.. | .. |
---|
4 | 4 | * All Rights Reserved. |
---|
5 | 5 | */ |
---|
6 | 6 | #include "xfs.h" |
---|
7 | | -#include <linux/stddef.h> |
---|
8 | | -#include <linux/errno.h> |
---|
9 | | -#include <linux/gfp.h> |
---|
10 | | -#include <linux/pagemap.h> |
---|
11 | | -#include <linux/init.h> |
---|
12 | | -#include <linux/vmalloc.h> |
---|
13 | | -#include <linux/bio.h> |
---|
14 | | -#include <linux/sysctl.h> |
---|
15 | | -#include <linux/proc_fs.h> |
---|
16 | | -#include <linux/workqueue.h> |
---|
17 | | -#include <linux/percpu.h> |
---|
18 | | -#include <linux/blkdev.h> |
---|
19 | | -#include <linux/hash.h> |
---|
20 | | -#include <linux/kthread.h> |
---|
21 | | -#include <linux/migrate.h> |
---|
22 | 7 | #include <linux/backing-dev.h> |
---|
23 | | -#include <linux/freezer.h> |
---|
24 | 8 | |
---|
| 9 | +#include "xfs_shared.h" |
---|
25 | 10 | #include "xfs_format.h" |
---|
26 | 11 | #include "xfs_log_format.h" |
---|
27 | 12 | #include "xfs_trans_resv.h" |
---|
.. | .. |
---|
29 | 14 | #include "xfs_mount.h" |
---|
30 | 15 | #include "xfs_trace.h" |
---|
31 | 16 | #include "xfs_log.h" |
---|
| 17 | +#include "xfs_log_recover.h" |
---|
| 18 | +#include "xfs_trans.h" |
---|
| 19 | +#include "xfs_buf_item.h" |
---|
32 | 20 | #include "xfs_errortag.h" |
---|
33 | 21 | #include "xfs_error.h" |
---|
34 | 22 | |
---|
.. | .. |
---|
63 | 51 | * lru_lock |
---|
64 | 52 | * b_lock (trylock due to inversion) |
---|
65 | 53 | */ |
---|
| 54 | + |
---|
| 55 | +static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); |
---|
| 56 | + |
---|
| 57 | +static inline int |
---|
| 58 | +xfs_buf_submit( |
---|
| 59 | + struct xfs_buf *bp) |
---|
| 60 | +{ |
---|
| 61 | + return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); |
---|
| 62 | +} |
---|
66 | 63 | |
---|
67 | 64 | static inline int |
---|
68 | 65 | xfs_buf_is_vmapped( |
---|
.. | .. |
---|
213 | 210 | } |
---|
214 | 211 | } |
---|
215 | 212 | |
---|
216 | | -struct xfs_buf * |
---|
| 213 | +static int |
---|
217 | 214 | _xfs_buf_alloc( |
---|
218 | 215 | struct xfs_buftarg *target, |
---|
219 | 216 | struct xfs_buf_map *map, |
---|
220 | 217 | int nmaps, |
---|
221 | | - xfs_buf_flags_t flags) |
---|
| 218 | + xfs_buf_flags_t flags, |
---|
| 219 | + struct xfs_buf **bpp) |
---|
222 | 220 | { |
---|
223 | 221 | struct xfs_buf *bp; |
---|
224 | 222 | int error; |
---|
225 | 223 | int i; |
---|
226 | 224 | |
---|
227 | | - bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); |
---|
228 | | - if (unlikely(!bp)) |
---|
229 | | - return NULL; |
---|
| 225 | + *bpp = NULL; |
---|
| 226 | + bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); |
---|
230 | 227 | |
---|
231 | 228 | /* |
---|
232 | 229 | * We don't want certain flags to appear in b_flags unless they are |
---|
.. | .. |
---|
243 | 240 | sema_init(&bp->b_sema, 0); /* held, no waiters */ |
---|
244 | 241 | spin_lock_init(&bp->b_lock); |
---|
245 | 242 | bp->b_target = target; |
---|
| 243 | + bp->b_mount = target->bt_mount; |
---|
246 | 244 | bp->b_flags = flags; |
---|
247 | 245 | |
---|
248 | 246 | /* |
---|
.. | .. |
---|
252 | 250 | */ |
---|
253 | 251 | error = xfs_buf_get_maps(bp, nmaps); |
---|
254 | 252 | if (error) { |
---|
255 | | - kmem_zone_free(xfs_buf_zone, bp); |
---|
256 | | - return NULL; |
---|
| 253 | + kmem_cache_free(xfs_buf_zone, bp); |
---|
| 254 | + return error; |
---|
257 | 255 | } |
---|
258 | 256 | |
---|
259 | 257 | bp->b_bn = map[0].bm_bn; |
---|
.. | .. |
---|
263 | 261 | bp->b_maps[i].bm_len = map[i].bm_len; |
---|
264 | 262 | bp->b_length += map[i].bm_len; |
---|
265 | 263 | } |
---|
266 | | - bp->b_io_length = bp->b_length; |
---|
267 | 264 | |
---|
268 | 265 | atomic_set(&bp->b_pin_count, 0); |
---|
269 | 266 | init_waitqueue_head(&bp->b_waiters); |
---|
270 | 267 | |
---|
271 | | - XFS_STATS_INC(target->bt_mount, xb_create); |
---|
| 268 | + XFS_STATS_INC(bp->b_mount, xb_create); |
---|
272 | 269 | trace_xfs_buf_init(bp, _RET_IP_); |
---|
273 | 270 | |
---|
274 | | - return bp; |
---|
| 271 | + *bpp = bp; |
---|
| 272 | + return 0; |
---|
275 | 273 | } |
---|
276 | 274 | |
---|
277 | 275 | /* |
---|
.. | .. |
---|
319 | 317 | * The buffer must not be on any hash - use xfs_buf_rele instead for |
---|
320 | 318 | * hashed and refcounted buffers |
---|
321 | 319 | */ |
---|
322 | | -void |
---|
| 320 | +static void |
---|
323 | 321 | xfs_buf_free( |
---|
324 | 322 | xfs_buf_t *bp) |
---|
325 | 323 | { |
---|
.. | .. |
---|
339 | 337 | |
---|
340 | 338 | __free_page(page); |
---|
341 | 339 | } |
---|
| 340 | + if (current->reclaim_state) |
---|
| 341 | + current->reclaim_state->reclaimed_slab += |
---|
| 342 | + bp->b_page_count; |
---|
342 | 343 | } else if (bp->b_flags & _XBF_KMEM) |
---|
343 | 344 | kmem_free(bp->b_addr); |
---|
344 | 345 | _xfs_buf_free_pages(bp); |
---|
345 | 346 | xfs_buf_free_maps(bp); |
---|
346 | | - kmem_zone_free(xfs_buf_zone, bp); |
---|
| 347 | + kmem_cache_free(xfs_buf_zone, bp); |
---|
347 | 348 | } |
---|
348 | 349 | |
---|
349 | 350 | /* |
---|
.. | .. |
---|
360 | 361 | unsigned short page_count, i; |
---|
361 | 362 | xfs_off_t start, end; |
---|
362 | 363 | int error; |
---|
| 364 | + xfs_km_flags_t kmflag_mask = 0; |
---|
| 365 | + |
---|
| 366 | + /* |
---|
| 367 | + * assure zeroed buffer for non-read cases. |
---|
| 368 | + */ |
---|
| 369 | + if (!(flags & XBF_READ)) { |
---|
| 370 | + kmflag_mask |= KM_ZERO; |
---|
| 371 | + gfp_mask |= __GFP_ZERO; |
---|
| 372 | + } |
---|
363 | 373 | |
---|
364 | 374 | /* |
---|
365 | 375 | * for buffers that are contained within a single page, just allocate |
---|
.. | .. |
---|
368 | 378 | */ |
---|
369 | 379 | size = BBTOB(bp->b_length); |
---|
370 | 380 | if (size < PAGE_SIZE) { |
---|
371 | | - bp->b_addr = kmem_alloc(size, KM_NOFS); |
---|
| 381 | + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); |
---|
| 382 | + bp->b_addr = kmem_alloc_io(size, align_mask, |
---|
| 383 | + KM_NOFS | kmflag_mask); |
---|
372 | 384 | if (!bp->b_addr) { |
---|
373 | 385 | /* low memory - use alloc_page loop instead */ |
---|
374 | 386 | goto use_alloc_page; |
---|
.. | .. |
---|
383 | 395 | } |
---|
384 | 396 | bp->b_offset = offset_in_page(bp->b_addr); |
---|
385 | 397 | bp->b_pages = bp->b_page_array; |
---|
386 | | - bp->b_pages[0] = virt_to_page(bp->b_addr); |
---|
| 398 | + bp->b_pages[0] = kmem_to_page(bp->b_addr); |
---|
387 | 399 | bp->b_page_count = 1; |
---|
388 | 400 | bp->b_flags |= _XBF_KMEM; |
---|
389 | 401 | return 0; |
---|
.. | .. |
---|
425 | 437 | current->comm, current->pid, |
---|
426 | 438 | __func__, gfp_mask); |
---|
427 | 439 | |
---|
428 | | - XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); |
---|
| 440 | + XFS_STATS_INC(bp->b_mount, xb_page_retries); |
---|
429 | 441 | congestion_wait(BLK_RW_ASYNC, HZ/50); |
---|
430 | 442 | goto retry; |
---|
431 | 443 | } |
---|
432 | 444 | |
---|
433 | | - XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); |
---|
| 445 | + XFS_STATS_INC(bp->b_mount, xb_page_found); |
---|
434 | 446 | |
---|
435 | 447 | nbytes = min_t(size_t, size, PAGE_SIZE - offset); |
---|
436 | 448 | size -= nbytes; |
---|
.. | .. |
---|
465 | 477 | unsigned nofs_flag; |
---|
466 | 478 | |
---|
467 | 479 | /* |
---|
468 | | - * vm_map_ram() will allocate auxillary structures (e.g. |
---|
| 480 | + * vm_map_ram() will allocate auxiliary structures (e.g. |
---|
469 | 481 | * pagetables) with GFP_KERNEL, yet we are likely to be under |
---|
470 | 482 | * GFP_NOFS context here. Hence we need to tell memory reclaim |
---|
471 | 483 | * that we are in such a context via PF_MEMALLOC_NOFS to prevent |
---|
.. | .. |
---|
475 | 487 | nofs_flag = memalloc_nofs_save(); |
---|
476 | 488 | do { |
---|
477 | 489 | bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, |
---|
478 | | - -1, PAGE_KERNEL); |
---|
| 490 | + -1); |
---|
479 | 491 | if (bp->b_addr) |
---|
480 | 492 | break; |
---|
481 | 493 | vm_unmap_aliases(); |
---|
.. | .. |
---|
653 | 665 | */ |
---|
654 | 666 | if (bp->b_flags & XBF_STALE) { |
---|
655 | 667 | ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); |
---|
656 | | - ASSERT(bp->b_iodone == NULL); |
---|
657 | 668 | bp->b_flags &= _XBF_KMEM | _XBF_PAGES; |
---|
658 | 669 | bp->b_ops = NULL; |
---|
659 | 670 | } |
---|
.. | .. |
---|
686 | 697 | * cache hits, as metadata intensive workloads will see 3 orders of magnitude |
---|
687 | 698 | * more hits than misses. |
---|
688 | 699 | */ |
---|
689 | | -struct xfs_buf * |
---|
| 700 | +int |
---|
690 | 701 | xfs_buf_get_map( |
---|
691 | 702 | struct xfs_buftarg *target, |
---|
692 | 703 | struct xfs_buf_map *map, |
---|
693 | 704 | int nmaps, |
---|
694 | | - xfs_buf_flags_t flags) |
---|
| 705 | + xfs_buf_flags_t flags, |
---|
| 706 | + struct xfs_buf **bpp) |
---|
695 | 707 | { |
---|
696 | 708 | struct xfs_buf *bp; |
---|
697 | 709 | struct xfs_buf *new_bp; |
---|
698 | 710 | int error = 0; |
---|
699 | 711 | |
---|
| 712 | + *bpp = NULL; |
---|
700 | 713 | error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); |
---|
701 | | - |
---|
702 | | - switch (error) { |
---|
703 | | - case 0: |
---|
704 | | - /* cache hit */ |
---|
| 714 | + if (!error) |
---|
705 | 715 | goto found; |
---|
706 | | - case -EAGAIN: |
---|
707 | | - /* cache hit, trylock failure, caller handles failure */ |
---|
708 | | - ASSERT(flags & XBF_TRYLOCK); |
---|
709 | | - return NULL; |
---|
710 | | - case -ENOENT: |
---|
711 | | - /* cache miss, go for insert */ |
---|
712 | | - break; |
---|
713 | | - case -EFSCORRUPTED: |
---|
714 | | - default: |
---|
715 | | - /* |
---|
716 | | - * None of the higher layers understand failure types |
---|
717 | | - * yet, so return NULL to signal a fatal lookup error. |
---|
718 | | - */ |
---|
719 | | - return NULL; |
---|
720 | | - } |
---|
| 716 | + if (error != -ENOENT) |
---|
| 717 | + return error; |
---|
721 | 718 | |
---|
722 | | - new_bp = _xfs_buf_alloc(target, map, nmaps, flags); |
---|
723 | | - if (unlikely(!new_bp)) |
---|
724 | | - return NULL; |
---|
| 719 | + error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); |
---|
| 720 | + if (error) |
---|
| 721 | + return error; |
---|
725 | 722 | |
---|
726 | 723 | error = xfs_buf_allocate_memory(new_bp, flags); |
---|
727 | 724 | if (error) { |
---|
728 | 725 | xfs_buf_free(new_bp); |
---|
729 | | - return NULL; |
---|
| 726 | + return error; |
---|
730 | 727 | } |
---|
731 | 728 | |
---|
732 | 729 | error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); |
---|
733 | 730 | if (error) { |
---|
734 | 731 | xfs_buf_free(new_bp); |
---|
735 | | - return NULL; |
---|
| 732 | + return error; |
---|
736 | 733 | } |
---|
737 | 734 | |
---|
738 | 735 | if (bp != new_bp) |
---|
.. | .. |
---|
742 | 739 | if (!bp->b_addr) { |
---|
743 | 740 | error = _xfs_buf_map_pages(bp, flags); |
---|
744 | 741 | if (unlikely(error)) { |
---|
745 | | - xfs_warn(target->bt_mount, |
---|
746 | | - "%s: failed to map pagesn", __func__); |
---|
| 742 | + xfs_warn_ratelimited(target->bt_mount, |
---|
| 743 | + "%s: failed to map %u pages", __func__, |
---|
| 744 | + bp->b_page_count); |
---|
747 | 745 | xfs_buf_relse(bp); |
---|
748 | | - return NULL; |
---|
| 746 | + return error; |
---|
749 | 747 | } |
---|
750 | 748 | } |
---|
751 | 749 | |
---|
.. | .. |
---|
758 | 756 | |
---|
759 | 757 | XFS_STATS_INC(target->bt_mount, xb_get); |
---|
760 | 758 | trace_xfs_buf_get(bp, flags, _RET_IP_); |
---|
761 | | - return bp; |
---|
| 759 | + *bpp = bp; |
---|
| 760 | + return 0; |
---|
762 | 761 | } |
---|
763 | 762 | |
---|
764 | | -STATIC int |
---|
| 763 | +int |
---|
765 | 764 | _xfs_buf_read( |
---|
766 | 765 | xfs_buf_t *bp, |
---|
767 | 766 | xfs_buf_flags_t flags) |
---|
.. | .. |
---|
769 | 768 | ASSERT(!(flags & XBF_WRITE)); |
---|
770 | 769 | ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); |
---|
771 | 770 | |
---|
772 | | - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); |
---|
| 771 | + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); |
---|
773 | 772 | bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); |
---|
774 | 773 | |
---|
775 | 774 | return xfs_buf_submit(bp); |
---|
776 | 775 | } |
---|
777 | 776 | |
---|
778 | | -xfs_buf_t * |
---|
| 777 | +/* |
---|
| 778 | + * Reverify a buffer found in cache without an attached ->b_ops. |
---|
| 779 | + * |
---|
| 780 | + * If the caller passed an ops structure and the buffer doesn't have ops |
---|
| 781 | + * assigned, set the ops and use it to verify the contents. If verification |
---|
| 782 | + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is |
---|
| 783 | + * already in XBF_DONE state on entry. |
---|
| 784 | + * |
---|
| 785 | + * Under normal operations, every in-core buffer is verified on read I/O |
---|
| 786 | + * completion. There are two scenarios that can lead to in-core buffers without |
---|
| 787 | + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 |
---|
| 788 | + * filesystem, though these buffers are purged at the end of recovery. The |
---|
| 789 | + * other is online repair, which intentionally reads with a NULL buffer ops to |
---|
| 790 | + * run several verifiers across an in-core buffer in order to establish buffer |
---|
| 791 | + * type. If repair can't establish that, the buffer will be left in memory |
---|
| 792 | + * with NULL buffer ops. |
---|
| 793 | + */ |
---|
| 794 | +int |
---|
| 795 | +xfs_buf_reverify( |
---|
| 796 | + struct xfs_buf *bp, |
---|
| 797 | + const struct xfs_buf_ops *ops) |
---|
| 798 | +{ |
---|
| 799 | + ASSERT(bp->b_flags & XBF_DONE); |
---|
| 800 | + ASSERT(bp->b_error == 0); |
---|
| 801 | + |
---|
| 802 | + if (!ops || bp->b_ops) |
---|
| 803 | + return 0; |
---|
| 804 | + |
---|
| 805 | + bp->b_ops = ops; |
---|
| 806 | + bp->b_ops->verify_read(bp); |
---|
| 807 | + if (bp->b_error) |
---|
| 808 | + bp->b_flags &= ~XBF_DONE; |
---|
| 809 | + return bp->b_error; |
---|
| 810 | +} |
---|
| 811 | + |
---|
| 812 | +int |
---|
779 | 813 | xfs_buf_read_map( |
---|
780 | 814 | struct xfs_buftarg *target, |
---|
781 | 815 | struct xfs_buf_map *map, |
---|
782 | 816 | int nmaps, |
---|
783 | 817 | xfs_buf_flags_t flags, |
---|
784 | | - const struct xfs_buf_ops *ops) |
---|
| 818 | + struct xfs_buf **bpp, |
---|
| 819 | + const struct xfs_buf_ops *ops, |
---|
| 820 | + xfs_failaddr_t fa) |
---|
785 | 821 | { |
---|
786 | 822 | struct xfs_buf *bp; |
---|
| 823 | + int error; |
---|
787 | 824 | |
---|
788 | 825 | flags |= XBF_READ; |
---|
| 826 | + *bpp = NULL; |
---|
789 | 827 | |
---|
790 | | - bp = xfs_buf_get_map(target, map, nmaps, flags); |
---|
791 | | - if (bp) { |
---|
792 | | - trace_xfs_buf_read(bp, flags, _RET_IP_); |
---|
| 828 | + error = xfs_buf_get_map(target, map, nmaps, flags, &bp); |
---|
| 829 | + if (error) |
---|
| 830 | + return error; |
---|
793 | 831 | |
---|
794 | | - if (!(bp->b_flags & XBF_DONE)) { |
---|
795 | | - XFS_STATS_INC(target->bt_mount, xb_get_read); |
---|
796 | | - bp->b_ops = ops; |
---|
797 | | - _xfs_buf_read(bp, flags); |
---|
798 | | - } else if (flags & XBF_ASYNC) { |
---|
799 | | - /* |
---|
800 | | - * Read ahead call which is already satisfied, |
---|
801 | | - * drop the buffer |
---|
802 | | - */ |
---|
| 832 | + trace_xfs_buf_read(bp, flags, _RET_IP_); |
---|
| 833 | + |
---|
| 834 | + if (!(bp->b_flags & XBF_DONE)) { |
---|
| 835 | + /* Initiate the buffer read and wait. */ |
---|
| 836 | + XFS_STATS_INC(target->bt_mount, xb_get_read); |
---|
| 837 | + bp->b_ops = ops; |
---|
| 838 | + error = _xfs_buf_read(bp, flags); |
---|
| 839 | + |
---|
| 840 | + /* Readahead iodone already dropped the buffer, so exit. */ |
---|
| 841 | + if (flags & XBF_ASYNC) |
---|
| 842 | + return 0; |
---|
| 843 | + } else { |
---|
| 844 | + /* Buffer already read; all we need to do is check it. */ |
---|
| 845 | + error = xfs_buf_reverify(bp, ops); |
---|
| 846 | + |
---|
| 847 | + /* Readahead already finished; drop the buffer and exit. */ |
---|
| 848 | + if (flags & XBF_ASYNC) { |
---|
803 | 849 | xfs_buf_relse(bp); |
---|
804 | | - return NULL; |
---|
805 | | - } else { |
---|
806 | | - /* We do not want read in the flags */ |
---|
807 | | - bp->b_flags &= ~XBF_READ; |
---|
| 850 | + return 0; |
---|
808 | 851 | } |
---|
| 852 | + |
---|
| 853 | + /* We do not want read in the flags */ |
---|
| 854 | + bp->b_flags &= ~XBF_READ; |
---|
| 855 | + ASSERT(bp->b_ops != NULL || ops == NULL); |
---|
809 | 856 | } |
---|
810 | 857 | |
---|
811 | | - return bp; |
---|
| 858 | + /* |
---|
| 859 | + * If we've had a read error, then the contents of the buffer are |
---|
| 860 | + * invalid and should not be used. To ensure that a followup read tries |
---|
| 861 | + * to pull the buffer from disk again, we clear the XBF_DONE flag and |
---|
| 862 | + * mark the buffer stale. This ensures that anyone who has a current |
---|
| 863 | + * reference to the buffer will interpret it's contents correctly and |
---|
| 864 | + * future cache lookups will also treat it as an empty, uninitialised |
---|
| 865 | + * buffer. |
---|
| 866 | + */ |
---|
| 867 | + if (error) { |
---|
| 868 | + if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) |
---|
| 869 | + xfs_buf_ioerror_alert(bp, fa); |
---|
| 870 | + |
---|
| 871 | + bp->b_flags &= ~XBF_DONE; |
---|
| 872 | + xfs_buf_stale(bp); |
---|
| 873 | + xfs_buf_relse(bp); |
---|
| 874 | + |
---|
| 875 | + /* bad CRC means corrupted metadata */ |
---|
| 876 | + if (error == -EFSBADCRC) |
---|
| 877 | + error = -EFSCORRUPTED; |
---|
| 878 | + return error; |
---|
| 879 | + } |
---|
| 880 | + |
---|
| 881 | + *bpp = bp; |
---|
| 882 | + return 0; |
---|
812 | 883 | } |
---|
813 | 884 | |
---|
814 | 885 | /* |
---|
.. | .. |
---|
822 | 893 | int nmaps, |
---|
823 | 894 | const struct xfs_buf_ops *ops) |
---|
824 | 895 | { |
---|
| 896 | + struct xfs_buf *bp; |
---|
| 897 | + |
---|
825 | 898 | if (bdi_read_congested(target->bt_bdev->bd_bdi)) |
---|
826 | 899 | return; |
---|
827 | 900 | |
---|
828 | 901 | xfs_buf_read_map(target, map, nmaps, |
---|
829 | | - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); |
---|
| 902 | + XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, |
---|
| 903 | + __this_address); |
---|
830 | 904 | } |
---|
831 | 905 | |
---|
832 | 906 | /* |
---|
.. | .. |
---|
843 | 917 | const struct xfs_buf_ops *ops) |
---|
844 | 918 | { |
---|
845 | 919 | struct xfs_buf *bp; |
---|
| 920 | + int error; |
---|
846 | 921 | |
---|
847 | 922 | *bpp = NULL; |
---|
848 | 923 | |
---|
849 | | - bp = xfs_buf_get_uncached(target, numblks, flags); |
---|
850 | | - if (!bp) |
---|
851 | | - return -ENOMEM; |
---|
| 924 | + error = xfs_buf_get_uncached(target, numblks, flags, &bp); |
---|
| 925 | + if (error) |
---|
| 926 | + return error; |
---|
852 | 927 | |
---|
853 | 928 | /* set up the buffer for a read IO */ |
---|
854 | 929 | ASSERT(bp->b_map_count == 1); |
---|
.. | .. |
---|
859 | 934 | |
---|
860 | 935 | xfs_buf_submit(bp); |
---|
861 | 936 | if (bp->b_error) { |
---|
862 | | - int error = bp->b_error; |
---|
| 937 | + error = bp->b_error; |
---|
863 | 938 | xfs_buf_relse(bp); |
---|
864 | 939 | return error; |
---|
865 | 940 | } |
---|
.. | .. |
---|
868 | 943 | return 0; |
---|
869 | 944 | } |
---|
870 | 945 | |
---|
871 | | -/* |
---|
872 | | - * Return a buffer allocated as an empty buffer and associated to external |
---|
873 | | - * memory via xfs_buf_associate_memory() back to it's empty state. |
---|
874 | | - */ |
---|
875 | | -void |
---|
876 | | -xfs_buf_set_empty( |
---|
877 | | - struct xfs_buf *bp, |
---|
878 | | - size_t numblks) |
---|
879 | | -{ |
---|
880 | | - if (bp->b_pages) |
---|
881 | | - _xfs_buf_free_pages(bp); |
---|
882 | | - |
---|
883 | | - bp->b_pages = NULL; |
---|
884 | | - bp->b_page_count = 0; |
---|
885 | | - bp->b_addr = NULL; |
---|
886 | | - bp->b_length = numblks; |
---|
887 | | - bp->b_io_length = numblks; |
---|
888 | | - |
---|
889 | | - ASSERT(bp->b_map_count == 1); |
---|
890 | | - bp->b_bn = XFS_BUF_DADDR_NULL; |
---|
891 | | - bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; |
---|
892 | | - bp->b_maps[0].bm_len = bp->b_length; |
---|
893 | | -} |
---|
894 | | - |
---|
895 | | -static inline struct page * |
---|
896 | | -mem_to_page( |
---|
897 | | - void *addr) |
---|
898 | | -{ |
---|
899 | | - if ((!is_vmalloc_addr(addr))) { |
---|
900 | | - return virt_to_page(addr); |
---|
901 | | - } else { |
---|
902 | | - return vmalloc_to_page(addr); |
---|
903 | | - } |
---|
904 | | -} |
---|
905 | | - |
---|
906 | 946 | int |
---|
907 | | -xfs_buf_associate_memory( |
---|
908 | | - xfs_buf_t *bp, |
---|
909 | | - void *mem, |
---|
910 | | - size_t len) |
---|
911 | | -{ |
---|
912 | | - int rval; |
---|
913 | | - int i = 0; |
---|
914 | | - unsigned long pageaddr; |
---|
915 | | - unsigned long offset; |
---|
916 | | - size_t buflen; |
---|
917 | | - int page_count; |
---|
918 | | - |
---|
919 | | - pageaddr = (unsigned long)mem & PAGE_MASK; |
---|
920 | | - offset = (unsigned long)mem - pageaddr; |
---|
921 | | - buflen = PAGE_ALIGN(len + offset); |
---|
922 | | - page_count = buflen >> PAGE_SHIFT; |
---|
923 | | - |
---|
924 | | - /* Free any previous set of page pointers */ |
---|
925 | | - if (bp->b_pages) |
---|
926 | | - _xfs_buf_free_pages(bp); |
---|
927 | | - |
---|
928 | | - bp->b_pages = NULL; |
---|
929 | | - bp->b_addr = mem; |
---|
930 | | - |
---|
931 | | - rval = _xfs_buf_get_pages(bp, page_count); |
---|
932 | | - if (rval) |
---|
933 | | - return rval; |
---|
934 | | - |
---|
935 | | - bp->b_offset = offset; |
---|
936 | | - |
---|
937 | | - for (i = 0; i < bp->b_page_count; i++) { |
---|
938 | | - bp->b_pages[i] = mem_to_page((void *)pageaddr); |
---|
939 | | - pageaddr += PAGE_SIZE; |
---|
940 | | - } |
---|
941 | | - |
---|
942 | | - bp->b_io_length = BTOBB(len); |
---|
943 | | - bp->b_length = BTOBB(buflen); |
---|
944 | | - |
---|
945 | | - return 0; |
---|
946 | | -} |
---|
947 | | - |
---|
948 | | -xfs_buf_t * |
---|
949 | 947 | xfs_buf_get_uncached( |
---|
950 | 948 | struct xfs_buftarg *target, |
---|
951 | 949 | size_t numblks, |
---|
952 | | - int flags) |
---|
| 950 | + int flags, |
---|
| 951 | + struct xfs_buf **bpp) |
---|
953 | 952 | { |
---|
954 | 953 | unsigned long page_count; |
---|
955 | 954 | int error, i; |
---|
956 | 955 | struct xfs_buf *bp; |
---|
957 | 956 | DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); |
---|
958 | 957 | |
---|
| 958 | + *bpp = NULL; |
---|
| 959 | + |
---|
959 | 960 | /* flags might contain irrelevant bits, pass only what we care about */ |
---|
960 | | - bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); |
---|
961 | | - if (unlikely(bp == NULL)) |
---|
| 961 | + error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); |
---|
| 962 | + if (error) |
---|
962 | 963 | goto fail; |
---|
963 | 964 | |
---|
964 | 965 | page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; |
---|
.. | .. |
---|
968 | 969 | |
---|
969 | 970 | for (i = 0; i < page_count; i++) { |
---|
970 | 971 | bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); |
---|
971 | | - if (!bp->b_pages[i]) |
---|
| 972 | + if (!bp->b_pages[i]) { |
---|
| 973 | + error = -ENOMEM; |
---|
972 | 974 | goto fail_free_mem; |
---|
| 975 | + } |
---|
973 | 976 | } |
---|
974 | 977 | bp->b_flags |= _XBF_PAGES; |
---|
975 | 978 | |
---|
.. | .. |
---|
981 | 984 | } |
---|
982 | 985 | |
---|
983 | 986 | trace_xfs_buf_get_uncached(bp, _RET_IP_); |
---|
984 | | - return bp; |
---|
| 987 | + *bpp = bp; |
---|
| 988 | + return 0; |
---|
985 | 989 | |
---|
986 | 990 | fail_free_mem: |
---|
987 | 991 | while (--i >= 0) |
---|
.. | .. |
---|
989 | 993 | _xfs_buf_free_pages(bp); |
---|
990 | 994 | fail_free_buf: |
---|
991 | 995 | xfs_buf_free_maps(bp); |
---|
992 | | - kmem_zone_free(xfs_buf_zone, bp); |
---|
| 996 | + kmem_cache_free(xfs_buf_zone, bp); |
---|
993 | 997 | fail: |
---|
994 | | - return NULL; |
---|
| 998 | + return error; |
---|
995 | 999 | } |
---|
996 | 1000 | |
---|
997 | 1001 | /* |
---|
.. | .. |
---|
1139 | 1143 | trace_xfs_buf_lock(bp, _RET_IP_); |
---|
1140 | 1144 | |
---|
1141 | 1145 | if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) |
---|
1142 | | - xfs_log_force(bp->b_target->bt_mount, 0); |
---|
| 1146 | + xfs_log_force(bp->b_mount, 0); |
---|
1143 | 1147 | down(&bp->b_sema); |
---|
1144 | 1148 | |
---|
1145 | 1149 | trace_xfs_buf_lock_done(bp, _RET_IP_); |
---|
.. | .. |
---|
1175 | 1179 | set_current_state(TASK_RUNNING); |
---|
1176 | 1180 | } |
---|
1177 | 1181 | |
---|
1178 | | -/* |
---|
1179 | | - * Buffer Utility Routines |
---|
1180 | | - */ |
---|
| 1182 | +static void |
---|
| 1183 | +xfs_buf_ioerror_alert_ratelimited( |
---|
| 1184 | + struct xfs_buf *bp) |
---|
| 1185 | +{ |
---|
| 1186 | + static unsigned long lasttime; |
---|
| 1187 | + static struct xfs_buftarg *lasttarg; |
---|
1181 | 1188 | |
---|
1182 | | -void |
---|
| 1189 | + if (bp->b_target != lasttarg || |
---|
| 1190 | + time_after(jiffies, (lasttime + 5*HZ))) { |
---|
| 1191 | + lasttime = jiffies; |
---|
| 1192 | + xfs_buf_ioerror_alert(bp, __this_address); |
---|
| 1193 | + } |
---|
| 1194 | + lasttarg = bp->b_target; |
---|
| 1195 | +} |
---|
| 1196 | + |
---|
| 1197 | +/* |
---|
| 1198 | + * Account for this latest trip around the retry handler, and decide if |
---|
| 1199 | + * we've failed enough times to constitute a permanent failure. |
---|
| 1200 | + */ |
---|
| 1201 | +static bool |
---|
| 1202 | +xfs_buf_ioerror_permanent( |
---|
| 1203 | + struct xfs_buf *bp, |
---|
| 1204 | + struct xfs_error_cfg *cfg) |
---|
| 1205 | +{ |
---|
| 1206 | + struct xfs_mount *mp = bp->b_mount; |
---|
| 1207 | + |
---|
| 1208 | + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && |
---|
| 1209 | + ++bp->b_retries > cfg->max_retries) |
---|
| 1210 | + return true; |
---|
| 1211 | + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && |
---|
| 1212 | + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) |
---|
| 1213 | + return true; |
---|
| 1214 | + |
---|
| 1215 | + /* At unmount we may treat errors differently */ |
---|
| 1216 | + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) |
---|
| 1217 | + return true; |
---|
| 1218 | + |
---|
| 1219 | + return false; |
---|
| 1220 | +} |
---|
| 1221 | + |
---|
| 1222 | +/* |
---|
| 1223 | + * On a sync write or shutdown we just want to stale the buffer and let the |
---|
| 1224 | + * caller handle the error in bp->b_error appropriately. |
---|
| 1225 | + * |
---|
| 1226 | + * If the write was asynchronous then no one will be looking for the error. If |
---|
| 1227 | + * this is the first failure of this type, clear the error state and write the |
---|
| 1228 | + * buffer out again. This means we always retry an async write failure at least |
---|
| 1229 | + * once, but we also need to set the buffer up to behave correctly now for |
---|
| 1230 | + * repeated failures. |
---|
| 1231 | + * |
---|
| 1232 | + * If we get repeated async write failures, then we take action according to the |
---|
| 1233 | + * error configuration we have been set up to use. |
---|
| 1234 | + * |
---|
| 1235 | + * Returns true if this function took care of error handling and the caller must |
---|
| 1236 | + * not touch the buffer again. Return false if the caller should proceed with |
---|
| 1237 | + * normal I/O completion handling. |
---|
| 1238 | + */ |
---|
| 1239 | +static bool |
---|
| 1240 | +xfs_buf_ioend_handle_error( |
---|
| 1241 | + struct xfs_buf *bp) |
---|
| 1242 | +{ |
---|
| 1243 | + struct xfs_mount *mp = bp->b_mount; |
---|
| 1244 | + struct xfs_error_cfg *cfg; |
---|
| 1245 | + |
---|
| 1246 | + /* |
---|
| 1247 | + * If we've already decided to shutdown the filesystem because of I/O |
---|
| 1248 | + * errors, there's no point in giving this a retry. |
---|
| 1249 | + */ |
---|
| 1250 | + if (XFS_FORCED_SHUTDOWN(mp)) |
---|
| 1251 | + goto out_stale; |
---|
| 1252 | + |
---|
| 1253 | + xfs_buf_ioerror_alert_ratelimited(bp); |
---|
| 1254 | + |
---|
| 1255 | + /* |
---|
| 1256 | + * We're not going to bother about retrying this during recovery. |
---|
| 1257 | + * One strike! |
---|
| 1258 | + */ |
---|
| 1259 | + if (bp->b_flags & _XBF_LOGRECOVERY) { |
---|
| 1260 | + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); |
---|
| 1261 | + return false; |
---|
| 1262 | + } |
---|
| 1263 | + |
---|
| 1264 | + /* |
---|
| 1265 | + * Synchronous writes will have callers process the error. |
---|
| 1266 | + */ |
---|
| 1267 | + if (!(bp->b_flags & XBF_ASYNC)) |
---|
| 1268 | + goto out_stale; |
---|
| 1269 | + |
---|
| 1270 | + trace_xfs_buf_iodone_async(bp, _RET_IP_); |
---|
| 1271 | + |
---|
| 1272 | + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); |
---|
| 1273 | + if (bp->b_last_error != bp->b_error || |
---|
| 1274 | + !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { |
---|
| 1275 | + bp->b_last_error = bp->b_error; |
---|
| 1276 | + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && |
---|
| 1277 | + !bp->b_first_retry_time) |
---|
| 1278 | + bp->b_first_retry_time = jiffies; |
---|
| 1279 | + goto resubmit; |
---|
| 1280 | + } |
---|
| 1281 | + |
---|
| 1282 | + /* |
---|
| 1283 | + * Permanent error - we need to trigger a shutdown if we haven't already |
---|
| 1284 | + * to indicate that inconsistency will result from this action. |
---|
| 1285 | + */ |
---|
| 1286 | + if (xfs_buf_ioerror_permanent(bp, cfg)) { |
---|
| 1287 | + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); |
---|
| 1288 | + goto out_stale; |
---|
| 1289 | + } |
---|
| 1290 | + |
---|
| 1291 | + /* Still considered a transient error. Caller will schedule retries. */ |
---|
| 1292 | + if (bp->b_flags & _XBF_INODES) |
---|
| 1293 | + xfs_buf_inode_io_fail(bp); |
---|
| 1294 | + else if (bp->b_flags & _XBF_DQUOTS) |
---|
| 1295 | + xfs_buf_dquot_io_fail(bp); |
---|
| 1296 | + else |
---|
| 1297 | + ASSERT(list_empty(&bp->b_li_list)); |
---|
| 1298 | + xfs_buf_ioerror(bp, 0); |
---|
| 1299 | + xfs_buf_relse(bp); |
---|
| 1300 | + return true; |
---|
| 1301 | + |
---|
| 1302 | +resubmit: |
---|
| 1303 | + xfs_buf_ioerror(bp, 0); |
---|
| 1304 | + bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); |
---|
| 1305 | + xfs_buf_submit(bp); |
---|
| 1306 | + return true; |
---|
| 1307 | +out_stale: |
---|
| 1308 | + xfs_buf_stale(bp); |
---|
| 1309 | + bp->b_flags |= XBF_DONE; |
---|
| 1310 | + bp->b_flags &= ~XBF_WRITE; |
---|
| 1311 | + trace_xfs_buf_error_relse(bp, _RET_IP_); |
---|
| 1312 | + return false; |
---|
| 1313 | +} |
---|
| 1314 | + |
---|
| 1315 | +static void |
---|
1183 | 1316 | xfs_buf_ioend( |
---|
1184 | 1317 | struct xfs_buf *bp) |
---|
1185 | 1318 | { |
---|
1186 | | - bool read = bp->b_flags & XBF_READ; |
---|
1187 | | - |
---|
1188 | 1319 | trace_xfs_buf_iodone(bp, _RET_IP_); |
---|
1189 | | - |
---|
1190 | | - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); |
---|
1191 | 1320 | |
---|
1192 | 1321 | /* |
---|
1193 | 1322 | * Pull in IO completion errors now. We are guaranteed to be running |
---|
.. | .. |
---|
1196 | 1325 | if (!bp->b_error && bp->b_io_error) |
---|
1197 | 1326 | xfs_buf_ioerror(bp, bp->b_io_error); |
---|
1198 | 1327 | |
---|
1199 | | - /* Only validate buffers that were read without errors */ |
---|
1200 | | - if (read && !bp->b_error && bp->b_ops) { |
---|
1201 | | - ASSERT(!bp->b_iodone); |
---|
1202 | | - bp->b_ops->verify_read(bp); |
---|
| 1328 | + if (bp->b_flags & XBF_READ) { |
---|
| 1329 | + if (!bp->b_error && bp->b_ops) |
---|
| 1330 | + bp->b_ops->verify_read(bp); |
---|
| 1331 | + if (!bp->b_error) |
---|
| 1332 | + bp->b_flags |= XBF_DONE; |
---|
| 1333 | + } else { |
---|
| 1334 | + if (!bp->b_error) { |
---|
| 1335 | + bp->b_flags &= ~XBF_WRITE_FAIL; |
---|
| 1336 | + bp->b_flags |= XBF_DONE; |
---|
| 1337 | + } |
---|
| 1338 | + |
---|
| 1339 | + if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) |
---|
| 1340 | + return; |
---|
| 1341 | + |
---|
| 1342 | + /* clear the retry state */ |
---|
| 1343 | + bp->b_last_error = 0; |
---|
| 1344 | + bp->b_retries = 0; |
---|
| 1345 | + bp->b_first_retry_time = 0; |
---|
| 1346 | + |
---|
| 1347 | + /* |
---|
| 1348 | + * Note that for things like remote attribute buffers, there may |
---|
| 1349 | + * not be a buffer log item here, so processing the buffer log |
---|
| 1350 | + * item must remain optional. |
---|
| 1351 | + */ |
---|
| 1352 | + if (bp->b_log_item) |
---|
| 1353 | + xfs_buf_item_done(bp); |
---|
| 1354 | + |
---|
| 1355 | + if (bp->b_flags & _XBF_INODES) |
---|
| 1356 | + xfs_buf_inode_iodone(bp); |
---|
| 1357 | + else if (bp->b_flags & _XBF_DQUOTS) |
---|
| 1358 | + xfs_buf_dquot_iodone(bp); |
---|
| 1359 | + |
---|
1203 | 1360 | } |
---|
1204 | 1361 | |
---|
1205 | | - if (!bp->b_error) { |
---|
1206 | | - bp->b_flags &= ~XBF_WRITE_FAIL; |
---|
1207 | | - bp->b_flags |= XBF_DONE; |
---|
1208 | | - } |
---|
| 1362 | + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | |
---|
| 1363 | + _XBF_LOGRECOVERY); |
---|
1209 | 1364 | |
---|
1210 | | - if (bp->b_iodone) |
---|
1211 | | - (*(bp->b_iodone))(bp); |
---|
1212 | | - else if (bp->b_flags & XBF_ASYNC) |
---|
| 1365 | + if (bp->b_flags & XBF_ASYNC) |
---|
1213 | 1366 | xfs_buf_relse(bp); |
---|
1214 | 1367 | else |
---|
1215 | 1368 | complete(&bp->b_iowait); |
---|
.. | .. |
---|
1230 | 1383 | struct xfs_buf *bp) |
---|
1231 | 1384 | { |
---|
1232 | 1385 | INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); |
---|
1233 | | - queue_work(bp->b_ioend_wq, &bp->b_ioend_work); |
---|
| 1386 | + queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); |
---|
1234 | 1387 | } |
---|
1235 | 1388 | |
---|
1236 | 1389 | void |
---|
.. | .. |
---|
1247 | 1400 | void |
---|
1248 | 1401 | xfs_buf_ioerror_alert( |
---|
1249 | 1402 | struct xfs_buf *bp, |
---|
1250 | | - const char *func) |
---|
| 1403 | + xfs_failaddr_t func) |
---|
1251 | 1404 | { |
---|
1252 | | - xfs_alert(bp->b_target->bt_mount, |
---|
1253 | | -"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", |
---|
1254 | | - func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, |
---|
1255 | | - -bp->b_error); |
---|
| 1405 | + xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", |
---|
| 1406 | + "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", |
---|
| 1407 | + func, (uint64_t)XFS_BUF_ADDR(bp), |
---|
| 1408 | + bp->b_length, -bp->b_error); |
---|
| 1409 | +} |
---|
| 1410 | + |
---|
| 1411 | +/* |
---|
| 1412 | + * To simulate an I/O failure, the buffer must be locked and held with at least |
---|
| 1413 | + * three references. The LRU reference is dropped by the stale call. The buf |
---|
| 1414 | + * item reference is dropped via ioend processing. The third reference is owned |
---|
| 1415 | + * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. |
---|
| 1416 | + */ |
---|
| 1417 | +void |
---|
| 1418 | +xfs_buf_ioend_fail( |
---|
| 1419 | + struct xfs_buf *bp) |
---|
| 1420 | +{ |
---|
| 1421 | + bp->b_flags &= ~XBF_DONE; |
---|
| 1422 | + xfs_buf_stale(bp); |
---|
| 1423 | + xfs_buf_ioerror(bp, -EIO); |
---|
| 1424 | + xfs_buf_ioend(bp); |
---|
1256 | 1425 | } |
---|
1257 | 1426 | |
---|
1258 | 1427 | int |
---|
.. | .. |
---|
1268 | 1437 | XBF_DONE); |
---|
1269 | 1438 | |
---|
1270 | 1439 | error = xfs_buf_submit(bp); |
---|
1271 | | - if (error) { |
---|
1272 | | - xfs_force_shutdown(bp->b_target->bt_mount, |
---|
1273 | | - SHUTDOWN_META_IO_ERROR); |
---|
1274 | | - } |
---|
| 1440 | + if (error) |
---|
| 1441 | + xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); |
---|
1275 | 1442 | return error; |
---|
1276 | 1443 | } |
---|
1277 | 1444 | |
---|
.. | .. |
---|
1280 | 1447 | struct bio *bio) |
---|
1281 | 1448 | { |
---|
1282 | 1449 | struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; |
---|
| 1450 | + |
---|
| 1451 | + if (!bio->bi_status && |
---|
| 1452 | + (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && |
---|
| 1453 | + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) |
---|
| 1454 | + bio->bi_status = BLK_STS_IOERR; |
---|
1283 | 1455 | |
---|
1284 | 1456 | /* |
---|
1285 | 1457 | * don't overwrite existing errors - otherwise we can lose errors on |
---|
.. | .. |
---|
1305 | 1477 | int map, |
---|
1306 | 1478 | int *buf_offset, |
---|
1307 | 1479 | int *count, |
---|
1308 | | - int op, |
---|
1309 | | - int op_flags) |
---|
| 1480 | + int op) |
---|
1310 | 1481 | { |
---|
1311 | 1482 | int page_index; |
---|
1312 | 1483 | int total_nr_pages = bp->b_page_count; |
---|
.. | .. |
---|
1341 | 1512 | bio->bi_iter.bi_sector = sector; |
---|
1342 | 1513 | bio->bi_end_io = xfs_buf_bio_end_io; |
---|
1343 | 1514 | bio->bi_private = bp; |
---|
1344 | | - bio_set_op_attrs(bio, op, op_flags); |
---|
| 1515 | + bio->bi_opf = op; |
---|
1345 | 1516 | |
---|
1346 | 1517 | for (; size && nr_pages; nr_pages--, page_index++) { |
---|
1347 | 1518 | int rbytes, nbytes = PAGE_SIZE - offset; |
---|
.. | .. |
---|
1386 | 1557 | { |
---|
1387 | 1558 | struct blk_plug plug; |
---|
1388 | 1559 | int op; |
---|
1389 | | - int op_flags = 0; |
---|
1390 | 1560 | int offset; |
---|
1391 | 1561 | int size; |
---|
1392 | 1562 | int i; |
---|
.. | .. |
---|
1397 | 1567 | */ |
---|
1398 | 1568 | bp->b_error = 0; |
---|
1399 | 1569 | |
---|
1400 | | - /* |
---|
1401 | | - * Initialize the I/O completion workqueue if we haven't yet or the |
---|
1402 | | - * submitter has not opted to specify a custom one. |
---|
1403 | | - */ |
---|
1404 | | - if (!bp->b_ioend_wq) |
---|
1405 | | - bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; |
---|
1406 | | - |
---|
1407 | 1570 | if (bp->b_flags & XBF_WRITE) { |
---|
1408 | 1571 | op = REQ_OP_WRITE; |
---|
1409 | | - if (bp->b_flags & XBF_SYNCIO) |
---|
1410 | | - op_flags = REQ_SYNC; |
---|
1411 | | - if (bp->b_flags & XBF_FUA) |
---|
1412 | | - op_flags |= REQ_FUA; |
---|
1413 | | - if (bp->b_flags & XBF_FLUSH) |
---|
1414 | | - op_flags |= REQ_PREFLUSH; |
---|
1415 | 1572 | |
---|
1416 | 1573 | /* |
---|
1417 | 1574 | * Run the write verifier callback function if it exists. If |
---|
.. | .. |
---|
1421 | 1578 | if (bp->b_ops) { |
---|
1422 | 1579 | bp->b_ops->verify_write(bp); |
---|
1423 | 1580 | if (bp->b_error) { |
---|
1424 | | - xfs_force_shutdown(bp->b_target->bt_mount, |
---|
| 1581 | + xfs_force_shutdown(bp->b_mount, |
---|
1425 | 1582 | SHUTDOWN_CORRUPT_INCORE); |
---|
1426 | 1583 | return; |
---|
1427 | 1584 | } |
---|
1428 | 1585 | } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { |
---|
1429 | | - struct xfs_mount *mp = bp->b_target->bt_mount; |
---|
| 1586 | + struct xfs_mount *mp = bp->b_mount; |
---|
1430 | 1587 | |
---|
1431 | 1588 | /* |
---|
1432 | 1589 | * non-crc filesystems don't attach verifiers during |
---|
.. | .. |
---|
1441 | 1598 | dump_stack(); |
---|
1442 | 1599 | } |
---|
1443 | 1600 | } |
---|
1444 | | - } else if (bp->b_flags & XBF_READ_AHEAD) { |
---|
1445 | | - op = REQ_OP_READ; |
---|
1446 | | - op_flags = REQ_RAHEAD; |
---|
1447 | 1601 | } else { |
---|
1448 | 1602 | op = REQ_OP_READ; |
---|
| 1603 | + if (bp->b_flags & XBF_READ_AHEAD) |
---|
| 1604 | + op |= REQ_RAHEAD; |
---|
1449 | 1605 | } |
---|
1450 | 1606 | |
---|
1451 | 1607 | /* we only use the buffer cache for meta-data */ |
---|
1452 | | - op_flags |= REQ_META; |
---|
| 1608 | + op |= REQ_META; |
---|
1453 | 1609 | |
---|
1454 | 1610 | /* |
---|
1455 | 1611 | * Walk all the vectors issuing IO on them. Set up the initial offset |
---|
.. | .. |
---|
1458 | 1614 | * subsequent call. |
---|
1459 | 1615 | */ |
---|
1460 | 1616 | offset = bp->b_offset; |
---|
1461 | | - size = BBTOB(bp->b_io_length); |
---|
| 1617 | + size = BBTOB(bp->b_length); |
---|
1462 | 1618 | blk_start_plug(&plug); |
---|
1463 | 1619 | for (i = 0; i < bp->b_map_count; i++) { |
---|
1464 | | - xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); |
---|
| 1620 | + xfs_buf_ioapply_map(bp, i, &offset, &size, op); |
---|
1465 | 1621 | if (bp->b_error) |
---|
1466 | 1622 | break; |
---|
1467 | 1623 | if (size <= 0) |
---|
.. | .. |
---|
1492 | 1648 | * safe to reference the buffer after a call to this function unless the caller |
---|
1493 | 1649 | * holds an additional reference itself. |
---|
1494 | 1650 | */ |
---|
1495 | | -int |
---|
| 1651 | +static int |
---|
1496 | 1652 | __xfs_buf_submit( |
---|
1497 | 1653 | struct xfs_buf *bp, |
---|
1498 | 1654 | bool wait) |
---|
.. | .. |
---|
1504 | 1660 | ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); |
---|
1505 | 1661 | |
---|
1506 | 1662 | /* on shutdown we stale and complete the buffer immediately */ |
---|
1507 | | - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { |
---|
1508 | | - xfs_buf_ioerror(bp, -EIO); |
---|
1509 | | - bp->b_flags &= ~XBF_DONE; |
---|
1510 | | - xfs_buf_stale(bp); |
---|
1511 | | - xfs_buf_ioend(bp); |
---|
| 1663 | + if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { |
---|
| 1664 | + xfs_buf_ioend_fail(bp); |
---|
1512 | 1665 | return -EIO; |
---|
1513 | 1666 | } |
---|
1514 | 1667 | |
---|
.. | .. |
---|
1574 | 1727 | return page_address(page) + (offset & (PAGE_SIZE-1)); |
---|
1575 | 1728 | } |
---|
1576 | 1729 | |
---|
1577 | | -/* |
---|
1578 | | - * Move data into or out of a buffer. |
---|
1579 | | - */ |
---|
1580 | 1730 | void |
---|
1581 | | -xfs_buf_iomove( |
---|
1582 | | - xfs_buf_t *bp, /* buffer to process */ |
---|
1583 | | - size_t boff, /* starting buffer offset */ |
---|
1584 | | - size_t bsize, /* length to copy */ |
---|
1585 | | - void *data, /* data address */ |
---|
1586 | | - xfs_buf_rw_t mode) /* read/write/zero flag */ |
---|
| 1731 | +xfs_buf_zero( |
---|
| 1732 | + struct xfs_buf *bp, |
---|
| 1733 | + size_t boff, |
---|
| 1734 | + size_t bsize) |
---|
1587 | 1735 | { |
---|
1588 | 1736 | size_t bend; |
---|
1589 | 1737 | |
---|
.. | .. |
---|
1596 | 1744 | page_offset = (boff + bp->b_offset) & ~PAGE_MASK; |
---|
1597 | 1745 | page = bp->b_pages[page_index]; |
---|
1598 | 1746 | csize = min_t(size_t, PAGE_SIZE - page_offset, |
---|
1599 | | - BBTOB(bp->b_io_length) - boff); |
---|
| 1747 | + BBTOB(bp->b_length) - boff); |
---|
1600 | 1748 | |
---|
1601 | 1749 | ASSERT((csize + page_offset) <= PAGE_SIZE); |
---|
1602 | 1750 | |
---|
1603 | | - switch (mode) { |
---|
1604 | | - case XBRW_ZERO: |
---|
1605 | | - memset(page_address(page) + page_offset, 0, csize); |
---|
1606 | | - break; |
---|
1607 | | - case XBRW_READ: |
---|
1608 | | - memcpy(data, page_address(page) + page_offset, csize); |
---|
1609 | | - break; |
---|
1610 | | - case XBRW_WRITE: |
---|
1611 | | - memcpy(page_address(page) + page_offset, data, csize); |
---|
1612 | | - } |
---|
| 1751 | + memset(page_address(page) + page_offset, 0, csize); |
---|
1613 | 1752 | |
---|
1614 | 1753 | boff += csize; |
---|
1615 | | - data += csize; |
---|
1616 | 1754 | } |
---|
| 1755 | +} |
---|
| 1756 | + |
---|
| 1757 | +/* |
---|
| 1758 | + * Log a message about and stale a buffer that a caller has decided is corrupt. |
---|
| 1759 | + * |
---|
| 1760 | + * This function should be called for the kinds of metadata corruption that |
---|
| 1761 | + * cannot be detect from a verifier, such as incorrect inter-block relationship |
---|
| 1762 | + * data. Do /not/ call this function from a verifier function. |
---|
| 1763 | + * |
---|
| 1764 | + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will |
---|
| 1765 | + * be marked stale, but b_error will not be set. The caller is responsible for |
---|
| 1766 | + * releasing the buffer or fixing it. |
---|
| 1767 | + */ |
---|
| 1768 | +void |
---|
| 1769 | +__xfs_buf_mark_corrupt( |
---|
| 1770 | + struct xfs_buf *bp, |
---|
| 1771 | + xfs_failaddr_t fa) |
---|
| 1772 | +{ |
---|
| 1773 | + ASSERT(bp->b_flags & XBF_DONE); |
---|
| 1774 | + |
---|
| 1775 | + xfs_buf_corruption_error(bp, fa); |
---|
| 1776 | + xfs_buf_stale(bp); |
---|
1617 | 1777 | } |
---|
1618 | 1778 | |
---|
1619 | 1779 | /* |
---|
.. | .. |
---|
1660 | 1820 | struct xfs_buftarg *btp) |
---|
1661 | 1821 | { |
---|
1662 | 1822 | LIST_HEAD(dispose); |
---|
1663 | | - int loop = 0; |
---|
| 1823 | + int loop = 0; |
---|
| 1824 | + bool write_fail = false; |
---|
1664 | 1825 | |
---|
1665 | 1826 | /* |
---|
1666 | 1827 | * First wait on the buftarg I/O count for all in-flight buffers to be |
---|
.. | .. |
---|
1688 | 1849 | bp = list_first_entry(&dispose, struct xfs_buf, b_lru); |
---|
1689 | 1850 | list_del_init(&bp->b_lru); |
---|
1690 | 1851 | if (bp->b_flags & XBF_WRITE_FAIL) { |
---|
1691 | | - xfs_alert(btp->bt_mount, |
---|
| 1852 | + write_fail = true; |
---|
| 1853 | + xfs_buf_alert_ratelimited(bp, |
---|
| 1854 | + "XFS: Corruption Alert", |
---|
1692 | 1855 | "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", |
---|
1693 | 1856 | (long long)bp->b_bn); |
---|
1694 | | - xfs_alert(btp->bt_mount, |
---|
1695 | | -"Please run xfs_repair to determine the extent of the problem."); |
---|
1696 | 1857 | } |
---|
1697 | 1858 | xfs_buf_rele(bp); |
---|
1698 | 1859 | } |
---|
1699 | 1860 | if (loop++ != 0) |
---|
1700 | 1861 | delay(100); |
---|
| 1862 | + } |
---|
| 1863 | + |
---|
| 1864 | + /* |
---|
| 1865 | + * If one or more failed buffers were freed, that means dirty metadata |
---|
| 1866 | + * was thrown away. This should only ever happen after I/O completion |
---|
| 1867 | + * handling has elevated I/O error(s) to permanent failures and shuts |
---|
| 1868 | + * down the fs. |
---|
| 1869 | + */ |
---|
| 1870 | + if (write_fail) { |
---|
| 1871 | + ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); |
---|
| 1872 | + xfs_alert(btp->bt_mount, |
---|
| 1873 | + "Please run xfs_repair to determine the extent of the problem."); |
---|
1701 | 1874 | } |
---|
1702 | 1875 | } |
---|
1703 | 1876 | |
---|
.. | .. |
---|
1824 | 1997 | { |
---|
1825 | 1998 | xfs_buftarg_t *btp; |
---|
1826 | 1999 | |
---|
1827 | | - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); |
---|
| 2000 | + btp = kmem_zalloc(sizeof(*btp), KM_NOFS); |
---|
1828 | 2001 | |
---|
1829 | 2002 | btp->bt_mount = mp; |
---|
1830 | 2003 | btp->bt_dev = bdev->bd_dev; |
---|
1831 | 2004 | btp->bt_bdev = bdev; |
---|
1832 | 2005 | btp->bt_daxdev = dax_dev; |
---|
| 2006 | + |
---|
| 2007 | + /* |
---|
| 2008 | + * Buffer IO error rate limiting. Limit it to no more than 10 messages |
---|
| 2009 | + * per 30 seconds so as to not spam logs too much on repeated errors. |
---|
| 2010 | + */ |
---|
| 2011 | + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, |
---|
| 2012 | + DEFAULT_RATELIMIT_BURST); |
---|
1833 | 2013 | |
---|
1834 | 2014 | if (xfs_setsize_buftarg_early(btp, bdev)) |
---|
1835 | 2015 | goto error_free; |
---|
.. | .. |
---|
1963 | 2143 | struct list_head *wait_list) |
---|
1964 | 2144 | { |
---|
1965 | 2145 | struct xfs_buf *bp, *n; |
---|
1966 | | - LIST_HEAD (submit_list); |
---|
1967 | 2146 | int pinned = 0; |
---|
1968 | 2147 | struct blk_plug plug; |
---|
1969 | 2148 | |
---|
.. | .. |
---|
2136 | 2315 | int __init |
---|
2137 | 2316 | xfs_buf_init(void) |
---|
2138 | 2317 | { |
---|
2139 | | - xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", |
---|
2140 | | - KM_ZONE_HWALIGN, NULL); |
---|
| 2318 | + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, |
---|
| 2319 | + SLAB_HWCACHE_ALIGN | |
---|
| 2320 | + SLAB_RECLAIM_ACCOUNT | |
---|
| 2321 | + SLAB_MEM_SPREAD, |
---|
| 2322 | + NULL); |
---|
2141 | 2323 | if (!xfs_buf_zone) |
---|
2142 | 2324 | goto out; |
---|
2143 | 2325 | |
---|
.. | .. |
---|
2150 | 2332 | void |
---|
2151 | 2333 | xfs_buf_terminate(void) |
---|
2152 | 2334 | { |
---|
2153 | | - kmem_zone_destroy(xfs_buf_zone); |
---|
| 2335 | + kmem_cache_destroy(xfs_buf_zone); |
---|
2154 | 2336 | } |
---|
2155 | 2337 | |
---|
2156 | 2338 | void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) |
---|
.. | .. |
---|
2160 | 2342 | * This allows userspace to disrupt buffer caching for debug/testing |
---|
2161 | 2343 | * purposes. |
---|
2162 | 2344 | */ |
---|
2163 | | - if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, |
---|
2164 | | - XFS_ERRTAG_BUF_LRU_REF)) |
---|
| 2345 | + if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) |
---|
2165 | 2346 | lru_ref = 0; |
---|
2166 | 2347 | |
---|
2167 | 2348 | atomic_set(&bp->b_lru_ref, lru_ref); |
---|
2168 | 2349 | } |
---|
| 2350 | + |
---|
| 2351 | +/* |
---|
| 2352 | + * Verify an on-disk magic value against the magic value specified in the |
---|
| 2353 | + * verifier structure. The verifier magic is in disk byte order so the caller is |
---|
| 2354 | + * expected to pass the value directly from disk. |
---|
| 2355 | + */ |
---|
| 2356 | +bool |
---|
| 2357 | +xfs_verify_magic( |
---|
| 2358 | + struct xfs_buf *bp, |
---|
| 2359 | + __be32 dmagic) |
---|
| 2360 | +{ |
---|
| 2361 | + struct xfs_mount *mp = bp->b_mount; |
---|
| 2362 | + int idx; |
---|
| 2363 | + |
---|
| 2364 | + idx = xfs_sb_version_hascrc(&mp->m_sb); |
---|
| 2365 | + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) |
---|
| 2366 | + return false; |
---|
| 2367 | + return dmagic == bp->b_ops->magic[idx]; |
---|
| 2368 | +} |
---|
| 2369 | +/* |
---|
| 2370 | + * Verify an on-disk magic value against the magic value specified in the |
---|
| 2371 | + * verifier structure. The verifier magic is in disk byte order so the caller is |
---|
| 2372 | + * expected to pass the value directly from disk. |
---|
| 2373 | + */ |
---|
| 2374 | +bool |
---|
| 2375 | +xfs_verify_magic16( |
---|
| 2376 | + struct xfs_buf *bp, |
---|
| 2377 | + __be16 dmagic) |
---|
| 2378 | +{ |
---|
| 2379 | + struct xfs_mount *mp = bp->b_mount; |
---|
| 2380 | + int idx; |
---|
| 2381 | + |
---|
| 2382 | + idx = xfs_sb_version_hascrc(&mp->m_sb); |
---|
| 2383 | + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) |
---|
| 2384 | + return false; |
---|
| 2385 | + return dmagic == bp->b_ops->magic16[idx]; |
---|
| 2386 | +} |
---|