From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/extent_io.c | 1804 ++++++++++++++++++++++++++++++--------------------------- 1 files changed, 951 insertions(+), 853 deletions(-) diff --git a/kernel/fs/btrfs/extent_io.c b/kernel/fs/btrfs/extent_io.c index dabf153..685a375 100644 --- a/kernel/fs/btrfs/extent_io.c +++ b/kernel/fs/btrfs/extent_io.c @@ -14,6 +14,7 @@ #include <linux/prefetch.h> #include <linux/cleancache.h> #include "extent_io.h" +#include "extent-io-tree.h" #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" @@ -34,36 +35,59 @@ } #ifdef CONFIG_BTRFS_DEBUG -static LIST_HEAD(buffers); static LIST_HEAD(states); - static DEFINE_SPINLOCK(leak_lock); -static inline -void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +static inline void btrfs_leak_debug_add(spinlock_t *lock, + struct list_head *new, + struct list_head *head) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_add(new, head); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline -void btrfs_leak_debug_del(struct list_head *entry) +static inline void btrfs_leak_debug_del(spinlock_t *lock, + struct list_head *entry) { unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); + spin_lock_irqsave(lock, flags); list_del(entry); - spin_unlock_irqrestore(&leak_lock, flags); + spin_unlock_irqrestore(lock, flags); } -static inline -void btrfs_leak_debug_check(void) +void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + unsigned long flags; + + /* + * If we didn't get into open_ctree our allocated_ebs will not be + * initialized, so just skip this. + */ + if (!fs_info->allocated_ebs.next) + return; + + spin_lock_irqsave(&fs_info->eb_leak_lock, flags); + while (!list_empty(&fs_info->allocated_ebs)) { + eb = list_first_entry(&fs_info->allocated_ebs, + struct extent_buffer, leak_list); + pr_err( + "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_header_owner(eb)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); +} + +static inline void btrfs_extent_state_leak_debug_check(void) { struct extent_state *state; - struct extent_buffer *eb; while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); @@ -74,14 +98,6 @@ list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } } #define btrfs_debug_check_extent_io_range(tree, start, end) \ @@ -89,18 +105,25 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - if (tree->ops && tree->ops->check_extent_io_range) - tree->ops->check_extent_io_range(tree->private_data, caller, - start, end); + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode || !is_data_inode(inode)) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } } #else -#define btrfs_leak_debug_add(new, head) do {} while (0) -#define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_leak_debug_add(lock, new, head) do {} while (0) +#define btrfs_leak_debug_del(lock, entry) do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif - -#define BUFFER_LRU_MAX 64 struct tree_entry { u64 start; @@ -110,7 +133,6 @@ struct extent_page_data { struct bio *bio; - struct extent_io_tree *tree; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -138,24 +160,20 @@ return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; - u64 start; - - start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); else - btrfsic_submit_bio(bio); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -194,19 +212,23 @@ return ret; } -int __init extent_io_init(void) +int __init extent_state_cache_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; + return 0; +} +int __init extent_io_init(void) +{ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) - goto free_state_cache; + return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_io_bio, bio), @@ -224,35 +246,76 @@ free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; - -free_state_cache: - kmem_cache_destroy(extent_state_cache); - extent_state_cache = NULL; return -ENOMEM; +} + +void __cold extent_state_cache_exit(void) +{ + btrfs_extent_state_leak_debug_check(); + kmem_cache_destroy(extent_state_cache); } void __cold extent_io_exit(void) { - btrfs_leak_debug_check(); - /* * Make sure all delayed rcu free are flushed before we * destroy caches. */ rcu_barrier(); - kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); bioset_exit(&btrfs_bioset); } -void extent_io_tree_init(struct extent_io_tree *tree, +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, void *private_data) { + tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; + tree->owner = owner; + if (owner == IO_TREE_INODE_FILE_EXTENT) + lockdep_set_class(&tree->lock, &file_extent_tree_class); +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -270,7 +333,7 @@ state->state = 0; state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); - btrfs_leak_debug_add(&state->leak_list, &states); + btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -283,7 +346,7 @@ return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); - btrfs_leak_debug_del(&state->leak_list); + btrfs_leak_debug_del(&leak_lock, &state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -325,9 +388,27 @@ return NULL; } +/** + * __etree_search - searche @tree for an entry that contains @offset. Such + * entry would have entry->start <= offset && entry->end >= offset. + * + * @tree - the tree to search + * @offset - offset that should fall within an entry in @tree + * @next_ret - pointer to the first entry whose range ends after @offset + * @prev - pointer to the first entry whose range begins before @offset + * @p_ret - pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret - points to entry which would have been the parent of the entry, + * containing @offset + * + * This function returns a pointer to the entry that contains @offset byte + * address. If no such entry exists, then NULL is returned and the other + * pointer arguments to the function are filled, otherwise the found entry is + * returned and other pointers are left untouched. + */ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, struct rb_node **next_ret, + struct rb_node **prev_ret, struct rb_node ***p_ret, struct rb_node **parent_ret) { @@ -356,23 +437,23 @@ if (parent_ret) *parent_ret = prev; - if (prev_ret) { + if (next_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *prev_ret = prev; + *next_ret = prev; prev = orig_prev; } - if (next_ret) { + if (prev_ret) { prev_entry = rb_entry(prev, struct tree_entry, rb_node); while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *next_ret = prev; + *prev_ret = prev; } return NULL; } @@ -383,12 +464,12 @@ struct rb_node ***p_ret, struct rb_node **parent_ret) { - struct rb_node *prev = NULL; + struct rb_node *next= NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); + ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); if (!ret) - return prev; + return next; return ret; } @@ -396,13 +477,6 @@ u64 offset) { return tree_search_for_insert(tree, offset, NULL, NULL); -} - -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->private_data, new, other); } /* @@ -420,7 +494,7 @@ struct extent_state *other; struct rb_node *other_node; - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; other_node = rb_prev(&state->rb_node); @@ -428,7 +502,10 @@ other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -440,27 +517,16 @@ other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); free_extent_state(other); } } -} - -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->private_data, state, bits); } static void set_state_bits(struct extent_io_tree *tree, @@ -485,9 +551,11 @@ { struct rb_node *node; - if (end < start) - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", - end, start); + if (end < start) { + btrfs_err(tree->fs_info, + "insert state: end < start %llu %llu", end, start); + WARN_ON(1); + } state->start = start; state->end = end; @@ -497,19 +565,13 @@ if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", found->start, found->end, start, end); return -EEXIST; } merge_state(tree, state); return 0; -} - -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->private_data, orig, split); } /* @@ -531,7 +593,8 @@ { struct rb_node *node; - split_cb(tree, orig, split); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -558,7 +621,7 @@ /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -577,7 +640,10 @@ WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; } - clear_state_cb(tree, state, bits); + + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); state->state &= ~bits_to_clear; @@ -610,9 +676,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) { - struct inode *inode = tree->private_data; - - btrfs_panic(btrfs_sb(inode->i_sb), err, + btrfs_panic(tree->fs_info, err, "locking error: extent tree was modified by another thread while locked"); } @@ -642,15 +706,15 @@ int clear = 0; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; if (delete) bits |= ~EXTENT_CTLBITS; - bits |= EXTENT_FIRST_DELALLOC; - if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc && gfpflags_allow_blocking(mask)) { @@ -854,7 +918,9 @@ unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; - set_state_cb(tree, state, bits); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; @@ -880,7 +946,7 @@ struct extent_state **cached_ptr) { return cache_state_if_flags(state, cached_ptr, - EXTENT_IOBITS | EXTENT_BOUNDARY); + EXTENT_LOCKED | EXTENT_BOUNDARY); } /* @@ -910,8 +976,8 @@ u64 last_end; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); - bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1002,6 +1068,16 @@ *failed_start = start; err = -EEXIST; goto out; + } + + /* + * If this extent already has all the bits we want set, then + * skip it, not necessary to split it or do anything with it. + */ + if ((state->state & bits) == bits) { + start = state->end + 1; + cache_state(state, cached_state); + goto search_again; } prealloc = alloc_extent_state_atomic(prealloc); @@ -1143,6 +1219,8 @@ bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); again: if (!prealloc) { @@ -1342,6 +1420,13 @@ changeset); } +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits) +{ + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); +} + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached) @@ -1478,20 +1563,15 @@ struct extent_state **cached_state) { struct extent_state *state; - struct rb_node *n; int ret = 1; spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; if (state->end == start - 1 && extent_state_in_tree(state)) { - n = rb_next(&state->rb_node); - while (n) { - state = rb_entry(n, struct extent_state, - rb_node); + while ((state = next_state(state)) != NULL) { if (state->state & bits) goto got_it; - n = rb_next(n); } free_extent_state(*cached_state); *cached_state = NULL; @@ -1514,20 +1594,174 @@ return ret; } +/** + * find_contiguous_extent_bit: find a contiguous area of bits + * @tree - io tree to check + * @start - offset to start the search from + * @start_ret - the first offset we found with the bits set + * @end_ret - the final contiguous range of the bits that were set + * @bits - bits to look for + * + * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges + * to set bits appropriately, and then merge them again. During this time it + * will drop the tree->lock, so use this helper if you want to find the actual + * contiguous area for given bits. We will search to the first bit we find, and + * then walk down the tree until we find a non-contiguous area. The area + * returned will be the full contiguous area with the bits set. + */ +int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + while ((state = next_state(state)) != NULL) { + if (state->start > (*end_ret + 1)) + break; + *end_ret = state->end; + } + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/** + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. + * + * @tree - the tree to search + * @start - the offset at/after which the found extent should start + * @start_ret - records the beginning of the range + * @end_ret - records the end of the range (inclusive) + * @bits - the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + struct rb_node *node, *prev = NULL, *next; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + node = __etree_search(tree, start, &next, &prev, NULL, NULL); + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { + node = next; + } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ + state = rb_entry(node, struct extent_state, rb_node); + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } + } else { + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); +} + /* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes, - struct extent_state **cached_state) +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1538,8 +1772,7 @@ */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1559,7 +1792,7 @@ *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1617,19 +1850,22 @@ } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. @Start and @end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + * false if nothing was in the tree */ -STATIC u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1638,13 +1874,13 @@ /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; - found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes, &cached_state); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); if (!found || delalloc_end <= *start) { *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1664,6 +1900,7 @@ /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1675,11 +1912,10 @@ loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1741,7 +1977,7 @@ if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); - if (pages[i] == locked_page) { + if (locked_page && pages[i] == locked_page) { put_page(pages[i]); pages_locked++; continue; @@ -1780,15 +2016,14 @@ return err; } -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, - u64 delalloc_end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned clear_bits, + unsigned long page_ops) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); - __process_pages_contig(inode->i_mapping, locked_page, + __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, page_ops, NULL); } @@ -1857,8 +2092,8 @@ * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec) { struct rb_node *node; struct extent_state *state; @@ -1885,12 +2120,11 @@ return ret; } -static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec) +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) { struct rb_node *node; struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec; spin_lock(&tree->lock); /* @@ -1899,18 +2133,19 @@ */ node = tree_search(tree, start); if (!node) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } state = rb_entry(node, struct extent_state, rb_node); if (state->start != start) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } - *failrec = state->failrec; + + failrec = state->failrec; out: spin_unlock(&tree->lock); - return ret; + return failrec; } /* @@ -2096,9 +2331,9 @@ return 0; } -int repair_eb_io_failure(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int mirror_num) +int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -2140,8 +2375,8 @@ if (!ret) return 0; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) + failrec = get_state_failrec(failure_tree, start); + if (IS_ERR(failrec)) return 0; BUG_ON(!failrec->this_mirror); @@ -2213,8 +2448,8 @@ spin_unlock(&failure_tree->lock); } -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret) +static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2225,65 +2460,8 @@ int ret; u64 logical; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) { - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - - failrec->start = start; - failrec->len = end - start + 1; - failrec->this_mirror = 0; - failrec->bio_flags = 0; - failrec->in_validation = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return -EIO; - } - - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { - kfree(failrec); - return -EIO; - } - - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - - /* set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) - ret = set_state_failrec(failure_tree, start, failrec); - /* set the bits in the inode's tree */ - if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); - if (ret < 0) { - kfree(failrec); - return ret; - } - } else { + failrec = get_state_failrec(failure_tree, start); + if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", failrec->logical, failrec->start, failrec->len, @@ -2293,15 +2471,71 @@ * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ + + return failrec; } - *failrec_ret = failrec; + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return ERR_PTR(-ENOMEM); - return 0; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return ERR_PTR(-EIO); + } + + if (em->start > start || em->start + em->len <= start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + if (!em) { + kfree(failrec); + return ERR_PTR(-EIO); + } + + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, em->compress_type); + } + + btrfs_debug(fs_info, + "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", + logical, start, failrec->len); + + failrec->logical = logical; + free_extent_map(em); + + /* Set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY); + if (ret >= 0) { + ret = set_state_failrec(failure_tree, start, failrec); + /* Set the bits in the inode's tree */ + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + } else if (ret < 0) { + kfree(failrec); + return ERR_PTR(ret); + } + + return failrec; } -bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, - struct io_failure_record *failrec, int failed_mirror) +static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, + struct io_failure_record *failrec, + int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int num_copies; @@ -2324,7 +2558,7 @@ * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio_pages > 1) { + if (needs_validation) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2363,97 +2597,114 @@ return true; } - -struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, - struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func, void *data) +static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio *bio; - struct btrfs_io_bio *btrfs_failed_bio; - struct btrfs_io_bio *btrfs_bio; + u64 len = 0; + const u32 blocksize = inode->i_sb->s_blocksize; - bio = btrfs_io_bio_alloc(1); - bio->bi_end_io = endio_func; - bio->bi_iter.bi_sector = failrec->logical >> 9; - bio_set_dev(bio, fs_info->fs_devices->latest_bdev); - bio->bi_iter.bi_size = 0; - bio->bi_private = data; + /* + * If bi_status is BLK_STS_OK, then this was a checksum error, not an + * I/O error. In this case, we already know exactly which sector was + * bad, so we don't need to validate. + */ + if (bio->bi_status == BLK_STS_OK) + return false; - btrfs_failed_bio = btrfs_io_bio(failed_bio); - if (btrfs_failed_bio->csum) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + /* + * We need to validate each sector individually if the failed I/O was + * for multiple sectors. + * + * There are a few possible bios that can end up here: + * 1. A buffered read bio, which is not cloned. + * 2. A direct I/O read bio, which is cloned. + * 3. A (buffered or direct) repair bio, which is not cloned. + * + * For cloned bios (case 2), we can get the size from + * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get + * it from the bvecs. + */ + if (bio_flagged(bio, BIO_CLONED)) { + if (btrfs_io_bio(bio)->iter.bi_size > blocksize) + return true; + } else { + struct bio_vec *bvec; + int i; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = btrfs_bio->csum_inline; - icsum *= csum_size; - memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, - csum_size); + bio_for_each_bvec_all(bvec, bio, i) { + len += bvec->bv_len; + if (len > blocksize) + return true; + } } - - bio_add_page(bio, page, failrec->len, pg_offset); - - return bio; + return false; } -/* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed - */ - -static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int failed_mirror) +blk_status_t btrfs_submit_read_repair(struct inode *inode, + struct bio *failed_bio, u64 phy_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; - struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int read_mode = 0; + struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + bool need_validation; + struct bio *repair_bio; + struct btrfs_io_bio *repair_io_bio; blk_status_t status; - int ret; - unsigned failed_bio_pages = bio_pages_all(failed_bio); + + btrfs_debug(fs_info, + "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); - if (ret) - return ret; + failrec = btrfs_get_io_failure_record(inode, start, end); + if (IS_ERR(failrec)) + return errno_to_blk_status(PTR_ERR(failrec)); - if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, + need_validation = btrfs_io_needs_validation(inode, failed_bio); + + if (!btrfs_check_repairable(inode, need_validation, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return -EIO; + return BLK_STS_IOERR; } - if (failed_bio_pages > 1) - read_mode |= REQ_FAILFAST_DEV; + repair_bio = btrfs_io_bio_alloc(1); + repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio->bi_opf = REQ_OP_READ; + if (need_validation) + repair_bio->bi_opf |= REQ_FAILFAST_DEV; + repair_bio->bi_end_io = failed_bio->bi_end_io; + repair_bio->bi_iter.bi_sector = failrec->logical >> 9; + repair_bio->bi_private = failed_bio->bi_private; - phy_offset >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - start - page_offset(page), - (int)phy_offset, failed_bio->bi_end_io, - NULL); - bio->bi_opf = REQ_OP_READ | read_mode; + if (failed_io_bio->csum) { + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + repair_io_bio->csum = repair_io_bio->csum_inline; + memcpy(repair_io_bio->csum, + failed_io_bio->csum + csum_size * icsum, csum_size); + } + + bio_add_page(repair_bio, page, failrec->len, pgoff); + repair_io_bio->logical = failrec->start; + repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), - "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); +"repair read error: submitting new read to mirror %d, in_validation=%d", + failrec->this_mirror, failrec->in_validation); - status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, - failrec->bio_flags, 0); + status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, + failrec->bio_flags); if (status) { free_io_failure(failure_tree, tree, failrec); - bio_put(bio); - ret = blk_status_to_errno(status); + bio_put(repair_bio); } - - return ret; + return status; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2461,14 +2712,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); - struct extent_io_tree *tree; int ret = 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end, NULL, - uptodate); + btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2493,10 +2739,10 @@ struct bio_vec *bvec; u64 start; u64 end; - int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2564,10 +2810,10 @@ u64 extent_len = 0; int mirror; int ret; - int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2600,10 +2846,13 @@ len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (likely(uptodate)) { + if (is_data_inode(inode)) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else @@ -2616,38 +2865,36 @@ if (likely(uptodate)) goto readpage_ok; - if (tree->ops) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (ret == -EAGAIN) { - /* - * Data inode's readpage_io_failed_hook() always - * returns -EAGAIN. - * - * The generic bio_readpage_error handles errors - * the following way: If possible, new read - * requests are created and submitted and will - * end up in end_bio_extent_readpage as well (if - * we're lucky, not in the !uptodate case). In - * that case it returns 0 and we just go on with - * the next page in our bio. If it can't handle - * the error it will return -EIO and we remain - * responsible for that page. - */ - ret = bio_readpage_error(bio, offset, page, - start, end, mirror); - if (ret == 0) { - uptodate = !bio->bi_status; - offset += len; - continue; - } - } + if (is_data_inode(inode)) { /* - * metadata's readpage_io_failed_hook() always returns - * -EIO and fixes nothing. -EIO is also returned if - * data inode error could not be fixed. + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, + * not in the !uptodate case). In that case it returns + * 0 and we just go on with the next page in our bio. + * If it can't handle the error it will return -EIO and + * we remain responsible for that page. */ - ASSERT(ret == -EIO); + if (!btrfs_submit_read_repair(inode, bio, offset, page, + start - page_offset(page), + start, end, mirror, + btrfs_submit_data_bio)) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } else { + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, + &eb->bflags)) + btree_readahead_hook(eb, -EIO); } readpage_ok: if (likely(uptodate)) { @@ -2656,7 +2903,7 @@ unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_SIZE-1); + off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); @@ -2693,8 +2940,7 @@ if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -2713,12 +2959,11 @@ * never fail. We're returning a bio right now but you can call btrfs_io_bio * for the appropriate container_of magic */ -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2766,25 +3011,22 @@ /* * @opf: bio REQ_OP_* and REQ_* flags as one value - * @tree: tree so we can call our merge_bio hook * @wbc: optional writeback control for io accounting * @page: page to add to the bio * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one * @size: portion of page that we want to write * @offset: starting offset in the page - * @bdev: attach newly created bios to this bdev * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them */ -static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, +static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, - struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, @@ -2796,6 +3038,7 @@ struct bio *bio; size_t page_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; + struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); @@ -2809,8 +3052,7 @@ else contig = bio_end_sector(bio) == sector; - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, - bio, bio_flags)) + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2824,20 +3066,24 @@ bio = NULL; } else { if (wbc) - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); return 0; } } - bio = btrfs_bio_alloc(bdev, offset); + bio = btrfs_bio_alloc(offset); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; bio->bi_opf = opf; if (wbc) { + struct block_device *bdev; + + bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); } *bio_ret = bio; @@ -2848,28 +3094,21 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -2885,7 +3124,7 @@ *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -2900,13 +3139,9 @@ * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2917,7 +3152,6 @@ u64 block_start; u64 cur_end; struct extent_map *em; - struct block_device *bdev; int ret = 0; int nr = 0; size_t pg_offset = 0; @@ -2925,6 +3159,7 @@ size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; set_page_extent_mapped(page); @@ -2938,7 +3173,7 @@ if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; @@ -2968,7 +3203,7 @@ break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -2994,14 +3229,13 @@ offset = em->block_start + extent_offset; disk_io_size = iosize; } - bdev = em->bdev; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a @@ -3082,10 +3316,10 @@ continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, + ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, - pg_offset, bdev, bio, - end_bio_extent_readpage, mirror_num, + pg_offset, bio, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3109,118 +3343,23 @@ return ret; } -static inline void __do_contiguous_readpages(struct extent_io_tree *tree, - struct page *pages[], int nr_pages, +static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { - struct inode *inode; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - inode = pages[0]->mapping->host; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - end - start + 1); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } -} - -static void __extent_readpages(struct extent_io_tree *tree, - struct page *pages[], - int nr_pages, - struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, - u64 *prev_em_start) -{ - u64 start = 0; - u64 end = 0; - u64 page_start; - int index; - int first_index = 0; - - for (index = 0; index < nr_pages; index++) { - page_start = page_offset(pages[index]); - if (!end) { - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } else if (end + 1 == page_start) { - end += PAGE_SIZE; - } else { - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, - bio, bio_flags, - prev_em_start); - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } - } - - if (end) - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, bio, - bio_flags, prev_em_start); -} - -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, - unsigned int read_flags) -{ - struct inode *inode = page->mapping->host; - struct btrfs_ordered_extent *ordered; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - PAGE_SIZE); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - - ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); - return ret; -} - -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); - return ret; } static void update_nr_written(struct writeback_control *wbc, @@ -3239,36 +3378,28 @@ * This returns 0 if all went well (page still locked) * This returns < 0 if there were errors (page still locked) */ -static noinline_for_stack int writepage_delalloc(struct inode *inode, - struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, - u64 delalloc_start, - unsigned long *nr_written) +static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, + struct page *page, struct writeback_control *wbc, + u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = epd->tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; int page_started = 0; - if (epd->extent_locked) - return 0; while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, + found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, - &delalloc_end, - BTRFS_MAX_EXTENT_SIZE); - if (nr_delalloc == 0) { + &delalloc_end); + if (!found) { delalloc_start = delalloc_end + 1; continue; } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); - /* File system has been set read-only */ if (ret) { SetPageError(page); /* @@ -3277,8 +3408,7 @@ * started, so we don't want to return > 0 unless * things are going well. */ - ret = ret < 0 ? ret : -EIO; - goto done; + return ret < 0 ? ret : -EIO; } /* * delalloc_end is already one less than the total length, so @@ -3310,10 +3440,7 @@ return 1; } - ret = 0; - -done: - return ret; + return 0; } /* @@ -3324,15 +3451,15 @@ * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct inode *inode, +static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, struct extent_page_data *epd, loff_t i_size, unsigned long nr_written, - unsigned int write_flags, int *nr_ret) + int *nr_ret) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3341,27 +3468,20 @@ u64 block_start; u64 iosize; struct extent_map *em; - struct block_device *bdev; size_t pg_offset = 0; size_t blocksize; int ret = 0; int nr = 0; + const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - - update_nr_written(wbc, nr_written); - unlock_page(page); - return 1; - } + ret = btrfs_writepage_cow_fixup(page, start, page_end); + if (ret) { + /* Fixup worker will requeue */ + redirty_page_for_writepage(wbc, page); + update_nr_written(wbc, nr_written); + unlock_page(page); + return 1; } /* @@ -3371,27 +3491,18 @@ update_nr_written(wbc, nr_written + 1); end = page_end; - if (i_size <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); - goto done; - } - - blocksize = inode->i_sb->s_blocksize; + blocksize = inode->vfs_inode.i_sb->s_blocksize; while (cur <= end) { u64 em_end; u64 offset; if (cur >= i_size) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, cur, + page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, - end - cur + 1, 1); + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3405,7 +3516,6 @@ iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); offset = em->block_start + extent_offset; - bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); @@ -3417,23 +3527,11 @@ */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ + if (compressed) nr++; - } - + else + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, 1); cur += iosize; pg_offset += iosize; continue; @@ -3441,14 +3539,14 @@ btrfs_set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, + btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, offset, iosize, pg_offset, - bdev, &epd->bio, + &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3461,7 +3559,6 @@ pg_offset += iosize; nr++; } -done: *nr_ret = nr; return ret; } @@ -3483,13 +3580,10 @@ u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; - size_t pg_offset = 0; + size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned int write_flags = 0; unsigned long nr_written = 0; - - write_flags = wbc_to_write_flags(wbc); trace___extent_writepage(page, inode, wbc); @@ -3497,7 +3591,7 @@ ClearPageError(page); - pg_offset = i_size & (PAGE_SIZE - 1); + pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3515,20 +3609,21 @@ flush_dcache_page(page); } - pg_offset = 0; - set_page_extent_mapped(page); - ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); - if (ret == 1) - goto done_unlocked; - if (ret) - goto done; + if (!epd->extent_locked) { + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, + &nr_written); + if (ret == 1) + return 0; + if (ret) + goto done; + } - ret = __extent_writepage_io(inode, page, wbc, epd, - i_size, nr_written, write_flags, &nr); + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + nr_written, &nr); if (ret == 1) - goto done_unlocked; + return 0; done: if (nr == 0) { @@ -3543,9 +3638,6 @@ unlock_page(page); ASSERT(ret <= 0); return ret; - -done_unlocked: - return 0; } void wait_on_extent_buffer_writeback(struct extent_buffer *eb) @@ -3568,11 +3660,10 @@ * Return >0 is same as 0, except bio is not submitted * Return <0 if something went wrong, no page is locked */ -static noinline_for_stack int -lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, +static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) { + struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages, failed_page_nr; int flush = 0; int ret = 0; @@ -3672,10 +3763,25 @@ static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; + + /* + * A read may stumble upon this buffer later, make sure that it gets an + * error and knows there was an error. + */ + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + + /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); /* * If writeback for a btree extent that doesn't belong to a log tree @@ -3734,10 +3840,11 @@ { struct bio_vec *bvec; struct extent_buffer *eb; - int i, done; + int done; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; eb = (struct extent_buffer *)page->private; @@ -3762,12 +3869,9 @@ } static noinline_for_stack int write_one_eb(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) { - struct block_device *bdev = fs_info->fs_devices->latest_bdev; - struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; int i, num_pages; @@ -3791,7 +3895,7 @@ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); memzero_extent_buffer(eb, start, end - start); } @@ -3800,8 +3904,8 @@ clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset, PAGE_SIZE, 0, bdev, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, + p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3833,15 +3937,13 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; int nr_to_write_done = 0; @@ -3850,12 +3952,17 @@ pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -3873,7 +3980,6 @@ tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -3909,7 +4015,7 @@ continue; prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + ret = lock_extent_buffer_for_io(eb, &epd); if (!ret) { free_extent_buffer(eb); continue; @@ -3919,7 +4025,7 @@ break; } - ret = write_one_eb(eb, fs_info, wbc, &epd); + ret = write_one_eb(eb, wbc, &epd); if (ret) { done = 1; free_extent_buffer(eb); @@ -3928,11 +4034,12 @@ free_extent_buffer(eb); /* - * the filesystem may choose to bump up nr_to_write. + * The filesystem may choose to bump up nr_to_write. * We have to make sure to honor the new nr_to_write - * at any time + * at any time. */ - nr_to_write_done = wbc->nr_to_write <= 0; + nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && + wbc->nr_to_write <= 0); } pagevec_release(&pvec); cond_resched(); @@ -3981,7 +4088,7 @@ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { ret = flush_write_bio(&epd); } else { - ret = -EUCLEAN; + ret = -EROFS; end_write_bio(&epd, ret); } return ret; @@ -4016,7 +4123,7 @@ pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their @@ -4034,6 +4141,11 @@ if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -4067,7 +4179,6 @@ &index, end, tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4105,11 +4216,6 @@ } ret = __extent_writepage(page, wbc, epd); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } if (ret < 0) { done = 1; break; @@ -4156,7 +4262,6 @@ int ret; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(page->mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4177,16 +4282,13 @@ int mode) { int ret = 0; - int flush_ret; struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; unsigned long nr_pages = (end - start + PAGE_SIZE) >> PAGE_SHIFT; struct extent_page_data epd = { .bio = NULL, - .tree = tree, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4195,25 +4297,32 @@ .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, + /* We're called from an async helper function */ + .punt_to_cgroup = 1, + .no_cgroup_owner = 1, }; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (start <= end) { page = find_get_page(mapping, start >> PAGE_SHIFT); if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_SIZE - 1, - NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, + start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); start += PAGE_SIZE; } - flush_ret = flush_write_bio(&epd); - BUG_ON(flush_ret < 0); + ASSERT(ret <= 0); + if (ret == 0) + ret = flush_write_bio(&epd); + else + end_write_bio(&epd, ret); + + wbc_detach_inode(&wbc_writepages); return ret; } @@ -4221,63 +4330,48 @@ struct writeback_control *wbc) { int ret = 0; - int flush_ret; struct extent_page_data epd = { .bio = NULL, - .tree = &BTRFS_I(mapping->host)->io_tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(mapping, wbc, &epd); - flush_ret = flush_write_bio(&epd); - BUG_ON(flush_ret < 0); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; - unsigned page_idx; unsigned long bio_flags = 0; struct page *pagepool[16]; - struct page *page; struct extent_map *em_cached = NULL; - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - page = list_entry(pages->prev, struct page, lru); + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - continue; - } + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - pagepool[nr++] = page; - if (nr < ARRAY_SIZE(pagepool)) - continue; - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); - nr = 0; + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } - if (nr) - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); - BUG_ON(!list_empty(pages)); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@ -4299,10 +4393,8 @@ lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state); + clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); return 0; } @@ -4318,10 +4410,9 @@ u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, - EXTENT_IOBITS, 0, NULL)) + if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; - else { + } else { /* * at this point we can safely clear everything except the * locked bit and the nodatasum bit @@ -4359,6 +4450,9 @@ page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { + struct btrfs_fs_info *fs_info; + u64 cur_gen; + len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); @@ -4372,16 +4466,45 @@ free_extent_map(em); break; } - if (!test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK, - 0, NULL)) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &btrfs_inode->runtime_flags); - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); - } + if (test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED, 0, NULL)) + goto next; + /* + * If it's not in the list of modified extents, used + * by a fast fsync, we can remove it. If it's being + * logged we can safely remove it since fsync took an + * extra reference on the em. + */ + if (list_empty(&em->list) || + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it + * only if its generation is older then the current one, + * in which case we don't need it for a fast fsync. + * Otherwise don't remove it, we could be racing with an + * ongoing fast fsync that could miss the new extent. + */ + fs_info = btrfs_inode->root->fs_info; + spin_lock(&fs_info->trans_lock); + cur_gen = fs_info->generation; + spin_unlock(&fs_info->trans_lock); + if (em->generation >= cur_gen) + goto next; +remove_em: + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there + * is no need to set the full fsync flag on the inode (it + * hurts the fsync performance for workloads with a data + * size that exceeds or is close to the system's memory). + */ + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); +next: start = extent_map_end(em); write_unlock(&map->lock); @@ -4398,7 +4521,7 @@ * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -4413,8 +4536,7 @@ if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, - len, 0); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4465,7 +4587,7 @@ /* * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cahced one. + * fiemap extent won't overlap with cached one. * Not recoverable. * * NOTE: Physical address can overlap, due to compression @@ -4527,8 +4649,7 @@ * In this case, the first extent range will be cached but not emitted. * So we must emit it before ending extent_fiemap(). */ -static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, - struct fiemap_extent_info *fieinfo, +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache) { int ret; @@ -4544,24 +4665,26 @@ return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { int ret = 0; - u64 off = start; + u64 off; u64 max = start + len; u32 flags = 0; u32 found_type; u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; + struct ulist *roots; + struct ulist *tmp_ulist; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4575,6 +4698,18 @@ return -ENOMEM; path->leave_spinning = 1; + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!roots || !tmp_ulist) { + ret = -ENOMEM; + goto out_free_ulist; + } + + /* + * We can't initialize that to 'start' as this could miss extents due + * to extent item merging + */ + off = 0; start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -4582,11 +4717,10 @@ * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { - btrfs_free_path(path); - return ret; + goto out_free_ulist; } else { WARN_ON(!ret); if (ret == 1) @@ -4598,7 +4732,7 @@ found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4624,7 +4758,7 @@ last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent); @@ -4693,9 +4827,8 @@ * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), - bytenr); + ret = btrfs_check_shared(root, btrfs_ino(inode), + bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; if (ret) @@ -4735,22 +4868,25 @@ } out_free: if (!ret) - ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); + ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - btrfs_free_path(path); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); + +out_free_ulist: + btrfs_free_path(path); + ulist_free(roots); + ulist_free(tmp_ulist); return ret; } static void __free_extent_buffer(struct extent_buffer *eb) { - btrfs_leak_debug_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } -int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(const struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -4792,10 +4928,7 @@ * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) @@ -4812,6 +4945,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { btrfs_release_extent_buffer_pages(eb); + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); __free_extent_buffer(eb); } @@ -4827,17 +4961,14 @@ eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); - atomic_set(&eb->write_locks, 0); - atomic_set(&eb->read_locks, 0); atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->spinning_writers, 0); - eb->lock_nested = 0; + eb->blocking_writers = 0; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); - btrfs_leak_debug_add(&eb->leak_list, &buffers); + btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, + &fs_info->allocated_ebs); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -4850,10 +4981,17 @@ > MAX_INLINE_EXTENT_BUFFER_SIZE); BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); +#ifdef CONFIG_BTRFS_DEBUG + eb->spinning_writers = 0; + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->read_locks, 0); + eb->write_locks = 0; +#endif + return eb; } -struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; struct page *p; @@ -5042,13 +5180,6 @@ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); - /* - * We will free dummy extent buffer's if they come into - * free_extent_buffer with a ref count of 2, but if we are using this we - * want the buffers to stay in memory until we're done with them, so - * bump the ref count again. - */ - atomic_inc(&eb->refs); return eb; free_eb: btrfs_release_extent_buffer(eb); @@ -5187,6 +5318,7 @@ } static int release_extent_buffer(struct extent_buffer *eb) + __releases(&eb->refs_lock) { lockdep_assert_held(&eb->refs_lock); @@ -5205,6 +5337,7 @@ spin_unlock(&eb->refs_lock); } + btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_pages(eb); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5230,7 +5363,9 @@ while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) @@ -5238,10 +5373,6 @@ } spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) - atomic_dec(&eb->refs); - if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && @@ -5269,7 +5400,7 @@ release_extent_buffer(eb); } -void clear_extent_buffer_dirty(struct extent_buffer *eb) +void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; @@ -5287,11 +5418,9 @@ clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); @@ -5299,11 +5428,11 @@ WARN_ON(atomic_read(&eb->refs) == 0); } -int set_extent_buffer_dirty(struct extent_buffer *eb) +bool set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; - int was_dirty = 0; + bool was_dirty; check_buffer_tree_ref(eb); @@ -5313,8 +5442,15 @@ WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); + if (!was_dirty) + for (i = 0; i < num_pages; i++) + set_page_dirty(eb->pages[i]); + +#ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + ASSERT(PageDirty(eb->pages[i])); +#endif + return was_dirty; } @@ -5347,8 +5483,7 @@ } } -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) { int i; struct page *page; @@ -5412,20 +5547,19 @@ } ClearPageError(page); - err = __extent_read_full_page(tree, page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { @@ -5460,6 +5594,36 @@ return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5468,17 +5632,18 @@ struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; } - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5503,21 +5668,20 @@ struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); - if (probe_user_write(dst, kaddr + offset, cur)) { + if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } @@ -5531,48 +5695,6 @@ return ret; } -/* - * return 0 if the item is found within a page. - * return 1 if the item spans two pages. - * return -EINVAL otherwise. - */ -int map_private_extent_buffer(const struct extent_buffer *eb, - unsigned long start, unsigned long min_len, - char **map, unsigned long *map_start, - unsigned long *map_len) -{ - size_t offset = start & (PAGE_SIZE - 1); - char *kaddr; - struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_SHIFT; - - if (start + min_len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, min_len); - return -EINVAL; - } - - if (i != end_i) - return 1; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_SHIFT) - start_offset; - } - - p = eb->pages[i]; - kaddr = page_address(p); - *map = kaddr + offset; - *map_len = PAGE_SIZE - offset; - return 0; -} - int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { @@ -5581,14 +5703,13 @@ struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5608,7 +5729,7 @@ return ret; } -void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, +void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@ -5619,7 +5740,7 @@ BTRFS_FSID_SIZE); } -void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) +void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@ -5629,7 +5750,7 @@ BTRFS_FSID_SIZE); } -void write_extent_buffer(struct extent_buffer *eb, const void *srcv, +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { size_t cur; @@ -5637,13 +5758,12 @@ struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5660,20 +5780,19 @@ } } -void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len) { size_t cur; size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5689,8 +5808,8 @@ } } -void copy_extent_buffer_full(struct extent_buffer *dst, - struct extent_buffer *src) +void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src) { int i; int num_pages; @@ -5703,7 +5822,8 @@ page_address(src->pages[i])); } -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, +void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { @@ -5712,13 +5832,15 @@ size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; + unsigned long i = dst_offset >> PAGE_SHIFT; + + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; WARN_ON(src->len != dst_len); - offset = (start_offset + dst_offset) & - (PAGE_SIZE - 1); + offset = offset_in_page(dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5749,12 +5871,11 @@ * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ -static inline void eb_bitmap_offset(struct extent_buffer *eb, +static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5763,10 +5884,10 @@ * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start_offset + start + byte_offset; + offset = start + byte_offset; *page_index = offset >> PAGE_SHIFT; - *page_offset = offset & (PAGE_SIZE - 1); + *page_offset = offset_in_page(offset); } /** @@ -5775,7 +5896,7 @@ * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ -int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, +int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { u8 *kaddr; @@ -5797,7 +5918,7 @@ * @pos: bit number of the first bit * @len: number of bits to set */ -void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, +void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { u8 *kaddr; @@ -5839,8 +5960,9 @@ * @pos: bit number of the first bit * @len: number of bits to clear */ -void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, - unsigned long pos, unsigned long len) +void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len) { u8 *kaddr; struct page *page; @@ -5901,38 +6023,26 @@ memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) +void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG_ON(1); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(dst_offset); + src_off_in_page = offset_in_page(src_offset); - dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_SHIFT; + dst_i = dst_offset >> PAGE_SHIFT; + src_i = src_offset >> PAGE_SHIFT; cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@ -5948,43 +6058,31 @@ } } -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) +void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG_ON(1); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_SHIFT; + dst_i = dst_end >> PAGE_SHIFT; + src_i = src_end >> PAGE_SHIFT; - dst_off_in_page = (start_offset + dst_end) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(dst_end); + src_off_in_page = offset_in_page(src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); -- Gitblit v1.6.2