From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/tree-log.c | 1765 ++++++++++++++++++++++++++++++++++++---------------------- 1 files changed, 1,083 insertions(+), 682 deletions(-) diff --git a/kernel/fs/btrfs/tree-log.c b/kernel/fs/btrfs/tree-log.c index 1f5facd..372ebbc 100644 --- a/kernel/fs/btrfs/tree-log.c +++ b/kernel/fs/btrfs/tree-log.c @@ -8,6 +8,7 @@ #include <linux/blkdev.h> #include <linux/list_sort.h> #include <linux/iversion.h> +#include "misc.h" #include "ctree.h" #include "tree-log.h" #include "disk-io.h" @@ -17,6 +18,8 @@ #include "compression.h" #include "qgroup.h" #include "inode-map.h" +#include "block-group.h" +#include "space-info.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -24,9 +27,12 @@ * LOG_INODE_EXISTS means to log just enough to recreate the inode * during log replay */ -#define LOG_INODE_ALL 0 -#define LOG_INODE_EXISTS 1 -#define LOG_OTHER_INODE 2 +enum { + LOG_INODE_ALL, + LOG_INODE_EXISTS, + LOG_OTHER_INODE, + LOG_OTHER_INODE_ALL, +}; /* * directory trouble cases @@ -80,16 +86,16 @@ * The last stage is to deal with directories and links and extents * and all the other fun semantics */ -#define LOG_WALK_PIN_ONLY 0 -#define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_DIR_INDEX 2 -#define LOG_WALK_REPLAY_ALL 3 +enum { + LOG_WALK_PIN_ONLY, + LOG_WALK_REPLAY_INODES, + LOG_WALK_REPLAY_DIR_INDEX, + LOG_WALK_REPLAY_ALL, +}; static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -138,7 +144,7 @@ mutex_lock(&root->log_mutex); if (root->log_root) { - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } @@ -161,13 +167,14 @@ if (ret) goto out; + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; } atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -187,9 +194,8 @@ { int ret = -ENOENT; - smp_mb(); - if (!root->log_root) - return -ENOENT; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) + return ret; mutex_lock(&root->log_mutex); if (root->log_root) { @@ -205,14 +211,9 @@ * until you call btrfs_end_log_trans() or it makes any future * log transactions wait until you call btrfs_end_log_trans() */ -int btrfs_pin_log_trans(struct btrfs_root *root) +void btrfs_pin_log_trans(struct btrfs_root *root) { - int ret = -ENOENT; - - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); - return ret; } /* @@ -227,6 +228,17 @@ } } +static int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + buf->start + buf->len - 1); +} + +static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); +} /* * the walk control struct is used to pass state down the chain when @@ -301,12 +313,12 @@ } if (wc->pin) - ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) - ret = btrfs_exclude_logged_extents(fs_info, eb); + ret = btrfs_exclude_logged_extents(eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -335,7 +347,6 @@ struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; u32 item_size; u64 saved_i_size = 0; @@ -456,10 +467,9 @@ found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(fs_info, path, - item_size - found_size); + btrfs_extend_item(path, item_size - found_size); } else if (ret) { return ret; } @@ -495,13 +505,8 @@ */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && - ino_size != 0) { - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - btrfs_set_token_inode_size(dst_eb, dst_item, - ino_size, &token); - } + ino_size != 0) + btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } @@ -545,13 +550,9 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { - struct btrfs_key key; struct inode *inode; - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + inode = btrfs_iget(root->fs_info->sb, objectid, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -696,20 +697,27 @@ goto out; if (ins.objectid > 0) { + struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); + /* * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, - ins.objectid, ins.offset, - 0, root->root_key.objectid, + if (ret < 0) { + goto out; + } else if (ret == 0) { + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + ins.objectid, ins.offset, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, key->objectid, offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; } else { @@ -816,6 +824,11 @@ if (ret) goto out; } + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, + extent_end - start); + if (ret) + goto out; inode_add_bytes(inode, nbytes); update_inode: @@ -941,54 +954,32 @@ const char *name, int namelen) { struct btrfs_path *path; - struct btrfs_inode_ref *ref; - unsigned long ptr; - unsigned long ptr_end; - unsigned long name_ptr; - int found_name_len; - int item_size; int ret; - int match = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret != 0) + if (ret < 0) { goto out; + } else if (ret == 1) { + ret = 0; + goto out; + } - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - - if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path->nodes[0], + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, + name, namelen); + else + ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, namelen, NULL)) - match = 1; - - goto out; - } - - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - ref = (struct btrfs_inode_ref *)ptr; - found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); - if (found_name_len == namelen) { - name_ptr = (unsigned long)(ref + 1); - ret = memcmp_extent_buffer(path->nodes[0], name, - name_ptr, namelen); - if (ret == 0) { - match = 1; - goto out; - } - } - ptr = (unsigned long)(ref + 1) + found_name_len; - } + name, namelen); out: btrfs_free_path(path); - return match; + return ret; } static inline int __add_inode_ref(struct btrfs_trans_handle *trans, @@ -1046,10 +1037,13 @@ (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log_root, &search_key, - parent_objectid, - victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + kfree(victim_name); + return ret; + } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); @@ -1081,7 +1075,9 @@ extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, inode_objectid, parent_objectid, 0, 0); - if (!IS_ERR_OR_NULL(extref)) { + if (IS_ERR(extref)) { + return PTR_ERR(extref); + } else if (extref) { u32 item_size; u32 cur_offset = 0; unsigned long base; @@ -1111,10 +1107,13 @@ search_key.offset = btrfs_extref_hash(parent_objectid, victim_name, victim_name_len); - ret = 0; - if (!backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + kfree(victim_name); + return ret; + } else if (!ret) { ret = -ENOENT; victim_parent = read_one_inode(root, parent_objectid); @@ -1159,7 +1158,7 @@ } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (IS_ERR(di)) { @@ -1268,12 +1267,12 @@ goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen); else - ret = btrfs_find_name_in_backref(log_eb, log_slot, name, - namelen, NULL); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, + name, namelen); if (!ret) { struct inode *dir; @@ -1289,6 +1288,15 @@ inode, name, namelen); kfree(name); iput(dir); + /* + * Whenever we need to check if a name exists or not, we + * check the subvolume tree. So after an unlink we must + * run delayed items, so that future checks for a name + * during log replay see that the name does not exists + * anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); if (ret) goto out; goto again; @@ -1335,15 +1343,75 @@ goto out; } if (key.type == BTRFS_INODE_EXTREF_KEY) - ret = btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], parent_id, - name, namelen, NULL); + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], parent_id, name, namelen); else - ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, namelen, NULL); + ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, namelen); out: btrfs_free_path(path); + return ret; +} + +static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *dir, struct inode *inode, const char *name, + int namelen, u64 ref_index) +{ + struct btrfs_dir_item *dir_item; + struct btrfs_key key; + struct btrfs_path *path; + struct inode *other_inode = NULL; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_item = btrfs_lookup_dir_item(NULL, root, path, + btrfs_ino(BTRFS_I(dir)), + name, namelen, 0); + if (!dir_item) { + btrfs_release_path(path); + goto add_link; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + goto out; + } + + /* + * Our inode's dentry collides with the dentry of another inode which is + * in the log but not yet processed since it has a higher inode number. + * So delete that other dentry. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); + btrfs_release_path(path); + other_inode = read_one_inode(root, key.objectid); + if (!other_inode) { + ret = -ENOENT; + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); + if (ret) + goto out; + /* + * If we dropped the link count to 0, bump it so that later the iput() + * on the inode will not free it. We will fixup the link count later. + */ + if (other_inode->i_nlink == 0) + inc_nlink(other_inode); + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; +add_link: + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + name, namelen, 0, ref_index); +out: + iput(other_inode); + btrfs_free_path(path); + return ret; } @@ -1480,14 +1548,22 @@ */ if (!ret && inode->i_nlink == 0) inc_nlink(inode); + /* + * Whenever we need to check if a name exists or + * not, we check the subvolume tree. So after an + * unlink we must run delayed items, so that future + * checks for a name during log replay see that the + * name does not exists anymore. + */ + if (!ret) + ret = btrfs_run_delayed_items(trans); } if (ret < 0) goto out; /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), - BTRFS_I(inode), - name, namelen, 0, ref_index); + ret = add_link(trans, root, dir, inode, name, namelen, + ref_index); if (ret) goto out; @@ -1829,30 +1905,6 @@ } /* - * Return true if an inode reference exists in the log for the given name, - * inode and parent inode. - */ -static bool name_in_log_ref(struct btrfs_root *log_root, - const char *name, const int name_len, - const u64 dirid, const u64 ino) -{ - struct btrfs_key search_key; - - search_key.objectid = ino; - search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = dirid; - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(dirid, name, name_len); - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - return false; -} - -/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -1975,8 +2027,31 @@ return ret; insert: - if (name_in_log_ref(root->log_root, name, name_len, - key->objectid, log_key.objectid)) { + /* + * Check if the inode reference exists in the log for the given name, + * inode and parent inode + */ + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_REF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + if (ret < 0) { + goto out; + } else if (ret) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_EXTREF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + name_len); + if (ret < 0) { + goto out; + } else if (ret) { /* The dentry will be added later. */ ret = 0; update_size = false; @@ -2629,29 +2704,45 @@ return ret; } +/* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) +{ + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return; + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->reserved -= fs_info->nodesize; + cache->space_info->bytes_reserved -= fs_info->nodesize; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - struct extent_buffer *parent; u32 blocksize; int ret = 0; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); while (*level > 0) { struct btrfs_key first_key; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; WARN_ON(btrfs_header_level(cur) != *level); @@ -2664,9 +2755,6 @@ ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) @@ -2691,23 +2779,20 @@ if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(fs_info, next); + btrfs_set_lock_blocking_write(next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + bytenr, blocksize); + if (ret) { + free_extent_buffer(next); + return ret; + } } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - - WARN_ON(root_owner != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, bytenr, - blocksize); - if (ret) { - free_extent_buffer(next); - return ret; + unaccount_log_buffer(fs_info, bytenr); } } free_extent_buffer(next); @@ -2719,7 +2804,6 @@ return ret; } - WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; @@ -2727,9 +2811,6 @@ path->slots[*level] = 0; cond_resched(); } - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); @@ -2742,7 +2823,6 @@ struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 root_owner; int i; int slot; int ret; @@ -2755,13 +2835,6 @@ WARN_ON(*level == 0); return 0; } else { - struct extent_buffer *parent; - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level]), *level); @@ -2775,22 +2848,22 @@ if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(fs_info, next); + btrfs_set_lock_blocking_write(next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + path->nodes[*level]->start, + path->nodes[*level]->len); + if (ret) + return ret; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; + unaccount_log_buffer(fs_info, + path->nodes[*level]->start); + } } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2822,7 +2895,7 @@ level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - extent_buffer_get(log->node); + atomic_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -2857,21 +2930,19 @@ if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(fs_info, next); + btrfs_set_lock_blocking_write(next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + ret = btrfs_pin_reserved_extent(trans, + next->start, next->len); + if (ret) + goto out; } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); + unaccount_log_buffer(fs_info, next->start); } - - WARN_ON(log->root_key.objectid != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(fs_info, - next->start, next->len); - if (ret) - goto out; } } @@ -3035,7 +3106,7 @@ } /* bail out if we need to do a full commit */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; mutex_unlock(&root->log_mutex); goto out; @@ -3054,7 +3125,7 @@ if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; } @@ -3088,16 +3159,10 @@ btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_batch); - atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; - - mutex_unlock(&log_root_tree->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); /* * Now we are safe to update the log_root_tree because we're under the @@ -3105,18 +3170,12 @@ * open until we drop the log_mutex. */ ret = update_log_root(trans, log, &new_root_item); - - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* atomic_dec_and_test implies a barrier */ - cond_wake_up_nomb(&log_root_tree->log_writer_wait); - } - if (ret) { if (!list_empty(&root_log_ctx.list)) list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -3156,13 +3215,11 @@ root_log_ctx.log_transid - 1); } - wait_for_writer(log_root_tree); - /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -3175,7 +3232,7 @@ EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -3185,7 +3242,7 @@ ret = btrfs_wait_tree_log_extents(log_root_tree, EXTENT_NEW | EXTENT_DIRTY); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3199,7 +3256,7 @@ mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3207,7 +3264,7 @@ */ ret = write_all_supers(fs_info, 1); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -3251,8 +3308,6 @@ struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer @@ -3266,20 +3321,10 @@ btrfs_handle_fs_error(log->fs_info, ret, NULL); } - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - } - - free_extent_buffer(log->node); - kfree(log); + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->log_csum_range); + btrfs_put_root(log); } /* @@ -3291,6 +3336,7 @@ if (root->log_root) { free_log_tree(trans, root->log_root); root->log_root = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); } return 0; } @@ -3447,7 +3493,7 @@ out_unlock: mutex_unlock(&dir->log_mutex); if (err == -ENOSPC) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(trans); err = 0; } else if (err < 0 && err != -ENOENT) { /* ENOENT can be returned if the entry hasn't been fsynced yet */ @@ -3465,7 +3511,6 @@ const char *name, int name_len, struct btrfs_inode *inode, u64 dirid) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; @@ -3483,7 +3528,7 @@ dirid, &index); mutex_unlock(&inode->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); @@ -3807,8 +3852,9 @@ found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, 0, - &start_slot); + ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); + if (ret < 0) + break; ret = btrfs_del_items(trans, log, path, start_slot, path->slots[0] - start_slot + 1); @@ -3834,7 +3880,7 @@ { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); if (log_inode_only) { /* set the generation to zero so the recover code @@ -3842,44 +3888,41 @@ * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, logged_isize, &token); + btrfs_set_token_inode_generation(&token, item, 0); + btrfs_set_token_inode_size(&token, item, logged_isize); } else { - btrfs_set_token_inode_generation(leaf, item, - BTRFS_I(inode)->generation, - &token); - btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_size(&token, item, inode->i_size); } - btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); - btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); - btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); - btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - btrfs_set_token_timespec_sec(leaf, &item->atime, - inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->atime, - inode->i_atime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); - btrfs_set_token_timespec_sec(leaf, &item->mtime, - inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->mtime, - inode->i_mtime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); - btrfs_set_token_timespec_sec(leaf, &item->ctime, - inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->ctime, - inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), - &token); + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); - btrfs_set_token_inode_sequence(leaf, item, - inode_peek_iversion(inode), &token); - btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); - btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); - btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); - btrfs_set_token_inode_block_group(leaf, item, 0, &token); + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + btrfs_set_token_inode_block_group(&token, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3902,11 +3945,32 @@ } static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { + const u64 lock_end = sums->bytenr + sums->len - 1; + struct extent_state *cached_state = NULL; int ret; + /* + * If this inode was not used for reflink operations in the current + * transaction with new extents, then do the fast path, no need to + * worry about logging checksum items with overlapping ranges. + */ + if (inode->last_reflink_trans < trans->transid) + return btrfs_csum_file_blocks(trans, log_root, sums); + + /* + * Serialize logging for checksums. This is to avoid racing with the + * same checksum being logged by another task that is logging another + * file which happens to refer to the same extent as well. Such races + * can leave checksum items in the log with overlapping ranges. + */ + ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, + lock_end, &cached_state); + if (ret) + return ret; /* * Due to extent cloning, we might have logged a csum item that covers a * subrange of a cloned extent, and later we can end up logging a csum @@ -3917,10 +3981,13 @@ * trim and adjust) any existing csum items in the log for this range. */ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); - if (ret) - return ret; + if (!ret) + ret = btrfs_csum_file_blocks(trans, log_root, sums); - return btrfs_csum_file_blocks(trans, log_root, sums); + unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); + + return ret; } static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -4041,7 +4108,7 @@ struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log, sums); + ret = log_csums(trans, inode, log, sums); list_del(&sums->list); kfree(sums); } @@ -4066,10 +4133,14 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4078,13 +4149,71 @@ em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4100,7 +4229,7 @@ struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log_root, sums); + ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -4124,13 +4253,11 @@ int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; - btrfs_init_map_token(&token); - - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); if (ret) @@ -4147,46 +4274,39 @@ return ret; } leaf = path->nodes[0]; + btrfs_init_map_token(&token, leaf); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, - &token); + btrfs_set_token_file_extent_generation(&token, fi, trans->transid); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC, - &token); + btrfs_set_token_file_extent_type(&token, fi, + BTRFS_FILE_EXTENT_PREALLOC); else - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG, - &token); + btrfs_set_token_file_extent_type(&token, fi, + BTRFS_FILE_EXTENT_REG); block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start, - &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); + btrfs_set_token_file_extent_disk_bytenr(&token, fi, + em->block_start); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + btrfs_set_token_file_extent_disk_bytenr(&token, fi, em->block_start - - extent_offset, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); + extent_offset); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); } else { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, - &token); + btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); } - btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); - btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, - &token); - btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); - btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_set_token_file_extent_offset(&token, fi, extent_offset); + btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); + btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); + btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); + btrfs_set_token_file_extent_encryption(&token, fi, 0); + btrfs_set_token_file_extent_other_encoding(&token, fi, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4196,7 +4316,7 @@ /* * Log all prealloc extents beyond the inode's i_size to make sure we do not - * lose them after doing a fast fsync and replaying the log. We scan the + * lose them after doing a full/fast fsync and replaying the log. We scan the * subvolume's root instead of iterating the inode's extent map tree because * otherwise we can log incorrect extent items based on extent map conversion. * That can happen due to the fact that extent maps are merged when they @@ -4322,12 +4442,9 @@ } } } - if (ins_nr > 0) { + if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, start_slot, ins_nr, 1, 0); - if (ret > 0) - ret = 0; - } out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4338,14 +4455,13 @@ struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4354,27 +4470,8 @@ write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4395,11 +4492,6 @@ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && em->start >= i_size_read(&inode->vfs_inode)) continue; - - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); @@ -4438,8 +4530,32 @@ btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4502,6 +4618,10 @@ const u64 ino = btrfs_ino(inode); int ins_nr = 0; int start_slot = 0; + bool found_xattrs = false; + + if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) + return 0; key.objectid = ino; key.type = BTRFS_XATTR_ITEM_KEY; @@ -4540,6 +4660,7 @@ start_slot = slot; ins_nr++; path->slots[0]++; + found_xattrs = true; cond_resched(); } if (ins_nr > 0) { @@ -4548,6 +4669,9 @@ if (ret < 0) return ret; } + + if (!found_xattrs) + set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); return 0; } @@ -4585,9 +4709,7 @@ return ret; while (true) { - struct btrfs_file_extent_item *extent; struct extent_buffer *leaf = path->nodes[0]; - u64 len; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); @@ -4636,18 +4758,7 @@ leaf = path->nodes[0]; } - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - prev_extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(leaf, extent); - prev_extent_end = key.offset + len; - } - + prev_extent_end = btrfs_file_extent_end(path); path->slots[0]++; cond_resched(); } @@ -4714,7 +4825,7 @@ const int slot, const struct btrfs_key *key, struct btrfs_inode *inode, - u64 *other_ino) + u64 *other_ino, u64 *other_parent) { int ret; struct btrfs_path *search_path; @@ -4777,8 +4888,13 @@ btrfs_dir_item_key_to_cpu(search_path->nodes[0], di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { - ret = 1; - *other_ino = di_key.objectid; + if (di_key.objectid != key->objectid) { + ret = 1; + *other_ino = di_key.objectid; + *other_parent = parent; + } else { + ret = 0; + } } else { ret = -EAGAIN; } @@ -4795,6 +4911,334 @@ out: btrfs_free_path(search_path); kfree(name); + return ret; +} + +struct btrfs_ino_list { + u64 ino; + u64 parent; + struct list_head list; +}; + +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + u64 ino, u64 parent) +{ + struct btrfs_ino_list *ino_elem; + LIST_HEAD(inode_list); + int ret = 0; + + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &inode_list); + + while (!list_empty(&inode_list)) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct inode *inode; + + ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, + list); + ino = ino_elem->ino; + parent = ino_elem->parent; + list_del(&ino_elem->list); + kfree(ino_elem); + if (ret) + continue; + + btrfs_release_path(path); + + inode = btrfs_iget(fs_info->sb, ino, root); + /* + * If the other inode that had a conflicting dir entry was + * deleted in the current transaction, we need to log its parent + * directory. + */ + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + inode = btrfs_iget(fs_info->sb, parent, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + } else { + ret = btrfs_log_inode(trans, root, + BTRFS_I(inode), + LOG_OTHER_INODE_ALL, + ctx); + btrfs_add_delayed_iput(inode); + } + } + continue; + } + /* + * If the inode was already logged skip it - otherwise we can + * hit an infinite loop. Example: + * + * From the commit root (previous transaction) we have the + * following inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + */ + spin_lock(&BTRFS_I(inode)->lock); + /* + * Check the inode's logged_trans only instead of + * btrfs_inode_in_log(). This is because the last_log_commit of + * the inode is not updated when we only log that it exists and + * it has the full sync bit set (see btrfs_log_inode()). + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) { + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_add_delayed_iput(inode); + continue; + } + spin_unlock(&BTRFS_I(inode)->lock); + /* + * We are safe logging the other inode without acquiring its + * lock as long as we log with the LOG_INODE_EXISTS mode. We + * are safe against concurrent renames of the other inode as + * well because during a rename we pin the log and update the + * log with the new name before we unpin it. + */ + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_OTHER_INODE, ctx); + if (ret) { + btrfs_add_delayed_iput(inode); + continue; + } + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_add_delayed_iput(inode); + continue; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u64 other_ino = 0; + u64 other_parent = 0; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = 0; + break; + } + + ret = btrfs_check_ref_name_override(leaf, slot, &key, + BTRFS_I(inode), &other_ino, + &other_parent); + if (ret < 0) + break; + if (ret > 0) { + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) { + ret = -ENOMEM; + break; + } + ino_elem->ino = other_ino; + ino_elem->parent = other_parent; + list_add_tail(&ino_elem->list, &inode_list); + ret = 0; + } + path->slots[0]++; + } + btrfs_add_delayed_iput(inode); + } + + return ret; +} + +static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_key *min_key, + const struct btrfs_key *max_key, + struct btrfs_path *path, + struct btrfs_path *dst_path, + const u64 logged_isize, + const bool recursive_logging, + const int inode_only, + struct btrfs_log_ctx *ctx, + bool *need_log_inode_item) +{ + const u64 i_size = i_size_read(&inode->vfs_inode); + struct btrfs_root *root = inode->root; + int ins_start_slot = 0; + int ins_nr = 0; + int ret; + + while (1) { + ret = btrfs_search_forward(root, min_key, path, trans->transid); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } +again: + /* Note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key->objectid != max_key->objectid) + break; + if (min_key->type > max_key->type) + break; + + if (min_key->type == BTRFS_INODE_ITEM_KEY) { + *need_log_inode_item = false; + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && + min_key->offset >= i_size) { + /* + * Extents at and beyond eof are logged with + * btrfs_log_prealloc_extents(). + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, + * and no keys greater than that, so bail out. + */ + break; + } else if ((min_key->type == BTRFS_INODE_REF_KEY || + min_key->type == BTRFS_INODE_EXTREF_KEY) && + inode->generation == trans->transid && + !recursive_logging) { + u64 other_ino = 0; + u64 other_parent = 0; + + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], min_key, inode, + &other_ino, &other_parent); + if (ret < 0) { + return ret; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, + inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + + ret = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); + if (ret) + return ret; + btrfs_release_path(path); + goto next_key; + } + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + goto next_slot; + } + + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) + return ret; + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) + return ret; + ins_nr = 0; + } + btrfs_release_path(path); +next_key: + if (min_key->offset < (u64)-1) { + min_key->offset++; + } else if (min_key->type < max_key->type) { + min_key->type++; + min_key->offset = 0; + } else { + break; + } + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret) + return ret; + } + + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { + /* + * Release the path because otherwise we might attempt to double + * lock the same leaf with btrfs_log_prealloc_extents() below. + */ + btrfs_release_path(path); + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); + } + return ret; } @@ -4815,27 +5259,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; int err = 0; - int ret; - int nritems; - int ins_start_slot = 0; - int ins_nr; + int ret = 0; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; + bool recursive_logging = false; path = btrfs_alloc_path(); if (!path) @@ -4864,15 +5303,19 @@ max_key.offset = (u64)-1; /* - * Only run delayed items if we are a dir or a new file. - * Otherwise commit the delayed inode only, which is needed in - * order for the log replay code to mark inodes for link count - * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + * Only run delayed items if we are a directory. We want to make sure + * all directory indexes hit the fs/subvolume tree so we can find them + * and figure out which index ranges have to be logged. + * + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode) || - inode->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode); - else + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { @@ -4881,12 +5324,28 @@ return ret; } - if (inode_only == LOG_OTHER_INODE) { - inode_only = LOG_INODE_EXISTS; + if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { + recursive_logging = true; + if (inode_only == LOG_OTHER_INODE) + inode_only = LOG_INODE_EXISTS; + else + inode_only = LOG_INODE_ALL; mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&inode->log_mutex); } + + /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; /* * a brute force approach to making sure we get the most uptodate @@ -4955,170 +5414,12 @@ goto out_unlock; } - while (1) { - ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, - path, trans->transid); - if (ret < 0) { - err = ret; - goto out_unlock; - } - if (ret != 0) - break; -again: - /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != ino) - break; - if (min_key.type > max_key.type) - break; - - if (min_key.type == BTRFS_INODE_ITEM_KEY) - need_log_inode_item = false; - - if ((min_key.type == BTRFS_INODE_REF_KEY || - min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid) { - u64 other_ino = 0; - - ret = btrfs_check_ref_name_override(path->nodes[0], - path->slots[0], &min_key, inode, - &other_ino); - if (ret < 0) { - err = ret; - goto out_unlock; - } else if (ret > 0 && ctx && - other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - struct btrfs_key inode_key; - struct inode *other_inode; - - if (ins_nr > 0) { - ins_nr++; - } else { - ins_nr = 1; - ins_start_slot = path->slots[0]; - } - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - btrfs_release_path(path); - inode_key.objectid = other_ino; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - other_inode = btrfs_iget(fs_info->sb, - &inode_key, root, - NULL); - /* - * If the other inode that had a conflicting dir - * entry was deleted in the current transaction, - * we don't need to do more work nor fallback to - * a transaction commit. - */ - if (other_inode == ERR_PTR(-ENOENT)) { - goto next_key; - } else if (IS_ERR(other_inode)) { - err = PTR_ERR(other_inode); - goto out_unlock; - } - /* - * We are safe logging the other inode without - * acquiring its i_mutex as long as we log with - * the LOG_INODE_EXISTS mode. We're safe against - * concurrent renames of the other inode as well - * because during a rename we pin the log and - * update the log with the new name before we - * unpin it. - */ - err = btrfs_log_inode(trans, root, - BTRFS_I(other_inode), - LOG_OTHER_INODE, 0, LLONG_MAX, - ctx); - btrfs_add_delayed_iput(other_inode); - if (err) - goto out_unlock; - else - goto next_key; - } - } - - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ - if (min_key.type == BTRFS_XATTR_ITEM_KEY) { - if (ins_nr == 0) - goto next_slot; - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - goto next_slot; - } - - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { - ins_nr++; - goto next_slot; - } else if (!ins_nr) { - ins_start_slot = path->slots[0]; - ins_nr = 1; - goto next_slot; - } - - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 1; - ins_start_slot = path->slots[0]; -next_slot: - - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(path->nodes[0], &min_key, - path->slots[0]); - goto again; - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, - ins_nr, inode_only, logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - btrfs_release_path(path); -next_key: - if (min_key.offset < (u64)-1) { - min_key.offset++; - } else if (min_key.type < max_key.type) { - min_key.type++; - min_key.offset = 0; - } else { - break; - } - } - if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, - ins_start_slot, ins_nr, inode_only, - logged_isize); - if (ret < 0) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } + err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, + path, dst_path, logged_isize, + recursive_logging, inode_only, ctx, + &need_log_inode_item); + if (err) + goto out_unlock; btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5148,7 +5449,7 @@ } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5157,31 +5458,8 @@ struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5195,19 +5473,34 @@ } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -5244,7 +5537,7 @@ * Make sure any commits to the log are forced to be full * commits. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = true; } mutex_unlock(&inode->log_mutex); @@ -5432,7 +5725,7 @@ continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto next_dir_inode; @@ -5444,10 +5737,10 @@ } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5558,8 +5851,8 @@ cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, &inode_key, - root, NULL); + dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, + root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -5591,7 +5884,7 @@ if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5610,6 +5903,192 @@ return ret; } +static int log_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + while (true) { + struct btrfs_fs_info *fs_info = root->fs_info; + const u64 last_committed = fs_info->last_trans_committed; + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key search_key; + struct inode *inode; + u64 ino; + int ret = 0; + + btrfs_release_path(path); + + ino = found_key.offset; + + search_key.objectid = found_key.offset; + search_key.type = BTRFS_INODE_ITEM_KEY; + search_key.offset = 0; + inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (BTRFS_I(inode)->generation > last_committed) + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); + if (ret) + return ret; + + if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) + break; + + search_key.type = BTRFS_INODE_REF_KEY; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != search_key.objectid || + found_key.type != BTRFS_INODE_REF_KEY) + return -ENOENT; + } + return 0; +} + +static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct dentry *old_parent = NULL; + struct super_block *sb = inode->vfs_inode.i_sb; + int ret = 0; + + while (true) { + if (!parent || d_really_is_negative(parent) || + sb != parent->d_sb) + break; + + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) + break; + + if (inode->generation > fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, ctx); + if (ret) + break; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + dput(old_parent); + + return ret; +} + +static int log_all_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + /* + * For a single hard link case, go through a fast path that does not + * need to iterate the fs/subvolume tree. + */ + if (inode->vfs_inode.i_nlink < 2) + return log_new_ancestors_fast(trans, inode, parent, ctx); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) + path->slots[0]++; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key found_key; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != ino || + found_key.type > BTRFS_INODE_EXTREF_KEY) + break; + + /* + * Don't deal with extended references because they are rare + * cases and too complex to deal with (we would need to keep + * track of which subitem we are processing for each item in + * this loop, etc). So just return some error to fallback to + * a transaction commit. + */ + if (found_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = -EMLINK; + goto out; + } + + /* + * Logging ancestors needs to do more searches on the fs/subvol + * tree, so it releases the path as needed to avoid deadlocks. + * Keep track of the last inode ref key and resume from that key + * after logging all new ancestors for the current hard link. + */ + memcpy(&search_key, &found_key, sizeof(search_key)); + + ret = log_new_ancestors(trans, root, path, ctx); + if (ret) + goto out; + btrfs_release_path(path); + goto again; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -5619,19 +6098,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; - struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct btrfs_inode *orig_inode = inode; sb = inode->vfs_inode.i_sb; @@ -5665,7 +6140,8 @@ * (since logging them is pointless, a link count of 0 means they * will never be accessible). */ - if (btrfs_inode_in_log(inode, trans->transid) || + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; @@ -5675,7 +6151,7 @@ if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -5737,56 +6213,22 @@ * and has a link count of 2. */ if (inode->last_unlink_trans > last_committed) { - ret = btrfs_log_all_parents(trans, orig_inode, ctx); + ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; } - /* - * If a new hard link was added to the inode in the current transaction - * and its link count is now greater than 1, we need to fallback to a - * transaction commit, otherwise we can end up not logging all its new - * parents for all the hard links. Here just from the dentry used to - * fsync, we can not visit the ancestor inodes for all the other hard - * links to figure out if any is new, so we fallback to a transaction - * commit (instead of adding a lot of complexity of scanning a btree, - * since this scenario is not a common use case). - */ - if (inode->vfs_inode.i_nlink > 1 && - inode->last_link_trans > last_committed) { - ret = -EMLINK; + ret = log_all_new_ancestors(trans, inode, parent, ctx); + if (ret) goto end_trans; - } - while (1) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - inode = BTRFS_I(d_inode(parent)); - if (root != inode->root) - break; - - if (inode->generation > last_committed) { - ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - if (ret) - goto end_trans; - } - if (IS_ROOT(parent)) - break; - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - } if (log_dentries) - ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + ret = log_new_dir_dentries(trans, root, inode, ctx); else ret = 0; end_trans: - dput(old_parent); if (ret < 0) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 1; } @@ -5805,15 +6247,13 @@ */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5830,12 +6270,11 @@ struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_key tmp_key; struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, - .stage = 0, + .stage = LOG_WALK_PIN_ONLY, }; path = btrfs_alloc_path(); @@ -5884,7 +6323,7 @@ if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root(log_root_tree, &found_key); + log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_handle_fs_error(fs_info, ret, @@ -5892,11 +6331,8 @@ goto error; } - tmp_key.objectid = found_key.offset; - tmp_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_key.offset = (u64)-1; - - wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, + true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); @@ -5912,12 +6348,10 @@ * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(fs_info, + ret = btrfs_pin_extent_for_log_replay(trans, log->node->start, log->node->len); - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(log); if (!ret) goto next; @@ -5953,9 +6387,8 @@ } wc.replay_dest->log_root = NULL; - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); + btrfs_put_root(wc.replay_dest); + btrfs_put_root(log); if (ret) goto error; @@ -5986,10 +6419,9 @@ if (ret) return ret; - free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); - kfree(log_root_tree); + btrfs_put_root(log_root_tree); return 0; error: @@ -6085,26 +6517,12 @@ /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { - struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6117,36 +6535,19 @@ * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans <= fs_info->last_trans_committed && - (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; + if (!inode_logged(trans, inode) && + (!old_dir || !inode_logged(trans, old_dir))) + return; - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } -- Gitblit v1.6.2