hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/btrfs/ctree.c
....@@ -12,6 +12,8 @@
1212 #include "transaction.h"
1313 #include "print-tree.h"
1414 #include "locking.h"
15
+#include "volumes.h"
16
+#include "qgroup.h"
1517
1618 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
1719 *root, struct btrfs_path *path, int level);
....@@ -19,73 +21,61 @@
1921 const struct btrfs_key *ins_key, struct btrfs_path *path,
2022 int data_size, int extend);
2123 static int push_node_left(struct btrfs_trans_handle *trans,
22
- struct btrfs_fs_info *fs_info,
2324 struct extent_buffer *dst,
2425 struct extent_buffer *src, int empty);
2526 static int balance_node_right(struct btrfs_trans_handle *trans,
26
- struct btrfs_fs_info *fs_info,
2727 struct extent_buffer *dst_buf,
2828 struct extent_buffer *src_buf);
2929 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3030 int level, int slot);
3131
32
+static const struct btrfs_csums {
33
+ u16 size;
34
+ const char name[10];
35
+ const char driver[12];
36
+} btrfs_csums[] = {
37
+ [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41
+ .driver = "blake2b-256" },
42
+};
43
+
44
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
45
+{
46
+ u16 t = btrfs_super_csum_type(s);
47
+ /*
48
+ * csum type is validated at mount time
49
+ */
50
+ return btrfs_csums[t].size;
51
+}
52
+
53
+const char *btrfs_super_csum_name(u16 csum_type)
54
+{
55
+ /* csum type is validated at mount time */
56
+ return btrfs_csums[csum_type].name;
57
+}
58
+
59
+/*
60
+ * Return driver name if defined, otherwise the name that's also a valid driver
61
+ * name
62
+ */
63
+const char *btrfs_super_csum_driver(u16 csum_type)
64
+{
65
+ /* csum type is validated at mount time */
66
+ return btrfs_csums[csum_type].driver[0] ?
67
+ btrfs_csums[csum_type].driver :
68
+ btrfs_csums[csum_type].name;
69
+}
70
+
71
+size_t __attribute_const__ btrfs_get_num_csums(void)
72
+{
73
+ return ARRAY_SIZE(btrfs_csums);
74
+}
75
+
3276 struct btrfs_path *btrfs_alloc_path(void)
3377 {
3478 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35
-}
36
-
37
-/*
38
- * set all locked nodes in the path to blocking locks. This should
39
- * be done before scheduling
40
- */
41
-noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42
-{
43
- int i;
44
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45
- if (!p->nodes[i] || !p->locks[i])
46
- continue;
47
- btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48
- if (p->locks[i] == BTRFS_READ_LOCK)
49
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50
- else if (p->locks[i] == BTRFS_WRITE_LOCK)
51
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52
- }
53
-}
54
-
55
-/*
56
- * reset all the locked nodes in the patch to spinning locks.
57
- *
58
- * held is used to keep lockdep happy, when lockdep is enabled
59
- * we set held to a blocking lock before we go around and
60
- * retake all the spinlocks in the path. You can safely use NULL
61
- * for held
62
- */
63
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64
- struct extent_buffer *held, int held_rw)
65
-{
66
- int i;
67
-
68
- if (held) {
69
- btrfs_set_lock_blocking_rw(held, held_rw);
70
- if (held_rw == BTRFS_WRITE_LOCK)
71
- held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72
- else if (held_rw == BTRFS_READ_LOCK)
73
- held_rw = BTRFS_READ_LOCK_BLOCKING;
74
- }
75
- btrfs_set_path_blocking(p);
76
-
77
- for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78
- if (p->nodes[i] && p->locks[i]) {
79
- btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80
- if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81
- p->locks[i] = BTRFS_WRITE_LOCK;
82
- else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83
- p->locks[i] = BTRFS_READ_LOCK;
84
- }
85
- }
86
-
87
- if (held)
88
- btrfs_clear_lock_blocking_rw(held, held_rw);
8979 }
9080
9181 /* this also releases the path */
....@@ -154,47 +144,10 @@
154144 return eb;
155145 }
156146
157
-/* loop around taking references on and locking the root node of the
158
- * tree until you end up with a lock on the root. A locked buffer
159
- * is returned, with a reference held.
160
- */
161
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162
-{
163
- struct extent_buffer *eb;
164
-
165
- while (1) {
166
- eb = btrfs_root_node(root);
167
- btrfs_tree_lock(eb);
168
- if (eb == root->node)
169
- break;
170
- btrfs_tree_unlock(eb);
171
- free_extent_buffer(eb);
172
- }
173
- return eb;
174
-}
175
-
176
-/* loop around taking references on and locking the root node of the
177
- * tree until you end up with a lock on the root. A locked buffer
178
- * is returned, with a reference held.
179
- */
180
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181
-{
182
- struct extent_buffer *eb;
183
-
184
- while (1) {
185
- eb = btrfs_root_node(root);
186
- btrfs_tree_read_lock(eb);
187
- if (eb == root->node)
188
- break;
189
- btrfs_tree_read_unlock(eb);
190
- free_extent_buffer(eb);
191
- }
192
- return eb;
193
-}
194
-
195
-/* cowonly root (everything not a reference counted cow subvolume), just get
196
- * put onto a simple dirty list. transaction.c walks this to make sure they
197
- * get properly updated on disk.
147
+/*
148
+ * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149
+ * just get put onto a simple dirty list. Transaction walks this list to make
150
+ * sure they get properly updated on disk.
198151 */
199152 static void add_root_to_dirty_list(struct btrfs_root *root)
200153 {
....@@ -207,7 +160,7 @@
207160 spin_lock(&fs_info->trans_lock);
208161 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209162 /* Want the extent tree to be the last on the list */
210
- if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
163
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
211164 list_move_tail(&root->dirty_list,
212165 &fs_info->dirty_cowonly_roots);
213166 else
....@@ -233,9 +186,9 @@
233186 int level;
234187 struct btrfs_disk_key disk_key;
235188
236
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
189
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
237190 trans->transid != fs_info->running_transaction->transid);
238
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
191
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
239192 trans->transid != root->last_trans);
240193
241194 level = btrfs_header_level(buf);
....@@ -245,7 +198,8 @@
245198 btrfs_node_key(buf, &disk_key, 0);
246199
247200 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248
- &disk_key, level, buf->start, 0);
201
+ &disk_key, level, buf->start, 0,
202
+ BTRFS_NESTING_NEW_ROOT);
249203 if (IS_ERR(cow))
250204 return PTR_ERR(cow);
251205
....@@ -260,7 +214,7 @@
260214 else
261215 btrfs_set_header_owner(cow, new_root_objectid);
262216
263
- write_extent_buffer_fsid(cow, fs_info->fsid);
217
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
264218
265219 WARN_ON(btrfs_header_generation(buf) > trans->transid);
266220 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
....@@ -355,7 +309,6 @@
355309 struct rb_root *tm_root;
356310 struct rb_node *node;
357311 struct rb_node *next;
358
- struct seq_list *cur_elem;
359312 struct tree_mod_elem *tm;
360313 u64 min_seq = (u64)-1;
361314 u64 seq_putting = elem->seq;
....@@ -367,18 +320,20 @@
367320 list_del(&elem->list);
368321 elem->seq = 0;
369322
370
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
371
- if (cur_elem->seq < min_seq) {
372
- if (seq_putting > cur_elem->seq) {
373
- /*
374
- * blocker with lower sequence number exists, we
375
- * cannot remove anything from the log
376
- */
377
- write_unlock(&fs_info->tree_mod_log_lock);
378
- return;
379
- }
380
- min_seq = cur_elem->seq;
323
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
324
+ struct seq_list *first;
325
+
326
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
327
+ struct seq_list, list);
328
+ if (seq_putting > first->seq) {
329
+ /*
330
+ * Blocker with lower sequence number exists, we
331
+ * cannot remove anything from the log.
332
+ */
333
+ write_unlock(&fs_info->tree_mod_log_lock);
334
+ return;
381335 }
336
+ min_seq = first->seq;
382337 }
383338
384339 /*
....@@ -404,8 +359,6 @@
404359 * The 'start address' is the logical address of the *new* root node
405360 * for root replace operations, or the logical address of the affected
406361 * block for all other operations.
407
- *
408
- * Note: must be called with write lock for fs_info::tree_mod_log_lock.
409362 */
410363 static noinline int
411364 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
....@@ -414,6 +367,8 @@
414367 struct rb_node **new;
415368 struct rb_node *parent = NULL;
416369 struct tree_mod_elem *cur;
370
+
371
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
417372
418373 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
419374
....@@ -752,11 +707,11 @@
752707 return __tree_mod_log_search(fs_info, start, min_seq, 0);
753708 }
754709
755
-static noinline int
756
-tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
710
+static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
757711 struct extent_buffer *src, unsigned long dst_offset,
758712 unsigned long src_offset, int nr_items)
759713 {
714
+ struct btrfs_fs_info *fs_info = dst->fs_info;
760715 int ret = 0;
761716 struct tree_mod_elem **tm_list = NULL;
762717 struct tree_mod_elem **tm_list_add, **tm_list_rem;
....@@ -876,12 +831,11 @@
876831 struct extent_buffer *buf)
877832 {
878833 /*
879
- * Tree blocks not in reference counted trees and tree roots
880
- * are never shared. If a block was allocated after the last
881
- * snapshot and the block was not allocated by tree relocation,
882
- * we know the block is not shared.
834
+ * Tree blocks not in shareable trees and tree roots are never shared.
835
+ * If a block was allocated after the last snapshot and the block was
836
+ * not allocated by tree relocation, we know the block is not shared.
883837 */
884
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
838
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
885839 buf != root->node && buf != root->commit_root &&
886840 (btrfs_header_generation(buf) <=
887841 btrfs_root_last_snapshot(&root->root_item) ||
....@@ -976,9 +930,7 @@
976930 if (new_flags != 0) {
977931 int level = btrfs_header_level(buf);
978932
979
- ret = btrfs_set_disk_extent_flags(trans, fs_info,
980
- buf->start,
981
- buf->len,
933
+ ret = btrfs_set_disk_extent_flags(trans, buf,
982934 new_flags, level, 0);
983935 if (ret)
984936 return ret;
....@@ -996,7 +948,7 @@
996948 if (ret)
997949 return ret;
998950 }
999
- clean_tree_block(fs_info, buf);
951
+ btrfs_clean_tree_block(buf);
1000952 *last_ref = 1;
1001953 }
1002954 return 0;
....@@ -1009,7 +961,8 @@
1009961 const struct btrfs_disk_key *disk_key,
1010962 int level,
1011963 u64 hint,
1012
- u64 empty_size)
964
+ u64 empty_size,
965
+ enum btrfs_lock_nesting nest)
1013966 {
1014967 struct btrfs_fs_info *fs_info = root->fs_info;
1015968 struct extent_buffer *ret;
....@@ -1038,7 +991,7 @@
1038991
1039992 ret = btrfs_alloc_tree_block(trans, root, parent_start,
1040993 root->root_key.objectid, disk_key, level,
1041
- hint, empty_size);
994
+ hint, empty_size, nest);
1042995 trans->can_flush_pending_bgs = true;
1043996
1044997 return ret;
....@@ -1061,7 +1014,8 @@
10611014 struct extent_buffer *buf,
10621015 struct extent_buffer *parent, int parent_slot,
10631016 struct extent_buffer **cow_ret,
1064
- u64 search_start, u64 empty_size)
1017
+ u64 search_start, u64 empty_size,
1018
+ enum btrfs_lock_nesting nest)
10651019 {
10661020 struct btrfs_fs_info *fs_info = root->fs_info;
10671021 struct btrfs_disk_key disk_key;
....@@ -1076,9 +1030,9 @@
10761030
10771031 btrfs_assert_tree_locked(buf);
10781032
1079
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1033
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
10801034 trans->transid != fs_info->running_transaction->transid);
1081
- WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1035
+ WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
10821036 trans->transid != root->last_trans);
10831037
10841038 level = btrfs_header_level(buf);
....@@ -1092,7 +1046,7 @@
10921046 parent_start = parent->start;
10931047
10941048 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1095
- level, search_start, empty_size);
1049
+ level, search_start, empty_size, nest);
10961050 if (IS_ERR(cow))
10971051 return PTR_ERR(cow);
10981052
....@@ -1109,7 +1063,7 @@
11091063 else
11101064 btrfs_set_header_owner(cow, root->root_key.objectid);
11111065
1112
- write_extent_buffer_fsid(cow, fs_info->fsid);
1066
+ write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
11131067
11141068 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
11151069 if (ret) {
....@@ -1119,7 +1073,7 @@
11191073 return ret;
11201074 }
11211075
1122
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1076
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
11231077 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
11241078 if (ret) {
11251079 btrfs_tree_unlock(cow);
....@@ -1135,7 +1089,7 @@
11351089 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
11361090 parent_start = buf->start;
11371091
1138
- extent_buffer_get(cow);
1092
+ atomic_inc(&cow->refs);
11391093 ret = tree_mod_log_insert_root(root->node, cow, 1);
11401094 BUG_ON(ret < 0);
11411095 rcu_assign_pointer(root->node, cow);
....@@ -1254,7 +1208,7 @@
12541208 switch (tm->op) {
12551209 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
12561210 BUG_ON(tm->slot < n);
1257
- /* Fallthrough */
1211
+ fallthrough;
12581212 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
12591213 case MOD_LOG_KEY_REMOVE:
12601214 btrfs_set_node_key(eb, &tm->key, tm->slot);
....@@ -1328,7 +1282,7 @@
13281282 return eb;
13291283
13301284 btrfs_set_path_blocking(path);
1331
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1285
+ btrfs_set_lock_blocking_read(eb);
13321286
13331287 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
13341288 BUG_ON(tm->slot != 0);
....@@ -1352,7 +1306,6 @@
13521306 }
13531307 }
13541308
1355
- btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
13561309 btrfs_tree_read_unlock_blocking(eb);
13571310 free_extent_buffer(eb);
13581311
....@@ -1445,7 +1398,7 @@
14451398 free_extent_buffer(eb_root);
14461399 eb = alloc_dummy_extent_buffer(fs_info, logical);
14471400 } else {
1448
- btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1401
+ btrfs_set_lock_blocking_read(eb_root);
14491402 eb = btrfs_clone_extent_buffer(eb_root);
14501403 btrfs_tree_read_unlock_blocking(eb_root);
14511404 free_extent_buffer(eb_root);
....@@ -1507,7 +1460,7 @@
15071460 *
15081461 * What is forced COW:
15091462 * when we create snapshot during committing the transaction,
1510
- * after we've finished coping src root, we must COW the shared
1463
+ * after we've finished copying src root, we must COW the shared
15111464 * block to ensure the metadata consistency.
15121465 */
15131466 if (btrfs_header_generation(buf) == trans->transid &&
....@@ -1527,11 +1480,16 @@
15271480 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
15281481 struct btrfs_root *root, struct extent_buffer *buf,
15291482 struct extent_buffer *parent, int parent_slot,
1530
- struct extent_buffer **cow_ret)
1483
+ struct extent_buffer **cow_ret,
1484
+ enum btrfs_lock_nesting nest)
15311485 {
15321486 struct btrfs_fs_info *fs_info = root->fs_info;
15331487 u64 search_start;
15341488 int ret;
1489
+
1490
+ if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1491
+ btrfs_err(fs_info,
1492
+ "COW'ing blocks on a fs root that's being dropped");
15351493
15361494 if (trans->transaction != fs_info->running_transaction)
15371495 WARN(1, KERN_CRIT "trans %llu running %llu\n",
....@@ -1551,11 +1509,18 @@
15511509 search_start = buf->start & ~((u64)SZ_1G - 1);
15521510
15531511 if (parent)
1554
- btrfs_set_lock_blocking(parent);
1555
- btrfs_set_lock_blocking(buf);
1512
+ btrfs_set_lock_blocking_write(parent);
1513
+ btrfs_set_lock_blocking_write(buf);
15561514
1515
+ /*
1516
+ * Before CoWing this block for later modification, check if it's
1517
+ * the subtree root and do the delayed subtree trace if needed.
1518
+ *
1519
+ * Also We don't care about the error, as it's handled internally.
1520
+ */
1521
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
15571522 ret = __btrfs_cow_block(trans, root, buf, parent,
1558
- parent_slot, cow_ret, search_start, 0);
1523
+ parent_slot, cow_ret, search_start, 0, nest);
15591524
15601525 trace_btrfs_cow_block(root, buf, *cow_ret);
15611526
....@@ -1575,6 +1540,22 @@
15751540 return 0;
15761541 }
15771542
1543
+#ifdef __LITTLE_ENDIAN
1544
+
1545
+/*
1546
+ * Compare two keys, on little-endian the disk order is same as CPU order and
1547
+ * we can avoid the conversion.
1548
+ */
1549
+static int comp_keys(const struct btrfs_disk_key *disk_key,
1550
+ const struct btrfs_key *k2)
1551
+{
1552
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1553
+
1554
+ return btrfs_comp_cpu_keys(k1, k2);
1555
+}
1556
+
1557
+#else
1558
+
15781559 /*
15791560 * compare two keys in a memcmp fashion
15801561 */
....@@ -1587,11 +1568,12 @@
15871568
15881569 return btrfs_comp_cpu_keys(&k1, k2);
15891570 }
1571
+#endif
15901572
15911573 /*
15921574 * same as comp_keys only with two btrfs_key's
15931575 */
1594
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1576
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
15951577 {
15961578 if (k1->objectid > k2->objectid)
15971579 return 1;
....@@ -1647,7 +1629,7 @@
16471629 if (parent_nritems <= 1)
16481630 return 0;
16491631
1650
- btrfs_set_lock_blocking(parent);
1632
+ btrfs_set_lock_blocking_write(parent);
16511633
16521634 for (i = start_slot; i <= end_slot; i++) {
16531635 struct btrfs_key first_key;
....@@ -1706,11 +1688,12 @@
17061688 search_start = last_block;
17071689
17081690 btrfs_tree_lock(cur);
1709
- btrfs_set_lock_blocking(cur);
1691
+ btrfs_set_lock_blocking_write(cur);
17101692 err = __btrfs_cow_block(trans, root, cur, parent, i,
17111693 &cur, search_start,
17121694 min(16 * blocksize,
1713
- (end_slot - i) * blocksize));
1695
+ (end_slot - i) * blocksize),
1696
+ BTRFS_NESTING_COW);
17141697 if (err) {
17151698 btrfs_tree_unlock(cur);
17161699 free_extent_buffer(cur);
....@@ -1742,15 +1725,8 @@
17421725 {
17431726 int low = 0;
17441727 int high = max;
1745
- int mid;
17461728 int ret;
1747
- struct btrfs_disk_key *tmp = NULL;
1748
- struct btrfs_disk_key unaligned;
1749
- unsigned long offset;
1750
- char *kaddr = NULL;
1751
- unsigned long map_start = 0;
1752
- unsigned long map_len = 0;
1753
- int err;
1729
+ const int key_size = sizeof(struct btrfs_disk_key);
17541730
17551731 if (low > high) {
17561732 btrfs_err(eb->fs_info,
....@@ -1761,32 +1737,26 @@
17611737 }
17621738
17631739 while (low < high) {
1740
+ unsigned long oip;
1741
+ unsigned long offset;
1742
+ struct btrfs_disk_key *tmp;
1743
+ struct btrfs_disk_key unaligned;
1744
+ int mid;
1745
+
17641746 mid = (low + high) / 2;
17651747 offset = p + mid * item_size;
1748
+ oip = offset_in_page(offset);
17661749
1767
- if (!kaddr || offset < map_start ||
1768
- (offset + sizeof(struct btrfs_disk_key)) >
1769
- map_start + map_len) {
1750
+ if (oip + key_size <= PAGE_SIZE) {
1751
+ const unsigned long idx = offset >> PAGE_SHIFT;
1752
+ char *kaddr = page_address(eb->pages[idx]);
17701753
1771
- err = map_private_extent_buffer(eb, offset,
1772
- sizeof(struct btrfs_disk_key),
1773
- &kaddr, &map_start, &map_len);
1774
-
1775
- if (!err) {
1776
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
1777
- map_start);
1778
- } else if (err == 1) {
1779
- read_extent_buffer(eb, &unaligned,
1780
- offset, sizeof(unaligned));
1781
- tmp = &unaligned;
1782
- } else {
1783
- return err;
1784
- }
1785
-
1754
+ tmp = (struct btrfs_disk_key *)(kaddr + oip);
17861755 } else {
1787
- tmp = (struct btrfs_disk_key *)(kaddr + offset -
1788
- map_start);
1756
+ read_extent_buffer(eb, &unaligned, offset, key_size);
1757
+ tmp = &unaligned;
17891758 }
1759
+
17901760 ret = comp_keys(tmp, key);
17911761
17921762 if (ret < 0)
....@@ -1807,9 +1777,9 @@
18071777 * leaves vs nodes
18081778 */
18091779 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1810
- int level, int *slot)
1780
+ int *slot)
18111781 {
1812
- if (level == 0)
1782
+ if (btrfs_header_level(eb) == 0)
18131783 return generic_bin_search(eb,
18141784 offsetof(struct btrfs_leaf, items),
18151785 sizeof(struct btrfs_item),
....@@ -1842,9 +1812,8 @@
18421812 /* given a node and slot number, this reads the blocks it points to. The
18431813 * extent buffer is returned with a reference taken (but unlocked).
18441814 */
1845
-static noinline struct extent_buffer *
1846
-read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1847
- int slot)
1815
+struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1816
+ int slot)
18481817 {
18491818 int level = btrfs_header_level(parent);
18501819 struct extent_buffer *eb;
....@@ -1856,7 +1825,7 @@
18561825 BUG_ON(level == 0);
18571826
18581827 btrfs_node_key_to_cpu(parent, &first_key, slot);
1859
- eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1828
+ eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
18601829 btrfs_node_ptr_generation(parent, slot),
18611830 level - 1, &first_key);
18621831 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
....@@ -1887,8 +1856,7 @@
18871856 int orig_slot = path->slots[level];
18881857 u64 orig_ptr;
18891858
1890
- if (level == 0)
1891
- return 0;
1859
+ ASSERT(level > 0);
18921860
18931861 mid = path->nodes[level];
18941862
....@@ -1914,7 +1882,7 @@
19141882 return 0;
19151883
19161884 /* promote the child to a root */
1917
- child = read_node_slot(fs_info, mid, 0);
1885
+ child = btrfs_read_node_slot(mid, 0);
19181886 if (IS_ERR(child)) {
19191887 ret = PTR_ERR(child);
19201888 btrfs_handle_fs_error(fs_info, ret, NULL);
....@@ -1922,8 +1890,9 @@
19221890 }
19231891
19241892 btrfs_tree_lock(child);
1925
- btrfs_set_lock_blocking(child);
1926
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1893
+ btrfs_set_lock_blocking_write(child);
1894
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1895
+ BTRFS_NESTING_COW);
19271896 if (ret) {
19281897 btrfs_tree_unlock(child);
19291898 free_extent_buffer(child);
....@@ -1939,7 +1908,7 @@
19391908
19401909 path->locks[level] = 0;
19411910 path->nodes[level] = NULL;
1942
- clean_tree_block(fs_info, mid);
1911
+ btrfs_clean_tree_block(mid);
19431912 btrfs_tree_unlock(mid);
19441913 /* once for the path */
19451914 free_extent_buffer(mid);
....@@ -1954,30 +1923,32 @@
19541923 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
19551924 return 0;
19561925
1957
- left = read_node_slot(fs_info, parent, pslot - 1);
1926
+ left = btrfs_read_node_slot(parent, pslot - 1);
19581927 if (IS_ERR(left))
19591928 left = NULL;
19601929
19611930 if (left) {
1962
- btrfs_tree_lock(left);
1963
- btrfs_set_lock_blocking(left);
1931
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1932
+ btrfs_set_lock_blocking_write(left);
19641933 wret = btrfs_cow_block(trans, root, left,
1965
- parent, pslot - 1, &left);
1934
+ parent, pslot - 1, &left,
1935
+ BTRFS_NESTING_LEFT_COW);
19661936 if (wret) {
19671937 ret = wret;
19681938 goto enospc;
19691939 }
19701940 }
19711941
1972
- right = read_node_slot(fs_info, parent, pslot + 1);
1942
+ right = btrfs_read_node_slot(parent, pslot + 1);
19731943 if (IS_ERR(right))
19741944 right = NULL;
19751945
19761946 if (right) {
1977
- btrfs_tree_lock(right);
1978
- btrfs_set_lock_blocking(right);
1947
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1948
+ btrfs_set_lock_blocking_write(right);
19791949 wret = btrfs_cow_block(trans, root, right,
1980
- parent, pslot + 1, &right);
1950
+ parent, pslot + 1, &right,
1951
+ BTRFS_NESTING_RIGHT_COW);
19811952 if (wret) {
19821953 ret = wret;
19831954 goto enospc;
....@@ -1987,7 +1958,7 @@
19871958 /* first, try to make some room in the middle buffer */
19881959 if (left) {
19891960 orig_slot += btrfs_header_nritems(left);
1990
- wret = push_node_left(trans, fs_info, left, mid, 1);
1961
+ wret = push_node_left(trans, left, mid, 1);
19911962 if (wret < 0)
19921963 ret = wret;
19931964 }
....@@ -1996,11 +1967,11 @@
19961967 * then try to empty the right most buffer into the middle
19971968 */
19981969 if (right) {
1999
- wret = push_node_left(trans, fs_info, mid, right, 1);
1970
+ wret = push_node_left(trans, mid, right, 1);
20001971 if (wret < 0 && wret != -ENOSPC)
20011972 ret = wret;
20021973 if (btrfs_header_nritems(right) == 0) {
2003
- clean_tree_block(fs_info, right);
1974
+ btrfs_clean_tree_block(right);
20041975 btrfs_tree_unlock(right);
20051976 del_ptr(root, path, level + 1, pslot + 1);
20061977 root_sub_used(root, right->len);
....@@ -2032,20 +2003,20 @@
20322003 btrfs_handle_fs_error(fs_info, ret, NULL);
20332004 goto enospc;
20342005 }
2035
- wret = balance_node_right(trans, fs_info, mid, left);
2006
+ wret = balance_node_right(trans, mid, left);
20362007 if (wret < 0) {
20372008 ret = wret;
20382009 goto enospc;
20392010 }
20402011 if (wret == 1) {
2041
- wret = push_node_left(trans, fs_info, left, mid, 1);
2012
+ wret = push_node_left(trans, left, mid, 1);
20422013 if (wret < 0)
20432014 ret = wret;
20442015 }
20452016 BUG_ON(wret == 1);
20462017 }
20472018 if (btrfs_header_nritems(mid) == 0) {
2048
- clean_tree_block(fs_info, mid);
2019
+ btrfs_clean_tree_block(mid);
20492020 btrfs_tree_unlock(mid);
20502021 del_ptr(root, path, level + 1, pslot);
20512022 root_sub_used(root, mid->len);
....@@ -2066,7 +2037,7 @@
20662037 /* update the path */
20672038 if (left) {
20682039 if (btrfs_header_nritems(left) > orig_slot) {
2069
- extent_buffer_get(left);
2040
+ atomic_inc(&left->refs);
20702041 /* left was locked after cow */
20712042 path->nodes[level] = left;
20722043 path->slots[level + 1] -= 1;
....@@ -2129,7 +2100,7 @@
21292100 if (!parent)
21302101 return 1;
21312102
2132
- left = read_node_slot(fs_info, parent, pslot - 1);
2103
+ left = btrfs_read_node_slot(parent, pslot - 1);
21332104 if (IS_ERR(left))
21342105 left = NULL;
21352106
....@@ -2137,20 +2108,20 @@
21372108 if (left) {
21382109 u32 left_nr;
21392110
2140
- btrfs_tree_lock(left);
2141
- btrfs_set_lock_blocking(left);
2111
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2112
+ btrfs_set_lock_blocking_write(left);
21422113
21432114 left_nr = btrfs_header_nritems(left);
21442115 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
21452116 wret = 1;
21462117 } else {
21472118 ret = btrfs_cow_block(trans, root, left, parent,
2148
- pslot - 1, &left);
2119
+ pslot - 1, &left,
2120
+ BTRFS_NESTING_LEFT_COW);
21492121 if (ret)
21502122 wret = 1;
21512123 else {
2152
- wret = push_node_left(trans, fs_info,
2153
- left, mid, 0);
2124
+ wret = push_node_left(trans, left, mid, 0);
21542125 }
21552126 }
21562127 if (wret < 0)
....@@ -2182,7 +2153,7 @@
21822153 btrfs_tree_unlock(left);
21832154 free_extent_buffer(left);
21842155 }
2185
- right = read_node_slot(fs_info, parent, pslot + 1);
2156
+ right = btrfs_read_node_slot(parent, pslot + 1);
21862157 if (IS_ERR(right))
21872158 right = NULL;
21882159
....@@ -2192,8 +2163,8 @@
21922163 if (right) {
21932164 u32 right_nr;
21942165
2195
- btrfs_tree_lock(right);
2196
- btrfs_set_lock_blocking(right);
2166
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2167
+ btrfs_set_lock_blocking_write(right);
21972168
21982169 right_nr = btrfs_header_nritems(right);
21992170 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
....@@ -2201,12 +2172,11 @@
22012172 } else {
22022173 ret = btrfs_cow_block(trans, root, right,
22032174 parent, pslot + 1,
2204
- &right);
2175
+ &right, BTRFS_NESTING_RIGHT_COW);
22052176 if (ret)
22062177 wret = 1;
22072178 else {
2208
- wret = balance_node_right(trans, fs_info,
2209
- right, mid);
2179
+ wret = balance_node_right(trans, right, mid);
22102180 }
22112181 }
22122182 if (wret < 0)
....@@ -2411,32 +2381,6 @@
24112381 }
24122382
24132383 /*
2414
- * This releases any locks held in the path starting at level and
2415
- * going all the way up to the root.
2416
- *
2417
- * btrfs_search_slot will keep the lock held on higher nodes in a few
2418
- * corner cases, such as COW of the block at slot zero in the node. This
2419
- * ignores those rules, and it should only be called when there are no
2420
- * more updates to be done higher up in the tree.
2421
- */
2422
-noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2423
-{
2424
- int i;
2425
-
2426
- if (path->keep_locks)
2427
- return;
2428
-
2429
- for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2430
- if (!path->nodes[i])
2431
- continue;
2432
- if (!path->locks[i])
2433
- continue;
2434
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2435
- path->locks[i] = 0;
2436
- }
2437
-}
2438
-
2439
-/*
24402384 * helper function for btrfs_search_slot. The goal is to find a block
24412385 * in cache without setting the path to blocking. If we find the block
24422386 * we return zero and the path is unchanged.
....@@ -2452,16 +2396,15 @@
24522396 struct btrfs_fs_info *fs_info = root->fs_info;
24532397 u64 blocknr;
24542398 u64 gen;
2455
- struct extent_buffer *b = *eb_ret;
24562399 struct extent_buffer *tmp;
24572400 struct btrfs_key first_key;
24582401 int ret;
24592402 int parent_level;
24602403
2461
- blocknr = btrfs_node_blockptr(b, slot);
2462
- gen = btrfs_node_ptr_generation(b, slot);
2463
- parent_level = btrfs_header_level(b);
2464
- btrfs_node_key_to_cpu(b, &first_key, slot);
2404
+ blocknr = btrfs_node_blockptr(*eb_ret, slot);
2405
+ gen = btrfs_node_ptr_generation(*eb_ret, slot);
2406
+ parent_level = btrfs_header_level(*eb_ret);
2407
+ btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
24652408
24662409 tmp = find_extent_buffer(fs_info, blocknr);
24672410 if (tmp) {
....@@ -2472,7 +2415,7 @@
24722415 * being cached, read from scrub, or have multiple
24732416 * parents (shared tree blocks).
24742417 */
2475
- if (btrfs_verify_level_key(fs_info, tmp,
2418
+ if (btrfs_verify_level_key(tmp,
24762419 parent_level - 1, &first_key, gen)) {
24772420 free_extent_buffer(tmp);
24782421 return -EUCLEAN;
....@@ -2565,7 +2508,6 @@
25652508 btrfs_set_path_blocking(p);
25662509 reada_for_balance(fs_info, p, level);
25672510 sret = split_node(trans, root, p, level);
2568
- btrfs_clear_path_blocking(p, NULL, 0);
25692511
25702512 BUG_ON(sret > 0);
25712513 if (sret) {
....@@ -2586,7 +2528,6 @@
25862528 btrfs_set_path_blocking(p);
25872529 reada_for_balance(fs_info, p, level);
25882530 sret = balance_level(trans, root, p, level);
2589
- btrfs_clear_path_blocking(p, NULL, 0);
25902531
25912532 if (sret) {
25922533 ret = sret;
....@@ -2605,40 +2546,6 @@
26052546 ret = -EAGAIN;
26062547 done:
26072548 return ret;
2608
-}
2609
-
2610
-static void key_search_validate(struct extent_buffer *b,
2611
- const struct btrfs_key *key,
2612
- int level)
2613
-{
2614
-#ifdef CONFIG_BTRFS_ASSERT
2615
- struct btrfs_disk_key disk_key;
2616
-
2617
- btrfs_cpu_key_to_disk(&disk_key, key);
2618
-
2619
- if (level == 0)
2620
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
2621
- offsetof(struct btrfs_leaf, items[0].key),
2622
- sizeof(disk_key)));
2623
- else
2624
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
2625
- offsetof(struct btrfs_node, ptrs[0].key),
2626
- sizeof(disk_key)));
2627
-#endif
2628
-}
2629
-
2630
-static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2631
- int level, int *prev_cmp, int *slot)
2632
-{
2633
- if (*prev_cmp != 0) {
2634
- *prev_cmp = btrfs_bin_search(b, key, level, slot);
2635
- return *prev_cmp;
2636
- }
2637
-
2638
- key_search_validate(b, key, level);
2639
- *slot = 0;
2640
-
2641
- return 0;
26422549 }
26432550
26442551 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
....@@ -2682,11 +2589,8 @@
26822589 {
26832590 struct btrfs_fs_info *fs_info = root->fs_info;
26842591 struct extent_buffer *b;
2685
- int root_lock;
2592
+ int root_lock = 0;
26862593 int level = 0;
2687
-
2688
- /* We try very hard to do read locks on the root */
2689
- root_lock = BTRFS_READ_LOCK;
26902594
26912595 if (p->search_commit_root) {
26922596 /*
....@@ -2707,7 +2611,7 @@
27072611
27082612 } else {
27092613 b = root->commit_root;
2710
- extent_buffer_get(b);
2614
+ atomic_inc(&b->refs);
27112615 }
27122616 level = btrfs_header_level(b);
27132617 /*
....@@ -2725,6 +2629,9 @@
27252629 goto out;
27262630 }
27272631
2632
+ /* We try very hard to do read locks on the root */
2633
+ root_lock = BTRFS_READ_LOCK;
2634
+
27282635 /*
27292636 * If the level is set to maximum, we can skip trying to get the read
27302637 * lock.
....@@ -2734,7 +2641,7 @@
27342641 * We don't know the level of the root node until we actually
27352642 * have it read locked
27362643 */
2737
- b = btrfs_read_lock_root_node(root);
2644
+ b = __btrfs_read_lock_root_node(root, p->recurse);
27382645 level = btrfs_header_level(b);
27392646 if (level > write_lock_level)
27402647 goto out;
....@@ -2751,6 +2658,17 @@
27512658 level = btrfs_header_level(b);
27522659
27532660 out:
2661
+ /*
2662
+ * The root may have failed to write out at some point, and thus is no
2663
+ * longer valid, return an error in this case.
2664
+ */
2665
+ if (!extent_buffer_uptodate(b)) {
2666
+ if (root_lock)
2667
+ btrfs_tree_unlock_rw(b, root_lock);
2668
+ free_extent_buffer(b);
2669
+ return ERR_PTR(-EIO);
2670
+ }
2671
+
27542672 p->nodes[level] = b;
27552673 if (!p->skip_locking)
27562674 p->locks[level] = root_lock;
....@@ -2790,7 +2708,6 @@
27902708 const struct btrfs_key *key, struct btrfs_path *p,
27912709 int ins_len, int cow)
27922710 {
2793
- struct btrfs_fs_info *fs_info = root->fs_info;
27942711 struct extent_buffer *b;
27952712 int slot;
27962713 int ret;
....@@ -2841,12 +2758,10 @@
28412758 }
28422759
28432760 while (b) {
2761
+ int dec = 0;
2762
+
28442763 level = btrfs_header_level(b);
28452764
2846
- /*
2847
- * setup the path here so we can release it under lock
2848
- * contention with the cow code
2849
- */
28502765 if (cow) {
28512766 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
28522767
....@@ -2876,11 +2791,13 @@
28762791 btrfs_set_path_blocking(p);
28772792 if (last_level)
28782793 err = btrfs_cow_block(trans, root, b, NULL, 0,
2879
- &b);
2794
+ &b,
2795
+ BTRFS_NESTING_COW);
28802796 else
28812797 err = btrfs_cow_block(trans, root, b,
28822798 p->nodes[level + 1],
2883
- p->slots[level + 1], &b);
2799
+ p->slots[level + 1], &b,
2800
+ BTRFS_NESTING_COW);
28842801 if (err) {
28852802 ret = err;
28862803 goto done;
....@@ -2888,7 +2805,10 @@
28882805 }
28892806 cow_done:
28902807 p->nodes[level] = b;
2891
- btrfs_clear_path_blocking(p, NULL, 0);
2808
+ /*
2809
+ * Leave path with blocking locks to avoid massive
2810
+ * lock context switch, this is made on purpose.
2811
+ */
28922812
28932813 /*
28942814 * we have a lock on b and as long as we aren't changing
....@@ -2910,86 +2830,28 @@
29102830 }
29112831 }
29122832
2913
- ret = key_search(b, key, level, &prev_cmp, &slot);
2914
- if (ret < 0)
2915
- goto done;
2916
-
2917
- if (level != 0) {
2918
- int dec = 0;
2919
- if (ret && slot > 0) {
2920
- dec = 1;
2921
- slot -= 1;
2922
- }
2923
- p->slots[level] = slot;
2924
- err = setup_nodes_for_search(trans, root, p, b, level,
2925
- ins_len, &write_lock_level);
2926
- if (err == -EAGAIN)
2927
- goto again;
2928
- if (err) {
2929
- ret = err;
2930
- goto done;
2931
- }
2932
- b = p->nodes[level];
2933
- slot = p->slots[level];
2934
-
2935
- /*
2936
- * slot 0 is special, if we change the key
2937
- * we have to update the parent pointer
2938
- * which means we must have a write lock
2939
- * on the parent
2940
- */
2941
- if (slot == 0 && ins_len &&
2942
- write_lock_level < level + 1) {
2943
- write_lock_level = level + 1;
2944
- btrfs_release_path(p);
2945
- goto again;
2946
- }
2947
-
2948
- unlock_up(p, level, lowest_unlock,
2949
- min_write_lock_level, &write_lock_level);
2950
-
2951
- if (level == lowest_level) {
2952
- if (dec)
2953
- p->slots[level]++;
2954
- goto done;
2955
- }
2956
-
2957
- err = read_block_for_search(root, p, &b, level,
2958
- slot, key);
2959
- if (err == -EAGAIN)
2960
- goto again;
2961
- if (err) {
2962
- ret = err;
2963
- goto done;
2964
- }
2965
-
2966
- if (!p->skip_locking) {
2967
- level = btrfs_header_level(b);
2968
- if (level <= write_lock_level) {
2969
- err = btrfs_try_tree_write_lock(b);
2970
- if (!err) {
2971
- btrfs_set_path_blocking(p);
2972
- btrfs_tree_lock(b);
2973
- btrfs_clear_path_blocking(p, b,
2974
- BTRFS_WRITE_LOCK);
2975
- }
2976
- p->locks[level] = BTRFS_WRITE_LOCK;
2977
- } else {
2978
- err = btrfs_tree_read_lock_atomic(b);
2979
- if (!err) {
2980
- btrfs_set_path_blocking(p);
2981
- btrfs_tree_read_lock(b);
2982
- btrfs_clear_path_blocking(p, b,
2983
- BTRFS_READ_LOCK);
2984
- }
2985
- p->locks[level] = BTRFS_READ_LOCK;
2986
- }
2987
- p->nodes[level] = b;
2988
- }
2833
+ /*
2834
+ * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2835
+ * we can safely assume the target key will always be in slot 0
2836
+ * on lower levels due to the invariants BTRFS' btree provides,
2837
+ * namely that a btrfs_key_ptr entry always points to the
2838
+ * lowest key in the child node, thus we can skip searching
2839
+ * lower levels
2840
+ */
2841
+ if (prev_cmp == 0) {
2842
+ slot = 0;
2843
+ ret = 0;
29892844 } else {
2845
+ ret = btrfs_bin_search(b, key, &slot);
2846
+ prev_cmp = ret;
2847
+ if (ret < 0)
2848
+ goto done;
2849
+ }
2850
+
2851
+ if (level == 0) {
29902852 p->slots[level] = slot;
29912853 if (ins_len > 0 &&
2992
- btrfs_leaf_free_space(fs_info, b) < ins_len) {
2854
+ btrfs_leaf_free_space(b) < ins_len) {
29932855 if (write_lock_level < 1) {
29942856 write_lock_level = 1;
29952857 btrfs_release_path(p);
....@@ -2999,7 +2861,6 @@
29992861 btrfs_set_path_blocking(p);
30002862 err = split_leaf(trans, root, key,
30012863 p, ins_len, ret == 0);
3002
- btrfs_clear_path_blocking(p, NULL, 0);
30032864
30042865 BUG_ON(err > 0);
30052866 if (err) {
....@@ -3009,8 +2870,70 @@
30092870 }
30102871 if (!p->search_for_split)
30112872 unlock_up(p, level, lowest_unlock,
3012
- min_write_lock_level, &write_lock_level);
2873
+ min_write_lock_level, NULL);
30132874 goto done;
2875
+ }
2876
+ if (ret && slot > 0) {
2877
+ dec = 1;
2878
+ slot--;
2879
+ }
2880
+ p->slots[level] = slot;
2881
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2882
+ &write_lock_level);
2883
+ if (err == -EAGAIN)
2884
+ goto again;
2885
+ if (err) {
2886
+ ret = err;
2887
+ goto done;
2888
+ }
2889
+ b = p->nodes[level];
2890
+ slot = p->slots[level];
2891
+
2892
+ /*
2893
+ * Slot 0 is special, if we change the key we have to update
2894
+ * the parent pointer which means we must have a write lock on
2895
+ * the parent
2896
+ */
2897
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
2898
+ write_lock_level = level + 1;
2899
+ btrfs_release_path(p);
2900
+ goto again;
2901
+ }
2902
+
2903
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
2904
+ &write_lock_level);
2905
+
2906
+ if (level == lowest_level) {
2907
+ if (dec)
2908
+ p->slots[level]++;
2909
+ goto done;
2910
+ }
2911
+
2912
+ err = read_block_for_search(root, p, &b, level, slot, key);
2913
+ if (err == -EAGAIN)
2914
+ goto again;
2915
+ if (err) {
2916
+ ret = err;
2917
+ goto done;
2918
+ }
2919
+
2920
+ if (!p->skip_locking) {
2921
+ level = btrfs_header_level(b);
2922
+ if (level <= write_lock_level) {
2923
+ if (!btrfs_try_tree_write_lock(b)) {
2924
+ btrfs_set_path_blocking(p);
2925
+ btrfs_tree_lock(b);
2926
+ }
2927
+ p->locks[level] = BTRFS_WRITE_LOCK;
2928
+ } else {
2929
+ if (!btrfs_tree_read_lock_atomic(b)) {
2930
+ btrfs_set_path_blocking(p);
2931
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2932
+ p->recurse);
2933
+ }
2934
+ p->locks[level] = BTRFS_READ_LOCK;
2935
+ }
2936
+ p->nodes[level] = b;
30142937 }
30152938 }
30162939 ret = 1;
....@@ -3048,7 +2971,6 @@
30482971 int level;
30492972 int lowest_unlock = 1;
30502973 u8 lowest_level = 0;
3051
- int prev_cmp = -1;
30522974
30532975 lowest_level = p->lowest_level;
30542976 WARN_ON(p->nodes[0] != NULL);
....@@ -3068,9 +2990,10 @@
30682990 p->locks[level] = BTRFS_READ_LOCK;
30692991
30702992 while (b) {
2993
+ int dec = 0;
2994
+
30712995 level = btrfs_header_level(b);
30722996 p->nodes[level] = b;
3073
- btrfs_clear_path_blocking(p, NULL, 0);
30742997
30752998 /*
30762999 * we have a lock on b and as long as we aren't changing
....@@ -3080,57 +3003,49 @@
30803003 */
30813004 btrfs_unlock_up_safe(p, level + 1);
30823005
3083
- /*
3084
- * Since we can unwind ebs we want to do a real search every
3085
- * time.
3086
- */
3087
- prev_cmp = -1;
3088
- ret = key_search(b, key, level, &prev_cmp, &slot);
3006
+ ret = btrfs_bin_search(b, key, &slot);
3007
+ if (ret < 0)
3008
+ goto done;
30893009
3090
- if (level != 0) {
3091
- int dec = 0;
3092
- if (ret && slot > 0) {
3093
- dec = 1;
3094
- slot -= 1;
3095
- }
3096
- p->slots[level] = slot;
3097
- unlock_up(p, level, lowest_unlock, 0, NULL);
3098
-
3099
- if (level == lowest_level) {
3100
- if (dec)
3101
- p->slots[level]++;
3102
- goto done;
3103
- }
3104
-
3105
- err = read_block_for_search(root, p, &b, level,
3106
- slot, key);
3107
- if (err == -EAGAIN)
3108
- goto again;
3109
- if (err) {
3110
- ret = err;
3111
- goto done;
3112
- }
3113
-
3114
- level = btrfs_header_level(b);
3115
- err = btrfs_tree_read_lock_atomic(b);
3116
- if (!err) {
3117
- btrfs_set_path_blocking(p);
3118
- btrfs_tree_read_lock(b);
3119
- btrfs_clear_path_blocking(p, b,
3120
- BTRFS_READ_LOCK);
3121
- }
3122
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3123
- if (!b) {
3124
- ret = -ENOMEM;
3125
- goto done;
3126
- }
3127
- p->locks[level] = BTRFS_READ_LOCK;
3128
- p->nodes[level] = b;
3129
- } else {
3010
+ if (level == 0) {
31303011 p->slots[level] = slot;
31313012 unlock_up(p, level, lowest_unlock, 0, NULL);
31323013 goto done;
31333014 }
3015
+
3016
+ if (ret && slot > 0) {
3017
+ dec = 1;
3018
+ slot--;
3019
+ }
3020
+ p->slots[level] = slot;
3021
+ unlock_up(p, level, lowest_unlock, 0, NULL);
3022
+
3023
+ if (level == lowest_level) {
3024
+ if (dec)
3025
+ p->slots[level]++;
3026
+ goto done;
3027
+ }
3028
+
3029
+ err = read_block_for_search(root, p, &b, level, slot, key);
3030
+ if (err == -EAGAIN)
3031
+ goto again;
3032
+ if (err) {
3033
+ ret = err;
3034
+ goto done;
3035
+ }
3036
+
3037
+ level = btrfs_header_level(b);
3038
+ if (!btrfs_tree_read_lock_atomic(b)) {
3039
+ btrfs_set_path_blocking(p);
3040
+ btrfs_tree_read_lock(b);
3041
+ }
3042
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3043
+ if (!b) {
3044
+ ret = -ENOMEM;
3045
+ goto done;
3046
+ }
3047
+ p->locks[level] = BTRFS_READ_LOCK;
3048
+ p->nodes[level] = b;
31343049 }
31353050 ret = 1;
31363051 done:
....@@ -3268,11 +3183,31 @@
32683183 slot = path->slots[0];
32693184 if (slot > 0) {
32703185 btrfs_item_key(eb, &disk_key, slot - 1);
3271
- BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3186
+ if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3187
+ btrfs_crit(fs_info,
3188
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3189
+ slot, btrfs_disk_key_objectid(&disk_key),
3190
+ btrfs_disk_key_type(&disk_key),
3191
+ btrfs_disk_key_offset(&disk_key),
3192
+ new_key->objectid, new_key->type,
3193
+ new_key->offset);
3194
+ btrfs_print_leaf(eb);
3195
+ BUG();
3196
+ }
32723197 }
32733198 if (slot < btrfs_header_nritems(eb) - 1) {
32743199 btrfs_item_key(eb, &disk_key, slot + 1);
3275
- BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3200
+ if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3201
+ btrfs_crit(fs_info,
3202
+ "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3203
+ slot, btrfs_disk_key_objectid(&disk_key),
3204
+ btrfs_disk_key_type(&disk_key),
3205
+ btrfs_disk_key_offset(&disk_key),
3206
+ new_key->objectid, new_key->type,
3207
+ new_key->offset);
3208
+ btrfs_print_leaf(eb);
3209
+ BUG();
3210
+ }
32763211 }
32773212
32783213 btrfs_cpu_key_to_disk(&disk_key, new_key);
....@@ -3283,6 +3218,58 @@
32833218 }
32843219
32853220 /*
3221
+ * Check key order of two sibling extent buffers.
3222
+ *
3223
+ * Return true if something is wrong.
3224
+ * Return false if everything is fine.
3225
+ *
3226
+ * Tree-checker only works inside one tree block, thus the following
3227
+ * corruption can not be detected by tree-checker:
3228
+ *
3229
+ * Leaf @left | Leaf @right
3230
+ * --------------------------------------------------------------
3231
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
3232
+ *
3233
+ * Key f6 in leaf @left itself is valid, but not valid when the next
3234
+ * key in leaf @right is 7.
3235
+ * This can only be checked at tree block merge time.
3236
+ * And since tree checker has ensured all key order in each tree block
3237
+ * is correct, we only need to bother the last key of @left and the first
3238
+ * key of @right.
3239
+ */
3240
+static bool check_sibling_keys(struct extent_buffer *left,
3241
+ struct extent_buffer *right)
3242
+{
3243
+ struct btrfs_key left_last;
3244
+ struct btrfs_key right_first;
3245
+ int level = btrfs_header_level(left);
3246
+ int nr_left = btrfs_header_nritems(left);
3247
+ int nr_right = btrfs_header_nritems(right);
3248
+
3249
+ /* No key to check in one of the tree blocks */
3250
+ if (!nr_left || !nr_right)
3251
+ return false;
3252
+
3253
+ if (level) {
3254
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3255
+ btrfs_node_key_to_cpu(right, &right_first, 0);
3256
+ } else {
3257
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3258
+ btrfs_item_key_to_cpu(right, &right_first, 0);
3259
+ }
3260
+
3261
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3262
+ btrfs_crit(left->fs_info,
3263
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3264
+ left_last.objectid, left_last.type,
3265
+ left_last.offset, right_first.objectid,
3266
+ right_first.type, right_first.offset);
3267
+ return true;
3268
+ }
3269
+ return false;
3270
+}
3271
+
3272
+/*
32863273 * try to push data from one node into the next node left in the
32873274 * tree.
32883275 *
....@@ -3290,10 +3277,10 @@
32903277 * error, and > 0 if there was no room in the left hand block.
32913278 */
32923279 static int push_node_left(struct btrfs_trans_handle *trans,
3293
- struct btrfs_fs_info *fs_info,
32943280 struct extent_buffer *dst,
32953281 struct extent_buffer *src, int empty)
32963282 {
3283
+ struct btrfs_fs_info *fs_info = trans->fs_info;
32973284 int push_items = 0;
32983285 int src_nritems;
32993286 int dst_nritems;
....@@ -3326,8 +3313,13 @@
33263313 } else
33273314 push_items = min(src_nritems - 8, push_items);
33283315
3329
- ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3330
- push_items);
3316
+ /* dst is the left eb, src is the middle eb */
3317
+ if (check_sibling_keys(dst, src)) {
3318
+ ret = -EUCLEAN;
3319
+ btrfs_abort_transaction(trans, ret);
3320
+ return ret;
3321
+ }
3322
+ ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
33313323 if (ret) {
33323324 btrfs_abort_transaction(trans, ret);
33333325 return ret;
....@@ -3365,10 +3357,10 @@
33653357 * this will only push up to 1/2 the contents of the left node over
33663358 */
33673359 static int balance_node_right(struct btrfs_trans_handle *trans,
3368
- struct btrfs_fs_info *fs_info,
33693360 struct extent_buffer *dst,
33703361 struct extent_buffer *src)
33713362 {
3363
+ struct btrfs_fs_info *fs_info = trans->fs_info;
33723364 int push_items = 0;
33733365 int max_push;
33743366 int src_nritems;
....@@ -3395,6 +3387,12 @@
33953387 if (max_push < push_items)
33963388 push_items = max_push;
33973389
3390
+ /* dst is the right eb, src is the middle eb */
3391
+ if (check_sibling_keys(src, dst)) {
3392
+ ret = -EUCLEAN;
3393
+ btrfs_abort_transaction(trans, ret);
3394
+ return ret;
3395
+ }
33983396 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
33993397 BUG_ON(ret < 0);
34003398 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
....@@ -3402,8 +3400,8 @@
34023400 (dst_nritems) *
34033401 sizeof(struct btrfs_key_ptr));
34043402
3405
- ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3406
- src_nritems - push_items, push_items);
3403
+ ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3404
+ push_items);
34073405 if (ret) {
34083406 btrfs_abort_transaction(trans, ret);
34093407 return ret;
....@@ -3451,7 +3449,8 @@
34513449 btrfs_node_key(lower, &lower_key, 0);
34523450
34533451 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3454
- root->node->start, 0);
3452
+ root->node->start, 0,
3453
+ BTRFS_NESTING_NEW_ROOT);
34553454 if (IS_ERR(c))
34563455 return PTR_ERR(c);
34573456
....@@ -3476,7 +3475,7 @@
34763475 free_extent_buffer(old);
34773476
34783477 add_root_to_dirty_list(root);
3479
- extent_buffer_get(c);
3478
+ atomic_inc(&c->refs);
34803479 path->nodes[level] = c;
34813480 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
34823481 path->slots[level] = 0;
....@@ -3491,7 +3490,7 @@
34913490 * blocknr is the block the key points to.
34923491 */
34933492 static void insert_ptr(struct btrfs_trans_handle *trans,
3494
- struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3493
+ struct btrfs_path *path,
34953494 struct btrfs_disk_key *key, u64 bytenr,
34963495 int slot, int level)
34973496 {
....@@ -3504,7 +3503,7 @@
35043503 lower = path->nodes[level];
35053504 nritems = btrfs_header_nritems(lower);
35063505 BUG_ON(slot > nritems);
3507
- BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3506
+ BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
35083507 if (slot != nritems) {
35093508 if (level) {
35103509 ret = tree_mod_log_insert_move(lower, slot + 1, slot,
....@@ -3581,15 +3580,17 @@
35813580 btrfs_node_key(c, &disk_key, mid);
35823581
35833582 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3584
- c->start, 0);
3583
+ c->start, 0, BTRFS_NESTING_SPLIT);
35853584 if (IS_ERR(split))
35863585 return PTR_ERR(split);
35873586
35883587 root_add_used(root, fs_info->nodesize);
35893588 ASSERT(btrfs_header_level(c) == level);
35903589
3591
- ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3590
+ ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
35923591 if (ret) {
3592
+ btrfs_tree_unlock(split);
3593
+ free_extent_buffer(split);
35933594 btrfs_abort_transaction(trans, ret);
35943595 return ret;
35953596 }
....@@ -3604,7 +3605,7 @@
36043605 btrfs_mark_buffer_dirty(c);
36053606 btrfs_mark_buffer_dirty(split);
36063607
3607
- insert_ptr(trans, fs_info, path, &disk_key, split->start,
3608
+ insert_ptr(trans, path, &disk_key, split->start,
36083609 path->slots[level + 1] + 1, level + 1);
36093610
36103611 if (path->slots[level] >= mid) {
....@@ -3629,19 +3630,17 @@
36293630 {
36303631 struct btrfs_item *start_item;
36313632 struct btrfs_item *end_item;
3632
- struct btrfs_map_token token;
36333633 int data_len;
36343634 int nritems = btrfs_header_nritems(l);
36353635 int end = min(nritems, start + nr) - 1;
36363636
36373637 if (!nr)
36383638 return 0;
3639
- btrfs_init_map_token(&token);
36403639 start_item = btrfs_item_nr(start);
36413640 end_item = btrfs_item_nr(end);
3642
- data_len = btrfs_token_item_offset(l, start_item, &token) +
3643
- btrfs_token_item_size(l, start_item, &token);
3644
- data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3641
+ data_len = btrfs_item_offset(l, start_item) +
3642
+ btrfs_item_size(l, start_item);
3643
+ data_len = data_len - btrfs_item_offset(l, end_item);
36453644 data_len += sizeof(struct btrfs_item) * nr;
36463645 WARN_ON(data_len < 0);
36473646 return data_len;
....@@ -3652,9 +3651,9 @@
36523651 * the start of the leaf data. IOW, how much room
36533652 * the leaf has left for both items and data
36543653 */
3655
-noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3656
- struct extent_buffer *leaf)
3654
+noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
36573655 {
3656
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
36583657 int nritems = btrfs_header_nritems(leaf);
36593658 int ret;
36603659
....@@ -3673,13 +3672,13 @@
36733672 * min slot controls the lowest index we're willing to push to the
36743673 * right. We'll push up to and including min_slot, but no lower
36753674 */
3676
-static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3677
- struct btrfs_path *path,
3675
+static noinline int __push_leaf_right(struct btrfs_path *path,
36783676 int data_size, int empty,
36793677 struct extent_buffer *right,
36803678 int free_space, u32 left_nritems,
36813679 u32 min_slot)
36823680 {
3681
+ struct btrfs_fs_info *fs_info = right->fs_info;
36833682 struct extent_buffer *left = path->nodes[0];
36843683 struct extent_buffer *upper = path->nodes[1];
36853684 struct btrfs_map_token token;
....@@ -3693,8 +3692,6 @@
36933692 u32 right_nritems;
36943693 u32 data_end;
36953694 u32 this_item_size;
3696
-
3697
- btrfs_init_map_token(&token);
36983695
36993696 if (empty)
37003697 nr = 0;
....@@ -3713,7 +3710,8 @@
37133710 if (path->slots[0] > i)
37143711 break;
37153712 if (path->slots[0] == i) {
3716
- int space = btrfs_leaf_free_space(fs_info, left);
3713
+ int space = btrfs_leaf_free_space(left);
3714
+
37173715 if (space + push_space * 2 > free_space)
37183716 break;
37193717 }
....@@ -3742,10 +3740,10 @@
37423740 right_nritems = btrfs_header_nritems(right);
37433741
37443742 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3745
- push_space -= leaf_data_end(fs_info, left);
3743
+ push_space -= leaf_data_end(left);
37463744
37473745 /* make room in the right data area */
3748
- data_end = leaf_data_end(fs_info, right);
3746
+ data_end = leaf_data_end(right);
37493747 memmove_extent_buffer(right,
37503748 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
37513749 BTRFS_LEAF_DATA_OFFSET + data_end,
....@@ -3754,7 +3752,7 @@
37543752 /* copy from the left data area */
37553753 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
37563754 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3757
- BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3755
+ BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
37583756 push_space);
37593757
37603758 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
....@@ -3767,13 +3765,14 @@
37673765 push_items * sizeof(struct btrfs_item));
37683766
37693767 /* update the item pointers */
3768
+ btrfs_init_map_token(&token, right);
37703769 right_nritems += push_items;
37713770 btrfs_set_header_nritems(right, right_nritems);
37723771 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
37733772 for (i = 0; i < right_nritems; i++) {
37743773 item = btrfs_item_nr(i);
3775
- push_space -= btrfs_token_item_size(right, item, &token);
3776
- btrfs_set_token_item_offset(right, item, push_space, &token);
3774
+ push_space -= btrfs_token_item_size(&token, item);
3775
+ btrfs_set_token_item_offset(&token, item, push_space);
37773776 }
37783777
37793778 left_nritems -= push_items;
....@@ -3782,7 +3781,7 @@
37823781 if (left_nritems)
37833782 btrfs_mark_buffer_dirty(left);
37843783 else
3785
- clean_tree_block(fs_info, left);
3784
+ btrfs_clean_tree_block(left);
37863785
37873786 btrfs_mark_buffer_dirty(right);
37883787
....@@ -3794,7 +3793,7 @@
37943793 if (path->slots[0] >= left_nritems) {
37953794 path->slots[0] -= left_nritems;
37963795 if (btrfs_header_nritems(path->nodes[0]) == 0)
3797
- clean_tree_block(fs_info, path->nodes[0]);
3796
+ btrfs_clean_tree_block(path->nodes[0]);
37983797 btrfs_tree_unlock(path->nodes[0]);
37993798 free_extent_buffer(path->nodes[0]);
38003799 path->nodes[0] = right;
....@@ -3826,7 +3825,6 @@
38263825 int min_data_size, int data_size,
38273826 int empty, u32 min_slot)
38283827 {
3829
- struct btrfs_fs_info *fs_info = root->fs_info;
38303828 struct extent_buffer *left = path->nodes[0];
38313829 struct extent_buffer *right;
38323830 struct extent_buffer *upper;
....@@ -3845,7 +3843,7 @@
38453843
38463844 btrfs_assert_tree_locked(path->nodes[1]);
38473845
3848
- right = read_node_slot(fs_info, upper, slot + 1);
3846
+ right = btrfs_read_node_slot(upper, slot + 1);
38493847 /*
38503848 * slot + 1 is not valid or we fail to read the right node,
38513849 * no big deal, just return.
....@@ -3853,20 +3851,20 @@
38533851 if (IS_ERR(right))
38543852 return 1;
38553853
3856
- btrfs_tree_lock(right);
3857
- btrfs_set_lock_blocking(right);
3854
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3855
+ btrfs_set_lock_blocking_write(right);
38583856
3859
- free_space = btrfs_leaf_free_space(fs_info, right);
3857
+ free_space = btrfs_leaf_free_space(right);
38603858 if (free_space < data_size)
38613859 goto out_unlock;
38623860
38633861 /* cow and double check */
38643862 ret = btrfs_cow_block(trans, root, right, upper,
3865
- slot + 1, &right);
3863
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
38663864 if (ret)
38673865 goto out_unlock;
38683866
3869
- free_space = btrfs_leaf_free_space(fs_info, right);
3867
+ free_space = btrfs_leaf_free_space(right);
38703868 if (free_space < data_size)
38713869 goto out_unlock;
38723870
....@@ -3874,11 +3872,18 @@
38743872 if (left_nritems == 0)
38753873 goto out_unlock;
38763874
3875
+ if (check_sibling_keys(left, right)) {
3876
+ ret = -EUCLEAN;
3877
+ btrfs_abort_transaction(trans, ret);
3878
+ btrfs_tree_unlock(right);
3879
+ free_extent_buffer(right);
3880
+ return ret;
3881
+ }
38773882 if (path->slots[0] == left_nritems && !empty) {
38783883 /* Key greater than all keys in the leaf, right neighbor has
38793884 * enough room for it and we're not emptying our leaf to delete
38803885 * it, therefore use right neighbor to insert the new item and
3881
- * no need to touch/dirty our left leaft. */
3886
+ * no need to touch/dirty our left leaf. */
38823887 btrfs_tree_unlock(left);
38833888 free_extent_buffer(left);
38843889 path->nodes[0] = right;
....@@ -3887,7 +3892,7 @@
38873892 return 0;
38883893 }
38893894
3890
- return __push_leaf_right(fs_info, path, min_data_size, empty,
3895
+ return __push_leaf_right(path, min_data_size, empty,
38913896 right, free_space, left_nritems, min_slot);
38923897 out_unlock:
38933898 btrfs_tree_unlock(right);
....@@ -3903,12 +3908,12 @@
39033908 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
39043909 * items
39053910 */
3906
-static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3907
- struct btrfs_path *path, int data_size,
3911
+static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
39083912 int empty, struct extent_buffer *left,
39093913 int free_space, u32 right_nritems,
39103914 u32 max_slot)
39113915 {
3916
+ struct btrfs_fs_info *fs_info = left->fs_info;
39123917 struct btrfs_disk_key disk_key;
39133918 struct extent_buffer *right = path->nodes[0];
39143919 int i;
....@@ -3922,8 +3927,6 @@
39223927 u32 old_left_item_size;
39233928 struct btrfs_map_token token;
39243929
3925
- btrfs_init_map_token(&token);
3926
-
39273930 if (empty)
39283931 nr = min(right_nritems, max_slot);
39293932 else
....@@ -3936,7 +3939,8 @@
39363939 if (path->slots[0] < i)
39373940 break;
39383941 if (path->slots[0] == i) {
3939
- int space = btrfs_leaf_free_space(fs_info, right);
3942
+ int space = btrfs_leaf_free_space(right);
3943
+
39403944 if (space + push_space * 2 > free_space)
39413945 break;
39423946 }
....@@ -3969,23 +3973,23 @@
39693973 btrfs_item_offset_nr(right, push_items - 1);
39703974
39713975 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3972
- leaf_data_end(fs_info, left) - push_space,
3976
+ leaf_data_end(left) - push_space,
39733977 BTRFS_LEAF_DATA_OFFSET +
39743978 btrfs_item_offset_nr(right, push_items - 1),
39753979 push_space);
39763980 old_left_nritems = btrfs_header_nritems(left);
39773981 BUG_ON(old_left_nritems <= 0);
39783982
3983
+ btrfs_init_map_token(&token, left);
39793984 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
39803985 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
39813986 u32 ioff;
39823987
39833988 item = btrfs_item_nr(i);
39843989
3985
- ioff = btrfs_token_item_offset(left, item, &token);
3986
- btrfs_set_token_item_offset(left, item,
3987
- ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3988
- &token);
3990
+ ioff = btrfs_token_item_offset(&token, item);
3991
+ btrfs_set_token_item_offset(&token, item,
3992
+ ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
39893993 }
39903994 btrfs_set_header_nritems(left, old_left_nritems + push_items);
39913995
....@@ -3996,33 +4000,34 @@
39964000
39974001 if (push_items < right_nritems) {
39984002 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3999
- leaf_data_end(fs_info, right);
4003
+ leaf_data_end(right);
40004004 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
40014005 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
40024006 BTRFS_LEAF_DATA_OFFSET +
4003
- leaf_data_end(fs_info, right), push_space);
4007
+ leaf_data_end(right), push_space);
40044008
40054009 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
40064010 btrfs_item_nr_offset(push_items),
40074011 (btrfs_header_nritems(right) - push_items) *
40084012 sizeof(struct btrfs_item));
40094013 }
4014
+
4015
+ btrfs_init_map_token(&token, right);
40104016 right_nritems -= push_items;
40114017 btrfs_set_header_nritems(right, right_nritems);
40124018 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
40134019 for (i = 0; i < right_nritems; i++) {
40144020 item = btrfs_item_nr(i);
40154021
4016
- push_space = push_space - btrfs_token_item_size(right,
4017
- item, &token);
4018
- btrfs_set_token_item_offset(right, item, push_space, &token);
4022
+ push_space = push_space - btrfs_token_item_size(&token, item);
4023
+ btrfs_set_token_item_offset(&token, item, push_space);
40194024 }
40204025
40214026 btrfs_mark_buffer_dirty(left);
40224027 if (right_nritems)
40234028 btrfs_mark_buffer_dirty(right);
40244029 else
4025
- clean_tree_block(fs_info, right);
4030
+ btrfs_clean_tree_block(right);
40264031
40274032 btrfs_item_key(right, &disk_key, 0);
40284033 fixup_low_keys(path, &disk_key, 1);
....@@ -4059,7 +4064,6 @@
40594064 *root, struct btrfs_path *path, int min_data_size,
40604065 int data_size, int empty, u32 max_slot)
40614066 {
4062
- struct btrfs_fs_info *fs_info = root->fs_info;
40634067 struct extent_buffer *right = path->nodes[0];
40644068 struct extent_buffer *left;
40654069 int slot;
....@@ -4079,7 +4083,7 @@
40794083
40804084 btrfs_assert_tree_locked(path->nodes[1]);
40814085
4082
- left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4086
+ left = btrfs_read_node_slot(path->nodes[1], slot - 1);
40834087 /*
40844088 * slot - 1 is not valid or we fail to read the left node,
40854089 * no big deal, just return.
....@@ -4087,10 +4091,10 @@
40874091 if (IS_ERR(left))
40884092 return 1;
40894093
4090
- btrfs_tree_lock(left);
4091
- btrfs_set_lock_blocking(left);
4094
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4095
+ btrfs_set_lock_blocking_write(left);
40924096
4093
- free_space = btrfs_leaf_free_space(fs_info, left);
4097
+ free_space = btrfs_leaf_free_space(left);
40944098 if (free_space < data_size) {
40954099 ret = 1;
40964100 goto out;
....@@ -4098,7 +4102,8 @@
40984102
40994103 /* cow and double check */
41004104 ret = btrfs_cow_block(trans, root, left,
4101
- path->nodes[1], slot - 1, &left);
4105
+ path->nodes[1], slot - 1, &left,
4106
+ BTRFS_NESTING_LEFT_COW);
41024107 if (ret) {
41034108 /* we hit -ENOSPC, but it isn't fatal here */
41044109 if (ret == -ENOSPC)
....@@ -4106,13 +4111,18 @@
41064111 goto out;
41074112 }
41084113
4109
- free_space = btrfs_leaf_free_space(fs_info, left);
4114
+ free_space = btrfs_leaf_free_space(left);
41104115 if (free_space < data_size) {
41114116 ret = 1;
41124117 goto out;
41134118 }
41144119
4115
- return __push_leaf_left(fs_info, path, min_data_size,
4120
+ if (check_sibling_keys(left, right)) {
4121
+ ret = -EUCLEAN;
4122
+ btrfs_abort_transaction(trans, ret);
4123
+ goto out;
4124
+ }
4125
+ return __push_leaf_left(path, min_data_size,
41164126 empty, left, free_space, right_nritems,
41174127 max_slot);
41184128 out:
....@@ -4126,23 +4136,21 @@
41264136 * available for the resulting leaf level of the path.
41274137 */
41284138 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4129
- struct btrfs_fs_info *fs_info,
41304139 struct btrfs_path *path,
41314140 struct extent_buffer *l,
41324141 struct extent_buffer *right,
41334142 int slot, int mid, int nritems)
41344143 {
4144
+ struct btrfs_fs_info *fs_info = trans->fs_info;
41354145 int data_copy_size;
41364146 int rt_data_off;
41374147 int i;
41384148 struct btrfs_disk_key disk_key;
41394149 struct btrfs_map_token token;
41404150
4141
- btrfs_init_map_token(&token);
4142
-
41434151 nritems = nritems - mid;
41444152 btrfs_set_header_nritems(right, nritems);
4145
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4153
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
41464154
41474155 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
41484156 btrfs_item_nr_offset(mid),
....@@ -4151,23 +4159,22 @@
41514159 copy_extent_buffer(right, l,
41524160 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
41534161 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4154
- leaf_data_end(fs_info, l), data_copy_size);
4162
+ leaf_data_end(l), data_copy_size);
41554163
41564164 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
41574165
4166
+ btrfs_init_map_token(&token, right);
41584167 for (i = 0; i < nritems; i++) {
41594168 struct btrfs_item *item = btrfs_item_nr(i);
41604169 u32 ioff;
41614170
4162
- ioff = btrfs_token_item_offset(right, item, &token);
4163
- btrfs_set_token_item_offset(right, item,
4164
- ioff + rt_data_off, &token);
4171
+ ioff = btrfs_token_item_offset(&token, item);
4172
+ btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
41654173 }
41664174
41674175 btrfs_set_header_nritems(l, mid);
41684176 btrfs_item_key(right, &disk_key, 0);
4169
- insert_ptr(trans, fs_info, path, &disk_key, right->start,
4170
- path->slots[1] + 1, 1);
4177
+ insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
41714178
41724179 btrfs_mark_buffer_dirty(right);
41734180 btrfs_mark_buffer_dirty(l);
....@@ -4202,7 +4209,6 @@
42024209 struct btrfs_path *path,
42034210 int data_size)
42044211 {
4205
- struct btrfs_fs_info *fs_info = root->fs_info;
42064212 int ret;
42074213 int progress = 0;
42084214 int slot;
....@@ -4211,7 +4217,7 @@
42114217
42124218 slot = path->slots[0];
42134219 if (slot < btrfs_header_nritems(path->nodes[0]))
4214
- space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4220
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
42154221
42164222 /*
42174223 * try to push all the items after our slot into the
....@@ -4232,14 +4238,14 @@
42324238 if (path->slots[0] == 0 || path->slots[0] == nritems)
42334239 return 0;
42344240
4235
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4241
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
42364242 return 0;
42374243
42384244 /* try to push all the items before our slot into the next leaf */
42394245 slot = path->slots[0];
42404246 space_needed = data_size;
42414247 if (slot > 0)
4242
- space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4248
+ space_needed -= btrfs_leaf_free_space(path->nodes[0]);
42434249 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
42444250 if (ret < 0)
42454251 return ret;
....@@ -4288,7 +4294,7 @@
42884294 int space_needed = data_size;
42894295
42904296 if (slot < btrfs_header_nritems(l))
4291
- space_needed -= btrfs_leaf_free_space(fs_info, l);
4297
+ space_needed -= btrfs_leaf_free_space(l);
42924298
42934299 wret = push_leaf_right(trans, root, path, space_needed,
42944300 space_needed, 0, 0);
....@@ -4297,8 +4303,7 @@
42974303 if (wret) {
42984304 space_needed = data_size;
42994305 if (slot > 0)
4300
- space_needed -= btrfs_leaf_free_space(fs_info,
4301
- l);
4306
+ space_needed -= btrfs_leaf_free_space(l);
43024307 wret = push_leaf_left(trans, root, path, space_needed,
43034308 space_needed, 0, (u32)-1);
43044309 if (wret < 0)
....@@ -4307,7 +4312,7 @@
43074312 l = path->nodes[0];
43084313
43094314 /* did the pushes work? */
4310
- if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4315
+ if (btrfs_leaf_free_space(l) >= data_size)
43114316 return 0;
43124317 }
43134318
....@@ -4365,8 +4370,18 @@
43654370 else
43664371 btrfs_item_key(l, &disk_key, mid);
43674372
4373
+ /*
4374
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4375
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4376
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
4377
+ * out. In the future we could add a
4378
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4379
+ * use BTRFS_NESTING_NEW_ROOT.
4380
+ */
43684381 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4369
- l->start, 0);
4382
+ l->start, 0, num_doubles ?
4383
+ BTRFS_NESTING_NEW_ROOT :
4384
+ BTRFS_NESTING_SPLIT);
43704385 if (IS_ERR(right))
43714386 return PTR_ERR(right);
43724387
....@@ -4375,7 +4390,7 @@
43754390 if (split == 0) {
43764391 if (mid <= slot) {
43774392 btrfs_set_header_nritems(right, 0);
4378
- insert_ptr(trans, fs_info, path, &disk_key,
4393
+ insert_ptr(trans, path, &disk_key,
43794394 right->start, path->slots[1] + 1, 1);
43804395 btrfs_tree_unlock(path->nodes[0]);
43814396 free_extent_buffer(path->nodes[0]);
....@@ -4384,7 +4399,7 @@
43844399 path->slots[1] += 1;
43854400 } else {
43864401 btrfs_set_header_nritems(right, 0);
4387
- insert_ptr(trans, fs_info, path, &disk_key,
4402
+ insert_ptr(trans, path, &disk_key,
43884403 right->start, path->slots[1], 1);
43894404 btrfs_tree_unlock(path->nodes[0]);
43904405 free_extent_buffer(path->nodes[0]);
....@@ -4401,7 +4416,7 @@
44014416 return ret;
44024417 }
44034418
4404
- copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4419
+ copy_for_split(trans, path, l, right, slot, mid, nritems);
44054420
44064421 if (split == 2) {
44074422 BUG_ON(num_doubles != 0);
....@@ -4414,7 +4429,7 @@
44144429 push_for_double:
44154430 push_for_double_split(trans, root, path, data_size);
44164431 tried_avoid_double = 1;
4417
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4432
+ if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
44184433 return 0;
44194434 goto again;
44204435 }
....@@ -4423,7 +4438,6 @@
44234438 struct btrfs_root *root,
44244439 struct btrfs_path *path, int ins_len)
44254440 {
4426
- struct btrfs_fs_info *fs_info = root->fs_info;
44274441 struct btrfs_key key;
44284442 struct extent_buffer *leaf;
44294443 struct btrfs_file_extent_item *fi;
....@@ -4437,7 +4451,7 @@
44374451 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
44384452 key.type != BTRFS_EXTENT_CSUM_KEY);
44394453
4440
- if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4454
+ if (btrfs_leaf_free_space(leaf) >= ins_len)
44414455 return 0;
44424456
44434457 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
....@@ -4464,7 +4478,7 @@
44644478 goto err;
44654479
44664480 /* the leaf has changed, it now has room. return now */
4467
- if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4481
+ if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
44684482 goto err;
44694483
44704484 if (key.type == BTRFS_EXTENT_DATA_KEY) {
....@@ -4487,8 +4501,7 @@
44874501 return ret;
44884502 }
44894503
4490
-static noinline int split_item(struct btrfs_fs_info *fs_info,
4491
- struct btrfs_path *path,
4504
+static noinline int split_item(struct btrfs_path *path,
44924505 const struct btrfs_key *new_key,
44934506 unsigned long split_offset)
44944507 {
....@@ -4503,7 +4516,7 @@
45034516 struct btrfs_disk_key disk_key;
45044517
45054518 leaf = path->nodes[0];
4506
- BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4519
+ BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
45074520
45084521 btrfs_set_path_blocking(path);
45094522
....@@ -4552,7 +4565,7 @@
45524565 item_size - split_offset);
45534566 btrfs_mark_buffer_dirty(leaf);
45544567
4555
- BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4568
+ BUG_ON(btrfs_leaf_free_space(leaf) < 0);
45564569 kfree(buf);
45574570 return 0;
45584571 }
....@@ -4584,7 +4597,7 @@
45844597 if (ret)
45854598 return ret;
45864599
4587
- ret = split_item(root->fs_info, path, new_key, split_offset);
4600
+ ret = split_item(path, new_key, split_offset);
45884601 return ret;
45894602 }
45904603
....@@ -4613,9 +4626,7 @@
46134626 return ret;
46144627
46154628 path->slots[0]++;
4616
- setup_items_for_insert(root, path, new_key, &item_size,
4617
- item_size, item_size +
4618
- sizeof(struct btrfs_item), 1);
4629
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
46194630 leaf = path->nodes[0];
46204631 memcpy_extent_buffer(leaf,
46214632 btrfs_item_ptr_offset(leaf, path->slots[0]),
....@@ -4630,8 +4641,7 @@
46304641 * off the end of the item or if we shift the item to chop bytes off
46314642 * the front.
46324643 */
4633
-void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4634
- struct btrfs_path *path, u32 new_size, int from_end)
4644
+void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
46354645 {
46364646 int slot;
46374647 struct extent_buffer *leaf;
....@@ -4644,8 +4654,6 @@
46444654 int i;
46454655 struct btrfs_map_token token;
46464656
4647
- btrfs_init_map_token(&token);
4648
-
46494657 leaf = path->nodes[0];
46504658 slot = path->slots[0];
46514659
....@@ -4654,7 +4662,7 @@
46544662 return;
46554663
46564664 nritems = btrfs_header_nritems(leaf);
4657
- data_end = leaf_data_end(fs_info, leaf);
4665
+ data_end = leaf_data_end(leaf);
46584666
46594667 old_data_start = btrfs_item_offset_nr(leaf, slot);
46604668
....@@ -4667,13 +4675,13 @@
46674675 * item0..itemN ... dataN.offset..dataN.size .. data0.size
46684676 */
46694677 /* first correct the data pointers */
4678
+ btrfs_init_map_token(&token, leaf);
46704679 for (i = slot; i < nritems; i++) {
46714680 u32 ioff;
46724681 item = btrfs_item_nr(i);
46734682
4674
- ioff = btrfs_token_item_offset(leaf, item, &token);
4675
- btrfs_set_token_item_offset(leaf, item,
4676
- ioff + size_diff, &token);
4683
+ ioff = btrfs_token_item_offset(&token, item);
4684
+ btrfs_set_token_item_offset(&token, item, ioff + size_diff);
46774685 }
46784686
46794687 /* shift the data */
....@@ -4720,7 +4728,7 @@
47204728 btrfs_set_item_size(leaf, item, new_size);
47214729 btrfs_mark_buffer_dirty(leaf);
47224730
4723
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4731
+ if (btrfs_leaf_free_space(leaf) < 0) {
47244732 btrfs_print_leaf(leaf);
47254733 BUG();
47264734 }
....@@ -4729,8 +4737,7 @@
47294737 /*
47304738 * make the item pointed to by the path bigger, data_size is the added size.
47314739 */
4732
-void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4733
- u32 data_size)
4740
+void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
47344741 {
47354742 int slot;
47364743 struct extent_buffer *leaf;
....@@ -4742,14 +4749,12 @@
47424749 int i;
47434750 struct btrfs_map_token token;
47444751
4745
- btrfs_init_map_token(&token);
4746
-
47474752 leaf = path->nodes[0];
47484753
47494754 nritems = btrfs_header_nritems(leaf);
4750
- data_end = leaf_data_end(fs_info, leaf);
4755
+ data_end = leaf_data_end(leaf);
47514756
4752
- if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4757
+ if (btrfs_leaf_free_space(leaf) < data_size) {
47534758 btrfs_print_leaf(leaf);
47544759 BUG();
47554760 }
....@@ -4759,22 +4764,22 @@
47594764 BUG_ON(slot < 0);
47604765 if (slot >= nritems) {
47614766 btrfs_print_leaf(leaf);
4762
- btrfs_crit(fs_info, "slot %d too large, nritems %d",
4767
+ btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
47634768 slot, nritems);
4764
- BUG_ON(1);
4769
+ BUG();
47654770 }
47664771
47674772 /*
47684773 * item0..itemN ... dataN.offset..dataN.size .. data0.size
47694774 */
47704775 /* first correct the data pointers */
4776
+ btrfs_init_map_token(&token, leaf);
47714777 for (i = slot; i < nritems; i++) {
47724778 u32 ioff;
47734779 item = btrfs_item_nr(i);
47744780
4775
- ioff = btrfs_token_item_offset(leaf, item, &token);
4776
- btrfs_set_token_item_offset(leaf, item,
4777
- ioff - data_size, &token);
4781
+ ioff = btrfs_token_item_offset(&token, item);
4782
+ btrfs_set_token_item_offset(&token, item, ioff - data_size);
47784783 }
47794784
47804785 /* shift the data */
....@@ -4788,20 +4793,26 @@
47884793 btrfs_set_item_size(leaf, item, old_size + data_size);
47894794 btrfs_mark_buffer_dirty(leaf);
47904795
4791
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4796
+ if (btrfs_leaf_free_space(leaf) < 0) {
47924797 btrfs_print_leaf(leaf);
47934798 BUG();
47944799 }
47954800 }
47964801
4797
-/*
4798
- * this is a helper for btrfs_insert_empty_items, the main goal here is
4799
- * to save stack depth by doing the bulk of the work in a function
4800
- * that doesn't call btrfs_search_slot
4802
+/**
4803
+ * setup_items_for_insert - Helper called before inserting one or more items
4804
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4805
+ * in a function that doesn't call btrfs_search_slot
4806
+ *
4807
+ * @root: root we are inserting items to
4808
+ * @path: points to the leaf/slot where we are going to insert new items
4809
+ * @cpu_key: array of keys for items to be inserted
4810
+ * @data_size: size of the body of each item we are going to insert
4811
+ * @nr: size of @cpu_key/@data_size arrays
48014812 */
48024813 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
48034814 const struct btrfs_key *cpu_key, u32 *data_size,
4804
- u32 total_data, u32 total_size, int nr)
4815
+ int nr)
48054816 {
48064817 struct btrfs_fs_info *fs_info = root->fs_info;
48074818 struct btrfs_item *item;
....@@ -4812,6 +4823,12 @@
48124823 struct extent_buffer *leaf;
48134824 int slot;
48144825 struct btrfs_map_token token;
4826
+ u32 total_size;
4827
+ u32 total_data = 0;
4828
+
4829
+ for (i = 0; i < nr; i++)
4830
+ total_data += data_size[i];
4831
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
48154832
48164833 if (path->slots[0] == 0) {
48174834 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
....@@ -4819,29 +4836,29 @@
48194836 }
48204837 btrfs_unlock_up_safe(path, 1);
48214838
4822
- btrfs_init_map_token(&token);
4823
-
48244839 leaf = path->nodes[0];
48254840 slot = path->slots[0];
48264841
48274842 nritems = btrfs_header_nritems(leaf);
4828
- data_end = leaf_data_end(fs_info, leaf);
4843
+ data_end = leaf_data_end(leaf);
48294844
4830
- if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4845
+ if (btrfs_leaf_free_space(leaf) < total_size) {
48314846 btrfs_print_leaf(leaf);
48324847 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4833
- total_size, btrfs_leaf_free_space(fs_info, leaf));
4848
+ total_size, btrfs_leaf_free_space(leaf));
48344849 BUG();
48354850 }
48364851
4852
+ btrfs_init_map_token(&token, leaf);
48374853 if (slot != nritems) {
48384854 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
48394855
48404856 if (old_data < data_end) {
48414857 btrfs_print_leaf(leaf);
4842
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4858
+ btrfs_crit(fs_info,
4859
+ "item at slot %d with data offset %u beyond data end of leaf %u",
48434860 slot, old_data, data_end);
4844
- BUG_ON(1);
4861
+ BUG();
48454862 }
48464863 /*
48474864 * item0..itemN ... dataN.offset..dataN.size .. data0.size
....@@ -4851,9 +4868,9 @@
48514868 u32 ioff;
48524869
48534870 item = btrfs_item_nr(i);
4854
- ioff = btrfs_token_item_offset(leaf, item, &token);
4855
- btrfs_set_token_item_offset(leaf, item,
4856
- ioff - total_data, &token);
4871
+ ioff = btrfs_token_item_offset(&token, item);
4872
+ btrfs_set_token_item_offset(&token, item,
4873
+ ioff - total_data);
48574874 }
48584875 /* shift the items */
48594876 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
....@@ -4872,16 +4889,15 @@
48724889 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
48734890 btrfs_set_item_key(leaf, &disk_key, slot + i);
48744891 item = btrfs_item_nr(slot + i);
4875
- btrfs_set_token_item_offset(leaf, item,
4876
- data_end - data_size[i], &token);
48774892 data_end -= data_size[i];
4878
- btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4893
+ btrfs_set_token_item_offset(&token, item, data_end);
4894
+ btrfs_set_token_item_size(&token, item, data_size[i]);
48794895 }
48804896
48814897 btrfs_set_header_nritems(leaf, nritems + nr);
48824898 btrfs_mark_buffer_dirty(leaf);
48834899
4884
- if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4900
+ if (btrfs_leaf_free_space(leaf) < 0) {
48854901 btrfs_print_leaf(leaf);
48864902 BUG();
48874903 }
....@@ -4916,8 +4932,7 @@
49164932 slot = path->slots[0];
49174933 BUG_ON(slot < 0);
49184934
4919
- setup_items_for_insert(root, path, cpu_key, data_size,
4920
- total_data, total_size, nr);
4935
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
49214936 return 0;
49224937 }
49234938
....@@ -5020,7 +5035,7 @@
50205035
50215036 root_sub_used(root, leaf->len);
50225037
5023
- extent_buffer_get(leaf);
5038
+ atomic_inc(&leaf->refs);
50245039 btrfs_free_tree_block(trans, root, leaf, 0, 1);
50255040 free_extent_buffer_stale(leaf);
50265041 }
....@@ -5040,9 +5055,6 @@
50405055 int wret;
50415056 int i;
50425057 u32 nritems;
5043
- struct btrfs_map_token token;
5044
-
5045
- btrfs_init_map_token(&token);
50465058
50475059 leaf = path->nodes[0];
50485060 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
....@@ -5053,20 +5065,21 @@
50535065 nritems = btrfs_header_nritems(leaf);
50545066
50555067 if (slot + nr != nritems) {
5056
- int data_end = leaf_data_end(fs_info, leaf);
5068
+ int data_end = leaf_data_end(leaf);
5069
+ struct btrfs_map_token token;
50575070
50585071 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
50595072 data_end + dsize,
50605073 BTRFS_LEAF_DATA_OFFSET + data_end,
50615074 last_off - data_end);
50625075
5076
+ btrfs_init_map_token(&token, leaf);
50635077 for (i = slot + nr; i < nritems; i++) {
50645078 u32 ioff;
50655079
50665080 item = btrfs_item_nr(i);
5067
- ioff = btrfs_token_item_offset(leaf, item, &token);
5068
- btrfs_set_token_item_offset(leaf, item,
5069
- ioff + dsize, &token);
5081
+ ioff = btrfs_token_item_offset(&token, item);
5082
+ btrfs_set_token_item_offset(&token, item, ioff + dsize);
50705083 }
50715084
50725085 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
....@@ -5083,7 +5096,7 @@
50835096 btrfs_set_header_level(leaf, 0);
50845097 } else {
50855098 btrfs_set_path_blocking(path);
5086
- clean_tree_block(fs_info, leaf);
5099
+ btrfs_clean_tree_block(leaf);
50875100 btrfs_del_leaf(trans, root, path, leaf);
50885101 }
50895102 } else {
....@@ -5102,7 +5115,7 @@
51025115 * for possible call to del_ptr below
51035116 */
51045117 slot = path->slots[1];
5105
- extent_buffer_get(leaf);
5118
+ atomic_inc(&leaf->refs);
51065119
51075120 btrfs_set_path_blocking(path);
51085121 wret = push_leaf_left(trans, root, path, 1, 1,
....@@ -5151,10 +5164,12 @@
51515164 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
51525165 {
51535166 struct btrfs_key key;
5167
+ struct btrfs_key orig_key;
51545168 struct btrfs_disk_key found_key;
51555169 int ret;
51565170
51575171 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5172
+ orig_key = key;
51585173
51595174 if (key.offset > 0) {
51605175 key.offset--;
....@@ -5171,8 +5186,36 @@
51715186
51725187 btrfs_release_path(path);
51735188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5174
- if (ret < 0)
5189
+ if (ret <= 0)
51755190 return ret;
5191
+
5192
+ /*
5193
+ * Previous key not found. Even if we were at slot 0 of the leaf we had
5194
+ * before releasing the path and calling btrfs_search_slot(), we now may
5195
+ * be in a slot pointing to the same original key - this can happen if
5196
+ * after we released the path, one of more items were moved from a
5197
+ * sibling leaf into the front of the leaf we had due to an insertion
5198
+ * (see push_leaf_right()).
5199
+ * If we hit this case and our slot is > 0 and just decrement the slot
5200
+ * so that the caller does not process the same key again, which may or
5201
+ * may not break the caller, depending on its logic.
5202
+ */
5203
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5204
+ btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
5205
+ ret = comp_keys(&found_key, &orig_key);
5206
+ if (ret == 0) {
5207
+ if (path->slots[0] > 0) {
5208
+ path->slots[0]--;
5209
+ return 0;
5210
+ }
5211
+ /*
5212
+ * At slot 0, same key as before, it means orig_key is
5213
+ * the lowest, leftmost, key in the tree. We're done.
5214
+ */
5215
+ return 1;
5216
+ }
5217
+ }
5218
+
51765219 btrfs_item_key(path->nodes[0], &found_key, 0);
51775220 ret = comp_keys(&found_key, &key);
51785221 /*
....@@ -5213,7 +5256,6 @@
52135256 struct btrfs_path *path,
52145257 u64 min_trans)
52155258 {
5216
- struct btrfs_fs_info *fs_info = root->fs_info;
52175259 struct extent_buffer *cur;
52185260 struct btrfs_key found_key;
52195261 int slot;
....@@ -5238,7 +5280,11 @@
52385280 while (1) {
52395281 nritems = btrfs_header_nritems(cur);
52405282 level = btrfs_header_level(cur);
5241
- sret = btrfs_bin_search(cur, min_key, level, &slot);
5283
+ sret = btrfs_bin_search(cur, min_key, &slot);
5284
+ if (sret < 0) {
5285
+ ret = sret;
5286
+ goto out;
5287
+ }
52425288
52435289 /* at the lowest level, we're done, setup the path and exit */
52445290 if (level == path->lowest_level) {
....@@ -5253,7 +5299,7 @@
52535299 slot--;
52545300 /*
52555301 * check this node pointer against the min_trans parameters.
5256
- * If it is too old, old, skip to the next one.
5302
+ * If it is too old, skip to the next one.
52575303 */
52585304 while (slot < nritems) {
52595305 u64 gen;
....@@ -5290,7 +5336,7 @@
52905336 goto out;
52915337 }
52925338 btrfs_set_path_blocking(path);
5293
- cur = read_node_slot(fs_info, cur, slot);
5339
+ cur = btrfs_read_node_slot(cur, slot);
52945340 if (IS_ERR(cur)) {
52955341 ret = PTR_ERR(cur);
52965342 goto out;
....@@ -5301,7 +5347,6 @@
53015347 path->locks[level - 1] = BTRFS_READ_LOCK;
53025348 path->nodes[level - 1] = cur;
53035349 unlock_up(path, level, 1, 0, NULL);
5304
- btrfs_clear_path_blocking(path, NULL, 0);
53055350 }
53065351 out:
53075352 path->keep_locks = keep_locks;
....@@ -5310,374 +5355,6 @@
53105355 btrfs_set_path_blocking(path);
53115356 memcpy(min_key, &found_key, sizeof(found_key));
53125357 }
5313
- return ret;
5314
-}
5315
-
5316
-static int tree_move_down(struct btrfs_fs_info *fs_info,
5317
- struct btrfs_path *path,
5318
- int *level)
5319
-{
5320
- struct extent_buffer *eb;
5321
-
5322
- BUG_ON(*level == 0);
5323
- eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5324
- if (IS_ERR(eb))
5325
- return PTR_ERR(eb);
5326
-
5327
- path->nodes[*level - 1] = eb;
5328
- path->slots[*level - 1] = 0;
5329
- (*level)--;
5330
- return 0;
5331
-}
5332
-
5333
-static int tree_move_next_or_upnext(struct btrfs_path *path,
5334
- int *level, int root_level)
5335
-{
5336
- int ret = 0;
5337
- int nritems;
5338
- nritems = btrfs_header_nritems(path->nodes[*level]);
5339
-
5340
- path->slots[*level]++;
5341
-
5342
- while (path->slots[*level] >= nritems) {
5343
- if (*level == root_level)
5344
- return -1;
5345
-
5346
- /* move upnext */
5347
- path->slots[*level] = 0;
5348
- free_extent_buffer(path->nodes[*level]);
5349
- path->nodes[*level] = NULL;
5350
- (*level)++;
5351
- path->slots[*level]++;
5352
-
5353
- nritems = btrfs_header_nritems(path->nodes[*level]);
5354
- ret = 1;
5355
- }
5356
- return ret;
5357
-}
5358
-
5359
-/*
5360
- * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5361
- * or down.
5362
- */
5363
-static int tree_advance(struct btrfs_fs_info *fs_info,
5364
- struct btrfs_path *path,
5365
- int *level, int root_level,
5366
- int allow_down,
5367
- struct btrfs_key *key)
5368
-{
5369
- int ret;
5370
-
5371
- if (*level == 0 || !allow_down) {
5372
- ret = tree_move_next_or_upnext(path, level, root_level);
5373
- } else {
5374
- ret = tree_move_down(fs_info, path, level);
5375
- }
5376
- if (ret >= 0) {
5377
- if (*level == 0)
5378
- btrfs_item_key_to_cpu(path->nodes[*level], key,
5379
- path->slots[*level]);
5380
- else
5381
- btrfs_node_key_to_cpu(path->nodes[*level], key,
5382
- path->slots[*level]);
5383
- }
5384
- return ret;
5385
-}
5386
-
5387
-static int tree_compare_item(struct btrfs_path *left_path,
5388
- struct btrfs_path *right_path,
5389
- char *tmp_buf)
5390
-{
5391
- int cmp;
5392
- int len1, len2;
5393
- unsigned long off1, off2;
5394
-
5395
- len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5396
- len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5397
- if (len1 != len2)
5398
- return 1;
5399
-
5400
- off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5401
- off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5402
- right_path->slots[0]);
5403
-
5404
- read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5405
-
5406
- cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5407
- if (cmp)
5408
- return 1;
5409
- return 0;
5410
-}
5411
-
5412
-#define ADVANCE 1
5413
-#define ADVANCE_ONLY_NEXT -1
5414
-
5415
-/*
5416
- * This function compares two trees and calls the provided callback for
5417
- * every changed/new/deleted item it finds.
5418
- * If shared tree blocks are encountered, whole subtrees are skipped, making
5419
- * the compare pretty fast on snapshotted subvolumes.
5420
- *
5421
- * This currently works on commit roots only. As commit roots are read only,
5422
- * we don't do any locking. The commit roots are protected with transactions.
5423
- * Transactions are ended and rejoined when a commit is tried in between.
5424
- *
5425
- * This function checks for modifications done to the trees while comparing.
5426
- * If it detects a change, it aborts immediately.
5427
- */
5428
-int btrfs_compare_trees(struct btrfs_root *left_root,
5429
- struct btrfs_root *right_root,
5430
- btrfs_changed_cb_t changed_cb, void *ctx)
5431
-{
5432
- struct btrfs_fs_info *fs_info = left_root->fs_info;
5433
- int ret;
5434
- int cmp;
5435
- struct btrfs_path *left_path = NULL;
5436
- struct btrfs_path *right_path = NULL;
5437
- struct btrfs_key left_key;
5438
- struct btrfs_key right_key;
5439
- char *tmp_buf = NULL;
5440
- int left_root_level;
5441
- int right_root_level;
5442
- int left_level;
5443
- int right_level;
5444
- int left_end_reached;
5445
- int right_end_reached;
5446
- int advance_left;
5447
- int advance_right;
5448
- u64 left_blockptr;
5449
- u64 right_blockptr;
5450
- u64 left_gen;
5451
- u64 right_gen;
5452
-
5453
- left_path = btrfs_alloc_path();
5454
- if (!left_path) {
5455
- ret = -ENOMEM;
5456
- goto out;
5457
- }
5458
- right_path = btrfs_alloc_path();
5459
- if (!right_path) {
5460
- ret = -ENOMEM;
5461
- goto out;
5462
- }
5463
-
5464
- tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5465
- if (!tmp_buf) {
5466
- ret = -ENOMEM;
5467
- goto out;
5468
- }
5469
-
5470
- left_path->search_commit_root = 1;
5471
- left_path->skip_locking = 1;
5472
- right_path->search_commit_root = 1;
5473
- right_path->skip_locking = 1;
5474
-
5475
- /*
5476
- * Strategy: Go to the first items of both trees. Then do
5477
- *
5478
- * If both trees are at level 0
5479
- * Compare keys of current items
5480
- * If left < right treat left item as new, advance left tree
5481
- * and repeat
5482
- * If left > right treat right item as deleted, advance right tree
5483
- * and repeat
5484
- * If left == right do deep compare of items, treat as changed if
5485
- * needed, advance both trees and repeat
5486
- * If both trees are at the same level but not at level 0
5487
- * Compare keys of current nodes/leafs
5488
- * If left < right advance left tree and repeat
5489
- * If left > right advance right tree and repeat
5490
- * If left == right compare blockptrs of the next nodes/leafs
5491
- * If they match advance both trees but stay at the same level
5492
- * and repeat
5493
- * If they don't match advance both trees while allowing to go
5494
- * deeper and repeat
5495
- * If tree levels are different
5496
- * Advance the tree that needs it and repeat
5497
- *
5498
- * Advancing a tree means:
5499
- * If we are at level 0, try to go to the next slot. If that's not
5500
- * possible, go one level up and repeat. Stop when we found a level
5501
- * where we could go to the next slot. We may at this point be on a
5502
- * node or a leaf.
5503
- *
5504
- * If we are not at level 0 and not on shared tree blocks, go one
5505
- * level deeper.
5506
- *
5507
- * If we are not at level 0 and on shared tree blocks, go one slot to
5508
- * the right if possible or go up and right.
5509
- */
5510
-
5511
- down_read(&fs_info->commit_root_sem);
5512
- left_level = btrfs_header_level(left_root->commit_root);
5513
- left_root_level = left_level;
5514
- left_path->nodes[left_level] =
5515
- btrfs_clone_extent_buffer(left_root->commit_root);
5516
- if (!left_path->nodes[left_level]) {
5517
- up_read(&fs_info->commit_root_sem);
5518
- ret = -ENOMEM;
5519
- goto out;
5520
- }
5521
- extent_buffer_get(left_path->nodes[left_level]);
5522
-
5523
- right_level = btrfs_header_level(right_root->commit_root);
5524
- right_root_level = right_level;
5525
- right_path->nodes[right_level] =
5526
- btrfs_clone_extent_buffer(right_root->commit_root);
5527
- if (!right_path->nodes[right_level]) {
5528
- up_read(&fs_info->commit_root_sem);
5529
- ret = -ENOMEM;
5530
- goto out;
5531
- }
5532
- extent_buffer_get(right_path->nodes[right_level]);
5533
- up_read(&fs_info->commit_root_sem);
5534
-
5535
- if (left_level == 0)
5536
- btrfs_item_key_to_cpu(left_path->nodes[left_level],
5537
- &left_key, left_path->slots[left_level]);
5538
- else
5539
- btrfs_node_key_to_cpu(left_path->nodes[left_level],
5540
- &left_key, left_path->slots[left_level]);
5541
- if (right_level == 0)
5542
- btrfs_item_key_to_cpu(right_path->nodes[right_level],
5543
- &right_key, right_path->slots[right_level]);
5544
- else
5545
- btrfs_node_key_to_cpu(right_path->nodes[right_level],
5546
- &right_key, right_path->slots[right_level]);
5547
-
5548
- left_end_reached = right_end_reached = 0;
5549
- advance_left = advance_right = 0;
5550
-
5551
- while (1) {
5552
- cond_resched();
5553
- if (advance_left && !left_end_reached) {
5554
- ret = tree_advance(fs_info, left_path, &left_level,
5555
- left_root_level,
5556
- advance_left != ADVANCE_ONLY_NEXT,
5557
- &left_key);
5558
- if (ret == -1)
5559
- left_end_reached = ADVANCE;
5560
- else if (ret < 0)
5561
- goto out;
5562
- advance_left = 0;
5563
- }
5564
- if (advance_right && !right_end_reached) {
5565
- ret = tree_advance(fs_info, right_path, &right_level,
5566
- right_root_level,
5567
- advance_right != ADVANCE_ONLY_NEXT,
5568
- &right_key);
5569
- if (ret == -1)
5570
- right_end_reached = ADVANCE;
5571
- else if (ret < 0)
5572
- goto out;
5573
- advance_right = 0;
5574
- }
5575
-
5576
- if (left_end_reached && right_end_reached) {
5577
- ret = 0;
5578
- goto out;
5579
- } else if (left_end_reached) {
5580
- if (right_level == 0) {
5581
- ret = changed_cb(left_path, right_path,
5582
- &right_key,
5583
- BTRFS_COMPARE_TREE_DELETED,
5584
- ctx);
5585
- if (ret < 0)
5586
- goto out;
5587
- }
5588
- advance_right = ADVANCE;
5589
- continue;
5590
- } else if (right_end_reached) {
5591
- if (left_level == 0) {
5592
- ret = changed_cb(left_path, right_path,
5593
- &left_key,
5594
- BTRFS_COMPARE_TREE_NEW,
5595
- ctx);
5596
- if (ret < 0)
5597
- goto out;
5598
- }
5599
- advance_left = ADVANCE;
5600
- continue;
5601
- }
5602
-
5603
- if (left_level == 0 && right_level == 0) {
5604
- cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5605
- if (cmp < 0) {
5606
- ret = changed_cb(left_path, right_path,
5607
- &left_key,
5608
- BTRFS_COMPARE_TREE_NEW,
5609
- ctx);
5610
- if (ret < 0)
5611
- goto out;
5612
- advance_left = ADVANCE;
5613
- } else if (cmp > 0) {
5614
- ret = changed_cb(left_path, right_path,
5615
- &right_key,
5616
- BTRFS_COMPARE_TREE_DELETED,
5617
- ctx);
5618
- if (ret < 0)
5619
- goto out;
5620
- advance_right = ADVANCE;
5621
- } else {
5622
- enum btrfs_compare_tree_result result;
5623
-
5624
- WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5625
- ret = tree_compare_item(left_path, right_path,
5626
- tmp_buf);
5627
- if (ret)
5628
- result = BTRFS_COMPARE_TREE_CHANGED;
5629
- else
5630
- result = BTRFS_COMPARE_TREE_SAME;
5631
- ret = changed_cb(left_path, right_path,
5632
- &left_key, result, ctx);
5633
- if (ret < 0)
5634
- goto out;
5635
- advance_left = ADVANCE;
5636
- advance_right = ADVANCE;
5637
- }
5638
- } else if (left_level == right_level) {
5639
- cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5640
- if (cmp < 0) {
5641
- advance_left = ADVANCE;
5642
- } else if (cmp > 0) {
5643
- advance_right = ADVANCE;
5644
- } else {
5645
- left_blockptr = btrfs_node_blockptr(
5646
- left_path->nodes[left_level],
5647
- left_path->slots[left_level]);
5648
- right_blockptr = btrfs_node_blockptr(
5649
- right_path->nodes[right_level],
5650
- right_path->slots[right_level]);
5651
- left_gen = btrfs_node_ptr_generation(
5652
- left_path->nodes[left_level],
5653
- left_path->slots[left_level]);
5654
- right_gen = btrfs_node_ptr_generation(
5655
- right_path->nodes[right_level],
5656
- right_path->slots[right_level]);
5657
- if (left_blockptr == right_blockptr &&
5658
- left_gen == right_gen) {
5659
- /*
5660
- * As we're on a shared block, don't
5661
- * allow to go deeper.
5662
- */
5663
- advance_left = ADVANCE_ONLY_NEXT;
5664
- advance_right = ADVANCE_ONLY_NEXT;
5665
- } else {
5666
- advance_left = ADVANCE;
5667
- advance_right = ADVANCE;
5668
- }
5669
- }
5670
- } else if (left_level < right_level) {
5671
- advance_right = ADVANCE;
5672
- } else {
5673
- advance_left = ADVANCE;
5674
- }
5675
- }
5676
-
5677
-out:
5678
- btrfs_free_path(left_path);
5679
- btrfs_free_path(right_path);
5680
- kvfree(tmp_buf);
56815358 return ret;
56825359 }
56835360
....@@ -5698,7 +5375,7 @@
56985375 int slot;
56995376 struct extent_buffer *c;
57005377
5701
- WARN_ON(!path->keep_locks);
5378
+ WARN_ON(!path->keep_locks && !path->skip_locking);
57025379 while (level < BTRFS_MAX_LEVEL) {
57035380 if (!path->nodes[level])
57045381 return 1;
....@@ -5714,7 +5391,7 @@
57145391 !path->nodes[level + 1])
57155392 return 1;
57165393
5717
- if (path->locks[level + 1]) {
5394
+ if (path->locks[level + 1] || path->skip_locking) {
57185395 level++;
57195396 continue;
57205397 }
....@@ -5886,9 +5563,9 @@
58865563 }
58875564 if (!ret) {
58885565 btrfs_set_path_blocking(path);
5889
- btrfs_tree_read_lock(next);
5890
- btrfs_clear_path_blocking(path, next,
5891
- BTRFS_READ_LOCK);
5566
+ __btrfs_tree_read_lock(next,
5567
+ BTRFS_NESTING_RIGHT,
5568
+ path->recurse);
58925569 }
58935570 next_rw_lock = BTRFS_READ_LOCK;
58945571 }
....@@ -5923,9 +5600,9 @@
59235600 ret = btrfs_try_tree_read_lock(next);
59245601 if (!ret) {
59255602 btrfs_set_path_blocking(path);
5926
- btrfs_tree_read_lock(next);
5927
- btrfs_clear_path_blocking(path, next,
5928
- BTRFS_READ_LOCK);
5603
+ __btrfs_tree_read_lock(next,
5604
+ BTRFS_NESTING_RIGHT,
5605
+ path->recurse);
59295606 }
59305607 next_rw_lock = BTRFS_READ_LOCK;
59315608 }