hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/fs/btrfs/backref.c
....@@ -13,6 +13,7 @@
1313 #include "transaction.h"
1414 #include "delayed-ref.h"
1515 #include "locking.h"
16
+#include "misc.h"
1617
1718 /* Just an arbitrary number so we can be sure this happened */
1819 #define BACKREF_FOUND_SHARED 6
....@@ -112,11 +113,11 @@
112113 }
113114
114115 struct preftree {
115
- struct rb_root root;
116
+ struct rb_root_cached root;
116117 unsigned int count;
117118 };
118119
119
-#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 }
120
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
120121
121122 struct preftrees {
122123 struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
....@@ -136,6 +137,7 @@
136137 u64 root_objectid;
137138 u64 inum;
138139 int share_count;
140
+ bool have_delayed_delete_refs;
139141 };
140142
141143 static inline int extent_is_shared(struct share_check *sc)
....@@ -225,14 +227,15 @@
225227 struct prelim_ref *newref,
226228 struct share_check *sc)
227229 {
228
- struct rb_root *root;
230
+ struct rb_root_cached *root;
229231 struct rb_node **p;
230232 struct rb_node *parent = NULL;
231233 struct prelim_ref *ref;
232234 int result;
235
+ bool leftmost = true;
233236
234237 root = &preftree->root;
235
- p = &root->rb_node;
238
+ p = &root->rb_root.rb_node;
236239
237240 while (*p) {
238241 parent = *p;
....@@ -242,6 +245,7 @@
242245 p = &(*p)->rb_left;
243246 } else if (result > 0) {
244247 p = &(*p)->rb_right;
248
+ leftmost = false;
245249 } else {
246250 /* Identical refs, merge them and free @newref */
247251 struct extent_inode_elem *eie = ref->inode_list;
....@@ -272,7 +276,7 @@
272276 preftree->count++;
273277 trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
274278 rb_link_node(&newref->rbnode, parent, p);
275
- rb_insert_color(&newref->rbnode, root);
279
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
276280 }
277281
278282 /*
....@@ -283,11 +287,13 @@
283287 {
284288 struct prelim_ref *ref, *next_ref;
285289
286
- rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
287
- rbnode)
290
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
291
+ &preftree->root.rb_root, rbnode) {
292
+ free_inode_elem_list(ref->inode_list);
288293 free_pref(ref);
294
+ }
289295
290
- preftree->root = RB_ROOT;
296
+ preftree->root = RB_ROOT_CACHED;
291297 preftree->count = 0;
292298 }
293299
....@@ -345,33 +351,10 @@
345351 return -ENOMEM;
346352
347353 ref->root_id = root_id;
348
- if (key) {
354
+ if (key)
349355 ref->key_for_search = *key;
350
- /*
351
- * We can often find data backrefs with an offset that is too
352
- * large (>= LLONG_MAX, maximum allowed file offset) due to
353
- * underflows when subtracting a file's offset with the data
354
- * offset of its corresponding extent data item. This can
355
- * happen for example in the clone ioctl.
356
- * So if we detect such case we set the search key's offset to
357
- * zero to make sure we will find the matching file extent item
358
- * at add_all_parents(), otherwise we will miss it because the
359
- * offset taken form the backref is much larger then the offset
360
- * of the file extent item. This can make us scan a very large
361
- * number of file extent items, but at least it will not make
362
- * us miss any.
363
- * This is an ugly workaround for a behaviour that should have
364
- * never existed, but it does and a fix for the clone ioctl
365
- * would touch a lot of places, cause backwards incompatibility
366
- * and would not fix the problem for extents cloned with older
367
- * kernels.
368
- */
369
- if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
370
- ref->key_for_search.offset >= LLONG_MAX)
371
- ref->key_for_search.offset = 0;
372
- } else {
356
+ else
373357 memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
374
- }
375358
376359 ref->inode_list = NULL;
377360 ref->level = level;
....@@ -407,10 +390,36 @@
407390 wanted_disk_byte, count, sc, gfp_mask);
408391 }
409392
393
+static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
394
+{
395
+ struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
396
+ struct rb_node *parent = NULL;
397
+ struct prelim_ref *ref = NULL;
398
+ struct prelim_ref target = {};
399
+ int result;
400
+
401
+ target.parent = bytenr;
402
+
403
+ while (*p) {
404
+ parent = *p;
405
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
406
+ result = prelim_ref_compare(ref, &target);
407
+
408
+ if (result < 0)
409
+ p = &(*p)->rb_left;
410
+ else if (result > 0)
411
+ p = &(*p)->rb_right;
412
+ else
413
+ return 1;
414
+ }
415
+ return 0;
416
+}
417
+
410418 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
411
- struct ulist *parents, struct prelim_ref *ref,
419
+ struct ulist *parents,
420
+ struct preftrees *preftrees, struct prelim_ref *ref,
412421 int level, u64 time_seq, const u64 *extent_item_pos,
413
- u64 total_refs, bool ignore_offset)
422
+ bool ignore_offset)
414423 {
415424 int ret = 0;
416425 int slot;
....@@ -422,6 +431,7 @@
422431 u64 disk_byte;
423432 u64 wanted_disk_byte = ref->wanted_disk_byte;
424433 u64 count = 0;
434
+ u64 data_offset;
425435
426436 if (level != 0) {
427437 eb = path->nodes[level];
....@@ -432,18 +442,26 @@
432442 }
433443
434444 /*
435
- * We normally enter this function with the path already pointing to
436
- * the first item to check. But sometimes, we may enter it with
437
- * slot==nritems. In that case, go to the next leaf before we continue.
445
+ * 1. We normally enter this function with the path already pointing to
446
+ * the first item to check. But sometimes, we may enter it with
447
+ * slot == nritems.
448
+ * 2. We are searching for normal backref but bytenr of this leaf
449
+ * matches shared data backref
450
+ * 3. The leaf owner is not equal to the root we are searching
451
+ *
452
+ * For these cases, go to the next leaf before we continue.
438453 */
439
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
454
+ eb = path->nodes[0];
455
+ if (path->slots[0] >= btrfs_header_nritems(eb) ||
456
+ is_shared_data_backref(preftrees, eb->start) ||
457
+ ref->root_id != btrfs_header_owner(eb)) {
440458 if (time_seq == SEQ_LAST)
441459 ret = btrfs_next_leaf(root, path);
442460 else
443461 ret = btrfs_next_old_leaf(root, path, time_seq);
444462 }
445463
446
- while (!ret && count < total_refs) {
464
+ while (!ret && count < ref->count) {
447465 eb = path->nodes[0];
448466 slot = path->slots[0];
449467
....@@ -453,13 +471,31 @@
453471 key.type != BTRFS_EXTENT_DATA_KEY)
454472 break;
455473
474
+ /*
475
+ * We are searching for normal backref but bytenr of this leaf
476
+ * matches shared data backref, OR
477
+ * the leaf owner is not equal to the root we are searching for
478
+ */
479
+ if (slot == 0 &&
480
+ (is_shared_data_backref(preftrees, eb->start) ||
481
+ ref->root_id != btrfs_header_owner(eb))) {
482
+ if (time_seq == SEQ_LAST)
483
+ ret = btrfs_next_leaf(root, path);
484
+ else
485
+ ret = btrfs_next_old_leaf(root, path, time_seq);
486
+ continue;
487
+ }
456488 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
457489 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
490
+ data_offset = btrfs_file_extent_offset(eb, fi);
458491
459492 if (disk_byte == wanted_disk_byte) {
460493 eie = NULL;
461494 old = NULL;
462
- count++;
495
+ if (ref->key_for_search.offset == key.offset - data_offset)
496
+ count++;
497
+ else
498
+ goto next;
463499 if (extent_item_pos) {
464500 ret = check_extent_in_eb(&key, eb, fi,
465501 *extent_item_pos,
....@@ -500,33 +536,41 @@
500536 */
501537 static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
502538 struct btrfs_path *path, u64 time_seq,
539
+ struct preftrees *preftrees,
503540 struct prelim_ref *ref, struct ulist *parents,
504
- const u64 *extent_item_pos, u64 total_refs,
505
- bool ignore_offset)
541
+ const u64 *extent_item_pos, bool ignore_offset)
506542 {
507543 struct btrfs_root *root;
508
- struct btrfs_key root_key;
509544 struct extent_buffer *eb;
510545 int ret = 0;
511546 int root_level;
512547 int level = ref->level;
513
- int index;
548
+ struct btrfs_key search_key = ref->key_for_search;
514549
515
- root_key.objectid = ref->root_id;
516
- root_key.type = BTRFS_ROOT_ITEM_KEY;
517
- root_key.offset = (u64)-1;
518
-
519
- index = srcu_read_lock(&fs_info->subvol_srcu);
520
-
521
- root = btrfs_get_fs_root(fs_info, &root_key, false);
550
+ /*
551
+ * If we're search_commit_root we could possibly be holding locks on
552
+ * other tree nodes. This happens when qgroups does backref walks when
553
+ * adding new delayed refs. To deal with this we need to look in cache
554
+ * for the root, and if we don't find it then we need to search the
555
+ * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage
556
+ * here.
557
+ */
558
+ if (path->search_commit_root)
559
+ root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id);
560
+ else
561
+ root = btrfs_get_fs_root(fs_info, ref->root_id, false);
522562 if (IS_ERR(root)) {
523
- srcu_read_unlock(&fs_info->subvol_srcu, index);
524563 ret = PTR_ERR(root);
564
+ goto out_free;
565
+ }
566
+
567
+ if (!path->search_commit_root &&
568
+ test_bit(BTRFS_ROOT_DELETING, &root->state)) {
569
+ ret = -ENOENT;
525570 goto out;
526571 }
527572
528573 if (btrfs_is_testing(fs_info)) {
529
- srcu_read_unlock(&fs_info->subvol_srcu, index);
530574 ret = -ENOENT;
531575 goto out;
532576 }
....@@ -538,21 +582,36 @@
538582 else
539583 root_level = btrfs_old_root_level(root, time_seq);
540584
541
- if (root_level + 1 == level) {
542
- srcu_read_unlock(&fs_info->subvol_srcu, index);
585
+ if (root_level + 1 == level)
543586 goto out;
544
- }
545587
588
+ /*
589
+ * We can often find data backrefs with an offset that is too large
590
+ * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
591
+ * subtracting a file's offset with the data offset of its
592
+ * corresponding extent data item. This can happen for example in the
593
+ * clone ioctl.
594
+ *
595
+ * So if we detect such case we set the search key's offset to zero to
596
+ * make sure we will find the matching file extent item at
597
+ * add_all_parents(), otherwise we will miss it because the offset
598
+ * taken form the backref is much larger then the offset of the file
599
+ * extent item. This can make us scan a very large number of file
600
+ * extent items, but at least it will not make us miss any.
601
+ *
602
+ * This is an ugly workaround for a behaviour that should have never
603
+ * existed, but it does and a fix for the clone ioctl would touch a lot
604
+ * of places, cause backwards incompatibility and would not fix the
605
+ * problem for extents cloned with older kernels.
606
+ */
607
+ if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
608
+ search_key.offset >= LLONG_MAX)
609
+ search_key.offset = 0;
546610 path->lowest_level = level;
547611 if (time_seq == SEQ_LAST)
548
- ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
549
- 0, 0);
612
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
550613 else
551
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
552
- time_seq);
553
-
554
- /* root node has been locked, we can release @subvol_srcu safely here */
555
- srcu_read_unlock(&fs_info->subvol_srcu, index);
614
+ ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
556615
557616 btrfs_debug(fs_info,
558617 "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
....@@ -572,9 +631,11 @@
572631 eb = path->nodes[level];
573632 }
574633
575
- ret = add_all_parents(root, path, parents, ref, level, time_seq,
576
- extent_item_pos, total_refs, ignore_offset);
634
+ ret = add_all_parents(root, path, parents, preftrees, ref, level,
635
+ time_seq, extent_item_pos, ignore_offset);
577636 out:
637
+ btrfs_put_root(root);
638
+out_free:
578639 path->lowest_level = 0;
579640 btrfs_release_path(path);
580641 return ret;
....@@ -588,8 +649,20 @@
588649 return (struct extent_inode_elem *)(uintptr_t)node->aux;
589650 }
590651
652
+static void free_leaf_list(struct ulist *ulist)
653
+{
654
+ struct ulist_node *node;
655
+ struct ulist_iterator uiter;
656
+
657
+ ULIST_ITER_INIT(&uiter);
658
+ while ((node = ulist_next(ulist, &uiter)))
659
+ free_inode_elem_list(unode_aux_to_inode_list(node));
660
+
661
+ ulist_free(ulist);
662
+}
663
+
591664 /*
592
- * We maintain three seperate rbtrees: one for direct refs, one for
665
+ * We maintain three separate rbtrees: one for direct refs, one for
593666 * indirect refs which have a key, and one for indirect refs which do not
594667 * have a key. Each tree does merge on insertion.
595668 *
....@@ -607,7 +680,7 @@
607680 static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
608681 struct btrfs_path *path, u64 time_seq,
609682 struct preftrees *preftrees,
610
- const u64 *extent_item_pos, u64 total_refs,
683
+ const u64 *extent_item_pos,
611684 struct share_check *sc, bool ignore_offset)
612685 {
613686 int err;
....@@ -627,7 +700,7 @@
627700 * freeing the entire indirect tree when we're done. In some test
628701 * cases, the tree can grow quite large (~200k objects).
629702 */
630
- while ((rnode = rb_first(&preftrees->indirect.root))) {
703
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
631704 struct prelim_ref *ref;
632705
633706 ref = rb_entry(rnode, struct prelim_ref, rbnode);
....@@ -637,7 +710,7 @@
637710 goto out;
638711 }
639712
640
- rb_erase(&ref->rbnode, &preftrees->indirect.root);
713
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
641714 preftrees->indirect.count--;
642715
643716 if (ref->count == 0) {
....@@ -651,9 +724,9 @@
651724 ret = BACKREF_FOUND_SHARED;
652725 goto out;
653726 }
654
- err = resolve_indirect_ref(fs_info, path, time_seq, ref,
655
- parents, extent_item_pos,
656
- total_refs, ignore_offset);
727
+ err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
728
+ ref, parents, extent_item_pos,
729
+ ignore_offset);
657730 /*
658731 * we can only tolerate ENOENT,otherwise,we should catch error
659732 * and return directly.
....@@ -693,7 +766,7 @@
693766 }
694767
695768 /*
696
- * Now it's a direct ref, put it in the the direct tree. We must
769
+ * Now it's a direct ref, put it in the direct tree. We must
697770 * do this last because the ref could be merged/freed here.
698771 */
699772 prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL);
....@@ -702,7 +775,11 @@
702775 cond_resched();
703776 }
704777 out:
705
- ulist_free(parents);
778
+ /*
779
+ * We may have inode lists attached to refs in the parents ulist, so we
780
+ * must free them before freeing the ulist and its refs.
781
+ */
782
+ free_leaf_list(parents);
706783 return ret;
707784 }
708785
....@@ -717,9 +794,9 @@
717794 struct preftree *tree = &preftrees->indirect_missing_keys;
718795 struct rb_node *node;
719796
720
- while ((node = rb_first(&tree->root))) {
797
+ while ((node = rb_first_cached(&tree->root))) {
721798 ref = rb_entry(node, struct prelim_ref, rbnode);
722
- rb_erase(node, &tree->root);
799
+ rb_erase_cached(node, &tree->root);
723800
724801 BUG_ON(ref->parent); /* should not be a direct ref */
725802 BUG_ON(ref->key_for_search.type);
....@@ -756,22 +833,16 @@
756833 */
757834 static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
758835 struct btrfs_delayed_ref_head *head, u64 seq,
759
- struct preftrees *preftrees, u64 *total_refs,
760
- struct share_check *sc)
836
+ struct preftrees *preftrees, struct share_check *sc)
761837 {
762838 struct btrfs_delayed_ref_node *node;
763
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
764839 struct btrfs_key key;
765
- struct btrfs_key tmp_op_key;
766840 struct rb_node *n;
767841 int count;
768842 int ret = 0;
769843
770
- if (extent_op && extent_op->update_key)
771
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
772
-
773844 spin_lock(&head->lock);
774
- for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
845
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
775846 node = rb_entry(n, struct btrfs_delayed_ref_node,
776847 ref_node);
777848 if (node->seq > seq)
....@@ -789,17 +860,22 @@
789860 count = node->ref_mod * -1;
790861 break;
791862 default:
792
- BUG_ON(1);
863
+ BUG();
793864 }
794
- *total_refs += count;
795865 switch (node->type) {
796866 case BTRFS_TREE_BLOCK_REF_KEY: {
797867 /* NORMAL INDIRECT METADATA backref */
798868 struct btrfs_delayed_tree_ref *ref;
869
+ struct btrfs_key *key_ptr = NULL;
870
+
871
+ if (head->extent_op && head->extent_op->update_key) {
872
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
873
+ key_ptr = &key;
874
+ }
799875
800876 ref = btrfs_delayed_node_to_tree_ref(node);
801877 ret = add_indirect_ref(fs_info, preftrees, ref->root,
802
- &tmp_op_key, ref->level + 1,
878
+ key_ptr, ref->level + 1,
803879 node->bytenr, count, sc,
804880 GFP_ATOMIC);
805881 break;
....@@ -825,13 +901,22 @@
825901 key.offset = ref->offset;
826902
827903 /*
828
- * Found a inum that doesn't match our known inum, we
829
- * know it's shared.
904
+ * If we have a share check context and a reference for
905
+ * another inode, we can't exit immediately. This is
906
+ * because even if this is a BTRFS_ADD_DELAYED_REF
907
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
908
+ * which cancels out this ADD reference.
909
+ *
910
+ * If this is a DROP reference and there was no previous
911
+ * ADD reference, then we need to signal that when we
912
+ * process references from the extent tree (through
913
+ * add_inline_refs() and add_keyed_refs()), we should
914
+ * not exit early if we find a reference for another
915
+ * inode, because one of the delayed DROP references
916
+ * may cancel that reference in the extent tree.
830917 */
831
- if (sc && sc->inum && ref->objectid != sc->inum) {
832
- ret = BACKREF_FOUND_SHARED;
833
- goto out;
834
- }
918
+ if (sc && count < 0)
919
+ sc->have_delayed_delete_refs = true;
835920
836921 ret = add_indirect_ref(fs_info, preftrees, ref->root,
837922 &key, 0, node->bytenr, count, sc,
....@@ -861,7 +946,7 @@
861946 }
862947 if (!ret)
863948 ret = extent_is_shared(sc);
864
-out:
949
+
865950 spin_unlock(&head->lock);
866951 return ret;
867952 }
....@@ -874,7 +959,7 @@
874959 static int add_inline_refs(const struct btrfs_fs_info *fs_info,
875960 struct btrfs_path *path, u64 bytenr,
876961 int *info_level, struct preftrees *preftrees,
877
- u64 *total_refs, struct share_check *sc)
962
+ struct share_check *sc)
878963 {
879964 int ret = 0;
880965 int slot;
....@@ -898,7 +983,6 @@
898983
899984 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
900985 flags = btrfs_extent_flags(leaf, ei);
901
- *total_refs += btrfs_extent_refs(leaf, ei);
902986 btrfs_item_key_to_cpu(leaf, &found_key, slot);
903987
904988 ptr = (unsigned long)(ei + 1);
....@@ -965,7 +1049,8 @@
9651049 key.type = BTRFS_EXTENT_DATA_KEY;
9661050 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
9671051
968
- if (sc && sc->inum && key.objectid != sc->inum) {
1052
+ if (sc && sc->inum && key.objectid != sc->inum &&
1053
+ !sc->have_delayed_delete_refs) {
9691054 ret = BACKREF_FOUND_SHARED;
9701055 break;
9711056 }
....@@ -975,6 +1060,7 @@
9751060 ret = add_indirect_ref(fs_info, preftrees, root,
9761061 &key, 0, bytenr, count,
9771062 sc, GFP_NOFS);
1063
+
9781064 break;
9791065 }
9801066 default:
....@@ -1064,7 +1150,8 @@
10641150 key.type = BTRFS_EXTENT_DATA_KEY;
10651151 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
10661152
1067
- if (sc && sc->inum && key.objectid != sc->inum) {
1153
+ if (sc && sc->inum && key.objectid != sc->inum &&
1154
+ !sc->have_delayed_delete_refs) {
10681155 ret = BACKREF_FOUND_SHARED;
10691156 break;
10701157 }
....@@ -1123,8 +1210,6 @@
11231210 struct prelim_ref *ref;
11241211 struct rb_node *node;
11251212 struct extent_inode_elem *eie = NULL;
1126
- /* total of both direct AND indirect refs! */
1127
- u64 total_refs = 0;
11281213 struct preftrees preftrees = {
11291214 .direct = PREFTREE_INIT,
11301215 .indirect = PREFTREE_INIT,
....@@ -1198,7 +1283,7 @@
11981283 }
11991284 spin_unlock(&delayed_refs->lock);
12001285 ret = add_delayed_refs(fs_info, head, time_seq,
1201
- &preftrees, &total_refs, sc);
1286
+ &preftrees, sc);
12021287 mutex_unlock(&head->mutex);
12031288 if (ret)
12041289 goto out;
....@@ -1219,8 +1304,7 @@
12191304 (key.type == BTRFS_EXTENT_ITEM_KEY ||
12201305 key.type == BTRFS_METADATA_ITEM_KEY)) {
12211306 ret = add_inline_refs(fs_info, path, bytenr,
1222
- &info_level, &preftrees,
1223
- &total_refs, sc);
1307
+ &info_level, &preftrees, sc);
12241308 if (ret)
12251309 goto out;
12261310 ret = add_keyed_refs(fs_info, path, bytenr, info_level,
....@@ -1236,14 +1320,14 @@
12361320 if (ret)
12371321 goto out;
12381322
1239
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
1323
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
12401324
12411325 ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
1242
- extent_item_pos, total_refs, sc, ignore_offset);
1326
+ extent_item_pos, sc, ignore_offset);
12431327 if (ret)
12441328 goto out;
12451329
1246
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
1330
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
12471331
12481332 /*
12491333 * This walks the tree of merged and resolved refs. Tree blocks are
....@@ -1252,7 +1336,7 @@
12521336 *
12531337 * We release the entire tree in one go before returning.
12541338 */
1255
- node = rb_first(&preftrees.direct.root);
1339
+ node = rb_first_cached(&preftrees.direct.root);
12561340 while (node) {
12571341 ref = rb_entry(node, struct prelim_ref, rbnode);
12581342 node = rb_next(&ref->rbnode);
....@@ -1293,9 +1377,10 @@
12931377 ret = -EIO;
12941378 goto out;
12951379 }
1380
+
12961381 if (!path->skip_locking) {
12971382 btrfs_tree_read_lock(eb);
1298
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1383
+ btrfs_set_lock_blocking_read(eb);
12991384 }
13001385 ret = find_extent_in_eb(eb, bytenr,
13011386 *extent_item_pos, &eie, ignore_offset);
....@@ -1305,6 +1390,12 @@
13051390 if (ret < 0)
13061391 goto out;
13071392 ref->inode_list = eie;
1393
+ /*
1394
+ * We transferred the list ownership to the ref,
1395
+ * so set to NULL to avoid a double free in case
1396
+ * an error happens after this.
1397
+ */
1398
+ eie = NULL;
13081399 }
13091400 ret = ulist_add_merge_ptr(refs, ref->parent,
13101401 ref->inode_list,
....@@ -1330,6 +1421,14 @@
13301421 eie->next = ref->inode_list;
13311422 }
13321423 eie = NULL;
1424
+ /*
1425
+ * We have transferred the inode list ownership from
1426
+ * this ref to the ref we added to the 'refs' ulist.
1427
+ * So set this ref's inode list to NULL to avoid
1428
+ * use-after-free when our caller uses it or double
1429
+ * frees in case an error happens before we return.
1430
+ */
1431
+ ref->inode_list = NULL;
13331432 }
13341433 cond_resched();
13351434 }
....@@ -1346,24 +1445,6 @@
13461445 return ret;
13471446 }
13481447
1349
-static void free_leaf_list(struct ulist *blocks)
1350
-{
1351
- struct ulist_node *node = NULL;
1352
- struct extent_inode_elem *eie;
1353
- struct ulist_iterator uiter;
1354
-
1355
- ULIST_ITER_INIT(&uiter);
1356
- while ((node = ulist_next(blocks, &uiter))) {
1357
- if (!node->aux)
1358
- continue;
1359
- eie = unode_aux_to_inode_list(node);
1360
- free_inode_elem_list(eie);
1361
- node->aux = 0;
1362
- }
1363
-
1364
- ulist_free(blocks);
1365
-}
1366
-
13671448 /*
13681449 * Finds all leafs with a reference to the specified combination of bytenr and
13691450 * offset. key_list_head will point to a list of corresponding keys (caller must
....@@ -1372,10 +1453,10 @@
13721453 *
13731454 * returns 0 on success, <0 on error
13741455 */
1375
-static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1376
- struct btrfs_fs_info *fs_info, u64 bytenr,
1377
- u64 time_seq, struct ulist **leafs,
1378
- const u64 *extent_item_pos, bool ignore_offset)
1456
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1457
+ struct btrfs_fs_info *fs_info, u64 bytenr,
1458
+ u64 time_seq, struct ulist **leafs,
1459
+ const u64 *extent_item_pos, bool ignore_offset)
13791460 {
13801461 int ret;
13811462
....@@ -1476,28 +1557,24 @@
14761557 *
14771558 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
14781559 */
1479
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
1560
+int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
1561
+ struct ulist *roots, struct ulist *tmp)
14801562 {
14811563 struct btrfs_fs_info *fs_info = root->fs_info;
14821564 struct btrfs_trans_handle *trans;
1483
- struct ulist *tmp = NULL;
1484
- struct ulist *roots = NULL;
14851565 struct ulist_iterator uiter;
14861566 struct ulist_node *node;
14871567 struct seq_list elem = SEQ_LIST_INIT(elem);
14881568 int ret = 0;
14891569 struct share_check shared = {
1490
- .root_objectid = root->objectid,
1570
+ .root_objectid = root->root_key.objectid,
14911571 .inum = inum,
14921572 .share_count = 0,
1573
+ .have_delayed_delete_refs = false,
14931574 };
14941575
1495
- tmp = ulist_alloc(GFP_NOFS);
1496
- roots = ulist_alloc(GFP_NOFS);
1497
- if (!tmp || !roots) {
1498
- ret = -ENOMEM;
1499
- goto out;
1500
- }
1576
+ ulist_init(roots);
1577
+ ulist_init(tmp);
15011578
15021579 trans = btrfs_join_transaction_nostart(root);
15031580 if (IS_ERR(trans)) {
....@@ -1528,6 +1605,7 @@
15281605 break;
15291606 bytenr = node->val;
15301607 shared.share_count = 0;
1608
+ shared.have_delayed_delete_refs = false;
15311609 cond_resched();
15321610 }
15331611
....@@ -1538,8 +1616,8 @@
15381616 up_read(&fs_info->commit_root_sem);
15391617 }
15401618 out:
1541
- ulist_free(tmp);
1542
- ulist_free(roots);
1619
+ ulist_release(roots);
1620
+ ulist_release(tmp);
15431621 return ret;
15441622 }
15451623
....@@ -1671,7 +1749,7 @@
16711749 /* make sure we can use eb after releasing the path */
16721750 if (eb != eb_in) {
16731751 if (!path->skip_locking)
1674
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1752
+ btrfs_set_lock_blocking_read(eb);
16751753 path->nodes[0] = NULL;
16761754 path->locks[0] = 0;
16771755 }
....@@ -1762,7 +1840,7 @@
17621840 else if (flags & BTRFS_EXTENT_FLAG_DATA)
17631841 *flags_ret = BTRFS_EXTENT_FLAG_DATA;
17641842 else
1765
- BUG_ON(1);
1843
+ BUG();
17661844 return 0;
17671845 }
17681846
....@@ -1982,10 +2060,29 @@
19822060 return ret;
19832061 }
19842062
2063
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2064
+{
2065
+ struct btrfs_data_container *inodes = ctx;
2066
+ const size_t c = 3 * sizeof(u64);
2067
+
2068
+ if (inodes->bytes_left >= c) {
2069
+ inodes->bytes_left -= c;
2070
+ inodes->val[inodes->elem_cnt] = inum;
2071
+ inodes->val[inodes->elem_cnt + 1] = offset;
2072
+ inodes->val[inodes->elem_cnt + 2] = root;
2073
+ inodes->elem_cnt += 3;
2074
+ } else {
2075
+ inodes->bytes_missing += c - inodes->bytes_left;
2076
+ inodes->bytes_left = 0;
2077
+ inodes->elem_missed += 3;
2078
+ }
2079
+
2080
+ return 0;
2081
+}
2082
+
19852083 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
19862084 struct btrfs_path *path,
1987
- iterate_extent_inodes_t *iterate, void *ctx,
1988
- bool ignore_offset)
2085
+ void *ctx, bool ignore_offset)
19892086 {
19902087 int ret;
19912088 u64 extent_item_pos;
....@@ -2003,7 +2100,7 @@
20032100 extent_item_pos = logical - found_key.objectid;
20042101 ret = iterate_extent_inodes(fs_info, found_key.objectid,
20052102 extent_item_pos, search_commit_root,
2006
- iterate, ctx, ignore_offset);
2103
+ build_ino_list, ctx, ignore_offset);
20072104
20082105 return ret;
20092106 }
....@@ -2047,9 +2144,6 @@
20472144 ret = -ENOMEM;
20482145 break;
20492146 }
2050
- extent_buffer_get(eb);
2051
- btrfs_tree_read_lock(eb);
2052
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
20532147 btrfs_release_path(path);
20542148
20552149 item = btrfs_item_nr(slot);
....@@ -2060,7 +2154,8 @@
20602154 /* path must be released before calling iterate()! */
20612155 btrfs_debug(fs_root->fs_info,
20622156 "following ref at offset %u for inode %llu in tree %llu",
2063
- cur, found_key.objectid, fs_root->objectid);
2157
+ cur, found_key.objectid,
2158
+ fs_root->root_key.objectid);
20642159 ret = iterate(parent, name_len,
20652160 (unsigned long)(iref + 1), eb, ctx);
20662161 if (ret)
....@@ -2068,7 +2163,6 @@
20682163 len = sizeof(*iref) + name_len;
20692164 iref = (struct btrfs_inode_ref *)((char *)iref + len);
20702165 }
2071
- btrfs_tree_read_unlock_blocking(eb);
20722166 free_extent_buffer(eb);
20732167 }
20742168
....@@ -2109,10 +2203,6 @@
21092203 ret = -ENOMEM;
21102204 break;
21112205 }
2112
- extent_buffer_get(eb);
2113
-
2114
- btrfs_tree_read_lock(eb);
2115
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
21162206 btrfs_release_path(path);
21172207
21182208 item_size = btrfs_item_size_nr(eb, slot);
....@@ -2133,7 +2223,6 @@
21332223 cur_offset += btrfs_inode_extref_name_len(eb, extref);
21342224 cur_offset += sizeof(*extref);
21352225 }
2136
- btrfs_tree_read_unlock_blocking(eb);
21372226 free_extent_buffer(eb);
21382227
21392228 offset++;
....@@ -2276,3 +2365,824 @@
22762365 kvfree(ipath->fspath);
22772366 kfree(ipath);
22782367 }
2368
+
2369
+struct btrfs_backref_iter *btrfs_backref_iter_alloc(
2370
+ struct btrfs_fs_info *fs_info, gfp_t gfp_flag)
2371
+{
2372
+ struct btrfs_backref_iter *ret;
2373
+
2374
+ ret = kzalloc(sizeof(*ret), gfp_flag);
2375
+ if (!ret)
2376
+ return NULL;
2377
+
2378
+ ret->path = btrfs_alloc_path();
2379
+ if (!ret->path) {
2380
+ kfree(ret);
2381
+ return NULL;
2382
+ }
2383
+
2384
+ /* Current backref iterator only supports iteration in commit root */
2385
+ ret->path->search_commit_root = 1;
2386
+ ret->path->skip_locking = 1;
2387
+ ret->fs_info = fs_info;
2388
+
2389
+ return ret;
2390
+}
2391
+
2392
+int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
2393
+{
2394
+ struct btrfs_fs_info *fs_info = iter->fs_info;
2395
+ struct btrfs_path *path = iter->path;
2396
+ struct btrfs_extent_item *ei;
2397
+ struct btrfs_key key;
2398
+ int ret;
2399
+
2400
+ key.objectid = bytenr;
2401
+ key.type = BTRFS_METADATA_ITEM_KEY;
2402
+ key.offset = (u64)-1;
2403
+ iter->bytenr = bytenr;
2404
+
2405
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
2406
+ if (ret < 0)
2407
+ return ret;
2408
+ if (ret == 0) {
2409
+ ret = -EUCLEAN;
2410
+ goto release;
2411
+ }
2412
+ if (path->slots[0] == 0) {
2413
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
2414
+ ret = -EUCLEAN;
2415
+ goto release;
2416
+ }
2417
+ path->slots[0]--;
2418
+
2419
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2420
+ if ((key.type != BTRFS_EXTENT_ITEM_KEY &&
2421
+ key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) {
2422
+ ret = -ENOENT;
2423
+ goto release;
2424
+ }
2425
+ memcpy(&iter->cur_key, &key, sizeof(key));
2426
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
2427
+ path->slots[0]);
2428
+ iter->end_ptr = (u32)(iter->item_ptr +
2429
+ btrfs_item_size_nr(path->nodes[0], path->slots[0]));
2430
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2431
+ struct btrfs_extent_item);
2432
+
2433
+ /*
2434
+ * Only support iteration on tree backref yet.
2435
+ *
2436
+ * This is an extra precaution for non skinny-metadata, where
2437
+ * EXTENT_ITEM is also used for tree blocks, that we can only use
2438
+ * extent flags to determine if it's a tree block.
2439
+ */
2440
+ if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) {
2441
+ ret = -ENOTSUPP;
2442
+ goto release;
2443
+ }
2444
+ iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei));
2445
+
2446
+ /* If there is no inline backref, go search for keyed backref */
2447
+ if (iter->cur_ptr >= iter->end_ptr) {
2448
+ ret = btrfs_next_item(fs_info->extent_root, path);
2449
+
2450
+ /* No inline nor keyed ref */
2451
+ if (ret > 0) {
2452
+ ret = -ENOENT;
2453
+ goto release;
2454
+ }
2455
+ if (ret < 0)
2456
+ goto release;
2457
+
2458
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key,
2459
+ path->slots[0]);
2460
+ if (iter->cur_key.objectid != bytenr ||
2461
+ (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
2462
+ iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) {
2463
+ ret = -ENOENT;
2464
+ goto release;
2465
+ }
2466
+ iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
2467
+ path->slots[0]);
2468
+ iter->item_ptr = iter->cur_ptr;
2469
+ iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr(
2470
+ path->nodes[0], path->slots[0]));
2471
+ }
2472
+
2473
+ return 0;
2474
+release:
2475
+ btrfs_backref_iter_release(iter);
2476
+ return ret;
2477
+}
2478
+
2479
+/*
2480
+ * Go to the next backref item of current bytenr, can be either inlined or
2481
+ * keyed.
2482
+ *
2483
+ * Caller needs to check whether it's inline ref or not by iter->cur_key.
2484
+ *
2485
+ * Return 0 if we get next backref without problem.
2486
+ * Return >0 if there is no extra backref for this bytenr.
2487
+ * Return <0 if there is something wrong happened.
2488
+ */
2489
+int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
2490
+{
2491
+ struct extent_buffer *eb = btrfs_backref_get_eb(iter);
2492
+ struct btrfs_path *path = iter->path;
2493
+ struct btrfs_extent_inline_ref *iref;
2494
+ int ret;
2495
+ u32 size;
2496
+
2497
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
2498
+ /* We're still inside the inline refs */
2499
+ ASSERT(iter->cur_ptr < iter->end_ptr);
2500
+
2501
+ if (btrfs_backref_has_tree_block_info(iter)) {
2502
+ /* First tree block info */
2503
+ size = sizeof(struct btrfs_tree_block_info);
2504
+ } else {
2505
+ /* Use inline ref type to determine the size */
2506
+ int type;
2507
+
2508
+ iref = (struct btrfs_extent_inline_ref *)
2509
+ ((unsigned long)iter->cur_ptr);
2510
+ type = btrfs_extent_inline_ref_type(eb, iref);
2511
+
2512
+ size = btrfs_extent_inline_ref_size(type);
2513
+ }
2514
+ iter->cur_ptr += size;
2515
+ if (iter->cur_ptr < iter->end_ptr)
2516
+ return 0;
2517
+
2518
+ /* All inline items iterated, fall through */
2519
+ }
2520
+
2521
+ /* We're at keyed items, there is no inline item, go to the next one */
2522
+ ret = btrfs_next_item(iter->fs_info->extent_root, iter->path);
2523
+ if (ret)
2524
+ return ret;
2525
+
2526
+ btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]);
2527
+ if (iter->cur_key.objectid != iter->bytenr ||
2528
+ (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
2529
+ iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY))
2530
+ return 1;
2531
+ iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0],
2532
+ path->slots[0]);
2533
+ iter->cur_ptr = iter->item_ptr;
2534
+ iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0],
2535
+ path->slots[0]);
2536
+ return 0;
2537
+}
2538
+
2539
+void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
2540
+ struct btrfs_backref_cache *cache, int is_reloc)
2541
+{
2542
+ int i;
2543
+
2544
+ cache->rb_root = RB_ROOT;
2545
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
2546
+ INIT_LIST_HEAD(&cache->pending[i]);
2547
+ INIT_LIST_HEAD(&cache->changed);
2548
+ INIT_LIST_HEAD(&cache->detached);
2549
+ INIT_LIST_HEAD(&cache->leaves);
2550
+ INIT_LIST_HEAD(&cache->pending_edge);
2551
+ INIT_LIST_HEAD(&cache->useless_node);
2552
+ cache->fs_info = fs_info;
2553
+ cache->is_reloc = is_reloc;
2554
+}
2555
+
2556
+struct btrfs_backref_node *btrfs_backref_alloc_node(
2557
+ struct btrfs_backref_cache *cache, u64 bytenr, int level)
2558
+{
2559
+ struct btrfs_backref_node *node;
2560
+
2561
+ ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL);
2562
+ node = kzalloc(sizeof(*node), GFP_NOFS);
2563
+ if (!node)
2564
+ return node;
2565
+
2566
+ INIT_LIST_HEAD(&node->list);
2567
+ INIT_LIST_HEAD(&node->upper);
2568
+ INIT_LIST_HEAD(&node->lower);
2569
+ RB_CLEAR_NODE(&node->rb_node);
2570
+ cache->nr_nodes++;
2571
+ node->level = level;
2572
+ node->bytenr = bytenr;
2573
+
2574
+ return node;
2575
+}
2576
+
2577
+struct btrfs_backref_edge *btrfs_backref_alloc_edge(
2578
+ struct btrfs_backref_cache *cache)
2579
+{
2580
+ struct btrfs_backref_edge *edge;
2581
+
2582
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
2583
+ if (edge)
2584
+ cache->nr_edges++;
2585
+ return edge;
2586
+}
2587
+
2588
+/*
2589
+ * Drop the backref node from cache, also cleaning up all its
2590
+ * upper edges and any uncached nodes in the path.
2591
+ *
2592
+ * This cleanup happens bottom up, thus the node should either
2593
+ * be the lowest node in the cache or a detached node.
2594
+ */
2595
+void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
2596
+ struct btrfs_backref_node *node)
2597
+{
2598
+ struct btrfs_backref_node *upper;
2599
+ struct btrfs_backref_edge *edge;
2600
+
2601
+ if (!node)
2602
+ return;
2603
+
2604
+ BUG_ON(!node->lowest && !node->detached);
2605
+ while (!list_empty(&node->upper)) {
2606
+ edge = list_entry(node->upper.next, struct btrfs_backref_edge,
2607
+ list[LOWER]);
2608
+ upper = edge->node[UPPER];
2609
+ list_del(&edge->list[LOWER]);
2610
+ list_del(&edge->list[UPPER]);
2611
+ btrfs_backref_free_edge(cache, edge);
2612
+
2613
+ /*
2614
+ * Add the node to leaf node list if no other child block
2615
+ * cached.
2616
+ */
2617
+ if (list_empty(&upper->lower)) {
2618
+ list_add_tail(&upper->lower, &cache->leaves);
2619
+ upper->lowest = 1;
2620
+ }
2621
+ }
2622
+
2623
+ btrfs_backref_drop_node(cache, node);
2624
+}
2625
+
2626
+/*
2627
+ * Release all nodes/edges from current cache
2628
+ */
2629
+void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
2630
+{
2631
+ struct btrfs_backref_node *node;
2632
+ int i;
2633
+
2634
+ while (!list_empty(&cache->detached)) {
2635
+ node = list_entry(cache->detached.next,
2636
+ struct btrfs_backref_node, list);
2637
+ btrfs_backref_cleanup_node(cache, node);
2638
+ }
2639
+
2640
+ while (!list_empty(&cache->leaves)) {
2641
+ node = list_entry(cache->leaves.next,
2642
+ struct btrfs_backref_node, lower);
2643
+ btrfs_backref_cleanup_node(cache, node);
2644
+ }
2645
+
2646
+ cache->last_trans = 0;
2647
+
2648
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
2649
+ ASSERT(list_empty(&cache->pending[i]));
2650
+ ASSERT(list_empty(&cache->pending_edge));
2651
+ ASSERT(list_empty(&cache->useless_node));
2652
+ ASSERT(list_empty(&cache->changed));
2653
+ ASSERT(list_empty(&cache->detached));
2654
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
2655
+ ASSERT(!cache->nr_nodes);
2656
+ ASSERT(!cache->nr_edges);
2657
+}
2658
+
2659
+/*
2660
+ * Handle direct tree backref
2661
+ *
2662
+ * Direct tree backref means, the backref item shows its parent bytenr
2663
+ * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined).
2664
+ *
2665
+ * @ref_key: The converted backref key.
2666
+ * For keyed backref, it's the item key.
2667
+ * For inlined backref, objectid is the bytenr,
2668
+ * type is btrfs_inline_ref_type, offset is
2669
+ * btrfs_inline_ref_offset.
2670
+ */
2671
+static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
2672
+ struct btrfs_key *ref_key,
2673
+ struct btrfs_backref_node *cur)
2674
+{
2675
+ struct btrfs_backref_edge *edge;
2676
+ struct btrfs_backref_node *upper;
2677
+ struct rb_node *rb_node;
2678
+
2679
+ ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY);
2680
+
2681
+ /* Only reloc root uses backref pointing to itself */
2682
+ if (ref_key->objectid == ref_key->offset) {
2683
+ struct btrfs_root *root;
2684
+
2685
+ cur->is_reloc_root = 1;
2686
+ /* Only reloc backref cache cares about a specific root */
2687
+ if (cache->is_reloc) {
2688
+ root = find_reloc_root(cache->fs_info, cur->bytenr);
2689
+ if (!root)
2690
+ return -ENOENT;
2691
+ cur->root = root;
2692
+ } else {
2693
+ /*
2694
+ * For generic purpose backref cache, reloc root node
2695
+ * is useless.
2696
+ */
2697
+ list_add(&cur->list, &cache->useless_node);
2698
+ }
2699
+ return 0;
2700
+ }
2701
+
2702
+ edge = btrfs_backref_alloc_edge(cache);
2703
+ if (!edge)
2704
+ return -ENOMEM;
2705
+
2706
+ rb_node = rb_simple_search(&cache->rb_root, ref_key->offset);
2707
+ if (!rb_node) {
2708
+ /* Parent node not yet cached */
2709
+ upper = btrfs_backref_alloc_node(cache, ref_key->offset,
2710
+ cur->level + 1);
2711
+ if (!upper) {
2712
+ btrfs_backref_free_edge(cache, edge);
2713
+ return -ENOMEM;
2714
+ }
2715
+
2716
+ /*
2717
+ * Backrefs for the upper level block isn't cached, add the
2718
+ * block to pending list
2719
+ */
2720
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
2721
+ } else {
2722
+ /* Parent node already cached */
2723
+ upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
2724
+ ASSERT(upper->checked);
2725
+ INIT_LIST_HEAD(&edge->list[UPPER]);
2726
+ }
2727
+ btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
2728
+ return 0;
2729
+}
2730
+
2731
+/*
2732
+ * Handle indirect tree backref
2733
+ *
2734
+ * Indirect tree backref means, we only know which tree the node belongs to.
2735
+ * We still need to do a tree search to find out the parents. This is for
2736
+ * TREE_BLOCK_REF backref (keyed or inlined).
2737
+ *
2738
+ * @ref_key: The same as @ref_key in handle_direct_tree_backref()
2739
+ * @tree_key: The first key of this tree block.
2740
+ * @path: A clean (released) path, to avoid allocating path everytime
2741
+ * the function get called.
2742
+ */
2743
+static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
2744
+ struct btrfs_path *path,
2745
+ struct btrfs_key *ref_key,
2746
+ struct btrfs_key *tree_key,
2747
+ struct btrfs_backref_node *cur)
2748
+{
2749
+ struct btrfs_fs_info *fs_info = cache->fs_info;
2750
+ struct btrfs_backref_node *upper;
2751
+ struct btrfs_backref_node *lower;
2752
+ struct btrfs_backref_edge *edge;
2753
+ struct extent_buffer *eb;
2754
+ struct btrfs_root *root;
2755
+ struct rb_node *rb_node;
2756
+ int level;
2757
+ bool need_check = true;
2758
+ int ret;
2759
+
2760
+ root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
2761
+ if (IS_ERR(root))
2762
+ return PTR_ERR(root);
2763
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2764
+ cur->cowonly = 1;
2765
+
2766
+ if (btrfs_root_level(&root->root_item) == cur->level) {
2767
+ /* Tree root */
2768
+ ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr);
2769
+ /*
2770
+ * For reloc backref cache, we may ignore reloc root. But for
2771
+ * general purpose backref cache, we can't rely on
2772
+ * btrfs_should_ignore_reloc_root() as it may conflict with
2773
+ * current running relocation and lead to missing root.
2774
+ *
2775
+ * For general purpose backref cache, reloc root detection is
2776
+ * completely relying on direct backref (key->offset is parent
2777
+ * bytenr), thus only do such check for reloc cache.
2778
+ */
2779
+ if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) {
2780
+ btrfs_put_root(root);
2781
+ list_add(&cur->list, &cache->useless_node);
2782
+ } else {
2783
+ cur->root = root;
2784
+ }
2785
+ return 0;
2786
+ }
2787
+
2788
+ level = cur->level + 1;
2789
+
2790
+ /* Search the tree to find parent blocks referring to the block */
2791
+ path->search_commit_root = 1;
2792
+ path->skip_locking = 1;
2793
+ path->lowest_level = level;
2794
+ ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0);
2795
+ path->lowest_level = 0;
2796
+ if (ret < 0) {
2797
+ btrfs_put_root(root);
2798
+ return ret;
2799
+ }
2800
+ if (ret > 0 && path->slots[level] > 0)
2801
+ path->slots[level]--;
2802
+
2803
+ eb = path->nodes[level];
2804
+ if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) {
2805
+ btrfs_err(fs_info,
2806
+"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
2807
+ cur->bytenr, level - 1, root->root_key.objectid,
2808
+ tree_key->objectid, tree_key->type, tree_key->offset);
2809
+ btrfs_put_root(root);
2810
+ ret = -ENOENT;
2811
+ goto out;
2812
+ }
2813
+ lower = cur;
2814
+
2815
+ /* Add all nodes and edges in the path */
2816
+ for (; level < BTRFS_MAX_LEVEL; level++) {
2817
+ if (!path->nodes[level]) {
2818
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
2819
+ lower->bytenr);
2820
+ /* Same as previous should_ignore_reloc_root() call */
2821
+ if (btrfs_should_ignore_reloc_root(root) &&
2822
+ cache->is_reloc) {
2823
+ btrfs_put_root(root);
2824
+ list_add(&lower->list, &cache->useless_node);
2825
+ } else {
2826
+ lower->root = root;
2827
+ }
2828
+ break;
2829
+ }
2830
+
2831
+ edge = btrfs_backref_alloc_edge(cache);
2832
+ if (!edge) {
2833
+ btrfs_put_root(root);
2834
+ ret = -ENOMEM;
2835
+ goto out;
2836
+ }
2837
+
2838
+ eb = path->nodes[level];
2839
+ rb_node = rb_simple_search(&cache->rb_root, eb->start);
2840
+ if (!rb_node) {
2841
+ upper = btrfs_backref_alloc_node(cache, eb->start,
2842
+ lower->level + 1);
2843
+ if (!upper) {
2844
+ btrfs_put_root(root);
2845
+ btrfs_backref_free_edge(cache, edge);
2846
+ ret = -ENOMEM;
2847
+ goto out;
2848
+ }
2849
+ upper->owner = btrfs_header_owner(eb);
2850
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2851
+ upper->cowonly = 1;
2852
+
2853
+ /*
2854
+ * If we know the block isn't shared we can avoid
2855
+ * checking its backrefs.
2856
+ */
2857
+ if (btrfs_block_can_be_shared(root, eb))
2858
+ upper->checked = 0;
2859
+ else
2860
+ upper->checked = 1;
2861
+
2862
+ /*
2863
+ * Add the block to pending list if we need to check its
2864
+ * backrefs, we only do this once while walking up a
2865
+ * tree as we will catch anything else later on.
2866
+ */
2867
+ if (!upper->checked && need_check) {
2868
+ need_check = false;
2869
+ list_add_tail(&edge->list[UPPER],
2870
+ &cache->pending_edge);
2871
+ } else {
2872
+ if (upper->checked)
2873
+ need_check = true;
2874
+ INIT_LIST_HEAD(&edge->list[UPPER]);
2875
+ }
2876
+ } else {
2877
+ upper = rb_entry(rb_node, struct btrfs_backref_node,
2878
+ rb_node);
2879
+ ASSERT(upper->checked);
2880
+ INIT_LIST_HEAD(&edge->list[UPPER]);
2881
+ if (!upper->owner)
2882
+ upper->owner = btrfs_header_owner(eb);
2883
+ }
2884
+ btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
2885
+
2886
+ if (rb_node) {
2887
+ btrfs_put_root(root);
2888
+ break;
2889
+ }
2890
+ lower = upper;
2891
+ upper = NULL;
2892
+ }
2893
+out:
2894
+ btrfs_release_path(path);
2895
+ return ret;
2896
+}
2897
+
2898
+/*
2899
+ * Add backref node @cur into @cache.
2900
+ *
2901
+ * NOTE: Even if the function returned 0, @cur is not yet cached as its upper
2902
+ * links aren't yet bi-directional. Needs to finish such links.
2903
+ * Use btrfs_backref_finish_upper_links() to finish such linkage.
2904
+ *
2905
+ * @path: Released path for indirect tree backref lookup
2906
+ * @iter: Released backref iter for extent tree search
2907
+ * @node_key: The first key of the tree block
2908
+ */
2909
+int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
2910
+ struct btrfs_path *path,
2911
+ struct btrfs_backref_iter *iter,
2912
+ struct btrfs_key *node_key,
2913
+ struct btrfs_backref_node *cur)
2914
+{
2915
+ struct btrfs_fs_info *fs_info = cache->fs_info;
2916
+ struct btrfs_backref_edge *edge;
2917
+ struct btrfs_backref_node *exist;
2918
+ int ret;
2919
+
2920
+ ret = btrfs_backref_iter_start(iter, cur->bytenr);
2921
+ if (ret < 0)
2922
+ return ret;
2923
+ /*
2924
+ * We skip the first btrfs_tree_block_info, as we don't use the key
2925
+ * stored in it, but fetch it from the tree block
2926
+ */
2927
+ if (btrfs_backref_has_tree_block_info(iter)) {
2928
+ ret = btrfs_backref_iter_next(iter);
2929
+ if (ret < 0)
2930
+ goto out;
2931
+ /* No extra backref? This means the tree block is corrupted */
2932
+ if (ret > 0) {
2933
+ ret = -EUCLEAN;
2934
+ goto out;
2935
+ }
2936
+ }
2937
+ WARN_ON(cur->checked);
2938
+ if (!list_empty(&cur->upper)) {
2939
+ /*
2940
+ * The backref was added previously when processing backref of
2941
+ * type BTRFS_TREE_BLOCK_REF_KEY
2942
+ */
2943
+ ASSERT(list_is_singular(&cur->upper));
2944
+ edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
2945
+ list[LOWER]);
2946
+ ASSERT(list_empty(&edge->list[UPPER]));
2947
+ exist = edge->node[UPPER];
2948
+ /*
2949
+ * Add the upper level block to pending list if we need check
2950
+ * its backrefs
2951
+ */
2952
+ if (!exist->checked)
2953
+ list_add_tail(&edge->list[UPPER], &cache->pending_edge);
2954
+ } else {
2955
+ exist = NULL;
2956
+ }
2957
+
2958
+ for (; ret == 0; ret = btrfs_backref_iter_next(iter)) {
2959
+ struct extent_buffer *eb;
2960
+ struct btrfs_key key;
2961
+ int type;
2962
+
2963
+ cond_resched();
2964
+ eb = btrfs_backref_get_eb(iter);
2965
+
2966
+ key.objectid = iter->bytenr;
2967
+ if (btrfs_backref_iter_is_inline_ref(iter)) {
2968
+ struct btrfs_extent_inline_ref *iref;
2969
+
2970
+ /* Update key for inline backref */
2971
+ iref = (struct btrfs_extent_inline_ref *)
2972
+ ((unsigned long)iter->cur_ptr);
2973
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
2974
+ BTRFS_REF_TYPE_BLOCK);
2975
+ if (type == BTRFS_REF_TYPE_INVALID) {
2976
+ ret = -EUCLEAN;
2977
+ goto out;
2978
+ }
2979
+ key.type = type;
2980
+ key.offset = btrfs_extent_inline_ref_offset(eb, iref);
2981
+ } else {
2982
+ key.type = iter->cur_key.type;
2983
+ key.offset = iter->cur_key.offset;
2984
+ }
2985
+
2986
+ /*
2987
+ * Parent node found and matches current inline ref, no need to
2988
+ * rebuild this node for this inline ref
2989
+ */
2990
+ if (exist &&
2991
+ ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
2992
+ exist->owner == key.offset) ||
2993
+ (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
2994
+ exist->bytenr == key.offset))) {
2995
+ exist = NULL;
2996
+ continue;
2997
+ }
2998
+
2999
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
3000
+ if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
3001
+ ret = handle_direct_tree_backref(cache, &key, cur);
3002
+ if (ret < 0)
3003
+ goto out;
3004
+ continue;
3005
+ } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
3006
+ ret = -EINVAL;
3007
+ btrfs_print_v0_err(fs_info);
3008
+ btrfs_handle_fs_error(fs_info, ret, NULL);
3009
+ goto out;
3010
+ } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
3011
+ continue;
3012
+ }
3013
+
3014
+ /*
3015
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
3016
+ * means the root objectid. We need to search the tree to get
3017
+ * its parent bytenr.
3018
+ */
3019
+ ret = handle_indirect_tree_backref(cache, path, &key, node_key,
3020
+ cur);
3021
+ if (ret < 0)
3022
+ goto out;
3023
+ }
3024
+ ret = 0;
3025
+ cur->checked = 1;
3026
+ WARN_ON(exist);
3027
+out:
3028
+ btrfs_backref_iter_release(iter);
3029
+ return ret;
3030
+}
3031
+
3032
+/*
3033
+ * Finish the upwards linkage created by btrfs_backref_add_tree_node()
3034
+ */
3035
+int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
3036
+ struct btrfs_backref_node *start)
3037
+{
3038
+ struct list_head *useless_node = &cache->useless_node;
3039
+ struct btrfs_backref_edge *edge;
3040
+ struct rb_node *rb_node;
3041
+ LIST_HEAD(pending_edge);
3042
+
3043
+ ASSERT(start->checked);
3044
+
3045
+ /* Insert this node to cache if it's not COW-only */
3046
+ if (!start->cowonly) {
3047
+ rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
3048
+ &start->rb_node);
3049
+ if (rb_node)
3050
+ btrfs_backref_panic(cache->fs_info, start->bytenr,
3051
+ -EEXIST);
3052
+ list_add_tail(&start->lower, &cache->leaves);
3053
+ }
3054
+
3055
+ /*
3056
+ * Use breadth first search to iterate all related edges.
3057
+ *
3058
+ * The starting points are all the edges of this node
3059
+ */
3060
+ list_for_each_entry(edge, &start->upper, list[LOWER])
3061
+ list_add_tail(&edge->list[UPPER], &pending_edge);
3062
+
3063
+ while (!list_empty(&pending_edge)) {
3064
+ struct btrfs_backref_node *upper;
3065
+ struct btrfs_backref_node *lower;
3066
+
3067
+ edge = list_first_entry(&pending_edge,
3068
+ struct btrfs_backref_edge, list[UPPER]);
3069
+ list_del_init(&edge->list[UPPER]);
3070
+ upper = edge->node[UPPER];
3071
+ lower = edge->node[LOWER];
3072
+
3073
+ /* Parent is detached, no need to keep any edges */
3074
+ if (upper->detached) {
3075
+ list_del(&edge->list[LOWER]);
3076
+ btrfs_backref_free_edge(cache, edge);
3077
+
3078
+ /* Lower node is orphan, queue for cleanup */
3079
+ if (list_empty(&lower->upper))
3080
+ list_add(&lower->list, useless_node);
3081
+ continue;
3082
+ }
3083
+
3084
+ /*
3085
+ * All new nodes added in current build_backref_tree() haven't
3086
+ * been linked to the cache rb tree.
3087
+ * So if we have upper->rb_node populated, this means a cache
3088
+ * hit. We only need to link the edge, as @upper and all its
3089
+ * parents have already been linked.
3090
+ */
3091
+ if (!RB_EMPTY_NODE(&upper->rb_node)) {
3092
+ if (upper->lowest) {
3093
+ list_del_init(&upper->lower);
3094
+ upper->lowest = 0;
3095
+ }
3096
+
3097
+ list_add_tail(&edge->list[UPPER], &upper->lower);
3098
+ continue;
3099
+ }
3100
+
3101
+ /* Sanity check, we shouldn't have any unchecked nodes */
3102
+ if (!upper->checked) {
3103
+ ASSERT(0);
3104
+ return -EUCLEAN;
3105
+ }
3106
+
3107
+ /* Sanity check, COW-only node has non-COW-only parent */
3108
+ if (start->cowonly != upper->cowonly) {
3109
+ ASSERT(0);
3110
+ return -EUCLEAN;
3111
+ }
3112
+
3113
+ /* Only cache non-COW-only (subvolume trees) tree blocks */
3114
+ if (!upper->cowonly) {
3115
+ rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
3116
+ &upper->rb_node);
3117
+ if (rb_node) {
3118
+ btrfs_backref_panic(cache->fs_info,
3119
+ upper->bytenr, -EEXIST);
3120
+ return -EUCLEAN;
3121
+ }
3122
+ }
3123
+
3124
+ list_add_tail(&edge->list[UPPER], &upper->lower);
3125
+
3126
+ /*
3127
+ * Also queue all the parent edges of this uncached node
3128
+ * to finish the upper linkage
3129
+ */
3130
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
3131
+ list_add_tail(&edge->list[UPPER], &pending_edge);
3132
+ }
3133
+ return 0;
3134
+}
3135
+
3136
+void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
3137
+ struct btrfs_backref_node *node)
3138
+{
3139
+ struct btrfs_backref_node *lower;
3140
+ struct btrfs_backref_node *upper;
3141
+ struct btrfs_backref_edge *edge;
3142
+
3143
+ while (!list_empty(&cache->useless_node)) {
3144
+ lower = list_first_entry(&cache->useless_node,
3145
+ struct btrfs_backref_node, list);
3146
+ list_del_init(&lower->list);
3147
+ }
3148
+ while (!list_empty(&cache->pending_edge)) {
3149
+ edge = list_first_entry(&cache->pending_edge,
3150
+ struct btrfs_backref_edge, list[UPPER]);
3151
+ list_del(&edge->list[UPPER]);
3152
+ list_del(&edge->list[LOWER]);
3153
+ lower = edge->node[LOWER];
3154
+ upper = edge->node[UPPER];
3155
+ btrfs_backref_free_edge(cache, edge);
3156
+
3157
+ /*
3158
+ * Lower is no longer linked to any upper backref nodes and
3159
+ * isn't in the cache, we can free it ourselves.
3160
+ */
3161
+ if (list_empty(&lower->upper) &&
3162
+ RB_EMPTY_NODE(&lower->rb_node))
3163
+ list_add(&lower->list, &cache->useless_node);
3164
+
3165
+ if (!RB_EMPTY_NODE(&upper->rb_node))
3166
+ continue;
3167
+
3168
+ /* Add this guy's upper edges to the list to process */
3169
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
3170
+ list_add_tail(&edge->list[UPPER],
3171
+ &cache->pending_edge);
3172
+ if (list_empty(&upper->upper))
3173
+ list_add(&upper->list, &cache->useless_node);
3174
+ }
3175
+
3176
+ while (!list_empty(&cache->useless_node)) {
3177
+ lower = list_first_entry(&cache->useless_node,
3178
+ struct btrfs_backref_node, list);
3179
+ list_del_init(&lower->list);
3180
+ if (lower == node)
3181
+ node = NULL;
3182
+ btrfs_backref_drop_node(cache, lower);
3183
+ }
3184
+
3185
+ btrfs_backref_cleanup_node(cache, node);
3186
+ ASSERT(list_empty(&cache->useless_node) &&
3187
+ list_empty(&cache->pending_edge));
3188
+}