hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/btrfs/reada.c
....@@ -14,6 +14,7 @@
1414 #include "disk-io.h"
1515 #include "transaction.h"
1616 #include "dev-replace.h"
17
+#include "block-group.h"
1718
1819 #undef DEBUG
1920
....@@ -226,7 +227,7 @@
226227 struct btrfs_fs_info *fs_info = dev->fs_info;
227228 int ret;
228229 struct reada_zone *zone;
229
- struct btrfs_block_group_cache *cache = NULL;
230
+ struct btrfs_block_group *cache = NULL;
230231 u64 start;
231232 u64 end;
232233 int i;
....@@ -247,8 +248,8 @@
247248 if (!cache)
248249 return NULL;
249250
250
- start = cache->key.objectid;
251
- end = start + cache->key.offset - 1;
251
+ start = cache->start;
252
+ end = start + cache->length - 1;
252253 btrfs_put_block_group(cache);
253254
254255 zone = kzalloc(sizeof(*zone), GFP_KERNEL);
....@@ -376,26 +377,28 @@
376377 goto error;
377378 }
378379
380
+ /* Insert extent in reada tree + all per-device trees, all or nothing */
381
+ down_read(&fs_info->dev_replace.rwsem);
379382 ret = radix_tree_preload(GFP_KERNEL);
380
- if (ret)
383
+ if (ret) {
384
+ up_read(&fs_info->dev_replace.rwsem);
381385 goto error;
386
+ }
382387
383
- /* insert extent in reada_tree + all per-device trees, all or nothing */
384
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
385388 spin_lock(&fs_info->reada_lock);
386389 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
387390 if (ret == -EEXIST) {
388391 re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
389392 re_exist->refcnt++;
390393 spin_unlock(&fs_info->reada_lock);
391
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
392394 radix_tree_preload_end();
395
+ up_read(&fs_info->dev_replace.rwsem);
393396 goto error;
394397 }
395398 if (ret) {
396399 spin_unlock(&fs_info->reada_lock);
397
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
398400 radix_tree_preload_end();
401
+ up_read(&fs_info->dev_replace.rwsem);
399402 goto error;
400403 }
401404 radix_tree_preload_end();
....@@ -418,6 +421,9 @@
418421 if (!dev->bdev)
419422 continue;
420423
424
+ if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
425
+ continue;
426
+
421427 if (dev_replace_is_ongoing &&
422428 dev == fs_info->dev_replace.tgtdev) {
423429 /*
....@@ -437,7 +443,7 @@
437443 }
438444 radix_tree_delete(&fs_info->reada_tree, index);
439445 spin_unlock(&fs_info->reada_lock);
440
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
446
+ up_read(&fs_info->dev_replace.rwsem);
441447 goto error;
442448 }
443449 have_zone = 1;
....@@ -445,7 +451,7 @@
445451 if (!have_zone)
446452 radix_tree_delete(&fs_info->reada_tree, index);
447453 spin_unlock(&fs_info->reada_lock);
448
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
454
+ up_read(&fs_info->dev_replace.rwsem);
449455
450456 if (!have_zone)
451457 goto error;
....@@ -638,6 +644,35 @@
638644 return 1;
639645 }
640646
647
+static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
648
+ int mirror_num, struct extent_buffer **eb)
649
+{
650
+ struct extent_buffer *buf = NULL;
651
+ int ret;
652
+
653
+ buf = btrfs_find_create_tree_block(fs_info, bytenr);
654
+ if (IS_ERR(buf))
655
+ return 0;
656
+
657
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
658
+
659
+ ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
660
+ if (ret) {
661
+ free_extent_buffer_stale(buf);
662
+ return ret;
663
+ }
664
+
665
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
666
+ free_extent_buffer_stale(buf);
667
+ return -EIO;
668
+ } else if (extent_buffer_uptodate(buf)) {
669
+ *eb = buf;
670
+ } else {
671
+ free_extent_buffer(buf);
672
+ }
673
+ return 0;
674
+}
675
+
641676 static int reada_start_machine_dev(struct btrfs_device *dev)
642677 {
643678 struct btrfs_fs_info *fs_info = dev->fs_info;
....@@ -737,31 +772,39 @@
737772 kfree(rmw);
738773 }
739774
740
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
775
+/* Try to start up to 10k READA requests for a group of devices */
776
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
741777 {
742
- struct btrfs_device *device;
743
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
744778 u64 enqueued;
745779 u64 total = 0;
746
- int i;
780
+ struct btrfs_device *device;
747781
748
-again:
749782 do {
750783 enqueued = 0;
751
- mutex_lock(&fs_devices->device_list_mutex);
752784 list_for_each_entry(device, &fs_devices->devices, dev_list) {
753785 if (atomic_read(&device->reada_in_flight) <
754786 MAX_IN_FLIGHT)
755787 enqueued += reada_start_machine_dev(device);
756788 }
757
- mutex_unlock(&fs_devices->device_list_mutex);
758789 total += enqueued;
759790 } while (enqueued && total < 10000);
760
- if (fs_devices->seed) {
761
- fs_devices = fs_devices->seed;
762
- goto again;
763
- }
764791
792
+ return total;
793
+}
794
+
795
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
796
+{
797
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
798
+ int i;
799
+ u64 enqueued = 0;
800
+
801
+ mutex_lock(&fs_devices->device_list_mutex);
802
+
803
+ enqueued += reada_start_for_fsdevs(fs_devices);
804
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
805
+ enqueued += reada_start_for_fsdevs(seed_devs);
806
+
807
+ mutex_unlock(&fs_devices->device_list_mutex);
765808 if (enqueued == 0)
766809 return;
767810
....@@ -789,8 +832,7 @@
789832 /* FIXME we cannot handle this properly right now */
790833 BUG();
791834 }
792
- btrfs_init_work(&rmw->work, btrfs_readahead_helper,
793
- reada_start_machine_worker, NULL, NULL);
835
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
794836 rmw->fs_info = fs_info;
795837
796838 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
....@@ -983,3 +1025,45 @@
9831025
9841026 kref_put(&rc->refcnt, reada_control_release);
9851027 }
1028
+
1029
+/*
1030
+ * Before removing a device (device replace or device remove ioctls), call this
1031
+ * function to wait for all existing readahead requests on the device and to
1032
+ * make sure no one queues more readahead requests for the device.
1033
+ *
1034
+ * Must be called without holding neither the device list mutex nor the device
1035
+ * replace semaphore, otherwise it will deadlock.
1036
+ */
1037
+void btrfs_reada_remove_dev(struct btrfs_device *dev)
1038
+{
1039
+ struct btrfs_fs_info *fs_info = dev->fs_info;
1040
+
1041
+ /* Serialize with readahead extent creation at reada_find_extent(). */
1042
+ spin_lock(&fs_info->reada_lock);
1043
+ set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1044
+ spin_unlock(&fs_info->reada_lock);
1045
+
1046
+ /*
1047
+ * There might be readahead requests added to the radix trees which
1048
+ * were not yet added to the readahead work queue. We need to start
1049
+ * them and wait for their completion, otherwise we can end up with
1050
+ * use-after-free problems when dropping the last reference on the
1051
+ * readahead extents and their zones, as they need to access the
1052
+ * device structure.
1053
+ */
1054
+ reada_start_machine(fs_info);
1055
+ btrfs_flush_workqueue(fs_info->readahead_workers);
1056
+}
1057
+
1058
+/*
1059
+ * If when removing a device (device replace or device remove ioctls) an error
1060
+ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
1061
+ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
1062
+ * called before.
1063
+ */
1064
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
1065
+{
1066
+ spin_lock(&dev->fs_info->reada_lock);
1067
+ clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1068
+ spin_unlock(&dev->fs_info->reada_lock);
1069
+}