.. | .. |
---|
14 | 14 | #include "disk-io.h" |
---|
15 | 15 | #include "transaction.h" |
---|
16 | 16 | #include "dev-replace.h" |
---|
| 17 | +#include "block-group.h" |
---|
17 | 18 | |
---|
18 | 19 | #undef DEBUG |
---|
19 | 20 | |
---|
.. | .. |
---|
226 | 227 | struct btrfs_fs_info *fs_info = dev->fs_info; |
---|
227 | 228 | int ret; |
---|
228 | 229 | struct reada_zone *zone; |
---|
229 | | - struct btrfs_block_group_cache *cache = NULL; |
---|
| 230 | + struct btrfs_block_group *cache = NULL; |
---|
230 | 231 | u64 start; |
---|
231 | 232 | u64 end; |
---|
232 | 233 | int i; |
---|
.. | .. |
---|
247 | 248 | if (!cache) |
---|
248 | 249 | return NULL; |
---|
249 | 250 | |
---|
250 | | - start = cache->key.objectid; |
---|
251 | | - end = start + cache->key.offset - 1; |
---|
| 251 | + start = cache->start; |
---|
| 252 | + end = start + cache->length - 1; |
---|
252 | 253 | btrfs_put_block_group(cache); |
---|
253 | 254 | |
---|
254 | 255 | zone = kzalloc(sizeof(*zone), GFP_KERNEL); |
---|
.. | .. |
---|
376 | 377 | goto error; |
---|
377 | 378 | } |
---|
378 | 379 | |
---|
| 380 | + /* Insert extent in reada tree + all per-device trees, all or nothing */ |
---|
| 381 | + down_read(&fs_info->dev_replace.rwsem); |
---|
379 | 382 | ret = radix_tree_preload(GFP_KERNEL); |
---|
380 | | - if (ret) |
---|
| 383 | + if (ret) { |
---|
| 384 | + up_read(&fs_info->dev_replace.rwsem); |
---|
381 | 385 | goto error; |
---|
| 386 | + } |
---|
382 | 387 | |
---|
383 | | - /* insert extent in reada_tree + all per-device trees, all or nothing */ |
---|
384 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
---|
385 | 388 | spin_lock(&fs_info->reada_lock); |
---|
386 | 389 | ret = radix_tree_insert(&fs_info->reada_tree, index, re); |
---|
387 | 390 | if (ret == -EEXIST) { |
---|
388 | 391 | re_exist = radix_tree_lookup(&fs_info->reada_tree, index); |
---|
389 | 392 | re_exist->refcnt++; |
---|
390 | 393 | spin_unlock(&fs_info->reada_lock); |
---|
391 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
392 | 394 | radix_tree_preload_end(); |
---|
| 395 | + up_read(&fs_info->dev_replace.rwsem); |
---|
393 | 396 | goto error; |
---|
394 | 397 | } |
---|
395 | 398 | if (ret) { |
---|
396 | 399 | spin_unlock(&fs_info->reada_lock); |
---|
397 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
398 | 400 | radix_tree_preload_end(); |
---|
| 401 | + up_read(&fs_info->dev_replace.rwsem); |
---|
399 | 402 | goto error; |
---|
400 | 403 | } |
---|
401 | 404 | radix_tree_preload_end(); |
---|
.. | .. |
---|
418 | 421 | if (!dev->bdev) |
---|
419 | 422 | continue; |
---|
420 | 423 | |
---|
| 424 | + if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) |
---|
| 425 | + continue; |
---|
| 426 | + |
---|
421 | 427 | if (dev_replace_is_ongoing && |
---|
422 | 428 | dev == fs_info->dev_replace.tgtdev) { |
---|
423 | 429 | /* |
---|
.. | .. |
---|
437 | 443 | } |
---|
438 | 444 | radix_tree_delete(&fs_info->reada_tree, index); |
---|
439 | 445 | spin_unlock(&fs_info->reada_lock); |
---|
440 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
| 446 | + up_read(&fs_info->dev_replace.rwsem); |
---|
441 | 447 | goto error; |
---|
442 | 448 | } |
---|
443 | 449 | have_zone = 1; |
---|
.. | .. |
---|
445 | 451 | if (!have_zone) |
---|
446 | 452 | radix_tree_delete(&fs_info->reada_tree, index); |
---|
447 | 453 | spin_unlock(&fs_info->reada_lock); |
---|
448 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
| 454 | + up_read(&fs_info->dev_replace.rwsem); |
---|
449 | 455 | |
---|
450 | 456 | if (!have_zone) |
---|
451 | 457 | goto error; |
---|
.. | .. |
---|
638 | 644 | return 1; |
---|
639 | 645 | } |
---|
640 | 646 | |
---|
| 647 | +static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, |
---|
| 648 | + int mirror_num, struct extent_buffer **eb) |
---|
| 649 | +{ |
---|
| 650 | + struct extent_buffer *buf = NULL; |
---|
| 651 | + int ret; |
---|
| 652 | + |
---|
| 653 | + buf = btrfs_find_create_tree_block(fs_info, bytenr); |
---|
| 654 | + if (IS_ERR(buf)) |
---|
| 655 | + return 0; |
---|
| 656 | + |
---|
| 657 | + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); |
---|
| 658 | + |
---|
| 659 | + ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); |
---|
| 660 | + if (ret) { |
---|
| 661 | + free_extent_buffer_stale(buf); |
---|
| 662 | + return ret; |
---|
| 663 | + } |
---|
| 664 | + |
---|
| 665 | + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { |
---|
| 666 | + free_extent_buffer_stale(buf); |
---|
| 667 | + return -EIO; |
---|
| 668 | + } else if (extent_buffer_uptodate(buf)) { |
---|
| 669 | + *eb = buf; |
---|
| 670 | + } else { |
---|
| 671 | + free_extent_buffer(buf); |
---|
| 672 | + } |
---|
| 673 | + return 0; |
---|
| 674 | +} |
---|
| 675 | + |
---|
641 | 676 | static int reada_start_machine_dev(struct btrfs_device *dev) |
---|
642 | 677 | { |
---|
643 | 678 | struct btrfs_fs_info *fs_info = dev->fs_info; |
---|
.. | .. |
---|
737 | 772 | kfree(rmw); |
---|
738 | 773 | } |
---|
739 | 774 | |
---|
740 | | -static void __reada_start_machine(struct btrfs_fs_info *fs_info) |
---|
| 775 | +/* Try to start up to 10k READA requests for a group of devices */ |
---|
| 776 | +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) |
---|
741 | 777 | { |
---|
742 | | - struct btrfs_device *device; |
---|
743 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
744 | 778 | u64 enqueued; |
---|
745 | 779 | u64 total = 0; |
---|
746 | | - int i; |
---|
| 780 | + struct btrfs_device *device; |
---|
747 | 781 | |
---|
748 | | -again: |
---|
749 | 782 | do { |
---|
750 | 783 | enqueued = 0; |
---|
751 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
752 | 784 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
---|
753 | 785 | if (atomic_read(&device->reada_in_flight) < |
---|
754 | 786 | MAX_IN_FLIGHT) |
---|
755 | 787 | enqueued += reada_start_machine_dev(device); |
---|
756 | 788 | } |
---|
757 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
758 | 789 | total += enqueued; |
---|
759 | 790 | } while (enqueued && total < 10000); |
---|
760 | | - if (fs_devices->seed) { |
---|
761 | | - fs_devices = fs_devices->seed; |
---|
762 | | - goto again; |
---|
763 | | - } |
---|
764 | 791 | |
---|
| 792 | + return total; |
---|
| 793 | +} |
---|
| 794 | + |
---|
| 795 | +static void __reada_start_machine(struct btrfs_fs_info *fs_info) |
---|
| 796 | +{ |
---|
| 797 | + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; |
---|
| 798 | + int i; |
---|
| 799 | + u64 enqueued = 0; |
---|
| 800 | + |
---|
| 801 | + mutex_lock(&fs_devices->device_list_mutex); |
---|
| 802 | + |
---|
| 803 | + enqueued += reada_start_for_fsdevs(fs_devices); |
---|
| 804 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) |
---|
| 805 | + enqueued += reada_start_for_fsdevs(seed_devs); |
---|
| 806 | + |
---|
| 807 | + mutex_unlock(&fs_devices->device_list_mutex); |
---|
765 | 808 | if (enqueued == 0) |
---|
766 | 809 | return; |
---|
767 | 810 | |
---|
.. | .. |
---|
789 | 832 | /* FIXME we cannot handle this properly right now */ |
---|
790 | 833 | BUG(); |
---|
791 | 834 | } |
---|
792 | | - btrfs_init_work(&rmw->work, btrfs_readahead_helper, |
---|
793 | | - reada_start_machine_worker, NULL, NULL); |
---|
| 835 | + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); |
---|
794 | 836 | rmw->fs_info = fs_info; |
---|
795 | 837 | |
---|
796 | 838 | btrfs_queue_work(fs_info->readahead_workers, &rmw->work); |
---|
.. | .. |
---|
983 | 1025 | |
---|
984 | 1026 | kref_put(&rc->refcnt, reada_control_release); |
---|
985 | 1027 | } |
---|
| 1028 | + |
---|
| 1029 | +/* |
---|
| 1030 | + * Before removing a device (device replace or device remove ioctls), call this |
---|
| 1031 | + * function to wait for all existing readahead requests on the device and to |
---|
| 1032 | + * make sure no one queues more readahead requests for the device. |
---|
| 1033 | + * |
---|
| 1034 | + * Must be called without holding neither the device list mutex nor the device |
---|
| 1035 | + * replace semaphore, otherwise it will deadlock. |
---|
| 1036 | + */ |
---|
| 1037 | +void btrfs_reada_remove_dev(struct btrfs_device *dev) |
---|
| 1038 | +{ |
---|
| 1039 | + struct btrfs_fs_info *fs_info = dev->fs_info; |
---|
| 1040 | + |
---|
| 1041 | + /* Serialize with readahead extent creation at reada_find_extent(). */ |
---|
| 1042 | + spin_lock(&fs_info->reada_lock); |
---|
| 1043 | + set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); |
---|
| 1044 | + spin_unlock(&fs_info->reada_lock); |
---|
| 1045 | + |
---|
| 1046 | + /* |
---|
| 1047 | + * There might be readahead requests added to the radix trees which |
---|
| 1048 | + * were not yet added to the readahead work queue. We need to start |
---|
| 1049 | + * them and wait for their completion, otherwise we can end up with |
---|
| 1050 | + * use-after-free problems when dropping the last reference on the |
---|
| 1051 | + * readahead extents and their zones, as they need to access the |
---|
| 1052 | + * device structure. |
---|
| 1053 | + */ |
---|
| 1054 | + reada_start_machine(fs_info); |
---|
| 1055 | + btrfs_flush_workqueue(fs_info->readahead_workers); |
---|
| 1056 | +} |
---|
| 1057 | + |
---|
| 1058 | +/* |
---|
| 1059 | + * If when removing a device (device replace or device remove ioctls) an error |
---|
| 1060 | + * happens after calling btrfs_reada_remove_dev(), call this to undo what that |
---|
| 1061 | + * function did. This is safe to call even if btrfs_reada_remove_dev() was not |
---|
| 1062 | + * called before. |
---|
| 1063 | + */ |
---|
| 1064 | +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) |
---|
| 1065 | +{ |
---|
| 1066 | + spin_lock(&dev->fs_info->reada_lock); |
---|
| 1067 | + clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); |
---|
| 1068 | + spin_unlock(&dev->fs_info->reada_lock); |
---|
| 1069 | +} |
---|