.. | .. |
---|
6 | 6 | #include <linux/blkdev.h> |
---|
7 | 7 | #include <linux/ratelimit.h> |
---|
8 | 8 | #include <linux/sched/mm.h> |
---|
| 9 | +#include <crypto/hash.h> |
---|
9 | 10 | #include "ctree.h" |
---|
| 11 | +#include "discard.h" |
---|
10 | 12 | #include "volumes.h" |
---|
11 | 13 | #include "disk-io.h" |
---|
12 | 14 | #include "ordered-data.h" |
---|
.. | .. |
---|
17 | 19 | #include "check-integrity.h" |
---|
18 | 20 | #include "rcu-string.h" |
---|
19 | 21 | #include "raid56.h" |
---|
| 22 | +#include "block-group.h" |
---|
20 | 23 | |
---|
21 | 24 | /* |
---|
22 | 25 | * This is only the first step towards a full-features scrub. It reads all |
---|
.. | .. |
---|
146 | 149 | */ |
---|
147 | 150 | unsigned long *ebitmap; |
---|
148 | 151 | |
---|
149 | | - unsigned long bitmap[0]; |
---|
| 152 | + unsigned long bitmap[]; |
---|
150 | 153 | }; |
---|
151 | 154 | |
---|
152 | 155 | struct scrub_ctx { |
---|
.. | .. |
---|
322 | 325 | struct rb_node *parent = NULL; |
---|
323 | 326 | struct full_stripe_lock *entry; |
---|
324 | 327 | struct full_stripe_lock *ret; |
---|
325 | | - unsigned int nofs_flag; |
---|
326 | 328 | |
---|
327 | 329 | lockdep_assert_held(&locks_root->lock); |
---|
328 | 330 | |
---|
.. | .. |
---|
342 | 344 | |
---|
343 | 345 | /* |
---|
344 | 346 | * Insert new lock. |
---|
345 | | - * |
---|
346 | | - * We must use GFP_NOFS because the scrub task might be waiting for a |
---|
347 | | - * worker task executing this function and in turn a transaction commit |
---|
348 | | - * might be waiting the scrub task to pause (which needs to wait for all |
---|
349 | | - * the worker tasks to complete before pausing). |
---|
350 | 347 | */ |
---|
351 | | - nofs_flag = memalloc_nofs_save(); |
---|
352 | 348 | ret = kmalloc(sizeof(*ret), GFP_KERNEL); |
---|
353 | | - memalloc_nofs_restore(nofs_flag); |
---|
354 | 349 | if (!ret) |
---|
355 | 350 | return ERR_PTR(-ENOMEM); |
---|
356 | 351 | ret->logical = fstripe_logical; |
---|
.. | .. |
---|
395 | 390 | * |
---|
396 | 391 | * Caller must ensure @cache is a RAID56 block group. |
---|
397 | 392 | */ |
---|
398 | | -static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, |
---|
399 | | - u64 bytenr) |
---|
| 393 | +static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) |
---|
400 | 394 | { |
---|
401 | 395 | u64 ret; |
---|
402 | 396 | |
---|
.. | .. |
---|
410 | 404 | * round_down() can only handle power of 2, while RAID56 full |
---|
411 | 405 | * stripe length can be 64KiB * n, so we need to manually round down. |
---|
412 | 406 | */ |
---|
413 | | - ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * |
---|
414 | | - cache->full_stripe_len + cache->key.objectid; |
---|
| 407 | + ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * |
---|
| 408 | + cache->full_stripe_len + cache->start; |
---|
415 | 409 | return ret; |
---|
416 | 410 | } |
---|
417 | 411 | |
---|
.. | .. |
---|
429 | 423 | static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, |
---|
430 | 424 | bool *locked_ret) |
---|
431 | 425 | { |
---|
432 | | - struct btrfs_block_group_cache *bg_cache; |
---|
| 426 | + struct btrfs_block_group *bg_cache; |
---|
433 | 427 | struct btrfs_full_stripe_locks_tree *locks_root; |
---|
434 | 428 | struct full_stripe_lock *existing; |
---|
435 | 429 | u64 fstripe_start; |
---|
.. | .. |
---|
476 | 470 | static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, |
---|
477 | 471 | bool locked) |
---|
478 | 472 | { |
---|
479 | | - struct btrfs_block_group_cache *bg_cache; |
---|
| 473 | + struct btrfs_block_group *bg_cache; |
---|
480 | 474 | struct btrfs_full_stripe_locks_tree *locks_root; |
---|
481 | 475 | struct full_stripe_lock *fstripe_lock; |
---|
482 | 476 | u64 fstripe_start; |
---|
.. | .. |
---|
604 | 598 | sbio->index = i; |
---|
605 | 599 | sbio->sctx = sctx; |
---|
606 | 600 | sbio->page_count = 0; |
---|
607 | | - btrfs_init_work(&sbio->work, btrfs_scrub_helper, |
---|
608 | | - scrub_bio_end_io_worker, NULL, NULL); |
---|
| 601 | + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, |
---|
| 602 | + NULL); |
---|
609 | 603 | |
---|
610 | 604 | if (i != SCRUB_BIOS_PER_SCTX - 1) |
---|
611 | 605 | sctx->bios[i]->next_free = i + 1; |
---|
.. | .. |
---|
653 | 647 | struct btrfs_fs_info *fs_info = swarn->dev->fs_info; |
---|
654 | 648 | struct inode_fs_paths *ipath = NULL; |
---|
655 | 649 | struct btrfs_root *local_root; |
---|
656 | | - struct btrfs_key root_key; |
---|
657 | 650 | struct btrfs_key key; |
---|
658 | 651 | |
---|
659 | | - root_key.objectid = root; |
---|
660 | | - root_key.type = BTRFS_ROOT_ITEM_KEY; |
---|
661 | | - root_key.offset = (u64)-1; |
---|
662 | | - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); |
---|
| 652 | + local_root = btrfs_get_fs_root(fs_info, root, true); |
---|
663 | 653 | if (IS_ERR(local_root)) { |
---|
664 | 654 | ret = PTR_ERR(local_root); |
---|
665 | 655 | goto err; |
---|
.. | .. |
---|
674 | 664 | |
---|
675 | 665 | ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); |
---|
676 | 666 | if (ret) { |
---|
| 667 | + btrfs_put_root(local_root); |
---|
677 | 668 | btrfs_release_path(swarn->path); |
---|
678 | 669 | goto err; |
---|
679 | 670 | } |
---|
.. | .. |
---|
694 | 685 | ipath = init_ipath(4096, local_root, swarn->path); |
---|
695 | 686 | memalloc_nofs_restore(nofs_flag); |
---|
696 | 687 | if (IS_ERR(ipath)) { |
---|
| 688 | + btrfs_put_root(local_root); |
---|
697 | 689 | ret = PTR_ERR(ipath); |
---|
698 | 690 | ipath = NULL; |
---|
699 | 691 | goto err; |
---|
.. | .. |
---|
717 | 709 | min(isize - offset, (u64)PAGE_SIZE), nlink, |
---|
718 | 710 | (char *)(unsigned long)ipath->fspath->val[i]); |
---|
719 | 711 | |
---|
| 712 | + btrfs_put_root(local_root); |
---|
720 | 713 | free_ipath(ipath); |
---|
721 | 714 | return 0; |
---|
722 | 715 | |
---|
.. | .. |
---|
841 | 834 | int page_num; |
---|
842 | 835 | int success; |
---|
843 | 836 | bool full_stripe_locked; |
---|
844 | | - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, |
---|
| 837 | + unsigned int nofs_flag; |
---|
| 838 | + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, |
---|
845 | 839 | DEFAULT_RATELIMIT_BURST); |
---|
846 | 840 | |
---|
847 | 841 | BUG_ON(sblock_to_check->page_count < 1); |
---|
.. | .. |
---|
866 | 860 | dev = sblock_to_check->pagev[0]->dev; |
---|
867 | 861 | |
---|
868 | 862 | /* |
---|
| 863 | + * We must use GFP_NOFS because the scrub task might be waiting for a |
---|
| 864 | + * worker task executing this function and in turn a transaction commit |
---|
| 865 | + * might be waiting the scrub task to pause (which needs to wait for all |
---|
| 866 | + * the worker tasks to complete before pausing). |
---|
| 867 | + * We do allocations in the workers through insert_full_stripe_lock() |
---|
| 868 | + * and scrub_add_page_to_wr_bio(), which happens down the call chain of |
---|
| 869 | + * this function. |
---|
| 870 | + */ |
---|
| 871 | + nofs_flag = memalloc_nofs_save(); |
---|
| 872 | + /* |
---|
869 | 873 | * For RAID5/6, race can happen for a different device scrub thread. |
---|
870 | 874 | * For data corruption, Parity and Data threads will both try |
---|
871 | 875 | * to recovery the data. |
---|
.. | .. |
---|
874 | 878 | */ |
---|
875 | 879 | ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); |
---|
876 | 880 | if (ret < 0) { |
---|
| 881 | + memalloc_nofs_restore(nofs_flag); |
---|
877 | 882 | spin_lock(&sctx->stat_lock); |
---|
878 | 883 | if (ret == -ENOMEM) |
---|
879 | 884 | sctx->stat.malloc_errors++; |
---|
.. | .. |
---|
913 | 918 | */ |
---|
914 | 919 | |
---|
915 | 920 | sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, |
---|
916 | | - sizeof(*sblocks_for_recheck), GFP_NOFS); |
---|
| 921 | + sizeof(*sblocks_for_recheck), GFP_KERNEL); |
---|
917 | 922 | if (!sblocks_for_recheck) { |
---|
918 | 923 | spin_lock(&sctx->stat_lock); |
---|
919 | 924 | sctx->stat.malloc_errors++; |
---|
.. | .. |
---|
964 | 969 | spin_lock(&sctx->stat_lock); |
---|
965 | 970 | sctx->stat.read_errors++; |
---|
966 | 971 | spin_unlock(&sctx->stat_lock); |
---|
967 | | - if (__ratelimit(&_rs)) |
---|
| 972 | + if (__ratelimit(&rs)) |
---|
968 | 973 | scrub_print_warning("i/o error", sblock_to_check); |
---|
969 | 974 | btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); |
---|
970 | 975 | } else if (sblock_bad->checksum_error) { |
---|
971 | 976 | spin_lock(&sctx->stat_lock); |
---|
972 | 977 | sctx->stat.csum_errors++; |
---|
973 | 978 | spin_unlock(&sctx->stat_lock); |
---|
974 | | - if (__ratelimit(&_rs)) |
---|
| 979 | + if (__ratelimit(&rs)) |
---|
975 | 980 | scrub_print_warning("checksum error", sblock_to_check); |
---|
976 | 981 | btrfs_dev_stat_inc_and_print(dev, |
---|
977 | 982 | BTRFS_DEV_STAT_CORRUPTION_ERRS); |
---|
.. | .. |
---|
979 | 984 | spin_lock(&sctx->stat_lock); |
---|
980 | 985 | sctx->stat.verify_errors++; |
---|
981 | 986 | spin_unlock(&sctx->stat_lock); |
---|
982 | | - if (__ratelimit(&_rs)) |
---|
| 987 | + if (__ratelimit(&rs)) |
---|
983 | 988 | scrub_print_warning("checksum/header error", |
---|
984 | 989 | sblock_to_check); |
---|
985 | 990 | if (sblock_bad->generation_error) |
---|
.. | .. |
---|
1133 | 1138 | |
---|
1134 | 1139 | if (scrub_write_page_to_dev_replace(sblock_other, |
---|
1135 | 1140 | page_num) != 0) { |
---|
1136 | | - btrfs_dev_replace_stats_inc( |
---|
| 1141 | + atomic64_inc( |
---|
1137 | 1142 | &fs_info->dev_replace.num_write_errors); |
---|
1138 | 1143 | success = 0; |
---|
1139 | 1144 | } |
---|
.. | .. |
---|
1211 | 1216 | } |
---|
1212 | 1217 | |
---|
1213 | 1218 | ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); |
---|
| 1219 | + memalloc_nofs_restore(nofs_flag); |
---|
1214 | 1220 | if (ret < 0) |
---|
1215 | 1221 | return ret; |
---|
1216 | 1222 | return 0; |
---|
.. | .. |
---|
1573 | 1579 | if (btrfsic_submit_bio_wait(bio)) { |
---|
1574 | 1580 | btrfs_dev_stat_inc_and_print(page_bad->dev, |
---|
1575 | 1581 | BTRFS_DEV_STAT_WRITE_ERRS); |
---|
1576 | | - btrfs_dev_replace_stats_inc( |
---|
1577 | | - &fs_info->dev_replace.num_write_errors); |
---|
| 1582 | + atomic64_inc(&fs_info->dev_replace.num_write_errors); |
---|
1578 | 1583 | bio_put(bio); |
---|
1579 | 1584 | return -EIO; |
---|
1580 | 1585 | } |
---|
.. | .. |
---|
1601 | 1606 | |
---|
1602 | 1607 | ret = scrub_write_page_to_dev_replace(sblock, page_num); |
---|
1603 | 1608 | if (ret) |
---|
1604 | | - btrfs_dev_replace_stats_inc( |
---|
1605 | | - &fs_info->dev_replace.num_write_errors); |
---|
| 1609 | + atomic64_inc(&fs_info->dev_replace.num_write_errors); |
---|
1606 | 1610 | } |
---|
1607 | 1611 | } |
---|
1608 | 1612 | |
---|
.. | .. |
---|
1612 | 1616 | struct scrub_page *spage = sblock->pagev[page_num]; |
---|
1613 | 1617 | |
---|
1614 | 1618 | BUG_ON(spage->page == NULL); |
---|
1615 | | - if (spage->io_error) { |
---|
1616 | | - void *mapped_buffer = kmap_atomic(spage->page); |
---|
| 1619 | + if (spage->io_error) |
---|
| 1620 | + clear_page(page_address(spage->page)); |
---|
1617 | 1621 | |
---|
1618 | | - clear_page(mapped_buffer); |
---|
1619 | | - flush_dcache_page(spage->page); |
---|
1620 | | - kunmap_atomic(mapped_buffer); |
---|
1621 | | - } |
---|
1622 | 1622 | return scrub_add_page_to_wr_bio(sblock->sctx, spage); |
---|
1623 | 1623 | } |
---|
1624 | 1624 | |
---|
.. | .. |
---|
1631 | 1631 | mutex_lock(&sctx->wr_lock); |
---|
1632 | 1632 | again: |
---|
1633 | 1633 | if (!sctx->wr_curr_bio) { |
---|
1634 | | - unsigned int nofs_flag; |
---|
1635 | | - |
---|
1636 | | - /* |
---|
1637 | | - * We must use GFP_NOFS because the scrub task might be waiting |
---|
1638 | | - * for a worker task executing this function and in turn a |
---|
1639 | | - * transaction commit might be waiting the scrub task to pause |
---|
1640 | | - * (which needs to wait for all the worker tasks to complete |
---|
1641 | | - * before pausing). |
---|
1642 | | - */ |
---|
1643 | | - nofs_flag = memalloc_nofs_save(); |
---|
1644 | 1634 | sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), |
---|
1645 | 1635 | GFP_KERNEL); |
---|
1646 | | - memalloc_nofs_restore(nofs_flag); |
---|
1647 | 1636 | if (!sctx->wr_curr_bio) { |
---|
1648 | 1637 | mutex_unlock(&sctx->wr_lock); |
---|
1649 | 1638 | return -ENOMEM; |
---|
.. | .. |
---|
1726 | 1715 | sbio->status = bio->bi_status; |
---|
1727 | 1716 | sbio->bio = bio; |
---|
1728 | 1717 | |
---|
1729 | | - btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, |
---|
1730 | | - scrub_wr_bio_end_io_worker, NULL, NULL); |
---|
| 1718 | + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); |
---|
1731 | 1719 | btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); |
---|
1732 | 1720 | } |
---|
1733 | 1721 | |
---|
.. | .. |
---|
1746 | 1734 | struct scrub_page *spage = sbio->pagev[i]; |
---|
1747 | 1735 | |
---|
1748 | 1736 | spage->io_error = 1; |
---|
1749 | | - btrfs_dev_replace_stats_inc(&dev_replace-> |
---|
1750 | | - num_write_errors); |
---|
| 1737 | + atomic64_inc(&dev_replace->num_write_errors); |
---|
1751 | 1738 | } |
---|
1752 | 1739 | } |
---|
1753 | 1740 | |
---|
.. | .. |
---|
1796 | 1783 | static int scrub_checksum_data(struct scrub_block *sblock) |
---|
1797 | 1784 | { |
---|
1798 | 1785 | struct scrub_ctx *sctx = sblock->sctx; |
---|
| 1786 | + struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
| 1787 | + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); |
---|
1799 | 1788 | u8 csum[BTRFS_CSUM_SIZE]; |
---|
1800 | | - u8 *on_disk_csum; |
---|
1801 | | - struct page *page; |
---|
1802 | | - void *buffer; |
---|
1803 | | - u32 crc = ~(u32)0; |
---|
1804 | | - u64 len; |
---|
1805 | | - int index; |
---|
| 1789 | + struct scrub_page *spage; |
---|
| 1790 | + char *kaddr; |
---|
1806 | 1791 | |
---|
1807 | 1792 | BUG_ON(sblock->page_count < 1); |
---|
1808 | | - if (!sblock->pagev[0]->have_csum) |
---|
| 1793 | + spage = sblock->pagev[0]; |
---|
| 1794 | + if (!spage->have_csum) |
---|
1809 | 1795 | return 0; |
---|
1810 | 1796 | |
---|
1811 | | - on_disk_csum = sblock->pagev[0]->csum; |
---|
1812 | | - page = sblock->pagev[0]->page; |
---|
1813 | | - buffer = kmap_atomic(page); |
---|
| 1797 | + kaddr = page_address(spage->page); |
---|
1814 | 1798 | |
---|
1815 | | - len = sctx->fs_info->sectorsize; |
---|
1816 | | - index = 0; |
---|
1817 | | - for (;;) { |
---|
1818 | | - u64 l = min_t(u64, len, PAGE_SIZE); |
---|
| 1799 | + shash->tfm = fs_info->csum_shash; |
---|
| 1800 | + crypto_shash_init(shash); |
---|
| 1801 | + crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); |
---|
1819 | 1802 | |
---|
1820 | | - crc = btrfs_csum_data(buffer, crc, l); |
---|
1821 | | - kunmap_atomic(buffer); |
---|
1822 | | - len -= l; |
---|
1823 | | - if (len == 0) |
---|
1824 | | - break; |
---|
1825 | | - index++; |
---|
1826 | | - BUG_ON(index >= sblock->page_count); |
---|
1827 | | - BUG_ON(!sblock->pagev[index]->page); |
---|
1828 | | - page = sblock->pagev[index]->page; |
---|
1829 | | - buffer = kmap_atomic(page); |
---|
1830 | | - } |
---|
1831 | | - |
---|
1832 | | - btrfs_csum_final(crc, csum); |
---|
1833 | | - if (memcmp(csum, on_disk_csum, sctx->csum_size)) |
---|
| 1803 | + if (memcmp(csum, spage->csum, sctx->csum_size)) |
---|
1834 | 1804 | sblock->checksum_error = 1; |
---|
1835 | 1805 | |
---|
1836 | 1806 | return sblock->checksum_error; |
---|
.. | .. |
---|
1841 | 1811 | struct scrub_ctx *sctx = sblock->sctx; |
---|
1842 | 1812 | struct btrfs_header *h; |
---|
1843 | 1813 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
| 1814 | + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); |
---|
1844 | 1815 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
---|
1845 | 1816 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
---|
1846 | | - struct page *page; |
---|
1847 | | - void *mapped_buffer; |
---|
1848 | | - u64 mapped_size; |
---|
1849 | | - void *p; |
---|
1850 | | - u32 crc = ~(u32)0; |
---|
1851 | | - u64 len; |
---|
1852 | | - int index; |
---|
| 1817 | + const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; |
---|
| 1818 | + int i; |
---|
| 1819 | + struct scrub_page *spage; |
---|
| 1820 | + char *kaddr; |
---|
1853 | 1821 | |
---|
1854 | 1822 | BUG_ON(sblock->page_count < 1); |
---|
1855 | | - page = sblock->pagev[0]->page; |
---|
1856 | | - mapped_buffer = kmap_atomic(page); |
---|
1857 | | - h = (struct btrfs_header *)mapped_buffer; |
---|
| 1823 | + spage = sblock->pagev[0]; |
---|
| 1824 | + kaddr = page_address(spage->page); |
---|
| 1825 | + h = (struct btrfs_header *)kaddr; |
---|
1858 | 1826 | memcpy(on_disk_csum, h->csum, sctx->csum_size); |
---|
1859 | 1827 | |
---|
1860 | 1828 | /* |
---|
.. | .. |
---|
1862 | 1830 | * a) don't have an extent buffer and |
---|
1863 | 1831 | * b) the page is already kmapped |
---|
1864 | 1832 | */ |
---|
1865 | | - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) |
---|
| 1833 | + if (spage->logical != btrfs_stack_header_bytenr(h)) |
---|
1866 | 1834 | sblock->header_error = 1; |
---|
1867 | 1835 | |
---|
1868 | | - if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { |
---|
| 1836 | + if (spage->generation != btrfs_stack_header_generation(h)) { |
---|
1869 | 1837 | sblock->header_error = 1; |
---|
1870 | 1838 | sblock->generation_error = 1; |
---|
1871 | 1839 | } |
---|
1872 | 1840 | |
---|
1873 | | - if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) |
---|
| 1841 | + if (!scrub_check_fsid(h->fsid, spage)) |
---|
1874 | 1842 | sblock->header_error = 1; |
---|
1875 | 1843 | |
---|
1876 | 1844 | if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, |
---|
1877 | 1845 | BTRFS_UUID_SIZE)) |
---|
1878 | 1846 | sblock->header_error = 1; |
---|
1879 | 1847 | |
---|
1880 | | - len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; |
---|
1881 | | - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
---|
1882 | | - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; |
---|
1883 | | - index = 0; |
---|
1884 | | - for (;;) { |
---|
1885 | | - u64 l = min_t(u64, len, mapped_size); |
---|
| 1848 | + shash->tfm = fs_info->csum_shash; |
---|
| 1849 | + crypto_shash_init(shash); |
---|
| 1850 | + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, |
---|
| 1851 | + PAGE_SIZE - BTRFS_CSUM_SIZE); |
---|
1886 | 1852 | |
---|
1887 | | - crc = btrfs_csum_data(p, crc, l); |
---|
1888 | | - kunmap_atomic(mapped_buffer); |
---|
1889 | | - len -= l; |
---|
1890 | | - if (len == 0) |
---|
1891 | | - break; |
---|
1892 | | - index++; |
---|
1893 | | - BUG_ON(index >= sblock->page_count); |
---|
1894 | | - BUG_ON(!sblock->pagev[index]->page); |
---|
1895 | | - page = sblock->pagev[index]->page; |
---|
1896 | | - mapped_buffer = kmap_atomic(page); |
---|
1897 | | - mapped_size = PAGE_SIZE; |
---|
1898 | | - p = mapped_buffer; |
---|
| 1853 | + for (i = 1; i < num_pages; i++) { |
---|
| 1854 | + kaddr = page_address(sblock->pagev[i]->page); |
---|
| 1855 | + crypto_shash_update(shash, kaddr, PAGE_SIZE); |
---|
1899 | 1856 | } |
---|
1900 | 1857 | |
---|
1901 | | - btrfs_csum_final(crc, calculated_csum); |
---|
| 1858 | + crypto_shash_final(shash, calculated_csum); |
---|
1902 | 1859 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
---|
1903 | 1860 | sblock->checksum_error = 1; |
---|
1904 | 1861 | |
---|
.. | .. |
---|
1909 | 1866 | { |
---|
1910 | 1867 | struct btrfs_super_block *s; |
---|
1911 | 1868 | struct scrub_ctx *sctx = sblock->sctx; |
---|
| 1869 | + struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
| 1870 | + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); |
---|
1912 | 1871 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
---|
1913 | | - u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
---|
1914 | | - struct page *page; |
---|
1915 | | - void *mapped_buffer; |
---|
1916 | | - u64 mapped_size; |
---|
1917 | | - void *p; |
---|
1918 | | - u32 crc = ~(u32)0; |
---|
| 1872 | + struct scrub_page *spage; |
---|
| 1873 | + char *kaddr; |
---|
1919 | 1874 | int fail_gen = 0; |
---|
1920 | 1875 | int fail_cor = 0; |
---|
1921 | | - u64 len; |
---|
1922 | | - int index; |
---|
1923 | 1876 | |
---|
1924 | 1877 | BUG_ON(sblock->page_count < 1); |
---|
1925 | | - page = sblock->pagev[0]->page; |
---|
1926 | | - mapped_buffer = kmap_atomic(page); |
---|
1927 | | - s = (struct btrfs_super_block *)mapped_buffer; |
---|
1928 | | - memcpy(on_disk_csum, s->csum, sctx->csum_size); |
---|
| 1878 | + spage = sblock->pagev[0]; |
---|
| 1879 | + kaddr = page_address(spage->page); |
---|
| 1880 | + s = (struct btrfs_super_block *)kaddr; |
---|
1929 | 1881 | |
---|
1930 | | - if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) |
---|
| 1882 | + if (spage->logical != btrfs_super_bytenr(s)) |
---|
1931 | 1883 | ++fail_cor; |
---|
1932 | 1884 | |
---|
1933 | | - if (sblock->pagev[0]->generation != btrfs_super_generation(s)) |
---|
| 1885 | + if (spage->generation != btrfs_super_generation(s)) |
---|
1934 | 1886 | ++fail_gen; |
---|
1935 | 1887 | |
---|
1936 | | - if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) |
---|
| 1888 | + if (!scrub_check_fsid(s->fsid, spage)) |
---|
1937 | 1889 | ++fail_cor; |
---|
1938 | 1890 | |
---|
1939 | | - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; |
---|
1940 | | - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; |
---|
1941 | | - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; |
---|
1942 | | - index = 0; |
---|
1943 | | - for (;;) { |
---|
1944 | | - u64 l = min_t(u64, len, mapped_size); |
---|
| 1891 | + shash->tfm = fs_info->csum_shash; |
---|
| 1892 | + crypto_shash_init(shash); |
---|
| 1893 | + crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, |
---|
| 1894 | + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); |
---|
1945 | 1895 | |
---|
1946 | | - crc = btrfs_csum_data(p, crc, l); |
---|
1947 | | - kunmap_atomic(mapped_buffer); |
---|
1948 | | - len -= l; |
---|
1949 | | - if (len == 0) |
---|
1950 | | - break; |
---|
1951 | | - index++; |
---|
1952 | | - BUG_ON(index >= sblock->page_count); |
---|
1953 | | - BUG_ON(!sblock->pagev[index]->page); |
---|
1954 | | - page = sblock->pagev[index]->page; |
---|
1955 | | - mapped_buffer = kmap_atomic(page); |
---|
1956 | | - mapped_size = PAGE_SIZE; |
---|
1957 | | - p = mapped_buffer; |
---|
1958 | | - } |
---|
1959 | | - |
---|
1960 | | - btrfs_csum_final(crc, calculated_csum); |
---|
1961 | | - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
---|
| 1896 | + if (memcmp(calculated_csum, s->csum, sctx->csum_size)) |
---|
1962 | 1897 | ++fail_cor; |
---|
1963 | 1898 | |
---|
1964 | 1899 | if (fail_cor + fail_gen) { |
---|
.. | .. |
---|
1971 | 1906 | ++sctx->stat.super_errors; |
---|
1972 | 1907 | spin_unlock(&sctx->stat_lock); |
---|
1973 | 1908 | if (fail_cor) |
---|
1974 | | - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, |
---|
| 1909 | + btrfs_dev_stat_inc_and_print(spage->dev, |
---|
1975 | 1910 | BTRFS_DEV_STAT_CORRUPTION_ERRS); |
---|
1976 | 1911 | else |
---|
1977 | | - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, |
---|
| 1912 | + btrfs_dev_stat_inc_and_print(spage->dev, |
---|
1978 | 1913 | BTRFS_DEV_STAT_GENERATION_ERRS); |
---|
1979 | 1914 | } |
---|
1980 | 1915 | |
---|
.. | .. |
---|
2199 | 2134 | raid56_add_scrub_pages(rbio, spage->page, spage->logical); |
---|
2200 | 2135 | } |
---|
2201 | 2136 | |
---|
2202 | | - btrfs_init_work(&sblock->work, btrfs_scrub_helper, |
---|
2203 | | - scrub_missing_raid56_worker, NULL, NULL); |
---|
| 2137 | + btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); |
---|
2204 | 2138 | scrub_block_get(sblock); |
---|
2205 | 2139 | scrub_pending_bio_inc(sctx); |
---|
2206 | 2140 | raid56_submit_missing_rbio(rbio); |
---|
.. | .. |
---|
2456 | 2390 | ASSERT(index < UINT_MAX); |
---|
2457 | 2391 | |
---|
2458 | 2392 | num_sectors = sum->len / sctx->fs_info->sectorsize; |
---|
2459 | | - memcpy(csum, sum->sums + index, sctx->csum_size); |
---|
| 2393 | + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); |
---|
2460 | 2394 | if (index == num_sectors - 1) { |
---|
2461 | 2395 | list_del(&sum->list); |
---|
2462 | 2396 | kfree(sum); |
---|
.. | .. |
---|
2668 | 2602 | u64 last_offset; |
---|
2669 | 2603 | u32 stripe_index; |
---|
2670 | 2604 | u32 rot; |
---|
| 2605 | + const int data_stripes = nr_data_stripes(map); |
---|
2671 | 2606 | |
---|
2672 | | - last_offset = (physical - map->stripes[num].physical) * |
---|
2673 | | - nr_data_stripes(map); |
---|
| 2607 | + last_offset = (physical - map->stripes[num].physical) * data_stripes; |
---|
2674 | 2608 | if (stripe_start) |
---|
2675 | 2609 | *stripe_start = last_offset; |
---|
2676 | 2610 | |
---|
2677 | 2611 | *offset = last_offset; |
---|
2678 | | - for (i = 0; i < nr_data_stripes(map); i++) { |
---|
| 2612 | + for (i = 0; i < data_stripes; i++) { |
---|
2679 | 2613 | *offset = last_offset + i * map->stripe_len; |
---|
2680 | 2614 | |
---|
2681 | 2615 | stripe_nr = div64_u64(*offset, map->stripe_len); |
---|
2682 | | - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); |
---|
| 2616 | + stripe_nr = div_u64(stripe_nr, data_stripes); |
---|
2683 | 2617 | |
---|
2684 | 2618 | /* Work out the disk rotation on this stripe-set */ |
---|
2685 | 2619 | stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); |
---|
.. | .. |
---|
2738 | 2672 | |
---|
2739 | 2673 | bio_put(bio); |
---|
2740 | 2674 | |
---|
2741 | | - btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, |
---|
2742 | | - scrub_parity_bio_endio_worker, NULL, NULL); |
---|
| 2675 | + btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, |
---|
| 2676 | + NULL); |
---|
2743 | 2677 | btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); |
---|
2744 | 2678 | } |
---|
2745 | 2679 | |
---|
.. | .. |
---|
3041 | 2975 | static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, |
---|
3042 | 2976 | struct map_lookup *map, |
---|
3043 | 2977 | struct btrfs_device *scrub_dev, |
---|
3044 | | - int num, u64 base, u64 length) |
---|
| 2978 | + int num, u64 base, u64 length, |
---|
| 2979 | + struct btrfs_block_group *cache) |
---|
3045 | 2980 | { |
---|
3046 | 2981 | struct btrfs_path *path, *ppath; |
---|
3047 | 2982 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
.. | .. |
---|
3087 | 3022 | offset = map->stripe_len * (num / map->sub_stripes); |
---|
3088 | 3023 | increment = map->stripe_len * factor; |
---|
3089 | 3024 | mirror_num = num % map->sub_stripes + 1; |
---|
3090 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
---|
| 3025 | + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { |
---|
3091 | 3026 | increment = map->stripe_len; |
---|
3092 | 3027 | mirror_num = num % map->num_stripes + 1; |
---|
3093 | 3028 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
---|
.. | .. |
---|
3279 | 3214 | break; |
---|
3280 | 3215 | } |
---|
3281 | 3216 | |
---|
| 3217 | + /* |
---|
| 3218 | + * If our block group was removed in the meanwhile, just |
---|
| 3219 | + * stop scrubbing since there is no point in continuing. |
---|
| 3220 | + * Continuing would prevent reusing its device extents |
---|
| 3221 | + * for new block groups for a long time. |
---|
| 3222 | + */ |
---|
| 3223 | + spin_lock(&cache->lock); |
---|
| 3224 | + if (cache->removed) { |
---|
| 3225 | + spin_unlock(&cache->lock); |
---|
| 3226 | + ret = 0; |
---|
| 3227 | + goto out; |
---|
| 3228 | + } |
---|
| 3229 | + spin_unlock(&cache->lock); |
---|
| 3230 | + |
---|
3282 | 3231 | extent = btrfs_item_ptr(l, slot, |
---|
3283 | 3232 | struct btrfs_extent_item); |
---|
3284 | 3233 | flags = btrfs_extent_flags(l, extent); |
---|
.. | .. |
---|
3323 | 3272 | &extent_dev, |
---|
3324 | 3273 | &extent_mirror_num); |
---|
3325 | 3274 | |
---|
3326 | | - ret = btrfs_lookup_csums_range(csum_root, |
---|
3327 | | - extent_logical, |
---|
3328 | | - extent_logical + |
---|
3329 | | - extent_len - 1, |
---|
3330 | | - &sctx->csum_list, 1); |
---|
3331 | | - if (ret) |
---|
3332 | | - goto out; |
---|
| 3275 | + if (flags & BTRFS_EXTENT_FLAG_DATA) { |
---|
| 3276 | + ret = btrfs_lookup_csums_range(csum_root, |
---|
| 3277 | + extent_logical, |
---|
| 3278 | + extent_logical + extent_len - 1, |
---|
| 3279 | + &sctx->csum_list, 1); |
---|
| 3280 | + if (ret) |
---|
| 3281 | + goto out; |
---|
| 3282 | + } |
---|
3333 | 3283 | |
---|
3334 | 3284 | ret = scrub_extent(sctx, map, extent_logical, extent_len, |
---|
3335 | 3285 | extent_physical, extent_dev, flags, |
---|
.. | .. |
---|
3415 | 3365 | struct btrfs_device *scrub_dev, |
---|
3416 | 3366 | u64 chunk_offset, u64 length, |
---|
3417 | 3367 | u64 dev_offset, |
---|
3418 | | - struct btrfs_block_group_cache *cache) |
---|
| 3368 | + struct btrfs_block_group *cache) |
---|
3419 | 3369 | { |
---|
3420 | 3370 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
3421 | | - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
---|
| 3371 | + struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
---|
3422 | 3372 | struct map_lookup *map; |
---|
3423 | 3373 | struct extent_map *em; |
---|
3424 | 3374 | int i; |
---|
3425 | 3375 | int ret = 0; |
---|
3426 | 3376 | |
---|
3427 | | - read_lock(&map_tree->map_tree.lock); |
---|
3428 | | - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); |
---|
3429 | | - read_unlock(&map_tree->map_tree.lock); |
---|
| 3377 | + read_lock(&map_tree->lock); |
---|
| 3378 | + em = lookup_extent_mapping(map_tree, chunk_offset, 1); |
---|
| 3379 | + read_unlock(&map_tree->lock); |
---|
3430 | 3380 | |
---|
3431 | 3381 | if (!em) { |
---|
3432 | 3382 | /* |
---|
.. | .. |
---|
3452 | 3402 | if (map->stripes[i].dev->bdev == scrub_dev->bdev && |
---|
3453 | 3403 | map->stripes[i].physical == dev_offset) { |
---|
3454 | 3404 | ret = scrub_stripe(sctx, map, scrub_dev, i, |
---|
3455 | | - chunk_offset, length); |
---|
| 3405 | + chunk_offset, length, cache); |
---|
3456 | 3406 | if (ret) |
---|
3457 | 3407 | goto out; |
---|
3458 | 3408 | } |
---|
.. | .. |
---|
3479 | 3429 | struct extent_buffer *l; |
---|
3480 | 3430 | struct btrfs_key key; |
---|
3481 | 3431 | struct btrfs_key found_key; |
---|
3482 | | - struct btrfs_block_group_cache *cache; |
---|
| 3432 | + struct btrfs_block_group *cache; |
---|
3483 | 3433 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; |
---|
3484 | 3434 | |
---|
3485 | 3435 | path = btrfs_alloc_path(); |
---|
.. | .. |
---|
3550 | 3500 | goto skip; |
---|
3551 | 3501 | |
---|
3552 | 3502 | /* |
---|
| 3503 | + * Make sure that while we are scrubbing the corresponding block |
---|
| 3504 | + * group doesn't get its logical address and its device extents |
---|
| 3505 | + * reused for another block group, which can possibly be of a |
---|
| 3506 | + * different type and different profile. We do this to prevent |
---|
| 3507 | + * false error detections and crashes due to bogus attempts to |
---|
| 3508 | + * repair extents. |
---|
| 3509 | + */ |
---|
| 3510 | + spin_lock(&cache->lock); |
---|
| 3511 | + if (cache->removed) { |
---|
| 3512 | + spin_unlock(&cache->lock); |
---|
| 3513 | + btrfs_put_block_group(cache); |
---|
| 3514 | + goto skip; |
---|
| 3515 | + } |
---|
| 3516 | + btrfs_freeze_block_group(cache); |
---|
| 3517 | + spin_unlock(&cache->lock); |
---|
| 3518 | + |
---|
| 3519 | + /* |
---|
3553 | 3520 | * we need call btrfs_inc_block_group_ro() with scrubs_paused, |
---|
3554 | 3521 | * to avoid deadlock caused by: |
---|
3555 | 3522 | * btrfs_inc_block_group_ro() |
---|
.. | .. |
---|
3558 | 3525 | * -> btrfs_scrub_pause() |
---|
3559 | 3526 | */ |
---|
3560 | 3527 | scrub_pause_on(fs_info); |
---|
3561 | | - ret = btrfs_inc_block_group_ro(cache); |
---|
3562 | | - if (!ret && sctx->is_dev_replace) { |
---|
3563 | | - /* |
---|
3564 | | - * If we are doing a device replace wait for any tasks |
---|
3565 | | - * that started dellaloc right before we set the block |
---|
3566 | | - * group to RO mode, as they might have just allocated |
---|
3567 | | - * an extent from it or decided they could do a nocow |
---|
3568 | | - * write. And if any such tasks did that, wait for their |
---|
3569 | | - * ordered extents to complete and then commit the |
---|
3570 | | - * current transaction, so that we can later see the new |
---|
3571 | | - * extent items in the extent tree - the ordered extents |
---|
3572 | | - * create delayed data references (for cow writes) when |
---|
3573 | | - * they complete, which will be run and insert the |
---|
3574 | | - * corresponding extent items into the extent tree when |
---|
3575 | | - * we commit the transaction they used when running |
---|
3576 | | - * inode.c:btrfs_finish_ordered_io(). We later use |
---|
3577 | | - * the commit root of the extent tree to find extents |
---|
3578 | | - * to copy from the srcdev into the tgtdev, and we don't |
---|
3579 | | - * want to miss any new extents. |
---|
3580 | | - */ |
---|
3581 | | - btrfs_wait_block_group_reservations(cache); |
---|
3582 | | - btrfs_wait_nocow_writers(cache); |
---|
3583 | | - ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, |
---|
3584 | | - cache->key.objectid, |
---|
3585 | | - cache->key.offset); |
---|
3586 | | - if (ret > 0) { |
---|
3587 | | - struct btrfs_trans_handle *trans; |
---|
3588 | 3528 | |
---|
3589 | | - trans = btrfs_join_transaction(root); |
---|
3590 | | - if (IS_ERR(trans)) |
---|
3591 | | - ret = PTR_ERR(trans); |
---|
3592 | | - else |
---|
3593 | | - ret = btrfs_commit_transaction(trans); |
---|
3594 | | - if (ret) { |
---|
3595 | | - scrub_pause_off(fs_info); |
---|
3596 | | - btrfs_put_block_group(cache); |
---|
3597 | | - break; |
---|
3598 | | - } |
---|
3599 | | - } |
---|
3600 | | - } |
---|
3601 | | - scrub_pause_off(fs_info); |
---|
3602 | | - |
---|
| 3529 | + /* |
---|
| 3530 | + * Don't do chunk preallocation for scrub. |
---|
| 3531 | + * |
---|
| 3532 | + * This is especially important for SYSTEM bgs, or we can hit |
---|
| 3533 | + * -EFBIG from btrfs_finish_chunk_alloc() like: |
---|
| 3534 | + * 1. The only SYSTEM bg is marked RO. |
---|
| 3535 | + * Since SYSTEM bg is small, that's pretty common. |
---|
| 3536 | + * 2. New SYSTEM bg will be allocated |
---|
| 3537 | + * Due to regular version will allocate new chunk. |
---|
| 3538 | + * 3. New SYSTEM bg is empty and will get cleaned up |
---|
| 3539 | + * Before cleanup really happens, it's marked RO again. |
---|
| 3540 | + * 4. Empty SYSTEM bg get scrubbed |
---|
| 3541 | + * We go back to 2. |
---|
| 3542 | + * |
---|
| 3543 | + * This can easily boost the amount of SYSTEM chunks if cleaner |
---|
| 3544 | + * thread can't be triggered fast enough, and use up all space |
---|
| 3545 | + * of btrfs_super_block::sys_chunk_array |
---|
| 3546 | + * |
---|
| 3547 | + * While for dev replace, we need to try our best to mark block |
---|
| 3548 | + * group RO, to prevent race between: |
---|
| 3549 | + * - Write duplication |
---|
| 3550 | + * Contains latest data |
---|
| 3551 | + * - Scrub copy |
---|
| 3552 | + * Contains data from commit tree |
---|
| 3553 | + * |
---|
| 3554 | + * If target block group is not marked RO, nocow writes can |
---|
| 3555 | + * be overwritten by scrub copy, causing data corruption. |
---|
| 3556 | + * So for dev-replace, it's not allowed to continue if a block |
---|
| 3557 | + * group is not RO. |
---|
| 3558 | + */ |
---|
| 3559 | + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); |
---|
3603 | 3560 | if (ret == 0) { |
---|
3604 | 3561 | ro_set = 1; |
---|
3605 | | - } else if (ret == -ENOSPC) { |
---|
| 3562 | + } else if (ret == -ENOSPC && !sctx->is_dev_replace && |
---|
| 3563 | + !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { |
---|
3606 | 3564 | /* |
---|
3607 | 3565 | * btrfs_inc_block_group_ro return -ENOSPC when it |
---|
3608 | 3566 | * failed in creating new chunk for metadata. |
---|
3609 | | - * It is not a problem for scrub/replace, because |
---|
| 3567 | + * It is not a problem for scrub, because |
---|
3610 | 3568 | * metadata are always cowed, and our scrub paused |
---|
3611 | 3569 | * commit_transactions. |
---|
| 3570 | + * |
---|
| 3571 | + * For RAID56 chunks, we have to mark them read-only |
---|
| 3572 | + * for scrub, as later we would use our own cache |
---|
| 3573 | + * out of RAID56 realm. |
---|
| 3574 | + * Thus we want the RAID56 bg to be marked RO to |
---|
| 3575 | + * prevent RMW from screwing up out cache. |
---|
3612 | 3576 | */ |
---|
3613 | 3577 | ro_set = 0; |
---|
| 3578 | + } else if (ret == -ETXTBSY) { |
---|
| 3579 | + btrfs_warn(fs_info, |
---|
| 3580 | + "skipping scrub of block group %llu due to active swapfile", |
---|
| 3581 | + cache->start); |
---|
| 3582 | + scrub_pause_off(fs_info); |
---|
| 3583 | + ret = 0; |
---|
| 3584 | + goto skip_unfreeze; |
---|
3614 | 3585 | } else { |
---|
3615 | 3586 | btrfs_warn(fs_info, |
---|
3616 | 3587 | "failed setting block group ro: %d", ret); |
---|
| 3588 | + btrfs_unfreeze_block_group(cache); |
---|
3617 | 3589 | btrfs_put_block_group(cache); |
---|
| 3590 | + scrub_pause_off(fs_info); |
---|
3618 | 3591 | break; |
---|
3619 | 3592 | } |
---|
3620 | 3593 | |
---|
3621 | | - btrfs_dev_replace_write_lock(&fs_info->dev_replace); |
---|
| 3594 | + /* |
---|
| 3595 | + * Now the target block is marked RO, wait for nocow writes to |
---|
| 3596 | + * finish before dev-replace. |
---|
| 3597 | + * COW is fine, as COW never overwrites extents in commit tree. |
---|
| 3598 | + */ |
---|
| 3599 | + if (sctx->is_dev_replace) { |
---|
| 3600 | + btrfs_wait_nocow_writers(cache); |
---|
| 3601 | + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, |
---|
| 3602 | + cache->length); |
---|
| 3603 | + } |
---|
| 3604 | + |
---|
| 3605 | + scrub_pause_off(fs_info); |
---|
| 3606 | + down_write(&dev_replace->rwsem); |
---|
3622 | 3607 | dev_replace->cursor_right = found_key.offset + length; |
---|
3623 | 3608 | dev_replace->cursor_left = found_key.offset; |
---|
3624 | 3609 | dev_replace->item_needs_writeback = 1; |
---|
3625 | | - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); |
---|
| 3610 | + up_write(&dev_replace->rwsem); |
---|
| 3611 | + |
---|
3626 | 3612 | ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, |
---|
3627 | 3613 | found_key.offset, cache); |
---|
3628 | 3614 | |
---|
.. | .. |
---|
3658 | 3644 | |
---|
3659 | 3645 | scrub_pause_off(fs_info); |
---|
3660 | 3646 | |
---|
3661 | | - btrfs_dev_replace_write_lock(&fs_info->dev_replace); |
---|
| 3647 | + down_write(&dev_replace->rwsem); |
---|
3662 | 3648 | dev_replace->cursor_left = dev_replace->cursor_right; |
---|
3663 | 3649 | dev_replace->item_needs_writeback = 1; |
---|
3664 | | - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); |
---|
| 3650 | + up_write(&dev_replace->rwsem); |
---|
3665 | 3651 | |
---|
3666 | 3652 | if (ro_set) |
---|
3667 | 3653 | btrfs_dec_block_group_ro(cache); |
---|
.. | .. |
---|
3675 | 3661 | */ |
---|
3676 | 3662 | spin_lock(&cache->lock); |
---|
3677 | 3663 | if (!cache->removed && !cache->ro && cache->reserved == 0 && |
---|
3678 | | - btrfs_block_group_used(&cache->item) == 0) { |
---|
| 3664 | + cache->used == 0) { |
---|
3679 | 3665 | spin_unlock(&cache->lock); |
---|
3680 | | - btrfs_mark_bg_unused(cache); |
---|
| 3666 | + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) |
---|
| 3667 | + btrfs_discard_queue_work(&fs_info->discard_ctl, |
---|
| 3668 | + cache); |
---|
| 3669 | + else |
---|
| 3670 | + btrfs_mark_bg_unused(cache); |
---|
3681 | 3671 | } else { |
---|
3682 | 3672 | spin_unlock(&cache->lock); |
---|
3683 | 3673 | } |
---|
3684 | | - |
---|
| 3674 | +skip_unfreeze: |
---|
| 3675 | + btrfs_unfreeze_block_group(cache); |
---|
3685 | 3676 | btrfs_put_block_group(cache); |
---|
3686 | 3677 | if (ret) |
---|
3687 | 3678 | break; |
---|
.. | .. |
---|
3714 | 3705 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
---|
3715 | 3706 | |
---|
3716 | 3707 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) |
---|
3717 | | - return -EIO; |
---|
| 3708 | + return -EROFS; |
---|
3718 | 3709 | |
---|
3719 | 3710 | /* Seed devices of a new filesystem has their own generation. */ |
---|
3720 | 3711 | if (scrub_dev->fs_devices != fs_info->fs_devices) |
---|
.. | .. |
---|
3739 | 3730 | return 0; |
---|
3740 | 3731 | } |
---|
3741 | 3732 | |
---|
| 3733 | +static void scrub_workers_put(struct btrfs_fs_info *fs_info) |
---|
| 3734 | +{ |
---|
| 3735 | + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, |
---|
| 3736 | + &fs_info->scrub_lock)) { |
---|
| 3737 | + struct btrfs_workqueue *scrub_workers = NULL; |
---|
| 3738 | + struct btrfs_workqueue *scrub_wr_comp = NULL; |
---|
| 3739 | + struct btrfs_workqueue *scrub_parity = NULL; |
---|
| 3740 | + |
---|
| 3741 | + scrub_workers = fs_info->scrub_workers; |
---|
| 3742 | + scrub_wr_comp = fs_info->scrub_wr_completion_workers; |
---|
| 3743 | + scrub_parity = fs_info->scrub_parity_workers; |
---|
| 3744 | + |
---|
| 3745 | + fs_info->scrub_workers = NULL; |
---|
| 3746 | + fs_info->scrub_wr_completion_workers = NULL; |
---|
| 3747 | + fs_info->scrub_parity_workers = NULL; |
---|
| 3748 | + mutex_unlock(&fs_info->scrub_lock); |
---|
| 3749 | + |
---|
| 3750 | + btrfs_destroy_workqueue(scrub_workers); |
---|
| 3751 | + btrfs_destroy_workqueue(scrub_wr_comp); |
---|
| 3752 | + btrfs_destroy_workqueue(scrub_parity); |
---|
| 3753 | + } |
---|
| 3754 | +} |
---|
| 3755 | + |
---|
3742 | 3756 | /* |
---|
3743 | 3757 | * get a reference count on fs_info->scrub_workers. start worker if necessary |
---|
3744 | 3758 | */ |
---|
3745 | 3759 | static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, |
---|
3746 | 3760 | int is_dev_replace) |
---|
3747 | 3761 | { |
---|
| 3762 | + struct btrfs_workqueue *scrub_workers = NULL; |
---|
| 3763 | + struct btrfs_workqueue *scrub_wr_comp = NULL; |
---|
| 3764 | + struct btrfs_workqueue *scrub_parity = NULL; |
---|
3748 | 3765 | unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; |
---|
3749 | 3766 | int max_active = fs_info->thread_pool_size; |
---|
| 3767 | + int ret = -ENOMEM; |
---|
3750 | 3768 | |
---|
3751 | | - if (fs_info->scrub_workers_refcnt == 0) { |
---|
3752 | | - fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", |
---|
3753 | | - flags, is_dev_replace ? 1 : max_active, 4); |
---|
3754 | | - if (!fs_info->scrub_workers) |
---|
3755 | | - goto fail_scrub_workers; |
---|
| 3769 | + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) |
---|
| 3770 | + return 0; |
---|
3756 | 3771 | |
---|
3757 | | - fs_info->scrub_wr_completion_workers = |
---|
3758 | | - btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, |
---|
| 3772 | + scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, |
---|
| 3773 | + is_dev_replace ? 1 : max_active, 4); |
---|
| 3774 | + if (!scrub_workers) |
---|
| 3775 | + goto fail_scrub_workers; |
---|
| 3776 | + |
---|
| 3777 | + scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, |
---|
3759 | 3778 | max_active, 2); |
---|
3760 | | - if (!fs_info->scrub_wr_completion_workers) |
---|
3761 | | - goto fail_scrub_wr_completion_workers; |
---|
| 3779 | + if (!scrub_wr_comp) |
---|
| 3780 | + goto fail_scrub_wr_completion_workers; |
---|
3762 | 3781 | |
---|
3763 | | - fs_info->scrub_parity_workers = |
---|
3764 | | - btrfs_alloc_workqueue(fs_info, "scrubparity", flags, |
---|
3765 | | - max_active, 2); |
---|
3766 | | - if (!fs_info->scrub_parity_workers) |
---|
3767 | | - goto fail_scrub_parity_workers; |
---|
| 3782 | + scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, |
---|
| 3783 | + max_active, 2); |
---|
| 3784 | + if (!scrub_parity) |
---|
| 3785 | + goto fail_scrub_parity_workers; |
---|
| 3786 | + |
---|
| 3787 | + mutex_lock(&fs_info->scrub_lock); |
---|
| 3788 | + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { |
---|
| 3789 | + ASSERT(fs_info->scrub_workers == NULL && |
---|
| 3790 | + fs_info->scrub_wr_completion_workers == NULL && |
---|
| 3791 | + fs_info->scrub_parity_workers == NULL); |
---|
| 3792 | + fs_info->scrub_workers = scrub_workers; |
---|
| 3793 | + fs_info->scrub_wr_completion_workers = scrub_wr_comp; |
---|
| 3794 | + fs_info->scrub_parity_workers = scrub_parity; |
---|
| 3795 | + refcount_set(&fs_info->scrub_workers_refcnt, 1); |
---|
| 3796 | + mutex_unlock(&fs_info->scrub_lock); |
---|
| 3797 | + return 0; |
---|
3768 | 3798 | } |
---|
3769 | | - ++fs_info->scrub_workers_refcnt; |
---|
3770 | | - return 0; |
---|
| 3799 | + /* Other thread raced in and created the workers for us */ |
---|
| 3800 | + refcount_inc(&fs_info->scrub_workers_refcnt); |
---|
| 3801 | + mutex_unlock(&fs_info->scrub_lock); |
---|
3771 | 3802 | |
---|
| 3803 | + ret = 0; |
---|
| 3804 | + btrfs_destroy_workqueue(scrub_parity); |
---|
3772 | 3805 | fail_scrub_parity_workers: |
---|
3773 | | - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); |
---|
| 3806 | + btrfs_destroy_workqueue(scrub_wr_comp); |
---|
3774 | 3807 | fail_scrub_wr_completion_workers: |
---|
3775 | | - btrfs_destroy_workqueue(fs_info->scrub_workers); |
---|
| 3808 | + btrfs_destroy_workqueue(scrub_workers); |
---|
3776 | 3809 | fail_scrub_workers: |
---|
3777 | | - return -ENOMEM; |
---|
| 3810 | + return ret; |
---|
3778 | 3811 | } |
---|
3779 | 3812 | |
---|
3780 | 3813 | int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, |
---|
.. | .. |
---|
3785 | 3818 | int ret; |
---|
3786 | 3819 | struct btrfs_device *dev; |
---|
3787 | 3820 | unsigned int nofs_flag; |
---|
3788 | | - struct btrfs_workqueue *scrub_workers = NULL; |
---|
3789 | | - struct btrfs_workqueue *scrub_wr_comp = NULL; |
---|
3790 | | - struct btrfs_workqueue *scrub_parity = NULL; |
---|
| 3821 | + bool need_commit = false; |
---|
3791 | 3822 | |
---|
3792 | 3823 | if (btrfs_fs_closing(fs_info)) |
---|
3793 | | - return -EINVAL; |
---|
| 3824 | + return -EAGAIN; |
---|
3794 | 3825 | |
---|
3795 | 3826 | if (fs_info->nodesize > BTRFS_STRIPE_LEN) { |
---|
3796 | 3827 | /* |
---|
.. | .. |
---|
3834 | 3865 | if (IS_ERR(sctx)) |
---|
3835 | 3866 | return PTR_ERR(sctx); |
---|
3836 | 3867 | |
---|
| 3868 | + ret = scrub_workers_get(fs_info, is_dev_replace); |
---|
| 3869 | + if (ret) |
---|
| 3870 | + goto out_free_ctx; |
---|
| 3871 | + |
---|
3837 | 3872 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
---|
3838 | 3873 | dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); |
---|
3839 | 3874 | if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && |
---|
3840 | 3875 | !is_dev_replace)) { |
---|
3841 | 3876 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
3842 | 3877 | ret = -ENODEV; |
---|
3843 | | - goto out_free_ctx; |
---|
| 3878 | + goto out; |
---|
3844 | 3879 | } |
---|
3845 | 3880 | |
---|
3846 | 3881 | if (!is_dev_replace && !readonly && |
---|
3847 | 3882 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { |
---|
3848 | 3883 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
3849 | | - btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", |
---|
3850 | | - rcu_str_deref(dev->name)); |
---|
| 3884 | + btrfs_err_in_rcu(fs_info, |
---|
| 3885 | + "scrub on devid %llu: filesystem on %s is not writable", |
---|
| 3886 | + devid, rcu_str_deref(dev->name)); |
---|
3851 | 3887 | ret = -EROFS; |
---|
3852 | | - goto out_free_ctx; |
---|
| 3888 | + goto out; |
---|
3853 | 3889 | } |
---|
3854 | 3890 | |
---|
3855 | 3891 | mutex_lock(&fs_info->scrub_lock); |
---|
.. | .. |
---|
3858 | 3894 | mutex_unlock(&fs_info->scrub_lock); |
---|
3859 | 3895 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
3860 | 3896 | ret = -EIO; |
---|
3861 | | - goto out_free_ctx; |
---|
| 3897 | + goto out; |
---|
3862 | 3898 | } |
---|
3863 | 3899 | |
---|
3864 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
---|
| 3900 | + down_read(&fs_info->dev_replace.rwsem); |
---|
3865 | 3901 | if (dev->scrub_ctx || |
---|
3866 | 3902 | (!is_dev_replace && |
---|
3867 | 3903 | btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { |
---|
3868 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
| 3904 | + up_read(&fs_info->dev_replace.rwsem); |
---|
3869 | 3905 | mutex_unlock(&fs_info->scrub_lock); |
---|
3870 | 3906 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
3871 | 3907 | ret = -EINPROGRESS; |
---|
3872 | | - goto out_free_ctx; |
---|
| 3908 | + goto out; |
---|
3873 | 3909 | } |
---|
3874 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
3875 | | - |
---|
3876 | | - ret = scrub_workers_get(fs_info, is_dev_replace); |
---|
3877 | | - if (ret) { |
---|
3878 | | - mutex_unlock(&fs_info->scrub_lock); |
---|
3879 | | - mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
3880 | | - goto out_free_ctx; |
---|
3881 | | - } |
---|
| 3910 | + up_read(&fs_info->dev_replace.rwsem); |
---|
3882 | 3911 | |
---|
3883 | 3912 | sctx->readonly = readonly; |
---|
3884 | 3913 | dev->scrub_ctx = sctx; |
---|
.. | .. |
---|
3903 | 3932 | */ |
---|
3904 | 3933 | nofs_flag = memalloc_nofs_save(); |
---|
3905 | 3934 | if (!is_dev_replace) { |
---|
| 3935 | + u64 old_super_errors; |
---|
| 3936 | + |
---|
| 3937 | + spin_lock(&sctx->stat_lock); |
---|
| 3938 | + old_super_errors = sctx->stat.super_errors; |
---|
| 3939 | + spin_unlock(&sctx->stat_lock); |
---|
| 3940 | + |
---|
| 3941 | + btrfs_info(fs_info, "scrub: started on devid %llu", devid); |
---|
3906 | 3942 | /* |
---|
3907 | 3943 | * by holding device list mutex, we can |
---|
3908 | 3944 | * kick off writing super in log tree sync. |
---|
.. | .. |
---|
3910 | 3946 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
---|
3911 | 3947 | ret = scrub_supers(sctx, dev); |
---|
3912 | 3948 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); |
---|
| 3949 | + |
---|
| 3950 | + spin_lock(&sctx->stat_lock); |
---|
| 3951 | + /* |
---|
| 3952 | + * Super block errors found, but we can not commit transaction |
---|
| 3953 | + * at current context, since btrfs_commit_transaction() needs |
---|
| 3954 | + * to pause the current running scrub (hold by ourselves). |
---|
| 3955 | + */ |
---|
| 3956 | + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) |
---|
| 3957 | + need_commit = true; |
---|
| 3958 | + spin_unlock(&sctx->stat_lock); |
---|
3913 | 3959 | } |
---|
3914 | 3960 | |
---|
3915 | 3961 | if (!ret) |
---|
.. | .. |
---|
3925 | 3971 | if (progress) |
---|
3926 | 3972 | memcpy(progress, &sctx->stat, sizeof(*progress)); |
---|
3927 | 3973 | |
---|
| 3974 | + if (!is_dev_replace) |
---|
| 3975 | + btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", |
---|
| 3976 | + ret ? "not finished" : "finished", devid, ret); |
---|
| 3977 | + |
---|
3928 | 3978 | mutex_lock(&fs_info->scrub_lock); |
---|
3929 | 3979 | dev->scrub_ctx = NULL; |
---|
3930 | | - if (--fs_info->scrub_workers_refcnt == 0) { |
---|
3931 | | - scrub_workers = fs_info->scrub_workers; |
---|
3932 | | - scrub_wr_comp = fs_info->scrub_wr_completion_workers; |
---|
3933 | | - scrub_parity = fs_info->scrub_parity_workers; |
---|
3934 | | - } |
---|
3935 | 3980 | mutex_unlock(&fs_info->scrub_lock); |
---|
3936 | 3981 | |
---|
3937 | | - btrfs_destroy_workqueue(scrub_workers); |
---|
3938 | | - btrfs_destroy_workqueue(scrub_wr_comp); |
---|
3939 | | - btrfs_destroy_workqueue(scrub_parity); |
---|
| 3982 | + scrub_workers_put(fs_info); |
---|
3940 | 3983 | scrub_put_ctx(sctx); |
---|
3941 | 3984 | |
---|
3942 | | - return ret; |
---|
| 3985 | + /* |
---|
| 3986 | + * We found some super block errors before, now try to force a |
---|
| 3987 | + * transaction commit, as scrub has finished. |
---|
| 3988 | + */ |
---|
| 3989 | + if (need_commit) { |
---|
| 3990 | + struct btrfs_trans_handle *trans; |
---|
3943 | 3991 | |
---|
| 3992 | + trans = btrfs_start_transaction(fs_info->tree_root, 0); |
---|
| 3993 | + if (IS_ERR(trans)) { |
---|
| 3994 | + ret = PTR_ERR(trans); |
---|
| 3995 | + btrfs_err(fs_info, |
---|
| 3996 | + "scrub: failed to start transaction to fix super block errors: %d", ret); |
---|
| 3997 | + return ret; |
---|
| 3998 | + } |
---|
| 3999 | + ret = btrfs_commit_transaction(trans); |
---|
| 4000 | + if (ret < 0) |
---|
| 4001 | + btrfs_err(fs_info, |
---|
| 4002 | + "scrub: failed to commit transaction to fix super block errors: %d", ret); |
---|
| 4003 | + } |
---|
| 4004 | + return ret; |
---|
| 4005 | +out: |
---|
| 4006 | + scrub_workers_put(fs_info); |
---|
3944 | 4007 | out_free_ctx: |
---|
3945 | 4008 | scrub_free_ctx(sctx); |
---|
3946 | 4009 | |
---|
.. | .. |
---|
3989 | 4052 | return 0; |
---|
3990 | 4053 | } |
---|
3991 | 4054 | |
---|
3992 | | -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, |
---|
3993 | | - struct btrfs_device *dev) |
---|
| 4055 | +int btrfs_scrub_cancel_dev(struct btrfs_device *dev) |
---|
3994 | 4056 | { |
---|
| 4057 | + struct btrfs_fs_info *fs_info = dev->fs_info; |
---|
3995 | 4058 | struct scrub_ctx *sctx; |
---|
3996 | 4059 | |
---|
3997 | 4060 | mutex_lock(&fs_info->scrub_lock); |
---|