hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/btrfs/scrub.c
....@@ -6,7 +6,9 @@
66 #include <linux/blkdev.h>
77 #include <linux/ratelimit.h>
88 #include <linux/sched/mm.h>
9
+#include <crypto/hash.h>
910 #include "ctree.h"
11
+#include "discard.h"
1012 #include "volumes.h"
1113 #include "disk-io.h"
1214 #include "ordered-data.h"
....@@ -17,6 +19,7 @@
1719 #include "check-integrity.h"
1820 #include "rcu-string.h"
1921 #include "raid56.h"
22
+#include "block-group.h"
2023
2124 /*
2225 * This is only the first step towards a full-features scrub. It reads all
....@@ -146,7 +149,7 @@
146149 */
147150 unsigned long *ebitmap;
148151
149
- unsigned long bitmap[0];
152
+ unsigned long bitmap[];
150153 };
151154
152155 struct scrub_ctx {
....@@ -322,7 +325,6 @@
322325 struct rb_node *parent = NULL;
323326 struct full_stripe_lock *entry;
324327 struct full_stripe_lock *ret;
325
- unsigned int nofs_flag;
326328
327329 lockdep_assert_held(&locks_root->lock);
328330
....@@ -342,15 +344,8 @@
342344
343345 /*
344346 * Insert new lock.
345
- *
346
- * We must use GFP_NOFS because the scrub task might be waiting for a
347
- * worker task executing this function and in turn a transaction commit
348
- * might be waiting the scrub task to pause (which needs to wait for all
349
- * the worker tasks to complete before pausing).
350347 */
351
- nofs_flag = memalloc_nofs_save();
352348 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
353
- memalloc_nofs_restore(nofs_flag);
354349 if (!ret)
355350 return ERR_PTR(-ENOMEM);
356351 ret->logical = fstripe_logical;
....@@ -395,8 +390,7 @@
395390 *
396391 * Caller must ensure @cache is a RAID56 block group.
397392 */
398
-static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
399
- u64 bytenr)
393
+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
400394 {
401395 u64 ret;
402396
....@@ -410,8 +404,8 @@
410404 * round_down() can only handle power of 2, while RAID56 full
411405 * stripe length can be 64KiB * n, so we need to manually round down.
412406 */
413
- ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
414
- cache->full_stripe_len + cache->key.objectid;
407
+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
408
+ cache->full_stripe_len + cache->start;
415409 return ret;
416410 }
417411
....@@ -429,7 +423,7 @@
429423 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
430424 bool *locked_ret)
431425 {
432
- struct btrfs_block_group_cache *bg_cache;
426
+ struct btrfs_block_group *bg_cache;
433427 struct btrfs_full_stripe_locks_tree *locks_root;
434428 struct full_stripe_lock *existing;
435429 u64 fstripe_start;
....@@ -476,7 +470,7 @@
476470 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
477471 bool locked)
478472 {
479
- struct btrfs_block_group_cache *bg_cache;
473
+ struct btrfs_block_group *bg_cache;
480474 struct btrfs_full_stripe_locks_tree *locks_root;
481475 struct full_stripe_lock *fstripe_lock;
482476 u64 fstripe_start;
....@@ -604,8 +598,8 @@
604598 sbio->index = i;
605599 sbio->sctx = sctx;
606600 sbio->page_count = 0;
607
- btrfs_init_work(&sbio->work, btrfs_scrub_helper,
608
- scrub_bio_end_io_worker, NULL, NULL);
601
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
602
+ NULL);
609603
610604 if (i != SCRUB_BIOS_PER_SCTX - 1)
611605 sctx->bios[i]->next_free = i + 1;
....@@ -653,13 +647,9 @@
653647 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
654648 struct inode_fs_paths *ipath = NULL;
655649 struct btrfs_root *local_root;
656
- struct btrfs_key root_key;
657650 struct btrfs_key key;
658651
659
- root_key.objectid = root;
660
- root_key.type = BTRFS_ROOT_ITEM_KEY;
661
- root_key.offset = (u64)-1;
662
- local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
652
+ local_root = btrfs_get_fs_root(fs_info, root, true);
663653 if (IS_ERR(local_root)) {
664654 ret = PTR_ERR(local_root);
665655 goto err;
....@@ -674,6 +664,7 @@
674664
675665 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
676666 if (ret) {
667
+ btrfs_put_root(local_root);
677668 btrfs_release_path(swarn->path);
678669 goto err;
679670 }
....@@ -694,6 +685,7 @@
694685 ipath = init_ipath(4096, local_root, swarn->path);
695686 memalloc_nofs_restore(nofs_flag);
696687 if (IS_ERR(ipath)) {
688
+ btrfs_put_root(local_root);
697689 ret = PTR_ERR(ipath);
698690 ipath = NULL;
699691 goto err;
....@@ -717,6 +709,7 @@
717709 min(isize - offset, (u64)PAGE_SIZE), nlink,
718710 (char *)(unsigned long)ipath->fspath->val[i]);
719711
712
+ btrfs_put_root(local_root);
720713 free_ipath(ipath);
721714 return 0;
722715
....@@ -841,7 +834,8 @@
841834 int page_num;
842835 int success;
843836 bool full_stripe_locked;
844
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
837
+ unsigned int nofs_flag;
838
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
845839 DEFAULT_RATELIMIT_BURST);
846840
847841 BUG_ON(sblock_to_check->page_count < 1);
....@@ -866,6 +860,16 @@
866860 dev = sblock_to_check->pagev[0]->dev;
867861
868862 /*
863
+ * We must use GFP_NOFS because the scrub task might be waiting for a
864
+ * worker task executing this function and in turn a transaction commit
865
+ * might be waiting the scrub task to pause (which needs to wait for all
866
+ * the worker tasks to complete before pausing).
867
+ * We do allocations in the workers through insert_full_stripe_lock()
868
+ * and scrub_add_page_to_wr_bio(), which happens down the call chain of
869
+ * this function.
870
+ */
871
+ nofs_flag = memalloc_nofs_save();
872
+ /*
869873 * For RAID5/6, race can happen for a different device scrub thread.
870874 * For data corruption, Parity and Data threads will both try
871875 * to recovery the data.
....@@ -874,6 +878,7 @@
874878 */
875879 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
876880 if (ret < 0) {
881
+ memalloc_nofs_restore(nofs_flag);
877882 spin_lock(&sctx->stat_lock);
878883 if (ret == -ENOMEM)
879884 sctx->stat.malloc_errors++;
....@@ -913,7 +918,7 @@
913918 */
914919
915920 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
916
- sizeof(*sblocks_for_recheck), GFP_NOFS);
921
+ sizeof(*sblocks_for_recheck), GFP_KERNEL);
917922 if (!sblocks_for_recheck) {
918923 spin_lock(&sctx->stat_lock);
919924 sctx->stat.malloc_errors++;
....@@ -964,14 +969,14 @@
964969 spin_lock(&sctx->stat_lock);
965970 sctx->stat.read_errors++;
966971 spin_unlock(&sctx->stat_lock);
967
- if (__ratelimit(&_rs))
972
+ if (__ratelimit(&rs))
968973 scrub_print_warning("i/o error", sblock_to_check);
969974 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
970975 } else if (sblock_bad->checksum_error) {
971976 spin_lock(&sctx->stat_lock);
972977 sctx->stat.csum_errors++;
973978 spin_unlock(&sctx->stat_lock);
974
- if (__ratelimit(&_rs))
979
+ if (__ratelimit(&rs))
975980 scrub_print_warning("checksum error", sblock_to_check);
976981 btrfs_dev_stat_inc_and_print(dev,
977982 BTRFS_DEV_STAT_CORRUPTION_ERRS);
....@@ -979,7 +984,7 @@
979984 spin_lock(&sctx->stat_lock);
980985 sctx->stat.verify_errors++;
981986 spin_unlock(&sctx->stat_lock);
982
- if (__ratelimit(&_rs))
987
+ if (__ratelimit(&rs))
983988 scrub_print_warning("checksum/header error",
984989 sblock_to_check);
985990 if (sblock_bad->generation_error)
....@@ -1133,7 +1138,7 @@
11331138
11341139 if (scrub_write_page_to_dev_replace(sblock_other,
11351140 page_num) != 0) {
1136
- btrfs_dev_replace_stats_inc(
1141
+ atomic64_inc(
11371142 &fs_info->dev_replace.num_write_errors);
11381143 success = 0;
11391144 }
....@@ -1211,6 +1216,7 @@
12111216 }
12121217
12131218 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1219
+ memalloc_nofs_restore(nofs_flag);
12141220 if (ret < 0)
12151221 return ret;
12161222 return 0;
....@@ -1573,8 +1579,7 @@
15731579 if (btrfsic_submit_bio_wait(bio)) {
15741580 btrfs_dev_stat_inc_and_print(page_bad->dev,
15751581 BTRFS_DEV_STAT_WRITE_ERRS);
1576
- btrfs_dev_replace_stats_inc(
1577
- &fs_info->dev_replace.num_write_errors);
1582
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
15781583 bio_put(bio);
15791584 return -EIO;
15801585 }
....@@ -1601,8 +1606,7 @@
16011606
16021607 ret = scrub_write_page_to_dev_replace(sblock, page_num);
16031608 if (ret)
1604
- btrfs_dev_replace_stats_inc(
1605
- &fs_info->dev_replace.num_write_errors);
1609
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
16061610 }
16071611 }
16081612
....@@ -1612,13 +1616,9 @@
16121616 struct scrub_page *spage = sblock->pagev[page_num];
16131617
16141618 BUG_ON(spage->page == NULL);
1615
- if (spage->io_error) {
1616
- void *mapped_buffer = kmap_atomic(spage->page);
1619
+ if (spage->io_error)
1620
+ clear_page(page_address(spage->page));
16171621
1618
- clear_page(mapped_buffer);
1619
- flush_dcache_page(spage->page);
1620
- kunmap_atomic(mapped_buffer);
1621
- }
16221622 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
16231623 }
16241624
....@@ -1631,19 +1631,8 @@
16311631 mutex_lock(&sctx->wr_lock);
16321632 again:
16331633 if (!sctx->wr_curr_bio) {
1634
- unsigned int nofs_flag;
1635
-
1636
- /*
1637
- * We must use GFP_NOFS because the scrub task might be waiting
1638
- * for a worker task executing this function and in turn a
1639
- * transaction commit might be waiting the scrub task to pause
1640
- * (which needs to wait for all the worker tasks to complete
1641
- * before pausing).
1642
- */
1643
- nofs_flag = memalloc_nofs_save();
16441634 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
16451635 GFP_KERNEL);
1646
- memalloc_nofs_restore(nofs_flag);
16471636 if (!sctx->wr_curr_bio) {
16481637 mutex_unlock(&sctx->wr_lock);
16491638 return -ENOMEM;
....@@ -1726,8 +1715,7 @@
17261715 sbio->status = bio->bi_status;
17271716 sbio->bio = bio;
17281717
1729
- btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1730
- scrub_wr_bio_end_io_worker, NULL, NULL);
1718
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
17311719 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
17321720 }
17331721
....@@ -1746,8 +1734,7 @@
17461734 struct scrub_page *spage = sbio->pagev[i];
17471735
17481736 spage->io_error = 1;
1749
- btrfs_dev_replace_stats_inc(&dev_replace->
1750
- num_write_errors);
1737
+ atomic64_inc(&dev_replace->num_write_errors);
17511738 }
17521739 }
17531740
....@@ -1796,41 +1783,24 @@
17961783 static int scrub_checksum_data(struct scrub_block *sblock)
17971784 {
17981785 struct scrub_ctx *sctx = sblock->sctx;
1786
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
1787
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
17991788 u8 csum[BTRFS_CSUM_SIZE];
1800
- u8 *on_disk_csum;
1801
- struct page *page;
1802
- void *buffer;
1803
- u32 crc = ~(u32)0;
1804
- u64 len;
1805
- int index;
1789
+ struct scrub_page *spage;
1790
+ char *kaddr;
18061791
18071792 BUG_ON(sblock->page_count < 1);
1808
- if (!sblock->pagev[0]->have_csum)
1793
+ spage = sblock->pagev[0];
1794
+ if (!spage->have_csum)
18091795 return 0;
18101796
1811
- on_disk_csum = sblock->pagev[0]->csum;
1812
- page = sblock->pagev[0]->page;
1813
- buffer = kmap_atomic(page);
1797
+ kaddr = page_address(spage->page);
18141798
1815
- len = sctx->fs_info->sectorsize;
1816
- index = 0;
1817
- for (;;) {
1818
- u64 l = min_t(u64, len, PAGE_SIZE);
1799
+ shash->tfm = fs_info->csum_shash;
1800
+ crypto_shash_init(shash);
1801
+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
18191802
1820
- crc = btrfs_csum_data(buffer, crc, l);
1821
- kunmap_atomic(buffer);
1822
- len -= l;
1823
- if (len == 0)
1824
- break;
1825
- index++;
1826
- BUG_ON(index >= sblock->page_count);
1827
- BUG_ON(!sblock->pagev[index]->page);
1828
- page = sblock->pagev[index]->page;
1829
- buffer = kmap_atomic(page);
1830
- }
1831
-
1832
- btrfs_csum_final(crc, csum);
1833
- if (memcmp(csum, on_disk_csum, sctx->csum_size))
1803
+ if (memcmp(csum, spage->csum, sctx->csum_size))
18341804 sblock->checksum_error = 1;
18351805
18361806 return sblock->checksum_error;
....@@ -1841,20 +1811,18 @@
18411811 struct scrub_ctx *sctx = sblock->sctx;
18421812 struct btrfs_header *h;
18431813 struct btrfs_fs_info *fs_info = sctx->fs_info;
1814
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
18441815 u8 calculated_csum[BTRFS_CSUM_SIZE];
18451816 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1846
- struct page *page;
1847
- void *mapped_buffer;
1848
- u64 mapped_size;
1849
- void *p;
1850
- u32 crc = ~(u32)0;
1851
- u64 len;
1852
- int index;
1817
+ const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
1818
+ int i;
1819
+ struct scrub_page *spage;
1820
+ char *kaddr;
18531821
18541822 BUG_ON(sblock->page_count < 1);
1855
- page = sblock->pagev[0]->page;
1856
- mapped_buffer = kmap_atomic(page);
1857
- h = (struct btrfs_header *)mapped_buffer;
1823
+ spage = sblock->pagev[0];
1824
+ kaddr = page_address(spage->page);
1825
+ h = (struct btrfs_header *)kaddr;
18581826 memcpy(on_disk_csum, h->csum, sctx->csum_size);
18591827
18601828 /*
....@@ -1862,43 +1830,32 @@
18621830 * a) don't have an extent buffer and
18631831 * b) the page is already kmapped
18641832 */
1865
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1833
+ if (spage->logical != btrfs_stack_header_bytenr(h))
18661834 sblock->header_error = 1;
18671835
1868
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
1836
+ if (spage->generation != btrfs_stack_header_generation(h)) {
18691837 sblock->header_error = 1;
18701838 sblock->generation_error = 1;
18711839 }
18721840
1873
- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1841
+ if (!scrub_check_fsid(h->fsid, spage))
18741842 sblock->header_error = 1;
18751843
18761844 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
18771845 BTRFS_UUID_SIZE))
18781846 sblock->header_error = 1;
18791847
1880
- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
1881
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1882
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1883
- index = 0;
1884
- for (;;) {
1885
- u64 l = min_t(u64, len, mapped_size);
1848
+ shash->tfm = fs_info->csum_shash;
1849
+ crypto_shash_init(shash);
1850
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1851
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
18861852
1887
- crc = btrfs_csum_data(p, crc, l);
1888
- kunmap_atomic(mapped_buffer);
1889
- len -= l;
1890
- if (len == 0)
1891
- break;
1892
- index++;
1893
- BUG_ON(index >= sblock->page_count);
1894
- BUG_ON(!sblock->pagev[index]->page);
1895
- page = sblock->pagev[index]->page;
1896
- mapped_buffer = kmap_atomic(page);
1897
- mapped_size = PAGE_SIZE;
1898
- p = mapped_buffer;
1853
+ for (i = 1; i < num_pages; i++) {
1854
+ kaddr = page_address(sblock->pagev[i]->page);
1855
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
18991856 }
19001857
1901
- btrfs_csum_final(crc, calculated_csum);
1858
+ crypto_shash_final(shash, calculated_csum);
19021859 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
19031860 sblock->checksum_error = 1;
19041861
....@@ -1909,56 +1866,34 @@
19091866 {
19101867 struct btrfs_super_block *s;
19111868 struct scrub_ctx *sctx = sblock->sctx;
1869
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
1870
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
19121871 u8 calculated_csum[BTRFS_CSUM_SIZE];
1913
- u8 on_disk_csum[BTRFS_CSUM_SIZE];
1914
- struct page *page;
1915
- void *mapped_buffer;
1916
- u64 mapped_size;
1917
- void *p;
1918
- u32 crc = ~(u32)0;
1872
+ struct scrub_page *spage;
1873
+ char *kaddr;
19191874 int fail_gen = 0;
19201875 int fail_cor = 0;
1921
- u64 len;
1922
- int index;
19231876
19241877 BUG_ON(sblock->page_count < 1);
1925
- page = sblock->pagev[0]->page;
1926
- mapped_buffer = kmap_atomic(page);
1927
- s = (struct btrfs_super_block *)mapped_buffer;
1928
- memcpy(on_disk_csum, s->csum, sctx->csum_size);
1878
+ spage = sblock->pagev[0];
1879
+ kaddr = page_address(spage->page);
1880
+ s = (struct btrfs_super_block *)kaddr;
19291881
1930
- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1882
+ if (spage->logical != btrfs_super_bytenr(s))
19311883 ++fail_cor;
19321884
1933
- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1885
+ if (spage->generation != btrfs_super_generation(s))
19341886 ++fail_gen;
19351887
1936
- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1888
+ if (!scrub_check_fsid(s->fsid, spage))
19371889 ++fail_cor;
19381890
1939
- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1940
- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1941
- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1942
- index = 0;
1943
- for (;;) {
1944
- u64 l = min_t(u64, len, mapped_size);
1891
+ shash->tfm = fs_info->csum_shash;
1892
+ crypto_shash_init(shash);
1893
+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1894
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
19451895
1946
- crc = btrfs_csum_data(p, crc, l);
1947
- kunmap_atomic(mapped_buffer);
1948
- len -= l;
1949
- if (len == 0)
1950
- break;
1951
- index++;
1952
- BUG_ON(index >= sblock->page_count);
1953
- BUG_ON(!sblock->pagev[index]->page);
1954
- page = sblock->pagev[index]->page;
1955
- mapped_buffer = kmap_atomic(page);
1956
- mapped_size = PAGE_SIZE;
1957
- p = mapped_buffer;
1958
- }
1959
-
1960
- btrfs_csum_final(crc, calculated_csum);
1961
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1896
+ if (memcmp(calculated_csum, s->csum, sctx->csum_size))
19621897 ++fail_cor;
19631898
19641899 if (fail_cor + fail_gen) {
....@@ -1971,10 +1906,10 @@
19711906 ++sctx->stat.super_errors;
19721907 spin_unlock(&sctx->stat_lock);
19731908 if (fail_cor)
1974
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1909
+ btrfs_dev_stat_inc_and_print(spage->dev,
19751910 BTRFS_DEV_STAT_CORRUPTION_ERRS);
19761911 else
1977
- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1912
+ btrfs_dev_stat_inc_and_print(spage->dev,
19781913 BTRFS_DEV_STAT_GENERATION_ERRS);
19791914 }
19801915
....@@ -2199,8 +2134,7 @@
21992134 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
22002135 }
22012136
2202
- btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2203
- scrub_missing_raid56_worker, NULL, NULL);
2137
+ btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
22042138 scrub_block_get(sblock);
22052139 scrub_pending_bio_inc(sctx);
22062140 raid56_submit_missing_rbio(rbio);
....@@ -2456,7 +2390,7 @@
24562390 ASSERT(index < UINT_MAX);
24572391
24582392 num_sectors = sum->len / sctx->fs_info->sectorsize;
2459
- memcpy(csum, sum->sums + index, sctx->csum_size);
2393
+ memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
24602394 if (index == num_sectors - 1) {
24612395 list_del(&sum->list);
24622396 kfree(sum);
....@@ -2668,18 +2602,18 @@
26682602 u64 last_offset;
26692603 u32 stripe_index;
26702604 u32 rot;
2605
+ const int data_stripes = nr_data_stripes(map);
26712606
2672
- last_offset = (physical - map->stripes[num].physical) *
2673
- nr_data_stripes(map);
2607
+ last_offset = (physical - map->stripes[num].physical) * data_stripes;
26742608 if (stripe_start)
26752609 *stripe_start = last_offset;
26762610
26772611 *offset = last_offset;
2678
- for (i = 0; i < nr_data_stripes(map); i++) {
2612
+ for (i = 0; i < data_stripes; i++) {
26792613 *offset = last_offset + i * map->stripe_len;
26802614
26812615 stripe_nr = div64_u64(*offset, map->stripe_len);
2682
- stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2616
+ stripe_nr = div_u64(stripe_nr, data_stripes);
26832617
26842618 /* Work out the disk rotation on this stripe-set */
26852619 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
....@@ -2738,8 +2672,8 @@
27382672
27392673 bio_put(bio);
27402674
2741
- btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
2742
- scrub_parity_bio_endio_worker, NULL, NULL);
2675
+ btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2676
+ NULL);
27432677 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
27442678 }
27452679
....@@ -3041,7 +2975,8 @@
30412975 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
30422976 struct map_lookup *map,
30432977 struct btrfs_device *scrub_dev,
3044
- int num, u64 base, u64 length)
2978
+ int num, u64 base, u64 length,
2979
+ struct btrfs_block_group *cache)
30452980 {
30462981 struct btrfs_path *path, *ppath;
30472982 struct btrfs_fs_info *fs_info = sctx->fs_info;
....@@ -3087,7 +3022,7 @@
30873022 offset = map->stripe_len * (num / map->sub_stripes);
30883023 increment = map->stripe_len * factor;
30893024 mirror_num = num % map->sub_stripes + 1;
3090
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3025
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
30913026 increment = map->stripe_len;
30923027 mirror_num = num % map->num_stripes + 1;
30933028 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
....@@ -3279,6 +3214,20 @@
32793214 break;
32803215 }
32813216
3217
+ /*
3218
+ * If our block group was removed in the meanwhile, just
3219
+ * stop scrubbing since there is no point in continuing.
3220
+ * Continuing would prevent reusing its device extents
3221
+ * for new block groups for a long time.
3222
+ */
3223
+ spin_lock(&cache->lock);
3224
+ if (cache->removed) {
3225
+ spin_unlock(&cache->lock);
3226
+ ret = 0;
3227
+ goto out;
3228
+ }
3229
+ spin_unlock(&cache->lock);
3230
+
32823231 extent = btrfs_item_ptr(l, slot,
32833232 struct btrfs_extent_item);
32843233 flags = btrfs_extent_flags(l, extent);
....@@ -3323,13 +3272,14 @@
33233272 &extent_dev,
33243273 &extent_mirror_num);
33253274
3326
- ret = btrfs_lookup_csums_range(csum_root,
3327
- extent_logical,
3328
- extent_logical +
3329
- extent_len - 1,
3330
- &sctx->csum_list, 1);
3331
- if (ret)
3332
- goto out;
3275
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
3276
+ ret = btrfs_lookup_csums_range(csum_root,
3277
+ extent_logical,
3278
+ extent_logical + extent_len - 1,
3279
+ &sctx->csum_list, 1);
3280
+ if (ret)
3281
+ goto out;
3282
+ }
33333283
33343284 ret = scrub_extent(sctx, map, extent_logical, extent_len,
33353285 extent_physical, extent_dev, flags,
....@@ -3415,18 +3365,18 @@
34153365 struct btrfs_device *scrub_dev,
34163366 u64 chunk_offset, u64 length,
34173367 u64 dev_offset,
3418
- struct btrfs_block_group_cache *cache)
3368
+ struct btrfs_block_group *cache)
34193369 {
34203370 struct btrfs_fs_info *fs_info = sctx->fs_info;
3421
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3371
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
34223372 struct map_lookup *map;
34233373 struct extent_map *em;
34243374 int i;
34253375 int ret = 0;
34263376
3427
- read_lock(&map_tree->map_tree.lock);
3428
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3429
- read_unlock(&map_tree->map_tree.lock);
3377
+ read_lock(&map_tree->lock);
3378
+ em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3379
+ read_unlock(&map_tree->lock);
34303380
34313381 if (!em) {
34323382 /*
....@@ -3452,7 +3402,7 @@
34523402 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
34533403 map->stripes[i].physical == dev_offset) {
34543404 ret = scrub_stripe(sctx, map, scrub_dev, i,
3455
- chunk_offset, length);
3405
+ chunk_offset, length, cache);
34563406 if (ret)
34573407 goto out;
34583408 }
....@@ -3479,7 +3429,7 @@
34793429 struct extent_buffer *l;
34803430 struct btrfs_key key;
34813431 struct btrfs_key found_key;
3482
- struct btrfs_block_group_cache *cache;
3432
+ struct btrfs_block_group *cache;
34833433 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
34843434
34853435 path = btrfs_alloc_path();
....@@ -3550,6 +3500,23 @@
35503500 goto skip;
35513501
35523502 /*
3503
+ * Make sure that while we are scrubbing the corresponding block
3504
+ * group doesn't get its logical address and its device extents
3505
+ * reused for another block group, which can possibly be of a
3506
+ * different type and different profile. We do this to prevent
3507
+ * false error detections and crashes due to bogus attempts to
3508
+ * repair extents.
3509
+ */
3510
+ spin_lock(&cache->lock);
3511
+ if (cache->removed) {
3512
+ spin_unlock(&cache->lock);
3513
+ btrfs_put_block_group(cache);
3514
+ goto skip;
3515
+ }
3516
+ btrfs_freeze_block_group(cache);
3517
+ spin_unlock(&cache->lock);
3518
+
3519
+ /*
35533520 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
35543521 * to avoid deadlock caused by:
35553522 * btrfs_inc_block_group_ro()
....@@ -3558,71 +3525,90 @@
35583525 * -> btrfs_scrub_pause()
35593526 */
35603527 scrub_pause_on(fs_info);
3561
- ret = btrfs_inc_block_group_ro(cache);
3562
- if (!ret && sctx->is_dev_replace) {
3563
- /*
3564
- * If we are doing a device replace wait for any tasks
3565
- * that started dellaloc right before we set the block
3566
- * group to RO mode, as they might have just allocated
3567
- * an extent from it or decided they could do a nocow
3568
- * write. And if any such tasks did that, wait for their
3569
- * ordered extents to complete and then commit the
3570
- * current transaction, so that we can later see the new
3571
- * extent items in the extent tree - the ordered extents
3572
- * create delayed data references (for cow writes) when
3573
- * they complete, which will be run and insert the
3574
- * corresponding extent items into the extent tree when
3575
- * we commit the transaction they used when running
3576
- * inode.c:btrfs_finish_ordered_io(). We later use
3577
- * the commit root of the extent tree to find extents
3578
- * to copy from the srcdev into the tgtdev, and we don't
3579
- * want to miss any new extents.
3580
- */
3581
- btrfs_wait_block_group_reservations(cache);
3582
- btrfs_wait_nocow_writers(cache);
3583
- ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3584
- cache->key.objectid,
3585
- cache->key.offset);
3586
- if (ret > 0) {
3587
- struct btrfs_trans_handle *trans;
35883528
3589
- trans = btrfs_join_transaction(root);
3590
- if (IS_ERR(trans))
3591
- ret = PTR_ERR(trans);
3592
- else
3593
- ret = btrfs_commit_transaction(trans);
3594
- if (ret) {
3595
- scrub_pause_off(fs_info);
3596
- btrfs_put_block_group(cache);
3597
- break;
3598
- }
3599
- }
3600
- }
3601
- scrub_pause_off(fs_info);
3602
-
3529
+ /*
3530
+ * Don't do chunk preallocation for scrub.
3531
+ *
3532
+ * This is especially important for SYSTEM bgs, or we can hit
3533
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
3534
+ * 1. The only SYSTEM bg is marked RO.
3535
+ * Since SYSTEM bg is small, that's pretty common.
3536
+ * 2. New SYSTEM bg will be allocated
3537
+ * Due to regular version will allocate new chunk.
3538
+ * 3. New SYSTEM bg is empty and will get cleaned up
3539
+ * Before cleanup really happens, it's marked RO again.
3540
+ * 4. Empty SYSTEM bg get scrubbed
3541
+ * We go back to 2.
3542
+ *
3543
+ * This can easily boost the amount of SYSTEM chunks if cleaner
3544
+ * thread can't be triggered fast enough, and use up all space
3545
+ * of btrfs_super_block::sys_chunk_array
3546
+ *
3547
+ * While for dev replace, we need to try our best to mark block
3548
+ * group RO, to prevent race between:
3549
+ * - Write duplication
3550
+ * Contains latest data
3551
+ * - Scrub copy
3552
+ * Contains data from commit tree
3553
+ *
3554
+ * If target block group is not marked RO, nocow writes can
3555
+ * be overwritten by scrub copy, causing data corruption.
3556
+ * So for dev-replace, it's not allowed to continue if a block
3557
+ * group is not RO.
3558
+ */
3559
+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
36033560 if (ret == 0) {
36043561 ro_set = 1;
3605
- } else if (ret == -ENOSPC) {
3562
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
3563
+ !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
36063564 /*
36073565 * btrfs_inc_block_group_ro return -ENOSPC when it
36083566 * failed in creating new chunk for metadata.
3609
- * It is not a problem for scrub/replace, because
3567
+ * It is not a problem for scrub, because
36103568 * metadata are always cowed, and our scrub paused
36113569 * commit_transactions.
3570
+ *
3571
+ * For RAID56 chunks, we have to mark them read-only
3572
+ * for scrub, as later we would use our own cache
3573
+ * out of RAID56 realm.
3574
+ * Thus we want the RAID56 bg to be marked RO to
3575
+ * prevent RMW from screwing up out cache.
36123576 */
36133577 ro_set = 0;
3578
+ } else if (ret == -ETXTBSY) {
3579
+ btrfs_warn(fs_info,
3580
+ "skipping scrub of block group %llu due to active swapfile",
3581
+ cache->start);
3582
+ scrub_pause_off(fs_info);
3583
+ ret = 0;
3584
+ goto skip_unfreeze;
36143585 } else {
36153586 btrfs_warn(fs_info,
36163587 "failed setting block group ro: %d", ret);
3588
+ btrfs_unfreeze_block_group(cache);
36173589 btrfs_put_block_group(cache);
3590
+ scrub_pause_off(fs_info);
36183591 break;
36193592 }
36203593
3621
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3594
+ /*
3595
+ * Now the target block is marked RO, wait for nocow writes to
3596
+ * finish before dev-replace.
3597
+ * COW is fine, as COW never overwrites extents in commit tree.
3598
+ */
3599
+ if (sctx->is_dev_replace) {
3600
+ btrfs_wait_nocow_writers(cache);
3601
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3602
+ cache->length);
3603
+ }
3604
+
3605
+ scrub_pause_off(fs_info);
3606
+ down_write(&dev_replace->rwsem);
36223607 dev_replace->cursor_right = found_key.offset + length;
36233608 dev_replace->cursor_left = found_key.offset;
36243609 dev_replace->item_needs_writeback = 1;
3625
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3610
+ up_write(&dev_replace->rwsem);
3611
+
36263612 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
36273613 found_key.offset, cache);
36283614
....@@ -3658,10 +3644,10 @@
36583644
36593645 scrub_pause_off(fs_info);
36603646
3661
- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3647
+ down_write(&dev_replace->rwsem);
36623648 dev_replace->cursor_left = dev_replace->cursor_right;
36633649 dev_replace->item_needs_writeback = 1;
3664
- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3650
+ up_write(&dev_replace->rwsem);
36653651
36663652 if (ro_set)
36673653 btrfs_dec_block_group_ro(cache);
....@@ -3675,13 +3661,18 @@
36753661 */
36763662 spin_lock(&cache->lock);
36773663 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3678
- btrfs_block_group_used(&cache->item) == 0) {
3664
+ cache->used == 0) {
36793665 spin_unlock(&cache->lock);
3680
- btrfs_mark_bg_unused(cache);
3666
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3667
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
3668
+ cache);
3669
+ else
3670
+ btrfs_mark_bg_unused(cache);
36813671 } else {
36823672 spin_unlock(&cache->lock);
36833673 }
3684
-
3674
+skip_unfreeze:
3675
+ btrfs_unfreeze_block_group(cache);
36853676 btrfs_put_block_group(cache);
36863677 if (ret)
36873678 break;
....@@ -3714,7 +3705,7 @@
37143705 struct btrfs_fs_info *fs_info = sctx->fs_info;
37153706
37163707 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3717
- return -EIO;
3708
+ return -EROFS;
37183709
37193710 /* Seed devices of a new filesystem has their own generation. */
37203711 if (scrub_dev->fs_devices != fs_info->fs_devices)
....@@ -3739,42 +3730,84 @@
37393730 return 0;
37403731 }
37413732
3733
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3734
+{
3735
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3736
+ &fs_info->scrub_lock)) {
3737
+ struct btrfs_workqueue *scrub_workers = NULL;
3738
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
3739
+ struct btrfs_workqueue *scrub_parity = NULL;
3740
+
3741
+ scrub_workers = fs_info->scrub_workers;
3742
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3743
+ scrub_parity = fs_info->scrub_parity_workers;
3744
+
3745
+ fs_info->scrub_workers = NULL;
3746
+ fs_info->scrub_wr_completion_workers = NULL;
3747
+ fs_info->scrub_parity_workers = NULL;
3748
+ mutex_unlock(&fs_info->scrub_lock);
3749
+
3750
+ btrfs_destroy_workqueue(scrub_workers);
3751
+ btrfs_destroy_workqueue(scrub_wr_comp);
3752
+ btrfs_destroy_workqueue(scrub_parity);
3753
+ }
3754
+}
3755
+
37423756 /*
37433757 * get a reference count on fs_info->scrub_workers. start worker if necessary
37443758 */
37453759 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
37463760 int is_dev_replace)
37473761 {
3762
+ struct btrfs_workqueue *scrub_workers = NULL;
3763
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
3764
+ struct btrfs_workqueue *scrub_parity = NULL;
37483765 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
37493766 int max_active = fs_info->thread_pool_size;
3767
+ int ret = -ENOMEM;
37503768
3751
- if (fs_info->scrub_workers_refcnt == 0) {
3752
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
3753
- flags, is_dev_replace ? 1 : max_active, 4);
3754
- if (!fs_info->scrub_workers)
3755
- goto fail_scrub_workers;
3769
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3770
+ return 0;
37563771
3757
- fs_info->scrub_wr_completion_workers =
3758
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3772
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3773
+ is_dev_replace ? 1 : max_active, 4);
3774
+ if (!scrub_workers)
3775
+ goto fail_scrub_workers;
3776
+
3777
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
37593778 max_active, 2);
3760
- if (!fs_info->scrub_wr_completion_workers)
3761
- goto fail_scrub_wr_completion_workers;
3779
+ if (!scrub_wr_comp)
3780
+ goto fail_scrub_wr_completion_workers;
37623781
3763
- fs_info->scrub_parity_workers =
3764
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3765
- max_active, 2);
3766
- if (!fs_info->scrub_parity_workers)
3767
- goto fail_scrub_parity_workers;
3782
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3783
+ max_active, 2);
3784
+ if (!scrub_parity)
3785
+ goto fail_scrub_parity_workers;
3786
+
3787
+ mutex_lock(&fs_info->scrub_lock);
3788
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3789
+ ASSERT(fs_info->scrub_workers == NULL &&
3790
+ fs_info->scrub_wr_completion_workers == NULL &&
3791
+ fs_info->scrub_parity_workers == NULL);
3792
+ fs_info->scrub_workers = scrub_workers;
3793
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3794
+ fs_info->scrub_parity_workers = scrub_parity;
3795
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
3796
+ mutex_unlock(&fs_info->scrub_lock);
3797
+ return 0;
37683798 }
3769
- ++fs_info->scrub_workers_refcnt;
3770
- return 0;
3799
+ /* Other thread raced in and created the workers for us */
3800
+ refcount_inc(&fs_info->scrub_workers_refcnt);
3801
+ mutex_unlock(&fs_info->scrub_lock);
37713802
3803
+ ret = 0;
3804
+ btrfs_destroy_workqueue(scrub_parity);
37723805 fail_scrub_parity_workers:
3773
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3806
+ btrfs_destroy_workqueue(scrub_wr_comp);
37743807 fail_scrub_wr_completion_workers:
3775
- btrfs_destroy_workqueue(fs_info->scrub_workers);
3808
+ btrfs_destroy_workqueue(scrub_workers);
37763809 fail_scrub_workers:
3777
- return -ENOMEM;
3810
+ return ret;
37783811 }
37793812
37803813 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
....@@ -3785,12 +3818,10 @@
37853818 int ret;
37863819 struct btrfs_device *dev;
37873820 unsigned int nofs_flag;
3788
- struct btrfs_workqueue *scrub_workers = NULL;
3789
- struct btrfs_workqueue *scrub_wr_comp = NULL;
3790
- struct btrfs_workqueue *scrub_parity = NULL;
3821
+ bool need_commit = false;
37913822
37923823 if (btrfs_fs_closing(fs_info))
3793
- return -EINVAL;
3824
+ return -EAGAIN;
37943825
37953826 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
37963827 /*
....@@ -3834,22 +3865,27 @@
38343865 if (IS_ERR(sctx))
38353866 return PTR_ERR(sctx);
38363867
3868
+ ret = scrub_workers_get(fs_info, is_dev_replace);
3869
+ if (ret)
3870
+ goto out_free_ctx;
3871
+
38373872 mutex_lock(&fs_info->fs_devices->device_list_mutex);
38383873 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
38393874 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
38403875 !is_dev_replace)) {
38413876 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
38423877 ret = -ENODEV;
3843
- goto out_free_ctx;
3878
+ goto out;
38443879 }
38453880
38463881 if (!is_dev_replace && !readonly &&
38473882 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
38483883 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3849
- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
3850
- rcu_str_deref(dev->name));
3884
+ btrfs_err_in_rcu(fs_info,
3885
+ "scrub on devid %llu: filesystem on %s is not writable",
3886
+ devid, rcu_str_deref(dev->name));
38513887 ret = -EROFS;
3852
- goto out_free_ctx;
3888
+ goto out;
38533889 }
38543890
38553891 mutex_lock(&fs_info->scrub_lock);
....@@ -3858,27 +3894,20 @@
38583894 mutex_unlock(&fs_info->scrub_lock);
38593895 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
38603896 ret = -EIO;
3861
- goto out_free_ctx;
3897
+ goto out;
38623898 }
38633899
3864
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3900
+ down_read(&fs_info->dev_replace.rwsem);
38653901 if (dev->scrub_ctx ||
38663902 (!is_dev_replace &&
38673903 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3868
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3904
+ up_read(&fs_info->dev_replace.rwsem);
38693905 mutex_unlock(&fs_info->scrub_lock);
38703906 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
38713907 ret = -EINPROGRESS;
3872
- goto out_free_ctx;
3908
+ goto out;
38733909 }
3874
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3875
-
3876
- ret = scrub_workers_get(fs_info, is_dev_replace);
3877
- if (ret) {
3878
- mutex_unlock(&fs_info->scrub_lock);
3879
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3880
- goto out_free_ctx;
3881
- }
3910
+ up_read(&fs_info->dev_replace.rwsem);
38823911
38833912 sctx->readonly = readonly;
38843913 dev->scrub_ctx = sctx;
....@@ -3903,6 +3932,13 @@
39033932 */
39043933 nofs_flag = memalloc_nofs_save();
39053934 if (!is_dev_replace) {
3935
+ u64 old_super_errors;
3936
+
3937
+ spin_lock(&sctx->stat_lock);
3938
+ old_super_errors = sctx->stat.super_errors;
3939
+ spin_unlock(&sctx->stat_lock);
3940
+
3941
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
39063942 /*
39073943 * by holding device list mutex, we can
39083944 * kick off writing super in log tree sync.
....@@ -3910,6 +3946,16 @@
39103946 mutex_lock(&fs_info->fs_devices->device_list_mutex);
39113947 ret = scrub_supers(sctx, dev);
39123948 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3949
+
3950
+ spin_lock(&sctx->stat_lock);
3951
+ /*
3952
+ * Super block errors found, but we can not commit transaction
3953
+ * at current context, since btrfs_commit_transaction() needs
3954
+ * to pause the current running scrub (hold by ourselves).
3955
+ */
3956
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3957
+ need_commit = true;
3958
+ spin_unlock(&sctx->stat_lock);
39133959 }
39143960
39153961 if (!ret)
....@@ -3925,22 +3971,39 @@
39253971 if (progress)
39263972 memcpy(progress, &sctx->stat, sizeof(*progress));
39273973
3974
+ if (!is_dev_replace)
3975
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3976
+ ret ? "not finished" : "finished", devid, ret);
3977
+
39283978 mutex_lock(&fs_info->scrub_lock);
39293979 dev->scrub_ctx = NULL;
3930
- if (--fs_info->scrub_workers_refcnt == 0) {
3931
- scrub_workers = fs_info->scrub_workers;
3932
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3933
- scrub_parity = fs_info->scrub_parity_workers;
3934
- }
39353980 mutex_unlock(&fs_info->scrub_lock);
39363981
3937
- btrfs_destroy_workqueue(scrub_workers);
3938
- btrfs_destroy_workqueue(scrub_wr_comp);
3939
- btrfs_destroy_workqueue(scrub_parity);
3982
+ scrub_workers_put(fs_info);
39403983 scrub_put_ctx(sctx);
39413984
3942
- return ret;
3985
+ /*
3986
+ * We found some super block errors before, now try to force a
3987
+ * transaction commit, as scrub has finished.
3988
+ */
3989
+ if (need_commit) {
3990
+ struct btrfs_trans_handle *trans;
39433991
3992
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
3993
+ if (IS_ERR(trans)) {
3994
+ ret = PTR_ERR(trans);
3995
+ btrfs_err(fs_info,
3996
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
3997
+ return ret;
3998
+ }
3999
+ ret = btrfs_commit_transaction(trans);
4000
+ if (ret < 0)
4001
+ btrfs_err(fs_info,
4002
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
4003
+ }
4004
+ return ret;
4005
+out:
4006
+ scrub_workers_put(fs_info);
39444007 out_free_ctx:
39454008 scrub_free_ctx(sctx);
39464009
....@@ -3989,9 +4052,9 @@
39894052 return 0;
39904053 }
39914054
3992
-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3993
- struct btrfs_device *dev)
4055
+int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
39944056 {
4057
+ struct btrfs_fs_info *fs_info = dev->fs_info;
39954058 struct scrub_ctx *sctx;
39964059
39974060 mutex_lock(&fs_info->scrub_lock);