~hc/RK356X_SDK_RELEASE.git

..	..	@@ -6,7 +6,9 @@
6	6	#include <linux/blkdev.h>
7	7	#include <linux/ratelimit.h>
8	8	#include <linux/sched/mm.h>
	9	+#include <crypto/hash.h>
9	10	#include "ctree.h"
	11	+#include "discard.h"
10	12	#include "volumes.h"
11	13	#include "disk-io.h"
12	14	#include "ordered-data.h"
..	..	@@ -17,6 +19,7 @@
17	19	#include "check-integrity.h"
18	20	#include "rcu-string.h"
19	21	#include "raid56.h"
	22	+#include "block-group.h"
20	23
21	24	/*
22	25	* This is only the first step towards a full-features scrub. It reads all
..	..	@@ -146,7 +149,7 @@
146	149	*/
147	150	unsigned long *ebitmap;
148	151
149		- unsigned long bitmap[0];
	152	+ unsigned long bitmap[];
150	153	};
151	154
152	155	struct scrub_ctx {
..	..	@@ -322,7 +325,6 @@
322	325	struct rb_node *parent = NULL;
323	326	struct full_stripe_lock *entry;
324	327	struct full_stripe_lock *ret;
325		- unsigned int nofs_flag;
326	328
327	329	lockdep_assert_held(&locks_root->lock);
328	330
..	..	@@ -342,15 +344,8 @@
342	344
343	345	/*
344	346	* Insert new lock.
345		- *
346		- * We must use GFP_NOFS because the scrub task might be waiting for a
347		- * worker task executing this function and in turn a transaction commit
348		- * might be waiting the scrub task to pause (which needs to wait for all
349		- * the worker tasks to complete before pausing).
350	347	*/
351		- nofs_flag = memalloc_nofs_save();
352	348	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
353		- memalloc_nofs_restore(nofs_flag);
354	349	if (!ret)
355	350	return ERR_PTR(-ENOMEM);
356	351	ret->logical = fstripe_logical;
..	..	@@ -395,8 +390,7 @@
395	390	*
396	391	* Caller must ensure @cache is a RAID56 block group.
397	392	*/
398		-static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
399		- u64 bytenr)
	393	+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
400	394	{
401	395	u64 ret;
402	396
..	..	@@ -410,8 +404,8 @@
410	404	* round_down() can only handle power of 2, while RAID56 full
411	405	* stripe length can be 64KiB * n, so we need to manually round down.
412	406	*/
413		- ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
414		- cache->full_stripe_len + cache->key.objectid;
	407	+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
	408	+ cache->full_stripe_len + cache->start;
415	409	return ret;
416	410	}
417	411
..	..	@@ -429,7 +423,7 @@
429	423	static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
430	424	bool *locked_ret)
431	425	{
432		- struct btrfs_block_group_cache *bg_cache;
	426	+ struct btrfs_block_group *bg_cache;
433	427	struct btrfs_full_stripe_locks_tree *locks_root;
434	428	struct full_stripe_lock *existing;
435	429	u64 fstripe_start;
..	..	@@ -476,7 +470,7 @@
476	470	static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
477	471	bool locked)
478	472	{
479		- struct btrfs_block_group_cache *bg_cache;
	473	+ struct btrfs_block_group *bg_cache;
480	474	struct btrfs_full_stripe_locks_tree *locks_root;
481	475	struct full_stripe_lock *fstripe_lock;
482	476	u64 fstripe_start;
..	..	@@ -604,8 +598,8 @@
604	598	sbio->index = i;
605	599	sbio->sctx = sctx;
606	600	sbio->page_count = 0;
607		- btrfs_init_work(&sbio->work, btrfs_scrub_helper,
608		- scrub_bio_end_io_worker, NULL, NULL);
	601	+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
	602	+ NULL);
609	603
610	604	if (i != SCRUB_BIOS_PER_SCTX - 1)
611	605	sctx->bios[i]->next_free = i + 1;
..	..	@@ -653,13 +647,9 @@
653	647	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
654	648	struct inode_fs_paths *ipath = NULL;
655	649	struct btrfs_root *local_root;
656		- struct btrfs_key root_key;
657	650	struct btrfs_key key;
658	651
659		- root_key.objectid = root;
660		- root_key.type = BTRFS_ROOT_ITEM_KEY;
661		- root_key.offset = (u64)-1;
662		- local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
	652	+ local_root = btrfs_get_fs_root(fs_info, root, true);
663	653	if (IS_ERR(local_root)) {
664	654	ret = PTR_ERR(local_root);
665	655	goto err;
..	..	@@ -674,6 +664,7 @@
674	664
675	665	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
676	666	if (ret) {
	667	+ btrfs_put_root(local_root);
677	668	btrfs_release_path(swarn->path);
678	669	goto err;
679	670	}
..	..	@@ -694,6 +685,7 @@
694	685	ipath = init_ipath(4096, local_root, swarn->path);
695	686	memalloc_nofs_restore(nofs_flag);
696	687	if (IS_ERR(ipath)) {
	688	+ btrfs_put_root(local_root);
697	689	ret = PTR_ERR(ipath);
698	690	ipath = NULL;
699	691	goto err;
..	..	@@ -717,6 +709,7 @@
717	709	min(isize - offset, (u64)PAGE_SIZE), nlink,
718	710	(char *)(unsigned long)ipath->fspath->val[i]);
719	711
	712	+ btrfs_put_root(local_root);
720	713	free_ipath(ipath);
721	714	return 0;
722	715
..	..	@@ -841,7 +834,8 @@
841	834	int page_num;
842	835	int success;
843	836	bool full_stripe_locked;
844		- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
	837	+ unsigned int nofs_flag;
	838	+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
845	839	DEFAULT_RATELIMIT_BURST);
846	840
847	841	BUG_ON(sblock_to_check->page_count < 1);
..	..	@@ -866,6 +860,16 @@
866	860	dev = sblock_to_check->pagev[0]->dev;
867	861
868	862	/*
	863	+ * We must use GFP_NOFS because the scrub task might be waiting for a
	864	+ * worker task executing this function and in turn a transaction commit
	865	+ * might be waiting the scrub task to pause (which needs to wait for all
	866	+ * the worker tasks to complete before pausing).
	867	+ * We do allocations in the workers through insert_full_stripe_lock()
	868	+ * and scrub_add_page_to_wr_bio(), which happens down the call chain of
	869	+ * this function.
	870	+ */
	871	+ nofs_flag = memalloc_nofs_save();
	872	+ /*
869	873	* For RAID5/6, race can happen for a different device scrub thread.
870	874	* For data corruption, Parity and Data threads will both try
871	875	* to recovery the data.
..	..	@@ -874,6 +878,7 @@
874	878	*/
875	879	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
876	880	if (ret < 0) {
	881	+ memalloc_nofs_restore(nofs_flag);
877	882	spin_lock(&sctx->stat_lock);
878	883	if (ret == -ENOMEM)
879	884	sctx->stat.malloc_errors++;
..	..	@@ -913,7 +918,7 @@
913	918	*/
914	919
915	920	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
916		- sizeof(*sblocks_for_recheck), GFP_NOFS);
	921	+ sizeof(*sblocks_for_recheck), GFP_KERNEL);
917	922	if (!sblocks_for_recheck) {
918	923	spin_lock(&sctx->stat_lock);
919	924	sctx->stat.malloc_errors++;
..	..	@@ -964,14 +969,14 @@
964	969	spin_lock(&sctx->stat_lock);
965	970	sctx->stat.read_errors++;
966	971	spin_unlock(&sctx->stat_lock);
967		- if (__ratelimit(&_rs))
	972	+ if (__ratelimit(&rs))
968	973	scrub_print_warning("i/o error", sblock_to_check);
969	974	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
970	975	} else if (sblock_bad->checksum_error) {
971	976	spin_lock(&sctx->stat_lock);
972	977	sctx->stat.csum_errors++;
973	978	spin_unlock(&sctx->stat_lock);
974		- if (__ratelimit(&_rs))
	979	+ if (__ratelimit(&rs))
975	980	scrub_print_warning("checksum error", sblock_to_check);
976	981	btrfs_dev_stat_inc_and_print(dev,
977	982	BTRFS_DEV_STAT_CORRUPTION_ERRS);
..	..	@@ -979,7 +984,7 @@
979	984	spin_lock(&sctx->stat_lock);
980	985	sctx->stat.verify_errors++;
981	986	spin_unlock(&sctx->stat_lock);
982		- if (__ratelimit(&_rs))
	987	+ if (__ratelimit(&rs))
983	988	scrub_print_warning("checksum/header error",
984	989	sblock_to_check);
985	990	if (sblock_bad->generation_error)
..	..	@@ -1133,7 +1138,7 @@
1133	1138
1134	1139	if (scrub_write_page_to_dev_replace(sblock_other,
1135	1140	page_num) != 0) {
1136		- btrfs_dev_replace_stats_inc(
	1141	+ atomic64_inc(
1137	1142	&fs_info->dev_replace.num_write_errors);
1138	1143	success = 0;
1139	1144	}
..	..	@@ -1211,6 +1216,7 @@
1211	1216	}
1212	1217
1213	1218	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
	1219	+ memalloc_nofs_restore(nofs_flag);
1214	1220	if (ret < 0)
1215	1221	return ret;
1216	1222	return 0;
..	..	@@ -1573,8 +1579,7 @@
1573	1579	if (btrfsic_submit_bio_wait(bio)) {
1574	1580	btrfs_dev_stat_inc_and_print(page_bad->dev,
1575	1581	BTRFS_DEV_STAT_WRITE_ERRS);
1576		- btrfs_dev_replace_stats_inc(
1577		- &fs_info->dev_replace.num_write_errors);
	1582	+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
1578	1583	bio_put(bio);
1579	1584	return -EIO;
1580	1585	}
..	..	@@ -1601,8 +1606,7 @@
1601	1606
1602	1607	ret = scrub_write_page_to_dev_replace(sblock, page_num);
1603	1608	if (ret)
1604		- btrfs_dev_replace_stats_inc(
1605		- &fs_info->dev_replace.num_write_errors);
	1609	+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
1606	1610	}
1607	1611	}
1608	1612
..	..	@@ -1612,13 +1616,9 @@
1612	1616	struct scrub_page *spage = sblock->pagev[page_num];
1613	1617
1614	1618	BUG_ON(spage->page == NULL);
1615		- if (spage->io_error) {
1616		- void *mapped_buffer = kmap_atomic(spage->page);
	1619	+ if (spage->io_error)
	1620	+ clear_page(page_address(spage->page));
1617	1621
1618		- clear_page(mapped_buffer);
1619		- flush_dcache_page(spage->page);
1620		- kunmap_atomic(mapped_buffer);
1621		- }
1622	1622	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1623	1623	}
1624	1624
..	..	@@ -1631,19 +1631,8 @@
1631	1631	mutex_lock(&sctx->wr_lock);
1632	1632	again:
1633	1633	if (!sctx->wr_curr_bio) {
1634		- unsigned int nofs_flag;
1635		-
1636		- /*
1637		- * We must use GFP_NOFS because the scrub task might be waiting
1638		- * for a worker task executing this function and in turn a
1639		- * transaction commit might be waiting the scrub task to pause
1640		- * (which needs to wait for all the worker tasks to complete
1641		- * before pausing).
1642		- */
1643		- nofs_flag = memalloc_nofs_save();
1644	1634	sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1645	1635	GFP_KERNEL);
1646		- memalloc_nofs_restore(nofs_flag);
1647	1636	if (!sctx->wr_curr_bio) {
1648	1637	mutex_unlock(&sctx->wr_lock);
1649	1638	return -ENOMEM;
..	..	@@ -1726,8 +1715,7 @@
1726	1715	sbio->status = bio->bi_status;
1727	1716	sbio->bio = bio;
1728	1717
1729		- btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1730		- scrub_wr_bio_end_io_worker, NULL, NULL);
	1718	+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1731	1719	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1732	1720	}
1733	1721
..	..	@@ -1746,8 +1734,7 @@
1746	1734	struct scrub_page *spage = sbio->pagev[i];
1747	1735
1748	1736	spage->io_error = 1;
1749		- btrfs_dev_replace_stats_inc(&dev_replace->
1750		- num_write_errors);
	1737	+ atomic64_inc(&dev_replace->num_write_errors);
1751	1738	}
1752	1739	}
1753	1740
..	..	@@ -1796,41 +1783,24 @@
1796	1783	static int scrub_checksum_data(struct scrub_block *sblock)
1797	1784	{
1798	1785	struct scrub_ctx *sctx = sblock->sctx;
	1786	+ struct btrfs_fs_info *fs_info = sctx->fs_info;
	1787	+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1799	1788	u8 csum[BTRFS_CSUM_SIZE];
1800		- u8 *on_disk_csum;
1801		- struct page *page;
1802		- void *buffer;
1803		- u32 crc = ~(u32)0;
1804		- u64 len;
1805		- int index;
	1789	+ struct scrub_page *spage;
	1790	+ char *kaddr;
1806	1791
1807	1792	BUG_ON(sblock->page_count < 1);
1808		- if (!sblock->pagev[0]->have_csum)
	1793	+ spage = sblock->pagev[0];
	1794	+ if (!spage->have_csum)
1809	1795	return 0;
1810	1796
1811		- on_disk_csum = sblock->pagev[0]->csum;
1812		- page = sblock->pagev[0]->page;
1813		- buffer = kmap_atomic(page);
	1797	+ kaddr = page_address(spage->page);
1814	1798
1815		- len = sctx->fs_info->sectorsize;
1816		- index = 0;
1817		- for (;;) {
1818		- u64 l = min_t(u64, len, PAGE_SIZE);
	1799	+ shash->tfm = fs_info->csum_shash;
	1800	+ crypto_shash_init(shash);
	1801	+ crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
1819	1802
1820		- crc = btrfs_csum_data(buffer, crc, l);
1821		- kunmap_atomic(buffer);
1822		- len -= l;
1823		- if (len == 0)
1824		- break;
1825		- index++;
1826		- BUG_ON(index >= sblock->page_count);
1827		- BUG_ON(!sblock->pagev[index]->page);
1828		- page = sblock->pagev[index]->page;
1829		- buffer = kmap_atomic(page);
1830		- }
1831		-
1832		- btrfs_csum_final(crc, csum);
1833		- if (memcmp(csum, on_disk_csum, sctx->csum_size))
	1803	+ if (memcmp(csum, spage->csum, sctx->csum_size))
1834	1804	sblock->checksum_error = 1;
1835	1805
1836	1806	return sblock->checksum_error;
..	..	@@ -1841,20 +1811,18 @@
1841	1811	struct scrub_ctx *sctx = sblock->sctx;
1842	1812	struct btrfs_header *h;
1843	1813	struct btrfs_fs_info *fs_info = sctx->fs_info;
	1814	+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1844	1815	u8 calculated_csum[BTRFS_CSUM_SIZE];
1845	1816	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1846		- struct page *page;
1847		- void *mapped_buffer;
1848		- u64 mapped_size;
1849		- void *p;
1850		- u32 crc = ~(u32)0;
1851		- u64 len;
1852		- int index;
	1817	+ const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
	1818	+ int i;
	1819	+ struct scrub_page *spage;
	1820	+ char *kaddr;
1853	1821
1854	1822	BUG_ON(sblock->page_count < 1);
1855		- page = sblock->pagev[0]->page;
1856		- mapped_buffer = kmap_atomic(page);
1857		- h = (struct btrfs_header *)mapped_buffer;
	1823	+ spage = sblock->pagev[0];
	1824	+ kaddr = page_address(spage->page);
	1825	+ h = (struct btrfs_header *)kaddr;
1858	1826	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1859	1827
1860	1828	/*
..	..	@@ -1862,43 +1830,32 @@
1862	1830	* a) don't have an extent buffer and
1863	1831	* b) the page is already kmapped
1864	1832	*/
1865		- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
	1833	+ if (spage->logical != btrfs_stack_header_bytenr(h))
1866	1834	sblock->header_error = 1;
1867	1835
1868		- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
	1836	+ if (spage->generation != btrfs_stack_header_generation(h)) {
1869	1837	sblock->header_error = 1;
1870	1838	sblock->generation_error = 1;
1871	1839	}
1872	1840
1873		- if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
	1841	+ if (!scrub_check_fsid(h->fsid, spage))
1874	1842	sblock->header_error = 1;
1875	1843
1876	1844	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1877	1845	BTRFS_UUID_SIZE))
1878	1846	sblock->header_error = 1;
1879	1847
1880		- len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
1881		- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1882		- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1883		- index = 0;
1884		- for (;;) {
1885		- u64 l = min_t(u64, len, mapped_size);
	1848	+ shash->tfm = fs_info->csum_shash;
	1849	+ crypto_shash_init(shash);
	1850	+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
	1851	+ PAGE_SIZE - BTRFS_CSUM_SIZE);
1886	1852
1887		- crc = btrfs_csum_data(p, crc, l);
1888		- kunmap_atomic(mapped_buffer);
1889		- len -= l;
1890		- if (len == 0)
1891		- break;
1892		- index++;
1893		- BUG_ON(index >= sblock->page_count);
1894		- BUG_ON(!sblock->pagev[index]->page);
1895		- page = sblock->pagev[index]->page;
1896		- mapped_buffer = kmap_atomic(page);
1897		- mapped_size = PAGE_SIZE;
1898		- p = mapped_buffer;
	1853	+ for (i = 1; i < num_pages; i++) {
	1854	+ kaddr = page_address(sblock->pagev[i]->page);
	1855	+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
1899	1856	}
1900	1857
1901		- btrfs_csum_final(crc, calculated_csum);
	1858	+ crypto_shash_final(shash, calculated_csum);
1902	1859	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1903	1860	sblock->checksum_error = 1;
1904	1861
..	..	@@ -1909,56 +1866,34 @@
1909	1866	{
1910	1867	struct btrfs_super_block *s;
1911	1868	struct scrub_ctx *sctx = sblock->sctx;
	1869	+ struct btrfs_fs_info *fs_info = sctx->fs_info;
	1870	+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1912	1871	u8 calculated_csum[BTRFS_CSUM_SIZE];
1913		- u8 on_disk_csum[BTRFS_CSUM_SIZE];
1914		- struct page *page;
1915		- void *mapped_buffer;
1916		- u64 mapped_size;
1917		- void *p;
1918		- u32 crc = ~(u32)0;
	1872	+ struct scrub_page *spage;
	1873	+ char *kaddr;
1919	1874	int fail_gen = 0;
1920	1875	int fail_cor = 0;
1921		- u64 len;
1922		- int index;
1923	1876
1924	1877	BUG_ON(sblock->page_count < 1);
1925		- page = sblock->pagev[0]->page;
1926		- mapped_buffer = kmap_atomic(page);
1927		- s = (struct btrfs_super_block *)mapped_buffer;
1928		- memcpy(on_disk_csum, s->csum, sctx->csum_size);
	1878	+ spage = sblock->pagev[0];
	1879	+ kaddr = page_address(spage->page);
	1880	+ s = (struct btrfs_super_block *)kaddr;
1929	1881
1930		- if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
	1882	+ if (spage->logical != btrfs_super_bytenr(s))
1931	1883	++fail_cor;
1932	1884
1933		- if (sblock->pagev[0]->generation != btrfs_super_generation(s))
	1885	+ if (spage->generation != btrfs_super_generation(s))
1934	1886	++fail_gen;
1935	1887
1936		- if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
	1888	+ if (!scrub_check_fsid(s->fsid, spage))
1937	1889	++fail_cor;
1938	1890
1939		- len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1940		- mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1941		- p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1942		- index = 0;
1943		- for (;;) {
1944		- u64 l = min_t(u64, len, mapped_size);
	1891	+ shash->tfm = fs_info->csum_shash;
	1892	+ crypto_shash_init(shash);
	1893	+ crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
	1894	+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1945	1895
1946		- crc = btrfs_csum_data(p, crc, l);
1947		- kunmap_atomic(mapped_buffer);
1948		- len -= l;
1949		- if (len == 0)
1950		- break;
1951		- index++;
1952		- BUG_ON(index >= sblock->page_count);
1953		- BUG_ON(!sblock->pagev[index]->page);
1954		- page = sblock->pagev[index]->page;
1955		- mapped_buffer = kmap_atomic(page);
1956		- mapped_size = PAGE_SIZE;
1957		- p = mapped_buffer;
1958		- }
1959		-
1960		- btrfs_csum_final(crc, calculated_csum);
1961		- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
	1896	+ if (memcmp(calculated_csum, s->csum, sctx->csum_size))
1962	1897	++fail_cor;
1963	1898
1964	1899	if (fail_cor + fail_gen) {
..	..	@@ -1971,10 +1906,10 @@
1971	1906	++sctx->stat.super_errors;
1972	1907	spin_unlock(&sctx->stat_lock);
1973	1908	if (fail_cor)
1974		- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
	1909	+ btrfs_dev_stat_inc_and_print(spage->dev,
1975	1910	BTRFS_DEV_STAT_CORRUPTION_ERRS);
1976	1911	else
1977		- btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
	1912	+ btrfs_dev_stat_inc_and_print(spage->dev,
1978	1913	BTRFS_DEV_STAT_GENERATION_ERRS);
1979	1914	}
1980	1915
..	..	@@ -2199,8 +2134,7 @@
2199	2134	raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2200	2135	}
2201	2136
2202		- btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2203		- scrub_missing_raid56_worker, NULL, NULL);
	2137	+ btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2204	2138	scrub_block_get(sblock);
2205	2139	scrub_pending_bio_inc(sctx);
2206	2140	raid56_submit_missing_rbio(rbio);
..	..	@@ -2456,7 +2390,7 @@
2456	2390	ASSERT(index < UINT_MAX);
2457	2391
2458	2392	num_sectors = sum->len / sctx->fs_info->sectorsize;
2459		- memcpy(csum, sum->sums + index, sctx->csum_size);
	2393	+ memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
2460	2394	if (index == num_sectors - 1) {
2461	2395	list_del(&sum->list);
2462	2396	kfree(sum);
..	..	@@ -2668,18 +2602,18 @@
2668	2602	u64 last_offset;
2669	2603	u32 stripe_index;
2670	2604	u32 rot;
	2605	+ const int data_stripes = nr_data_stripes(map);
2671	2606
2672		- last_offset = (physical - map->stripes[num].physical) *
2673		- nr_data_stripes(map);
	2607	+ last_offset = (physical - map->stripes[num].physical) * data_stripes;
2674	2608	if (stripe_start)
2675	2609	*stripe_start = last_offset;
2676	2610
2677	2611	*offset = last_offset;
2678		- for (i = 0; i < nr_data_stripes(map); i++) {
	2612	+ for (i = 0; i < data_stripes; i++) {
2679	2613	offset = last_offset + i map->stripe_len;
2680	2614
2681	2615	stripe_nr = div64_u64(*offset, map->stripe_len);
2682		- stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
	2616	+ stripe_nr = div_u64(stripe_nr, data_stripes);
2683	2617
2684	2618	/* Work out the disk rotation on this stripe-set */
2685	2619	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
..	..	@@ -2738,8 +2672,8 @@
2738	2672
2739	2673	bio_put(bio);
2740	2674
2741		- btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
2742		- scrub_parity_bio_endio_worker, NULL, NULL);
	2675	+ btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
	2676	+ NULL);
2743	2677	btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2744	2678	}
2745	2679
..	..	@@ -3041,7 +2975,8 @@
3041	2975	static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3042	2976	struct map_lookup *map,
3043	2977	struct btrfs_device *scrub_dev,
3044		- int num, u64 base, u64 length)
	2978	+ int num, u64 base, u64 length,
	2979	+ struct btrfs_block_group *cache)
3045	2980	{
3046	2981	struct btrfs_path path, ppath;
3047	2982	struct btrfs_fs_info *fs_info = sctx->fs_info;
..	..	@@ -3087,7 +3022,7 @@
3087	3022	offset = map->stripe_len * (num / map->sub_stripes);
3088	3023	increment = map->stripe_len * factor;
3089	3024	mirror_num = num % map->sub_stripes + 1;
3090		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
	3025	+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3091	3026	increment = map->stripe_len;
3092	3027	mirror_num = num % map->num_stripes + 1;
3093	3028	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
..	..	@@ -3279,6 +3214,20 @@
3279	3214	break;
3280	3215	}
3281	3216
	3217	+ /*
	3218	+ * If our block group was removed in the meanwhile, just
	3219	+ * stop scrubbing since there is no point in continuing.
	3220	+ * Continuing would prevent reusing its device extents
	3221	+ * for new block groups for a long time.
	3222	+ */
	3223	+ spin_lock(&cache->lock);
	3224	+ if (cache->removed) {
	3225	+ spin_unlock(&cache->lock);
	3226	+ ret = 0;
	3227	+ goto out;
	3228	+ }
	3229	+ spin_unlock(&cache->lock);
	3230	+
3282	3231	extent = btrfs_item_ptr(l, slot,
3283	3232	struct btrfs_extent_item);
3284	3233	flags = btrfs_extent_flags(l, extent);
..	..	@@ -3323,13 +3272,14 @@
3323	3272	&extent_dev,
3324	3273	&extent_mirror_num);
3325	3274
3326		- ret = btrfs_lookup_csums_range(csum_root,
3327		- extent_logical,
3328		- extent_logical +
3329		- extent_len - 1,
3330		- &sctx->csum_list, 1);
3331		- if (ret)
3332		- goto out;
	3275	+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
	3276	+ ret = btrfs_lookup_csums_range(csum_root,
	3277	+ extent_logical,
	3278	+ extent_logical + extent_len - 1,
	3279	+ &sctx->csum_list, 1);
	3280	+ if (ret)
	3281	+ goto out;
	3282	+ }
3333	3283
3334	3284	ret = scrub_extent(sctx, map, extent_logical, extent_len,
3335	3285	extent_physical, extent_dev, flags,
..	..	@@ -3415,18 +3365,18 @@
3415	3365	struct btrfs_device *scrub_dev,
3416	3366	u64 chunk_offset, u64 length,
3417	3367	u64 dev_offset,
3418		- struct btrfs_block_group_cache *cache)
	3368	+ struct btrfs_block_group *cache)
3419	3369	{
3420	3370	struct btrfs_fs_info *fs_info = sctx->fs_info;
3421		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	3371	+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3422	3372	struct map_lookup *map;
3423	3373	struct extent_map *em;
3424	3374	int i;
3425	3375	int ret = 0;
3426	3376
3427		- read_lock(&map_tree->map_tree.lock);
3428		- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3429		- read_unlock(&map_tree->map_tree.lock);
	3377	+ read_lock(&map_tree->lock);
	3378	+ em = lookup_extent_mapping(map_tree, chunk_offset, 1);
	3379	+ read_unlock(&map_tree->lock);
3430	3380
3431	3381	if (!em) {
3432	3382	/*
..	..	@@ -3452,7 +3402,7 @@
3452	3402	if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3453	3403	map->stripes[i].physical == dev_offset) {
3454	3404	ret = scrub_stripe(sctx, map, scrub_dev, i,
3455		- chunk_offset, length);
	3405	+ chunk_offset, length, cache);
3456	3406	if (ret)
3457	3407	goto out;
3458	3408	}
..	..	@@ -3479,7 +3429,7 @@
3479	3429	struct extent_buffer *l;
3480	3430	struct btrfs_key key;
3481	3431	struct btrfs_key found_key;
3482		- struct btrfs_block_group_cache *cache;
	3432	+ struct btrfs_block_group *cache;
3483	3433	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3484	3434
3485	3435	path = btrfs_alloc_path();
..	..	@@ -3550,6 +3500,23 @@
3550	3500	goto skip;
3551	3501
3552	3502	/*
	3503	+ * Make sure that while we are scrubbing the corresponding block
	3504	+ * group doesn't get its logical address and its device extents
	3505	+ * reused for another block group, which can possibly be of a
	3506	+ * different type and different profile. We do this to prevent
	3507	+ * false error detections and crashes due to bogus attempts to
	3508	+ * repair extents.
	3509	+ */
	3510	+ spin_lock(&cache->lock);
	3511	+ if (cache->removed) {
	3512	+ spin_unlock(&cache->lock);
	3513	+ btrfs_put_block_group(cache);
	3514	+ goto skip;
	3515	+ }
	3516	+ btrfs_freeze_block_group(cache);
	3517	+ spin_unlock(&cache->lock);
	3518	+
	3519	+ /*
3553	3520	* we need call btrfs_inc_block_group_ro() with scrubs_paused,
3554	3521	* to avoid deadlock caused by:
3555	3522	* btrfs_inc_block_group_ro()
..	..	@@ -3558,71 +3525,90 @@
3558	3525	* -> btrfs_scrub_pause()
3559	3526	*/
3560	3527	scrub_pause_on(fs_info);
3561		- ret = btrfs_inc_block_group_ro(cache);
3562		- if (!ret && sctx->is_dev_replace) {
3563		- /*
3564		- * If we are doing a device replace wait for any tasks
3565		- * that started dellaloc right before we set the block
3566		- * group to RO mode, as they might have just allocated
3567		- * an extent from it or decided they could do a nocow
3568		- * write. And if any such tasks did that, wait for their
3569		- * ordered extents to complete and then commit the
3570		- * current transaction, so that we can later see the new
3571		- * extent items in the extent tree - the ordered extents
3572		- * create delayed data references (for cow writes) when
3573		- * they complete, which will be run and insert the
3574		- * corresponding extent items into the extent tree when
3575		- * we commit the transaction they used when running
3576		- * inode.c:btrfs_finish_ordered_io(). We later use
3577		- * the commit root of the extent tree to find extents
3578		- * to copy from the srcdev into the tgtdev, and we don't
3579		- * want to miss any new extents.
3580		- */
3581		- btrfs_wait_block_group_reservations(cache);
3582		- btrfs_wait_nocow_writers(cache);
3583		- ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3584		- cache->key.objectid,
3585		- cache->key.offset);
3586		- if (ret > 0) {
3587		- struct btrfs_trans_handle *trans;
3588	3528
3589		- trans = btrfs_join_transaction(root);
3590		- if (IS_ERR(trans))
3591		- ret = PTR_ERR(trans);
3592		- else
3593		- ret = btrfs_commit_transaction(trans);
3594		- if (ret) {
3595		- scrub_pause_off(fs_info);
3596		- btrfs_put_block_group(cache);
3597		- break;
3598		- }
3599		- }
3600		- }
3601		- scrub_pause_off(fs_info);
3602		-
	3529	+ /*
	3530	+ * Don't do chunk preallocation for scrub.
	3531	+ *
	3532	+ * This is especially important for SYSTEM bgs, or we can hit
	3533	+ * -EFBIG from btrfs_finish_chunk_alloc() like:
	3534	+ * 1. The only SYSTEM bg is marked RO.
	3535	+ * Since SYSTEM bg is small, that's pretty common.
	3536	+ * 2. New SYSTEM bg will be allocated
	3537	+ * Due to regular version will allocate new chunk.
	3538	+ * 3. New SYSTEM bg is empty and will get cleaned up
	3539	+ * Before cleanup really happens, it's marked RO again.
	3540	+ * 4. Empty SYSTEM bg get scrubbed
	3541	+ * We go back to 2.
	3542	+ *
	3543	+ * This can easily boost the amount of SYSTEM chunks if cleaner
	3544	+ * thread can't be triggered fast enough, and use up all space
	3545	+ * of btrfs_super_block::sys_chunk_array
	3546	+ *
	3547	+ * While for dev replace, we need to try our best to mark block
	3548	+ * group RO, to prevent race between:
	3549	+ * - Write duplication
	3550	+ * Contains latest data
	3551	+ * - Scrub copy
	3552	+ * Contains data from commit tree
	3553	+ *
	3554	+ * If target block group is not marked RO, nocow writes can
	3555	+ * be overwritten by scrub copy, causing data corruption.
	3556	+ * So for dev-replace, it's not allowed to continue if a block
	3557	+ * group is not RO.
	3558	+ */
	3559	+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3603	3560	if (ret == 0) {
3604	3561	ro_set = 1;
3605		- } else if (ret == -ENOSPC) {
	3562	+ } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
	3563	+ !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
3606	3564	/*
3607	3565	* btrfs_inc_block_group_ro return -ENOSPC when it
3608	3566	* failed in creating new chunk for metadata.
3609		- * It is not a problem for scrub/replace, because
	3567	+ * It is not a problem for scrub, because
3610	3568	* metadata are always cowed, and our scrub paused
3611	3569	* commit_transactions.
	3570	+ *
	3571	+ * For RAID56 chunks, we have to mark them read-only
	3572	+ * for scrub, as later we would use our own cache
	3573	+ * out of RAID56 realm.
	3574	+ * Thus we want the RAID56 bg to be marked RO to
	3575	+ * prevent RMW from screwing up out cache.
3612	3576	*/
3613	3577	ro_set = 0;
	3578	+ } else if (ret == -ETXTBSY) {
	3579	+ btrfs_warn(fs_info,
	3580	+ "skipping scrub of block group %llu due to active swapfile",
	3581	+ cache->start);
	3582	+ scrub_pause_off(fs_info);
	3583	+ ret = 0;
	3584	+ goto skip_unfreeze;
3614	3585	} else {
3615	3586	btrfs_warn(fs_info,
3616	3587	"failed setting block group ro: %d", ret);
	3588	+ btrfs_unfreeze_block_group(cache);
3617	3589	btrfs_put_block_group(cache);
	3590	+ scrub_pause_off(fs_info);
3618	3591	break;
3619	3592	}
3620	3593
3621		- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
	3594	+ /*
	3595	+ * Now the target block is marked RO, wait for nocow writes to
	3596	+ * finish before dev-replace.
	3597	+ * COW is fine, as COW never overwrites extents in commit tree.
	3598	+ */
	3599	+ if (sctx->is_dev_replace) {
	3600	+ btrfs_wait_nocow_writers(cache);
	3601	+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
	3602	+ cache->length);
	3603	+ }
	3604	+
	3605	+ scrub_pause_off(fs_info);
	3606	+ down_write(&dev_replace->rwsem);
3622	3607	dev_replace->cursor_right = found_key.offset + length;
3623	3608	dev_replace->cursor_left = found_key.offset;
3624	3609	dev_replace->item_needs_writeback = 1;
3625		- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
	3610	+ up_write(&dev_replace->rwsem);
	3611	+
3626	3612	ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3627	3613	found_key.offset, cache);
3628	3614
..	..	@@ -3658,10 +3644,10 @@
3658	3644
3659	3645	scrub_pause_off(fs_info);
3660	3646
3661		- btrfs_dev_replace_write_lock(&fs_info->dev_replace);
	3647	+ down_write(&dev_replace->rwsem);
3662	3648	dev_replace->cursor_left = dev_replace->cursor_right;
3663	3649	dev_replace->item_needs_writeback = 1;
3664		- btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
	3650	+ up_write(&dev_replace->rwsem);
3665	3651
3666	3652	if (ro_set)
3667	3653	btrfs_dec_block_group_ro(cache);
..	..	@@ -3675,13 +3661,18 @@
3675	3661	*/
3676	3662	spin_lock(&cache->lock);
3677	3663	if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3678		- btrfs_block_group_used(&cache->item) == 0) {
	3664	+ cache->used == 0) {
3679	3665	spin_unlock(&cache->lock);
3680		- btrfs_mark_bg_unused(cache);
	3666	+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
	3667	+ btrfs_discard_queue_work(&fs_info->discard_ctl,
	3668	+ cache);
	3669	+ else
	3670	+ btrfs_mark_bg_unused(cache);
3681	3671	} else {
3682	3672	spin_unlock(&cache->lock);
3683	3673	}
3684		-
	3674	+skip_unfreeze:
	3675	+ btrfs_unfreeze_block_group(cache);
3685	3676	btrfs_put_block_group(cache);
3686	3677	if (ret)
3687	3678	break;
..	..	@@ -3714,7 +3705,7 @@
3714	3705	struct btrfs_fs_info *fs_info = sctx->fs_info;
3715	3706
3716	3707	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3717		- return -EIO;
	3708	+ return -EROFS;
3718	3709
3719	3710	/* Seed devices of a new filesystem has their own generation. */
3720	3711	if (scrub_dev->fs_devices != fs_info->fs_devices)
..	..	@@ -3739,42 +3730,84 @@
3739	3730	return 0;
3740	3731	}
3741	3732
	3733	+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
	3734	+{
	3735	+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
	3736	+ &fs_info->scrub_lock)) {
	3737	+ struct btrfs_workqueue *scrub_workers = NULL;
	3738	+ struct btrfs_workqueue *scrub_wr_comp = NULL;
	3739	+ struct btrfs_workqueue *scrub_parity = NULL;
	3740	+
	3741	+ scrub_workers = fs_info->scrub_workers;
	3742	+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
	3743	+ scrub_parity = fs_info->scrub_parity_workers;
	3744	+
	3745	+ fs_info->scrub_workers = NULL;
	3746	+ fs_info->scrub_wr_completion_workers = NULL;
	3747	+ fs_info->scrub_parity_workers = NULL;
	3748	+ mutex_unlock(&fs_info->scrub_lock);
	3749	+
	3750	+ btrfs_destroy_workqueue(scrub_workers);
	3751	+ btrfs_destroy_workqueue(scrub_wr_comp);
	3752	+ btrfs_destroy_workqueue(scrub_parity);
	3753	+ }
	3754	+}
	3755	+
3742	3756	/*
3743	3757	* get a reference count on fs_info->scrub_workers. start worker if necessary
3744	3758	*/
3745	3759	static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3746	3760	int is_dev_replace)
3747	3761	{
	3762	+ struct btrfs_workqueue *scrub_workers = NULL;
	3763	+ struct btrfs_workqueue *scrub_wr_comp = NULL;
	3764	+ struct btrfs_workqueue *scrub_parity = NULL;
3748	3765	unsigned int flags = WQ_FREEZABLE \| WQ_UNBOUND;
3749	3766	int max_active = fs_info->thread_pool_size;
	3767	+ int ret = -ENOMEM;
3750	3768
3751		- if (fs_info->scrub_workers_refcnt == 0) {
3752		- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
3753		- flags, is_dev_replace ? 1 : max_active, 4);
3754		- if (!fs_info->scrub_workers)
3755		- goto fail_scrub_workers;
	3769	+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
	3770	+ return 0;
3756	3771
3757		- fs_info->scrub_wr_completion_workers =
3758		- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
	3772	+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
	3773	+ is_dev_replace ? 1 : max_active, 4);
	3774	+ if (!scrub_workers)
	3775	+ goto fail_scrub_workers;
	3776	+
	3777	+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3759	3778	max_active, 2);
3760		- if (!fs_info->scrub_wr_completion_workers)
3761		- goto fail_scrub_wr_completion_workers;
	3779	+ if (!scrub_wr_comp)
	3780	+ goto fail_scrub_wr_completion_workers;
3762	3781
3763		- fs_info->scrub_parity_workers =
3764		- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3765		- max_active, 2);
3766		- if (!fs_info->scrub_parity_workers)
3767		- goto fail_scrub_parity_workers;
	3782	+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
	3783	+ max_active, 2);
	3784	+ if (!scrub_parity)
	3785	+ goto fail_scrub_parity_workers;
	3786	+
	3787	+ mutex_lock(&fs_info->scrub_lock);
	3788	+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
	3789	+ ASSERT(fs_info->scrub_workers == NULL &&
	3790	+ fs_info->scrub_wr_completion_workers == NULL &&
	3791	+ fs_info->scrub_parity_workers == NULL);
	3792	+ fs_info->scrub_workers = scrub_workers;
	3793	+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
	3794	+ fs_info->scrub_parity_workers = scrub_parity;
	3795	+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
	3796	+ mutex_unlock(&fs_info->scrub_lock);
	3797	+ return 0;
3768	3798	}
3769		- ++fs_info->scrub_workers_refcnt;
3770		- return 0;
	3799	+ /* Other thread raced in and created the workers for us */
	3800	+ refcount_inc(&fs_info->scrub_workers_refcnt);
	3801	+ mutex_unlock(&fs_info->scrub_lock);
3771	3802
	3803	+ ret = 0;
	3804	+ btrfs_destroy_workqueue(scrub_parity);
3772	3805	fail_scrub_parity_workers:
3773		- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
	3806	+ btrfs_destroy_workqueue(scrub_wr_comp);
3774	3807	fail_scrub_wr_completion_workers:
3775		- btrfs_destroy_workqueue(fs_info->scrub_workers);
	3808	+ btrfs_destroy_workqueue(scrub_workers);
3776	3809	fail_scrub_workers:
3777		- return -ENOMEM;
	3810	+ return ret;
3778	3811	}
3779	3812
3780	3813	int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
..	..	@@ -3785,12 +3818,10 @@
3785	3818	int ret;
3786	3819	struct btrfs_device *dev;
3787	3820	unsigned int nofs_flag;
3788		- struct btrfs_workqueue *scrub_workers = NULL;
3789		- struct btrfs_workqueue *scrub_wr_comp = NULL;
3790		- struct btrfs_workqueue *scrub_parity = NULL;
	3821	+ bool need_commit = false;
3791	3822
3792	3823	if (btrfs_fs_closing(fs_info))
3793		- return -EINVAL;
	3824	+ return -EAGAIN;
3794	3825
3795	3826	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3796	3827	/*
..	..	@@ -3834,22 +3865,27 @@
3834	3865	if (IS_ERR(sctx))
3835	3866	return PTR_ERR(sctx);
3836	3867
	3868	+ ret = scrub_workers_get(fs_info, is_dev_replace);
	3869	+ if (ret)
	3870	+ goto out_free_ctx;
	3871	+
3837	3872	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3838	3873	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3839	3874	if (!dev \|\| (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3840	3875	!is_dev_replace)) {
3841	3876	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3842	3877	ret = -ENODEV;
3843		- goto out_free_ctx;
	3878	+ goto out;
3844	3879	}
3845	3880
3846	3881	if (!is_dev_replace && !readonly &&
3847	3882	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3848	3883	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3849		- btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
3850		- rcu_str_deref(dev->name));
	3884	+ btrfs_err_in_rcu(fs_info,
	3885	+ "scrub on devid %llu: filesystem on %s is not writable",
	3886	+ devid, rcu_str_deref(dev->name));
3851	3887	ret = -EROFS;
3852		- goto out_free_ctx;
	3888	+ goto out;
3853	3889	}
3854	3890
3855	3891	mutex_lock(&fs_info->scrub_lock);
..	..	@@ -3858,27 +3894,20 @@
3858	3894	mutex_unlock(&fs_info->scrub_lock);
3859	3895	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3860	3896	ret = -EIO;
3861		- goto out_free_ctx;
	3897	+ goto out;
3862	3898	}
3863	3899
3864		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
	3900	+ down_read(&fs_info->dev_replace.rwsem);
3865	3901	if (dev->scrub_ctx \|\|
3866	3902	(!is_dev_replace &&
3867	3903	btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3868		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
	3904	+ up_read(&fs_info->dev_replace.rwsem);
3869	3905	mutex_unlock(&fs_info->scrub_lock);
3870	3906	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3871	3907	ret = -EINPROGRESS;
3872		- goto out_free_ctx;
	3908	+ goto out;
3873	3909	}
3874		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3875		-
3876		- ret = scrub_workers_get(fs_info, is_dev_replace);
3877		- if (ret) {
3878		- mutex_unlock(&fs_info->scrub_lock);
3879		- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3880		- goto out_free_ctx;
3881		- }
	3910	+ up_read(&fs_info->dev_replace.rwsem);
3882	3911
3883	3912	sctx->readonly = readonly;
3884	3913	dev->scrub_ctx = sctx;
..	..	@@ -3903,6 +3932,13 @@
3903	3932	*/
3904	3933	nofs_flag = memalloc_nofs_save();
3905	3934	if (!is_dev_replace) {
	3935	+ u64 old_super_errors;
	3936	+
	3937	+ spin_lock(&sctx->stat_lock);
	3938	+ old_super_errors = sctx->stat.super_errors;
	3939	+ spin_unlock(&sctx->stat_lock);
	3940	+
	3941	+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3906	3942	/*
3907	3943	* by holding device list mutex, we can
3908	3944	* kick off writing super in log tree sync.
..	..	@@ -3910,6 +3946,16 @@
3910	3946	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3911	3947	ret = scrub_supers(sctx, dev);
3912	3948	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
	3949	+
	3950	+ spin_lock(&sctx->stat_lock);
	3951	+ /*
	3952	+ * Super block errors found, but we can not commit transaction
	3953	+ * at current context, since btrfs_commit_transaction() needs
	3954	+ * to pause the current running scrub (hold by ourselves).
	3955	+ */
	3956	+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
	3957	+ need_commit = true;
	3958	+ spin_unlock(&sctx->stat_lock);
3913	3959	}
3914	3960
3915	3961	if (!ret)
..	..	@@ -3925,22 +3971,39 @@
3925	3971	if (progress)
3926	3972	memcpy(progress, &sctx->stat, sizeof(*progress));
3927	3973
	3974	+ if (!is_dev_replace)
	3975	+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
	3976	+ ret ? "not finished" : "finished", devid, ret);
	3977	+
3928	3978	mutex_lock(&fs_info->scrub_lock);
3929	3979	dev->scrub_ctx = NULL;
3930		- if (--fs_info->scrub_workers_refcnt == 0) {
3931		- scrub_workers = fs_info->scrub_workers;
3932		- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3933		- scrub_parity = fs_info->scrub_parity_workers;
3934		- }
3935	3980	mutex_unlock(&fs_info->scrub_lock);
3936	3981
3937		- btrfs_destroy_workqueue(scrub_workers);
3938		- btrfs_destroy_workqueue(scrub_wr_comp);
3939		- btrfs_destroy_workqueue(scrub_parity);
	3982	+ scrub_workers_put(fs_info);
3940	3983	scrub_put_ctx(sctx);
3941	3984
3942		- return ret;
	3985	+ /*
	3986	+ * We found some super block errors before, now try to force a
	3987	+ * transaction commit, as scrub has finished.
	3988	+ */
	3989	+ if (need_commit) {
	3990	+ struct btrfs_trans_handle *trans;
3943	3991
	3992	+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
	3993	+ if (IS_ERR(trans)) {
	3994	+ ret = PTR_ERR(trans);
	3995	+ btrfs_err(fs_info,
	3996	+ "scrub: failed to start transaction to fix super block errors: %d", ret);
	3997	+ return ret;
	3998	+ }
	3999	+ ret = btrfs_commit_transaction(trans);
	4000	+ if (ret < 0)
	4001	+ btrfs_err(fs_info,
	4002	+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
	4003	+ }
	4004	+ return ret;
	4005	+out:
	4006	+ scrub_workers_put(fs_info);
3944	4007	out_free_ctx:
3945	4008	scrub_free_ctx(sctx);
3946	4009
..	..	@@ -3989,9 +4052,9 @@
3989	4052	return 0;
3990	4053	}
3991	4054
3992		-int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3993		- struct btrfs_device *dev)
	4055	+int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
3994	4056	{
	4057	+ struct btrfs_fs_info *fs_info = dev->fs_info;
3995	4058	struct scrub_ctx *sctx;
3996	4059
3997	4060	mutex_lock(&fs_info->scrub_lock);