~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* raid10.c : Multiple Devices driver for Linux
3	4	*
..	..	@@ -6,16 +7,6 @@
6	7	* RAID-10 support for md.
7	8	*
8	9	* Base on code in raid1.c. See raid1.c for further copyright information.
9		- *
10		- *
11		- * This program is free software; you can redistribute it and/or modify
12		- * it under the terms of the GNU General Public License as published by
13		- * the Free Software Foundation; either version 2, or (at your option)
14		- * any later version.
15		- *
16		- * You should have received a copy of the GNU General Public License
17		- * (for example /usr/src/linux/COPYING); if not, write to the Free
18		- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	10	*/
20	11
21	12	#include <linux/slab.h>
..	..	@@ -25,6 +16,7 @@
25	16	#include <linux/seq_file.h>
26	17	#include <linux/ratelimit.h>
27	18	#include <linux/kthread.h>
	19	+#include <linux/raid/md_p.h>
28	20	#include <trace/events/block.h>
29	21	#include "md.h"
30	22	#include "raid10.h"
..	..	@@ -72,31 +64,6 @@
72	64	* [B A] [D C] [B A] [E C D]
73	65	*/
74	66
75		-/*
76		- * Number of guaranteed r10bios in case of extreme VM load:
77		- */
78		-#define NR_RAID10_BIOS 256
79		-
80		-/* when we get a read error on a read-only array, we redirect to another
81		- * device without failing the first device, or trying to over-write to
82		- * correct the read error. To keep track of bad blocks on a per-bio
83		- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
84		- */
85		-#define IO_BLOCKED ((struct bio *)1)
86		-/* When we successfully write to a known bad-block, we need to remove the
87		- * bad-block marking which must be done from process context. So we record
88		- * the success by setting devs[n].bio to IO_MADE_GOOD
89		- */
90		-#define IO_MADE_GOOD ((struct bio *)2)
91		-
92		-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93		-
94		-/* When there are this many requests queued to be written by
95		- * the raid10 thread, we become 'congested' to provide back-pressure
96		- * for writeback.
97		- */
98		-static int max_queued_requests = 1024;
99		-
100	67	static void allow_barrier(struct r10conf *conf);
101	68	static void lower_barrier(struct r10conf *conf);
102	69	static int _enough(struct r10conf *conf, int previous, int ignore);
..	..	@@ -129,11 +96,6 @@
129	96	/* allocate a r10bio with room for raid_disks entries in the
130	97	* bios array */
131	98	return kzalloc(size, gfp_flags);
132		-}
133		-
134		-static void r10bio_pool_free(void r10_bio, void data)
135		-{
136		- kfree(r10_bio);
137	99	}
138	100
139	101	#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
..	..	@@ -241,7 +203,7 @@
241	203	}
242	204	kfree(rps);
243	205	out_free_r10bio:
244		- r10bio_pool_free(r10_bio, conf);
	206	+ rbio_pool_free(r10_bio, conf);
245	207	return NULL;
246	208	}
247	209
..	..	@@ -269,7 +231,7 @@
269	231	/* resync pages array stored in the 1st bio's .bi_private */
270	232	kfree(rp);
271	233
272		- r10bio_pool_free(r10bio, conf);
	234	+ rbio_pool_free(r10bio, conf);
273	235	}
274	236
275	237	static void put_all_bios(struct r10conf conf, struct r10bio r10_bio)
..	..	@@ -503,19 +465,21 @@
503	465	if (test_bit(FailFast, &rdev->flags) &&
504	466	(bio->bi_opf & MD_FAILFAST)) {
505	467	md_error(rdev->mddev, rdev);
506		- if (!test_bit(Faulty, &rdev->flags))
507		- /* This is the only remaining device,
508		- * We need to retry the write without
509		- * FailFast
510		- */
511		- set_bit(R10BIO_WriteError, &r10_bio->state);
512		- else {
513		- r10_bio->devs[slot].bio = NULL;
514		- to_put = bio;
515		- dec_rdev = 1;
516		- }
517		- } else
	468	+ }
	469	+
	470	+ /*
	471	+ * When the device is faulty, it is not necessary to
	472	+ * handle write error.
	473	+ */
	474	+ if (!test_bit(Faulty, &rdev->flags))
518	475	set_bit(R10BIO_WriteError, &r10_bio->state);
	476	+ else {
	477	+ /* Fail the request */
	478	+ set_bit(R10BIO_Degraded, &r10_bio->state);
	479	+ r10_bio->devs[slot].bio = NULL;
	480	+ to_put = bio;
	481	+ dec_rdev = 1;
	482	+ }
519	483	}
520	484	} else {
521	485	/*
..	..	@@ -745,15 +709,19 @@
745	709	int sectors = r10_bio->sectors;
746	710	int best_good_sectors;
747	711	sector_t new_distance, best_dist;
748		- struct md_rdev best_rdev, rdev = NULL;
	712	+ struct md_rdev best_dist_rdev, best_pending_rdev, *rdev = NULL;
749	713	int do_balance;
750		- int best_slot;
	714	+ int best_dist_slot, best_pending_slot;
	715	+ bool has_nonrot_disk = false;
	716	+ unsigned int min_pending;
751	717	struct geom *geo = &conf->geo;
752	718
753	719	raid10_find_phys(conf, r10_bio);
754	720	rcu_read_lock();
755		- best_slot = -1;
756		- best_rdev = NULL;
	721	+ best_dist_slot = -1;
	722	+ min_pending = UINT_MAX;
	723	+ best_dist_rdev = NULL;
	724	+ best_pending_rdev = NULL;
757	725	best_dist = MaxSector;
758	726	best_good_sectors = 0;
759	727	do_balance = 1;
..	..	@@ -775,14 +743,24 @@
775	743	sector_t first_bad;
776	744	int bad_sectors;
777	745	sector_t dev_sector;
	746	+ unsigned int pending;
	747	+ bool nonrot;
778	748
779	749	if (r10_bio->devs[slot].bio == IO_BLOCKED)
780	750	continue;
781	751	disk = r10_bio->devs[slot].devnum;
782	752	rdev = rcu_dereference(conf->mirrors[disk].replacement);
783	753	if (rdev == NULL \|\| test_bit(Faulty, &rdev->flags) \|\|
784		- r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
	754	+ r10_bio->devs[slot].addr + sectors >
	755	+ rdev->recovery_offset) {
	756	+ /*
	757	+ * Read replacement first to prevent reading both rdev
	758	+ * and replacement as NULL during replacement replace
	759	+ * rdev.
	760	+ */
	761	+ smp_mb();
785	762	rdev = rcu_dereference(conf->mirrors[disk].rdev);
	763	+ }
786	764	if (rdev == NULL \|\|
787	765	test_bit(Faulty, &rdev->flags))
788	766	continue;
..	..	@@ -811,8 +789,8 @@
811	789	first_bad - dev_sector;
812	790	if (good_sectors > best_good_sectors) {
813	791	best_good_sectors = good_sectors;
814		- best_slot = slot;
815		- best_rdev = rdev;
	792	+ best_dist_slot = slot;
	793	+ best_dist_rdev = rdev;
816	794	}
817	795	if (!do_balance)
818	796	/* Must read from here */
..	..	@@ -825,14 +803,23 @@
825	803	if (!do_balance)
826	804	break;
827	805
828		- if (best_slot >= 0)
	806	+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
	807	+ has_nonrot_disk \|= nonrot;
	808	+ pending = atomic_read(&rdev->nr_pending);
	809	+ if (min_pending > pending && nonrot) {
	810	+ min_pending = pending;
	811	+ best_pending_slot = slot;
	812	+ best_pending_rdev = rdev;
	813	+ }
	814	+
	815	+ if (best_dist_slot >= 0)
829	816	/* At least 2 disks to choose from so failfast is OK */
830	817	set_bit(R10BIO_FailFast, &r10_bio->state);
831	818	/* This optimisation is debatable, and completely destroys
832	819	* sequential read speed for 'far copies' arrays. So only
833	820	* keep it for 'near' arrays, and review those later.
834	821	*/
835		- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
	822	+ if (geo->near_copies > 1 && !pending)
836	823	new_distance = 0;
837	824
838	825	/* for far > 1 always use the lowest address */
..	..	@@ -841,15 +828,21 @@
841	828	else
842	829	new_distance = abs(r10_bio->devs[slot].addr -
843	830	conf->mirrors[disk].head_position);
	831	+
844	832	if (new_distance < best_dist) {
845	833	best_dist = new_distance;
846		- best_slot = slot;
847		- best_rdev = rdev;
	834	+ best_dist_slot = slot;
	835	+ best_dist_rdev = rdev;
848	836	}
849	837	}
850	838	if (slot >= conf->copies) {
851		- slot = best_slot;
852		- rdev = best_rdev;
	839	+ if (has_nonrot_disk) {
	840	+ slot = best_pending_slot;
	841	+ rdev = best_pending_rdev;
	842	+ } else {
	843	+ slot = best_dist_slot;
	844	+ rdev = best_dist_rdev;
	845	+ }
853	846	}
854	847
855	848	if (slot >= 0) {
..	..	@@ -861,31 +854,6 @@
861	854	*max_sectors = best_good_sectors;
862	855
863	856	return rdev;
864		-}
865		-
866		-static int raid10_congested(struct mddev *mddev, int bits)
867		-{
868		- struct r10conf *conf = mddev->private;
869		- int i, ret = 0;
870		-
871		- if ((bits & (1 << WB_async_congested)) &&
872		- conf->pending_count >= max_queued_requests)
873		- return 1;
874		-
875		- rcu_read_lock();
876		- for (i = 0;
877		- (i < conf->geo.raid_disks \|\| i < conf->prev.raid_disks)
878		- && ret == 0;
879		- i++) {
880		- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
881		- if (rdev && !test_bit(Faulty, &rdev->flags)) {
882		- struct request_queue *q = bdev_get_queue(rdev->bdev);
883		-
884		- ret \|= bdi_congested(q->backing_dev_info, bits);
885		- }
886		- }
887		- rcu_read_unlock();
888		- return ret;
889	857	}
890	858
891	859	static void flush_pending_writes(struct r10conf *conf)
..	..	@@ -932,8 +900,9 @@
932	900	/* Just ignore it */
933	901	bio_endio(bio);
934	902	else
935		- generic_make_request(bio);
	903	+ submit_bio_noacct(bio);
936	904	bio = next;
	905	+ cond_resched();
937	906	}
938	907	blk_finish_plug(&plug);
939	908	} else
..	..	@@ -995,6 +964,7 @@
995	964	{
996	965	spin_lock_irq(&conf->resync_lock);
997	966	if (conf->barrier) {
	967	+ struct bio_list *bio_list = current->bio_list;
998	968	conf->nr_waiting++;
999	969	/* Wait for the barrier to drop.
1000	970	* However if there are already pending
..	..	@@ -1009,9 +979,16 @@
1009	979	wait_event_lock_irq(conf->wait_barrier,
1010	980	!conf->barrier \|\|
1011	981	(atomic_read(&conf->nr_pending) &&
1012		- current->bio_list &&
1013		- (!bio_list_empty(&current->bio_list[0]) \|\|
1014		- !bio_list_empty(&current->bio_list[1]))),
	982	+ bio_list &&
	983	+ (!bio_list_empty(&bio_list[0]) \|\|
	984	+ !bio_list_empty(&bio_list[1]))) \|\|
	985	+ /* move on if recovery thread is
	986	+ * blocked by us
	987	+ */
	988	+ (conf->mddev->thread->tsk == current &&
	989	+ test_bit(MD_RECOVERY_RUNNING,
	990	+ &conf->mddev->recovery) &&
	991	+ conf->nr_queued > 0),
1015	992	conf->resync_lock);
1016	993	conf->nr_waiting--;
1017	994	if (!conf->nr_waiting)
..	..	@@ -1117,10 +1094,34 @@
1117	1094	/* Just ignore it */
1118	1095	bio_endio(bio);
1119	1096	else
1120		- generic_make_request(bio);
	1097	+ submit_bio_noacct(bio);
1121	1098	bio = next;
	1099	+ cond_resched();
1122	1100	}
1123	1101	kfree(plug);
	1102	+}
	1103	+
	1104	+/*
	1105	+ * 1. Register the new request and wait if the reconstruction thread has put
	1106	+ * up a bar for new requests. Continue immediately if no resync is active
	1107	+ * currently.
	1108	+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
	1109	+ */
	1110	+static void regular_request_wait(struct mddev mddev, struct r10conf conf,
	1111	+ struct bio *bio, sector_t sectors)
	1112	+{
	1113	+ wait_barrier(conf);
	1114	+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	1115	+ bio->bi_iter.bi_sector < conf->reshape_progress &&
	1116	+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
	1117	+ raid10_log(conf->mddev, "wait reshape");
	1118	+ allow_barrier(conf);
	1119	+ wait_event(conf->wait_barrier,
	1120	+ conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
	1121	+ conf->reshape_progress >= bio->bi_iter.bi_sector +
	1122	+ sectors);
	1123	+ wait_barrier(conf);
	1124	+ }
1124	1125	}
1125	1126
1126	1127	static void raid10_read_request(struct mddev mddev, struct bio bio,
..	..	@@ -1131,7 +1132,6 @@
1131	1132	const int op = bio_op(bio);
1132	1133	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1133	1134	int max_sectors;
1134		- sector_t sectors;
1135	1135	struct md_rdev *rdev;
1136	1136	char b[BDEVNAME_SIZE];
1137	1137	int slot = r10_bio->read_slot;
..	..	@@ -1165,30 +1165,8 @@
1165	1165	}
1166	1166	rcu_read_unlock();
1167	1167	}
1168		- /*
1169		- * Register the new request and wait if the reconstruction
1170		- * thread has put up a bar for new requests.
1171		- * Continue immediately if no resync is active currently.
1172		- */
1173		- wait_barrier(conf);
1174	1168
1175		- sectors = r10_bio->sectors;
1176		- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177		- bio->bi_iter.bi_sector < conf->reshape_progress &&
1178		- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1179		- /*
1180		- * IO spans the reshape position. Need to wait for reshape to
1181		- * pass
1182		- */
1183		- raid10_log(conf->mddev, "wait reshape");
1184		- allow_barrier(conf);
1185		- wait_event(conf->wait_barrier,
1186		- conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
1187		- conf->reshape_progress >= bio->bi_iter.bi_sector +
1188		- sectors);
1189		- wait_barrier(conf);
1190		- }
1191		-
	1169	+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1192	1170	rdev = read_balance(conf, r10_bio, &max_sectors);
1193	1171	if (!rdev) {
1194	1172	if (err_rdev) {
..	..	@@ -1209,7 +1187,7 @@
1209	1187	gfp, &conf->bio_split);
1210	1188	bio_chain(split, bio);
1211	1189	allow_barrier(conf);
1212		- generic_make_request(bio);
	1190	+ submit_bio_noacct(bio);
1213	1191	wait_barrier(conf);
1214	1192	bio = split;
1215	1193	r10_bio->master_bio = bio;
..	..	@@ -1236,7 +1214,7 @@
1236	1214	trace_block_bio_remap(read_bio->bi_disk->queue,
1237	1215	read_bio, disk_devt(mddev->gendisk),
1238	1216	r10_bio->sector);
1239		- generic_make_request(read_bio);
	1217	+ submit_bio_noacct(read_bio);
1240	1218	return;
1241	1219	}
1242	1220
..	..	@@ -1333,30 +1311,8 @@
1333	1311	finish_wait(&conf->wait_barrier, &w);
1334	1312	}
1335	1313
1336		- /*
1337		- * Register the new request and wait if the reconstruction
1338		- * thread has put up a bar for new requests.
1339		- * Continue immediately if no resync is active currently.
1340		- */
1341		- wait_barrier(conf);
1342		-
1343	1314	sectors = r10_bio->sectors;
1344		- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1345		- bio->bi_iter.bi_sector < conf->reshape_progress &&
1346		- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1347		- /*
1348		- * IO spans the reshape position. Need to wait for reshape to
1349		- * pass
1350		- */
1351		- raid10_log(conf->mddev, "wait reshape");
1352		- allow_barrier(conf);
1353		- wait_event(conf->wait_barrier,
1354		- conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
1355		- conf->reshape_progress >= bio->bi_iter.bi_sector +
1356		- sectors);
1357		- wait_barrier(conf);
1358		- }
1359		-
	1315	+ regular_request_wait(mddev, conf, bio, sectors);
1360	1316	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1361	1317	(mddev->reshape_backwards
1362	1318	? (bio->bi_iter.bi_sector < conf->reshape_safe &&
..	..	@@ -1400,9 +1356,15 @@
1400	1356
1401	1357	for (i = 0; i < conf->copies; i++) {
1402	1358	int d = r10_bio->devs[i].devnum;
1403		- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1404		- struct md_rdev *rrdev = rcu_dereference(
1405		- conf->mirrors[d].replacement);
	1359	+ struct md_rdev rdev, rrdev;
	1360	+
	1361	+ rrdev = rcu_dereference(conf->mirrors[d].replacement);
	1362	+ /*
	1363	+ * Read replacement first to prevent reading both rdev and
	1364	+ * replacement as NULL during replacement replace rdev.
	1365	+ */
	1366	+ smp_mb();
	1367	+ rdev = rcu_dereference(conf->mirrors[d].rdev);
1406	1368	if (rdev == rrdev)
1407	1369	rrdev = NULL;
1408	1370	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
..	..	@@ -1516,7 +1478,7 @@
1516	1478	GFP_NOIO, &conf->bio_split);
1517	1479	bio_chain(split, bio);
1518	1480	allow_barrier(conf);
1519		- generic_make_request(bio);
	1481	+ submit_bio_noacct(bio);
1520	1482	wait_barrier(conf);
1521	1483	bio = split;
1522	1484	r10_bio->master_bio = bio;
..	..	@@ -1677,12 +1639,12 @@
1677	1639
1678	1640	/*
1679	1641	* If it is not operational, then we have already marked it as dead
1680		- * else if it is the last working disks, ignore the error, let the
1681		- * next level up know.
	1642	+ * else if it is the last working disks with "fail_last_dev == false",
	1643	+ * ignore the error, let the next level up know.
1682	1644	* else mark the drive as failed
1683	1645	*/
1684	1646	spin_lock_irqsave(&conf->device_lock, flags);
1685		- if (test_bit(In_sync, &rdev->flags)
	1647	+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1686	1648	&& !enough(conf, rdev->raid_disk)) {
1687	1649	/*
1688	1650	* Don't fail the drive, just return an IO error.
..	..	@@ -1863,9 +1825,12 @@
1863	1825	int err = 0;
1864	1826	int number = rdev->raid_disk;
1865	1827	struct md_rdev **rdevp;
1866		- struct raid10_info *p = conf->mirrors + number;
	1828	+ struct raid10_info *p;
1867	1829
1868	1830	print_conf(conf);
	1831	+ if (unlikely(number >= mddev->raid_disks))
	1832	+ return 0;
	1833	+ p = conf->mirrors + number;
1869	1834	if (rdev == p->rdev)
1870	1835	rdevp = &p->rdev;
1871	1836	else if (rdev == p->replacement)
..	..	@@ -2137,7 +2102,7 @@
2137	2102	tbio->bi_opf \|= MD_FAILFAST;
2138	2103	tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2139	2104	bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2140		- generic_make_request(tbio);
	2105	+ submit_bio_noacct(tbio);
2141	2106	}
2142	2107
2143	2108	/* Now write out to any replacement devices
..	..	@@ -2156,7 +2121,7 @@
2156	2121	atomic_inc(&r10_bio->remaining);
2157	2122	md_sync_acct(conf->mirrors[d].replacement->bdev,
2158	2123	bio_sectors(tbio));
2159		- generic_make_request(tbio);
	2124	+ submit_bio_noacct(tbio);
2160	2125	}
2161	2126
2162	2127	done:
..	..	@@ -2263,11 +2228,22 @@
2263	2228	{
2264	2229	struct r10conf *conf = mddev->private;
2265	2230	int d;
2266		- struct bio wbio, wbio2;
	2231	+ struct bio *wbio = r10_bio->devs[1].bio;
	2232	+ struct bio *wbio2 = r10_bio->devs[1].repl_bio;
	2233	+
	2234	+ /* Need to test wbio2->bi_end_io before we call
	2235	+ * submit_bio_noacct as if the former is NULL,
	2236	+ * the latter is free to free wbio2.
	2237	+ */
	2238	+ if (wbio2 && !wbio2->bi_end_io)
	2239	+ wbio2 = NULL;
2267	2240
2268	2241	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2269	2242	fix_recovery_read_error(r10_bio);
2270		- end_sync_request(r10_bio);
	2243	+ if (wbio->bi_end_io)
	2244	+ end_sync_request(r10_bio);
	2245	+ if (wbio2)
	2246	+ end_sync_request(r10_bio);
2271	2247	return;
2272	2248	}
2273	2249
..	..	@@ -2276,24 +2252,16 @@
2276	2252	* and submit the write request
2277	2253	*/
2278	2254	d = r10_bio->devs[1].devnum;
2279		- wbio = r10_bio->devs[1].bio;
2280		- wbio2 = r10_bio->devs[1].repl_bio;
2281		- /* Need to test wbio2->bi_end_io before we call
2282		- * generic_make_request as if the former is NULL,
2283		- * the latter is free to free wbio2.
2284		- */
2285		- if (wbio2 && !wbio2->bi_end_io)
2286		- wbio2 = NULL;
2287	2255	if (wbio->bi_end_io) {
2288	2256	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2289	2257	md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2290		- generic_make_request(wbio);
	2258	+ submit_bio_noacct(wbio);
2291	2259	}
2292	2260	if (wbio2) {
2293	2261	atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2294	2262	md_sync_acct(conf->mirrors[d].replacement->bdev,
2295	2263	bio_sectors(wbio2));
2296		- generic_make_request(wbio2);
	2264	+ submit_bio_noacct(wbio2);
2297	2265	}
2298	2266	}
2299	2267
..	..	@@ -2927,7 +2895,7 @@
2927	2895	* a number of r10_bio structures, one for each out-of-sync device.
2928	2896	* As we setup these structures, we collect all bio's together into a list
2929	2897	* which we then process collectively to add pages, and then process again
2930		- * to pass to generic_make_request.
	2898	+ * to pass to submit_bio_noacct.
2931	2899	*
2932	2900	* The r10_bio structures are linked using a borrowed master_bio pointer.
2933	2901	* This link is counted in ->remaining. When the r10_bio that points to NULL
..	..	@@ -2951,10 +2919,6 @@
2951	2919	sector_t chunk_mask = conf->geo.chunk_mask;
2952	2920	int page_idx = 0;
2953	2921
2954		- if (!mempool_initialized(&conf->r10buf_pool))
2955		- if (init_resync(conf))
2956		- return 0;
2957		-
2958	2922	/*
2959	2923	* Allow skipping a full rebuild for incremental assembly
2960	2924	* of a clean array, like RAID1 does.
..	..	@@ -2969,6 +2933,10 @@
2969	2933	*skipped = 1;
2970	2934	return mddev->dev_sectors - sector_nr;
2971	2935	}
	2936	+
	2937	+ if (!mempool_initialized(&conf->r10buf_pool))
	2938	+ if (init_resync(conf))
	2939	+ return 0;
2972	2940
2973	2941	skipped:
2974	2942	max_sector = mddev->dev_sectors;
..	..	@@ -3084,6 +3052,7 @@
3084	3052	sector_t sect;
3085	3053	int must_sync;
3086	3054	int any_working;
	3055	+ int need_recover = 0;
3087	3056	struct raid10_info *mirror = &conf->mirrors[i];
3088	3057	struct md_rdev mrdev, mreplace;
3089	3058
..	..	@@ -3091,11 +3060,14 @@
3091	3060	mrdev = rcu_dereference(mirror->rdev);
3092	3061	mreplace = rcu_dereference(mirror->replacement);
3093	3062
3094		- if ((mrdev == NULL \|\|
3095		- test_bit(Faulty, &mrdev->flags) \|\|
3096		- test_bit(In_sync, &mrdev->flags)) &&
3097		- (mreplace == NULL \|\|
3098		- test_bit(Faulty, &mreplace->flags))) {
	3063	+ if (mrdev != NULL &&
	3064	+ !test_bit(Faulty, &mrdev->flags) &&
	3065	+ !test_bit(In_sync, &mrdev->flags))
	3066	+ need_recover = 1;
	3067	+ if (mreplace && test_bit(Faulty, &mreplace->flags))
	3068	+ mreplace = NULL;
	3069	+
	3070	+ if (!need_recover && !mreplace) {
3099	3071	rcu_read_unlock();
3100	3072	continue;
3101	3073	}
..	..	@@ -3111,8 +3083,6 @@
3111	3083	rcu_read_unlock();
3112	3084	continue;
3113	3085	}
3114		- if (mreplace && test_bit(Faulty, &mreplace->flags))
3115		- mreplace = NULL;
3116	3086	/* Unless we are doing a full sync, or a replacement
3117	3087	* we only need to recover the block if it is set in
3118	3088	* the bitmap
..	..	@@ -3218,7 +3188,7 @@
3218	3188	r10_bio->devs[1].devnum = i;
3219	3189	r10_bio->devs[1].addr = to_addr;
3220	3190
3221		- if (!test_bit(In_sync, &mrdev->flags)) {
	3191	+ if (need_recover) {
3222	3192	bio = r10_bio->devs[1].bio;
3223	3193	bio->bi_next = biolist;
3224	3194	biolist = bio;
..	..	@@ -3235,16 +3205,11 @@
3235	3205	bio = r10_bio->devs[1].repl_bio;
3236	3206	if (bio)
3237	3207	bio->bi_end_io = NULL;
3238		- /* Note: if mreplace != NULL, then bio
	3208	+ /* Note: if replace is not NULL, then bio
3239	3209	* cannot be NULL as r10buf_pool_alloc will
3240	3210	* have allocated it.
3241		- * So the second test here is pointless.
3242		- * But it keeps semantic-checkers happy, and
3243		- * this comment keeps human reviewers
3244		- * happy.
3245	3211	*/
3246		- if (mreplace == NULL \|\| bio == NULL \|\|
3247		- test_bit(Faulty, &mreplace->flags))
	3212	+ if (!mreplace)
3248	3213	break;
3249	3214	bio->bi_next = biolist;
3250	3215	biolist = bio;
..	..	@@ -3533,7 +3498,7 @@
3533	3498	if (bio->bi_end_io == end_sync_read) {
3534	3499	md_sync_acct_bio(bio, nr_sectors);
3535	3500	bio->bi_status = 0;
3536		- generic_make_request(bio);
	3501	+ submit_bio_noacct(bio);
3537	3502	}
3538	3503	}
3539	3504
..	..	@@ -3665,6 +3630,20 @@
3665	3630	return nc*fc;
3666	3631	}
3667	3632
	3633	+static void raid10_free_conf(struct r10conf *conf)
	3634	+{
	3635	+ if (!conf)
	3636	+ return;
	3637	+
	3638	+ mempool_exit(&conf->r10bio_pool);
	3639	+ kfree(conf->mirrors);
	3640	+ kfree(conf->mirrors_old);
	3641	+ kfree(conf->mirrors_new);
	3642	+ safe_put_page(conf->tmppage);
	3643	+ bioset_exit(&conf->bio_split);
	3644	+ kfree(conf);
	3645	+}
	3646	+
3668	3647	static struct r10conf setup_conf(struct mddev mddev)
3669	3648	{
3670	3649	struct r10conf *conf = NULL;
..	..	@@ -3704,8 +3683,8 @@
3704	3683
3705	3684	conf->geo = geo;
3706	3685	conf->copies = copies;
3707		- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
3708		- r10bio_pool_free, conf);
	3686	+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
	3687	+ rbio_pool_free, conf);
3709	3688	if (err)
3710	3689	goto out;
3711	3690
..	..	@@ -3747,20 +3726,24 @@
3747	3726	return conf;
3748	3727
3749	3728	out:
3750		- if (conf) {
3751		- mempool_exit(&conf->r10bio_pool);
3752		- kfree(conf->mirrors);
3753		- safe_put_page(conf->tmppage);
3754		- bioset_exit(&conf->bio_split);
3755		- kfree(conf);
3756		- }
	3729	+ raid10_free_conf(conf);
3757	3730	return ERR_PTR(err);
	3731	+}
	3732	+
	3733	+static void raid10_set_io_opt(struct r10conf *conf)
	3734	+{
	3735	+ int raid_disks = conf->geo.raid_disks;
	3736	+
	3737	+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
	3738	+ raid_disks /= conf->geo.near_copies;
	3739	+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
	3740	+ raid_disks);
3758	3741	}
3759	3742
3760	3743	static int raid10_run(struct mddev *mddev)
3761	3744	{
3762	3745	struct r10conf *conf;
3763		- int i, disk_idx, chunk_size;
	3746	+ int i, disk_idx;
3764	3747	struct raid10_info *disk;
3765	3748	struct md_rdev *rdev;
3766	3749	sector_t size;
..	..	@@ -3781,6 +3764,9 @@
3781	3764	if (!conf)
3782	3765	goto out;
3783	3766
	3767	+ mddev->thread = conf->thread;
	3768	+ conf->thread = NULL;
	3769	+
3784	3770	if (mddev_is_clustered(conf->mddev)) {
3785	3771	int fc, fo;
3786	3772
..	..	@@ -3793,21 +3779,13 @@
3793	3779	}
3794	3780	}
3795	3781
3796		- mddev->thread = conf->thread;
3797		- conf->thread = NULL;
3798		-
3799		- chunk_size = mddev->chunk_sectors << 9;
3800	3782	if (mddev->queue) {
3801	3783	blk_queue_max_discard_sectors(mddev->queue,
3802	3784	mddev->chunk_sectors);
3803	3785	blk_queue_max_write_same_sectors(mddev->queue, 0);
3804	3786	blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3805		- blk_queue_io_min(mddev->queue, chunk_size);
3806		- if (conf->geo.raid_disks % conf->geo.near_copies)
3807		- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3808		- else
3809		- blk_queue_io_opt(mddev->queue, chunk_size *
3810		- (conf->geo.raid_disks / conf->geo.near_copies));
	3787	+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
	3788	+ raid10_set_io_opt(conf);
3811	3789	}
3812	3790
3813	3791	rdev_for_each(rdev, mddev) {
..	..	@@ -3922,19 +3900,6 @@
3922	3900	mddev->resync_max_sectors = size;
3923	3901	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3924	3902
3925		- if (mddev->queue) {
3926		- int stripe = conf->geo.raid_disks *
3927		- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3928		-
3929		- /* Calculate max read-ahead size.
3930		- * We need to readahead at least twice a whole stripe....
3931		- * maybe...
3932		- */
3933		- stripe /= conf->geo.near_copies;
3934		- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3935		- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3936		- }
3937		-
3938	3903	if (md_integrity_register(mddev))
3939	3904	goto out_free_conf;
3940	3905
..	..	@@ -3967,10 +3932,7 @@
3967	3932
3968	3933	out_free_conf:
3969	3934	md_unregister_thread(&mddev->thread);
3970		- mempool_exit(&conf->r10bio_pool);
3971		- safe_put_page(conf->tmppage);
3972		- kfree(conf->mirrors);
3973		- kfree(conf);
	3935	+ raid10_free_conf(conf);
3974	3936	mddev->private = NULL;
3975	3937	out:
3976	3938	return -EIO;
..	..	@@ -3978,15 +3940,7 @@
3978	3940
3979	3941	static void raid10_free(struct mddev mddev, void priv)
3980	3942	{
3981		- struct r10conf *conf = priv;
3982		-
3983		- mempool_exit(&conf->r10bio_pool);
3984		- safe_put_page(conf->tmppage);
3985		- kfree(conf->mirrors);
3986		- kfree(conf->mirrors_old);
3987		- kfree(conf->mirrors_new);
3988		- bioset_exit(&conf->bio_split);
3989		- kfree(conf);
	3943	+ raid10_free_conf(priv);
3990	3944	}
3991	3945
3992	3946	static void raid10_quiesce(struct mddev *mddev, int quiesce)
..	..	@@ -4293,12 +4247,46 @@
4293	4247	spin_unlock_irq(&conf->device_lock);
4294	4248
4295	4249	if (mddev->delta_disks && mddev->bitmap) {
4296		- ret = md_bitmap_resize(mddev->bitmap,
4297		- raid10_size(mddev, 0, conf->geo.raid_disks),
4298		- 0, 0);
	4250	+ struct mdp_superblock_1 *sb = NULL;
	4251	+ sector_t oldsize, newsize;
	4252	+
	4253	+ oldsize = raid10_size(mddev, 0, 0);
	4254	+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
	4255	+
	4256	+ if (!mddev_is_clustered(mddev)) {
	4257	+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
	4258	+ if (ret)
	4259	+ goto abort;
	4260	+ else
	4261	+ goto out;
	4262	+ }
	4263	+
	4264	+ rdev_for_each(rdev, mddev) {
	4265	+ if (rdev->raid_disk > -1 &&
	4266	+ !test_bit(Faulty, &rdev->flags))
	4267	+ sb = page_address(rdev->sb_page);
	4268	+ }
	4269	+
	4270	+ /*
	4271	+ * some node is already performing reshape, and no need to
	4272	+ * call md_bitmap_resize again since it should be called when
	4273	+ * receiving BITMAP_RESIZE msg
	4274	+ */
	4275	+ if ((sb && (le32_to_cpu(sb->feature_map) &
	4276	+ MD_FEATURE_RESHAPE_ACTIVE)) \|\| (oldsize == newsize))
	4277	+ goto out;
	4278	+
	4279	+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4299	4280	if (ret)
4300	4281	goto abort;
	4282	+
	4283	+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
	4284	+ if (ret) {
	4285	+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
	4286	+ goto abort;
	4287	+ }
4301	4288	}
	4289	+out:
4302	4290	if (mddev->delta_disks > 0) {
4303	4291	rdev_for_each(rdev, mddev)
4304	4292	if (rdev->raid_disk < 0 &&
..	..	@@ -4310,8 +4298,8 @@
4310	4298	else
4311	4299	rdev->recovery_offset = 0;
4312	4300
4313		- if (sysfs_link_rdev(mddev, rdev))
4314		- /* Failure here is OK */;
	4301	+ /* Failure here is OK */
	4302	+ sysfs_link_rdev(mddev, rdev);
4315	4303	}
4316	4304	} else if (rdev->raid_disk >= conf->prev.raid_disks
4317	4305	&& !test_bit(Faulty, &rdev->flags)) {
..	..	@@ -4457,7 +4445,7 @@
4457	4445	sector_nr = conf->reshape_progress;
4458	4446	if (sector_nr) {
4459	4447	mddev->curr_resync_completed = sector_nr;
4460		- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	4448	+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
4461	4449	*skipped = 1;
4462	4450	return sector_nr;
4463	4451	}
..	..	@@ -4486,8 +4474,8 @@
4486	4474	last = conf->reshape_progress - 1;
4487	4475	sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4488	4476	& conf->prev.chunk_mask);
4489		- if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4490		- sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
	4477	+ if (sector_nr + RESYNC_SECTORS < last)
	4478	+ sector_nr = last + 1 - RESYNC_SECTORS;
4491	4479	} else {
4492	4480	/* 'next' is after the last device address that we
4493	4481	* might write to for this chunk in the new layout
..	..	@@ -4509,8 +4497,8 @@
4509	4497	last = sector_nr \| (conf->geo.chunk_mask
4510	4498	& conf->prev.chunk_mask);
4511	4499
4512		- if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4513		- last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
	4500	+ if (sector_nr + RESYNC_SECTORS <= last)
	4501	+ last = sector_nr + RESYNC_SECTORS - 1;
4514	4502	}
4515	4503
4516	4504	if (need_flush \|\|
..	..	@@ -4575,6 +4563,32 @@
4575	4563	r10_bio->master_bio = read_bio;
4576	4564	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4577	4565
	4566	+ /*
	4567	+ * Broadcast RESYNC message to other nodes, so all nodes would not
	4568	+ * write to the region to avoid conflict.
	4569	+ */
	4570	+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
	4571	+ struct mdp_superblock_1 *sb = NULL;
	4572	+ int sb_reshape_pos = 0;
	4573	+
	4574	+ conf->cluster_sync_low = sector_nr;
	4575	+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
	4576	+ sb = page_address(rdev->sb_page);
	4577	+ if (sb) {
	4578	+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
	4579	+ /*
	4580	+ * Set cluster_sync_low again if next address for array
	4581	+ * reshape is less than cluster_sync_low. Since we can't
	4582	+ * update cluster_sync_low until it has finished reshape.
	4583	+ */
	4584	+ if (sb_reshape_pos < conf->cluster_sync_low)
	4585	+ conf->cluster_sync_low = sb_reshape_pos;
	4586	+ }
	4587	+
	4588	+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
	4589	+ conf->cluster_sync_high);
	4590	+ }
	4591	+
4578	4592	/* Now find the locations in the new layout */
4579	4593	__raid10_find_phys(&conf->geo, r10_bio);
4580	4594
..	..	@@ -4631,7 +4645,7 @@
4631	4645	md_sync_acct_bio(read_bio, r10_bio->sectors);
4632	4646	atomic_inc(&r10_bio->remaining);
4633	4647	read_bio->bi_next = NULL;
4634		- generic_make_request(read_bio);
	4648	+ submit_bio_noacct(read_bio);
4635	4649	sectors_done += nr_sectors;
4636	4650	if (sector_nr <= last)
4637	4651	goto read_more;
..	..	@@ -4694,7 +4708,7 @@
4694	4708	md_sync_acct_bio(b, r10_bio->sectors);
4695	4709	atomic_inc(&r10_bio->remaining);
4696	4710	b->bi_next = NULL;
4697		- generic_make_request(b);
	4711	+ submit_bio_noacct(b);
4698	4712	}
4699	4713	end_reshape_request(r10_bio);
4700	4714	}
..	..	@@ -4712,17 +4726,22 @@
4712	4726	conf->reshape_safe = MaxSector;
4713	4727	spin_unlock_irq(&conf->device_lock);
4714	4728
4715		- /* read-ahead size must cover two whole stripes, which is
4716		- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4717		- */
4718		- if (conf->mddev->queue) {
4719		- int stripe = conf->geo.raid_disks *
4720		- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4721		- stripe /= conf->geo.near_copies;
4722		- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4723		- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4724		- }
	4729	+ if (conf->mddev->queue)
	4730	+ raid10_set_io_opt(conf);
4725	4731	conf->fullsync = 0;
	4732	+}
	4733	+
	4734	+static void raid10_update_reshape_pos(struct mddev *mddev)
	4735	+{
	4736	+ struct r10conf *conf = mddev->private;
	4737	+ sector_t lo, hi;
	4738	+
	4739	+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
	4740	+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
	4741	+ \|\| mddev->reshape_position == MaxSector)
	4742	+ conf->reshape_progress = mddev->reshape_position;
	4743	+ else
	4744	+ WARN_ON_ONCE(1);
4726	4745	}
4727	4746
4728	4747	static int handle_reshape_read_error(struct mddev *mddev,
..	..	@@ -4736,8 +4755,7 @@
4736	4755	int idx = 0;
4737	4756	struct page **pages;
4738	4757
4739		- r10b = kmalloc(sizeof(*r10b) +
4740		- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
	4758	+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4741	4759	if (!r10b) {
4742	4760	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4743	4761	return -ENOMEM;
..	..	@@ -4893,7 +4911,7 @@
4893	4911	.check_reshape = raid10_check_reshape,
4894	4912	.start_reshape = raid10_start_reshape,
4895	4913	.finish_reshape = raid10_finish_reshape,
4896		- .congested = raid10_congested,
	4914	+ .update_reshape_pos = raid10_update_reshape_pos,
4897	4915	};
4898	4916
4899	4917	static int __init raid_init(void)