~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* raid10.c : Multiple Devices driver for Linux
3	4	*
..	..	@@ -6,16 +7,6 @@
6	7	* RAID-10 support for md.
7	8	*
8	9	* Base on code in raid1.c. See raid1.c for further copyright information.
9		- *
10		- *
11		- * This program is free software; you can redistribute it and/or modify
12		- * it under the terms of the GNU General Public License as published by
13		- * the Free Software Foundation; either version 2, or (at your option)
14		- * any later version.
15		- *
16		- * You should have received a copy of the GNU General Public License
17		- * (for example /usr/src/linux/COPYING); if not, write to the Free
18		- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	10	*/
20	11
21	12	#include <linux/slab.h>
..	..	@@ -25,6 +16,7 @@
25	16	#include <linux/seq_file.h>
26	17	#include <linux/ratelimit.h>
27	18	#include <linux/kthread.h>
	19	+#include <linux/raid/md_p.h>
28	20	#include <trace/events/block.h>
29	21	#include "md.h"
30	22	#include "raid10.h"
..	..	@@ -72,31 +64,6 @@
72	64	* [B A] [D C] [B A] [E C D]
73	65	*/
74	66
75		-/*
76		- * Number of guaranteed r10bios in case of extreme VM load:
77		- */
78		-#define NR_RAID10_BIOS 256
79		-
80		-/* when we get a read error on a read-only array, we redirect to another
81		- * device without failing the first device, or trying to over-write to
82		- * correct the read error. To keep track of bad blocks on a per-bio
83		- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
84		- */
85		-#define IO_BLOCKED ((struct bio *)1)
86		-/* When we successfully write to a known bad-block, we need to remove the
87		- * bad-block marking which must be done from process context. So we record
88		- * the success by setting devs[n].bio to IO_MADE_GOOD
89		- */
90		-#define IO_MADE_GOOD ((struct bio *)2)
91		-
92		-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93		-
94		-/* When there are this many requests queued to be written by
95		- * the raid10 thread, we become 'congested' to provide back-pressure
96		- * for writeback.
97		- */
98		-static int max_queued_requests = 1024;
99		-
100	67	static void allow_barrier(struct r10conf *conf);
101	68	static void lower_barrier(struct r10conf *conf);
102	69	static int _enough(struct r10conf *conf, int previous, int ignore);
..	..	@@ -129,11 +96,6 @@
129	96	/* allocate a r10bio with room for raid_disks entries in the
130	97	* bios array */
131	98	return kzalloc(size, gfp_flags);
132		-}
133		-
134		-static void r10bio_pool_free(void r10_bio, void data)
135		-{
136		- kfree(r10_bio);
137	99	}
138	100
139	101	#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
..	..	@@ -241,7 +203,7 @@
241	203	}
242	204	kfree(rps);
243	205	out_free_r10bio:
244		- r10bio_pool_free(r10_bio, conf);
	206	+ rbio_pool_free(r10_bio, conf);
245	207	return NULL;
246	208	}
247	209
..	..	@@ -269,7 +231,7 @@
269	231	/* resync pages array stored in the 1st bio's .bi_private */
270	232	kfree(rp);
271	233
272		- r10bio_pool_free(r10bio, conf);
	234	+ rbio_pool_free(r10bio, conf);
273	235	}
274	236
275	237	static void put_all_bios(struct r10conf conf, struct r10bio r10_bio)
..	..	@@ -503,19 +465,21 @@
503	465	if (test_bit(FailFast, &rdev->flags) &&
504	466	(bio->bi_opf & MD_FAILFAST)) {
505	467	md_error(rdev->mddev, rdev);
506		- if (!test_bit(Faulty, &rdev->flags))
507		- /* This is the only remaining device,
508		- * We need to retry the write without
509		- * FailFast
510		- */
511		- set_bit(R10BIO_WriteError, &r10_bio->state);
512		- else {
513		- r10_bio->devs[slot].bio = NULL;
514		- to_put = bio;
515		- dec_rdev = 1;
516		- }
517		- } else
	468	+ }
	469	+
	470	+ /*
	471	+ * When the device is faulty, it is not necessary to
	472	+ * handle write error.
	473	+ */
	474	+ if (!test_bit(Faulty, &rdev->flags))
518	475	set_bit(R10BIO_WriteError, &r10_bio->state);
	476	+ else {
	477	+ /* Fail the request */
	478	+ set_bit(R10BIO_Degraded, &r10_bio->state);
	479	+ r10_bio->devs[slot].bio = NULL;
	480	+ to_put = bio;
	481	+ dec_rdev = 1;
	482	+ }
519	483	}
520	484	} else {
521	485	/*
..	..	@@ -745,15 +709,19 @@
745	709	int sectors = r10_bio->sectors;
746	710	int best_good_sectors;
747	711	sector_t new_distance, best_dist;
748		- struct md_rdev best_rdev, rdev = NULL;
	712	+ struct md_rdev best_dist_rdev, best_pending_rdev, *rdev = NULL;
749	713	int do_balance;
750		- int best_slot;
	714	+ int best_dist_slot, best_pending_slot;
	715	+ bool has_nonrot_disk = false;
	716	+ unsigned int min_pending;
751	717	struct geom *geo = &conf->geo;
752	718
753	719	raid10_find_phys(conf, r10_bio);
754	720	rcu_read_lock();
755		- best_slot = -1;
756		- best_rdev = NULL;
	721	+ best_dist_slot = -1;
	722	+ min_pending = UINT_MAX;
	723	+ best_dist_rdev = NULL;
	724	+ best_pending_rdev = NULL;
757	725	best_dist = MaxSector;
758	726	best_good_sectors = 0;
759	727	do_balance = 1;
..	..	@@ -775,6 +743,8 @@
775	743	sector_t first_bad;
776	744	int bad_sectors;
777	745	sector_t dev_sector;
	746	+ unsigned int pending;
	747	+ bool nonrot;
778	748
779	749	if (r10_bio->devs[slot].bio == IO_BLOCKED)
780	750	continue;
..	..	@@ -811,8 +781,8 @@
811	781	first_bad - dev_sector;
812	782	if (good_sectors > best_good_sectors) {
813	783	best_good_sectors = good_sectors;
814		- best_slot = slot;
815		- best_rdev = rdev;
	784	+ best_dist_slot = slot;
	785	+ best_dist_rdev = rdev;
816	786	}
817	787	if (!do_balance)
818	788	/* Must read from here */
..	..	@@ -825,14 +795,23 @@
825	795	if (!do_balance)
826	796	break;
827	797
828		- if (best_slot >= 0)
	798	+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
	799	+ has_nonrot_disk \|= nonrot;
	800	+ pending = atomic_read(&rdev->nr_pending);
	801	+ if (min_pending > pending && nonrot) {
	802	+ min_pending = pending;
	803	+ best_pending_slot = slot;
	804	+ best_pending_rdev = rdev;
	805	+ }
	806	+
	807	+ if (best_dist_slot >= 0)
829	808	/* At least 2 disks to choose from so failfast is OK */
830	809	set_bit(R10BIO_FailFast, &r10_bio->state);
831	810	/* This optimisation is debatable, and completely destroys
832	811	* sequential read speed for 'far copies' arrays. So only
833	812	* keep it for 'near' arrays, and review those later.
834	813	*/
835		- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
	814	+ if (geo->near_copies > 1 && !pending)
836	815	new_distance = 0;
837	816
838	817	/* for far > 1 always use the lowest address */
..	..	@@ -841,15 +820,21 @@
841	820	else
842	821	new_distance = abs(r10_bio->devs[slot].addr -
843	822	conf->mirrors[disk].head_position);
	823	+
844	824	if (new_distance < best_dist) {
845	825	best_dist = new_distance;
846		- best_slot = slot;
847		- best_rdev = rdev;
	826	+ best_dist_slot = slot;
	827	+ best_dist_rdev = rdev;
848	828	}
849	829	}
850	830	if (slot >= conf->copies) {
851		- slot = best_slot;
852		- rdev = best_rdev;
	831	+ if (has_nonrot_disk) {
	832	+ slot = best_pending_slot;
	833	+ rdev = best_pending_rdev;
	834	+ } else {
	835	+ slot = best_dist_slot;
	836	+ rdev = best_dist_rdev;
	837	+ }
853	838	}
854	839
855	840	if (slot >= 0) {
..	..	@@ -861,31 +846,6 @@
861	846	*max_sectors = best_good_sectors;
862	847
863	848	return rdev;
864		-}
865		-
866		-static int raid10_congested(struct mddev *mddev, int bits)
867		-{
868		- struct r10conf *conf = mddev->private;
869		- int i, ret = 0;
870		-
871		- if ((bits & (1 << WB_async_congested)) &&
872		- conf->pending_count >= max_queued_requests)
873		- return 1;
874		-
875		- rcu_read_lock();
876		- for (i = 0;
877		- (i < conf->geo.raid_disks \|\| i < conf->prev.raid_disks)
878		- && ret == 0;
879		- i++) {
880		- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
881		- if (rdev && !test_bit(Faulty, &rdev->flags)) {
882		- struct request_queue *q = bdev_get_queue(rdev->bdev);
883		-
884		- ret \|= bdi_congested(q->backing_dev_info, bits);
885		- }
886		- }
887		- rcu_read_unlock();
888		- return ret;
889	849	}
890	850
891	851	static void flush_pending_writes(struct r10conf *conf)
..	..	@@ -932,7 +892,7 @@
932	892	/* Just ignore it */
933	893	bio_endio(bio);
934	894	else
935		- generic_make_request(bio);
	895	+ submit_bio_noacct(bio);
936	896	bio = next;
937	897	}
938	898	blk_finish_plug(&plug);
..	..	@@ -995,6 +955,7 @@
995	955	{
996	956	spin_lock_irq(&conf->resync_lock);
997	957	if (conf->barrier) {
	958	+ struct bio_list *bio_list = current->bio_list;
998	959	conf->nr_waiting++;
999	960	/* Wait for the barrier to drop.
1000	961	* However if there are already pending
..	..	@@ -1009,9 +970,16 @@
1009	970	wait_event_lock_irq(conf->wait_barrier,
1010	971	!conf->barrier \|\|
1011	972	(atomic_read(&conf->nr_pending) &&
1012		- current->bio_list &&
1013		- (!bio_list_empty(&current->bio_list[0]) \|\|
1014		- !bio_list_empty(&current->bio_list[1]))),
	973	+ bio_list &&
	974	+ (!bio_list_empty(&bio_list[0]) \|\|
	975	+ !bio_list_empty(&bio_list[1]))) \|\|
	976	+ /* move on if recovery thread is
	977	+ * blocked by us
	978	+ */
	979	+ (conf->mddev->thread->tsk == current &&
	980	+ test_bit(MD_RECOVERY_RUNNING,
	981	+ &conf->mddev->recovery) &&
	982	+ conf->nr_queued > 0),
1015	983	conf->resync_lock);
1016	984	conf->nr_waiting--;
1017	985	if (!conf->nr_waiting)
..	..	@@ -1117,10 +1085,33 @@
1117	1085	/* Just ignore it */
1118	1086	bio_endio(bio);
1119	1087	else
1120		- generic_make_request(bio);
	1088	+ submit_bio_noacct(bio);
1121	1089	bio = next;
1122	1090	}
1123	1091	kfree(plug);
	1092	+}
	1093	+
	1094	+/*
	1095	+ * 1. Register the new request and wait if the reconstruction thread has put
	1096	+ * up a bar for new requests. Continue immediately if no resync is active
	1097	+ * currently.
	1098	+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
	1099	+ */
	1100	+static void regular_request_wait(struct mddev mddev, struct r10conf conf,
	1101	+ struct bio *bio, sector_t sectors)
	1102	+{
	1103	+ wait_barrier(conf);
	1104	+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
	1105	+ bio->bi_iter.bi_sector < conf->reshape_progress &&
	1106	+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
	1107	+ raid10_log(conf->mddev, "wait reshape");
	1108	+ allow_barrier(conf);
	1109	+ wait_event(conf->wait_barrier,
	1110	+ conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
	1111	+ conf->reshape_progress >= bio->bi_iter.bi_sector +
	1112	+ sectors);
	1113	+ wait_barrier(conf);
	1114	+ }
1124	1115	}
1125	1116
1126	1117	static void raid10_read_request(struct mddev mddev, struct bio bio,
..	..	@@ -1131,7 +1122,6 @@
1131	1122	const int op = bio_op(bio);
1132	1123	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1133	1124	int max_sectors;
1134		- sector_t sectors;
1135	1125	struct md_rdev *rdev;
1136	1126	char b[BDEVNAME_SIZE];
1137	1127	int slot = r10_bio->read_slot;
..	..	@@ -1165,30 +1155,8 @@
1165	1155	}
1166	1156	rcu_read_unlock();
1167	1157	}
1168		- /*
1169		- * Register the new request and wait if the reconstruction
1170		- * thread has put up a bar for new requests.
1171		- * Continue immediately if no resync is active currently.
1172		- */
1173		- wait_barrier(conf);
1174	1158
1175		- sectors = r10_bio->sectors;
1176		- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177		- bio->bi_iter.bi_sector < conf->reshape_progress &&
1178		- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1179		- /*
1180		- * IO spans the reshape position. Need to wait for reshape to
1181		- * pass
1182		- */
1183		- raid10_log(conf->mddev, "wait reshape");
1184		- allow_barrier(conf);
1185		- wait_event(conf->wait_barrier,
1186		- conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
1187		- conf->reshape_progress >= bio->bi_iter.bi_sector +
1188		- sectors);
1189		- wait_barrier(conf);
1190		- }
1191		-
	1159	+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
1192	1160	rdev = read_balance(conf, r10_bio, &max_sectors);
1193	1161	if (!rdev) {
1194	1162	if (err_rdev) {
..	..	@@ -1209,7 +1177,7 @@
1209	1177	gfp, &conf->bio_split);
1210	1178	bio_chain(split, bio);
1211	1179	allow_barrier(conf);
1212		- generic_make_request(bio);
	1180	+ submit_bio_noacct(bio);
1213	1181	wait_barrier(conf);
1214	1182	bio = split;
1215	1183	r10_bio->master_bio = bio;
..	..	@@ -1236,7 +1204,7 @@
1236	1204	trace_block_bio_remap(read_bio->bi_disk->queue,
1237	1205	read_bio, disk_devt(mddev->gendisk),
1238	1206	r10_bio->sector);
1239		- generic_make_request(read_bio);
	1207	+ submit_bio_noacct(read_bio);
1240	1208	return;
1241	1209	}
1242	1210
..	..	@@ -1333,30 +1301,8 @@
1333	1301	finish_wait(&conf->wait_barrier, &w);
1334	1302	}
1335	1303
1336		- /*
1337		- * Register the new request and wait if the reconstruction
1338		- * thread has put up a bar for new requests.
1339		- * Continue immediately if no resync is active currently.
1340		- */
1341		- wait_barrier(conf);
1342		-
1343	1304	sectors = r10_bio->sectors;
1344		- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1345		- bio->bi_iter.bi_sector < conf->reshape_progress &&
1346		- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1347		- /*
1348		- * IO spans the reshape position. Need to wait for reshape to
1349		- * pass
1350		- */
1351		- raid10_log(conf->mddev, "wait reshape");
1352		- allow_barrier(conf);
1353		- wait_event(conf->wait_barrier,
1354		- conf->reshape_progress <= bio->bi_iter.bi_sector \|\|
1355		- conf->reshape_progress >= bio->bi_iter.bi_sector +
1356		- sectors);
1357		- wait_barrier(conf);
1358		- }
1359		-
	1305	+ regular_request_wait(mddev, conf, bio, sectors);
1360	1306	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1361	1307	(mddev->reshape_backwards
1362	1308	? (bio->bi_iter.bi_sector < conf->reshape_safe &&
..	..	@@ -1516,7 +1462,7 @@
1516	1462	GFP_NOIO, &conf->bio_split);
1517	1463	bio_chain(split, bio);
1518	1464	allow_barrier(conf);
1519		- generic_make_request(bio);
	1465	+ submit_bio_noacct(bio);
1520	1466	wait_barrier(conf);
1521	1467	bio = split;
1522	1468	r10_bio->master_bio = bio;
..	..	@@ -1677,12 +1623,12 @@
1677	1623
1678	1624	/*
1679	1625	* If it is not operational, then we have already marked it as dead
1680		- * else if it is the last working disks, ignore the error, let the
1681		- * next level up know.
	1626	+ * else if it is the last working disks with "fail_last_dev == false",
	1627	+ * ignore the error, let the next level up know.
1682	1628	* else mark the drive as failed
1683	1629	*/
1684	1630	spin_lock_irqsave(&conf->device_lock, flags);
1685		- if (test_bit(In_sync, &rdev->flags)
	1631	+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1686	1632	&& !enough(conf, rdev->raid_disk)) {
1687	1633	/*
1688	1634	* Don't fail the drive, just return an IO error.
..	..	@@ -1863,9 +1809,12 @@
1863	1809	int err = 0;
1864	1810	int number = rdev->raid_disk;
1865	1811	struct md_rdev **rdevp;
1866		- struct raid10_info *p = conf->mirrors + number;
	1812	+ struct raid10_info *p;
1867	1813
1868	1814	print_conf(conf);
	1815	+ if (unlikely(number >= mddev->raid_disks))
	1816	+ return 0;
	1817	+ p = conf->mirrors + number;
1869	1818	if (rdev == p->rdev)
1870	1819	rdevp = &p->rdev;
1871	1820	else if (rdev == p->replacement)
..	..	@@ -2137,7 +2086,7 @@
2137	2086	tbio->bi_opf \|= MD_FAILFAST;
2138	2087	tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2139	2088	bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2140		- generic_make_request(tbio);
	2089	+ submit_bio_noacct(tbio);
2141	2090	}
2142	2091
2143	2092	/* Now write out to any replacement devices
..	..	@@ -2156,7 +2105,7 @@
2156	2105	atomic_inc(&r10_bio->remaining);
2157	2106	md_sync_acct(conf->mirrors[d].replacement->bdev,
2158	2107	bio_sectors(tbio));
2159		- generic_make_request(tbio);
	2108	+ submit_bio_noacct(tbio);
2160	2109	}
2161	2110
2162	2111	done:
..	..	@@ -2279,7 +2228,7 @@
2279	2228	wbio = r10_bio->devs[1].bio;
2280	2229	wbio2 = r10_bio->devs[1].repl_bio;
2281	2230	/* Need to test wbio2->bi_end_io before we call
2282		- * generic_make_request as if the former is NULL,
	2231	+ * submit_bio_noacct as if the former is NULL,
2283	2232	* the latter is free to free wbio2.
2284	2233	*/
2285	2234	if (wbio2 && !wbio2->bi_end_io)
..	..	@@ -2287,13 +2236,13 @@
2287	2236	if (wbio->bi_end_io) {
2288	2237	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2289	2238	md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2290		- generic_make_request(wbio);
	2239	+ submit_bio_noacct(wbio);
2291	2240	}
2292	2241	if (wbio2) {
2293	2242	atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2294	2243	md_sync_acct(conf->mirrors[d].replacement->bdev,
2295	2244	bio_sectors(wbio2));
2296		- generic_make_request(wbio2);
	2245	+ submit_bio_noacct(wbio2);
2297	2246	}
2298	2247	}
2299	2248
..	..	@@ -2927,7 +2876,7 @@
2927	2876	* a number of r10_bio structures, one for each out-of-sync device.
2928	2877	* As we setup these structures, we collect all bio's together into a list
2929	2878	* which we then process collectively to add pages, and then process again
2930		- * to pass to generic_make_request.
	2879	+ * to pass to submit_bio_noacct.
2931	2880	*
2932	2881	* The r10_bio structures are linked using a borrowed master_bio pointer.
2933	2882	* This link is counted in ->remaining. When the r10_bio that points to NULL
..	..	@@ -3084,6 +3033,8 @@
3084	3033	sector_t sect;
3085	3034	int must_sync;
3086	3035	int any_working;
	3036	+ int need_recover = 0;
	3037	+ int need_replace = 0;
3087	3038	struct raid10_info *mirror = &conf->mirrors[i];
3088	3039	struct md_rdev mrdev, mreplace;
3089	3040
..	..	@@ -3091,11 +3042,15 @@
3091	3042	mrdev = rcu_dereference(mirror->rdev);
3092	3043	mreplace = rcu_dereference(mirror->replacement);
3093	3044
3094		- if ((mrdev == NULL \|\|
3095		- test_bit(Faulty, &mrdev->flags) \|\|
3096		- test_bit(In_sync, &mrdev->flags)) &&
3097		- (mreplace == NULL \|\|
3098		- test_bit(Faulty, &mreplace->flags))) {
	3045	+ if (mrdev != NULL &&
	3046	+ !test_bit(Faulty, &mrdev->flags) &&
	3047	+ !test_bit(In_sync, &mrdev->flags))
	3048	+ need_recover = 1;
	3049	+ if (mreplace != NULL &&
	3050	+ !test_bit(Faulty, &mreplace->flags))
	3051	+ need_replace = 1;
	3052	+
	3053	+ if (!need_recover && !need_replace) {
3099	3054	rcu_read_unlock();
3100	3055	continue;
3101	3056	}
..	..	@@ -3218,7 +3173,7 @@
3218	3173	r10_bio->devs[1].devnum = i;
3219	3174	r10_bio->devs[1].addr = to_addr;
3220	3175
3221		- if (!test_bit(In_sync, &mrdev->flags)) {
	3176	+ if (need_recover) {
3222	3177	bio = r10_bio->devs[1].bio;
3223	3178	bio->bi_next = biolist;
3224	3179	biolist = bio;
..	..	@@ -3235,16 +3190,11 @@
3235	3190	bio = r10_bio->devs[1].repl_bio;
3236	3191	if (bio)
3237	3192	bio->bi_end_io = NULL;
3238		- /* Note: if mreplace != NULL, then bio
	3193	+ /* Note: if need_replace, then bio
3239	3194	* cannot be NULL as r10buf_pool_alloc will
3240	3195	* have allocated it.
3241		- * So the second test here is pointless.
3242		- * But it keeps semantic-checkers happy, and
3243		- * this comment keeps human reviewers
3244		- * happy.
3245	3196	*/
3246		- if (mreplace == NULL \|\| bio == NULL \|\|
3247		- test_bit(Faulty, &mreplace->flags))
	3197	+ if (!need_replace)
3248	3198	break;
3249	3199	bio->bi_next = biolist;
3250	3200	biolist = bio;
..	..	@@ -3533,7 +3483,7 @@
3533	3483	if (bio->bi_end_io == end_sync_read) {
3534	3484	md_sync_acct_bio(bio, nr_sectors);
3535	3485	bio->bi_status = 0;
3536		- generic_make_request(bio);
	3486	+ submit_bio_noacct(bio);
3537	3487	}
3538	3488	}
3539	3489
..	..	@@ -3704,8 +3654,8 @@
3704	3654
3705	3655	conf->geo = geo;
3706	3656	conf->copies = copies;
3707		- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
3708		- r10bio_pool_free, conf);
	3657	+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
	3658	+ rbio_pool_free, conf);
3709	3659	if (err)
3710	3660	goto out;
3711	3661
..	..	@@ -3757,10 +3707,20 @@
3757	3707	return ERR_PTR(err);
3758	3708	}
3759	3709
	3710	+static void raid10_set_io_opt(struct r10conf *conf)
	3711	+{
	3712	+ int raid_disks = conf->geo.raid_disks;
	3713	+
	3714	+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
	3715	+ raid_disks /= conf->geo.near_copies;
	3716	+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
	3717	+ raid_disks);
	3718	+}
	3719	+
3760	3720	static int raid10_run(struct mddev *mddev)
3761	3721	{
3762	3722	struct r10conf *conf;
3763		- int i, disk_idx, chunk_size;
	3723	+ int i, disk_idx;
3764	3724	struct raid10_info *disk;
3765	3725	struct md_rdev *rdev;
3766	3726	sector_t size;
..	..	@@ -3796,18 +3756,13 @@
3796	3756	mddev->thread = conf->thread;
3797	3757	conf->thread = NULL;
3798	3758
3799		- chunk_size = mddev->chunk_sectors << 9;
3800	3759	if (mddev->queue) {
3801	3760	blk_queue_max_discard_sectors(mddev->queue,
3802	3761	mddev->chunk_sectors);
3803	3762	blk_queue_max_write_same_sectors(mddev->queue, 0);
3804	3763	blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3805		- blk_queue_io_min(mddev->queue, chunk_size);
3806		- if (conf->geo.raid_disks % conf->geo.near_copies)
3807		- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3808		- else
3809		- blk_queue_io_opt(mddev->queue, chunk_size *
3810		- (conf->geo.raid_disks / conf->geo.near_copies));
	3764	+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
	3765	+ raid10_set_io_opt(conf);
3811	3766	}
3812	3767
3813	3768	rdev_for_each(rdev, mddev) {
..	..	@@ -3921,19 +3876,6 @@
3921	3876	md_set_array_sectors(mddev, size);
3922	3877	mddev->resync_max_sectors = size;
3923	3878	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3924		-
3925		- if (mddev->queue) {
3926		- int stripe = conf->geo.raid_disks *
3927		- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3928		-
3929		- /* Calculate max read-ahead size.
3930		- * We need to readahead at least twice a whole stripe....
3931		- * maybe...
3932		- */
3933		- stripe /= conf->geo.near_copies;
3934		- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3935		- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3936		- }
3937	3879
3938	3880	if (md_integrity_register(mddev))
3939	3881	goto out_free_conf;
..	..	@@ -4293,12 +4235,46 @@
4293	4235	spin_unlock_irq(&conf->device_lock);
4294	4236
4295	4237	if (mddev->delta_disks && mddev->bitmap) {
4296		- ret = md_bitmap_resize(mddev->bitmap,
4297		- raid10_size(mddev, 0, conf->geo.raid_disks),
4298		- 0, 0);
	4238	+ struct mdp_superblock_1 *sb = NULL;
	4239	+ sector_t oldsize, newsize;
	4240	+
	4241	+ oldsize = raid10_size(mddev, 0, 0);
	4242	+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
	4243	+
	4244	+ if (!mddev_is_clustered(mddev)) {
	4245	+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
	4246	+ if (ret)
	4247	+ goto abort;
	4248	+ else
	4249	+ goto out;
	4250	+ }
	4251	+
	4252	+ rdev_for_each(rdev, mddev) {
	4253	+ if (rdev->raid_disk > -1 &&
	4254	+ !test_bit(Faulty, &rdev->flags))
	4255	+ sb = page_address(rdev->sb_page);
	4256	+ }
	4257	+
	4258	+ /*
	4259	+ * some node is already performing reshape, and no need to
	4260	+ * call md_bitmap_resize again since it should be called when
	4261	+ * receiving BITMAP_RESIZE msg
	4262	+ */
	4263	+ if ((sb && (le32_to_cpu(sb->feature_map) &
	4264	+ MD_FEATURE_RESHAPE_ACTIVE)) \|\| (oldsize == newsize))
	4265	+ goto out;
	4266	+
	4267	+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4299	4268	if (ret)
4300	4269	goto abort;
	4270	+
	4271	+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
	4272	+ if (ret) {
	4273	+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
	4274	+ goto abort;
	4275	+ }
4301	4276	}
	4277	+out:
4302	4278	if (mddev->delta_disks > 0) {
4303	4279	rdev_for_each(rdev, mddev)
4304	4280	if (rdev->raid_disk < 0 &&
..	..	@@ -4310,8 +4286,8 @@
4310	4286	else
4311	4287	rdev->recovery_offset = 0;
4312	4288
4313		- if (sysfs_link_rdev(mddev, rdev))
4314		- /* Failure here is OK */;
	4289	+ /* Failure here is OK */
	4290	+ sysfs_link_rdev(mddev, rdev);
4315	4291	}
4316	4292	} else if (rdev->raid_disk >= conf->prev.raid_disks
4317	4293	&& !test_bit(Faulty, &rdev->flags)) {
..	..	@@ -4457,7 +4433,7 @@
4457	4433	sector_nr = conf->reshape_progress;
4458	4434	if (sector_nr) {
4459	4435	mddev->curr_resync_completed = sector_nr;
4460		- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
	4436	+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
4461	4437	*skipped = 1;
4462	4438	return sector_nr;
4463	4439	}
..	..	@@ -4486,8 +4462,8 @@
4486	4462	last = conf->reshape_progress - 1;
4487	4463	sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4488	4464	& conf->prev.chunk_mask);
4489		- if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4490		- sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
	4465	+ if (sector_nr + RESYNC_SECTORS < last)
	4466	+ sector_nr = last + 1 - RESYNC_SECTORS;
4491	4467	} else {
4492	4468	/* 'next' is after the last device address that we
4493	4469	* might write to for this chunk in the new layout
..	..	@@ -4509,8 +4485,8 @@
4509	4485	last = sector_nr \| (conf->geo.chunk_mask
4510	4486	& conf->prev.chunk_mask);
4511	4487
4512		- if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4513		- last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
	4488	+ if (sector_nr + RESYNC_SECTORS <= last)
	4489	+ last = sector_nr + RESYNC_SECTORS - 1;
4514	4490	}
4515	4491
4516	4492	if (need_flush \|\|
..	..	@@ -4575,6 +4551,32 @@
4575	4551	r10_bio->master_bio = read_bio;
4576	4552	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4577	4553
	4554	+ /*
	4555	+ * Broadcast RESYNC message to other nodes, so all nodes would not
	4556	+ * write to the region to avoid conflict.
	4557	+ */
	4558	+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
	4559	+ struct mdp_superblock_1 *sb = NULL;
	4560	+ int sb_reshape_pos = 0;
	4561	+
	4562	+ conf->cluster_sync_low = sector_nr;
	4563	+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
	4564	+ sb = page_address(rdev->sb_page);
	4565	+ if (sb) {
	4566	+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
	4567	+ /*
	4568	+ * Set cluster_sync_low again if next address for array
	4569	+ * reshape is less than cluster_sync_low. Since we can't
	4570	+ * update cluster_sync_low until it has finished reshape.
	4571	+ */
	4572	+ if (sb_reshape_pos < conf->cluster_sync_low)
	4573	+ conf->cluster_sync_low = sb_reshape_pos;
	4574	+ }
	4575	+
	4576	+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
	4577	+ conf->cluster_sync_high);
	4578	+ }
	4579	+
4578	4580	/* Now find the locations in the new layout */
4579	4581	__raid10_find_phys(&conf->geo, r10_bio);
4580	4582
..	..	@@ -4631,7 +4633,7 @@
4631	4633	md_sync_acct_bio(read_bio, r10_bio->sectors);
4632	4634	atomic_inc(&r10_bio->remaining);
4633	4635	read_bio->bi_next = NULL;
4634		- generic_make_request(read_bio);
	4636	+ submit_bio_noacct(read_bio);
4635	4637	sectors_done += nr_sectors;
4636	4638	if (sector_nr <= last)
4637	4639	goto read_more;
..	..	@@ -4694,7 +4696,7 @@
4694	4696	md_sync_acct_bio(b, r10_bio->sectors);
4695	4697	atomic_inc(&r10_bio->remaining);
4696	4698	b->bi_next = NULL;
4697		- generic_make_request(b);
	4699	+ submit_bio_noacct(b);
4698	4700	}
4699	4701	end_reshape_request(r10_bio);
4700	4702	}
..	..	@@ -4712,17 +4714,22 @@
4712	4714	conf->reshape_safe = MaxSector;
4713	4715	spin_unlock_irq(&conf->device_lock);
4714	4716
4715		- /* read-ahead size must cover two whole stripes, which is
4716		- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4717		- */
4718		- if (conf->mddev->queue) {
4719		- int stripe = conf->geo.raid_disks *
4720		- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4721		- stripe /= conf->geo.near_copies;
4722		- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4723		- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4724		- }
	4717	+ if (conf->mddev->queue)
	4718	+ raid10_set_io_opt(conf);
4725	4719	conf->fullsync = 0;
	4720	+}
	4721	+
	4722	+static void raid10_update_reshape_pos(struct mddev *mddev)
	4723	+{
	4724	+ struct r10conf *conf = mddev->private;
	4725	+ sector_t lo, hi;
	4726	+
	4727	+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
	4728	+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
	4729	+ \|\| mddev->reshape_position == MaxSector)
	4730	+ conf->reshape_progress = mddev->reshape_position;
	4731	+ else
	4732	+ WARN_ON_ONCE(1);
4726	4733	}
4727	4734
4728	4735	static int handle_reshape_read_error(struct mddev *mddev,
..	..	@@ -4736,8 +4743,7 @@
4736	4743	int idx = 0;
4737	4744	struct page **pages;
4738	4745
4739		- r10b = kmalloc(sizeof(*r10b) +
4740		- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
	4746	+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4741	4747	if (!r10b) {
4742	4748	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4743	4749	return -ENOMEM;
..	..	@@ -4893,7 +4899,7 @@
4893	4899	.check_reshape = raid10_check_reshape,
4894	4900	.start_reshape = raid10_start_reshape,
4895	4901	.finish_reshape = raid10_finish_reshape,
4896		- .congested = raid10_congested,
	4902	+ .update_reshape_pos = raid10_update_reshape_pos,
4897	4903	};
4898	4904
4899	4905	static int __init raid_init(void)