~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* raid1.c : Multiple Devices driver for Linux
3	4	*
..	..	@@ -20,15 +21,6 @@
20	21	*
21	22	* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22	23	* - persistent bitmap code
23		- *
24		- * This program is free software; you can redistribute it and/or modify
25		- * it under the terms of the GNU General Public License as published by
26		- * the Free Software Foundation; either version 2, or (at your option)
27		- * any later version.
28		- *
29		- * You should have received a copy of the GNU General Public License
30		- * (for example /usr/src/linux/COPYING); if not, write to the Free
31		- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32	24	*/
33	25
34	26	#include <linux/slab.h>
..	..	@@ -37,6 +29,7 @@
37	29	#include <linux/module.h>
38	30	#include <linux/seq_file.h>
39	31	#include <linux/ratelimit.h>
	32	+#include <linux/interval_tree_generic.h>
40	33
41	34	#include <trace/events/block.h>
42	35
..	..	@@ -50,31 +43,6 @@
50	43	(1L << MD_HAS_PPL) \| \
51	44	(1L << MD_HAS_MULTIPLE_PPLS))
52	45
53		-/*
54		- * Number of guaranteed r1bios in case of extreme VM load:
55		- */
56		-#define NR_RAID1_BIOS 256
57		-
58		-/* when we get a read error on a read-only array, we redirect to another
59		- * device without failing the first device, or trying to over-write to
60		- * correct the read error. To keep track of bad blocks on a per-bio
61		- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
62		- */
63		-#define IO_BLOCKED ((struct bio *)1)
64		-/* When we successfully write to a known bad-block, we need to remove the
65		- * bad-block marking which must be done from process context. So we record
66		- * the success by setting devs[n].bio to IO_MADE_GOOD
67		- */
68		-#define IO_MADE_GOOD ((struct bio *)2)
69		-
70		-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
71		-
72		-/* When there are this many requests queue to be written by
73		- * the raid1 thread, we become 'congested' to provide back-pressure
74		- * for writeback.
75		- */
76		-static int max_queued_requests = 1024;
77		-
78	46	static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
79	47	static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
80	48
..	..	@@ -82,6 +50,73 @@
82	50	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
83	51
84	52	#include "raid1-10.c"
	53	+
	54	+#define START(node) ((node)->start)
	55	+#define LAST(node) ((node)->last)
	56	+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
	57	+ START, LAST, static inline, raid1_rb);
	58	+
	59	+static int check_and_add_serial(struct md_rdev rdev, struct r1bio r1_bio,
	60	+ struct serial_info *si, int idx)
	61	+{
	62	+ unsigned long flags;
	63	+ int ret = 0;
	64	+ sector_t lo = r1_bio->sector;
	65	+ sector_t hi = lo + r1_bio->sectors;
	66	+ struct serial_in_rdev *serial = &rdev->serial[idx];
	67	+
	68	+ spin_lock_irqsave(&serial->serial_lock, flags);
	69	+ /* collision happened */
	70	+ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
	71	+ ret = -EBUSY;
	72	+ else {
	73	+ si->start = lo;
	74	+ si->last = hi;
	75	+ raid1_rb_insert(si, &serial->serial_rb);
	76	+ }
	77	+ spin_unlock_irqrestore(&serial->serial_lock, flags);
	78	+
	79	+ return ret;
	80	+}
	81	+
	82	+static void wait_for_serialization(struct md_rdev rdev, struct r1bio r1_bio)
	83	+{
	84	+ struct mddev *mddev = rdev->mddev;
	85	+ struct serial_info *si;
	86	+ int idx = sector_to_idx(r1_bio->sector);
	87	+ struct serial_in_rdev *serial = &rdev->serial[idx];
	88	+
	89	+ if (WARN_ON(!mddev->serial_info_pool))
	90	+ return;
	91	+ si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
	92	+ wait_event(serial->serial_io_wait,
	93	+ check_and_add_serial(rdev, r1_bio, si, idx) == 0);
	94	+}
	95	+
	96	+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
	97	+{
	98	+ struct serial_info *si;
	99	+ unsigned long flags;
	100	+ int found = 0;
	101	+ struct mddev *mddev = rdev->mddev;
	102	+ int idx = sector_to_idx(lo);
	103	+ struct serial_in_rdev *serial = &rdev->serial[idx];
	104	+
	105	+ spin_lock_irqsave(&serial->serial_lock, flags);
	106	+ for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
	107	+ si; si = raid1_rb_iter_next(si, lo, hi)) {
	108	+ if (si->start == lo && si->last == hi) {
	109	+ raid1_rb_remove(si, &serial->serial_rb);
	110	+ mempool_free(si, mddev->serial_info_pool);
	111	+ found = 1;
	112	+ break;
	113	+ }
	114	+ }
	115	+ if (!found)
	116	+ WARN(1, "The write IO is not recorded for serialization\n");
	117	+ spin_unlock_irqrestore(&serial->serial_lock, flags);
	118	+ wake_up(&serial->serial_io_wait);
	119	+}
85	120
86	121	/*
87	122	* for resync bio, r1bio pointer can be retrieved from the per-bio
..	..	@@ -99,11 +134,6 @@
99	134
100	135	/* allocate a r1bio with room for raid_disks entries in the bios array */
101	136	return kzalloc(size, gfp_flags);
102		-}
103		-
104		-static void r1bio_pool_free(void r1_bio, void data)
105		-{
106		- kfree(r1_bio);
107	137	}
108	138
109	139	#define RESYNC_DEPTH 32
..	..	@@ -181,7 +211,7 @@
181	211	kfree(rps);
182	212
183	213	out_free_r1bio:
184		- r1bio_pool_free(r1_bio, data);
	214	+ rbio_pool_free(r1_bio, data);
185	215	return NULL;
186	216	}
187	217
..	..	@@ -201,7 +231,7 @@
201	231	/* resync pages array stored in the 1st bio's .bi_private */
202	232	kfree(rp);
203	233
204		- r1bio_pool_free(r1bio, data);
	234	+ rbio_pool_free(r1bio, data);
205	235	}
206	236
207	237	static void put_all_bios(struct r1conf conf, struct r1bio r1_bio)
..	..	@@ -266,22 +296,17 @@
266	296	static void call_bio_endio(struct r1bio *r1_bio)
267	297	{
268	298	struct bio *bio = r1_bio->master_bio;
269		- struct r1conf *conf = r1_bio->mddev->private;
270	299
271	300	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
272	301	bio->bi_status = BLK_STS_IOERR;
273	302
274	303	bio_endio(bio);
275		- /*
276		- * Wake up any possible resync thread that waits for the device
277		- * to go idle.
278		- */
279		- allow_barrier(conf, r1_bio->sector);
280	304	}
281	305
282	306	static void raid_end_bio_io(struct r1bio *r1_bio)
283	307	{
284	308	struct bio *bio = r1_bio->master_bio;
	309	+ struct r1conf *conf = r1_bio->mddev->private;
285	310
286	311	/* if nobody has done the final endio yet, do it now */
287	312	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
..	..	@@ -292,6 +317,12 @@
292	317
293	318	call_bio_endio(r1_bio);
294	319	}
	320	+ /*
	321	+ * Wake up any possible resync thread that waits for the device
	322	+ * to go idle. All I/Os, even write-behind writes, are done.
	323	+ */
	324	+ allow_barrier(conf, r1_bio->sector);
	325	+
295	326	free_r1bio(r1_bio);
296	327	}
297	328
..	..	@@ -417,6 +448,8 @@
417	448	int mirror = find_bio_disk(r1_bio, bio);
418	449	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
419	450	bool discard_error;
	451	+ sector_t lo = r1_bio->sector;
	452	+ sector_t hi = r1_bio->sector + r1_bio->sectors;
420	453
421	454	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
422	455
..	..	@@ -439,8 +472,6 @@
439	472	/*
440	473	* When the device is faulty, it is not necessary to
441	474	* handle write error.
442		- * For failfast, this is the only remaining device,
443		- * We need to retry the write without FailFast.
444	475	*/
445	476	if (!test_bit(Faulty, &rdev->flags))
446	477	set_bit(R1BIO_WriteError, &r1_bio->state);
..	..	@@ -488,6 +519,8 @@
488	519	}
489	520
490	521	if (behind) {
	522	+ if (test_bit(CollisionCheck, &rdev->flags))
	523	+ remove_serial(rdev, lo, hi);
491	524	if (test_bit(WriteMostly, &rdev->flags))
492	525	atomic_dec(&r1_bio->behind_remaining);
493	526
..	..	@@ -510,7 +543,8 @@
510	543	call_bio_endio(r1_bio);
511	544	}
512	545	}
513		- }
	546	+ } else if (rdev->mddev->serialize_policy)
	547	+ remove_serial(rdev, lo, hi);
514	548	if (r1_bio->bios[mirror] == NULL)
515	549	rdev_dec_pending(rdev, conf->mddev);
516	550
..	..	@@ -752,36 +786,6 @@
752	786	return best_disk;
753	787	}
754	788
755		-static int raid1_congested(struct mddev *mddev, int bits)
756		-{
757		- struct r1conf *conf = mddev->private;
758		- int i, ret = 0;
759		-
760		- if ((bits & (1 << WB_async_congested)) &&
761		- conf->pending_count >= max_queued_requests)
762		- return 1;
763		-
764		- rcu_read_lock();
765		- for (i = 0; i < conf->raid_disks * 2; i++) {
766		- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
767		- if (rdev && !test_bit(Faulty, &rdev->flags)) {
768		- struct request_queue *q = bdev_get_queue(rdev->bdev);
769		-
770		- BUG_ON(!q);
771		-
772		- /* Note the '\|\| 1' - when read_balance prefers
773		- * non-congested targets, it can be removed
774		- */
775		- if ((bits & (1 << WB_async_congested)) \|\| 1)
776		- ret \|= bdi_congested(q->backing_dev_info, bits);
777		- else
778		- ret &= bdi_congested(q->backing_dev_info, bits);
779		- }
780		- }
781		- rcu_read_unlock();
782		- return ret;
783		-}
784		-
785	789	static void flush_bio_list(struct r1conf conf, struct bio bio)
786	790	{
787	791	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
..	..	@@ -800,8 +804,9 @@
800	804	/* Just ignore it */
801	805	bio_endio(bio);
802	806	else
803		- generic_make_request(bio);
	807	+ submit_bio_noacct(bio);
804	808	bio = next;
	809	+ cond_resched();
805	810	}
806	811	}
807	812
..	..	@@ -857,8 +862,11 @@
857	862	* backgroup IO calls must call raise_barrier. Once that returns
858	863	* there is no normal IO happeing. It must arrange to call
859	864	* lower_barrier when the particular background IO completes.
	865	+ *
	866	+ * If resync/recovery is interrupted, returns -EINTR;
	867	+ * Otherwise, returns 0.
860	868	*/
861		-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
	869	+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
862	870	{
863	871	int idx = sector_to_idx(sector_nr);
864	872
..	..	@@ -1274,7 +1282,7 @@
1274	1282	struct bio *split = bio_split(bio, max_sectors,
1275	1283	gfp, &conf->bio_split);
1276	1284	bio_chain(split, bio);
1277		- generic_make_request(bio);
	1285	+ submit_bio_noacct(bio);
1278	1286	bio = split;
1279	1287	r1_bio->master_bio = bio;
1280	1288	r1_bio->sectors = max_sectors;
..	..	@@ -1300,7 +1308,7 @@
1300	1308	trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
1301	1309	disk_devt(mddev->gendisk), r1_bio->sector);
1302	1310
1303		- generic_make_request(read_bio);
	1311	+ submit_bio_noacct(read_bio);
1304	1312	}
1305	1313
1306	1314	static void raid1_write_request(struct mddev mddev, struct bio bio,
..	..	@@ -1445,7 +1453,7 @@
1445	1453	struct bio *split = bio_split(bio, max_sectors,
1446	1454	GFP_NOIO, &conf->bio_split);
1447	1455	bio_chain(split, bio);
1448		- generic_make_request(bio);
	1456	+ submit_bio_noacct(bio);
1449	1457	bio = split;
1450	1458	r1_bio->master_bio = bio;
1451	1459	r1_bio->sectors = max_sectors;
..	..	@@ -1458,9 +1466,9 @@
1458	1466
1459	1467	for (i = 0; i < disks; i++) {
1460	1468	struct bio *mbio = NULL;
	1469	+ struct md_rdev *rdev = conf->mirrors[i].rdev;
1461	1470	if (!r1_bio->bios[i])
1462	1471	continue;
1463		-
1464	1472
1465	1473	if (first_clone) {
1466	1474	/* do behind I/O ?
..	..	@@ -1486,9 +1494,12 @@
1486	1494	mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1487	1495
1488	1496	if (r1_bio->behind_master_bio) {
1489		- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
	1497	+ if (test_bit(CollisionCheck, &rdev->flags))
	1498	+ wait_for_serialization(rdev, r1_bio);
	1499	+ if (test_bit(WriteMostly, &rdev->flags))
1490	1500	atomic_inc(&r1_bio->behind_remaining);
1491		- }
	1501	+ } else if (mddev->serialize_policy)
	1502	+ wait_for_serialization(rdev, r1_bio);
1492	1503
1493	1504	r1_bio->bios[i] = mbio;
1494	1505
..	..	@@ -1588,12 +1599,12 @@
1588	1599
1589	1600	/*
1590	1601	* If it is not operational, then we have already marked it as dead
1591		- * else if it is the last working disks, ignore the error, let the
1592		- * next level up know.
	1602	+ * else if it is the last working disks with "fail_last_dev == false",
	1603	+ * ignore the error, let the next level up know.
1593	1604	* else mark the drive as failed
1594	1605	*/
1595	1606	spin_lock_irqsave(&conf->device_lock, flags);
1596		- if (test_bit(In_sync, &rdev->flags)
	1607	+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
1597	1608	&& (conf->raid_disks - mddev->degraded) == 1) {
1598	1609	/*
1599	1610	* Don't fail the drive, act as though we were just a
..	..	@@ -1606,11 +1617,9 @@
1606	1617	return;
1607	1618	}
1608	1619	set_bit(Blocked, &rdev->flags);
1609		- if (test_and_clear_bit(In_sync, &rdev->flags)) {
	1620	+ if (test_and_clear_bit(In_sync, &rdev->flags))
1610	1621	mddev->degraded++;
1611		- set_bit(Faulty, &rdev->flags);
1612		- } else
1613		- set_bit(Faulty, &rdev->flags);
	1622	+ set_bit(Faulty, &rdev->flags);
1614	1623	spin_unlock_irqrestore(&conf->device_lock, flags);
1615	1624	/*
1616	1625	* if recovery is running, make sure it aborts.
..	..	@@ -1742,9 +1751,8 @@
1742	1751	first = last = rdev->saved_raid_disk;
1743	1752
1744	1753	for (mirror = first; mirror <= last; mirror++) {
1745		- p = conf->mirrors+mirror;
	1754	+ p = conf->mirrors + mirror;
1746	1755	if (!p->rdev) {
1747		-
1748	1756	if (mddev->gendisk)
1749	1757	disk_stack_limits(mddev->gendisk, rdev->bdev,
1750	1758	rdev->data_offset << 9);
..	..	@@ -1880,6 +1888,22 @@
1880	1888	} while (sectors_to_go > 0);
1881	1889	}
1882	1890
	1891	+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
	1892	+{
	1893	+ if (atomic_dec_and_test(&r1_bio->remaining)) {
	1894	+ struct mddev *mddev = r1_bio->mddev;
	1895	+ int s = r1_bio->sectors;
	1896	+
	1897	+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
	1898	+ test_bit(R1BIO_WriteError, &r1_bio->state))
	1899	+ reschedule_retry(r1_bio);
	1900	+ else {
	1901	+ put_buf(r1_bio);
	1902	+ md_done_sync(mddev, s, uptodate);
	1903	+ }
	1904	+ }
	1905	+}
	1906	+
1883	1907	static void end_sync_write(struct bio *bio)
1884	1908	{
1885	1909	int uptodate = !bio->bi_status;
..	..	@@ -1906,16 +1930,7 @@
1906	1930	)
1907	1931	set_bit(R1BIO_MadeGood, &r1_bio->state);
1908	1932
1909		- if (atomic_dec_and_test(&r1_bio->remaining)) {
1910		- int s = r1_bio->sectors;
1911		- if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
1912		- test_bit(R1BIO_WriteError, &r1_bio->state))
1913		- reschedule_retry(r1_bio);
1914		- else {
1915		- put_buf(r1_bio);
1916		- md_done_sync(mddev, s, uptodate);
1917		- }
1918		- }
	1933	+ put_sync_write_buf(r1_bio, uptodate);
1919	1934	}
1920	1935
1921	1936	static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
..	..	@@ -2115,7 +2130,7 @@
2115	2130	}
2116	2131	r1_bio->read_disk = primary;
2117	2132	for (i = 0; i < conf->raid_disks * 2; i++) {
2118		- int j;
	2133	+ int j = 0;
2119	2134	struct bio *pbio = r1_bio->bios[primary];
2120	2135	struct bio *sbio = r1_bio->bios[i];
2121	2136	blk_status_t status = sbio->bi_status;
..	..	@@ -2123,14 +2138,15 @@
2123	2138	struct page **spages = get_resync_pages(sbio)->pages;
2124	2139	struct bio_vec *bi;
2125	2140	int page_len[RESYNC_PAGES] = { 0 };
	2141	+ struct bvec_iter_all iter_all;
2126	2142
2127	2143	if (sbio->bi_end_io != end_sync_read)
2128	2144	continue;
2129	2145	/* Now we can 'fixup' the error value */
2130	2146	sbio->bi_status = 0;
2131	2147
2132		- bio_for_each_segment_all(bi, sbio, j)
2133		- page_len[j] = bi->bv_len;
	2148	+ bio_for_each_segment_all(bi, sbio, iter_all)
	2149	+ page_len[j++] = bi->bv_len;
2134	2150
2135	2151	if (!status) {
2136	2152	for (j = vcnt; j-- ; ) {
..	..	@@ -2194,20 +2210,10 @@
2194	2210	atomic_inc(&r1_bio->remaining);
2195	2211	md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
2196	2212
2197		- generic_make_request(wbio);
	2213	+ submit_bio_noacct(wbio);
2198	2214	}
2199	2215
2200		- if (atomic_dec_and_test(&r1_bio->remaining)) {
2201		- /* if we're here, all write(s) have completed, so clean up */
2202		- int s = r1_bio->sectors;
2203		- if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2204		- test_bit(R1BIO_WriteError, &r1_bio->state))
2205		- reschedule_retry(r1_bio);
2206		- else {
2207		- put_buf(r1_bio);
2208		- md_done_sync(mddev, s, 1);
2209		- }
2210		- }
	2216	+ put_sync_write_buf(r1_bio, 1);
2211	2217	}
2212	2218
2213	2219	/*
..	..	@@ -2890,7 +2896,7 @@
2890	2896	md_sync_acct_bio(bio, nr_sectors);
2891	2897	if (read_targets == 1)
2892	2898	bio->bi_opf &= ~MD_FAILFAST;
2893		- generic_make_request(bio);
	2899	+ submit_bio_noacct(bio);
2894	2900	}
2895	2901	}
2896	2902	} else {
..	..	@@ -2899,8 +2905,7 @@
2899	2905	md_sync_acct_bio(bio, nr_sectors);
2900	2906	if (read_targets == 1)
2901	2907	bio->bi_opf &= ~MD_FAILFAST;
2902		- generic_make_request(bio);
2903		-
	2908	+ submit_bio_noacct(bio);
2904	2909	}
2905	2910	return nr_sectors;
2906	2911	}
..	..	@@ -2959,8 +2964,8 @@
2959	2964	if (!conf->poolinfo)
2960	2965	goto abort;
2961	2966	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2962		- err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
2963		- r1bio_pool_free, conf->poolinfo);
	2967	+ err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
	2968	+ rbio_pool_free, conf->poolinfo);
2964	2969	if (err)
2965	2970	goto abort;
2966	2971
..	..	@@ -3101,7 +3106,7 @@
3101	3106	}
3102	3107
3103	3108	mddev->degraded = 0;
3104		- for (i=0; i < conf->raid_disks; i++)
	3109	+ for (i = 0; i < conf->raid_disks; i++)
3105	3110	if (conf->mirrors[i].rdev == NULL \|\|
3106	3111	!test_bit(In_sync, &conf->mirrors[i].rdev->flags) \|\|
3107	3112	test_bit(Faulty, &conf->mirrors[i].rdev->flags))
..	..	@@ -3143,7 +3148,7 @@
3143	3148	mddev->queue);
3144	3149	}
3145	3150
3146		- ret = md_integrity_register(mddev);
	3151	+ ret = md_integrity_register(mddev);
3147	3152	if (ret) {
3148	3153	md_unregister_thread(&mddev->thread);
3149	3154	goto abort;
..	..	@@ -3255,8 +3260,8 @@
3255	3260	newpoolinfo->mddev = mddev;
3256	3261	newpoolinfo->raid_disks = raid_disks * 2;
3257	3262
3258		- ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
3259		- r1bio_pool_free, newpoolinfo);
	3263	+ ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
	3264	+ rbio_pool_free, newpoolinfo);
3260	3265	if (ret) {
3261	3266	kfree(newpoolinfo);
3262	3267	return ret;
..	..	@@ -3361,7 +3366,6 @@
3361	3366	.check_reshape = raid1_reshape,
3362	3367	.quiesce = raid1_quiesce,
3363	3368	.takeover = raid1_takeover,
3364		- .congested = raid1_congested,
3365	3369	};
3366	3370
3367	3371	static int __init raid_init(void)