hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/drivers/md/raid1.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid1.c : Multiple Devices driver for Linux
34 *
....@@ -20,15 +21,6 @@
2021 *
2122 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
2223 * - persistent bitmap code
23
- *
24
- * This program is free software; you can redistribute it and/or modify
25
- * it under the terms of the GNU General Public License as published by
26
- * the Free Software Foundation; either version 2, or (at your option)
27
- * any later version.
28
- *
29
- * You should have received a copy of the GNU General Public License
30
- * (for example /usr/src/linux/COPYING); if not, write to the Free
31
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3224 */
3325
3426 #include <linux/slab.h>
....@@ -37,6 +29,7 @@
3729 #include <linux/module.h>
3830 #include <linux/seq_file.h>
3931 #include <linux/ratelimit.h>
32
+#include <linux/interval_tree_generic.h>
4033
4134 #include <trace/events/block.h>
4235
....@@ -50,31 +43,6 @@
5043 (1L << MD_HAS_PPL) | \
5144 (1L << MD_HAS_MULTIPLE_PPLS))
5245
53
-/*
54
- * Number of guaranteed r1bios in case of extreme VM load:
55
- */
56
-#define NR_RAID1_BIOS 256
57
-
58
-/* when we get a read error on a read-only array, we redirect to another
59
- * device without failing the first device, or trying to over-write to
60
- * correct the read error. To keep track of bad blocks on a per-bio
61
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
62
- */
63
-#define IO_BLOCKED ((struct bio *)1)
64
-/* When we successfully write to a known bad-block, we need to remove the
65
- * bad-block marking which must be done from process context. So we record
66
- * the success by setting devs[n].bio to IO_MADE_GOOD
67
- */
68
-#define IO_MADE_GOOD ((struct bio *)2)
69
-
70
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
71
-
72
-/* When there are this many requests queue to be written by
73
- * the raid1 thread, we become 'congested' to provide back-pressure
74
- * for writeback.
75
- */
76
-static int max_queued_requests = 1024;
77
-
7846 static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
7947 static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
8048
....@@ -82,6 +50,73 @@
8250 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
8351
8452 #include "raid1-10.c"
53
+
54
+#define START(node) ((node)->start)
55
+#define LAST(node) ((node)->last)
56
+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
57
+ START, LAST, static inline, raid1_rb);
58
+
59
+static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
60
+ struct serial_info *si, int idx)
61
+{
62
+ unsigned long flags;
63
+ int ret = 0;
64
+ sector_t lo = r1_bio->sector;
65
+ sector_t hi = lo + r1_bio->sectors;
66
+ struct serial_in_rdev *serial = &rdev->serial[idx];
67
+
68
+ spin_lock_irqsave(&serial->serial_lock, flags);
69
+ /* collision happened */
70
+ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
71
+ ret = -EBUSY;
72
+ else {
73
+ si->start = lo;
74
+ si->last = hi;
75
+ raid1_rb_insert(si, &serial->serial_rb);
76
+ }
77
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
78
+
79
+ return ret;
80
+}
81
+
82
+static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
83
+{
84
+ struct mddev *mddev = rdev->mddev;
85
+ struct serial_info *si;
86
+ int idx = sector_to_idx(r1_bio->sector);
87
+ struct serial_in_rdev *serial = &rdev->serial[idx];
88
+
89
+ if (WARN_ON(!mddev->serial_info_pool))
90
+ return;
91
+ si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
92
+ wait_event(serial->serial_io_wait,
93
+ check_and_add_serial(rdev, r1_bio, si, idx) == 0);
94
+}
95
+
96
+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
97
+{
98
+ struct serial_info *si;
99
+ unsigned long flags;
100
+ int found = 0;
101
+ struct mddev *mddev = rdev->mddev;
102
+ int idx = sector_to_idx(lo);
103
+ struct serial_in_rdev *serial = &rdev->serial[idx];
104
+
105
+ spin_lock_irqsave(&serial->serial_lock, flags);
106
+ for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
107
+ si; si = raid1_rb_iter_next(si, lo, hi)) {
108
+ if (si->start == lo && si->last == hi) {
109
+ raid1_rb_remove(si, &serial->serial_rb);
110
+ mempool_free(si, mddev->serial_info_pool);
111
+ found = 1;
112
+ break;
113
+ }
114
+ }
115
+ if (!found)
116
+ WARN(1, "The write IO is not recorded for serialization\n");
117
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
118
+ wake_up(&serial->serial_io_wait);
119
+}
85120
86121 /*
87122 * for resync bio, r1bio pointer can be retrieved from the per-bio
....@@ -99,11 +134,6 @@
99134
100135 /* allocate a r1bio with room for raid_disks entries in the bios array */
101136 return kzalloc(size, gfp_flags);
102
-}
103
-
104
-static void r1bio_pool_free(void *r1_bio, void *data)
105
-{
106
- kfree(r1_bio);
107137 }
108138
109139 #define RESYNC_DEPTH 32
....@@ -181,7 +211,7 @@
181211 kfree(rps);
182212
183213 out_free_r1bio:
184
- r1bio_pool_free(r1_bio, data);
214
+ rbio_pool_free(r1_bio, data);
185215 return NULL;
186216 }
187217
....@@ -201,7 +231,7 @@
201231 /* resync pages array stored in the 1st bio's .bi_private */
202232 kfree(rp);
203233
204
- r1bio_pool_free(r1bio, data);
234
+ rbio_pool_free(r1bio, data);
205235 }
206236
207237 static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
....@@ -266,22 +296,17 @@
266296 static void call_bio_endio(struct r1bio *r1_bio)
267297 {
268298 struct bio *bio = r1_bio->master_bio;
269
- struct r1conf *conf = r1_bio->mddev->private;
270299
271300 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
272301 bio->bi_status = BLK_STS_IOERR;
273302
274303 bio_endio(bio);
275
- /*
276
- * Wake up any possible resync thread that waits for the device
277
- * to go idle.
278
- */
279
- allow_barrier(conf, r1_bio->sector);
280304 }
281305
282306 static void raid_end_bio_io(struct r1bio *r1_bio)
283307 {
284308 struct bio *bio = r1_bio->master_bio;
309
+ struct r1conf *conf = r1_bio->mddev->private;
285310
286311 /* if nobody has done the final endio yet, do it now */
287312 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
....@@ -292,6 +317,12 @@
292317
293318 call_bio_endio(r1_bio);
294319 }
320
+ /*
321
+ * Wake up any possible resync thread that waits for the device
322
+ * to go idle. All I/Os, even write-behind writes, are done.
323
+ */
324
+ allow_barrier(conf, r1_bio->sector);
325
+
295326 free_r1bio(r1_bio);
296327 }
297328
....@@ -417,6 +448,8 @@
417448 int mirror = find_bio_disk(r1_bio, bio);
418449 struct md_rdev *rdev = conf->mirrors[mirror].rdev;
419450 bool discard_error;
451
+ sector_t lo = r1_bio->sector;
452
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
420453
421454 discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
422455
....@@ -439,8 +472,6 @@
439472 /*
440473 * When the device is faulty, it is not necessary to
441474 * handle write error.
442
- * For failfast, this is the only remaining device,
443
- * We need to retry the write without FailFast.
444475 */
445476 if (!test_bit(Faulty, &rdev->flags))
446477 set_bit(R1BIO_WriteError, &r1_bio->state);
....@@ -488,6 +519,8 @@
488519 }
489520
490521 if (behind) {
522
+ if (test_bit(CollisionCheck, &rdev->flags))
523
+ remove_serial(rdev, lo, hi);
491524 if (test_bit(WriteMostly, &rdev->flags))
492525 atomic_dec(&r1_bio->behind_remaining);
493526
....@@ -510,7 +543,8 @@
510543 call_bio_endio(r1_bio);
511544 }
512545 }
513
- }
546
+ } else if (rdev->mddev->serialize_policy)
547
+ remove_serial(rdev, lo, hi);
514548 if (r1_bio->bios[mirror] == NULL)
515549 rdev_dec_pending(rdev, conf->mddev);
516550
....@@ -752,36 +786,6 @@
752786 return best_disk;
753787 }
754788
755
-static int raid1_congested(struct mddev *mddev, int bits)
756
-{
757
- struct r1conf *conf = mddev->private;
758
- int i, ret = 0;
759
-
760
- if ((bits & (1 << WB_async_congested)) &&
761
- conf->pending_count >= max_queued_requests)
762
- return 1;
763
-
764
- rcu_read_lock();
765
- for (i = 0; i < conf->raid_disks * 2; i++) {
766
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
767
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
768
- struct request_queue *q = bdev_get_queue(rdev->bdev);
769
-
770
- BUG_ON(!q);
771
-
772
- /* Note the '|| 1' - when read_balance prefers
773
- * non-congested targets, it can be removed
774
- */
775
- if ((bits & (1 << WB_async_congested)) || 1)
776
- ret |= bdi_congested(q->backing_dev_info, bits);
777
- else
778
- ret &= bdi_congested(q->backing_dev_info, bits);
779
- }
780
- }
781
- rcu_read_unlock();
782
- return ret;
783
-}
784
-
785789 static void flush_bio_list(struct r1conf *conf, struct bio *bio)
786790 {
787791 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
....@@ -800,8 +804,9 @@
800804 /* Just ignore it */
801805 bio_endio(bio);
802806 else
803
- generic_make_request(bio);
807
+ submit_bio_noacct(bio);
804808 bio = next;
809
+ cond_resched();
805810 }
806811 }
807812
....@@ -857,8 +862,11 @@
857862 * backgroup IO calls must call raise_barrier. Once that returns
858863 * there is no normal IO happeing. It must arrange to call
859864 * lower_barrier when the particular background IO completes.
865
+ *
866
+ * If resync/recovery is interrupted, returns -EINTR;
867
+ * Otherwise, returns 0.
860868 */
861
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
869
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
862870 {
863871 int idx = sector_to_idx(sector_nr);
864872
....@@ -1274,7 +1282,7 @@
12741282 struct bio *split = bio_split(bio, max_sectors,
12751283 gfp, &conf->bio_split);
12761284 bio_chain(split, bio);
1277
- generic_make_request(bio);
1285
+ submit_bio_noacct(bio);
12781286 bio = split;
12791287 r1_bio->master_bio = bio;
12801288 r1_bio->sectors = max_sectors;
....@@ -1300,7 +1308,7 @@
13001308 trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
13011309 disk_devt(mddev->gendisk), r1_bio->sector);
13021310
1303
- generic_make_request(read_bio);
1311
+ submit_bio_noacct(read_bio);
13041312 }
13051313
13061314 static void raid1_write_request(struct mddev *mddev, struct bio *bio,
....@@ -1445,7 +1453,7 @@
14451453 struct bio *split = bio_split(bio, max_sectors,
14461454 GFP_NOIO, &conf->bio_split);
14471455 bio_chain(split, bio);
1448
- generic_make_request(bio);
1456
+ submit_bio_noacct(bio);
14491457 bio = split;
14501458 r1_bio->master_bio = bio;
14511459 r1_bio->sectors = max_sectors;
....@@ -1458,9 +1466,9 @@
14581466
14591467 for (i = 0; i < disks; i++) {
14601468 struct bio *mbio = NULL;
1469
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
14611470 if (!r1_bio->bios[i])
14621471 continue;
1463
-
14641472
14651473 if (first_clone) {
14661474 /* do behind I/O ?
....@@ -1486,9 +1494,12 @@
14861494 mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
14871495
14881496 if (r1_bio->behind_master_bio) {
1489
- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
1497
+ if (test_bit(CollisionCheck, &rdev->flags))
1498
+ wait_for_serialization(rdev, r1_bio);
1499
+ if (test_bit(WriteMostly, &rdev->flags))
14901500 atomic_inc(&r1_bio->behind_remaining);
1491
- }
1501
+ } else if (mddev->serialize_policy)
1502
+ wait_for_serialization(rdev, r1_bio);
14921503
14931504 r1_bio->bios[i] = mbio;
14941505
....@@ -1588,12 +1599,12 @@
15881599
15891600 /*
15901601 * If it is not operational, then we have already marked it as dead
1591
- * else if it is the last working disks, ignore the error, let the
1592
- * next level up know.
1602
+ * else if it is the last working disks with "fail_last_dev == false",
1603
+ * ignore the error, let the next level up know.
15931604 * else mark the drive as failed
15941605 */
15951606 spin_lock_irqsave(&conf->device_lock, flags);
1596
- if (test_bit(In_sync, &rdev->flags)
1607
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
15971608 && (conf->raid_disks - mddev->degraded) == 1) {
15981609 /*
15991610 * Don't fail the drive, act as though we were just a
....@@ -1606,11 +1617,9 @@
16061617 return;
16071618 }
16081619 set_bit(Blocked, &rdev->flags);
1609
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
1620
+ if (test_and_clear_bit(In_sync, &rdev->flags))
16101621 mddev->degraded++;
1611
- set_bit(Faulty, &rdev->flags);
1612
- } else
1613
- set_bit(Faulty, &rdev->flags);
1622
+ set_bit(Faulty, &rdev->flags);
16141623 spin_unlock_irqrestore(&conf->device_lock, flags);
16151624 /*
16161625 * if recovery is running, make sure it aborts.
....@@ -1742,9 +1751,8 @@
17421751 first = last = rdev->saved_raid_disk;
17431752
17441753 for (mirror = first; mirror <= last; mirror++) {
1745
- p = conf->mirrors+mirror;
1754
+ p = conf->mirrors + mirror;
17461755 if (!p->rdev) {
1747
-
17481756 if (mddev->gendisk)
17491757 disk_stack_limits(mddev->gendisk, rdev->bdev,
17501758 rdev->data_offset << 9);
....@@ -1784,6 +1792,9 @@
17841792 int err = 0;
17851793 int number = rdev->raid_disk;
17861794 struct raid1_info *p = conf->mirrors + number;
1795
+
1796
+ if (unlikely(number >= conf->raid_disks))
1797
+ goto abort;
17871798
17881799 if (rdev != p->rdev)
17891800 p = conf->mirrors + conf->raid_disks + number;
....@@ -1880,6 +1891,22 @@
18801891 } while (sectors_to_go > 0);
18811892 }
18821893
1894
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
1895
+{
1896
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
1897
+ struct mddev *mddev = r1_bio->mddev;
1898
+ int s = r1_bio->sectors;
1899
+
1900
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1901
+ test_bit(R1BIO_WriteError, &r1_bio->state))
1902
+ reschedule_retry(r1_bio);
1903
+ else {
1904
+ put_buf(r1_bio);
1905
+ md_done_sync(mddev, s, uptodate);
1906
+ }
1907
+ }
1908
+}
1909
+
18831910 static void end_sync_write(struct bio *bio)
18841911 {
18851912 int uptodate = !bio->bi_status;
....@@ -1906,16 +1933,7 @@
19061933 )
19071934 set_bit(R1BIO_MadeGood, &r1_bio->state);
19081935
1909
- if (atomic_dec_and_test(&r1_bio->remaining)) {
1910
- int s = r1_bio->sectors;
1911
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1912
- test_bit(R1BIO_WriteError, &r1_bio->state))
1913
- reschedule_retry(r1_bio);
1914
- else {
1915
- put_buf(r1_bio);
1916
- md_done_sync(mddev, s, uptodate);
1917
- }
1918
- }
1936
+ put_sync_write_buf(r1_bio, uptodate);
19191937 }
19201938
19211939 static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
....@@ -2115,7 +2133,7 @@
21152133 }
21162134 r1_bio->read_disk = primary;
21172135 for (i = 0; i < conf->raid_disks * 2; i++) {
2118
- int j;
2136
+ int j = 0;
21192137 struct bio *pbio = r1_bio->bios[primary];
21202138 struct bio *sbio = r1_bio->bios[i];
21212139 blk_status_t status = sbio->bi_status;
....@@ -2123,14 +2141,15 @@
21232141 struct page **spages = get_resync_pages(sbio)->pages;
21242142 struct bio_vec *bi;
21252143 int page_len[RESYNC_PAGES] = { 0 };
2144
+ struct bvec_iter_all iter_all;
21262145
21272146 if (sbio->bi_end_io != end_sync_read)
21282147 continue;
21292148 /* Now we can 'fixup' the error value */
21302149 sbio->bi_status = 0;
21312150
2132
- bio_for_each_segment_all(bi, sbio, j)
2133
- page_len[j] = bi->bv_len;
2151
+ bio_for_each_segment_all(bi, sbio, iter_all)
2152
+ page_len[j++] = bi->bv_len;
21342153
21352154 if (!status) {
21362155 for (j = vcnt; j-- ; ) {
....@@ -2194,20 +2213,10 @@
21942213 atomic_inc(&r1_bio->remaining);
21952214 md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
21962215
2197
- generic_make_request(wbio);
2216
+ submit_bio_noacct(wbio);
21982217 }
21992218
2200
- if (atomic_dec_and_test(&r1_bio->remaining)) {
2201
- /* if we're here, all write(s) have completed, so clean up */
2202
- int s = r1_bio->sectors;
2203
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2204
- test_bit(R1BIO_WriteError, &r1_bio->state))
2205
- reschedule_retry(r1_bio);
2206
- else {
2207
- put_buf(r1_bio);
2208
- md_done_sync(mddev, s, 1);
2209
- }
2210
- }
2219
+ put_sync_write_buf(r1_bio, 1);
22112220 }
22122221
22132222 /*
....@@ -2890,7 +2899,7 @@
28902899 md_sync_acct_bio(bio, nr_sectors);
28912900 if (read_targets == 1)
28922901 bio->bi_opf &= ~MD_FAILFAST;
2893
- generic_make_request(bio);
2902
+ submit_bio_noacct(bio);
28942903 }
28952904 }
28962905 } else {
....@@ -2899,8 +2908,7 @@
28992908 md_sync_acct_bio(bio, nr_sectors);
29002909 if (read_targets == 1)
29012910 bio->bi_opf &= ~MD_FAILFAST;
2902
- generic_make_request(bio);
2903
-
2911
+ submit_bio_noacct(bio);
29042912 }
29052913 return nr_sectors;
29062914 }
....@@ -2959,8 +2967,8 @@
29592967 if (!conf->poolinfo)
29602968 goto abort;
29612969 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
2962
- err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
2963
- r1bio_pool_free, conf->poolinfo);
2970
+ err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
2971
+ rbio_pool_free, conf->poolinfo);
29642972 if (err)
29652973 goto abort;
29662974
....@@ -3101,7 +3109,7 @@
31013109 }
31023110
31033111 mddev->degraded = 0;
3104
- for (i=0; i < conf->raid_disks; i++)
3112
+ for (i = 0; i < conf->raid_disks; i++)
31053113 if (conf->mirrors[i].rdev == NULL ||
31063114 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
31073115 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
....@@ -3110,6 +3118,7 @@
31103118 * RAID1 needs at least one disk in active
31113119 */
31123120 if (conf->raid_disks - mddev->degraded < 1) {
3121
+ md_unregister_thread(&conf->thread);
31133122 ret = -EINVAL;
31143123 goto abort;
31153124 }
....@@ -3143,7 +3152,7 @@
31433152 mddev->queue);
31443153 }
31453154
3146
- ret = md_integrity_register(mddev);
3155
+ ret = md_integrity_register(mddev);
31473156 if (ret) {
31483157 md_unregister_thread(&mddev->thread);
31493158 goto abort;
....@@ -3255,8 +3264,8 @@
32553264 newpoolinfo->mddev = mddev;
32563265 newpoolinfo->raid_disks = raid_disks * 2;
32573266
3258
- ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
3259
- r1bio_pool_free, newpoolinfo);
3267
+ ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
3268
+ rbio_pool_free, newpoolinfo);
32603269 if (ret) {
32613270 kfree(newpoolinfo);
32623271 return ret;
....@@ -3361,7 +3370,6 @@
33613370 .check_reshape = raid1_reshape,
33623371 .quiesce = raid1_quiesce,
33633372 .takeover = raid1_takeover,
3364
- .congested = raid1_congested,
33653373 };
33663374
33673375 static int __init raid_init(void)