hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/drivers/md/raid10.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid10.c : Multiple Devices driver for Linux
34 *
....@@ -6,16 +7,6 @@
67 * RAID-10 support for md.
78 *
89 * Base on code in raid1.c. See raid1.c for further copyright information.
9
- *
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1910 */
2011
2112 #include <linux/slab.h>
....@@ -25,6 +16,7 @@
2516 #include <linux/seq_file.h>
2617 #include <linux/ratelimit.h>
2718 #include <linux/kthread.h>
19
+#include <linux/raid/md_p.h>
2820 #include <trace/events/block.h>
2921 #include "md.h"
3022 #include "raid10.h"
....@@ -72,31 +64,6 @@
7264 * [B A] [D C] [B A] [E C D]
7365 */
7466
75
-/*
76
- * Number of guaranteed r10bios in case of extreme VM load:
77
- */
78
-#define NR_RAID10_BIOS 256
79
-
80
-/* when we get a read error on a read-only array, we redirect to another
81
- * device without failing the first device, or trying to over-write to
82
- * correct the read error. To keep track of bad blocks on a per-bio
83
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
84
- */
85
-#define IO_BLOCKED ((struct bio *)1)
86
-/* When we successfully write to a known bad-block, we need to remove the
87
- * bad-block marking which must be done from process context. So we record
88
- * the success by setting devs[n].bio to IO_MADE_GOOD
89
- */
90
-#define IO_MADE_GOOD ((struct bio *)2)
91
-
92
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93
-
94
-/* When there are this many requests queued to be written by
95
- * the raid10 thread, we become 'congested' to provide back-pressure
96
- * for writeback.
97
- */
98
-static int max_queued_requests = 1024;
99
-
10067 static void allow_barrier(struct r10conf *conf);
10168 static void lower_barrier(struct r10conf *conf);
10269 static int _enough(struct r10conf *conf, int previous, int ignore);
....@@ -129,11 +96,6 @@
12996 /* allocate a r10bio with room for raid_disks entries in the
13097 * bios array */
13198 return kzalloc(size, gfp_flags);
132
-}
133
-
134
-static void r10bio_pool_free(void *r10_bio, void *data)
135
-{
136
- kfree(r10_bio);
13799 }
138100
139101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
....@@ -241,7 +203,7 @@
241203 }
242204 kfree(rps);
243205 out_free_r10bio:
244
- r10bio_pool_free(r10_bio, conf);
206
+ rbio_pool_free(r10_bio, conf);
245207 return NULL;
246208 }
247209
....@@ -269,7 +231,7 @@
269231 /* resync pages array stored in the 1st bio's .bi_private */
270232 kfree(rp);
271233
272
- r10bio_pool_free(r10bio, conf);
234
+ rbio_pool_free(r10bio, conf);
273235 }
274236
275237 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
....@@ -503,19 +465,21 @@
503465 if (test_bit(FailFast, &rdev->flags) &&
504466 (bio->bi_opf & MD_FAILFAST)) {
505467 md_error(rdev->mddev, rdev);
506
- if (!test_bit(Faulty, &rdev->flags))
507
- /* This is the only remaining device,
508
- * We need to retry the write without
509
- * FailFast
510
- */
511
- set_bit(R10BIO_WriteError, &r10_bio->state);
512
- else {
513
- r10_bio->devs[slot].bio = NULL;
514
- to_put = bio;
515
- dec_rdev = 1;
516
- }
517
- } else
468
+ }
469
+
470
+ /*
471
+ * When the device is faulty, it is not necessary to
472
+ * handle write error.
473
+ */
474
+ if (!test_bit(Faulty, &rdev->flags))
518475 set_bit(R10BIO_WriteError, &r10_bio->state);
476
+ else {
477
+ /* Fail the request */
478
+ set_bit(R10BIO_Degraded, &r10_bio->state);
479
+ r10_bio->devs[slot].bio = NULL;
480
+ to_put = bio;
481
+ dec_rdev = 1;
482
+ }
519483 }
520484 } else {
521485 /*
....@@ -745,15 +709,19 @@
745709 int sectors = r10_bio->sectors;
746710 int best_good_sectors;
747711 sector_t new_distance, best_dist;
748
- struct md_rdev *best_rdev, *rdev = NULL;
712
+ struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
749713 int do_balance;
750
- int best_slot;
714
+ int best_dist_slot, best_pending_slot;
715
+ bool has_nonrot_disk = false;
716
+ unsigned int min_pending;
751717 struct geom *geo = &conf->geo;
752718
753719 raid10_find_phys(conf, r10_bio);
754720 rcu_read_lock();
755
- best_slot = -1;
756
- best_rdev = NULL;
721
+ best_dist_slot = -1;
722
+ min_pending = UINT_MAX;
723
+ best_dist_rdev = NULL;
724
+ best_pending_rdev = NULL;
757725 best_dist = MaxSector;
758726 best_good_sectors = 0;
759727 do_balance = 1;
....@@ -775,14 +743,24 @@
775743 sector_t first_bad;
776744 int bad_sectors;
777745 sector_t dev_sector;
746
+ unsigned int pending;
747
+ bool nonrot;
778748
779749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
780750 continue;
781751 disk = r10_bio->devs[slot].devnum;
782752 rdev = rcu_dereference(conf->mirrors[disk].replacement);
783753 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
784
- r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
754
+ r10_bio->devs[slot].addr + sectors >
755
+ rdev->recovery_offset) {
756
+ /*
757
+ * Read replacement first to prevent reading both rdev
758
+ * and replacement as NULL during replacement replace
759
+ * rdev.
760
+ */
761
+ smp_mb();
785762 rdev = rcu_dereference(conf->mirrors[disk].rdev);
763
+ }
786764 if (rdev == NULL ||
787765 test_bit(Faulty, &rdev->flags))
788766 continue;
....@@ -811,8 +789,8 @@
811789 first_bad - dev_sector;
812790 if (good_sectors > best_good_sectors) {
813791 best_good_sectors = good_sectors;
814
- best_slot = slot;
815
- best_rdev = rdev;
792
+ best_dist_slot = slot;
793
+ best_dist_rdev = rdev;
816794 }
817795 if (!do_balance)
818796 /* Must read from here */
....@@ -825,14 +803,23 @@
825803 if (!do_balance)
826804 break;
827805
828
- if (best_slot >= 0)
806
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
807
+ has_nonrot_disk |= nonrot;
808
+ pending = atomic_read(&rdev->nr_pending);
809
+ if (min_pending > pending && nonrot) {
810
+ min_pending = pending;
811
+ best_pending_slot = slot;
812
+ best_pending_rdev = rdev;
813
+ }
814
+
815
+ if (best_dist_slot >= 0)
829816 /* At least 2 disks to choose from so failfast is OK */
830817 set_bit(R10BIO_FailFast, &r10_bio->state);
831818 /* This optimisation is debatable, and completely destroys
832819 * sequential read speed for 'far copies' arrays. So only
833820 * keep it for 'near' arrays, and review those later.
834821 */
835
- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
822
+ if (geo->near_copies > 1 && !pending)
836823 new_distance = 0;
837824
838825 /* for far > 1 always use the lowest address */
....@@ -841,15 +828,21 @@
841828 else
842829 new_distance = abs(r10_bio->devs[slot].addr -
843830 conf->mirrors[disk].head_position);
831
+
844832 if (new_distance < best_dist) {
845833 best_dist = new_distance;
846
- best_slot = slot;
847
- best_rdev = rdev;
834
+ best_dist_slot = slot;
835
+ best_dist_rdev = rdev;
848836 }
849837 }
850838 if (slot >= conf->copies) {
851
- slot = best_slot;
852
- rdev = best_rdev;
839
+ if (has_nonrot_disk) {
840
+ slot = best_pending_slot;
841
+ rdev = best_pending_rdev;
842
+ } else {
843
+ slot = best_dist_slot;
844
+ rdev = best_dist_rdev;
845
+ }
853846 }
854847
855848 if (slot >= 0) {
....@@ -861,31 +854,6 @@
861854 *max_sectors = best_good_sectors;
862855
863856 return rdev;
864
-}
865
-
866
-static int raid10_congested(struct mddev *mddev, int bits)
867
-{
868
- struct r10conf *conf = mddev->private;
869
- int i, ret = 0;
870
-
871
- if ((bits & (1 << WB_async_congested)) &&
872
- conf->pending_count >= max_queued_requests)
873
- return 1;
874
-
875
- rcu_read_lock();
876
- for (i = 0;
877
- (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
878
- && ret == 0;
879
- i++) {
880
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
881
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
882
- struct request_queue *q = bdev_get_queue(rdev->bdev);
883
-
884
- ret |= bdi_congested(q->backing_dev_info, bits);
885
- }
886
- }
887
- rcu_read_unlock();
888
- return ret;
889857 }
890858
891859 static void flush_pending_writes(struct r10conf *conf)
....@@ -932,8 +900,9 @@
932900 /* Just ignore it */
933901 bio_endio(bio);
934902 else
935
- generic_make_request(bio);
903
+ submit_bio_noacct(bio);
936904 bio = next;
905
+ cond_resched();
937906 }
938907 blk_finish_plug(&plug);
939908 } else
....@@ -995,6 +964,7 @@
995964 {
996965 spin_lock_irq(&conf->resync_lock);
997966 if (conf->barrier) {
967
+ struct bio_list *bio_list = current->bio_list;
998968 conf->nr_waiting++;
999969 /* Wait for the barrier to drop.
1000970 * However if there are already pending
....@@ -1009,9 +979,16 @@
1009979 wait_event_lock_irq(conf->wait_barrier,
1010980 !conf->barrier ||
1011981 (atomic_read(&conf->nr_pending) &&
1012
- current->bio_list &&
1013
- (!bio_list_empty(&current->bio_list[0]) ||
1014
- !bio_list_empty(&current->bio_list[1]))),
982
+ bio_list &&
983
+ (!bio_list_empty(&bio_list[0]) ||
984
+ !bio_list_empty(&bio_list[1]))) ||
985
+ /* move on if recovery thread is
986
+ * blocked by us
987
+ */
988
+ (conf->mddev->thread->tsk == current &&
989
+ test_bit(MD_RECOVERY_RUNNING,
990
+ &conf->mddev->recovery) &&
991
+ conf->nr_queued > 0),
1015992 conf->resync_lock);
1016993 conf->nr_waiting--;
1017994 if (!conf->nr_waiting)
....@@ -1117,10 +1094,34 @@
11171094 /* Just ignore it */
11181095 bio_endio(bio);
11191096 else
1120
- generic_make_request(bio);
1097
+ submit_bio_noacct(bio);
11211098 bio = next;
1099
+ cond_resched();
11221100 }
11231101 kfree(plug);
1102
+}
1103
+
1104
+/*
1105
+ * 1. Register the new request and wait if the reconstruction thread has put
1106
+ * up a bar for new requests. Continue immediately if no resync is active
1107
+ * currently.
1108
+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
1109
+ */
1110
+static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1111
+ struct bio *bio, sector_t sectors)
1112
+{
1113
+ wait_barrier(conf);
1114
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1115
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
1116
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1117
+ raid10_log(conf->mddev, "wait reshape");
1118
+ allow_barrier(conf);
1119
+ wait_event(conf->wait_barrier,
1120
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
1121
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
1122
+ sectors);
1123
+ wait_barrier(conf);
1124
+ }
11241125 }
11251126
11261127 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
....@@ -1131,7 +1132,6 @@
11311132 const int op = bio_op(bio);
11321133 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
11331134 int max_sectors;
1134
- sector_t sectors;
11351135 struct md_rdev *rdev;
11361136 char b[BDEVNAME_SIZE];
11371137 int slot = r10_bio->read_slot;
....@@ -1165,30 +1165,8 @@
11651165 }
11661166 rcu_read_unlock();
11671167 }
1168
- /*
1169
- * Register the new request and wait if the reconstruction
1170
- * thread has put up a bar for new requests.
1171
- * Continue immediately if no resync is active currently.
1172
- */
1173
- wait_barrier(conf);
11741168
1175
- sectors = r10_bio->sectors;
1176
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177
- bio->bi_iter.bi_sector < conf->reshape_progress &&
1178
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1179
- /*
1180
- * IO spans the reshape position. Need to wait for reshape to
1181
- * pass
1182
- */
1183
- raid10_log(conf->mddev, "wait reshape");
1184
- allow_barrier(conf);
1185
- wait_event(conf->wait_barrier,
1186
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
1187
- conf->reshape_progress >= bio->bi_iter.bi_sector +
1188
- sectors);
1189
- wait_barrier(conf);
1190
- }
1191
-
1169
+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
11921170 rdev = read_balance(conf, r10_bio, &max_sectors);
11931171 if (!rdev) {
11941172 if (err_rdev) {
....@@ -1209,7 +1187,7 @@
12091187 gfp, &conf->bio_split);
12101188 bio_chain(split, bio);
12111189 allow_barrier(conf);
1212
- generic_make_request(bio);
1190
+ submit_bio_noacct(bio);
12131191 wait_barrier(conf);
12141192 bio = split;
12151193 r10_bio->master_bio = bio;
....@@ -1236,7 +1214,7 @@
12361214 trace_block_bio_remap(read_bio->bi_disk->queue,
12371215 read_bio, disk_devt(mddev->gendisk),
12381216 r10_bio->sector);
1239
- generic_make_request(read_bio);
1217
+ submit_bio_noacct(read_bio);
12401218 return;
12411219 }
12421220
....@@ -1333,30 +1311,8 @@
13331311 finish_wait(&conf->wait_barrier, &w);
13341312 }
13351313
1336
- /*
1337
- * Register the new request and wait if the reconstruction
1338
- * thread has put up a bar for new requests.
1339
- * Continue immediately if no resync is active currently.
1340
- */
1341
- wait_barrier(conf);
1342
-
13431314 sectors = r10_bio->sectors;
1344
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1345
- bio->bi_iter.bi_sector < conf->reshape_progress &&
1346
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1347
- /*
1348
- * IO spans the reshape position. Need to wait for reshape to
1349
- * pass
1350
- */
1351
- raid10_log(conf->mddev, "wait reshape");
1352
- allow_barrier(conf);
1353
- wait_event(conf->wait_barrier,
1354
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
1355
- conf->reshape_progress >= bio->bi_iter.bi_sector +
1356
- sectors);
1357
- wait_barrier(conf);
1358
- }
1359
-
1315
+ regular_request_wait(mddev, conf, bio, sectors);
13601316 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
13611317 (mddev->reshape_backwards
13621318 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
....@@ -1400,9 +1356,15 @@
14001356
14011357 for (i = 0; i < conf->copies; i++) {
14021358 int d = r10_bio->devs[i].devnum;
1403
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1404
- struct md_rdev *rrdev = rcu_dereference(
1405
- conf->mirrors[d].replacement);
1359
+ struct md_rdev *rdev, *rrdev;
1360
+
1361
+ rrdev = rcu_dereference(conf->mirrors[d].replacement);
1362
+ /*
1363
+ * Read replacement first to prevent reading both rdev and
1364
+ * replacement as NULL during replacement replace rdev.
1365
+ */
1366
+ smp_mb();
1367
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
14061368 if (rdev == rrdev)
14071369 rrdev = NULL;
14081370 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
....@@ -1516,7 +1478,7 @@
15161478 GFP_NOIO, &conf->bio_split);
15171479 bio_chain(split, bio);
15181480 allow_barrier(conf);
1519
- generic_make_request(bio);
1481
+ submit_bio_noacct(bio);
15201482 wait_barrier(conf);
15211483 bio = split;
15221484 r10_bio->master_bio = bio;
....@@ -1677,12 +1639,12 @@
16771639
16781640 /*
16791641 * If it is not operational, then we have already marked it as dead
1680
- * else if it is the last working disks, ignore the error, let the
1681
- * next level up know.
1642
+ * else if it is the last working disks with "fail_last_dev == false",
1643
+ * ignore the error, let the next level up know.
16821644 * else mark the drive as failed
16831645 */
16841646 spin_lock_irqsave(&conf->device_lock, flags);
1685
- if (test_bit(In_sync, &rdev->flags)
1647
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
16861648 && !enough(conf, rdev->raid_disk)) {
16871649 /*
16881650 * Don't fail the drive, just return an IO error.
....@@ -1863,9 +1825,12 @@
18631825 int err = 0;
18641826 int number = rdev->raid_disk;
18651827 struct md_rdev **rdevp;
1866
- struct raid10_info *p = conf->mirrors + number;
1828
+ struct raid10_info *p;
18671829
18681830 print_conf(conf);
1831
+ if (unlikely(number >= mddev->raid_disks))
1832
+ return 0;
1833
+ p = conf->mirrors + number;
18691834 if (rdev == p->rdev)
18701835 rdevp = &p->rdev;
18711836 else if (rdev == p->replacement)
....@@ -2137,7 +2102,7 @@
21372102 tbio->bi_opf |= MD_FAILFAST;
21382103 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
21392104 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2140
- generic_make_request(tbio);
2105
+ submit_bio_noacct(tbio);
21412106 }
21422107
21432108 /* Now write out to any replacement devices
....@@ -2156,7 +2121,7 @@
21562121 atomic_inc(&r10_bio->remaining);
21572122 md_sync_acct(conf->mirrors[d].replacement->bdev,
21582123 bio_sectors(tbio));
2159
- generic_make_request(tbio);
2124
+ submit_bio_noacct(tbio);
21602125 }
21612126
21622127 done:
....@@ -2263,11 +2228,22 @@
22632228 {
22642229 struct r10conf *conf = mddev->private;
22652230 int d;
2266
- struct bio *wbio, *wbio2;
2231
+ struct bio *wbio = r10_bio->devs[1].bio;
2232
+ struct bio *wbio2 = r10_bio->devs[1].repl_bio;
2233
+
2234
+ /* Need to test wbio2->bi_end_io before we call
2235
+ * submit_bio_noacct as if the former is NULL,
2236
+ * the latter is free to free wbio2.
2237
+ */
2238
+ if (wbio2 && !wbio2->bi_end_io)
2239
+ wbio2 = NULL;
22672240
22682241 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
22692242 fix_recovery_read_error(r10_bio);
2270
- end_sync_request(r10_bio);
2243
+ if (wbio->bi_end_io)
2244
+ end_sync_request(r10_bio);
2245
+ if (wbio2)
2246
+ end_sync_request(r10_bio);
22712247 return;
22722248 }
22732249
....@@ -2276,24 +2252,16 @@
22762252 * and submit the write request
22772253 */
22782254 d = r10_bio->devs[1].devnum;
2279
- wbio = r10_bio->devs[1].bio;
2280
- wbio2 = r10_bio->devs[1].repl_bio;
2281
- /* Need to test wbio2->bi_end_io before we call
2282
- * generic_make_request as if the former is NULL,
2283
- * the latter is free to free wbio2.
2284
- */
2285
- if (wbio2 && !wbio2->bi_end_io)
2286
- wbio2 = NULL;
22872255 if (wbio->bi_end_io) {
22882256 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
22892257 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2290
- generic_make_request(wbio);
2258
+ submit_bio_noacct(wbio);
22912259 }
22922260 if (wbio2) {
22932261 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
22942262 md_sync_acct(conf->mirrors[d].replacement->bdev,
22952263 bio_sectors(wbio2));
2296
- generic_make_request(wbio2);
2264
+ submit_bio_noacct(wbio2);
22972265 }
22982266 }
22992267
....@@ -2927,7 +2895,7 @@
29272895 * a number of r10_bio structures, one for each out-of-sync device.
29282896 * As we setup these structures, we collect all bio's together into a list
29292897 * which we then process collectively to add pages, and then process again
2930
- * to pass to generic_make_request.
2898
+ * to pass to submit_bio_noacct.
29312899 *
29322900 * The r10_bio structures are linked using a borrowed master_bio pointer.
29332901 * This link is counted in ->remaining. When the r10_bio that points to NULL
....@@ -2951,10 +2919,6 @@
29512919 sector_t chunk_mask = conf->geo.chunk_mask;
29522920 int page_idx = 0;
29532921
2954
- if (!mempool_initialized(&conf->r10buf_pool))
2955
- if (init_resync(conf))
2956
- return 0;
2957
-
29582922 /*
29592923 * Allow skipping a full rebuild for incremental assembly
29602924 * of a clean array, like RAID1 does.
....@@ -2969,6 +2933,10 @@
29692933 *skipped = 1;
29702934 return mddev->dev_sectors - sector_nr;
29712935 }
2936
+
2937
+ if (!mempool_initialized(&conf->r10buf_pool))
2938
+ if (init_resync(conf))
2939
+ return 0;
29722940
29732941 skipped:
29742942 max_sector = mddev->dev_sectors;
....@@ -3084,6 +3052,7 @@
30843052 sector_t sect;
30853053 int must_sync;
30863054 int any_working;
3055
+ int need_recover = 0;
30873056 struct raid10_info *mirror = &conf->mirrors[i];
30883057 struct md_rdev *mrdev, *mreplace;
30893058
....@@ -3091,11 +3060,14 @@
30913060 mrdev = rcu_dereference(mirror->rdev);
30923061 mreplace = rcu_dereference(mirror->replacement);
30933062
3094
- if ((mrdev == NULL ||
3095
- test_bit(Faulty, &mrdev->flags) ||
3096
- test_bit(In_sync, &mrdev->flags)) &&
3097
- (mreplace == NULL ||
3098
- test_bit(Faulty, &mreplace->flags))) {
3063
+ if (mrdev != NULL &&
3064
+ !test_bit(Faulty, &mrdev->flags) &&
3065
+ !test_bit(In_sync, &mrdev->flags))
3066
+ need_recover = 1;
3067
+ if (mreplace && test_bit(Faulty, &mreplace->flags))
3068
+ mreplace = NULL;
3069
+
3070
+ if (!need_recover && !mreplace) {
30993071 rcu_read_unlock();
31003072 continue;
31013073 }
....@@ -3111,8 +3083,6 @@
31113083 rcu_read_unlock();
31123084 continue;
31133085 }
3114
- if (mreplace && test_bit(Faulty, &mreplace->flags))
3115
- mreplace = NULL;
31163086 /* Unless we are doing a full sync, or a replacement
31173087 * we only need to recover the block if it is set in
31183088 * the bitmap
....@@ -3218,7 +3188,7 @@
32183188 r10_bio->devs[1].devnum = i;
32193189 r10_bio->devs[1].addr = to_addr;
32203190
3221
- if (!test_bit(In_sync, &mrdev->flags)) {
3191
+ if (need_recover) {
32223192 bio = r10_bio->devs[1].bio;
32233193 bio->bi_next = biolist;
32243194 biolist = bio;
....@@ -3235,16 +3205,11 @@
32353205 bio = r10_bio->devs[1].repl_bio;
32363206 if (bio)
32373207 bio->bi_end_io = NULL;
3238
- /* Note: if mreplace != NULL, then bio
3208
+ /* Note: if replace is not NULL, then bio
32393209 * cannot be NULL as r10buf_pool_alloc will
32403210 * have allocated it.
3241
- * So the second test here is pointless.
3242
- * But it keeps semantic-checkers happy, and
3243
- * this comment keeps human reviewers
3244
- * happy.
32453211 */
3246
- if (mreplace == NULL || bio == NULL ||
3247
- test_bit(Faulty, &mreplace->flags))
3212
+ if (!mreplace)
32483213 break;
32493214 bio->bi_next = biolist;
32503215 biolist = bio;
....@@ -3533,7 +3498,7 @@
35333498 if (bio->bi_end_io == end_sync_read) {
35343499 md_sync_acct_bio(bio, nr_sectors);
35353500 bio->bi_status = 0;
3536
- generic_make_request(bio);
3501
+ submit_bio_noacct(bio);
35373502 }
35383503 }
35393504
....@@ -3665,6 +3630,20 @@
36653630 return nc*fc;
36663631 }
36673632
3633
+static void raid10_free_conf(struct r10conf *conf)
3634
+{
3635
+ if (!conf)
3636
+ return;
3637
+
3638
+ mempool_exit(&conf->r10bio_pool);
3639
+ kfree(conf->mirrors);
3640
+ kfree(conf->mirrors_old);
3641
+ kfree(conf->mirrors_new);
3642
+ safe_put_page(conf->tmppage);
3643
+ bioset_exit(&conf->bio_split);
3644
+ kfree(conf);
3645
+}
3646
+
36683647 static struct r10conf *setup_conf(struct mddev *mddev)
36693648 {
36703649 struct r10conf *conf = NULL;
....@@ -3704,8 +3683,8 @@
37043683
37053684 conf->geo = geo;
37063685 conf->copies = copies;
3707
- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
3708
- r10bio_pool_free, conf);
3686
+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3687
+ rbio_pool_free, conf);
37093688 if (err)
37103689 goto out;
37113690
....@@ -3747,20 +3726,24 @@
37473726 return conf;
37483727
37493728 out:
3750
- if (conf) {
3751
- mempool_exit(&conf->r10bio_pool);
3752
- kfree(conf->mirrors);
3753
- safe_put_page(conf->tmppage);
3754
- bioset_exit(&conf->bio_split);
3755
- kfree(conf);
3756
- }
3729
+ raid10_free_conf(conf);
37573730 return ERR_PTR(err);
3731
+}
3732
+
3733
+static void raid10_set_io_opt(struct r10conf *conf)
3734
+{
3735
+ int raid_disks = conf->geo.raid_disks;
3736
+
3737
+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
3738
+ raid_disks /= conf->geo.near_copies;
3739
+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
3740
+ raid_disks);
37583741 }
37593742
37603743 static int raid10_run(struct mddev *mddev)
37613744 {
37623745 struct r10conf *conf;
3763
- int i, disk_idx, chunk_size;
3746
+ int i, disk_idx;
37643747 struct raid10_info *disk;
37653748 struct md_rdev *rdev;
37663749 sector_t size;
....@@ -3781,6 +3764,9 @@
37813764 if (!conf)
37823765 goto out;
37833766
3767
+ mddev->thread = conf->thread;
3768
+ conf->thread = NULL;
3769
+
37843770 if (mddev_is_clustered(conf->mddev)) {
37853771 int fc, fo;
37863772
....@@ -3793,21 +3779,13 @@
37933779 }
37943780 }
37953781
3796
- mddev->thread = conf->thread;
3797
- conf->thread = NULL;
3798
-
3799
- chunk_size = mddev->chunk_sectors << 9;
38003782 if (mddev->queue) {
38013783 blk_queue_max_discard_sectors(mddev->queue,
38023784 mddev->chunk_sectors);
38033785 blk_queue_max_write_same_sectors(mddev->queue, 0);
38043786 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3805
- blk_queue_io_min(mddev->queue, chunk_size);
3806
- if (conf->geo.raid_disks % conf->geo.near_copies)
3807
- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3808
- else
3809
- blk_queue_io_opt(mddev->queue, chunk_size *
3810
- (conf->geo.raid_disks / conf->geo.near_copies));
3787
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
3788
+ raid10_set_io_opt(conf);
38113789 }
38123790
38133791 rdev_for_each(rdev, mddev) {
....@@ -3922,19 +3900,6 @@
39223900 mddev->resync_max_sectors = size;
39233901 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
39243902
3925
- if (mddev->queue) {
3926
- int stripe = conf->geo.raid_disks *
3927
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3928
-
3929
- /* Calculate max read-ahead size.
3930
- * We need to readahead at least twice a whole stripe....
3931
- * maybe...
3932
- */
3933
- stripe /= conf->geo.near_copies;
3934
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3935
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3936
- }
3937
-
39383903 if (md_integrity_register(mddev))
39393904 goto out_free_conf;
39403905
....@@ -3967,10 +3932,7 @@
39673932
39683933 out_free_conf:
39693934 md_unregister_thread(&mddev->thread);
3970
- mempool_exit(&conf->r10bio_pool);
3971
- safe_put_page(conf->tmppage);
3972
- kfree(conf->mirrors);
3973
- kfree(conf);
3935
+ raid10_free_conf(conf);
39743936 mddev->private = NULL;
39753937 out:
39763938 return -EIO;
....@@ -3978,15 +3940,7 @@
39783940
39793941 static void raid10_free(struct mddev *mddev, void *priv)
39803942 {
3981
- struct r10conf *conf = priv;
3982
-
3983
- mempool_exit(&conf->r10bio_pool);
3984
- safe_put_page(conf->tmppage);
3985
- kfree(conf->mirrors);
3986
- kfree(conf->mirrors_old);
3987
- kfree(conf->mirrors_new);
3988
- bioset_exit(&conf->bio_split);
3989
- kfree(conf);
3943
+ raid10_free_conf(priv);
39903944 }
39913945
39923946 static void raid10_quiesce(struct mddev *mddev, int quiesce)
....@@ -4293,12 +4247,46 @@
42934247 spin_unlock_irq(&conf->device_lock);
42944248
42954249 if (mddev->delta_disks && mddev->bitmap) {
4296
- ret = md_bitmap_resize(mddev->bitmap,
4297
- raid10_size(mddev, 0, conf->geo.raid_disks),
4298
- 0, 0);
4250
+ struct mdp_superblock_1 *sb = NULL;
4251
+ sector_t oldsize, newsize;
4252
+
4253
+ oldsize = raid10_size(mddev, 0, 0);
4254
+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4255
+
4256
+ if (!mddev_is_clustered(mddev)) {
4257
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4258
+ if (ret)
4259
+ goto abort;
4260
+ else
4261
+ goto out;
4262
+ }
4263
+
4264
+ rdev_for_each(rdev, mddev) {
4265
+ if (rdev->raid_disk > -1 &&
4266
+ !test_bit(Faulty, &rdev->flags))
4267
+ sb = page_address(rdev->sb_page);
4268
+ }
4269
+
4270
+ /*
4271
+ * some node is already performing reshape, and no need to
4272
+ * call md_bitmap_resize again since it should be called when
4273
+ * receiving BITMAP_RESIZE msg
4274
+ */
4275
+ if ((sb && (le32_to_cpu(sb->feature_map) &
4276
+ MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4277
+ goto out;
4278
+
4279
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
42994280 if (ret)
43004281 goto abort;
4282
+
4283
+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4284
+ if (ret) {
4285
+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4286
+ goto abort;
4287
+ }
43014288 }
4289
+out:
43024290 if (mddev->delta_disks > 0) {
43034291 rdev_for_each(rdev, mddev)
43044292 if (rdev->raid_disk < 0 &&
....@@ -4310,8 +4298,8 @@
43104298 else
43114299 rdev->recovery_offset = 0;
43124300
4313
- if (sysfs_link_rdev(mddev, rdev))
4314
- /* Failure here is OK */;
4301
+ /* Failure here is OK */
4302
+ sysfs_link_rdev(mddev, rdev);
43154303 }
43164304 } else if (rdev->raid_disk >= conf->prev.raid_disks
43174305 && !test_bit(Faulty, &rdev->flags)) {
....@@ -4457,7 +4445,7 @@
44574445 sector_nr = conf->reshape_progress;
44584446 if (sector_nr) {
44594447 mddev->curr_resync_completed = sector_nr;
4460
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4448
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
44614449 *skipped = 1;
44624450 return sector_nr;
44634451 }
....@@ -4486,8 +4474,8 @@
44864474 last = conf->reshape_progress - 1;
44874475 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
44884476 & conf->prev.chunk_mask);
4489
- if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4490
- sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4477
+ if (sector_nr + RESYNC_SECTORS < last)
4478
+ sector_nr = last + 1 - RESYNC_SECTORS;
44914479 } else {
44924480 /* 'next' is after the last device address that we
44934481 * might write to for this chunk in the new layout
....@@ -4509,8 +4497,8 @@
45094497 last = sector_nr | (conf->geo.chunk_mask
45104498 & conf->prev.chunk_mask);
45114499
4512
- if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4513
- last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4500
+ if (sector_nr + RESYNC_SECTORS <= last)
4501
+ last = sector_nr + RESYNC_SECTORS - 1;
45144502 }
45154503
45164504 if (need_flush ||
....@@ -4575,6 +4563,32 @@
45754563 r10_bio->master_bio = read_bio;
45764564 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
45774565
4566
+ /*
4567
+ * Broadcast RESYNC message to other nodes, so all nodes would not
4568
+ * write to the region to avoid conflict.
4569
+ */
4570
+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4571
+ struct mdp_superblock_1 *sb = NULL;
4572
+ int sb_reshape_pos = 0;
4573
+
4574
+ conf->cluster_sync_low = sector_nr;
4575
+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4576
+ sb = page_address(rdev->sb_page);
4577
+ if (sb) {
4578
+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4579
+ /*
4580
+ * Set cluster_sync_low again if next address for array
4581
+ * reshape is less than cluster_sync_low. Since we can't
4582
+ * update cluster_sync_low until it has finished reshape.
4583
+ */
4584
+ if (sb_reshape_pos < conf->cluster_sync_low)
4585
+ conf->cluster_sync_low = sb_reshape_pos;
4586
+ }
4587
+
4588
+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4589
+ conf->cluster_sync_high);
4590
+ }
4591
+
45784592 /* Now find the locations in the new layout */
45794593 __raid10_find_phys(&conf->geo, r10_bio);
45804594
....@@ -4631,7 +4645,7 @@
46314645 md_sync_acct_bio(read_bio, r10_bio->sectors);
46324646 atomic_inc(&r10_bio->remaining);
46334647 read_bio->bi_next = NULL;
4634
- generic_make_request(read_bio);
4648
+ submit_bio_noacct(read_bio);
46354649 sectors_done += nr_sectors;
46364650 if (sector_nr <= last)
46374651 goto read_more;
....@@ -4694,7 +4708,7 @@
46944708 md_sync_acct_bio(b, r10_bio->sectors);
46954709 atomic_inc(&r10_bio->remaining);
46964710 b->bi_next = NULL;
4697
- generic_make_request(b);
4711
+ submit_bio_noacct(b);
46984712 }
46994713 end_reshape_request(r10_bio);
47004714 }
....@@ -4712,17 +4726,22 @@
47124726 conf->reshape_safe = MaxSector;
47134727 spin_unlock_irq(&conf->device_lock);
47144728
4715
- /* read-ahead size must cover two whole stripes, which is
4716
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4717
- */
4718
- if (conf->mddev->queue) {
4719
- int stripe = conf->geo.raid_disks *
4720
- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4721
- stripe /= conf->geo.near_copies;
4722
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4723
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4724
- }
4729
+ if (conf->mddev->queue)
4730
+ raid10_set_io_opt(conf);
47254731 conf->fullsync = 0;
4732
+}
4733
+
4734
+static void raid10_update_reshape_pos(struct mddev *mddev)
4735
+{
4736
+ struct r10conf *conf = mddev->private;
4737
+ sector_t lo, hi;
4738
+
4739
+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4740
+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4741
+ || mddev->reshape_position == MaxSector)
4742
+ conf->reshape_progress = mddev->reshape_position;
4743
+ else
4744
+ WARN_ON_ONCE(1);
47264745 }
47274746
47284747 static int handle_reshape_read_error(struct mddev *mddev,
....@@ -4736,8 +4755,7 @@
47364755 int idx = 0;
47374756 struct page **pages;
47384757
4739
- r10b = kmalloc(sizeof(*r10b) +
4740
- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
4758
+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
47414759 if (!r10b) {
47424760 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
47434761 return -ENOMEM;
....@@ -4893,7 +4911,7 @@
48934911 .check_reshape = raid10_check_reshape,
48944912 .start_reshape = raid10_start_reshape,
48954913 .finish_reshape = raid10_finish_reshape,
4896
- .congested = raid10_congested,
4914
+ .update_reshape_pos = raid10_update_reshape_pos,
48974915 };
48984916
48994917 static int __init raid_init(void)