forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/drivers/md/raid10.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid10.c : Multiple Devices driver for Linux
34 *
....@@ -6,16 +7,6 @@
67 * RAID-10 support for md.
78 *
89 * Base on code in raid1.c. See raid1.c for further copyright information.
9
- *
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1910 */
2011
2112 #include <linux/slab.h>
....@@ -25,6 +16,7 @@
2516 #include <linux/seq_file.h>
2617 #include <linux/ratelimit.h>
2718 #include <linux/kthread.h>
19
+#include <linux/raid/md_p.h>
2820 #include <trace/events/block.h>
2921 #include "md.h"
3022 #include "raid10.h"
....@@ -72,31 +64,6 @@
7264 * [B A] [D C] [B A] [E C D]
7365 */
7466
75
-/*
76
- * Number of guaranteed r10bios in case of extreme VM load:
77
- */
78
-#define NR_RAID10_BIOS 256
79
-
80
-/* when we get a read error on a read-only array, we redirect to another
81
- * device without failing the first device, or trying to over-write to
82
- * correct the read error. To keep track of bad blocks on a per-bio
83
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
84
- */
85
-#define IO_BLOCKED ((struct bio *)1)
86
-/* When we successfully write to a known bad-block, we need to remove the
87
- * bad-block marking which must be done from process context. So we record
88
- * the success by setting devs[n].bio to IO_MADE_GOOD
89
- */
90
-#define IO_MADE_GOOD ((struct bio *)2)
91
-
92
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93
-
94
-/* When there are this many requests queued to be written by
95
- * the raid10 thread, we become 'congested' to provide back-pressure
96
- * for writeback.
97
- */
98
-static int max_queued_requests = 1024;
99
-
10067 static void allow_barrier(struct r10conf *conf);
10168 static void lower_barrier(struct r10conf *conf);
10269 static int _enough(struct r10conf *conf, int previous, int ignore);
....@@ -129,11 +96,6 @@
12996 /* allocate a r10bio with room for raid_disks entries in the
13097 * bios array */
13198 return kzalloc(size, gfp_flags);
132
-}
133
-
134
-static void r10bio_pool_free(void *r10_bio, void *data)
135
-{
136
- kfree(r10_bio);
13799 }
138100
139101 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
....@@ -241,7 +203,7 @@
241203 }
242204 kfree(rps);
243205 out_free_r10bio:
244
- r10bio_pool_free(r10_bio, conf);
206
+ rbio_pool_free(r10_bio, conf);
245207 return NULL;
246208 }
247209
....@@ -269,7 +231,7 @@
269231 /* resync pages array stored in the 1st bio's .bi_private */
270232 kfree(rp);
271233
272
- r10bio_pool_free(r10bio, conf);
234
+ rbio_pool_free(r10bio, conf);
273235 }
274236
275237 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
....@@ -503,19 +465,21 @@
503465 if (test_bit(FailFast, &rdev->flags) &&
504466 (bio->bi_opf & MD_FAILFAST)) {
505467 md_error(rdev->mddev, rdev);
506
- if (!test_bit(Faulty, &rdev->flags))
507
- /* This is the only remaining device,
508
- * We need to retry the write without
509
- * FailFast
510
- */
511
- set_bit(R10BIO_WriteError, &r10_bio->state);
512
- else {
513
- r10_bio->devs[slot].bio = NULL;
514
- to_put = bio;
515
- dec_rdev = 1;
516
- }
517
- } else
468
+ }
469
+
470
+ /*
471
+ * When the device is faulty, it is not necessary to
472
+ * handle write error.
473
+ */
474
+ if (!test_bit(Faulty, &rdev->flags))
518475 set_bit(R10BIO_WriteError, &r10_bio->state);
476
+ else {
477
+ /* Fail the request */
478
+ set_bit(R10BIO_Degraded, &r10_bio->state);
479
+ r10_bio->devs[slot].bio = NULL;
480
+ to_put = bio;
481
+ dec_rdev = 1;
482
+ }
519483 }
520484 } else {
521485 /*
....@@ -745,15 +709,19 @@
745709 int sectors = r10_bio->sectors;
746710 int best_good_sectors;
747711 sector_t new_distance, best_dist;
748
- struct md_rdev *best_rdev, *rdev = NULL;
712
+ struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
749713 int do_balance;
750
- int best_slot;
714
+ int best_dist_slot, best_pending_slot;
715
+ bool has_nonrot_disk = false;
716
+ unsigned int min_pending;
751717 struct geom *geo = &conf->geo;
752718
753719 raid10_find_phys(conf, r10_bio);
754720 rcu_read_lock();
755
- best_slot = -1;
756
- best_rdev = NULL;
721
+ best_dist_slot = -1;
722
+ min_pending = UINT_MAX;
723
+ best_dist_rdev = NULL;
724
+ best_pending_rdev = NULL;
757725 best_dist = MaxSector;
758726 best_good_sectors = 0;
759727 do_balance = 1;
....@@ -775,6 +743,8 @@
775743 sector_t first_bad;
776744 int bad_sectors;
777745 sector_t dev_sector;
746
+ unsigned int pending;
747
+ bool nonrot;
778748
779749 if (r10_bio->devs[slot].bio == IO_BLOCKED)
780750 continue;
....@@ -811,8 +781,8 @@
811781 first_bad - dev_sector;
812782 if (good_sectors > best_good_sectors) {
813783 best_good_sectors = good_sectors;
814
- best_slot = slot;
815
- best_rdev = rdev;
784
+ best_dist_slot = slot;
785
+ best_dist_rdev = rdev;
816786 }
817787 if (!do_balance)
818788 /* Must read from here */
....@@ -825,14 +795,23 @@
825795 if (!do_balance)
826796 break;
827797
828
- if (best_slot >= 0)
798
+ nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
799
+ has_nonrot_disk |= nonrot;
800
+ pending = atomic_read(&rdev->nr_pending);
801
+ if (min_pending > pending && nonrot) {
802
+ min_pending = pending;
803
+ best_pending_slot = slot;
804
+ best_pending_rdev = rdev;
805
+ }
806
+
807
+ if (best_dist_slot >= 0)
829808 /* At least 2 disks to choose from so failfast is OK */
830809 set_bit(R10BIO_FailFast, &r10_bio->state);
831810 /* This optimisation is debatable, and completely destroys
832811 * sequential read speed for 'far copies' arrays. So only
833812 * keep it for 'near' arrays, and review those later.
834813 */
835
- if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
814
+ if (geo->near_copies > 1 && !pending)
836815 new_distance = 0;
837816
838817 /* for far > 1 always use the lowest address */
....@@ -841,15 +820,21 @@
841820 else
842821 new_distance = abs(r10_bio->devs[slot].addr -
843822 conf->mirrors[disk].head_position);
823
+
844824 if (new_distance < best_dist) {
845825 best_dist = new_distance;
846
- best_slot = slot;
847
- best_rdev = rdev;
826
+ best_dist_slot = slot;
827
+ best_dist_rdev = rdev;
848828 }
849829 }
850830 if (slot >= conf->copies) {
851
- slot = best_slot;
852
- rdev = best_rdev;
831
+ if (has_nonrot_disk) {
832
+ slot = best_pending_slot;
833
+ rdev = best_pending_rdev;
834
+ } else {
835
+ slot = best_dist_slot;
836
+ rdev = best_dist_rdev;
837
+ }
853838 }
854839
855840 if (slot >= 0) {
....@@ -861,31 +846,6 @@
861846 *max_sectors = best_good_sectors;
862847
863848 return rdev;
864
-}
865
-
866
-static int raid10_congested(struct mddev *mddev, int bits)
867
-{
868
- struct r10conf *conf = mddev->private;
869
- int i, ret = 0;
870
-
871
- if ((bits & (1 << WB_async_congested)) &&
872
- conf->pending_count >= max_queued_requests)
873
- return 1;
874
-
875
- rcu_read_lock();
876
- for (i = 0;
877
- (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
878
- && ret == 0;
879
- i++) {
880
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
881
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
882
- struct request_queue *q = bdev_get_queue(rdev->bdev);
883
-
884
- ret |= bdi_congested(q->backing_dev_info, bits);
885
- }
886
- }
887
- rcu_read_unlock();
888
- return ret;
889849 }
890850
891851 static void flush_pending_writes(struct r10conf *conf)
....@@ -932,7 +892,7 @@
932892 /* Just ignore it */
933893 bio_endio(bio);
934894 else
935
- generic_make_request(bio);
895
+ submit_bio_noacct(bio);
936896 bio = next;
937897 }
938898 blk_finish_plug(&plug);
....@@ -995,6 +955,7 @@
995955 {
996956 spin_lock_irq(&conf->resync_lock);
997957 if (conf->barrier) {
958
+ struct bio_list *bio_list = current->bio_list;
998959 conf->nr_waiting++;
999960 /* Wait for the barrier to drop.
1000961 * However if there are already pending
....@@ -1009,9 +970,16 @@
1009970 wait_event_lock_irq(conf->wait_barrier,
1010971 !conf->barrier ||
1011972 (atomic_read(&conf->nr_pending) &&
1012
- current->bio_list &&
1013
- (!bio_list_empty(&current->bio_list[0]) ||
1014
- !bio_list_empty(&current->bio_list[1]))),
973
+ bio_list &&
974
+ (!bio_list_empty(&bio_list[0]) ||
975
+ !bio_list_empty(&bio_list[1]))) ||
976
+ /* move on if recovery thread is
977
+ * blocked by us
978
+ */
979
+ (conf->mddev->thread->tsk == current &&
980
+ test_bit(MD_RECOVERY_RUNNING,
981
+ &conf->mddev->recovery) &&
982
+ conf->nr_queued > 0),
1015983 conf->resync_lock);
1016984 conf->nr_waiting--;
1017985 if (!conf->nr_waiting)
....@@ -1117,10 +1085,33 @@
11171085 /* Just ignore it */
11181086 bio_endio(bio);
11191087 else
1120
- generic_make_request(bio);
1088
+ submit_bio_noacct(bio);
11211089 bio = next;
11221090 }
11231091 kfree(plug);
1092
+}
1093
+
1094
+/*
1095
+ * 1. Register the new request and wait if the reconstruction thread has put
1096
+ * up a bar for new requests. Continue immediately if no resync is active
1097
+ * currently.
1098
+ * 2. If IO spans the reshape position. Need to wait for reshape to pass.
1099
+ */
1100
+static void regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1101
+ struct bio *bio, sector_t sectors)
1102
+{
1103
+ wait_barrier(conf);
1104
+ while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1105
+ bio->bi_iter.bi_sector < conf->reshape_progress &&
1106
+ bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1107
+ raid10_log(conf->mddev, "wait reshape");
1108
+ allow_barrier(conf);
1109
+ wait_event(conf->wait_barrier,
1110
+ conf->reshape_progress <= bio->bi_iter.bi_sector ||
1111
+ conf->reshape_progress >= bio->bi_iter.bi_sector +
1112
+ sectors);
1113
+ wait_barrier(conf);
1114
+ }
11241115 }
11251116
11261117 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
....@@ -1131,7 +1122,6 @@
11311122 const int op = bio_op(bio);
11321123 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
11331124 int max_sectors;
1134
- sector_t sectors;
11351125 struct md_rdev *rdev;
11361126 char b[BDEVNAME_SIZE];
11371127 int slot = r10_bio->read_slot;
....@@ -1165,30 +1155,8 @@
11651155 }
11661156 rcu_read_unlock();
11671157 }
1168
- /*
1169
- * Register the new request and wait if the reconstruction
1170
- * thread has put up a bar for new requests.
1171
- * Continue immediately if no resync is active currently.
1172
- */
1173
- wait_barrier(conf);
11741158
1175
- sectors = r10_bio->sectors;
1176
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177
- bio->bi_iter.bi_sector < conf->reshape_progress &&
1178
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1179
- /*
1180
- * IO spans the reshape position. Need to wait for reshape to
1181
- * pass
1182
- */
1183
- raid10_log(conf->mddev, "wait reshape");
1184
- allow_barrier(conf);
1185
- wait_event(conf->wait_barrier,
1186
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
1187
- conf->reshape_progress >= bio->bi_iter.bi_sector +
1188
- sectors);
1189
- wait_barrier(conf);
1190
- }
1191
-
1159
+ regular_request_wait(mddev, conf, bio, r10_bio->sectors);
11921160 rdev = read_balance(conf, r10_bio, &max_sectors);
11931161 if (!rdev) {
11941162 if (err_rdev) {
....@@ -1209,7 +1177,7 @@
12091177 gfp, &conf->bio_split);
12101178 bio_chain(split, bio);
12111179 allow_barrier(conf);
1212
- generic_make_request(bio);
1180
+ submit_bio_noacct(bio);
12131181 wait_barrier(conf);
12141182 bio = split;
12151183 r10_bio->master_bio = bio;
....@@ -1236,7 +1204,7 @@
12361204 trace_block_bio_remap(read_bio->bi_disk->queue,
12371205 read_bio, disk_devt(mddev->gendisk),
12381206 r10_bio->sector);
1239
- generic_make_request(read_bio);
1207
+ submit_bio_noacct(read_bio);
12401208 return;
12411209 }
12421210
....@@ -1333,30 +1301,8 @@
13331301 finish_wait(&conf->wait_barrier, &w);
13341302 }
13351303
1336
- /*
1337
- * Register the new request and wait if the reconstruction
1338
- * thread has put up a bar for new requests.
1339
- * Continue immediately if no resync is active currently.
1340
- */
1341
- wait_barrier(conf);
1342
-
13431304 sectors = r10_bio->sectors;
1344
- while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1345
- bio->bi_iter.bi_sector < conf->reshape_progress &&
1346
- bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1347
- /*
1348
- * IO spans the reshape position. Need to wait for reshape to
1349
- * pass
1350
- */
1351
- raid10_log(conf->mddev, "wait reshape");
1352
- allow_barrier(conf);
1353
- wait_event(conf->wait_barrier,
1354
- conf->reshape_progress <= bio->bi_iter.bi_sector ||
1355
- conf->reshape_progress >= bio->bi_iter.bi_sector +
1356
- sectors);
1357
- wait_barrier(conf);
1358
- }
1359
-
1305
+ regular_request_wait(mddev, conf, bio, sectors);
13601306 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
13611307 (mddev->reshape_backwards
13621308 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
....@@ -1516,7 +1462,7 @@
15161462 GFP_NOIO, &conf->bio_split);
15171463 bio_chain(split, bio);
15181464 allow_barrier(conf);
1519
- generic_make_request(bio);
1465
+ submit_bio_noacct(bio);
15201466 wait_barrier(conf);
15211467 bio = split;
15221468 r10_bio->master_bio = bio;
....@@ -1677,12 +1623,12 @@
16771623
16781624 /*
16791625 * If it is not operational, then we have already marked it as dead
1680
- * else if it is the last working disks, ignore the error, let the
1681
- * next level up know.
1626
+ * else if it is the last working disks with "fail_last_dev == false",
1627
+ * ignore the error, let the next level up know.
16821628 * else mark the drive as failed
16831629 */
16841630 spin_lock_irqsave(&conf->device_lock, flags);
1685
- if (test_bit(In_sync, &rdev->flags)
1631
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
16861632 && !enough(conf, rdev->raid_disk)) {
16871633 /*
16881634 * Don't fail the drive, just return an IO error.
....@@ -1863,9 +1809,12 @@
18631809 int err = 0;
18641810 int number = rdev->raid_disk;
18651811 struct md_rdev **rdevp;
1866
- struct raid10_info *p = conf->mirrors + number;
1812
+ struct raid10_info *p;
18671813
18681814 print_conf(conf);
1815
+ if (unlikely(number >= mddev->raid_disks))
1816
+ return 0;
1817
+ p = conf->mirrors + number;
18691818 if (rdev == p->rdev)
18701819 rdevp = &p->rdev;
18711820 else if (rdev == p->replacement)
....@@ -2137,7 +2086,7 @@
21372086 tbio->bi_opf |= MD_FAILFAST;
21382087 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
21392088 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2140
- generic_make_request(tbio);
2089
+ submit_bio_noacct(tbio);
21412090 }
21422091
21432092 /* Now write out to any replacement devices
....@@ -2156,7 +2105,7 @@
21562105 atomic_inc(&r10_bio->remaining);
21572106 md_sync_acct(conf->mirrors[d].replacement->bdev,
21582107 bio_sectors(tbio));
2159
- generic_make_request(tbio);
2108
+ submit_bio_noacct(tbio);
21602109 }
21612110
21622111 done:
....@@ -2279,7 +2228,7 @@
22792228 wbio = r10_bio->devs[1].bio;
22802229 wbio2 = r10_bio->devs[1].repl_bio;
22812230 /* Need to test wbio2->bi_end_io before we call
2282
- * generic_make_request as if the former is NULL,
2231
+ * submit_bio_noacct as if the former is NULL,
22832232 * the latter is free to free wbio2.
22842233 */
22852234 if (wbio2 && !wbio2->bi_end_io)
....@@ -2287,13 +2236,13 @@
22872236 if (wbio->bi_end_io) {
22882237 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
22892238 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2290
- generic_make_request(wbio);
2239
+ submit_bio_noacct(wbio);
22912240 }
22922241 if (wbio2) {
22932242 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
22942243 md_sync_acct(conf->mirrors[d].replacement->bdev,
22952244 bio_sectors(wbio2));
2296
- generic_make_request(wbio2);
2245
+ submit_bio_noacct(wbio2);
22972246 }
22982247 }
22992248
....@@ -2927,7 +2876,7 @@
29272876 * a number of r10_bio structures, one for each out-of-sync device.
29282877 * As we setup these structures, we collect all bio's together into a list
29292878 * which we then process collectively to add pages, and then process again
2930
- * to pass to generic_make_request.
2879
+ * to pass to submit_bio_noacct.
29312880 *
29322881 * The r10_bio structures are linked using a borrowed master_bio pointer.
29332882 * This link is counted in ->remaining. When the r10_bio that points to NULL
....@@ -3084,6 +3033,8 @@
30843033 sector_t sect;
30853034 int must_sync;
30863035 int any_working;
3036
+ int need_recover = 0;
3037
+ int need_replace = 0;
30873038 struct raid10_info *mirror = &conf->mirrors[i];
30883039 struct md_rdev *mrdev, *mreplace;
30893040
....@@ -3091,11 +3042,15 @@
30913042 mrdev = rcu_dereference(mirror->rdev);
30923043 mreplace = rcu_dereference(mirror->replacement);
30933044
3094
- if ((mrdev == NULL ||
3095
- test_bit(Faulty, &mrdev->flags) ||
3096
- test_bit(In_sync, &mrdev->flags)) &&
3097
- (mreplace == NULL ||
3098
- test_bit(Faulty, &mreplace->flags))) {
3045
+ if (mrdev != NULL &&
3046
+ !test_bit(Faulty, &mrdev->flags) &&
3047
+ !test_bit(In_sync, &mrdev->flags))
3048
+ need_recover = 1;
3049
+ if (mreplace != NULL &&
3050
+ !test_bit(Faulty, &mreplace->flags))
3051
+ need_replace = 1;
3052
+
3053
+ if (!need_recover && !need_replace) {
30993054 rcu_read_unlock();
31003055 continue;
31013056 }
....@@ -3218,7 +3173,7 @@
32183173 r10_bio->devs[1].devnum = i;
32193174 r10_bio->devs[1].addr = to_addr;
32203175
3221
- if (!test_bit(In_sync, &mrdev->flags)) {
3176
+ if (need_recover) {
32223177 bio = r10_bio->devs[1].bio;
32233178 bio->bi_next = biolist;
32243179 biolist = bio;
....@@ -3235,16 +3190,11 @@
32353190 bio = r10_bio->devs[1].repl_bio;
32363191 if (bio)
32373192 bio->bi_end_io = NULL;
3238
- /* Note: if mreplace != NULL, then bio
3193
+ /* Note: if need_replace, then bio
32393194 * cannot be NULL as r10buf_pool_alloc will
32403195 * have allocated it.
3241
- * So the second test here is pointless.
3242
- * But it keeps semantic-checkers happy, and
3243
- * this comment keeps human reviewers
3244
- * happy.
32453196 */
3246
- if (mreplace == NULL || bio == NULL ||
3247
- test_bit(Faulty, &mreplace->flags))
3197
+ if (!need_replace)
32483198 break;
32493199 bio->bi_next = biolist;
32503200 biolist = bio;
....@@ -3533,7 +3483,7 @@
35333483 if (bio->bi_end_io == end_sync_read) {
35343484 md_sync_acct_bio(bio, nr_sectors);
35353485 bio->bi_status = 0;
3536
- generic_make_request(bio);
3486
+ submit_bio_noacct(bio);
35373487 }
35383488 }
35393489
....@@ -3704,8 +3654,8 @@
37043654
37053655 conf->geo = geo;
37063656 conf->copies = copies;
3707
- err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc,
3708
- r10bio_pool_free, conf);
3657
+ err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3658
+ rbio_pool_free, conf);
37093659 if (err)
37103660 goto out;
37113661
....@@ -3757,10 +3707,20 @@
37573707 return ERR_PTR(err);
37583708 }
37593709
3710
+static void raid10_set_io_opt(struct r10conf *conf)
3711
+{
3712
+ int raid_disks = conf->geo.raid_disks;
3713
+
3714
+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
3715
+ raid_disks /= conf->geo.near_copies;
3716
+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
3717
+ raid_disks);
3718
+}
3719
+
37603720 static int raid10_run(struct mddev *mddev)
37613721 {
37623722 struct r10conf *conf;
3763
- int i, disk_idx, chunk_size;
3723
+ int i, disk_idx;
37643724 struct raid10_info *disk;
37653725 struct md_rdev *rdev;
37663726 sector_t size;
....@@ -3796,18 +3756,13 @@
37963756 mddev->thread = conf->thread;
37973757 conf->thread = NULL;
37983758
3799
- chunk_size = mddev->chunk_sectors << 9;
38003759 if (mddev->queue) {
38013760 blk_queue_max_discard_sectors(mddev->queue,
38023761 mddev->chunk_sectors);
38033762 blk_queue_max_write_same_sectors(mddev->queue, 0);
38043763 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3805
- blk_queue_io_min(mddev->queue, chunk_size);
3806
- if (conf->geo.raid_disks % conf->geo.near_copies)
3807
- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3808
- else
3809
- blk_queue_io_opt(mddev->queue, chunk_size *
3810
- (conf->geo.raid_disks / conf->geo.near_copies));
3764
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
3765
+ raid10_set_io_opt(conf);
38113766 }
38123767
38133768 rdev_for_each(rdev, mddev) {
....@@ -3921,19 +3876,6 @@
39213876 md_set_array_sectors(mddev, size);
39223877 mddev->resync_max_sectors = size;
39233878 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3924
-
3925
- if (mddev->queue) {
3926
- int stripe = conf->geo.raid_disks *
3927
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3928
-
3929
- /* Calculate max read-ahead size.
3930
- * We need to readahead at least twice a whole stripe....
3931
- * maybe...
3932
- */
3933
- stripe /= conf->geo.near_copies;
3934
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3935
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3936
- }
39373879
39383880 if (md_integrity_register(mddev))
39393881 goto out_free_conf;
....@@ -4293,12 +4235,46 @@
42934235 spin_unlock_irq(&conf->device_lock);
42944236
42954237 if (mddev->delta_disks && mddev->bitmap) {
4296
- ret = md_bitmap_resize(mddev->bitmap,
4297
- raid10_size(mddev, 0, conf->geo.raid_disks),
4298
- 0, 0);
4238
+ struct mdp_superblock_1 *sb = NULL;
4239
+ sector_t oldsize, newsize;
4240
+
4241
+ oldsize = raid10_size(mddev, 0, 0);
4242
+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4243
+
4244
+ if (!mddev_is_clustered(mddev)) {
4245
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
4246
+ if (ret)
4247
+ goto abort;
4248
+ else
4249
+ goto out;
4250
+ }
4251
+
4252
+ rdev_for_each(rdev, mddev) {
4253
+ if (rdev->raid_disk > -1 &&
4254
+ !test_bit(Faulty, &rdev->flags))
4255
+ sb = page_address(rdev->sb_page);
4256
+ }
4257
+
4258
+ /*
4259
+ * some node is already performing reshape, and no need to
4260
+ * call md_bitmap_resize again since it should be called when
4261
+ * receiving BITMAP_RESIZE msg
4262
+ */
4263
+ if ((sb && (le32_to_cpu(sb->feature_map) &
4264
+ MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4265
+ goto out;
4266
+
4267
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
42994268 if (ret)
43004269 goto abort;
4270
+
4271
+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4272
+ if (ret) {
4273
+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
4274
+ goto abort;
4275
+ }
43014276 }
4277
+out:
43024278 if (mddev->delta_disks > 0) {
43034279 rdev_for_each(rdev, mddev)
43044280 if (rdev->raid_disk < 0 &&
....@@ -4310,8 +4286,8 @@
43104286 else
43114287 rdev->recovery_offset = 0;
43124288
4313
- if (sysfs_link_rdev(mddev, rdev))
4314
- /* Failure here is OK */;
4289
+ /* Failure here is OK */
4290
+ sysfs_link_rdev(mddev, rdev);
43154291 }
43164292 } else if (rdev->raid_disk >= conf->prev.raid_disks
43174293 && !test_bit(Faulty, &rdev->flags)) {
....@@ -4457,7 +4433,7 @@
44574433 sector_nr = conf->reshape_progress;
44584434 if (sector_nr) {
44594435 mddev->curr_resync_completed = sector_nr;
4460
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4436
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
44614437 *skipped = 1;
44624438 return sector_nr;
44634439 }
....@@ -4486,8 +4462,8 @@
44864462 last = conf->reshape_progress - 1;
44874463 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
44884464 & conf->prev.chunk_mask);
4489
- if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4490
- sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4465
+ if (sector_nr + RESYNC_SECTORS < last)
4466
+ sector_nr = last + 1 - RESYNC_SECTORS;
44914467 } else {
44924468 /* 'next' is after the last device address that we
44934469 * might write to for this chunk in the new layout
....@@ -4509,8 +4485,8 @@
45094485 last = sector_nr | (conf->geo.chunk_mask
45104486 & conf->prev.chunk_mask);
45114487
4512
- if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4513
- last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4488
+ if (sector_nr + RESYNC_SECTORS <= last)
4489
+ last = sector_nr + RESYNC_SECTORS - 1;
45144490 }
45154491
45164492 if (need_flush ||
....@@ -4575,6 +4551,32 @@
45754551 r10_bio->master_bio = read_bio;
45764552 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
45774553
4554
+ /*
4555
+ * Broadcast RESYNC message to other nodes, so all nodes would not
4556
+ * write to the region to avoid conflict.
4557
+ */
4558
+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4559
+ struct mdp_superblock_1 *sb = NULL;
4560
+ int sb_reshape_pos = 0;
4561
+
4562
+ conf->cluster_sync_low = sector_nr;
4563
+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4564
+ sb = page_address(rdev->sb_page);
4565
+ if (sb) {
4566
+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4567
+ /*
4568
+ * Set cluster_sync_low again if next address for array
4569
+ * reshape is less than cluster_sync_low. Since we can't
4570
+ * update cluster_sync_low until it has finished reshape.
4571
+ */
4572
+ if (sb_reshape_pos < conf->cluster_sync_low)
4573
+ conf->cluster_sync_low = sb_reshape_pos;
4574
+ }
4575
+
4576
+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4577
+ conf->cluster_sync_high);
4578
+ }
4579
+
45784580 /* Now find the locations in the new layout */
45794581 __raid10_find_phys(&conf->geo, r10_bio);
45804582
....@@ -4631,7 +4633,7 @@
46314633 md_sync_acct_bio(read_bio, r10_bio->sectors);
46324634 atomic_inc(&r10_bio->remaining);
46334635 read_bio->bi_next = NULL;
4634
- generic_make_request(read_bio);
4636
+ submit_bio_noacct(read_bio);
46354637 sectors_done += nr_sectors;
46364638 if (sector_nr <= last)
46374639 goto read_more;
....@@ -4694,7 +4696,7 @@
46944696 md_sync_acct_bio(b, r10_bio->sectors);
46954697 atomic_inc(&r10_bio->remaining);
46964698 b->bi_next = NULL;
4697
- generic_make_request(b);
4699
+ submit_bio_noacct(b);
46984700 }
46994701 end_reshape_request(r10_bio);
47004702 }
....@@ -4712,17 +4714,22 @@
47124714 conf->reshape_safe = MaxSector;
47134715 spin_unlock_irq(&conf->device_lock);
47144716
4715
- /* read-ahead size must cover two whole stripes, which is
4716
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4717
- */
4718
- if (conf->mddev->queue) {
4719
- int stripe = conf->geo.raid_disks *
4720
- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4721
- stripe /= conf->geo.near_copies;
4722
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4723
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4724
- }
4717
+ if (conf->mddev->queue)
4718
+ raid10_set_io_opt(conf);
47254719 conf->fullsync = 0;
4720
+}
4721
+
4722
+static void raid10_update_reshape_pos(struct mddev *mddev)
4723
+{
4724
+ struct r10conf *conf = mddev->private;
4725
+ sector_t lo, hi;
4726
+
4727
+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4728
+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4729
+ || mddev->reshape_position == MaxSector)
4730
+ conf->reshape_progress = mddev->reshape_position;
4731
+ else
4732
+ WARN_ON_ONCE(1);
47264733 }
47274734
47284735 static int handle_reshape_read_error(struct mddev *mddev,
....@@ -4736,8 +4743,7 @@
47364743 int idx = 0;
47374744 struct page **pages;
47384745
4739
- r10b = kmalloc(sizeof(*r10b) +
4740
- sizeof(struct r10dev) * conf->copies, GFP_NOIO);
4746
+ r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
47414747 if (!r10b) {
47424748 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
47434749 return -ENOMEM;
....@@ -4893,7 +4899,7 @@
48934899 .check_reshape = raid10_check_reshape,
48944900 .start_reshape = raid10_start_reshape,
48954901 .finish_reshape = raid10_finish_reshape,
4896
- .congested = raid10_congested,
4902
+ .update_reshape_pos = raid10_update_reshape_pos,
48974903 };
48984904
48994905 static int __init raid_init(void)