forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/drivers/md/raid5.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid5.c : Multiple Devices driver for Linux
34 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
....@@ -7,15 +8,6 @@
78 * RAID-4/5/6 management functions.
89 * Thanks to Penguin Computing for making the RAID-6 development possible
910 * by donating a test server!
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1911 */
2012
2113 /*
....@@ -44,6 +36,7 @@
4436 */
4537
4638 #include <linux/blkdev.h>
39
+#include <linux/delay.h>
4740 #include <linux/kthread.h>
4841 #include <linux/raid/pq.h>
4942 #include <linux/async_tx.h>
....@@ -54,7 +47,6 @@
5447 #include <linux/slab.h>
5548 #include <linux/ratelimit.h>
5649 #include <linux/nodemask.h>
57
-#include <linux/flex_array.h>
5850
5951 #include <trace/events/block.h>
6052 #include <linux/list_sort.h>
....@@ -78,13 +70,13 @@
7870
7971 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
8072 {
81
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
8274 return &conf->stripe_hashtbl[hash];
8375 }
8476
85
-static inline int stripe_hash_locks_hash(sector_t sect)
77
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
8678 {
87
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
8880 }
8981
9082 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
....@@ -457,13 +449,74 @@
457449 return sh;
458450 }
459451
452
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
453
+static void free_stripe_pages(struct stripe_head *sh)
454
+{
455
+ int i;
456
+ struct page *p;
457
+
458
+ /* Have not allocate page pool */
459
+ if (!sh->pages)
460
+ return;
461
+
462
+ for (i = 0; i < sh->nr_pages; i++) {
463
+ p = sh->pages[i];
464
+ if (p)
465
+ put_page(p);
466
+ sh->pages[i] = NULL;
467
+ }
468
+}
469
+
470
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
471
+{
472
+ int i;
473
+ struct page *p;
474
+
475
+ for (i = 0; i < sh->nr_pages; i++) {
476
+ /* The page have allocated. */
477
+ if (sh->pages[i])
478
+ continue;
479
+
480
+ p = alloc_page(gfp);
481
+ if (!p) {
482
+ free_stripe_pages(sh);
483
+ return -ENOMEM;
484
+ }
485
+ sh->pages[i] = p;
486
+ }
487
+ return 0;
488
+}
489
+
490
+static int
491
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
492
+{
493
+ int nr_pages, cnt;
494
+
495
+ if (sh->pages)
496
+ return 0;
497
+
498
+ /* Each of the sh->dev[i] need one conf->stripe_size */
499
+ cnt = PAGE_SIZE / conf->stripe_size;
500
+ nr_pages = (disks + cnt - 1) / cnt;
501
+
502
+ sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
503
+ if (!sh->pages)
504
+ return -ENOMEM;
505
+ sh->nr_pages = nr_pages;
506
+ sh->stripes_per_page = cnt;
507
+ return 0;
508
+}
509
+#endif
510
+
460511 static void shrink_buffers(struct stripe_head *sh)
461512 {
462
- struct page *p;
463513 int i;
464514 int num = sh->raid_conf->pool_size;
465515
516
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
466517 for (i = 0; i < num ; i++) {
518
+ struct page *p;
519
+
467520 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468521 p = sh->dev[i].page;
469522 if (!p)
....@@ -471,6 +524,11 @@
471524 sh->dev[i].page = NULL;
472525 put_page(p);
473526 }
527
+#else
528
+ for (i = 0; i < num; i++)
529
+ sh->dev[i].page = NULL;
530
+ free_stripe_pages(sh); /* Free pages */
531
+#endif
474532 }
475533
476534 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
....@@ -478,6 +536,7 @@
478536 int i;
479537 int num = sh->raid_conf->pool_size;
480538
539
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
481540 for (i = 0; i < num; i++) {
482541 struct page *page;
483542
....@@ -486,8 +545,18 @@
486545 }
487546 sh->dev[i].page = page;
488547 sh->dev[i].orig_page = page;
548
+ sh->dev[i].offset = 0;
489549 }
550
+#else
551
+ if (alloc_stripe_pages(sh, gfp))
552
+ return -ENOMEM;
490553
554
+ for (i = 0; i < num; i++) {
555
+ sh->dev[i].page = raid5_get_dev_page(sh, i);
556
+ sh->dev[i].orig_page = sh->dev[i].page;
557
+ sh->dev[i].offset = raid5_get_page_offset(sh, i);
558
+ }
559
+#endif
491560 return 0;
492561 }
493562
....@@ -618,17 +687,17 @@
618687 return degraded;
619688 }
620689
621
-static int has_failed(struct r5conf *conf)
690
+static bool has_failed(struct r5conf *conf)
622691 {
623
- int degraded;
692
+ int degraded = conf->mddev->degraded;
624693
625
- if (conf->mddev->reshape_position == MaxSector)
626
- return conf->mddev->degraded > conf->max_degraded;
694
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
695
+ return true;
627696
628
- degraded = raid5_calc_degraded(conf);
629
- if (degraded > conf->max_degraded)
630
- return 1;
631
- return 0;
697
+ if (conf->mddev->reshape_position != MaxSector)
698
+ degraded = raid5_calc_degraded(conf);
699
+
700
+ return degraded > conf->max_degraded;
632701 }
633702
634703 struct stripe_head *
....@@ -636,7 +705,7 @@
636705 int previous, int noblock, int noquiesce)
637706 {
638707 struct stripe_head *sh;
639
- int hash = stripe_hash_locks_hash(sector);
708
+ int hash = stripe_hash_locks_hash(conf, sector);
640709 int inc_empty_inactive_list_flag;
641710
642711 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
....@@ -712,6 +781,8 @@
712781 }
713782
714783 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
784
+ __acquires(&sh1->stripe_lock)
785
+ __acquires(&sh2->stripe_lock)
715786 {
716787 if (sh1 > sh2) {
717788 spin_lock_irq(&sh2->stripe_lock);
....@@ -723,6 +794,8 @@
723794 }
724795
725796 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
797
+ __releases(&sh1->stripe_lock)
798
+ __releases(&sh2->stripe_lock)
726799 {
727800 spin_unlock(&sh1->stripe_lock);
728801 spin_unlock_irq(&sh2->stripe_lock);
....@@ -753,9 +826,9 @@
753826 tmp_sec = sh->sector;
754827 if (!sector_div(tmp_sec, conf->chunk_sectors))
755828 return;
756
- head_sector = sh->sector - STRIPE_SECTORS;
829
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
757830
758
- hash = stripe_hash_locks_hash(head_sector);
831
+ hash = stripe_hash_locks_hash(conf, head_sector);
759832 spin_lock_irq(conf->hash_locks + hash);
760833 head = __find_stripe(conf, head_sector, conf->generation);
761834 if (head && !atomic_inc_not_zero(&head->count)) {
....@@ -878,7 +951,7 @@
878951 struct bio *bio;
879952
880953 while ((bio = bio_list_pop(tmp)))
881
- generic_make_request(bio);
954
+ submit_bio_noacct(bio);
882955 }
883956
884957 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
....@@ -1062,7 +1135,7 @@
10621135 test_bit(WriteErrorSeen, &rdev->flags)) {
10631136 sector_t first_bad;
10641137 int bad_sectors;
1065
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1138
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
10661139 &first_bad, &bad_sectors);
10671140 if (!bad)
10681141 break;
....@@ -1094,7 +1167,7 @@
10941167 if (rdev) {
10951168 if (s->syncing || s->expanding || s->expanded
10961169 || s->replacing)
1097
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1170
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
10981171
10991172 set_bit(STRIPE_IO_STARTED, &sh->state);
11001173
....@@ -1134,12 +1207,12 @@
11341207 else
11351208 sh->dev[i].vec.bv_page = sh->dev[i].page;
11361209 bi->bi_vcnt = 1;
1137
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138
- bi->bi_io_vec[0].bv_offset = 0;
1139
- bi->bi_iter.bi_size = STRIPE_SIZE;
1210
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211
+ bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11401213 bi->bi_write_hint = sh->dev[i].write_hint;
11411214 if (!rrdev)
1142
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1215
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11431216 /*
11441217 * If this is discard request, set bi_vcnt 0. We don't
11451218 * want to confuse SCSI because SCSI will replace payload
....@@ -1156,12 +1229,12 @@
11561229 if (should_defer && op_is_write(op))
11571230 bio_list_add(&pending_bios, bi);
11581231 else
1159
- generic_make_request(bi);
1232
+ submit_bio_noacct(bi);
11601233 }
11611234 if (rrdev) {
11621235 if (s->syncing || s->expanding || s->expanded
11631236 || s->replacing)
1164
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1237
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
11651238
11661239 set_bit(STRIPE_IO_STARTED, &sh->state);
11671240
....@@ -1188,11 +1261,11 @@
11881261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
11891262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
11901263 rbi->bi_vcnt = 1;
1191
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192
- rbi->bi_io_vec[0].bv_offset = 0;
1193
- rbi->bi_iter.bi_size = STRIPE_SIZE;
1264
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265
+ rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11941267 rbi->bi_write_hint = sh->dev[i].write_hint;
1195
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1268
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11961269 /*
11971270 * If this is discard request, set bi_vcnt 0. We don't
11981271 * want to confuse SCSI because SCSI will replace payload
....@@ -1206,7 +1279,7 @@
12061279 if (should_defer && op_is_write(op))
12071280 bio_list_add(&pending_bios, rbi);
12081281 else
1209
- generic_make_request(rbi);
1282
+ submit_bio_noacct(rbi);
12101283 }
12111284 if (!rdev && !rrdev) {
12121285 if (op_is_write(op))
....@@ -1231,7 +1304,7 @@
12311304
12321305 static struct dma_async_tx_descriptor *
12331306 async_copy_data(int frombio, struct bio *bio, struct page **page,
1234
- sector_t sector, struct dma_async_tx_descriptor *tx,
1307
+ unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
12351308 struct stripe_head *sh, int no_skipcopy)
12361309 {
12371310 struct bio_vec bvl;
....@@ -1240,6 +1313,7 @@
12401313 int page_offset;
12411314 struct async_submit_ctl submit;
12421315 enum async_tx_flags flags = 0;
1316
+ struct r5conf *conf = sh->raid_conf;
12431317
12441318 if (bio->bi_iter.bi_sector >= sector)
12451319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
....@@ -1261,8 +1335,8 @@
12611335 len -= b_offset;
12621336 }
12631337
1264
- if (len > 0 && page_offset + len > STRIPE_SIZE)
1265
- clen = STRIPE_SIZE - page_offset;
1338
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
12661340 else
12671341 clen = len;
12681342
....@@ -1270,17 +1344,17 @@
12701344 b_offset += bvl.bv_offset;
12711345 bio_page = bvl.bv_page;
12721346 if (frombio) {
1273
- if (sh->raid_conf->skip_copy &&
1347
+ if (conf->skip_copy &&
12741348 b_offset == 0 && page_offset == 0 &&
1275
- clen == STRIPE_SIZE &&
1349
+ clen == RAID5_STRIPE_SIZE(conf) &&
12761350 !no_skipcopy)
12771351 *page = bio_page;
12781352 else
1279
- tx = async_memcpy(*page, bio_page, page_offset,
1353
+ tx = async_memcpy(*page, bio_page, page_offset + poff,
12801354 b_offset, clen, &submit);
12811355 } else
12821356 tx = async_memcpy(bio_page, *page, b_offset,
1283
- page_offset, clen, &submit);
1357
+ page_offset + poff, clen, &submit);
12841358 }
12851359 /* chain the operations */
12861360 submit.depend_tx = tx;
....@@ -1297,6 +1371,7 @@
12971371 {
12981372 struct stripe_head *sh = stripe_head_ref;
12991373 int i;
1374
+ struct r5conf *conf = sh->raid_conf;
13001375
13011376 pr_debug("%s: stripe %llu\n", __func__,
13021377 (unsigned long long)sh->sector);
....@@ -1317,8 +1392,8 @@
13171392 rbi = dev->read;
13181393 dev->read = NULL;
13191394 while (rbi && rbi->bi_iter.bi_sector <
1320
- dev->sector + STRIPE_SECTORS) {
1321
- rbi2 = r5_next_bio(rbi, dev->sector);
1395
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
13221397 bio_endio(rbi);
13231398 rbi = rbi2;
13241399 }
....@@ -1335,6 +1410,7 @@
13351410 struct dma_async_tx_descriptor *tx = NULL;
13361411 struct async_submit_ctl submit;
13371412 int i;
1413
+ struct r5conf *conf = sh->raid_conf;
13381414
13391415 BUG_ON(sh->batch_head);
13401416 pr_debug("%s: stripe %llu\n", __func__,
....@@ -1349,10 +1425,11 @@
13491425 dev->toread = NULL;
13501426 spin_unlock_irq(&sh->stripe_lock);
13511427 while (rbi && rbi->bi_iter.bi_sector <
1352
- dev->sector + STRIPE_SECTORS) {
1428
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
13531429 tx = async_copy_data(0, rbi, &dev->page,
1430
+ dev->offset,
13541431 dev->sector, tx, sh, 0);
1355
- rbi = r5_next_bio(rbi, dev->sector);
1432
+ rbi = r5_next_bio(conf, rbi, dev->sector);
13561433 }
13571434 }
13581435 }
....@@ -1394,22 +1471,25 @@
13941471 }
13951472
13961473 /* return a pointer to the address conversion region of the scribble buffer */
1397
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398
- struct raid5_percpu *percpu, int i)
1474
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
13991475 {
1400
- void *addr;
1401
-
1402
- addr = flex_array_get(percpu->scribble, i);
1403
- return addr + sizeof(struct page *) * (sh->disks + 2);
1476
+ return percpu->scribble + i * percpu->scribble_obj_size;
14041477 }
14051478
14061479 /* return a pointer to the address conversion region of the scribble buffer */
1407
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1480
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481
+ struct raid5_percpu *percpu, int i)
14081482 {
1409
- void *addr;
1483
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484
+}
14101485
1411
- addr = flex_array_get(percpu->scribble, i);
1412
- return addr;
1486
+/*
1487
+ * Return a pointer to record offset address.
1488
+ */
1489
+static unsigned int *
1490
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491
+{
1492
+ return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
14131493 }
14141494
14151495 static struct dma_async_tx_descriptor *
....@@ -1417,9 +1497,11 @@
14171497 {
14181498 int disks = sh->disks;
14191499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
14201501 int target = sh->ops.target;
14211502 struct r5dev *tgt = &sh->dev[target];
14221503 struct page *xor_dest = tgt->page;
1504
+ unsigned int off_dest = tgt->offset;
14231505 int count = 0;
14241506 struct dma_async_tx_descriptor *tx;
14251507 struct async_submit_ctl submit;
....@@ -1431,24 +1513,30 @@
14311513 __func__, (unsigned long long)sh->sector, target);
14321514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
14331515
1434
- for (i = disks; i--; )
1435
- if (i != target)
1516
+ for (i = disks; i--; ) {
1517
+ if (i != target) {
1518
+ off_srcs[count] = sh->dev[i].offset;
14361519 xor_srcs[count++] = sh->dev[i].page;
1520
+ }
1521
+ }
14371522
14381523 atomic_inc(&sh->count);
14391524
14401525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
14411526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
14421527 if (unlikely(count == 1))
1443
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1528
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14441530 else
1445
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1531
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14461533
14471534 return tx;
14481535 }
14491536
14501537 /* set_syndrome_sources - populate source buffers for gen_syndrome
14511538 * @srcs - (struct page *) array of size sh->disks
1539
+ * @offs - (unsigned int) array of offset for each page
14521540 * @sh - stripe_head to parse
14531541 *
14541542 * Populates srcs in proper layout order for the stripe and returns the
....@@ -1457,6 +1545,7 @@
14571545 * is recorded in srcs[count+1]].
14581546 */
14591547 static int set_syndrome_sources(struct page **srcs,
1548
+ unsigned int *offs,
14601549 struct stripe_head *sh,
14611550 int srctype)
14621551 {
....@@ -1487,6 +1576,12 @@
14871576 srcs[slot] = sh->dev[i].orig_page;
14881577 else
14891578 srcs[slot] = sh->dev[i].page;
1579
+ /*
1580
+ * For R5_InJournal, PAGE_SIZE must be 4KB and will
1581
+ * not shared page. In that case, dev[i].offset
1582
+ * is 0.
1583
+ */
1584
+ offs[slot] = sh->dev[i].offset;
14901585 }
14911586 i = raid6_next_disk(i, disks);
14921587 } while (i != d0_idx);
....@@ -1499,12 +1594,14 @@
14991594 {
15001595 int disks = sh->disks;
15011596 struct page **blocks = to_addr_page(percpu, 0);
1597
+ unsigned int *offs = to_addr_offs(sh, percpu);
15021598 int target;
15031599 int qd_idx = sh->qd_idx;
15041600 struct dma_async_tx_descriptor *tx;
15051601 struct async_submit_ctl submit;
15061602 struct r5dev *tgt;
15071603 struct page *dest;
1604
+ unsigned int dest_off;
15081605 int i;
15091606 int count;
15101607
....@@ -1523,30 +1620,34 @@
15231620 tgt = &sh->dev[target];
15241621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
15251622 dest = tgt->page;
1623
+ dest_off = tgt->offset;
15261624
15271625 atomic_inc(&sh->count);
15281626
15291627 if (target == qd_idx) {
1530
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
15311629 blocks[count] = NULL; /* regenerating p is not necessary */
15321630 BUG_ON(blocks[count+1] != dest); /* q should already be set */
15331631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
15341632 ops_complete_compute, sh,
15351633 to_addr_conv(sh, percpu, 0));
1536
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1634
+ tx = async_gen_syndrome(blocks, offs, count+2,
1635
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15371636 } else {
15381637 /* Compute any data- or p-drive using XOR */
15391638 count = 0;
15401639 for (i = disks; i-- ; ) {
15411640 if (i == target || i == qd_idx)
15421641 continue;
1642
+ offs[count] = sh->dev[i].offset;
15431643 blocks[count++] = sh->dev[i].page;
15441644 }
15451645
15461646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
15471647 NULL, ops_complete_compute, sh,
15481648 to_addr_conv(sh, percpu, 0));
1549
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1649
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15501651 }
15511652
15521653 return tx;
....@@ -1565,6 +1666,7 @@
15651666 struct r5dev *tgt2 = &sh->dev[target2];
15661667 struct dma_async_tx_descriptor *tx;
15671668 struct page **blocks = to_addr_page(percpu, 0);
1669
+ unsigned int *offs = to_addr_offs(sh, percpu);
15681670 struct async_submit_ctl submit;
15691671
15701672 BUG_ON(sh->batch_head);
....@@ -1577,13 +1679,16 @@
15771679 /* we need to open-code set_syndrome_sources to handle the
15781680 * slot number conversion for 'faila' and 'failb'
15791681 */
1580
- for (i = 0; i < disks ; i++)
1682
+ for (i = 0; i < disks ; i++) {
1683
+ offs[i] = 0;
15811684 blocks[i] = NULL;
1685
+ }
15821686 count = 0;
15831687 i = d0_idx;
15841688 do {
15851689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
15861690
1691
+ offs[slot] = sh->dev[i].offset;
15871692 blocks[slot] = sh->dev[i].page;
15881693
15891694 if (i == target)
....@@ -1608,10 +1713,12 @@
16081713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
16091714 ops_complete_compute, sh,
16101715 to_addr_conv(sh, percpu, 0));
1611
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612
- STRIPE_SIZE, &submit);
1716
+ return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1718
+ &submit);
16131719 } else {
16141720 struct page *dest;
1721
+ unsigned int dest_off;
16151722 int data_target;
16161723 int qd_idx = sh->qd_idx;
16171724
....@@ -1625,22 +1732,26 @@
16251732 for (i = disks; i-- ; ) {
16261733 if (i == data_target || i == qd_idx)
16271734 continue;
1735
+ offs[count] = sh->dev[i].offset;
16281736 blocks[count++] = sh->dev[i].page;
16291737 }
16301738 dest = sh->dev[data_target].page;
1739
+ dest_off = sh->dev[data_target].offset;
16311740 init_async_submit(&submit,
16321741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
16331742 NULL, NULL, NULL,
16341743 to_addr_conv(sh, percpu, 0));
1635
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1744
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745
+ RAID5_STRIPE_SIZE(sh->raid_conf),
16361746 &submit);
16371747
1638
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1748
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
16391749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
16401750 ops_complete_compute, sh,
16411751 to_addr_conv(sh, percpu, 0));
1642
- return async_gen_syndrome(blocks, 0, count+2,
1643
- STRIPE_SIZE, &submit);
1752
+ return async_gen_syndrome(blocks, offs, count+2,
1753
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1754
+ &submit);
16441755 }
16451756 } else {
16461757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
....@@ -1649,13 +1760,15 @@
16491760 if (failb == syndrome_disks) {
16501761 /* We're missing D+P. */
16511762 return async_raid6_datap_recov(syndrome_disks+2,
1652
- STRIPE_SIZE, faila,
1653
- blocks, &submit);
1763
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1764
+ faila,
1765
+ blocks, offs, &submit);
16541766 } else {
16551767 /* We're missing D+D. */
16561768 return async_raid6_2data_recov(syndrome_disks+2,
1657
- STRIPE_SIZE, faila, failb,
1658
- blocks, &submit);
1769
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1770
+ faila, failb,
1771
+ blocks, offs, &submit);
16591772 }
16601773 }
16611774 }
....@@ -1681,10 +1794,12 @@
16811794 {
16821795 int disks = sh->disks;
16831796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
16841798 int count = 0, pd_idx = sh->pd_idx, i;
16851799 struct async_submit_ctl submit;
16861800
16871801 /* existing parity data subtracted */
1802
+ unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
16881803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
16891804
16901805 BUG_ON(sh->batch_head);
....@@ -1694,15 +1809,23 @@
16941809 for (i = disks; i--; ) {
16951810 struct r5dev *dev = &sh->dev[i];
16961811 /* Only process blocks that are known to be uptodate */
1697
- if (test_bit(R5_InJournal, &dev->flags))
1812
+ if (test_bit(R5_InJournal, &dev->flags)) {
1813
+ /*
1814
+ * For this case, PAGE_SIZE must be equal to 4KB and
1815
+ * page offset is zero.
1816
+ */
1817
+ off_srcs[count] = dev->offset;
16981818 xor_srcs[count++] = dev->orig_page;
1699
- else if (test_bit(R5_Wantdrain, &dev->flags))
1819
+ } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820
+ off_srcs[count] = dev->offset;
17001821 xor_srcs[count++] = dev->page;
1822
+ }
17011823 }
17021824
17031825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
17041826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1827
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17061829
17071830 return tx;
17081831 }
....@@ -1712,17 +1835,19 @@
17121835 struct dma_async_tx_descriptor *tx)
17131836 {
17141837 struct page **blocks = to_addr_page(percpu, 0);
1838
+ unsigned int *offs = to_addr_offs(sh, percpu);
17151839 int count;
17161840 struct async_submit_ctl submit;
17171841
17181842 pr_debug("%s: stripe %llu\n", __func__,
17191843 (unsigned long long)sh->sector);
17201844
1721
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1845
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
17221846
17231847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
17241848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1849
+ tx = async_gen_syndrome(blocks, offs, count+2,
1850
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17261851
17271852 return tx;
17281853 }
....@@ -1763,7 +1888,7 @@
17631888 WARN_ON(dev->page != dev->orig_page);
17641889
17651890 while (wbi && wbi->bi_iter.bi_sector <
1766
- dev->sector + STRIPE_SECTORS) {
1891
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
17671892 if (wbi->bi_opf & REQ_FUA)
17681893 set_bit(R5_WantFUA, &dev->flags);
17691894 if (wbi->bi_opf & REQ_SYNC)
....@@ -1772,6 +1897,7 @@
17721897 set_bit(R5_Discard, &dev->flags);
17731898 else {
17741899 tx = async_copy_data(1, wbi, &dev->page,
1900
+ dev->offset,
17751901 dev->sector, tx, sh,
17761902 r5c_is_writeback(conf->log));
17771903 if (dev->page != dev->orig_page &&
....@@ -1781,7 +1907,7 @@
17811907 clear_bit(R5_OVERWRITE, &dev->flags);
17821908 }
17831909 }
1784
- wbi = r5_next_bio(wbi, dev->sector);
1910
+ wbi = r5_next_bio(conf, wbi, dev->sector);
17851911 }
17861912
17871913 if (head_sh->batch_head) {
....@@ -1851,9 +1977,11 @@
18511977 {
18521978 int disks = sh->disks;
18531979 struct page **xor_srcs;
1980
+ unsigned int *off_srcs;
18541981 struct async_submit_ctl submit;
18551982 int count, pd_idx = sh->pd_idx, i;
18561983 struct page *xor_dest;
1984
+ unsigned int off_dest;
18571985 int prexor = 0;
18581986 unsigned long flags;
18591987 int j = 0;
....@@ -1878,24 +2006,31 @@
18782006 again:
18792007 count = 0;
18802008 xor_srcs = to_addr_page(percpu, j);
2009
+ off_srcs = to_addr_offs(sh, percpu);
18812010 /* check if prexor is active which means only process blocks
18822011 * that are part of a read-modify-write (written)
18832012 */
18842013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
18852014 prexor = 1;
2015
+ off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
18862016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
18872017 for (i = disks; i--; ) {
18882018 struct r5dev *dev = &sh->dev[i];
18892019 if (head_sh->dev[i].written ||
1890
- test_bit(R5_InJournal, &head_sh->dev[i].flags))
2020
+ test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021
+ off_srcs[count] = dev->offset;
18912022 xor_srcs[count++] = dev->page;
2023
+ }
18922024 }
18932025 } else {
18942026 xor_dest = sh->dev[pd_idx].page;
2027
+ off_dest = sh->dev[pd_idx].offset;
18952028 for (i = disks; i--; ) {
18962029 struct r5dev *dev = &sh->dev[i];
1897
- if (i != pd_idx)
2030
+ if (i != pd_idx) {
2031
+ off_srcs[count] = dev->offset;
18982032 xor_srcs[count++] = dev->page;
2033
+ }
18992034 }
19002035 }
19012036
....@@ -1921,9 +2056,11 @@
19212056 }
19222057
19232058 if (unlikely(count == 1))
1924
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
2059
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19252061 else
1926
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
2062
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19272064 if (!last_stripe) {
19282065 j++;
19292066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -1938,6 +2075,7 @@
19382075 {
19392076 struct async_submit_ctl submit;
19402077 struct page **blocks;
2078
+ unsigned int *offs;
19412079 int count, i, j = 0;
19422080 struct stripe_head *head_sh = sh;
19432081 int last_stripe;
....@@ -1962,6 +2100,7 @@
19622100
19632101 again:
19642102 blocks = to_addr_page(percpu, j);
2103
+ offs = to_addr_offs(sh, percpu);
19652104
19662105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
19672106 synflags = SYNDROME_SRC_WRITTEN;
....@@ -1971,7 +2110,7 @@
19712110 txflags = ASYNC_TX_ACK;
19722111 }
19732112
1974
- count = set_syndrome_sources(blocks, sh, synflags);
2113
+ count = set_syndrome_sources(blocks, offs, sh, synflags);
19752114 last_stripe = !head_sh->batch_head ||
19762115 list_first_entry(&sh->batch_list,
19772116 struct stripe_head, batch_list) == head_sh;
....@@ -1983,7 +2122,8 @@
19832122 } else
19842123 init_async_submit(&submit, 0, tx, NULL, NULL,
19852124 to_addr_conv(sh, percpu, j));
1986
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
2125
+ tx = async_gen_syndrome(blocks, offs, count+2,
2126
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19872127 if (!last_stripe) {
19882128 j++;
19892129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -2010,7 +2150,9 @@
20102150 int pd_idx = sh->pd_idx;
20112151 int qd_idx = sh->qd_idx;
20122152 struct page *xor_dest;
2153
+ unsigned int off_dest;
20132154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
20142156 struct dma_async_tx_descriptor *tx;
20152157 struct async_submit_ctl submit;
20162158 int count;
....@@ -2022,16 +2164,20 @@
20222164 BUG_ON(sh->batch_head);
20232165 count = 0;
20242166 xor_dest = sh->dev[pd_idx].page;
2167
+ off_dest = sh->dev[pd_idx].offset;
2168
+ off_srcs[count] = off_dest;
20252169 xor_srcs[count++] = xor_dest;
20262170 for (i = disks; i--; ) {
20272171 if (i == pd_idx || i == qd_idx)
20282172 continue;
2173
+ off_srcs[count] = sh->dev[i].offset;
20292174 xor_srcs[count++] = sh->dev[i].page;
20302175 }
20312176
20322177 init_async_submit(&submit, 0, NULL, NULL, NULL,
20332178 to_addr_conv(sh, percpu, 0));
2034
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2179
+ tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180
+ RAID5_STRIPE_SIZE(sh->raid_conf),
20352181 &sh->ops.zero_sum_result, &submit);
20362182
20372183 atomic_inc(&sh->count);
....@@ -2042,6 +2188,7 @@
20422188 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
20432189 {
20442190 struct page **srcs = to_addr_page(percpu, 0);
2191
+ unsigned int *offs = to_addr_offs(sh, percpu);
20452192 struct async_submit_ctl submit;
20462193 int count;
20472194
....@@ -2049,15 +2196,16 @@
20492196 (unsigned long long)sh->sector, checkp);
20502197
20512198 BUG_ON(sh->batch_head);
2052
- count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2199
+ count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
20532200 if (!checkp)
20542201 srcs[count] = NULL;
20552202
20562203 atomic_inc(&sh->count);
20572204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
20582205 sh, to_addr_conv(sh, percpu, 0));
2059
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2206
+ async_syndrome_val(srcs, offs, count+2,
2207
+ RAID5_STRIPE_SIZE(sh->raid_conf),
2208
+ &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
20612209 }
20622210
20632211 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
....@@ -2136,6 +2284,9 @@
21362284
21372285 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
21382286 {
2287
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2288
+ kfree(sh->pages);
2289
+#endif
21392290 if (sh->ppl_page)
21402291 __free_page(sh->ppl_page);
21412292 kmem_cache_free(sc, sh);
....@@ -2169,9 +2320,15 @@
21692320 sh->ppl_page = alloc_page(gfp);
21702321 if (!sh->ppl_page) {
21712322 free_stripe(sc, sh);
2172
- sh = NULL;
2323
+ return NULL;
21732324 }
21742325 }
2326
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2327
+ if (init_stripe_shared_pages(sh, conf, disks)) {
2328
+ free_stripe(sc, sh);
2329
+ return NULL;
2330
+ }
2331
+#endif
21752332 }
21762333 return sh;
21772334 }
....@@ -2228,10 +2385,13 @@
22282385 }
22292386
22302387 /**
2231
- * scribble_len - return the required size of the scribble region
2232
- * @num - total number of disks in the array
2388
+ * scribble_alloc - allocate percpu scribble buffer for required size
2389
+ * of the scribble region
2390
+ * @percpu: from for_each_present_cpu() of the caller
2391
+ * @num: total number of disks in the array
2392
+ * @cnt: scribble objs count for required size of the scribble region
22332393 *
2234
- * The size must be enough to contain:
2394
+ * The scribble buffer size must be enough to contain:
22352395 * 1/ a struct page pointer for each device in the array +2
22362396 * 2/ room to convert each entry in (1) to its corresponding dma
22372397 * (dma_map_page()) or page (page_address()) address.
....@@ -2240,21 +2400,29 @@
22402400 * calculate over all devices (not just the data blocks), using zeros in place
22412401 * of the P and Q blocks.
22422402 */
2243
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2403
+static int scribble_alloc(struct raid5_percpu *percpu,
2404
+ int num, int cnt)
22442405 {
2245
- struct flex_array *ret;
2246
- size_t len;
2406
+ size_t obj_size =
2407
+ sizeof(struct page *) * (num + 2) +
2408
+ sizeof(addr_conv_t) * (num + 2) +
2409
+ sizeof(unsigned int) * (num + 2);
2410
+ void *scribble;
22472411
2248
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2249
- ret = flex_array_alloc(len, cnt, flags);
2250
- if (!ret)
2251
- return NULL;
2252
- /* always prealloc all elements, so no locking is required */
2253
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
2254
- flex_array_free(ret);
2255
- return NULL;
2256
- }
2257
- return ret;
2412
+ /*
2413
+ * If here is in raid array suspend context, it is in memalloc noio
2414
+ * context as well, there is no potential recursive memory reclaim
2415
+ * I/Os with the GFP_KERNEL flag.
2416
+ */
2417
+ scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2418
+ if (!scribble)
2419
+ return -ENOMEM;
2420
+
2421
+ kvfree(percpu->scribble);
2422
+
2423
+ percpu->scribble = scribble;
2424
+ percpu->scribble_obj_size = obj_size;
2425
+ return 0;
22582426 }
22592427
22602428 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
....@@ -2272,23 +2440,17 @@
22722440 return 0;
22732441 mddev_suspend(conf->mddev);
22742442 get_online_cpus();
2443
+
22752444 for_each_present_cpu(cpu) {
22762445 struct raid5_percpu *percpu;
2277
- struct flex_array *scribble;
22782446
22792447 percpu = per_cpu_ptr(conf->percpu, cpu);
2280
- scribble = scribble_alloc(new_disks,
2281
- new_sectors / STRIPE_SECTORS,
2282
- GFP_NOIO);
2283
-
2284
- if (scribble) {
2285
- flex_array_free(percpu->scribble);
2286
- percpu->scribble = scribble;
2287
- } else {
2288
- err = -ENOMEM;
2448
+ err = scribble_alloc(percpu, new_disks,
2449
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
2450
+ if (err)
22892451 break;
2290
- }
22912452 }
2453
+
22922454 put_online_cpus();
22932455 mddev_resume(conf->mddev);
22942456 if (!err) {
....@@ -2376,9 +2538,16 @@
23762538 osh = get_free_stripe(conf, hash);
23772539 unlock_device_hash_lock(conf, hash);
23782540
2541
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2542
+ for (i = 0; i < osh->nr_pages; i++) {
2543
+ nsh->pages[i] = osh->pages[i];
2544
+ osh->pages[i] = NULL;
2545
+ }
2546
+#endif
23792547 for(i=0; i<conf->pool_size; i++) {
23802548 nsh->dev[i].page = osh->dev[i].page;
23812549 nsh->dev[i].orig_page = osh->dev[i].page;
2550
+ nsh->dev[i].offset = osh->dev[i].offset;
23822551 }
23832552 nsh->hash_lock_index = hash;
23842553 free_stripe(conf->slab_cache, osh);
....@@ -2427,14 +2596,33 @@
24272596 nsh = list_entry(newstripes.next, struct stripe_head, lru);
24282597 list_del_init(&nsh->lru);
24292598
2599
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2600
+ for (i = 0; i < nsh->nr_pages; i++) {
2601
+ if (nsh->pages[i])
2602
+ continue;
2603
+ nsh->pages[i] = alloc_page(GFP_NOIO);
2604
+ if (!nsh->pages[i])
2605
+ err = -ENOMEM;
2606
+ }
2607
+
2608
+ for (i = conf->raid_disks; i < newsize; i++) {
2609
+ if (nsh->dev[i].page)
2610
+ continue;
2611
+ nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2612
+ nsh->dev[i].orig_page = nsh->dev[i].page;
2613
+ nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2614
+ }
2615
+#else
24302616 for (i=conf->raid_disks; i < newsize; i++)
24312617 if (nsh->dev[i].page == NULL) {
24322618 struct page *p = alloc_page(GFP_NOIO);
24332619 nsh->dev[i].page = p;
24342620 nsh->dev[i].orig_page = p;
2621
+ nsh->dev[i].offset = 0;
24352622 if (!p)
24362623 err = -ENOMEM;
24372624 }
2625
+#endif
24382626 raid5_release_stripe(nsh);
24392627 }
24402628 /* critical section pass, GFP_NOIO no longer needed */
....@@ -2518,10 +2706,10 @@
25182706 */
25192707 pr_info_ratelimited(
25202708 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2521
- mdname(conf->mddev), STRIPE_SECTORS,
2709
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
25222710 (unsigned long long)s,
25232711 bdevname(rdev->bdev, b));
2524
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2712
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
25252713 clear_bit(R5_ReadError, &sh->dev[i].flags);
25262714 clear_bit(R5_ReWrite, &sh->dev[i].flags);
25272715 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2566,10 +2754,16 @@
25662754 (unsigned long long)s,
25672755 bdn);
25682756 } else if (atomic_read(&rdev->read_errors)
2569
- > conf->max_nr_stripes)
2570
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2571
- mdname(conf->mddev), bdn);
2572
- else
2757
+ > conf->max_nr_stripes) {
2758
+ if (!test_bit(Faulty, &rdev->flags)) {
2759
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2760
+ mdname(conf->mddev),
2761
+ atomic_read(&rdev->read_errors),
2762
+ conf->max_nr_stripes);
2763
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2764
+ mdname(conf->mddev), bdn);
2765
+ }
2766
+ } else
25732767 retry = 1;
25742768 if (set_bad && test_bit(In_sync, &rdev->flags)
25752769 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2588,7 +2782,7 @@
25882782 if (!(set_bad
25892783 && test_bit(In_sync, &rdev->flags)
25902784 && rdev_set_badblocks(
2591
- rdev, sh->sector, STRIPE_SECTORS, 0)))
2785
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
25922786 md_error(conf->mddev, rdev);
25932787 }
25942788 }
....@@ -2604,7 +2798,7 @@
26042798 struct stripe_head *sh = bi->bi_private;
26052799 struct r5conf *conf = sh->raid_conf;
26062800 int disks = sh->disks, i;
2607
- struct md_rdev *uninitialized_var(rdev);
2801
+ struct md_rdev *rdev;
26082802 sector_t first_bad;
26092803 int bad_sectors;
26102804 int replacement = 0;
....@@ -2640,7 +2834,7 @@
26402834 if (bi->bi_status)
26412835 md_error(conf->mddev, rdev);
26422836 else if (is_badblock(rdev, sh->sector,
2643
- STRIPE_SECTORS,
2837
+ RAID5_STRIPE_SECTORS(conf),
26442838 &first_bad, &bad_sectors))
26452839 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
26462840 } else {
....@@ -2652,7 +2846,7 @@
26522846 set_bit(MD_RECOVERY_NEEDED,
26532847 &rdev->mddev->recovery);
26542848 } else if (is_badblock(rdev, sh->sector,
2655
- STRIPE_SECTORS,
2849
+ RAID5_STRIPE_SECTORS(conf),
26562850 &first_bad, &bad_sectors)) {
26572851 set_bit(R5_MadeGood, &sh->dev[i].flags);
26582852 if (test_bit(R5_ReadError, &sh->dev[i].flags))
....@@ -2672,10 +2866,10 @@
26722866 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
26732867 clear_bit(R5_LOCKED, &sh->dev[i].flags);
26742868 set_bit(STRIPE_HANDLE, &sh->state);
2675
- raid5_release_stripe(sh);
26762869
26772870 if (sh->batch_head && sh != sh->batch_head)
26782871 raid5_release_stripe(sh->batch_head);
2872
+ raid5_release_stripe(sh);
26792873 }
26802874
26812875 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
....@@ -2685,22 +2879,31 @@
26852879 unsigned long flags;
26862880 pr_debug("raid456: error called\n");
26872881
2882
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
2883
+ mdname(mddev), bdevname(rdev->bdev, b));
2884
+
26882885 spin_lock_irqsave(&conf->device_lock, flags);
26892886 set_bit(Faulty, &rdev->flags);
26902887 clear_bit(In_sync, &rdev->flags);
26912888 mddev->degraded = raid5_calc_degraded(conf);
2889
+
2890
+ if (has_failed(conf)) {
2891
+ set_bit(MD_BROKEN, &conf->mddev->flags);
2892
+ conf->recovery_disabled = mddev->recovery_disabled;
2893
+
2894
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2895
+ mdname(mddev), mddev->degraded, conf->raid_disks);
2896
+ } else {
2897
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2898
+ mdname(mddev), conf->raid_disks - mddev->degraded);
2899
+ }
2900
+
26922901 spin_unlock_irqrestore(&conf->device_lock, flags);
26932902 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
26942903
26952904 set_bit(Blocked, &rdev->flags);
26962905 set_mask_bits(&mddev->sb_flags, 0,
26972906 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2698
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2699
- "md/raid:%s: Operation continuing on %d devices.\n",
2700
- mdname(mddev),
2701
- bdevname(rdev->bdev, b),
2702
- mdname(mddev),
2703
- conf->raid_disks - mddev->degraded);
27042907 r5c_update_on_rdev_error(mddev, rdev);
27052908 }
27062909
....@@ -3274,13 +3477,13 @@
32743477 /* check if page is covered */
32753478 sector_t sector = sh->dev[dd_idx].sector;
32763479 for (bi=sh->dev[dd_idx].towrite;
3277
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3480
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
32783481 bi && bi->bi_iter.bi_sector <= sector;
3279
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3482
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
32803483 if (bio_end_sector(bi) >= sector)
32813484 sector = bio_end_sector(bi);
32823485 }
3283
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3486
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
32843487 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
32853488 sh->overwrite_disks++;
32863489 }
....@@ -3305,7 +3508,7 @@
33053508 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
33063509 spin_unlock_irq(&sh->stripe_lock);
33073510 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3308
- STRIPE_SECTORS, 0);
3511
+ RAID5_STRIPE_SECTORS(conf), 0);
33093512 spin_lock_irq(&sh->stripe_lock);
33103513 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
33113514 if (!sh->batch_head) {
....@@ -3367,7 +3570,7 @@
33673570 if (!rdev_set_badblocks(
33683571 rdev,
33693572 sh->sector,
3370
- STRIPE_SECTORS, 0))
3573
+ RAID5_STRIPE_SECTORS(conf), 0))
33713574 md_error(conf->mddev, rdev);
33723575 rdev_dec_pending(rdev, conf->mddev);
33733576 }
....@@ -3387,8 +3590,8 @@
33873590 wake_up(&conf->wait_for_overlap);
33883591
33893592 while (bi && bi->bi_iter.bi_sector <
3390
- sh->dev[i].sector + STRIPE_SECTORS) {
3391
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3593
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3594
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
33923595
33933596 md_write_end(conf->mddev);
33943597 bio_io_error(bi);
....@@ -3396,7 +3599,7 @@
33963599 }
33973600 if (bitmap_end)
33983601 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3399
- STRIPE_SECTORS, 0, 0);
3602
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34003603 bitmap_end = 0;
34013604 /* and fail all 'written' */
34023605 bi = sh->dev[i].written;
....@@ -3408,8 +3611,8 @@
34083611
34093612 if (bi) bitmap_end = 1;
34103613 while (bi && bi->bi_iter.bi_sector <
3411
- sh->dev[i].sector + STRIPE_SECTORS) {
3412
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3614
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3615
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
34133616
34143617 md_write_end(conf->mddev);
34153618 bio_io_error(bi);
....@@ -3432,9 +3635,9 @@
34323635 if (bi)
34333636 s->to_read--;
34343637 while (bi && bi->bi_iter.bi_sector <
3435
- sh->dev[i].sector + STRIPE_SECTORS) {
3638
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
34363639 struct bio *nextbi =
3437
- r5_next_bio(bi, sh->dev[i].sector);
3640
+ r5_next_bio(conf, bi, sh->dev[i].sector);
34383641
34393642 bio_io_error(bi);
34403643 bi = nextbi;
....@@ -3442,7 +3645,7 @@
34423645 }
34433646 if (bitmap_end)
34443647 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3445
- STRIPE_SECTORS, 0, 0);
3648
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34463649 /* If we were in the middle of a write the parity block might
34473650 * still be locked - so just clear all R5_LOCKED flags
34483651 */
....@@ -3487,14 +3690,14 @@
34873690 && !test_bit(Faulty, &rdev->flags)
34883691 && !test_bit(In_sync, &rdev->flags)
34893692 && !rdev_set_badblocks(rdev, sh->sector,
3490
- STRIPE_SECTORS, 0))
3693
+ RAID5_STRIPE_SECTORS(conf), 0))
34913694 abort = 1;
34923695 rdev = rcu_dereference(conf->disks[i].replacement);
34933696 if (rdev
34943697 && !test_bit(Faulty, &rdev->flags)
34953698 && !test_bit(In_sync, &rdev->flags)
34963699 && !rdev_set_badblocks(rdev, sh->sector,
3497
- STRIPE_SECTORS, 0))
3700
+ RAID5_STRIPE_SECTORS(conf), 0))
34983701 abort = 1;
34993702 }
35003703 rcu_read_unlock();
....@@ -3502,7 +3705,7 @@
35023705 conf->recovery_disabled =
35033706 conf->mddev->recovery_disabled;
35043707 }
3505
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3708
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
35063709 }
35073710
35083711 static int want_replace(struct stripe_head *sh, int disk_idx)
....@@ -3529,6 +3732,7 @@
35293732 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
35303733 &sh->dev[s->failed_num[1]] };
35313734 int i;
3735
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
35323736
35333737
35343738 if (test_bit(R5_LOCKED, &dev->flags) ||
....@@ -3587,18 +3791,27 @@
35873791 * devices must be read.
35883792 */
35893793 return 1;
3794
+
3795
+ if (s->failed >= 2 &&
3796
+ (fdev[i]->towrite ||
3797
+ s->failed_num[i] == sh->pd_idx ||
3798
+ s->failed_num[i] == sh->qd_idx) &&
3799
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
3800
+ /* In max degraded raid6, If the failed disk is P, Q,
3801
+ * or we want to read the failed disk, we need to do
3802
+ * reconstruct-write.
3803
+ */
3804
+ force_rcw = true;
35903805 }
35913806
3592
- /* If we are forced to do a reconstruct-write, either because
3593
- * the current RAID6 implementation only supports that, or
3594
- * because parity cannot be trusted and we are currently
3595
- * recovering it, there is extra need to be careful.
3807
+ /* If we are forced to do a reconstruct-write, because parity
3808
+ * cannot be trusted and we are currently recovering it, there
3809
+ * is extra need to be careful.
35963810 * If one of the devices that we would need to read, because
35973811 * it is not being overwritten (and maybe not written at all)
35983812 * is missing/faulty, then we need to read everything we can.
35993813 */
3600
- if (sh->raid_conf->level != 6 &&
3601
- sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
3814
+ if (!force_rcw &&
36023815 sh->sector < sh->raid_conf->mddev->recovery_cp)
36033816 /* reconstruct-write isn't being forced */
36043817 return 0;
....@@ -3702,7 +3915,7 @@
37023915 return 0;
37033916 }
37043917
3705
-/**
3918
+/*
37063919 * handle_stripe_fill - read or compute data to satisfy pending requests.
37073920 */
37083921 static void handle_stripe_fill(struct stripe_head *sh,
....@@ -3725,7 +3938,7 @@
37253938 * back cache (prexor with orig_page, and then xor with
37263939 * page) in the read path
37273940 */
3728
- if (s->injournal && s->failed) {
3941
+ if (s->to_read && s->injournal && s->failed) {
37293942 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
37303943 r5c_make_stripe_write_out(sh);
37313944 goto out;
....@@ -3777,14 +3990,14 @@
37773990 wbi = dev->written;
37783991 dev->written = NULL;
37793992 while (wbi && wbi->bi_iter.bi_sector <
3780
- dev->sector + STRIPE_SECTORS) {
3781
- wbi2 = r5_next_bio(wbi, dev->sector);
3993
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3994
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
37823995 md_write_end(conf->mddev);
37833996 bio_endio(wbi);
37843997 wbi = wbi2;
37853998 }
37863999 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3787
- STRIPE_SECTORS,
4000
+ RAID5_STRIPE_SECTORS(conf),
37884001 !test_bit(STRIPE_DEGRADED, &sh->state),
37894002 0);
37904003 if (head_sh->batch_head) {
....@@ -3968,10 +4181,8 @@
39684181 set_bit(R5_LOCKED, &dev->flags);
39694182 set_bit(R5_Wantread, &dev->flags);
39704183 s->locked++;
3971
- } else {
4184
+ } else
39724185 set_bit(STRIPE_DELAYED, &sh->state);
3973
- set_bit(STRIPE_HANDLE, &sh->state);
3974
- }
39754186 }
39764187 }
39774188 }
....@@ -3996,10 +4207,8 @@
39964207 set_bit(R5_Wantread, &dev->flags);
39974208 s->locked++;
39984209 qread++;
3999
- } else {
4210
+ } else
40004211 set_bit(STRIPE_DELAYED, &sh->state);
4001
- set_bit(STRIPE_HANDLE, &sh->state);
4002
- }
40034212 }
40044213 }
40054214 if (rcw && conf->mddev->queue)
....@@ -4049,7 +4258,7 @@
40494258 break;
40504259 }
40514260 dev = &sh->dev[s->failed_num[0]];
4052
- /* fall through */
4261
+ fallthrough;
40534262 case check_state_compute_result:
40544263 sh->check_state = check_state_idle;
40554264 if (!dev)
....@@ -4091,7 +4300,7 @@
40914300 */
40924301 set_bit(STRIPE_INSYNC, &sh->state);
40934302 else {
4094
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4303
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
40954304 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
40964305 /* don't try to repair!! */
40974306 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4099,7 +4308,7 @@
40994308 "%llu-%llu\n", mdname(conf->mddev),
41004309 (unsigned long long) sh->sector,
41014310 (unsigned long long) sh->sector +
4102
- STRIPE_SECTORS);
4311
+ RAID5_STRIPE_SECTORS(conf));
41034312 } else {
41044313 sh->check_state = check_state_compute_run;
41054314 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
....@@ -4180,7 +4389,7 @@
41804389
41814390 /* we have 2-disk failure */
41824391 BUG_ON(s->failed != 2);
4183
- /* fall through */
4392
+ fallthrough;
41844393 case check_state_compute_result:
41854394 sh->check_state = check_state_idle;
41864395
....@@ -4256,7 +4465,7 @@
42564465 */
42574466 }
42584467 } else {
4259
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4468
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
42604469 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
42614470 /* don't try to repair!! */
42624471 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4264,7 +4473,7 @@
42644473 "%llu-%llu\n", mdname(conf->mddev),
42654474 (unsigned long long) sh->sector,
42664475 (unsigned long long) sh->sector +
4267
- STRIPE_SECTORS);
4476
+ RAID5_STRIPE_SECTORS(conf));
42684477 } else {
42694478 int *target = &sh->ops.target;
42704479
....@@ -4335,7 +4544,8 @@
43354544 /* place all the copies on one channel */
43364545 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
43374546 tx = async_memcpy(sh2->dev[dd_idx].page,
4338
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
4547
+ sh->dev[i].page, sh2->dev[dd_idx].offset,
4548
+ sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
43394549 &submit);
43404550
43414551 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
....@@ -4434,8 +4644,8 @@
44344644 */
44354645 rdev = rcu_dereference(conf->disks[i].replacement);
44364646 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4437
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4438
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4647
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4648
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44394649 &first_bad, &bad_sectors))
44404650 set_bit(R5_ReadRepl, &dev->flags);
44414651 else {
....@@ -4449,7 +4659,7 @@
44494659 if (rdev && test_bit(Faulty, &rdev->flags))
44504660 rdev = NULL;
44514661 if (rdev) {
4452
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4662
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44534663 &first_bad, &bad_sectors);
44544664 if (s->blocked_rdev == NULL
44554665 && (test_bit(Blocked, &rdev->flags)
....@@ -4476,7 +4686,7 @@
44764686 }
44774687 } else if (test_bit(In_sync, &rdev->flags))
44784688 set_bit(R5_Insync, &dev->flags);
4479
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4689
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
44804690 /* in sync if before recovery_offset */
44814691 set_bit(R5_Insync, &dev->flags);
44824692 else if (test_bit(R5_UPTODATE, &dev->flags) &&
....@@ -4565,12 +4775,12 @@
45654775 rcu_read_unlock();
45664776 }
45674777
4778
+/*
4779
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4780
+ * a head which can now be handled.
4781
+ */
45684782 static int clear_batch_ready(struct stripe_head *sh)
45694783 {
4570
- /* Return '1' if this is a member of batch, or
4571
- * '0' if it is a lone stripe or a head which can now be
4572
- * handled.
4573
- */
45744784 struct stripe_head *tmp;
45754785 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
45764786 return (sh->batch_head && sh->batch_head != sh);
....@@ -4620,7 +4830,6 @@
46204830 (1 << STRIPE_FULL_WRITE) |
46214831 (1 << STRIPE_BIOFILL_RUN) |
46224832 (1 << STRIPE_COMPUTE_RUN) |
4623
- (1 << STRIPE_OPS_REQ_PENDING) |
46244833 (1 << STRIPE_DISCARD) |
46254834 (1 << STRIPE_BATCH_READY) |
46264835 (1 << STRIPE_BATCH_ERR) |
....@@ -4675,15 +4884,20 @@
46754884 struct r5dev *pdev, *qdev;
46764885
46774886 clear_bit(STRIPE_HANDLE, &sh->state);
4887
+
4888
+ /*
4889
+ * handle_stripe should not continue handle the batched stripe, only
4890
+ * the head of batch list or lone stripe can continue. Otherwise we
4891
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4892
+ * is set for the batched stripe.
4893
+ */
4894
+ if (clear_batch_ready(sh))
4895
+ return;
4896
+
46784897 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
46794898 /* already being handled, ensure it gets handled
46804899 * again when current action finishes */
46814900 set_bit(STRIPE_HANDLE, &sh->state);
4682
- return;
4683
- }
4684
-
4685
- if (clear_batch_ready(sh) ) {
4686
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
46874901 return;
46884902 }
46894903
....@@ -4920,7 +5134,7 @@
49205134 if ((s.syncing || s.replacing) && s.locked == 0 &&
49215135 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
49225136 test_bit(STRIPE_INSYNC, &sh->state)) {
4923
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5137
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49245138 clear_bit(STRIPE_SYNCING, &sh->state);
49255139 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
49265140 wake_up(&conf->wait_for_overlap);
....@@ -4939,14 +5153,11 @@
49395153 if (!test_bit(R5_ReWrite, &dev->flags)) {
49405154 set_bit(R5_Wantwrite, &dev->flags);
49415155 set_bit(R5_ReWrite, &dev->flags);
4942
- set_bit(R5_LOCKED, &dev->flags);
4943
- s.locked++;
4944
- } else {
5156
+ } else
49455157 /* let's read it back */
49465158 set_bit(R5_Wantread, &dev->flags);
4947
- set_bit(R5_LOCKED, &dev->flags);
4948
- s.locked++;
4949
- }
5159
+ set_bit(R5_LOCKED, &dev->flags);
5160
+ s.locked++;
49505161 }
49515162 }
49525163
....@@ -4988,7 +5199,7 @@
49885199 clear_bit(STRIPE_EXPAND_READY, &sh->state);
49895200 atomic_dec(&conf->reshape_stripes);
49905201 wake_up(&conf->wait_for_overlap);
4991
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5202
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49925203 }
49935204
49945205 if (s.expanding && s.locked == 0 &&
....@@ -5018,14 +5229,14 @@
50185229 /* We own a safe reference to the rdev */
50195230 rdev = conf->disks[i].rdev;
50205231 if (!rdev_set_badblocks(rdev, sh->sector,
5021
- STRIPE_SECTORS, 0))
5232
+ RAID5_STRIPE_SECTORS(conf), 0))
50225233 md_error(conf->mddev, rdev);
50235234 rdev_dec_pending(rdev, conf->mddev);
50245235 }
50255236 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
50265237 rdev = conf->disks[i].rdev;
50275238 rdev_clear_badblocks(rdev, sh->sector,
5028
- STRIPE_SECTORS, 0);
5239
+ RAID5_STRIPE_SECTORS(conf), 0);
50295240 rdev_dec_pending(rdev, conf->mddev);
50305241 }
50315242 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
....@@ -5034,7 +5245,7 @@
50345245 /* rdev have been moved down */
50355246 rdev = conf->disks[i].rdev;
50365247 rdev_clear_badblocks(rdev, sh->sector,
5037
- STRIPE_SECTORS, 0);
5248
+ RAID5_STRIPE_SECTORS(conf), 0);
50385249 rdev_dec_pending(rdev, conf->mddev);
50395250 }
50405251 }
....@@ -5090,28 +5301,6 @@
50905301 hash = sh->hash_lock_index;
50915302 __release_stripe(conf, sh, &temp_inactive_list[hash]);
50925303 }
5093
-}
5094
-
5095
-static int raid5_congested(struct mddev *mddev, int bits)
5096
-{
5097
- struct r5conf *conf = mddev->private;
5098
-
5099
- /* No difference between reads and writes. Just check
5100
- * how busy the stripe_cache is
5101
- */
5102
-
5103
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5104
- return 1;
5105
-
5106
- /* Also checks whether there is pressure on r5cache log space */
5107
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5108
- return 1;
5109
- if (conf->quiesce)
5110
- return 1;
5111
- if (atomic_read(&conf->empty_inactive_list_nr))
5112
- return 1;
5113
-
5114
- return 0;
51155304 }
51165305
51175306 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
....@@ -5259,7 +5448,6 @@
52595448 rcu_read_unlock();
52605449 raid_bio->bi_next = (void*)rdev;
52615450 bio_set_dev(align_bi, rdev->bdev);
5262
- bio_clear_flag(align_bi, BIO_SEG_VALID);
52635451
52645452 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
52655453 bio_sectors(align_bi),
....@@ -5283,7 +5471,7 @@
52835471 trace_block_bio_remap(align_bi->bi_disk->queue,
52845472 align_bi, disk_devt(mddev->gendisk),
52855473 raid_bio->bi_iter.bi_sector);
5286
- generic_make_request(align_bi);
5474
+ submit_bio_noacct(align_bi);
52875475 return 1;
52885476 } else {
52895477 rcu_read_unlock();
....@@ -5303,7 +5491,7 @@
53035491 struct r5conf *conf = mddev->private;
53045492 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
53055493 bio_chain(split, raid_bio);
5306
- generic_make_request(raid_bio);
5494
+ submit_bio_noacct(raid_bio);
53075495 raid_bio = split;
53085496 }
53095497
....@@ -5499,8 +5687,8 @@
54995687 /* Skip discard while reshape is happening */
55005688 return;
55015689
5502
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5503
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5690
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5691
+ last_sector = bio_end_sector(bi);
55045692
55055693 bi->bi_next = NULL;
55065694
....@@ -5514,7 +5702,7 @@
55145702 last_sector *= conf->chunk_sectors;
55155703
55165704 for (; logical_sector < last_sector;
5517
- logical_sector += STRIPE_SECTORS) {
5705
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
55185706 DEFINE_WAIT(w);
55195707 int d;
55205708 again:
....@@ -5559,7 +5747,7 @@
55595747 d++)
55605748 md_bitmap_startwrite(mddev->bitmap,
55615749 sh->sector,
5562
- STRIPE_SECTORS,
5750
+ RAID5_STRIPE_SECTORS(conf),
55635751 0);
55645752 sh->bm_seq = conf->seq_flush + 1;
55655753 set_bit(STRIPE_BIT_DELAY, &sh->state);
....@@ -5624,12 +5812,12 @@
56245812 return true;
56255813 }
56265814
5627
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5815
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
56285816 last_sector = bio_end_sector(bi);
56295817 bi->bi_next = NULL;
56305818
56315819 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5632
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5820
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
56335821 int previous;
56345822 int seq;
56355823
....@@ -5727,8 +5915,7 @@
57275915 do_flush = false;
57285916 }
57295917
5730
- if (!sh->batch_head || sh == sh->batch_head)
5731
- set_bit(STRIPE_HANDLE, &sh->state);
5918
+ set_bit(STRIPE_HANDLE, &sh->state);
57325919 clear_bit(STRIPE_DELAYED, &sh->state);
57335920 if ((!sh->batch_head || sh == sh->batch_head) &&
57345921 (bi->bi_opf & REQ_SYNC) &&
....@@ -5793,7 +5980,7 @@
57935980 sector_div(sector_nr, new_data_disks);
57945981 if (sector_nr) {
57955982 mddev->curr_resync_completed = sector_nr;
5796
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5983
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
57975984 *skipped = 1;
57985985 retn = sector_nr;
57995986 goto finish;
....@@ -5907,11 +6094,11 @@
59076094 conf->reshape_safe = mddev->reshape_position;
59086095 spin_unlock_irq(&conf->device_lock);
59096096 wake_up(&conf->wait_for_overlap);
5910
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6097
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
59116098 }
59126099
59136100 INIT_LIST_HEAD(&stripes);
5914
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
6101
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
59156102 int j;
59166103 int skipped_disk = 0;
59176104 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
....@@ -5932,7 +6119,7 @@
59326119 skipped_disk = 1;
59336120 continue;
59346121 }
5935
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
6122
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
59366123 set_bit(R5_Expanded, &sh->dev[j].flags);
59376124 set_bit(R5_UPTODATE, &sh->dev[j].flags);
59386125 }
....@@ -5967,7 +6154,7 @@
59676154 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
59686155 set_bit(STRIPE_HANDLE, &sh->state);
59696156 raid5_release_stripe(sh);
5970
- first_sector += STRIPE_SECTORS;
6157
+ first_sector += RAID5_STRIPE_SECTORS(conf);
59716158 }
59726159 /* Now that the sources are clearly marked, we can release
59736160 * the destination stripes
....@@ -6014,7 +6201,7 @@
60146201 conf->reshape_safe = mddev->reshape_position;
60156202 spin_unlock_irq(&conf->device_lock);
60166203 wake_up(&conf->wait_for_overlap);
6017
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6204
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
60186205 }
60196206 ret:
60206207 return retn;
....@@ -6073,11 +6260,12 @@
60736260 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
60746261 !conf->fullsync &&
60756262 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6076
- sync_blocks >= STRIPE_SECTORS) {
6263
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
60776264 /* we can skip this block, and probably more */
6078
- sync_blocks /= STRIPE_SECTORS;
6265
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
60796266 *skipped = 1;
6080
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
6267
+ /* keep things rounded to whole stripes */
6268
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
60816269 }
60826270
60836271 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
....@@ -6110,7 +6298,7 @@
61106298
61116299 raid5_release_stripe(sh);
61126300
6113
- return STRIPE_SECTORS;
6301
+ return RAID5_STRIPE_SECTORS(conf);
61146302 }
61156303
61166304 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
....@@ -6133,14 +6321,14 @@
61336321 int handled = 0;
61346322
61356323 logical_sector = raid_bio->bi_iter.bi_sector &
6136
- ~((sector_t)STRIPE_SECTORS-1);
6324
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
61376325 sector = raid5_compute_sector(conf, logical_sector,
61386326 0, &dd_idx, NULL);
61396327 last_sector = bio_end_sector(raid_bio);
61406328
61416329 for (; logical_sector < last_sector;
6142
- logical_sector += STRIPE_SECTORS,
6143
- sector += STRIPE_SECTORS,
6330
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
6331
+ sector += RAID5_STRIPE_SECTORS(conf),
61446332 scnt++) {
61456333
61466334 if (scnt < offset)
....@@ -6179,6 +6367,8 @@
61796367 static int handle_active_stripes(struct r5conf *conf, int group,
61806368 struct r5worker *worker,
61816369 struct list_head *temp_inactive_list)
6370
+ __releases(&conf->device_lock)
6371
+ __acquires(&conf->device_lock)
61826372 {
61836373 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
61846374 int i, batch_size = 0, hash;
....@@ -6331,7 +6521,18 @@
63316521 spin_unlock_irq(&conf->device_lock);
63326522 md_check_recovery(mddev);
63336523 spin_lock_irq(&conf->device_lock);
6524
+
6525
+ /*
6526
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6527
+ * seeing md_check_recovery() is needed to clear
6528
+ * the flag when using mdmon.
6529
+ */
6530
+ continue;
63346531 }
6532
+
6533
+ wait_event_lock_irq(mddev->sb_wait,
6534
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6535
+ conf->device_lock);
63356536 }
63366537 pr_debug("%d stripes handled\n", handled);
63376538
....@@ -6471,6 +6672,100 @@
64716672 raid5_show_rmw_level,
64726673 raid5_store_rmw_level);
64736674
6675
+static ssize_t
6676
+raid5_show_stripe_size(struct mddev *mddev, char *page)
6677
+{
6678
+ struct r5conf *conf;
6679
+ int ret = 0;
6680
+
6681
+ spin_lock(&mddev->lock);
6682
+ conf = mddev->private;
6683
+ if (conf)
6684
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6685
+ spin_unlock(&mddev->lock);
6686
+ return ret;
6687
+}
6688
+
6689
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6690
+static ssize_t
6691
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6692
+{
6693
+ struct r5conf *conf;
6694
+ unsigned long new;
6695
+ int err;
6696
+ int size;
6697
+
6698
+ if (len >= PAGE_SIZE)
6699
+ return -EINVAL;
6700
+ if (kstrtoul(page, 10, &new))
6701
+ return -EINVAL;
6702
+
6703
+ /*
6704
+ * The value should not be bigger than PAGE_SIZE. It requires to
6705
+ * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6706
+ * of two.
6707
+ */
6708
+ if (new % DEFAULT_STRIPE_SIZE != 0 ||
6709
+ new > PAGE_SIZE || new == 0 ||
6710
+ new != roundup_pow_of_two(new))
6711
+ return -EINVAL;
6712
+
6713
+ err = mddev_lock(mddev);
6714
+ if (err)
6715
+ return err;
6716
+
6717
+ conf = mddev->private;
6718
+ if (!conf) {
6719
+ err = -ENODEV;
6720
+ goto out_unlock;
6721
+ }
6722
+
6723
+ if (new == conf->stripe_size)
6724
+ goto out_unlock;
6725
+
6726
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6727
+ conf->stripe_size, new);
6728
+
6729
+ if (mddev->sync_thread ||
6730
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6731
+ mddev->reshape_position != MaxSector ||
6732
+ mddev->sysfs_active) {
6733
+ err = -EBUSY;
6734
+ goto out_unlock;
6735
+ }
6736
+
6737
+ mddev_suspend(mddev);
6738
+ mutex_lock(&conf->cache_size_mutex);
6739
+ size = conf->max_nr_stripes;
6740
+
6741
+ shrink_stripes(conf);
6742
+
6743
+ conf->stripe_size = new;
6744
+ conf->stripe_shift = ilog2(new) - 9;
6745
+ conf->stripe_sectors = new >> 9;
6746
+ if (grow_stripes(conf, size)) {
6747
+ pr_warn("md/raid:%s: couldn't allocate buffers\n",
6748
+ mdname(mddev));
6749
+ err = -ENOMEM;
6750
+ }
6751
+ mutex_unlock(&conf->cache_size_mutex);
6752
+ mddev_resume(mddev);
6753
+
6754
+out_unlock:
6755
+ mddev_unlock(mddev);
6756
+ return err ?: len;
6757
+}
6758
+
6759
+static struct md_sysfs_entry
6760
+raid5_stripe_size = __ATTR(stripe_size, 0644,
6761
+ raid5_show_stripe_size,
6762
+ raid5_store_stripe_size);
6763
+#else
6764
+static struct md_sysfs_entry
6765
+raid5_stripe_size = __ATTR(stripe_size, 0444,
6766
+ raid5_show_stripe_size,
6767
+ NULL);
6768
+#endif
64746769
64756770 static ssize_t
64766771 raid5_show_preread_threshold(struct mddev *mddev, char *page)
....@@ -6550,14 +6845,14 @@
65506845 if (!conf)
65516846 err = -ENODEV;
65526847 else if (new != conf->skip_copy) {
6848
+ struct request_queue *q = mddev->queue;
6849
+
65536850 mddev_suspend(mddev);
65546851 conf->skip_copy = new;
65556852 if (new)
6556
- mddev->queue->backing_dev_info->capabilities |=
6557
- BDI_CAP_STABLE_WRITES;
6853
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
65586854 else
6559
- mddev->queue->backing_dev_info->capabilities &=
6560
- ~BDI_CAP_STABLE_WRITES;
6855
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
65616856 mddev_resume(mddev);
65626857 }
65636858 mddev_unlock(mddev);
....@@ -6597,7 +6892,6 @@
65976892
65986893 static int alloc_thread_groups(struct r5conf *conf, int cnt,
65996894 int *group_cnt,
6600
- int *worker_cnt_per_group,
66016895 struct r5worker_group **worker_groups);
66026896 static ssize_t
66036897 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
....@@ -6606,7 +6900,7 @@
66066900 unsigned int new;
66076901 int err;
66086902 struct r5worker_group *new_groups, *old_groups;
6609
- int group_cnt, worker_cnt_per_group;
6903
+ int group_cnt;
66106904
66116905 if (len >= PAGE_SIZE)
66126906 return -EINVAL;
....@@ -6629,13 +6923,11 @@
66296923 if (old_groups)
66306924 flush_workqueue(raid5_wq);
66316925
6632
- err = alloc_thread_groups(conf, new,
6633
- &group_cnt, &worker_cnt_per_group,
6634
- &new_groups);
6926
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
66356927 if (!err) {
66366928 spin_lock_irq(&conf->device_lock);
66376929 conf->group_cnt = group_cnt;
6638
- conf->worker_cnt_per_group = worker_cnt_per_group;
6930
+ conf->worker_cnt_per_group = new;
66396931 conf->worker_groups = new_groups;
66406932 spin_unlock_irq(&conf->device_lock);
66416933
....@@ -6662,7 +6954,9 @@
66626954 &raid5_group_thread_cnt.attr,
66636955 &raid5_skip_copy.attr,
66646956 &raid5_rmw_level.attr,
6957
+ &raid5_stripe_size.attr,
66656958 &r5c_journal_mode.attr,
6959
+ &ppl_write_hint.attr,
66666960 NULL,
66676961 };
66686962 static struct attribute_group raid5_attrs_group = {
....@@ -6670,16 +6964,13 @@
66706964 .attrs = raid5_attrs,
66716965 };
66726966
6673
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
6674
- int *group_cnt,
6675
- int *worker_cnt_per_group,
6967
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
66766968 struct r5worker_group **worker_groups)
66776969 {
66786970 int i, j, k;
66796971 ssize_t size;
66806972 struct r5worker *workers;
66816973
6682
- *worker_cnt_per_group = cnt;
66836974 if (cnt == 0) {
66846975 *group_cnt = 0;
66856976 *worker_groups = NULL;
....@@ -6745,25 +7036,25 @@
67457036 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67467037 {
67477038 safe_put_page(percpu->spare_page);
6748
- if (percpu->scribble)
6749
- flex_array_free(percpu->scribble);
67507039 percpu->spare_page = NULL;
7040
+ kvfree(percpu->scribble);
67517041 percpu->scribble = NULL;
67527042 }
67537043
67547044 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67557045 {
6756
- if (conf->level == 6 && !percpu->spare_page)
7046
+ if (conf->level == 6 && !percpu->spare_page) {
67577047 percpu->spare_page = alloc_page(GFP_KERNEL);
6758
- if (!percpu->scribble)
6759
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
6760
- conf->previous_raid_disks),
6761
- max(conf->chunk_sectors,
6762
- conf->prev_chunk_sectors)
6763
- / STRIPE_SECTORS,
6764
- GFP_KERNEL);
7048
+ if (!percpu->spare_page)
7049
+ return -ENOMEM;
7050
+ }
67657051
6766
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
7052
+ if (scribble_alloc(percpu,
7053
+ max(conf->raid_disks,
7054
+ conf->previous_raid_disks),
7055
+ max(conf->chunk_sectors,
7056
+ conf->prev_chunk_sectors)
7057
+ / RAID5_STRIPE_SECTORS(conf))) {
67677058 free_scratch_buffer(conf, percpu);
67687059 return -ENOMEM;
67697060 }
....@@ -6829,6 +7120,7 @@
68297120 conf->percpu = alloc_percpu(struct raid5_percpu);
68307121 if (!conf->percpu)
68317122 return -ENOMEM;
7123
+
68327124 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
68337125 if (!err) {
68347126 conf->scribble_disks = max(conf->raid_disks,
....@@ -6879,7 +7171,7 @@
68797171 struct disk_info *disk;
68807172 char pers_name[6];
68817173 int i;
6882
- int group_cnt, worker_cnt_per_group;
7174
+ int group_cnt;
68837175 struct r5worker_group *new_group;
68847176 int ret;
68857177
....@@ -6915,6 +7207,12 @@
69157207 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
69167208 if (conf == NULL)
69177209 goto abort;
7210
+
7211
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7212
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
7213
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7214
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7215
+#endif
69187216 INIT_LIST_HEAD(&conf->free_list);
69197217 INIT_LIST_HEAD(&conf->pending_list);
69207218 conf->pending_data = kcalloc(PENDING_IO_MAX,
....@@ -6925,15 +7223,14 @@
69257223 for (i = 0; i < PENDING_IO_MAX; i++)
69267224 list_add(&conf->pending_data[i].sibling, &conf->free_list);
69277225 /* Don't enable multi-threading by default*/
6928
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6929
- &new_group)) {
7226
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
69307227 conf->group_cnt = group_cnt;
6931
- conf->worker_cnt_per_group = worker_cnt_per_group;
7228
+ conf->worker_cnt_per_group = 0;
69327229 conf->worker_groups = new_group;
69337230 } else
69347231 goto abort;
69357232 spin_lock_init(&conf->device_lock);
6936
- seqcount_init(&conf->gen_lock);
7233
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
69377234 mutex_init(&conf->cache_size_mutex);
69387235 init_waitqueue_head(&conf->wait_for_quiescent);
69397236 init_waitqueue_head(&conf->wait_for_stripe);
....@@ -7067,8 +7364,8 @@
70677364 conf->min_nr_stripes = NR_STRIPES;
70687365 if (mddev->reshape_position != MaxSector) {
70697366 int stripes = max_t(int,
7070
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7071
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7367
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7368
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
70727369 conf->min_nr_stripes = max(NR_STRIPES, stripes);
70737370 if (conf->min_nr_stripes != NR_STRIPES)
70747371 pr_info("md/raid:%s: force stripe size %d for reshape\n",
....@@ -7141,6 +7438,12 @@
71417438 return 1;
71427439 }
71437440 return 0;
7441
+}
7442
+
7443
+static void raid5_set_io_opt(struct r5conf *conf)
7444
+{
7445
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7446
+ (conf->raid_disks - conf->max_degraded));
71447447 }
71457448
71467449 static int raid5_run(struct mddev *mddev)
....@@ -7427,13 +7730,10 @@
74277730 int data_disks = conf->previous_raid_disks - conf->max_degraded;
74287731 int stripe = data_disks *
74297732 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7430
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7431
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
74327733
74337734 chunk_size = mddev->chunk_sectors << 9;
74347735 blk_queue_io_min(mddev->queue, chunk_size);
7435
- blk_queue_io_opt(mddev->queue, chunk_size *
7436
- (conf->raid_disks - conf->max_degraded));
7736
+ raid5_set_io_opt(conf);
74377737 mddev->queue->limits.raid_partial_stripes_expensive = 1;
74387738 /*
74397739 * We can only discard a whole stripe. It doesn't make sense to
....@@ -7718,6 +8018,7 @@
77188018 */
77198019 if (rdev->saved_raid_disk >= 0 &&
77208020 rdev->saved_raid_disk >= first &&
8021
+ rdev->saved_raid_disk <= last &&
77218022 conf->disks[rdev->saved_raid_disk].rdev == NULL)
77228023 first = rdev->saved_raid_disk;
77238024
....@@ -7799,14 +8100,14 @@
77998100 * stripe_heads first.
78008101 */
78018102 struct r5conf *conf = mddev->private;
7802
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
8103
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78038104 > conf->min_nr_stripes ||
7804
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
8105
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78058106 > conf->min_nr_stripes) {
78068107 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
78078108 mdname(mddev),
78088109 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7809
- / STRIPE_SIZE)*4);
8110
+ / RAID5_STRIPE_SIZE(conf))*4);
78108111 return 0;
78118112 }
78128113 return 1;
....@@ -7942,8 +8243,8 @@
79428243 else
79438244 rdev->recovery_offset = 0;
79448245
7945
- if (sysfs_link_rdev(mddev, rdev))
7946
- /* Failure here is OK */;
8246
+ /* Failure here is OK */
8247
+ sysfs_link_rdev(mddev, rdev);
79478248 }
79488249 } else if (rdev->raid_disk >= conf->previous_raid_disks
79498250 && !test_bit(Faulty, &rdev->flags)) {
....@@ -8017,16 +8318,8 @@
80178318 spin_unlock_irq(&conf->device_lock);
80188319 wake_up(&conf->wait_for_overlap);
80198320
8020
- /* read-ahead size must cover two whole stripes, which is
8021
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
8022
- */
8023
- if (conf->mddev->queue) {
8024
- int data_disks = conf->raid_disks - conf->max_degraded;
8025
- int stripe = data_disks * ((conf->chunk_sectors << 9)
8026
- / PAGE_SIZE);
8027
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8028
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8029
- }
8321
+ if (conf->mddev->queue)
8322
+ raid5_set_io_opt(conf);
80308323 }
80318324 }
80328325
....@@ -8138,7 +8431,7 @@
81388431 while (chunksect && (mddev->array_sectors & (chunksect-1)))
81398432 chunksect >>= 1;
81408433
8141
- if ((chunksect<<9) < STRIPE_SIZE)
8434
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
81428435 /* array size does not allow a suitable chunk size */
81438436 return ERR_PTR(-EINVAL);
81448437
....@@ -8425,7 +8718,6 @@
84258718 .finish_reshape = raid5_finish_reshape,
84268719 .quiesce = raid5_quiesce,
84278720 .takeover = raid6_takeover,
8428
- .congested = raid5_congested,
84298721 .change_consistency_policy = raid5_change_consistency_policy,
84308722 };
84318723 static struct md_personality raid5_personality =
....@@ -8450,7 +8742,6 @@
84508742 .finish_reshape = raid5_finish_reshape,
84518743 .quiesce = raid5_quiesce,
84528744 .takeover = raid5_takeover,
8453
- .congested = raid5_congested,
84548745 .change_consistency_policy = raid5_change_consistency_policy,
84558746 };
84568747
....@@ -8476,7 +8767,6 @@
84768767 .finish_reshape = raid5_finish_reshape,
84778768 .quiesce = raid5_quiesce,
84788769 .takeover = raid4_takeover,
8479
- .congested = raid5_congested,
84808770 .change_consistency_policy = raid5_change_consistency_policy,
84818771 };
84828772