forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/drivers/md/raid5.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid5.c : Multiple Devices driver for Linux
34 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
....@@ -7,15 +8,6 @@
78 * RAID-4/5/6 management functions.
89 * Thanks to Penguin Computing for making the RAID-6 development possible
910 * by donating a test server!
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1911 */
2012
2113 /*
....@@ -44,6 +36,7 @@
4436 */
4537
4638 #include <linux/blkdev.h>
39
+#include <linux/delay.h>
4740 #include <linux/kthread.h>
4841 #include <linux/raid/pq.h>
4942 #include <linux/async_tx.h>
....@@ -54,7 +47,6 @@
5447 #include <linux/slab.h>
5548 #include <linux/ratelimit.h>
5649 #include <linux/nodemask.h>
57
-#include <linux/flex_array.h>
5850
5951 #include <trace/events/block.h>
6052 #include <linux/list_sort.h>
....@@ -78,13 +70,13 @@
7870
7971 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
8072 {
81
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
8274 return &conf->stripe_hashtbl[hash];
8375 }
8476
85
-static inline int stripe_hash_locks_hash(sector_t sect)
77
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
8678 {
87
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
8880 }
8981
9082 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
....@@ -457,13 +449,74 @@
457449 return sh;
458450 }
459451
452
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
453
+static void free_stripe_pages(struct stripe_head *sh)
454
+{
455
+ int i;
456
+ struct page *p;
457
+
458
+ /* Have not allocate page pool */
459
+ if (!sh->pages)
460
+ return;
461
+
462
+ for (i = 0; i < sh->nr_pages; i++) {
463
+ p = sh->pages[i];
464
+ if (p)
465
+ put_page(p);
466
+ sh->pages[i] = NULL;
467
+ }
468
+}
469
+
470
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
471
+{
472
+ int i;
473
+ struct page *p;
474
+
475
+ for (i = 0; i < sh->nr_pages; i++) {
476
+ /* The page have allocated. */
477
+ if (sh->pages[i])
478
+ continue;
479
+
480
+ p = alloc_page(gfp);
481
+ if (!p) {
482
+ free_stripe_pages(sh);
483
+ return -ENOMEM;
484
+ }
485
+ sh->pages[i] = p;
486
+ }
487
+ return 0;
488
+}
489
+
490
+static int
491
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
492
+{
493
+ int nr_pages, cnt;
494
+
495
+ if (sh->pages)
496
+ return 0;
497
+
498
+ /* Each of the sh->dev[i] need one conf->stripe_size */
499
+ cnt = PAGE_SIZE / conf->stripe_size;
500
+ nr_pages = (disks + cnt - 1) / cnt;
501
+
502
+ sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
503
+ if (!sh->pages)
504
+ return -ENOMEM;
505
+ sh->nr_pages = nr_pages;
506
+ sh->stripes_per_page = cnt;
507
+ return 0;
508
+}
509
+#endif
510
+
460511 static void shrink_buffers(struct stripe_head *sh)
461512 {
462
- struct page *p;
463513 int i;
464514 int num = sh->raid_conf->pool_size;
465515
516
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
466517 for (i = 0; i < num ; i++) {
518
+ struct page *p;
519
+
467520 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468521 p = sh->dev[i].page;
469522 if (!p)
....@@ -471,6 +524,11 @@
471524 sh->dev[i].page = NULL;
472525 put_page(p);
473526 }
527
+#else
528
+ for (i = 0; i < num; i++)
529
+ sh->dev[i].page = NULL;
530
+ free_stripe_pages(sh); /* Free pages */
531
+#endif
474532 }
475533
476534 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
....@@ -478,6 +536,7 @@
478536 int i;
479537 int num = sh->raid_conf->pool_size;
480538
539
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
481540 for (i = 0; i < num; i++) {
482541 struct page *page;
483542
....@@ -486,8 +545,18 @@
486545 }
487546 sh->dev[i].page = page;
488547 sh->dev[i].orig_page = page;
548
+ sh->dev[i].offset = 0;
489549 }
550
+#else
551
+ if (alloc_stripe_pages(sh, gfp))
552
+ return -ENOMEM;
490553
554
+ for (i = 0; i < num; i++) {
555
+ sh->dev[i].page = raid5_get_dev_page(sh, i);
556
+ sh->dev[i].orig_page = sh->dev[i].page;
557
+ sh->dev[i].offset = raid5_get_page_offset(sh, i);
558
+ }
559
+#endif
491560 return 0;
492561 }
493562
....@@ -618,17 +687,17 @@
618687 return degraded;
619688 }
620689
621
-static int has_failed(struct r5conf *conf)
690
+static bool has_failed(struct r5conf *conf)
622691 {
623
- int degraded;
692
+ int degraded = conf->mddev->degraded;
624693
625
- if (conf->mddev->reshape_position == MaxSector)
626
- return conf->mddev->degraded > conf->max_degraded;
694
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
695
+ return true;
627696
628
- degraded = raid5_calc_degraded(conf);
629
- if (degraded > conf->max_degraded)
630
- return 1;
631
- return 0;
697
+ if (conf->mddev->reshape_position != MaxSector)
698
+ degraded = raid5_calc_degraded(conf);
699
+
700
+ return degraded > conf->max_degraded;
632701 }
633702
634703 struct stripe_head *
....@@ -636,7 +705,7 @@
636705 int previous, int noblock, int noquiesce)
637706 {
638707 struct stripe_head *sh;
639
- int hash = stripe_hash_locks_hash(sector);
708
+ int hash = stripe_hash_locks_hash(conf, sector);
640709 int inc_empty_inactive_list_flag;
641710
642711 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
....@@ -712,6 +781,8 @@
712781 }
713782
714783 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
784
+ __acquires(&sh1->stripe_lock)
785
+ __acquires(&sh2->stripe_lock)
715786 {
716787 if (sh1 > sh2) {
717788 spin_lock_irq(&sh2->stripe_lock);
....@@ -723,6 +794,8 @@
723794 }
724795
725796 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
797
+ __releases(&sh1->stripe_lock)
798
+ __releases(&sh2->stripe_lock)
726799 {
727800 spin_unlock(&sh1->stripe_lock);
728801 spin_unlock_irq(&sh2->stripe_lock);
....@@ -753,9 +826,9 @@
753826 tmp_sec = sh->sector;
754827 if (!sector_div(tmp_sec, conf->chunk_sectors))
755828 return;
756
- head_sector = sh->sector - STRIPE_SECTORS;
829
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
757830
758
- hash = stripe_hash_locks_hash(head_sector);
831
+ hash = stripe_hash_locks_hash(conf, head_sector);
759832 spin_lock_irq(conf->hash_locks + hash);
760833 head = __find_stripe(conf, head_sector, conf->generation);
761834 if (head && !atomic_inc_not_zero(&head->count)) {
....@@ -878,7 +951,7 @@
878951 struct bio *bio;
879952
880953 while ((bio = bio_list_pop(tmp)))
881
- generic_make_request(bio);
954
+ submit_bio_noacct(bio);
882955 }
883956
884957 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
....@@ -1062,7 +1135,7 @@
10621135 test_bit(WriteErrorSeen, &rdev->flags)) {
10631136 sector_t first_bad;
10641137 int bad_sectors;
1065
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1138
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
10661139 &first_bad, &bad_sectors);
10671140 if (!bad)
10681141 break;
....@@ -1094,7 +1167,7 @@
10941167 if (rdev) {
10951168 if (s->syncing || s->expanding || s->expanded
10961169 || s->replacing)
1097
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1170
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
10981171
10991172 set_bit(STRIPE_IO_STARTED, &sh->state);
11001173
....@@ -1134,12 +1207,12 @@
11341207 else
11351208 sh->dev[i].vec.bv_page = sh->dev[i].page;
11361209 bi->bi_vcnt = 1;
1137
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138
- bi->bi_io_vec[0].bv_offset = 0;
1139
- bi->bi_iter.bi_size = STRIPE_SIZE;
1210
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211
+ bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11401213 bi->bi_write_hint = sh->dev[i].write_hint;
11411214 if (!rrdev)
1142
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1215
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11431216 /*
11441217 * If this is discard request, set bi_vcnt 0. We don't
11451218 * want to confuse SCSI because SCSI will replace payload
....@@ -1156,12 +1229,12 @@
11561229 if (should_defer && op_is_write(op))
11571230 bio_list_add(&pending_bios, bi);
11581231 else
1159
- generic_make_request(bi);
1232
+ submit_bio_noacct(bi);
11601233 }
11611234 if (rrdev) {
11621235 if (s->syncing || s->expanding || s->expanded
11631236 || s->replacing)
1164
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1237
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
11651238
11661239 set_bit(STRIPE_IO_STARTED, &sh->state);
11671240
....@@ -1188,11 +1261,11 @@
11881261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
11891262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
11901263 rbi->bi_vcnt = 1;
1191
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192
- rbi->bi_io_vec[0].bv_offset = 0;
1193
- rbi->bi_iter.bi_size = STRIPE_SIZE;
1264
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265
+ rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11941267 rbi->bi_write_hint = sh->dev[i].write_hint;
1195
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1268
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11961269 /*
11971270 * If this is discard request, set bi_vcnt 0. We don't
11981271 * want to confuse SCSI because SCSI will replace payload
....@@ -1206,7 +1279,7 @@
12061279 if (should_defer && op_is_write(op))
12071280 bio_list_add(&pending_bios, rbi);
12081281 else
1209
- generic_make_request(rbi);
1282
+ submit_bio_noacct(rbi);
12101283 }
12111284 if (!rdev && !rrdev) {
12121285 if (op_is_write(op))
....@@ -1231,7 +1304,7 @@
12311304
12321305 static struct dma_async_tx_descriptor *
12331306 async_copy_data(int frombio, struct bio *bio, struct page **page,
1234
- sector_t sector, struct dma_async_tx_descriptor *tx,
1307
+ unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
12351308 struct stripe_head *sh, int no_skipcopy)
12361309 {
12371310 struct bio_vec bvl;
....@@ -1240,6 +1313,7 @@
12401313 int page_offset;
12411314 struct async_submit_ctl submit;
12421315 enum async_tx_flags flags = 0;
1316
+ struct r5conf *conf = sh->raid_conf;
12431317
12441318 if (bio->bi_iter.bi_sector >= sector)
12451319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
....@@ -1261,8 +1335,8 @@
12611335 len -= b_offset;
12621336 }
12631337
1264
- if (len > 0 && page_offset + len > STRIPE_SIZE)
1265
- clen = STRIPE_SIZE - page_offset;
1338
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
12661340 else
12671341 clen = len;
12681342
....@@ -1270,17 +1344,17 @@
12701344 b_offset += bvl.bv_offset;
12711345 bio_page = bvl.bv_page;
12721346 if (frombio) {
1273
- if (sh->raid_conf->skip_copy &&
1347
+ if (conf->skip_copy &&
12741348 b_offset == 0 && page_offset == 0 &&
1275
- clen == STRIPE_SIZE &&
1349
+ clen == RAID5_STRIPE_SIZE(conf) &&
12761350 !no_skipcopy)
12771351 *page = bio_page;
12781352 else
1279
- tx = async_memcpy(*page, bio_page, page_offset,
1353
+ tx = async_memcpy(*page, bio_page, page_offset + poff,
12801354 b_offset, clen, &submit);
12811355 } else
12821356 tx = async_memcpy(bio_page, *page, b_offset,
1283
- page_offset, clen, &submit);
1357
+ page_offset + poff, clen, &submit);
12841358 }
12851359 /* chain the operations */
12861360 submit.depend_tx = tx;
....@@ -1297,6 +1371,7 @@
12971371 {
12981372 struct stripe_head *sh = stripe_head_ref;
12991373 int i;
1374
+ struct r5conf *conf = sh->raid_conf;
13001375
13011376 pr_debug("%s: stripe %llu\n", __func__,
13021377 (unsigned long long)sh->sector);
....@@ -1317,8 +1392,8 @@
13171392 rbi = dev->read;
13181393 dev->read = NULL;
13191394 while (rbi && rbi->bi_iter.bi_sector <
1320
- dev->sector + STRIPE_SECTORS) {
1321
- rbi2 = r5_next_bio(rbi, dev->sector);
1395
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
13221397 bio_endio(rbi);
13231398 rbi = rbi2;
13241399 }
....@@ -1335,6 +1410,7 @@
13351410 struct dma_async_tx_descriptor *tx = NULL;
13361411 struct async_submit_ctl submit;
13371412 int i;
1413
+ struct r5conf *conf = sh->raid_conf;
13381414
13391415 BUG_ON(sh->batch_head);
13401416 pr_debug("%s: stripe %llu\n", __func__,
....@@ -1349,10 +1425,11 @@
13491425 dev->toread = NULL;
13501426 spin_unlock_irq(&sh->stripe_lock);
13511427 while (rbi && rbi->bi_iter.bi_sector <
1352
- dev->sector + STRIPE_SECTORS) {
1428
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
13531429 tx = async_copy_data(0, rbi, &dev->page,
1430
+ dev->offset,
13541431 dev->sector, tx, sh, 0);
1355
- rbi = r5_next_bio(rbi, dev->sector);
1432
+ rbi = r5_next_bio(conf, rbi, dev->sector);
13561433 }
13571434 }
13581435 }
....@@ -1394,22 +1471,25 @@
13941471 }
13951472
13961473 /* return a pointer to the address conversion region of the scribble buffer */
1397
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398
- struct raid5_percpu *percpu, int i)
1474
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
13991475 {
1400
- void *addr;
1401
-
1402
- addr = flex_array_get(percpu->scribble, i);
1403
- return addr + sizeof(struct page *) * (sh->disks + 2);
1476
+ return percpu->scribble + i * percpu->scribble_obj_size;
14041477 }
14051478
14061479 /* return a pointer to the address conversion region of the scribble buffer */
1407
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1480
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481
+ struct raid5_percpu *percpu, int i)
14081482 {
1409
- void *addr;
1483
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484
+}
14101485
1411
- addr = flex_array_get(percpu->scribble, i);
1412
- return addr;
1486
+/*
1487
+ * Return a pointer to record offset address.
1488
+ */
1489
+static unsigned int *
1490
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491
+{
1492
+ return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
14131493 }
14141494
14151495 static struct dma_async_tx_descriptor *
....@@ -1417,9 +1497,11 @@
14171497 {
14181498 int disks = sh->disks;
14191499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
14201501 int target = sh->ops.target;
14211502 struct r5dev *tgt = &sh->dev[target];
14221503 struct page *xor_dest = tgt->page;
1504
+ unsigned int off_dest = tgt->offset;
14231505 int count = 0;
14241506 struct dma_async_tx_descriptor *tx;
14251507 struct async_submit_ctl submit;
....@@ -1431,24 +1513,30 @@
14311513 __func__, (unsigned long long)sh->sector, target);
14321514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
14331515
1434
- for (i = disks; i--; )
1435
- if (i != target)
1516
+ for (i = disks; i--; ) {
1517
+ if (i != target) {
1518
+ off_srcs[count] = sh->dev[i].offset;
14361519 xor_srcs[count++] = sh->dev[i].page;
1520
+ }
1521
+ }
14371522
14381523 atomic_inc(&sh->count);
14391524
14401525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
14411526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
14421527 if (unlikely(count == 1))
1443
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1528
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14441530 else
1445
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1531
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14461533
14471534 return tx;
14481535 }
14491536
14501537 /* set_syndrome_sources - populate source buffers for gen_syndrome
14511538 * @srcs - (struct page *) array of size sh->disks
1539
+ * @offs - (unsigned int) array of offset for each page
14521540 * @sh - stripe_head to parse
14531541 *
14541542 * Populates srcs in proper layout order for the stripe and returns the
....@@ -1457,6 +1545,7 @@
14571545 * is recorded in srcs[count+1]].
14581546 */
14591547 static int set_syndrome_sources(struct page **srcs,
1548
+ unsigned int *offs,
14601549 struct stripe_head *sh,
14611550 int srctype)
14621551 {
....@@ -1487,6 +1576,12 @@
14871576 srcs[slot] = sh->dev[i].orig_page;
14881577 else
14891578 srcs[slot] = sh->dev[i].page;
1579
+ /*
1580
+ * For R5_InJournal, PAGE_SIZE must be 4KB and will
1581
+ * not shared page. In that case, dev[i].offset
1582
+ * is 0.
1583
+ */
1584
+ offs[slot] = sh->dev[i].offset;
14901585 }
14911586 i = raid6_next_disk(i, disks);
14921587 } while (i != d0_idx);
....@@ -1499,12 +1594,14 @@
14991594 {
15001595 int disks = sh->disks;
15011596 struct page **blocks = to_addr_page(percpu, 0);
1597
+ unsigned int *offs = to_addr_offs(sh, percpu);
15021598 int target;
15031599 int qd_idx = sh->qd_idx;
15041600 struct dma_async_tx_descriptor *tx;
15051601 struct async_submit_ctl submit;
15061602 struct r5dev *tgt;
15071603 struct page *dest;
1604
+ unsigned int dest_off;
15081605 int i;
15091606 int count;
15101607
....@@ -1523,30 +1620,34 @@
15231620 tgt = &sh->dev[target];
15241621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
15251622 dest = tgt->page;
1623
+ dest_off = tgt->offset;
15261624
15271625 atomic_inc(&sh->count);
15281626
15291627 if (target == qd_idx) {
1530
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
15311629 blocks[count] = NULL; /* regenerating p is not necessary */
15321630 BUG_ON(blocks[count+1] != dest); /* q should already be set */
15331631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
15341632 ops_complete_compute, sh,
15351633 to_addr_conv(sh, percpu, 0));
1536
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1634
+ tx = async_gen_syndrome(blocks, offs, count+2,
1635
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15371636 } else {
15381637 /* Compute any data- or p-drive using XOR */
15391638 count = 0;
15401639 for (i = disks; i-- ; ) {
15411640 if (i == target || i == qd_idx)
15421641 continue;
1642
+ offs[count] = sh->dev[i].offset;
15431643 blocks[count++] = sh->dev[i].page;
15441644 }
15451645
15461646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
15471647 NULL, ops_complete_compute, sh,
15481648 to_addr_conv(sh, percpu, 0));
1549
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1649
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15501651 }
15511652
15521653 return tx;
....@@ -1565,6 +1666,7 @@
15651666 struct r5dev *tgt2 = &sh->dev[target2];
15661667 struct dma_async_tx_descriptor *tx;
15671668 struct page **blocks = to_addr_page(percpu, 0);
1669
+ unsigned int *offs = to_addr_offs(sh, percpu);
15681670 struct async_submit_ctl submit;
15691671
15701672 BUG_ON(sh->batch_head);
....@@ -1577,13 +1679,16 @@
15771679 /* we need to open-code set_syndrome_sources to handle the
15781680 * slot number conversion for 'faila' and 'failb'
15791681 */
1580
- for (i = 0; i < disks ; i++)
1682
+ for (i = 0; i < disks ; i++) {
1683
+ offs[i] = 0;
15811684 blocks[i] = NULL;
1685
+ }
15821686 count = 0;
15831687 i = d0_idx;
15841688 do {
15851689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
15861690
1691
+ offs[slot] = sh->dev[i].offset;
15871692 blocks[slot] = sh->dev[i].page;
15881693
15891694 if (i == target)
....@@ -1608,10 +1713,12 @@
16081713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
16091714 ops_complete_compute, sh,
16101715 to_addr_conv(sh, percpu, 0));
1611
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612
- STRIPE_SIZE, &submit);
1716
+ return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1718
+ &submit);
16131719 } else {
16141720 struct page *dest;
1721
+ unsigned int dest_off;
16151722 int data_target;
16161723 int qd_idx = sh->qd_idx;
16171724
....@@ -1625,22 +1732,26 @@
16251732 for (i = disks; i-- ; ) {
16261733 if (i == data_target || i == qd_idx)
16271734 continue;
1735
+ offs[count] = sh->dev[i].offset;
16281736 blocks[count++] = sh->dev[i].page;
16291737 }
16301738 dest = sh->dev[data_target].page;
1739
+ dest_off = sh->dev[data_target].offset;
16311740 init_async_submit(&submit,
16321741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
16331742 NULL, NULL, NULL,
16341743 to_addr_conv(sh, percpu, 0));
1635
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1744
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745
+ RAID5_STRIPE_SIZE(sh->raid_conf),
16361746 &submit);
16371747
1638
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1748
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
16391749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
16401750 ops_complete_compute, sh,
16411751 to_addr_conv(sh, percpu, 0));
1642
- return async_gen_syndrome(blocks, 0, count+2,
1643
- STRIPE_SIZE, &submit);
1752
+ return async_gen_syndrome(blocks, offs, count+2,
1753
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1754
+ &submit);
16441755 }
16451756 } else {
16461757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
....@@ -1649,13 +1760,15 @@
16491760 if (failb == syndrome_disks) {
16501761 /* We're missing D+P. */
16511762 return async_raid6_datap_recov(syndrome_disks+2,
1652
- STRIPE_SIZE, faila,
1653
- blocks, &submit);
1763
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1764
+ faila,
1765
+ blocks, offs, &submit);
16541766 } else {
16551767 /* We're missing D+D. */
16561768 return async_raid6_2data_recov(syndrome_disks+2,
1657
- STRIPE_SIZE, faila, failb,
1658
- blocks, &submit);
1769
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1770
+ faila, failb,
1771
+ blocks, offs, &submit);
16591772 }
16601773 }
16611774 }
....@@ -1681,10 +1794,12 @@
16811794 {
16821795 int disks = sh->disks;
16831796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
16841798 int count = 0, pd_idx = sh->pd_idx, i;
16851799 struct async_submit_ctl submit;
16861800
16871801 /* existing parity data subtracted */
1802
+ unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
16881803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
16891804
16901805 BUG_ON(sh->batch_head);
....@@ -1694,15 +1809,23 @@
16941809 for (i = disks; i--; ) {
16951810 struct r5dev *dev = &sh->dev[i];
16961811 /* Only process blocks that are known to be uptodate */
1697
- if (test_bit(R5_InJournal, &dev->flags))
1812
+ if (test_bit(R5_InJournal, &dev->flags)) {
1813
+ /*
1814
+ * For this case, PAGE_SIZE must be equal to 4KB and
1815
+ * page offset is zero.
1816
+ */
1817
+ off_srcs[count] = dev->offset;
16981818 xor_srcs[count++] = dev->orig_page;
1699
- else if (test_bit(R5_Wantdrain, &dev->flags))
1819
+ } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820
+ off_srcs[count] = dev->offset;
17001821 xor_srcs[count++] = dev->page;
1822
+ }
17011823 }
17021824
17031825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
17041826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1827
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17061829
17071830 return tx;
17081831 }
....@@ -1712,17 +1835,19 @@
17121835 struct dma_async_tx_descriptor *tx)
17131836 {
17141837 struct page **blocks = to_addr_page(percpu, 0);
1838
+ unsigned int *offs = to_addr_offs(sh, percpu);
17151839 int count;
17161840 struct async_submit_ctl submit;
17171841
17181842 pr_debug("%s: stripe %llu\n", __func__,
17191843 (unsigned long long)sh->sector);
17201844
1721
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1845
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
17221846
17231847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
17241848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1849
+ tx = async_gen_syndrome(blocks, offs, count+2,
1850
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17261851
17271852 return tx;
17281853 }
....@@ -1763,7 +1888,7 @@
17631888 WARN_ON(dev->page != dev->orig_page);
17641889
17651890 while (wbi && wbi->bi_iter.bi_sector <
1766
- dev->sector + STRIPE_SECTORS) {
1891
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
17671892 if (wbi->bi_opf & REQ_FUA)
17681893 set_bit(R5_WantFUA, &dev->flags);
17691894 if (wbi->bi_opf & REQ_SYNC)
....@@ -1772,6 +1897,7 @@
17721897 set_bit(R5_Discard, &dev->flags);
17731898 else {
17741899 tx = async_copy_data(1, wbi, &dev->page,
1900
+ dev->offset,
17751901 dev->sector, tx, sh,
17761902 r5c_is_writeback(conf->log));
17771903 if (dev->page != dev->orig_page &&
....@@ -1781,7 +1907,7 @@
17811907 clear_bit(R5_OVERWRITE, &dev->flags);
17821908 }
17831909 }
1784
- wbi = r5_next_bio(wbi, dev->sector);
1910
+ wbi = r5_next_bio(conf, wbi, dev->sector);
17851911 }
17861912
17871913 if (head_sh->batch_head) {
....@@ -1851,9 +1977,11 @@
18511977 {
18521978 int disks = sh->disks;
18531979 struct page **xor_srcs;
1980
+ unsigned int *off_srcs;
18541981 struct async_submit_ctl submit;
18551982 int count, pd_idx = sh->pd_idx, i;
18561983 struct page *xor_dest;
1984
+ unsigned int off_dest;
18571985 int prexor = 0;
18581986 unsigned long flags;
18591987 int j = 0;
....@@ -1878,24 +2006,31 @@
18782006 again:
18792007 count = 0;
18802008 xor_srcs = to_addr_page(percpu, j);
2009
+ off_srcs = to_addr_offs(sh, percpu);
18812010 /* check if prexor is active which means only process blocks
18822011 * that are part of a read-modify-write (written)
18832012 */
18842013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
18852014 prexor = 1;
2015
+ off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
18862016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
18872017 for (i = disks; i--; ) {
18882018 struct r5dev *dev = &sh->dev[i];
18892019 if (head_sh->dev[i].written ||
1890
- test_bit(R5_InJournal, &head_sh->dev[i].flags))
2020
+ test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021
+ off_srcs[count] = dev->offset;
18912022 xor_srcs[count++] = dev->page;
2023
+ }
18922024 }
18932025 } else {
18942026 xor_dest = sh->dev[pd_idx].page;
2027
+ off_dest = sh->dev[pd_idx].offset;
18952028 for (i = disks; i--; ) {
18962029 struct r5dev *dev = &sh->dev[i];
1897
- if (i != pd_idx)
2030
+ if (i != pd_idx) {
2031
+ off_srcs[count] = dev->offset;
18982032 xor_srcs[count++] = dev->page;
2033
+ }
18992034 }
19002035 }
19012036
....@@ -1921,9 +2056,11 @@
19212056 }
19222057
19232058 if (unlikely(count == 1))
1924
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
2059
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19252061 else
1926
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
2062
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19272064 if (!last_stripe) {
19282065 j++;
19292066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -1938,6 +2075,7 @@
19382075 {
19392076 struct async_submit_ctl submit;
19402077 struct page **blocks;
2078
+ unsigned int *offs;
19412079 int count, i, j = 0;
19422080 struct stripe_head *head_sh = sh;
19432081 int last_stripe;
....@@ -1962,6 +2100,7 @@
19622100
19632101 again:
19642102 blocks = to_addr_page(percpu, j);
2103
+ offs = to_addr_offs(sh, percpu);
19652104
19662105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
19672106 synflags = SYNDROME_SRC_WRITTEN;
....@@ -1971,7 +2110,7 @@
19712110 txflags = ASYNC_TX_ACK;
19722111 }
19732112
1974
- count = set_syndrome_sources(blocks, sh, synflags);
2113
+ count = set_syndrome_sources(blocks, offs, sh, synflags);
19752114 last_stripe = !head_sh->batch_head ||
19762115 list_first_entry(&sh->batch_list,
19772116 struct stripe_head, batch_list) == head_sh;
....@@ -1983,7 +2122,8 @@
19832122 } else
19842123 init_async_submit(&submit, 0, tx, NULL, NULL,
19852124 to_addr_conv(sh, percpu, j));
1986
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
2125
+ tx = async_gen_syndrome(blocks, offs, count+2,
2126
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19872127 if (!last_stripe) {
19882128 j++;
19892129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -2010,7 +2150,9 @@
20102150 int pd_idx = sh->pd_idx;
20112151 int qd_idx = sh->qd_idx;
20122152 struct page *xor_dest;
2153
+ unsigned int off_dest;
20132154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
20142156 struct dma_async_tx_descriptor *tx;
20152157 struct async_submit_ctl submit;
20162158 int count;
....@@ -2022,16 +2164,20 @@
20222164 BUG_ON(sh->batch_head);
20232165 count = 0;
20242166 xor_dest = sh->dev[pd_idx].page;
2167
+ off_dest = sh->dev[pd_idx].offset;
2168
+ off_srcs[count] = off_dest;
20252169 xor_srcs[count++] = xor_dest;
20262170 for (i = disks; i--; ) {
20272171 if (i == pd_idx || i == qd_idx)
20282172 continue;
2173
+ off_srcs[count] = sh->dev[i].offset;
20292174 xor_srcs[count++] = sh->dev[i].page;
20302175 }
20312176
20322177 init_async_submit(&submit, 0, NULL, NULL, NULL,
20332178 to_addr_conv(sh, percpu, 0));
2034
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2179
+ tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180
+ RAID5_STRIPE_SIZE(sh->raid_conf),
20352181 &sh->ops.zero_sum_result, &submit);
20362182
20372183 atomic_inc(&sh->count);
....@@ -2042,6 +2188,7 @@
20422188 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
20432189 {
20442190 struct page **srcs = to_addr_page(percpu, 0);
2191
+ unsigned int *offs = to_addr_offs(sh, percpu);
20452192 struct async_submit_ctl submit;
20462193 int count;
20472194
....@@ -2049,15 +2196,16 @@
20492196 (unsigned long long)sh->sector, checkp);
20502197
20512198 BUG_ON(sh->batch_head);
2052
- count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2199
+ count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
20532200 if (!checkp)
20542201 srcs[count] = NULL;
20552202
20562203 atomic_inc(&sh->count);
20572204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
20582205 sh, to_addr_conv(sh, percpu, 0));
2059
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2206
+ async_syndrome_val(srcs, offs, count+2,
2207
+ RAID5_STRIPE_SIZE(sh->raid_conf),
2208
+ &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
20612209 }
20622210
20632211 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
....@@ -2134,6 +2282,9 @@
21342282
21352283 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
21362284 {
2285
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2286
+ kfree(sh->pages);
2287
+#endif
21372288 if (sh->ppl_page)
21382289 __free_page(sh->ppl_page);
21392290 kmem_cache_free(sc, sh);
....@@ -2167,9 +2318,15 @@
21672318 sh->ppl_page = alloc_page(gfp);
21682319 if (!sh->ppl_page) {
21692320 free_stripe(sc, sh);
2170
- sh = NULL;
2321
+ return NULL;
21712322 }
21722323 }
2324
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2325
+ if (init_stripe_shared_pages(sh, conf, disks)) {
2326
+ free_stripe(sc, sh);
2327
+ return NULL;
2328
+ }
2329
+#endif
21732330 }
21742331 return sh;
21752332 }
....@@ -2226,10 +2383,13 @@
22262383 }
22272384
22282385 /**
2229
- * scribble_len - return the required size of the scribble region
2230
- * @num - total number of disks in the array
2386
+ * scribble_alloc - allocate percpu scribble buffer for required size
2387
+ * of the scribble region
2388
+ * @percpu: from for_each_present_cpu() of the caller
2389
+ * @num: total number of disks in the array
2390
+ * @cnt: scribble objs count for required size of the scribble region
22312391 *
2232
- * The size must be enough to contain:
2392
+ * The scribble buffer size must be enough to contain:
22332393 * 1/ a struct page pointer for each device in the array +2
22342394 * 2/ room to convert each entry in (1) to its corresponding dma
22352395 * (dma_map_page()) or page (page_address()) address.
....@@ -2238,21 +2398,29 @@
22382398 * calculate over all devices (not just the data blocks), using zeros in place
22392399 * of the P and Q blocks.
22402400 */
2241
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2401
+static int scribble_alloc(struct raid5_percpu *percpu,
2402
+ int num, int cnt)
22422403 {
2243
- struct flex_array *ret;
2244
- size_t len;
2404
+ size_t obj_size =
2405
+ sizeof(struct page *) * (num + 2) +
2406
+ sizeof(addr_conv_t) * (num + 2) +
2407
+ sizeof(unsigned int) * (num + 2);
2408
+ void *scribble;
22452409
2246
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2247
- ret = flex_array_alloc(len, cnt, flags);
2248
- if (!ret)
2249
- return NULL;
2250
- /* always prealloc all elements, so no locking is required */
2251
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
2252
- flex_array_free(ret);
2253
- return NULL;
2254
- }
2255
- return ret;
2410
+ /*
2411
+ * If here is in raid array suspend context, it is in memalloc noio
2412
+ * context as well, there is no potential recursive memory reclaim
2413
+ * I/Os with the GFP_KERNEL flag.
2414
+ */
2415
+ scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2416
+ if (!scribble)
2417
+ return -ENOMEM;
2418
+
2419
+ kvfree(percpu->scribble);
2420
+
2421
+ percpu->scribble = scribble;
2422
+ percpu->scribble_obj_size = obj_size;
2423
+ return 0;
22562424 }
22572425
22582426 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
....@@ -2270,23 +2438,17 @@
22702438 return 0;
22712439 mddev_suspend(conf->mddev);
22722440 get_online_cpus();
2441
+
22732442 for_each_present_cpu(cpu) {
22742443 struct raid5_percpu *percpu;
2275
- struct flex_array *scribble;
22762444
22772445 percpu = per_cpu_ptr(conf->percpu, cpu);
2278
- scribble = scribble_alloc(new_disks,
2279
- new_sectors / STRIPE_SECTORS,
2280
- GFP_NOIO);
2281
-
2282
- if (scribble) {
2283
- flex_array_free(percpu->scribble);
2284
- percpu->scribble = scribble;
2285
- } else {
2286
- err = -ENOMEM;
2446
+ err = scribble_alloc(percpu, new_disks,
2447
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
2448
+ if (err)
22872449 break;
2288
- }
22892450 }
2451
+
22902452 put_online_cpus();
22912453 mddev_resume(conf->mddev);
22922454 if (!err) {
....@@ -2374,9 +2536,16 @@
23742536 osh = get_free_stripe(conf, hash);
23752537 unlock_device_hash_lock(conf, hash);
23762538
2539
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2540
+ for (i = 0; i < osh->nr_pages; i++) {
2541
+ nsh->pages[i] = osh->pages[i];
2542
+ osh->pages[i] = NULL;
2543
+ }
2544
+#endif
23772545 for(i=0; i<conf->pool_size; i++) {
23782546 nsh->dev[i].page = osh->dev[i].page;
23792547 nsh->dev[i].orig_page = osh->dev[i].page;
2548
+ nsh->dev[i].offset = osh->dev[i].offset;
23802549 }
23812550 nsh->hash_lock_index = hash;
23822551 free_stripe(conf->slab_cache, osh);
....@@ -2425,14 +2594,33 @@
24252594 nsh = list_entry(newstripes.next, struct stripe_head, lru);
24262595 list_del_init(&nsh->lru);
24272596
2597
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2598
+ for (i = 0; i < nsh->nr_pages; i++) {
2599
+ if (nsh->pages[i])
2600
+ continue;
2601
+ nsh->pages[i] = alloc_page(GFP_NOIO);
2602
+ if (!nsh->pages[i])
2603
+ err = -ENOMEM;
2604
+ }
2605
+
2606
+ for (i = conf->raid_disks; i < newsize; i++) {
2607
+ if (nsh->dev[i].page)
2608
+ continue;
2609
+ nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2610
+ nsh->dev[i].orig_page = nsh->dev[i].page;
2611
+ nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2612
+ }
2613
+#else
24282614 for (i=conf->raid_disks; i < newsize; i++)
24292615 if (nsh->dev[i].page == NULL) {
24302616 struct page *p = alloc_page(GFP_NOIO);
24312617 nsh->dev[i].page = p;
24322618 nsh->dev[i].orig_page = p;
2619
+ nsh->dev[i].offset = 0;
24332620 if (!p)
24342621 err = -ENOMEM;
24352622 }
2623
+#endif
24362624 raid5_release_stripe(nsh);
24372625 }
24382626 /* critical section pass, GFP_NOIO no longer needed */
....@@ -2516,10 +2704,10 @@
25162704 */
25172705 pr_info_ratelimited(
25182706 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2519
- mdname(conf->mddev), STRIPE_SECTORS,
2707
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
25202708 (unsigned long long)s,
25212709 bdevname(rdev->bdev, b));
2522
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2710
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
25232711 clear_bit(R5_ReadError, &sh->dev[i].flags);
25242712 clear_bit(R5_ReWrite, &sh->dev[i].flags);
25252713 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2564,10 +2752,16 @@
25642752 (unsigned long long)s,
25652753 bdn);
25662754 } else if (atomic_read(&rdev->read_errors)
2567
- > conf->max_nr_stripes)
2568
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2569
- mdname(conf->mddev), bdn);
2570
- else
2755
+ > conf->max_nr_stripes) {
2756
+ if (!test_bit(Faulty, &rdev->flags)) {
2757
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2758
+ mdname(conf->mddev),
2759
+ atomic_read(&rdev->read_errors),
2760
+ conf->max_nr_stripes);
2761
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2762
+ mdname(conf->mddev), bdn);
2763
+ }
2764
+ } else
25712765 retry = 1;
25722766 if (set_bad && test_bit(In_sync, &rdev->flags)
25732767 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2586,7 +2780,7 @@
25862780 if (!(set_bad
25872781 && test_bit(In_sync, &rdev->flags)
25882782 && rdev_set_badblocks(
2589
- rdev, sh->sector, STRIPE_SECTORS, 0)))
2783
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
25902784 md_error(conf->mddev, rdev);
25912785 }
25922786 }
....@@ -2602,7 +2796,7 @@
26022796 struct stripe_head *sh = bi->bi_private;
26032797 struct r5conf *conf = sh->raid_conf;
26042798 int disks = sh->disks, i;
2605
- struct md_rdev *uninitialized_var(rdev);
2799
+ struct md_rdev *rdev;
26062800 sector_t first_bad;
26072801 int bad_sectors;
26082802 int replacement = 0;
....@@ -2638,7 +2832,7 @@
26382832 if (bi->bi_status)
26392833 md_error(conf->mddev, rdev);
26402834 else if (is_badblock(rdev, sh->sector,
2641
- STRIPE_SECTORS,
2835
+ RAID5_STRIPE_SECTORS(conf),
26422836 &first_bad, &bad_sectors))
26432837 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
26442838 } else {
....@@ -2650,7 +2844,7 @@
26502844 set_bit(MD_RECOVERY_NEEDED,
26512845 &rdev->mddev->recovery);
26522846 } else if (is_badblock(rdev, sh->sector,
2653
- STRIPE_SECTORS,
2847
+ RAID5_STRIPE_SECTORS(conf),
26542848 &first_bad, &bad_sectors)) {
26552849 set_bit(R5_MadeGood, &sh->dev[i].flags);
26562850 if (test_bit(R5_ReadError, &sh->dev[i].flags))
....@@ -2670,10 +2864,10 @@
26702864 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
26712865 clear_bit(R5_LOCKED, &sh->dev[i].flags);
26722866 set_bit(STRIPE_HANDLE, &sh->state);
2673
- raid5_release_stripe(sh);
26742867
26752868 if (sh->batch_head && sh != sh->batch_head)
26762869 raid5_release_stripe(sh->batch_head);
2870
+ raid5_release_stripe(sh);
26772871 }
26782872
26792873 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
....@@ -2683,22 +2877,31 @@
26832877 unsigned long flags;
26842878 pr_debug("raid456: error called\n");
26852879
2880
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
2881
+ mdname(mddev), bdevname(rdev->bdev, b));
2882
+
26862883 spin_lock_irqsave(&conf->device_lock, flags);
26872884 set_bit(Faulty, &rdev->flags);
26882885 clear_bit(In_sync, &rdev->flags);
26892886 mddev->degraded = raid5_calc_degraded(conf);
2887
+
2888
+ if (has_failed(conf)) {
2889
+ set_bit(MD_BROKEN, &conf->mddev->flags);
2890
+ conf->recovery_disabled = mddev->recovery_disabled;
2891
+
2892
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2893
+ mdname(mddev), mddev->degraded, conf->raid_disks);
2894
+ } else {
2895
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2896
+ mdname(mddev), conf->raid_disks - mddev->degraded);
2897
+ }
2898
+
26902899 spin_unlock_irqrestore(&conf->device_lock, flags);
26912900 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
26922901
26932902 set_bit(Blocked, &rdev->flags);
26942903 set_mask_bits(&mddev->sb_flags, 0,
26952904 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2696
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2697
- "md/raid:%s: Operation continuing on %d devices.\n",
2698
- mdname(mddev),
2699
- bdevname(rdev->bdev, b),
2700
- mdname(mddev),
2701
- conf->raid_disks - mddev->degraded);
27022905 r5c_update_on_rdev_error(mddev, rdev);
27032906 }
27042907
....@@ -3272,13 +3475,13 @@
32723475 /* check if page is covered */
32733476 sector_t sector = sh->dev[dd_idx].sector;
32743477 for (bi=sh->dev[dd_idx].towrite;
3275
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3478
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
32763479 bi && bi->bi_iter.bi_sector <= sector;
3277
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3480
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
32783481 if (bio_end_sector(bi) >= sector)
32793482 sector = bio_end_sector(bi);
32803483 }
3281
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3484
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
32823485 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
32833486 sh->overwrite_disks++;
32843487 }
....@@ -3303,7 +3506,7 @@
33033506 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
33043507 spin_unlock_irq(&sh->stripe_lock);
33053508 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3306
- STRIPE_SECTORS, 0);
3509
+ RAID5_STRIPE_SECTORS(conf), 0);
33073510 spin_lock_irq(&sh->stripe_lock);
33083511 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
33093512 if (!sh->batch_head) {
....@@ -3365,7 +3568,7 @@
33653568 if (!rdev_set_badblocks(
33663569 rdev,
33673570 sh->sector,
3368
- STRIPE_SECTORS, 0))
3571
+ RAID5_STRIPE_SECTORS(conf), 0))
33693572 md_error(conf->mddev, rdev);
33703573 rdev_dec_pending(rdev, conf->mddev);
33713574 }
....@@ -3385,8 +3588,8 @@
33853588 wake_up(&conf->wait_for_overlap);
33863589
33873590 while (bi && bi->bi_iter.bi_sector <
3388
- sh->dev[i].sector + STRIPE_SECTORS) {
3389
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3591
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3592
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
33903593
33913594 md_write_end(conf->mddev);
33923595 bio_io_error(bi);
....@@ -3394,7 +3597,7 @@
33943597 }
33953598 if (bitmap_end)
33963599 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3397
- STRIPE_SECTORS, 0, 0);
3600
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
33983601 bitmap_end = 0;
33993602 /* and fail all 'written' */
34003603 bi = sh->dev[i].written;
....@@ -3406,8 +3609,8 @@
34063609
34073610 if (bi) bitmap_end = 1;
34083611 while (bi && bi->bi_iter.bi_sector <
3409
- sh->dev[i].sector + STRIPE_SECTORS) {
3410
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3612
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3613
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
34113614
34123615 md_write_end(conf->mddev);
34133616 bio_io_error(bi);
....@@ -3430,9 +3633,9 @@
34303633 if (bi)
34313634 s->to_read--;
34323635 while (bi && bi->bi_iter.bi_sector <
3433
- sh->dev[i].sector + STRIPE_SECTORS) {
3636
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
34343637 struct bio *nextbi =
3435
- r5_next_bio(bi, sh->dev[i].sector);
3638
+ r5_next_bio(conf, bi, sh->dev[i].sector);
34363639
34373640 bio_io_error(bi);
34383641 bi = nextbi;
....@@ -3440,7 +3643,7 @@
34403643 }
34413644 if (bitmap_end)
34423645 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3443
- STRIPE_SECTORS, 0, 0);
3646
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34443647 /* If we were in the middle of a write the parity block might
34453648 * still be locked - so just clear all R5_LOCKED flags
34463649 */
....@@ -3485,14 +3688,14 @@
34853688 && !test_bit(Faulty, &rdev->flags)
34863689 && !test_bit(In_sync, &rdev->flags)
34873690 && !rdev_set_badblocks(rdev, sh->sector,
3488
- STRIPE_SECTORS, 0))
3691
+ RAID5_STRIPE_SECTORS(conf), 0))
34893692 abort = 1;
34903693 rdev = rcu_dereference(conf->disks[i].replacement);
34913694 if (rdev
34923695 && !test_bit(Faulty, &rdev->flags)
34933696 && !test_bit(In_sync, &rdev->flags)
34943697 && !rdev_set_badblocks(rdev, sh->sector,
3495
- STRIPE_SECTORS, 0))
3698
+ RAID5_STRIPE_SECTORS(conf), 0))
34963699 abort = 1;
34973700 }
34983701 rcu_read_unlock();
....@@ -3500,7 +3703,7 @@
35003703 conf->recovery_disabled =
35013704 conf->mddev->recovery_disabled;
35023705 }
3503
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3706
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
35043707 }
35053708
35063709 static int want_replace(struct stripe_head *sh, int disk_idx)
....@@ -3527,6 +3730,7 @@
35273730 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
35283731 &sh->dev[s->failed_num[1]] };
35293732 int i;
3733
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
35303734
35313735
35323736 if (test_bit(R5_LOCKED, &dev->flags) ||
....@@ -3585,18 +3789,27 @@
35853789 * devices must be read.
35863790 */
35873791 return 1;
3792
+
3793
+ if (s->failed >= 2 &&
3794
+ (fdev[i]->towrite ||
3795
+ s->failed_num[i] == sh->pd_idx ||
3796
+ s->failed_num[i] == sh->qd_idx) &&
3797
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
3798
+ /* In max degraded raid6, If the failed disk is P, Q,
3799
+ * or we want to read the failed disk, we need to do
3800
+ * reconstruct-write.
3801
+ */
3802
+ force_rcw = true;
35883803 }
35893804
3590
- /* If we are forced to do a reconstruct-write, either because
3591
- * the current RAID6 implementation only supports that, or
3592
- * because parity cannot be trusted and we are currently
3593
- * recovering it, there is extra need to be careful.
3805
+ /* If we are forced to do a reconstruct-write, because parity
3806
+ * cannot be trusted and we are currently recovering it, there
3807
+ * is extra need to be careful.
35943808 * If one of the devices that we would need to read, because
35953809 * it is not being overwritten (and maybe not written at all)
35963810 * is missing/faulty, then we need to read everything we can.
35973811 */
3598
- if (sh->raid_conf->level != 6 &&
3599
- sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
3812
+ if (!force_rcw &&
36003813 sh->sector < sh->raid_conf->mddev->recovery_cp)
36013814 /* reconstruct-write isn't being forced */
36023815 return 0;
....@@ -3700,7 +3913,7 @@
37003913 return 0;
37013914 }
37023915
3703
-/**
3916
+/*
37043917 * handle_stripe_fill - read or compute data to satisfy pending requests.
37053918 */
37063919 static void handle_stripe_fill(struct stripe_head *sh,
....@@ -3723,7 +3936,7 @@
37233936 * back cache (prexor with orig_page, and then xor with
37243937 * page) in the read path
37253938 */
3726
- if (s->injournal && s->failed) {
3939
+ if (s->to_read && s->injournal && s->failed) {
37273940 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
37283941 r5c_make_stripe_write_out(sh);
37293942 goto out;
....@@ -3775,14 +3988,14 @@
37753988 wbi = dev->written;
37763989 dev->written = NULL;
37773990 while (wbi && wbi->bi_iter.bi_sector <
3778
- dev->sector + STRIPE_SECTORS) {
3779
- wbi2 = r5_next_bio(wbi, dev->sector);
3991
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3992
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
37803993 md_write_end(conf->mddev);
37813994 bio_endio(wbi);
37823995 wbi = wbi2;
37833996 }
37843997 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3785
- STRIPE_SECTORS,
3998
+ RAID5_STRIPE_SECTORS(conf),
37863999 !test_bit(STRIPE_DEGRADED, &sh->state),
37874000 0);
37884001 if (head_sh->batch_head) {
....@@ -3966,10 +4179,8 @@
39664179 set_bit(R5_LOCKED, &dev->flags);
39674180 set_bit(R5_Wantread, &dev->flags);
39684181 s->locked++;
3969
- } else {
4182
+ } else
39704183 set_bit(STRIPE_DELAYED, &sh->state);
3971
- set_bit(STRIPE_HANDLE, &sh->state);
3972
- }
39734184 }
39744185 }
39754186 }
....@@ -3994,10 +4205,8 @@
39944205 set_bit(R5_Wantread, &dev->flags);
39954206 s->locked++;
39964207 qread++;
3997
- } else {
4208
+ } else
39984209 set_bit(STRIPE_DELAYED, &sh->state);
3999
- set_bit(STRIPE_HANDLE, &sh->state);
4000
- }
40014210 }
40024211 }
40034212 if (rcw && conf->mddev->queue)
....@@ -4047,7 +4256,7 @@
40474256 break;
40484257 }
40494258 dev = &sh->dev[s->failed_num[0]];
4050
- /* fall through */
4259
+ fallthrough;
40514260 case check_state_compute_result:
40524261 sh->check_state = check_state_idle;
40534262 if (!dev)
....@@ -4089,7 +4298,7 @@
40894298 */
40904299 set_bit(STRIPE_INSYNC, &sh->state);
40914300 else {
4092
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4301
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
40934302 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
40944303 /* don't try to repair!! */
40954304 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4097,7 +4306,7 @@
40974306 "%llu-%llu\n", mdname(conf->mddev),
40984307 (unsigned long long) sh->sector,
40994308 (unsigned long long) sh->sector +
4100
- STRIPE_SECTORS);
4309
+ RAID5_STRIPE_SECTORS(conf));
41014310 } else {
41024311 sh->check_state = check_state_compute_run;
41034312 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
....@@ -4178,7 +4387,7 @@
41784387
41794388 /* we have 2-disk failure */
41804389 BUG_ON(s->failed != 2);
4181
- /* fall through */
4390
+ fallthrough;
41824391 case check_state_compute_result:
41834392 sh->check_state = check_state_idle;
41844393
....@@ -4254,7 +4463,7 @@
42544463 */
42554464 }
42564465 } else {
4257
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4466
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
42584467 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
42594468 /* don't try to repair!! */
42604469 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4262,7 +4471,7 @@
42624471 "%llu-%llu\n", mdname(conf->mddev),
42634472 (unsigned long long) sh->sector,
42644473 (unsigned long long) sh->sector +
4265
- STRIPE_SECTORS);
4474
+ RAID5_STRIPE_SECTORS(conf));
42664475 } else {
42674476 int *target = &sh->ops.target;
42684477
....@@ -4333,7 +4542,8 @@
43334542 /* place all the copies on one channel */
43344543 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
43354544 tx = async_memcpy(sh2->dev[dd_idx].page,
4336
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
4545
+ sh->dev[i].page, sh2->dev[dd_idx].offset,
4546
+ sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
43374547 &submit);
43384548
43394549 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
....@@ -4432,8 +4642,8 @@
44324642 */
44334643 rdev = rcu_dereference(conf->disks[i].replacement);
44344644 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4435
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4436
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4645
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4646
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44374647 &first_bad, &bad_sectors))
44384648 set_bit(R5_ReadRepl, &dev->flags);
44394649 else {
....@@ -4447,7 +4657,7 @@
44474657 if (rdev && test_bit(Faulty, &rdev->flags))
44484658 rdev = NULL;
44494659 if (rdev) {
4450
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4660
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44514661 &first_bad, &bad_sectors);
44524662 if (s->blocked_rdev == NULL
44534663 && (test_bit(Blocked, &rdev->flags)
....@@ -4474,7 +4684,7 @@
44744684 }
44754685 } else if (test_bit(In_sync, &rdev->flags))
44764686 set_bit(R5_Insync, &dev->flags);
4477
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4687
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
44784688 /* in sync if before recovery_offset */
44794689 set_bit(R5_Insync, &dev->flags);
44804690 else if (test_bit(R5_UPTODATE, &dev->flags) &&
....@@ -4563,12 +4773,12 @@
45634773 rcu_read_unlock();
45644774 }
45654775
4776
+/*
4777
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4778
+ * a head which can now be handled.
4779
+ */
45664780 static int clear_batch_ready(struct stripe_head *sh)
45674781 {
4568
- /* Return '1' if this is a member of batch, or
4569
- * '0' if it is a lone stripe or a head which can now be
4570
- * handled.
4571
- */
45724782 struct stripe_head *tmp;
45734783 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
45744784 return (sh->batch_head && sh->batch_head != sh);
....@@ -4618,7 +4828,6 @@
46184828 (1 << STRIPE_FULL_WRITE) |
46194829 (1 << STRIPE_BIOFILL_RUN) |
46204830 (1 << STRIPE_COMPUTE_RUN) |
4621
- (1 << STRIPE_OPS_REQ_PENDING) |
46224831 (1 << STRIPE_DISCARD) |
46234832 (1 << STRIPE_BATCH_READY) |
46244833 (1 << STRIPE_BATCH_ERR) |
....@@ -4673,15 +4882,20 @@
46734882 struct r5dev *pdev, *qdev;
46744883
46754884 clear_bit(STRIPE_HANDLE, &sh->state);
4885
+
4886
+ /*
4887
+ * handle_stripe should not continue handle the batched stripe, only
4888
+ * the head of batch list or lone stripe can continue. Otherwise we
4889
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4890
+ * is set for the batched stripe.
4891
+ */
4892
+ if (clear_batch_ready(sh))
4893
+ return;
4894
+
46764895 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
46774896 /* already being handled, ensure it gets handled
46784897 * again when current action finishes */
46794898 set_bit(STRIPE_HANDLE, &sh->state);
4680
- return;
4681
- }
4682
-
4683
- if (clear_batch_ready(sh) ) {
4684
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
46854899 return;
46864900 }
46874901
....@@ -4918,7 +5132,7 @@
49185132 if ((s.syncing || s.replacing) && s.locked == 0 &&
49195133 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
49205134 test_bit(STRIPE_INSYNC, &sh->state)) {
4921
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5135
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49225136 clear_bit(STRIPE_SYNCING, &sh->state);
49235137 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
49245138 wake_up(&conf->wait_for_overlap);
....@@ -4937,14 +5151,11 @@
49375151 if (!test_bit(R5_ReWrite, &dev->flags)) {
49385152 set_bit(R5_Wantwrite, &dev->flags);
49395153 set_bit(R5_ReWrite, &dev->flags);
4940
- set_bit(R5_LOCKED, &dev->flags);
4941
- s.locked++;
4942
- } else {
5154
+ } else
49435155 /* let's read it back */
49445156 set_bit(R5_Wantread, &dev->flags);
4945
- set_bit(R5_LOCKED, &dev->flags);
4946
- s.locked++;
4947
- }
5157
+ set_bit(R5_LOCKED, &dev->flags);
5158
+ s.locked++;
49485159 }
49495160 }
49505161
....@@ -4986,7 +5197,7 @@
49865197 clear_bit(STRIPE_EXPAND_READY, &sh->state);
49875198 atomic_dec(&conf->reshape_stripes);
49885199 wake_up(&conf->wait_for_overlap);
4989
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5200
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49905201 }
49915202
49925203 if (s.expanding && s.locked == 0 &&
....@@ -5016,14 +5227,14 @@
50165227 /* We own a safe reference to the rdev */
50175228 rdev = conf->disks[i].rdev;
50185229 if (!rdev_set_badblocks(rdev, sh->sector,
5019
- STRIPE_SECTORS, 0))
5230
+ RAID5_STRIPE_SECTORS(conf), 0))
50205231 md_error(conf->mddev, rdev);
50215232 rdev_dec_pending(rdev, conf->mddev);
50225233 }
50235234 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
50245235 rdev = conf->disks[i].rdev;
50255236 rdev_clear_badblocks(rdev, sh->sector,
5026
- STRIPE_SECTORS, 0);
5237
+ RAID5_STRIPE_SECTORS(conf), 0);
50275238 rdev_dec_pending(rdev, conf->mddev);
50285239 }
50295240 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
....@@ -5032,7 +5243,7 @@
50325243 /* rdev have been moved down */
50335244 rdev = conf->disks[i].rdev;
50345245 rdev_clear_badblocks(rdev, sh->sector,
5035
- STRIPE_SECTORS, 0);
5246
+ RAID5_STRIPE_SECTORS(conf), 0);
50365247 rdev_dec_pending(rdev, conf->mddev);
50375248 }
50385249 }
....@@ -5088,28 +5299,6 @@
50885299 hash = sh->hash_lock_index;
50895300 __release_stripe(conf, sh, &temp_inactive_list[hash]);
50905301 }
5091
-}
5092
-
5093
-static int raid5_congested(struct mddev *mddev, int bits)
5094
-{
5095
- struct r5conf *conf = mddev->private;
5096
-
5097
- /* No difference between reads and writes. Just check
5098
- * how busy the stripe_cache is
5099
- */
5100
-
5101
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5102
- return 1;
5103
-
5104
- /* Also checks whether there is pressure on r5cache log space */
5105
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5106
- return 1;
5107
- if (conf->quiesce)
5108
- return 1;
5109
- if (atomic_read(&conf->empty_inactive_list_nr))
5110
- return 1;
5111
-
5112
- return 0;
51135302 }
51145303
51155304 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
....@@ -5257,7 +5446,6 @@
52575446 rcu_read_unlock();
52585447 raid_bio->bi_next = (void*)rdev;
52595448 bio_set_dev(align_bi, rdev->bdev);
5260
- bio_clear_flag(align_bi, BIO_SEG_VALID);
52615449
52625450 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
52635451 bio_sectors(align_bi),
....@@ -5281,7 +5469,7 @@
52815469 trace_block_bio_remap(align_bi->bi_disk->queue,
52825470 align_bi, disk_devt(mddev->gendisk),
52835471 raid_bio->bi_iter.bi_sector);
5284
- generic_make_request(align_bi);
5472
+ submit_bio_noacct(align_bi);
52855473 return 1;
52865474 } else {
52875475 rcu_read_unlock();
....@@ -5301,7 +5489,7 @@
53015489 struct r5conf *conf = mddev->private;
53025490 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
53035491 bio_chain(split, raid_bio);
5304
- generic_make_request(raid_bio);
5492
+ submit_bio_noacct(raid_bio);
53055493 raid_bio = split;
53065494 }
53075495
....@@ -5497,8 +5685,8 @@
54975685 /* Skip discard while reshape is happening */
54985686 return;
54995687
5500
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5501
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5688
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5689
+ last_sector = bio_end_sector(bi);
55025690
55035691 bi->bi_next = NULL;
55045692
....@@ -5512,7 +5700,7 @@
55125700 last_sector *= conf->chunk_sectors;
55135701
55145702 for (; logical_sector < last_sector;
5515
- logical_sector += STRIPE_SECTORS) {
5703
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
55165704 DEFINE_WAIT(w);
55175705 int d;
55185706 again:
....@@ -5557,7 +5745,7 @@
55575745 d++)
55585746 md_bitmap_startwrite(mddev->bitmap,
55595747 sh->sector,
5560
- STRIPE_SECTORS,
5748
+ RAID5_STRIPE_SECTORS(conf),
55615749 0);
55625750 sh->bm_seq = conf->seq_flush + 1;
55635751 set_bit(STRIPE_BIT_DELAY, &sh->state);
....@@ -5622,12 +5810,12 @@
56225810 return true;
56235811 }
56245812
5625
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5813
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
56265814 last_sector = bio_end_sector(bi);
56275815 bi->bi_next = NULL;
56285816
56295817 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5630
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5818
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
56315819 int previous;
56325820 int seq;
56335821
....@@ -5725,8 +5913,7 @@
57255913 do_flush = false;
57265914 }
57275915
5728
- if (!sh->batch_head || sh == sh->batch_head)
5729
- set_bit(STRIPE_HANDLE, &sh->state);
5916
+ set_bit(STRIPE_HANDLE, &sh->state);
57305917 clear_bit(STRIPE_DELAYED, &sh->state);
57315918 if ((!sh->batch_head || sh == sh->batch_head) &&
57325919 (bi->bi_opf & REQ_SYNC) &&
....@@ -5791,7 +5978,7 @@
57915978 sector_div(sector_nr, new_data_disks);
57925979 if (sector_nr) {
57935980 mddev->curr_resync_completed = sector_nr;
5794
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5981
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
57955982 *skipped = 1;
57965983 retn = sector_nr;
57975984 goto finish;
....@@ -5905,11 +6092,11 @@
59056092 conf->reshape_safe = mddev->reshape_position;
59066093 spin_unlock_irq(&conf->device_lock);
59076094 wake_up(&conf->wait_for_overlap);
5908
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6095
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
59096096 }
59106097
59116098 INIT_LIST_HEAD(&stripes);
5912
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
6099
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
59136100 int j;
59146101 int skipped_disk = 0;
59156102 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
....@@ -5930,7 +6117,7 @@
59306117 skipped_disk = 1;
59316118 continue;
59326119 }
5933
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
6120
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
59346121 set_bit(R5_Expanded, &sh->dev[j].flags);
59356122 set_bit(R5_UPTODATE, &sh->dev[j].flags);
59366123 }
....@@ -5965,7 +6152,7 @@
59656152 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
59666153 set_bit(STRIPE_HANDLE, &sh->state);
59676154 raid5_release_stripe(sh);
5968
- first_sector += STRIPE_SECTORS;
6155
+ first_sector += RAID5_STRIPE_SECTORS(conf);
59696156 }
59706157 /* Now that the sources are clearly marked, we can release
59716158 * the destination stripes
....@@ -6012,7 +6199,7 @@
60126199 conf->reshape_safe = mddev->reshape_position;
60136200 spin_unlock_irq(&conf->device_lock);
60146201 wake_up(&conf->wait_for_overlap);
6015
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6202
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
60166203 }
60176204 ret:
60186205 return retn;
....@@ -6071,11 +6258,12 @@
60716258 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
60726259 !conf->fullsync &&
60736260 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6074
- sync_blocks >= STRIPE_SECTORS) {
6261
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
60756262 /* we can skip this block, and probably more */
6076
- sync_blocks /= STRIPE_SECTORS;
6263
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
60776264 *skipped = 1;
6078
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
6265
+ /* keep things rounded to whole stripes */
6266
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
60796267 }
60806268
60816269 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
....@@ -6108,7 +6296,7 @@
61086296
61096297 raid5_release_stripe(sh);
61106298
6111
- return STRIPE_SECTORS;
6299
+ return RAID5_STRIPE_SECTORS(conf);
61126300 }
61136301
61146302 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
....@@ -6131,14 +6319,14 @@
61316319 int handled = 0;
61326320
61336321 logical_sector = raid_bio->bi_iter.bi_sector &
6134
- ~((sector_t)STRIPE_SECTORS-1);
6322
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
61356323 sector = raid5_compute_sector(conf, logical_sector,
61366324 0, &dd_idx, NULL);
61376325 last_sector = bio_end_sector(raid_bio);
61386326
61396327 for (; logical_sector < last_sector;
6140
- logical_sector += STRIPE_SECTORS,
6141
- sector += STRIPE_SECTORS,
6328
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
6329
+ sector += RAID5_STRIPE_SECTORS(conf),
61426330 scnt++) {
61436331
61446332 if (scnt < offset)
....@@ -6177,6 +6365,8 @@
61776365 static int handle_active_stripes(struct r5conf *conf, int group,
61786366 struct r5worker *worker,
61796367 struct list_head *temp_inactive_list)
6368
+ __releases(&conf->device_lock)
6369
+ __acquires(&conf->device_lock)
61806370 {
61816371 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
61826372 int i, batch_size = 0, hash;
....@@ -6329,7 +6519,18 @@
63296519 spin_unlock_irq(&conf->device_lock);
63306520 md_check_recovery(mddev);
63316521 spin_lock_irq(&conf->device_lock);
6522
+
6523
+ /*
6524
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6525
+ * seeing md_check_recovery() is needed to clear
6526
+ * the flag when using mdmon.
6527
+ */
6528
+ continue;
63326529 }
6530
+
6531
+ wait_event_lock_irq(mddev->sb_wait,
6532
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6533
+ conf->device_lock);
63336534 }
63346535 pr_debug("%d stripes handled\n", handled);
63356536
....@@ -6469,6 +6670,100 @@
64696670 raid5_show_rmw_level,
64706671 raid5_store_rmw_level);
64716672
6673
+static ssize_t
6674
+raid5_show_stripe_size(struct mddev *mddev, char *page)
6675
+{
6676
+ struct r5conf *conf;
6677
+ int ret = 0;
6678
+
6679
+ spin_lock(&mddev->lock);
6680
+ conf = mddev->private;
6681
+ if (conf)
6682
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6683
+ spin_unlock(&mddev->lock);
6684
+ return ret;
6685
+}
6686
+
6687
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6688
+static ssize_t
6689
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6690
+{
6691
+ struct r5conf *conf;
6692
+ unsigned long new;
6693
+ int err;
6694
+ int size;
6695
+
6696
+ if (len >= PAGE_SIZE)
6697
+ return -EINVAL;
6698
+ if (kstrtoul(page, 10, &new))
6699
+ return -EINVAL;
6700
+
6701
+ /*
6702
+ * The value should not be bigger than PAGE_SIZE. It requires to
6703
+ * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6704
+ * of two.
6705
+ */
6706
+ if (new % DEFAULT_STRIPE_SIZE != 0 ||
6707
+ new > PAGE_SIZE || new == 0 ||
6708
+ new != roundup_pow_of_two(new))
6709
+ return -EINVAL;
6710
+
6711
+ err = mddev_lock(mddev);
6712
+ if (err)
6713
+ return err;
6714
+
6715
+ conf = mddev->private;
6716
+ if (!conf) {
6717
+ err = -ENODEV;
6718
+ goto out_unlock;
6719
+ }
6720
+
6721
+ if (new == conf->stripe_size)
6722
+ goto out_unlock;
6723
+
6724
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6725
+ conf->stripe_size, new);
6726
+
6727
+ if (mddev->sync_thread ||
6728
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6729
+ mddev->reshape_position != MaxSector ||
6730
+ mddev->sysfs_active) {
6731
+ err = -EBUSY;
6732
+ goto out_unlock;
6733
+ }
6734
+
6735
+ mddev_suspend(mddev);
6736
+ mutex_lock(&conf->cache_size_mutex);
6737
+ size = conf->max_nr_stripes;
6738
+
6739
+ shrink_stripes(conf);
6740
+
6741
+ conf->stripe_size = new;
6742
+ conf->stripe_shift = ilog2(new) - 9;
6743
+ conf->stripe_sectors = new >> 9;
6744
+ if (grow_stripes(conf, size)) {
6745
+ pr_warn("md/raid:%s: couldn't allocate buffers\n",
6746
+ mdname(mddev));
6747
+ err = -ENOMEM;
6748
+ }
6749
+ mutex_unlock(&conf->cache_size_mutex);
6750
+ mddev_resume(mddev);
6751
+
6752
+out_unlock:
6753
+ mddev_unlock(mddev);
6754
+ return err ?: len;
6755
+}
6756
+
6757
+static struct md_sysfs_entry
6758
+raid5_stripe_size = __ATTR(stripe_size, 0644,
6759
+ raid5_show_stripe_size,
6760
+ raid5_store_stripe_size);
6761
+#else
6762
+static struct md_sysfs_entry
6763
+raid5_stripe_size = __ATTR(stripe_size, 0444,
6764
+ raid5_show_stripe_size,
6765
+ NULL);
6766
+#endif
64726767
64736768 static ssize_t
64746769 raid5_show_preread_threshold(struct mddev *mddev, char *page)
....@@ -6548,14 +6843,14 @@
65486843 if (!conf)
65496844 err = -ENODEV;
65506845 else if (new != conf->skip_copy) {
6846
+ struct request_queue *q = mddev->queue;
6847
+
65516848 mddev_suspend(mddev);
65526849 conf->skip_copy = new;
65536850 if (new)
6554
- mddev->queue->backing_dev_info->capabilities |=
6555
- BDI_CAP_STABLE_WRITES;
6851
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
65566852 else
6557
- mddev->queue->backing_dev_info->capabilities &=
6558
- ~BDI_CAP_STABLE_WRITES;
6853
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
65596854 mddev_resume(mddev);
65606855 }
65616856 mddev_unlock(mddev);
....@@ -6595,7 +6890,6 @@
65956890
65966891 static int alloc_thread_groups(struct r5conf *conf, int cnt,
65976892 int *group_cnt,
6598
- int *worker_cnt_per_group,
65996893 struct r5worker_group **worker_groups);
66006894 static ssize_t
66016895 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
....@@ -6604,7 +6898,7 @@
66046898 unsigned int new;
66056899 int err;
66066900 struct r5worker_group *new_groups, *old_groups;
6607
- int group_cnt, worker_cnt_per_group;
6901
+ int group_cnt;
66086902
66096903 if (len >= PAGE_SIZE)
66106904 return -EINVAL;
....@@ -6627,13 +6921,11 @@
66276921 if (old_groups)
66286922 flush_workqueue(raid5_wq);
66296923
6630
- err = alloc_thread_groups(conf, new,
6631
- &group_cnt, &worker_cnt_per_group,
6632
- &new_groups);
6924
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
66336925 if (!err) {
66346926 spin_lock_irq(&conf->device_lock);
66356927 conf->group_cnt = group_cnt;
6636
- conf->worker_cnt_per_group = worker_cnt_per_group;
6928
+ conf->worker_cnt_per_group = new;
66376929 conf->worker_groups = new_groups;
66386930 spin_unlock_irq(&conf->device_lock);
66396931
....@@ -6660,7 +6952,9 @@
66606952 &raid5_group_thread_cnt.attr,
66616953 &raid5_skip_copy.attr,
66626954 &raid5_rmw_level.attr,
6955
+ &raid5_stripe_size.attr,
66636956 &r5c_journal_mode.attr,
6957
+ &ppl_write_hint.attr,
66646958 NULL,
66656959 };
66666960 static struct attribute_group raid5_attrs_group = {
....@@ -6668,16 +6962,13 @@
66686962 .attrs = raid5_attrs,
66696963 };
66706964
6671
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
6672
- int *group_cnt,
6673
- int *worker_cnt_per_group,
6965
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
66746966 struct r5worker_group **worker_groups)
66756967 {
66766968 int i, j, k;
66776969 ssize_t size;
66786970 struct r5worker *workers;
66796971
6680
- *worker_cnt_per_group = cnt;
66816972 if (cnt == 0) {
66826973 *group_cnt = 0;
66836974 *worker_groups = NULL;
....@@ -6743,25 +7034,25 @@
67437034 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67447035 {
67457036 safe_put_page(percpu->spare_page);
6746
- if (percpu->scribble)
6747
- flex_array_free(percpu->scribble);
67487037 percpu->spare_page = NULL;
7038
+ kvfree(percpu->scribble);
67497039 percpu->scribble = NULL;
67507040 }
67517041
67527042 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67537043 {
6754
- if (conf->level == 6 && !percpu->spare_page)
7044
+ if (conf->level == 6 && !percpu->spare_page) {
67557045 percpu->spare_page = alloc_page(GFP_KERNEL);
6756
- if (!percpu->scribble)
6757
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
6758
- conf->previous_raid_disks),
6759
- max(conf->chunk_sectors,
6760
- conf->prev_chunk_sectors)
6761
- / STRIPE_SECTORS,
6762
- GFP_KERNEL);
7046
+ if (!percpu->spare_page)
7047
+ return -ENOMEM;
7048
+ }
67637049
6764
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
7050
+ if (scribble_alloc(percpu,
7051
+ max(conf->raid_disks,
7052
+ conf->previous_raid_disks),
7053
+ max(conf->chunk_sectors,
7054
+ conf->prev_chunk_sectors)
7055
+ / RAID5_STRIPE_SECTORS(conf))) {
67657056 free_scratch_buffer(conf, percpu);
67667057 return -ENOMEM;
67677058 }
....@@ -6877,7 +7168,7 @@
68777168 struct disk_info *disk;
68787169 char pers_name[6];
68797170 int i;
6880
- int group_cnt, worker_cnt_per_group;
7171
+ int group_cnt;
68817172 struct r5worker_group *new_group;
68827173 int ret;
68837174
....@@ -6913,6 +7204,12 @@
69137204 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
69147205 if (conf == NULL)
69157206 goto abort;
7207
+
7208
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7209
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
7210
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7211
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7212
+#endif
69167213 INIT_LIST_HEAD(&conf->free_list);
69177214 INIT_LIST_HEAD(&conf->pending_list);
69187215 conf->pending_data = kcalloc(PENDING_IO_MAX,
....@@ -6923,15 +7220,14 @@
69237220 for (i = 0; i < PENDING_IO_MAX; i++)
69247221 list_add(&conf->pending_data[i].sibling, &conf->free_list);
69257222 /* Don't enable multi-threading by default*/
6926
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6927
- &new_group)) {
7223
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
69287224 conf->group_cnt = group_cnt;
6929
- conf->worker_cnt_per_group = worker_cnt_per_group;
7225
+ conf->worker_cnt_per_group = 0;
69307226 conf->worker_groups = new_group;
69317227 } else
69327228 goto abort;
69337229 spin_lock_init(&conf->device_lock);
6934
- seqcount_init(&conf->gen_lock);
7230
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
69357231 mutex_init(&conf->cache_size_mutex);
69367232 init_waitqueue_head(&conf->wait_for_quiescent);
69377233 init_waitqueue_head(&conf->wait_for_stripe);
....@@ -7065,8 +7361,8 @@
70657361 conf->min_nr_stripes = NR_STRIPES;
70667362 if (mddev->reshape_position != MaxSector) {
70677363 int stripes = max_t(int,
7068
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7069
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7364
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7365
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
70707366 conf->min_nr_stripes = max(NR_STRIPES, stripes);
70717367 if (conf->min_nr_stripes != NR_STRIPES)
70727368 pr_info("md/raid:%s: force stripe size %d for reshape\n",
....@@ -7139,6 +7435,12 @@
71397435 return 1;
71407436 }
71417437 return 0;
7438
+}
7439
+
7440
+static void raid5_set_io_opt(struct r5conf *conf)
7441
+{
7442
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7443
+ (conf->raid_disks - conf->max_degraded));
71427444 }
71437445
71447446 static int raid5_run(struct mddev *mddev)
....@@ -7425,13 +7727,10 @@
74257727 int data_disks = conf->previous_raid_disks - conf->max_degraded;
74267728 int stripe = data_disks *
74277729 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7428
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7429
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
74307730
74317731 chunk_size = mddev->chunk_sectors << 9;
74327732 blk_queue_io_min(mddev->queue, chunk_size);
7433
- blk_queue_io_opt(mddev->queue, chunk_size *
7434
- (conf->raid_disks - conf->max_degraded));
7733
+ raid5_set_io_opt(conf);
74357734 mddev->queue->limits.raid_partial_stripes_expensive = 1;
74367735 /*
74377736 * We can only discard a whole stripe. It doesn't make sense to
....@@ -7716,6 +8015,7 @@
77168015 */
77178016 if (rdev->saved_raid_disk >= 0 &&
77188017 rdev->saved_raid_disk >= first &&
8018
+ rdev->saved_raid_disk <= last &&
77198019 conf->disks[rdev->saved_raid_disk].rdev == NULL)
77208020 first = rdev->saved_raid_disk;
77218021
....@@ -7797,14 +8097,14 @@
77978097 * stripe_heads first.
77988098 */
77998099 struct r5conf *conf = mddev->private;
7800
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
8100
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78018101 > conf->min_nr_stripes ||
7802
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
8102
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78038103 > conf->min_nr_stripes) {
78048104 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
78058105 mdname(mddev),
78068106 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7807
- / STRIPE_SIZE)*4);
8107
+ / RAID5_STRIPE_SIZE(conf))*4);
78088108 return 0;
78098109 }
78108110 return 1;
....@@ -7940,8 +8240,8 @@
79408240 else
79418241 rdev->recovery_offset = 0;
79428242
7943
- if (sysfs_link_rdev(mddev, rdev))
7944
- /* Failure here is OK */;
8243
+ /* Failure here is OK */
8244
+ sysfs_link_rdev(mddev, rdev);
79458245 }
79468246 } else if (rdev->raid_disk >= conf->previous_raid_disks
79478247 && !test_bit(Faulty, &rdev->flags)) {
....@@ -8015,16 +8315,8 @@
80158315 spin_unlock_irq(&conf->device_lock);
80168316 wake_up(&conf->wait_for_overlap);
80178317
8018
- /* read-ahead size must cover two whole stripes, which is
8019
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
8020
- */
8021
- if (conf->mddev->queue) {
8022
- int data_disks = conf->raid_disks - conf->max_degraded;
8023
- int stripe = data_disks * ((conf->chunk_sectors << 9)
8024
- / PAGE_SIZE);
8025
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8026
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8027
- }
8318
+ if (conf->mddev->queue)
8319
+ raid5_set_io_opt(conf);
80288320 }
80298321 }
80308322
....@@ -8136,7 +8428,7 @@
81368428 while (chunksect && (mddev->array_sectors & (chunksect-1)))
81378429 chunksect >>= 1;
81388430
8139
- if ((chunksect<<9) < STRIPE_SIZE)
8431
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
81408432 /* array size does not allow a suitable chunk size */
81418433 return ERR_PTR(-EINVAL);
81428434
....@@ -8423,7 +8715,6 @@
84238715 .finish_reshape = raid5_finish_reshape,
84248716 .quiesce = raid5_quiesce,
84258717 .takeover = raid6_takeover,
8426
- .congested = raid5_congested,
84278718 .change_consistency_policy = raid5_change_consistency_policy,
84288719 };
84298720 static struct md_personality raid5_personality =
....@@ -8448,7 +8739,6 @@
84488739 .finish_reshape = raid5_finish_reshape,
84498740 .quiesce = raid5_quiesce,
84508741 .takeover = raid5_takeover,
8451
- .congested = raid5_congested,
84528742 .change_consistency_policy = raid5_change_consistency_policy,
84538743 };
84548744
....@@ -8474,7 +8764,6 @@
84748764 .finish_reshape = raid5_finish_reshape,
84758765 .quiesce = raid5_quiesce,
84768766 .takeover = raid4_takeover,
8477
- .congested = raid5_congested,
84788767 .change_consistency_policy = raid5_change_consistency_policy,
84798768 };
84808769