hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/drivers/md/raid5.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid5.c : Multiple Devices driver for Linux
34 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
....@@ -7,15 +8,6 @@
78 * RAID-4/5/6 management functions.
89 * Thanks to Penguin Computing for making the RAID-6 development possible
910 * by donating a test server!
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1911 */
2012
2113 /*
....@@ -44,6 +36,7 @@
4436 */
4537
4638 #include <linux/blkdev.h>
39
+#include <linux/delay.h>
4740 #include <linux/kthread.h>
4841 #include <linux/raid/pq.h>
4942 #include <linux/async_tx.h>
....@@ -54,7 +47,6 @@
5447 #include <linux/slab.h>
5548 #include <linux/ratelimit.h>
5649 #include <linux/nodemask.h>
57
-#include <linux/flex_array.h>
5850
5951 #include <trace/events/block.h>
6052 #include <linux/list_sort.h>
....@@ -78,13 +70,13 @@
7870
7971 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
8072 {
81
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
8274 return &conf->stripe_hashtbl[hash];
8375 }
8476
85
-static inline int stripe_hash_locks_hash(sector_t sect)
77
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
8678 {
87
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
8880 }
8981
9082 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
....@@ -457,13 +449,74 @@
457449 return sh;
458450 }
459451
452
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
453
+static void free_stripe_pages(struct stripe_head *sh)
454
+{
455
+ int i;
456
+ struct page *p;
457
+
458
+ /* Have not allocate page pool */
459
+ if (!sh->pages)
460
+ return;
461
+
462
+ for (i = 0; i < sh->nr_pages; i++) {
463
+ p = sh->pages[i];
464
+ if (p)
465
+ put_page(p);
466
+ sh->pages[i] = NULL;
467
+ }
468
+}
469
+
470
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
471
+{
472
+ int i;
473
+ struct page *p;
474
+
475
+ for (i = 0; i < sh->nr_pages; i++) {
476
+ /* The page have allocated. */
477
+ if (sh->pages[i])
478
+ continue;
479
+
480
+ p = alloc_page(gfp);
481
+ if (!p) {
482
+ free_stripe_pages(sh);
483
+ return -ENOMEM;
484
+ }
485
+ sh->pages[i] = p;
486
+ }
487
+ return 0;
488
+}
489
+
490
+static int
491
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
492
+{
493
+ int nr_pages, cnt;
494
+
495
+ if (sh->pages)
496
+ return 0;
497
+
498
+ /* Each of the sh->dev[i] need one conf->stripe_size */
499
+ cnt = PAGE_SIZE / conf->stripe_size;
500
+ nr_pages = (disks + cnt - 1) / cnt;
501
+
502
+ sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
503
+ if (!sh->pages)
504
+ return -ENOMEM;
505
+ sh->nr_pages = nr_pages;
506
+ sh->stripes_per_page = cnt;
507
+ return 0;
508
+}
509
+#endif
510
+
460511 static void shrink_buffers(struct stripe_head *sh)
461512 {
462
- struct page *p;
463513 int i;
464514 int num = sh->raid_conf->pool_size;
465515
516
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
466517 for (i = 0; i < num ; i++) {
518
+ struct page *p;
519
+
467520 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468521 p = sh->dev[i].page;
469522 if (!p)
....@@ -471,6 +524,11 @@
471524 sh->dev[i].page = NULL;
472525 put_page(p);
473526 }
527
+#else
528
+ for (i = 0; i < num; i++)
529
+ sh->dev[i].page = NULL;
530
+ free_stripe_pages(sh); /* Free pages */
531
+#endif
474532 }
475533
476534 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
....@@ -478,6 +536,7 @@
478536 int i;
479537 int num = sh->raid_conf->pool_size;
480538
539
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
481540 for (i = 0; i < num; i++) {
482541 struct page *page;
483542
....@@ -486,8 +545,18 @@
486545 }
487546 sh->dev[i].page = page;
488547 sh->dev[i].orig_page = page;
548
+ sh->dev[i].offset = 0;
489549 }
550
+#else
551
+ if (alloc_stripe_pages(sh, gfp))
552
+ return -ENOMEM;
490553
554
+ for (i = 0; i < num; i++) {
555
+ sh->dev[i].page = raid5_get_dev_page(sh, i);
556
+ sh->dev[i].orig_page = sh->dev[i].page;
557
+ sh->dev[i].offset = raid5_get_page_offset(sh, i);
558
+ }
559
+#endif
491560 return 0;
492561 }
493562
....@@ -618,17 +687,17 @@
618687 return degraded;
619688 }
620689
621
-static int has_failed(struct r5conf *conf)
690
+static bool has_failed(struct r5conf *conf)
622691 {
623
- int degraded;
692
+ int degraded = conf->mddev->degraded;
624693
625
- if (conf->mddev->reshape_position == MaxSector)
626
- return conf->mddev->degraded > conf->max_degraded;
694
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
695
+ return true;
627696
628
- degraded = raid5_calc_degraded(conf);
629
- if (degraded > conf->max_degraded)
630
- return 1;
631
- return 0;
697
+ if (conf->mddev->reshape_position != MaxSector)
698
+ degraded = raid5_calc_degraded(conf);
699
+
700
+ return degraded > conf->max_degraded;
632701 }
633702
634703 struct stripe_head *
....@@ -636,7 +705,7 @@
636705 int previous, int noblock, int noquiesce)
637706 {
638707 struct stripe_head *sh;
639
- int hash = stripe_hash_locks_hash(sector);
708
+ int hash = stripe_hash_locks_hash(conf, sector);
640709 int inc_empty_inactive_list_flag;
641710
642711 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
....@@ -712,6 +781,8 @@
712781 }
713782
714783 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
784
+ __acquires(&sh1->stripe_lock)
785
+ __acquires(&sh2->stripe_lock)
715786 {
716787 if (sh1 > sh2) {
717788 spin_lock_irq(&sh2->stripe_lock);
....@@ -723,6 +794,8 @@
723794 }
724795
725796 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
797
+ __releases(&sh1->stripe_lock)
798
+ __releases(&sh2->stripe_lock)
726799 {
727800 spin_unlock(&sh1->stripe_lock);
728801 spin_unlock_irq(&sh2->stripe_lock);
....@@ -753,9 +826,9 @@
753826 tmp_sec = sh->sector;
754827 if (!sector_div(tmp_sec, conf->chunk_sectors))
755828 return;
756
- head_sector = sh->sector - STRIPE_SECTORS;
829
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
757830
758
- hash = stripe_hash_locks_hash(head_sector);
831
+ hash = stripe_hash_locks_hash(conf, head_sector);
759832 spin_lock_irq(conf->hash_locks + hash);
760833 head = __find_stripe(conf, head_sector, conf->generation);
761834 if (head && !atomic_inc_not_zero(&head->count)) {
....@@ -878,7 +951,7 @@
878951 struct bio *bio;
879952
880953 while ((bio = bio_list_pop(tmp)))
881
- generic_make_request(bio);
954
+ submit_bio_noacct(bio);
882955 }
883956
884957 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
....@@ -1062,7 +1135,7 @@
10621135 test_bit(WriteErrorSeen, &rdev->flags)) {
10631136 sector_t first_bad;
10641137 int bad_sectors;
1065
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1138
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
10661139 &first_bad, &bad_sectors);
10671140 if (!bad)
10681141 break;
....@@ -1094,7 +1167,7 @@
10941167 if (rdev) {
10951168 if (s->syncing || s->expanding || s->expanded
10961169 || s->replacing)
1097
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1170
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
10981171
10991172 set_bit(STRIPE_IO_STARTED, &sh->state);
11001173
....@@ -1134,12 +1207,12 @@
11341207 else
11351208 sh->dev[i].vec.bv_page = sh->dev[i].page;
11361209 bi->bi_vcnt = 1;
1137
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138
- bi->bi_io_vec[0].bv_offset = 0;
1139
- bi->bi_iter.bi_size = STRIPE_SIZE;
1210
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211
+ bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11401213 bi->bi_write_hint = sh->dev[i].write_hint;
11411214 if (!rrdev)
1142
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1215
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11431216 /*
11441217 * If this is discard request, set bi_vcnt 0. We don't
11451218 * want to confuse SCSI because SCSI will replace payload
....@@ -1156,12 +1229,12 @@
11561229 if (should_defer && op_is_write(op))
11571230 bio_list_add(&pending_bios, bi);
11581231 else
1159
- generic_make_request(bi);
1232
+ submit_bio_noacct(bi);
11601233 }
11611234 if (rrdev) {
11621235 if (s->syncing || s->expanding || s->expanded
11631236 || s->replacing)
1164
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1237
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
11651238
11661239 set_bit(STRIPE_IO_STARTED, &sh->state);
11671240
....@@ -1188,11 +1261,11 @@
11881261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
11891262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
11901263 rbi->bi_vcnt = 1;
1191
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192
- rbi->bi_io_vec[0].bv_offset = 0;
1193
- rbi->bi_iter.bi_size = STRIPE_SIZE;
1264
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265
+ rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11941267 rbi->bi_write_hint = sh->dev[i].write_hint;
1195
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1268
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11961269 /*
11971270 * If this is discard request, set bi_vcnt 0. We don't
11981271 * want to confuse SCSI because SCSI will replace payload
....@@ -1206,7 +1279,7 @@
12061279 if (should_defer && op_is_write(op))
12071280 bio_list_add(&pending_bios, rbi);
12081281 else
1209
- generic_make_request(rbi);
1282
+ submit_bio_noacct(rbi);
12101283 }
12111284 if (!rdev && !rrdev) {
12121285 if (op_is_write(op))
....@@ -1231,7 +1304,7 @@
12311304
12321305 static struct dma_async_tx_descriptor *
12331306 async_copy_data(int frombio, struct bio *bio, struct page **page,
1234
- sector_t sector, struct dma_async_tx_descriptor *tx,
1307
+ unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
12351308 struct stripe_head *sh, int no_skipcopy)
12361309 {
12371310 struct bio_vec bvl;
....@@ -1240,6 +1313,7 @@
12401313 int page_offset;
12411314 struct async_submit_ctl submit;
12421315 enum async_tx_flags flags = 0;
1316
+ struct r5conf *conf = sh->raid_conf;
12431317
12441318 if (bio->bi_iter.bi_sector >= sector)
12451319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
....@@ -1261,8 +1335,8 @@
12611335 len -= b_offset;
12621336 }
12631337
1264
- if (len > 0 && page_offset + len > STRIPE_SIZE)
1265
- clen = STRIPE_SIZE - page_offset;
1338
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
12661340 else
12671341 clen = len;
12681342
....@@ -1270,17 +1344,17 @@
12701344 b_offset += bvl.bv_offset;
12711345 bio_page = bvl.bv_page;
12721346 if (frombio) {
1273
- if (sh->raid_conf->skip_copy &&
1347
+ if (conf->skip_copy &&
12741348 b_offset == 0 && page_offset == 0 &&
1275
- clen == STRIPE_SIZE &&
1349
+ clen == RAID5_STRIPE_SIZE(conf) &&
12761350 !no_skipcopy)
12771351 *page = bio_page;
12781352 else
1279
- tx = async_memcpy(*page, bio_page, page_offset,
1353
+ tx = async_memcpy(*page, bio_page, page_offset + poff,
12801354 b_offset, clen, &submit);
12811355 } else
12821356 tx = async_memcpy(bio_page, *page, b_offset,
1283
- page_offset, clen, &submit);
1357
+ page_offset + poff, clen, &submit);
12841358 }
12851359 /* chain the operations */
12861360 submit.depend_tx = tx;
....@@ -1297,6 +1371,7 @@
12971371 {
12981372 struct stripe_head *sh = stripe_head_ref;
12991373 int i;
1374
+ struct r5conf *conf = sh->raid_conf;
13001375
13011376 pr_debug("%s: stripe %llu\n", __func__,
13021377 (unsigned long long)sh->sector);
....@@ -1317,8 +1392,8 @@
13171392 rbi = dev->read;
13181393 dev->read = NULL;
13191394 while (rbi && rbi->bi_iter.bi_sector <
1320
- dev->sector + STRIPE_SECTORS) {
1321
- rbi2 = r5_next_bio(rbi, dev->sector);
1395
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
13221397 bio_endio(rbi);
13231398 rbi = rbi2;
13241399 }
....@@ -1335,6 +1410,7 @@
13351410 struct dma_async_tx_descriptor *tx = NULL;
13361411 struct async_submit_ctl submit;
13371412 int i;
1413
+ struct r5conf *conf = sh->raid_conf;
13381414
13391415 BUG_ON(sh->batch_head);
13401416 pr_debug("%s: stripe %llu\n", __func__,
....@@ -1349,10 +1425,11 @@
13491425 dev->toread = NULL;
13501426 spin_unlock_irq(&sh->stripe_lock);
13511427 while (rbi && rbi->bi_iter.bi_sector <
1352
- dev->sector + STRIPE_SECTORS) {
1428
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
13531429 tx = async_copy_data(0, rbi, &dev->page,
1430
+ dev->offset,
13541431 dev->sector, tx, sh, 0);
1355
- rbi = r5_next_bio(rbi, dev->sector);
1432
+ rbi = r5_next_bio(conf, rbi, dev->sector);
13561433 }
13571434 }
13581435 }
....@@ -1394,22 +1471,25 @@
13941471 }
13951472
13961473 /* return a pointer to the address conversion region of the scribble buffer */
1397
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398
- struct raid5_percpu *percpu, int i)
1474
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
13991475 {
1400
- void *addr;
1401
-
1402
- addr = flex_array_get(percpu->scribble, i);
1403
- return addr + sizeof(struct page *) * (sh->disks + 2);
1476
+ return percpu->scribble + i * percpu->scribble_obj_size;
14041477 }
14051478
14061479 /* return a pointer to the address conversion region of the scribble buffer */
1407
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1480
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481
+ struct raid5_percpu *percpu, int i)
14081482 {
1409
- void *addr;
1483
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484
+}
14101485
1411
- addr = flex_array_get(percpu->scribble, i);
1412
- return addr;
1486
+/*
1487
+ * Return a pointer to record offset address.
1488
+ */
1489
+static unsigned int *
1490
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491
+{
1492
+ return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
14131493 }
14141494
14151495 static struct dma_async_tx_descriptor *
....@@ -1417,9 +1497,11 @@
14171497 {
14181498 int disks = sh->disks;
14191499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
14201501 int target = sh->ops.target;
14211502 struct r5dev *tgt = &sh->dev[target];
14221503 struct page *xor_dest = tgt->page;
1504
+ unsigned int off_dest = tgt->offset;
14231505 int count = 0;
14241506 struct dma_async_tx_descriptor *tx;
14251507 struct async_submit_ctl submit;
....@@ -1431,24 +1513,30 @@
14311513 __func__, (unsigned long long)sh->sector, target);
14321514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
14331515
1434
- for (i = disks; i--; )
1435
- if (i != target)
1516
+ for (i = disks; i--; ) {
1517
+ if (i != target) {
1518
+ off_srcs[count] = sh->dev[i].offset;
14361519 xor_srcs[count++] = sh->dev[i].page;
1520
+ }
1521
+ }
14371522
14381523 atomic_inc(&sh->count);
14391524
14401525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
14411526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
14421527 if (unlikely(count == 1))
1443
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1528
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14441530 else
1445
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1531
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14461533
14471534 return tx;
14481535 }
14491536
14501537 /* set_syndrome_sources - populate source buffers for gen_syndrome
14511538 * @srcs - (struct page *) array of size sh->disks
1539
+ * @offs - (unsigned int) array of offset for each page
14521540 * @sh - stripe_head to parse
14531541 *
14541542 * Populates srcs in proper layout order for the stripe and returns the
....@@ -1457,6 +1545,7 @@
14571545 * is recorded in srcs[count+1]].
14581546 */
14591547 static int set_syndrome_sources(struct page **srcs,
1548
+ unsigned int *offs,
14601549 struct stripe_head *sh,
14611550 int srctype)
14621551 {
....@@ -1487,6 +1576,12 @@
14871576 srcs[slot] = sh->dev[i].orig_page;
14881577 else
14891578 srcs[slot] = sh->dev[i].page;
1579
+ /*
1580
+ * For R5_InJournal, PAGE_SIZE must be 4KB and will
1581
+ * not shared page. In that case, dev[i].offset
1582
+ * is 0.
1583
+ */
1584
+ offs[slot] = sh->dev[i].offset;
14901585 }
14911586 i = raid6_next_disk(i, disks);
14921587 } while (i != d0_idx);
....@@ -1499,12 +1594,14 @@
14991594 {
15001595 int disks = sh->disks;
15011596 struct page **blocks = to_addr_page(percpu, 0);
1597
+ unsigned int *offs = to_addr_offs(sh, percpu);
15021598 int target;
15031599 int qd_idx = sh->qd_idx;
15041600 struct dma_async_tx_descriptor *tx;
15051601 struct async_submit_ctl submit;
15061602 struct r5dev *tgt;
15071603 struct page *dest;
1604
+ unsigned int dest_off;
15081605 int i;
15091606 int count;
15101607
....@@ -1523,30 +1620,34 @@
15231620 tgt = &sh->dev[target];
15241621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
15251622 dest = tgt->page;
1623
+ dest_off = tgt->offset;
15261624
15271625 atomic_inc(&sh->count);
15281626
15291627 if (target == qd_idx) {
1530
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
15311629 blocks[count] = NULL; /* regenerating p is not necessary */
15321630 BUG_ON(blocks[count+1] != dest); /* q should already be set */
15331631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
15341632 ops_complete_compute, sh,
15351633 to_addr_conv(sh, percpu, 0));
1536
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1634
+ tx = async_gen_syndrome(blocks, offs, count+2,
1635
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15371636 } else {
15381637 /* Compute any data- or p-drive using XOR */
15391638 count = 0;
15401639 for (i = disks; i-- ; ) {
15411640 if (i == target || i == qd_idx)
15421641 continue;
1642
+ offs[count] = sh->dev[i].offset;
15431643 blocks[count++] = sh->dev[i].page;
15441644 }
15451645
15461646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
15471647 NULL, ops_complete_compute, sh,
15481648 to_addr_conv(sh, percpu, 0));
1549
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1649
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15501651 }
15511652
15521653 return tx;
....@@ -1565,6 +1666,7 @@
15651666 struct r5dev *tgt2 = &sh->dev[target2];
15661667 struct dma_async_tx_descriptor *tx;
15671668 struct page **blocks = to_addr_page(percpu, 0);
1669
+ unsigned int *offs = to_addr_offs(sh, percpu);
15681670 struct async_submit_ctl submit;
15691671
15701672 BUG_ON(sh->batch_head);
....@@ -1577,13 +1679,16 @@
15771679 /* we need to open-code set_syndrome_sources to handle the
15781680 * slot number conversion for 'faila' and 'failb'
15791681 */
1580
- for (i = 0; i < disks ; i++)
1682
+ for (i = 0; i < disks ; i++) {
1683
+ offs[i] = 0;
15811684 blocks[i] = NULL;
1685
+ }
15821686 count = 0;
15831687 i = d0_idx;
15841688 do {
15851689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
15861690
1691
+ offs[slot] = sh->dev[i].offset;
15871692 blocks[slot] = sh->dev[i].page;
15881693
15891694 if (i == target)
....@@ -1608,10 +1713,12 @@
16081713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
16091714 ops_complete_compute, sh,
16101715 to_addr_conv(sh, percpu, 0));
1611
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612
- STRIPE_SIZE, &submit);
1716
+ return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1718
+ &submit);
16131719 } else {
16141720 struct page *dest;
1721
+ unsigned int dest_off;
16151722 int data_target;
16161723 int qd_idx = sh->qd_idx;
16171724
....@@ -1625,22 +1732,26 @@
16251732 for (i = disks; i-- ; ) {
16261733 if (i == data_target || i == qd_idx)
16271734 continue;
1735
+ offs[count] = sh->dev[i].offset;
16281736 blocks[count++] = sh->dev[i].page;
16291737 }
16301738 dest = sh->dev[data_target].page;
1739
+ dest_off = sh->dev[data_target].offset;
16311740 init_async_submit(&submit,
16321741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
16331742 NULL, NULL, NULL,
16341743 to_addr_conv(sh, percpu, 0));
1635
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1744
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745
+ RAID5_STRIPE_SIZE(sh->raid_conf),
16361746 &submit);
16371747
1638
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1748
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
16391749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
16401750 ops_complete_compute, sh,
16411751 to_addr_conv(sh, percpu, 0));
1642
- return async_gen_syndrome(blocks, 0, count+2,
1643
- STRIPE_SIZE, &submit);
1752
+ return async_gen_syndrome(blocks, offs, count+2,
1753
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1754
+ &submit);
16441755 }
16451756 } else {
16461757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
....@@ -1649,13 +1760,15 @@
16491760 if (failb == syndrome_disks) {
16501761 /* We're missing D+P. */
16511762 return async_raid6_datap_recov(syndrome_disks+2,
1652
- STRIPE_SIZE, faila,
1653
- blocks, &submit);
1763
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1764
+ faila,
1765
+ blocks, offs, &submit);
16541766 } else {
16551767 /* We're missing D+D. */
16561768 return async_raid6_2data_recov(syndrome_disks+2,
1657
- STRIPE_SIZE, faila, failb,
1658
- blocks, &submit);
1769
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1770
+ faila, failb,
1771
+ blocks, offs, &submit);
16591772 }
16601773 }
16611774 }
....@@ -1681,10 +1794,12 @@
16811794 {
16821795 int disks = sh->disks;
16831796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
16841798 int count = 0, pd_idx = sh->pd_idx, i;
16851799 struct async_submit_ctl submit;
16861800
16871801 /* existing parity data subtracted */
1802
+ unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
16881803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
16891804
16901805 BUG_ON(sh->batch_head);
....@@ -1694,15 +1809,23 @@
16941809 for (i = disks; i--; ) {
16951810 struct r5dev *dev = &sh->dev[i];
16961811 /* Only process blocks that are known to be uptodate */
1697
- if (test_bit(R5_InJournal, &dev->flags))
1812
+ if (test_bit(R5_InJournal, &dev->flags)) {
1813
+ /*
1814
+ * For this case, PAGE_SIZE must be equal to 4KB and
1815
+ * page offset is zero.
1816
+ */
1817
+ off_srcs[count] = dev->offset;
16981818 xor_srcs[count++] = dev->orig_page;
1699
- else if (test_bit(R5_Wantdrain, &dev->flags))
1819
+ } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820
+ off_srcs[count] = dev->offset;
17001821 xor_srcs[count++] = dev->page;
1822
+ }
17011823 }
17021824
17031825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
17041826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1827
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17061829
17071830 return tx;
17081831 }
....@@ -1712,17 +1835,19 @@
17121835 struct dma_async_tx_descriptor *tx)
17131836 {
17141837 struct page **blocks = to_addr_page(percpu, 0);
1838
+ unsigned int *offs = to_addr_offs(sh, percpu);
17151839 int count;
17161840 struct async_submit_ctl submit;
17171841
17181842 pr_debug("%s: stripe %llu\n", __func__,
17191843 (unsigned long long)sh->sector);
17201844
1721
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1845
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
17221846
17231847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
17241848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1849
+ tx = async_gen_syndrome(blocks, offs, count+2,
1850
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17261851
17271852 return tx;
17281853 }
....@@ -1763,7 +1888,7 @@
17631888 WARN_ON(dev->page != dev->orig_page);
17641889
17651890 while (wbi && wbi->bi_iter.bi_sector <
1766
- dev->sector + STRIPE_SECTORS) {
1891
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
17671892 if (wbi->bi_opf & REQ_FUA)
17681893 set_bit(R5_WantFUA, &dev->flags);
17691894 if (wbi->bi_opf & REQ_SYNC)
....@@ -1772,6 +1897,7 @@
17721897 set_bit(R5_Discard, &dev->flags);
17731898 else {
17741899 tx = async_copy_data(1, wbi, &dev->page,
1900
+ dev->offset,
17751901 dev->sector, tx, sh,
17761902 r5c_is_writeback(conf->log));
17771903 if (dev->page != dev->orig_page &&
....@@ -1781,7 +1907,7 @@
17811907 clear_bit(R5_OVERWRITE, &dev->flags);
17821908 }
17831909 }
1784
- wbi = r5_next_bio(wbi, dev->sector);
1910
+ wbi = r5_next_bio(conf, wbi, dev->sector);
17851911 }
17861912
17871913 if (head_sh->batch_head) {
....@@ -1851,9 +1977,11 @@
18511977 {
18521978 int disks = sh->disks;
18531979 struct page **xor_srcs;
1980
+ unsigned int *off_srcs;
18541981 struct async_submit_ctl submit;
18551982 int count, pd_idx = sh->pd_idx, i;
18561983 struct page *xor_dest;
1984
+ unsigned int off_dest;
18571985 int prexor = 0;
18581986 unsigned long flags;
18591987 int j = 0;
....@@ -1878,24 +2006,31 @@
18782006 again:
18792007 count = 0;
18802008 xor_srcs = to_addr_page(percpu, j);
2009
+ off_srcs = to_addr_offs(sh, percpu);
18812010 /* check if prexor is active which means only process blocks
18822011 * that are part of a read-modify-write (written)
18832012 */
18842013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
18852014 prexor = 1;
2015
+ off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
18862016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
18872017 for (i = disks; i--; ) {
18882018 struct r5dev *dev = &sh->dev[i];
18892019 if (head_sh->dev[i].written ||
1890
- test_bit(R5_InJournal, &head_sh->dev[i].flags))
2020
+ test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021
+ off_srcs[count] = dev->offset;
18912022 xor_srcs[count++] = dev->page;
2023
+ }
18922024 }
18932025 } else {
18942026 xor_dest = sh->dev[pd_idx].page;
2027
+ off_dest = sh->dev[pd_idx].offset;
18952028 for (i = disks; i--; ) {
18962029 struct r5dev *dev = &sh->dev[i];
1897
- if (i != pd_idx)
2030
+ if (i != pd_idx) {
2031
+ off_srcs[count] = dev->offset;
18982032 xor_srcs[count++] = dev->page;
2033
+ }
18992034 }
19002035 }
19012036
....@@ -1921,9 +2056,11 @@
19212056 }
19222057
19232058 if (unlikely(count == 1))
1924
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
2059
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19252061 else
1926
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
2062
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19272064 if (!last_stripe) {
19282065 j++;
19292066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -1938,6 +2075,7 @@
19382075 {
19392076 struct async_submit_ctl submit;
19402077 struct page **blocks;
2078
+ unsigned int *offs;
19412079 int count, i, j = 0;
19422080 struct stripe_head *head_sh = sh;
19432081 int last_stripe;
....@@ -1962,6 +2100,7 @@
19622100
19632101 again:
19642102 blocks = to_addr_page(percpu, j);
2103
+ offs = to_addr_offs(sh, percpu);
19652104
19662105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
19672106 synflags = SYNDROME_SRC_WRITTEN;
....@@ -1971,7 +2110,7 @@
19712110 txflags = ASYNC_TX_ACK;
19722111 }
19732112
1974
- count = set_syndrome_sources(blocks, sh, synflags);
2113
+ count = set_syndrome_sources(blocks, offs, sh, synflags);
19752114 last_stripe = !head_sh->batch_head ||
19762115 list_first_entry(&sh->batch_list,
19772116 struct stripe_head, batch_list) == head_sh;
....@@ -1983,7 +2122,8 @@
19832122 } else
19842123 init_async_submit(&submit, 0, tx, NULL, NULL,
19852124 to_addr_conv(sh, percpu, j));
1986
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
2125
+ tx = async_gen_syndrome(blocks, offs, count+2,
2126
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19872127 if (!last_stripe) {
19882128 j++;
19892129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -2010,7 +2150,9 @@
20102150 int pd_idx = sh->pd_idx;
20112151 int qd_idx = sh->qd_idx;
20122152 struct page *xor_dest;
2153
+ unsigned int off_dest;
20132154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
20142156 struct dma_async_tx_descriptor *tx;
20152157 struct async_submit_ctl submit;
20162158 int count;
....@@ -2022,16 +2164,20 @@
20222164 BUG_ON(sh->batch_head);
20232165 count = 0;
20242166 xor_dest = sh->dev[pd_idx].page;
2167
+ off_dest = sh->dev[pd_idx].offset;
2168
+ off_srcs[count] = off_dest;
20252169 xor_srcs[count++] = xor_dest;
20262170 for (i = disks; i--; ) {
20272171 if (i == pd_idx || i == qd_idx)
20282172 continue;
2173
+ off_srcs[count] = sh->dev[i].offset;
20292174 xor_srcs[count++] = sh->dev[i].page;
20302175 }
20312176
20322177 init_async_submit(&submit, 0, NULL, NULL, NULL,
20332178 to_addr_conv(sh, percpu, 0));
2034
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2179
+ tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180
+ RAID5_STRIPE_SIZE(sh->raid_conf),
20352181 &sh->ops.zero_sum_result, &submit);
20362182
20372183 atomic_inc(&sh->count);
....@@ -2042,6 +2188,7 @@
20422188 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
20432189 {
20442190 struct page **srcs = to_addr_page(percpu, 0);
2191
+ unsigned int *offs = to_addr_offs(sh, percpu);
20452192 struct async_submit_ctl submit;
20462193 int count;
20472194
....@@ -2049,15 +2196,16 @@
20492196 (unsigned long long)sh->sector, checkp);
20502197
20512198 BUG_ON(sh->batch_head);
2052
- count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2199
+ count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
20532200 if (!checkp)
20542201 srcs[count] = NULL;
20552202
20562203 atomic_inc(&sh->count);
20572204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
20582205 sh, to_addr_conv(sh, percpu, 0));
2059
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2206
+ async_syndrome_val(srcs, offs, count+2,
2207
+ RAID5_STRIPE_SIZE(sh->raid_conf),
2208
+ &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
20612209 }
20622210
20632211 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
....@@ -2069,9 +2217,8 @@
20692217 struct raid5_percpu *percpu;
20702218 unsigned long cpu;
20712219
2072
- cpu = get_cpu_light();
2220
+ cpu = get_cpu();
20732221 percpu = per_cpu_ptr(conf->percpu, cpu);
2074
- spin_lock(&percpu->lock);
20752222 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
20762223 ops_run_biofill(sh);
20772224 overlap_clear++;
....@@ -2130,12 +2277,14 @@
21302277 if (test_and_clear_bit(R5_Overlap, &dev->flags))
21312278 wake_up(&sh->raid_conf->wait_for_overlap);
21322279 }
2133
- spin_unlock(&percpu->lock);
2134
- put_cpu_light();
2280
+ put_cpu();
21352281 }
21362282
21372283 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
21382284 {
2285
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2286
+ kfree(sh->pages);
2287
+#endif
21392288 if (sh->ppl_page)
21402289 __free_page(sh->ppl_page);
21412290 kmem_cache_free(sc, sh);
....@@ -2169,9 +2318,15 @@
21692318 sh->ppl_page = alloc_page(gfp);
21702319 if (!sh->ppl_page) {
21712320 free_stripe(sc, sh);
2172
- sh = NULL;
2321
+ return NULL;
21732322 }
21742323 }
2324
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2325
+ if (init_stripe_shared_pages(sh, conf, disks)) {
2326
+ free_stripe(sc, sh);
2327
+ return NULL;
2328
+ }
2329
+#endif
21752330 }
21762331 return sh;
21772332 }
....@@ -2228,10 +2383,13 @@
22282383 }
22292384
22302385 /**
2231
- * scribble_len - return the required size of the scribble region
2232
- * @num - total number of disks in the array
2386
+ * scribble_alloc - allocate percpu scribble buffer for required size
2387
+ * of the scribble region
2388
+ * @percpu: from for_each_present_cpu() of the caller
2389
+ * @num: total number of disks in the array
2390
+ * @cnt: scribble objs count for required size of the scribble region
22332391 *
2234
- * The size must be enough to contain:
2392
+ * The scribble buffer size must be enough to contain:
22352393 * 1/ a struct page pointer for each device in the array +2
22362394 * 2/ room to convert each entry in (1) to its corresponding dma
22372395 * (dma_map_page()) or page (page_address()) address.
....@@ -2240,21 +2398,29 @@
22402398 * calculate over all devices (not just the data blocks), using zeros in place
22412399 * of the P and Q blocks.
22422400 */
2243
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2401
+static int scribble_alloc(struct raid5_percpu *percpu,
2402
+ int num, int cnt)
22442403 {
2245
- struct flex_array *ret;
2246
- size_t len;
2404
+ size_t obj_size =
2405
+ sizeof(struct page *) * (num + 2) +
2406
+ sizeof(addr_conv_t) * (num + 2) +
2407
+ sizeof(unsigned int) * (num + 2);
2408
+ void *scribble;
22472409
2248
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2249
- ret = flex_array_alloc(len, cnt, flags);
2250
- if (!ret)
2251
- return NULL;
2252
- /* always prealloc all elements, so no locking is required */
2253
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
2254
- flex_array_free(ret);
2255
- return NULL;
2256
- }
2257
- return ret;
2410
+ /*
2411
+ * If here is in raid array suspend context, it is in memalloc noio
2412
+ * context as well, there is no potential recursive memory reclaim
2413
+ * I/Os with the GFP_KERNEL flag.
2414
+ */
2415
+ scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2416
+ if (!scribble)
2417
+ return -ENOMEM;
2418
+
2419
+ kvfree(percpu->scribble);
2420
+
2421
+ percpu->scribble = scribble;
2422
+ percpu->scribble_obj_size = obj_size;
2423
+ return 0;
22582424 }
22592425
22602426 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
....@@ -2272,23 +2438,17 @@
22722438 return 0;
22732439 mddev_suspend(conf->mddev);
22742440 get_online_cpus();
2441
+
22752442 for_each_present_cpu(cpu) {
22762443 struct raid5_percpu *percpu;
2277
- struct flex_array *scribble;
22782444
22792445 percpu = per_cpu_ptr(conf->percpu, cpu);
2280
- scribble = scribble_alloc(new_disks,
2281
- new_sectors / STRIPE_SECTORS,
2282
- GFP_NOIO);
2283
-
2284
- if (scribble) {
2285
- flex_array_free(percpu->scribble);
2286
- percpu->scribble = scribble;
2287
- } else {
2288
- err = -ENOMEM;
2446
+ err = scribble_alloc(percpu, new_disks,
2447
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
2448
+ if (err)
22892449 break;
2290
- }
22912450 }
2451
+
22922452 put_online_cpus();
22932453 mddev_resume(conf->mddev);
22942454 if (!err) {
....@@ -2376,9 +2536,16 @@
23762536 osh = get_free_stripe(conf, hash);
23772537 unlock_device_hash_lock(conf, hash);
23782538
2539
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2540
+ for (i = 0; i < osh->nr_pages; i++) {
2541
+ nsh->pages[i] = osh->pages[i];
2542
+ osh->pages[i] = NULL;
2543
+ }
2544
+#endif
23792545 for(i=0; i<conf->pool_size; i++) {
23802546 nsh->dev[i].page = osh->dev[i].page;
23812547 nsh->dev[i].orig_page = osh->dev[i].page;
2548
+ nsh->dev[i].offset = osh->dev[i].offset;
23822549 }
23832550 nsh->hash_lock_index = hash;
23842551 free_stripe(conf->slab_cache, osh);
....@@ -2427,14 +2594,33 @@
24272594 nsh = list_entry(newstripes.next, struct stripe_head, lru);
24282595 list_del_init(&nsh->lru);
24292596
2597
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2598
+ for (i = 0; i < nsh->nr_pages; i++) {
2599
+ if (nsh->pages[i])
2600
+ continue;
2601
+ nsh->pages[i] = alloc_page(GFP_NOIO);
2602
+ if (!nsh->pages[i])
2603
+ err = -ENOMEM;
2604
+ }
2605
+
2606
+ for (i = conf->raid_disks; i < newsize; i++) {
2607
+ if (nsh->dev[i].page)
2608
+ continue;
2609
+ nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2610
+ nsh->dev[i].orig_page = nsh->dev[i].page;
2611
+ nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2612
+ }
2613
+#else
24302614 for (i=conf->raid_disks; i < newsize; i++)
24312615 if (nsh->dev[i].page == NULL) {
24322616 struct page *p = alloc_page(GFP_NOIO);
24332617 nsh->dev[i].page = p;
24342618 nsh->dev[i].orig_page = p;
2619
+ nsh->dev[i].offset = 0;
24352620 if (!p)
24362621 err = -ENOMEM;
24372622 }
2623
+#endif
24382624 raid5_release_stripe(nsh);
24392625 }
24402626 /* critical section pass, GFP_NOIO no longer needed */
....@@ -2518,10 +2704,10 @@
25182704 */
25192705 pr_info_ratelimited(
25202706 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2521
- mdname(conf->mddev), STRIPE_SECTORS,
2707
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
25222708 (unsigned long long)s,
25232709 bdevname(rdev->bdev, b));
2524
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2710
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
25252711 clear_bit(R5_ReadError, &sh->dev[i].flags);
25262712 clear_bit(R5_ReWrite, &sh->dev[i].flags);
25272713 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2566,10 +2752,16 @@
25662752 (unsigned long long)s,
25672753 bdn);
25682754 } else if (atomic_read(&rdev->read_errors)
2569
- > conf->max_nr_stripes)
2570
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2571
- mdname(conf->mddev), bdn);
2572
- else
2755
+ > conf->max_nr_stripes) {
2756
+ if (!test_bit(Faulty, &rdev->flags)) {
2757
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2758
+ mdname(conf->mddev),
2759
+ atomic_read(&rdev->read_errors),
2760
+ conf->max_nr_stripes);
2761
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2762
+ mdname(conf->mddev), bdn);
2763
+ }
2764
+ } else
25732765 retry = 1;
25742766 if (set_bad && test_bit(In_sync, &rdev->flags)
25752767 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2588,7 +2780,7 @@
25882780 if (!(set_bad
25892781 && test_bit(In_sync, &rdev->flags)
25902782 && rdev_set_badblocks(
2591
- rdev, sh->sector, STRIPE_SECTORS, 0)))
2783
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
25922784 md_error(conf->mddev, rdev);
25932785 }
25942786 }
....@@ -2604,7 +2796,7 @@
26042796 struct stripe_head *sh = bi->bi_private;
26052797 struct r5conf *conf = sh->raid_conf;
26062798 int disks = sh->disks, i;
2607
- struct md_rdev *uninitialized_var(rdev);
2799
+ struct md_rdev *rdev;
26082800 sector_t first_bad;
26092801 int bad_sectors;
26102802 int replacement = 0;
....@@ -2640,7 +2832,7 @@
26402832 if (bi->bi_status)
26412833 md_error(conf->mddev, rdev);
26422834 else if (is_badblock(rdev, sh->sector,
2643
- STRIPE_SECTORS,
2835
+ RAID5_STRIPE_SECTORS(conf),
26442836 &first_bad, &bad_sectors))
26452837 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
26462838 } else {
....@@ -2652,7 +2844,7 @@
26522844 set_bit(MD_RECOVERY_NEEDED,
26532845 &rdev->mddev->recovery);
26542846 } else if (is_badblock(rdev, sh->sector,
2655
- STRIPE_SECTORS,
2847
+ RAID5_STRIPE_SECTORS(conf),
26562848 &first_bad, &bad_sectors)) {
26572849 set_bit(R5_MadeGood, &sh->dev[i].flags);
26582850 if (test_bit(R5_ReadError, &sh->dev[i].flags))
....@@ -2672,10 +2864,10 @@
26722864 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
26732865 clear_bit(R5_LOCKED, &sh->dev[i].flags);
26742866 set_bit(STRIPE_HANDLE, &sh->state);
2675
- raid5_release_stripe(sh);
26762867
26772868 if (sh->batch_head && sh != sh->batch_head)
26782869 raid5_release_stripe(sh->batch_head);
2870
+ raid5_release_stripe(sh);
26792871 }
26802872
26812873 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
....@@ -2685,22 +2877,31 @@
26852877 unsigned long flags;
26862878 pr_debug("raid456: error called\n");
26872879
2880
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
2881
+ mdname(mddev), bdevname(rdev->bdev, b));
2882
+
26882883 spin_lock_irqsave(&conf->device_lock, flags);
26892884 set_bit(Faulty, &rdev->flags);
26902885 clear_bit(In_sync, &rdev->flags);
26912886 mddev->degraded = raid5_calc_degraded(conf);
2887
+
2888
+ if (has_failed(conf)) {
2889
+ set_bit(MD_BROKEN, &conf->mddev->flags);
2890
+ conf->recovery_disabled = mddev->recovery_disabled;
2891
+
2892
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2893
+ mdname(mddev), mddev->degraded, conf->raid_disks);
2894
+ } else {
2895
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2896
+ mdname(mddev), conf->raid_disks - mddev->degraded);
2897
+ }
2898
+
26922899 spin_unlock_irqrestore(&conf->device_lock, flags);
26932900 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
26942901
26952902 set_bit(Blocked, &rdev->flags);
26962903 set_mask_bits(&mddev->sb_flags, 0,
26972904 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2698
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2699
- "md/raid:%s: Operation continuing on %d devices.\n",
2700
- mdname(mddev),
2701
- bdevname(rdev->bdev, b),
2702
- mdname(mddev),
2703
- conf->raid_disks - mddev->degraded);
27042905 r5c_update_on_rdev_error(mddev, rdev);
27052906 }
27062907
....@@ -3274,13 +3475,13 @@
32743475 /* check if page is covered */
32753476 sector_t sector = sh->dev[dd_idx].sector;
32763477 for (bi=sh->dev[dd_idx].towrite;
3277
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3478
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
32783479 bi && bi->bi_iter.bi_sector <= sector;
3279
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3480
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
32803481 if (bio_end_sector(bi) >= sector)
32813482 sector = bio_end_sector(bi);
32823483 }
3283
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3484
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
32843485 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
32853486 sh->overwrite_disks++;
32863487 }
....@@ -3305,7 +3506,7 @@
33053506 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
33063507 spin_unlock_irq(&sh->stripe_lock);
33073508 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3308
- STRIPE_SECTORS, 0);
3509
+ RAID5_STRIPE_SECTORS(conf), 0);
33093510 spin_lock_irq(&sh->stripe_lock);
33103511 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
33113512 if (!sh->batch_head) {
....@@ -3367,7 +3568,7 @@
33673568 if (!rdev_set_badblocks(
33683569 rdev,
33693570 sh->sector,
3370
- STRIPE_SECTORS, 0))
3571
+ RAID5_STRIPE_SECTORS(conf), 0))
33713572 md_error(conf->mddev, rdev);
33723573 rdev_dec_pending(rdev, conf->mddev);
33733574 }
....@@ -3387,8 +3588,8 @@
33873588 wake_up(&conf->wait_for_overlap);
33883589
33893590 while (bi && bi->bi_iter.bi_sector <
3390
- sh->dev[i].sector + STRIPE_SECTORS) {
3391
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3591
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3592
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
33923593
33933594 md_write_end(conf->mddev);
33943595 bio_io_error(bi);
....@@ -3396,7 +3597,7 @@
33963597 }
33973598 if (bitmap_end)
33983599 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3399
- STRIPE_SECTORS, 0, 0);
3600
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34003601 bitmap_end = 0;
34013602 /* and fail all 'written' */
34023603 bi = sh->dev[i].written;
....@@ -3408,8 +3609,8 @@
34083609
34093610 if (bi) bitmap_end = 1;
34103611 while (bi && bi->bi_iter.bi_sector <
3411
- sh->dev[i].sector + STRIPE_SECTORS) {
3412
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3612
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3613
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
34133614
34143615 md_write_end(conf->mddev);
34153616 bio_io_error(bi);
....@@ -3432,9 +3633,9 @@
34323633 if (bi)
34333634 s->to_read--;
34343635 while (bi && bi->bi_iter.bi_sector <
3435
- sh->dev[i].sector + STRIPE_SECTORS) {
3636
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
34363637 struct bio *nextbi =
3437
- r5_next_bio(bi, sh->dev[i].sector);
3638
+ r5_next_bio(conf, bi, sh->dev[i].sector);
34383639
34393640 bio_io_error(bi);
34403641 bi = nextbi;
....@@ -3442,7 +3643,7 @@
34423643 }
34433644 if (bitmap_end)
34443645 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3445
- STRIPE_SECTORS, 0, 0);
3646
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34463647 /* If we were in the middle of a write the parity block might
34473648 * still be locked - so just clear all R5_LOCKED flags
34483649 */
....@@ -3487,14 +3688,14 @@
34873688 && !test_bit(Faulty, &rdev->flags)
34883689 && !test_bit(In_sync, &rdev->flags)
34893690 && !rdev_set_badblocks(rdev, sh->sector,
3490
- STRIPE_SECTORS, 0))
3691
+ RAID5_STRIPE_SECTORS(conf), 0))
34913692 abort = 1;
34923693 rdev = rcu_dereference(conf->disks[i].replacement);
34933694 if (rdev
34943695 && !test_bit(Faulty, &rdev->flags)
34953696 && !test_bit(In_sync, &rdev->flags)
34963697 && !rdev_set_badblocks(rdev, sh->sector,
3497
- STRIPE_SECTORS, 0))
3698
+ RAID5_STRIPE_SECTORS(conf), 0))
34983699 abort = 1;
34993700 }
35003701 rcu_read_unlock();
....@@ -3502,7 +3703,7 @@
35023703 conf->recovery_disabled =
35033704 conf->mddev->recovery_disabled;
35043705 }
3505
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3706
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
35063707 }
35073708
35083709 static int want_replace(struct stripe_head *sh, int disk_idx)
....@@ -3529,6 +3730,7 @@
35293730 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
35303731 &sh->dev[s->failed_num[1]] };
35313732 int i;
3733
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
35323734
35333735
35343736 if (test_bit(R5_LOCKED, &dev->flags) ||
....@@ -3587,18 +3789,27 @@
35873789 * devices must be read.
35883790 */
35893791 return 1;
3792
+
3793
+ if (s->failed >= 2 &&
3794
+ (fdev[i]->towrite ||
3795
+ s->failed_num[i] == sh->pd_idx ||
3796
+ s->failed_num[i] == sh->qd_idx) &&
3797
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
3798
+ /* In max degraded raid6, If the failed disk is P, Q,
3799
+ * or we want to read the failed disk, we need to do
3800
+ * reconstruct-write.
3801
+ */
3802
+ force_rcw = true;
35903803 }
35913804
3592
- /* If we are forced to do a reconstruct-write, either because
3593
- * the current RAID6 implementation only supports that, or
3594
- * because parity cannot be trusted and we are currently
3595
- * recovering it, there is extra need to be careful.
3805
+ /* If we are forced to do a reconstruct-write, because parity
3806
+ * cannot be trusted and we are currently recovering it, there
3807
+ * is extra need to be careful.
35963808 * If one of the devices that we would need to read, because
35973809 * it is not being overwritten (and maybe not written at all)
35983810 * is missing/faulty, then we need to read everything we can.
35993811 */
3600
- if (sh->raid_conf->level != 6 &&
3601
- sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
3812
+ if (!force_rcw &&
36023813 sh->sector < sh->raid_conf->mddev->recovery_cp)
36033814 /* reconstruct-write isn't being forced */
36043815 return 0;
....@@ -3702,7 +3913,7 @@
37023913 return 0;
37033914 }
37043915
3705
-/**
3916
+/*
37063917 * handle_stripe_fill - read or compute data to satisfy pending requests.
37073918 */
37083919 static void handle_stripe_fill(struct stripe_head *sh,
....@@ -3725,7 +3936,7 @@
37253936 * back cache (prexor with orig_page, and then xor with
37263937 * page) in the read path
37273938 */
3728
- if (s->injournal && s->failed) {
3939
+ if (s->to_read && s->injournal && s->failed) {
37293940 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
37303941 r5c_make_stripe_write_out(sh);
37313942 goto out;
....@@ -3777,14 +3988,14 @@
37773988 wbi = dev->written;
37783989 dev->written = NULL;
37793990 while (wbi && wbi->bi_iter.bi_sector <
3780
- dev->sector + STRIPE_SECTORS) {
3781
- wbi2 = r5_next_bio(wbi, dev->sector);
3991
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3992
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
37823993 md_write_end(conf->mddev);
37833994 bio_endio(wbi);
37843995 wbi = wbi2;
37853996 }
37863997 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3787
- STRIPE_SECTORS,
3998
+ RAID5_STRIPE_SECTORS(conf),
37883999 !test_bit(STRIPE_DEGRADED, &sh->state),
37894000 0);
37904001 if (head_sh->batch_head) {
....@@ -3968,10 +4179,8 @@
39684179 set_bit(R5_LOCKED, &dev->flags);
39694180 set_bit(R5_Wantread, &dev->flags);
39704181 s->locked++;
3971
- } else {
4182
+ } else
39724183 set_bit(STRIPE_DELAYED, &sh->state);
3973
- set_bit(STRIPE_HANDLE, &sh->state);
3974
- }
39754184 }
39764185 }
39774186 }
....@@ -3996,10 +4205,8 @@
39964205 set_bit(R5_Wantread, &dev->flags);
39974206 s->locked++;
39984207 qread++;
3999
- } else {
4208
+ } else
40004209 set_bit(STRIPE_DELAYED, &sh->state);
4001
- set_bit(STRIPE_HANDLE, &sh->state);
4002
- }
40034210 }
40044211 }
40054212 if (rcw && conf->mddev->queue)
....@@ -4049,7 +4256,7 @@
40494256 break;
40504257 }
40514258 dev = &sh->dev[s->failed_num[0]];
4052
- /* fall through */
4259
+ fallthrough;
40534260 case check_state_compute_result:
40544261 sh->check_state = check_state_idle;
40554262 if (!dev)
....@@ -4091,7 +4298,7 @@
40914298 */
40924299 set_bit(STRIPE_INSYNC, &sh->state);
40934300 else {
4094
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4301
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
40954302 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
40964303 /* don't try to repair!! */
40974304 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4099,7 +4306,7 @@
40994306 "%llu-%llu\n", mdname(conf->mddev),
41004307 (unsigned long long) sh->sector,
41014308 (unsigned long long) sh->sector +
4102
- STRIPE_SECTORS);
4309
+ RAID5_STRIPE_SECTORS(conf));
41034310 } else {
41044311 sh->check_state = check_state_compute_run;
41054312 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
....@@ -4180,7 +4387,7 @@
41804387
41814388 /* we have 2-disk failure */
41824389 BUG_ON(s->failed != 2);
4183
- /* fall through */
4390
+ fallthrough;
41844391 case check_state_compute_result:
41854392 sh->check_state = check_state_idle;
41864393
....@@ -4256,7 +4463,7 @@
42564463 */
42574464 }
42584465 } else {
4259
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4466
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
42604467 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
42614468 /* don't try to repair!! */
42624469 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4264,7 +4471,7 @@
42644471 "%llu-%llu\n", mdname(conf->mddev),
42654472 (unsigned long long) sh->sector,
42664473 (unsigned long long) sh->sector +
4267
- STRIPE_SECTORS);
4474
+ RAID5_STRIPE_SECTORS(conf));
42684475 } else {
42694476 int *target = &sh->ops.target;
42704477
....@@ -4335,7 +4542,8 @@
43354542 /* place all the copies on one channel */
43364543 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
43374544 tx = async_memcpy(sh2->dev[dd_idx].page,
4338
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
4545
+ sh->dev[i].page, sh2->dev[dd_idx].offset,
4546
+ sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
43394547 &submit);
43404548
43414549 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
....@@ -4434,8 +4642,8 @@
44344642 */
44354643 rdev = rcu_dereference(conf->disks[i].replacement);
44364644 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4437
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4438
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4645
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4646
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44394647 &first_bad, &bad_sectors))
44404648 set_bit(R5_ReadRepl, &dev->flags);
44414649 else {
....@@ -4449,7 +4657,7 @@
44494657 if (rdev && test_bit(Faulty, &rdev->flags))
44504658 rdev = NULL;
44514659 if (rdev) {
4452
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4660
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44534661 &first_bad, &bad_sectors);
44544662 if (s->blocked_rdev == NULL
44554663 && (test_bit(Blocked, &rdev->flags)
....@@ -4476,7 +4684,7 @@
44764684 }
44774685 } else if (test_bit(In_sync, &rdev->flags))
44784686 set_bit(R5_Insync, &dev->flags);
4479
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4687
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
44804688 /* in sync if before recovery_offset */
44814689 set_bit(R5_Insync, &dev->flags);
44824690 else if (test_bit(R5_UPTODATE, &dev->flags) &&
....@@ -4565,12 +4773,12 @@
45654773 rcu_read_unlock();
45664774 }
45674775
4776
+/*
4777
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4778
+ * a head which can now be handled.
4779
+ */
45684780 static int clear_batch_ready(struct stripe_head *sh)
45694781 {
4570
- /* Return '1' if this is a member of batch, or
4571
- * '0' if it is a lone stripe or a head which can now be
4572
- * handled.
4573
- */
45744782 struct stripe_head *tmp;
45754783 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
45764784 return (sh->batch_head && sh->batch_head != sh);
....@@ -4620,7 +4828,6 @@
46204828 (1 << STRIPE_FULL_WRITE) |
46214829 (1 << STRIPE_BIOFILL_RUN) |
46224830 (1 << STRIPE_COMPUTE_RUN) |
4623
- (1 << STRIPE_OPS_REQ_PENDING) |
46244831 (1 << STRIPE_DISCARD) |
46254832 (1 << STRIPE_BATCH_READY) |
46264833 (1 << STRIPE_BATCH_ERR) |
....@@ -4675,15 +4882,20 @@
46754882 struct r5dev *pdev, *qdev;
46764883
46774884 clear_bit(STRIPE_HANDLE, &sh->state);
4885
+
4886
+ /*
4887
+ * handle_stripe should not continue handle the batched stripe, only
4888
+ * the head of batch list or lone stripe can continue. Otherwise we
4889
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4890
+ * is set for the batched stripe.
4891
+ */
4892
+ if (clear_batch_ready(sh))
4893
+ return;
4894
+
46784895 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
46794896 /* already being handled, ensure it gets handled
46804897 * again when current action finishes */
46814898 set_bit(STRIPE_HANDLE, &sh->state);
4682
- return;
4683
- }
4684
-
4685
- if (clear_batch_ready(sh) ) {
4686
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
46874899 return;
46884900 }
46894901
....@@ -4920,7 +5132,7 @@
49205132 if ((s.syncing || s.replacing) && s.locked == 0 &&
49215133 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
49225134 test_bit(STRIPE_INSYNC, &sh->state)) {
4923
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5135
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49245136 clear_bit(STRIPE_SYNCING, &sh->state);
49255137 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
49265138 wake_up(&conf->wait_for_overlap);
....@@ -4939,14 +5151,11 @@
49395151 if (!test_bit(R5_ReWrite, &dev->flags)) {
49405152 set_bit(R5_Wantwrite, &dev->flags);
49415153 set_bit(R5_ReWrite, &dev->flags);
4942
- set_bit(R5_LOCKED, &dev->flags);
4943
- s.locked++;
4944
- } else {
5154
+ } else
49455155 /* let's read it back */
49465156 set_bit(R5_Wantread, &dev->flags);
4947
- set_bit(R5_LOCKED, &dev->flags);
4948
- s.locked++;
4949
- }
5157
+ set_bit(R5_LOCKED, &dev->flags);
5158
+ s.locked++;
49505159 }
49515160 }
49525161
....@@ -4988,7 +5197,7 @@
49885197 clear_bit(STRIPE_EXPAND_READY, &sh->state);
49895198 atomic_dec(&conf->reshape_stripes);
49905199 wake_up(&conf->wait_for_overlap);
4991
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5200
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49925201 }
49935202
49945203 if (s.expanding && s.locked == 0 &&
....@@ -5018,14 +5227,14 @@
50185227 /* We own a safe reference to the rdev */
50195228 rdev = conf->disks[i].rdev;
50205229 if (!rdev_set_badblocks(rdev, sh->sector,
5021
- STRIPE_SECTORS, 0))
5230
+ RAID5_STRIPE_SECTORS(conf), 0))
50225231 md_error(conf->mddev, rdev);
50235232 rdev_dec_pending(rdev, conf->mddev);
50245233 }
50255234 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
50265235 rdev = conf->disks[i].rdev;
50275236 rdev_clear_badblocks(rdev, sh->sector,
5028
- STRIPE_SECTORS, 0);
5237
+ RAID5_STRIPE_SECTORS(conf), 0);
50295238 rdev_dec_pending(rdev, conf->mddev);
50305239 }
50315240 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
....@@ -5034,7 +5243,7 @@
50345243 /* rdev have been moved down */
50355244 rdev = conf->disks[i].rdev;
50365245 rdev_clear_badblocks(rdev, sh->sector,
5037
- STRIPE_SECTORS, 0);
5246
+ RAID5_STRIPE_SECTORS(conf), 0);
50385247 rdev_dec_pending(rdev, conf->mddev);
50395248 }
50405249 }
....@@ -5090,28 +5299,6 @@
50905299 hash = sh->hash_lock_index;
50915300 __release_stripe(conf, sh, &temp_inactive_list[hash]);
50925301 }
5093
-}
5094
-
5095
-static int raid5_congested(struct mddev *mddev, int bits)
5096
-{
5097
- struct r5conf *conf = mddev->private;
5098
-
5099
- /* No difference between reads and writes. Just check
5100
- * how busy the stripe_cache is
5101
- */
5102
-
5103
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5104
- return 1;
5105
-
5106
- /* Also checks whether there is pressure on r5cache log space */
5107
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5108
- return 1;
5109
- if (conf->quiesce)
5110
- return 1;
5111
- if (atomic_read(&conf->empty_inactive_list_nr))
5112
- return 1;
5113
-
5114
- return 0;
51155302 }
51165303
51175304 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
....@@ -5259,7 +5446,6 @@
52595446 rcu_read_unlock();
52605447 raid_bio->bi_next = (void*)rdev;
52615448 bio_set_dev(align_bi, rdev->bdev);
5262
- bio_clear_flag(align_bi, BIO_SEG_VALID);
52635449
52645450 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
52655451 bio_sectors(align_bi),
....@@ -5283,7 +5469,7 @@
52835469 trace_block_bio_remap(align_bi->bi_disk->queue,
52845470 align_bi, disk_devt(mddev->gendisk),
52855471 raid_bio->bi_iter.bi_sector);
5286
- generic_make_request(align_bi);
5472
+ submit_bio_noacct(align_bi);
52875473 return 1;
52885474 } else {
52895475 rcu_read_unlock();
....@@ -5303,7 +5489,7 @@
53035489 struct r5conf *conf = mddev->private;
53045490 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
53055491 bio_chain(split, raid_bio);
5306
- generic_make_request(raid_bio);
5492
+ submit_bio_noacct(raid_bio);
53075493 raid_bio = split;
53085494 }
53095495
....@@ -5499,8 +5685,8 @@
54995685 /* Skip discard while reshape is happening */
55005686 return;
55015687
5502
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5503
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5688
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5689
+ last_sector = bio_end_sector(bi);
55045690
55055691 bi->bi_next = NULL;
55065692
....@@ -5514,7 +5700,7 @@
55145700 last_sector *= conf->chunk_sectors;
55155701
55165702 for (; logical_sector < last_sector;
5517
- logical_sector += STRIPE_SECTORS) {
5703
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
55185704 DEFINE_WAIT(w);
55195705 int d;
55205706 again:
....@@ -5559,7 +5745,7 @@
55595745 d++)
55605746 md_bitmap_startwrite(mddev->bitmap,
55615747 sh->sector,
5562
- STRIPE_SECTORS,
5748
+ RAID5_STRIPE_SECTORS(conf),
55635749 0);
55645750 sh->bm_seq = conf->seq_flush + 1;
55655751 set_bit(STRIPE_BIT_DELAY, &sh->state);
....@@ -5624,12 +5810,12 @@
56245810 return true;
56255811 }
56265812
5627
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5813
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
56285814 last_sector = bio_end_sector(bi);
56295815 bi->bi_next = NULL;
56305816
56315817 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5632
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5818
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
56335819 int previous;
56345820 int seq;
56355821
....@@ -5727,8 +5913,7 @@
57275913 do_flush = false;
57285914 }
57295915
5730
- if (!sh->batch_head || sh == sh->batch_head)
5731
- set_bit(STRIPE_HANDLE, &sh->state);
5916
+ set_bit(STRIPE_HANDLE, &sh->state);
57325917 clear_bit(STRIPE_DELAYED, &sh->state);
57335918 if ((!sh->batch_head || sh == sh->batch_head) &&
57345919 (bi->bi_opf & REQ_SYNC) &&
....@@ -5793,7 +5978,7 @@
57935978 sector_div(sector_nr, new_data_disks);
57945979 if (sector_nr) {
57955980 mddev->curr_resync_completed = sector_nr;
5796
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5981
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
57975982 *skipped = 1;
57985983 retn = sector_nr;
57995984 goto finish;
....@@ -5907,11 +6092,11 @@
59076092 conf->reshape_safe = mddev->reshape_position;
59086093 spin_unlock_irq(&conf->device_lock);
59096094 wake_up(&conf->wait_for_overlap);
5910
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6095
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
59116096 }
59126097
59136098 INIT_LIST_HEAD(&stripes);
5914
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
6099
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
59156100 int j;
59166101 int skipped_disk = 0;
59176102 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
....@@ -5932,7 +6117,7 @@
59326117 skipped_disk = 1;
59336118 continue;
59346119 }
5935
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
6120
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
59366121 set_bit(R5_Expanded, &sh->dev[j].flags);
59376122 set_bit(R5_UPTODATE, &sh->dev[j].flags);
59386123 }
....@@ -5967,7 +6152,7 @@
59676152 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
59686153 set_bit(STRIPE_HANDLE, &sh->state);
59696154 raid5_release_stripe(sh);
5970
- first_sector += STRIPE_SECTORS;
6155
+ first_sector += RAID5_STRIPE_SECTORS(conf);
59716156 }
59726157 /* Now that the sources are clearly marked, we can release
59736158 * the destination stripes
....@@ -6014,7 +6199,7 @@
60146199 conf->reshape_safe = mddev->reshape_position;
60156200 spin_unlock_irq(&conf->device_lock);
60166201 wake_up(&conf->wait_for_overlap);
6017
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6202
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
60186203 }
60196204 ret:
60206205 return retn;
....@@ -6073,11 +6258,12 @@
60736258 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
60746259 !conf->fullsync &&
60756260 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6076
- sync_blocks >= STRIPE_SECTORS) {
6261
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
60776262 /* we can skip this block, and probably more */
6078
- sync_blocks /= STRIPE_SECTORS;
6263
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
60796264 *skipped = 1;
6080
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
6265
+ /* keep things rounded to whole stripes */
6266
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
60816267 }
60826268
60836269 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
....@@ -6110,7 +6296,7 @@
61106296
61116297 raid5_release_stripe(sh);
61126298
6113
- return STRIPE_SECTORS;
6299
+ return RAID5_STRIPE_SECTORS(conf);
61146300 }
61156301
61166302 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
....@@ -6133,14 +6319,14 @@
61336319 int handled = 0;
61346320
61356321 logical_sector = raid_bio->bi_iter.bi_sector &
6136
- ~((sector_t)STRIPE_SECTORS-1);
6322
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
61376323 sector = raid5_compute_sector(conf, logical_sector,
61386324 0, &dd_idx, NULL);
61396325 last_sector = bio_end_sector(raid_bio);
61406326
61416327 for (; logical_sector < last_sector;
6142
- logical_sector += STRIPE_SECTORS,
6143
- sector += STRIPE_SECTORS,
6328
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
6329
+ sector += RAID5_STRIPE_SECTORS(conf),
61446330 scnt++) {
61456331
61466332 if (scnt < offset)
....@@ -6179,6 +6365,8 @@
61796365 static int handle_active_stripes(struct r5conf *conf, int group,
61806366 struct r5worker *worker,
61816367 struct list_head *temp_inactive_list)
6368
+ __releases(&conf->device_lock)
6369
+ __acquires(&conf->device_lock)
61826370 {
61836371 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
61846372 int i, batch_size = 0, hash;
....@@ -6331,7 +6519,18 @@
63316519 spin_unlock_irq(&conf->device_lock);
63326520 md_check_recovery(mddev);
63336521 spin_lock_irq(&conf->device_lock);
6522
+
6523
+ /*
6524
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6525
+ * seeing md_check_recovery() is needed to clear
6526
+ * the flag when using mdmon.
6527
+ */
6528
+ continue;
63346529 }
6530
+
6531
+ wait_event_lock_irq(mddev->sb_wait,
6532
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6533
+ conf->device_lock);
63356534 }
63366535 pr_debug("%d stripes handled\n", handled);
63376536
....@@ -6471,6 +6670,100 @@
64716670 raid5_show_rmw_level,
64726671 raid5_store_rmw_level);
64736672
6673
+static ssize_t
6674
+raid5_show_stripe_size(struct mddev *mddev, char *page)
6675
+{
6676
+ struct r5conf *conf;
6677
+ int ret = 0;
6678
+
6679
+ spin_lock(&mddev->lock);
6680
+ conf = mddev->private;
6681
+ if (conf)
6682
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6683
+ spin_unlock(&mddev->lock);
6684
+ return ret;
6685
+}
6686
+
6687
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6688
+static ssize_t
6689
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6690
+{
6691
+ struct r5conf *conf;
6692
+ unsigned long new;
6693
+ int err;
6694
+ int size;
6695
+
6696
+ if (len >= PAGE_SIZE)
6697
+ return -EINVAL;
6698
+ if (kstrtoul(page, 10, &new))
6699
+ return -EINVAL;
6700
+
6701
+ /*
6702
+ * The value should not be bigger than PAGE_SIZE. It requires to
6703
+ * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6704
+ * of two.
6705
+ */
6706
+ if (new % DEFAULT_STRIPE_SIZE != 0 ||
6707
+ new > PAGE_SIZE || new == 0 ||
6708
+ new != roundup_pow_of_two(new))
6709
+ return -EINVAL;
6710
+
6711
+ err = mddev_lock(mddev);
6712
+ if (err)
6713
+ return err;
6714
+
6715
+ conf = mddev->private;
6716
+ if (!conf) {
6717
+ err = -ENODEV;
6718
+ goto out_unlock;
6719
+ }
6720
+
6721
+ if (new == conf->stripe_size)
6722
+ goto out_unlock;
6723
+
6724
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6725
+ conf->stripe_size, new);
6726
+
6727
+ if (mddev->sync_thread ||
6728
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6729
+ mddev->reshape_position != MaxSector ||
6730
+ mddev->sysfs_active) {
6731
+ err = -EBUSY;
6732
+ goto out_unlock;
6733
+ }
6734
+
6735
+ mddev_suspend(mddev);
6736
+ mutex_lock(&conf->cache_size_mutex);
6737
+ size = conf->max_nr_stripes;
6738
+
6739
+ shrink_stripes(conf);
6740
+
6741
+ conf->stripe_size = new;
6742
+ conf->stripe_shift = ilog2(new) - 9;
6743
+ conf->stripe_sectors = new >> 9;
6744
+ if (grow_stripes(conf, size)) {
6745
+ pr_warn("md/raid:%s: couldn't allocate buffers\n",
6746
+ mdname(mddev));
6747
+ err = -ENOMEM;
6748
+ }
6749
+ mutex_unlock(&conf->cache_size_mutex);
6750
+ mddev_resume(mddev);
6751
+
6752
+out_unlock:
6753
+ mddev_unlock(mddev);
6754
+ return err ?: len;
6755
+}
6756
+
6757
+static struct md_sysfs_entry
6758
+raid5_stripe_size = __ATTR(stripe_size, 0644,
6759
+ raid5_show_stripe_size,
6760
+ raid5_store_stripe_size);
6761
+#else
6762
+static struct md_sysfs_entry
6763
+raid5_stripe_size = __ATTR(stripe_size, 0444,
6764
+ raid5_show_stripe_size,
6765
+ NULL);
6766
+#endif
64746767
64756768 static ssize_t
64766769 raid5_show_preread_threshold(struct mddev *mddev, char *page)
....@@ -6550,14 +6843,14 @@
65506843 if (!conf)
65516844 err = -ENODEV;
65526845 else if (new != conf->skip_copy) {
6846
+ struct request_queue *q = mddev->queue;
6847
+
65536848 mddev_suspend(mddev);
65546849 conf->skip_copy = new;
65556850 if (new)
6556
- mddev->queue->backing_dev_info->capabilities |=
6557
- BDI_CAP_STABLE_WRITES;
6851
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
65586852 else
6559
- mddev->queue->backing_dev_info->capabilities &=
6560
- ~BDI_CAP_STABLE_WRITES;
6853
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
65616854 mddev_resume(mddev);
65626855 }
65636856 mddev_unlock(mddev);
....@@ -6597,7 +6890,6 @@
65976890
65986891 static int alloc_thread_groups(struct r5conf *conf, int cnt,
65996892 int *group_cnt,
6600
- int *worker_cnt_per_group,
66016893 struct r5worker_group **worker_groups);
66026894 static ssize_t
66036895 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
....@@ -6606,7 +6898,7 @@
66066898 unsigned int new;
66076899 int err;
66086900 struct r5worker_group *new_groups, *old_groups;
6609
- int group_cnt, worker_cnt_per_group;
6901
+ int group_cnt;
66106902
66116903 if (len >= PAGE_SIZE)
66126904 return -EINVAL;
....@@ -6629,13 +6921,11 @@
66296921 if (old_groups)
66306922 flush_workqueue(raid5_wq);
66316923
6632
- err = alloc_thread_groups(conf, new,
6633
- &group_cnt, &worker_cnt_per_group,
6634
- &new_groups);
6924
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
66356925 if (!err) {
66366926 spin_lock_irq(&conf->device_lock);
66376927 conf->group_cnt = group_cnt;
6638
- conf->worker_cnt_per_group = worker_cnt_per_group;
6928
+ conf->worker_cnt_per_group = new;
66396929 conf->worker_groups = new_groups;
66406930 spin_unlock_irq(&conf->device_lock);
66416931
....@@ -6662,7 +6952,9 @@
66626952 &raid5_group_thread_cnt.attr,
66636953 &raid5_skip_copy.attr,
66646954 &raid5_rmw_level.attr,
6955
+ &raid5_stripe_size.attr,
66656956 &r5c_journal_mode.attr,
6957
+ &ppl_write_hint.attr,
66666958 NULL,
66676959 };
66686960 static struct attribute_group raid5_attrs_group = {
....@@ -6670,16 +6962,13 @@
66706962 .attrs = raid5_attrs,
66716963 };
66726964
6673
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
6674
- int *group_cnt,
6675
- int *worker_cnt_per_group,
6965
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
66766966 struct r5worker_group **worker_groups)
66776967 {
66786968 int i, j, k;
66796969 ssize_t size;
66806970 struct r5worker *workers;
66816971
6682
- *worker_cnt_per_group = cnt;
66836972 if (cnt == 0) {
66846973 *group_cnt = 0;
66856974 *worker_groups = NULL;
....@@ -6745,25 +7034,25 @@
67457034 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67467035 {
67477036 safe_put_page(percpu->spare_page);
6748
- if (percpu->scribble)
6749
- flex_array_free(percpu->scribble);
67507037 percpu->spare_page = NULL;
7038
+ kvfree(percpu->scribble);
67517039 percpu->scribble = NULL;
67527040 }
67537041
67547042 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67557043 {
6756
- if (conf->level == 6 && !percpu->spare_page)
7044
+ if (conf->level == 6 && !percpu->spare_page) {
67577045 percpu->spare_page = alloc_page(GFP_KERNEL);
6758
- if (!percpu->scribble)
6759
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
6760
- conf->previous_raid_disks),
6761
- max(conf->chunk_sectors,
6762
- conf->prev_chunk_sectors)
6763
- / STRIPE_SECTORS,
6764
- GFP_KERNEL);
7046
+ if (!percpu->spare_page)
7047
+ return -ENOMEM;
7048
+ }
67657049
6766
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
7050
+ if (scribble_alloc(percpu,
7051
+ max(conf->raid_disks,
7052
+ conf->previous_raid_disks),
7053
+ max(conf->chunk_sectors,
7054
+ conf->prev_chunk_sectors)
7055
+ / RAID5_STRIPE_SECTORS(conf))) {
67677056 free_scratch_buffer(conf, percpu);
67687057 return -ENOMEM;
67697058 }
....@@ -6818,7 +7107,6 @@
68187107 __func__, cpu);
68197108 return -ENOMEM;
68207109 }
6821
- spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
68227110 return 0;
68237111 }
68247112
....@@ -6829,6 +7117,7 @@
68297117 conf->percpu = alloc_percpu(struct raid5_percpu);
68307118 if (!conf->percpu)
68317119 return -ENOMEM;
7120
+
68327121 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
68337122 if (!err) {
68347123 conf->scribble_disks = max(conf->raid_disks,
....@@ -6879,7 +7168,7 @@
68797168 struct disk_info *disk;
68807169 char pers_name[6];
68817170 int i;
6882
- int group_cnt, worker_cnt_per_group;
7171
+ int group_cnt;
68837172 struct r5worker_group *new_group;
68847173 int ret;
68857174
....@@ -6915,6 +7204,12 @@
69157204 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
69167205 if (conf == NULL)
69177206 goto abort;
7207
+
7208
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7209
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
7210
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7211
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7212
+#endif
69187213 INIT_LIST_HEAD(&conf->free_list);
69197214 INIT_LIST_HEAD(&conf->pending_list);
69207215 conf->pending_data = kcalloc(PENDING_IO_MAX,
....@@ -6925,15 +7220,14 @@
69257220 for (i = 0; i < PENDING_IO_MAX; i++)
69267221 list_add(&conf->pending_data[i].sibling, &conf->free_list);
69277222 /* Don't enable multi-threading by default*/
6928
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6929
- &new_group)) {
7223
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
69307224 conf->group_cnt = group_cnt;
6931
- conf->worker_cnt_per_group = worker_cnt_per_group;
7225
+ conf->worker_cnt_per_group = 0;
69327226 conf->worker_groups = new_group;
69337227 } else
69347228 goto abort;
69357229 spin_lock_init(&conf->device_lock);
6936
- seqcount_init(&conf->gen_lock);
7230
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
69377231 mutex_init(&conf->cache_size_mutex);
69387232 init_waitqueue_head(&conf->wait_for_quiescent);
69397233 init_waitqueue_head(&conf->wait_for_stripe);
....@@ -7067,8 +7361,8 @@
70677361 conf->min_nr_stripes = NR_STRIPES;
70687362 if (mddev->reshape_position != MaxSector) {
70697363 int stripes = max_t(int,
7070
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7071
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7364
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7365
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
70727366 conf->min_nr_stripes = max(NR_STRIPES, stripes);
70737367 if (conf->min_nr_stripes != NR_STRIPES)
70747368 pr_info("md/raid:%s: force stripe size %d for reshape\n",
....@@ -7141,6 +7435,12 @@
71417435 return 1;
71427436 }
71437437 return 0;
7438
+}
7439
+
7440
+static void raid5_set_io_opt(struct r5conf *conf)
7441
+{
7442
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7443
+ (conf->raid_disks - conf->max_degraded));
71447444 }
71457445
71467446 static int raid5_run(struct mddev *mddev)
....@@ -7427,13 +7727,10 @@
74277727 int data_disks = conf->previous_raid_disks - conf->max_degraded;
74287728 int stripe = data_disks *
74297729 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7430
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7431
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
74327730
74337731 chunk_size = mddev->chunk_sectors << 9;
74347732 blk_queue_io_min(mddev->queue, chunk_size);
7435
- blk_queue_io_opt(mddev->queue, chunk_size *
7436
- (conf->raid_disks - conf->max_degraded));
7733
+ raid5_set_io_opt(conf);
74377734 mddev->queue->limits.raid_partial_stripes_expensive = 1;
74387735 /*
74397736 * We can only discard a whole stripe. It doesn't make sense to
....@@ -7718,6 +8015,7 @@
77188015 */
77198016 if (rdev->saved_raid_disk >= 0 &&
77208017 rdev->saved_raid_disk >= first &&
8018
+ rdev->saved_raid_disk <= last &&
77218019 conf->disks[rdev->saved_raid_disk].rdev == NULL)
77228020 first = rdev->saved_raid_disk;
77238021
....@@ -7799,14 +8097,14 @@
77998097 * stripe_heads first.
78008098 */
78018099 struct r5conf *conf = mddev->private;
7802
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
8100
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78038101 > conf->min_nr_stripes ||
7804
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
8102
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78058103 > conf->min_nr_stripes) {
78068104 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
78078105 mdname(mddev),
78088106 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7809
- / STRIPE_SIZE)*4);
8107
+ / RAID5_STRIPE_SIZE(conf))*4);
78108108 return 0;
78118109 }
78128110 return 1;
....@@ -7942,8 +8240,8 @@
79428240 else
79438241 rdev->recovery_offset = 0;
79448242
7945
- if (sysfs_link_rdev(mddev, rdev))
7946
- /* Failure here is OK */;
8243
+ /* Failure here is OK */
8244
+ sysfs_link_rdev(mddev, rdev);
79478245 }
79488246 } else if (rdev->raid_disk >= conf->previous_raid_disks
79498247 && !test_bit(Faulty, &rdev->flags)) {
....@@ -8017,16 +8315,8 @@
80178315 spin_unlock_irq(&conf->device_lock);
80188316 wake_up(&conf->wait_for_overlap);
80198317
8020
- /* read-ahead size must cover two whole stripes, which is
8021
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
8022
- */
8023
- if (conf->mddev->queue) {
8024
- int data_disks = conf->raid_disks - conf->max_degraded;
8025
- int stripe = data_disks * ((conf->chunk_sectors << 9)
8026
- / PAGE_SIZE);
8027
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8028
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8029
- }
8318
+ if (conf->mddev->queue)
8319
+ raid5_set_io_opt(conf);
80308320 }
80318321 }
80328322
....@@ -8138,7 +8428,7 @@
81388428 while (chunksect && (mddev->array_sectors & (chunksect-1)))
81398429 chunksect >>= 1;
81408430
8141
- if ((chunksect<<9) < STRIPE_SIZE)
8431
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
81428432 /* array size does not allow a suitable chunk size */
81438433 return ERR_PTR(-EINVAL);
81448434
....@@ -8425,7 +8715,6 @@
84258715 .finish_reshape = raid5_finish_reshape,
84268716 .quiesce = raid5_quiesce,
84278717 .takeover = raid6_takeover,
8428
- .congested = raid5_congested,
84298718 .change_consistency_policy = raid5_change_consistency_policy,
84308719 };
84318720 static struct md_personality raid5_personality =
....@@ -8450,7 +8739,6 @@
84508739 .finish_reshape = raid5_finish_reshape,
84518740 .quiesce = raid5_quiesce,
84528741 .takeover = raid5_takeover,
8453
- .congested = raid5_congested,
84548742 .change_consistency_policy = raid5_change_consistency_policy,
84558743 };
84568744
....@@ -8476,7 +8764,6 @@
84768764 .finish_reshape = raid5_finish_reshape,
84778765 .quiesce = raid5_quiesce,
84788766 .takeover = raid4_takeover,
8479
- .congested = raid5_congested,
84808767 .change_consistency_policy = raid5_change_consistency_policy,
84818768 };
84828769