forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/drivers/md/raid5.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * raid5.c : Multiple Devices driver for Linux
34 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
....@@ -7,15 +8,6 @@
78 * RAID-4/5/6 management functions.
89 * Thanks to Penguin Computing for making the RAID-6 development possible
910 * by donating a test server!
10
- *
11
- * This program is free software; you can redistribute it and/or modify
12
- * it under the terms of the GNU General Public License as published by
13
- * the Free Software Foundation; either version 2, or (at your option)
14
- * any later version.
15
- *
16
- * You should have received a copy of the GNU General Public License
17
- * (for example /usr/src/linux/COPYING); if not, write to the Free
18
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1911 */
2012
2113 /*
....@@ -44,6 +36,7 @@
4436 */
4537
4638 #include <linux/blkdev.h>
39
+#include <linux/delay.h>
4740 #include <linux/kthread.h>
4841 #include <linux/raid/pq.h>
4942 #include <linux/async_tx.h>
....@@ -54,7 +47,6 @@
5447 #include <linux/slab.h>
5548 #include <linux/ratelimit.h>
5649 #include <linux/nodemask.h>
57
-#include <linux/flex_array.h>
5850
5951 #include <trace/events/block.h>
6052 #include <linux/list_sort.h>
....@@ -78,13 +70,13 @@
7870
7971 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
8072 {
81
- int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
73
+ int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
8274 return &conf->stripe_hashtbl[hash];
8375 }
8476
85
-static inline int stripe_hash_locks_hash(sector_t sect)
77
+static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
8678 {
87
- return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
79
+ return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
8880 }
8981
9082 static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
....@@ -457,13 +449,74 @@
457449 return sh;
458450 }
459451
452
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
453
+static void free_stripe_pages(struct stripe_head *sh)
454
+{
455
+ int i;
456
+ struct page *p;
457
+
458
+ /* Have not allocate page pool */
459
+ if (!sh->pages)
460
+ return;
461
+
462
+ for (i = 0; i < sh->nr_pages; i++) {
463
+ p = sh->pages[i];
464
+ if (p)
465
+ put_page(p);
466
+ sh->pages[i] = NULL;
467
+ }
468
+}
469
+
470
+static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
471
+{
472
+ int i;
473
+ struct page *p;
474
+
475
+ for (i = 0; i < sh->nr_pages; i++) {
476
+ /* The page have allocated. */
477
+ if (sh->pages[i])
478
+ continue;
479
+
480
+ p = alloc_page(gfp);
481
+ if (!p) {
482
+ free_stripe_pages(sh);
483
+ return -ENOMEM;
484
+ }
485
+ sh->pages[i] = p;
486
+ }
487
+ return 0;
488
+}
489
+
490
+static int
491
+init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks)
492
+{
493
+ int nr_pages, cnt;
494
+
495
+ if (sh->pages)
496
+ return 0;
497
+
498
+ /* Each of the sh->dev[i] need one conf->stripe_size */
499
+ cnt = PAGE_SIZE / conf->stripe_size;
500
+ nr_pages = (disks + cnt - 1) / cnt;
501
+
502
+ sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
503
+ if (!sh->pages)
504
+ return -ENOMEM;
505
+ sh->nr_pages = nr_pages;
506
+ sh->stripes_per_page = cnt;
507
+ return 0;
508
+}
509
+#endif
510
+
460511 static void shrink_buffers(struct stripe_head *sh)
461512 {
462
- struct page *p;
463513 int i;
464514 int num = sh->raid_conf->pool_size;
465515
516
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
466517 for (i = 0; i < num ; i++) {
518
+ struct page *p;
519
+
467520 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
468521 p = sh->dev[i].page;
469522 if (!p)
....@@ -471,6 +524,11 @@
471524 sh->dev[i].page = NULL;
472525 put_page(p);
473526 }
527
+#else
528
+ for (i = 0; i < num; i++)
529
+ sh->dev[i].page = NULL;
530
+ free_stripe_pages(sh); /* Free pages */
531
+#endif
474532 }
475533
476534 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
....@@ -478,6 +536,7 @@
478536 int i;
479537 int num = sh->raid_conf->pool_size;
480538
539
+#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
481540 for (i = 0; i < num; i++) {
482541 struct page *page;
483542
....@@ -486,8 +545,18 @@
486545 }
487546 sh->dev[i].page = page;
488547 sh->dev[i].orig_page = page;
548
+ sh->dev[i].offset = 0;
489549 }
550
+#else
551
+ if (alloc_stripe_pages(sh, gfp))
552
+ return -ENOMEM;
490553
554
+ for (i = 0; i < num; i++) {
555
+ sh->dev[i].page = raid5_get_dev_page(sh, i);
556
+ sh->dev[i].orig_page = sh->dev[i].page;
557
+ sh->dev[i].offset = raid5_get_page_offset(sh, i);
558
+ }
559
+#endif
491560 return 0;
492561 }
493562
....@@ -618,17 +687,17 @@
618687 return degraded;
619688 }
620689
621
-static int has_failed(struct r5conf *conf)
690
+static bool has_failed(struct r5conf *conf)
622691 {
623
- int degraded;
692
+ int degraded = conf->mddev->degraded;
624693
625
- if (conf->mddev->reshape_position == MaxSector)
626
- return conf->mddev->degraded > conf->max_degraded;
694
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
695
+ return true;
627696
628
- degraded = raid5_calc_degraded(conf);
629
- if (degraded > conf->max_degraded)
630
- return 1;
631
- return 0;
697
+ if (conf->mddev->reshape_position != MaxSector)
698
+ degraded = raid5_calc_degraded(conf);
699
+
700
+ return degraded > conf->max_degraded;
632701 }
633702
634703 struct stripe_head *
....@@ -636,7 +705,7 @@
636705 int previous, int noblock, int noquiesce)
637706 {
638707 struct stripe_head *sh;
639
- int hash = stripe_hash_locks_hash(sector);
708
+ int hash = stripe_hash_locks_hash(conf, sector);
640709 int inc_empty_inactive_list_flag;
641710
642711 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
....@@ -712,6 +781,8 @@
712781 }
713782
714783 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
784
+ __acquires(&sh1->stripe_lock)
785
+ __acquires(&sh2->stripe_lock)
715786 {
716787 if (sh1 > sh2) {
717788 spin_lock_irq(&sh2->stripe_lock);
....@@ -723,6 +794,8 @@
723794 }
724795
725796 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
797
+ __releases(&sh1->stripe_lock)
798
+ __releases(&sh2->stripe_lock)
726799 {
727800 spin_unlock(&sh1->stripe_lock);
728801 spin_unlock_irq(&sh2->stripe_lock);
....@@ -753,9 +826,9 @@
753826 tmp_sec = sh->sector;
754827 if (!sector_div(tmp_sec, conf->chunk_sectors))
755828 return;
756
- head_sector = sh->sector - STRIPE_SECTORS;
829
+ head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
757830
758
- hash = stripe_hash_locks_hash(head_sector);
831
+ hash = stripe_hash_locks_hash(conf, head_sector);
759832 spin_lock_irq(conf->hash_locks + hash);
760833 head = __find_stripe(conf, head_sector, conf->generation);
761834 if (head && !atomic_inc_not_zero(&head->count)) {
....@@ -878,7 +951,7 @@
878951 struct bio *bio;
879952
880953 while ((bio = bio_list_pop(tmp)))
881
- generic_make_request(bio);
954
+ submit_bio_noacct(bio);
882955 }
883956
884957 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
....@@ -1062,7 +1135,7 @@
10621135 test_bit(WriteErrorSeen, &rdev->flags)) {
10631136 sector_t first_bad;
10641137 int bad_sectors;
1065
- int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
1138
+ int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
10661139 &first_bad, &bad_sectors);
10671140 if (!bad)
10681141 break;
....@@ -1094,7 +1167,7 @@
10941167 if (rdev) {
10951168 if (s->syncing || s->expanding || s->expanded
10961169 || s->replacing)
1097
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1170
+ md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
10981171
10991172 set_bit(STRIPE_IO_STARTED, &sh->state);
11001173
....@@ -1134,12 +1207,12 @@
11341207 else
11351208 sh->dev[i].vec.bv_page = sh->dev[i].page;
11361209 bi->bi_vcnt = 1;
1137
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1138
- bi->bi_io_vec[0].bv_offset = 0;
1139
- bi->bi_iter.bi_size = STRIPE_SIZE;
1210
+ bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1211
+ bi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1212
+ bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11401213 bi->bi_write_hint = sh->dev[i].write_hint;
11411214 if (!rrdev)
1142
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1215
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11431216 /*
11441217 * If this is discard request, set bi_vcnt 0. We don't
11451218 * want to confuse SCSI because SCSI will replace payload
....@@ -1156,12 +1229,12 @@
11561229 if (should_defer && op_is_write(op))
11571230 bio_list_add(&pending_bios, bi);
11581231 else
1159
- generic_make_request(bi);
1232
+ submit_bio_noacct(bi);
11601233 }
11611234 if (rrdev) {
11621235 if (s->syncing || s->expanding || s->expanded
11631236 || s->replacing)
1164
- md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1237
+ md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
11651238
11661239 set_bit(STRIPE_IO_STARTED, &sh->state);
11671240
....@@ -1188,11 +1261,11 @@
11881261 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
11891262 sh->dev[i].rvec.bv_page = sh->dev[i].page;
11901263 rbi->bi_vcnt = 1;
1191
- rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1192
- rbi->bi_io_vec[0].bv_offset = 0;
1193
- rbi->bi_iter.bi_size = STRIPE_SIZE;
1264
+ rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
1265
+ rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset;
1266
+ rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
11941267 rbi->bi_write_hint = sh->dev[i].write_hint;
1195
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
1268
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
11961269 /*
11971270 * If this is discard request, set bi_vcnt 0. We don't
11981271 * want to confuse SCSI because SCSI will replace payload
....@@ -1206,7 +1279,7 @@
12061279 if (should_defer && op_is_write(op))
12071280 bio_list_add(&pending_bios, rbi);
12081281 else
1209
- generic_make_request(rbi);
1282
+ submit_bio_noacct(rbi);
12101283 }
12111284 if (!rdev && !rrdev) {
12121285 if (op_is_write(op))
....@@ -1231,7 +1304,7 @@
12311304
12321305 static struct dma_async_tx_descriptor *
12331306 async_copy_data(int frombio, struct bio *bio, struct page **page,
1234
- sector_t sector, struct dma_async_tx_descriptor *tx,
1307
+ unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
12351308 struct stripe_head *sh, int no_skipcopy)
12361309 {
12371310 struct bio_vec bvl;
....@@ -1240,6 +1313,7 @@
12401313 int page_offset;
12411314 struct async_submit_ctl submit;
12421315 enum async_tx_flags flags = 0;
1316
+ struct r5conf *conf = sh->raid_conf;
12431317
12441318 if (bio->bi_iter.bi_sector >= sector)
12451319 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
....@@ -1261,8 +1335,8 @@
12611335 len -= b_offset;
12621336 }
12631337
1264
- if (len > 0 && page_offset + len > STRIPE_SIZE)
1265
- clen = STRIPE_SIZE - page_offset;
1338
+ if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
1339
+ clen = RAID5_STRIPE_SIZE(conf) - page_offset;
12661340 else
12671341 clen = len;
12681342
....@@ -1270,17 +1344,17 @@
12701344 b_offset += bvl.bv_offset;
12711345 bio_page = bvl.bv_page;
12721346 if (frombio) {
1273
- if (sh->raid_conf->skip_copy &&
1347
+ if (conf->skip_copy &&
12741348 b_offset == 0 && page_offset == 0 &&
1275
- clen == STRIPE_SIZE &&
1349
+ clen == RAID5_STRIPE_SIZE(conf) &&
12761350 !no_skipcopy)
12771351 *page = bio_page;
12781352 else
1279
- tx = async_memcpy(*page, bio_page, page_offset,
1353
+ tx = async_memcpy(*page, bio_page, page_offset + poff,
12801354 b_offset, clen, &submit);
12811355 } else
12821356 tx = async_memcpy(bio_page, *page, b_offset,
1283
- page_offset, clen, &submit);
1357
+ page_offset + poff, clen, &submit);
12841358 }
12851359 /* chain the operations */
12861360 submit.depend_tx = tx;
....@@ -1297,6 +1371,7 @@
12971371 {
12981372 struct stripe_head *sh = stripe_head_ref;
12991373 int i;
1374
+ struct r5conf *conf = sh->raid_conf;
13001375
13011376 pr_debug("%s: stripe %llu\n", __func__,
13021377 (unsigned long long)sh->sector);
....@@ -1317,8 +1392,8 @@
13171392 rbi = dev->read;
13181393 dev->read = NULL;
13191394 while (rbi && rbi->bi_iter.bi_sector <
1320
- dev->sector + STRIPE_SECTORS) {
1321
- rbi2 = r5_next_bio(rbi, dev->sector);
1395
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1396
+ rbi2 = r5_next_bio(conf, rbi, dev->sector);
13221397 bio_endio(rbi);
13231398 rbi = rbi2;
13241399 }
....@@ -1335,6 +1410,7 @@
13351410 struct dma_async_tx_descriptor *tx = NULL;
13361411 struct async_submit_ctl submit;
13371412 int i;
1413
+ struct r5conf *conf = sh->raid_conf;
13381414
13391415 BUG_ON(sh->batch_head);
13401416 pr_debug("%s: stripe %llu\n", __func__,
....@@ -1349,10 +1425,11 @@
13491425 dev->toread = NULL;
13501426 spin_unlock_irq(&sh->stripe_lock);
13511427 while (rbi && rbi->bi_iter.bi_sector <
1352
- dev->sector + STRIPE_SECTORS) {
1428
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
13531429 tx = async_copy_data(0, rbi, &dev->page,
1430
+ dev->offset,
13541431 dev->sector, tx, sh, 0);
1355
- rbi = r5_next_bio(rbi, dev->sector);
1432
+ rbi = r5_next_bio(conf, rbi, dev->sector);
13561433 }
13571434 }
13581435 }
....@@ -1394,22 +1471,25 @@
13941471 }
13951472
13961473 /* return a pointer to the address conversion region of the scribble buffer */
1397
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1398
- struct raid5_percpu *percpu, int i)
1474
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
13991475 {
1400
- void *addr;
1401
-
1402
- addr = flex_array_get(percpu->scribble, i);
1403
- return addr + sizeof(struct page *) * (sh->disks + 2);
1476
+ return percpu->scribble + i * percpu->scribble_obj_size;
14041477 }
14051478
14061479 /* return a pointer to the address conversion region of the scribble buffer */
1407
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1480
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1481
+ struct raid5_percpu *percpu, int i)
14081482 {
1409
- void *addr;
1483
+ return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
1484
+}
14101485
1411
- addr = flex_array_get(percpu->scribble, i);
1412
- return addr;
1486
+/*
1487
+ * Return a pointer to record offset address.
1488
+ */
1489
+static unsigned int *
1490
+to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu)
1491
+{
1492
+ return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2);
14131493 }
14141494
14151495 static struct dma_async_tx_descriptor *
....@@ -1417,9 +1497,11 @@
14171497 {
14181498 int disks = sh->disks;
14191499 struct page **xor_srcs = to_addr_page(percpu, 0);
1500
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
14201501 int target = sh->ops.target;
14211502 struct r5dev *tgt = &sh->dev[target];
14221503 struct page *xor_dest = tgt->page;
1504
+ unsigned int off_dest = tgt->offset;
14231505 int count = 0;
14241506 struct dma_async_tx_descriptor *tx;
14251507 struct async_submit_ctl submit;
....@@ -1431,24 +1513,30 @@
14311513 __func__, (unsigned long long)sh->sector, target);
14321514 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
14331515
1434
- for (i = disks; i--; )
1435
- if (i != target)
1516
+ for (i = disks; i--; ) {
1517
+ if (i != target) {
1518
+ off_srcs[count] = sh->dev[i].offset;
14361519 xor_srcs[count++] = sh->dev[i].page;
1520
+ }
1521
+ }
14371522
14381523 atomic_inc(&sh->count);
14391524
14401525 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
14411526 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
14421527 if (unlikely(count == 1))
1443
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1528
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
1529
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14441530 else
1445
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1531
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1532
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
14461533
14471534 return tx;
14481535 }
14491536
14501537 /* set_syndrome_sources - populate source buffers for gen_syndrome
14511538 * @srcs - (struct page *) array of size sh->disks
1539
+ * @offs - (unsigned int) array of offset for each page
14521540 * @sh - stripe_head to parse
14531541 *
14541542 * Populates srcs in proper layout order for the stripe and returns the
....@@ -1457,6 +1545,7 @@
14571545 * is recorded in srcs[count+1]].
14581546 */
14591547 static int set_syndrome_sources(struct page **srcs,
1548
+ unsigned int *offs,
14601549 struct stripe_head *sh,
14611550 int srctype)
14621551 {
....@@ -1487,6 +1576,12 @@
14871576 srcs[slot] = sh->dev[i].orig_page;
14881577 else
14891578 srcs[slot] = sh->dev[i].page;
1579
+ /*
1580
+ * For R5_InJournal, PAGE_SIZE must be 4KB and will
1581
+ * not shared page. In that case, dev[i].offset
1582
+ * is 0.
1583
+ */
1584
+ offs[slot] = sh->dev[i].offset;
14901585 }
14911586 i = raid6_next_disk(i, disks);
14921587 } while (i != d0_idx);
....@@ -1499,12 +1594,14 @@
14991594 {
15001595 int disks = sh->disks;
15011596 struct page **blocks = to_addr_page(percpu, 0);
1597
+ unsigned int *offs = to_addr_offs(sh, percpu);
15021598 int target;
15031599 int qd_idx = sh->qd_idx;
15041600 struct dma_async_tx_descriptor *tx;
15051601 struct async_submit_ctl submit;
15061602 struct r5dev *tgt;
15071603 struct page *dest;
1604
+ unsigned int dest_off;
15081605 int i;
15091606 int count;
15101607
....@@ -1523,30 +1620,34 @@
15231620 tgt = &sh->dev[target];
15241621 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
15251622 dest = tgt->page;
1623
+ dest_off = tgt->offset;
15261624
15271625 atomic_inc(&sh->count);
15281626
15291627 if (target == qd_idx) {
1530
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1628
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
15311629 blocks[count] = NULL; /* regenerating p is not necessary */
15321630 BUG_ON(blocks[count+1] != dest); /* q should already be set */
15331631 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
15341632 ops_complete_compute, sh,
15351633 to_addr_conv(sh, percpu, 0));
1536
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1634
+ tx = async_gen_syndrome(blocks, offs, count+2,
1635
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15371636 } else {
15381637 /* Compute any data- or p-drive using XOR */
15391638 count = 0;
15401639 for (i = disks; i-- ; ) {
15411640 if (i == target || i == qd_idx)
15421641 continue;
1642
+ offs[count] = sh->dev[i].offset;
15431643 blocks[count++] = sh->dev[i].page;
15441644 }
15451645
15461646 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
15471647 NULL, ops_complete_compute, sh,
15481648 to_addr_conv(sh, percpu, 0));
1549
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1649
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1650
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
15501651 }
15511652
15521653 return tx;
....@@ -1565,6 +1666,7 @@
15651666 struct r5dev *tgt2 = &sh->dev[target2];
15661667 struct dma_async_tx_descriptor *tx;
15671668 struct page **blocks = to_addr_page(percpu, 0);
1669
+ unsigned int *offs = to_addr_offs(sh, percpu);
15681670 struct async_submit_ctl submit;
15691671
15701672 BUG_ON(sh->batch_head);
....@@ -1577,13 +1679,16 @@
15771679 /* we need to open-code set_syndrome_sources to handle the
15781680 * slot number conversion for 'faila' and 'failb'
15791681 */
1580
- for (i = 0; i < disks ; i++)
1682
+ for (i = 0; i < disks ; i++) {
1683
+ offs[i] = 0;
15811684 blocks[i] = NULL;
1685
+ }
15821686 count = 0;
15831687 i = d0_idx;
15841688 do {
15851689 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
15861690
1691
+ offs[slot] = sh->dev[i].offset;
15871692 blocks[slot] = sh->dev[i].page;
15881693
15891694 if (i == target)
....@@ -1608,10 +1713,12 @@
16081713 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
16091714 ops_complete_compute, sh,
16101715 to_addr_conv(sh, percpu, 0));
1611
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1612
- STRIPE_SIZE, &submit);
1716
+ return async_gen_syndrome(blocks, offs, syndrome_disks+2,
1717
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1718
+ &submit);
16131719 } else {
16141720 struct page *dest;
1721
+ unsigned int dest_off;
16151722 int data_target;
16161723 int qd_idx = sh->qd_idx;
16171724
....@@ -1625,22 +1732,26 @@
16251732 for (i = disks; i-- ; ) {
16261733 if (i == data_target || i == qd_idx)
16271734 continue;
1735
+ offs[count] = sh->dev[i].offset;
16281736 blocks[count++] = sh->dev[i].page;
16291737 }
16301738 dest = sh->dev[data_target].page;
1739
+ dest_off = sh->dev[data_target].offset;
16311740 init_async_submit(&submit,
16321741 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
16331742 NULL, NULL, NULL,
16341743 to_addr_conv(sh, percpu, 0));
1635
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1744
+ tx = async_xor_offs(dest, dest_off, blocks, offs, count,
1745
+ RAID5_STRIPE_SIZE(sh->raid_conf),
16361746 &submit);
16371747
1638
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1748
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL);
16391749 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
16401750 ops_complete_compute, sh,
16411751 to_addr_conv(sh, percpu, 0));
1642
- return async_gen_syndrome(blocks, 0, count+2,
1643
- STRIPE_SIZE, &submit);
1752
+ return async_gen_syndrome(blocks, offs, count+2,
1753
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1754
+ &submit);
16441755 }
16451756 } else {
16461757 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
....@@ -1649,13 +1760,15 @@
16491760 if (failb == syndrome_disks) {
16501761 /* We're missing D+P. */
16511762 return async_raid6_datap_recov(syndrome_disks+2,
1652
- STRIPE_SIZE, faila,
1653
- blocks, &submit);
1763
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1764
+ faila,
1765
+ blocks, offs, &submit);
16541766 } else {
16551767 /* We're missing D+D. */
16561768 return async_raid6_2data_recov(syndrome_disks+2,
1657
- STRIPE_SIZE, faila, failb,
1658
- blocks, &submit);
1769
+ RAID5_STRIPE_SIZE(sh->raid_conf),
1770
+ faila, failb,
1771
+ blocks, offs, &submit);
16591772 }
16601773 }
16611774 }
....@@ -1681,10 +1794,12 @@
16811794 {
16821795 int disks = sh->disks;
16831796 struct page **xor_srcs = to_addr_page(percpu, 0);
1797
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
16841798 int count = 0, pd_idx = sh->pd_idx, i;
16851799 struct async_submit_ctl submit;
16861800
16871801 /* existing parity data subtracted */
1802
+ unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
16881803 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
16891804
16901805 BUG_ON(sh->batch_head);
....@@ -1694,15 +1809,23 @@
16941809 for (i = disks; i--; ) {
16951810 struct r5dev *dev = &sh->dev[i];
16961811 /* Only process blocks that are known to be uptodate */
1697
- if (test_bit(R5_InJournal, &dev->flags))
1812
+ if (test_bit(R5_InJournal, &dev->flags)) {
1813
+ /*
1814
+ * For this case, PAGE_SIZE must be equal to 4KB and
1815
+ * page offset is zero.
1816
+ */
1817
+ off_srcs[count] = dev->offset;
16981818 xor_srcs[count++] = dev->orig_page;
1699
- else if (test_bit(R5_Wantdrain, &dev->flags))
1819
+ } else if (test_bit(R5_Wantdrain, &dev->flags)) {
1820
+ off_srcs[count] = dev->offset;
17001821 xor_srcs[count++] = dev->page;
1822
+ }
17011823 }
17021824
17031825 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
17041826 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1705
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1827
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
1828
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17061829
17071830 return tx;
17081831 }
....@@ -1712,17 +1835,19 @@
17121835 struct dma_async_tx_descriptor *tx)
17131836 {
17141837 struct page **blocks = to_addr_page(percpu, 0);
1838
+ unsigned int *offs = to_addr_offs(sh, percpu);
17151839 int count;
17161840 struct async_submit_ctl submit;
17171841
17181842 pr_debug("%s: stripe %llu\n", __func__,
17191843 (unsigned long long)sh->sector);
17201844
1721
- count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1845
+ count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN);
17221846
17231847 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
17241848 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1725
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1849
+ tx = async_gen_syndrome(blocks, offs, count+2,
1850
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
17261851
17271852 return tx;
17281853 }
....@@ -1763,7 +1888,7 @@
17631888 WARN_ON(dev->page != dev->orig_page);
17641889
17651890 while (wbi && wbi->bi_iter.bi_sector <
1766
- dev->sector + STRIPE_SECTORS) {
1891
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
17671892 if (wbi->bi_opf & REQ_FUA)
17681893 set_bit(R5_WantFUA, &dev->flags);
17691894 if (wbi->bi_opf & REQ_SYNC)
....@@ -1772,6 +1897,7 @@
17721897 set_bit(R5_Discard, &dev->flags);
17731898 else {
17741899 tx = async_copy_data(1, wbi, &dev->page,
1900
+ dev->offset,
17751901 dev->sector, tx, sh,
17761902 r5c_is_writeback(conf->log));
17771903 if (dev->page != dev->orig_page &&
....@@ -1781,7 +1907,7 @@
17811907 clear_bit(R5_OVERWRITE, &dev->flags);
17821908 }
17831909 }
1784
- wbi = r5_next_bio(wbi, dev->sector);
1910
+ wbi = r5_next_bio(conf, wbi, dev->sector);
17851911 }
17861912
17871913 if (head_sh->batch_head) {
....@@ -1851,9 +1977,11 @@
18511977 {
18521978 int disks = sh->disks;
18531979 struct page **xor_srcs;
1980
+ unsigned int *off_srcs;
18541981 struct async_submit_ctl submit;
18551982 int count, pd_idx = sh->pd_idx, i;
18561983 struct page *xor_dest;
1984
+ unsigned int off_dest;
18571985 int prexor = 0;
18581986 unsigned long flags;
18591987 int j = 0;
....@@ -1878,24 +2006,31 @@
18782006 again:
18792007 count = 0;
18802008 xor_srcs = to_addr_page(percpu, j);
2009
+ off_srcs = to_addr_offs(sh, percpu);
18812010 /* check if prexor is active which means only process blocks
18822011 * that are part of a read-modify-write (written)
18832012 */
18842013 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
18852014 prexor = 1;
2015
+ off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
18862016 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
18872017 for (i = disks; i--; ) {
18882018 struct r5dev *dev = &sh->dev[i];
18892019 if (head_sh->dev[i].written ||
1890
- test_bit(R5_InJournal, &head_sh->dev[i].flags))
2020
+ test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2021
+ off_srcs[count] = dev->offset;
18912022 xor_srcs[count++] = dev->page;
2023
+ }
18922024 }
18932025 } else {
18942026 xor_dest = sh->dev[pd_idx].page;
2027
+ off_dest = sh->dev[pd_idx].offset;
18952028 for (i = disks; i--; ) {
18962029 struct r5dev *dev = &sh->dev[i];
1897
- if (i != pd_idx)
2030
+ if (i != pd_idx) {
2031
+ off_srcs[count] = dev->offset;
18982032 xor_srcs[count++] = dev->page;
2033
+ }
18992034 }
19002035 }
19012036
....@@ -1921,9 +2056,11 @@
19212056 }
19222057
19232058 if (unlikely(count == 1))
1924
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
2059
+ tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0],
2060
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19252061 else
1926
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
2062
+ tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2063
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19272064 if (!last_stripe) {
19282065 j++;
19292066 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -1938,6 +2075,7 @@
19382075 {
19392076 struct async_submit_ctl submit;
19402077 struct page **blocks;
2078
+ unsigned int *offs;
19412079 int count, i, j = 0;
19422080 struct stripe_head *head_sh = sh;
19432081 int last_stripe;
....@@ -1962,6 +2100,7 @@
19622100
19632101 again:
19642102 blocks = to_addr_page(percpu, j);
2103
+ offs = to_addr_offs(sh, percpu);
19652104
19662105 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
19672106 synflags = SYNDROME_SRC_WRITTEN;
....@@ -1971,7 +2110,7 @@
19712110 txflags = ASYNC_TX_ACK;
19722111 }
19732112
1974
- count = set_syndrome_sources(blocks, sh, synflags);
2113
+ count = set_syndrome_sources(blocks, offs, sh, synflags);
19752114 last_stripe = !head_sh->batch_head ||
19762115 list_first_entry(&sh->batch_list,
19772116 struct stripe_head, batch_list) == head_sh;
....@@ -1983,7 +2122,8 @@
19832122 } else
19842123 init_async_submit(&submit, 0, tx, NULL, NULL,
19852124 to_addr_conv(sh, percpu, j));
1986
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
2125
+ tx = async_gen_syndrome(blocks, offs, count+2,
2126
+ RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
19872127 if (!last_stripe) {
19882128 j++;
19892129 sh = list_first_entry(&sh->batch_list, struct stripe_head,
....@@ -2010,7 +2150,9 @@
20102150 int pd_idx = sh->pd_idx;
20112151 int qd_idx = sh->qd_idx;
20122152 struct page *xor_dest;
2153
+ unsigned int off_dest;
20132154 struct page **xor_srcs = to_addr_page(percpu, 0);
2155
+ unsigned int *off_srcs = to_addr_offs(sh, percpu);
20142156 struct dma_async_tx_descriptor *tx;
20152157 struct async_submit_ctl submit;
20162158 int count;
....@@ -2022,16 +2164,20 @@
20222164 BUG_ON(sh->batch_head);
20232165 count = 0;
20242166 xor_dest = sh->dev[pd_idx].page;
2167
+ off_dest = sh->dev[pd_idx].offset;
2168
+ off_srcs[count] = off_dest;
20252169 xor_srcs[count++] = xor_dest;
20262170 for (i = disks; i--; ) {
20272171 if (i == pd_idx || i == qd_idx)
20282172 continue;
2173
+ off_srcs[count] = sh->dev[i].offset;
20292174 xor_srcs[count++] = sh->dev[i].page;
20302175 }
20312176
20322177 init_async_submit(&submit, 0, NULL, NULL, NULL,
20332178 to_addr_conv(sh, percpu, 0));
2034
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
2179
+ tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count,
2180
+ RAID5_STRIPE_SIZE(sh->raid_conf),
20352181 &sh->ops.zero_sum_result, &submit);
20362182
20372183 atomic_inc(&sh->count);
....@@ -2042,6 +2188,7 @@
20422188 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
20432189 {
20442190 struct page **srcs = to_addr_page(percpu, 0);
2191
+ unsigned int *offs = to_addr_offs(sh, percpu);
20452192 struct async_submit_ctl submit;
20462193 int count;
20472194
....@@ -2049,15 +2196,16 @@
20492196 (unsigned long long)sh->sector, checkp);
20502197
20512198 BUG_ON(sh->batch_head);
2052
- count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
2199
+ count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL);
20532200 if (!checkp)
20542201 srcs[count] = NULL;
20552202
20562203 atomic_inc(&sh->count);
20572204 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
20582205 sh, to_addr_conv(sh, percpu, 0));
2059
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
2060
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
2206
+ async_syndrome_val(srcs, offs, count+2,
2207
+ RAID5_STRIPE_SIZE(sh->raid_conf),
2208
+ &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit);
20612209 }
20622210
20632211 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
....@@ -2069,8 +2217,9 @@
20692217 struct raid5_percpu *percpu;
20702218 unsigned long cpu;
20712219
2072
- cpu = get_cpu();
2220
+ cpu = get_cpu_light();
20732221 percpu = per_cpu_ptr(conf->percpu, cpu);
2222
+ spin_lock(&percpu->lock);
20742223 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
20752224 ops_run_biofill(sh);
20762225 overlap_clear++;
....@@ -2129,11 +2278,15 @@
21292278 if (test_and_clear_bit(R5_Overlap, &dev->flags))
21302279 wake_up(&sh->raid_conf->wait_for_overlap);
21312280 }
2132
- put_cpu();
2281
+ spin_unlock(&percpu->lock);
2282
+ put_cpu_light();
21332283 }
21342284
21352285 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
21362286 {
2287
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2288
+ kfree(sh->pages);
2289
+#endif
21372290 if (sh->ppl_page)
21382291 __free_page(sh->ppl_page);
21392292 kmem_cache_free(sc, sh);
....@@ -2167,9 +2320,15 @@
21672320 sh->ppl_page = alloc_page(gfp);
21682321 if (!sh->ppl_page) {
21692322 free_stripe(sc, sh);
2170
- sh = NULL;
2323
+ return NULL;
21712324 }
21722325 }
2326
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2327
+ if (init_stripe_shared_pages(sh, conf, disks)) {
2328
+ free_stripe(sc, sh);
2329
+ return NULL;
2330
+ }
2331
+#endif
21732332 }
21742333 return sh;
21752334 }
....@@ -2226,10 +2385,13 @@
22262385 }
22272386
22282387 /**
2229
- * scribble_len - return the required size of the scribble region
2230
- * @num - total number of disks in the array
2388
+ * scribble_alloc - allocate percpu scribble buffer for required size
2389
+ * of the scribble region
2390
+ * @percpu: from for_each_present_cpu() of the caller
2391
+ * @num: total number of disks in the array
2392
+ * @cnt: scribble objs count for required size of the scribble region
22312393 *
2232
- * The size must be enough to contain:
2394
+ * The scribble buffer size must be enough to contain:
22332395 * 1/ a struct page pointer for each device in the array +2
22342396 * 2/ room to convert each entry in (1) to its corresponding dma
22352397 * (dma_map_page()) or page (page_address()) address.
....@@ -2238,21 +2400,29 @@
22382400 * calculate over all devices (not just the data blocks), using zeros in place
22392401 * of the P and Q blocks.
22402402 */
2241
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2403
+static int scribble_alloc(struct raid5_percpu *percpu,
2404
+ int num, int cnt)
22422405 {
2243
- struct flex_array *ret;
2244
- size_t len;
2406
+ size_t obj_size =
2407
+ sizeof(struct page *) * (num + 2) +
2408
+ sizeof(addr_conv_t) * (num + 2) +
2409
+ sizeof(unsigned int) * (num + 2);
2410
+ void *scribble;
22452411
2246
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2247
- ret = flex_array_alloc(len, cnt, flags);
2248
- if (!ret)
2249
- return NULL;
2250
- /* always prealloc all elements, so no locking is required */
2251
- if (flex_array_prealloc(ret, 0, cnt, flags)) {
2252
- flex_array_free(ret);
2253
- return NULL;
2254
- }
2255
- return ret;
2412
+ /*
2413
+ * If here is in raid array suspend context, it is in memalloc noio
2414
+ * context as well, there is no potential recursive memory reclaim
2415
+ * I/Os with the GFP_KERNEL flag.
2416
+ */
2417
+ scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL);
2418
+ if (!scribble)
2419
+ return -ENOMEM;
2420
+
2421
+ kvfree(percpu->scribble);
2422
+
2423
+ percpu->scribble = scribble;
2424
+ percpu->scribble_obj_size = obj_size;
2425
+ return 0;
22562426 }
22572427
22582428 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
....@@ -2270,23 +2440,17 @@
22702440 return 0;
22712441 mddev_suspend(conf->mddev);
22722442 get_online_cpus();
2443
+
22732444 for_each_present_cpu(cpu) {
22742445 struct raid5_percpu *percpu;
2275
- struct flex_array *scribble;
22762446
22772447 percpu = per_cpu_ptr(conf->percpu, cpu);
2278
- scribble = scribble_alloc(new_disks,
2279
- new_sectors / STRIPE_SECTORS,
2280
- GFP_NOIO);
2281
-
2282
- if (scribble) {
2283
- flex_array_free(percpu->scribble);
2284
- percpu->scribble = scribble;
2285
- } else {
2286
- err = -ENOMEM;
2448
+ err = scribble_alloc(percpu, new_disks,
2449
+ new_sectors / RAID5_STRIPE_SECTORS(conf));
2450
+ if (err)
22872451 break;
2288
- }
22892452 }
2453
+
22902454 put_online_cpus();
22912455 mddev_resume(conf->mddev);
22922456 if (!err) {
....@@ -2374,9 +2538,16 @@
23742538 osh = get_free_stripe(conf, hash);
23752539 unlock_device_hash_lock(conf, hash);
23762540
2541
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2542
+ for (i = 0; i < osh->nr_pages; i++) {
2543
+ nsh->pages[i] = osh->pages[i];
2544
+ osh->pages[i] = NULL;
2545
+ }
2546
+#endif
23772547 for(i=0; i<conf->pool_size; i++) {
23782548 nsh->dev[i].page = osh->dev[i].page;
23792549 nsh->dev[i].orig_page = osh->dev[i].page;
2550
+ nsh->dev[i].offset = osh->dev[i].offset;
23802551 }
23812552 nsh->hash_lock_index = hash;
23822553 free_stripe(conf->slab_cache, osh);
....@@ -2425,14 +2596,33 @@
24252596 nsh = list_entry(newstripes.next, struct stripe_head, lru);
24262597 list_del_init(&nsh->lru);
24272598
2599
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2600
+ for (i = 0; i < nsh->nr_pages; i++) {
2601
+ if (nsh->pages[i])
2602
+ continue;
2603
+ nsh->pages[i] = alloc_page(GFP_NOIO);
2604
+ if (!nsh->pages[i])
2605
+ err = -ENOMEM;
2606
+ }
2607
+
2608
+ for (i = conf->raid_disks; i < newsize; i++) {
2609
+ if (nsh->dev[i].page)
2610
+ continue;
2611
+ nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2612
+ nsh->dev[i].orig_page = nsh->dev[i].page;
2613
+ nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2614
+ }
2615
+#else
24282616 for (i=conf->raid_disks; i < newsize; i++)
24292617 if (nsh->dev[i].page == NULL) {
24302618 struct page *p = alloc_page(GFP_NOIO);
24312619 nsh->dev[i].page = p;
24322620 nsh->dev[i].orig_page = p;
2621
+ nsh->dev[i].offset = 0;
24332622 if (!p)
24342623 err = -ENOMEM;
24352624 }
2625
+#endif
24362626 raid5_release_stripe(nsh);
24372627 }
24382628 /* critical section pass, GFP_NOIO no longer needed */
....@@ -2516,10 +2706,10 @@
25162706 */
25172707 pr_info_ratelimited(
25182708 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2519
- mdname(conf->mddev), STRIPE_SECTORS,
2709
+ mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
25202710 (unsigned long long)s,
25212711 bdevname(rdev->bdev, b));
2522
- atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2712
+ atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
25232713 clear_bit(R5_ReadError, &sh->dev[i].flags);
25242714 clear_bit(R5_ReWrite, &sh->dev[i].flags);
25252715 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2564,10 +2754,16 @@
25642754 (unsigned long long)s,
25652755 bdn);
25662756 } else if (atomic_read(&rdev->read_errors)
2567
- > conf->max_nr_stripes)
2568
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2569
- mdname(conf->mddev), bdn);
2570
- else
2757
+ > conf->max_nr_stripes) {
2758
+ if (!test_bit(Faulty, &rdev->flags)) {
2759
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2760
+ mdname(conf->mddev),
2761
+ atomic_read(&rdev->read_errors),
2762
+ conf->max_nr_stripes);
2763
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2764
+ mdname(conf->mddev), bdn);
2765
+ }
2766
+ } else
25712767 retry = 1;
25722768 if (set_bad && test_bit(In_sync, &rdev->flags)
25732769 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
....@@ -2586,7 +2782,7 @@
25862782 if (!(set_bad
25872783 && test_bit(In_sync, &rdev->flags)
25882784 && rdev_set_badblocks(
2589
- rdev, sh->sector, STRIPE_SECTORS, 0)))
2785
+ rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
25902786 md_error(conf->mddev, rdev);
25912787 }
25922788 }
....@@ -2602,7 +2798,7 @@
26022798 struct stripe_head *sh = bi->bi_private;
26032799 struct r5conf *conf = sh->raid_conf;
26042800 int disks = sh->disks, i;
2605
- struct md_rdev *uninitialized_var(rdev);
2801
+ struct md_rdev *rdev;
26062802 sector_t first_bad;
26072803 int bad_sectors;
26082804 int replacement = 0;
....@@ -2638,7 +2834,7 @@
26382834 if (bi->bi_status)
26392835 md_error(conf->mddev, rdev);
26402836 else if (is_badblock(rdev, sh->sector,
2641
- STRIPE_SECTORS,
2837
+ RAID5_STRIPE_SECTORS(conf),
26422838 &first_bad, &bad_sectors))
26432839 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
26442840 } else {
....@@ -2650,7 +2846,7 @@
26502846 set_bit(MD_RECOVERY_NEEDED,
26512847 &rdev->mddev->recovery);
26522848 } else if (is_badblock(rdev, sh->sector,
2653
- STRIPE_SECTORS,
2849
+ RAID5_STRIPE_SECTORS(conf),
26542850 &first_bad, &bad_sectors)) {
26552851 set_bit(R5_MadeGood, &sh->dev[i].flags);
26562852 if (test_bit(R5_ReadError, &sh->dev[i].flags))
....@@ -2670,10 +2866,10 @@
26702866 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
26712867 clear_bit(R5_LOCKED, &sh->dev[i].flags);
26722868 set_bit(STRIPE_HANDLE, &sh->state);
2673
- raid5_release_stripe(sh);
26742869
26752870 if (sh->batch_head && sh != sh->batch_head)
26762871 raid5_release_stripe(sh->batch_head);
2872
+ raid5_release_stripe(sh);
26772873 }
26782874
26792875 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
....@@ -2683,22 +2879,31 @@
26832879 unsigned long flags;
26842880 pr_debug("raid456: error called\n");
26852881
2882
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
2883
+ mdname(mddev), bdevname(rdev->bdev, b));
2884
+
26862885 spin_lock_irqsave(&conf->device_lock, flags);
26872886 set_bit(Faulty, &rdev->flags);
26882887 clear_bit(In_sync, &rdev->flags);
26892888 mddev->degraded = raid5_calc_degraded(conf);
2889
+
2890
+ if (has_failed(conf)) {
2891
+ set_bit(MD_BROKEN, &conf->mddev->flags);
2892
+ conf->recovery_disabled = mddev->recovery_disabled;
2893
+
2894
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2895
+ mdname(mddev), mddev->degraded, conf->raid_disks);
2896
+ } else {
2897
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2898
+ mdname(mddev), conf->raid_disks - mddev->degraded);
2899
+ }
2900
+
26902901 spin_unlock_irqrestore(&conf->device_lock, flags);
26912902 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
26922903
26932904 set_bit(Blocked, &rdev->flags);
26942905 set_mask_bits(&mddev->sb_flags, 0,
26952906 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2696
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2697
- "md/raid:%s: Operation continuing on %d devices.\n",
2698
- mdname(mddev),
2699
- bdevname(rdev->bdev, b),
2700
- mdname(mddev),
2701
- conf->raid_disks - mddev->degraded);
27022907 r5c_update_on_rdev_error(mddev, rdev);
27032908 }
27042909
....@@ -3272,13 +3477,13 @@
32723477 /* check if page is covered */
32733478 sector_t sector = sh->dev[dd_idx].sector;
32743479 for (bi=sh->dev[dd_idx].towrite;
3275
- sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3480
+ sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
32763481 bi && bi->bi_iter.bi_sector <= sector;
3277
- bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3482
+ bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
32783483 if (bio_end_sector(bi) >= sector)
32793484 sector = bio_end_sector(bi);
32803485 }
3281
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3486
+ if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
32823487 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
32833488 sh->overwrite_disks++;
32843489 }
....@@ -3303,7 +3508,7 @@
33033508 set_bit(STRIPE_BITMAP_PENDING, &sh->state);
33043509 spin_unlock_irq(&sh->stripe_lock);
33053510 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3306
- STRIPE_SECTORS, 0);
3511
+ RAID5_STRIPE_SECTORS(conf), 0);
33073512 spin_lock_irq(&sh->stripe_lock);
33083513 clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
33093514 if (!sh->batch_head) {
....@@ -3365,7 +3570,7 @@
33653570 if (!rdev_set_badblocks(
33663571 rdev,
33673572 sh->sector,
3368
- STRIPE_SECTORS, 0))
3573
+ RAID5_STRIPE_SECTORS(conf), 0))
33693574 md_error(conf->mddev, rdev);
33703575 rdev_dec_pending(rdev, conf->mddev);
33713576 }
....@@ -3385,8 +3590,8 @@
33853590 wake_up(&conf->wait_for_overlap);
33863591
33873592 while (bi && bi->bi_iter.bi_sector <
3388
- sh->dev[i].sector + STRIPE_SECTORS) {
3389
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3593
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3594
+ struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
33903595
33913596 md_write_end(conf->mddev);
33923597 bio_io_error(bi);
....@@ -3394,7 +3599,7 @@
33943599 }
33953600 if (bitmap_end)
33963601 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3397
- STRIPE_SECTORS, 0, 0);
3602
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
33983603 bitmap_end = 0;
33993604 /* and fail all 'written' */
34003605 bi = sh->dev[i].written;
....@@ -3406,8 +3611,8 @@
34063611
34073612 if (bi) bitmap_end = 1;
34083613 while (bi && bi->bi_iter.bi_sector <
3409
- sh->dev[i].sector + STRIPE_SECTORS) {
3410
- struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3614
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3615
+ struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
34113616
34123617 md_write_end(conf->mddev);
34133618 bio_io_error(bi);
....@@ -3430,9 +3635,9 @@
34303635 if (bi)
34313636 s->to_read--;
34323637 while (bi && bi->bi_iter.bi_sector <
3433
- sh->dev[i].sector + STRIPE_SECTORS) {
3638
+ sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
34343639 struct bio *nextbi =
3435
- r5_next_bio(bi, sh->dev[i].sector);
3640
+ r5_next_bio(conf, bi, sh->dev[i].sector);
34363641
34373642 bio_io_error(bi);
34383643 bi = nextbi;
....@@ -3440,7 +3645,7 @@
34403645 }
34413646 if (bitmap_end)
34423647 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3443
- STRIPE_SECTORS, 0, 0);
3648
+ RAID5_STRIPE_SECTORS(conf), 0, 0);
34443649 /* If we were in the middle of a write the parity block might
34453650 * still be locked - so just clear all R5_LOCKED flags
34463651 */
....@@ -3485,14 +3690,14 @@
34853690 && !test_bit(Faulty, &rdev->flags)
34863691 && !test_bit(In_sync, &rdev->flags)
34873692 && !rdev_set_badblocks(rdev, sh->sector,
3488
- STRIPE_SECTORS, 0))
3693
+ RAID5_STRIPE_SECTORS(conf), 0))
34893694 abort = 1;
34903695 rdev = rcu_dereference(conf->disks[i].replacement);
34913696 if (rdev
34923697 && !test_bit(Faulty, &rdev->flags)
34933698 && !test_bit(In_sync, &rdev->flags)
34943699 && !rdev_set_badblocks(rdev, sh->sector,
3495
- STRIPE_SECTORS, 0))
3700
+ RAID5_STRIPE_SECTORS(conf), 0))
34963701 abort = 1;
34973702 }
34983703 rcu_read_unlock();
....@@ -3500,7 +3705,7 @@
35003705 conf->recovery_disabled =
35013706 conf->mddev->recovery_disabled;
35023707 }
3503
- md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3708
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
35043709 }
35053710
35063711 static int want_replace(struct stripe_head *sh, int disk_idx)
....@@ -3527,6 +3732,7 @@
35273732 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
35283733 &sh->dev[s->failed_num[1]] };
35293734 int i;
3735
+ bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
35303736
35313737
35323738 if (test_bit(R5_LOCKED, &dev->flags) ||
....@@ -3585,18 +3791,27 @@
35853791 * devices must be read.
35863792 */
35873793 return 1;
3794
+
3795
+ if (s->failed >= 2 &&
3796
+ (fdev[i]->towrite ||
3797
+ s->failed_num[i] == sh->pd_idx ||
3798
+ s->failed_num[i] == sh->qd_idx) &&
3799
+ !test_bit(R5_UPTODATE, &fdev[i]->flags))
3800
+ /* In max degraded raid6, If the failed disk is P, Q,
3801
+ * or we want to read the failed disk, we need to do
3802
+ * reconstruct-write.
3803
+ */
3804
+ force_rcw = true;
35883805 }
35893806
3590
- /* If we are forced to do a reconstruct-write, either because
3591
- * the current RAID6 implementation only supports that, or
3592
- * because parity cannot be trusted and we are currently
3593
- * recovering it, there is extra need to be careful.
3807
+ /* If we are forced to do a reconstruct-write, because parity
3808
+ * cannot be trusted and we are currently recovering it, there
3809
+ * is extra need to be careful.
35943810 * If one of the devices that we would need to read, because
35953811 * it is not being overwritten (and maybe not written at all)
35963812 * is missing/faulty, then we need to read everything we can.
35973813 */
3598
- if (sh->raid_conf->level != 6 &&
3599
- sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
3814
+ if (!force_rcw &&
36003815 sh->sector < sh->raid_conf->mddev->recovery_cp)
36013816 /* reconstruct-write isn't being forced */
36023817 return 0;
....@@ -3700,7 +3915,7 @@
37003915 return 0;
37013916 }
37023917
3703
-/**
3918
+/*
37043919 * handle_stripe_fill - read or compute data to satisfy pending requests.
37053920 */
37063921 static void handle_stripe_fill(struct stripe_head *sh,
....@@ -3723,7 +3938,7 @@
37233938 * back cache (prexor with orig_page, and then xor with
37243939 * page) in the read path
37253940 */
3726
- if (s->injournal && s->failed) {
3941
+ if (s->to_read && s->injournal && s->failed) {
37273942 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
37283943 r5c_make_stripe_write_out(sh);
37293944 goto out;
....@@ -3775,14 +3990,14 @@
37753990 wbi = dev->written;
37763991 dev->written = NULL;
37773992 while (wbi && wbi->bi_iter.bi_sector <
3778
- dev->sector + STRIPE_SECTORS) {
3779
- wbi2 = r5_next_bio(wbi, dev->sector);
3993
+ dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3994
+ wbi2 = r5_next_bio(conf, wbi, dev->sector);
37803995 md_write_end(conf->mddev);
37813996 bio_endio(wbi);
37823997 wbi = wbi2;
37833998 }
37843999 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3785
- STRIPE_SECTORS,
4000
+ RAID5_STRIPE_SECTORS(conf),
37864001 !test_bit(STRIPE_DEGRADED, &sh->state),
37874002 0);
37884003 if (head_sh->batch_head) {
....@@ -3966,10 +4181,8 @@
39664181 set_bit(R5_LOCKED, &dev->flags);
39674182 set_bit(R5_Wantread, &dev->flags);
39684183 s->locked++;
3969
- } else {
4184
+ } else
39704185 set_bit(STRIPE_DELAYED, &sh->state);
3971
- set_bit(STRIPE_HANDLE, &sh->state);
3972
- }
39734186 }
39744187 }
39754188 }
....@@ -3994,10 +4207,8 @@
39944207 set_bit(R5_Wantread, &dev->flags);
39954208 s->locked++;
39964209 qread++;
3997
- } else {
4210
+ } else
39984211 set_bit(STRIPE_DELAYED, &sh->state);
3999
- set_bit(STRIPE_HANDLE, &sh->state);
4000
- }
40014212 }
40024213 }
40034214 if (rcw && conf->mddev->queue)
....@@ -4047,7 +4258,7 @@
40474258 break;
40484259 }
40494260 dev = &sh->dev[s->failed_num[0]];
4050
- /* fall through */
4261
+ fallthrough;
40514262 case check_state_compute_result:
40524263 sh->check_state = check_state_idle;
40534264 if (!dev)
....@@ -4089,7 +4300,7 @@
40894300 */
40904301 set_bit(STRIPE_INSYNC, &sh->state);
40914302 else {
4092
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4303
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
40934304 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
40944305 /* don't try to repair!! */
40954306 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4097,7 +4308,7 @@
40974308 "%llu-%llu\n", mdname(conf->mddev),
40984309 (unsigned long long) sh->sector,
40994310 (unsigned long long) sh->sector +
4100
- STRIPE_SECTORS);
4311
+ RAID5_STRIPE_SECTORS(conf));
41014312 } else {
41024313 sh->check_state = check_state_compute_run;
41034314 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
....@@ -4178,7 +4389,7 @@
41784389
41794390 /* we have 2-disk failure */
41804391 BUG_ON(s->failed != 2);
4181
- /* fall through */
4392
+ fallthrough;
41824393 case check_state_compute_result:
41834394 sh->check_state = check_state_idle;
41844395
....@@ -4254,7 +4465,7 @@
42544465 */
42554466 }
42564467 } else {
4257
- atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4468
+ atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
42584469 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
42594470 /* don't try to repair!! */
42604471 set_bit(STRIPE_INSYNC, &sh->state);
....@@ -4262,7 +4473,7 @@
42624473 "%llu-%llu\n", mdname(conf->mddev),
42634474 (unsigned long long) sh->sector,
42644475 (unsigned long long) sh->sector +
4265
- STRIPE_SECTORS);
4476
+ RAID5_STRIPE_SECTORS(conf));
42664477 } else {
42674478 int *target = &sh->ops.target;
42684479
....@@ -4333,7 +4544,8 @@
43334544 /* place all the copies on one channel */
43344545 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
43354546 tx = async_memcpy(sh2->dev[dd_idx].page,
4336
- sh->dev[i].page, 0, 0, STRIPE_SIZE,
4547
+ sh->dev[i].page, sh2->dev[dd_idx].offset,
4548
+ sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
43374549 &submit);
43384550
43394551 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
....@@ -4432,8 +4644,8 @@
44324644 */
44334645 rdev = rcu_dereference(conf->disks[i].replacement);
44344646 if (rdev && !test_bit(Faulty, &rdev->flags) &&
4435
- rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4436
- !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4647
+ rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4648
+ !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44374649 &first_bad, &bad_sectors))
44384650 set_bit(R5_ReadRepl, &dev->flags);
44394651 else {
....@@ -4447,7 +4659,7 @@
44474659 if (rdev && test_bit(Faulty, &rdev->flags))
44484660 rdev = NULL;
44494661 if (rdev) {
4450
- is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4662
+ is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
44514663 &first_bad, &bad_sectors);
44524664 if (s->blocked_rdev == NULL
44534665 && (test_bit(Blocked, &rdev->flags)
....@@ -4474,7 +4686,7 @@
44744686 }
44754687 } else if (test_bit(In_sync, &rdev->flags))
44764688 set_bit(R5_Insync, &dev->flags);
4477
- else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4689
+ else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
44784690 /* in sync if before recovery_offset */
44794691 set_bit(R5_Insync, &dev->flags);
44804692 else if (test_bit(R5_UPTODATE, &dev->flags) &&
....@@ -4563,12 +4775,12 @@
45634775 rcu_read_unlock();
45644776 }
45654777
4778
+/*
4779
+ * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4780
+ * a head which can now be handled.
4781
+ */
45664782 static int clear_batch_ready(struct stripe_head *sh)
45674783 {
4568
- /* Return '1' if this is a member of batch, or
4569
- * '0' if it is a lone stripe or a head which can now be
4570
- * handled.
4571
- */
45724784 struct stripe_head *tmp;
45734785 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
45744786 return (sh->batch_head && sh->batch_head != sh);
....@@ -4618,7 +4830,6 @@
46184830 (1 << STRIPE_FULL_WRITE) |
46194831 (1 << STRIPE_BIOFILL_RUN) |
46204832 (1 << STRIPE_COMPUTE_RUN) |
4621
- (1 << STRIPE_OPS_REQ_PENDING) |
46224833 (1 << STRIPE_DISCARD) |
46234834 (1 << STRIPE_BATCH_READY) |
46244835 (1 << STRIPE_BATCH_ERR) |
....@@ -4673,15 +4884,20 @@
46734884 struct r5dev *pdev, *qdev;
46744885
46754886 clear_bit(STRIPE_HANDLE, &sh->state);
4887
+
4888
+ /*
4889
+ * handle_stripe should not continue handle the batched stripe, only
4890
+ * the head of batch list or lone stripe can continue. Otherwise we
4891
+ * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4892
+ * is set for the batched stripe.
4893
+ */
4894
+ if (clear_batch_ready(sh))
4895
+ return;
4896
+
46764897 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
46774898 /* already being handled, ensure it gets handled
46784899 * again when current action finishes */
46794900 set_bit(STRIPE_HANDLE, &sh->state);
4680
- return;
4681
- }
4682
-
4683
- if (clear_batch_ready(sh) ) {
4684
- clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
46854901 return;
46864902 }
46874903
....@@ -4918,7 +5134,7 @@
49185134 if ((s.syncing || s.replacing) && s.locked == 0 &&
49195135 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
49205136 test_bit(STRIPE_INSYNC, &sh->state)) {
4921
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5137
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49225138 clear_bit(STRIPE_SYNCING, &sh->state);
49235139 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
49245140 wake_up(&conf->wait_for_overlap);
....@@ -4937,14 +5153,11 @@
49375153 if (!test_bit(R5_ReWrite, &dev->flags)) {
49385154 set_bit(R5_Wantwrite, &dev->flags);
49395155 set_bit(R5_ReWrite, &dev->flags);
4940
- set_bit(R5_LOCKED, &dev->flags);
4941
- s.locked++;
4942
- } else {
5156
+ } else
49435157 /* let's read it back */
49445158 set_bit(R5_Wantread, &dev->flags);
4945
- set_bit(R5_LOCKED, &dev->flags);
4946
- s.locked++;
4947
- }
5159
+ set_bit(R5_LOCKED, &dev->flags);
5160
+ s.locked++;
49485161 }
49495162 }
49505163
....@@ -4986,7 +5199,7 @@
49865199 clear_bit(STRIPE_EXPAND_READY, &sh->state);
49875200 atomic_dec(&conf->reshape_stripes);
49885201 wake_up(&conf->wait_for_overlap);
4989
- md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
5202
+ md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
49905203 }
49915204
49925205 if (s.expanding && s.locked == 0 &&
....@@ -5016,14 +5229,14 @@
50165229 /* We own a safe reference to the rdev */
50175230 rdev = conf->disks[i].rdev;
50185231 if (!rdev_set_badblocks(rdev, sh->sector,
5019
- STRIPE_SECTORS, 0))
5232
+ RAID5_STRIPE_SECTORS(conf), 0))
50205233 md_error(conf->mddev, rdev);
50215234 rdev_dec_pending(rdev, conf->mddev);
50225235 }
50235236 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
50245237 rdev = conf->disks[i].rdev;
50255238 rdev_clear_badblocks(rdev, sh->sector,
5026
- STRIPE_SECTORS, 0);
5239
+ RAID5_STRIPE_SECTORS(conf), 0);
50275240 rdev_dec_pending(rdev, conf->mddev);
50285241 }
50295242 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
....@@ -5032,7 +5245,7 @@
50325245 /* rdev have been moved down */
50335246 rdev = conf->disks[i].rdev;
50345247 rdev_clear_badblocks(rdev, sh->sector,
5035
- STRIPE_SECTORS, 0);
5248
+ RAID5_STRIPE_SECTORS(conf), 0);
50365249 rdev_dec_pending(rdev, conf->mddev);
50375250 }
50385251 }
....@@ -5088,28 +5301,6 @@
50885301 hash = sh->hash_lock_index;
50895302 __release_stripe(conf, sh, &temp_inactive_list[hash]);
50905303 }
5091
-}
5092
-
5093
-static int raid5_congested(struct mddev *mddev, int bits)
5094
-{
5095
- struct r5conf *conf = mddev->private;
5096
-
5097
- /* No difference between reads and writes. Just check
5098
- * how busy the stripe_cache is
5099
- */
5100
-
5101
- if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
5102
- return 1;
5103
-
5104
- /* Also checks whether there is pressure on r5cache log space */
5105
- if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
5106
- return 1;
5107
- if (conf->quiesce)
5108
- return 1;
5109
- if (atomic_read(&conf->empty_inactive_list_nr))
5110
- return 1;
5111
-
5112
- return 0;
51135304 }
51145305
51155306 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
....@@ -5257,7 +5448,6 @@
52575448 rcu_read_unlock();
52585449 raid_bio->bi_next = (void*)rdev;
52595450 bio_set_dev(align_bi, rdev->bdev);
5260
- bio_clear_flag(align_bi, BIO_SEG_VALID);
52615451
52625452 if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
52635453 bio_sectors(align_bi),
....@@ -5281,7 +5471,7 @@
52815471 trace_block_bio_remap(align_bi->bi_disk->queue,
52825472 align_bi, disk_devt(mddev->gendisk),
52835473 raid_bio->bi_iter.bi_sector);
5284
- generic_make_request(align_bi);
5474
+ submit_bio_noacct(align_bi);
52855475 return 1;
52865476 } else {
52875477 rcu_read_unlock();
....@@ -5301,7 +5491,7 @@
53015491 struct r5conf *conf = mddev->private;
53025492 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
53035493 bio_chain(split, raid_bio);
5304
- generic_make_request(raid_bio);
5494
+ submit_bio_noacct(raid_bio);
53055495 raid_bio = split;
53065496 }
53075497
....@@ -5497,8 +5687,8 @@
54975687 /* Skip discard while reshape is happening */
54985688 return;
54995689
5500
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5501
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5690
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
5691
+ last_sector = bio_end_sector(bi);
55025692
55035693 bi->bi_next = NULL;
55045694
....@@ -5512,7 +5702,7 @@
55125702 last_sector *= conf->chunk_sectors;
55135703
55145704 for (; logical_sector < last_sector;
5515
- logical_sector += STRIPE_SECTORS) {
5705
+ logical_sector += RAID5_STRIPE_SECTORS(conf)) {
55165706 DEFINE_WAIT(w);
55175707 int d;
55185708 again:
....@@ -5557,7 +5747,7 @@
55575747 d++)
55585748 md_bitmap_startwrite(mddev->bitmap,
55595749 sh->sector,
5560
- STRIPE_SECTORS,
5750
+ RAID5_STRIPE_SECTORS(conf),
55615751 0);
55625752 sh->bm_seq = conf->seq_flush + 1;
55635753 set_bit(STRIPE_BIT_DELAY, &sh->state);
....@@ -5622,12 +5812,12 @@
56225812 return true;
56235813 }
56245814
5625
- logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5815
+ logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
56265816 last_sector = bio_end_sector(bi);
56275817 bi->bi_next = NULL;
56285818
56295819 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5630
- for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5820
+ for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
56315821 int previous;
56325822 int seq;
56335823
....@@ -5725,8 +5915,7 @@
57255915 do_flush = false;
57265916 }
57275917
5728
- if (!sh->batch_head || sh == sh->batch_head)
5729
- set_bit(STRIPE_HANDLE, &sh->state);
5918
+ set_bit(STRIPE_HANDLE, &sh->state);
57305919 clear_bit(STRIPE_DELAYED, &sh->state);
57315920 if ((!sh->batch_head || sh == sh->batch_head) &&
57325921 (bi->bi_opf & REQ_SYNC) &&
....@@ -5791,7 +5980,7 @@
57915980 sector_div(sector_nr, new_data_disks);
57925981 if (sector_nr) {
57935982 mddev->curr_resync_completed = sector_nr;
5794
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5983
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
57955984 *skipped = 1;
57965985 retn = sector_nr;
57975986 goto finish;
....@@ -5905,11 +6094,11 @@
59056094 conf->reshape_safe = mddev->reshape_position;
59066095 spin_unlock_irq(&conf->device_lock);
59076096 wake_up(&conf->wait_for_overlap);
5908
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6097
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
59096098 }
59106099
59116100 INIT_LIST_HEAD(&stripes);
5912
- for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
6101
+ for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
59136102 int j;
59146103 int skipped_disk = 0;
59156104 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
....@@ -5930,7 +6119,7 @@
59306119 skipped_disk = 1;
59316120 continue;
59326121 }
5933
- memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
6122
+ memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
59346123 set_bit(R5_Expanded, &sh->dev[j].flags);
59356124 set_bit(R5_UPTODATE, &sh->dev[j].flags);
59366125 }
....@@ -5965,7 +6154,7 @@
59656154 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
59666155 set_bit(STRIPE_HANDLE, &sh->state);
59676156 raid5_release_stripe(sh);
5968
- first_sector += STRIPE_SECTORS;
6157
+ first_sector += RAID5_STRIPE_SECTORS(conf);
59696158 }
59706159 /* Now that the sources are clearly marked, we can release
59716160 * the destination stripes
....@@ -6012,7 +6201,7 @@
60126201 conf->reshape_safe = mddev->reshape_position;
60136202 spin_unlock_irq(&conf->device_lock);
60146203 wake_up(&conf->wait_for_overlap);
6015
- sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6204
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
60166205 }
60176206 ret:
60186207 return retn;
....@@ -6071,11 +6260,12 @@
60716260 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
60726261 !conf->fullsync &&
60736262 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
6074
- sync_blocks >= STRIPE_SECTORS) {
6263
+ sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
60756264 /* we can skip this block, and probably more */
6076
- sync_blocks /= STRIPE_SECTORS;
6265
+ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
60776266 *skipped = 1;
6078
- return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
6267
+ /* keep things rounded to whole stripes */
6268
+ return sync_blocks * RAID5_STRIPE_SECTORS(conf);
60796269 }
60806270
60816271 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
....@@ -6108,7 +6298,7 @@
61086298
61096299 raid5_release_stripe(sh);
61106300
6111
- return STRIPE_SECTORS;
6301
+ return RAID5_STRIPE_SECTORS(conf);
61126302 }
61136303
61146304 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
....@@ -6131,14 +6321,14 @@
61316321 int handled = 0;
61326322
61336323 logical_sector = raid_bio->bi_iter.bi_sector &
6134
- ~((sector_t)STRIPE_SECTORS-1);
6324
+ ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
61356325 sector = raid5_compute_sector(conf, logical_sector,
61366326 0, &dd_idx, NULL);
61376327 last_sector = bio_end_sector(raid_bio);
61386328
61396329 for (; logical_sector < last_sector;
6140
- logical_sector += STRIPE_SECTORS,
6141
- sector += STRIPE_SECTORS,
6330
+ logical_sector += RAID5_STRIPE_SECTORS(conf),
6331
+ sector += RAID5_STRIPE_SECTORS(conf),
61426332 scnt++) {
61436333
61446334 if (scnt < offset)
....@@ -6177,6 +6367,8 @@
61776367 static int handle_active_stripes(struct r5conf *conf, int group,
61786368 struct r5worker *worker,
61796369 struct list_head *temp_inactive_list)
6370
+ __releases(&conf->device_lock)
6371
+ __acquires(&conf->device_lock)
61806372 {
61816373 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
61826374 int i, batch_size = 0, hash;
....@@ -6329,7 +6521,18 @@
63296521 spin_unlock_irq(&conf->device_lock);
63306522 md_check_recovery(mddev);
63316523 spin_lock_irq(&conf->device_lock);
6524
+
6525
+ /*
6526
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
6527
+ * seeing md_check_recovery() is needed to clear
6528
+ * the flag when using mdmon.
6529
+ */
6530
+ continue;
63326531 }
6532
+
6533
+ wait_event_lock_irq(mddev->sb_wait,
6534
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6535
+ conf->device_lock);
63336536 }
63346537 pr_debug("%d stripes handled\n", handled);
63356538
....@@ -6469,6 +6672,100 @@
64696672 raid5_show_rmw_level,
64706673 raid5_store_rmw_level);
64716674
6675
+static ssize_t
6676
+raid5_show_stripe_size(struct mddev *mddev, char *page)
6677
+{
6678
+ struct r5conf *conf;
6679
+ int ret = 0;
6680
+
6681
+ spin_lock(&mddev->lock);
6682
+ conf = mddev->private;
6683
+ if (conf)
6684
+ ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
6685
+ spin_unlock(&mddev->lock);
6686
+ return ret;
6687
+}
6688
+
6689
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6690
+static ssize_t
6691
+raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
6692
+{
6693
+ struct r5conf *conf;
6694
+ unsigned long new;
6695
+ int err;
6696
+ int size;
6697
+
6698
+ if (len >= PAGE_SIZE)
6699
+ return -EINVAL;
6700
+ if (kstrtoul(page, 10, &new))
6701
+ return -EINVAL;
6702
+
6703
+ /*
6704
+ * The value should not be bigger than PAGE_SIZE. It requires to
6705
+ * be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6706
+ * of two.
6707
+ */
6708
+ if (new % DEFAULT_STRIPE_SIZE != 0 ||
6709
+ new > PAGE_SIZE || new == 0 ||
6710
+ new != roundup_pow_of_two(new))
6711
+ return -EINVAL;
6712
+
6713
+ err = mddev_lock(mddev);
6714
+ if (err)
6715
+ return err;
6716
+
6717
+ conf = mddev->private;
6718
+ if (!conf) {
6719
+ err = -ENODEV;
6720
+ goto out_unlock;
6721
+ }
6722
+
6723
+ if (new == conf->stripe_size)
6724
+ goto out_unlock;
6725
+
6726
+ pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6727
+ conf->stripe_size, new);
6728
+
6729
+ if (mddev->sync_thread ||
6730
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6731
+ mddev->reshape_position != MaxSector ||
6732
+ mddev->sysfs_active) {
6733
+ err = -EBUSY;
6734
+ goto out_unlock;
6735
+ }
6736
+
6737
+ mddev_suspend(mddev);
6738
+ mutex_lock(&conf->cache_size_mutex);
6739
+ size = conf->max_nr_stripes;
6740
+
6741
+ shrink_stripes(conf);
6742
+
6743
+ conf->stripe_size = new;
6744
+ conf->stripe_shift = ilog2(new) - 9;
6745
+ conf->stripe_sectors = new >> 9;
6746
+ if (grow_stripes(conf, size)) {
6747
+ pr_warn("md/raid:%s: couldn't allocate buffers\n",
6748
+ mdname(mddev));
6749
+ err = -ENOMEM;
6750
+ }
6751
+ mutex_unlock(&conf->cache_size_mutex);
6752
+ mddev_resume(mddev);
6753
+
6754
+out_unlock:
6755
+ mddev_unlock(mddev);
6756
+ return err ?: len;
6757
+}
6758
+
6759
+static struct md_sysfs_entry
6760
+raid5_stripe_size = __ATTR(stripe_size, 0644,
6761
+ raid5_show_stripe_size,
6762
+ raid5_store_stripe_size);
6763
+#else
6764
+static struct md_sysfs_entry
6765
+raid5_stripe_size = __ATTR(stripe_size, 0444,
6766
+ raid5_show_stripe_size,
6767
+ NULL);
6768
+#endif
64726769
64736770 static ssize_t
64746771 raid5_show_preread_threshold(struct mddev *mddev, char *page)
....@@ -6548,14 +6845,14 @@
65486845 if (!conf)
65496846 err = -ENODEV;
65506847 else if (new != conf->skip_copy) {
6848
+ struct request_queue *q = mddev->queue;
6849
+
65516850 mddev_suspend(mddev);
65526851 conf->skip_copy = new;
65536852 if (new)
6554
- mddev->queue->backing_dev_info->capabilities |=
6555
- BDI_CAP_STABLE_WRITES;
6853
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
65566854 else
6557
- mddev->queue->backing_dev_info->capabilities &=
6558
- ~BDI_CAP_STABLE_WRITES;
6855
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
65596856 mddev_resume(mddev);
65606857 }
65616858 mddev_unlock(mddev);
....@@ -6595,7 +6892,6 @@
65956892
65966893 static int alloc_thread_groups(struct r5conf *conf, int cnt,
65976894 int *group_cnt,
6598
- int *worker_cnt_per_group,
65996895 struct r5worker_group **worker_groups);
66006896 static ssize_t
66016897 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
....@@ -6604,7 +6900,7 @@
66046900 unsigned int new;
66056901 int err;
66066902 struct r5worker_group *new_groups, *old_groups;
6607
- int group_cnt, worker_cnt_per_group;
6903
+ int group_cnt;
66086904
66096905 if (len >= PAGE_SIZE)
66106906 return -EINVAL;
....@@ -6627,13 +6923,11 @@
66276923 if (old_groups)
66286924 flush_workqueue(raid5_wq);
66296925
6630
- err = alloc_thread_groups(conf, new,
6631
- &group_cnt, &worker_cnt_per_group,
6632
- &new_groups);
6926
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
66336927 if (!err) {
66346928 spin_lock_irq(&conf->device_lock);
66356929 conf->group_cnt = group_cnt;
6636
- conf->worker_cnt_per_group = worker_cnt_per_group;
6930
+ conf->worker_cnt_per_group = new;
66376931 conf->worker_groups = new_groups;
66386932 spin_unlock_irq(&conf->device_lock);
66396933
....@@ -6660,7 +6954,9 @@
66606954 &raid5_group_thread_cnt.attr,
66616955 &raid5_skip_copy.attr,
66626956 &raid5_rmw_level.attr,
6957
+ &raid5_stripe_size.attr,
66636958 &r5c_journal_mode.attr,
6959
+ &ppl_write_hint.attr,
66646960 NULL,
66656961 };
66666962 static struct attribute_group raid5_attrs_group = {
....@@ -6668,16 +6964,13 @@
66686964 .attrs = raid5_attrs,
66696965 };
66706966
6671
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
6672
- int *group_cnt,
6673
- int *worker_cnt_per_group,
6967
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
66746968 struct r5worker_group **worker_groups)
66756969 {
66766970 int i, j, k;
66776971 ssize_t size;
66786972 struct r5worker *workers;
66796973
6680
- *worker_cnt_per_group = cnt;
66816974 if (cnt == 0) {
66826975 *group_cnt = 0;
66836976 *worker_groups = NULL;
....@@ -6743,25 +7036,25 @@
67437036 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67447037 {
67457038 safe_put_page(percpu->spare_page);
6746
- if (percpu->scribble)
6747
- flex_array_free(percpu->scribble);
67487039 percpu->spare_page = NULL;
7040
+ kvfree(percpu->scribble);
67497041 percpu->scribble = NULL;
67507042 }
67517043
67527044 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
67537045 {
6754
- if (conf->level == 6 && !percpu->spare_page)
7046
+ if (conf->level == 6 && !percpu->spare_page) {
67557047 percpu->spare_page = alloc_page(GFP_KERNEL);
6756
- if (!percpu->scribble)
6757
- percpu->scribble = scribble_alloc(max(conf->raid_disks,
6758
- conf->previous_raid_disks),
6759
- max(conf->chunk_sectors,
6760
- conf->prev_chunk_sectors)
6761
- / STRIPE_SECTORS,
6762
- GFP_KERNEL);
7048
+ if (!percpu->spare_page)
7049
+ return -ENOMEM;
7050
+ }
67637051
6764
- if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
7052
+ if (scribble_alloc(percpu,
7053
+ max(conf->raid_disks,
7054
+ conf->previous_raid_disks),
7055
+ max(conf->chunk_sectors,
7056
+ conf->prev_chunk_sectors)
7057
+ / RAID5_STRIPE_SECTORS(conf))) {
67657058 free_scratch_buffer(conf, percpu);
67667059 return -ENOMEM;
67677060 }
....@@ -6816,6 +7109,7 @@
68167109 __func__, cpu);
68177110 return -ENOMEM;
68187111 }
7112
+ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
68197113 return 0;
68207114 }
68217115
....@@ -6877,7 +7171,7 @@
68777171 struct disk_info *disk;
68787172 char pers_name[6];
68797173 int i;
6880
- int group_cnt, worker_cnt_per_group;
7174
+ int group_cnt;
68817175 struct r5worker_group *new_group;
68827176 int ret;
68837177
....@@ -6913,6 +7207,12 @@
69137207 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
69147208 if (conf == NULL)
69157209 goto abort;
7210
+
7211
+#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7212
+ conf->stripe_size = DEFAULT_STRIPE_SIZE;
7213
+ conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
7214
+ conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
7215
+#endif
69167216 INIT_LIST_HEAD(&conf->free_list);
69177217 INIT_LIST_HEAD(&conf->pending_list);
69187218 conf->pending_data = kcalloc(PENDING_IO_MAX,
....@@ -6923,15 +7223,14 @@
69237223 for (i = 0; i < PENDING_IO_MAX; i++)
69247224 list_add(&conf->pending_data[i].sibling, &conf->free_list);
69257225 /* Don't enable multi-threading by default*/
6926
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6927
- &new_group)) {
7226
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
69287227 conf->group_cnt = group_cnt;
6929
- conf->worker_cnt_per_group = worker_cnt_per_group;
7228
+ conf->worker_cnt_per_group = 0;
69307229 conf->worker_groups = new_group;
69317230 } else
69327231 goto abort;
69337232 spin_lock_init(&conf->device_lock);
6934
- seqcount_init(&conf->gen_lock);
7233
+ seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
69357234 mutex_init(&conf->cache_size_mutex);
69367235 init_waitqueue_head(&conf->wait_for_quiescent);
69377236 init_waitqueue_head(&conf->wait_for_stripe);
....@@ -7065,8 +7364,8 @@
70657364 conf->min_nr_stripes = NR_STRIPES;
70667365 if (mddev->reshape_position != MaxSector) {
70677366 int stripes = max_t(int,
7068
- ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
7069
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
7367
+ ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
7368
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
70707369 conf->min_nr_stripes = max(NR_STRIPES, stripes);
70717370 if (conf->min_nr_stripes != NR_STRIPES)
70727371 pr_info("md/raid:%s: force stripe size %d for reshape\n",
....@@ -7139,6 +7438,12 @@
71397438 return 1;
71407439 }
71417440 return 0;
7441
+}
7442
+
7443
+static void raid5_set_io_opt(struct r5conf *conf)
7444
+{
7445
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
7446
+ (conf->raid_disks - conf->max_degraded));
71427447 }
71437448
71447449 static int raid5_run(struct mddev *mddev)
....@@ -7425,13 +7730,10 @@
74257730 int data_disks = conf->previous_raid_disks - conf->max_degraded;
74267731 int stripe = data_disks *
74277732 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
7428
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
7429
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
74307733
74317734 chunk_size = mddev->chunk_sectors << 9;
74327735 blk_queue_io_min(mddev->queue, chunk_size);
7433
- blk_queue_io_opt(mddev->queue, chunk_size *
7434
- (conf->raid_disks - conf->max_degraded));
7736
+ raid5_set_io_opt(conf);
74357737 mddev->queue->limits.raid_partial_stripes_expensive = 1;
74367738 /*
74377739 * We can only discard a whole stripe. It doesn't make sense to
....@@ -7716,6 +8018,7 @@
77168018 */
77178019 if (rdev->saved_raid_disk >= 0 &&
77188020 rdev->saved_raid_disk >= first &&
8021
+ rdev->saved_raid_disk <= last &&
77198022 conf->disks[rdev->saved_raid_disk].rdev == NULL)
77208023 first = rdev->saved_raid_disk;
77218024
....@@ -7797,14 +8100,14 @@
77978100 * stripe_heads first.
77988101 */
77998102 struct r5conf *conf = mddev->private;
7800
- if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
8103
+ if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78018104 > conf->min_nr_stripes ||
7802
- ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
8105
+ ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
78038106 > conf->min_nr_stripes) {
78048107 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
78058108 mdname(mddev),
78068109 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7807
- / STRIPE_SIZE)*4);
8110
+ / RAID5_STRIPE_SIZE(conf))*4);
78088111 return 0;
78098112 }
78108113 return 1;
....@@ -7940,8 +8243,8 @@
79408243 else
79418244 rdev->recovery_offset = 0;
79428245
7943
- if (sysfs_link_rdev(mddev, rdev))
7944
- /* Failure here is OK */;
8246
+ /* Failure here is OK */
8247
+ sysfs_link_rdev(mddev, rdev);
79458248 }
79468249 } else if (rdev->raid_disk >= conf->previous_raid_disks
79478250 && !test_bit(Faulty, &rdev->flags)) {
....@@ -8015,16 +8318,8 @@
80158318 spin_unlock_irq(&conf->device_lock);
80168319 wake_up(&conf->wait_for_overlap);
80178320
8018
- /* read-ahead size must cover two whole stripes, which is
8019
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
8020
- */
8021
- if (conf->mddev->queue) {
8022
- int data_disks = conf->raid_disks - conf->max_degraded;
8023
- int stripe = data_disks * ((conf->chunk_sectors << 9)
8024
- / PAGE_SIZE);
8025
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
8026
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
8027
- }
8321
+ if (conf->mddev->queue)
8322
+ raid5_set_io_opt(conf);
80288323 }
80298324 }
80308325
....@@ -8136,7 +8431,7 @@
81368431 while (chunksect && (mddev->array_sectors & (chunksect-1)))
81378432 chunksect >>= 1;
81388433
8139
- if ((chunksect<<9) < STRIPE_SIZE)
8434
+ if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
81408435 /* array size does not allow a suitable chunk size */
81418436 return ERR_PTR(-EINVAL);
81428437
....@@ -8423,7 +8718,6 @@
84238718 .finish_reshape = raid5_finish_reshape,
84248719 .quiesce = raid5_quiesce,
84258720 .takeover = raid6_takeover,
8426
- .congested = raid5_congested,
84278721 .change_consistency_policy = raid5_change_consistency_policy,
84288722 };
84298723 static struct md_personality raid5_personality =
....@@ -8448,7 +8742,6 @@
84488742 .finish_reshape = raid5_finish_reshape,
84498743 .quiesce = raid5_quiesce,
84508744 .takeover = raid5_takeover,
8451
- .congested = raid5_congested,
84528745 .change_consistency_policy = raid5_change_consistency_policy,
84538746 };
84548747
....@@ -8474,7 +8767,6 @@
84748767 .finish_reshape = raid5_finish_reshape,
84758768 .quiesce = raid5_quiesce,
84768769 .takeover = raid4_takeover,
8477
- .congested = raid5_congested,
84788770 .change_consistency_policy = raid5_change_consistency_policy,
84798771 };
84808772