From 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 13 May 2024 10:30:14 +0000 Subject: [PATCH] modify sin led gpio --- kernel/drivers/md/raid5.c | 901 +++++++++++++++++++++++++++++++++++++------------------- 1 files changed, 595 insertions(+), 306 deletions(-) diff --git a/kernel/drivers/md/raid5.c b/kernel/drivers/md/raid5.c index c7bda4b..0dea4aa 100644 --- a/kernel/drivers/md/raid5.c +++ b/kernel/drivers/md/raid5.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman @@ -7,15 +8,6 @@ * RAID-4/5/6 management functions. * Thanks to Penguin Computing for making the RAID-6 development possible * by donating a test server! - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* @@ -44,6 +36,7 @@ */ #include <linux/blkdev.h> +#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -54,7 +47,6 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/nodemask.h> -#include <linux/flex_array.h> #include <trace/events/block.h> #include <linux/list_sort.h> @@ -78,13 +70,13 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; return &conf->stripe_hashtbl[hash]; } -static inline int stripe_hash_locks_hash(sector_t sect) +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) { - return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; + return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) @@ -457,13 +449,74 @@ return sh; } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static void free_stripe_pages(struct stripe_head *sh) +{ + int i; + struct page *p; + + /* Have not allocate page pool */ + if (!sh->pages) + return; + + for (i = 0; i < sh->nr_pages; i++) { + p = sh->pages[i]; + if (p) + put_page(p); + sh->pages[i] = NULL; + } +} + +static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) +{ + int i; + struct page *p; + + for (i = 0; i < sh->nr_pages; i++) { + /* The page have allocated. */ + if (sh->pages[i]) + continue; + + p = alloc_page(gfp); + if (!p) { + free_stripe_pages(sh); + return -ENOMEM; + } + sh->pages[i] = p; + } + return 0; +} + +static int +init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) +{ + int nr_pages, cnt; + + if (sh->pages) + return 0; + + /* Each of the sh->dev[i] need one conf->stripe_size */ + cnt = PAGE_SIZE / conf->stripe_size; + nr_pages = (disks + cnt - 1) / cnt; + + sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!sh->pages) + return -ENOMEM; + sh->nr_pages = nr_pages; + sh->stripes_per_page = cnt; + return 0; +} +#endif + static void shrink_buffers(struct stripe_head *sh) { - struct page *p; int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num ; i++) { + struct page *p; + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) @@ -471,6 +524,11 @@ sh->dev[i].page = NULL; put_page(p); } +#else + for (i = 0; i < num; i++) + sh->dev[i].page = NULL; + free_stripe_pages(sh); /* Free pages */ +#endif } static int grow_buffers(struct stripe_head *sh, gfp_t gfp) @@ -478,6 +536,7 @@ int i; int num = sh->raid_conf->pool_size; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE for (i = 0; i < num; i++) { struct page *page; @@ -486,8 +545,18 @@ } sh->dev[i].page = page; sh->dev[i].orig_page = page; + sh->dev[i].offset = 0; } +#else + if (alloc_stripe_pages(sh, gfp)) + return -ENOMEM; + for (i = 0; i < num; i++) { + sh->dev[i].page = raid5_get_dev_page(sh, i); + sh->dev[i].orig_page = sh->dev[i].page; + sh->dev[i].offset = raid5_get_page_offset(sh, i); + } +#endif return 0; } @@ -618,17 +687,17 @@ return degraded; } -static int has_failed(struct r5conf *conf) +static bool has_failed(struct r5conf *conf) { - int degraded; + int degraded = conf->mddev->degraded; - if (conf->mddev->reshape_position == MaxSector) - return conf->mddev->degraded > conf->max_degraded; + if (test_bit(MD_BROKEN, &conf->mddev->flags)) + return true; - degraded = raid5_calc_degraded(conf); - if (degraded > conf->max_degraded) - return 1; - return 0; + if (conf->mddev->reshape_position != MaxSector) + degraded = raid5_calc_degraded(conf); + + return degraded > conf->max_degraded; } struct stripe_head * @@ -636,7 +705,7 @@ int previous, int noblock, int noquiesce) { struct stripe_head *sh; - int hash = stripe_hash_locks_hash(sector); + int hash = stripe_hash_locks_hash(conf, sector); int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -712,6 +781,8 @@ } static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __acquires(&sh1->stripe_lock) + __acquires(&sh2->stripe_lock) { if (sh1 > sh2) { spin_lock_irq(&sh2->stripe_lock); @@ -723,6 +794,8 @@ } static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __releases(&sh1->stripe_lock) + __releases(&sh2->stripe_lock) { spin_unlock(&sh1->stripe_lock); spin_unlock_irq(&sh2->stripe_lock); @@ -753,9 +826,9 @@ tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) return; - head_sector = sh->sector - STRIPE_SECTORS; + head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(head_sector); + hash = stripe_hash_locks_hash(conf, head_sector); spin_lock_irq(conf->hash_locks + hash); head = __find_stripe(conf, head_sector, conf->generation); if (head && !atomic_inc_not_zero(&head->count)) { @@ -878,7 +951,7 @@ struct bio *bio; while ((bio = bio_list_pop(tmp))) - generic_make_request(bio); + submit_bio_noacct(bio); } static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) @@ -1062,7 +1135,7 @@ test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (!bad) break; @@ -1094,7 +1167,7 @@ if (rdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1134,12 +1207,12 @@ else sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_iter.bi_size = STRIPE_SIZE; + bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); + bi->bi_io_vec[0].bv_offset = sh->dev[i].offset; + bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1156,12 +1229,12 @@ if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else - generic_make_request(bi); + submit_bio_noacct(bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1188,11 +1261,11 @@ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; - rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_iter.bi_size = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); + rbi->bi_io_vec[0].bv_offset = sh->dev[i].offset; + rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1206,7 +1279,7 @@ if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else - generic_make_request(rbi); + submit_bio_noacct(rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1231,7 +1304,7 @@ static struct dma_async_tx_descriptor * async_copy_data(int frombio, struct bio *bio, struct page **page, - sector_t sector, struct dma_async_tx_descriptor *tx, + unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx, struct stripe_head *sh, int no_skipcopy) { struct bio_vec bvl; @@ -1240,6 +1313,7 @@ int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; + struct r5conf *conf = sh->raid_conf; if (bio->bi_iter.bi_sector >= sector) page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; @@ -1261,8 +1335,8 @@ len -= b_offset; } - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; + if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) + clen = RAID5_STRIPE_SIZE(conf) - page_offset; else clen = len; @@ -1270,17 +1344,17 @@ b_offset += bvl.bv_offset; bio_page = bvl.bv_page; if (frombio) { - if (sh->raid_conf->skip_copy && + if (conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE && + clen == RAID5_STRIPE_SIZE(conf) && !no_skipcopy) *page = bio_page; else - tx = async_memcpy(*page, bio_page, page_offset, + tx = async_memcpy(*page, bio_page, page_offset + poff, b_offset, clen, &submit); } else tx = async_memcpy(bio_page, *page, b_offset, - page_offset, clen, &submit); + page_offset + poff, clen, &submit); } /* chain the operations */ submit.depend_tx = tx; @@ -1297,6 +1371,7 @@ { struct stripe_head *sh = stripe_head_ref; int i; + struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1317,8 +1392,8 @@ rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - rbi2 = r5_next_bio(rbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + rbi2 = r5_next_bio(conf, rbi, dev->sector); bio_endio(rbi); rbi = rbi2; } @@ -1335,6 +1410,7 @@ struct dma_async_tx_descriptor *tx = NULL; struct async_submit_ctl submit; int i; + struct r5conf *conf = sh->raid_conf; BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, @@ -1349,10 +1425,11 @@ dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, + dev->offset, dev->sector, tx, sh, 0); - rbi = r5_next_bio(rbi, dev->sector); + rbi = r5_next_bio(conf, rbi, dev->sector); } } } @@ -1394,22 +1471,25 @@ } /* return a pointer to the address conversion region of the scribble buffer */ -static addr_conv_t *to_addr_conv(struct stripe_head *sh, - struct raid5_percpu *percpu, int i) +static struct page **to_addr_page(struct raid5_percpu *percpu, int i) { - void *addr; - - addr = flex_array_get(percpu->scribble, i); - return addr + sizeof(struct page *) * (sh->disks + 2); + return percpu->scribble + i * percpu->scribble_obj_size; } /* return a pointer to the address conversion region of the scribble buffer */ -static struct page **to_addr_page(struct raid5_percpu *percpu, int i) +static addr_conv_t *to_addr_conv(struct stripe_head *sh, + struct raid5_percpu *percpu, int i) { - void *addr; + return (void *) (to_addr_page(percpu, i) + sh->disks + 2); +} - addr = flex_array_get(percpu->scribble, i); - return addr; +/* + * Return a pointer to record offset address. + */ +static unsigned int * +to_addr_offs(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + return (unsigned int *) (to_addr_conv(sh, percpu, 0) + sh->disks + 2); } static struct dma_async_tx_descriptor * @@ -1417,9 +1497,11 @@ { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; + unsigned int off_dest = tgt->offset; int count = 0; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; @@ -1431,24 +1513,30 @@ __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); - for (i = disks; i--; ) - if (i != target) + for (i = disks; i--; ) { + if (i != target) { + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; + } + } atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } /* set_syndrome_sources - populate source buffers for gen_syndrome * @srcs - (struct page *) array of size sh->disks + * @offs - (unsigned int) array of offset for each page * @sh - stripe_head to parse * * Populates srcs in proper layout order for the stripe and returns the @@ -1457,6 +1545,7 @@ * is recorded in srcs[count+1]]. */ static int set_syndrome_sources(struct page **srcs, + unsigned int *offs, struct stripe_head *sh, int srctype) { @@ -1487,6 +1576,12 @@ srcs[slot] = sh->dev[i].orig_page; else srcs[slot] = sh->dev[i].page; + /* + * For R5_InJournal, PAGE_SIZE must be 4KB and will + * not shared page. In that case, dev[i].offset + * is 0. + */ + offs[slot] = sh->dev[i].offset; } i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1499,12 +1594,14 @@ { int disks = sh->disks; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int target; int qd_idx = sh->qd_idx; struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; struct r5dev *tgt; struct page *dest; + unsigned int dest_off; int i; int count; @@ -1523,30 +1620,34 @@ tgt = &sh->dev[target]; BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); dest = tgt->page; + dest_off = tgt->offset; atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ count = 0; for (i = disks; i-- ; ) { if (i == target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + tx = async_xor_offs(dest, dest_off, blocks, offs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } return tx; @@ -1565,6 +1666,7 @@ struct r5dev *tgt2 = &sh->dev[target2]; struct dma_async_tx_descriptor *tx; struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; BUG_ON(sh->batch_head); @@ -1577,13 +1679,16 @@ /* we need to open-code set_syndrome_sources to handle the * slot number conversion for 'faila' and 'failb' */ - for (i = 0; i < disks ; i++) + for (i = 0; i < disks ; i++) { + offs[i] = 0; blocks[i] = NULL; + } count = 0; i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + offs[slot] = sh->dev[i].offset; blocks[slot] = sh->dev[i].page; if (i == target) @@ -1608,10 +1713,12 @@ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, syndrome_disks+2, - STRIPE_SIZE, &submit); + return async_gen_syndrome(blocks, offs, syndrome_disks+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } else { struct page *dest; + unsigned int dest_off; int data_target; int qd_idx = sh->qd_idx; @@ -1625,22 +1732,26 @@ for (i = disks; i-- ; ) { if (i == data_target || i == qd_idx) continue; + offs[count] = sh->dev[i].offset; blocks[count++] = sh->dev[i].page; } dest = sh->dev[data_target].page; + dest_off = sh->dev[data_target].offset; init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + tx = async_xor_offs(dest, dest_off, blocks, offs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); + return async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } } else { init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1649,13 +1760,15 @@ if (failb == syndrome_disks) { /* We're missing D+P. */ return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, + blocks, offs, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, failb, + blocks, offs, &submit); } } } @@ -1681,10 +1794,12 @@ { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); int count = 0, pd_idx = sh->pd_idx, i; struct async_submit_ctl submit; /* existing parity data subtracted */ + unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset; struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; BUG_ON(sh->batch_head); @@ -1694,15 +1809,23 @@ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_InJournal, &dev->flags)) + if (test_bit(R5_InJournal, &dev->flags)) { + /* + * For this case, PAGE_SIZE must be equal to 4KB and + * page offset is zero. + */ + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->orig_page; - else if (test_bit(R5_Wantdrain, &dev->flags)) + } else if (test_bit(R5_Wantdrain, &dev->flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1712,17 +1835,19 @@ struct dma_async_tx_descriptor *tx) { struct page **blocks = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); int count; struct async_submit_ctl submit; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + count = set_syndrome_sources(blocks, offs, sh, SYNDROME_SRC_WANT_DRAIN); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1763,7 +1888,7 @@ WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_opf & REQ_SYNC) @@ -1772,6 +1897,7 @@ set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, + dev->offset, dev->sector, tx, sh, r5c_is_writeback(conf->log)); if (dev->page != dev->orig_page && @@ -1781,7 +1907,7 @@ clear_bit(R5_OVERWRITE, &dev->flags); } } - wbi = r5_next_bio(wbi, dev->sector); + wbi = r5_next_bio(conf, wbi, dev->sector); } if (head_sh->batch_head) { @@ -1851,9 +1977,11 @@ { int disks = sh->disks; struct page **xor_srcs; + unsigned int *off_srcs; struct async_submit_ctl submit; int count, pd_idx = sh->pd_idx, i; struct page *xor_dest; + unsigned int off_dest; int prexor = 0; unsigned long flags; int j = 0; @@ -1878,24 +2006,31 @@ again: count = 0; xor_srcs = to_addr_page(percpu, j); + off_srcs = to_addr_offs(sh, percpu); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { prexor = 1; + off_dest = off_srcs[count] = sh->dev[pd_idx].offset; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (head_sh->dev[i].written || - test_bit(R5_InJournal, &head_sh->dev[i].flags)) + test_bit(R5_InJournal, &head_sh->dev[i].flags)) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } else { xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) + if (i != pd_idx) { + off_srcs[count] = dev->offset; xor_srcs[count++] = dev->page; + } } } @@ -1921,9 +2056,11 @@ } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], off_dest, off_srcs[0], + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -1938,6 +2075,7 @@ { struct async_submit_ctl submit; struct page **blocks; + unsigned int *offs; int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; @@ -1962,6 +2100,7 @@ again: blocks = to_addr_page(percpu, j); + offs = to_addr_offs(sh, percpu); if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { synflags = SYNDROME_SRC_WRITTEN; @@ -1971,7 +2110,7 @@ txflags = ASYNC_TX_ACK; } - count = set_syndrome_sources(blocks, sh, synflags); + count = set_syndrome_sources(blocks, offs, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; @@ -1983,7 +2122,8 @@ } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -2010,7 +2150,9 @@ int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; struct page *xor_dest; + unsigned int off_dest; struct page **xor_srcs = to_addr_page(percpu, 0); + unsigned int *off_srcs = to_addr_offs(sh, percpu); struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; int count; @@ -2022,16 +2164,20 @@ BUG_ON(sh->batch_head); count = 0; xor_dest = sh->dev[pd_idx].page; + off_dest = sh->dev[pd_idx].offset; + off_srcs[count] = off_dest; xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { if (i == pd_idx || i == qd_idx) continue; + off_srcs[count] = sh->dev[i].offset; xor_srcs[count++] = sh->dev[i].page; } init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + tx = async_xor_val_offs(xor_dest, off_dest, xor_srcs, off_srcs, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); @@ -2042,6 +2188,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) { struct page **srcs = to_addr_page(percpu, 0); + unsigned int *offs = to_addr_offs(sh, percpu); struct async_submit_ctl submit; int count; @@ -2049,15 +2196,16 @@ (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); + count = set_syndrome_sources(srcs, offs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, - &sh->ops.zero_sum_result, percpu->spare_page, &submit); + async_syndrome_val(srcs, offs, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), + &sh->ops.zero_sum_result, percpu->spare_page, 0, &submit); } static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) @@ -2134,6 +2282,9 @@ static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) { +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + kfree(sh->pages); +#endif if (sh->ppl_page) __free_page(sh->ppl_page); kmem_cache_free(sc, sh); @@ -2167,9 +2318,15 @@ sh->ppl_page = alloc_page(gfp); if (!sh->ppl_page) { free_stripe(sc, sh); - sh = NULL; + return NULL; } } +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + if (init_stripe_shared_pages(sh, conf, disks)) { + free_stripe(sc, sh); + return NULL; + } +#endif } return sh; } @@ -2226,10 +2383,13 @@ } /** - * scribble_len - return the required size of the scribble region - * @num - total number of disks in the array + * scribble_alloc - allocate percpu scribble buffer for required size + * of the scribble region + * @percpu: from for_each_present_cpu() of the caller + * @num: total number of disks in the array + * @cnt: scribble objs count for required size of the scribble region * - * The size must be enough to contain: + * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 * 2/ room to convert each entry in (1) to its corresponding dma * (dma_map_page()) or page (page_address()) address. @@ -2238,21 +2398,29 @@ * calculate over all devices (not just the data blocks), using zeros in place * of the P and Q blocks. */ -static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) +static int scribble_alloc(struct raid5_percpu *percpu, + int num, int cnt) { - struct flex_array *ret; - size_t len; + size_t obj_size = + sizeof(struct page *) * (num + 2) + + sizeof(addr_conv_t) * (num + 2) + + sizeof(unsigned int) * (num + 2); + void *scribble; - len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); - ret = flex_array_alloc(len, cnt, flags); - if (!ret) - return NULL; - /* always prealloc all elements, so no locking is required */ - if (flex_array_prealloc(ret, 0, cnt, flags)) { - flex_array_free(ret); - return NULL; - } - return ret; + /* + * If here is in raid array suspend context, it is in memalloc noio + * context as well, there is no potential recursive memory reclaim + * I/Os with the GFP_KERNEL flag. + */ + scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); + if (!scribble) + return -ENOMEM; + + kvfree(percpu->scribble); + + percpu->scribble = scribble; + percpu->scribble_obj_size = obj_size; + return 0; } static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) @@ -2270,23 +2438,17 @@ return 0; mddev_suspend(conf->mddev); get_online_cpus(); + for_each_present_cpu(cpu) { struct raid5_percpu *percpu; - struct flex_array *scribble; percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = scribble_alloc(new_disks, - new_sectors / STRIPE_SECTORS, - GFP_NOIO); - - if (scribble) { - flex_array_free(percpu->scribble); - percpu->scribble = scribble; - } else { - err = -ENOMEM; + err = scribble_alloc(percpu, new_disks, + new_sectors / RAID5_STRIPE_SECTORS(conf)); + if (err) break; - } } + put_online_cpus(); mddev_resume(conf->mddev); if (!err) { @@ -2374,9 +2536,16 @@ osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < osh->nr_pages; i++) { + nsh->pages[i] = osh->pages[i]; + osh->pages[i] = NULL; + } +#endif for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; nsh->dev[i].orig_page = osh->dev[i].page; + nsh->dev[i].offset = osh->dev[i].offset; } nsh->hash_lock_index = hash; free_stripe(conf->slab_cache, osh); @@ -2425,14 +2594,33 @@ nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + for (i = 0; i < nsh->nr_pages; i++) { + if (nsh->pages[i]) + continue; + nsh->pages[i] = alloc_page(GFP_NOIO); + if (!nsh->pages[i]) + err = -ENOMEM; + } + + for (i = conf->raid_disks; i < newsize; i++) { + if (nsh->dev[i].page) + continue; + nsh->dev[i].page = raid5_get_dev_page(nsh, i); + nsh->dev[i].orig_page = nsh->dev[i].page; + nsh->dev[i].offset = raid5_get_page_offset(nsh, i); + } +#else for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; nsh->dev[i].orig_page = p; + nsh->dev[i].offset = 0; if (!p) err = -ENOMEM; } +#endif raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ @@ -2516,10 +2704,10 @@ */ pr_info_ratelimited( "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, + mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), (unsigned long long)s, bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) @@ -2564,10 +2752,16 @@ (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) - > conf->max_nr_stripes) - pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); - else + > conf->max_nr_stripes) { + if (!test_bit(Faulty, &rdev->flags)) { + pr_warn("md/raid:%s: %d read_errors > %d stripes\n", + mdname(conf->mddev), + atomic_read(&rdev->read_errors), + conf->max_nr_stripes); + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); + } + } else retry = 1; if (set_bad && test_bit(In_sync, &rdev->flags) && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) @@ -2586,7 +2780,7 @@ if (!(set_bad && test_bit(In_sync, &rdev->flags) && rdev_set_badblocks( - rdev, sh->sector, STRIPE_SECTORS, 0))) + rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) md_error(conf->mddev, rdev); } } @@ -2602,7 +2796,7 @@ struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - struct md_rdev *uninitialized_var(rdev); + struct md_rdev *rdev; sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2638,7 +2832,7 @@ if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { @@ -2650,7 +2844,7 @@ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) @@ -2670,10 +2864,10 @@ if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); + raid5_release_stripe(sh); } static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) @@ -2683,22 +2877,31 @@ unsigned long flags; pr_debug("raid456: error called\n"); + pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n", + mdname(mddev), bdevname(rdev->bdev, b)); + spin_lock_irqsave(&conf->device_lock, flags); set_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); mddev->degraded = raid5_calc_degraded(conf); + + if (has_failed(conf)) { + set_bit(MD_BROKEN, &conf->mddev->flags); + conf->recovery_disabled = mddev->recovery_disabled; + + pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", + mdname(mddev), mddev->degraded, conf->raid_disks); + } else { + pr_crit("md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), conf->raid_disks - mddev->degraded); + } + spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); - pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); r5c_update_on_rdev_error(mddev, rdev); } @@ -3272,13 +3475,13 @@ /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && + sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && bi && bi->bi_iter.bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) sh->overwrite_disks++; } @@ -3303,7 +3506,7 @@ set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3365,7 +3568,7 @@ if (!rdev_set_badblocks( rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } @@ -3385,8 +3588,8 @@ wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3394,7 +3597,7 @@ } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3406,8 +3609,8 @@ if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3430,9 +3633,9 @@ if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *nextbi = - r5_next_bio(bi, sh->dev[i].sector); + r5_next_bio(conf, bi, sh->dev[i].sector); bio_io_error(bi); bi = nextbi; @@ -3440,7 +3643,7 @@ } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3485,14 +3688,14 @@ && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } rcu_read_unlock(); @@ -3500,7 +3703,7 @@ conf->recovery_disabled = conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3527,6 +3730,7 @@ struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; int i; + bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); if (test_bit(R5_LOCKED, &dev->flags) || @@ -3585,18 +3789,27 @@ * devices must be read. */ return 1; + + if (s->failed >= 2 && + (fdev[i]->towrite || + s->failed_num[i] == sh->pd_idx || + s->failed_num[i] == sh->qd_idx) && + !test_bit(R5_UPTODATE, &fdev[i]->flags)) + /* In max degraded raid6, If the failed disk is P, Q, + * or we want to read the failed disk, we need to do + * reconstruct-write. + */ + force_rcw = true; } - /* If we are forced to do a reconstruct-write, either because - * the current RAID6 implementation only supports that, or - * because parity cannot be trusted and we are currently - * recovering it, there is extra need to be careful. + /* If we are forced to do a reconstruct-write, because parity + * cannot be trusted and we are currently recovering it, there + * is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) * is missing/faulty, then we need to read everything we can. */ - if (sh->raid_conf->level != 6 && - sh->raid_conf->rmw_level != PARITY_DISABLE_RMW && + if (!force_rcw && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -3700,7 +3913,7 @@ return 0; } -/** +/* * handle_stripe_fill - read or compute data to satisfy pending requests. */ static void handle_stripe_fill(struct stripe_head *sh, @@ -3723,7 +3936,7 @@ * back cache (prexor with orig_page, and then xor with * page) in the read path */ - if (s->injournal && s->failed) { + if (s->to_read && s->injournal && s->failed) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); goto out; @@ -3775,14 +3988,14 @@ wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; } md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); if (head_sh->batch_head) { @@ -3966,10 +4179,8 @@ set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } } @@ -3994,10 +4205,8 @@ set_bit(R5_Wantread, &dev->flags); s->locked++; qread++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } if (rcw && conf->mddev->queue) @@ -4047,7 +4256,7 @@ break; } dev = &sh->dev[s->failed_num[0]]; - /* fall through */ + fallthrough; case check_state_compute_result: sh->check_state = check_state_idle; if (!dev) @@ -4089,7 +4298,7 @@ */ set_bit(STRIPE_INSYNC, &sh->state); else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4097,7 +4306,7 @@ "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { sh->check_state = check_state_compute_run; set_bit(STRIPE_COMPUTE_RUN, &sh->state); @@ -4178,7 +4387,7 @@ /* we have 2-disk failure */ BUG_ON(s->failed != 2); - /* fall through */ + fallthrough; case check_state_compute_result: sh->check_state = check_state_idle; @@ -4254,7 +4463,7 @@ */ } } else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4262,7 +4471,7 @@ "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { int *target = &sh->ops.target; @@ -4333,7 +4542,8 @@ /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, + sh->dev[i].page, sh2->dev[dd_idx].offset, + sh->dev[i].offset, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -4432,8 +4642,8 @@ */ rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && + !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { @@ -4447,7 +4657,7 @@ if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) @@ -4474,7 +4684,7 @@ } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) /* in sync if before recovery_offset */ set_bit(R5_Insync, &dev->flags); else if (test_bit(R5_UPTODATE, &dev->flags) && @@ -4563,12 +4773,12 @@ rcu_read_unlock(); } +/* + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or + * a head which can now be handled. + */ static int clear_batch_ready(struct stripe_head *sh) { - /* Return '1' if this is a member of batch, or - * '0' if it is a lone stripe or a head which can now be - * handled. - */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) return (sh->batch_head && sh->batch_head != sh); @@ -4618,7 +4828,6 @@ (1 << STRIPE_FULL_WRITE) | (1 << STRIPE_BIOFILL_RUN) | (1 << STRIPE_COMPUTE_RUN) | - (1 << STRIPE_OPS_REQ_PENDING) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | (1 << STRIPE_BATCH_ERR) | @@ -4673,15 +4882,20 @@ struct r5dev *pdev, *qdev; clear_bit(STRIPE_HANDLE, &sh->state); + + /* + * handle_stripe should not continue handle the batched stripe, only + * the head of batch list or lone stripe can continue. Otherwise we + * could see break_stripe_batch_list warns about the STRIPE_ACTIVE + * is set for the batched stripe. + */ + if (clear_batch_ready(sh)) + return; + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { /* already being handled, ensure it gets handled * again when current action finishes */ set_bit(STRIPE_HANDLE, &sh->state); - return; - } - - if (clear_batch_ready(sh) ) { - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); return; } @@ -4918,7 +5132,7 @@ if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -4937,14 +5151,11 @@ if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { + } else /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } @@ -4986,7 +5197,7 @@ clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } if (s.expanding && s.locked == 0 && @@ -5016,14 +5227,14 @@ /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -5032,7 +5243,7 @@ /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5088,28 +5299,6 @@ hash = sh->hash_lock_index; __release_stripe(conf, sh, &temp_inactive_list[hash]); } -} - -static int raid5_congested(struct mddev *mddev, int bits) -{ - struct r5conf *conf = mddev->private; - - /* No difference between reads and writes. Just check - * how busy the stripe_cache is - */ - - if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) - return 1; - - /* Also checks whether there is pressure on r5cache log space */ - if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) - return 1; - if (conf->quiesce) - return 1; - if (atomic_read(&conf->empty_inactive_list_nr)) - return 1; - - return 0; } static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) @@ -5257,7 +5446,6 @@ rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; bio_set_dev(align_bi, rdev->bdev); - bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), @@ -5281,7 +5469,7 @@ trace_block_bio_remap(align_bi->bi_disk->queue, align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); - generic_make_request(align_bi); + submit_bio_noacct(align_bi); return 1; } else { rcu_read_unlock(); @@ -5301,7 +5489,7 @@ struct r5conf *conf = mddev->private; split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); - generic_make_request(raid_bio); + submit_bio_noacct(raid_bio); raid_bio = split; } @@ -5497,8 +5685,8 @@ /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5512,7 +5700,7 @@ last_sector *= conf->chunk_sectors; for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS) { + logical_sector += RAID5_STRIPE_SECTORS(conf)) { DEFINE_WAIT(w); int d; again: @@ -5557,7 +5745,7 @@ d++) md_bitmap_startwrite(mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); @@ -5622,12 +5810,12 @@ return true; } - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; int seq; @@ -5725,8 +5913,7 @@ do_flush = false; } - if (!sh->batch_head || sh == sh->batch_head) - set_bit(STRIPE_HANDLE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && @@ -5791,7 +5978,7 @@ sector_div(sector_nr, new_data_disks); if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; retn = sector_nr; goto finish; @@ -5905,11 +6092,11 @@ conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } INIT_LIST_HEAD(&stripes); - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); @@ -5930,7 +6117,7 @@ skipped_disk = 1; continue; } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } @@ -5965,7 +6152,7 @@ set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); - first_sector += STRIPE_SECTORS; + first_sector += RAID5_STRIPE_SECTORS(conf); } /* Now that the sources are clearly marked, we can release * the destination stripes @@ -6012,7 +6199,7 @@ conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: return retn; @@ -6071,11 +6258,12 @@ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - sync_blocks >= STRIPE_SECTORS) { + sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; + do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + /* keep things rounded to whole stripes */ + return sync_blocks * RAID5_STRIPE_SECTORS(conf); } md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); @@ -6108,7 +6296,7 @@ raid5_release_stripe(sh); - return STRIPE_SECTORS; + return RAID5_STRIPE_SECTORS(conf); } static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, @@ -6131,14 +6319,14 @@ int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & - ~((sector_t)STRIPE_SECTORS-1); + ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, - sector += STRIPE_SECTORS, + logical_sector += RAID5_STRIPE_SECTORS(conf), + sector += RAID5_STRIPE_SECTORS(conf), scnt++) { if (scnt < offset) @@ -6177,6 +6365,8 @@ static int handle_active_stripes(struct r5conf *conf, int group, struct r5worker *worker, struct list_head *temp_inactive_list) + __releases(&conf->device_lock) + __acquires(&conf->device_lock) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0, hash; @@ -6329,7 +6519,18 @@ spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -6469,6 +6670,100 @@ raid5_show_rmw_level, raid5_store_rmw_level); +static ssize_t +raid5_show_stripe_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); + spin_unlock(&mddev->lock); + return ret; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static ssize_t +raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + int size; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + /* + * The value should not be bigger than PAGE_SIZE. It requires to + * be multiple of DEFAULT_STRIPE_SIZE and the value should be power + * of two. + */ + if (new % DEFAULT_STRIPE_SIZE != 0 || + new > PAGE_SIZE || new == 0 || + new != roundup_pow_of_two(new)) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + goto out_unlock; + } + + if (new == conf->stripe_size) + goto out_unlock; + + pr_debug("md/raid: change stripe_size from %lu to %lu\n", + conf->stripe_size, new); + + if (mddev->sync_thread || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) { + err = -EBUSY; + goto out_unlock; + } + + mddev_suspend(mddev); + mutex_lock(&conf->cache_size_mutex); + size = conf->max_nr_stripes; + + shrink_stripes(conf); + + conf->stripe_size = new; + conf->stripe_shift = ilog2(new) - 9; + conf->stripe_sectors = new >> 9; + if (grow_stripes(conf, size)) { + pr_warn("md/raid:%s: couldn't allocate buffers\n", + mdname(mddev)); + err = -ENOMEM; + } + mutex_unlock(&conf->cache_size_mutex); + mddev_resume(mddev); + +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0644, + raid5_show_stripe_size, + raid5_store_stripe_size); +#else +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0444, + raid5_show_stripe_size, + NULL); +#endif static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) @@ -6548,14 +6843,14 @@ if (!conf) err = -ENODEV; else if (new != conf->skip_copy) { + struct request_queue *q = mddev->queue; + mddev_suspend(mddev); conf->skip_copy = new; if (new) - mddev->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else - mddev->queue->backing_dev_info->capabilities &= - ~BDI_CAP_STABLE_WRITES; + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); mddev_resume(mddev); } mddev_unlock(mddev); @@ -6595,7 +6890,6 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, - int *worker_cnt_per_group, struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) @@ -6604,7 +6898,7 @@ unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; - int group_cnt, worker_cnt_per_group; + int group_cnt; if (len >= PAGE_SIZE) return -EINVAL; @@ -6627,13 +6921,11 @@ if (old_groups) flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); if (!err) { spin_lock_irq(&conf->device_lock); conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = new; conf->worker_groups = new_groups; spin_unlock_irq(&conf->device_lock); @@ -6660,7 +6952,9 @@ &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &raid5_stripe_size.attr, &r5c_journal_mode.attr, + &ppl_write_hint.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -6668,16 +6962,13 @@ .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt, - int *group_cnt, - int *worker_cnt_per_group, +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, struct r5worker_group **worker_groups) { int i, j, k; ssize_t size; struct r5worker *workers; - *worker_cnt_per_group = cnt; if (cnt == 0) { *group_cnt = 0; *worker_groups = NULL; @@ -6743,25 +7034,25 @@ static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) { safe_put_page(percpu->spare_page); - if (percpu->scribble) - flex_array_free(percpu->scribble); percpu->spare_page = NULL; + kvfree(percpu->scribble); percpu->scribble = NULL; } static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) { - if (conf->level == 6 && !percpu->spare_page) + if (conf->level == 6 && !percpu->spare_page) { percpu->spare_page = alloc_page(GFP_KERNEL); - if (!percpu->scribble) - percpu->scribble = scribble_alloc(max(conf->raid_disks, - conf->previous_raid_disks), - max(conf->chunk_sectors, - conf->prev_chunk_sectors) - / STRIPE_SECTORS, - GFP_KERNEL); + if (!percpu->spare_page) + return -ENOMEM; + } - if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { + if (scribble_alloc(percpu, + max(conf->raid_disks, + conf->previous_raid_disks), + max(conf->chunk_sectors, + conf->prev_chunk_sectors) + / RAID5_STRIPE_SECTORS(conf))) { free_scratch_buffer(conf, percpu); return -ENOMEM; } @@ -6877,7 +7168,7 @@ struct disk_info *disk; char pers_name[6]; int i; - int group_cnt, worker_cnt_per_group; + int group_cnt; struct r5worker_group *new_group; int ret; @@ -6913,6 +7204,12 @@ conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + conf->stripe_size = DEFAULT_STRIPE_SIZE; + conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; + conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; +#endif INIT_LIST_HEAD(&conf->free_list); INIT_LIST_HEAD(&conf->pending_list); conf->pending_data = kcalloc(PENDING_IO_MAX, @@ -6923,15 +7220,14 @@ for (i = 0; i < PENDING_IO_MAX; i++) list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ - if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, - &new_group)) { + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = 0; conf->worker_groups = new_group; } else goto abort; spin_lock_init(&conf->device_lock); - seqcount_init(&conf->gen_lock); + seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock); mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); init_waitqueue_head(&conf->wait_for_stripe); @@ -7065,8 +7361,8 @@ conf->min_nr_stripes = NR_STRIPES; if (mddev->reshape_position != MaxSector) { int stripes = max_t(int, - ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) pr_info("md/raid:%s: force stripe size %d for reshape\n", @@ -7139,6 +7435,12 @@ return 1; } return 0; +} + +static void raid5_set_io_opt(struct r5conf *conf) +{ + blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * + (conf->raid_disks - conf->max_degraded)); } static int raid5_run(struct mddev *mddev) @@ -7425,13 +7727,10 @@ int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks - conf->max_degraded)); + raid5_set_io_opt(conf); mddev->queue->limits.raid_partial_stripes_expensive = 1; /* * We can only discard a whole stripe. It doesn't make sense to @@ -7716,6 +8015,7 @@ */ if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && + rdev->saved_raid_disk <= last && conf->disks[rdev->saved_raid_disk].rdev == NULL) first = rdev->saved_raid_disk; @@ -7797,14 +8097,14 @@ * stripe_heads first. */ struct r5conf *conf = mddev->private; - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes) { pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + / RAID5_STRIPE_SIZE(conf))*4); return 0; } return 1; @@ -7940,8 +8240,8 @@ else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->previous_raid_disks && !test_bit(Faulty, &rdev->flags)) { @@ -8015,16 +8315,8 @@ spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - if (conf->mddev->queue) { - int data_disks = conf->raid_disks - conf->max_degraded; - int stripe = data_disks * ((conf->chunk_sectors << 9) - / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; - } + if (conf->mddev->queue) + raid5_set_io_opt(conf); } } @@ -8136,7 +8428,7 @@ while (chunksect && (mddev->array_sectors & (chunksect-1))) chunksect >>= 1; - if ((chunksect<<9) < STRIPE_SIZE) + if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) /* array size does not allow a suitable chunk size */ return ERR_PTR(-EINVAL); @@ -8423,7 +8715,6 @@ .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid5_personality = @@ -8448,7 +8739,6 @@ .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; @@ -8474,7 +8764,6 @@ .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; -- Gitblit v1.6.2