From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/drivers/md/raid1.c | 268 +++++++++++++++++++++++++++--------------------------
1 files changed, 136 insertions(+), 132 deletions(-)
diff --git a/kernel/drivers/md/raid1.c b/kernel/drivers/md/raid1.c
index 876d3e1..fb31e5d 100644
--- a/kernel/drivers/md/raid1.c
+++ b/kernel/drivers/md/raid1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid1.c : Multiple Devices driver for Linux
*
@@ -20,15 +21,6 @@
*
* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
* - persistent bitmap code
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
@@ -37,6 +29,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/interval_tree_generic.h>
#include <trace/events/block.h>
@@ -50,31 +43,6 @@
(1L << MD_HAS_PPL) | \
(1L << MD_HAS_MULTIPLE_PPLS))
-/*
- * Number of guaranteed r1bios in case of extreme VM load:
- */
-#define NR_RAID1_BIOS 256
-
-/* when we get a read error on a read-only array, we redirect to another
- * device without failing the first device, or trying to over-write to
- * correct the read error. To keep track of bad blocks on a per-bio
- * level, we store IO_BLOCKED in the appropriate 'bios' pointer
- */
-#define IO_BLOCKED ((struct bio *)1)
-/* When we successfully write to a known bad-block, we need to remove the
- * bad-block marking which must be done from process context. So we record
- * the success by setting devs[n].bio to IO_MADE_GOOD
- */
-#define IO_MADE_GOOD ((struct bio *)2)
-
-#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
-
-/* When there are this many requests queue to be written by
- * the raid1 thread, we become 'congested' to provide back-pressure
- * for writeback.
- */
-static int max_queued_requests = 1024;
-
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
@@ -82,6 +50,73 @@
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
#include "raid1-10.c"
+
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
+ START, LAST, static inline, raid1_rb);
+
+static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
+ struct serial_info *si, int idx)
+{
+ unsigned long flags;
+ int ret = 0;
+ sector_t lo = r1_bio->sector;
+ sector_t hi = lo + r1_bio->sectors;
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ /* collision happened */
+ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+ ret = -EBUSY;
+ else {
+ si->start = lo;
+ si->last = hi;
+ raid1_rb_insert(si, &serial->serial_rb);
+ }
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
+
+ return ret;
+}
+
+static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+ struct mddev *mddev = rdev->mddev;
+ struct serial_info *si;
+ int idx = sector_to_idx(r1_bio->sector);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ if (WARN_ON(!mddev->serial_info_pool))
+ return;
+ si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
+ wait_event(serial->serial_io_wait,
+ check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+}
+
+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
+{
+ struct serial_info *si;
+ unsigned long flags;
+ int found = 0;
+ struct mddev *mddev = rdev->mddev;
+ int idx = sector_to_idx(lo);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+ si; si = raid1_rb_iter_next(si, lo, hi)) {
+ if (si->start == lo && si->last == hi) {
+ raid1_rb_remove(si, &serial->serial_rb);
+ mempool_free(si, mddev->serial_info_pool);
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ WARN(1, "The write IO is not recorded for serialization\n");
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
+ wake_up(&serial->serial_io_wait);
+}
/*
* for resync bio, r1bio pointer can be retrieved from the per-bio
@@ -99,11 +134,6 @@
/* allocate a r1bio with room for raid_disks entries in the bios array */
return kzalloc(size, gfp_flags);
-}
-
-static void r1bio_pool_free(void *r1_bio, void *data)
-{
- kfree(r1_bio);
}
#define RESYNC_DEPTH 32
@@ -181,7 +211,7 @@
kfree(rps);
out_free_r1bio:
- r1bio_pool_free(r1_bio, data);
+ rbio_pool_free(r1_bio, data);
return NULL;
}
@@ -201,7 +231,7 @@
/* resync pages array stored in the 1st bio's .bi_private */
kfree(rp);
- r1bio_pool_free(r1bio, data);
+ rbio_pool_free(r1bio, data);
}
static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
@@ -266,22 +296,17 @@
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- struct r1conf *conf = r1_bio->mddev->private;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, r1_bio->sector);
}
static void raid_end_bio_io(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
+ struct r1conf *conf = r1_bio->mddev->private;
/* if nobody has done the final endio yet, do it now */
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
@@ -292,6 +317,12 @@
call_bio_endio(r1_bio);
}
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle. All I/Os, even write-behind writes, are done.
+ */
+ allow_barrier(conf, r1_bio->sector);
+
free_r1bio(r1_bio);
}
@@ -417,6 +448,8 @@
int mirror = find_bio_disk(r1_bio, bio);
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
bool discard_error;
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
@@ -439,8 +472,6 @@
/*
* When the device is faulty, it is not necessary to
* handle write error.
- * For failfast, this is the only remaining device,
- * We need to retry the write without FailFast.
*/
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
@@ -488,6 +519,8 @@
}
if (behind) {
+ if (test_bit(CollisionCheck, &rdev->flags))
+ remove_serial(rdev, lo, hi);
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -510,7 +543,8 @@
call_bio_endio(r1_bio);
}
}
- }
+ } else if (rdev->mddev->serialize_policy)
+ remove_serial(rdev, lo, hi);
if (r1_bio->bios[mirror] == NULL)
rdev_dec_pending(rdev, conf->mddev);
@@ -752,36 +786,6 @@
return best_disk;
}
-static int raid1_congested(struct mddev *mddev, int bits)
-{
- struct r1conf *conf = mddev->private;
- int i, ret = 0;
-
- if ((bits & (1 << WB_async_congested)) &&
- conf->pending_count >= max_queued_requests)
- return 1;
-
- rcu_read_lock();
- for (i = 0; i < conf->raid_disks * 2; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
- if (rdev && !test_bit(Faulty, &rdev->flags)) {
- struct request_queue *q = bdev_get_queue(rdev->bdev);
-
- BUG_ON(!q);
-
- /* Note the '|| 1' - when read_balance prefers
- * non-congested targets, it can be removed
- */
- if ((bits & (1 << WB_async_congested)) || 1)
- ret |= bdi_congested(q->backing_dev_info, bits);
- else
- ret &= bdi_congested(q->backing_dev_info, bits);
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
@@ -800,8 +804,9 @@
/* Just ignore it */
bio_endio(bio);
else
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = next;
+ cond_resched();
}
}
@@ -857,8 +862,11 @@
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
+ *
+ * If resync/recovery is interrupted, returns -EINTR;
+ * Otherwise, returns 0.
*/
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
@@ -1274,7 +1282,7 @@
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
@@ -1300,7 +1308,7 @@
trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
disk_devt(mddev->gendisk), r1_bio->sector);
- generic_make_request(read_bio);
+ submit_bio_noacct(read_bio);
}
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
@@ -1445,7 +1453,7 @@
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
bio = split;
r1_bio->master_bio = bio;
r1_bio->sectors = max_sectors;
@@ -1458,9 +1466,9 @@
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!r1_bio->bios[i])
continue;
-
if (first_clone) {
/* do behind I/O ?
@@ -1486,9 +1494,12 @@
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ if (test_bit(CollisionCheck, &rdev->flags))
+ wait_for_serialization(rdev, r1_bio);
+ if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
- }
+ } else if (mddev->serialize_policy)
+ wait_for_serialization(rdev, r1_bio);
r1_bio->bios[i] = mbio;
@@ -1588,12 +1599,12 @@
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
@@ -1606,11 +1617,9 @@
return;
}
set_bit(Blocked, &rdev->flags);
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- set_bit(Faulty, &rdev->flags);
- } else
- set_bit(Faulty, &rdev->flags);
+ set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
/*
* if recovery is running, make sure it aborts.
@@ -1742,9 +1751,8 @@
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
- p = conf->mirrors+mirror;
+ p = conf->mirrors + mirror;
if (!p->rdev) {
-
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
@@ -1880,6 +1888,22 @@
} while (sectors_to_go > 0);
}
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ struct mddev *mddev = r1_bio->mddev;
+ int s = r1_bio->sectors;
+
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1906,16 +1930,7 @@
)
set_bit(R1BIO_MadeGood, &r1_bio->state);
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
- }
- }
+ put_sync_write_buf(r1_bio, uptodate);
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2115,7 +2130,7 @@
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks * 2; i++) {
- int j;
+ int j = 0;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
blk_status_t status = sbio->bi_status;
@@ -2123,14 +2138,15 @@
struct page **spages = get_resync_pages(sbio)->pages;
struct bio_vec *bi;
int page_len[RESYNC_PAGES] = { 0 };
+ struct bvec_iter_all iter_all;
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
- bio_for_each_segment_all(bi, sbio, j)
- page_len[j] = bi->bv_len;
+ bio_for_each_segment_all(bi, sbio, iter_all)
+ page_len[j++] = bi->bv_len;
if (!status) {
for (j = vcnt; j-- ; ) {
@@ -2194,20 +2210,10 @@
atomic_inc(&r1_bio->remaining);
md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
- generic_make_request(wbio);
+ submit_bio_noacct(wbio);
}
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- /* if we're here, all write(s) have completed, so clean up */
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, 1);
- }
- }
+ put_sync_write_buf(r1_bio, 1);
}
/*
@@ -2890,7 +2896,7 @@
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
} else {
@@ -2899,8 +2905,7 @@
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
- generic_make_request(bio);
-
+ submit_bio_noacct(bio);
}
return nr_sectors;
}
@@ -2959,8 +2964,8 @@
if (!conf->poolinfo)
goto abort;
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
- err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, conf->poolinfo);
+ err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, conf->poolinfo);
if (err)
goto abort;
@@ -3101,7 +3106,7 @@
}
mddev->degraded = 0;
- for (i=0; i < conf->raid_disks; i++)
+ for (i = 0; i < conf->raid_disks; i++)
if (conf->mirrors[i].rdev == NULL ||
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
@@ -3143,7 +3148,7 @@
mddev->queue);
}
- ret = md_integrity_register(mddev);
+ ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
goto abort;
@@ -3255,8 +3260,8 @@
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
- ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc,
- r1bio_pool_free, newpoolinfo);
+ ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
+ rbio_pool_free, newpoolinfo);
if (ret) {
kfree(newpoolinfo);
return ret;
@@ -3361,7 +3366,6 @@
.check_reshape = raid1_reshape,
.quiesce = raid1_quiesce,
.takeover = raid1_takeover,
- .congested = raid1_congested,
};
static int __init raid_init(void)
--
Gitblit v1.6.2