From 01573e231f18eb2d99162747186f59511f56b64d Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 08 Dec 2023 10:40:48 +0000
Subject: [PATCH] 移去rt
---
kernel/fs/btrfs/raid56.c | 264 ++++++++++++++++++++++++++++------------------------
1 files changed, 142 insertions(+), 122 deletions(-)
diff --git a/kernel/fs/btrfs/raid56.c b/kernel/fs/btrfs/raid56.c
index a91f74c..9eb6a43 100644
--- a/kernel/fs/btrfs/raid56.c
+++ b/kernel/fs/btrfs/raid56.c
@@ -35,6 +35,22 @@
#define RBIO_CACHE_SIZE 1024
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+ struct list_head hash_list;
+ spinlock_t lock;
+};
+
+/* Used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+ struct list_head stripe_cache;
+ spinlock_t cache_lock;
+ int cache_size;
+ struct btrfs_stripe_hash table[];
+};
+
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
@@ -174,7 +190,7 @@
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
+ btrfs_init_work(&rbio->work, work_func, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
@@ -190,7 +206,6 @@
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
- int table_size;
if (info->stripe_hash_table)
return 0;
@@ -202,8 +217,7 @@
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
- table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kvzalloc(table_size, GFP_KERNEL);
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -318,6 +332,9 @@
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
+ dest->stripe_npages);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -655,8 +672,7 @@
*/
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{
- int bucket = rbio_bucket(rbio);
- struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
@@ -664,64 +680,63 @@
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
+ h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
- spin_lock(&cur->bio_list_lock);
+ if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ continue;
- /* can we steal this cached rbio's pages? */
- if (bio_list_empty(&cur->bio_list) &&
- list_empty(&cur->plug_list) &&
- test_bit(RBIO_CACHE_BIT, &cur->flags) &&
- !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
- list_del_init(&cur->hash_list);
- refcount_dec(&cur->refs);
+ spin_lock(&cur->bio_list_lock);
- steal_rbio(cur, rbio);
- cache_drop = cur;
- spin_unlock(&cur->bio_list_lock);
+ /* Can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ refcount_dec(&cur->refs);
- goto lockit;
- }
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
- /* can we merge into the lock owner? */
- if (rbio_can_merge(cur, rbio)) {
- merge_rbio(cur, rbio);
+ goto lockit;
+ }
+
+ /* Can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+
+
+ /*
+ * We couldn't merge with the running rbio, see if we can merge
+ * with the pending ones. We don't have to check for rmw_locked
+ * because there is no way they are inside finish_rmw right now
+ */
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1;
goto out;
}
-
-
- /*
- * we couldn't merge with the running
- * rbio, see if we can merge with the
- * pending ones. We don't have to
- * check for rmw_locked because there
- * is no way they are inside finish_rmw
- * right now
- */
- list_for_each_entry(pending, &cur->plug_list,
- plug_list) {
- if (rbio_can_merge(pending, rbio)) {
- merge_rbio(pending, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
- }
-
- /* no merging, put us on the tail of the plug list,
- * our rbio will be started with the currently
- * running rbio unlocks
- */
- list_add_tail(&rbio->plug_list, &cur->plug_list);
- spin_unlock(&cur->bio_list_lock);
- ret = 1;
- goto out;
}
+
+ /*
+ * No merging, put us on the tail of the plug list, our rbio
+ * will be started with the currently running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
}
lockit:
refcount_inc(&rbio->refs);
@@ -862,6 +877,12 @@
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1071,7 +1092,6 @@
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
- u64 last_end = 0;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
@@ -1086,15 +1106,14 @@
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = (u64)last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
- if (last_end == disk_start && stripe->dev->bdev &&
- !last->bi_status &&
+ if (last_end == disk_start && !last->bi_status &&
last->bi_disk == stripe->dev->bdev->bd_disk &&
last->bi_partno == stripe->dev->bdev->bd_partno) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
@@ -1105,6 +1124,7 @@
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+ btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1196,6 +1216,9 @@
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1269,6 +1292,11 @@
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1293,6 +1321,11 @@
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1313,11 +1346,7 @@
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -1342,7 +1371,6 @@
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
- u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
@@ -1350,9 +1378,7 @@
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
- stripe_start = stripe->physical;
- if (physical >= stripe_start &&
- physical < stripe_start + rbio->stripe_len &&
+ if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev &&
bio->bi_disk == stripe->dev->bdev->bd_disk &&
bio->bi_partno == stripe->dev->bdev->bd_partno) {
@@ -1370,18 +1396,14 @@
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_iter.bi_sector;
- u64 stripe_start;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
int i;
- logical <<= 9;
-
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->bbio->raid_map[i];
- if (logical >= stripe_start &&
- logical < stripe_start + rbio->stripe_len) {
+ u64 stripe_start = rbio->bbio->raid_map[i];
+
+ if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
- }
}
return -1;
}
@@ -1439,11 +1461,11 @@
static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
- int i;
+ struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, iter_all)
SetPageUptodate(bvec->bv_page);
}
@@ -1555,11 +1577,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -1724,13 +1742,39 @@
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, btrfs_rmw_helper,
- unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
}
run_plug(plug);
+}
+
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * rbio->stripe_len);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ PAGE_SHIFT) % rbio->stripe_npages;
+
+ set_bit(bit, rbio->dbitmap);
+ }
}
/*
@@ -1749,9 +1793,8 @@
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
@@ -1867,11 +1910,8 @@
}
/* make sure our ps and qs are in order */
- if (faila > failb) {
- int tmp = failb;
- failb = faila;
- faila = tmp;
- }
+ if (faila > failb)
+ swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
@@ -1977,7 +2017,7 @@
* - In case of single failure, where rbio->failb == -1:
*
* Cache this rbio iff the above read reconstruction is
- * excuted without problems.
+ * executed without problems.
*/
if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
@@ -2053,9 +2093,12 @@
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
@@ -2064,16 +2107,6 @@
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
@@ -2091,7 +2124,7 @@
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
- goto out;
+ return 0;
} else {
goto cleanup;
}
@@ -2102,11 +2135,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
@@ -2115,7 +2144,7 @@
submit_bio(bio);
}
-out:
+
return 0;
cleanup:
@@ -2155,8 +2184,7 @@
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
@@ -2470,11 +2498,7 @@
atomic_set(&rbio->stripes_pending, nr_data);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
@@ -2652,11 +2676,7 @@
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
- while (1) {
- bio = bio_list_pop(&bio_list);
- if (!bio)
- break;
-
+ while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
--
Gitblit v1.6.2