From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/raid56.c | 264 ++++++++++++++++++++++++++++------------------------ 1 files changed, 142 insertions(+), 122 deletions(-) diff --git a/kernel/fs/btrfs/raid56.c b/kernel/fs/btrfs/raid56.c index a91f74c..9eb6a43 100644 --- a/kernel/fs/btrfs/raid56.c +++ b/kernel/fs/btrfs/raid56.c @@ -35,6 +35,22 @@ #define RBIO_CACHE_SIZE 1024 +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + spinlock_t lock; +}; + +/* Used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, @@ -174,7 +190,7 @@ static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { - btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL); + btrfs_init_work(&rbio->work, work_func, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } @@ -190,7 +206,6 @@ struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; - int table_size; if (info->stripe_hash_table) return 0; @@ -202,8 +217,7 @@ * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ - table_size = sizeof(*table) + sizeof(*h) * num_entries; - table = kvzalloc(table_size, GFP_KERNEL); + table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; @@ -318,6 +332,9 @@ { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap, + dest->stripe_npages); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -655,8 +672,7 @@ */ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) { - int bucket = rbio_bucket(rbio); - struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; unsigned long flags; @@ -664,64 +680,63 @@ struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; + h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { - spin_lock(&cur->bio_list_lock); + if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + continue; - /* can we steal this cached rbio's pages? */ - if (bio_list_empty(&cur->bio_list) && - list_empty(&cur->plug_list) && - test_bit(RBIO_CACHE_BIT, &cur->flags) && - !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { - list_del_init(&cur->hash_list); - refcount_dec(&cur->refs); + spin_lock(&cur->bio_list_lock); - steal_rbio(cur, rbio); - cache_drop = cur; - spin_unlock(&cur->bio_list_lock); + /* Can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + refcount_dec(&cur->refs); - goto lockit; - } + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); - /* can we merge into the lock owner? */ - if (rbio_can_merge(cur, rbio)) { - merge_rbio(cur, rbio); + goto lockit; + } + + /* Can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * We couldn't merge with the running rbio, see if we can merge + * with the pending ones. We don't have to check for rmw_locked + * because there is no way they are inside finish_rmw right now + */ + list_for_each_entry(pending, &cur->plug_list, plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); spin_unlock(&cur->bio_list_lock); freeit = rbio; ret = 1; goto out; } - - - /* - * we couldn't merge with the running - * rbio, see if we can merge with the - * pending ones. We don't have to - * check for rmw_locked because there - * is no way they are inside finish_rmw - * right now - */ - list_for_each_entry(pending, &cur->plug_list, - plug_list) { - if (rbio_can_merge(pending, rbio)) { - merge_rbio(pending, rbio); - spin_unlock(&cur->bio_list_lock); - freeit = rbio; - ret = 1; - goto out; - } - } - - /* no merging, put us on the tail of the plug list, - * our rbio will be started with the currently - * running rbio unlocks - */ - list_add_tail(&rbio->plug_list, &cur->plug_list); - spin_unlock(&cur->bio_list_lock); - ret = 1; - goto out; } + + /* + * No merging, put us on the tail of the plug list, our rbio + * will be started with the currently running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; } lockit: refcount_inc(&rbio->refs); @@ -862,6 +877,12 @@ if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1071,7 +1092,6 @@ unsigned long bio_max_len) { struct bio *last = bio_list->tail; - u64 last_end = 0; int ret; struct bio *bio; struct btrfs_bio_stripe *stripe; @@ -1086,15 +1106,14 @@ /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = (u64)last->bi_iter.bi_sector << 9; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ - if (last_end == disk_start && stripe->dev->bdev && - !last->bi_status && + if (last_end == disk_start && !last->bi_status && last->bi_disk == stripe->dev->bdev->bd_disk && last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); @@ -1105,6 +1124,7 @@ /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_io_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; @@ -1196,6 +1216,9 @@ else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1269,6 +1292,11 @@ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(pagenr, rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); if (!page) @@ -1293,6 +1321,11 @@ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(pagenr, rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); if (!page) @@ -1313,11 +1346,7 @@ atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -1342,7 +1371,6 @@ struct bio *bio) { u64 physical = bio->bi_iter.bi_sector; - u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1350,9 +1378,7 @@ for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; - stripe_start = stripe->physical; - if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len && + if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { @@ -1370,18 +1396,14 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_iter.bi_sector; - u64 stripe_start; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; int i; - logical <<= 9; - for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->bbio->raid_map[i]; - if (logical >= stripe_start && - logical < stripe_start + rbio->stripe_len) { + u64 stripe_start = rbio->bbio->raid_map[i]; + + if (in_range(logical, stripe_start, rbio->stripe_len)) return i; - } } return -1; } @@ -1439,11 +1461,11 @@ static void set_bio_pages_uptodate(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, iter_all) SetPageUptodate(bvec->bv_page); } @@ -1555,11 +1577,7 @@ * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; @@ -1724,13 +1742,39 @@ plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, btrfs_rmw_helper, - unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; } run_plug(plug); +} + +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bbio->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * rbio->stripe_len); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + PAGE_SHIFT) % rbio->stripe_npages; + + set_bit(bit, rbio->dbitmap); + } } /* @@ -1749,9 +1793,8 @@ btrfs_put_bbio(bbio); return PTR_ERR(rbio); } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; @@ -1867,11 +1910,8 @@ } /* make sure our ps and qs are in order */ - if (faila > failb) { - int tmp = failb; - failb = faila; - faila = tmp; - } + if (faila > failb) + swap(faila, failb); /* if the q stripe is failed, do a pstripe reconstruction * from the xors. @@ -1977,7 +2017,7 @@ * - In case of single failure, where rbio->failb == -1: * * Cache this rbio iff the above read reconstruction is - * excuted without problems. + * executed without problems. */ if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); @@ -2053,9 +2093,12 @@ atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { if (rbio->faila == stripe || rbio->failb == stripe) { @@ -2064,16 +2107,6 @@ } for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - - /* - * the rmw code may have already read this - * page in - */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) - continue; - ret = rbio_add_io_page(rbio, &bio_list, rbio_stripe_page(rbio, stripe, pagenr), stripe, pagenr, rbio->stripe_len); @@ -2091,7 +2124,7 @@ */ if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { __raid_recover_end_io(rbio); - goto out; + return 0; } else { goto cleanup; } @@ -2102,11 +2135,7 @@ * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; @@ -2115,7 +2144,7 @@ submit_bio(bio); } -out: + return 0; cleanup: @@ -2155,8 +2184,7 @@ } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { @@ -2470,11 +2498,7 @@ atomic_set(&rbio->stripes_pending, nr_data); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -2652,11 +2676,7 @@ * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; -- Gitblit v1.6.2