From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/scrub.c | 619 +++++++++++++++++++++++++++++++------------------------- 1 files changed, 341 insertions(+), 278 deletions(-) diff --git a/kernel/fs/btrfs/scrub.c b/kernel/fs/btrfs/scrub.c index fee8995..715a032 100644 --- a/kernel/fs/btrfs/scrub.c +++ b/kernel/fs/btrfs/scrub.c @@ -6,7 +6,9 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" +#include "discard.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" @@ -17,6 +19,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "raid56.h" +#include "block-group.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -146,7 +149,7 @@ */ unsigned long *ebitmap; - unsigned long bitmap[0]; + unsigned long bitmap[]; }; struct scrub_ctx { @@ -322,7 +325,6 @@ struct rb_node *parent = NULL; struct full_stripe_lock *entry; struct full_stripe_lock *ret; - unsigned int nofs_flag; lockdep_assert_held(&locks_root->lock); @@ -342,15 +344,8 @@ /* * Insert new lock. - * - * We must use GFP_NOFS because the scrub task might be waiting for a - * worker task executing this function and in turn a transaction commit - * might be waiting the scrub task to pause (which needs to wait for all - * the worker tasks to complete before pausing). */ - nofs_flag = memalloc_nofs_save(); ret = kmalloc(sizeof(*ret), GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); if (!ret) return ERR_PTR(-ENOMEM); ret->logical = fstripe_logical; @@ -395,8 +390,7 @@ * * Caller must ensure @cache is a RAID56 block group. */ -static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, - u64 bytenr) +static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) { u64 ret; @@ -410,8 +404,8 @@ * round_down() can only handle power of 2, while RAID56 full * stripe length can be 64KiB * n, so we need to manually round down. */ - ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * - cache->full_stripe_len + cache->key.objectid; + ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * + cache->full_stripe_len + cache->start; return ret; } @@ -429,7 +423,7 @@ static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool *locked_ret) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *existing; u64 fstripe_start; @@ -476,7 +470,7 @@ static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool locked) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *fstripe_lock; u64 fstripe_start; @@ -604,8 +598,8 @@ sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, btrfs_scrub_helper, - scrub_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, + NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -653,13 +647,9 @@ struct btrfs_fs_info *fs_info = swarn->dev->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; - struct btrfs_key root_key; struct btrfs_key key; - root_key.objectid = root; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; @@ -674,6 +664,7 @@ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { + btrfs_put_root(local_root); btrfs_release_path(swarn->path); goto err; } @@ -694,6 +685,7 @@ ipath = init_ipath(4096, local_root, swarn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { + btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; goto err; @@ -717,6 +709,7 @@ min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); + btrfs_put_root(local_root); free_ipath(ipath); return 0; @@ -841,7 +834,8 @@ int page_num; int success; bool full_stripe_locked; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + unsigned int nofs_flag; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -866,6 +860,16 @@ dev = sblock_to_check->pagev[0]->dev; /* + * We must use GFP_NOFS because the scrub task might be waiting for a + * worker task executing this function and in turn a transaction commit + * might be waiting the scrub task to pause (which needs to wait for all + * the worker tasks to complete before pausing). + * We do allocations in the workers through insert_full_stripe_lock() + * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * this function. + */ + nofs_flag = memalloc_nofs_save(); + /* * For RAID5/6, race can happen for a different device scrub thread. * For data corruption, Parity and Data threads will both try * to recovery the data. @@ -874,6 +878,7 @@ */ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); if (ret < 0) { + memalloc_nofs_restore(nofs_flag); spin_lock(&sctx->stat_lock); if (ret == -ENOMEM) sctx->stat.malloc_errors++; @@ -913,7 +918,7 @@ */ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_NOFS); + sizeof(*sblocks_for_recheck), GFP_KERNEL); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -964,14 +969,14 @@ spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -979,7 +984,7 @@ spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) @@ -1133,7 +1138,7 @@ if (scrub_write_page_to_dev_replace(sblock_other, page_num) != 0) { - btrfs_dev_replace_stats_inc( + atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } @@ -1211,6 +1216,7 @@ } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + memalloc_nofs_restore(nofs_flag); if (ret < 0) return ret; return 0; @@ -1573,8 +1579,7 @@ if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1601,8 +1606,7 @@ ret = scrub_write_page_to_dev_replace(sblock, page_num); if (ret) - btrfs_dev_replace_stats_inc( - &fs_info->dev_replace.num_write_errors); + atomic64_inc(&fs_info->dev_replace.num_write_errors); } } @@ -1612,13 +1616,9 @@ struct scrub_page *spage = sblock->pagev[page_num]; BUG_ON(spage->page == NULL); - if (spage->io_error) { - void *mapped_buffer = kmap_atomic(spage->page); + if (spage->io_error) + clear_page(page_address(spage->page)); - clear_page(mapped_buffer); - flush_dcache_page(spage->page); - kunmap_atomic(mapped_buffer); - } return scrub_add_page_to_wr_bio(sblock->sctx, spage); } @@ -1631,19 +1631,8 @@ mutex_lock(&sctx->wr_lock); again: if (!sctx->wr_curr_bio) { - unsigned int nofs_flag; - - /* - * We must use GFP_NOFS because the scrub task might be waiting - * for a worker task executing this function and in turn a - * transaction commit might be waiting the scrub task to pause - * (which needs to wait for all the worker tasks to complete - * before pausing). - */ - nofs_flag = memalloc_nofs_save(); sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); if (!sctx->wr_curr_bio) { mutex_unlock(&sctx->wr_lock); return -ENOMEM; @@ -1726,8 +1715,7 @@ sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, - scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -1746,8 +1734,7 @@ struct scrub_page *spage = sbio->pagev[i]; spage->io_error = 1; - btrfs_dev_replace_stats_inc(&dev_replace-> - num_write_errors); + atomic64_inc(&dev_replace->num_write_errors); } } @@ -1796,41 +1783,24 @@ static int scrub_checksum_data(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - u8 *on_disk_csum; - struct page *page; - void *buffer; - u32 crc = ~(u32)0; - u64 len; - int index; + struct scrub_page *spage; + char *kaddr; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0]->have_csum) + spage = sblock->pagev[0]; + if (!spage->have_csum) return 0; - on_disk_csum = sblock->pagev[0]->csum; - page = sblock->pagev[0]->page; - buffer = kmap_atomic(page); + kaddr = page_address(spage->page); - len = sctx->fs_info->sectorsize; - index = 0; - for (;;) { - u64 l = min_t(u64, len, PAGE_SIZE); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - crc = btrfs_csum_data(buffer, crc, l); - kunmap_atomic(buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - buffer = kmap_atomic(page); - } - - btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sctx->csum_size)) + if (memcmp(csum, spage->csum, sctx->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; @@ -1841,20 +1811,18 @@ struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; - u32 crc = ~(u32)0; - u64 len; - int index; + const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; + int i; + struct scrub_page *spage; + char *kaddr; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); - h = (struct btrfs_header *)mapped_buffer; + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); + h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->csum_size); /* @@ -1862,43 +1830,32 @@ * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) + if (spage->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { + if (spage->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(h->fsid, spage)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = btrfs_csum_data(p, crc, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(sblock->pagev[i]->page); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1909,56 +1866,34 @@ { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; - u32 crc = ~(u32)0; + struct scrub_page *spage; + char *kaddr; int fail_gen = 0; int fail_cor = 0; - u64 len; - int index; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); - s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sctx->csum_size); + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); + s = (struct btrfs_super_block *)kaddr; - if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) + if (spage->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != btrfs_super_generation(s)) + if (spage->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(s->fsid, spage)) ++fail_cor; - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - crc = btrfs_csum_data(p, crc, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + if (memcmp(calculated_csum, s->csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -1971,10 +1906,10 @@ ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -2199,8 +2134,7 @@ raid56_add_scrub_pages(rbio, spage->page, spage->logical); } - btrfs_init_work(&sblock->work, btrfs_scrub_helper, - scrub_missing_raid56_worker, NULL, NULL); + btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2456,7 +2390,7 @@ ASSERT(index < UINT_MAX); num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); @@ -2668,18 +2602,18 @@ u64 last_offset; u32 stripe_index; u32 rot; + const int data_stripes = nr_data_stripes(map); - last_offset = (physical - map->stripes[num].physical) * - nr_data_stripes(map); + last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; - for (i = 0; i < nr_data_stripes(map); i++) { + for (i = 0; i < data_stripes; i++) { *offset = last_offset + i * map->stripe_len; stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(stripe_nr, data_stripes); /* Work out the disk rotation on this stripe-set */ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); @@ -2738,8 +2672,8 @@ bio_put(bio); - btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, - scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, + NULL); btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); } @@ -3041,7 +2975,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length) + int num, u64 base, u64 length, + struct btrfs_block_group *cache) { struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -3087,7 +3022,7 @@ offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { @@ -3279,6 +3214,20 @@ break; } + /* + * If our block group was removed in the meanwhile, just + * stop scrubbing since there is no point in continuing. + * Continuing would prevent reusing its device extents + * for new block groups for a long time. + */ + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + ret = 0; + goto out; + } + spin_unlock(&cache->lock); + extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(l, extent); @@ -3323,13 +3272,14 @@ &extent_dev, &extent_mirror_num); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + - extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + } ret = scrub_extent(sctx, map, extent_logical, extent_len, extent_physical, extent_dev, flags, @@ -3415,18 +3365,18 @@ struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; int ret = 0; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, chunk_offset, 1); + read_unlock(&map_tree->lock); if (!em) { /* @@ -3452,7 +3402,7 @@ if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length); + chunk_offset, length, cache); if (ret) goto out; } @@ -3479,7 +3429,7 @@ struct extent_buffer *l; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); @@ -3550,6 +3500,23 @@ goto skip; /* + * Make sure that while we are scrubbing the corresponding block + * group doesn't get its logical address and its device extents + * reused for another block group, which can possibly be of a + * different type and different profile. We do this to prevent + * false error detections and crashes due to bogus attempts to + * repair extents. + */ + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + goto skip; + } + btrfs_freeze_block_group(cache); + spin_unlock(&cache->lock); + + /* * we need call btrfs_inc_block_group_ro() with scrubs_paused, * to avoid deadlock caused by: * btrfs_inc_block_group_ro() @@ -3558,71 +3525,90 @@ * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); - ret = btrfs_inc_block_group_ro(cache); - if (!ret && sctx->is_dev_replace) { - /* - * If we are doing a device replace wait for any tasks - * that started dellaloc right before we set the block - * group to RO mode, as they might have just allocated - * an extent from it or decided they could do a nocow - * write. And if any such tasks did that, wait for their - * ordered extents to complete and then commit the - * current transaction, so that we can later see the new - * extent items in the extent tree - the ordered extents - * create delayed data references (for cow writes) when - * they complete, which will be run and insert the - * corresponding extent items into the extent tree when - * we commit the transaction they used when running - * inode.c:btrfs_finish_ordered_io(). We later use - * the commit root of the extent tree to find extents - * to copy from the srcdev into the tgtdev, and we don't - * want to miss any new extents. - */ - btrfs_wait_block_group_reservations(cache); - btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, - cache->key.objectid, - cache->key.offset); - if (ret > 0) { - struct btrfs_trans_handle *trans; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - ret = PTR_ERR(trans); - else - ret = btrfs_commit_transaction(trans); - if (ret) { - scrub_pause_off(fs_info); - btrfs_put_block_group(cache); - break; - } - } - } - scrub_pause_off(fs_info); - + /* + * Don't do chunk preallocation for scrub. + * + * This is especially important for SYSTEM bgs, or we can hit + * -EFBIG from btrfs_finish_chunk_alloc() like: + * 1. The only SYSTEM bg is marked RO. + * Since SYSTEM bg is small, that's pretty common. + * 2. New SYSTEM bg will be allocated + * Due to regular version will allocate new chunk. + * 3. New SYSTEM bg is empty and will get cleaned up + * Before cleanup really happens, it's marked RO again. + * 4. Empty SYSTEM bg get scrubbed + * We go back to 2. + * + * This can easily boost the amount of SYSTEM chunks if cleaner + * thread can't be triggered fast enough, and use up all space + * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. + */ + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace && + !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. - * It is not a problem for scrub/replace, because + * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. + * + * For RAID56 chunks, we have to mark them read-only + * for scrub, as later we would use our own cache + * out of RAID56 realm. + * Thus we want the RAID56 bg to be marked RO to + * prevent RMW from screwing up out cache. */ ro_set = 0; + } else if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "skipping scrub of block group %llu due to active swapfile", + cache->start); + scrub_pause_off(fs_info); + ret = 0; + goto skip_unfreeze; } else { btrfs_warn(fs_info, "failed setting block group ro: %d", ret); + btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); + scrub_pause_off(fs_info); break; } - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); + down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache); @@ -3658,10 +3644,10 @@ scrub_pause_off(fs_info); - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3675,13 +3661,18 @@ */ spin_lock(&cache->lock); if (!cache->removed && !cache->ro && cache->reserved == 0 && - btrfs_block_group_used(&cache->item) == 0) { + cache->used == 0) { spin_unlock(&cache->lock); - btrfs_mark_bg_unused(cache); + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&fs_info->discard_ctl, + cache); + else + btrfs_mark_bg_unused(cache); } else { spin_unlock(&cache->lock); } - +skip_unfreeze: + btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); if (ret) break; @@ -3714,7 +3705,7 @@ struct btrfs_fs_info *fs_info = sctx->fs_info; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - return -EIO; + return -EROFS; /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) @@ -3739,42 +3730,84 @@ return 0; } +static void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, + &fs_info->scrub_lock)) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; + + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); + } +} + /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; - if (fs_info->scrub_workers_refcnt == 0) { - fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", - flags, is_dev_replace ? 1 : max_active, 4); - if (!fs_info->scrub_workers) - goto fail_scrub_workers; + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) + return 0; - fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, + scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, + is_dev_replace ? 1 : max_active, 4); + if (!scrub_workers) + goto fail_scrub_workers; + + scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) - goto fail_scrub_wr_completion_workers; + if (!scrub_wr_comp) + goto fail_scrub_wr_completion_workers; - fs_info->scrub_parity_workers = - btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); - if (!fs_info->scrub_parity_workers) - goto fail_scrub_parity_workers; + scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + max_active, 2); + if (!scrub_parity) + goto fail_scrub_parity_workers; + + mutex_lock(&fs_info->scrub_lock); + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL && + fs_info->scrub_wr_completion_workers == NULL && + fs_info->scrub_parity_workers == NULL); + fs_info->scrub_workers = scrub_workers; + fs_info->scrub_wr_completion_workers = scrub_wr_comp; + fs_info->scrub_parity_workers = scrub_parity; + refcount_set(&fs_info->scrub_workers_refcnt, 1); + mutex_unlock(&fs_info->scrub_lock); + return 0; } - ++fs_info->scrub_workers_refcnt; - return 0; + /* Other thread raced in and created the workers for us */ + refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + ret = 0; + btrfs_destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(scrub_workers); fail_scrub_workers: - return -ENOMEM; + return ret; } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -3785,12 +3818,10 @@ int ret; struct btrfs_device *dev; unsigned int nofs_flag; - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; + bool need_commit = false; if (btrfs_fs_closing(fs_info)) - return -EINVAL; + return -EAGAIN; if (fs_info->nodesize > BTRFS_STRIPE_LEN) { /* @@ -3834,22 +3865,27 @@ if (IS_ERR(sctx)) return PTR_ERR(sctx); + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) + goto out_free_ctx; + mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -ENODEV; - goto out_free_ctx; + goto out; } if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", - rcu_str_deref(dev->name)); + btrfs_err_in_rcu(fs_info, + "scrub on devid %llu: filesystem on %s is not writable", + devid, rcu_str_deref(dev->name)); ret = -EROFS; - goto out_free_ctx; + goto out; } mutex_lock(&fs_info->scrub_lock); @@ -3858,27 +3894,20 @@ mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; - goto out_free_ctx; + goto out; } - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EINPROGRESS; - goto out_free_ctx; + goto out; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); - - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - goto out_free_ctx; - } + up_read(&fs_info->dev_replace.rwsem); sctx->readonly = readonly; dev->scrub_ctx = sctx; @@ -3903,6 +3932,13 @@ */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can * kick off writing super in log tree sync. @@ -3910,6 +3946,16 @@ mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); } if (!ret) @@ -3925,22 +3971,39 @@ if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); + if (!is_dev_replace) + btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", + ret ? "not finished" : "finished", devid, ret); + mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - if (--fs_info->scrub_workers_refcnt == 0) { - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; - } mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + scrub_workers_put(fs_info); scrub_put_ctx(sctx); - return ret; + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } + return ret; +out: + scrub_workers_put(fs_info); out_free_ctx: scrub_free_ctx(sctx); @@ -3989,9 +4052,9 @@ return 0; } -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +int btrfs_scrub_cancel_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); -- Gitblit v1.6.2