From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/free-space-cache.c | 949 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 679 insertions(+), 270 deletions(-) diff --git a/kernel/fs/btrfs/free-space-cache.c b/kernel/fs/btrfs/free-space-cache.c index 6511cb7..4989c60 100644 --- a/kernel/fs/btrfs/free-space-cache.c +++ b/kernel/fs/btrfs/free-space-cache.c @@ -18,9 +18,14 @@ #include "extent_io.h" #include "inode-map.h" #include "volumes.h" +#include "space-info.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "discard.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) -#define MAX_CACHE_BYTES_PER_GIG SZ_32K +#define MAX_CACHE_BYTES_PER_GIG SZ_64K +#define FORCE_EXTENT_THRESHOLD SZ_1M struct btrfs_trim_range { u64 start; @@ -28,6 +33,8 @@ struct list_head list; }; +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -75,7 +82,7 @@ * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); + inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -88,10 +95,10 @@ return inode; } -struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path) +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, + struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; @@ -103,7 +110,7 @@ return inode; inode = __lookup_free_space_inode(fs_info->tree_root, path, - block_group->key.objectid); + block_group->start); if (IS_ERR(inode)) return inode; @@ -185,20 +192,19 @@ return 0; } -int create_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +int create_free_space_inode(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; u64 ino; - ret = btrfs_find_free_objectid(fs_info->tree_root, &ino); + ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; - return __create_free_space_inode(fs_info->tree_root, trans, path, ino, - block_group->key.objectid); + return __create_free_space_inode(trans->fs_info->tree_root, trans, path, + ino, block_group->start); } int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, @@ -208,8 +214,8 @@ int ret; /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) + - btrfs_calc_trans_metadata_size(fs_info, 1); + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); spin_lock(&rsv->lock); if (rsv->reserved < needed_bytes) @@ -221,7 +227,7 @@ } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -365,10 +371,10 @@ } } -static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, - int uptodate) +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct page *page; + struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -407,8 +413,6 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -423,14 +427,13 @@ io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -445,11 +448,11 @@ io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -471,9 +474,8 @@ if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -499,9 +501,8 @@ val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); @@ -521,8 +522,8 @@ return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -595,8 +596,8 @@ } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -728,7 +729,7 @@ readahead_cache(inode); - ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; @@ -753,6 +754,16 @@ kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + + /* + * Sync discard ensures that the free space cache is always + * trimmed. So when reading this in, the state should reflect + * that. We also do this for async as a stop gap for lack of + * persistence. + */ + if (btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + e->trim_state = BTRFS_TRIM_STATE_TRIMMED; if (!e->bytes) { ret = -1; @@ -783,15 +794,16 @@ } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); if (ret) { + spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } @@ -809,12 +821,19 @@ ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; + e->bitmap_extents = count_bitmap_extents(ctl, e); + if (!btrfs_free_space_trimmed(e)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + e->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; + } } io_ctl_drop_pages(&io_ctl); merge_space_tree(ctl); ret = 1; out: + btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: @@ -823,15 +842,15 @@ goto out; } -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) +int load_free_space_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; struct btrfs_path *path; int ret = 0; bool matched; - u64 used = btrfs_block_group_used(&block_group->item); + u64 used = block_group->used; /* * If this block group has been marked to be cleared for one reason or @@ -869,7 +888,7 @@ * once created get their ->cached field set to BTRFS_CACHE_FINISHED so * we will never try to read their inode item while the fs is mounted. */ - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; @@ -885,13 +904,13 @@ spin_unlock(&block_group->lock); ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, - path, block_group->key.objectid); + path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->key.offset - used - + matched = (ctl->free_space == (block_group->length - used - block_group->bytes_super)); spin_unlock(&ctl->tree_lock); @@ -899,7 +918,7 @@ __btrfs_remove_free_space_cache(ctl); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", - block_group->key.objectid); + block_group->start); ret = -1; } out: @@ -912,7 +931,7 @@ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", - block_group->key.objectid); + block_group->start); } iput(inode); @@ -922,7 +941,7 @@ static noinline_for_stack int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, int *entries, int *bitmaps, struct list_head *bitmap_list) { @@ -1015,7 +1034,7 @@ ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1027,9 +1046,8 @@ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL); + inode->i_size - 1, EXTENT_DELALLOC, 0, + 0, NULL); btrfs_release_path(path); goto fail; } @@ -1050,9 +1068,9 @@ return -1; } -static noinline_for_stack int -write_pinned_extent_entries(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +static noinline_for_stack int write_pinned_extent_entries( + struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) { @@ -1070,11 +1088,11 @@ * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = fs_info->pinned_extents; + unpin = &trans->transaction->pinned_extents; - start = block_group->key.objectid; + start = block_group->start; - while (start < block_group->key.objectid + block_group->key.offset) { + while (start < block_group->start + block_group->length) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); @@ -1082,13 +1100,12 @@ return 0; /* This pinned extent is out of our range */ - if (extent_start >= block_group->key.objectid + - block_group->key.offset) + if (extent_start >= block_group->start + block_group->length) return 0; extent_start = max(extent_start, start); - extent_end = min(block_group->key.objectid + - block_group->key.offset, extent_end + 1); + extent_end = min(block_group->start + block_group->length, + extent_end + 1); len = extent_end - extent_start; *entries += 1; @@ -1126,7 +1143,7 @@ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); + EXTENT_DELALLOC, 0, 0, NULL); return ret; } @@ -1152,7 +1169,7 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path, u64 offset) { @@ -1174,13 +1191,10 @@ if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; - if (block_group) { -#ifdef DEBUG - btrfs_err(root->fs_info, - "failed to write free space cache for block group %llu", - block_group->key.objectid); -#endif - } + if (block_group) + btrfs_debug(root->fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); } btrfs_update_inode(trans, root, inode); @@ -1220,12 +1234,12 @@ } int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, block_group, &block_group->io_ctl, - path, block_group->key.objectid); + path, block_group->start); } /** @@ -1241,11 +1255,10 @@ */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1277,7 +1290,7 @@ } /* Lock all pages first so we can lock the extent safely. */ - ret = io_ctl_prepare_pages(io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; @@ -1303,8 +1316,7 @@ * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(fs_info, block_group, - io_ctl, &entries); + ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1323,8 +1335,9 @@ io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0, - i_size_read(inode), &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, + io_ctl->num_pages, 0, i_size_read(inode), + &cached_state); if (ret) goto out_nospc; @@ -1342,7 +1355,7 @@ /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; @@ -1353,18 +1366,6 @@ goto out; return 0; - -out: - io_ctl->inode = NULL; - io_ctl_free(io_ctl); - if (ret) { - invalidate_inode_pages2(inode->i_mapping); - BTRFS_I(inode)->generation = 0; - } - btrfs_update_inode(trans, root, inode); - if (must_iput) - iput(inode); - return ret; out_nospc_locked: cleanup_bitmap_list(&bitmap_list); @@ -1378,14 +1379,24 @@ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); - goto out; +out: + io_ctl->inode = NULL; + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + } + btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); + return ret; } -int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; @@ -1397,18 +1408,16 @@ } spin_unlock(&block_group->lock); - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) return 0; ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { -#ifdef DEBUG - btrfs_err(fs_info, - "failed to write free space cache for block group %llu", - block_group->key.objectid); -#endif + btrfs_debug(fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); @@ -1633,6 +1642,11 @@ { rb_erase(&info->offset_index, &ctl->free_space_offset); ctl->free_extents--; + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; + } } static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -1653,6 +1667,11 @@ if (ret) return ret; + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + ctl->free_space += info->bytes; ctl->free_extents++; return ret; @@ -1660,11 +1679,11 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; - u64 size = block_group->key.offset; + u64 size = block_group->length; u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); @@ -1673,26 +1692,17 @@ ASSERT(ctl->total_bitmaps <= max_bitmaps); /* - * The goal is to keep the total amount of memory used per 1gb of space - * at or below 32k, so we need to adjust how much memory we allow to be - * used by extent based free space tracking + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. */ if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - /* - * we want to account for 1 more bitmap than what we have so we can make - * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as - * we add more bitmaps. - */ - bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; - - if (bitmap_bytes >= max_bytes) { - ctl->extents_thresh = 0; - return; - } + bitmap_bytes = ctl->total_bitmaps * ctl->unit; /* * we want the extent entry threshold to always be at most 1/2 the max @@ -1709,17 +1719,31 @@ struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = -1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta++; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta++; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + } } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1734,16 +1758,30 @@ struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = 1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); info->bytes += bytes; ctl->free_space += bytes; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta--; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta--; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; + } } /* @@ -1879,11 +1917,35 @@ return NULL; } +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + struct btrfs_block_group *block_group = ctl->private; + u64 bytes = bitmap_info->bytes; + unsigned int rs, re; + int count = 0; + + if (!block_group || !bytes) + return count; + + bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, + BITS_PER_BITMAP) { + bytes -= (rs - re) * ctl->unit; + count++; + + if (!bytes) + break; + } + + return count; +} + static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + info->bitmap_extents = 0; INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1894,6 +1956,18 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { + /* + * Normally when this is called, the bitmap is completely empty. However, + * if we are blowing up the free space cache for one reason or another + * via __btrfs_remove_free_space_cache(), then it may not be freed and + * we may leave stats on the table. + */ + if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] -= + bitmap_info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; + + } unlink_free_space(ctl, bitmap_info); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); @@ -1980,10 +2054,23 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes) + u64 bytes, enum btrfs_trim_state trim_state) { u64 bytes_to_set = 0; u64 end; + + /* + * This is a tradeoff to make bitmap trim state minimal. We mark the + * whole bitmap untrimmed if at any point we add untrimmed regions. + */ + if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { + if (btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); @@ -2004,7 +2091,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2012,6 +2099,10 @@ if (btrfs_should_fragment_free_space(block_group)) forced = true; #endif + + /* This is a way to reclaim large regions from the bitmaps. */ + if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) + return false; /* * If we are below the extents threshold then we can add this as an @@ -2025,8 +2116,8 @@ * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ - if (info->bytes <= fs_info->sectorsize * 4) { - if (ctl->free_extents * 2 <= ctl->extents_thresh) + if (info->bytes <= fs_info->sectorsize * 8) { + if (ctl->free_extents * 3 <= ctl->extents_thresh) return false; } else { return false; @@ -2041,7 +2132,7 @@ * so allow those block groups to still be allowed to have a bitmap * entry. */ - if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) return false; return true; @@ -2056,13 +2147,15 @@ struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; - struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; + enum btrfs_trim_state trim_state; int ret; bytes = info->bytes; offset = info->offset; + trim_state = info->trim_state; if (!ctl->op->use_bitmap(ctl, info)) return 0; @@ -2097,8 +2190,8 @@ } if (entry->offset == offset_to_bitmap(ctl, offset)) { - bytes_added = add_bytes_to_bitmap(ctl, entry, - offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, entry, offset, + bytes, trim_state); bytes -= bytes_added; offset += bytes_added; } @@ -2117,7 +2210,8 @@ goto new_bitmap; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; added = 0; @@ -2151,6 +2245,7 @@ /* allocate the bitmap */ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + info->trim_state = BTRFS_TRIM_STATE_TRIMMED; spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2170,6 +2265,22 @@ return ret; } +/* + * Free space merging rules: + * 1) Merge trimmed areas together + * 2) Let untrimmed areas coalesce with trimmed areas + * 3) Always pull neighboring regions from bitmaps + * + * The above rules are for when we merge free space based on btrfs_trim_state. + * Rules 2 and 3 are subtle because they are suboptimal, but are done for the + * same reason: to promote larger extent regions which makes life easier for + * find_free_extent(). Rule 2 enables coalescing based on the common path + * being returning free space from btrfs_finish_extent_commit(). So when free + * space is trimmed, it will prevent aggregating trimmed new region and + * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents + * and provide find_free_extent() with the largest extents possible hoping for + * the reuse path. + */ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { @@ -2178,6 +2289,7 @@ bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; + const bool is_trimmed = btrfs_free_space_trimmed(info); /* * first we want to see if there is free space adjacent to the range we @@ -2191,7 +2303,9 @@ else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); - if (right_info && !right_info->bitmap) { + /* See try_merge_free_space() comment. */ + if (right_info && !right_info->bitmap && + (!is_trimmed || btrfs_free_space_trimmed(right_info))) { if (update_stat) unlink_free_space(ctl, right_info); else @@ -2201,8 +2315,10 @@ merged = true; } + /* See try_merge_free_space() comment. */ if (left_info && !left_info->bitmap && - left_info->offset + left_info->bytes == offset) { + left_info->offset + left_info->bytes == offset && + (!is_trimmed || btrfs_free_space_trimmed(left_info))) { if (update_stat) unlink_free_space(ctl, left_info); else @@ -2237,6 +2353,10 @@ return false; bytes = (j - i) * ctl->unit; info->bytes += bytes; + + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (update_stat) bitmap_clear_bits(ctl, bitmap, end, bytes); @@ -2291,6 +2411,10 @@ info->offset -= bytes; info->bytes += bytes; + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (update_stat) bitmap_clear_bits(ctl, bitmap, info->offset, bytes); else @@ -2340,10 +2464,13 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, - u64 offset, u64 bytes) + u64 offset, u64 bytes, + enum btrfs_trim_state trim_state) { + struct btrfs_block_group *block_group = ctl->private; struct btrfs_free_space *info; int ret = 0; + u64 filter_bytes = bytes; info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) @@ -2351,6 +2478,7 @@ info->offset = offset; info->bytes = bytes; + info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); spin_lock(&ctl->tree_lock); @@ -2379,10 +2507,13 @@ */ steal_from_bitmap(ctl, info, true); + filter_bytes = max(filter_bytes, info->bytes); + ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2390,10 +2521,47 @@ ASSERT(ret != -EEXIST); } + if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { + btrfs_discard_check_filter(block_group, filter_bytes); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + return ret; } -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_add_free_space(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size, trim_state); +} + +/* + * This is a subtle distinction because when adding free space back in general, + * we want it to be added as untrimmed for async. But in the case where we add + * it on loading of a block group, we want to consider it trimmed. + */ +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || + btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size, trim_state); +} + +int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2465,8 +2633,10 @@ } spin_unlock(&ctl->tree_lock); - ret = btrfs_add_free_space(block_group, offset + bytes, - old_end - (offset + bytes)); + ret = __btrfs_add_free_space(block_group->fs_info, ctl, + offset + bytes, + old_end - (offset + bytes), + info->trim_state); WARN_ON(ret); goto out; } @@ -2478,12 +2648,13 @@ goto again; } out_lock: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); out: return ret; } -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, +void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2508,14 +2679,14 @@ "%d blocks of free space at or bigger than bytes is", count); } -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; - ctl->start = block_group->key.objectid; + ctl->start = block_group->start; ctl->private = block_group; ctl->op = &free_space_op; INIT_LIST_HEAD(&ctl->trimming_ranges); @@ -2535,9 +2706,8 @@ * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ -static int -__btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, +static void __btrfs_return_cluster_to_free_space( + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2545,8 +2715,10 @@ struct rb_node *node; spin_lock(&cluster->lock); - if (cluster->block_group != block_group) - goto out; + if (cluster->block_group != block_group) { + spin_unlock(&cluster->lock); + return; + } cluster->block_group = NULL; cluster->window_start = 0; @@ -2563,18 +2735,29 @@ bitmap = (entry->bitmap != NULL); if (!bitmap) { + /* Merging treats extents as if they were new */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= + entry->bytes; + } + try_merge_free_space(ctl, entry, false); steal_from_bitmap(ctl, entry, false); + + /* As we insert directly, update these statistics */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += + entry->bytes; + } } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); } cluster->root = RB_ROOT; - -out: spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); - return 0; } static void __btrfs_remove_free_space_cache_locked( @@ -2600,10 +2783,12 @@ { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); + if (ctl->private) + btrfs_discard_update_discardable(ctl->private, ctl); spin_unlock(&ctl->tree_lock); } -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; @@ -2621,20 +2806,55 @@ cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); } -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, +/** + * btrfs_is_free_space_trimmed - see if everything is trimmed + * @block_group: block_group of interest + * + * Walk @block_group's free space rb_tree to determine if everything is trimmed. + */ +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *node; + bool ret = true; + + spin_lock(&ctl->tree_lock); + node = rb_first(&ctl->free_space_offset); + + while (node) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + + if (!btrfs_free_space_trimmed(info)) { + ret = false; + break; + } + + node = rb_next(node); + } + + spin_unlock(&ctl->tree_lock); + return ret; +} + +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; u64 align_gap = 0; u64 align_gap_len = 0; + enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, @@ -2645,12 +2865,20 @@ ret = offset; if (entry->bitmap) { bitmap_clear_bits(ctl, entry, offset, bytes); + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + if (!entry->bytes) free_bitmap(ctl, entry); } else { unlink_free_space(ctl, entry); align_gap_len = offset - entry->offset; align_gap = entry->offset; + align_gap_trim_state = entry->trim_state; + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); @@ -2662,11 +2890,13 @@ link_free_space(ctl, entry); } out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (align_gap_len) __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len); + align_gap, align_gap_len, + align_gap_trim_state); return ret; } @@ -2678,12 +2908,11 @@ * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ -int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, +void btrfs_return_cluster_to_free_space( + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; - int ret; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); @@ -2691,29 +2920,30 @@ block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); - return 0; + return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); - return 0; + return; } - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); - ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); + + btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); - return ret; } -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, u64 bytes, u64 min_start, @@ -2746,11 +2976,13 @@ * if it couldn't find anything suitably large, or a logical disk offset * if things worked out */ -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; @@ -2803,8 +3035,6 @@ entry->bytes -= bytes; } - if (entry->bytes == 0) - rb_erase(&entry->offset_index, &cluster->root); break; } out: @@ -2815,24 +3045,35 @@ spin_lock(&ctl->tree_lock); + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + ctl->free_space -= bytes; + if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + + spin_lock(&cluster->lock); if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); ctl->free_extents--; if (entry->bitmap) { kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); + } else if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } + spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; } -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, +static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, @@ -2914,7 +3155,7 @@ * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_no_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -3005,7 +3246,7 @@ * that we have already failed to find extents that will work. */ static noinline int -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -3055,11 +3296,11 @@ * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); @@ -3118,7 +3359,7 @@ list_del_init(&entry->list); if (!ret) { - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -3146,9 +3387,10 @@ cluster->block_group = NULL; } -static int do_trimming(struct btrfs_block_group_cache *block_group, +static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, + enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; @@ -3156,6 +3398,9 @@ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; + const u64 end = start + bytes; + const u64 reserved_end = reserved_start + reserved_bytes; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); @@ -3169,11 +3414,20 @@ spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); - if (!ret) + if (!ret) { *total_trimmed += trimmed; + trim_state = BTRFS_TRIM_STATE_TRIMMED; + } mutex_lock(&ctl->cache_writeout_mutex); - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + if (reserved_start < start) + __btrfs_add_free_space(fs_info, ctl, reserved_start, + start - reserved_start, + reserved_trim_state); + if (start + bytes < reserved_start + reserved_bytes) + __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + reserved_trim_state); + __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3184,23 +3438,31 @@ space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); } return ret; } -static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) +/* + * If @async is set, then we will trim 1 region and return. + */ +static int trim_no_bitmap(struct btrfs_block_group *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + bool async) { + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; int ret = 0; u64 extent_start; u64 extent_bytes; + enum btrfs_trim_state extent_trim_state; u64 bytes; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (start < end) { struct btrfs_trim_range trim_entry; @@ -3208,49 +3470,66 @@ mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (ctl->free_space < minlen) + goto out_unlock; entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (!entry) + goto out_unlock; - /* skip bitmaps */ - while (entry->bitmap) { + /* Skip bitmaps and if async, already trimmed entries */ + while (entry->bitmap || + (async && btrfs_free_space_trimmed(entry))) { node = rb_next(&entry->offset_index); - if (!node) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto out; - } + if (!node) + goto out_unlock; entry = rb_entry(node, struct btrfs_free_space, offset_index); } - if (entry->offset >= end) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (entry->offset >= end) + goto out_unlock; extent_start = entry->offset; extent_bytes = entry->bytes; - start = max(start, extent_start); - bytes = min(extent_start + extent_bytes, end) - start; - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto next; - } + extent_trim_state = entry->trim_state; + if (async) { + start = entry->offset; + bytes = entry->bytes; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + unlink_free_space(ctl, entry); + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim + * X when we come back around. So trim it now. + */ + if (max_discard_size && + bytes >= (max_discard_size + + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { + bytes = max_discard_size; + extent_bytes = max_discard_size; + entry->offset += max_discard_size; + entry->bytes -= max_discard_size; + link_free_space(ctl, entry); + } else { + kmem_cache_free(btrfs_free_space_cachep, entry); + } + } else { + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + } spin_unlock(&ctl->tree_lock); trim_entry.start = extent_start; @@ -3259,11 +3538,17 @@ mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes, &trim_entry); - if (ret) + extent_start, extent_bytes, extent_trim_state, + &trim_entry); + if (ret) { + block_group->discard_cursor = start + bytes; break; + } next: start += bytes; + block_group->discard_cursor = start; + if (async && *total_trimmed) + break; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -3272,19 +3557,76 @@ cond_resched(); } -out: + + return ret; + +out_unlock: + block_group->discard_cursor = btrfs_block_group_end(block_group); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + return ret; } -static int trim_bitmaps(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) +/* + * If we break out of trimming a bitmap prematurely, we should reset the + * trimming bit. In a rather contrieved case, it's possible to race here so + * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. + * + * start = start of bitmap + * end = near end of bitmap + * + * Thread 1: Thread 2: + * trim_bitmaps(start) + * trim_bitmaps(end) + * end_trimming_bitmap() + * reset_trimming_bitmap() + */ +static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { + struct btrfs_free_space *entry; + + spin_lock(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 1, 0); + if (entry) { + if (btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; + } + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + + spin_unlock(&ctl->tree_lock); +} + +static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *entry) +{ + if (btrfs_free_space_trimming_bitmap(entry)) { + entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; + ctl->discardable_extents[BTRFS_STAT_CURR] -= + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; + } +} + +/* + * If @async is set, then we will trim 1 region and return. + */ +static int trim_bitmaps(struct btrfs_block_group *block_group, + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; int ret = 0; int ret2; u64 bytes; u64 offset = offset_to_bitmap(ctl, start); + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (offset < end) { bool next_bitmap = false; @@ -3294,34 +3636,83 @@ spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { + block_group->discard_cursor = + btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); - if (!entry) { + /* + * Bitmaps are marked trimmed lossily now to prevent constant + * discarding of the same bitmap (the reason why we are bound + * by the filters). So, retrim the block group bitmaps when we + * are preparing to punt to the unused_bgs list. This uses + * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED + * which is the only discard index which sets minlen to 0. + */ + if (!entry || (async && minlen && start == offset && + btrfs_free_space_trimmed(entry))) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + + /* + * Async discard bitmap trimming begins at by setting the start + * to be key.objectid and the offset_to_bitmap() aligns to the + * start of the bitmap. This lets us know we are fully + * scanning the bitmap rather than only some portion of it. + */ + if (start == offset) + entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; bytes = minlen; ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { + /* + * We lossily consider a bitmap trimmed if we only skip + * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. + */ + if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) + end_trimming_bitmap(ctl, entry); + else + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + /* + * We already trimmed a region, but are using the locking above + * to reset the trim_state. + */ + if (async && *total_trimmed) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto out; + } + bytes = min(bytes, end - start); - if (bytes < minlen) { + if (bytes < minlen || (async && maxlen && bytes > maxlen)) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } + + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < @minlen, we won't trim X when we come back around. + * So trim it now. We differ here from trimming extents as we + * don't keep individual state per bit. + */ + if (async && + max_discard_size && + bytes > (max_discard_size + minlen)) + bytes = max_discard_size; bitmap_clear_bits(ctl, entry, start, bytes); if (entry->bytes == 0) @@ -3334,19 +3725,25 @@ mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes, &trim_entry); - if (ret) + start, bytes, 0, &trim_entry); + if (ret) { + reset_trimming_bitmap(ctl, offset); + block_group->discard_cursor = + btrfs_block_group_end(block_group); break; + } next: if (next_bitmap) { offset += BITS_PER_BITMAP * ctl->unit; + start = offset; } else { start += bytes; - if (start >= offset + BITS_PER_BITMAP * ctl->unit) - offset += BITS_PER_BITMAP * ctl->unit; } + block_group->discard_cursor = start; if (fatal_signal_pending(current)) { + if (start != offset) + reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; break; } @@ -3354,55 +3751,47 @@ cond_resched(); } + if (offset >= end) + block_group->discard_cursor = end; + +out: return ret; } -void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) +int btrfs_trim_block_group(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) { - atomic_inc(&cache->trimming); -} + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + int ret; + u64 rem = 0; -void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; - bool cleanup; + *trimmed = 0; spin_lock(&block_group->lock); - cleanup = (atomic_dec_and_test(&block_group->trimming) && - block_group->removed); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); - if (cleanup) { - mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree.map_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->key.objectid, - 1); - BUG_ON(!em); /* logic error, can't happen */ - /* - * remove_extent_mapping() will delete us from the pinned_chunks - * list, which is protected by the chunk mutex. - */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - mutex_unlock(&fs_info->chunk_mutex); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); + if (ret) + goto out; - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); - - /* - * We've left one free space entry and other tasks trimming - * this block group have left 1 entry each one. Free them. - */ - __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); + div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); + /* If we ended in the middle of a bitmap, reset the trimming flag */ + if (rem) + reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); +out: + btrfs_unfreeze_block_group(block_group); + return ret; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async) { int ret; @@ -3413,16 +3802,36 @@ spin_unlock(&block_group->lock); return 0; } - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - goto out; + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); + btrfs_unfreeze_block_group(block_group); - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); -out: - btrfs_put_block_group_trimming(block_group); + return ret; +} + +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_freeze_block_group(block_group); + spin_unlock(&block_group->lock); + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, + async); + + btrfs_unfreeze_block_group(block_group); + return ret; } @@ -3582,11 +3991,9 @@ if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), inode->i_size, true); -#ifdef DEBUG - btrfs_err(fs_info, - "failed to write free ino cache for root %llu", - root->root_key.objectid); -#endif + btrfs_debug(fs_info, + "failed to write free ino cache for root %llu error %d", + root->root_key.objectid, ret); } return ret; @@ -3599,12 +4006,13 @@ * how the free space cache loading stuff works, so you can get really weird * configurations. */ -int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info = NULL, *bitmap_info; void *map = NULL; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; u64 bytes_added; int ret; @@ -3646,7 +4054,8 @@ info = NULL; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; @@ -3667,7 +4076,7 @@ * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ -int test_check_exists(struct btrfs_block_group_cache *cache, +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; -- Gitblit v1.6.2