From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/inode.c | 5243 ++++++++++++++++++++++++++++------------------------------- 1 files changed, 2,479 insertions(+), 2,764 deletions(-) diff --git a/kernel/fs/btrfs/inode.c b/kernel/fs/btrfs/inode.c index f314b2c..c900a39 100644 --- a/kernel/fs/btrfs/inode.c +++ b/kernel/fs/btrfs/inode.c @@ -3,9 +3,9 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -27,7 +27,12 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <linux/swap.h> +#include <linux/migrate.h> +#include <linux/sched/mm.h> +#include <linux/iomap.h> #include <asm/unaligned.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -41,32 +46,31 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" -#include "backref.h" #include "props.h" #include "qgroup.h" -#include "dedupe.h" +#include "delalloc-space.h" +#include "block-group.h" +#include "space-info.h" struct btrfs_iget_args { - struct btrfs_key *location; + u64 ino; struct btrfs_root *root; }; struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; -static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; -static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -74,38 +78,26 @@ struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; -#define S_SHIFT 12 -static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, -}; - static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash); -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate); /* * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING @@ -113,7 +105,7 @@ * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ -static inline void btrfs_cleanup_ordered_extents(struct inode *inode, +static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { @@ -125,7 +117,7 @@ struct page *page; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; @@ -147,13 +139,6 @@ } static int btrfs_dirty_inode(struct inode *inode); - -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -187,6 +172,9 @@ int ret; size_t cur_size = size; unsigned long offset; + + ASSERT((compressed_size > 0 && compressed_pages) || + (compressed_size == 0 && !compressed_pages)); if (compressed_size && compressed_pages) cur_size = compressed_size; @@ -241,13 +229,22 @@ start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_SIZE - 1); + offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + + /* + * We align size to sectorsize for inline extents just for simplicity + * sake. + */ + size = ALIGN(size, root->fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); + if (ret) + goto fail; /* * we're an inline extent, so nobody can @@ -271,15 +268,15 @@ * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct inode *inode, u64 start, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 end, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; u64 aligned_end = ALIGN(end, fs_info->sectorsize); @@ -311,7 +308,7 @@ btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -320,9 +317,9 @@ extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, - start, aligned_end, NULL, - 1, 1, extent_item_size, &extent_inserted); + ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, + NULL, 1, 1, extent_item_size, + &extent_inserted); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -331,7 +328,7 @@ if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, path, extent_inserted, - root, inode, start, + root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -342,8 +339,8 @@ goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -367,18 +364,25 @@ struct list_head list; }; -struct async_cow { +struct async_chunk { struct inode *inode; - struct btrfs_root *root; struct page *locked_page; u64 start; u64 end; unsigned int write_flags; struct list_head extents; + struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; + atomic_t *pending; }; -static noinline int add_async_extent(struct async_cow *cow, +struct async_cow { + /* Number of chunks in flight; must be first in the structure */ + atomic_t num_chunks; + struct async_chunk chunks[]; +}; + +static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, @@ -402,10 +406,10 @@ /* * Check if the inode has flags compatible with compression */ -static inline bool inode_can_compress(struct inode *inode) +static inline bool inode_can_compress(struct btrfs_inode *inode) { - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW || - BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } @@ -414,29 +418,30 @@ * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ -static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) +static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(BTRFS_I(inode))); + btrfs_ino(inode)); return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ - if (BTRFS_I(inode)->defrag_compress) + if (inode->defrag_compress) return 1; /* bad compression ratios */ - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || - BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || - BTRFS_I(inode)->prop_compress) - return btrfs_compress_heuristic(inode, start, end); + inode->flags & BTRFS_INODE_COMPRESS || + inode->prop_compress) + return btrfs_compress_heuristic(&inode->vfs_inode, start, end); return 0; } @@ -466,16 +471,15 @@ * are written in the same order that the flusher thread sent them * down. */ -static noinline void compress_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, - struct async_cow *async_cow, - int *num_added) +static noinline int compress_file_range(struct async_chunk *async_chunk) { + struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; + u64 start = async_chunk->start; + u64 end = async_chunk->end; u64 actual_end; - u64 isize = i_size_read(inode); + u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -484,12 +488,25 @@ int i; int will_compress; int compress_type = fs_info->compress_type; + int compressed_extents = 0; int redirty = 0; inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, isize, end + 1); + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -530,7 +547,7 @@ * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -571,8 +588,7 @@ &total_compressed); if (!ret) { - unsigned long offset = total_compressed & - (PAGE_SIZE - 1); + unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; char *kaddr; @@ -595,11 +611,12 @@ /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(BTRFS_I(inode), start, end, + 0, BTRFS_COMPRESS_NONE, + NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(inode, start, end, + ret = cow_file_range_inline(BTRFS_I(inode), start, end, total_compressed, compress_type, pages); } @@ -621,8 +638,9 @@ * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, end, - NULL, clear_flags, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + NULL, + clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | @@ -641,8 +659,7 @@ } kfree(pages); } - - return; + return 0; } } @@ -661,14 +678,14 @@ */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - *num_added += 1; + compressed_extents++; /* * The async work queues will take care of doing actual * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, total_in, + add_async_extent(async_chunk, start, total_in, total_compressed, pages, nr_pages, compress_type); @@ -678,7 +695,7 @@ cond_resched(); goto again; } - return; + return compressed_extents; } } if (pages) { @@ -708,18 +725,20 @@ * to our extent and set things up for the async work queue to run * cow_file_range to do the normal delalloc dance. */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) - __set_page_dirty_nobuffers(locked_page); + if (async_chunk->locked_page && + (page_offset(async_chunk->locked_page) >= start && + page_offset(async_chunk->locked_page)) <= end) { + __set_page_dirty_nobuffers(async_chunk->locked_page); /* unlocked later on in the async handlers */ + } if (redirty) extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; + compressed_extents++; - return; + return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -744,45 +763,38 @@ * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline void submit_compressed_extents(struct inode *inode, - struct async_cow *async_cow) +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree; + struct btrfs_root *root = inode->root; + struct extent_io_tree *io_tree = &inode->io_tree; int ret = 0; again: - while (!list_empty(&async_cow->extents)) { - async_extent = list_entry(async_cow->extents.next, + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); - io_tree = &BTRFS_I(inode)->io_tree; - retry: + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1); /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; unsigned long nr_written = 0; - lock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); - /* allocate blocks */ - ret = cow_file_range(inode, async_cow->locked_page, + ret = cow_file_range(inode, async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0, - NULL); + &page_started, &nr_written, 0); /* JDM XXX */ @@ -793,20 +805,17 @@ * all those pages down to the drive. */ if (!page_started && !ret) - extent_write_locked_range(inode, + extent_write_locked_range(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, WB_SYNC_ALL); - else if (ret) - unlock_page(async_cow->locked_page); + else if (ret && async_chunk->locked_page) + unlock_page(async_chunk->locked_page); kfree(async_extent); cond_resched(); continue; } - - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, @@ -826,7 +835,7 @@ * will not submit these pages down to lower * layers. */ - extent_range_redirty_for_io(inode, + extent_range_redirty_for_io(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1); @@ -861,8 +870,7 @@ BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(BTRFS_I(inode), - async_extent->start, + btrfs_drop_extent_cache(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; @@ -875,29 +883,25 @@ extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - if (btrfs_submit_compressed_write(inode, - async_extent->start, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages, - async_cow->write_flags)) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + async_chunk->write_flags, + async_chunk->blkcg_css)) { struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; - p->mapping = inode->i_mapping; - tree->ops->writepage_end_io_hook(p, start, end, - NULL, 0); + p->mapping = inode->vfs_inode.i_mapping; + btrfs_writepage_endio_finish_ordered(p, start, end, 0); + p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, end, - NULL, 0, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); @@ -914,8 +918,6 @@ extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - async_extent->start + - async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -927,10 +929,10 @@ goto again; } -static u64 get_extent_allocation_hint(struct inode *inode, u64 start, +static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; @@ -972,14 +974,13 @@ * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash) + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -993,8 +994,7 @@ bool extent_reserved = false; int ret = 0; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { - WARN_ON_ONCE(1); + if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; } @@ -1003,7 +1003,7 @@ num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); - inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); + inode_should_defrag(inode, start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ @@ -1016,8 +1016,7 @@ * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, - delalloc_end, NULL, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1033,8 +1032,7 @@ } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + num_bytes - 1, 0); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1098,7 +1096,7 @@ * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(BTRFS_I(inode), start, + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } @@ -1114,9 +1112,8 @@ page_ops = unlock ? PAGE_UNLOCK : 0; page_ops |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, start, - start + ram_size - 1, - delalloc_end, locked_page, + extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1139,7 +1136,7 @@ return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1161,7 +1158,6 @@ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); @@ -1169,8 +1165,7 @@ if (start >= end) goto out; } - extent_clear_unlock_delalloc(inode, start, end, delalloc_end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1181,16 +1176,15 @@ */ static noinline void async_cow_start(struct btrfs_work *work) { - struct async_cow *async_cow; - int num_added = 0; - async_cow = container_of(work, struct async_cow, work); + struct async_chunk *async_chunk; + int compressed_extents; - compress_file_range(async_cow->inode, async_cow->locked_page, - async_cow->start, async_cow->end, async_cow, - &num_added); - if (num_added == 0) { - btrfs_add_delayed_iput(async_cow->inode); - async_cow->inode = NULL; + async_chunk = container_of(work, struct async_chunk, work); + + compressed_extents = compress_file_range(async_chunk); + if (compressed_extents == 0) { + btrfs_add_delayed_iput(async_chunk->inode); + async_chunk->inode = NULL; } } @@ -1199,77 +1193,153 @@ */ static noinline void async_cow_submit(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; - struct async_cow *async_cow; - struct btrfs_root *root; + struct async_chunk *async_chunk = container_of(work, struct async_chunk, + work); + struct btrfs_fs_info *fs_info = btrfs_work_owner(work); unsigned long nr_pages; - async_cow = container_of(work, struct async_cow, work); - - root = async_cow->root; - fs_info = root->fs_info; - nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; + + /* + * ->inode could be NULL if async_chunk_start has failed to compress, + * in which case we don't have anything to submit, yet we need to + * always adjust ->async_delalloc_pages as its paired with the init + * happening in cow_file_range_async + */ + if (async_chunk->inode) + submit_compressed_extents(async_chunk); /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); - - if (async_cow->inode) - submit_compressed_extents(async_cow->inode, async_cow); } static noinline void async_cow_free(struct btrfs_work *work) { - struct async_cow *async_cow; - async_cow = container_of(work, struct async_cow, work); - if (async_cow->inode) - btrfs_add_delayed_iput(async_cow->inode); - kfree(async_cow); + struct async_chunk *async_chunk; + + async_chunk = container_of(work, struct async_chunk, work); + if (async_chunk->inode) + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + /* + * Since the pointer to 'pending' is at the beginning of the array of + * async_chunk's, freeing it ensures the whole array has been freed. + */ + if (atomic_dec_and_test(async_chunk->pending)) + kvfree(async_chunk->pending); } -static int cow_file_range_async(struct inode *inode, struct page *locked_page, +static int cow_file_range_async(struct btrfs_inode *inode, + struct writeback_control *wbc, + struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, - unsigned int write_flags) + unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); + struct async_cow *ctx; + struct async_chunk *async_chunk; unsigned long nr_pages; u64 cur_end; + u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); + int i; + bool should_compress; + unsigned nofs_flag; + const unsigned int write_flags = wbc_to_write_flags(wbc); - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL); - while (start < end) { - async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = igrab(inode); - async_cow->root = root; - async_cow->locked_page = locked_page; - async_cow->start = start; - async_cow->write_flags = write_flags; + unlock_extent(&inode->io_tree, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) - cur_end = end; - else + if (inode->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { + num_chunks = 1; + should_compress = false; + } else { + should_compress = true; + } + + nofs_flag = memalloc_nofs_save(); + ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + + if (!ctx) { + unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; + unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR; + + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); + return -ENOMEM; + } + + async_chunk = ctx->chunks; + atomic_set(&ctx->num_chunks, num_chunks); + + for (i = 0; i < num_chunks; i++) { + if (should_compress) cur_end = min(end, start + SZ_512K - 1); + else + cur_end = end; - async_cow->end = cur_end; - INIT_LIST_HEAD(&async_cow->extents); + /* + * igrab is called higher up in the call chain, take only the + * lightweight reference for the callback lifetime + */ + ihold(&inode->vfs_inode); + async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].inode = &inode->vfs_inode; + async_chunk[i].start = start; + async_chunk[i].end = cur_end; + async_chunk[i].write_flags = write_flags; + INIT_LIST_HEAD(&async_chunk[i].extents); - btrfs_init_work(&async_cow->work, - btrfs_delalloc_helper, - async_cow_start, async_cow_submit, - async_cow_free); + /* + * The locked_page comes all the way from writepage and its + * the original page we were actually given. As we spread + * this large delalloc region across multiple async_chunk + * structs, only the first struct needs a pointer to locked_page + * + * This way we don't need racey decisions about who is supposed + * to unlock it. + */ + if (locked_page) { + /* + * Depending on the compressibility, the pages might or + * might not go through async. We want all of them to + * be accounted against wbc once. Let's do it here + * before the paths diverge. wbc accounting is used + * only for foreign writeback detection and doesn't + * need full accuracy. Just account the whole thing + * against the first page. + */ + wbc_account_cgroup_owner(wbc, locked_page, + cur_end - start); + async_chunk[i].locked_page = locked_page; + locked_page = NULL; + } else { + async_chunk[i].locked_page = NULL; + } - nr_pages = (cur_end - start + PAGE_SIZE) >> - PAGE_SHIFT; + if (blkcg_css != blkcg_root_css) { + css_get(blkcg_css); + async_chunk[i].blkcg_css = blkcg_css; + } else { + async_chunk[i].blkcg_css = NULL; + } + + btrfs_init_work(&async_chunk[i].work, async_cow_start, + async_cow_submit, async_cow_free); + + nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); - btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); + btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); *nr_written += nr_pages; start = cur_end + 1; @@ -1300,6 +1370,73 @@ return 1; } +static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, + const u64 start, const u64 end, + int *page_started, unsigned long *nr_written) +{ + const bool is_space_ino = btrfs_is_free_space_inode(inode); + const bool is_reloc_ino = (inode->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID); + const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &inode->io_tree; + u64 range_start = start; + u64 count; + + /* + * If EXTENT_NORESERVE is set it means that when the buffered write was + * made we had not enough available data space and therefore we did not + * reserve data space for it, since we though we could do NOCOW for the + * respective file range (either there is prealloc extent or the inode + * has the NOCOW bit set). + * + * However when we need to fallback to COW mode (because for example the + * block group for the corresponding extent was turned to RO mode by a + * scrub or relocation) we need to do the following: + * + * 1) We increment the bytes_may_use counter of the data space info. + * If COW succeeds, it allocates a new data extent and after doing + * that it decrements the space info's bytes_may_use counter and + * increments its bytes_reserved counter by the same amount (we do + * this at btrfs_add_reserved_bytes()). So we need to increment the + * bytes_may_use counter to compensate (when space is reserved at + * buffered write time, the bytes_may_use counter is incremented); + * + * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so + * that if the COW path fails for any reason, it decrements (through + * extent_clear_unlock_delalloc()) the bytes_may_use counter of the + * data space info, which we incremented in the step above. + * + * If we need to fallback to cow and the inode corresponds to a free + * space cache inode or an inode of the data relocation tree, we must + * also increment bytes_may_use of the data space_info for the same + * reason. Space caches and relocated data extents always get a prealloc + * extent for them, however scrub or balance may have set the block + * group that contains that extent to RO mode and therefore force COW + * when starting writeback. + */ + count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0); + if (count > 0 || is_space_ino || is_reloc_ino) { + u64 bytes = count; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + + if (is_space_ino || is_reloc_ino) + bytes = range_bytes; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); + + if (count > 0) + clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + 0, 0, NULL); + } + + return cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 1); +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -1307,38 +1444,27 @@ * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static noinline int run_delalloc_nocow(struct inode *inode, +static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, int force, - unsigned long *nr_written) + const u64 start, const u64 end, + int *page_started, int force, + unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key found_key; - struct extent_map *em; - u64 cow_start; - u64 cur_offset; - u64 extent_end; - u64 extent_offset; - u64 disk_bytenr; - u64 num_bytes; - u64 disk_num_bytes; - u64 ram_bytes; - int extent_type; + u64 cow_start = (u64)-1; + u64 cur_offset = start; int ret; - int type; - int nocow; - int check_prev = 1; - bool nolock; - u64 ino = btrfs_ino(BTRFS_I(inode)); + bool check_prev = true; + const bool freespace_inode = btrfs_is_free_space_inode(inode); + u64 ino = btrfs_ino(inode); + bool nocow = false; + u64 disk_bytenr = 0; path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1348,15 +1474,29 @@ return -ENOMEM; } - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); - - cow_start = (u64)-1; - cur_offset = start; while (1) { + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u64 extent_end; + u64 extent_offset; + u64 num_bytes = 0; + u64 disk_num_bytes; + u64 ram_bytes; + int extent_type; + + nocow = false; + ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; + + /* + * If there is no extent for our range when doing the initial + * search, then go back to the previous slot as it will be the + * one containing the search offset + */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, @@ -1365,8 +1505,9 @@ found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } - check_prev = 0; + check_prev = false; next_slot: + /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -1380,28 +1521,40 @@ leaf = path->nodes[0]; } - nocow = 0; - disk_bytenr = 0; - num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; + /* + * Keep searching until we find an EXTENT_ITEM or there are no + * more extents for this inode + */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } + + /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; + /* + * If the found extent starts after requested offset, then + * adjust extent_end to be right before this extent begins + */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto out_check; } + /* + * Found extent which begins before our range and potentially + * intersect it + */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1415,31 +1568,41 @@ btrfs_file_extent_num_bytes(leaf, fi); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (extent_end <= start) { + /* + * If the extent we got ends before our current offset, + * skip to the next extent. + */ + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } + /* Skip holes */ if (disk_bytenr == 0) goto out_check; + /* Skip compressed/encrypted/encoded extents */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; /* - * Do the same check as in btrfs_cross_ref_exist but - * without the unnecessary search. + * If extent is created before the last volume's snapshot + * this implies the extent is shared, hence we can't do + * nocow. This is the same check as in + * btrfs_cross_ref_exist but without calling + * btrfs_search_slot. */ - if (!nolock && + if (!freespace_inode && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; ret = btrfs_cross_ref_exist(root, ino, found_key.offset - - extent_offset, disk_bytenr); + extent_offset, disk_bytenr, false); if (ret) { /* * ret could be -EIO if the above fails to read @@ -1451,17 +1614,17 @@ goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* - * if there are pending snapshots for this root, - * we fall into common COW way. + * If there are pending snapshots for this root, we + * fall into common COW way */ - if (!nolock && atomic_read(&root->snapshot_force_cow)) + if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out_check; /* * force cow if csum exists in the range. @@ -1480,27 +1643,29 @@ cur_offset = cow_start; goto error; } - WARN_ON_ONCE(nolock); + WARN_ON_ONCE(freespace_inode); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - nocow = 1; + nocow = true; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + - btrfs_file_extent_ram_bytes(leaf, fi); - extent_end = ALIGN(extent_end, - fs_info->sectorsize); + extent_end = found_key.offset + ram_bytes; + extent_end = ALIGN(extent_end, fs_info->sectorsize); + /* Skip extents outside of our requested range */ + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } } else { - BUG_ON(1); + /* If this triggers then we have a memory corruption */ + BUG(); } out_check: - if (extent_end <= start) { - path->slots[0]++; - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - goto next_slot; - } + /* + * If nocow is false then record the beginning of the range + * that needs to be COWed + */ if (!nocow) { if (cow_start == (u64)-1) cow_start = cur_offset; @@ -1512,22 +1677,24 @@ } btrfs_release_path(path); + + /* + * COW range from cow_start to found_key.offset - 1. As the key + * will contain the beginning of the first extent that can be + * NOCOW, following one which needs to be COW'ed + */ if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, - cow_start, found_key.offset - 1, - end, page_started, nr_written, 1, - NULL); - if (ret) { - if (nocow) - btrfs_dec_nocow_writers(fs_info, - disk_bytenr); + ret = fallback_to_cow(inode, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written); + if (ret) goto error; - } cow_start = (u64)-1; } if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset; + struct extent_map *em; em = create_io_em(inode, cur_offset, num_bytes, orig_start, @@ -1537,26 +1704,32 @@ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - if (nocow) - btrfs_dec_nocow_writers(fs_info, - disk_bytenr); ret = PTR_ERR(em); goto error; } free_extent_map(em); - } - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - type = BTRFS_ORDERED_PREALLOC; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_PREALLOC); + if (ret) { + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + num_bytes - 1, + 0); + goto error; + } } else { - type = BTRFS_ORDERED_NOCOW; + ret = btrfs_add_ordered_extent(inode, cur_offset, + disk_bytenr, num_bytes, + num_bytes, + BTRFS_ORDERED_NOCOW); + if (ret) + goto error; } - ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, - num_bytes, num_bytes, type); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); - BUG_ON(ret); /* -ENOMEM */ + nocow = false; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) @@ -1569,7 +1742,7 @@ num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, end, + cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1594,15 +1767,18 @@ if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, end, - page_started, nr_written, 1, NULL); + ret = fallback_to_cow(inode, locked_page, cow_start, end, + page_started, nr_written); if (ret) goto error; } error: + if (nocow) + btrfs_dec_nocow_writers(fs_info, disk_bytenr); + if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, cur_offset, end, end, + extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1613,11 +1789,11 @@ return ret; } -static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + !(inode->flags & BTRFS_INODE_PREALLOC)) return 0; /* @@ -1625,9 +1801,8 @@ * if is not zero, it means the file is defragging. * Force cow if given extent needs to be defragged. */ - if (BTRFS_I(inode)->defrag_bytes && - test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_DEFRAG, 0, NULL)) + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) return 1; return 0; @@ -1637,31 +1812,27 @@ * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); - unsigned int write_flags = wbc_to_write_flags(wbc); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { + if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { + } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags); - ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written, - write_flags); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); + ret = cow_file_range_async(inode, wbc, locked_page, start, end, + page_started, nr_written); } if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -1669,10 +1840,9 @@ return ret; } -static void btrfs_split_extent_hook(void *private_data, - struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) { - struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1685,7 +1855,7 @@ u64 new_size; /* - * See the explanation in btrfs_merge_extent_hook, the same + * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; @@ -1702,16 +1872,13 @@ } /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(void *private_data, - struct extent_state *new, - struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) { - struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1815,15 +1982,12 @@ } /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(void *private_data, - struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits) { - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1869,14 +2033,14 @@ } /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. */ -static void btrfs_clear_bit_hook(void *private_data, - struct extent_state *state, - unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, + struct extent_state *state, unsigned *bits) { - struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1901,7 +2065,7 @@ /* * We don't reserve metadata space for space cache inodes so we - * don't need to call dellalloc_release_metadata if there is an + * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && @@ -1915,9 +2079,7 @@ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota( - &inode->vfs_inode, - state->start, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -1940,16 +2102,21 @@ } /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio + * + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio * return error otherwise */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1957,17 +2124,19 @@ u64 length = 0; u64 map_length; int ret; + struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, + &geom); if (ret < 0) return ret; - if (map_length < length + size) + + if (geom.len < length + size) return 1; return 0; } @@ -1984,34 +2153,8 @@ u64 bio_offset) { struct inode *inode = private_data; - blk_status_t ret = 0; - ret = btrfs_csum_one_bio(inode, bio, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - return 0; -} - -/* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; - - ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; + return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } /* @@ -2032,11 +2175,10 @@ * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) + { - struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; @@ -2060,7 +2202,7 @@ bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; } @@ -2071,17 +2213,16 @@ goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, - bio_offset, inode, - btrfs_submit_bio_start); + 0, inode, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); if (ret) goto out; } mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); out: if (ret) { @@ -2095,16 +2236,15 @@ * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2112,18 +2252,77 @@ return 0; } -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, - unsigned int extra_bits, - struct extent_state **cached_state, int dedupe) +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) { - WARN_ON((end & (PAGE_SIZE - 1)) == 0); - return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - extra_bits, cached_state); + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state) +{ + WARN_ON(PAGE_ALIGNED(end)); + + if (start >= i_size_read(&inode->vfs_inode) && + !(inode->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case so just + * set the delalloc new bit for the range directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + int ret; + + ret = btrfs_find_new_delalloc_bytes(inode, start, + end + 1 - start, + cached_state); + if (ret) + return ret; + } + + return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; + struct inode *inode; struct btrfs_work work; }; @@ -2134,75 +2333,126 @@ struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page; - struct inode *inode; + struct btrfs_inode *inode; u64 page_start; u64 page_end; - int ret; + int ret = 0; + bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; -again: - lock_page(page); - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); - goto out_page; - } - - inode = page->mapping->host; + inode = BTRFS_I(fixup->inode); page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state); + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); +again: + lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; + goto out_page; + } + + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; + + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) - goto out; + goto out_reserved; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - &cached_state, 0); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out_reserved; - } - - ClearPageChecked(page); - set_page_dirty(page); -out_reserved: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + &cached_state); if (ret) + goto out_reserved; + + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; +out_reserved: + btrfs_delalloc_release_extents(inode, PAGE_SIZE); + if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); -out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + clear_page_dirty_for_io(page); + SetPageError(page); + } + ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(&inode->vfs_inode); } /* @@ -2216,7 +2466,7 @@ * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2226,6 +2476,13 @@ if (TestClearPagePrivate2(page)) return 0; + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ if (PageChecked(page)) return -EAGAIN; @@ -2233,28 +2490,36 @@ if (!fixup) return -EAGAIN; + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); SetPageChecked(page); get_page(page); - btrfs_init_work(&fixup->work, btrfs_fixup_helper, - btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; + fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); - return -EBUSY; + + return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_pos, - u64 disk_bytenr, u64 disk_num_bytes, - u64 num_bytes, u64 ram_bytes, - u8 compression, u8 encryption, - u16 other_encoding, int extent_type) + struct btrfs_inode *inode, u64 file_pos, + struct btrfs_file_extent_item *stack_fi, + u64 qgroup_reserved) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *fi; + struct btrfs_root *root = inode->root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; - u64 qg_released; + u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); + u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); + u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); int extent_inserted = 0; int ret; @@ -2273,709 +2538,52 @@ */ ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, file_pos + num_bytes, NULL, 0, - 1, sizeof(*fi), &extent_inserted); + 1, sizeof(*stack_fi), &extent_inserted); if (ret) goto out; if (!extent_inserted) { - ins.objectid = btrfs_ino(BTRFS_I(inode)); + ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*fi)); + sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, fi, 0); - btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); - btrfs_set_file_extent_compression(leaf, fi, compression); - btrfs_set_file_extent_encryption(leaf, fi, encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); + write_extent_buffer(leaf, stack_fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(inode, num_bytes); + inode_add_bytes(&inode->vfs_inode, num_bytes); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - /* - * Release the reserved range from inode dirty range map, as it is - * already moved into delayed_ref_head - */ - ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); - if (ret < 0) - goto out; - qg_released = ret; - ret = btrfs_alloc_reserved_file_extent(trans, root, - btrfs_ino(BTRFS_I(inode)), - file_pos, qg_released, &ins); -out: - btrfs_free_path(path); - - return ret; -} - -/* snapshot-aware defrag */ -struct sa_defrag_extent_backref { - struct rb_node node; - struct old_sa_defrag_extent *old; - u64 root_id; - u64 inum; - u64 file_pos; - u64 extent_offset; - u64 num_bytes; - u64 generation; -}; - -struct old_sa_defrag_extent { - struct list_head list; - struct new_sa_defrag_extent *new; - - u64 extent_offset; - u64 bytenr; - u64 offset; - u64 len; - int count; -}; - -struct new_sa_defrag_extent { - struct rb_root root; - struct list_head head; - struct btrfs_path *path; - struct inode *inode; - u64 file_pos; - u64 len; - u64 bytenr; - u64 disk_len; - u8 compress_type; -}; - -static int backref_comp(struct sa_defrag_extent_backref *b1, - struct sa_defrag_extent_backref *b2) -{ - if (b1->root_id < b2->root_id) - return -1; - else if (b1->root_id > b2->root_id) - return 1; - - if (b1->inum < b2->inum) - return -1; - else if (b1->inum > b2->inum) - return 1; - - if (b1->file_pos < b2->file_pos) - return -1; - else if (b1->file_pos > b2->file_pos) - return 1; - - /* - * [------------------------------] ===> (a range of space) - * |<--->| |<---->| =============> (fs/file tree A) - * |<---------------------------->| ===> (fs/file tree B) - * - * A range of space can refer to two file extents in one tree while - * refer to only one file extent in another tree. - * - * So we may process a disk offset more than one time(two extents in A) - * and locate at the same extent(one extent in B), then insert two same - * backrefs(both refer to the extent in B). - */ - return 0; -} - -static void backref_insert(struct rb_root *root, - struct sa_defrag_extent_backref *backref) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sa_defrag_extent_backref *entry; - int ret; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct sa_defrag_extent_backref, node); - - ret = backref_comp(backref, entry); - if (ret < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&backref->node, parent, p); - rb_insert_color(&backref->node, root); -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, - void *ctx) -{ - struct btrfs_file_extent_item *extent; - struct old_sa_defrag_extent *old = ctx; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_path *path = new->path; - struct btrfs_key key; - struct btrfs_root *root; - struct sa_defrag_extent_backref *backref; - struct extent_buffer *leaf; - struct inode *inode = new->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int slot; - int ret; - u64 extent_offset; - u64 num_bytes; - - if (BTRFS_I(inode)->root->root_key.objectid == root_id && - inum == btrfs_ino(BTRFS_I(inode))) - return 0; - - key.objectid = root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - if (PTR_ERR(root) == -ENOENT) - return 0; - WARN_ON(1); - btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", - inum, offset, root_id); - return PTR_ERR(root); - } - - key.objectid = inum; - key.type = BTRFS_EXTENT_DATA_KEY; - if (offset > (u64)-1 << 32) - key.offset = 0; - else - key.offset = offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (WARN_ON(ret < 0)) - return ret; - ret = 0; - - while (1) { - cond_resched(); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } - continue; - } - - path->slots[0]++; - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid > inum) - goto out; - - if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) - continue; - - /* - * 'offset' refers to the exact key.offset, - * NOT the 'offset' field in btrfs_extent_data_ref, ie. - * (key.offset - extent_offset). - */ - if (key.offset != offset) - continue; - - extent_offset = btrfs_file_extent_offset(leaf, extent); - num_bytes = btrfs_file_extent_num_bytes(leaf, extent); - - if (extent_offset >= old->extent_offset + old->offset + - old->len || extent_offset + num_bytes <= - old->extent_offset + old->offset) - continue; - break; - } - - backref = kmalloc(sizeof(*backref), GFP_NOFS); - if (!backref) { - ret = -ENOENT; - goto out; - } - - backref->root_id = root_id; - backref->inum = inum; - backref->file_pos = offset; - backref->num_bytes = num_bytes; - backref->extent_offset = extent_offset; - backref->generation = btrfs_file_extent_generation(leaf, extent); - backref->old = old; - backref_insert(&new->root, backref); - old->count++; -out: - btrfs_release_path(path); - WARN_ON(ret); - return ret; -} - -static noinline bool record_extent_backrefs(struct btrfs_path *path, - struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct old_sa_defrag_extent *old, *tmp; - int ret; - - new->path = path; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr + - old->extent_offset, fs_info, - path, record_one_backref, - old, false); - if (ret < 0 && ret != -ENOENT) - return false; - - /* no backref to be processed for this extent */ - if (!old->count) { - list_del(&old->list); - kfree(old); - } - } - - if (list_empty(&new->head)) - return false; - - return true; -} - -static int relink_is_mergable(struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, - struct new_sa_defrag_extent *new) -{ - if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) - return 0; - - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; - - if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) - return 0; - - if (btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - return 0; - - return 1; -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int relink_extent_backref(struct btrfs_path *path, - struct sa_defrag_extent_backref *prev, - struct sa_defrag_extent_backref *backref) -{ - struct btrfs_file_extent_item *extent; - struct btrfs_file_extent_item *item; - struct btrfs_ordered_extent *ordered; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key key; - struct extent_buffer *leaf; - struct old_sa_defrag_extent *old = backref->old; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct inode *inode; - struct extent_state *cached = NULL; - int ret = 0; - u64 start; - u64 len; - u64 lock_start; - u64 lock_end; - bool merge = false; - int index; - - if (prev && prev->root_id == backref->root_id && - prev->inum == backref->inum && - prev->file_pos + prev->num_bytes == backref->file_pos) - merge = true; - - /* step 1: get root */ - key.objectid = backref->root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - if (PTR_ERR(root) == -ENOENT) - return 0; - return PTR_ERR(root); - } - - if (btrfs_root_readonly(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - /* step 2: get inode */ - key.objectid = backref->inum; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR(inode)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); - - /* step 3: relink backref */ - lock_start = backref->file_pos; - lock_end = backref->file_pos + backref->num_bytes - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - - ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - key.objectid = backref->inum; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = backref->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - goto out_free_path; - } else if (ret > 0) { - ret = 0; - goto out_free_path; - } - - extent = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(path->nodes[0], extent) != - backref->generation) - goto out_free_path; - - btrfs_release_path(path); - - start = backref->file_pos; - if (backref->extent_offset < old->extent_offset + old->offset) - start += old->extent_offset + old->offset - - backref->extent_offset; - - len = min(backref->extent_offset + backref->num_bytes, - old->extent_offset + old->offset + old->len); - len -= max(backref->extent_offset, old->extent_offset + old->offset); - - ret = btrfs_drop_extents(trans, root, inode, start, - start + len, 1); + ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) - goto out_free_path; -again: - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - path->leave_spinning = 1; - if (merge) { - struct btrfs_file_extent_item *fi; - u64 extent_len; - struct btrfs_key found_key; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out_free_path; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_len = btrfs_file_extent_num_bytes(leaf, fi); - - if (extent_len + found_key.offset == start && - relink_is_mergable(leaf, fi, new)) { - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len + len); - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - - ret = 1; - goto out_free_path; - } else { - merge = false; - btrfs_release_path(path); - goto again; - } - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); - btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); - btrfs_set_file_extent_num_bytes(leaf, item, len); - btrfs_set_file_extent_ram_bytes(leaf, item, new->len); - btrfs_set_file_extent_generation(leaf, item, trans->transid); - btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, new->compress_type); - btrfs_set_file_extent_encryption(leaf, item, 0); - btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - btrfs_release_path(path); - - ret = btrfs_inc_extent_ref(trans, root, new->bytenr, - new->disk_len, 0, - backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - ret = 1; -out_free_path: - btrfs_release_path(path); - path->leave_spinning = 0; - btrfs_end_transaction(trans); -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - iput(inode); - return ret; -} - -static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) -{ - struct old_sa_defrag_extent *old, *tmp; - - if (!new) - return; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - kfree(old); - } - kfree(new); -} - -static void relink_file_extents(struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct btrfs_path *path; - struct sa_defrag_extent_backref *backref; - struct sa_defrag_extent_backref *prev = NULL; - struct inode *inode; - struct rb_node *node; - int ret; - - inode = new->inode; - - path = btrfs_alloc_path(); - if (!path) - return; - - if (!record_extent_backrefs(path, new)) { - btrfs_free_path(path); goto out; - } - btrfs_release_path(path); - while (1) { - node = rb_first(&new->root); - if (!node) - break; - rb_erase(node, &new->root); - - backref = rb_entry(node, struct sa_defrag_extent_backref, node); - - ret = relink_extent_backref(path, prev, backref); - WARN_ON(ret < 0); - - kfree(prev); - - if (ret == 1) - prev = backref; - else - prev = NULL; - cond_resched(); - } - kfree(prev); - - btrfs_free_path(path); + ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), + file_pos, qgroup_reserved, &ins); out: - free_sa_defrag_extent(new); - - atomic_dec(&fs_info->defrag_running); - wake_up(&fs_info->transaction_wait); -} - -static struct new_sa_defrag_extent * -record_old_file_extents(struct inode *inode, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct old_sa_defrag_extent *old; - struct new_sa_defrag_extent *new; - int ret; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return NULL; - - new->inode = inode; - new->file_pos = ordered->file_offset; - new->len = ordered->len; - new->bytenr = ordered->start; - new->disk_len = ordered->disk_len; - new->compress_type = ordered->compress_type; - new->root = RB_ROOT; - INIT_LIST_HEAD(&new->head); - - path = btrfs_alloc_path(); - if (!path) - goto out_kfree; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = new->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_free_path; - if (ret > 0 && path->slots[0] > 0) - path->slots[0]--; - - /* find out all the old extents for the file range */ - while (1) { - struct btrfs_file_extent_item *extent; - struct extent_buffer *l; - int slot; - u64 num_bytes; - u64 offset; - u64 end; - u64 disk_bytenr; - u64 extent_offset; - - l = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out_free_path; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid != btrfs_ino(BTRFS_I(inode))) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - break; - if (key.offset >= new->file_pos + new->len) - break; - - extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); - - num_bytes = btrfs_file_extent_num_bytes(l, extent); - if (key.offset + num_bytes < new->file_pos) - goto next; - - disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); - if (!disk_bytenr) - goto next; - - extent_offset = btrfs_file_extent_offset(l, extent); - - old = kmalloc(sizeof(*old), GFP_NOFS); - if (!old) - goto out_free_path; - - offset = max(new->file_pos, key.offset); - end = min(new->file_pos + new->len, key.offset + num_bytes); - - old->bytenr = disk_bytenr; - old->extent_offset = extent_offset; - old->offset = offset - key.offset; - old->len = end - offset; - old->new = new; - old->count = 0; - list_add_tail(&old->list, &new->head); -next: - path->slots[0]++; - cond_resched(); - } - btrfs_free_path(path); - atomic_inc(&fs_info->defrag_running); - return new; - -out_free_path: - btrfs_free_path(path); -out_kfree: - free_sa_defrag_extent(new); - return NULL; + return ret; } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); @@ -2987,7 +2595,33 @@ btrfs_put_block_group(cache); } -/* as ordered data IO finishes, this gets called so we can finish +static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *oe) +{ + struct btrfs_file_extent_item stack_fi; + u64 logical_len; + + memset(&stack_fi, 0, sizeof(stack_fi)); + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, + oe->disk_num_bytes); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) + logical_len = oe->truncated_len; + else + logical_len = oe->num_bytes; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); + /* Encryption and other encoding is reserved and all 0 */ + + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); +} + +/* + * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ @@ -2999,32 +2633,33 @@ struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct new_sa_defrag_extent *new = NULL; + u64 start, end; int compress_type = 0; int ret = 0; - u64 logical_len = ordered_extent->len; - bool nolock; + u64 logical_len = ordered_extent->num_bytes; + bool freespace_inode; bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; + unsigned int clear_bits; + + start = ordered_extent->file_offset; + end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) clear_new_delalloc_bytes = true; - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); + freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3037,16 +2672,9 @@ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - /* - * For mwrite(mmap + memset to write) case, we still reserve - * space for NOCOW range. - * As NOCOW won't cause a new delayed ref, just free the space - */ - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (nolock) - trans = btrfs_join_transaction_nolock(root); + btrfs_inode_safe_disk_i_size_write(inode, 0); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3062,26 +2690,10 @@ } range_locked = true; - lock_extent_bits(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); - ret = test_range_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, cached_state); - if (ret) { - u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (0 && last_snapshot >= BTRFS_I(inode)->generation) - /* the inode is shared */ - new = record_old_file_extents(inode, ordered_extent); - - clear_extent_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state); - } - - if (nolock) - trans = btrfs_join_transaction_nolock(root); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3096,43 +2708,35 @@ compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, - ordered_extent->file_offset, - ordered_extent->start, - ordered_extent->disk_len, - logical_len, logical_len, - compress_type, 0, 0, - BTRFS_FILE_EXTENT_REG); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, - ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, ordered_extent->len, - trans->transid); + ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_ordered_update_i_size(inode, 0, ordered_extent); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -3140,27 +2744,20 @@ } ret = 0; out: - if (range_locked || clear_new_delalloc_bytes) { - unsigned int clear_bits = 0; - - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, - clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state); - } + clear_bits = EXTENT_DEFRAG; + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 start, end; + u64 unwritten_start = start; /* * If we failed to finish this ordered extent for any reason we @@ -3175,14 +2772,11 @@ mapping_set_error(ordered_extent->inode->i_mapping, -EIO); if (truncated) - start = ordered_extent->file_offset + logical_len; - else - start = ordered_extent->file_offset; - end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL); + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -3197,28 +2791,27 @@ if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + /* + * Discard the range before returning it back to the + * free space pool + */ + if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) + btrfs_discard_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, + NULL); btrfs_free_reserved_extent(fs_info, - ordered_extent->start, - ordered_extent->disk_len, 1); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); + } } - /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); - - /* for snapshot-aware defrag */ - if (new) { - if (ret) { - free_sa_defrag_extent(new); - atomic_dec(&fs_info->defrag_running); - } else { - relink_file_extents(new); - } - } + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -3235,14 +2828,13 @@ btrfs_finish_ordered_io(ordered_extent); } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -3251,34 +2843,34 @@ end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, - NULL); + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered_extent->work); } -static int __readpage_endio_check(struct inode *inode, - struct btrfs_io_bio *io_bio, - int icsum, struct page *page, - int pgoff, u64 start, size_t len) +static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, + int icsum, struct page *page, int pgoff, u64 start, + size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u32 csum_expected; - u32 csum = ~(u32)0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = *(((u32 *)io_bio->csum) + icsum); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (u8 *)&csum); - if (csum != csum_expected) + shash->tfm = fs_info->csum_shash; + + crypto_shash_digest(shash, kaddr + pgoff, len, csum); + + if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); @@ -3286,6 +2878,9 @@ zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, io_bio->mirror_num); + if (io_bio->device) + btrfs_dev_stat_inc_and_print(io_bio->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -3297,9 +2892,8 @@ * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -3321,8 +2915,8 @@ } phy_offset >>= inode->i_sb->s_blocksize_bits; - return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, - start, (size_t)(end - start + 1)); + return check_data_csum(inode, io_bio, phy_offset, page, offset, start, + (size_t)(end - start + 1)); } /* @@ -3343,10 +2937,35 @@ if (atomic_add_unless(&inode->i_count, -1, 1)) return; + atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); + if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); +} + +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock(&fs_info->delayed_iput_lock); + } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) @@ -3358,12 +2977,29 @@ inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); - iput(&inode->vfs_inode); - spin_lock(&fs_info->delayed_iput_lock); + run_delayed_iput_locked(fs_info, inode); + cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); +} + +/** + * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running + * @fs_info - the fs_info for this fs + * @return - EINTR if we were killed, 0 if nothing's pending + * + * This will wait on any delayed iputs that are currently running with KILLABLE + * set. Once they are all done running we will return, unless we are killed in + * which case we return EINTR. This helps in user operations like fallocate etc + * that might get blocked on the iputs. + */ +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) +{ + int ret = wait_event_killable(fs_info->delayed_iputs_wait, + atomic_read(&fs_info->nr_delayed_iputs) == 0); + if (ret) + return -EINTR; + return 0; } /* @@ -3471,14 +3107,13 @@ found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); + inode = btrfs_iget(fs_info->sb, last_objectid, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* @@ -3490,18 +3125,16 @@ * orphan must not get deleted. * find_dead_roots already ran before us, so if this * is a snapshot deletion, we should find the root - * in the dead_roots list + * in the fs_roots radix tree. */ - spin_lock(&fs_info->trans_lock); - list_for_each_entry(dead_root, &fs_info->dead_roots, - root_list) { - if (dead_root->root_key.objectid == - found_key.objectid) { - is_dead_root = 1; - break; - } - } - spin_unlock(&fs_info->trans_lock); + + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); + if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) + is_dead_root = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; @@ -3551,8 +3184,6 @@ /* this will do delete_inode and everything for us */ iput(inode); - if (ret) - goto out; } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3694,6 +3325,8 @@ i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, + round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); @@ -3764,21 +3397,14 @@ * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* - * Similar reasoning for last_link_trans, needs to be set otherwise - * for a case like the following: - * - * mkdir A - * touch foo - * ln foo A/bar - * echo 2 > /proc/sys/vm/drop_caches - * fsync foo - * <power failure> - * - * Would result in link bar and directory A not existing after the power - * failure. + * Same logic as for last_unlink_trans. We don't persist the generation + * of the last transaction where this inode was used for a reflink + * operation, so after eviction and reloading the inode we must be + * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; + BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -3827,7 +3453,6 @@ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -3838,7 +3463,7 @@ case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -3860,45 +3485,42 @@ { struct btrfs_map_token token; - btrfs_init_map_token(&token); + btrfs_init_map_token(&token, leaf); - btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); - btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); - btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, - &token); - btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); - btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - btrfs_set_token_timespec_sec(leaf, &item->atime, - inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->atime, - inode->i_atime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); - btrfs_set_token_timespec_sec(leaf, &item->mtime, - inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->mtime, - inode->i_mtime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); - btrfs_set_token_timespec_sec(leaf, &item->ctime, - inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->ctime, - inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode->i_ctime.tv_nsec); - btrfs_set_token_timespec_sec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec, &token); + btrfs_set_token_timespec_sec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); - btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), - &token); - btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, - &token); - btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), - &token); - btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); - btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); - btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); - btrfs_set_token_inode_block_group(leaf, item, 0, &token); + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + btrfs_set_token_inode_block_group(&token, item, 0); } /* @@ -3931,7 +3553,7 @@ fill_inode_item(trans, leaf, inode_item, inode); btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); ret = 0; failed: btrfs_free_path(path); @@ -3961,7 +3583,7 @@ ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); return ret; } @@ -3994,9 +3616,7 @@ struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; - struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct btrfs_key key; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); @@ -4010,16 +3630,10 @@ path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto err; } - if (!di) { - ret = -ENOENT; - goto err; - } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; @@ -4072,6 +3686,17 @@ ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) @@ -4120,7 +3745,7 @@ * 1 for the inode ref * 1 for the inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4187,10 +3812,7 @@ di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -4393,18 +4015,24 @@ * again is not run concurrently. */ spin_lock(&dest->root_item_lock); - root_flags = btrfs_root_flags(&dest->root_item); - if (dest->send_in_progress == 0) { - btrfs_set_root_flags(&dest->root_item, - root_flags | BTRFS_ROOT_SUBVOL_DEAD); - spin_unlock(&dest->root_item_lock); - } else { + if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); down_write(&fs_info->subvol_sem); @@ -4487,7 +4115,7 @@ err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -4566,31 +4194,6 @@ return err; } -static int truncate_space_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytes_deleted) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - /* - * This is only used to apply pressure to the enospc system, we don't - * intend to use this reservation at all. - */ - bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); - bytes_deleted *= fs_info->nodesize; - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, - bytes_deleted, 1); - trans->bytes_reserved += bytes_deleted; - } - return ret; - -} - /* * Return this if we need to call truncate_block for the last bit of the * truncate. @@ -4635,16 +4238,18 @@ u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; - bool should_end = false; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); /* - * for non-free space inodes and ref cows, we want to back off from - * time to time + * For non-free space inodes and non-shareable roots, we want to back + * off from time to time. This means all inodes in subvolume roots, + * reloc roots, and data reloc roots. */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && - test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; path = btrfs_alloc_path(); @@ -4652,21 +4257,24 @@ return -ENOMEM; path->reada = READA_BACK; - /* - * We want to drop from the next block forward in case this new size is - * not block aligned since we will be keeping the last block of the - * extent just the way it is. - */ - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + + /* + * We want to drop from the next block forward in case this + * new size is not block aligned since we will be keeping the + * last block of the extent just the way it is. + */ btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); + } /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed + * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4688,7 +4296,6 @@ goto out; } - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; @@ -4704,6 +4311,8 @@ } while (1) { + u64 clear_start = 0, clear_len = 0; + fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -4754,6 +4363,8 @@ if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; + + clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = @@ -4761,11 +4372,12 @@ extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); + clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) inode_sub_bytes(inode, num_dec); @@ -4781,11 +4393,12 @@ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, num_dec); } } + clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * we can't truncate inline items that have had @@ -4799,7 +4412,7 @@ btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); + btrfs_truncate_item(path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -4807,12 +4420,33 @@ */ ret = NEED_TRUNCATE_BLOCK; break; + } else { + /* + * Inline extents are special, we just treat + * them as a full sector worth in the file + * extent tree just for simplicity sake. + */ + clear_len = fs_info->sectorsize; } - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: + /* + * We use btrfs_truncate_inode_items() to clean up log trees for + * multiple fsyncs, and in this case we don't want to clear the + * file extent range because it's just the log. + */ + if (root == BTRFS_I(inode)->root) { + ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + clear_start, clear_len); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } + } + if (del_item) last_size = found_key.offset; else @@ -4836,29 +4470,23 @@ should_throttle = false; if (found_extent && - (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root)) { - btrfs_set_path_blocking(path); + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref ref = { 0 }; + bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, 0, - btrfs_header_owner(leaf), - ino, extent_offset); + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + ino, extent_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans, fs_info)) - btrfs_async_run_delayed_refs(fs_info, - trans->delayed_ref_updates * 2, - trans->transid, 0); if (be_nice) { - if (truncate_space_check(trans, root, - extent_num_bytes)) { - should_end = true; - } - if (btrfs_should_throttle_delayed_refs(trans, - fs_info)) + if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } @@ -4868,7 +4496,7 @@ if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle || should_end) { + should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4880,23 +4508,24 @@ pending_del_nr = 0; } btrfs_release_path(path); - if (should_throttle) { - unsigned long updates = trans->delayed_ref_updates; - if (updates) { - trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, - updates * 2); - if (ret) - break; - } - } + /* - * if we failed to refill our space rsv, bail out - * and let the transaction restart + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. */ - if (should_end) { - ret = -EAGAIN; - break; + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } } goto search_again; } else { @@ -4918,22 +4547,12 @@ ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; - btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, last_size); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); } btrfs_free_path(path); - - if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { - unsigned long updates = trans->delayed_ref_updates; - int err; - - if (updates) { - trans->delayed_ref_updates = 0; - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) - ret = err; - } - } return ret; } @@ -4958,11 +4577,13 @@ struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4974,15 +4595,28 @@ block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, + block_start, blocksize); + if (ret < 0) { + if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, + &write_bytes) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, block_start, blocksize); goto out; - + } again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; @@ -5007,24 +4641,23 @@ lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(inode, block_start); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, - &cached_state, 0); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0, + &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -5048,14 +4681,26 @@ set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); + out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: + if (only_release_metadata) + btrfs_check_nocow_unlock(BTRFS_I(inode)); extent_changeset_free(data_reserved); return ret; } @@ -5137,25 +4782,12 @@ if (size <= hole_start) return 0; - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(io_tree, hole_start, block_end - 1, - &cached_state); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, - block_end - hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, + block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - block_end - cur_offset, 0); + block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5163,14 +4795,21 @@ } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); + hole_size = last_byte - cur_offset; + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - hole_size = last_byte - cur_offset; err = maybe_insert_hole(root, inode, cur_offset, hole_size); if (err) break; + + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; + btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); @@ -5187,7 +4826,6 @@ hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; @@ -5203,6 +4841,11 @@ hole_size - 1, 0); } free_extent_map(hole_em); + } else { + err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + cur_offset, hole_size); + if (err) + break; } next: free_extent_map(em); @@ -5246,42 +4889,39 @@ * truncation, it must capture all writes that happened before * this truncation. */ - btrfs_wait_for_snapshot_creation(root); + btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_write_no_snapshotting(root); + btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the end less truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5356,10 +4996,10 @@ truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); - while (!RB_EMPTY_ROOT(&map_tree->map)) { + while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { struct extent_map *em; - node = rb_first(&map_tree->map); + node = rb_first_cached(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5375,8 +5015,8 @@ /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -5415,12 +5055,13 @@ * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, + end - start + 1); clear_extent_bit(io_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, &cached_state); + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5429,43 +5070,54 @@ } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - u64 min_size) + struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int failures = 0; + struct btrfs_trans_handle *trans; + u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + int ret; - for (;;) { - struct btrfs_trans_handle *trans; - int ret; - - ret = btrfs_block_rsv_refill(root, rsv, min_size, - BTRFS_RESERVE_FLUSH_LIMIT); - - if (ret && ++failures > 2) { - btrfs_warn(fs_info, - "could not allocate space for a delete; will truncate on mount"); - return ERR_PTR(-ENOSPC); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) - return trans; - + /* + * Eviction should be taking place at some place safe because of our + * delayed iputs. However the normal flushing code will run delayed + * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. + * + * We reserve the delayed_refs_extra here again because we can't use + * btrfs_start_transaction(root, 0) for the same deadlocky reason as + * above. We reserve our extra bit here because we generate a ton of + * delayed refs activity by truncating. + * + * If we cannot make our reservation we'll attempt to steal from the + * global reserve, because we really want to be able to free up space. + */ + ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, + BTRFS_RESERVE_FLUSH_EVICT); + if (ret) { /* * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans, fs_info) && - !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0)) - return trans; - - /* If not, commit and try again. */ - ret = btrfs_commit_transaction(trans); - if (ret) - return ERR_PTR(ret); + if (btrfs_check_space_for_delayed_refs(fs_info) || + btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { + btrfs_warn(fs_info, + "could not allocate space for delete; will truncate on mount"); + return ERR_PTR(-ENOSPC); + } + delayed_refs_extra = 0; } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return trans; + + if (delayed_refs_extra) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); + } + return trans; } void btrfs_evict_inode(struct inode *inode) @@ -5474,7 +5126,6 @@ struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - u64 min_size; int ret; trace_btrfs_inode_evict(inode); @@ -5483,8 +5134,6 @@ clear_inode(inode); return; } - - min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); evict_inode_truncate_pages(inode); @@ -5496,9 +5145,6 @@ if (is_bad_inode(inode)) goto no_delete; - /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - if (!special_file(inode->i_mode)) - btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); @@ -5518,13 +5164,13 @@ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; - rsv->size = min_size; + rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; @@ -5549,7 +5195,7 @@ * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv, min_size); + trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -5596,12 +5242,8 @@ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } @@ -5672,7 +5314,7 @@ btrfs_release_path(path); - new_root = btrfs_read_fs_root_no_name(fs_info, location); + new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@ -5724,15 +5366,15 @@ spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -5750,29 +5392,32 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->location->objectid; - memcpy(&BTRFS_I(inode)->location, args->location, - sizeof(*args->location)); - BTRFS_I(inode)->root = args->root; + + inode->i_ino = args->ino; + BTRFS_I(inode)->location.objectid = args->ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + BTRFS_I(inode)->root = btrfs_grab_root(args->root); + BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; } static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->location->objectid == BTRFS_I(inode)->location.objectid && + + return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, - struct btrfs_key *location, +static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(location->objectid, root); + unsigned long hashval = btrfs_inode_hash(ino, root); - args.location = location; + args.ino = ino; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@ -5781,16 +5426,18 @@ return inode; } -/* Get an inode object given its location and corresponding root. - * Returns in *is_new if the inode was read from disk +/* + * Get an inode object given its inode number and corresponding root. + * Path can be preallocated to prevent recursing back to iget through + * allocator. NULL is also valid but may require an additional allocation + * later. */ -struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new, - struct btrfs_path *path) +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; - inode = btrfs_iget_locked(s, location, root); + inode = btrfs_iget_locked(s, ino, root); if (!inode) return ERR_PTR(-ENOMEM); @@ -5801,8 +5448,6 @@ if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); - if (new) - *new = 1; } else { iget_failed(inode); /* @@ -5819,10 +5464,9 @@ return inode; } -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, location, root, new, NULL); + return btrfs_iget_path(s, ino, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, @@ -5834,12 +5478,16 @@ if (!inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &btrfs_dir_ro_inode_operations; + /* + * We only need lookup, the rest is read-only and there's no inode + * associated with the dentry + */ + inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; @@ -5853,7 +5501,20 @@ static inline u8 btrfs_inode_type(struct inode *inode) { - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; + /* + * Compile-time asserts that generic FT_* types still match + * BTRFS_FT_* types + */ + BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); + BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); + BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); + BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); + BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); + BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); + BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); + BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); + + return fs_umode_to_ftype(inode->i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) @@ -5864,7 +5525,6 @@ struct btrfs_root *sub_root = root; struct btrfs_key location; u8 di_type = 0; - int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) @@ -5875,7 +5535,7 @@ return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root, NULL); + inode = btrfs_iget(dir->i_sb, location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5891,7 +5551,6 @@ return inode; } - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { @@ -5900,9 +5559,10 @@ else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); + inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); } - srcu_read_unlock(&fs_info->subvol_srcu, index); + if (root != sub_root) + btrfs_put_root(sub_root); if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); @@ -5940,22 +5600,12 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct inode *inode; + struct inode *inode = btrfs_lookup_dentry(dir, dentry); - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ENOENT) - inode = NULL; - else - return ERR_CAST(inode); - } - + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; return d_splice_alias(inode, dentry); } - -unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; /* * All this infrastructure exists because dir_emit can fault, and we are holding @@ -6095,7 +5745,7 @@ name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); @@ -6167,7 +5817,7 @@ return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); - if (ret && ret == -ENOSPC) { + if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); @@ -6290,7 +5940,8 @@ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.location = &BTRFS_I(inode)->location; + + args.ino = BTRFS_I(inode)->location.objectid; args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6346,13 +5997,16 @@ u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); @@ -6390,7 +6044,7 @@ */ BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = root; + BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; @@ -6477,7 +6131,7 @@ inode_tree_add(inode); trace_btrfs_inode_new(inode); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); @@ -6535,8 +6189,7 @@ if (ret) return ret; - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6620,7 +6273,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6684,7 +6337,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6719,7 +6372,6 @@ if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -6744,7 +6396,7 @@ int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->objectid != BTRFS_I(inode)->root->objectid) + if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6782,7 +6434,6 @@ drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6796,14 +6447,8 @@ if (err) goto fail; } - BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -6824,7 +6469,6 @@ struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; - int drop_on_err = 0; u64 objectid = 0; u64 index = 0; @@ -6837,7 +6481,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_fail; @@ -6850,7 +6494,6 @@ goto out_fail; } - drop_on_err = 1; /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -6871,7 +6514,6 @@ goto out_fail; d_instantiate_new(dentry, inode); - drop_on_err = 0; out_fail: btrfs_end_transaction(trans); @@ -6929,26 +6571,34 @@ return ret; } -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. +/** + * btrfs_get_extent - Lookup the first extent overlapping a range in a file. + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start * - * This also copies inline extents directly into the page. + * This returns the first &struct extent_map which overlaps with the given + * range, reading it from the B-tree and caching it if necessary. Note that + * there may be more extents which overlap the given range after the returned + * extent_map. + * + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u32 found_type; + int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; @@ -6957,12 +6607,9 @@ struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - const bool new_inline = !page || create; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - if (em) - em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); if (em) { @@ -6975,48 +6622,47 @@ } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - em->bdev = fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; em->block_len = (u64)-1; + path = btrfs_alloc_path(); if (!path) { - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - /* - * Chances are we'll be called again, so go ahead and do - * readahead - */ - path->reada = READA_FORWARD; - } - - ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); - if (ret < 0) { - err = ret; + ret = -ENOMEM; goto out; } - if (ret != 0) { + /* Chances are we'll be called again, so go ahead and do readahead */ + path->reada = READA_FORWARD; + + /* + * Unless we're going to uncompress the inline extent, no sleep would + * happen. + */ + path->leave_spinning = 1; + + path->recurse = btrfs_is_free_space_inode(inode); + + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; if (found_key.objectid != objectid || - found_type != BTRFS_EXTENT_DATA_KEY) { + found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll @@ -7027,30 +6673,22 @@ goto next; } - found_type = btrfs_file_extent_type(leaf, item); + extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = btrfs_file_extent_end(path); + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); goto out; } - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, item); - trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_end = ALIGN(extent_start + size, - fs_info->sectorsize); - + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); @@ -7060,12 +6698,11 @@ path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } - if (ret > 0) + else if (ret > 0) goto not_found; + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -7076,26 +6713,28 @@ goto not_found; if (start > found_key.offset) goto next; + + /* New extent overlaps with existing one */ em->start = start; em->orig_start = start; em->len = found_key.offset - start; - goto not_found_em; + em->block_start = EXTENT_MAP_HOLE; + goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, - new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, !page, em); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; char *map; size_t size; size_t extent_offset; size_t copy_size; - if (new_inline) + if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -7107,15 +6746,15 @@ em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; + + btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -7137,49 +6776,45 @@ em->start = start; em->orig_start = start; em->len = len; -not_found_em: em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: + btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - btrfs_free_path(path); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } - BUG_ON(!em); /* Error is always set */ return em; } struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + u64 start, u64 len) { struct extent_map *em; struct extent_map *hole_em = NULL; - u64 range_start = start; + u64 delalloc_start = start; u64 end; - u64 found; - u64 found_end; + u64 delalloc_len; + u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* @@ -7204,80 +6839,83 @@ em = NULL; /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&inode->io_tree, &range_start, + delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, end, len, EXTENT_DELALLOC, 1); - found_end = range_start + found; - if (found_end < range_start) - found_end = (u64)-1; + delalloc_end = delalloc_start + delalloc_len; + if (delalloc_end < delalloc_start) + delalloc_end = (u64)-1; /* - * we didn't find anything useful, return - * the original results from get_extent() + * We didn't find anything useful, return the original results from + * get_extent() */ - if (range_start > end || found_end <= start) { + if (delalloc_start > end || delalloc_end <= start) { em = hole_em; hole_em = NULL; goto out; } - /* adjust the range_start to make sure it doesn't - * go backwards from the start they passed in + /* + * Adjust the delalloc_start to make sure it doesn't go backwards from + * the start they passed in */ - range_start = max(start, range_start); - found = found_end - range_start; + delalloc_start = max(start, delalloc_start); + delalloc_len = delalloc_end - delalloc_start; - if (found > 0) { - u64 hole_start = start; - u64 hole_len = len; + if (delalloc_len > 0) { + u64 hole_start; + u64 hole_len; + const u64 hole_end = extent_map_end(hole_em); em = alloc_extent_map(); if (!em) { err = -ENOMEM; goto out; } - /* - * when btrfs_get_extent can't find anything it - * returns one huge hole - * - * make sure what it found really fits our range, and - * adjust to make sure it is based on the start from - * the caller - */ - if (hole_em) { - u64 calc_end = extent_map_end(hole_em); - if (calc_end <= start || (hole_em->start > end)) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = calc_end - hole_start; - } + ASSERT(hole_em); + /* + * When btrfs_get_extent can't find anything it returns one + * huge hole + * + * Make sure what it found really fits our range, and adjust to + * make sure it is based on the start from the caller + */ + if (hole_end <= start || hole_em->start > end) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = hole_end - hole_start; } - em->bdev = NULL; - if (hole_em && range_start > hole_start) { - /* our hole starts before our delalloc, so we - * have to return just the parts of the hole - * that go until the delalloc starts + + if (hole_em && delalloc_start > hole_start) { + /* + * Our hole starts before our delalloc, so we have to + * return just the parts of the hole that go until the + * delalloc starts */ - em->len = min(hole_len, - range_start - hole_start); + em->len = min(hole_len, delalloc_start - hole_start); em->start = hole_start; em->orig_start = hole_start; /* - * don't adjust block start at all, - * it is fixed at EXTENT_MAP_HOLE + * Don't adjust block start at all, it is fixed at + * EXTENT_MAP_HOLE */ em->block_start = hole_em->block_start; em->block_len = hole_len; if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { - em->start = range_start; - em->len = found; - em->orig_start = range_start; + /* + * Hole is out of passed range or it starts after + * delalloc range + */ + em->start = delalloc_start; + em->len = delalloc_len; + em->orig_start = delalloc_start; em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = found; + em->block_len = delalloc_len; } } else { return hole_em; @@ -7292,7 +6930,7 @@ return em; } -static struct extent_map *btrfs_create_dio_extent(struct inode *inode, +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, const u64 orig_start, @@ -7306,21 +6944,19 @@ int ret; if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, - block_start, block_len, orig_block_len, - ram_bytes, + em = create_io_em(inode, start, len, orig_start, block_start, + block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ type); if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, - len, block_len, type); + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, + block_len, type); if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + len - 1, 0); + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } em = ERR_PTR(ret); } @@ -7329,11 +6965,11 @@ return em; } -static struct extent_map *btrfs_new_extent_direct(struct inode *inode, +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, u64 start, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; @@ -7350,19 +6986,38 @@ ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); return em; } /* - * returns 1 when the nocow is safe, < 1 on error, 0 if the - * block must be cow'd + * Check if we can do nocow write into the range [@offset, @offset + @len) + * + * @offset: File offset + * @len: The length to write, will be updated to the nocow writeable + * range + * @orig_start: (optional) Return the original file offset of the file extent + * @orig_len: (optional) Return the original on-disk length of the file extent + * @ram_bytes: (optional) Return the ram_bytes of the file extent + * @strict: if true, omit optimizations that might force us into unnecessary + * cow. e.g., don't trust generation number. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks for (nowait == false) case. + * + * Return: + * >0 and update @len if we can do nocow write + * 0 if we can't do nocow write + * <0 if error happened + * + * NOTE: This only checks the file extents, caller is responsible to wait for + * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) + u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; @@ -7440,8 +7095,9 @@ * Do the same check as in btrfs_cross_ref_exist but without the * unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) + if (!strict && + (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; backref_offset = btrfs_file_extent_offset(leaf, fi); @@ -7477,7 +7133,8 @@ */ ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr); + key.offset - backref_offset, disk_bytenr, + strict); if (ret) { ret = 0; goto out; @@ -7505,7 +7162,7 @@ } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7554,7 +7211,7 @@ */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7564,11 +7221,11 @@ * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -7585,15 +7242,14 @@ } /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type) { struct extent_map_tree *em_tree; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; ASSERT(type == BTRFS_ORDERED_PREALLOC || @@ -7601,7 +7257,7 @@ type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &BTRFS_I(inode)->extent_tree; + em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7611,7 +7267,6 @@ em->len = len; em->block_len = block_len; em->block_start = block_start; - em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; @@ -7624,8 +7279,8 @@ } do { - btrfs_drop_extent_cache(BTRFS_I(inode), em->start, - em->start + em->len - 1, 0); + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); @@ -7645,28 +7300,7 @@ } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = em->bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7698,11 +7332,11 @@ block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && + &orig_block_len, &ram_bytes, false) == 1 && btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; - em2 = btrfs_create_dio_extent(inode, start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); @@ -7721,16 +7355,14 @@ * use the existing or preallocated extent, so does not * need to adjust btrfs_space_info's bytes_may_use. */ - btrfs_free_reserved_data_space_noquota(inode, start, - len); + btrfs_free_reserved_data_space_noquota(fs_info, len); goto skip_cow; } } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); - *map = em = btrfs_new_extent_direct(inode, start, len); + *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -7739,72 +7371,93 @@ len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = em->bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; - int unlock_bits = EXTENT_LOCKED; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); - if (create) - unlock_bits |= EXTENT_DIRTY; - else + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; + + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; } + + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->sync = sync; + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } + } + iomap->private = dio_data; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -7827,443 +7480,253 @@ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - /* clear and unlock the entire range */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; free_extent_map(em); return 0; unlock_err: - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } return ret; } -static inline blk_status_t submit_dio_repair_bio(struct inode *inode, - struct bio *bio, - int mirror_num) +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) { + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } + kfree(dio_data); + iomap->private = NULL; + + return ret; +} + +static void btrfs_dio_private_put(struct btrfs_dio_private *dip) +{ + /* + * This implies a barrier so that stores to dio_bio->bi_status before + * this and loads of dio_bio->bi_status after this are fully ordered. + */ + if (!refcount_dec_and_test(&dip->refs)) + return; + + if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + __endio_write_update_ordered(BTRFS_I(dip->inode), + dip->logical_offset, + dip->bytes, + !dip->dio_bio->bi_status); + } else { + unlock_extent(&BTRFS_I(dip->inode)->io_tree, + dip->logical_offset, + dip->logical_offset + dip->bytes - 1); + } + + bio_endio(dip->dio_bio); + kfree(dip); +} + +static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) +{ + struct btrfs_dio_private *dip = bio->bi_private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); - + refcount_inc(&dip->refs); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) + refcount_dec(&dip->refs); return ret; } -static int btrfs_check_dio_repairable(struct inode *inode, - struct bio *failed_bio, - struct io_failure_record *failrec, - int failed_mirror) +static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, + struct btrfs_io_bio *io_bio, + const bool uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; - } - - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; - } - - return 1; -} - -static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) -{ - struct io_failure_record *failrec; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int isector; - unsigned int read_mode = 0; - int segs; - int ret; - blk_status_t status; - struct bio_vec bvec; - - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); - if (ret) - return errno_to_blk_status(ret); - - ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, - failed_mirror); - if (!ret) { - free_io_failure(failure_tree, io_tree, failrec); - return BLK_STS_IOERR; - } - - segs = bio_segments(failed_bio); - bio_get_first_bvec(failed_bio, &bvec); - if (segs > 1 || - (bvec.bv_len > btrfs_inode_sectorsize(inode))) - read_mode |= REQ_FAILFAST_DEV; - - isector = start - btrfs_io_bio(failed_bio)->logical; - isector >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - pgoff, isector, repair_endio, repair_arg); - bio->bi_opf = REQ_OP_READ | read_mode; - - btrfs_debug(BTRFS_I(inode)->root->fs_info, - "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); - - status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (status) { - free_io_failure(failure_tree, io_tree, failrec); - bio_put(bio); - } - - return status; -} - -struct btrfs_retry_complete { - struct completion done; - struct inode *inode; - u64 start; - int uptodate; -}; - -static void btrfs_retry_endio_nocsum(struct bio *bio) -{ - struct btrfs_retry_complete *done = bio->bi_private; - struct inode *inode = done->inode; - struct bio_vec *bvec; - struct extent_io_tree *io_tree, *failure_tree; - int i; - - if (bio->bi_status) - goto end; - - ASSERT(bio->bi_vcnt == 1); - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); - - done->uptodate = 1; - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, - io_tree, done->start, bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), 0); -end: - complete(&done->done); - bio_put(bio); -} - -static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) -{ - struct btrfs_fs_info *fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - unsigned int pgoff; - u32 sectorsize; - int nr_sectors; - blk_status_t ret; + u64 start = io_bio->logical; + int icsum = 0; blk_status_t err = BLK_STS_OK; - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; + __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + unsigned int i, nr_sectors, pgoff; - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; - - bio_for_each_segment(bvec, &io_bio->bio, iter) { nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; - -next_block_or_try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio_nocsum, &done); - if (ret) { - err = ret; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto next_block_or_try_again; - } - -next: - start += sectorsize; - - nr_sectors--; - if (nr_sectors) { - pgoff += sectorsize; + for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - goto next_block_or_try_again; + if (uptodate && + (!csum || !check_data_csum(inode, io_bio, icsum, + bvec.bv_page, pgoff, + start, sectorsize))) { + clean_io_failure(fs_info, failure_tree, io_tree, + start, bvec.bv_page, + btrfs_ino(BTRFS_I(inode)), + pgoff); + } else { + blk_status_t status; + + status = btrfs_submit_read_repair(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, + start + sectorsize - 1, + io_bio->mirror_num, + submit_dio_repair_bio); + if (status) + err = status; + } + start += sectorsize; + icsum++; + pgoff += sectorsize; } } - return err; } -static void btrfs_retry_endio(struct bio *bio) -{ - struct btrfs_retry_complete *done = bio->bi_private; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *io_tree, *failure_tree; - struct inode *inode = done->inode; - struct bio_vec *bvec; - int uptodate; - int ret; - int i; - - if (bio->bi_status) - goto end; - - uptodate = 1; - - ASSERT(bio->bi_vcnt == 1); - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); - - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, - bvec->bv_offset, done->start, - bvec->bv_len); - if (!ret) - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, io_tree, done->start, - bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), - bvec->bv_offset); - else - uptodate = 0; - } - - done->uptodate = uptodate; -end: - complete(&done->done); - bio_put(bio); -} - -static blk_status_t __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) -{ - struct btrfs_fs_info *fs_info; - struct bio_vec bvec; - struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - u64 offset = 0; - u32 sectorsize; - int nr_sectors; - unsigned int pgoff; - int csum_pos; - bool uptodate = (err == 0); - int ret; - blk_status_t status; - - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; - - err = BLK_STS_OK; - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; - - bio_for_each_segment(bvec, &io_bio->bio, iter) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - - pgoff = bvec.bv_offset; -next_block: - if (uptodate) { - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec.bv_page, pgoff, start, sectorsize); - if (likely(!ret)) - goto next; - } -try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, btrfs_retry_endio, - &done); - if (status) { - err = status; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto try_again; - } -next: - offset += sectorsize; - start += sectorsize; - - ASSERT(nr_sectors); - - nr_sectors--; - if (nr_sectors) { - pgoff += sectorsize; - ASSERT(pgoff < PAGE_SIZE); - goto next_block; - } - } - - return err; -} - -static blk_status_t btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) -{ - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - if (skip_csum) { - if (unlikely(err)) - return __btrfs_correct_data_nocsum(inode, io_bio); - else - return BLK_STS_OK; - } else { - return __btrfs_subio_endio_read(inode, io_bio, err); - } -} - -static void btrfs_endio_direct_read(struct bio *bio) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; - struct bio *dio_bio; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - blk_status_t err = bio->bi_status; - - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) - err = btrfs_subio_endio_read(inode, io_bio, err); - - unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1); - dio_bio = dip->dio_bio; - - kfree(dip); - - dio_bio->bi_status = err; - dio_end_io(dio_bio); - - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(err)); - bio_put(bio); -} - -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } while (ordered_offset < offset + bytes) { last_offset = ordered_offset; if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, func, - finish_ordered_fn, - NULL, NULL); + &ordered_offset, + ordered_bytes, + uptodate)) { + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, + NULL); btrfs_queue_work(wq, &ordered->work); } /* @@ -8274,7 +7737,7 @@ return; /* * Our bio might span multiple ordered extents. In this case - * we keep goin until we have accounted the whole dio. + * we keep going until we have accounted the whole dio. */ if (ordered_offset < offset + bytes) { ordered_bytes = offset + bytes - ordered_offset; @@ -8283,29 +7746,12 @@ } } -static void btrfs_endio_direct_write(struct bio *bio) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct bio *dio_bio = dip->dio_bio; - - __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_status); - - kfree(dip); - - dio_bio->bi_status = bio->bi_status; - dio_end_io(dio_bio); - bio_put(bio); -} - static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { struct inode *inode = private_data; - blk_status_t ret; - ret = btrfs_csum_one_bio(inode, bio, offset, 1); - BUG_ON(ret); /* -ENOMEM */ - return 0; + + return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) @@ -8321,62 +7767,16 @@ (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (dip->subio_endio) - err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); - - if (err) { - /* - * We want to perceive the errors flag being set before - * decrementing the reference count. We don't need a barrier - * since atomic operations with a return value are fully - * ordered as per atomic_t.txt - */ - dip->errors = 1; + if (bio_op(bio) == REQ_OP_READ) { + err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), + !err); } - /* if there are more bios still pending for this dio, just exit */ - if (!atomic_dec_and_test(&dip->pending_bios)) - goto out; + if (err) + dip->dio_bio->bi_status = err; - if (dip->errors) { - bio_io_error(dip->orig_bio); - } else { - dip->dio_bio->bi_status = BLK_STS_OK; - bio_endio(dip->orig_bio); - } -out: bio_put(bio); -} - -static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, - struct btrfs_dio_private *dip, - struct bio *bio, - u64 file_offset) -{ - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - blk_status_t ret; - - /* - * We load all the csum data we need when we submit - * the first bio to reduce the csum tree search and - * contention. - */ - if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, - file_offset); - if (ret) - return ret; - } - - if (bio == dip->orig_bio) - return 0; - - file_offset -= dip->logical_offset; - file_offset >>= inode->i_sb->s_blocksize_bits; - io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); - - return 0; + btrfs_dio_private_put(dip); } static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, @@ -8410,222 +7810,169 @@ * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); if (ret) goto err; } else { - ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio, - file_offset); - if (ret) - goto err; + u64 csum_offset; + + csum_offset = file_offset - dip->logical_offset; + csum_offset >>= inode->i_sb->s_blocksize_bits; + csum_offset *= btrfs_super_csum_size(fs_info->super_copy); + btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: - ret = btrfs_map_bio(fs_info, bio, 0, 0); + ret = btrfs_map_bio(fs_info, bio, 0); err: return ret; } -static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) +/* + * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked + * or ordered extents whether or not we submit any bios. + */ +static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, + struct inode *inode, + loff_t file_offset) { - struct inode *inode = dip->inode; + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); + size_t dip_size; + struct btrfs_dio_private *dip; + + dip_size = sizeof(*dip); + if (!write && csum) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + size_t nblocks; + + nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + dip_size += csum_size * nblocks; + } + + dip = kzalloc(dip_size, GFP_NOFS); + if (!dip) + return NULL; + + dip->inode = inode; + dip->logical_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + dip->dio_bio = dio_bio; + refcount_set(&dip->refs, 1); + return dip; +} + +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) +{ + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const bool raid56 = (btrfs_data_alloc_profile(fs_info) & + BTRFS_BLOCK_GROUP_RAID56_MASK); + struct btrfs_dio_private *dip; struct bio *bio; - struct bio *orig_bio = dip->orig_bio; - u64 start_sector = orig_bio->bi_iter.bi_sector; - u64 file_offset = dip->logical_offset; - u64 map_length; + u64 start_sector; int async_submit = 0; u64 submit_len; int clone_offset = 0; int clone_len; int ret; blk_status_t status; + struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; - map_length = orig_bio->bi_iter.bi_size; - submit_len = map_length; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, - &map_length, NULL, 0); - if (ret) - return -EIO; - - if (map_length >= submit_len) { - bio = orig_bio; - dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; - goto submit; + dip = btrfs_create_dio_private(dio_bio, inode, file_offset); + if (!dip) { + if (!write) { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_status = BLK_STS_RESOURCE; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } - /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) - async_submit = 0; - else - async_submit = 1; + if (!write && csum) { + /* + * Load the csums up front to reduce csum tree searches and + * contention when submitting bios. + */ + status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, + dip->csums); + if (status != BLK_STS_OK) + goto out_err; + } - /* bio split */ - ASSERT(map_length <= INT_MAX); + start_sector = dio_bio->bi_iter.bi_sector; + submit_len = dio_bio->bi_iter.bi_size; + do { - clone_len = min_t(int, submit_len, map_length); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), + start_sector << 9, submit_len, + &geom); + if (ret) { + status = errno_to_blk_status(ret); + goto out_err; + } + ASSERT(geom.len <= INT_MAX); + + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(orig_bio, clone_offset, - clone_len); + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; ASSERT(submit_len >= clone_len); submit_len -= clone_len; - if (submit_len == 0) - break; /* * Increase the count before we submit the bio so we know * the end IO handler won't happen before we increase the * count. Otherwise, the dip might get freed before we're * done setting it up. + * + * We transfer the initial reference to the last bio, so we + * don't need to increment the reference count for the last one. */ - atomic_inc(&dip->pending_bios); + if (submit_len > 0) { + refcount_inc(&dip->refs); + /* + * If we are submitting more than one bio, submit them + * all asynchronously. The exception is RAID 5 or 6, as + * asynchronous checksums make it difficult to collect + * full stripe writes. + */ + if (!raid56) + async_submit = 1; + } status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); - atomic_dec(&dip->pending_bios); + if (submit_len > 0) + refcount_dec(&dip->refs); goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; - - map_length = submit_len; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); - if (ret) - goto out_err; } while (submit_len > 0); + return BLK_QC_T_NONE; -submit: - status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); - if (!status) - return 0; - - if (bio != orig_bio) - bio_put(bio); out_err: - dip->errors = 1; - /* - * Before atomic variable goto zero, we must make sure dip->errors is - * perceived to be set. This ordering is ensured by the fact that an - * atomic operations with a return value are fully ordered as per - * atomic_t.txt - */ - if (atomic_dec_and_test(&dip->pending_bios)) - bio_io_error(dip->orig_bio); - - /* bio_end_io() will handle error, so we needn't return it */ - return 0; -} - -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) -{ - struct btrfs_dio_private *dip = NULL; - struct bio *bio = NULL; - struct btrfs_io_bio *io_bio; - bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - int ret = 0; - - bio = btrfs_bio_clone(dio_bio); - - dip = kzalloc(sizeof(*dip), GFP_NOFS); - if (!dip) { - ret = -ENOMEM; - goto free_ordered; - } - - dip->private = dio_bio->bi_private; - dip->inode = inode; - dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - bio->bi_private = dip; - dip->orig_bio = bio; - dip->dio_bio = dio_bio; - atomic_set(&dip->pending_bios, 1); - io_bio = btrfs_io_bio(bio); - io_bio->logical = file_offset; - - if (write) { - bio->bi_end_io = btrfs_endio_direct_write; - } else { - bio->bi_end_io = btrfs_endio_direct_read; - dip->subio_endio = btrfs_subio_endio_read; - } - - /* - * Reset the range for unsubmitted ordered extents (to a 0 length range) - * even if we fail to submit a bio, because in such case we do the - * corresponding error handling below and it must not be done a second - * time by btrfs_direct_IO(). - */ - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } - - ret = btrfs_submit_direct_hook(dip); - if (!ret) - return; - - if (io_bio->end_io) - io_bio->end_io(io_bio, ret); - -free_ordered: - /* - * If we arrived here it means either we failed to submit the dip - * or we either failed to clone the dio_bio or failed to allocate the - * dip. If we cloned the dio_bio and allocated the dip, we can just - * call bio_endio against our io_bio so that we get proper resource - * cleanup if we fail to submit the dip, otherwise, we must do the - * same as btrfs_endio_direct_[write|read] because we can't call these - * callbacks - they require an allocated dip and a clone of dio_bio. - */ - if (bio && dip) { - bio_io_error(bio); - /* - * The end io callbacks free our dip, do the final put on bio - * and all the cleanup and final put for dio_bio (through - * dio_end_io()). - */ - dip = NULL; - bio = NULL; - } else { - if (write) - __endio_write_update_ordered(inode, - file_offset, - dio_bio->bi_iter.bi_size, - false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - - dio_bio->bi_status = BLK_STS_IOERR; - /* - * Releases and cleans up our dio_bio, no need to bio_put() - * nor bio_endio()/bio_io_error() against dio_bio. - */ - dio_end_io(dio_bio); - } - if (bio) - bio_put(bio); - kfree(dip); + dip->dio_bio->bi_status = status; + btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -8661,37 +8008,63 @@ return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; - if (check_direct_IO(fs_info, iter, offset)) + if (check_direct_IO(fs_info, iter, offset)) { + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; return 0; + } - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -8699,65 +8072,29 @@ * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve, true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(inode, - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -8765,25 +8102,33 @@ return ret; } -#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) + u64 start, u64 len) { int ret; - ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent, 0); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + unsigned long bio_flags = 0; + struct bio *bio = NULL; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8817,21 +8162,16 @@ return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -8842,18 +8182,45 @@ return __btrfs_releasepage(page, gfp_flags); } +#ifdef CONFIG_MIGRATION +static int btrfs_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int ret; + + ret = migrate_page_move_mapping(mapping, newpage, page, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); + + if (PagePrivate2(page)) { + ClearPagePrivate2(page); + SetPagePrivate2(newpage); + } + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8864,28 +8231,39 @@ */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); -again: + start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); +again: + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { - end = min(page_end, ordered->file_offset + ordered->len - 1); + end = min(page_end, + ordered->file_offset + ordered->num_bytes - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8896,7 +8274,7 @@ struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8937,8 +8315,7 @@ */ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | + clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state); @@ -8947,11 +8324,7 @@ } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@ -9004,8 +8377,8 @@ * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - reserved_space); + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); if (!ret2) { ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; @@ -9042,7 +8415,7 @@ unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -9052,9 +8425,9 @@ fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE - reserved_space, - true); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); } } @@ -9066,23 +8439,21 @@ * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state); + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, &cached_state); - ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, - &cached_state, 0); + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) - zero_start = size & ~PAGE_MASK; + zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; @@ -9096,24 +8467,20 @@ set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); - if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - } + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(inode, data_reserved, page_start, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); @@ -9129,7 +8496,7 @@ int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); + u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -9184,7 +8551,7 @@ /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, - min_size, 0); + min_size, false); BUG_ON(ret); /* @@ -9219,9 +8586,9 @@ break; } - btrfs_block_rsv_release(fs_info, rsv, -1); + btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, 0); + rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -9244,7 +8611,7 @@ ret = PTR_ERR(trans); goto out; } - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { @@ -9327,7 +8694,7 @@ ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; - ei->last_link_trans = 0; + ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9346,13 +8713,15 @@ inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, inode); - extent_io_tree_init(&ei->io_failure_tree, inode); - ei->io_tree.track_uptodate = 1; - ei->io_failure_tree.track_uptodate = 1; + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_failure_tree, + IO_TREE_INODE_IO_FAILURE, inode); + extent_io_tree_init(fs_info, &ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT, inode); + ei->io_tree.track_uptodate = true; + ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); - mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); @@ -9370,27 +8739,26 @@ } #endif -static void btrfs_i_callback(struct rcu_head *head) +void btrfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -9398,16 +8766,16 @@ * created. */ if (!root) - goto free; + return; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", - ordered->file_offset, ordered->len); + ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -9415,9 +8783,9 @@ } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); -free: - call_rcu(&inode->i_rcu, btrfs_i_callback); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) @@ -9542,19 +8910,15 @@ struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; + bool need_abort = false; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9565,9 +8929,6 @@ (old_ino != BTRFS_FIRST_FREE_OBJECTID || new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; - - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || @@ -9608,7 +8969,7 @@ /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); root_log_pinned = true; @@ -9620,12 +8981,13 @@ old_idx); if (ret) goto out_fail; + need_abort = true; } /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(dest); dest_log_pinned = true; @@ -9635,8 +8997,11 @@ new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); - if (ret) + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); goto out_fail; + } } /* Update inode version and ctime/mtime. */ @@ -9710,30 +9075,14 @@ BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -9755,7 +9104,7 @@ btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (root_log_pinned) { btrfs_end_log_trans(root); @@ -9766,45 +9115,12 @@ dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); return ret; } @@ -9819,7 +9135,7 @@ u64 objectid; u64 index; - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) return ret; @@ -9873,11 +9189,9 @@ struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9954,7 +9268,7 @@ BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); log_pinned = true; @@ -10027,17 +9341,8 @@ BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -10069,28 +9374,13 @@ btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -10147,9 +9437,7 @@ init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - WARN_ON_ONCE(!inode); - btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, - btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -10158,7 +9446,9 @@ * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, + struct writeback_control *wbc, bool snapshot, + bool in_reclaim_context) { struct btrfs_inode *binode; struct inode *inode; @@ -10166,6 +9456,7 @@ struct list_head works; struct list_head splice; int ret = 0; + bool full_flush = wbc->nr_to_write == LONG_MAX; INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); @@ -10179,6 +9470,11 @@ list_move_tail(&binode->delalloc_inodes, &root->delalloc_inodes); + + if (in_reclaim_context && + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + continue; + inode = igrab(&binode->vfs_inode); if (!inode) { cond_resched_lock(&root->delalloc_lock); @@ -10189,18 +9485,26 @@ if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &binode->runtime_flags); - work = btrfs_alloc_delalloc_work(inode); - if (!work) { - iput(inode); - ret = -ENOMEM; - goto out; + if (full_flush) { + work = btrfs_alloc_delalloc_work(inode); + if (!work) { + iput(inode); + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + } else { + ret = sync_inode(inode, wbc); + if (!ret && + test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = sync_inode(inode, wbc); + btrfs_add_delayed_iput(inode); + if (ret || wbc->nr_to_write <= 0) + goto out; } - list_add_tail(&work->list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; cond_resched(); spin_lock(&root->delalloc_lock); } @@ -10224,20 +9528,29 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; struct btrfs_fs_info *fs_info = root->fs_info; - int ret; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &wbc, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, + bool in_reclaim_context) { + struct writeback_control wbc = { + .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, + .sync_mode = WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; struct btrfs_root *root; struct list_head splice; int ret; @@ -10251,23 +9564,25 @@ spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice) && nr) { + /* + * Reset nr_to_write here so we know that we're doing a full + * flush. + */ + if (nr == U64_MAX) + wbc.nr_to_write = LONG_MAX; + root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - root = btrfs_grab_fs_root(root); + root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); - btrfs_put_fs_root(root); - if (ret < 0) + ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + btrfs_put_root(root); + if (ret < 0 || wbc.nr_to_write <= 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); @@ -10316,7 +9631,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -10338,7 +9653,6 @@ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -10377,7 +9691,6 @@ inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); @@ -10404,6 +9717,65 @@ return err; } +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, + struct inode *inode, struct btrfs_key *ins, + u64 file_offset) +{ + struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; + u64 start = ins->objectid; + u64 len = ins->offset; + int ret; + + memset(&stack_fi, 0, sizeof(stack_fi)); + + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); + /* Encryption and other encoding is reserved and all 0 */ + + ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); + if (ret < 0) + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; +} + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -10426,14 +9798,6 @@ if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -10445,11 +9809,8 @@ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -10459,20 +9820,20 @@ * clear_offset by our extent size. */ clear_offset += ins.offset; - btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_reserved_file_extent(trans, inode, - cur_offset, ins.objectid, - ins.offset, ins.offset, - ins.offset, 0, 0, 0, - BTRFS_FILE_EXTENT_PREALLOC); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + /* + * Now that we inserted the prealloc extent we can finally + * decrement the number of reservations in the block group. + * If we did it before, we could race with relocation and have + * relocation miss the reserved extent, making it fail later. + */ + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -10493,7 +9854,6 @@ em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; @@ -10524,7 +9884,7 @@ else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + btrfs_inode_safe_disk_i_size_write(inode, 0); } ret = btrfs_update_inode(trans, root, inode); @@ -10536,11 +9896,13 @@ break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) - btrfs_free_reserved_data_space(inode, NULL, clear_offset, + btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } @@ -10600,7 +9962,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) goto out; @@ -10616,7 +9978,6 @@ inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10648,26 +10009,6 @@ return ret; } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ - return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, - u64 start, u64 end) -{ - struct inode *inode = private_data; - u64 isize; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { struct inode *inode = tree->private_data; @@ -10683,6 +10024,403 @@ index++; } } + +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + sp->bg_extent_count = 1; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + if (is_block_group) + entry->bg_extent_count++; + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) { + btrfs_dec_block_group_swap_extents(sp->ptr, + sp->bg_extent_count); + btrfs_put_block_group(sp->ptr); + } + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + unsigned long max_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + /* + * Our swapfile may have had its size extended after the swap header was + * written. In that case activating the swapfile should not go beyond + * the max size set in the swap header. + */ + if (bsi->nr_pages >= sis->max) + return 0; + + max_pages = sis->max - bsi->nr_pages; + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + + /* + * Prevent snapshot creation while we are activating the swap file. + * We do not want to race with snapshot creation. If snapshot creation + * already started before we bumped nr_swapfiles from 0 to 1 and + * completes before the first write into the swap file after it is + * activated, than that write would fallback to COW. + */ + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because snapshot creation is in progress"); + return -EINVAL; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. + */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } + atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + if (!btrfs_inc_block_group_swap_extents(bg)) { + btrfs_warn(fs_info, + "block group for swapfile at %llu is read-only%s", + bg->start, + atomic_read(&fs_info->scrubs_running) ? + " (scrub running)" : ""); + btrfs_put_block_group(bg); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + btrfs_drew_write_unlock(&root->snapshot_lock); + + btrfs_exclop_finish(fs_info); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, @@ -10703,11 +10441,6 @@ .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, }; -static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - .update_time = btrfs_update_time, -}; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, @@ -10720,22 +10453,6 @@ #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, -}; - -static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - - /* optional callbacks */ - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, - .check_extent_io_range = btrfs_check_extent_io_range, }; /* @@ -10754,19 +10471,17 @@ .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, - .direct_IO = btrfs_direct_IO, + .readahead = btrfs_readahead, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = btrfs_migratepage, +#endif .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { -- Gitblit v1.6.2