From 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 10 May 2024 07:44:59 +0000
Subject: [PATCH] gmac get mac form eeprom
---
kernel/fs/btrfs/file.c | 1151 +++++++++++++++++++++++++++++++++++----------------------
1 files changed, 701 insertions(+), 450 deletions(-)
diff --git a/kernel/fs/btrfs/file.c b/kernel/fs/btrfs/file.c
index 41ad37f..416a1b7 100644
--- a/kernel/fs/btrfs/file.c
+++ b/kernel/fs/btrfs/file.c
@@ -26,6 +26,8 @@
#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
+#include "delalloc-space.h"
+#include "reflink.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -273,34 +275,23 @@
{
struct btrfs_root *inode_root;
struct inode *inode;
- struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
- int index;
int ret;
/* get the inode */
- key.objectid = defrag->root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
}
- key.objectid = defrag->ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -336,7 +327,6 @@
iput(inode);
return 0;
cleanup:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@@ -399,7 +389,7 @@
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
- int offset = pos & (PAGE_SIZE - 1);
+ int offset = offset_in_page(pos);
while (write_bytes > 0) {
size_t count = min_t(size_t,
@@ -462,47 +452,6 @@
}
}
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
- const u64 start,
- const u64 len,
- struct extent_state **cached_state)
-{
- u64 search_start = start;
- const u64 end = start + len - 1;
-
- while (search_start < end) {
- const u64 search_len = end - search_start + 1;
- struct extent_map *em;
- u64 em_len;
- int ret = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, search_start,
- search_len, 0);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- if (em->block_start != EXTENT_MAP_HOLE)
- goto next;
-
- em_len = em->len;
- if (em->start < search_start)
- em_len -= search_start - em->start;
- if (em_len > search_len)
- em_len = search_len;
-
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
-next:
- search_start = extent_map_end(em);
- free_extent_map(em);
- if (ret)
- return ret;
- }
- return 0;
-}
-
/*
* after copy_from_user, pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
@@ -511,18 +460,18 @@
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
start_pos = pos & ~((u64) fs_info->sectorsize - 1);
@@ -535,30 +484,12 @@
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
-
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
- if (start_pos >= isize &&
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
- /*
- * There can't be any extents following eof in this case
- * so just set the delalloc new bit for the range
- * directly.
- */
- extra_bits |= EXTENT_DELALLOC_NEW;
- } else {
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
- start_pos,
- num_bytes, cached);
- if (err)
- return err;
- }
- }
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- extra_bits, cached, 0);
+ extra_bits, cached);
if (err)
return err;
@@ -575,7 +506,7 @@
* at this time.
*/
if (end_pos > isize)
- i_size_write(inode, end_pos);
+ i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
@@ -666,7 +597,6 @@
}
split->generation = gen;
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
@@ -679,7 +609,6 @@
split->start = start + len;
split->len = em->start + em->len - (start + len);
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
@@ -744,7 +673,7 @@
* is deleted from the tree.
*/
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
+ struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path, u64 start, u64 end,
u64 *drop_end, int drop_cache,
int replace_extent,
@@ -754,9 +683,11 @@
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct inode *vfs_inode = &inode->vfs_inode;
+ u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
@@ -774,13 +705,12 @@
int leafs_visited = 0;
if (drop_cache)
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
+ if (start >= inode->disk_i_size && !replace_extent)
modify_tree = 0;
- update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- root == fs_info->tree_root);
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -909,11 +839,14 @@
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
- ret = btrfs_inc_extent_ref(trans, root,
- disk_bytenr, num_bytes, 0,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_ADD_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
start - extent_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -944,7 +877,7 @@
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, end - key.offset);
+ inode_sub_bytes(vfs_inode, end - key.offset);
break;
}
@@ -964,7 +897,7 @@
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(inode, extent_end - start);
+ inode_sub_bytes(vfs_inode, extent_end - start);
if (end == extent_end)
break;
@@ -988,18 +921,21 @@
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
- ret = btrfs_free_extent(trans, root,
- disk_bytenr, num_bytes, 0,
+ btrfs_init_generic_ref(&ref,
+ BTRFS_DROP_DELAYED_REF,
+ disk_bytenr, num_bytes, 0);
+ btrfs_init_data_ref(&ref,
root->root_key.objectid,
- key.objectid, key.offset -
- extent_offset);
+ key.objectid,
+ key.offset - extent_offset);
+ ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(inode,
+ inode_sub_bytes(vfs_inode,
extent_end - key.offset);
}
@@ -1025,7 +961,7 @@
continue;
}
- BUG_ON(1);
+ BUG();
}
if (!ret && del_nr > 0) {
@@ -1050,7 +986,7 @@
if (!ret && replace_extent && leafs_visited == 1 &&
(path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
path->locks[0] == BTRFS_WRITE_LOCK) &&
- btrfs_leaf_free_space(fs_info, leaf) >=
+ btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + extent_item_size) {
key.objectid = ino;
@@ -1063,11 +999,7 @@
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1088,8 +1020,8 @@
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache, 0, 0, NULL);
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
+ end, NULL, drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
@@ -1142,6 +1074,7 @@
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
+ struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 bytenr;
@@ -1287,9 +1220,11 @@
extent_end - split);
btrfs_mark_buffer_dirty(leaf);
- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
+ orig_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1311,6 +1246,9 @@
other_start = end;
other_end = 0;
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+ num_bytes, 0);
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
@@ -1321,9 +1259,7 @@
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1341,9 +1277,7 @@
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
- 0, root->root_key.objectid,
- ino, orig_offset);
+ ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1481,9 +1415,7 @@
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1493,7 +1425,7 @@
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
- ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
last_pos, cached_state);
@@ -1501,8 +1433,7 @@
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1536,50 +1467,94 @@
return ret;
}
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
- struct btrfs_ordered_extent *ordered;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshotting(root);
- if (!ret)
- return -ENOSPC;
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return 0;
+
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
+ return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
+ num_bytes = lockend - lockstart + 1;
- while (1) {
- lock_extent(&inode->io_tree, lockstart, lockend);
+ if (nowait) {
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ return -EAGAIN;
+
ordered = btrfs_lookup_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (!ordered) {
- break;
+ num_bytes);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ goto out_unlock;
}
- unlock_extent(&inode->io_tree, lockstart, lockend);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
+ } else {
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
+ lockend, NULL);
}
- num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
- btrfs_end_write_no_snapshotting(root);
+ if (!nowait)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
-
+out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
+}
+
+static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, true);
+}
+
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * >0 and update @write_bytes if we can do nocow write
+ * 0 if we can't do nocow write
+ * -EAGAIN if we can't get the needed lock or there are ordered extents
+ * for * (nowait == true) case
+ * <0 if other error happened
+ *
+ * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ return check_can_nocow(inode, pos, write_bytes, false);
+}
+
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
+{
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
@@ -1589,7 +1564,6 @@
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
@@ -1610,8 +1584,8 @@
return -ENOMEM;
while (iov_iter_count(i) > 0) {
- size_t offset = pos & (PAGE_SIZE - 1);
struct extent_state *cached_state = NULL;
+ size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
@@ -1642,13 +1616,12 @@
fs_info->sectorsize);
extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &data_reserved, pos,
write_bytes);
if (ret < 0) {
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
* data space.
@@ -1673,11 +1646,11 @@
reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
@@ -1747,7 +1720,7 @@
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
@@ -1757,8 +1730,9 @@
fs_info->sectorsize);
if (copied > 0)
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
- pos, copied, &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state);
/*
* If we have not locked the extent range, because the range's
@@ -1781,7 +1755,7 @@
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1799,8 +1773,6 @@
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
- btrfs_btree_balance_dirty(fs_info);
pos += copied;
num_written += copied;
@@ -1810,11 +1782,12 @@
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- btrfs_delalloc_release_space(inode, data_reserved,
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
@@ -1834,7 +1807,7 @@
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -1888,11 +1861,10 @@
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start_pos;
u64 end_pos;
ssize_t num_written = 0;
- bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ const bool sync = iocb->ki_flags & IOCB_DSYNC;
ssize_t err;
loff_t pos;
size_t count;
@@ -1919,13 +1891,23 @@
pos = iocb->ki_pos;
count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
/*
* We will allocate space in case nodatacow is not set,
* so bail
*/
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
+ <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count) {
inode_unlock(inode);
return -EAGAIN;
}
@@ -1977,7 +1959,40 @@
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -1989,14 +2004,8 @@
inode_unlock(inode);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@@ -2017,12 +2026,12 @@
filp->private_data = NULL;
/*
- * ordered_data_close is set by settattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2048,6 +2057,30 @@
return ret;
}
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -2063,25 +2096,28 @@
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2105,17 +2141,12 @@
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection. Do this while holding the inode lock, to avoid races
- * with other tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2144,22 +2175,35 @@
/*
* We have to do this here to avoid the priority inversion of waiting on
- * IO of a lower priority task while holding a transaciton open.
+ * IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
smp_mb();
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2175,9 +2219,7 @@
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2187,20 +2229,18 @@
* here we could get into a situation where we're waiting on IO to
* happen that is blocked on a transaction trying to commit. With start
* we inc the extwriter counter, so we wait for all extwriters to exit
- * before we start blocking join'ers. This comment is to keep somebody
+ * before we start blocking joiners. This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction *cough*Josef*cough*.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- trans->sync = true;
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2219,30 +2259,18 @@
up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
- /*
- * If any of the ordered extents had an error, just return it to user
- * space, so that the application knows some writes didn't succeed and
- * can take proper action (retry for e.g.). Blindly committing the
- * transaction in this case, would fool userspace that everything was
- * successful. And we also want to make sure our log doesn't contain
- * file extent items pointing to extents that weren't fully written to -
- * just like in the non fast fsync path, where we check for the ordered
- * operation's error flag before writing to the log tree and return -EIO
- * if any of them had this flag set (btrfs_wait_ordered_range) -
- * therefore we need to check for errors in the ordered operations,
- * which are indicated by ctx.io_err.
- */
- if (ctx.io_err) {
- btrfs_end_transaction(trans);
- ret = ctx.io_err;
- goto out;
- }
-
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
goto out;
}
}
@@ -2256,6 +2284,12 @@
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2391,7 +2425,6 @@
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -2424,7 +2457,7 @@
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
round_down(*start, fs_info->sectorsize),
- round_up(*len, fs_info->sectorsize), 0);
+ round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2452,7 +2485,8 @@
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2460,7 +2494,7 @@
* we need to try again.
*/
if ((!ordered ||
- (ordered->file_offset + ordered->len <= lockstart ||
+ (ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
lockstart, lockend)) {
@@ -2480,27 +2514,340 @@
return 0;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int slot;
+ struct btrfs_ref ref = { 0 };
+ int ret;
+
+ if (replace_len == 0)
+ return 0;
+
+ if (extent_info->disk_offset == 0 &&
+ btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = extent_info->file_offset;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(struct btrfs_file_extent_item));
+ if (ret)
+ return ret;
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ write_extent_buffer(leaf, extent_info->extent_buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(struct btrfs_file_extent_item));
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ extent_info->file_offset, replace_len);
+ if (ret)
+ return ret;
+
+ /* If it's a hole, nothing more needs to be done. */
+ if (extent_info->disk_offset == 0)
+ return 0;
+
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
+
+ return ret;
+}
+
+/*
+ * The respective range must have been previously locked, as well as the inode.
+ * The end offset is inclusive (last byte of the range).
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
+ */
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
+ const u64 start, const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_block_rsv *rsv;
+ unsigned int rsv_count;
+ u64 cur_offset;
+ u64 drop_end;
+ u64 len = end - start;
+ int ret = 0;
+
+ if (end <= start)
+ return -EINVAL;
+
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
+ rsv_count = 3;
+ else
+ rsv_count = 2;
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ min_size, false);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ cur_offset = start;
+ while (cur_offset < end) {
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
+ cur_offset, end + 1, &drop_end,
+ 1, 0, 0, NULL);
+ if (ret != -ENOSPC) {
+ /*
+ * The only time we don't want to abort is if we are
+ * attempting to clone a partial inline extent, in which
+ * case we'll get EOPNOTSUPP. However if we aren't
+ * clone we need to abort no matter what, because if we
+ * got EOPNOTSUPP via prealloc then we messed up and
+ * need to abort.
+ */
+ if (ret &&
+ (ret != -EOPNOTSUPP ||
+ (extent_info && extent_info->is_new_extent)))
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+
+ if (!extent_info && cur_offset < drop_end &&
+ cur_offset < ino_size) {
+ ret = fill_holes(trans, BTRFS_I(inode), path,
+ cur_offset, drop_end);
+ if (ret) {
+ /*
+ * If we failed then we didn't insert our hole
+ * entries for the area we dropped, so now the
+ * fs is corrupted, so we must abort the
+ * transaction.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ } else if (!extent_info && cur_offset < drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
+
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ break;
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+
+ trans = btrfs_start_transaction(root, rsv_count);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
+ rsv, min_size, false);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+
+ if (!extent_info) {
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we were cloning, force the next fsync to be a full one since we
+ * we replaced (or just dropped in the case of cloning holes when
+ * NO_HOLES is enabled) extents and extent maps.
+ * This is for the sake of simplicity, and cloning into files larger
+ * than 16Mb would force the full fsync any way (when
+ * try_release_extent_mapping() is invoked during page cache truncation.
+ */
+ if (extent_info && !extent_info->is_new_extent)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ if (ret)
+ goto out_trans;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ /*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= end)
+ drop_end = end + 1;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
+ ret = fill_holes(trans, BTRFS_I(inode), path,
+ cur_offset, drop_end);
+ if (ret) {
+ /* Same comment as above. */
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ } else if (!extent_info && cur_offset < drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
+ }
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ if (ret)
+ btrfs_end_transaction(trans);
+ else
+ *trans_out = trans;
+out_free:
+ btrfs_free_block_rsv(fs_info, rsv);
+out:
+ return ret;
+}
+
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
+{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_block_rsv *rsv;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
u64 lockstart;
u64 lockend;
u64 tail_start;
u64 tail_len;
u64 orig_start = offset;
- u64 cur_offset;
- u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
- u64 drop_end;
int ret = 0;
- int err = 0;
- unsigned int rsv_count;
bool same_block;
- bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
u64 ino_size;
bool truncated_block = false;
bool updated_inode = false;
@@ -2520,9 +2867,13 @@
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2607,145 +2958,24 @@
goto out;
}
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out_free;
- }
- rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ goto out;
- /*
- * 1 - update the inode
- * 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set
- */
- rsv_count = no_holes ? 2 : 3;
- trans = btrfs_start_transaction(root, rsv_count);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- goto out_free;
- }
-
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
- BUG_ON(ret);
- trans->block_rsv = rsv;
-
- cur_offset = lockstart;
- len = lockend - cur_offset;
- while (cur_offset < lockend) {
- ret = __btrfs_drop_extents(trans, root, inode, path,
- cur_offset, lockend + 1,
- &drop_end, 1, 0, 0, NULL);
- if (ret != -ENOSPC)
- break;
-
- trans->block_rsv = &fs_info->trans_block_rsv;
-
- if (cur_offset < drop_end && cur_offset < ino_size) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
- if (ret) {
- /*
- * If we failed then we didn't insert our hole
- * entries for the area we dropped, so now the
- * fs is corrupted, so we must abort the
- * transaction.
- */
- btrfs_abort_transaction(trans, ret);
- err = ret;
- break;
- }
- }
-
- cur_offset = drop_end;
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- err = ret;
- break;
- }
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(fs_info);
-
- trans = btrfs_start_transaction(root, rsv_count);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- break;
- }
-
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
- BUG_ON(ret); /* shouldn't happen */
- trans->block_rsv = rsv;
-
- ret = find_first_non_hole(inode, &cur_offset, &len);
- if (unlikely(ret < 0))
- break;
- if (ret && !len) {
- ret = 0;
- break;
- }
- }
-
- if (ret) {
- err = ret;
- goto out_trans;
- }
-
- trans->block_rsv = &fs_info->trans_block_rsv;
- /*
- * If we are using the NO_HOLES feature we might have had already an
- * hole that overlaps a part of the region [lockstart, lockend] and
- * ends at (or beyond) lockend. Since we have no file extent items to
- * represent holes, drop_end can be less than lockend and so we must
- * make sure we have an extent map representing the existing hole (the
- * call to __btrfs_drop_extents() might have dropped the existing extent
- * map representing the existing hole), otherwise the fast fsync path
- * will not record the existence of the hole region
- * [existing_hole_start, lockend].
- */
- if (drop_end <= lockend)
- drop_end = lockend + 1;
- /*
- * Don't insert file hole extent item if it's for a range beyond eof
- * (because it's useless) or if it represents a 0 bytes range (when
- * cur_offset == drop_end).
- */
- if (cur_offset < ino_size && cur_offset < drop_end) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
- if (ret) {
- /* Same comment as above. */
- btrfs_abort_transaction(trans, ret);
- err = ret;
- goto out_trans;
- }
- }
-
-out_trans:
- if (!trans)
- goto out_free;
-
+ ASSERT(trans != NULL);
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
-
- trans->block_rsv = &fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
-out_free:
- btrfs_free_path(path);
- btrfs_free_block_rsv(fs_info, rsv);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
out_only_mutex:
- if (!updated_inode && truncated_block && !ret && !err) {
+ if (!updated_inode && truncated_block && !ret) {
/*
* If we only end up zeroing part of a page, we still need to
* update the inode item, so that all the time fields are
@@ -2760,16 +2990,18 @@
inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
} else {
- err = btrfs_update_inode(trans, root, inode);
- ret = btrfs_end_transaction(trans);
+ int ret2;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_end_transaction(trans);
+ if (!ret)
+ ret = ret2;
}
}
inode_unlock(inode);
- if (ret && !err)
- err = ret;
- return err;
+ return ret;
}
/* Helper structure to record which range is already reserved */
@@ -2830,7 +3062,7 @@
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_ordered_update_i_size(inode, end, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode(trans, root, inode);
ret2 = btrfs_end_transaction(trans);
@@ -2838,12 +3070,12 @@
}
enum {
- RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
- RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
- RANGE_BOUNDARY_HOLE = 2,
+ RANGE_BOUNDARY_WRITTEN_EXTENT,
+ RANGE_BOUNDARY_PREALLOC_EXTENT,
+ RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -2851,7 +3083,7 @@
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2876,7 +3108,7 @@
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -2884,8 +3116,8 @@
inode_dio_wait(inode);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, alloc_end - alloc_start, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -2928,8 +3160,8 @@
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -2966,7 +3198,8 @@
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -2982,7 +3215,7 @@
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3014,7 +3247,7 @@
&cached_state);
if (ret)
goto out;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
@@ -3036,7 +3269,7 @@
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
@@ -3060,7 +3293,7 @@
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3073,7 +3306,7 @@
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
+ return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
@@ -3094,6 +3327,10 @@
if (ret)
goto out;
}
+
+ ret = file_modified(file);
+ if (ret)
+ goto out;
/*
* TODO: Move these two operations after we have checked
@@ -3142,10 +3379,11 @@
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
- ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
@@ -3170,7 +3408,7 @@
INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
- alloc_end - cur_offset, 0);
+ alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
break;
@@ -3187,8 +3425,9 @@
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset,
+ last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
@@ -3200,8 +3439,9 @@
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
- btrfs_free_reserved_data_space(inode, data_reserved,
- cur_offset, last_byte - cur_offset);
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ data_reserved, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3218,7 +3458,7 @@
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
- btrfs_free_reserved_data_space(inode,
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
@@ -3239,35 +3479,36 @@
inode_unlock(inode);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(inode, data_reserved,
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+ int whence)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ loff_t i_size = inode->i_size;
u64 lockstart;
u64 lockend;
u64 start;
u64 len;
int ret = 0;
- if (inode->i_size == 0)
+ if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
- * *offset can be negative, in this case we start finding DATA/HOLE from
+ * offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
- start = max_t(loff_t, 0, *offset);
+ start = max_t(loff_t, 0, offset);
lockstart = round_down(start, fs_info->sectorsize);
- lockend = round_up(i_size_read(inode),
- fs_info->sectorsize);
+ lockend = round_up(i_size, fs_info->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
@@ -3276,9 +3517,8 @@
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- while (start < inode->i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
- start, len, 0);
+ while (start < i_size) {
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
@@ -3300,59 +3540,71 @@
cond_resched();
}
free_extent_map(em);
- if (!ret) {
- if (whence == SEEK_DATA && start >= inode->i_size)
- ret = -ENXIO;
- else
- *offset = min_t(loff_t, start, inode->i_size);
- }
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- return ret;
+ if (ret) {
+ offset = ret;
+ } else {
+ if (whence == SEEK_DATA && start >= i_size)
+ offset = -ENXIO;
+ else
+ offset = min_t(loff_t, start, i_size);
+ }
+
+ return offset;
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- int ret;
- inode_lock(inode);
switch (whence) {
- case SEEK_END:
- case SEEK_CUR:
- offset = generic_file_llseek(file, offset, whence);
- goto out;
+ default:
+ return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- ret = find_desired_extent(inode, &offset, whence);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ inode_lock_shared(inode);
+ offset = find_desired_extent(inode, offset, whence);
+ inode_unlock_shared(inode);
+ break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-out:
- inode_unlock(inode);
- return offset;
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return generic_file_open(inode, filp);
+}
+
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0 || !iov_iter_count(to) ||
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
}
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
+ .splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
@@ -3362,8 +3614,7 @@
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
- .clone_file_range = btrfs_clone_file_range,
- .dedupe_file_range = btrfs_dedupe_file_range,
+ .remap_file_range = btrfs_remap_file_range,
};
void __cold btrfs_auto_defrag_exit(void)
--
Gitblit v1.6.2