From 95099d4622f8cb224d94e314c7a8e0df60b13f87 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 09 Dec 2023 08:38:01 +0000
Subject: [PATCH] enable docker ppp
---
kernel/fs/btrfs/send.c | 974 ++++++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 719 insertions(+), 255 deletions(-)
diff --git a/kernel/fs/btrfs/send.c b/kernel/fs/btrfs/send.c
index eb2f8e8..4a6ba09 100644
--- a/kernel/fs/btrfs/send.c
+++ b/kernel/fs/btrfs/send.c
@@ -122,8 +122,6 @@
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -268,6 +266,16 @@
int need_later_update;
int name_len;
char name[];
+};
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+enum btrfs_compare_tree_result {
+ BTRFS_COMPARE_TREE_NEW,
+ BTRFS_COMPARE_TREE_DELETED,
+ BTRFS_COMPARE_TREE_CHANGED,
+ BTRFS_COMPARE_TREE_SAME,
};
__cold
@@ -570,8 +578,8 @@
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -681,7 +689,7 @@
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -693,17 +701,17 @@
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
- crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -1170,7 +1178,6 @@
struct backref_ctx {
struct send_ctx *sctx;
- struct btrfs_path *path;
/* number of total found references */
u64 found;
@@ -1196,9 +1203,9 @@
u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
- if (root < cr->root->objectid)
+ if (root < cr->root->root_key.objectid)
return -1;
- if (root > cr->root->objectid)
+ if (root > cr->root->root_key.objectid)
return 1;
return 0;
}
@@ -1208,9 +1215,9 @@
struct clone_root *cr1 = (struct clone_root *)e1;
struct clone_root *cr2 = (struct clone_root *)e2;
- if (cr1->root->objectid < cr2->root->objectid)
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
- if (cr1->root->objectid > cr2->root->objectid)
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
return 1;
return 0;
}
@@ -1223,8 +1230,6 @@
{
struct backref_ctx *bctx = ctx_;
struct clone_root *found;
- int ret;
- u64 i_size;
/* First check if the root is in the list of accepted clone sources */
found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
@@ -1241,30 +1246,26 @@
}
/*
- * There are inodes that have extents that lie behind its i_size. Don't
- * accept clones from these extents.
- */
- ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
- NULL, NULL, NULL);
- btrfs_release_path(bctx->path);
- if (ret < 0)
- return ret;
-
- if (offset + bctx->data_offset + bctx->extent_len > i_size)
- return 0;
-
- /*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
if (found->root == bctx->sctx->send_root) {
/*
- * TODO for the moment we don't accept clones from the inode
- * that is currently send. We may change this when
- * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
- * file.
+ * If the source inode was not yet processed we can't issue a
+ * clone operation, as the source extent does not exist yet at
+ * the destination of the stream.
*/
- if (ino >= bctx->cur_objectid)
+ if (ino > bctx->cur_objectid)
+ return 0;
+ /*
+ * We clone from the inode currently being sent as long as the
+ * source extent is already processed, otherwise we could try
+ * to clone from an extent that does not exist yet at the
+ * destination of the stream.
+ */
+ if (ino == bctx->cur_objectid &&
+ offset + bctx->extent_len >
+ bctx->sctx->cur_inode_next_write_offset)
return 0;
}
@@ -1329,8 +1330,6 @@
ret = -ENOMEM;
goto out;
}
-
- backref_ctx->path = tmp_path;
if (data_offset >= ino_size) {
/*
@@ -1718,12 +1717,8 @@
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2267,7 +2262,7 @@
* inodes "orphan" name instead of the real name and stop. Same with new inodes
* that were not created yet and overwritten inodes/refs.
*
- * When do we have have orphan inodes:
+ * When do we have orphan inodes:
* 1. When an inode is freshly created and thus no valid refs are available yet
* 2. When a directory lost all it's refs (deleted) but still has dir items
* inside which were not processed yet (pending for move/delete). If anyone
@@ -2371,7 +2366,7 @@
return -ENOMEM;
}
- key.objectid = send_root->objectid;
+ key.objectid = send_root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2387,7 +2382,7 @@
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->objectid) {
+ key.objectid != send_root->root_key.objectid) {
ret = -ENOENT;
goto out;
}
@@ -3957,52 +3952,56 @@
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * the the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -4093,6 +4092,49 @@
goto out;
}
ret = send_unlink(sctx, cur->full_path);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
if (ret < 0)
goto out;
}
@@ -4893,35 +4935,43 @@
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct page *page;
char *addr;
- struct btrfs_key key;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
- unsigned pg_offset = offset & ~PAGE_MASK;
- ssize_t ret = 0;
+ unsigned pg_offset = offset_in_page(offset);
+ int ret;
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
-
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
last_index = (offset + len - 1) >> PAGE_SHIFT;
@@ -4967,16 +5017,16 @@
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4990,20 +5040,12 @@
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
return -ENOMEM;
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
-
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
@@ -5015,16 +5057,16 @@
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5040,8 +5082,8 @@
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->objectid, clone_root->ino,
- clone_root->offset);
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
p = fs_path_alloc();
if (!p)
@@ -5136,8 +5178,8 @@
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5164,16 +5206,19 @@
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5189,23 +5234,20 @@
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5278,6 +5320,7 @@
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
@@ -5301,6 +5344,16 @@
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+
+ /*
+ * There are inodes that have extents that lie behind its i_size. Don't
+ * accept clones from these extents.
+ */
+ ret = __get_inode_info(clone_root->root, path, clone_root->ino,
+ &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
/*
* We can't send a clone operation for the entire range if we find
@@ -5344,6 +5397,8 @@
u8 type;
u64 ext_len;
u64 clone_len;
+ u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5397,13 +5452,94 @@
if (key.offset >= clone_root->offset + len)
break;
+ if (key.offset >= clone_src_i_size)
+ break;
+
+ if (key.offset + ext_len > clone_src_i_size) {
+ ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
+
+ clone_data_offset = btrfs_file_extent_offset(leaf, ei);
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
+ clone_root->offset = key.offset;
+ if (clone_data_offset < data_offset &&
+ clone_data_offset + ext_len > data_offset) {
+ u64 extent_offset;
+
+ extent_offset = data_offset - clone_data_offset;
+ ext_len -= extent_offset;
+ clone_data_offset += extent_offset;
+ clone_root->offset += extent_offset;
+ }
+ }
+
clone_len = min_t(u64, ext_len, len);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
- btrfs_file_extent_offset(leaf, ei) == data_offset)
- ret = send_clone(sctx, offset, clone_len, clone_root);
- else
+ clone_data_offset == data_offset) {
+ const u64 src_end = clone_root->offset + clone_len;
+ const u64 sectorsize = SZ_64K;
+
+ /*
+ * We can't clone the last block, when its size is not
+ * sector size aligned, into the middle of a file. If we
+ * do so, the receiver will get a failure (-EINVAL) when
+ * trying to clone or will silently corrupt the data in
+ * the destination file if it's on a kernel without the
+ * fix introduced by commit ac765f83f1397646
+ * ("Btrfs: fix data corruption due to cloning of eof
+ * block).
+ *
+ * So issue a clone of the aligned down range plus a
+ * regular write for the eof block, if we hit that case.
+ *
+ * Also, we use the maximum possible sector size, 64K,
+ * because we don't know what's the sector size of the
+ * filesystem that receives the stream, so we have to
+ * assume the largest possible sector size.
+ */
+ if (src_end == clone_src_i_size &&
+ !IS_ALIGNED(src_end, sectorsize) &&
+ offset + clone_len < sctx->cur_inode_size) {
+ u64 slen;
+
+ slen = ALIGN_DOWN(src_end - clone_root->offset,
+ sectorsize);
+ if (slen > 0) {
+ ret = send_clone(sctx, offset, slen,
+ clone_root);
+ if (ret < 0)
+ goto out;
+ }
+ ret = send_extent_data(sctx, offset + slen,
+ clone_len - slen);
+ } else {
+ ret = send_clone(sctx, offset, clone_len,
+ clone_root);
+ }
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
+ } else {
ret = send_extent_data(sctx, offset, clone_len);
+ }
if (ret < 0)
goto out;
@@ -5413,6 +5549,21 @@
break;
offset += clone_len;
clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
data_offset += clone_len;
next:
path->slots[0]++;
@@ -5433,51 +5584,29 @@
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
-
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -5675,10 +5804,7 @@
{
struct btrfs_path *path;
struct btrfs_root *root = sctx->send_root;
- struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 extent_end;
- u8 type;
int ret;
path = alloc_path_for_send();
@@ -5698,18 +5824,7 @@
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
goto out;
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key.offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
out:
btrfs_free_path(path);
return ret;
@@ -5763,16 +5878,7 @@
break;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
-
- extent_end = ALIGN(key.offset + size,
- root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- }
+ extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next;
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
@@ -5793,9 +5899,6 @@
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_file_extent_item *fi;
- u64 extent_end;
- u8 type;
int ret = 0;
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
@@ -5805,18 +5908,6 @@
ret = get_last_extent(sctx, key->offset - 1);
if (ret)
return ret;
- }
-
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key->offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key->offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
}
if (path->slots[0] == 0 &&
@@ -5844,7 +5935,7 @@
else
ret = 0;
}
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
return ret;
}
@@ -6654,6 +6745,365 @@
return ret;
}
+static int tree_move_down(struct btrfs_path *path, int *level)
+{
+ struct extent_buffer *eb;
+
+ BUG_ON(*level == 0);
+ eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+
+ path->nodes[*level - 1] = eb;
+ path->slots[*level - 1] = 0;
+ (*level)--;
+ return 0;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_path *path,
+ int *level, int root_level)
+{
+ int ret = 0;
+ int nritems;
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+
+ path->slots[*level]++;
+
+ while (path->slots[*level] >= nritems) {
+ if (*level == root_level)
+ return -1;
+
+ /* move upnext */
+ path->slots[*level] = 0;
+ free_extent_buffer(path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ (*level)++;
+ path->slots[*level]++;
+
+ nritems = btrfs_header_nritems(path->nodes[*level]);
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_path *path,
+ int *level, int root_level,
+ int allow_down,
+ struct btrfs_key *key)
+{
+ int ret;
+
+ if (*level == 0 || !allow_down) {
+ ret = tree_move_next_or_upnext(path, level, root_level);
+ } else {
+ ret = tree_move_down(path, level);
+ }
+ if (ret >= 0) {
+ if (*level == 0)
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ else
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
+ path->slots[*level]);
+ }
+ return ret;
+}
+
+static int tree_compare_item(struct btrfs_path *left_path,
+ struct btrfs_path *right_path,
+ char *tmp_buf)
+{
+ int cmp;
+ int len1, len2;
+ unsigned long off1, off2;
+
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+ if (len1 != len2)
+ return 1;
+
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+ right_path->slots[0]);
+
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+ if (cmp)
+ return 1;
+ return 0;
+}
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+static int btrfs_compare_trees(struct btrfs_root *left_root,
+ struct btrfs_root *right_root, void *ctx)
+{
+ struct btrfs_fs_info *fs_info = left_root->fs_info;
+ int ret;
+ int cmp;
+ struct btrfs_path *left_path = NULL;
+ struct btrfs_path *right_path = NULL;
+ struct btrfs_key left_key;
+ struct btrfs_key right_key;
+ char *tmp_buf = NULL;
+ int left_root_level;
+ int right_root_level;
+ int left_level;
+ int right_level;
+ int left_end_reached;
+ int right_end_reached;
+ int advance_left;
+ int advance_right;
+ u64 left_blockptr;
+ u64 right_blockptr;
+ u64 left_gen;
+ u64 right_gen;
+
+ left_path = btrfs_alloc_path();
+ if (!left_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ right_path = btrfs_alloc_path();
+ if (!right_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ left_path->search_commit_root = 1;
+ left_path->skip_locking = 1;
+ right_path->search_commit_root = 1;
+ right_path->skip_locking = 1;
+
+ /*
+ * Strategy: Go to the first items of both trees. Then do
+ *
+ * If both trees are at level 0
+ * Compare keys of current items
+ * If left < right treat left item as new, advance left tree
+ * and repeat
+ * If left > right treat right item as deleted, advance right tree
+ * and repeat
+ * If left == right do deep compare of items, treat as changed if
+ * needed, advance both trees and repeat
+ * If both trees are at the same level but not at level 0
+ * Compare keys of current nodes/leafs
+ * If left < right advance left tree and repeat
+ * If left > right advance right tree and repeat
+ * If left == right compare blockptrs of the next nodes/leafs
+ * If they match advance both trees but stay at the same level
+ * and repeat
+ * If they don't match advance both trees while allowing to go
+ * deeper and repeat
+ * If tree levels are different
+ * Advance the tree that needs it and repeat
+ *
+ * Advancing a tree means:
+ * If we are at level 0, try to go to the next slot. If that's not
+ * possible, go one level up and repeat. Stop when we found a level
+ * where we could go to the next slot. We may at this point be on a
+ * node or a leaf.
+ *
+ * If we are not at level 0 and not on shared tree blocks, go one
+ * level deeper.
+ *
+ * If we are not at level 0 and on shared tree blocks, go one slot to
+ * the right if possible or go up and right.
+ */
+
+ down_read(&fs_info->commit_root_sem);
+ left_level = btrfs_header_level(left_root->commit_root);
+ left_root_level = left_level;
+ left_path->nodes[left_level] =
+ btrfs_clone_extent_buffer(left_root->commit_root);
+ if (!left_path->nodes[left_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ right_level = btrfs_header_level(right_root->commit_root);
+ right_root_level = right_level;
+ right_path->nodes[right_level] =
+ btrfs_clone_extent_buffer(right_root->commit_root);
+ if (!right_path->nodes[right_level]) {
+ up_read(&fs_info->commit_root_sem);
+ ret = -ENOMEM;
+ goto out;
+ }
+ up_read(&fs_info->commit_root_sem);
+
+ if (left_level == 0)
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ else
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
+ &left_key, left_path->slots[left_level]);
+ if (right_level == 0)
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+ else
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
+ &right_key, right_path->slots[right_level]);
+
+ left_end_reached = right_end_reached = 0;
+ advance_left = advance_right = 0;
+
+ while (1) {
+ cond_resched();
+ if (advance_left && !left_end_reached) {
+ ret = tree_advance(left_path, &left_level,
+ left_root_level,
+ advance_left != ADVANCE_ONLY_NEXT,
+ &left_key);
+ if (ret == -1)
+ left_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
+ advance_left = 0;
+ }
+ if (advance_right && !right_end_reached) {
+ ret = tree_advance(right_path, &right_level,
+ right_root_level,
+ advance_right != ADVANCE_ONLY_NEXT,
+ &right_key);
+ if (ret == -1)
+ right_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
+ advance_right = 0;
+ }
+
+ if (left_end_reached && right_end_reached) {
+ ret = 0;
+ goto out;
+ } else if (left_end_reached) {
+ if (right_level == 0) {
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_right = ADVANCE;
+ continue;
+ } else if (right_end_reached) {
+ if (left_level == 0) {
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ }
+ advance_left = ADVANCE;
+ continue;
+ }
+
+ if (left_level == 0 && right_level == 0) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ ret = changed_cb(left_path, right_path,
+ &left_key,
+ BTRFS_COMPARE_TREE_NEW,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ ret = changed_cb(left_path, right_path,
+ &right_key,
+ BTRFS_COMPARE_TREE_DELETED,
+ ctx);
+ if (ret < 0)
+ goto out;
+ advance_right = ADVANCE;
+ } else {
+ enum btrfs_compare_tree_result result;
+
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+ ret = tree_compare_item(left_path, right_path,
+ tmp_buf);
+ if (ret)
+ result = BTRFS_COMPARE_TREE_CHANGED;
+ else
+ result = BTRFS_COMPARE_TREE_SAME;
+ ret = changed_cb(left_path, right_path,
+ &left_key, result, ctx);
+ if (ret < 0)
+ goto out;
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ } else if (left_level == right_level) {
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+ if (cmp < 0) {
+ advance_left = ADVANCE;
+ } else if (cmp > 0) {
+ advance_right = ADVANCE;
+ } else {
+ left_blockptr = btrfs_node_blockptr(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_blockptr = btrfs_node_blockptr(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
+ /*
+ * As we're on a shared block, don't
+ * allow to go deeper.
+ */
+ advance_left = ADVANCE_ONLY_NEXT;
+ advance_right = ADVANCE_ONLY_NEXT;
+ } else {
+ advance_left = ADVANCE;
+ advance_right = ADVANCE;
+ }
+ }
+ } else if (left_level < right_level) {
+ advance_right = ADVANCE;
+ } else {
+ advance_left = ADVANCE;
+ }
+ }
+
+out:
+ btrfs_free_path(left_path);
+ btrfs_free_path(right_path);
+ kvfree(tmp_buf);
+ return ret;
+}
+
static int send_subvol(struct send_ctx *sctx)
{
int ret;
@@ -6669,8 +7119,7 @@
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -6779,20 +7228,25 @@
spin_unlock(&root->root_item_lock);
}
+static void dedupe_in_progress_warn(const struct btrfs_root *root)
+{
+ btrfs_warn_rl(root->fs_info,
+"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
+ root->root_key.objectid, root->dedupe_in_progress);
+}
+
long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
- struct btrfs_key key;
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
- int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -6802,6 +7256,11 @@
* making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
+ if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(send_root);
+ spin_unlock(&send_root->root_item_lock);
+ return -EAGAIN;
+ }
send_root->send_in_progress++;
spin_unlock(&send_root->root_item_lock);
@@ -6822,13 +7281,6 @@
if (arg->clone_sources_count >
ULONG_MAX / sizeof(struct clone_root) - 1) {
ret = -EINVAL;
- goto out;
- }
-
- if (!access_ok(VERIFY_READ, arg->clone_sources,
- sizeof(*arg->clone_sources) *
- arg->clone_sources_count)) {
- ret = -EFAULT;
goto out;
}
@@ -6875,25 +7327,20 @@
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -6910,15 +7357,9 @@
}
for (i = 0; i < arg->clone_sources_count; i++) {
- key.objectid = clone_sources_tmp[i];
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ clone_root = btrfs_get_fs_root(fs_info,
+ clone_sources_tmp[i], true);
if (IS_ERR(clone_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@@ -6926,13 +7367,19 @@
if (!btrfs_root_readonly(clone_root) ||
btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EPERM;
+ goto out;
+ }
+ if (clone_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(clone_root);
+ spin_unlock(&clone_root->root_item_lock);
+ btrfs_put_root(clone_root);
+ ret = -EAGAIN;
goto out;
}
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
@@ -6942,15 +7389,9 @@
}
if (arg->parent_root) {
- key.objectid = arg->parent_root;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
+ true);
if (IS_ERR(sctx->parent_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
@@ -6960,13 +7401,16 @@
if (!btrfs_root_readonly(sctx->parent_root) ||
btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
+ if (sctx->parent_root->dedupe_in_progress) {
+ dedupe_in_progress_warn(sctx->parent_root);
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ ret = -EAGAIN;
+ goto out;
+ }
spin_unlock(&sctx->parent_root->root_item_lock);
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@@ -6974,7 +7418,8 @@
* is behind the current send position. This is checked while searching
* for possible clone sources.
*/
- sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
+ btrfs_grab_root(sctx->send_root);
/* We do a bsearch later */
sort(sctx->clone_roots, sctx->clone_roots_cnt,
@@ -6990,9 +7435,23 @@
if (ret)
goto out;
+ mutex_lock(&fs_info->balance_mutex);
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ btrfs_warn_rl(fs_info,
+ "cannot run send because a balance operation is in progress");
+ ret = -EAGAIN;
+ goto out;
+ }
+ fs_info->send_in_progress++;
+ mutex_unlock(&fs_info->balance_mutex);
+
current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
+ mutex_lock(&fs_info->balance_mutex);
+ fs_info->send_in_progress--;
+ mutex_unlock(&fs_info->balance_mutex);
if (ret < 0)
goto out;
@@ -7045,18 +7504,24 @@
}
if (sort_clone_roots) {
- for (i = 0; i < sctx->clone_roots_cnt; i++)
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
} else {
- for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
btrfs_root_dec_send_in_progress(send_root);
}
- if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
btrfs_root_dec_send_in_progress(sctx->parent_root);
+ btrfs_put_root(sctx->parent_root);
+ }
kvfree(clone_sources_tmp);
@@ -7066,7 +7531,6 @@
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
--
Gitblit v1.6.2