From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/send.c | 980 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 722 insertions(+), 258 deletions(-) diff --git a/kernel/fs/btrfs/send.c b/kernel/fs/btrfs/send.c index eb2f8e8..b081b61 100644 --- a/kernel/fs/btrfs/send.c +++ b/kernel/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -268,6 +266,16 @@ int need_later_update; int name_len; char name[]; +}; + +#define ADVANCE 1 +#define ADVANCE_ONLY_NEXT -1 + +enum btrfs_compare_tree_result { + BTRFS_COMPARE_TREE_NEW, + BTRFS_COMPARE_TREE_DELETED, + BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, }; __cold @@ -570,8 +578,8 @@ return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -681,7 +689,7 @@ sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -693,17 +701,17 @@ u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; @@ -1170,7 +1178,6 @@ struct backref_ctx { struct send_ctx *sctx; - struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1196,9 +1203,9 @@ u64 root = (u64)(uintptr_t)key; struct clone_root *cr = (struct clone_root *)elt; - if (root < cr->root->objectid) + if (root < cr->root->root_key.objectid) return -1; - if (root > cr->root->objectid) + if (root > cr->root->root_key.objectid) return 1; return 0; } @@ -1208,9 +1215,9 @@ struct clone_root *cr1 = (struct clone_root *)e1; struct clone_root *cr2 = (struct clone_root *)e2; - if (cr1->root->objectid < cr2->root->objectid) + if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; - if (cr1->root->objectid > cr2->root->objectid) + if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) return 1; return 0; } @@ -1223,8 +1230,6 @@ { struct backref_ctx *bctx = ctx_; struct clone_root *found; - int ret; - u64 i_size; /* First check if the root is in the list of accepted clone sources */ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, @@ -1241,30 +1246,26 @@ } /* - * There are inodes that have extents that lie behind its i_size. Don't - * accept clones from these extents. - */ - ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, - NULL, NULL, NULL); - btrfs_release_path(bctx->path); - if (ret < 0) - return ret; - - if (offset + bctx->data_offset + bctx->extent_len > i_size) - return 0; - - /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ if (found->root == bctx->sctx->send_root) { /* - * TODO for the moment we don't accept clones from the inode - * that is currently send. We may change this when - * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same - * file. + * If the source inode was not yet processed we can't issue a + * clone operation, as the source extent does not exist yet at + * the destination of the stream. */ - if (ino >= bctx->cur_objectid) + if (ino > bctx->cur_objectid) + return 0; + /* + * We clone from the inode currently being sent as long as the + * source extent is already processed, otherwise we could try + * to clone from an extent that does not exist yet at the + * destination of the stream. + */ + if (ino == bctx->cur_objectid && + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } @@ -1329,8 +1330,6 @@ ret = -ENOMEM; goto out; } - - backref_ctx->path = tmp_path; if (data_offset >= ino_size) { /* @@ -1718,12 +1717,8 @@ di = btrfs_lookup_dir_item(NULL, root, path, dir, name, name_len, 0); - if (!di) { - ret = -ENOENT; - goto out; - } - if (IS_ERR(di)) { - ret = PTR_ERR(di); + if (IS_ERR_OR_NULL(di)) { + ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); @@ -2267,7 +2262,7 @@ * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * - * When do we have have orphan inodes: + * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone @@ -2371,7 +2366,7 @@ return -ENOMEM; } - key.objectid = send_root->objectid; + key.objectid = send_root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2387,7 +2382,7 @@ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->objectid) { + key.objectid != send_root->root_key.objectid) { ret = -ENOENT; goto out; } @@ -3957,52 +3952,56 @@ goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * the the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -4093,6 +4092,49 @@ goto out; } ret = send_unlink(sctx, cur->full_path); + if (ret < 0) + goto out; + } + } + + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); if (ret < 0) goto out; } @@ -4893,35 +4935,43 @@ return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; struct page *page; char *addr; - struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_MASK; - ssize_t ret = 0; + unsigned pg_offset = offset_in_page(offset); + int ret; - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; + ret = put_data_header(sctx, len); + if (ret) + return ret; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; last_index = (offset + len - 1) >> PAGE_SHIFT; @@ -4967,16 +5017,16 @@ } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + sctx->send_size += cur_len; } -out: iput(inode); return ret; } @@ -4990,20 +5040,12 @@ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) return -ENOMEM; btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; - goto out; - } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -5015,16 +5057,16 @@ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5040,8 +5082,8 @@ btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->objectid, clone_root->ino, - clone_root->offset); + offset, len, clone_root->root->root_key.objectid, + clone_root->ino, clone_root->offset); p = fs_path_alloc(); if (!p) @@ -5136,8 +5178,8 @@ static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5164,16 +5206,19 @@ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5189,23 +5234,20 @@ const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } @@ -5278,6 +5320,7 @@ struct btrfs_path *path; struct btrfs_key key; int ret; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector @@ -5301,6 +5344,16 @@ path = alloc_path_for_send(); if (!path) return -ENOMEM; + + /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = __get_inode_info(clone_root->root, path, clone_root->ino, + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + btrfs_release_path(path); + if (ret < 0) + goto out; /* * We can't send a clone operation for the entire range if we find @@ -5344,6 +5397,8 @@ u8 type; u64 ext_len; u64 clone_len; + u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5397,13 +5452,94 @@ if (key.offset >= clone_root->offset + len) break; + if (key.offset >= clone_src_i_size) + break; + + if (key.offset + ext_len > clone_src_i_size) { + ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } + + clone_data_offset = btrfs_file_extent_offset(leaf, ei); + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { + clone_root->offset = key.offset; + if (clone_data_offset < data_offset && + clone_data_offset + ext_len > data_offset) { + u64 extent_offset; + + extent_offset = data_offset - clone_data_offset; + ext_len -= extent_offset; + clone_data_offset += extent_offset; + clone_root->offset += extent_offset; + } + } + clone_len = min_t(u64, ext_len, len); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && - btrfs_file_extent_offset(leaf, ei) == data_offset) - ret = send_clone(sctx, offset, clone_len, clone_root); - else + clone_data_offset == data_offset) { + const u64 src_end = clone_root->offset + clone_len; + const u64 sectorsize = SZ_64K; + + /* + * We can't clone the last block, when its size is not + * sector size aligned, into the middle of a file. If we + * do so, the receiver will get a failure (-EINVAL) when + * trying to clone or will silently corrupt the data in + * the destination file if it's on a kernel without the + * fix introduced by commit ac765f83f1397646 + * ("Btrfs: fix data corruption due to cloning of eof + * block). + * + * So issue a clone of the aligned down range plus a + * regular write for the eof block, if we hit that case. + * + * Also, we use the maximum possible sector size, 64K, + * because we don't know what's the sector size of the + * filesystem that receives the stream, so we have to + * assume the largest possible sector size. + */ + if (src_end == clone_src_i_size && + !IS_ALIGNED(src_end, sectorsize) && + offset + clone_len < sctx->cur_inode_size) { + u64 slen; + + slen = ALIGN_DOWN(src_end - clone_root->offset, + sectorsize); + if (slen > 0) { + ret = send_clone(sctx, offset, slen, + clone_root); + if (ret < 0) + goto out; + } + ret = send_extent_data(sctx, offset + slen, + clone_len - slen); + } else { + ret = send_clone(sctx, offset, clone_len, + clone_root); + } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; + } else { ret = send_extent_data(sctx, offset, clone_len); + } if (ret < 0) goto out; @@ -5413,6 +5549,21 @@ break; offset += clone_len; clone_root->offset += clone_len; + + /* + * If we are cloning from the file we are currently processing, + * and using the send root as the clone root, we must stop once + * the current clone offset reaches the current eof of the file + * at the receiver, otherwise we would issue an invalid clone + * operation (source range going beyond eof) and cause the + * receiver to fail. So if we reach the current eof, bail out + * and fallback to a regular write. + */ + if (clone_root->root == sctx->send_root && + clone_root->ino == sctx->cur_ino && + clone_root->offset >= sctx->cur_inode_next_write_offset) + break; + data_offset += clone_len; next: path->slots[0]++; @@ -5433,51 +5584,29 @@ struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } - - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } @@ -5675,10 +5804,7 @@ { struct btrfs_path *path; struct btrfs_root *root = sctx->send_root; - struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 extent_end; - u8 type; int ret; path = alloc_path_for_send(); @@ -5698,18 +5824,7 @@ if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) goto out; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key.offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); - } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); out: btrfs_free_path(path); return ret; @@ -5763,16 +5878,7 @@ break; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(leaf, fi); - - extent_end = ALIGN(key.offset + size, - root->fs_info->sectorsize); - } else { - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - } + extent_end = btrfs_file_extent_end(path); if (extent_end <= start) goto next; if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { @@ -5793,9 +5899,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - struct btrfs_file_extent_item *fi; - u64 extent_end; - u8 type; int ret = 0; if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) @@ -5805,18 +5908,6 @@ ret = get_last_extent(sctx, key->offset - 1); if (ret) return ret; - } - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], fi); - if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi); - extent_end = ALIGN(key->offset + size, - sctx->send_root->fs_info->sectorsize); - } else { - extent_end = key->offset + - btrfs_file_extent_num_bytes(path->nodes[0], fi); } if (path->slots[0] == 0 && @@ -5844,7 +5935,7 @@ else ret = 0; } - sctx->cur_inode_last_extent = extent_end; + sctx->cur_inode_last_extent = btrfs_file_extent_end(path); return ret; } @@ -6654,6 +6745,365 @@ return ret; } +static int tree_move_down(struct btrfs_path *path, int *level) +{ + struct extent_buffer *eb; + + BUG_ON(*level == 0); + eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; + path->slots[*level - 1] = 0; + (*level)--; + return 0; +} + +static int tree_move_next_or_upnext(struct btrfs_path *path, + int *level, int root_level) +{ + int ret = 0; + int nritems; + nritems = btrfs_header_nritems(path->nodes[*level]); + + path->slots[*level]++; + + while (path->slots[*level] >= nritems) { + if (*level == root_level) + return -1; + + /* move upnext */ + path->slots[*level] = 0; + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + (*level)++; + path->slots[*level]++; + + nritems = btrfs_header_nritems(path->nodes[*level]); + ret = 1; + } + return ret; +} + +/* + * Returns 1 if it had to move up and next. 0 is returned if it moved only next + * or down. + */ +static int tree_advance(struct btrfs_path *path, + int *level, int root_level, + int allow_down, + struct btrfs_key *key) +{ + int ret; + + if (*level == 0 || !allow_down) { + ret = tree_move_next_or_upnext(path, level, root_level); + } else { + ret = tree_move_down(path, level); + } + if (ret >= 0) { + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + } + return ret; +} + +static int tree_compare_item(struct btrfs_path *left_path, + struct btrfs_path *right_path, + char *tmp_buf) +{ + int cmp; + int len1, len2; + unsigned long off1, off2; + + len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]); + len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]); + if (len1 != len2) + return 1; + + off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); + off2 = btrfs_item_ptr_offset(right_path->nodes[0], + right_path->slots[0]); + + read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); + + cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); + if (cmp) + return 1; + return 0; +} + +/* + * This function compares two trees and calls the provided callback for + * every changed/new/deleted item it finds. + * If shared tree blocks are encountered, whole subtrees are skipped, making + * the compare pretty fast on snapshotted subvolumes. + * + * This currently works on commit roots only. As commit roots are read only, + * we don't do any locking. The commit roots are protected with transactions. + * Transactions are ended and rejoined when a commit is tried in between. + * + * This function checks for modifications done to the trees while comparing. + * If it detects a change, it aborts immediately. + */ +static int btrfs_compare_trees(struct btrfs_root *left_root, + struct btrfs_root *right_root, void *ctx) +{ + struct btrfs_fs_info *fs_info = left_root->fs_info; + int ret; + int cmp; + struct btrfs_path *left_path = NULL; + struct btrfs_path *right_path = NULL; + struct btrfs_key left_key; + struct btrfs_key right_key; + char *tmp_buf = NULL; + int left_root_level; + int right_root_level; + int left_level; + int right_level; + int left_end_reached; + int right_end_reached; + int advance_left; + int advance_right; + u64 left_blockptr; + u64 right_blockptr; + u64 left_gen; + u64 right_gen; + + left_path = btrfs_alloc_path(); + if (!left_path) { + ret = -ENOMEM; + goto out; + } + right_path = btrfs_alloc_path(); + if (!right_path) { + ret = -ENOMEM; + goto out; + } + + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } + + left_path->search_commit_root = 1; + left_path->skip_locking = 1; + right_path->search_commit_root = 1; + right_path->skip_locking = 1; + + /* + * Strategy: Go to the first items of both trees. Then do + * + * If both trees are at level 0 + * Compare keys of current items + * If left < right treat left item as new, advance left tree + * and repeat + * If left > right treat right item as deleted, advance right tree + * and repeat + * If left == right do deep compare of items, treat as changed if + * needed, advance both trees and repeat + * If both trees are at the same level but not at level 0 + * Compare keys of current nodes/leafs + * If left < right advance left tree and repeat + * If left > right advance right tree and repeat + * If left == right compare blockptrs of the next nodes/leafs + * If they match advance both trees but stay at the same level + * and repeat + * If they don't match advance both trees while allowing to go + * deeper and repeat + * If tree levels are different + * Advance the tree that needs it and repeat + * + * Advancing a tree means: + * If we are at level 0, try to go to the next slot. If that's not + * possible, go one level up and repeat. Stop when we found a level + * where we could go to the next slot. We may at this point be on a + * node or a leaf. + * + * If we are not at level 0 and not on shared tree blocks, go one + * level deeper. + * + * If we are not at level 0 and on shared tree blocks, go one slot to + * the right if possible or go up and right. + */ + + down_read(&fs_info->commit_root_sem); + left_level = btrfs_header_level(left_root->commit_root); + left_root_level = left_level; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + + right_level = btrfs_header_level(right_root->commit_root); + right_root_level = right_level; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } + up_read(&fs_info->commit_root_sem); + + if (left_level == 0) + btrfs_item_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + else + btrfs_node_key_to_cpu(left_path->nodes[left_level], + &left_key, left_path->slots[left_level]); + if (right_level == 0) + btrfs_item_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + else + btrfs_node_key_to_cpu(right_path->nodes[right_level], + &right_key, right_path->slots[right_level]); + + left_end_reached = right_end_reached = 0; + advance_left = advance_right = 0; + + while (1) { + cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, + advance_left != ADVANCE_ONLY_NEXT, + &left_key); + if (ret == -1) + left_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_left = 0; + } + if (advance_right && !right_end_reached) { + ret = tree_advance(right_path, &right_level, + right_root_level, + advance_right != ADVANCE_ONLY_NEXT, + &right_key); + if (ret == -1) + right_end_reached = ADVANCE; + else if (ret < 0) + goto out; + advance_right = 0; + } + + if (left_end_reached && right_end_reached) { + ret = 0; + goto out; + } else if (left_end_reached) { + if (right_level == 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + } + advance_right = ADVANCE; + continue; + } else if (right_end_reached) { + if (left_level == 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + } + advance_left = ADVANCE; + continue; + } + + if (left_level == 0 && right_level == 0) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + ret = changed_cb(left_path, right_path, + &left_key, + BTRFS_COMPARE_TREE_NEW, + ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + } else if (cmp > 0) { + ret = changed_cb(left_path, right_path, + &right_key, + BTRFS_COMPARE_TREE_DELETED, + ctx); + if (ret < 0) + goto out; + advance_right = ADVANCE; + } else { + enum btrfs_compare_tree_result result; + + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); + ret = tree_compare_item(left_path, right_path, + tmp_buf); + if (ret) + result = BTRFS_COMPARE_TREE_CHANGED; + else + result = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_path, right_path, + &left_key, result, ctx); + if (ret < 0) + goto out; + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } else if (left_level == right_level) { + cmp = btrfs_comp_cpu_keys(&left_key, &right_key); + if (cmp < 0) { + advance_left = ADVANCE; + } else if (cmp > 0) { + advance_right = ADVANCE; + } else { + left_blockptr = btrfs_node_blockptr( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_blockptr = btrfs_node_blockptr( + right_path->nodes[right_level], + right_path->slots[right_level]); + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { + /* + * As we're on a shared block, don't + * allow to go deeper. + */ + advance_left = ADVANCE_ONLY_NEXT; + advance_right = ADVANCE_ONLY_NEXT; + } else { + advance_left = ADVANCE; + advance_right = ADVANCE; + } + } + } else if (left_level < right_level) { + advance_right = ADVANCE; + } else { + advance_left = ADVANCE; + } + } + +out: + btrfs_free_path(left_path); + btrfs_free_path(right_path); + kvfree(tmp_buf); + return ret; +} + static int send_subvol(struct send_ctx *sctx) { int ret; @@ -6669,8 +7119,7 @@ goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); @@ -6779,20 +7228,25 @@ spin_unlock(&root->root_item_lock); } +static void dedupe_in_progress_warn(const struct btrfs_root *root) +{ + btrfs_warn_rl(root->fs_info, +"cannot use root %llu for send while deduplications on it are in progress (%d in progress)", + root->root_key.objectid, root->dedupe_in_progress); +} + long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; - int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -6802,6 +7256,11 @@ * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); + if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + dedupe_in_progress_warn(send_root); + spin_unlock(&send_root->root_item_lock); + return -EAGAIN; + } send_root->send_in_progress++; spin_unlock(&send_root->root_item_lock); @@ -6817,18 +7276,11 @@ /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; - goto out; - } - - if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources) * - arg->clone_sources_count)) { - ret = -EFAULT; goto out; } @@ -6875,25 +7327,20 @@ goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); @@ -6910,15 +7357,9 @@ } for (i = 0; i < arg->clone_sources_count; i++) { - key.objectid = clone_sources_tmp[i]; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - clone_root = btrfs_read_fs_root_no_name(fs_info, &key); + clone_root = btrfs_get_fs_root(fs_info, + clone_sources_tmp[i], true); if (IS_ERR(clone_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } @@ -6926,13 +7367,19 @@ if (!btrfs_root_readonly(clone_root) || btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); + btrfs_put_root(clone_root); ret = -EPERM; + goto out; + } + if (clone_root->dedupe_in_progress) { + dedupe_in_progress_warn(clone_root); + spin_unlock(&clone_root->root_item_lock); + btrfs_put_root(clone_root); + ret = -EAGAIN; goto out; } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; @@ -6942,15 +7389,9 @@ } if (arg->parent_root) { - key.objectid = arg->parent_root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); + sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, + true); if (IS_ERR(sctx->parent_root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = PTR_ERR(sctx->parent_root); goto out; } @@ -6960,13 +7401,16 @@ if (!btrfs_root_readonly(sctx->parent_root) || btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); - srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } + if (sctx->parent_root->dedupe_in_progress) { + dedupe_in_progress_warn(sctx->parent_root); + spin_unlock(&sctx->parent_root->root_item_lock); + ret = -EAGAIN; + goto out; + } spin_unlock(&sctx->parent_root->root_item_lock); - - srcu_read_unlock(&fs_info->subvol_srcu, index); } /* @@ -6974,7 +7418,8 @@ * is behind the current send position. This is checked while searching * for possible clone sources. */ - sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root; + sctx->clone_roots[sctx->clone_roots_cnt++].root = + btrfs_grab_root(sctx->send_root); /* We do a bsearch later */ sort(sctx->clone_roots, sctx->clone_roots_cnt, @@ -6990,9 +7435,23 @@ if (ret) goto out; + mutex_lock(&fs_info->balance_mutex); + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + btrfs_warn_rl(fs_info, + "cannot run send because a balance operation is in progress"); + ret = -EAGAIN; + goto out; + } + fs_info->send_in_progress++; + mutex_unlock(&fs_info->balance_mutex); + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; + mutex_lock(&fs_info->balance_mutex); + fs_info->send_in_progress--; + mutex_unlock(&fs_info->balance_mutex); if (ret < 0) goto out; @@ -7045,18 +7504,24 @@ } if (sort_clone_roots) { - for (i = 0; i < sctx->clone_roots_cnt; i++) + for (i = 0; i < sctx->clone_roots_cnt; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } } else { - for (i = 0; sctx && i < clone_sources_to_rollback; i++) + for (i = 0; sctx && i < clone_sources_to_rollback; i++) { btrfs_root_dec_send_in_progress( sctx->clone_roots[i].root); + btrfs_put_root(sctx->clone_roots[i].root); + } btrfs_root_dec_send_in_progress(send_root); } - if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) + if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { btrfs_root_dec_send_in_progress(sctx->parent_root); + btrfs_put_root(sctx->parent_root); + } kvfree(clone_sources_tmp); @@ -7066,7 +7531,6 @@ kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); -- Gitblit v1.6.2