hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/fs/btrfs/send.c
....@@ -122,8 +122,6 @@
122122
123123 struct file_ra_state ra;
124124
125
- char *read_buf;
126
-
127125 /*
128126 * We process inodes by their increasing order, so if before an
129127 * incremental send we reverse the parent/child relationship of
....@@ -268,6 +266,16 @@
268266 int need_later_update;
269267 int name_len;
270268 char name[];
269
+};
270
+
271
+#define ADVANCE 1
272
+#define ADVANCE_ONLY_NEXT -1
273
+
274
+enum btrfs_compare_tree_result {
275
+ BTRFS_COMPARE_TREE_NEW,
276
+ BTRFS_COMPARE_TREE_DELETED,
277
+ BTRFS_COMPARE_TREE_CHANGED,
278
+ BTRFS_COMPARE_TREE_SAME,
271279 };
272280
273281 __cold
....@@ -570,8 +578,8 @@
570578 return -EOVERFLOW;
571579
572580 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
573
- hdr->tlv_type = cpu_to_le16(attr);
574
- hdr->tlv_len = cpu_to_le16(len);
581
+ put_unaligned_le16(attr, &hdr->tlv_type);
582
+ put_unaligned_le16(len, &hdr->tlv_len);
575583 memcpy(hdr + 1, data, len);
576584 sctx->send_size += total_len;
577585
....@@ -681,7 +689,7 @@
681689
682690 sctx->send_size += sizeof(*hdr);
683691 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
684
- hdr->cmd = cpu_to_le16(cmd);
692
+ put_unaligned_le16(cmd, &hdr->cmd);
685693
686694 return 0;
687695 }
....@@ -693,17 +701,17 @@
693701 u32 crc;
694702
695703 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
696
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
697
- hdr->crc = 0;
704
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
705
+ put_unaligned_le32(0, &hdr->crc);
698706
699
- crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
700
- hdr->crc = cpu_to_le32(crc);
707
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
708
+ put_unaligned_le32(crc, &hdr->crc);
701709
702710 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
703711 &sctx->send_off);
704712
705713 sctx->total_send_size += sctx->send_size;
706
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
714
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
707715 sctx->send_size = 0;
708716
709717 return ret;
....@@ -1170,7 +1178,6 @@
11701178 struct backref_ctx {
11711179 struct send_ctx *sctx;
11721180
1173
- struct btrfs_path *path;
11741181 /* number of total found references */
11751182 u64 found;
11761183
....@@ -1196,9 +1203,9 @@
11961203 u64 root = (u64)(uintptr_t)key;
11971204 struct clone_root *cr = (struct clone_root *)elt;
11981205
1199
- if (root < cr->root->objectid)
1206
+ if (root < cr->root->root_key.objectid)
12001207 return -1;
1201
- if (root > cr->root->objectid)
1208
+ if (root > cr->root->root_key.objectid)
12021209 return 1;
12031210 return 0;
12041211 }
....@@ -1208,9 +1215,9 @@
12081215 struct clone_root *cr1 = (struct clone_root *)e1;
12091216 struct clone_root *cr2 = (struct clone_root *)e2;
12101217
1211
- if (cr1->root->objectid < cr2->root->objectid)
1218
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
12121219 return -1;
1213
- if (cr1->root->objectid > cr2->root->objectid)
1220
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
12141221 return 1;
12151222 return 0;
12161223 }
....@@ -1223,8 +1230,6 @@
12231230 {
12241231 struct backref_ctx *bctx = ctx_;
12251232 struct clone_root *found;
1226
- int ret;
1227
- u64 i_size;
12281233
12291234 /* First check if the root is in the list of accepted clone sources */
12301235 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
....@@ -1241,30 +1246,26 @@
12411246 }
12421247
12431248 /*
1244
- * There are inodes that have extents that lie behind its i_size. Don't
1245
- * accept clones from these extents.
1246
- */
1247
- ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
1248
- NULL, NULL, NULL);
1249
- btrfs_release_path(bctx->path);
1250
- if (ret < 0)
1251
- return ret;
1252
-
1253
- if (offset + bctx->data_offset + bctx->extent_len > i_size)
1254
- return 0;
1255
-
1256
- /*
12571249 * Make sure we don't consider clones from send_root that are
12581250 * behind the current inode/offset.
12591251 */
12601252 if (found->root == bctx->sctx->send_root) {
12611253 /*
1262
- * TODO for the moment we don't accept clones from the inode
1263
- * that is currently send. We may change this when
1264
- * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
1265
- * file.
1254
+ * If the source inode was not yet processed we can't issue a
1255
+ * clone operation, as the source extent does not exist yet at
1256
+ * the destination of the stream.
12661257 */
1267
- if (ino >= bctx->cur_objectid)
1258
+ if (ino > bctx->cur_objectid)
1259
+ return 0;
1260
+ /*
1261
+ * We clone from the inode currently being sent as long as the
1262
+ * source extent is already processed, otherwise we could try
1263
+ * to clone from an extent that does not exist yet at the
1264
+ * destination of the stream.
1265
+ */
1266
+ if (ino == bctx->cur_objectid &&
1267
+ offset + bctx->extent_len >
1268
+ bctx->sctx->cur_inode_next_write_offset)
12681269 return 0;
12691270 }
12701271
....@@ -1329,8 +1330,6 @@
13291330 ret = -ENOMEM;
13301331 goto out;
13311332 }
1332
-
1333
- backref_ctx->path = tmp_path;
13341333
13351334 if (data_offset >= ino_size) {
13361335 /*
....@@ -1718,12 +1717,8 @@
17181717
17191718 di = btrfs_lookup_dir_item(NULL, root, path,
17201719 dir, name, name_len, 0);
1721
- if (!di) {
1722
- ret = -ENOENT;
1723
- goto out;
1724
- }
1725
- if (IS_ERR(di)) {
1726
- ret = PTR_ERR(di);
1720
+ if (IS_ERR_OR_NULL(di)) {
1721
+ ret = di ? PTR_ERR(di) : -ENOENT;
17271722 goto out;
17281723 }
17291724 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
....@@ -2267,7 +2262,7 @@
22672262 * inodes "orphan" name instead of the real name and stop. Same with new inodes
22682263 * that were not created yet and overwritten inodes/refs.
22692264 *
2270
- * When do we have have orphan inodes:
2265
+ * When do we have orphan inodes:
22712266 * 1. When an inode is freshly created and thus no valid refs are available yet
22722267 * 2. When a directory lost all it's refs (deleted) but still has dir items
22732268 * inside which were not processed yet (pending for move/delete). If anyone
....@@ -2371,7 +2366,7 @@
23712366 return -ENOMEM;
23722367 }
23732368
2374
- key.objectid = send_root->objectid;
2369
+ key.objectid = send_root->root_key.objectid;
23752370 key.type = BTRFS_ROOT_BACKREF_KEY;
23762371 key.offset = 0;
23772372
....@@ -2387,7 +2382,7 @@
23872382 leaf = path->nodes[0];
23882383 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
23892384 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2390
- key.objectid != send_root->objectid) {
2385
+ key.objectid != send_root->root_key.objectid) {
23912386 ret = -ENOENT;
23922387 goto out;
23932388 }
....@@ -3957,52 +3952,56 @@
39573952 goto out;
39583953 }
39593954
3955
+ /*
3956
+ * Before doing any rename and link operations, do a first pass on the
3957
+ * new references to orphanize any unprocessed inodes that may have a
3958
+ * reference that conflicts with one of the new references of the current
3959
+ * inode. This needs to happen first because a new reference may conflict
3960
+ * with the old reference of a parent directory, so we must make sure
3961
+ * that the path used for link and rename commands don't use an
3962
+ * orphanized name when an ancestor was not yet orphanized.
3963
+ *
3964
+ * Example:
3965
+ *
3966
+ * Parent snapshot:
3967
+ *
3968
+ * . (ino 256)
3969
+ * |----- testdir/ (ino 259)
3970
+ * | |----- a (ino 257)
3971
+ * |
3972
+ * |----- b (ino 258)
3973
+ *
3974
+ * Send snapshot:
3975
+ *
3976
+ * . (ino 256)
3977
+ * |----- testdir_2/ (ino 259)
3978
+ * | |----- a (ino 260)
3979
+ * |
3980
+ * |----- testdir (ino 257)
3981
+ * |----- b (ino 257)
3982
+ * |----- b2 (ino 258)
3983
+ *
3984
+ * Processing the new reference for inode 257 with name "b" may happen
3985
+ * before processing the new reference with name "testdir". If so, we
3986
+ * must make sure that by the time we send a link command to create the
3987
+ * hard link "b", inode 259 was already orphanized, since the generated
3988
+ * path in "valid_path" already contains the orphanized name for 259.
3989
+ * We are processing inode 257, so only later when processing 259 we do
3990
+ * the rename operation to change its temporary (orphanized) name to
3991
+ * "testdir_2".
3992
+ */
39603993 list_for_each_entry(cur, &sctx->new_refs, list) {
3961
- /*
3962
- * We may have refs where the parent directory does not exist
3963
- * yet. This happens if the parent directories inum is higher
3964
- * the the current inum. To handle this case, we create the
3965
- * parent directory out of order. But we need to check if this
3966
- * did already happen before due to other refs in the same dir.
3967
- */
39683994 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
39693995 if (ret < 0)
39703996 goto out;
3971
- if (ret == inode_state_will_create) {
3972
- ret = 0;
3973
- /*
3974
- * First check if any of the current inodes refs did
3975
- * already create the dir.
3976
- */
3977
- list_for_each_entry(cur2, &sctx->new_refs, list) {
3978
- if (cur == cur2)
3979
- break;
3980
- if (cur2->dir == cur->dir) {
3981
- ret = 1;
3982
- break;
3983
- }
3984
- }
3985
-
3986
- /*
3987
- * If that did not happen, check if a previous inode
3988
- * did already create the dir.
3989
- */
3990
- if (!ret)
3991
- ret = did_create_dir(sctx, cur->dir);
3992
- if (ret < 0)
3993
- goto out;
3994
- if (!ret) {
3995
- ret = send_create_inode(sctx, cur->dir);
3996
- if (ret < 0)
3997
- goto out;
3998
- }
3999
- }
3997
+ if (ret == inode_state_will_create)
3998
+ continue;
40003999
40014000 /*
4002
- * Check if this new ref would overwrite the first ref of
4003
- * another unprocessed inode. If yes, orphanize the
4004
- * overwritten inode. If we find an overwritten ref that is
4005
- * not the first ref, simply unlink it.
4001
+ * Check if this new ref would overwrite the first ref of another
4002
+ * unprocessed inode. If yes, orphanize the overwritten inode.
4003
+ * If we find an overwritten ref that is not the first ref,
4004
+ * simply unlink it.
40064005 */
40074006 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
40084007 cur->name, cur->name_len,
....@@ -4093,6 +4092,49 @@
40934092 goto out;
40944093 }
40954094 ret = send_unlink(sctx, cur->full_path);
4095
+ if (ret < 0)
4096
+ goto out;
4097
+ }
4098
+ }
4099
+
4100
+ }
4101
+
4102
+ list_for_each_entry(cur, &sctx->new_refs, list) {
4103
+ /*
4104
+ * We may have refs where the parent directory does not exist
4105
+ * yet. This happens if the parent directories inum is higher
4106
+ * than the current inum. To handle this case, we create the
4107
+ * parent directory out of order. But we need to check if this
4108
+ * did already happen before due to other refs in the same dir.
4109
+ */
4110
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4111
+ if (ret < 0)
4112
+ goto out;
4113
+ if (ret == inode_state_will_create) {
4114
+ ret = 0;
4115
+ /*
4116
+ * First check if any of the current inodes refs did
4117
+ * already create the dir.
4118
+ */
4119
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
4120
+ if (cur == cur2)
4121
+ break;
4122
+ if (cur2->dir == cur->dir) {
4123
+ ret = 1;
4124
+ break;
4125
+ }
4126
+ }
4127
+
4128
+ /*
4129
+ * If that did not happen, check if a previous inode
4130
+ * did already create the dir.
4131
+ */
4132
+ if (!ret)
4133
+ ret = did_create_dir(sctx, cur->dir);
4134
+ if (ret < 0)
4135
+ goto out;
4136
+ if (!ret) {
4137
+ ret = send_create_inode(sctx, cur->dir);
40964138 if (ret < 0)
40974139 goto out;
40984140 }
....@@ -4893,35 +4935,43 @@
48934935 return ret;
48944936 }
48954937
4896
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
4938
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
4939
+{
4940
+ return sctx->send_max_size - SZ_16K;
4941
+}
4942
+
4943
+static int put_data_header(struct send_ctx *sctx, u32 len)
4944
+{
4945
+ struct btrfs_tlv_header *hdr;
4946
+
4947
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
4948
+ return -EOVERFLOW;
4949
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
4950
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
4951
+ put_unaligned_le16(len, &hdr->tlv_len);
4952
+ sctx->send_size += sizeof(*hdr);
4953
+ return 0;
4954
+}
4955
+
4956
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
48974957 {
48984958 struct btrfs_root *root = sctx->send_root;
48994959 struct btrfs_fs_info *fs_info = root->fs_info;
49004960 struct inode *inode;
49014961 struct page *page;
49024962 char *addr;
4903
- struct btrfs_key key;
49044963 pgoff_t index = offset >> PAGE_SHIFT;
49054964 pgoff_t last_index;
4906
- unsigned pg_offset = offset & ~PAGE_MASK;
4907
- ssize_t ret = 0;
4965
+ unsigned pg_offset = offset_in_page(offset);
4966
+ int ret;
49084967
4909
- key.objectid = sctx->cur_ino;
4910
- key.type = BTRFS_INODE_ITEM_KEY;
4911
- key.offset = 0;
4968
+ ret = put_data_header(sctx, len);
4969
+ if (ret)
4970
+ return ret;
49124971
4913
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4972
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
49144973 if (IS_ERR(inode))
49154974 return PTR_ERR(inode);
4916
-
4917
- if (offset + len > i_size_read(inode)) {
4918
- if (offset > i_size_read(inode))
4919
- len = 0;
4920
- else
4921
- len = offset - i_size_read(inode);
4922
- }
4923
- if (len == 0)
4924
- goto out;
49254975
49264976 last_index = (offset + len - 1) >> PAGE_SHIFT;
49274977
....@@ -4967,16 +5017,16 @@
49675017 }
49685018
49695019 addr = kmap(page);
4970
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
5020
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
5021
+ cur_len);
49715022 kunmap(page);
49725023 unlock_page(page);
49735024 put_page(page);
49745025 index++;
49755026 pg_offset = 0;
49765027 len -= cur_len;
4977
- ret += cur_len;
5028
+ sctx->send_size += cur_len;
49785029 }
4979
-out:
49805030 iput(inode);
49815031 return ret;
49825032 }
....@@ -4990,20 +5040,12 @@
49905040 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
49915041 int ret = 0;
49925042 struct fs_path *p;
4993
- ssize_t num_read = 0;
49945043
49955044 p = fs_path_alloc();
49965045 if (!p)
49975046 return -ENOMEM;
49985047
49995048 btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5000
-
5001
- num_read = fill_read_buf(sctx, offset, len);
5002
- if (num_read <= 0) {
5003
- if (num_read < 0)
5004
- ret = num_read;
5005
- goto out;
5006
- }
50075049
50085050 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
50095051 if (ret < 0)
....@@ -5015,16 +5057,16 @@
50155057
50165058 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
50175059 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5018
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
5060
+ ret = put_file_data(sctx, offset, len);
5061
+ if (ret < 0)
5062
+ goto out;
50195063
50205064 ret = send_cmd(sctx);
50215065
50225066 tlv_put_failure:
50235067 out:
50245068 fs_path_free(p);
5025
- if (ret < 0)
5026
- return ret;
5027
- return num_read;
5069
+ return ret;
50285070 }
50295071
50305072 /*
....@@ -5040,8 +5082,8 @@
50405082
50415083 btrfs_debug(sctx->send_root->fs_info,
50425084 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5043
- offset, len, clone_root->root->objectid, clone_root->ino,
5044
- clone_root->offset);
5085
+ offset, len, clone_root->root->root_key.objectid,
5086
+ clone_root->ino, clone_root->offset);
50455087
50465088 p = fs_path_alloc();
50475089 if (!p)
....@@ -5136,8 +5178,8 @@
51365178 static int send_hole(struct send_ctx *sctx, u64 end)
51375179 {
51385180 struct fs_path *p = NULL;
5181
+ u64 read_size = max_send_read_size(sctx);
51395182 u64 offset = sctx->cur_inode_last_extent;
5140
- u64 len;
51415183 int ret = 0;
51425184
51435185 /*
....@@ -5164,16 +5206,19 @@
51645206 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
51655207 if (ret < 0)
51665208 goto tlv_put_failure;
5167
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
51685209 while (offset < end) {
5169
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
5210
+ u64 len = min(end - offset, read_size);
51705211
51715212 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
51725213 if (ret < 0)
51735214 break;
51745215 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
51755216 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5176
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
5217
+ ret = put_data_header(sctx, len);
5218
+ if (ret < 0)
5219
+ break;
5220
+ memset(sctx->send_buf + sctx->send_size, 0, len);
5221
+ sctx->send_size += len;
51775222 ret = send_cmd(sctx);
51785223 if (ret < 0)
51795224 break;
....@@ -5189,23 +5234,20 @@
51895234 const u64 offset,
51905235 const u64 len)
51915236 {
5237
+ u64 read_size = max_send_read_size(sctx);
51925238 u64 sent = 0;
51935239
51945240 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
51955241 return send_update_extent(sctx, offset, len);
51965242
51975243 while (sent < len) {
5198
- u64 size = len - sent;
5244
+ u64 size = min(len - sent, read_size);
51995245 int ret;
52005246
5201
- if (size > BTRFS_SEND_READ_SIZE)
5202
- size = BTRFS_SEND_READ_SIZE;
52035247 ret = send_write(sctx, offset + sent, size);
52045248 if (ret < 0)
52055249 return ret;
5206
- if (!ret)
5207
- break;
5208
- sent += ret;
5250
+ sent += size;
52095251 }
52105252 return 0;
52115253 }
....@@ -5278,6 +5320,7 @@
52785320 struct btrfs_path *path;
52795321 struct btrfs_key key;
52805322 int ret;
5323
+ u64 clone_src_i_size = 0;
52815324
52825325 /*
52835326 * Prevent cloning from a zero offset with a length matching the sector
....@@ -5301,6 +5344,16 @@
53015344 path = alloc_path_for_send();
53025345 if (!path)
53035346 return -ENOMEM;
5347
+
5348
+ /*
5349
+ * There are inodes that have extents that lie behind its i_size. Don't
5350
+ * accept clones from these extents.
5351
+ */
5352
+ ret = __get_inode_info(clone_root->root, path, clone_root->ino,
5353
+ &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
5354
+ btrfs_release_path(path);
5355
+ if (ret < 0)
5356
+ goto out;
53045357
53055358 /*
53065359 * We can't send a clone operation for the entire range if we find
....@@ -5344,6 +5397,8 @@
53445397 u8 type;
53455398 u64 ext_len;
53465399 u64 clone_len;
5400
+ u64 clone_data_offset;
5401
+ bool crossed_src_i_size = false;
53475402
53485403 if (slot >= btrfs_header_nritems(leaf)) {
53495404 ret = btrfs_next_leaf(clone_root->root, path);
....@@ -5397,13 +5452,94 @@
53975452 if (key.offset >= clone_root->offset + len)
53985453 break;
53995454
5455
+ if (key.offset >= clone_src_i_size)
5456
+ break;
5457
+
5458
+ if (key.offset + ext_len > clone_src_i_size) {
5459
+ ext_len = clone_src_i_size - key.offset;
5460
+ crossed_src_i_size = true;
5461
+ }
5462
+
5463
+ clone_data_offset = btrfs_file_extent_offset(leaf, ei);
5464
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
5465
+ clone_root->offset = key.offset;
5466
+ if (clone_data_offset < data_offset &&
5467
+ clone_data_offset + ext_len > data_offset) {
5468
+ u64 extent_offset;
5469
+
5470
+ extent_offset = data_offset - clone_data_offset;
5471
+ ext_len -= extent_offset;
5472
+ clone_data_offset += extent_offset;
5473
+ clone_root->offset += extent_offset;
5474
+ }
5475
+ }
5476
+
54005477 clone_len = min_t(u64, ext_len, len);
54015478
54025479 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
5403
- btrfs_file_extent_offset(leaf, ei) == data_offset)
5404
- ret = send_clone(sctx, offset, clone_len, clone_root);
5405
- else
5480
+ clone_data_offset == data_offset) {
5481
+ const u64 src_end = clone_root->offset + clone_len;
5482
+ const u64 sectorsize = SZ_64K;
5483
+
5484
+ /*
5485
+ * We can't clone the last block, when its size is not
5486
+ * sector size aligned, into the middle of a file. If we
5487
+ * do so, the receiver will get a failure (-EINVAL) when
5488
+ * trying to clone or will silently corrupt the data in
5489
+ * the destination file if it's on a kernel without the
5490
+ * fix introduced by commit ac765f83f1397646
5491
+ * ("Btrfs: fix data corruption due to cloning of eof
5492
+ * block).
5493
+ *
5494
+ * So issue a clone of the aligned down range plus a
5495
+ * regular write for the eof block, if we hit that case.
5496
+ *
5497
+ * Also, we use the maximum possible sector size, 64K,
5498
+ * because we don't know what's the sector size of the
5499
+ * filesystem that receives the stream, so we have to
5500
+ * assume the largest possible sector size.
5501
+ */
5502
+ if (src_end == clone_src_i_size &&
5503
+ !IS_ALIGNED(src_end, sectorsize) &&
5504
+ offset + clone_len < sctx->cur_inode_size) {
5505
+ u64 slen;
5506
+
5507
+ slen = ALIGN_DOWN(src_end - clone_root->offset,
5508
+ sectorsize);
5509
+ if (slen > 0) {
5510
+ ret = send_clone(sctx, offset, slen,
5511
+ clone_root);
5512
+ if (ret < 0)
5513
+ goto out;
5514
+ }
5515
+ ret = send_extent_data(sctx, offset + slen,
5516
+ clone_len - slen);
5517
+ } else {
5518
+ ret = send_clone(sctx, offset, clone_len,
5519
+ clone_root);
5520
+ }
5521
+ } else if (crossed_src_i_size && clone_len < len) {
5522
+ /*
5523
+ * If we are at i_size of the clone source inode and we
5524
+ * can not clone from it, terminate the loop. This is
5525
+ * to avoid sending two write operations, one with a
5526
+ * length matching clone_len and the final one after
5527
+ * this loop with a length of len - clone_len.
5528
+ *
5529
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
5530
+ * was passed to the send ioctl), this helps avoid
5531
+ * sending an encoded write for an offset that is not
5532
+ * sector size aligned, in case the i_size of the source
5533
+ * inode is not sector size aligned. That will make the
5534
+ * receiver fallback to decompression of the data and
5535
+ * writing it using regular buffered IO, therefore while
5536
+ * not incorrect, it's not optimal due decompression and
5537
+ * possible re-compression at the receiver.
5538
+ */
5539
+ break;
5540
+ } else {
54065541 ret = send_extent_data(sctx, offset, clone_len);
5542
+ }
54075543
54085544 if (ret < 0)
54095545 goto out;
....@@ -5413,6 +5549,21 @@
54135549 break;
54145550 offset += clone_len;
54155551 clone_root->offset += clone_len;
5552
+
5553
+ /*
5554
+ * If we are cloning from the file we are currently processing,
5555
+ * and using the send root as the clone root, we must stop once
5556
+ * the current clone offset reaches the current eof of the file
5557
+ * at the receiver, otherwise we would issue an invalid clone
5558
+ * operation (source range going beyond eof) and cause the
5559
+ * receiver to fail. So if we reach the current eof, bail out
5560
+ * and fallback to a regular write.
5561
+ */
5562
+ if (clone_root->root == sctx->send_root &&
5563
+ clone_root->ino == sctx->cur_ino &&
5564
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
5565
+ break;
5566
+
54165567 data_offset += clone_len;
54175568 next:
54185569 path->slots[0]++;
....@@ -5433,51 +5584,29 @@
54335584 struct clone_root *clone_root)
54345585 {
54355586 int ret = 0;
5436
- struct btrfs_file_extent_item *ei;
54375587 u64 offset = key->offset;
5438
- u64 len;
5439
- u8 type;
5588
+ u64 end;
54405589 u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
54415590
5442
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5443
- struct btrfs_file_extent_item);
5444
- type = btrfs_file_extent_type(path->nodes[0], ei);
5445
- if (type == BTRFS_FILE_EXTENT_INLINE) {
5446
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
5447
- /*
5448
- * it is possible the inline item won't cover the whole page,
5449
- * but there may be items after this page. Make
5450
- * sure to send the whole thing
5451
- */
5452
- len = PAGE_ALIGN(len);
5453
- } else {
5454
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
5455
- }
5591
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
5592
+ if (offset >= end)
5593
+ return 0;
54565594
5457
- if (offset >= sctx->cur_inode_size) {
5458
- ret = 0;
5459
- goto out;
5460
- }
5461
- if (offset + len > sctx->cur_inode_size)
5462
- len = sctx->cur_inode_size - offset;
5463
- if (len == 0) {
5464
- ret = 0;
5465
- goto out;
5466
- }
5467
-
5468
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
5595
+ if (clone_root && IS_ALIGNED(end, bs)) {
5596
+ struct btrfs_file_extent_item *ei;
54695597 u64 disk_byte;
54705598 u64 data_offset;
54715599
5600
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5601
+ struct btrfs_file_extent_item);
54725602 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
54735603 data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
54745604 ret = clone_range(sctx, clone_root, disk_byte, data_offset,
5475
- offset, len);
5605
+ offset, end - offset);
54765606 } else {
5477
- ret = send_extent_data(sctx, offset, len);
5607
+ ret = send_extent_data(sctx, offset, end - offset);
54785608 }
5479
- sctx->cur_inode_next_write_offset = offset + len;
5480
-out:
5609
+ sctx->cur_inode_next_write_offset = end;
54815610 return ret;
54825611 }
54835612
....@@ -5675,10 +5804,7 @@
56755804 {
56765805 struct btrfs_path *path;
56775806 struct btrfs_root *root = sctx->send_root;
5678
- struct btrfs_file_extent_item *fi;
56795807 struct btrfs_key key;
5680
- u64 extent_end;
5681
- u8 type;
56825808 int ret;
56835809
56845810 path = alloc_path_for_send();
....@@ -5698,18 +5824,7 @@
56985824 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
56995825 goto out;
57005826
5701
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
5702
- struct btrfs_file_extent_item);
5703
- type = btrfs_file_extent_type(path->nodes[0], fi);
5704
- if (type == BTRFS_FILE_EXTENT_INLINE) {
5705
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
5706
- extent_end = ALIGN(key.offset + size,
5707
- sctx->send_root->fs_info->sectorsize);
5708
- } else {
5709
- extent_end = key.offset +
5710
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
5711
- }
5712
- sctx->cur_inode_last_extent = extent_end;
5827
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
57135828 out:
57145829 btrfs_free_path(path);
57155830 return ret;
....@@ -5763,16 +5878,7 @@
57635878 break;
57645879
57655880 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5766
- if (btrfs_file_extent_type(leaf, fi) ==
5767
- BTRFS_FILE_EXTENT_INLINE) {
5768
- u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
5769
-
5770
- extent_end = ALIGN(key.offset + size,
5771
- root->fs_info->sectorsize);
5772
- } else {
5773
- extent_end = key.offset +
5774
- btrfs_file_extent_num_bytes(leaf, fi);
5775
- }
5881
+ extent_end = btrfs_file_extent_end(path);
57765882 if (extent_end <= start)
57775883 goto next;
57785884 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
....@@ -5793,9 +5899,6 @@
57935899 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
57945900 struct btrfs_key *key)
57955901 {
5796
- struct btrfs_file_extent_item *fi;
5797
- u64 extent_end;
5798
- u8 type;
57995902 int ret = 0;
58005903
58015904 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
....@@ -5805,18 +5908,6 @@
58055908 ret = get_last_extent(sctx, key->offset - 1);
58065909 if (ret)
58075910 return ret;
5808
- }
5809
-
5810
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
5811
- struct btrfs_file_extent_item);
5812
- type = btrfs_file_extent_type(path->nodes[0], fi);
5813
- if (type == BTRFS_FILE_EXTENT_INLINE) {
5814
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
5815
- extent_end = ALIGN(key->offset + size,
5816
- sctx->send_root->fs_info->sectorsize);
5817
- } else {
5818
- extent_end = key->offset +
5819
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
58205911 }
58215912
58225913 if (path->slots[0] == 0 &&
....@@ -5844,7 +5935,7 @@
58445935 else
58455936 ret = 0;
58465937 }
5847
- sctx->cur_inode_last_extent = extent_end;
5938
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
58485939 return ret;
58495940 }
58505941
....@@ -6654,6 +6745,365 @@
66546745 return ret;
66556746 }
66566747
6748
+static int tree_move_down(struct btrfs_path *path, int *level)
6749
+{
6750
+ struct extent_buffer *eb;
6751
+
6752
+ BUG_ON(*level == 0);
6753
+ eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
6754
+ if (IS_ERR(eb))
6755
+ return PTR_ERR(eb);
6756
+
6757
+ path->nodes[*level - 1] = eb;
6758
+ path->slots[*level - 1] = 0;
6759
+ (*level)--;
6760
+ return 0;
6761
+}
6762
+
6763
+static int tree_move_next_or_upnext(struct btrfs_path *path,
6764
+ int *level, int root_level)
6765
+{
6766
+ int ret = 0;
6767
+ int nritems;
6768
+ nritems = btrfs_header_nritems(path->nodes[*level]);
6769
+
6770
+ path->slots[*level]++;
6771
+
6772
+ while (path->slots[*level] >= nritems) {
6773
+ if (*level == root_level)
6774
+ return -1;
6775
+
6776
+ /* move upnext */
6777
+ path->slots[*level] = 0;
6778
+ free_extent_buffer(path->nodes[*level]);
6779
+ path->nodes[*level] = NULL;
6780
+ (*level)++;
6781
+ path->slots[*level]++;
6782
+
6783
+ nritems = btrfs_header_nritems(path->nodes[*level]);
6784
+ ret = 1;
6785
+ }
6786
+ return ret;
6787
+}
6788
+
6789
+/*
6790
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
6791
+ * or down.
6792
+ */
6793
+static int tree_advance(struct btrfs_path *path,
6794
+ int *level, int root_level,
6795
+ int allow_down,
6796
+ struct btrfs_key *key)
6797
+{
6798
+ int ret;
6799
+
6800
+ if (*level == 0 || !allow_down) {
6801
+ ret = tree_move_next_or_upnext(path, level, root_level);
6802
+ } else {
6803
+ ret = tree_move_down(path, level);
6804
+ }
6805
+ if (ret >= 0) {
6806
+ if (*level == 0)
6807
+ btrfs_item_key_to_cpu(path->nodes[*level], key,
6808
+ path->slots[*level]);
6809
+ else
6810
+ btrfs_node_key_to_cpu(path->nodes[*level], key,
6811
+ path->slots[*level]);
6812
+ }
6813
+ return ret;
6814
+}
6815
+
6816
+static int tree_compare_item(struct btrfs_path *left_path,
6817
+ struct btrfs_path *right_path,
6818
+ char *tmp_buf)
6819
+{
6820
+ int cmp;
6821
+ int len1, len2;
6822
+ unsigned long off1, off2;
6823
+
6824
+ len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
6825
+ len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
6826
+ if (len1 != len2)
6827
+ return 1;
6828
+
6829
+ off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
6830
+ off2 = btrfs_item_ptr_offset(right_path->nodes[0],
6831
+ right_path->slots[0]);
6832
+
6833
+ read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
6834
+
6835
+ cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
6836
+ if (cmp)
6837
+ return 1;
6838
+ return 0;
6839
+}
6840
+
6841
+/*
6842
+ * This function compares two trees and calls the provided callback for
6843
+ * every changed/new/deleted item it finds.
6844
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
6845
+ * the compare pretty fast on snapshotted subvolumes.
6846
+ *
6847
+ * This currently works on commit roots only. As commit roots are read only,
6848
+ * we don't do any locking. The commit roots are protected with transactions.
6849
+ * Transactions are ended and rejoined when a commit is tried in between.
6850
+ *
6851
+ * This function checks for modifications done to the trees while comparing.
6852
+ * If it detects a change, it aborts immediately.
6853
+ */
6854
+static int btrfs_compare_trees(struct btrfs_root *left_root,
6855
+ struct btrfs_root *right_root, void *ctx)
6856
+{
6857
+ struct btrfs_fs_info *fs_info = left_root->fs_info;
6858
+ int ret;
6859
+ int cmp;
6860
+ struct btrfs_path *left_path = NULL;
6861
+ struct btrfs_path *right_path = NULL;
6862
+ struct btrfs_key left_key;
6863
+ struct btrfs_key right_key;
6864
+ char *tmp_buf = NULL;
6865
+ int left_root_level;
6866
+ int right_root_level;
6867
+ int left_level;
6868
+ int right_level;
6869
+ int left_end_reached;
6870
+ int right_end_reached;
6871
+ int advance_left;
6872
+ int advance_right;
6873
+ u64 left_blockptr;
6874
+ u64 right_blockptr;
6875
+ u64 left_gen;
6876
+ u64 right_gen;
6877
+
6878
+ left_path = btrfs_alloc_path();
6879
+ if (!left_path) {
6880
+ ret = -ENOMEM;
6881
+ goto out;
6882
+ }
6883
+ right_path = btrfs_alloc_path();
6884
+ if (!right_path) {
6885
+ ret = -ENOMEM;
6886
+ goto out;
6887
+ }
6888
+
6889
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
6890
+ if (!tmp_buf) {
6891
+ ret = -ENOMEM;
6892
+ goto out;
6893
+ }
6894
+
6895
+ left_path->search_commit_root = 1;
6896
+ left_path->skip_locking = 1;
6897
+ right_path->search_commit_root = 1;
6898
+ right_path->skip_locking = 1;
6899
+
6900
+ /*
6901
+ * Strategy: Go to the first items of both trees. Then do
6902
+ *
6903
+ * If both trees are at level 0
6904
+ * Compare keys of current items
6905
+ * If left < right treat left item as new, advance left tree
6906
+ * and repeat
6907
+ * If left > right treat right item as deleted, advance right tree
6908
+ * and repeat
6909
+ * If left == right do deep compare of items, treat as changed if
6910
+ * needed, advance both trees and repeat
6911
+ * If both trees are at the same level but not at level 0
6912
+ * Compare keys of current nodes/leafs
6913
+ * If left < right advance left tree and repeat
6914
+ * If left > right advance right tree and repeat
6915
+ * If left == right compare blockptrs of the next nodes/leafs
6916
+ * If they match advance both trees but stay at the same level
6917
+ * and repeat
6918
+ * If they don't match advance both trees while allowing to go
6919
+ * deeper and repeat
6920
+ * If tree levels are different
6921
+ * Advance the tree that needs it and repeat
6922
+ *
6923
+ * Advancing a tree means:
6924
+ * If we are at level 0, try to go to the next slot. If that's not
6925
+ * possible, go one level up and repeat. Stop when we found a level
6926
+ * where we could go to the next slot. We may at this point be on a
6927
+ * node or a leaf.
6928
+ *
6929
+ * If we are not at level 0 and not on shared tree blocks, go one
6930
+ * level deeper.
6931
+ *
6932
+ * If we are not at level 0 and on shared tree blocks, go one slot to
6933
+ * the right if possible or go up and right.
6934
+ */
6935
+
6936
+ down_read(&fs_info->commit_root_sem);
6937
+ left_level = btrfs_header_level(left_root->commit_root);
6938
+ left_root_level = left_level;
6939
+ left_path->nodes[left_level] =
6940
+ btrfs_clone_extent_buffer(left_root->commit_root);
6941
+ if (!left_path->nodes[left_level]) {
6942
+ up_read(&fs_info->commit_root_sem);
6943
+ ret = -ENOMEM;
6944
+ goto out;
6945
+ }
6946
+
6947
+ right_level = btrfs_header_level(right_root->commit_root);
6948
+ right_root_level = right_level;
6949
+ right_path->nodes[right_level] =
6950
+ btrfs_clone_extent_buffer(right_root->commit_root);
6951
+ if (!right_path->nodes[right_level]) {
6952
+ up_read(&fs_info->commit_root_sem);
6953
+ ret = -ENOMEM;
6954
+ goto out;
6955
+ }
6956
+ up_read(&fs_info->commit_root_sem);
6957
+
6958
+ if (left_level == 0)
6959
+ btrfs_item_key_to_cpu(left_path->nodes[left_level],
6960
+ &left_key, left_path->slots[left_level]);
6961
+ else
6962
+ btrfs_node_key_to_cpu(left_path->nodes[left_level],
6963
+ &left_key, left_path->slots[left_level]);
6964
+ if (right_level == 0)
6965
+ btrfs_item_key_to_cpu(right_path->nodes[right_level],
6966
+ &right_key, right_path->slots[right_level]);
6967
+ else
6968
+ btrfs_node_key_to_cpu(right_path->nodes[right_level],
6969
+ &right_key, right_path->slots[right_level]);
6970
+
6971
+ left_end_reached = right_end_reached = 0;
6972
+ advance_left = advance_right = 0;
6973
+
6974
+ while (1) {
6975
+ cond_resched();
6976
+ if (advance_left && !left_end_reached) {
6977
+ ret = tree_advance(left_path, &left_level,
6978
+ left_root_level,
6979
+ advance_left != ADVANCE_ONLY_NEXT,
6980
+ &left_key);
6981
+ if (ret == -1)
6982
+ left_end_reached = ADVANCE;
6983
+ else if (ret < 0)
6984
+ goto out;
6985
+ advance_left = 0;
6986
+ }
6987
+ if (advance_right && !right_end_reached) {
6988
+ ret = tree_advance(right_path, &right_level,
6989
+ right_root_level,
6990
+ advance_right != ADVANCE_ONLY_NEXT,
6991
+ &right_key);
6992
+ if (ret == -1)
6993
+ right_end_reached = ADVANCE;
6994
+ else if (ret < 0)
6995
+ goto out;
6996
+ advance_right = 0;
6997
+ }
6998
+
6999
+ if (left_end_reached && right_end_reached) {
7000
+ ret = 0;
7001
+ goto out;
7002
+ } else if (left_end_reached) {
7003
+ if (right_level == 0) {
7004
+ ret = changed_cb(left_path, right_path,
7005
+ &right_key,
7006
+ BTRFS_COMPARE_TREE_DELETED,
7007
+ ctx);
7008
+ if (ret < 0)
7009
+ goto out;
7010
+ }
7011
+ advance_right = ADVANCE;
7012
+ continue;
7013
+ } else if (right_end_reached) {
7014
+ if (left_level == 0) {
7015
+ ret = changed_cb(left_path, right_path,
7016
+ &left_key,
7017
+ BTRFS_COMPARE_TREE_NEW,
7018
+ ctx);
7019
+ if (ret < 0)
7020
+ goto out;
7021
+ }
7022
+ advance_left = ADVANCE;
7023
+ continue;
7024
+ }
7025
+
7026
+ if (left_level == 0 && right_level == 0) {
7027
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7028
+ if (cmp < 0) {
7029
+ ret = changed_cb(left_path, right_path,
7030
+ &left_key,
7031
+ BTRFS_COMPARE_TREE_NEW,
7032
+ ctx);
7033
+ if (ret < 0)
7034
+ goto out;
7035
+ advance_left = ADVANCE;
7036
+ } else if (cmp > 0) {
7037
+ ret = changed_cb(left_path, right_path,
7038
+ &right_key,
7039
+ BTRFS_COMPARE_TREE_DELETED,
7040
+ ctx);
7041
+ if (ret < 0)
7042
+ goto out;
7043
+ advance_right = ADVANCE;
7044
+ } else {
7045
+ enum btrfs_compare_tree_result result;
7046
+
7047
+ WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
7048
+ ret = tree_compare_item(left_path, right_path,
7049
+ tmp_buf);
7050
+ if (ret)
7051
+ result = BTRFS_COMPARE_TREE_CHANGED;
7052
+ else
7053
+ result = BTRFS_COMPARE_TREE_SAME;
7054
+ ret = changed_cb(left_path, right_path,
7055
+ &left_key, result, ctx);
7056
+ if (ret < 0)
7057
+ goto out;
7058
+ advance_left = ADVANCE;
7059
+ advance_right = ADVANCE;
7060
+ }
7061
+ } else if (left_level == right_level) {
7062
+ cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7063
+ if (cmp < 0) {
7064
+ advance_left = ADVANCE;
7065
+ } else if (cmp > 0) {
7066
+ advance_right = ADVANCE;
7067
+ } else {
7068
+ left_blockptr = btrfs_node_blockptr(
7069
+ left_path->nodes[left_level],
7070
+ left_path->slots[left_level]);
7071
+ right_blockptr = btrfs_node_blockptr(
7072
+ right_path->nodes[right_level],
7073
+ right_path->slots[right_level]);
7074
+ left_gen = btrfs_node_ptr_generation(
7075
+ left_path->nodes[left_level],
7076
+ left_path->slots[left_level]);
7077
+ right_gen = btrfs_node_ptr_generation(
7078
+ right_path->nodes[right_level],
7079
+ right_path->slots[right_level]);
7080
+ if (left_blockptr == right_blockptr &&
7081
+ left_gen == right_gen) {
7082
+ /*
7083
+ * As we're on a shared block, don't
7084
+ * allow to go deeper.
7085
+ */
7086
+ advance_left = ADVANCE_ONLY_NEXT;
7087
+ advance_right = ADVANCE_ONLY_NEXT;
7088
+ } else {
7089
+ advance_left = ADVANCE;
7090
+ advance_right = ADVANCE;
7091
+ }
7092
+ }
7093
+ } else if (left_level < right_level) {
7094
+ advance_right = ADVANCE;
7095
+ } else {
7096
+ advance_left = ADVANCE;
7097
+ }
7098
+ }
7099
+
7100
+out:
7101
+ btrfs_free_path(left_path);
7102
+ btrfs_free_path(right_path);
7103
+ kvfree(tmp_buf);
7104
+ return ret;
7105
+}
7106
+
66577107 static int send_subvol(struct send_ctx *sctx)
66587108 {
66597109 int ret;
....@@ -6669,8 +7119,7 @@
66697119 goto out;
66707120
66717121 if (sctx->parent_root) {
6672
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
6673
- changed_cb, sctx);
7122
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
66747123 if (ret < 0)
66757124 goto out;
66767125 ret = finish_inode_if_needed(sctx, 1);
....@@ -6779,20 +7228,25 @@
67797228 spin_unlock(&root->root_item_lock);
67807229 }
67817230
7231
+static void dedupe_in_progress_warn(const struct btrfs_root *root)
7232
+{
7233
+ btrfs_warn_rl(root->fs_info,
7234
+"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
7235
+ root->root_key.objectid, root->dedupe_in_progress);
7236
+}
7237
+
67827238 long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
67837239 {
67847240 int ret = 0;
67857241 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
67867242 struct btrfs_fs_info *fs_info = send_root->fs_info;
67877243 struct btrfs_root *clone_root;
6788
- struct btrfs_key key;
67897244 struct send_ctx *sctx = NULL;
67907245 u32 i;
67917246 u64 *clone_sources_tmp = NULL;
67927247 int clone_sources_to_rollback = 0;
6793
- unsigned alloc_size;
7248
+ size_t alloc_size;
67947249 int sort_clone_roots = 0;
6795
- int index;
67967250
67977251 if (!capable(CAP_SYS_ADMIN))
67987252 return -EPERM;
....@@ -6802,6 +7256,11 @@
68027256 * making it RW. This also protects against deletion.
68037257 */
68047258 spin_lock(&send_root->root_item_lock);
7259
+ if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
7260
+ dedupe_in_progress_warn(send_root);
7261
+ spin_unlock(&send_root->root_item_lock);
7262
+ return -EAGAIN;
7263
+ }
68057264 send_root->send_in_progress++;
68067265 spin_unlock(&send_root->root_item_lock);
68077266
....@@ -6817,18 +7276,11 @@
68177276 /*
68187277 * Check that we don't overflow at later allocations, we request
68197278 * clone_sources_count + 1 items, and compare to unsigned long inside
6820
- * access_ok.
7279
+ * access_ok. Also set an upper limit for allocation size so this can't
7280
+ * easily exhaust memory. Max number of clone sources is about 200K.
68217281 */
6822
- if (arg->clone_sources_count >
6823
- ULONG_MAX / sizeof(struct clone_root) - 1) {
7282
+ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
68247283 ret = -EINVAL;
6825
- goto out;
6826
- }
6827
-
6828
- if (!access_ok(VERIFY_READ, arg->clone_sources,
6829
- sizeof(*arg->clone_sources) *
6830
- arg->clone_sources_count)) {
6831
- ret = -EFAULT;
68327284 goto out;
68337285 }
68347286
....@@ -6875,25 +7327,20 @@
68757327 goto out;
68767328 }
68777329
6878
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
6879
- if (!sctx->read_buf) {
6880
- ret = -ENOMEM;
6881
- goto out;
6882
- }
6883
-
68847330 sctx->pending_dir_moves = RB_ROOT;
68857331 sctx->waiting_dir_moves = RB_ROOT;
68867332 sctx->orphan_dirs = RB_ROOT;
68877333
6888
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
6889
-
6890
- sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL);
7334
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
7335
+ arg->clone_sources_count + 1,
7336
+ GFP_KERNEL);
68917337 if (!sctx->clone_roots) {
68927338 ret = -ENOMEM;
68937339 goto out;
68947340 }
68957341
6896
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
7342
+ alloc_size = array_size(sizeof(*arg->clone_sources),
7343
+ arg->clone_sources_count);
68977344
68987345 if (arg->clone_sources_count) {
68997346 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
....@@ -6910,15 +7357,9 @@
69107357 }
69117358
69127359 for (i = 0; i < arg->clone_sources_count; i++) {
6913
- key.objectid = clone_sources_tmp[i];
6914
- key.type = BTRFS_ROOT_ITEM_KEY;
6915
- key.offset = (u64)-1;
6916
-
6917
- index = srcu_read_lock(&fs_info->subvol_srcu);
6918
-
6919
- clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
7360
+ clone_root = btrfs_get_fs_root(fs_info,
7361
+ clone_sources_tmp[i], true);
69207362 if (IS_ERR(clone_root)) {
6921
- srcu_read_unlock(&fs_info->subvol_srcu, index);
69227363 ret = PTR_ERR(clone_root);
69237364 goto out;
69247365 }
....@@ -6926,13 +7367,19 @@
69267367 if (!btrfs_root_readonly(clone_root) ||
69277368 btrfs_root_dead(clone_root)) {
69287369 spin_unlock(&clone_root->root_item_lock);
6929
- srcu_read_unlock(&fs_info->subvol_srcu, index);
7370
+ btrfs_put_root(clone_root);
69307371 ret = -EPERM;
7372
+ goto out;
7373
+ }
7374
+ if (clone_root->dedupe_in_progress) {
7375
+ dedupe_in_progress_warn(clone_root);
7376
+ spin_unlock(&clone_root->root_item_lock);
7377
+ btrfs_put_root(clone_root);
7378
+ ret = -EAGAIN;
69317379 goto out;
69327380 }
69337381 clone_root->send_in_progress++;
69347382 spin_unlock(&clone_root->root_item_lock);
6935
- srcu_read_unlock(&fs_info->subvol_srcu, index);
69367383
69377384 sctx->clone_roots[i].root = clone_root;
69387385 clone_sources_to_rollback = i + 1;
....@@ -6942,15 +7389,9 @@
69427389 }
69437390
69447391 if (arg->parent_root) {
6945
- key.objectid = arg->parent_root;
6946
- key.type = BTRFS_ROOT_ITEM_KEY;
6947
- key.offset = (u64)-1;
6948
-
6949
- index = srcu_read_lock(&fs_info->subvol_srcu);
6950
-
6951
- sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
7392
+ sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
7393
+ true);
69527394 if (IS_ERR(sctx->parent_root)) {
6953
- srcu_read_unlock(&fs_info->subvol_srcu, index);
69547395 ret = PTR_ERR(sctx->parent_root);
69557396 goto out;
69567397 }
....@@ -6960,13 +7401,16 @@
69607401 if (!btrfs_root_readonly(sctx->parent_root) ||
69617402 btrfs_root_dead(sctx->parent_root)) {
69627403 spin_unlock(&sctx->parent_root->root_item_lock);
6963
- srcu_read_unlock(&fs_info->subvol_srcu, index);
69647404 ret = -EPERM;
69657405 goto out;
69667406 }
7407
+ if (sctx->parent_root->dedupe_in_progress) {
7408
+ dedupe_in_progress_warn(sctx->parent_root);
7409
+ spin_unlock(&sctx->parent_root->root_item_lock);
7410
+ ret = -EAGAIN;
7411
+ goto out;
7412
+ }
69677413 spin_unlock(&sctx->parent_root->root_item_lock);
6968
-
6969
- srcu_read_unlock(&fs_info->subvol_srcu, index);
69707414 }
69717415
69727416 /*
....@@ -6974,7 +7418,8 @@
69747418 * is behind the current send position. This is checked while searching
69757419 * for possible clone sources.
69767420 */
6977
- sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
7421
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
7422
+ btrfs_grab_root(sctx->send_root);
69787423
69797424 /* We do a bsearch later */
69807425 sort(sctx->clone_roots, sctx->clone_roots_cnt,
....@@ -6990,9 +7435,23 @@
69907435 if (ret)
69917436 goto out;
69927437
7438
+ mutex_lock(&fs_info->balance_mutex);
7439
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
7440
+ mutex_unlock(&fs_info->balance_mutex);
7441
+ btrfs_warn_rl(fs_info,
7442
+ "cannot run send because a balance operation is in progress");
7443
+ ret = -EAGAIN;
7444
+ goto out;
7445
+ }
7446
+ fs_info->send_in_progress++;
7447
+ mutex_unlock(&fs_info->balance_mutex);
7448
+
69937449 current->journal_info = BTRFS_SEND_TRANS_STUB;
69947450 ret = send_subvol(sctx);
69957451 current->journal_info = NULL;
7452
+ mutex_lock(&fs_info->balance_mutex);
7453
+ fs_info->send_in_progress--;
7454
+ mutex_unlock(&fs_info->balance_mutex);
69967455 if (ret < 0)
69977456 goto out;
69987457
....@@ -7045,18 +7504,24 @@
70457504 }
70467505
70477506 if (sort_clone_roots) {
7048
- for (i = 0; i < sctx->clone_roots_cnt; i++)
7507
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
70497508 btrfs_root_dec_send_in_progress(
70507509 sctx->clone_roots[i].root);
7510
+ btrfs_put_root(sctx->clone_roots[i].root);
7511
+ }
70517512 } else {
7052
- for (i = 0; sctx && i < clone_sources_to_rollback; i++)
7513
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
70537514 btrfs_root_dec_send_in_progress(
70547515 sctx->clone_roots[i].root);
7516
+ btrfs_put_root(sctx->clone_roots[i].root);
7517
+ }
70557518
70567519 btrfs_root_dec_send_in_progress(send_root);
70577520 }
7058
- if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
7521
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
70597522 btrfs_root_dec_send_in_progress(sctx->parent_root);
7523
+ btrfs_put_root(sctx->parent_root);
7524
+ }
70607525
70617526 kvfree(clone_sources_tmp);
70627527
....@@ -7066,7 +7531,6 @@
70667531
70677532 kvfree(sctx->clone_roots);
70687533 kvfree(sctx->send_buf);
7069
- kvfree(sctx->read_buf);
70707534
70717535 name_cache_free(sctx);
70727536