hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/fs/btrfs/tree-log.c
....@@ -8,6 +8,7 @@
88 #include <linux/blkdev.h>
99 #include <linux/list_sort.h>
1010 #include <linux/iversion.h>
11
+#include "misc.h"
1112 #include "ctree.h"
1213 #include "tree-log.h"
1314 #include "disk-io.h"
....@@ -17,6 +18,8 @@
1718 #include "compression.h"
1819 #include "qgroup.h"
1920 #include "inode-map.h"
21
+#include "block-group.h"
22
+#include "space-info.h"
2023
2124 /* magic values for the inode_only field in btrfs_log_inode:
2225 *
....@@ -24,9 +27,12 @@
2427 * LOG_INODE_EXISTS means to log just enough to recreate the inode
2528 * during log replay
2629 */
27
-#define LOG_INODE_ALL 0
28
-#define LOG_INODE_EXISTS 1
29
-#define LOG_OTHER_INODE 2
30
+enum {
31
+ LOG_INODE_ALL,
32
+ LOG_INODE_EXISTS,
33
+ LOG_OTHER_INODE,
34
+ LOG_OTHER_INODE_ALL,
35
+};
3036
3137 /*
3238 * directory trouble cases
....@@ -80,16 +86,16 @@
8086 * The last stage is to deal with directories and links and extents
8187 * and all the other fun semantics
8288 */
83
-#define LOG_WALK_PIN_ONLY 0
84
-#define LOG_WALK_REPLAY_INODES 1
85
-#define LOG_WALK_REPLAY_DIR_INDEX 2
86
-#define LOG_WALK_REPLAY_ALL 3
89
+enum {
90
+ LOG_WALK_PIN_ONLY,
91
+ LOG_WALK_REPLAY_INODES,
92
+ LOG_WALK_REPLAY_DIR_INDEX,
93
+ LOG_WALK_REPLAY_ALL,
94
+};
8795
8896 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
8997 struct btrfs_root *root, struct btrfs_inode *inode,
9098 int inode_only,
91
- const loff_t start,
92
- const loff_t end,
9399 struct btrfs_log_ctx *ctx);
94100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
95101 struct btrfs_root *root,
....@@ -138,7 +144,7 @@
138144 mutex_lock(&root->log_mutex);
139145
140146 if (root->log_root) {
141
- if (btrfs_need_log_full_commit(fs_info, trans)) {
147
+ if (btrfs_need_log_full_commit(trans)) {
142148 ret = -EAGAIN;
143149 goto out;
144150 }
....@@ -161,13 +167,14 @@
161167 if (ret)
162168 goto out;
163169
170
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
164171 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
165172 root->log_start_pid = current->pid;
166173 }
167174
168175 atomic_inc(&root->log_batch);
169176 atomic_inc(&root->log_writers);
170
- if (ctx) {
177
+ if (ctx && !ctx->logging_new_name) {
171178 int index = root->log_transid % 2;
172179 list_add_tail(&ctx->list, &root->log_ctxs[index]);
173180 ctx->log_transid = root->log_transid;
....@@ -187,9 +194,8 @@
187194 {
188195 int ret = -ENOENT;
189196
190
- smp_mb();
191
- if (!root->log_root)
192
- return -ENOENT;
197
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
198
+ return ret;
193199
194200 mutex_lock(&root->log_mutex);
195201 if (root->log_root) {
....@@ -205,14 +211,9 @@
205211 * until you call btrfs_end_log_trans() or it makes any future
206212 * log transactions wait until you call btrfs_end_log_trans()
207213 */
208
-int btrfs_pin_log_trans(struct btrfs_root *root)
214
+void btrfs_pin_log_trans(struct btrfs_root *root)
209215 {
210
- int ret = -ENOENT;
211
-
212
- mutex_lock(&root->log_mutex);
213216 atomic_inc(&root->log_writers);
214
- mutex_unlock(&root->log_mutex);
215
- return ret;
216217 }
217218
218219 /*
....@@ -227,6 +228,17 @@
227228 }
228229 }
229230
231
+static int btrfs_write_tree_block(struct extent_buffer *buf)
232
+{
233
+ return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
234
+ buf->start + buf->len - 1);
235
+}
236
+
237
+static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
238
+{
239
+ filemap_fdatawait_range(buf->pages[0]->mapping,
240
+ buf->start, buf->start + buf->len - 1);
241
+}
230242
231243 /*
232244 * the walk control struct is used to pass state down the chain when
....@@ -301,12 +313,12 @@
301313 }
302314
303315 if (wc->pin)
304
- ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
316
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
305317 eb->len);
306318
307319 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
308320 if (wc->pin && btrfs_header_level(eb) == 0)
309
- ret = btrfs_exclude_logged_extents(fs_info, eb);
321
+ ret = btrfs_exclude_logged_extents(eb);
310322 if (wc->write)
311323 btrfs_write_tree_block(eb);
312324 if (wc->wait)
....@@ -335,7 +347,6 @@
335347 struct extent_buffer *eb, int slot,
336348 struct btrfs_key *key)
337349 {
338
- struct btrfs_fs_info *fs_info = root->fs_info;
339350 int ret;
340351 u32 item_size;
341352 u64 saved_i_size = 0;
....@@ -456,10 +467,9 @@
456467 found_size = btrfs_item_size_nr(path->nodes[0],
457468 path->slots[0]);
458469 if (found_size > item_size)
459
- btrfs_truncate_item(fs_info, path, item_size, 1);
470
+ btrfs_truncate_item(path, item_size, 1);
460471 else if (found_size < item_size)
461
- btrfs_extend_item(fs_info, path,
462
- item_size - found_size);
472
+ btrfs_extend_item(path, item_size - found_size);
463473 } else if (ret) {
464474 return ret;
465475 }
....@@ -495,13 +505,8 @@
495505 */
496506 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497507 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
498
- ino_size != 0) {
499
- struct btrfs_map_token token;
500
-
501
- btrfs_init_map_token(&token);
502
- btrfs_set_token_inode_size(dst_eb, dst_item,
503
- ino_size, &token);
504
- }
508
+ ino_size != 0)
509
+ btrfs_set_inode_size(dst_eb, dst_item, ino_size);
505510 goto no_copy;
506511 }
507512
....@@ -545,13 +550,9 @@
545550 static noinline struct inode *read_one_inode(struct btrfs_root *root,
546551 u64 objectid)
547552 {
548
- struct btrfs_key key;
549553 struct inode *inode;
550554
551
- key.objectid = objectid;
552
- key.type = BTRFS_INODE_ITEM_KEY;
553
- key.offset = 0;
554
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
555
+ inode = btrfs_iget(root->fs_info->sb, objectid, root);
555556 if (IS_ERR(inode))
556557 inode = NULL;
557558 return inode;
....@@ -696,20 +697,27 @@
696697 goto out;
697698
698699 if (ins.objectid > 0) {
700
+ struct btrfs_ref ref = { 0 };
699701 u64 csum_start;
700702 u64 csum_end;
701703 LIST_HEAD(ordered_sums);
704
+
702705 /*
703706 * is this extent already allocated in the extent
704707 * allocation tree? If so, just add a reference
705708 */
706709 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
707710 ins.offset);
708
- if (ret == 0) {
709
- ret = btrfs_inc_extent_ref(trans, root,
710
- ins.objectid, ins.offset,
711
- 0, root->root_key.objectid,
711
+ if (ret < 0) {
712
+ goto out;
713
+ } else if (ret == 0) {
714
+ btrfs_init_generic_ref(&ref,
715
+ BTRFS_ADD_DELAYED_REF,
716
+ ins.objectid, ins.offset, 0);
717
+ btrfs_init_data_ref(&ref,
718
+ root->root_key.objectid,
712719 key->objectid, offset);
720
+ ret = btrfs_inc_extent_ref(trans, &ref);
713721 if (ret)
714722 goto out;
715723 } else {
....@@ -816,6 +824,11 @@
816824 if (ret)
817825 goto out;
818826 }
827
+
828
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
829
+ extent_end - start);
830
+ if (ret)
831
+ goto out;
819832
820833 inode_add_bytes(inode, nbytes);
821834 update_inode:
....@@ -941,54 +954,32 @@
941954 const char *name, int namelen)
942955 {
943956 struct btrfs_path *path;
944
- struct btrfs_inode_ref *ref;
945
- unsigned long ptr;
946
- unsigned long ptr_end;
947
- unsigned long name_ptr;
948
- int found_name_len;
949
- int item_size;
950957 int ret;
951
- int match = 0;
952958
953959 path = btrfs_alloc_path();
954960 if (!path)
955961 return -ENOMEM;
956962
957963 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
958
- if (ret != 0)
964
+ if (ret < 0) {
959965 goto out;
966
+ } else if (ret == 1) {
967
+ ret = 0;
968
+ goto out;
969
+ }
960970
961
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
962
-
963
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
964
- if (btrfs_find_name_in_ext_backref(path->nodes[0],
971
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
972
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
973
+ path->slots[0],
974
+ ref_objectid,
975
+ name, namelen);
976
+ else
977
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
965978 path->slots[0],
966
- ref_objectid,
967
- name, namelen, NULL))
968
- match = 1;
969
-
970
- goto out;
971
- }
972
-
973
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
974
- ptr_end = ptr + item_size;
975
- while (ptr < ptr_end) {
976
- ref = (struct btrfs_inode_ref *)ptr;
977
- found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
978
- if (found_name_len == namelen) {
979
- name_ptr = (unsigned long)(ref + 1);
980
- ret = memcmp_extent_buffer(path->nodes[0], name,
981
- name_ptr, namelen);
982
- if (ret == 0) {
983
- match = 1;
984
- goto out;
985
- }
986
- }
987
- ptr = (unsigned long)(ref + 1) + found_name_len;
988
- }
979
+ name, namelen);
989980 out:
990981 btrfs_free_path(path);
991
- return match;
982
+ return ret;
992983 }
993984
994985 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
....@@ -1046,10 +1037,13 @@
10461037 (unsigned long)(victim_ref + 1),
10471038 victim_name_len);
10481039
1049
- if (!backref_in_log(log_root, &search_key,
1050
- parent_objectid,
1051
- victim_name,
1052
- victim_name_len)) {
1040
+ ret = backref_in_log(log_root, &search_key,
1041
+ parent_objectid, victim_name,
1042
+ victim_name_len);
1043
+ if (ret < 0) {
1044
+ kfree(victim_name);
1045
+ return ret;
1046
+ } else if (!ret) {
10531047 inc_nlink(&inode->vfs_inode);
10541048 btrfs_release_path(path);
10551049
....@@ -1081,7 +1075,9 @@
10811075 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
10821076 inode_objectid, parent_objectid, 0,
10831077 0);
1084
- if (!IS_ERR_OR_NULL(extref)) {
1078
+ if (IS_ERR(extref)) {
1079
+ return PTR_ERR(extref);
1080
+ } else if (extref) {
10851081 u32 item_size;
10861082 u32 cur_offset = 0;
10871083 unsigned long base;
....@@ -1111,10 +1107,13 @@
11111107 search_key.offset = btrfs_extref_hash(parent_objectid,
11121108 victim_name,
11131109 victim_name_len);
1114
- ret = 0;
1115
- if (!backref_in_log(log_root, &search_key,
1116
- parent_objectid, victim_name,
1117
- victim_name_len)) {
1110
+ ret = backref_in_log(log_root, &search_key,
1111
+ parent_objectid, victim_name,
1112
+ victim_name_len);
1113
+ if (ret < 0) {
1114
+ kfree(victim_name);
1115
+ return ret;
1116
+ } else if (!ret) {
11181117 ret = -ENOENT;
11191118 victim_parent = read_one_inode(root,
11201119 parent_objectid);
....@@ -1159,7 +1158,7 @@
11591158 }
11601159 btrfs_release_path(path);
11611160
1162
- /* look for a conflicing name */
1161
+ /* look for a conflicting name */
11631162 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
11641163 name, namelen, 0);
11651164 if (IS_ERR(di)) {
....@@ -1268,12 +1267,12 @@
12681267 goto out;
12691268
12701269 if (key->type == BTRFS_INODE_EXTREF_KEY)
1271
- ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
1272
- parent_id, name,
1273
- namelen, NULL);
1270
+ ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1271
+ parent_id, name,
1272
+ namelen);
12741273 else
1275
- ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
1276
- namelen, NULL);
1274
+ ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1275
+ name, namelen);
12771276
12781277 if (!ret) {
12791278 struct inode *dir;
....@@ -1289,6 +1288,15 @@
12891288 inode, name, namelen);
12901289 kfree(name);
12911290 iput(dir);
1291
+ /*
1292
+ * Whenever we need to check if a name exists or not, we
1293
+ * check the subvolume tree. So after an unlink we must
1294
+ * run delayed items, so that future checks for a name
1295
+ * during log replay see that the name does not exists
1296
+ * anymore.
1297
+ */
1298
+ if (!ret)
1299
+ ret = btrfs_run_delayed_items(trans);
12921300 if (ret)
12931301 goto out;
12941302 goto again;
....@@ -1335,15 +1343,75 @@
13351343 goto out;
13361344 }
13371345 if (key.type == BTRFS_INODE_EXTREF_KEY)
1338
- ret = btrfs_find_name_in_ext_backref(path->nodes[0],
1339
- path->slots[0], parent_id,
1340
- name, namelen, NULL);
1346
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1347
+ path->slots[0], parent_id, name, namelen);
13411348 else
1342
- ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1343
- name, namelen, NULL);
1349
+ ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1350
+ name, namelen);
13441351
13451352 out:
13461353 btrfs_free_path(path);
1354
+ return ret;
1355
+}
1356
+
1357
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1358
+ struct inode *dir, struct inode *inode, const char *name,
1359
+ int namelen, u64 ref_index)
1360
+{
1361
+ struct btrfs_dir_item *dir_item;
1362
+ struct btrfs_key key;
1363
+ struct btrfs_path *path;
1364
+ struct inode *other_inode = NULL;
1365
+ int ret;
1366
+
1367
+ path = btrfs_alloc_path();
1368
+ if (!path)
1369
+ return -ENOMEM;
1370
+
1371
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
1372
+ btrfs_ino(BTRFS_I(dir)),
1373
+ name, namelen, 0);
1374
+ if (!dir_item) {
1375
+ btrfs_release_path(path);
1376
+ goto add_link;
1377
+ } else if (IS_ERR(dir_item)) {
1378
+ ret = PTR_ERR(dir_item);
1379
+ goto out;
1380
+ }
1381
+
1382
+ /*
1383
+ * Our inode's dentry collides with the dentry of another inode which is
1384
+ * in the log but not yet processed since it has a higher inode number.
1385
+ * So delete that other dentry.
1386
+ */
1387
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1388
+ btrfs_release_path(path);
1389
+ other_inode = read_one_inode(root, key.objectid);
1390
+ if (!other_inode) {
1391
+ ret = -ENOENT;
1392
+ goto out;
1393
+ }
1394
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1395
+ name, namelen);
1396
+ if (ret)
1397
+ goto out;
1398
+ /*
1399
+ * If we dropped the link count to 0, bump it so that later the iput()
1400
+ * on the inode will not free it. We will fixup the link count later.
1401
+ */
1402
+ if (other_inode->i_nlink == 0)
1403
+ inc_nlink(other_inode);
1404
+
1405
+ ret = btrfs_run_delayed_items(trans);
1406
+ if (ret)
1407
+ goto out;
1408
+add_link:
1409
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1410
+ name, namelen, 0, ref_index);
1411
+out:
1412
+ iput(other_inode);
1413
+ btrfs_free_path(path);
1414
+
13471415 return ret;
13481416 }
13491417
....@@ -1480,14 +1548,22 @@
14801548 */
14811549 if (!ret && inode->i_nlink == 0)
14821550 inc_nlink(inode);
1551
+ /*
1552
+ * Whenever we need to check if a name exists or
1553
+ * not, we check the subvolume tree. So after an
1554
+ * unlink we must run delayed items, so that future
1555
+ * checks for a name during log replay see that the
1556
+ * name does not exists anymore.
1557
+ */
1558
+ if (!ret)
1559
+ ret = btrfs_run_delayed_items(trans);
14831560 }
14841561 if (ret < 0)
14851562 goto out;
14861563
14871564 /* insert our name */
1488
- ret = btrfs_add_link(trans, BTRFS_I(dir),
1489
- BTRFS_I(inode),
1490
- name, namelen, 0, ref_index);
1565
+ ret = add_link(trans, root, dir, inode, name, namelen,
1566
+ ref_index);
14911567 if (ret)
14921568 goto out;
14931569
....@@ -1829,30 +1905,6 @@
18291905 }
18301906
18311907 /*
1832
- * Return true if an inode reference exists in the log for the given name,
1833
- * inode and parent inode.
1834
- */
1835
-static bool name_in_log_ref(struct btrfs_root *log_root,
1836
- const char *name, const int name_len,
1837
- const u64 dirid, const u64 ino)
1838
-{
1839
- struct btrfs_key search_key;
1840
-
1841
- search_key.objectid = ino;
1842
- search_key.type = BTRFS_INODE_REF_KEY;
1843
- search_key.offset = dirid;
1844
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1845
- return true;
1846
-
1847
- search_key.type = BTRFS_INODE_EXTREF_KEY;
1848
- search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1849
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1850
- return true;
1851
-
1852
- return false;
1853
-}
1854
-
1855
-/*
18561908 * take a single entry in a log directory item and replay it into
18571909 * the subvolume.
18581910 *
....@@ -1975,8 +2027,31 @@
19752027 return ret;
19762028
19772029 insert:
1978
- if (name_in_log_ref(root->log_root, name, name_len,
1979
- key->objectid, log_key.objectid)) {
2030
+ /*
2031
+ * Check if the inode reference exists in the log for the given name,
2032
+ * inode and parent inode
2033
+ */
2034
+ found_key.objectid = log_key.objectid;
2035
+ found_key.type = BTRFS_INODE_REF_KEY;
2036
+ found_key.offset = key->objectid;
2037
+ ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2038
+ if (ret < 0) {
2039
+ goto out;
2040
+ } else if (ret) {
2041
+ /* The dentry will be added later. */
2042
+ ret = 0;
2043
+ update_size = false;
2044
+ goto out;
2045
+ }
2046
+
2047
+ found_key.objectid = log_key.objectid;
2048
+ found_key.type = BTRFS_INODE_EXTREF_KEY;
2049
+ found_key.offset = key->objectid;
2050
+ ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2051
+ name_len);
2052
+ if (ret < 0) {
2053
+ goto out;
2054
+ } else if (ret) {
19802055 /* The dentry will be added later. */
19812056 ret = 0;
19822057 update_size = false;
....@@ -2629,29 +2704,45 @@
26292704 return ret;
26302705 }
26312706
2707
+/*
2708
+ * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2709
+ */
2710
+static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2711
+{
2712
+ struct btrfs_block_group *cache;
2713
+
2714
+ cache = btrfs_lookup_block_group(fs_info, start);
2715
+ if (!cache) {
2716
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
2717
+ return;
2718
+ }
2719
+
2720
+ spin_lock(&cache->space_info->lock);
2721
+ spin_lock(&cache->lock);
2722
+ cache->reserved -= fs_info->nodesize;
2723
+ cache->space_info->bytes_reserved -= fs_info->nodesize;
2724
+ spin_unlock(&cache->lock);
2725
+ spin_unlock(&cache->space_info->lock);
2726
+
2727
+ btrfs_put_block_group(cache);
2728
+}
2729
+
26322730 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
26332731 struct btrfs_root *root,
26342732 struct btrfs_path *path, int *level,
26352733 struct walk_control *wc)
26362734 {
26372735 struct btrfs_fs_info *fs_info = root->fs_info;
2638
- u64 root_owner;
26392736 u64 bytenr;
26402737 u64 ptr_gen;
26412738 struct extent_buffer *next;
26422739 struct extent_buffer *cur;
2643
- struct extent_buffer *parent;
26442740 u32 blocksize;
26452741 int ret = 0;
2646
-
2647
- WARN_ON(*level < 0);
2648
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
26492742
26502743 while (*level > 0) {
26512744 struct btrfs_key first_key;
26522745
2653
- WARN_ON(*level < 0);
2654
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
26552746 cur = path->nodes[*level];
26562747
26572748 WARN_ON(btrfs_header_level(cur) != *level);
....@@ -2664,9 +2755,6 @@
26642755 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
26652756 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
26662757 blocksize = fs_info->nodesize;
2667
-
2668
- parent = path->nodes[*level];
2669
- root_owner = btrfs_header_owner(parent);
26702758
26712759 next = btrfs_find_create_tree_block(fs_info, bytenr);
26722760 if (IS_ERR(next))
....@@ -2691,23 +2779,20 @@
26912779
26922780 if (trans) {
26932781 btrfs_tree_lock(next);
2694
- btrfs_set_lock_blocking(next);
2695
- clean_tree_block(fs_info, next);
2782
+ btrfs_set_lock_blocking_write(next);
2783
+ btrfs_clean_tree_block(next);
26962784 btrfs_wait_tree_block_writeback(next);
26972785 btrfs_tree_unlock(next);
2786
+ ret = btrfs_pin_reserved_extent(trans,
2787
+ bytenr, blocksize);
2788
+ if (ret) {
2789
+ free_extent_buffer(next);
2790
+ return ret;
2791
+ }
26982792 } else {
26992793 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
27002794 clear_extent_buffer_dirty(next);
2701
- }
2702
-
2703
- WARN_ON(root_owner !=
2704
- BTRFS_TREE_LOG_OBJECTID);
2705
- ret = btrfs_free_and_pin_reserved_extent(
2706
- fs_info, bytenr,
2707
- blocksize);
2708
- if (ret) {
2709
- free_extent_buffer(next);
2710
- return ret;
2795
+ unaccount_log_buffer(fs_info, bytenr);
27112796 }
27122797 }
27132798 free_extent_buffer(next);
....@@ -2719,7 +2804,6 @@
27192804 return ret;
27202805 }
27212806
2722
- WARN_ON(*level <= 0);
27232807 if (path->nodes[*level-1])
27242808 free_extent_buffer(path->nodes[*level-1]);
27252809 path->nodes[*level-1] = next;
....@@ -2727,9 +2811,6 @@
27272811 path->slots[*level] = 0;
27282812 cond_resched();
27292813 }
2730
- WARN_ON(*level < 0);
2731
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
2732
-
27332814 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
27342815
27352816 cond_resched();
....@@ -2742,7 +2823,6 @@
27422823 struct walk_control *wc)
27432824 {
27442825 struct btrfs_fs_info *fs_info = root->fs_info;
2745
- u64 root_owner;
27462826 int i;
27472827 int slot;
27482828 int ret;
....@@ -2755,13 +2835,6 @@
27552835 WARN_ON(*level == 0);
27562836 return 0;
27572837 } else {
2758
- struct extent_buffer *parent;
2759
- if (path->nodes[*level] == root->node)
2760
- parent = path->nodes[*level];
2761
- else
2762
- parent = path->nodes[*level + 1];
2763
-
2764
- root_owner = btrfs_header_owner(parent);
27652838 ret = wc->process_func(root, path->nodes[*level], wc,
27662839 btrfs_header_generation(path->nodes[*level]),
27672840 *level);
....@@ -2775,22 +2848,22 @@
27752848
27762849 if (trans) {
27772850 btrfs_tree_lock(next);
2778
- btrfs_set_lock_blocking(next);
2779
- clean_tree_block(fs_info, next);
2851
+ btrfs_set_lock_blocking_write(next);
2852
+ btrfs_clean_tree_block(next);
27802853 btrfs_wait_tree_block_writeback(next);
27812854 btrfs_tree_unlock(next);
2855
+ ret = btrfs_pin_reserved_extent(trans,
2856
+ path->nodes[*level]->start,
2857
+ path->nodes[*level]->len);
2858
+ if (ret)
2859
+ return ret;
27822860 } else {
27832861 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
27842862 clear_extent_buffer_dirty(next);
2785
- }
27862863
2787
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2788
- ret = btrfs_free_and_pin_reserved_extent(
2789
- fs_info,
2790
- path->nodes[*level]->start,
2791
- path->nodes[*level]->len);
2792
- if (ret)
2793
- return ret;
2864
+ unaccount_log_buffer(fs_info,
2865
+ path->nodes[*level]->start);
2866
+ }
27942867 }
27952868 free_extent_buffer(path->nodes[*level]);
27962869 path->nodes[*level] = NULL;
....@@ -2822,7 +2895,7 @@
28222895 level = btrfs_header_level(log->node);
28232896 orig_level = level;
28242897 path->nodes[level] = log->node;
2825
- extent_buffer_get(log->node);
2898
+ atomic_inc(&log->node->refs);
28262899 path->slots[level] = 0;
28272900
28282901 while (1) {
....@@ -2857,21 +2930,19 @@
28572930
28582931 if (trans) {
28592932 btrfs_tree_lock(next);
2860
- btrfs_set_lock_blocking(next);
2861
- clean_tree_block(fs_info, next);
2933
+ btrfs_set_lock_blocking_write(next);
2934
+ btrfs_clean_tree_block(next);
28622935 btrfs_wait_tree_block_writeback(next);
28632936 btrfs_tree_unlock(next);
2937
+ ret = btrfs_pin_reserved_extent(trans,
2938
+ next->start, next->len);
2939
+ if (ret)
2940
+ goto out;
28642941 } else {
28652942 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
28662943 clear_extent_buffer_dirty(next);
2944
+ unaccount_log_buffer(fs_info, next->start);
28672945 }
2868
-
2869
- WARN_ON(log->root_key.objectid !=
2870
- BTRFS_TREE_LOG_OBJECTID);
2871
- ret = btrfs_free_and_pin_reserved_extent(fs_info,
2872
- next->start, next->len);
2873
- if (ret)
2874
- goto out;
28752946 }
28762947 }
28772948
....@@ -3035,7 +3106,7 @@
30353106 }
30363107
30373108 /* bail out if we need to do a full commit */
3038
- if (btrfs_need_log_full_commit(fs_info, trans)) {
3109
+ if (btrfs_need_log_full_commit(trans)) {
30393110 ret = -EAGAIN;
30403111 mutex_unlock(&root->log_mutex);
30413112 goto out;
....@@ -3054,7 +3125,7 @@
30543125 if (ret) {
30553126 blk_finish_plug(&plug);
30563127 btrfs_abort_transaction(trans, ret);
3057
- btrfs_set_log_full_commit(fs_info, trans);
3128
+ btrfs_set_log_full_commit(trans);
30583129 mutex_unlock(&root->log_mutex);
30593130 goto out;
30603131 }
....@@ -3088,16 +3159,10 @@
30883159 btrfs_init_log_ctx(&root_log_ctx, NULL);
30893160
30903161 mutex_lock(&log_root_tree->log_mutex);
3091
- atomic_inc(&log_root_tree->log_batch);
3092
- atomic_inc(&log_root_tree->log_writers);
30933162
30943163 index2 = log_root_tree->log_transid % 2;
30953164 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
30963165 root_log_ctx.log_transid = log_root_tree->log_transid;
3097
-
3098
- mutex_unlock(&log_root_tree->log_mutex);
3099
-
3100
- mutex_lock(&log_root_tree->log_mutex);
31013166
31023167 /*
31033168 * Now we are safe to update the log_root_tree because we're under the
....@@ -3105,18 +3170,12 @@
31053170 * open until we drop the log_mutex.
31063171 */
31073172 ret = update_log_root(trans, log, &new_root_item);
3108
-
3109
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
3110
- /* atomic_dec_and_test implies a barrier */
3111
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
3112
- }
3113
-
31143173 if (ret) {
31153174 if (!list_empty(&root_log_ctx.list))
31163175 list_del_init(&root_log_ctx.list);
31173176
31183177 blk_finish_plug(&plug);
3119
- btrfs_set_log_full_commit(fs_info, trans);
3178
+ btrfs_set_log_full_commit(trans);
31203179
31213180 if (ret != -ENOSPC) {
31223181 btrfs_abort_transaction(trans, ret);
....@@ -3156,13 +3215,11 @@
31563215 root_log_ctx.log_transid - 1);
31573216 }
31583217
3159
- wait_for_writer(log_root_tree);
3160
-
31613218 /*
31623219 * now that we've moved on to the tree of log tree roots,
31633220 * check the full commit flag again
31643221 */
3165
- if (btrfs_need_log_full_commit(fs_info, trans)) {
3222
+ if (btrfs_need_log_full_commit(trans)) {
31663223 blk_finish_plug(&plug);
31673224 btrfs_wait_tree_log_extents(log, mark);
31683225 mutex_unlock(&log_root_tree->log_mutex);
....@@ -3175,7 +3232,7 @@
31753232 EXTENT_DIRTY | EXTENT_NEW);
31763233 blk_finish_plug(&plug);
31773234 if (ret) {
3178
- btrfs_set_log_full_commit(fs_info, trans);
3235
+ btrfs_set_log_full_commit(trans);
31793236 btrfs_abort_transaction(trans, ret);
31803237 mutex_unlock(&log_root_tree->log_mutex);
31813238 goto out_wake_log_root;
....@@ -3185,7 +3242,7 @@
31853242 ret = btrfs_wait_tree_log_extents(log_root_tree,
31863243 EXTENT_NEW | EXTENT_DIRTY);
31873244 if (ret) {
3188
- btrfs_set_log_full_commit(fs_info, trans);
3245
+ btrfs_set_log_full_commit(trans);
31893246 mutex_unlock(&log_root_tree->log_mutex);
31903247 goto out_wake_log_root;
31913248 }
....@@ -3199,7 +3256,7 @@
31993256 mutex_unlock(&log_root_tree->log_mutex);
32003257
32013258 /*
3202
- * nobody else is going to jump in and write the the ctree
3259
+ * Nobody else is going to jump in and write the ctree
32033260 * super here because the log_commit atomic below is protecting
32043261 * us. We must be called with a transaction handle pinning
32053262 * the running transaction open, so a full commit can't hop
....@@ -3207,7 +3264,7 @@
32073264 */
32083265 ret = write_all_supers(fs_info, 1);
32093266 if (ret) {
3210
- btrfs_set_log_full_commit(fs_info, trans);
3267
+ btrfs_set_log_full_commit(trans);
32113268 btrfs_abort_transaction(trans, ret);
32123269 goto out_wake_log_root;
32133270 }
....@@ -3251,8 +3308,6 @@
32513308 struct btrfs_root *log)
32523309 {
32533310 int ret;
3254
- u64 start;
3255
- u64 end;
32563311 struct walk_control wc = {
32573312 .free = 1,
32583313 .process_func = process_one_buffer
....@@ -3266,20 +3321,10 @@
32663321 btrfs_handle_fs_error(log->fs_info, ret, NULL);
32673322 }
32683323
3269
- while (1) {
3270
- ret = find_first_extent_bit(&log->dirty_log_pages,
3271
- 0, &start, &end,
3272
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
3273
- NULL);
3274
- if (ret)
3275
- break;
3276
-
3277
- clear_extent_bits(&log->dirty_log_pages, start, end,
3278
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3279
- }
3280
-
3281
- free_extent_buffer(log->node);
3282
- kfree(log);
3324
+ clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3325
+ EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3326
+ extent_io_tree_release(&log->log_csum_range);
3327
+ btrfs_put_root(log);
32833328 }
32843329
32853330 /*
....@@ -3291,6 +3336,7 @@
32913336 if (root->log_root) {
32923337 free_log_tree(trans, root->log_root);
32933338 root->log_root = NULL;
3339
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
32943340 }
32953341 return 0;
32963342 }
....@@ -3447,7 +3493,7 @@
34473493 out_unlock:
34483494 mutex_unlock(&dir->log_mutex);
34493495 if (err == -ENOSPC) {
3450
- btrfs_set_log_full_commit(root->fs_info, trans);
3496
+ btrfs_set_log_full_commit(trans);
34513497 err = 0;
34523498 } else if (err < 0 && err != -ENOENT) {
34533499 /* ENOENT can be returned if the entry hasn't been fsynced yet */
....@@ -3465,7 +3511,6 @@
34653511 const char *name, int name_len,
34663512 struct btrfs_inode *inode, u64 dirid)
34673513 {
3468
- struct btrfs_fs_info *fs_info = root->fs_info;
34693514 struct btrfs_root *log;
34703515 u64 index;
34713516 int ret;
....@@ -3483,7 +3528,7 @@
34833528 dirid, &index);
34843529 mutex_unlock(&inode->log_mutex);
34853530 if (ret == -ENOSPC) {
3486
- btrfs_set_log_full_commit(fs_info, trans);
3531
+ btrfs_set_log_full_commit(trans);
34873532 ret = 0;
34883533 } else if (ret < 0 && ret != -ENOENT)
34893534 btrfs_abort_transaction(trans, ret);
....@@ -3807,8 +3852,9 @@
38073852
38083853 found_key.offset = 0;
38093854 found_key.type = 0;
3810
- ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3811
- &start_slot);
3855
+ ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3856
+ if (ret < 0)
3857
+ break;
38123858
38133859 ret = btrfs_del_items(trans, log, path, start_slot,
38143860 path->slots[0] - start_slot + 1);
....@@ -3834,7 +3880,7 @@
38343880 {
38353881 struct btrfs_map_token token;
38363882
3837
- btrfs_init_map_token(&token);
3883
+ btrfs_init_map_token(&token, leaf);
38383884
38393885 if (log_inode_only) {
38403886 /* set the generation to zero so the recover code
....@@ -3842,44 +3888,41 @@
38423888 * just to say 'this inode exists' and a logging
38433889 * to say 'update this inode with these values'
38443890 */
3845
- btrfs_set_token_inode_generation(leaf, item, 0, &token);
3846
- btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3891
+ btrfs_set_token_inode_generation(&token, item, 0);
3892
+ btrfs_set_token_inode_size(&token, item, logged_isize);
38473893 } else {
3848
- btrfs_set_token_inode_generation(leaf, item,
3849
- BTRFS_I(inode)->generation,
3850
- &token);
3851
- btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3894
+ btrfs_set_token_inode_generation(&token, item,
3895
+ BTRFS_I(inode)->generation);
3896
+ btrfs_set_token_inode_size(&token, item, inode->i_size);
38523897 }
38533898
3854
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3855
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3856
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3857
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3899
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3900
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3901
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3902
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
38583903
3859
- btrfs_set_token_timespec_sec(leaf, &item->atime,
3860
- inode->i_atime.tv_sec, &token);
3861
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
3862
- inode->i_atime.tv_nsec, &token);
3904
+ btrfs_set_token_timespec_sec(&token, &item->atime,
3905
+ inode->i_atime.tv_sec);
3906
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
3907
+ inode->i_atime.tv_nsec);
38633908
3864
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
3865
- inode->i_mtime.tv_sec, &token);
3866
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3867
- inode->i_mtime.tv_nsec, &token);
3909
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
3910
+ inode->i_mtime.tv_sec);
3911
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
3912
+ inode->i_mtime.tv_nsec);
38683913
3869
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
3870
- inode->i_ctime.tv_sec, &token);
3871
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3872
- inode->i_ctime.tv_nsec, &token);
3914
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
3915
+ inode->i_ctime.tv_sec);
3916
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
3917
+ inode->i_ctime.tv_nsec);
38733918
3874
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3875
- &token);
3919
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
38763920
3877
- btrfs_set_token_inode_sequence(leaf, item,
3878
- inode_peek_iversion(inode), &token);
3879
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3880
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3881
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3882
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3921
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3922
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
3923
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3924
+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3925
+ btrfs_set_token_inode_block_group(&token, item, 0);
38833926 }
38843927
38853928 static int log_inode_item(struct btrfs_trans_handle *trans,
....@@ -3902,11 +3945,32 @@
39023945 }
39033946
39043947 static int log_csums(struct btrfs_trans_handle *trans,
3948
+ struct btrfs_inode *inode,
39053949 struct btrfs_root *log_root,
39063950 struct btrfs_ordered_sum *sums)
39073951 {
3952
+ const u64 lock_end = sums->bytenr + sums->len - 1;
3953
+ struct extent_state *cached_state = NULL;
39083954 int ret;
39093955
3956
+ /*
3957
+ * If this inode was not used for reflink operations in the current
3958
+ * transaction with new extents, then do the fast path, no need to
3959
+ * worry about logging checksum items with overlapping ranges.
3960
+ */
3961
+ if (inode->last_reflink_trans < trans->transid)
3962
+ return btrfs_csum_file_blocks(trans, log_root, sums);
3963
+
3964
+ /*
3965
+ * Serialize logging for checksums. This is to avoid racing with the
3966
+ * same checksum being logged by another task that is logging another
3967
+ * file which happens to refer to the same extent as well. Such races
3968
+ * can leave checksum items in the log with overlapping ranges.
3969
+ */
3970
+ ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3971
+ lock_end, &cached_state);
3972
+ if (ret)
3973
+ return ret;
39103974 /*
39113975 * Due to extent cloning, we might have logged a csum item that covers a
39123976 * subrange of a cloned extent, and later we can end up logging a csum
....@@ -3917,10 +3981,13 @@
39173981 * trim and adjust) any existing csum items in the log for this range.
39183982 */
39193983 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
3920
- if (ret)
3921
- return ret;
3984
+ if (!ret)
3985
+ ret = btrfs_csum_file_blocks(trans, log_root, sums);
39223986
3923
- return btrfs_csum_file_blocks(trans, log_root, sums);
3987
+ unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
3988
+ &cached_state);
3989
+
3990
+ return ret;
39243991 }
39253992
39263993 static noinline int copy_items(struct btrfs_trans_handle *trans,
....@@ -4041,7 +4108,7 @@
40414108 struct btrfs_ordered_sum,
40424109 list);
40434110 if (!ret)
4044
- ret = log_csums(trans, log, sums);
4111
+ ret = log_csums(trans, inode, log, sums);
40454112 list_del(&sums->list);
40464113 kfree(sums);
40474114 }
....@@ -4066,10 +4133,14 @@
40664133 static int log_extent_csums(struct btrfs_trans_handle *trans,
40674134 struct btrfs_inode *inode,
40684135 struct btrfs_root *log_root,
4069
- const struct extent_map *em)
4136
+ const struct extent_map *em,
4137
+ struct btrfs_log_ctx *ctx)
40704138 {
4139
+ struct btrfs_ordered_extent *ordered;
40714140 u64 csum_offset;
40724141 u64 csum_len;
4142
+ u64 mod_start = em->mod_start;
4143
+ u64 mod_len = em->mod_len;
40734144 LIST_HEAD(ordered_sums);
40744145 int ret = 0;
40754146
....@@ -4078,13 +4149,71 @@
40784149 em->block_start == EXTENT_MAP_HOLE)
40794150 return 0;
40804151
4152
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4153
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4154
+ const u64 mod_end = mod_start + mod_len;
4155
+ struct btrfs_ordered_sum *sums;
4156
+
4157
+ if (mod_len == 0)
4158
+ break;
4159
+
4160
+ if (ordered_end <= mod_start)
4161
+ continue;
4162
+ if (mod_end <= ordered->file_offset)
4163
+ break;
4164
+
4165
+ /*
4166
+ * We are going to copy all the csums on this ordered extent, so
4167
+ * go ahead and adjust mod_start and mod_len in case this ordered
4168
+ * extent has already been logged.
4169
+ */
4170
+ if (ordered->file_offset > mod_start) {
4171
+ if (ordered_end >= mod_end)
4172
+ mod_len = ordered->file_offset - mod_start;
4173
+ /*
4174
+ * If we have this case
4175
+ *
4176
+ * |--------- logged extent ---------|
4177
+ * |----- ordered extent ----|
4178
+ *
4179
+ * Just don't mess with mod_start and mod_len, we'll
4180
+ * just end up logging more csums than we need and it
4181
+ * will be ok.
4182
+ */
4183
+ } else {
4184
+ if (ordered_end < mod_end) {
4185
+ mod_len = mod_end - ordered_end;
4186
+ mod_start = ordered_end;
4187
+ } else {
4188
+ mod_len = 0;
4189
+ }
4190
+ }
4191
+
4192
+ /*
4193
+ * To keep us from looping for the above case of an ordered
4194
+ * extent that falls inside of the logged extent.
4195
+ */
4196
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4197
+ continue;
4198
+
4199
+ list_for_each_entry(sums, &ordered->list, list) {
4200
+ ret = log_csums(trans, inode, log_root, sums);
4201
+ if (ret)
4202
+ return ret;
4203
+ }
4204
+ }
4205
+
4206
+ /* We're done, found all csums in the ordered extents. */
4207
+ if (mod_len == 0)
4208
+ return 0;
4209
+
40814210 /* If we're compressed we have to save the entire range of csums. */
40824211 if (em->compress_type) {
40834212 csum_offset = 0;
40844213 csum_len = max(em->block_len, em->orig_block_len);
40854214 } else {
4086
- csum_offset = em->mod_start - em->start;
4087
- csum_len = em->mod_len;
4215
+ csum_offset = mod_start - em->start;
4216
+ csum_len = mod_len;
40884217 }
40894218
40904219 /* block start is already adjusted for the file extent offset. */
....@@ -4100,7 +4229,7 @@
41004229 struct btrfs_ordered_sum,
41014230 list);
41024231 if (!ret)
4103
- ret = log_csums(trans, log_root, sums);
4232
+ ret = log_csums(trans, inode, log_root, sums);
41044233 list_del(&sums->list);
41054234 kfree(sums);
41064235 }
....@@ -4124,13 +4253,11 @@
41244253 int ret;
41254254 int extent_inserted = 0;
41264255
4127
- ret = log_extent_csums(trans, inode, log, em);
4256
+ ret = log_extent_csums(trans, inode, log, em, ctx);
41284257 if (ret)
41294258 return ret;
41304259
4131
- btrfs_init_map_token(&token);
4132
-
4133
- ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4260
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
41344261 em->start + em->len, NULL, 0, 1,
41354262 sizeof(*fi), &extent_inserted);
41364263 if (ret)
....@@ -4147,46 +4274,39 @@
41474274 return ret;
41484275 }
41494276 leaf = path->nodes[0];
4277
+ btrfs_init_map_token(&token, leaf);
41504278 fi = btrfs_item_ptr(leaf, path->slots[0],
41514279 struct btrfs_file_extent_item);
41524280
4153
- btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4154
- &token);
4281
+ btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
41554282 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4156
- btrfs_set_token_file_extent_type(leaf, fi,
4157
- BTRFS_FILE_EXTENT_PREALLOC,
4158
- &token);
4283
+ btrfs_set_token_file_extent_type(&token, fi,
4284
+ BTRFS_FILE_EXTENT_PREALLOC);
41594285 else
4160
- btrfs_set_token_file_extent_type(leaf, fi,
4161
- BTRFS_FILE_EXTENT_REG,
4162
- &token);
4286
+ btrfs_set_token_file_extent_type(&token, fi,
4287
+ BTRFS_FILE_EXTENT_REG);
41634288
41644289 block_len = max(em->block_len, em->orig_block_len);
41654290 if (em->compress_type != BTRFS_COMPRESS_NONE) {
4166
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4167
- em->block_start,
4168
- &token);
4169
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4170
- &token);
4291
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4292
+ em->block_start);
4293
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
41714294 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4172
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4295
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi,
41734296 em->block_start -
4174
- extent_offset, &token);
4175
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4176
- &token);
4297
+ extent_offset);
4298
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
41774299 } else {
4178
- btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4179
- btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4180
- &token);
4300
+ btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4301
+ btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
41814302 }
41824303
4183
- btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4184
- btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4185
- btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4186
- btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4187
- &token);
4188
- btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4189
- btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4304
+ btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4305
+ btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4306
+ btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4307
+ btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4308
+ btrfs_set_token_file_extent_encryption(&token, fi, 0);
4309
+ btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
41904310 btrfs_mark_buffer_dirty(leaf);
41914311
41924312 btrfs_release_path(path);
....@@ -4196,7 +4316,7 @@
41964316
41974317 /*
41984318 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4199
- * lose them after doing a fast fsync and replaying the log. We scan the
4319
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
42004320 * subvolume's root instead of iterating the inode's extent map tree because
42014321 * otherwise we can log incorrect extent items based on extent map conversion.
42024322 * That can happen due to the fact that extent maps are merged when they
....@@ -4322,12 +4442,9 @@
43224442 }
43234443 }
43244444 }
4325
- if (ins_nr > 0) {
4445
+ if (ins_nr > 0)
43264446 ret = copy_items(trans, inode, dst_path, path,
43274447 start_slot, ins_nr, 1, 0);
4328
- if (ret > 0)
4329
- ret = 0;
4330
- }
43314448 out:
43324449 btrfs_release_path(path);
43334450 btrfs_free_path(dst_path);
....@@ -4338,14 +4455,13 @@
43384455 struct btrfs_root *root,
43394456 struct btrfs_inode *inode,
43404457 struct btrfs_path *path,
4341
- struct btrfs_log_ctx *ctx,
4342
- const u64 start,
4343
- const u64 end)
4458
+ struct btrfs_log_ctx *ctx)
43444459 {
4460
+ struct btrfs_ordered_extent *ordered;
4461
+ struct btrfs_ordered_extent *tmp;
43454462 struct extent_map *em, *n;
43464463 struct list_head extents;
43474464 struct extent_map_tree *tree = &inode->extent_tree;
4348
- u64 logged_start, logged_end;
43494465 u64 test_gen;
43504466 int ret = 0;
43514467 int num = 0;
....@@ -4354,27 +4470,8 @@
43544470
43554471 write_lock(&tree->lock);
43564472 test_gen = root->fs_info->last_trans_committed;
4357
- logged_start = start;
4358
- logged_end = end;
43594473
43604474 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4361
- /*
4362
- * Skip extents outside our logging range. It's important to do
4363
- * it for correctness because if we don't ignore them, we may
4364
- * log them before their ordered extent completes, and therefore
4365
- * we could log them without logging their respective checksums
4366
- * (the checksum items are added to the csum tree at the very
4367
- * end of btrfs_finish_ordered_io()). Also leave such extents
4368
- * outside of our range in the list, since we may have another
4369
- * ranged fsync in the near future that needs them. If an extent
4370
- * outside our range corresponds to a hole, log it to avoid
4371
- * leaving gaps between extents (fsck will complain when we are
4372
- * not using the NO_HOLES feature).
4373
- */
4374
- if ((em->start > end || em->start + em->len <= start) &&
4375
- em->block_start != EXTENT_MAP_HOLE)
4376
- continue;
4377
-
43784475 list_del_init(&em->list);
43794476 /*
43804477 * Just an arbitrary number, this can be really CPU intensive
....@@ -4395,11 +4492,6 @@
43954492 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
43964493 em->start >= i_size_read(&inode->vfs_inode))
43974494 continue;
4398
-
4399
- if (em->start < logged_start)
4400
- logged_start = em->start;
4401
- if ((em->start + em->len - 1) > logged_end)
4402
- logged_end = em->start + em->len - 1;
44034495
44044496 /* Need a ref to keep it from getting evicted from cache */
44054497 refcount_inc(&em->refs);
....@@ -4438,8 +4530,32 @@
44384530 btrfs_release_path(path);
44394531 if (!ret)
44404532 ret = btrfs_log_prealloc_extents(trans, inode, path);
4533
+ if (ret)
4534
+ return ret;
44414535
4442
- return ret;
4536
+ /*
4537
+ * We have logged all extents successfully, now make sure the commit of
4538
+ * the current transaction waits for the ordered extents to complete
4539
+ * before it commits and wipes out the log trees, otherwise we would
4540
+ * lose data if an ordered extents completes after the transaction
4541
+ * commits and a power failure happens after the transaction commit.
4542
+ */
4543
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4544
+ list_del_init(&ordered->log_list);
4545
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4546
+
4547
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4548
+ spin_lock_irq(&inode->ordered_tree.lock);
4549
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4550
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4551
+ atomic_inc(&trans->transaction->pending_ordered);
4552
+ }
4553
+ spin_unlock_irq(&inode->ordered_tree.lock);
4554
+ }
4555
+ btrfs_put_ordered_extent(ordered);
4556
+ }
4557
+
4558
+ return 0;
44434559 }
44444560
44454561 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
....@@ -4502,6 +4618,10 @@
45024618 const u64 ino = btrfs_ino(inode);
45034619 int ins_nr = 0;
45044620 int start_slot = 0;
4621
+ bool found_xattrs = false;
4622
+
4623
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4624
+ return 0;
45054625
45064626 key.objectid = ino;
45074627 key.type = BTRFS_XATTR_ITEM_KEY;
....@@ -4540,6 +4660,7 @@
45404660 start_slot = slot;
45414661 ins_nr++;
45424662 path->slots[0]++;
4663
+ found_xattrs = true;
45434664 cond_resched();
45444665 }
45454666 if (ins_nr > 0) {
....@@ -4548,6 +4669,9 @@
45484669 if (ret < 0)
45494670 return ret;
45504671 }
4672
+
4673
+ if (!found_xattrs)
4674
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
45514675
45524676 return 0;
45534677 }
....@@ -4585,9 +4709,7 @@
45854709 return ret;
45864710
45874711 while (true) {
4588
- struct btrfs_file_extent_item *extent;
45894712 struct extent_buffer *leaf = path->nodes[0];
4590
- u64 len;
45914713
45924714 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
45934715 ret = btrfs_next_leaf(root, path);
....@@ -4636,18 +4758,7 @@
46364758 leaf = path->nodes[0];
46374759 }
46384760
4639
- extent = btrfs_item_ptr(leaf, path->slots[0],
4640
- struct btrfs_file_extent_item);
4641
- if (btrfs_file_extent_type(leaf, extent) ==
4642
- BTRFS_FILE_EXTENT_INLINE) {
4643
- len = btrfs_file_extent_ram_bytes(leaf, extent);
4644
- prev_extent_end = ALIGN(key.offset + len,
4645
- fs_info->sectorsize);
4646
- } else {
4647
- len = btrfs_file_extent_num_bytes(leaf, extent);
4648
- prev_extent_end = key.offset + len;
4649
- }
4650
-
4761
+ prev_extent_end = btrfs_file_extent_end(path);
46514762 path->slots[0]++;
46524763 cond_resched();
46534764 }
....@@ -4714,7 +4825,7 @@
47144825 const int slot,
47154826 const struct btrfs_key *key,
47164827 struct btrfs_inode *inode,
4717
- u64 *other_ino)
4828
+ u64 *other_ino, u64 *other_parent)
47184829 {
47194830 int ret;
47204831 struct btrfs_path *search_path;
....@@ -4777,8 +4888,13 @@
47774888 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
47784889 di, &di_key);
47794890 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4780
- ret = 1;
4781
- *other_ino = di_key.objectid;
4891
+ if (di_key.objectid != key->objectid) {
4892
+ ret = 1;
4893
+ *other_ino = di_key.objectid;
4894
+ *other_parent = parent;
4895
+ } else {
4896
+ ret = 0;
4897
+ }
47824898 } else {
47834899 ret = -EAGAIN;
47844900 }
....@@ -4795,6 +4911,334 @@
47954911 out:
47964912 btrfs_free_path(search_path);
47974913 kfree(name);
4914
+ return ret;
4915
+}
4916
+
4917
+struct btrfs_ino_list {
4918
+ u64 ino;
4919
+ u64 parent;
4920
+ struct list_head list;
4921
+};
4922
+
4923
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4924
+ struct btrfs_root *root,
4925
+ struct btrfs_path *path,
4926
+ struct btrfs_log_ctx *ctx,
4927
+ u64 ino, u64 parent)
4928
+{
4929
+ struct btrfs_ino_list *ino_elem;
4930
+ LIST_HEAD(inode_list);
4931
+ int ret = 0;
4932
+
4933
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4934
+ if (!ino_elem)
4935
+ return -ENOMEM;
4936
+ ino_elem->ino = ino;
4937
+ ino_elem->parent = parent;
4938
+ list_add_tail(&ino_elem->list, &inode_list);
4939
+
4940
+ while (!list_empty(&inode_list)) {
4941
+ struct btrfs_fs_info *fs_info = root->fs_info;
4942
+ struct btrfs_key key;
4943
+ struct inode *inode;
4944
+
4945
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4946
+ list);
4947
+ ino = ino_elem->ino;
4948
+ parent = ino_elem->parent;
4949
+ list_del(&ino_elem->list);
4950
+ kfree(ino_elem);
4951
+ if (ret)
4952
+ continue;
4953
+
4954
+ btrfs_release_path(path);
4955
+
4956
+ inode = btrfs_iget(fs_info->sb, ino, root);
4957
+ /*
4958
+ * If the other inode that had a conflicting dir entry was
4959
+ * deleted in the current transaction, we need to log its parent
4960
+ * directory.
4961
+ */
4962
+ if (IS_ERR(inode)) {
4963
+ ret = PTR_ERR(inode);
4964
+ if (ret == -ENOENT) {
4965
+ inode = btrfs_iget(fs_info->sb, parent, root);
4966
+ if (IS_ERR(inode)) {
4967
+ ret = PTR_ERR(inode);
4968
+ } else {
4969
+ ret = btrfs_log_inode(trans, root,
4970
+ BTRFS_I(inode),
4971
+ LOG_OTHER_INODE_ALL,
4972
+ ctx);
4973
+ btrfs_add_delayed_iput(inode);
4974
+ }
4975
+ }
4976
+ continue;
4977
+ }
4978
+ /*
4979
+ * If the inode was already logged skip it - otherwise we can
4980
+ * hit an infinite loop. Example:
4981
+ *
4982
+ * From the commit root (previous transaction) we have the
4983
+ * following inodes:
4984
+ *
4985
+ * inode 257 a directory
4986
+ * inode 258 with references "zz" and "zz_link" on inode 257
4987
+ * inode 259 with reference "a" on inode 257
4988
+ *
4989
+ * And in the current (uncommitted) transaction we have:
4990
+ *
4991
+ * inode 257 a directory, unchanged
4992
+ * inode 258 with references "a" and "a2" on inode 257
4993
+ * inode 259 with reference "zz_link" on inode 257
4994
+ * inode 261 with reference "zz" on inode 257
4995
+ *
4996
+ * When logging inode 261 the following infinite loop could
4997
+ * happen if we don't skip already logged inodes:
4998
+ *
4999
+ * - we detect inode 258 as a conflicting inode, with inode 261
5000
+ * on reference "zz", and log it;
5001
+ *
5002
+ * - we detect inode 259 as a conflicting inode, with inode 258
5003
+ * on reference "a", and log it;
5004
+ *
5005
+ * - we detect inode 258 as a conflicting inode, with inode 259
5006
+ * on reference "zz_link", and log it - again! After this we
5007
+ * repeat the above steps forever.
5008
+ */
5009
+ spin_lock(&BTRFS_I(inode)->lock);
5010
+ /*
5011
+ * Check the inode's logged_trans only instead of
5012
+ * btrfs_inode_in_log(). This is because the last_log_commit of
5013
+ * the inode is not updated when we only log that it exists and
5014
+ * it has the full sync bit set (see btrfs_log_inode()).
5015
+ */
5016
+ if (BTRFS_I(inode)->logged_trans == trans->transid) {
5017
+ spin_unlock(&BTRFS_I(inode)->lock);
5018
+ btrfs_add_delayed_iput(inode);
5019
+ continue;
5020
+ }
5021
+ spin_unlock(&BTRFS_I(inode)->lock);
5022
+ /*
5023
+ * We are safe logging the other inode without acquiring its
5024
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
5025
+ * are safe against concurrent renames of the other inode as
5026
+ * well because during a rename we pin the log and update the
5027
+ * log with the new name before we unpin it.
5028
+ */
5029
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5030
+ LOG_OTHER_INODE, ctx);
5031
+ if (ret) {
5032
+ btrfs_add_delayed_iput(inode);
5033
+ continue;
5034
+ }
5035
+
5036
+ key.objectid = ino;
5037
+ key.type = BTRFS_INODE_REF_KEY;
5038
+ key.offset = 0;
5039
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5040
+ if (ret < 0) {
5041
+ btrfs_add_delayed_iput(inode);
5042
+ continue;
5043
+ }
5044
+
5045
+ while (true) {
5046
+ struct extent_buffer *leaf = path->nodes[0];
5047
+ int slot = path->slots[0];
5048
+ u64 other_ino = 0;
5049
+ u64 other_parent = 0;
5050
+
5051
+ if (slot >= btrfs_header_nritems(leaf)) {
5052
+ ret = btrfs_next_leaf(root, path);
5053
+ if (ret < 0) {
5054
+ break;
5055
+ } else if (ret > 0) {
5056
+ ret = 0;
5057
+ break;
5058
+ }
5059
+ continue;
5060
+ }
5061
+
5062
+ btrfs_item_key_to_cpu(leaf, &key, slot);
5063
+ if (key.objectid != ino ||
5064
+ (key.type != BTRFS_INODE_REF_KEY &&
5065
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
5066
+ ret = 0;
5067
+ break;
5068
+ }
5069
+
5070
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
5071
+ BTRFS_I(inode), &other_ino,
5072
+ &other_parent);
5073
+ if (ret < 0)
5074
+ break;
5075
+ if (ret > 0) {
5076
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5077
+ if (!ino_elem) {
5078
+ ret = -ENOMEM;
5079
+ break;
5080
+ }
5081
+ ino_elem->ino = other_ino;
5082
+ ino_elem->parent = other_parent;
5083
+ list_add_tail(&ino_elem->list, &inode_list);
5084
+ ret = 0;
5085
+ }
5086
+ path->slots[0]++;
5087
+ }
5088
+ btrfs_add_delayed_iput(inode);
5089
+ }
5090
+
5091
+ return ret;
5092
+}
5093
+
5094
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5095
+ struct btrfs_inode *inode,
5096
+ struct btrfs_key *min_key,
5097
+ const struct btrfs_key *max_key,
5098
+ struct btrfs_path *path,
5099
+ struct btrfs_path *dst_path,
5100
+ const u64 logged_isize,
5101
+ const bool recursive_logging,
5102
+ const int inode_only,
5103
+ struct btrfs_log_ctx *ctx,
5104
+ bool *need_log_inode_item)
5105
+{
5106
+ const u64 i_size = i_size_read(&inode->vfs_inode);
5107
+ struct btrfs_root *root = inode->root;
5108
+ int ins_start_slot = 0;
5109
+ int ins_nr = 0;
5110
+ int ret;
5111
+
5112
+ while (1) {
5113
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
5114
+ if (ret < 0)
5115
+ return ret;
5116
+ if (ret > 0) {
5117
+ ret = 0;
5118
+ break;
5119
+ }
5120
+again:
5121
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5122
+ if (min_key->objectid != max_key->objectid)
5123
+ break;
5124
+ if (min_key->type > max_key->type)
5125
+ break;
5126
+
5127
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5128
+ *need_log_inode_item = false;
5129
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5130
+ min_key->offset >= i_size) {
5131
+ /*
5132
+ * Extents at and beyond eof are logged with
5133
+ * btrfs_log_prealloc_extents().
5134
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5135
+ * and no keys greater than that, so bail out.
5136
+ */
5137
+ break;
5138
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5139
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5140
+ inode->generation == trans->transid &&
5141
+ !recursive_logging) {
5142
+ u64 other_ino = 0;
5143
+ u64 other_parent = 0;
5144
+
5145
+ ret = btrfs_check_ref_name_override(path->nodes[0],
5146
+ path->slots[0], min_key, inode,
5147
+ &other_ino, &other_parent);
5148
+ if (ret < 0) {
5149
+ return ret;
5150
+ } else if (ret > 0 && ctx &&
5151
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5152
+ if (ins_nr > 0) {
5153
+ ins_nr++;
5154
+ } else {
5155
+ ins_nr = 1;
5156
+ ins_start_slot = path->slots[0];
5157
+ }
5158
+ ret = copy_items(trans, inode, dst_path, path,
5159
+ ins_start_slot, ins_nr,
5160
+ inode_only, logged_isize);
5161
+ if (ret < 0)
5162
+ return ret;
5163
+ ins_nr = 0;
5164
+
5165
+ ret = log_conflicting_inodes(trans, root, path,
5166
+ ctx, other_ino, other_parent);
5167
+ if (ret)
5168
+ return ret;
5169
+ btrfs_release_path(path);
5170
+ goto next_key;
5171
+ }
5172
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5173
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5174
+ if (ins_nr == 0)
5175
+ goto next_slot;
5176
+ ret = copy_items(trans, inode, dst_path, path,
5177
+ ins_start_slot,
5178
+ ins_nr, inode_only, logged_isize);
5179
+ if (ret < 0)
5180
+ return ret;
5181
+ ins_nr = 0;
5182
+ goto next_slot;
5183
+ }
5184
+
5185
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5186
+ ins_nr++;
5187
+ goto next_slot;
5188
+ } else if (!ins_nr) {
5189
+ ins_start_slot = path->slots[0];
5190
+ ins_nr = 1;
5191
+ goto next_slot;
5192
+ }
5193
+
5194
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5195
+ ins_nr, inode_only, logged_isize);
5196
+ if (ret < 0)
5197
+ return ret;
5198
+ ins_nr = 1;
5199
+ ins_start_slot = path->slots[0];
5200
+next_slot:
5201
+ path->slots[0]++;
5202
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5203
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
5204
+ path->slots[0]);
5205
+ goto again;
5206
+ }
5207
+ if (ins_nr) {
5208
+ ret = copy_items(trans, inode, dst_path, path,
5209
+ ins_start_slot, ins_nr, inode_only,
5210
+ logged_isize);
5211
+ if (ret < 0)
5212
+ return ret;
5213
+ ins_nr = 0;
5214
+ }
5215
+ btrfs_release_path(path);
5216
+next_key:
5217
+ if (min_key->offset < (u64)-1) {
5218
+ min_key->offset++;
5219
+ } else if (min_key->type < max_key->type) {
5220
+ min_key->type++;
5221
+ min_key->offset = 0;
5222
+ } else {
5223
+ break;
5224
+ }
5225
+ }
5226
+ if (ins_nr) {
5227
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5228
+ ins_nr, inode_only, logged_isize);
5229
+ if (ret)
5230
+ return ret;
5231
+ }
5232
+
5233
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5234
+ /*
5235
+ * Release the path because otherwise we might attempt to double
5236
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
5237
+ */
5238
+ btrfs_release_path(path);
5239
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5240
+ }
5241
+
47985242 return ret;
47995243 }
48005244
....@@ -4815,27 +5259,22 @@
48155259 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
48165260 struct btrfs_root *root, struct btrfs_inode *inode,
48175261 int inode_only,
4818
- const loff_t start,
4819
- const loff_t end,
48205262 struct btrfs_log_ctx *ctx)
48215263 {
4822
- struct btrfs_fs_info *fs_info = root->fs_info;
48235264 struct btrfs_path *path;
48245265 struct btrfs_path *dst_path;
48255266 struct btrfs_key min_key;
48265267 struct btrfs_key max_key;
48275268 struct btrfs_root *log = root->log_root;
48285269 int err = 0;
4829
- int ret;
4830
- int nritems;
4831
- int ins_start_slot = 0;
4832
- int ins_nr;
5270
+ int ret = 0;
48335271 bool fast_search = false;
48345272 u64 ino = btrfs_ino(inode);
48355273 struct extent_map_tree *em_tree = &inode->extent_tree;
48365274 u64 logged_isize = 0;
48375275 bool need_log_inode_item = true;
48385276 bool xattrs_logged = false;
5277
+ bool recursive_logging = false;
48395278
48405279 path = btrfs_alloc_path();
48415280 if (!path)
....@@ -4864,15 +5303,19 @@
48645303 max_key.offset = (u64)-1;
48655304
48665305 /*
4867
- * Only run delayed items if we are a dir or a new file.
4868
- * Otherwise commit the delayed inode only, which is needed in
4869
- * order for the log replay code to mark inodes for link count
4870
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
5306
+ * Only run delayed items if we are a directory. We want to make sure
5307
+ * all directory indexes hit the fs/subvolume tree so we can find them
5308
+ * and figure out which index ranges have to be logged.
5309
+ *
5310
+ * Otherwise commit the delayed inode only if the full sync flag is set,
5311
+ * as we want to make sure an up to date version is in the subvolume
5312
+ * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5313
+ * it to the log tree. For a non full sync, we always log the inode item
5314
+ * based on the in-memory struct btrfs_inode which is always up to date.
48715315 */
4872
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
4873
- inode->generation > fs_info->last_trans_committed)
5316
+ if (S_ISDIR(inode->vfs_inode.i_mode))
48745317 ret = btrfs_commit_inode_delayed_items(trans, inode);
4875
- else
5318
+ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
48765319 ret = btrfs_commit_inode_delayed_inode(inode);
48775320
48785321 if (ret) {
....@@ -4881,12 +5324,28 @@
48815324 return ret;
48825325 }
48835326
4884
- if (inode_only == LOG_OTHER_INODE) {
4885
- inode_only = LOG_INODE_EXISTS;
5327
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5328
+ recursive_logging = true;
5329
+ if (inode_only == LOG_OTHER_INODE)
5330
+ inode_only = LOG_INODE_EXISTS;
5331
+ else
5332
+ inode_only = LOG_INODE_ALL;
48865333 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
48875334 } else {
48885335 mutex_lock(&inode->log_mutex);
48895336 }
5337
+
5338
+ /*
5339
+ * For symlinks, we must always log their content, which is stored in an
5340
+ * inline extent, otherwise we could end up with an empty symlink after
5341
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
5342
+ * one attempts to create an empty symlink).
5343
+ * We don't need to worry about flushing delalloc, because when we create
5344
+ * the inline extent when the symlink is created (we never have delalloc
5345
+ * for symlinks).
5346
+ */
5347
+ if (S_ISLNK(inode->vfs_inode.i_mode))
5348
+ inode_only = LOG_INODE_ALL;
48905349
48915350 /*
48925351 * a brute force approach to making sure we get the most uptodate
....@@ -4955,170 +5414,12 @@
49555414 goto out_unlock;
49565415 }
49575416
4958
- while (1) {
4959
- ins_nr = 0;
4960
- ret = btrfs_search_forward(root, &min_key,
4961
- path, trans->transid);
4962
- if (ret < 0) {
4963
- err = ret;
4964
- goto out_unlock;
4965
- }
4966
- if (ret != 0)
4967
- break;
4968
-again:
4969
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
4970
- if (min_key.objectid != ino)
4971
- break;
4972
- if (min_key.type > max_key.type)
4973
- break;
4974
-
4975
- if (min_key.type == BTRFS_INODE_ITEM_KEY)
4976
- need_log_inode_item = false;
4977
-
4978
- if ((min_key.type == BTRFS_INODE_REF_KEY ||
4979
- min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4980
- inode->generation == trans->transid) {
4981
- u64 other_ino = 0;
4982
-
4983
- ret = btrfs_check_ref_name_override(path->nodes[0],
4984
- path->slots[0], &min_key, inode,
4985
- &other_ino);
4986
- if (ret < 0) {
4987
- err = ret;
4988
- goto out_unlock;
4989
- } else if (ret > 0 && ctx &&
4990
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4991
- struct btrfs_key inode_key;
4992
- struct inode *other_inode;
4993
-
4994
- if (ins_nr > 0) {
4995
- ins_nr++;
4996
- } else {
4997
- ins_nr = 1;
4998
- ins_start_slot = path->slots[0];
4999
- }
5000
- ret = copy_items(trans, inode, dst_path, path,
5001
- ins_start_slot,
5002
- ins_nr, inode_only,
5003
- logged_isize);
5004
- if (ret < 0) {
5005
- err = ret;
5006
- goto out_unlock;
5007
- }
5008
- ins_nr = 0;
5009
- btrfs_release_path(path);
5010
- inode_key.objectid = other_ino;
5011
- inode_key.type = BTRFS_INODE_ITEM_KEY;
5012
- inode_key.offset = 0;
5013
- other_inode = btrfs_iget(fs_info->sb,
5014
- &inode_key, root,
5015
- NULL);
5016
- /*
5017
- * If the other inode that had a conflicting dir
5018
- * entry was deleted in the current transaction,
5019
- * we don't need to do more work nor fallback to
5020
- * a transaction commit.
5021
- */
5022
- if (other_inode == ERR_PTR(-ENOENT)) {
5023
- goto next_key;
5024
- } else if (IS_ERR(other_inode)) {
5025
- err = PTR_ERR(other_inode);
5026
- goto out_unlock;
5027
- }
5028
- /*
5029
- * We are safe logging the other inode without
5030
- * acquiring its i_mutex as long as we log with
5031
- * the LOG_INODE_EXISTS mode. We're safe against
5032
- * concurrent renames of the other inode as well
5033
- * because during a rename we pin the log and
5034
- * update the log with the new name before we
5035
- * unpin it.
5036
- */
5037
- err = btrfs_log_inode(trans, root,
5038
- BTRFS_I(other_inode),
5039
- LOG_OTHER_INODE, 0, LLONG_MAX,
5040
- ctx);
5041
- btrfs_add_delayed_iput(other_inode);
5042
- if (err)
5043
- goto out_unlock;
5044
- else
5045
- goto next_key;
5046
- }
5047
- }
5048
-
5049
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5050
- if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
5051
- if (ins_nr == 0)
5052
- goto next_slot;
5053
- ret = copy_items(trans, inode, dst_path, path,
5054
- ins_start_slot,
5055
- ins_nr, inode_only, logged_isize);
5056
- if (ret < 0) {
5057
- err = ret;
5058
- goto out_unlock;
5059
- }
5060
- ins_nr = 0;
5061
- goto next_slot;
5062
- }
5063
-
5064
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5065
- ins_nr++;
5066
- goto next_slot;
5067
- } else if (!ins_nr) {
5068
- ins_start_slot = path->slots[0];
5069
- ins_nr = 1;
5070
- goto next_slot;
5071
- }
5072
-
5073
- ret = copy_items(trans, inode, dst_path, path,
5074
- ins_start_slot, ins_nr, inode_only,
5075
- logged_isize);
5076
- if (ret < 0) {
5077
- err = ret;
5078
- goto out_unlock;
5079
- }
5080
- ins_nr = 1;
5081
- ins_start_slot = path->slots[0];
5082
-next_slot:
5083
-
5084
- nritems = btrfs_header_nritems(path->nodes[0]);
5085
- path->slots[0]++;
5086
- if (path->slots[0] < nritems) {
5087
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
5088
- path->slots[0]);
5089
- goto again;
5090
- }
5091
- if (ins_nr) {
5092
- ret = copy_items(trans, inode, dst_path, path,
5093
- ins_start_slot,
5094
- ins_nr, inode_only, logged_isize);
5095
- if (ret < 0) {
5096
- err = ret;
5097
- goto out_unlock;
5098
- }
5099
- ins_nr = 0;
5100
- }
5101
- btrfs_release_path(path);
5102
-next_key:
5103
- if (min_key.offset < (u64)-1) {
5104
- min_key.offset++;
5105
- } else if (min_key.type < max_key.type) {
5106
- min_key.type++;
5107
- min_key.offset = 0;
5108
- } else {
5109
- break;
5110
- }
5111
- }
5112
- if (ins_nr) {
5113
- ret = copy_items(trans, inode, dst_path, path,
5114
- ins_start_slot, ins_nr, inode_only,
5115
- logged_isize);
5116
- if (ret < 0) {
5117
- err = ret;
5118
- goto out_unlock;
5119
- }
5120
- ins_nr = 0;
5121
- }
5417
+ err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5418
+ path, dst_path, logged_isize,
5419
+ recursive_logging, inode_only, ctx,
5420
+ &need_log_inode_item);
5421
+ if (err)
5422
+ goto out_unlock;
51225423
51235424 btrfs_release_path(path);
51245425 btrfs_release_path(dst_path);
....@@ -5148,7 +5449,7 @@
51485449 }
51495450 if (fast_search) {
51505451 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5151
- ctx, start, end);
5452
+ ctx);
51525453 if (ret) {
51535454 err = ret;
51545455 goto out_unlock;
....@@ -5157,31 +5458,8 @@
51575458 struct extent_map *em, *n;
51585459
51595460 write_lock(&em_tree->lock);
5160
- /*
5161
- * We can't just remove every em if we're called for a ranged
5162
- * fsync - that is, one that doesn't cover the whole possible
5163
- * file range (0 to LLONG_MAX). This is because we can have
5164
- * em's that fall outside the range we're logging and therefore
5165
- * their ordered operations haven't completed yet
5166
- * (btrfs_finish_ordered_io() not invoked yet). This means we
5167
- * didn't get their respective file extent item in the fs/subvol
5168
- * tree yet, and need to let the next fast fsync (one which
5169
- * consults the list of modified extent maps) find the em so
5170
- * that it logs a matching file extent item and waits for the
5171
- * respective ordered operation to complete (if it's still
5172
- * running).
5173
- *
5174
- * Removing every em outside the range we're logging would make
5175
- * the next fast fsync not log their matching file extent items,
5176
- * therefore making us lose data after a log replay.
5177
- */
5178
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
5179
- list) {
5180
- const u64 mod_end = em->mod_start + em->mod_len - 1;
5181
-
5182
- if (em->mod_start >= start && mod_end <= end)
5183
- list_del_init(&em->list);
5184
- }
5461
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5462
+ list_del_init(&em->list);
51855463 write_unlock(&em_tree->lock);
51865464 }
51875465
....@@ -5195,19 +5473,34 @@
51955473 }
51965474
51975475 /*
5198
- * Don't update last_log_commit if we logged that an inode exists after
5199
- * it was loaded to memory (full_sync bit set).
5200
- * This is to prevent data loss when we do a write to the inode, then
5201
- * the inode gets evicted after all delalloc was flushed, then we log
5202
- * it exists (due to a rename for example) and then fsync it. This last
5203
- * fsync would do nothing (not logging the extents previously written).
5476
+ * If we are logging that an ancestor inode exists as part of logging a
5477
+ * new name from a link or rename operation, don't mark the inode as
5478
+ * logged - otherwise if an explicit fsync is made against an ancestor,
5479
+ * the fsync considers the inode in the log and doesn't sync the log,
5480
+ * resulting in the ancestor missing after a power failure unless the
5481
+ * log was synced as part of an fsync against any other unrelated inode.
5482
+ * So keep it simple for this case and just don't flag the ancestors as
5483
+ * logged.
52045484 */
5205
- spin_lock(&inode->lock);
5206
- inode->logged_trans = trans->transid;
5207
- if (inode_only != LOG_INODE_EXISTS ||
5208
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5209
- inode->last_log_commit = inode->last_sub_trans;
5210
- spin_unlock(&inode->lock);
5485
+ if (!ctx ||
5486
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5487
+ &inode->vfs_inode != ctx->inode)) {
5488
+ spin_lock(&inode->lock);
5489
+ inode->logged_trans = trans->transid;
5490
+ /*
5491
+ * Don't update last_log_commit if we logged that an inode exists
5492
+ * after it was loaded to memory (full_sync bit set).
5493
+ * This is to prevent data loss when we do a write to the inode,
5494
+ * then the inode gets evicted after all delalloc was flushed,
5495
+ * then we log it exists (due to a rename for example) and then
5496
+ * fsync it. This last fsync would do nothing (not logging the
5497
+ * extents previously written).
5498
+ */
5499
+ if (inode_only != LOG_INODE_EXISTS ||
5500
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5501
+ inode->last_log_commit = inode->last_sub_trans;
5502
+ spin_unlock(&inode->lock);
5503
+ }
52115504 out_unlock:
52125505 mutex_unlock(&inode->log_mutex);
52135506
....@@ -5244,7 +5537,7 @@
52445537 * Make sure any commits to the log are forced to be full
52455538 * commits.
52465539 */
5247
- btrfs_set_log_full_commit(fs_info, trans);
5540
+ btrfs_set_log_full_commit(trans);
52485541 ret = true;
52495542 }
52505543 mutex_unlock(&inode->log_mutex);
....@@ -5432,7 +5725,7 @@
54325725 continue;
54335726
54345727 btrfs_release_path(path);
5435
- di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
5728
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
54365729 if (IS_ERR(di_inode)) {
54375730 ret = PTR_ERR(di_inode);
54385731 goto next_dir_inode;
....@@ -5444,10 +5737,10 @@
54445737 }
54455738
54465739 ctx->log_new_dentries = false;
5447
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5740
+ if (type == BTRFS_FT_DIR)
54485741 log_mode = LOG_INODE_ALL;
54495742 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5450
- log_mode, 0, LLONG_MAX, ctx);
5743
+ log_mode, ctx);
54515744 if (!ret &&
54525745 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
54535746 ret = 1;
....@@ -5558,8 +5851,8 @@
55585851 cur_offset = item_size;
55595852 }
55605853
5561
- dir_inode = btrfs_iget(fs_info->sb, &inode_key,
5562
- root, NULL);
5854
+ dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5855
+ root);
55635856 /*
55645857 * If the parent inode was deleted, return an error to
55655858 * fallback to a transaction commit. This is to prevent
....@@ -5591,7 +5884,7 @@
55915884 if (ctx)
55925885 ctx->log_new_dentries = false;
55935886 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5594
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5887
+ LOG_INODE_ALL, ctx);
55955888 if (!ret &&
55965889 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
55975890 ret = 1;
....@@ -5610,6 +5903,192 @@
56105903 return ret;
56115904 }
56125905
5906
+static int log_new_ancestors(struct btrfs_trans_handle *trans,
5907
+ struct btrfs_root *root,
5908
+ struct btrfs_path *path,
5909
+ struct btrfs_log_ctx *ctx)
5910
+{
5911
+ struct btrfs_key found_key;
5912
+
5913
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5914
+
5915
+ while (true) {
5916
+ struct btrfs_fs_info *fs_info = root->fs_info;
5917
+ const u64 last_committed = fs_info->last_trans_committed;
5918
+ struct extent_buffer *leaf = path->nodes[0];
5919
+ int slot = path->slots[0];
5920
+ struct btrfs_key search_key;
5921
+ struct inode *inode;
5922
+ u64 ino;
5923
+ int ret = 0;
5924
+
5925
+ btrfs_release_path(path);
5926
+
5927
+ ino = found_key.offset;
5928
+
5929
+ search_key.objectid = found_key.offset;
5930
+ search_key.type = BTRFS_INODE_ITEM_KEY;
5931
+ search_key.offset = 0;
5932
+ inode = btrfs_iget(fs_info->sb, ino, root);
5933
+ if (IS_ERR(inode))
5934
+ return PTR_ERR(inode);
5935
+
5936
+ if (BTRFS_I(inode)->generation > last_committed)
5937
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5938
+ LOG_INODE_EXISTS, ctx);
5939
+ btrfs_add_delayed_iput(inode);
5940
+ if (ret)
5941
+ return ret;
5942
+
5943
+ if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5944
+ break;
5945
+
5946
+ search_key.type = BTRFS_INODE_REF_KEY;
5947
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5948
+ if (ret < 0)
5949
+ return ret;
5950
+
5951
+ leaf = path->nodes[0];
5952
+ slot = path->slots[0];
5953
+ if (slot >= btrfs_header_nritems(leaf)) {
5954
+ ret = btrfs_next_leaf(root, path);
5955
+ if (ret < 0)
5956
+ return ret;
5957
+ else if (ret > 0)
5958
+ return -ENOENT;
5959
+ leaf = path->nodes[0];
5960
+ slot = path->slots[0];
5961
+ }
5962
+
5963
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
5964
+ if (found_key.objectid != search_key.objectid ||
5965
+ found_key.type != BTRFS_INODE_REF_KEY)
5966
+ return -ENOENT;
5967
+ }
5968
+ return 0;
5969
+}
5970
+
5971
+static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5972
+ struct btrfs_inode *inode,
5973
+ struct dentry *parent,
5974
+ struct btrfs_log_ctx *ctx)
5975
+{
5976
+ struct btrfs_root *root = inode->root;
5977
+ struct btrfs_fs_info *fs_info = root->fs_info;
5978
+ struct dentry *old_parent = NULL;
5979
+ struct super_block *sb = inode->vfs_inode.i_sb;
5980
+ int ret = 0;
5981
+
5982
+ while (true) {
5983
+ if (!parent || d_really_is_negative(parent) ||
5984
+ sb != parent->d_sb)
5985
+ break;
5986
+
5987
+ inode = BTRFS_I(d_inode(parent));
5988
+ if (root != inode->root)
5989
+ break;
5990
+
5991
+ if (inode->generation > fs_info->last_trans_committed) {
5992
+ ret = btrfs_log_inode(trans, root, inode,
5993
+ LOG_INODE_EXISTS, ctx);
5994
+ if (ret)
5995
+ break;
5996
+ }
5997
+ if (IS_ROOT(parent))
5998
+ break;
5999
+
6000
+ parent = dget_parent(parent);
6001
+ dput(old_parent);
6002
+ old_parent = parent;
6003
+ }
6004
+ dput(old_parent);
6005
+
6006
+ return ret;
6007
+}
6008
+
6009
+static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6010
+ struct btrfs_inode *inode,
6011
+ struct dentry *parent,
6012
+ struct btrfs_log_ctx *ctx)
6013
+{
6014
+ struct btrfs_root *root = inode->root;
6015
+ const u64 ino = btrfs_ino(inode);
6016
+ struct btrfs_path *path;
6017
+ struct btrfs_key search_key;
6018
+ int ret;
6019
+
6020
+ /*
6021
+ * For a single hard link case, go through a fast path that does not
6022
+ * need to iterate the fs/subvolume tree.
6023
+ */
6024
+ if (inode->vfs_inode.i_nlink < 2)
6025
+ return log_new_ancestors_fast(trans, inode, parent, ctx);
6026
+
6027
+ path = btrfs_alloc_path();
6028
+ if (!path)
6029
+ return -ENOMEM;
6030
+
6031
+ search_key.objectid = ino;
6032
+ search_key.type = BTRFS_INODE_REF_KEY;
6033
+ search_key.offset = 0;
6034
+again:
6035
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6036
+ if (ret < 0)
6037
+ goto out;
6038
+ if (ret == 0)
6039
+ path->slots[0]++;
6040
+
6041
+ while (true) {
6042
+ struct extent_buffer *leaf = path->nodes[0];
6043
+ int slot = path->slots[0];
6044
+ struct btrfs_key found_key;
6045
+
6046
+ if (slot >= btrfs_header_nritems(leaf)) {
6047
+ ret = btrfs_next_leaf(root, path);
6048
+ if (ret < 0)
6049
+ goto out;
6050
+ else if (ret > 0)
6051
+ break;
6052
+ continue;
6053
+ }
6054
+
6055
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
6056
+ if (found_key.objectid != ino ||
6057
+ found_key.type > BTRFS_INODE_EXTREF_KEY)
6058
+ break;
6059
+
6060
+ /*
6061
+ * Don't deal with extended references because they are rare
6062
+ * cases and too complex to deal with (we would need to keep
6063
+ * track of which subitem we are processing for each item in
6064
+ * this loop, etc). So just return some error to fallback to
6065
+ * a transaction commit.
6066
+ */
6067
+ if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6068
+ ret = -EMLINK;
6069
+ goto out;
6070
+ }
6071
+
6072
+ /*
6073
+ * Logging ancestors needs to do more searches on the fs/subvol
6074
+ * tree, so it releases the path as needed to avoid deadlocks.
6075
+ * Keep track of the last inode ref key and resume from that key
6076
+ * after logging all new ancestors for the current hard link.
6077
+ */
6078
+ memcpy(&search_key, &found_key, sizeof(search_key));
6079
+
6080
+ ret = log_new_ancestors(trans, root, path, ctx);
6081
+ if (ret)
6082
+ goto out;
6083
+ btrfs_release_path(path);
6084
+ goto again;
6085
+ }
6086
+ ret = 0;
6087
+out:
6088
+ btrfs_free_path(path);
6089
+ return ret;
6090
+}
6091
+
56136092 /*
56146093 * helper function around btrfs_log_inode to make sure newly created
56156094 * parent directories also end up in the log. A minimal inode and backref
....@@ -5619,19 +6098,15 @@
56196098 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
56206099 struct btrfs_inode *inode,
56216100 struct dentry *parent,
5622
- const loff_t start,
5623
- const loff_t end,
56246101 int inode_only,
56256102 struct btrfs_log_ctx *ctx)
56266103 {
56276104 struct btrfs_root *root = inode->root;
56286105 struct btrfs_fs_info *fs_info = root->fs_info;
56296106 struct super_block *sb;
5630
- struct dentry *old_parent = NULL;
56316107 int ret = 0;
56326108 u64 last_committed = fs_info->last_trans_committed;
56336109 bool log_dentries = false;
5634
- struct btrfs_inode *orig_inode = inode;
56356110
56366111 sb = inode->vfs_inode.i_sb;
56376112
....@@ -5665,7 +6140,8 @@
56656140 * (since logging them is pointless, a link count of 0 means they
56666141 * will never be accessible).
56676142 */
5668
- if (btrfs_inode_in_log(inode, trans->transid) ||
6143
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
6144
+ list_empty(&ctx->ordered_extents)) ||
56696145 inode->vfs_inode.i_nlink == 0) {
56706146 ret = BTRFS_NO_LOG_SYNC;
56716147 goto end_no_trans;
....@@ -5675,7 +6151,7 @@
56756151 if (ret)
56766152 goto end_no_trans;
56776153
5678
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
6154
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
56796155 if (ret)
56806156 goto end_trans;
56816157
....@@ -5737,56 +6213,22 @@
57376213 * and has a link count of 2.
57386214 */
57396215 if (inode->last_unlink_trans > last_committed) {
5740
- ret = btrfs_log_all_parents(trans, orig_inode, ctx);
6216
+ ret = btrfs_log_all_parents(trans, inode, ctx);
57416217 if (ret)
57426218 goto end_trans;
57436219 }
57446220
5745
- /*
5746
- * If a new hard link was added to the inode in the current transaction
5747
- * and its link count is now greater than 1, we need to fallback to a
5748
- * transaction commit, otherwise we can end up not logging all its new
5749
- * parents for all the hard links. Here just from the dentry used to
5750
- * fsync, we can not visit the ancestor inodes for all the other hard
5751
- * links to figure out if any is new, so we fallback to a transaction
5752
- * commit (instead of adding a lot of complexity of scanning a btree,
5753
- * since this scenario is not a common use case).
5754
- */
5755
- if (inode->vfs_inode.i_nlink > 1 &&
5756
- inode->last_link_trans > last_committed) {
5757
- ret = -EMLINK;
6221
+ ret = log_all_new_ancestors(trans, inode, parent, ctx);
6222
+ if (ret)
57586223 goto end_trans;
5759
- }
57606224
5761
- while (1) {
5762
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5763
- break;
5764
-
5765
- inode = BTRFS_I(d_inode(parent));
5766
- if (root != inode->root)
5767
- break;
5768
-
5769
- if (inode->generation > last_committed) {
5770
- ret = btrfs_log_inode(trans, root, inode,
5771
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5772
- if (ret)
5773
- goto end_trans;
5774
- }
5775
- if (IS_ROOT(parent))
5776
- break;
5777
-
5778
- parent = dget_parent(parent);
5779
- dput(old_parent);
5780
- old_parent = parent;
5781
- }
57826225 if (log_dentries)
5783
- ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
6226
+ ret = log_new_dir_dentries(trans, root, inode, ctx);
57846227 else
57856228 ret = 0;
57866229 end_trans:
5787
- dput(old_parent);
57886230 if (ret < 0) {
5789
- btrfs_set_log_full_commit(fs_info, trans);
6231
+ btrfs_set_log_full_commit(trans);
57906232 ret = 1;
57916233 }
57926234
....@@ -5805,15 +6247,13 @@
58056247 */
58066248 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
58076249 struct dentry *dentry,
5808
- const loff_t start,
5809
- const loff_t end,
58106250 struct btrfs_log_ctx *ctx)
58116251 {
58126252 struct dentry *parent = dget_parent(dentry);
58136253 int ret;
58146254
58156255 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
5816
- start, end, LOG_INODE_ALL, ctx);
6256
+ LOG_INODE_ALL, ctx);
58176257 dput(parent);
58186258
58196259 return ret;
....@@ -5830,12 +6270,11 @@
58306270 struct btrfs_trans_handle *trans;
58316271 struct btrfs_key key;
58326272 struct btrfs_key found_key;
5833
- struct btrfs_key tmp_key;
58346273 struct btrfs_root *log;
58356274 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
58366275 struct walk_control wc = {
58376276 .process_func = process_one_buffer,
5838
- .stage = 0,
6277
+ .stage = LOG_WALK_PIN_ONLY,
58396278 };
58406279
58416280 path = btrfs_alloc_path();
....@@ -5884,7 +6323,7 @@
58846323 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
58856324 break;
58866325
5887
- log = btrfs_read_fs_root(log_root_tree, &found_key);
6326
+ log = btrfs_read_tree_root(log_root_tree, &found_key);
58886327 if (IS_ERR(log)) {
58896328 ret = PTR_ERR(log);
58906329 btrfs_handle_fs_error(fs_info, ret,
....@@ -5892,11 +6331,8 @@
58926331 goto error;
58936332 }
58946333
5895
- tmp_key.objectid = found_key.offset;
5896
- tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5897
- tmp_key.offset = (u64)-1;
5898
-
5899
- wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
6334
+ wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6335
+ true);
59006336 if (IS_ERR(wc.replay_dest)) {
59016337 ret = PTR_ERR(wc.replay_dest);
59026338
....@@ -5912,12 +6348,10 @@
59126348 * each subsequent pass.
59136349 */
59146350 if (ret == -ENOENT)
5915
- ret = btrfs_pin_extent_for_log_replay(fs_info,
6351
+ ret = btrfs_pin_extent_for_log_replay(trans,
59166352 log->node->start,
59176353 log->node->len);
5918
- free_extent_buffer(log->node);
5919
- free_extent_buffer(log->commit_root);
5920
- kfree(log);
6354
+ btrfs_put_root(log);
59216355
59226356 if (!ret)
59236357 goto next;
....@@ -5953,9 +6387,8 @@
59536387 }
59546388
59556389 wc.replay_dest->log_root = NULL;
5956
- free_extent_buffer(log->node);
5957
- free_extent_buffer(log->commit_root);
5958
- kfree(log);
6390
+ btrfs_put_root(wc.replay_dest);
6391
+ btrfs_put_root(log);
59596392
59606393 if (ret)
59616394 goto error;
....@@ -5986,10 +6419,9 @@
59866419 if (ret)
59876420 return ret;
59886421
5989
- free_extent_buffer(log_root_tree->node);
59906422 log_root_tree->log_root = NULL;
59916423 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5992
- kfree(log_root_tree);
6424
+ btrfs_put_root(log_root_tree);
59936425
59946426 return 0;
59956427 error:
....@@ -6085,26 +6517,12 @@
60856517 /*
60866518 * Call this after adding a new name for a file and it will properly
60876519 * update the log to reflect the new name.
6088
- *
6089
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
6090
- * true (because it's not used).
6091
- *
6092
- * Return value depends on whether @sync_log is true or false.
6093
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6094
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
6095
- * otherwise.
6096
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
6097
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
6098
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6099
- * committed (without attempting to sync the log).
61006520 */
6101
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
6521
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
61026522 struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6103
- struct dentry *parent,
6104
- bool sync_log, struct btrfs_log_ctx *ctx)
6523
+ struct dentry *parent)
61056524 {
6106
- struct btrfs_fs_info *fs_info = trans->fs_info;
6107
- int ret;
6525
+ struct btrfs_log_ctx ctx;
61086526
61096527 /*
61106528 * this will force the logging code to walk the dentry chain
....@@ -6117,36 +6535,19 @@
61176535 * if this inode hasn't been logged and directory we're renaming it
61186536 * from hasn't been logged, we don't need to log it
61196537 */
6120
- if (inode->logged_trans <= fs_info->last_trans_committed &&
6121
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
6122
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
6123
- BTRFS_DONT_NEED_LOG_SYNC;
6538
+ if (!inode_logged(trans, inode) &&
6539
+ (!old_dir || !inode_logged(trans, old_dir)))
6540
+ return;
61246541
6125
- if (sync_log) {
6126
- struct btrfs_log_ctx ctx2;
6127
-
6128
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
6129
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6130
- LOG_INODE_EXISTS, &ctx2);
6131
- if (ret == BTRFS_NO_LOG_SYNC)
6132
- return BTRFS_DONT_NEED_TRANS_COMMIT;
6133
- else if (ret)
6134
- return BTRFS_NEED_TRANS_COMMIT;
6135
-
6136
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
6137
- if (ret)
6138
- return BTRFS_NEED_TRANS_COMMIT;
6139
- return BTRFS_DONT_NEED_TRANS_COMMIT;
6140
- }
6141
-
6142
- ASSERT(ctx);
6143
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6144
- LOG_INODE_EXISTS, ctx);
6145
- if (ret == BTRFS_NO_LOG_SYNC)
6146
- return BTRFS_DONT_NEED_LOG_SYNC;
6147
- else if (ret)
6148
- return BTRFS_NEED_TRANS_COMMIT;
6149
-
6150
- return BTRFS_NEED_LOG_SYNC;
6542
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6543
+ ctx.logging_new_name = true;
6544
+ /*
6545
+ * We don't care about the return value. If we fail to log the new name
6546
+ * then we know the next attempt to sync the log will fallback to a full
6547
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6548
+ * we don't need to worry about getting a log committed that has an
6549
+ * inconsistent state after a rename operation.
6550
+ */
6551
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
61516552 }
61526553