hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/btrfs/file.c
....@@ -26,6 +26,8 @@
2626 #include "volumes.h"
2727 #include "qgroup.h"
2828 #include "compression.h"
29
+#include "delalloc-space.h"
30
+#include "reflink.h"
2931
3032 static struct kmem_cache *btrfs_inode_defrag_cachep;
3133 /*
....@@ -273,34 +275,23 @@
273275 {
274276 struct btrfs_root *inode_root;
275277 struct inode *inode;
276
- struct btrfs_key key;
277278 struct btrfs_ioctl_defrag_range_args range;
278279 int num_defrag;
279
- int index;
280280 int ret;
281281
282282 /* get the inode */
283
- key.objectid = defrag->root;
284
- key.type = BTRFS_ROOT_ITEM_KEY;
285
- key.offset = (u64)-1;
286
-
287
- index = srcu_read_lock(&fs_info->subvol_srcu);
288
-
289
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
283
+ inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
290284 if (IS_ERR(inode_root)) {
291285 ret = PTR_ERR(inode_root);
292286 goto cleanup;
293287 }
294288
295
- key.objectid = defrag->ino;
296
- key.type = BTRFS_INODE_ITEM_KEY;
297
- key.offset = 0;
298
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
289
+ inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
290
+ btrfs_put_root(inode_root);
299291 if (IS_ERR(inode)) {
300292 ret = PTR_ERR(inode);
301293 goto cleanup;
302294 }
303
- srcu_read_unlock(&fs_info->subvol_srcu, index);
304295
305296 /* do a chunk of defrag */
306297 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
....@@ -336,7 +327,6 @@
336327 iput(inode);
337328 return 0;
338329 cleanup:
339
- srcu_read_unlock(&fs_info->subvol_srcu, index);
340330 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
341331 return ret;
342332 }
....@@ -399,7 +389,7 @@
399389 size_t copied = 0;
400390 size_t total_copied = 0;
401391 int pg = 0;
402
- int offset = pos & (PAGE_SIZE - 1);
392
+ int offset = offset_in_page(pos);
403393
404394 while (write_bytes > 0) {
405395 size_t count = min_t(size_t,
....@@ -462,47 +452,6 @@
462452 }
463453 }
464454
465
-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
466
- const u64 start,
467
- const u64 len,
468
- struct extent_state **cached_state)
469
-{
470
- u64 search_start = start;
471
- const u64 end = start + len - 1;
472
-
473
- while (search_start < end) {
474
- const u64 search_len = end - search_start + 1;
475
- struct extent_map *em;
476
- u64 em_len;
477
- int ret = 0;
478
-
479
- em = btrfs_get_extent(inode, NULL, 0, search_start,
480
- search_len, 0);
481
- if (IS_ERR(em))
482
- return PTR_ERR(em);
483
-
484
- if (em->block_start != EXTENT_MAP_HOLE)
485
- goto next;
486
-
487
- em_len = em->len;
488
- if (em->start < search_start)
489
- em_len -= search_start - em->start;
490
- if (em_len > search_len)
491
- em_len = search_len;
492
-
493
- ret = set_extent_bit(&inode->io_tree, search_start,
494
- search_start + em_len - 1,
495
- EXTENT_DELALLOC_NEW,
496
- NULL, cached_state, GFP_NOFS);
497
-next:
498
- search_start = extent_map_end(em);
499
- free_extent_map(em);
500
- if (ret)
501
- return ret;
502
- }
503
- return 0;
504
-}
505
-
506455 /*
507456 * after copy_from_user, pages need to be dirtied and we need to make
508457 * sure holes are created between the current EOF and the start of
....@@ -511,18 +460,18 @@
511460 * this also makes the decision about creating an inline extent vs
512461 * doing real data extents, marking pages dirty and delalloc as required.
513462 */
514
-int btrfs_dirty_pages(struct inode *inode, struct page **pages,
463
+int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
515464 size_t num_pages, loff_t pos, size_t write_bytes,
516465 struct extent_state **cached)
517466 {
518
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
467
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
519468 int err = 0;
520469 int i;
521470 u64 num_bytes;
522471 u64 start_pos;
523472 u64 end_of_last_block;
524473 u64 end_pos = pos + write_bytes;
525
- loff_t isize = i_size_read(inode);
474
+ loff_t isize = i_size_read(&inode->vfs_inode);
526475 unsigned int extra_bits = 0;
527476
528477 start_pos = pos & ~((u64) fs_info->sectorsize - 1);
....@@ -535,30 +484,12 @@
535484 * The pages may have already been dirty, clear out old accounting so
536485 * we can set things up properly
537486 */
538
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
539
- EXTENT_DIRTY | EXTENT_DELALLOC |
540
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
541
-
542
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
543
- if (start_pos >= isize &&
544
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
545
- /*
546
- * There can't be any extents following eof in this case
547
- * so just set the delalloc new bit for the range
548
- * directly.
549
- */
550
- extra_bits |= EXTENT_DELALLOC_NEW;
551
- } else {
552
- err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
553
- start_pos,
554
- num_bytes, cached);
555
- if (err)
556
- return err;
557
- }
558
- }
487
+ clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
488
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
489
+ 0, 0, cached);
559490
560491 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
561
- extra_bits, cached, 0);
492
+ extra_bits, cached);
562493 if (err)
563494 return err;
564495
....@@ -575,7 +506,7 @@
575506 * at this time.
576507 */
577508 if (end_pos > isize)
578
- i_size_write(inode, end_pos);
509
+ i_size_write(&inode->vfs_inode, end_pos);
579510 return 0;
580511 }
581512
....@@ -666,7 +597,6 @@
666597 }
667598
668599 split->generation = gen;
669
- split->bdev = em->bdev;
670600 split->flags = flags;
671601 split->compress_type = em->compress_type;
672602 replace_extent_mapping(em_tree, em, split, modified);
....@@ -679,7 +609,6 @@
679609
680610 split->start = start + len;
681611 split->len = em->start + em->len - (start + len);
682
- split->bdev = em->bdev;
683612 split->flags = flags;
684613 split->compress_type = em->compress_type;
685614 split->generation = gen;
....@@ -744,7 +673,7 @@
744673 * is deleted from the tree.
745674 */
746675 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
747
- struct btrfs_root *root, struct inode *inode,
676
+ struct btrfs_root *root, struct btrfs_inode *inode,
748677 struct btrfs_path *path, u64 start, u64 end,
749678 u64 *drop_end, int drop_cache,
750679 int replace_extent,
....@@ -754,9 +683,11 @@
754683 struct btrfs_fs_info *fs_info = root->fs_info;
755684 struct extent_buffer *leaf;
756685 struct btrfs_file_extent_item *fi;
686
+ struct btrfs_ref ref = { 0 };
757687 struct btrfs_key key;
758688 struct btrfs_key new_key;
759
- u64 ino = btrfs_ino(BTRFS_I(inode));
689
+ struct inode *vfs_inode = &inode->vfs_inode;
690
+ u64 ino = btrfs_ino(inode);
760691 u64 search_start = start;
761692 u64 disk_bytenr = 0;
762693 u64 num_bytes = 0;
....@@ -774,13 +705,12 @@
774705 int leafs_visited = 0;
775706
776707 if (drop_cache)
777
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
708
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
778709
779
- if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
710
+ if (start >= inode->disk_i_size && !replace_extent)
780711 modify_tree = 0;
781712
782
- update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
783
- root == fs_info->tree_root);
713
+ update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
784714 while (1) {
785715 recow = 0;
786716 ret = btrfs_lookup_file_extent(trans, root, path, ino,
....@@ -909,11 +839,14 @@
909839 btrfs_mark_buffer_dirty(leaf);
910840
911841 if (update_refs && disk_bytenr > 0) {
912
- ret = btrfs_inc_extent_ref(trans, root,
913
- disk_bytenr, num_bytes, 0,
842
+ btrfs_init_generic_ref(&ref,
843
+ BTRFS_ADD_DELAYED_REF,
844
+ disk_bytenr, num_bytes, 0);
845
+ btrfs_init_data_ref(&ref,
914846 root->root_key.objectid,
915847 new_key.objectid,
916848 start - extent_offset);
849
+ ret = btrfs_inc_extent_ref(trans, &ref);
917850 BUG_ON(ret); /* -ENOMEM */
918851 }
919852 key.offset = start;
....@@ -944,7 +877,7 @@
944877 extent_end - end);
945878 btrfs_mark_buffer_dirty(leaf);
946879 if (update_refs && disk_bytenr > 0)
947
- inode_sub_bytes(inode, end - key.offset);
880
+ inode_sub_bytes(vfs_inode, end - key.offset);
948881 break;
949882 }
950883
....@@ -964,7 +897,7 @@
964897 start - key.offset);
965898 btrfs_mark_buffer_dirty(leaf);
966899 if (update_refs && disk_bytenr > 0)
967
- inode_sub_bytes(inode, extent_end - start);
900
+ inode_sub_bytes(vfs_inode, extent_end - start);
968901 if (end == extent_end)
969902 break;
970903
....@@ -988,18 +921,21 @@
988921
989922 if (update_refs &&
990923 extent_type == BTRFS_FILE_EXTENT_INLINE) {
991
- inode_sub_bytes(inode,
924
+ inode_sub_bytes(vfs_inode,
992925 extent_end - key.offset);
993926 extent_end = ALIGN(extent_end,
994927 fs_info->sectorsize);
995928 } else if (update_refs && disk_bytenr > 0) {
996
- ret = btrfs_free_extent(trans, root,
997
- disk_bytenr, num_bytes, 0,
929
+ btrfs_init_generic_ref(&ref,
930
+ BTRFS_DROP_DELAYED_REF,
931
+ disk_bytenr, num_bytes, 0);
932
+ btrfs_init_data_ref(&ref,
998933 root->root_key.objectid,
999
- key.objectid, key.offset -
1000
- extent_offset);
934
+ key.objectid,
935
+ key.offset - extent_offset);
936
+ ret = btrfs_free_extent(trans, &ref);
1001937 BUG_ON(ret); /* -ENOMEM */
1002
- inode_sub_bytes(inode,
938
+ inode_sub_bytes(vfs_inode,
1003939 extent_end - key.offset);
1004940 }
1005941
....@@ -1025,7 +961,7 @@
1025961 continue;
1026962 }
1027963
1028
- BUG_ON(1);
964
+ BUG();
1029965 }
1030966
1031967 if (!ret && del_nr > 0) {
....@@ -1050,7 +986,7 @@
1050986 if (!ret && replace_extent && leafs_visited == 1 &&
1051987 (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
1052988 path->locks[0] == BTRFS_WRITE_LOCK) &&
1053
- btrfs_leaf_free_space(fs_info, leaf) >=
989
+ btrfs_leaf_free_space(leaf) >=
1054990 sizeof(struct btrfs_item) + extent_item_size) {
1055991
1056992 key.objectid = ino;
....@@ -1063,11 +999,7 @@
1063999 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
10641000 path->slots[0]++;
10651001 }
1066
- setup_items_for_insert(root, path, &key,
1067
- &extent_item_size,
1068
- extent_item_size,
1069
- sizeof(struct btrfs_item) +
1070
- extent_item_size, 1);
1002
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
10711003 *key_inserted = 1;
10721004 }
10731005
....@@ -1088,8 +1020,8 @@
10881020 path = btrfs_alloc_path();
10891021 if (!path)
10901022 return -ENOMEM;
1091
- ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1092
- drop_cache, 0, 0, NULL);
1023
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
1024
+ end, NULL, drop_cache, 0, 0, NULL);
10931025 btrfs_free_path(path);
10941026 return ret;
10951027 }
....@@ -1142,6 +1074,7 @@
11421074 struct extent_buffer *leaf;
11431075 struct btrfs_path *path;
11441076 struct btrfs_file_extent_item *fi;
1077
+ struct btrfs_ref ref = { 0 };
11451078 struct btrfs_key key;
11461079 struct btrfs_key new_key;
11471080 u64 bytenr;
....@@ -1287,9 +1220,11 @@
12871220 extent_end - split);
12881221 btrfs_mark_buffer_dirty(leaf);
12891222
1290
- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
1291
- 0, root->root_key.objectid,
1292
- ino, orig_offset);
1223
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1224
+ num_bytes, 0);
1225
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1226
+ orig_offset);
1227
+ ret = btrfs_inc_extent_ref(trans, &ref);
12931228 if (ret) {
12941229 btrfs_abort_transaction(trans, ret);
12951230 goto out;
....@@ -1311,6 +1246,9 @@
13111246
13121247 other_start = end;
13131248 other_end = 0;
1249
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1250
+ num_bytes, 0);
1251
+ btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
13141252 if (extent_mergeable(leaf, path->slots[0] + 1,
13151253 ino, bytenr, orig_offset,
13161254 &other_start, &other_end)) {
....@@ -1321,9 +1259,7 @@
13211259 extent_end = other_end;
13221260 del_slot = path->slots[0] + 1;
13231261 del_nr++;
1324
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1325
- 0, root->root_key.objectid,
1326
- ino, orig_offset);
1262
+ ret = btrfs_free_extent(trans, &ref);
13271263 if (ret) {
13281264 btrfs_abort_transaction(trans, ret);
13291265 goto out;
....@@ -1341,9 +1277,7 @@
13411277 key.offset = other_start;
13421278 del_slot = path->slots[0];
13431279 del_nr++;
1344
- ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1345
- 0, root->root_key.objectid,
1346
- ino, orig_offset);
1280
+ ret = btrfs_free_extent(trans, &ref);
13471281 if (ret) {
13481282 btrfs_abort_transaction(trans, ret);
13491283 goto out;
....@@ -1481,9 +1415,7 @@
14811415 int ret = 0;
14821416
14831417 start_pos = round_down(pos, fs_info->sectorsize);
1484
- last_pos = start_pos
1485
- + round_up(pos + write_bytes - start_pos,
1486
- fs_info->sectorsize) - 1;
1418
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
14871419
14881420 if (start_pos < inode->vfs_inode.i_size) {
14891421 struct btrfs_ordered_extent *ordered;
....@@ -1493,7 +1425,7 @@
14931425 ordered = btrfs_lookup_ordered_range(inode, start_pos,
14941426 last_pos - start_pos + 1);
14951427 if (ordered &&
1496
- ordered->file_offset + ordered->len > start_pos &&
1428
+ ordered->file_offset + ordered->num_bytes > start_pos &&
14971429 ordered->file_offset <= last_pos) {
14981430 unlock_extent_cached(&inode->io_tree, start_pos,
14991431 last_pos, cached_state);
....@@ -1501,8 +1433,7 @@
15011433 unlock_page(pages[i]);
15021434 put_page(pages[i]);
15031435 }
1504
- btrfs_start_ordered_extent(&inode->vfs_inode,
1505
- ordered, 1);
1436
+ btrfs_start_ordered_extent(ordered, 1);
15061437 btrfs_put_ordered_extent(ordered);
15071438 return -EAGAIN;
15081439 }
....@@ -1536,50 +1467,94 @@
15361467 return ret;
15371468 }
15381469
1539
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1540
- size_t *write_bytes)
1470
+static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1471
+ size_t *write_bytes, bool nowait)
15411472 {
15421473 struct btrfs_fs_info *fs_info = inode->root->fs_info;
15431474 struct btrfs_root *root = inode->root;
1544
- struct btrfs_ordered_extent *ordered;
15451475 u64 lockstart, lockend;
15461476 u64 num_bytes;
15471477 int ret;
15481478
1549
- ret = btrfs_start_write_no_snapshotting(root);
1550
- if (!ret)
1551
- return -ENOSPC;
1479
+ if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1480
+ return 0;
1481
+
1482
+ if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1483
+ return -EAGAIN;
15521484
15531485 lockstart = round_down(pos, fs_info->sectorsize);
15541486 lockend = round_up(pos + *write_bytes,
15551487 fs_info->sectorsize) - 1;
1488
+ num_bytes = lockend - lockstart + 1;
15561489
1557
- while (1) {
1558
- lock_extent(&inode->io_tree, lockstart, lockend);
1490
+ if (nowait) {
1491
+ struct btrfs_ordered_extent *ordered;
1492
+
1493
+ if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1494
+ return -EAGAIN;
1495
+
15591496 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1560
- lockend - lockstart + 1);
1561
- if (!ordered) {
1562
- break;
1497
+ num_bytes);
1498
+ if (ordered) {
1499
+ btrfs_put_ordered_extent(ordered);
1500
+ ret = -EAGAIN;
1501
+ goto out_unlock;
15631502 }
1564
- unlock_extent(&inode->io_tree, lockstart, lockend);
1565
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1566
- btrfs_put_ordered_extent(ordered);
1503
+ } else {
1504
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
1505
+ lockend, NULL);
15671506 }
15681507
1569
- num_bytes = lockend - lockstart + 1;
15701508 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1571
- NULL, NULL, NULL);
1509
+ NULL, NULL, NULL, false);
15721510 if (ret <= 0) {
15731511 ret = 0;
1574
- btrfs_end_write_no_snapshotting(root);
1512
+ if (!nowait)
1513
+ btrfs_drew_write_unlock(&root->snapshot_lock);
15751514 } else {
15761515 *write_bytes = min_t(size_t, *write_bytes ,
15771516 num_bytes - pos + lockstart);
15781517 }
1579
-
1518
+out_unlock:
15801519 unlock_extent(&inode->io_tree, lockstart, lockend);
15811520
15821521 return ret;
1522
+}
1523
+
1524
+static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1525
+ size_t *write_bytes)
1526
+{
1527
+ return check_can_nocow(inode, pos, write_bytes, true);
1528
+}
1529
+
1530
+/*
1531
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1532
+ *
1533
+ * @pos: File offset
1534
+ * @write_bytes: The length to write, will be updated to the nocow writeable
1535
+ * range
1536
+ *
1537
+ * This function will flush ordered extents in the range to ensure proper
1538
+ * nocow checks.
1539
+ *
1540
+ * Return:
1541
+ * >0 and update @write_bytes if we can do nocow write
1542
+ * 0 if we can't do nocow write
1543
+ * -EAGAIN if we can't get the needed lock or there are ordered extents
1544
+ * for * (nowait == true) case
1545
+ * <0 if other error happened
1546
+ *
1547
+ * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1548
+ */
1549
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1550
+ size_t *write_bytes)
1551
+{
1552
+ return check_can_nocow(inode, pos, write_bytes, false);
1553
+}
1554
+
1555
+void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1556
+{
1557
+ btrfs_drew_write_unlock(&inode->root->snapshot_lock);
15831558 }
15841559
15851560 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
....@@ -1589,7 +1564,6 @@
15891564 loff_t pos = iocb->ki_pos;
15901565 struct inode *inode = file_inode(file);
15911566 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1592
- struct btrfs_root *root = BTRFS_I(inode)->root;
15931567 struct page **pages = NULL;
15941568 struct extent_changeset *data_reserved = NULL;
15951569 u64 release_bytes = 0;
....@@ -1610,8 +1584,8 @@
16101584 return -ENOMEM;
16111585
16121586 while (iov_iter_count(i) > 0) {
1613
- size_t offset = pos & (PAGE_SIZE - 1);
16141587 struct extent_state *cached_state = NULL;
1588
+ size_t offset = offset_in_page(pos);
16151589 size_t sector_offset;
16161590 size_t write_bytes = min(iov_iter_count(i),
16171591 nrptrs * (size_t)PAGE_SIZE -
....@@ -1642,13 +1616,12 @@
16421616 fs_info->sectorsize);
16431617
16441618 extent_changeset_release(data_reserved);
1645
- ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
1619
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
1620
+ &data_reserved, pos,
16461621 write_bytes);
16471622 if (ret < 0) {
1648
- if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1649
- BTRFS_INODE_PREALLOC)) &&
1650
- check_can_nocow(BTRFS_I(inode), pos,
1651
- &write_bytes) > 0) {
1623
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1624
+ &write_bytes) > 0) {
16521625 /*
16531626 * For nodata cow case, no need to reserve
16541627 * data space.
....@@ -1673,11 +1646,11 @@
16731646 reserve_bytes);
16741647 if (ret) {
16751648 if (!only_release_metadata)
1676
- btrfs_free_reserved_data_space(inode,
1649
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
16771650 data_reserved, pos,
16781651 write_bytes);
16791652 else
1680
- btrfs_end_write_no_snapshotting(root);
1653
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
16811654 break;
16821655 }
16831656
....@@ -1747,7 +1720,7 @@
17471720 __pos = round_down(pos,
17481721 fs_info->sectorsize) +
17491722 (dirty_pages << PAGE_SHIFT);
1750
- btrfs_delalloc_release_space(inode,
1723
+ btrfs_delalloc_release_space(BTRFS_I(inode),
17511724 data_reserved, __pos,
17521725 release_bytes, true);
17531726 }
....@@ -1757,8 +1730,9 @@
17571730 fs_info->sectorsize);
17581731
17591732 if (copied > 0)
1760
- ret = btrfs_dirty_pages(inode, pages, dirty_pages,
1761
- pos, copied, &cached_state);
1733
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1734
+ dirty_pages, pos, copied,
1735
+ &cached_state);
17621736
17631737 /*
17641738 * If we have not locked the extent range, because the range's
....@@ -1781,7 +1755,7 @@
17811755
17821756 release_bytes = 0;
17831757 if (only_release_metadata)
1784
- btrfs_end_write_no_snapshotting(root);
1758
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
17851759
17861760 if (only_release_metadata && copied > 0) {
17871761 lockstart = round_down(pos,
....@@ -1799,8 +1773,6 @@
17991773 cond_resched();
18001774
18011775 balance_dirty_pages_ratelimited(inode->i_mapping);
1802
- if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
1803
- btrfs_btree_balance_dirty(fs_info);
18041776
18051777 pos += copied;
18061778 num_written += copied;
....@@ -1810,11 +1782,12 @@
18101782
18111783 if (release_bytes) {
18121784 if (only_release_metadata) {
1813
- btrfs_end_write_no_snapshotting(root);
1785
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
18141786 btrfs_delalloc_release_metadata(BTRFS_I(inode),
18151787 release_bytes, true);
18161788 } else {
1817
- btrfs_delalloc_release_space(inode, data_reserved,
1789
+ btrfs_delalloc_release_space(BTRFS_I(inode),
1790
+ data_reserved,
18181791 round_down(pos, fs_info->sectorsize),
18191792 release_bytes, true);
18201793 }
....@@ -1834,7 +1807,7 @@
18341807 loff_t endbyte;
18351808 int err;
18361809
1837
- written = generic_file_direct_write(iocb, from);
1810
+ written = btrfs_direct_IO(iocb, from);
18381811
18391812 if (written < 0 || !iov_iter_count(from))
18401813 return written;
....@@ -1888,11 +1861,10 @@
18881861 struct file *file = iocb->ki_filp;
18891862 struct inode *inode = file_inode(file);
18901863 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1891
- struct btrfs_root *root = BTRFS_I(inode)->root;
18921864 u64 start_pos;
18931865 u64 end_pos;
18941866 ssize_t num_written = 0;
1895
- bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1867
+ const bool sync = iocb->ki_flags & IOCB_DSYNC;
18961868 ssize_t err;
18971869 loff_t pos;
18981870 size_t count;
....@@ -1919,13 +1891,23 @@
19191891 pos = iocb->ki_pos;
19201892 count = iov_iter_count(from);
19211893 if (iocb->ki_flags & IOCB_NOWAIT) {
1894
+ size_t nocow_bytes = count;
1895
+
19221896 /*
19231897 * We will allocate space in case nodatacow is not set,
19241898 * so bail
19251899 */
1926
- if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1927
- BTRFS_INODE_PREALLOC)) ||
1928
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
1900
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
1901
+ <= 0) {
1902
+ inode_unlock(inode);
1903
+ return -EAGAIN;
1904
+ }
1905
+ /*
1906
+ * There are holes in the range or parts of the range that must
1907
+ * be COWed (shared extents, RO block groups, etc), so just bail
1908
+ * out.
1909
+ */
1910
+ if (nocow_bytes < count) {
19291911 inode_unlock(inode);
19301912 return -EAGAIN;
19311913 }
....@@ -1977,7 +1959,40 @@
19771959 atomic_inc(&BTRFS_I(inode)->sync_writers);
19781960
19791961 if (iocb->ki_flags & IOCB_DIRECT) {
1962
+ /*
1963
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
1964
+ * in iomap, as it calls generic_write_sync() in this case.
1965
+ * 2. If we are async, we can call iomap_dio_complete() either
1966
+ * in
1967
+ *
1968
+ * 2.1. A worker thread from the last bio completed. In this
1969
+ * case we need to mark the btrfs_dio_data that it is
1970
+ * async in order to call generic_write_sync() properly.
1971
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
1972
+ * current->journal_info.
1973
+ * 2.2 The submitter context, because all IO completed
1974
+ * before we exited iomap_dio_rw(). In this case we can
1975
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
1976
+ * the sync below. If our ->end_io() gets called and
1977
+ * current->journal_info is set, then we know we're in
1978
+ * our current context and we will clear
1979
+ * current->journal_info to indicate that we need to
1980
+ * sync below.
1981
+ */
1982
+ if (sync) {
1983
+ ASSERT(current->journal_info == NULL);
1984
+ iocb->ki_flags &= ~IOCB_DSYNC;
1985
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
1986
+ }
19801987 num_written = __btrfs_direct_write(iocb, from);
1988
+
1989
+ /*
1990
+ * As stated above, we cleared journal_info, so we need to do
1991
+ * the sync ourselves.
1992
+ */
1993
+ if (sync && current->journal_info == NULL)
1994
+ iocb->ki_flags |= IOCB_DSYNC;
1995
+ current->journal_info = NULL;
19811996 } else {
19821997 num_written = btrfs_buffered_write(iocb, from);
19831998 if (num_written > 0)
....@@ -1989,14 +2004,8 @@
19892004
19902005 inode_unlock(inode);
19912006
1992
- /*
1993
- * We also have to set last_sub_trans to the current log transid,
1994
- * otherwise subsequent syncs to a file that's been synced in this
1995
- * transaction will appear to have already occurred.
1996
- */
1997
- spin_lock(&BTRFS_I(inode)->lock);
1998
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
1999
- spin_unlock(&BTRFS_I(inode)->lock);
2007
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
2008
+
20002009 if (num_written > 0)
20012010 num_written = generic_write_sync(iocb, num_written);
20022011
....@@ -2017,12 +2026,12 @@
20172026 filp->private_data = NULL;
20182027
20192028 /*
2020
- * ordered_data_close is set by settattr when we are about to truncate
2021
- * a file from a non-zero size to a zero size. This tries to
2022
- * flush down new bytes that may have been written if the
2023
- * application were using truncate to replace a file in place.
2029
+ * Set by setattr when we are about to truncate a file from a non-zero
2030
+ * size to a zero size. This tries to flush down new bytes that may
2031
+ * have been written if the application were using truncate to replace
2032
+ * a file in place.
20242033 */
2025
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
2034
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
20262035 &BTRFS_I(inode)->runtime_flags))
20272036 filemap_flush(inode->i_mapping);
20282037 return 0;
....@@ -2048,6 +2057,30 @@
20482057 return ret;
20492058 }
20502059
2060
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2061
+{
2062
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2063
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
2064
+
2065
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
2066
+ list_empty(&ctx->ordered_extents))
2067
+ return true;
2068
+
2069
+ /*
2070
+ * If we are doing a fast fsync we can not bail out if the inode's
2071
+ * last_trans is <= then the last committed transaction, because we only
2072
+ * update the last_trans of the inode during ordered extent completion,
2073
+ * and for a fast fsync we don't wait for that, we only wait for the
2074
+ * writeback to complete.
2075
+ */
2076
+ if (inode->last_trans <= fs_info->last_trans_committed &&
2077
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2078
+ list_empty(&ctx->ordered_extents)))
2079
+ return true;
2080
+
2081
+ return false;
2082
+}
2083
+
20512084 /*
20522085 * fsync call for both files and directories. This logs the inode into
20532086 * the tree log instead of forcing full commits whenever possible.
....@@ -2063,25 +2096,28 @@
20632096 {
20642097 struct dentry *dentry = file_dentry(file);
20652098 struct inode *inode = d_inode(dentry);
2066
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
20672099 struct btrfs_root *root = BTRFS_I(inode)->root;
20682100 struct btrfs_trans_handle *trans;
20692101 struct btrfs_log_ctx ctx;
20702102 int ret = 0, err;
2103
+ u64 len;
2104
+ bool full_sync;
20712105
20722106 trace_btrfs_sync_file(file, datasync);
20732107
20742108 btrfs_init_log_ctx(&ctx, inode);
20752109
20762110 /*
2077
- * Set the range to full if the NO_HOLES feature is not enabled.
2078
- * This is to avoid missing file extent items representing holes after
2079
- * replaying the log.
2111
+ * Always set the range to a full range, otherwise we can get into
2112
+ * several problems, from missing file extent items to represent holes
2113
+ * when not using the NO_HOLES feature, to log tree corruption due to
2114
+ * races between hole detection during logging and completion of ordered
2115
+ * extents outside the range, to missing checksums due to ordered extents
2116
+ * for which we flushed only a subset of their pages.
20802117 */
2081
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
2082
- start = 0;
2083
- end = LLONG_MAX;
2084
- }
2118
+ start = 0;
2119
+ end = LLONG_MAX;
2120
+ len = (u64)LLONG_MAX + 1;
20852121
20862122 /*
20872123 * We write the dirty pages in the range and wait until they complete
....@@ -2105,17 +2141,12 @@
21052141 atomic_inc(&root->log_batch);
21062142
21072143 /*
2108
- * If the inode needs a full sync, make sure we use a full range to
2109
- * avoid log tree corruption, due to hole detection racing with ordered
2110
- * extent completion for adjacent ranges, and assertion failures during
2111
- * hole detection. Do this while holding the inode lock, to avoid races
2112
- * with other tasks.
2144
+ * Always check for the full sync flag while holding the inode's lock,
2145
+ * to avoid races with other tasks. The flag must be either set all the
2146
+ * time during logging or always off all the time while logging.
21132147 */
2114
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2115
- &BTRFS_I(inode)->runtime_flags)) {
2116
- start = 0;
2117
- end = LLONG_MAX;
2118
- }
2148
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2149
+ &BTRFS_I(inode)->runtime_flags);
21192150
21202151 /*
21212152 * Before we acquired the inode's lock, someone may have dirtied more
....@@ -2144,22 +2175,35 @@
21442175
21452176 /*
21462177 * We have to do this here to avoid the priority inversion of waiting on
2147
- * IO of a lower priority task while holding a transaciton open.
2178
+ * IO of a lower priority task while holding a transaction open.
21482179 *
2149
- * Also, the range length can be represented by u64, we have to do the
2150
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
2180
+ * For a full fsync we wait for the ordered extents to complete while
2181
+ * for a fast fsync we wait just for writeback to complete, and then
2182
+ * attach the ordered extents to the transaction so that a transaction
2183
+ * commit waits for their completion, to avoid data loss if we fsync,
2184
+ * the current transaction commits before the ordered extents complete
2185
+ * and a power failure happens right after that.
21512186 */
2152
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
2153
- if (ret) {
2154
- up_write(&BTRFS_I(inode)->dio_sem);
2155
- inode_unlock(inode);
2156
- goto out;
2187
+ if (full_sync) {
2188
+ ret = btrfs_wait_ordered_range(inode, start, len);
2189
+ } else {
2190
+ /*
2191
+ * Get our ordered extents as soon as possible to avoid doing
2192
+ * checksum lookups in the csum tree, and use instead the
2193
+ * checksums attached to the ordered extents.
2194
+ */
2195
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2196
+ &ctx.ordered_extents);
2197
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
21572198 }
2199
+
2200
+ if (ret)
2201
+ goto out_release_extents;
2202
+
21582203 atomic_inc(&root->log_batch);
21592204
21602205 smp_mb();
2161
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
2162
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
2206
+ if (skip_inode_logging(&ctx)) {
21632207 /*
21642208 * We've had everything committed since the last time we were
21652209 * modified so clear this flag in case it was set for whatever
....@@ -2175,9 +2219,7 @@
21752219 * checked called fsync.
21762220 */
21772221 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2178
- up_write(&BTRFS_I(inode)->dio_sem);
2179
- inode_unlock(inode);
2180
- goto out;
2222
+ goto out_release_extents;
21812223 }
21822224
21832225 /*
....@@ -2187,20 +2229,18 @@
21872229 * here we could get into a situation where we're waiting on IO to
21882230 * happen that is blocked on a transaction trying to commit. With start
21892231 * we inc the extwriter counter, so we wait for all extwriters to exit
2190
- * before we start blocking join'ers. This comment is to keep somebody
2232
+ * before we start blocking joiners. This comment is to keep somebody
21912233 * from thinking they are super smart and changing this to
21922234 * btrfs_join_transaction *cough*Josef*cough*.
21932235 */
21942236 trans = btrfs_start_transaction(root, 0);
21952237 if (IS_ERR(trans)) {
21962238 ret = PTR_ERR(trans);
2197
- up_write(&BTRFS_I(inode)->dio_sem);
2198
- inode_unlock(inode);
2199
- goto out;
2239
+ goto out_release_extents;
22002240 }
2201
- trans->sync = true;
22022241
2203
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
2242
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2243
+ btrfs_release_log_ctx_extents(&ctx);
22042244 if (ret < 0) {
22052245 /* Fallthrough and commit/free transaction. */
22062246 ret = 1;
....@@ -2219,30 +2259,18 @@
22192259 up_write(&BTRFS_I(inode)->dio_sem);
22202260 inode_unlock(inode);
22212261
2222
- /*
2223
- * If any of the ordered extents had an error, just return it to user
2224
- * space, so that the application knows some writes didn't succeed and
2225
- * can take proper action (retry for e.g.). Blindly committing the
2226
- * transaction in this case, would fool userspace that everything was
2227
- * successful. And we also want to make sure our log doesn't contain
2228
- * file extent items pointing to extents that weren't fully written to -
2229
- * just like in the non fast fsync path, where we check for the ordered
2230
- * operation's error flag before writing to the log tree and return -EIO
2231
- * if any of them had this flag set (btrfs_wait_ordered_range) -
2232
- * therefore we need to check for errors in the ordered operations,
2233
- * which are indicated by ctx.io_err.
2234
- */
2235
- if (ctx.io_err) {
2236
- btrfs_end_transaction(trans);
2237
- ret = ctx.io_err;
2238
- goto out;
2239
- }
2240
-
22412262 if (ret != BTRFS_NO_LOG_SYNC) {
22422263 if (!ret) {
22432264 ret = btrfs_sync_log(trans, root, &ctx);
22442265 if (!ret) {
22452266 ret = btrfs_end_transaction(trans);
2267
+ goto out;
2268
+ }
2269
+ }
2270
+ if (!full_sync) {
2271
+ ret = btrfs_wait_ordered_range(inode, start, len);
2272
+ if (ret) {
2273
+ btrfs_end_transaction(trans);
22462274 goto out;
22472275 }
22482276 }
....@@ -2256,6 +2284,12 @@
22562284 if (!ret)
22572285 ret = err;
22582286 return ret > 0 ? -EIO : ret;
2287
+
2288
+out_release_extents:
2289
+ btrfs_release_log_ctx_extents(&ctx);
2290
+ up_write(&BTRFS_I(inode)->dio_sem);
2291
+ inode_unlock(inode);
2292
+ goto out;
22592293 }
22602294
22612295 static const struct vm_operations_struct btrfs_file_vm_ops = {
....@@ -2391,7 +2425,6 @@
23912425 hole_em->block_start = EXTENT_MAP_HOLE;
23922426 hole_em->block_len = 0;
23932427 hole_em->orig_block_len = 0;
2394
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
23952428 hole_em->compress_type = BTRFS_COMPRESS_NONE;
23962429 hole_em->generation = trans->transid;
23972430
....@@ -2424,7 +2457,7 @@
24242457
24252458 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
24262459 round_down(*start, fs_info->sectorsize),
2427
- round_up(*len, fs_info->sectorsize), 0);
2460
+ round_up(*len, fs_info->sectorsize));
24282461 if (IS_ERR(em))
24292462 return PTR_ERR(em);
24302463
....@@ -2452,7 +2485,8 @@
24522485
24532486 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
24542487 cached_state);
2455
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2488
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2489
+ lockend);
24562490
24572491 /*
24582492 * We need to make sure we have no ordered extents in this range
....@@ -2460,7 +2494,7 @@
24602494 * we need to try again.
24612495 */
24622496 if ((!ordered ||
2463
- (ordered->file_offset + ordered->len <= lockstart ||
2497
+ (ordered->file_offset + ordered->num_bytes <= lockstart ||
24642498 ordered->file_offset > lockend)) &&
24652499 !filemap_range_has_page(inode->i_mapping,
24662500 lockstart, lockend)) {
....@@ -2480,27 +2514,340 @@
24802514 return 0;
24812515 }
24822516
2483
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2517
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2518
+ struct inode *inode,
2519
+ struct btrfs_path *path,
2520
+ struct btrfs_replace_extent_info *extent_info,
2521
+ const u64 replace_len)
24842522 {
2523
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2524
+ struct btrfs_root *root = BTRFS_I(inode)->root;
2525
+ struct btrfs_file_extent_item *extent;
2526
+ struct extent_buffer *leaf;
2527
+ struct btrfs_key key;
2528
+ int slot;
2529
+ struct btrfs_ref ref = { 0 };
2530
+ int ret;
2531
+
2532
+ if (replace_len == 0)
2533
+ return 0;
2534
+
2535
+ if (extent_info->disk_offset == 0 &&
2536
+ btrfs_fs_incompat(fs_info, NO_HOLES))
2537
+ return 0;
2538
+
2539
+ key.objectid = btrfs_ino(BTRFS_I(inode));
2540
+ key.type = BTRFS_EXTENT_DATA_KEY;
2541
+ key.offset = extent_info->file_offset;
2542
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
2543
+ sizeof(struct btrfs_file_extent_item));
2544
+ if (ret)
2545
+ return ret;
2546
+ leaf = path->nodes[0];
2547
+ slot = path->slots[0];
2548
+ write_extent_buffer(leaf, extent_info->extent_buf,
2549
+ btrfs_item_ptr_offset(leaf, slot),
2550
+ sizeof(struct btrfs_file_extent_item));
2551
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2552
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2553
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2554
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2555
+ if (extent_info->is_new_extent)
2556
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2557
+ btrfs_mark_buffer_dirty(leaf);
2558
+ btrfs_release_path(path);
2559
+
2560
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
2561
+ extent_info->file_offset, replace_len);
2562
+ if (ret)
2563
+ return ret;
2564
+
2565
+ /* If it's a hole, nothing more needs to be done. */
2566
+ if (extent_info->disk_offset == 0)
2567
+ return 0;
2568
+
2569
+ inode_add_bytes(inode, replace_len);
2570
+
2571
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
2572
+ key.objectid = extent_info->disk_offset;
2573
+ key.type = BTRFS_EXTENT_ITEM_KEY;
2574
+ key.offset = extent_info->disk_len;
2575
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
2576
+ btrfs_ino(BTRFS_I(inode)),
2577
+ extent_info->file_offset,
2578
+ extent_info->qgroup_reserved,
2579
+ &key);
2580
+ } else {
2581
+ u64 ref_offset;
2582
+
2583
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2584
+ extent_info->disk_offset,
2585
+ extent_info->disk_len, 0);
2586
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
2587
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
2588
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
2589
+ ret = btrfs_inc_extent_ref(trans, &ref);
2590
+ }
2591
+
2592
+ extent_info->insertions++;
2593
+
2594
+ return ret;
2595
+}
2596
+
2597
+/*
2598
+ * The respective range must have been previously locked, as well as the inode.
2599
+ * The end offset is inclusive (last byte of the range).
2600
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2601
+ * the file range with an extent.
2602
+ * When not punching a hole, we don't want to end up in a state where we dropped
2603
+ * extents without inserting a new one, so we must abort the transaction to avoid
2604
+ * a corruption.
2605
+ */
2606
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
2607
+ const u64 start, const u64 end,
2608
+ struct btrfs_replace_extent_info *extent_info,
2609
+ struct btrfs_trans_handle **trans_out)
2610
+{
2611
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2612
+ u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2613
+ u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2614
+ struct btrfs_root *root = BTRFS_I(inode)->root;
2615
+ struct btrfs_trans_handle *trans = NULL;
2616
+ struct btrfs_block_rsv *rsv;
2617
+ unsigned int rsv_count;
2618
+ u64 cur_offset;
2619
+ u64 drop_end;
2620
+ u64 len = end - start;
2621
+ int ret = 0;
2622
+
2623
+ if (end <= start)
2624
+ return -EINVAL;
2625
+
2626
+ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2627
+ if (!rsv) {
2628
+ ret = -ENOMEM;
2629
+ goto out;
2630
+ }
2631
+ rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2632
+ rsv->failfast = 1;
2633
+
2634
+ /*
2635
+ * 1 - update the inode
2636
+ * 1 - removing the extents in the range
2637
+ * 1 - adding the hole extent if no_holes isn't set or if we are
2638
+ * replacing the range with a new extent
2639
+ */
2640
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2641
+ rsv_count = 3;
2642
+ else
2643
+ rsv_count = 2;
2644
+
2645
+ trans = btrfs_start_transaction(root, rsv_count);
2646
+ if (IS_ERR(trans)) {
2647
+ ret = PTR_ERR(trans);
2648
+ trans = NULL;
2649
+ goto out_free;
2650
+ }
2651
+
2652
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2653
+ min_size, false);
2654
+ BUG_ON(ret);
2655
+ trans->block_rsv = rsv;
2656
+
2657
+ cur_offset = start;
2658
+ while (cur_offset < end) {
2659
+ ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
2660
+ cur_offset, end + 1, &drop_end,
2661
+ 1, 0, 0, NULL);
2662
+ if (ret != -ENOSPC) {
2663
+ /*
2664
+ * The only time we don't want to abort is if we are
2665
+ * attempting to clone a partial inline extent, in which
2666
+ * case we'll get EOPNOTSUPP. However if we aren't
2667
+ * clone we need to abort no matter what, because if we
2668
+ * got EOPNOTSUPP via prealloc then we messed up and
2669
+ * need to abort.
2670
+ */
2671
+ if (ret &&
2672
+ (ret != -EOPNOTSUPP ||
2673
+ (extent_info && extent_info->is_new_extent)))
2674
+ btrfs_abort_transaction(trans, ret);
2675
+ break;
2676
+ }
2677
+
2678
+ trans->block_rsv = &fs_info->trans_block_rsv;
2679
+
2680
+ if (!extent_info && cur_offset < drop_end &&
2681
+ cur_offset < ino_size) {
2682
+ ret = fill_holes(trans, BTRFS_I(inode), path,
2683
+ cur_offset, drop_end);
2684
+ if (ret) {
2685
+ /*
2686
+ * If we failed then we didn't insert our hole
2687
+ * entries for the area we dropped, so now the
2688
+ * fs is corrupted, so we must abort the
2689
+ * transaction.
2690
+ */
2691
+ btrfs_abort_transaction(trans, ret);
2692
+ break;
2693
+ }
2694
+ } else if (!extent_info && cur_offset < drop_end) {
2695
+ /*
2696
+ * We are past the i_size here, but since we didn't
2697
+ * insert holes we need to clear the mapped area so we
2698
+ * know to not set disk_i_size in this area until a new
2699
+ * file extent is inserted here.
2700
+ */
2701
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2702
+ cur_offset, drop_end - cur_offset);
2703
+ if (ret) {
2704
+ /*
2705
+ * We couldn't clear our area, so we could
2706
+ * presumably adjust up and corrupt the fs, so
2707
+ * we need to abort.
2708
+ */
2709
+ btrfs_abort_transaction(trans, ret);
2710
+ break;
2711
+ }
2712
+ }
2713
+
2714
+ if (extent_info && drop_end > extent_info->file_offset) {
2715
+ u64 replace_len = drop_end - extent_info->file_offset;
2716
+
2717
+ ret = btrfs_insert_replace_extent(trans, inode, path,
2718
+ extent_info, replace_len);
2719
+ if (ret) {
2720
+ btrfs_abort_transaction(trans, ret);
2721
+ break;
2722
+ }
2723
+ extent_info->data_len -= replace_len;
2724
+ extent_info->data_offset += replace_len;
2725
+ extent_info->file_offset += replace_len;
2726
+ }
2727
+
2728
+ cur_offset = drop_end;
2729
+
2730
+ ret = btrfs_update_inode(trans, root, inode);
2731
+ if (ret)
2732
+ break;
2733
+
2734
+ btrfs_end_transaction(trans);
2735
+ btrfs_btree_balance_dirty(fs_info);
2736
+
2737
+ trans = btrfs_start_transaction(root, rsv_count);
2738
+ if (IS_ERR(trans)) {
2739
+ ret = PTR_ERR(trans);
2740
+ trans = NULL;
2741
+ break;
2742
+ }
2743
+
2744
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2745
+ rsv, min_size, false);
2746
+ BUG_ON(ret); /* shouldn't happen */
2747
+ trans->block_rsv = rsv;
2748
+
2749
+ if (!extent_info) {
2750
+ ret = find_first_non_hole(inode, &cur_offset, &len);
2751
+ if (unlikely(ret < 0))
2752
+ break;
2753
+ if (ret && !len) {
2754
+ ret = 0;
2755
+ break;
2756
+ }
2757
+ }
2758
+ }
2759
+
2760
+ /*
2761
+ * If we were cloning, force the next fsync to be a full one since we
2762
+ * we replaced (or just dropped in the case of cloning holes when
2763
+ * NO_HOLES is enabled) extents and extent maps.
2764
+ * This is for the sake of simplicity, and cloning into files larger
2765
+ * than 16Mb would force the full fsync any way (when
2766
+ * try_release_extent_mapping() is invoked during page cache truncation.
2767
+ */
2768
+ if (extent_info && !extent_info->is_new_extent)
2769
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2770
+ &BTRFS_I(inode)->runtime_flags);
2771
+
2772
+ if (ret)
2773
+ goto out_trans;
2774
+
2775
+ trans->block_rsv = &fs_info->trans_block_rsv;
2776
+ /*
2777
+ * If we are using the NO_HOLES feature we might have had already an
2778
+ * hole that overlaps a part of the region [lockstart, lockend] and
2779
+ * ends at (or beyond) lockend. Since we have no file extent items to
2780
+ * represent holes, drop_end can be less than lockend and so we must
2781
+ * make sure we have an extent map representing the existing hole (the
2782
+ * call to __btrfs_drop_extents() might have dropped the existing extent
2783
+ * map representing the existing hole), otherwise the fast fsync path
2784
+ * will not record the existence of the hole region
2785
+ * [existing_hole_start, lockend].
2786
+ */
2787
+ if (drop_end <= end)
2788
+ drop_end = end + 1;
2789
+ /*
2790
+ * Don't insert file hole extent item if it's for a range beyond eof
2791
+ * (because it's useless) or if it represents a 0 bytes range (when
2792
+ * cur_offset == drop_end).
2793
+ */
2794
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
2795
+ ret = fill_holes(trans, BTRFS_I(inode), path,
2796
+ cur_offset, drop_end);
2797
+ if (ret) {
2798
+ /* Same comment as above. */
2799
+ btrfs_abort_transaction(trans, ret);
2800
+ goto out_trans;
2801
+ }
2802
+ } else if (!extent_info && cur_offset < drop_end) {
2803
+ /* See the comment in the loop above for the reasoning here. */
2804
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2805
+ cur_offset, drop_end - cur_offset);
2806
+ if (ret) {
2807
+ btrfs_abort_transaction(trans, ret);
2808
+ goto out_trans;
2809
+ }
2810
+
2811
+ }
2812
+ if (extent_info) {
2813
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
2814
+ extent_info->data_len);
2815
+ if (ret) {
2816
+ btrfs_abort_transaction(trans, ret);
2817
+ goto out_trans;
2818
+ }
2819
+ }
2820
+
2821
+out_trans:
2822
+ if (!trans)
2823
+ goto out_free;
2824
+
2825
+ trans->block_rsv = &fs_info->trans_block_rsv;
2826
+ if (ret)
2827
+ btrfs_end_transaction(trans);
2828
+ else
2829
+ *trans_out = trans;
2830
+out_free:
2831
+ btrfs_free_block_rsv(fs_info, rsv);
2832
+out:
2833
+ return ret;
2834
+}
2835
+
2836
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2837
+{
2838
+ struct inode *inode = file_inode(file);
24852839 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
24862840 struct btrfs_root *root = BTRFS_I(inode)->root;
24872841 struct extent_state *cached_state = NULL;
24882842 struct btrfs_path *path;
2489
- struct btrfs_block_rsv *rsv;
2490
- struct btrfs_trans_handle *trans;
2843
+ struct btrfs_trans_handle *trans = NULL;
24912844 u64 lockstart;
24922845 u64 lockend;
24932846 u64 tail_start;
24942847 u64 tail_len;
24952848 u64 orig_start = offset;
2496
- u64 cur_offset;
2497
- u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
2498
- u64 drop_end;
24992849 int ret = 0;
2500
- int err = 0;
2501
- unsigned int rsv_count;
25022850 bool same_block;
2503
- bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
25042851 u64 ino_size;
25052852 bool truncated_block = false;
25062853 bool updated_inode = false;
....@@ -2520,9 +2867,13 @@
25202867 goto out_only_mutex;
25212868 }
25222869
2523
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
2870
+ ret = file_modified(file);
2871
+ if (ret)
2872
+ goto out_only_mutex;
2873
+
2874
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
25242875 lockend = round_down(offset + len,
2525
- btrfs_inode_sectorsize(inode)) - 1;
2876
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
25262877 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
25272878 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
25282879 /*
....@@ -2607,145 +2958,24 @@
26072958 goto out;
26082959 }
26092960
2610
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2611
- if (!rsv) {
2612
- ret = -ENOMEM;
2613
- goto out_free;
2614
- }
2615
- rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
2616
- rsv->failfast = 1;
2961
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
2962
+ &trans);
2963
+ btrfs_free_path(path);
2964
+ if (ret)
2965
+ goto out;
26172966
2618
- /*
2619
- * 1 - update the inode
2620
- * 1 - removing the extents in the range
2621
- * 1 - adding the hole extent if no_holes isn't set
2622
- */
2623
- rsv_count = no_holes ? 2 : 3;
2624
- trans = btrfs_start_transaction(root, rsv_count);
2625
- if (IS_ERR(trans)) {
2626
- err = PTR_ERR(trans);
2627
- goto out_free;
2628
- }
2629
-
2630
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2631
- min_size, 0);
2632
- BUG_ON(ret);
2633
- trans->block_rsv = rsv;
2634
-
2635
- cur_offset = lockstart;
2636
- len = lockend - cur_offset;
2637
- while (cur_offset < lockend) {
2638
- ret = __btrfs_drop_extents(trans, root, inode, path,
2639
- cur_offset, lockend + 1,
2640
- &drop_end, 1, 0, 0, NULL);
2641
- if (ret != -ENOSPC)
2642
- break;
2643
-
2644
- trans->block_rsv = &fs_info->trans_block_rsv;
2645
-
2646
- if (cur_offset < drop_end && cur_offset < ino_size) {
2647
- ret = fill_holes(trans, BTRFS_I(inode), path,
2648
- cur_offset, drop_end);
2649
- if (ret) {
2650
- /*
2651
- * If we failed then we didn't insert our hole
2652
- * entries for the area we dropped, so now the
2653
- * fs is corrupted, so we must abort the
2654
- * transaction.
2655
- */
2656
- btrfs_abort_transaction(trans, ret);
2657
- err = ret;
2658
- break;
2659
- }
2660
- }
2661
-
2662
- cur_offset = drop_end;
2663
-
2664
- ret = btrfs_update_inode(trans, root, inode);
2665
- if (ret) {
2666
- err = ret;
2667
- break;
2668
- }
2669
-
2670
- btrfs_end_transaction(trans);
2671
- btrfs_btree_balance_dirty(fs_info);
2672
-
2673
- trans = btrfs_start_transaction(root, rsv_count);
2674
- if (IS_ERR(trans)) {
2675
- ret = PTR_ERR(trans);
2676
- trans = NULL;
2677
- break;
2678
- }
2679
-
2680
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2681
- rsv, min_size, 0);
2682
- BUG_ON(ret); /* shouldn't happen */
2683
- trans->block_rsv = rsv;
2684
-
2685
- ret = find_first_non_hole(inode, &cur_offset, &len);
2686
- if (unlikely(ret < 0))
2687
- break;
2688
- if (ret && !len) {
2689
- ret = 0;
2690
- break;
2691
- }
2692
- }
2693
-
2694
- if (ret) {
2695
- err = ret;
2696
- goto out_trans;
2697
- }
2698
-
2699
- trans->block_rsv = &fs_info->trans_block_rsv;
2700
- /*
2701
- * If we are using the NO_HOLES feature we might have had already an
2702
- * hole that overlaps a part of the region [lockstart, lockend] and
2703
- * ends at (or beyond) lockend. Since we have no file extent items to
2704
- * represent holes, drop_end can be less than lockend and so we must
2705
- * make sure we have an extent map representing the existing hole (the
2706
- * call to __btrfs_drop_extents() might have dropped the existing extent
2707
- * map representing the existing hole), otherwise the fast fsync path
2708
- * will not record the existence of the hole region
2709
- * [existing_hole_start, lockend].
2710
- */
2711
- if (drop_end <= lockend)
2712
- drop_end = lockend + 1;
2713
- /*
2714
- * Don't insert file hole extent item if it's for a range beyond eof
2715
- * (because it's useless) or if it represents a 0 bytes range (when
2716
- * cur_offset == drop_end).
2717
- */
2718
- if (cur_offset < ino_size && cur_offset < drop_end) {
2719
- ret = fill_holes(trans, BTRFS_I(inode), path,
2720
- cur_offset, drop_end);
2721
- if (ret) {
2722
- /* Same comment as above. */
2723
- btrfs_abort_transaction(trans, ret);
2724
- err = ret;
2725
- goto out_trans;
2726
- }
2727
- }
2728
-
2729
-out_trans:
2730
- if (!trans)
2731
- goto out_free;
2732
-
2967
+ ASSERT(trans != NULL);
27332968 inode_inc_iversion(inode);
27342969 inode->i_mtime = inode->i_ctime = current_time(inode);
2735
-
2736
- trans->block_rsv = &fs_info->trans_block_rsv;
27372970 ret = btrfs_update_inode(trans, root, inode);
27382971 updated_inode = true;
27392972 btrfs_end_transaction(trans);
27402973 btrfs_btree_balance_dirty(fs_info);
2741
-out_free:
2742
- btrfs_free_path(path);
2743
- btrfs_free_block_rsv(fs_info, rsv);
27442974 out:
27452975 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
27462976 &cached_state);
27472977 out_only_mutex:
2748
- if (!updated_inode && truncated_block && !ret && !err) {
2978
+ if (!updated_inode && truncated_block && !ret) {
27492979 /*
27502980 * If we only end up zeroing part of a page, we still need to
27512981 * update the inode item, so that all the time fields are
....@@ -2760,16 +2990,18 @@
27602990 inode->i_ctime = now;
27612991 trans = btrfs_start_transaction(root, 1);
27622992 if (IS_ERR(trans)) {
2763
- err = PTR_ERR(trans);
2993
+ ret = PTR_ERR(trans);
27642994 } else {
2765
- err = btrfs_update_inode(trans, root, inode);
2766
- ret = btrfs_end_transaction(trans);
2995
+ int ret2;
2996
+
2997
+ ret = btrfs_update_inode(trans, root, inode);
2998
+ ret2 = btrfs_end_transaction(trans);
2999
+ if (!ret)
3000
+ ret = ret2;
27673001 }
27683002 }
27693003 inode_unlock(inode);
2770
- if (ret && !err)
2771
- err = ret;
2772
- return err;
3004
+ return ret;
27733005 }
27743006
27753007 /* Helper structure to record which range is already reserved */
....@@ -2830,7 +3062,7 @@
28303062
28313063 inode->i_ctime = current_time(inode);
28323064 i_size_write(inode, end);
2833
- btrfs_ordered_update_i_size(inode, end, NULL);
3065
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
28343066 ret = btrfs_update_inode(trans, root, inode);
28353067 ret2 = btrfs_end_transaction(trans);
28363068
....@@ -2838,12 +3070,12 @@
28383070 }
28393071
28403072 enum {
2841
- RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
2842
- RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
2843
- RANGE_BOUNDARY_HOLE = 2,
3073
+ RANGE_BOUNDARY_WRITTEN_EXTENT,
3074
+ RANGE_BOUNDARY_PREALLOC_EXTENT,
3075
+ RANGE_BOUNDARY_HOLE,
28443076 };
28453077
2846
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
3078
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
28473079 u64 offset)
28483080 {
28493081 const u64 sectorsize = btrfs_inode_sectorsize(inode);
....@@ -2851,7 +3083,7 @@
28513083 int ret;
28523084
28533085 offset = round_down(offset, sectorsize);
2854
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
3086
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
28553087 if (IS_ERR(em))
28563088 return PTR_ERR(em);
28573089
....@@ -2876,7 +3108,7 @@
28763108 struct extent_changeset *data_reserved = NULL;
28773109 int ret;
28783110 u64 alloc_hint = 0;
2879
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
3111
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
28803112 u64 alloc_start = round_down(offset, sectorsize);
28813113 u64 alloc_end = round_up(offset + len, sectorsize);
28823114 u64 bytes_to_reserve = 0;
....@@ -2884,8 +3116,8 @@
28843116
28853117 inode_dio_wait(inode);
28863118
2887
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2888
- alloc_start, alloc_end - alloc_start, 0);
3119
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3120
+ alloc_end - alloc_start);
28893121 if (IS_ERR(em)) {
28903122 ret = PTR_ERR(em);
28913123 goto out;
....@@ -2928,8 +3160,8 @@
29283160
29293161 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
29303162 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2931
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2932
- alloc_start, sectorsize, 0);
3163
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3164
+ sectorsize);
29333165 if (IS_ERR(em)) {
29343166 ret = PTR_ERR(em);
29353167 goto out;
....@@ -2966,7 +3198,8 @@
29663198 * to cover them.
29673199 */
29683200 if (!IS_ALIGNED(offset, sectorsize)) {
2969
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
3201
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3202
+ offset);
29703203 if (ret < 0)
29713204 goto out;
29723205 if (ret == RANGE_BOUNDARY_HOLE) {
....@@ -2982,7 +3215,7 @@
29823215 }
29833216
29843217 if (!IS_ALIGNED(offset + len, sectorsize)) {
2985
- ret = btrfs_zero_range_check_range_boundary(inode,
3218
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
29863219 offset + len);
29873220 if (ret < 0)
29883221 goto out;
....@@ -3014,7 +3247,7 @@
30143247 &cached_state);
30153248 if (ret)
30163249 goto out;
3017
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
3250
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
30183251 alloc_start, bytes_to_reserve);
30193252 if (ret) {
30203253 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
....@@ -3036,7 +3269,7 @@
30363269 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
30373270 out:
30383271 if (ret && space_reserved)
3039
- btrfs_free_reserved_data_space(inode, data_reserved,
3272
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
30403273 alloc_start, bytes_to_reserve);
30413274 extent_changeset_free(data_reserved);
30423275
....@@ -3060,7 +3293,7 @@
30603293 u64 locked_end;
30613294 u64 actual_end = 0;
30623295 struct extent_map *em;
3063
- int blocksize = btrfs_inode_sectorsize(inode);
3296
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
30643297 int ret;
30653298
30663299 alloc_start = round_down(offset, blocksize);
....@@ -3073,7 +3306,7 @@
30733306 return -EOPNOTSUPP;
30743307
30753308 if (mode & FALLOC_FL_PUNCH_HOLE)
3076
- return btrfs_punch_hole(inode, offset, len);
3309
+ return btrfs_punch_hole(file, offset, len);
30773310
30783311 /*
30793312 * Only trigger disk allocation, don't trigger qgroup reserve
....@@ -3094,6 +3327,10 @@
30943327 if (ret)
30953328 goto out;
30963329 }
3330
+
3331
+ ret = file_modified(file);
3332
+ if (ret)
3333
+ goto out;
30973334
30983335 /*
30993336 * TODO: Move these two operations after we have checked
....@@ -3142,10 +3379,11 @@
31423379 */
31433380 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
31443381 locked_end, &cached_state);
3145
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
3382
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3383
+ locked_end);
31463384
31473385 if (ordered &&
3148
- ordered->file_offset + ordered->len > alloc_start &&
3386
+ ordered->file_offset + ordered->num_bytes > alloc_start &&
31493387 ordered->file_offset < alloc_end) {
31503388 btrfs_put_ordered_extent(ordered);
31513389 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
....@@ -3170,7 +3408,7 @@
31703408 INIT_LIST_HEAD(&reserve_list);
31713409 while (cur_offset < alloc_end) {
31723410 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3173
- alloc_end - cur_offset, 0);
3411
+ alloc_end - cur_offset);
31743412 if (IS_ERR(em)) {
31753413 ret = PTR_ERR(em);
31763414 break;
....@@ -3187,8 +3425,9 @@
31873425 free_extent_map(em);
31883426 break;
31893427 }
3190
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
3191
- cur_offset, last_byte - cur_offset);
3428
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3429
+ &data_reserved, cur_offset,
3430
+ last_byte - cur_offset);
31923431 if (ret < 0) {
31933432 cur_offset = last_byte;
31943433 free_extent_map(em);
....@@ -3200,8 +3439,9 @@
32003439 * range, free reserved data space first, otherwise
32013440 * it'll result in false ENOSPC error.
32023441 */
3203
- btrfs_free_reserved_data_space(inode, data_reserved,
3204
- cur_offset, last_byte - cur_offset);
3442
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
3443
+ data_reserved, cur_offset,
3444
+ last_byte - cur_offset);
32053445 }
32063446 free_extent_map(em);
32073447 cur_offset = last_byte;
....@@ -3218,7 +3458,7 @@
32183458 range->len, i_blocksize(inode),
32193459 offset + len, &alloc_hint);
32203460 else
3221
- btrfs_free_reserved_data_space(inode,
3461
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
32223462 data_reserved, range->start,
32233463 range->len);
32243464 list_del(&range->list);
....@@ -3239,35 +3479,36 @@
32393479 inode_unlock(inode);
32403480 /* Let go of our reservation. */
32413481 if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3242
- btrfs_free_reserved_data_space(inode, data_reserved,
3482
+ btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
32433483 cur_offset, alloc_end - cur_offset);
32443484 extent_changeset_free(data_reserved);
32453485 return ret;
32463486 }
32473487
3248
-static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
3488
+static loff_t find_desired_extent(struct inode *inode, loff_t offset,
3489
+ int whence)
32493490 {
32503491 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
32513492 struct extent_map *em = NULL;
32523493 struct extent_state *cached_state = NULL;
3494
+ loff_t i_size = inode->i_size;
32533495 u64 lockstart;
32543496 u64 lockend;
32553497 u64 start;
32563498 u64 len;
32573499 int ret = 0;
32583500
3259
- if (inode->i_size == 0)
3501
+ if (i_size == 0 || offset >= i_size)
32603502 return -ENXIO;
32613503
32623504 /*
3263
- * *offset can be negative, in this case we start finding DATA/HOLE from
3505
+ * offset can be negative, in this case we start finding DATA/HOLE from
32643506 * the very start of the file.
32653507 */
3266
- start = max_t(loff_t, 0, *offset);
3508
+ start = max_t(loff_t, 0, offset);
32673509
32683510 lockstart = round_down(start, fs_info->sectorsize);
3269
- lockend = round_up(i_size_read(inode),
3270
- fs_info->sectorsize);
3511
+ lockend = round_up(i_size, fs_info->sectorsize);
32713512 if (lockend <= lockstart)
32723513 lockend = lockstart + fs_info->sectorsize;
32733514 lockend--;
....@@ -3276,9 +3517,8 @@
32763517 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
32773518 &cached_state);
32783519
3279
- while (start < inode->i_size) {
3280
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
3281
- start, len, 0);
3520
+ while (start < i_size) {
3521
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
32823522 if (IS_ERR(em)) {
32833523 ret = PTR_ERR(em);
32843524 em = NULL;
....@@ -3300,59 +3540,71 @@
33003540 cond_resched();
33013541 }
33023542 free_extent_map(em);
3303
- if (!ret) {
3304
- if (whence == SEEK_DATA && start >= inode->i_size)
3305
- ret = -ENXIO;
3306
- else
3307
- *offset = min_t(loff_t, start, inode->i_size);
3308
- }
33093543 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
33103544 &cached_state);
3311
- return ret;
3545
+ if (ret) {
3546
+ offset = ret;
3547
+ } else {
3548
+ if (whence == SEEK_DATA && start >= i_size)
3549
+ offset = -ENXIO;
3550
+ else
3551
+ offset = min_t(loff_t, start, i_size);
3552
+ }
3553
+
3554
+ return offset;
33123555 }
33133556
33143557 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
33153558 {
33163559 struct inode *inode = file->f_mapping->host;
3317
- int ret;
33183560
3319
- inode_lock(inode);
33203561 switch (whence) {
3321
- case SEEK_END:
3322
- case SEEK_CUR:
3323
- offset = generic_file_llseek(file, offset, whence);
3324
- goto out;
3562
+ default:
3563
+ return generic_file_llseek(file, offset, whence);
33253564 case SEEK_DATA:
33263565 case SEEK_HOLE:
3327
- if (offset >= i_size_read(inode)) {
3328
- inode_unlock(inode);
3329
- return -ENXIO;
3330
- }
3331
-
3332
- ret = find_desired_extent(inode, &offset, whence);
3333
- if (ret) {
3334
- inode_unlock(inode);
3335
- return ret;
3336
- }
3566
+ inode_lock_shared(inode);
3567
+ offset = find_desired_extent(inode, offset, whence);
3568
+ inode_unlock_shared(inode);
3569
+ break;
33373570 }
33383571
3339
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3340
-out:
3341
- inode_unlock(inode);
3342
- return offset;
3572
+ if (offset < 0)
3573
+ return offset;
3574
+
3575
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
33433576 }
33443577
33453578 static int btrfs_file_open(struct inode *inode, struct file *filp)
33463579 {
3347
- filp->f_mode |= FMODE_NOWAIT;
3580
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
33483581 return generic_file_open(inode, filp);
3582
+}
3583
+
3584
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3585
+{
3586
+ ssize_t ret = 0;
3587
+
3588
+ if (iocb->ki_flags & IOCB_DIRECT) {
3589
+ struct inode *inode = file_inode(iocb->ki_filp);
3590
+
3591
+ inode_lock_shared(inode);
3592
+ ret = btrfs_direct_IO(iocb, to);
3593
+ inode_unlock_shared(inode);
3594
+ if (ret < 0 || !iov_iter_count(to) ||
3595
+ iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3596
+ return ret;
3597
+ }
3598
+
3599
+ return generic_file_buffered_read(iocb, to, ret);
33493600 }
33503601
33513602 const struct file_operations btrfs_file_operations = {
33523603 .llseek = btrfs_file_llseek,
3353
- .read_iter = generic_file_read_iter,
3604
+ .read_iter = btrfs_file_read_iter,
33543605 .splice_read = generic_file_splice_read,
33553606 .write_iter = btrfs_file_write_iter,
3607
+ .splice_write = iter_file_splice_write,
33563608 .mmap = btrfs_file_mmap,
33573609 .open = btrfs_file_open,
33583610 .release = btrfs_release_file,
....@@ -3362,8 +3614,7 @@
33623614 #ifdef CONFIG_COMPAT
33633615 .compat_ioctl = btrfs_compat_ioctl,
33643616 #endif
3365
- .clone_file_range = btrfs_clone_file_range,
3366
- .dedupe_file_range = btrfs_dedupe_file_range,
3617
+ .remap_file_range = btrfs_remap_file_range,
33673618 };
33683619
33693620 void __cold btrfs_auto_defrag_exit(void)