hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/btrfs/inode.c
....@@ -3,9 +3,9 @@
33 * Copyright (C) 2007 Oracle. All rights reserved.
44 */
55
6
+#include <crypto/hash.h>
67 #include <linux/kernel.h>
78 #include <linux/bio.h>
8
-#include <linux/buffer_head.h>
99 #include <linux/file.h>
1010 #include <linux/fs.h>
1111 #include <linux/pagemap.h>
....@@ -27,7 +27,12 @@
2727 #include <linux/uio.h>
2828 #include <linux/magic.h>
2929 #include <linux/iversion.h>
30
+#include <linux/swap.h>
31
+#include <linux/migrate.h>
32
+#include <linux/sched/mm.h>
33
+#include <linux/iomap.h>
3034 #include <asm/unaligned.h>
35
+#include "misc.h"
3136 #include "ctree.h"
3237 #include "disk-io.h"
3338 #include "transaction.h"
....@@ -41,32 +46,31 @@
4146 #include "locking.h"
4247 #include "free-space-cache.h"
4348 #include "inode-map.h"
44
-#include "backref.h"
4549 #include "props.h"
4650 #include "qgroup.h"
47
-#include "dedupe.h"
51
+#include "delalloc-space.h"
52
+#include "block-group.h"
53
+#include "space-info.h"
4854
4955 struct btrfs_iget_args {
50
- struct btrfs_key *location;
56
+ u64 ino;
5157 struct btrfs_root *root;
5258 };
5359
5460 struct btrfs_dio_data {
5561 u64 reserve;
56
- u64 unsubmitted_oe_range_start;
57
- u64 unsubmitted_oe_range_end;
58
- int overwrite;
62
+ loff_t length;
63
+ ssize_t submitted;
64
+ struct extent_changeset *data_reserved;
65
+ bool sync;
5966 };
6067
6168 static const struct inode_operations btrfs_dir_inode_operations;
6269 static const struct inode_operations btrfs_symlink_inode_operations;
63
-static const struct inode_operations btrfs_dir_ro_inode_operations;
6470 static const struct inode_operations btrfs_special_inode_operations;
6571 static const struct inode_operations btrfs_file_inode_operations;
6672 static const struct address_space_operations btrfs_aops;
67
-static const struct address_space_operations btrfs_symlink_aops;
6873 static const struct file_operations btrfs_dir_file_operations;
69
-static const struct extent_io_ops btrfs_extent_io_ops;
7074
7175 static struct kmem_cache *btrfs_inode_cachep;
7276 struct kmem_cache *btrfs_trans_handle_cachep;
....@@ -74,38 +78,26 @@
7478 struct kmem_cache *btrfs_free_space_cachep;
7579 struct kmem_cache *btrfs_free_space_bitmap_cachep;
7680
77
-#define S_SHIFT 12
78
-static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
79
- [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
80
- [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
81
- [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
82
- [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
83
- [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
84
- [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
85
- [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
86
-};
87
-
8881 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
8982 static int btrfs_truncate(struct inode *inode, bool skip_writeback);
9083 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
91
-static noinline int cow_file_range(struct inode *inode,
84
+static noinline int cow_file_range(struct btrfs_inode *inode,
9285 struct page *locked_page,
93
- u64 start, u64 end, u64 delalloc_end,
94
- int *page_started, unsigned long *nr_written,
95
- int unlock, struct btrfs_dedupe_hash *hash);
96
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
97
- u64 orig_start, u64 block_start,
86
+ u64 start, u64 end, int *page_started,
87
+ unsigned long *nr_written, int unlock);
88
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
89
+ u64 len, u64 orig_start, u64 block_start,
9890 u64 block_len, u64 orig_block_len,
9991 u64 ram_bytes, int compress_type,
10092 int type);
10193
102
-static void __endio_write_update_ordered(struct inode *inode,
94
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
10395 const u64 offset, const u64 bytes,
10496 const bool uptodate);
10597
10698 /*
10799 * Cleanup all submitted ordered extents in specified range to handle errors
108
- * from the fill_dellaloc() callback.
100
+ * from the btrfs_run_delalloc_range() callback.
109101 *
110102 * NOTE: caller must ensure that when an error happens, it can not call
111103 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
....@@ -113,7 +105,7 @@
113105 * to be released, which we want to happen only when finishing the ordered
114106 * extent (btrfs_finish_ordered_io()).
115107 */
116
-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
108
+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
117109 struct page *locked_page,
118110 u64 offset, u64 bytes)
119111 {
....@@ -125,7 +117,7 @@
125117 struct page *page;
126118
127119 while (index <= end_index) {
128
- page = find_get_page(inode->i_mapping, index);
120
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
129121 index++;
130122 if (!page)
131123 continue;
....@@ -147,13 +139,6 @@
147139 }
148140
149141 static int btrfs_dirty_inode(struct inode *inode);
150
-
151
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
152
-void btrfs_test_inode_set_ops(struct inode *inode)
153
-{
154
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
155
-}
156
-#endif
157142
158143 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
159144 struct inode *inode, struct inode *dir,
....@@ -187,6 +172,9 @@
187172 int ret;
188173 size_t cur_size = size;
189174 unsigned long offset;
175
+
176
+ ASSERT((compressed_size > 0 && compressed_pages) ||
177
+ (compressed_size == 0 && !compressed_pages));
190178
191179 if (compressed_size && compressed_pages)
192180 cur_size = compressed_size;
....@@ -241,13 +229,22 @@
241229 start >> PAGE_SHIFT);
242230 btrfs_set_file_extent_compression(leaf, ei, 0);
243231 kaddr = kmap_atomic(page);
244
- offset = start & (PAGE_SIZE - 1);
232
+ offset = offset_in_page(start);
245233 write_extent_buffer(leaf, kaddr + offset, ptr, size);
246234 kunmap_atomic(kaddr);
247235 put_page(page);
248236 }
249237 btrfs_mark_buffer_dirty(leaf);
250238 btrfs_release_path(path);
239
+
240
+ /*
241
+ * We align size to sectorsize for inline extents just for simplicity
242
+ * sake.
243
+ */
244
+ size = ALIGN(size, root->fs_info->sectorsize);
245
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
246
+ if (ret)
247
+ goto fail;
251248
252249 /*
253250 * we're an inline extent, so nobody can
....@@ -271,15 +268,15 @@
271268 * does the checks required to make sure the data is small enough
272269 * to fit as an inline extent.
273270 */
274
-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
271
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
275272 u64 end, size_t compressed_size,
276273 int compress_type,
277274 struct page **compressed_pages)
278275 {
279
- struct btrfs_root *root = BTRFS_I(inode)->root;
276
+ struct btrfs_root *root = inode->root;
280277 struct btrfs_fs_info *fs_info = root->fs_info;
281278 struct btrfs_trans_handle *trans;
282
- u64 isize = i_size_read(inode);
279
+ u64 isize = i_size_read(&inode->vfs_inode);
283280 u64 actual_end = min(end + 1, isize);
284281 u64 inline_len = actual_end - start;
285282 u64 aligned_end = ALIGN(end, fs_info->sectorsize);
....@@ -311,7 +308,7 @@
311308 btrfs_free_path(path);
312309 return PTR_ERR(trans);
313310 }
314
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
311
+ trans->block_rsv = &inode->block_rsv;
315312
316313 if (compressed_size && compressed_pages)
317314 extent_item_size = btrfs_file_extent_calc_inline_size(
....@@ -320,9 +317,9 @@
320317 extent_item_size = btrfs_file_extent_calc_inline_size(
321318 inline_len);
322319
323
- ret = __btrfs_drop_extents(trans, root, inode, path,
324
- start, aligned_end, NULL,
325
- 1, 1, extent_item_size, &extent_inserted);
320
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
321
+ NULL, 1, 1, extent_item_size,
322
+ &extent_inserted);
326323 if (ret) {
327324 btrfs_abort_transaction(trans, ret);
328325 goto out;
....@@ -331,7 +328,7 @@
331328 if (isize > actual_end)
332329 inline_len = min_t(u64, isize, actual_end);
333330 ret = insert_inline_extent(trans, path, extent_inserted,
334
- root, inode, start,
331
+ root, &inode->vfs_inode, start,
335332 inline_len, compressed_size,
336333 compress_type, compressed_pages);
337334 if (ret && ret != -ENOSPC) {
....@@ -342,8 +339,8 @@
342339 goto out;
343340 }
344341
345
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
346
- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
342
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
343
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
347344 out:
348345 /*
349346 * Don't forget to free the reserved space, as for inlined extent
....@@ -367,18 +364,25 @@
367364 struct list_head list;
368365 };
369366
370
-struct async_cow {
367
+struct async_chunk {
371368 struct inode *inode;
372
- struct btrfs_root *root;
373369 struct page *locked_page;
374370 u64 start;
375371 u64 end;
376372 unsigned int write_flags;
377373 struct list_head extents;
374
+ struct cgroup_subsys_state *blkcg_css;
378375 struct btrfs_work work;
376
+ atomic_t *pending;
379377 };
380378
381
-static noinline int add_async_extent(struct async_cow *cow,
379
+struct async_cow {
380
+ /* Number of chunks in flight; must be first in the structure */
381
+ atomic_t num_chunks;
382
+ struct async_chunk chunks[];
383
+};
384
+
385
+static noinline int add_async_extent(struct async_chunk *cow,
382386 u64 start, u64 ram_size,
383387 u64 compressed_size,
384388 struct page **pages,
....@@ -402,10 +406,10 @@
402406 /*
403407 * Check if the inode has flags compatible with compression
404408 */
405
-static inline bool inode_can_compress(struct inode *inode)
409
+static inline bool inode_can_compress(struct btrfs_inode *inode)
406410 {
407
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
408
- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
411
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
412
+ inode->flags & BTRFS_INODE_NODATASUM)
409413 return false;
410414 return true;
411415 }
....@@ -414,29 +418,30 @@
414418 * Check if the inode needs to be submitted to compression, based on mount
415419 * options, defragmentation, properties or heuristics.
416420 */
417
-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
421
+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
422
+ u64 end)
418423 {
419
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
424
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
420425
421426 if (!inode_can_compress(inode)) {
422427 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
423428 KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
424
- btrfs_ino(BTRFS_I(inode)));
429
+ btrfs_ino(inode));
425430 return 0;
426431 }
427432 /* force compress */
428433 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
429434 return 1;
430435 /* defrag ioctl */
431
- if (BTRFS_I(inode)->defrag_compress)
436
+ if (inode->defrag_compress)
432437 return 1;
433438 /* bad compression ratios */
434
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
439
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
435440 return 0;
436441 if (btrfs_test_opt(fs_info, COMPRESS) ||
437
- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
438
- BTRFS_I(inode)->prop_compress)
439
- return btrfs_compress_heuristic(inode, start, end);
442
+ inode->flags & BTRFS_INODE_COMPRESS ||
443
+ inode->prop_compress)
444
+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
440445 return 0;
441446 }
442447
....@@ -466,16 +471,15 @@
466471 * are written in the same order that the flusher thread sent them
467472 * down.
468473 */
469
-static noinline void compress_file_range(struct inode *inode,
470
- struct page *locked_page,
471
- u64 start, u64 end,
472
- struct async_cow *async_cow,
473
- int *num_added)
474
+static noinline int compress_file_range(struct async_chunk *async_chunk)
474475 {
476
+ struct inode *inode = async_chunk->inode;
475477 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
476478 u64 blocksize = fs_info->sectorsize;
479
+ u64 start = async_chunk->start;
480
+ u64 end = async_chunk->end;
477481 u64 actual_end;
478
- u64 isize = i_size_read(inode);
482
+ u64 i_size;
479483 int ret = 0;
480484 struct page **pages = NULL;
481485 unsigned long nr_pages;
....@@ -484,12 +488,25 @@
484488 int i;
485489 int will_compress;
486490 int compress_type = fs_info->compress_type;
491
+ int compressed_extents = 0;
487492 int redirty = 0;
488493
489494 inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
490495 SZ_16K);
491496
492
- actual_end = min_t(u64, isize, end + 1);
497
+ /*
498
+ * We need to save i_size before now because it could change in between
499
+ * us evaluating the size and assigning it. This is because we lock and
500
+ * unlock the page in truncate and fallocate, and then modify the i_size
501
+ * later on.
502
+ *
503
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
504
+ * does that for us.
505
+ */
506
+ barrier();
507
+ i_size = i_size_read(inode);
508
+ barrier();
509
+ actual_end = min_t(u64, i_size, end + 1);
493510 again:
494511 will_compress = 0;
495512 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
....@@ -530,7 +547,7 @@
530547 * inode has not been flagged as nocompress. This flag can
531548 * change at any time if we discover bad compression ratios.
532549 */
533
- if (inode_need_compress(inode, start, end)) {
550
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
534551 WARN_ON(pages);
535552 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
536553 if (!pages) {
....@@ -571,8 +588,7 @@
571588 &total_compressed);
572589
573590 if (!ret) {
574
- unsigned long offset = total_compressed &
575
- (PAGE_SIZE - 1);
591
+ unsigned long offset = offset_in_page(total_compressed);
576592 struct page *page = pages[nr_pages - 1];
577593 char *kaddr;
578594
....@@ -595,11 +611,12 @@
595611 /* we didn't compress the entire range, try
596612 * to make an uncompressed inline extent.
597613 */
598
- ret = cow_file_range_inline(inode, start, end, 0,
599
- BTRFS_COMPRESS_NONE, NULL);
614
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
615
+ 0, BTRFS_COMPRESS_NONE,
616
+ NULL);
600617 } else {
601618 /* try making a compressed inline extent */
602
- ret = cow_file_range_inline(inode, start, end,
619
+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
603620 total_compressed,
604621 compress_type, pages);
605622 }
....@@ -621,8 +638,9 @@
621638 * our outstanding extent for clearing delalloc for this
622639 * range.
623640 */
624
- extent_clear_unlock_delalloc(inode, start, end, end,
625
- NULL, clear_flags,
641
+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
642
+ NULL,
643
+ clear_flags,
626644 PAGE_UNLOCK |
627645 PAGE_CLEAR_DIRTY |
628646 PAGE_SET_WRITEBACK |
....@@ -641,8 +659,7 @@
641659 }
642660 kfree(pages);
643661 }
644
-
645
- return;
662
+ return 0;
646663 }
647664 }
648665
....@@ -661,14 +678,14 @@
661678 */
662679 total_in = ALIGN(total_in, PAGE_SIZE);
663680 if (total_compressed + blocksize <= total_in) {
664
- *num_added += 1;
681
+ compressed_extents++;
665682
666683 /*
667684 * The async work queues will take care of doing actual
668685 * allocation on disk for these compressed pages, and
669686 * will submit them to the elevator.
670687 */
671
- add_async_extent(async_cow, start, total_in,
688
+ add_async_extent(async_chunk, start, total_in,
672689 total_compressed, pages, nr_pages,
673690 compress_type);
674691
....@@ -678,7 +695,7 @@
678695 cond_resched();
679696 goto again;
680697 }
681
- return;
698
+ return compressed_extents;
682699 }
683700 }
684701 if (pages) {
....@@ -708,18 +725,20 @@
708725 * to our extent and set things up for the async work queue to run
709726 * cow_file_range to do the normal delalloc dance.
710727 */
711
- if (page_offset(locked_page) >= start &&
712
- page_offset(locked_page) <= end)
713
- __set_page_dirty_nobuffers(locked_page);
728
+ if (async_chunk->locked_page &&
729
+ (page_offset(async_chunk->locked_page) >= start &&
730
+ page_offset(async_chunk->locked_page)) <= end) {
731
+ __set_page_dirty_nobuffers(async_chunk->locked_page);
714732 /* unlocked later on in the async handlers */
733
+ }
715734
716735 if (redirty)
717736 extent_range_redirty_for_io(inode, start, end);
718
- add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
737
+ add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
719738 BTRFS_COMPRESS_NONE);
720
- *num_added += 1;
739
+ compressed_extents++;
721740
722
- return;
741
+ return compressed_extents;
723742 }
724743
725744 static void free_async_extent_pages(struct async_extent *async_extent)
....@@ -744,45 +763,38 @@
744763 * queued. We walk all the async extents created by compress_file_range
745764 * and send them down to the disk.
746765 */
747
-static noinline void submit_compressed_extents(struct inode *inode,
748
- struct async_cow *async_cow)
766
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
749767 {
750
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
768
+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
769
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
751770 struct async_extent *async_extent;
752771 u64 alloc_hint = 0;
753772 struct btrfs_key ins;
754773 struct extent_map *em;
755
- struct btrfs_root *root = BTRFS_I(inode)->root;
756
- struct extent_io_tree *io_tree;
774
+ struct btrfs_root *root = inode->root;
775
+ struct extent_io_tree *io_tree = &inode->io_tree;
757776 int ret = 0;
758777
759778 again:
760
- while (!list_empty(&async_cow->extents)) {
761
- async_extent = list_entry(async_cow->extents.next,
779
+ while (!list_empty(&async_chunk->extents)) {
780
+ async_extent = list_entry(async_chunk->extents.next,
762781 struct async_extent, list);
763782 list_del(&async_extent->list);
764783
765
- io_tree = &BTRFS_I(inode)->io_tree;
766
-
767784 retry:
785
+ lock_extent(io_tree, async_extent->start,
786
+ async_extent->start + async_extent->ram_size - 1);
768787 /* did the compression code fall back to uncompressed IO? */
769788 if (!async_extent->pages) {
770789 int page_started = 0;
771790 unsigned long nr_written = 0;
772791
773
- lock_extent(io_tree, async_extent->start,
774
- async_extent->start +
775
- async_extent->ram_size - 1);
776
-
777792 /* allocate blocks */
778
- ret = cow_file_range(inode, async_cow->locked_page,
793
+ ret = cow_file_range(inode, async_chunk->locked_page,
779794 async_extent->start,
780795 async_extent->start +
781796 async_extent->ram_size - 1,
782
- async_extent->start +
783
- async_extent->ram_size - 1,
784
- &page_started, &nr_written, 0,
785
- NULL);
797
+ &page_started, &nr_written, 0);
786798
787799 /* JDM XXX */
788800
....@@ -793,20 +805,17 @@
793805 * all those pages down to the drive.
794806 */
795807 if (!page_started && !ret)
796
- extent_write_locked_range(inode,
808
+ extent_write_locked_range(&inode->vfs_inode,
797809 async_extent->start,
798810 async_extent->start +
799811 async_extent->ram_size - 1,
800812 WB_SYNC_ALL);
801
- else if (ret)
802
- unlock_page(async_cow->locked_page);
813
+ else if (ret && async_chunk->locked_page)
814
+ unlock_page(async_chunk->locked_page);
803815 kfree(async_extent);
804816 cond_resched();
805817 continue;
806818 }
807
-
808
- lock_extent(io_tree, async_extent->start,
809
- async_extent->start + async_extent->ram_size - 1);
810819
811820 ret = btrfs_reserve_extent(root, async_extent->ram_size,
812821 async_extent->compressed_size,
....@@ -826,7 +835,7 @@
826835 * will not submit these pages down to lower
827836 * layers.
828837 */
829
- extent_range_redirty_for_io(inode,
838
+ extent_range_redirty_for_io(&inode->vfs_inode,
830839 async_extent->start,
831840 async_extent->start +
832841 async_extent->ram_size - 1);
....@@ -861,8 +870,7 @@
861870 BTRFS_ORDERED_COMPRESSED,
862871 async_extent->compress_type);
863872 if (ret) {
864
- btrfs_drop_extent_cache(BTRFS_I(inode),
865
- async_extent->start,
873
+ btrfs_drop_extent_cache(inode, async_extent->start,
866874 async_extent->start +
867875 async_extent->ram_size - 1, 0);
868876 goto out_free_reserve;
....@@ -875,29 +883,25 @@
875883 extent_clear_unlock_delalloc(inode, async_extent->start,
876884 async_extent->start +
877885 async_extent->ram_size - 1,
878
- async_extent->start +
879
- async_extent->ram_size - 1,
880886 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
881887 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
882888 PAGE_SET_WRITEBACK);
883
- if (btrfs_submit_compressed_write(inode,
884
- async_extent->start,
889
+ if (btrfs_submit_compressed_write(inode, async_extent->start,
885890 async_extent->ram_size,
886891 ins.objectid,
887892 ins.offset, async_extent->pages,
888893 async_extent->nr_pages,
889
- async_cow->write_flags)) {
890
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
894
+ async_chunk->write_flags,
895
+ async_chunk->blkcg_css)) {
891896 struct page *p = async_extent->pages[0];
892897 const u64 start = async_extent->start;
893898 const u64 end = start + async_extent->ram_size - 1;
894899
895
- p->mapping = inode->i_mapping;
896
- tree->ops->writepage_end_io_hook(p, start, end,
897
- NULL, 0);
900
+ p->mapping = inode->vfs_inode.i_mapping;
901
+ btrfs_writepage_endio_finish_ordered(p, start, end, 0);
902
+
898903 p->mapping = NULL;
899
- extent_clear_unlock_delalloc(inode, start, end, end,
900
- NULL, 0,
904
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
901905 PAGE_END_WRITEBACK |
902906 PAGE_SET_ERROR);
903907 free_async_extent_pages(async_extent);
....@@ -914,8 +918,6 @@
914918 extent_clear_unlock_delalloc(inode, async_extent->start,
915919 async_extent->start +
916920 async_extent->ram_size - 1,
917
- async_extent->start +
918
- async_extent->ram_size - 1,
919921 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
920922 EXTENT_DELALLOC_NEW |
921923 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
....@@ -927,10 +929,10 @@
927929 goto again;
928930 }
929931
930
-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
932
+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
931933 u64 num_bytes)
932934 {
933
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
935
+ struct extent_map_tree *em_tree = &inode->extent_tree;
934936 struct extent_map *em;
935937 u64 alloc_hint = 0;
936938
....@@ -972,14 +974,13 @@
972974 * required to start IO on it. It may be clean and already done with
973975 * IO when we return.
974976 */
975
-static noinline int cow_file_range(struct inode *inode,
977
+static noinline int cow_file_range(struct btrfs_inode *inode,
976978 struct page *locked_page,
977
- u64 start, u64 end, u64 delalloc_end,
978
- int *page_started, unsigned long *nr_written,
979
- int unlock, struct btrfs_dedupe_hash *hash)
979
+ u64 start, u64 end, int *page_started,
980
+ unsigned long *nr_written, int unlock)
980981 {
981
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
982
- struct btrfs_root *root = BTRFS_I(inode)->root;
982
+ struct btrfs_root *root = inode->root;
983
+ struct btrfs_fs_info *fs_info = root->fs_info;
983984 u64 alloc_hint = 0;
984985 u64 num_bytes;
985986 unsigned long ram_size;
....@@ -993,8 +994,7 @@
993994 bool extent_reserved = false;
994995 int ret = 0;
995996
996
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
997
- WARN_ON_ONCE(1);
997
+ if (btrfs_is_free_space_inode(inode)) {
998998 ret = -EINVAL;
999999 goto out_unlock;
10001000 }
....@@ -1003,7 +1003,7 @@
10031003 num_bytes = max(blocksize, num_bytes);
10041004 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
10051005
1006
- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
1006
+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
10071007
10081008 if (start == 0) {
10091009 /* lets try to make an inline extent */
....@@ -1016,8 +1016,7 @@
10161016 * our outstanding extent for clearing delalloc for this
10171017 * range.
10181018 */
1019
- extent_clear_unlock_delalloc(inode, start, end,
1020
- delalloc_end, NULL,
1019
+ extent_clear_unlock_delalloc(inode, start, end, NULL,
10211020 EXTENT_LOCKED | EXTENT_DELALLOC |
10221021 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
10231022 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
....@@ -1033,8 +1032,7 @@
10331032 }
10341033
10351034 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1036
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
1037
- start + num_bytes - 1, 0);
1035
+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
10381036
10391037 /*
10401038 * Relocation relies on the relocated extents to have exactly the same
....@@ -1098,7 +1096,7 @@
10981096 * skip current ordered extent.
10991097 */
11001098 if (ret)
1101
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
1099
+ btrfs_drop_extent_cache(inode, start,
11021100 start + ram_size - 1, 0);
11031101 }
11041102
....@@ -1114,9 +1112,8 @@
11141112 page_ops = unlock ? PAGE_UNLOCK : 0;
11151113 page_ops |= PAGE_SET_PRIVATE2;
11161114
1117
- extent_clear_unlock_delalloc(inode, start,
1118
- start + ram_size - 1,
1119
- delalloc_end, locked_page,
1115
+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1116
+ locked_page,
11201117 EXTENT_LOCKED | EXTENT_DELALLOC,
11211118 page_ops);
11221119 if (num_bytes < cur_alloc_size)
....@@ -1139,7 +1136,7 @@
11391136 return ret;
11401137
11411138 out_drop_extent_cache:
1142
- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1139
+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
11431140 out_reserve:
11441141 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
11451142 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
....@@ -1161,7 +1158,6 @@
11611158 if (extent_reserved) {
11621159 extent_clear_unlock_delalloc(inode, start,
11631160 start + cur_alloc_size - 1,
1164
- start + cur_alloc_size - 1,
11651161 locked_page,
11661162 clear_bits,
11671163 page_ops);
....@@ -1169,8 +1165,7 @@
11691165 if (start >= end)
11701166 goto out;
11711167 }
1172
- extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1173
- locked_page,
1168
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
11741169 clear_bits | EXTENT_CLEAR_DATA_RESV,
11751170 page_ops);
11761171 goto out;
....@@ -1181,16 +1176,15 @@
11811176 */
11821177 static noinline void async_cow_start(struct btrfs_work *work)
11831178 {
1184
- struct async_cow *async_cow;
1185
- int num_added = 0;
1186
- async_cow = container_of(work, struct async_cow, work);
1179
+ struct async_chunk *async_chunk;
1180
+ int compressed_extents;
11871181
1188
- compress_file_range(async_cow->inode, async_cow->locked_page,
1189
- async_cow->start, async_cow->end, async_cow,
1190
- &num_added);
1191
- if (num_added == 0) {
1192
- btrfs_add_delayed_iput(async_cow->inode);
1193
- async_cow->inode = NULL;
1182
+ async_chunk = container_of(work, struct async_chunk, work);
1183
+
1184
+ compressed_extents = compress_file_range(async_chunk);
1185
+ if (compressed_extents == 0) {
1186
+ btrfs_add_delayed_iput(async_chunk->inode);
1187
+ async_chunk->inode = NULL;
11941188 }
11951189 }
11961190
....@@ -1199,77 +1193,153 @@
11991193 */
12001194 static noinline void async_cow_submit(struct btrfs_work *work)
12011195 {
1202
- struct btrfs_fs_info *fs_info;
1203
- struct async_cow *async_cow;
1204
- struct btrfs_root *root;
1196
+ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1197
+ work);
1198
+ struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
12051199 unsigned long nr_pages;
12061200
1207
- async_cow = container_of(work, struct async_cow, work);
1208
-
1209
- root = async_cow->root;
1210
- fs_info = root->fs_info;
1211
- nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1201
+ nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
12121202 PAGE_SHIFT;
1203
+
1204
+ /*
1205
+ * ->inode could be NULL if async_chunk_start has failed to compress,
1206
+ * in which case we don't have anything to submit, yet we need to
1207
+ * always adjust ->async_delalloc_pages as its paired with the init
1208
+ * happening in cow_file_range_async
1209
+ */
1210
+ if (async_chunk->inode)
1211
+ submit_compressed_extents(async_chunk);
12131212
12141213 /* atomic_sub_return implies a barrier */
12151214 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
12161215 5 * SZ_1M)
12171216 cond_wake_up_nomb(&fs_info->async_submit_wait);
1218
-
1219
- if (async_cow->inode)
1220
- submit_compressed_extents(async_cow->inode, async_cow);
12211217 }
12221218
12231219 static noinline void async_cow_free(struct btrfs_work *work)
12241220 {
1225
- struct async_cow *async_cow;
1226
- async_cow = container_of(work, struct async_cow, work);
1227
- if (async_cow->inode)
1228
- btrfs_add_delayed_iput(async_cow->inode);
1229
- kfree(async_cow);
1221
+ struct async_chunk *async_chunk;
1222
+
1223
+ async_chunk = container_of(work, struct async_chunk, work);
1224
+ if (async_chunk->inode)
1225
+ btrfs_add_delayed_iput(async_chunk->inode);
1226
+ if (async_chunk->blkcg_css)
1227
+ css_put(async_chunk->blkcg_css);
1228
+ /*
1229
+ * Since the pointer to 'pending' is at the beginning of the array of
1230
+ * async_chunk's, freeing it ensures the whole array has been freed.
1231
+ */
1232
+ if (atomic_dec_and_test(async_chunk->pending))
1233
+ kvfree(async_chunk->pending);
12301234 }
12311235
1232
-static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1236
+static int cow_file_range_async(struct btrfs_inode *inode,
1237
+ struct writeback_control *wbc,
1238
+ struct page *locked_page,
12331239 u64 start, u64 end, int *page_started,
1234
- unsigned long *nr_written,
1235
- unsigned int write_flags)
1240
+ unsigned long *nr_written)
12361241 {
1237
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1238
- struct async_cow *async_cow;
1239
- struct btrfs_root *root = BTRFS_I(inode)->root;
1242
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
1243
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1244
+ struct async_cow *ctx;
1245
+ struct async_chunk *async_chunk;
12401246 unsigned long nr_pages;
12411247 u64 cur_end;
1248
+ u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1249
+ int i;
1250
+ bool should_compress;
1251
+ unsigned nofs_flag;
1252
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
12421253
1243
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1244
- 1, 0, NULL);
1245
- while (start < end) {
1246
- async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1247
- BUG_ON(!async_cow); /* -ENOMEM */
1248
- async_cow->inode = igrab(inode);
1249
- async_cow->root = root;
1250
- async_cow->locked_page = locked_page;
1251
- async_cow->start = start;
1252
- async_cow->write_flags = write_flags;
1254
+ unlock_extent(&inode->io_tree, start, end);
12531255
1254
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1255
- !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1256
- cur_end = end;
1257
- else
1256
+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1257
+ !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
1258
+ num_chunks = 1;
1259
+ should_compress = false;
1260
+ } else {
1261
+ should_compress = true;
1262
+ }
1263
+
1264
+ nofs_flag = memalloc_nofs_save();
1265
+ ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1266
+ memalloc_nofs_restore(nofs_flag);
1267
+
1268
+ if (!ctx) {
1269
+ unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
1270
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1271
+ EXTENT_DO_ACCOUNTING;
1272
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1273
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
1274
+ PAGE_SET_ERROR;
1275
+
1276
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
1277
+ clear_bits, page_ops);
1278
+ return -ENOMEM;
1279
+ }
1280
+
1281
+ async_chunk = ctx->chunks;
1282
+ atomic_set(&ctx->num_chunks, num_chunks);
1283
+
1284
+ for (i = 0; i < num_chunks; i++) {
1285
+ if (should_compress)
12581286 cur_end = min(end, start + SZ_512K - 1);
1287
+ else
1288
+ cur_end = end;
12591289
1260
- async_cow->end = cur_end;
1261
- INIT_LIST_HEAD(&async_cow->extents);
1290
+ /*
1291
+ * igrab is called higher up in the call chain, take only the
1292
+ * lightweight reference for the callback lifetime
1293
+ */
1294
+ ihold(&inode->vfs_inode);
1295
+ async_chunk[i].pending = &ctx->num_chunks;
1296
+ async_chunk[i].inode = &inode->vfs_inode;
1297
+ async_chunk[i].start = start;
1298
+ async_chunk[i].end = cur_end;
1299
+ async_chunk[i].write_flags = write_flags;
1300
+ INIT_LIST_HEAD(&async_chunk[i].extents);
12621301
1263
- btrfs_init_work(&async_cow->work,
1264
- btrfs_delalloc_helper,
1265
- async_cow_start, async_cow_submit,
1266
- async_cow_free);
1302
+ /*
1303
+ * The locked_page comes all the way from writepage and its
1304
+ * the original page we were actually given. As we spread
1305
+ * this large delalloc region across multiple async_chunk
1306
+ * structs, only the first struct needs a pointer to locked_page
1307
+ *
1308
+ * This way we don't need racey decisions about who is supposed
1309
+ * to unlock it.
1310
+ */
1311
+ if (locked_page) {
1312
+ /*
1313
+ * Depending on the compressibility, the pages might or
1314
+ * might not go through async. We want all of them to
1315
+ * be accounted against wbc once. Let's do it here
1316
+ * before the paths diverge. wbc accounting is used
1317
+ * only for foreign writeback detection and doesn't
1318
+ * need full accuracy. Just account the whole thing
1319
+ * against the first page.
1320
+ */
1321
+ wbc_account_cgroup_owner(wbc, locked_page,
1322
+ cur_end - start);
1323
+ async_chunk[i].locked_page = locked_page;
1324
+ locked_page = NULL;
1325
+ } else {
1326
+ async_chunk[i].locked_page = NULL;
1327
+ }
12671328
1268
- nr_pages = (cur_end - start + PAGE_SIZE) >>
1269
- PAGE_SHIFT;
1329
+ if (blkcg_css != blkcg_root_css) {
1330
+ css_get(blkcg_css);
1331
+ async_chunk[i].blkcg_css = blkcg_css;
1332
+ } else {
1333
+ async_chunk[i].blkcg_css = NULL;
1334
+ }
1335
+
1336
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
1337
+ async_cow_submit, async_cow_free);
1338
+
1339
+ nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
12701340 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
12711341
1272
- btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1342
+ btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
12731343
12741344 *nr_written += nr_pages;
12751345 start = cur_end + 1;
....@@ -1300,6 +1370,73 @@
13001370 return 1;
13011371 }
13021372
1373
+static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1374
+ const u64 start, const u64 end,
1375
+ int *page_started, unsigned long *nr_written)
1376
+{
1377
+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
1378
+ const bool is_reloc_ino = (inode->root->root_key.objectid ==
1379
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
1380
+ const u64 range_bytes = end + 1 - start;
1381
+ struct extent_io_tree *io_tree = &inode->io_tree;
1382
+ u64 range_start = start;
1383
+ u64 count;
1384
+
1385
+ /*
1386
+ * If EXTENT_NORESERVE is set it means that when the buffered write was
1387
+ * made we had not enough available data space and therefore we did not
1388
+ * reserve data space for it, since we though we could do NOCOW for the
1389
+ * respective file range (either there is prealloc extent or the inode
1390
+ * has the NOCOW bit set).
1391
+ *
1392
+ * However when we need to fallback to COW mode (because for example the
1393
+ * block group for the corresponding extent was turned to RO mode by a
1394
+ * scrub or relocation) we need to do the following:
1395
+ *
1396
+ * 1) We increment the bytes_may_use counter of the data space info.
1397
+ * If COW succeeds, it allocates a new data extent and after doing
1398
+ * that it decrements the space info's bytes_may_use counter and
1399
+ * increments its bytes_reserved counter by the same amount (we do
1400
+ * this at btrfs_add_reserved_bytes()). So we need to increment the
1401
+ * bytes_may_use counter to compensate (when space is reserved at
1402
+ * buffered write time, the bytes_may_use counter is incremented);
1403
+ *
1404
+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1405
+ * that if the COW path fails for any reason, it decrements (through
1406
+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1407
+ * data space info, which we incremented in the step above.
1408
+ *
1409
+ * If we need to fallback to cow and the inode corresponds to a free
1410
+ * space cache inode or an inode of the data relocation tree, we must
1411
+ * also increment bytes_may_use of the data space_info for the same
1412
+ * reason. Space caches and relocated data extents always get a prealloc
1413
+ * extent for them, however scrub or balance may have set the block
1414
+ * group that contains that extent to RO mode and therefore force COW
1415
+ * when starting writeback.
1416
+ */
1417
+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
1418
+ EXTENT_NORESERVE, 0);
1419
+ if (count > 0 || is_space_ino || is_reloc_ino) {
1420
+ u64 bytes = count;
1421
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
1422
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1423
+
1424
+ if (is_space_ino || is_reloc_ino)
1425
+ bytes = range_bytes;
1426
+
1427
+ spin_lock(&sinfo->lock);
1428
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1429
+ spin_unlock(&sinfo->lock);
1430
+
1431
+ if (count > 0)
1432
+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1433
+ 0, 0, NULL);
1434
+ }
1435
+
1436
+ return cow_file_range(inode, locked_page, start, end, page_started,
1437
+ nr_written, 1);
1438
+}
1439
+
13031440 /*
13041441 * when nowcow writeback call back. This checks for snapshots or COW copies
13051442 * of the extents that exist in the file, and COWs the file as required.
....@@ -1307,38 +1444,27 @@
13071444 * If no cow copies or snapshots exist, we write directly to the existing
13081445 * blocks on disk
13091446 */
1310
-static noinline int run_delalloc_nocow(struct inode *inode,
1447
+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
13111448 struct page *locked_page,
1312
- u64 start, u64 end, int *page_started, int force,
1313
- unsigned long *nr_written)
1449
+ const u64 start, const u64 end,
1450
+ int *page_started, int force,
1451
+ unsigned long *nr_written)
13141452 {
1315
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1316
- struct btrfs_root *root = BTRFS_I(inode)->root;
1317
- struct extent_buffer *leaf;
1453
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
1454
+ struct btrfs_root *root = inode->root;
13181455 struct btrfs_path *path;
1319
- struct btrfs_file_extent_item *fi;
1320
- struct btrfs_key found_key;
1321
- struct extent_map *em;
1322
- u64 cow_start;
1323
- u64 cur_offset;
1324
- u64 extent_end;
1325
- u64 extent_offset;
1326
- u64 disk_bytenr;
1327
- u64 num_bytes;
1328
- u64 disk_num_bytes;
1329
- u64 ram_bytes;
1330
- int extent_type;
1456
+ u64 cow_start = (u64)-1;
1457
+ u64 cur_offset = start;
13311458 int ret;
1332
- int type;
1333
- int nocow;
1334
- int check_prev = 1;
1335
- bool nolock;
1336
- u64 ino = btrfs_ino(BTRFS_I(inode));
1459
+ bool check_prev = true;
1460
+ const bool freespace_inode = btrfs_is_free_space_inode(inode);
1461
+ u64 ino = btrfs_ino(inode);
1462
+ bool nocow = false;
1463
+ u64 disk_bytenr = 0;
13371464
13381465 path = btrfs_alloc_path();
13391466 if (!path) {
1340
- extent_clear_unlock_delalloc(inode, start, end, end,
1341
- locked_page,
1467
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
13421468 EXTENT_LOCKED | EXTENT_DELALLOC |
13431469 EXTENT_DO_ACCOUNTING |
13441470 EXTENT_DEFRAG, PAGE_UNLOCK |
....@@ -1348,15 +1474,29 @@
13481474 return -ENOMEM;
13491475 }
13501476
1351
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1352
-
1353
- cow_start = (u64)-1;
1354
- cur_offset = start;
13551477 while (1) {
1478
+ struct btrfs_key found_key;
1479
+ struct btrfs_file_extent_item *fi;
1480
+ struct extent_buffer *leaf;
1481
+ u64 extent_end;
1482
+ u64 extent_offset;
1483
+ u64 num_bytes = 0;
1484
+ u64 disk_num_bytes;
1485
+ u64 ram_bytes;
1486
+ int extent_type;
1487
+
1488
+ nocow = false;
1489
+
13561490 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
13571491 cur_offset, 0);
13581492 if (ret < 0)
13591493 goto error;
1494
+
1495
+ /*
1496
+ * If there is no extent for our range when doing the initial
1497
+ * search, then go back to the previous slot as it will be the
1498
+ * one containing the search offset
1499
+ */
13601500 if (ret > 0 && path->slots[0] > 0 && check_prev) {
13611501 leaf = path->nodes[0];
13621502 btrfs_item_key_to_cpu(leaf, &found_key,
....@@ -1365,8 +1505,9 @@
13651505 found_key.type == BTRFS_EXTENT_DATA_KEY)
13661506 path->slots[0]--;
13671507 }
1368
- check_prev = 0;
1508
+ check_prev = false;
13691509 next_slot:
1510
+ /* Go to next leaf if we have exhausted the current one */
13701511 leaf = path->nodes[0];
13711512 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
13721513 ret = btrfs_next_leaf(root, path);
....@@ -1380,28 +1521,40 @@
13801521 leaf = path->nodes[0];
13811522 }
13821523
1383
- nocow = 0;
1384
- disk_bytenr = 0;
1385
- num_bytes = 0;
13861524 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
13871525
1526
+ /* Didn't find anything for our INO */
13881527 if (found_key.objectid > ino)
13891528 break;
1529
+ /*
1530
+ * Keep searching until we find an EXTENT_ITEM or there are no
1531
+ * more extents for this inode
1532
+ */
13901533 if (WARN_ON_ONCE(found_key.objectid < ino) ||
13911534 found_key.type < BTRFS_EXTENT_DATA_KEY) {
13921535 path->slots[0]++;
13931536 goto next_slot;
13941537 }
1538
+
1539
+ /* Found key is not EXTENT_DATA_KEY or starts after req range */
13951540 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
13961541 found_key.offset > end)
13971542 break;
13981543
1544
+ /*
1545
+ * If the found extent starts after requested offset, then
1546
+ * adjust extent_end to be right before this extent begins
1547
+ */
13991548 if (found_key.offset > cur_offset) {
14001549 extent_end = found_key.offset;
14011550 extent_type = 0;
14021551 goto out_check;
14031552 }
14041553
1554
+ /*
1555
+ * Found extent which begins before our range and potentially
1556
+ * intersect it
1557
+ */
14051558 fi = btrfs_item_ptr(leaf, path->slots[0],
14061559 struct btrfs_file_extent_item);
14071560 extent_type = btrfs_file_extent_type(leaf, fi);
....@@ -1415,31 +1568,41 @@
14151568 btrfs_file_extent_num_bytes(leaf, fi);
14161569 disk_num_bytes =
14171570 btrfs_file_extent_disk_num_bytes(leaf, fi);
1418
- if (extent_end <= start) {
1571
+ /*
1572
+ * If the extent we got ends before our current offset,
1573
+ * skip to the next extent.
1574
+ */
1575
+ if (extent_end <= cur_offset) {
14191576 path->slots[0]++;
14201577 goto next_slot;
14211578 }
1579
+ /* Skip holes */
14221580 if (disk_bytenr == 0)
14231581 goto out_check;
1582
+ /* Skip compressed/encrypted/encoded extents */
14241583 if (btrfs_file_extent_compression(leaf, fi) ||
14251584 btrfs_file_extent_encryption(leaf, fi) ||
14261585 btrfs_file_extent_other_encoding(leaf, fi))
14271586 goto out_check;
14281587 /*
1429
- * Do the same check as in btrfs_cross_ref_exist but
1430
- * without the unnecessary search.
1588
+ * If extent is created before the last volume's snapshot
1589
+ * this implies the extent is shared, hence we can't do
1590
+ * nocow. This is the same check as in
1591
+ * btrfs_cross_ref_exist but without calling
1592
+ * btrfs_search_slot.
14311593 */
1432
- if (!nolock &&
1594
+ if (!freespace_inode &&
14331595 btrfs_file_extent_generation(leaf, fi) <=
14341596 btrfs_root_last_snapshot(&root->root_item))
14351597 goto out_check;
14361598 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
14371599 goto out_check;
1600
+ /* If extent is RO, we must COW it */
14381601 if (btrfs_extent_readonly(fs_info, disk_bytenr))
14391602 goto out_check;
14401603 ret = btrfs_cross_ref_exist(root, ino,
14411604 found_key.offset -
1442
- extent_offset, disk_bytenr);
1605
+ extent_offset, disk_bytenr, false);
14431606 if (ret) {
14441607 /*
14451608 * ret could be -EIO if the above fails to read
....@@ -1451,17 +1614,17 @@
14511614 goto error;
14521615 }
14531616
1454
- WARN_ON_ONCE(nolock);
1617
+ WARN_ON_ONCE(freespace_inode);
14551618 goto out_check;
14561619 }
14571620 disk_bytenr += extent_offset;
14581621 disk_bytenr += cur_offset - found_key.offset;
14591622 num_bytes = min(end + 1, extent_end) - cur_offset;
14601623 /*
1461
- * if there are pending snapshots for this root,
1462
- * we fall into common COW way.
1624
+ * If there are pending snapshots for this root, we
1625
+ * fall into common COW way
14631626 */
1464
- if (!nolock && atomic_read(&root->snapshot_force_cow))
1627
+ if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
14651628 goto out_check;
14661629 /*
14671630 * force cow if csum exists in the range.
....@@ -1480,27 +1643,29 @@
14801643 cur_offset = cow_start;
14811644 goto error;
14821645 }
1483
- WARN_ON_ONCE(nolock);
1646
+ WARN_ON_ONCE(freespace_inode);
14841647 goto out_check;
14851648 }
14861649 if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
14871650 goto out_check;
1488
- nocow = 1;
1651
+ nocow = true;
14891652 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1490
- extent_end = found_key.offset +
1491
- btrfs_file_extent_ram_bytes(leaf, fi);
1492
- extent_end = ALIGN(extent_end,
1493
- fs_info->sectorsize);
1653
+ extent_end = found_key.offset + ram_bytes;
1654
+ extent_end = ALIGN(extent_end, fs_info->sectorsize);
1655
+ /* Skip extents outside of our requested range */
1656
+ if (extent_end <= start) {
1657
+ path->slots[0]++;
1658
+ goto next_slot;
1659
+ }
14941660 } else {
1495
- BUG_ON(1);
1661
+ /* If this triggers then we have a memory corruption */
1662
+ BUG();
14961663 }
14971664 out_check:
1498
- if (extent_end <= start) {
1499
- path->slots[0]++;
1500
- if (nocow)
1501
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1502
- goto next_slot;
1503
- }
1665
+ /*
1666
+ * If nocow is false then record the beginning of the range
1667
+ * that needs to be COWed
1668
+ */
15041669 if (!nocow) {
15051670 if (cow_start == (u64)-1)
15061671 cow_start = cur_offset;
....@@ -1512,22 +1677,24 @@
15121677 }
15131678
15141679 btrfs_release_path(path);
1680
+
1681
+ /*
1682
+ * COW range from cow_start to found_key.offset - 1. As the key
1683
+ * will contain the beginning of the first extent that can be
1684
+ * NOCOW, following one which needs to be COW'ed
1685
+ */
15151686 if (cow_start != (u64)-1) {
1516
- ret = cow_file_range(inode, locked_page,
1517
- cow_start, found_key.offset - 1,
1518
- end, page_started, nr_written, 1,
1519
- NULL);
1520
- if (ret) {
1521
- if (nocow)
1522
- btrfs_dec_nocow_writers(fs_info,
1523
- disk_bytenr);
1687
+ ret = fallback_to_cow(inode, locked_page,
1688
+ cow_start, found_key.offset - 1,
1689
+ page_started, nr_written);
1690
+ if (ret)
15241691 goto error;
1525
- }
15261692 cow_start = (u64)-1;
15271693 }
15281694
15291695 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
15301696 u64 orig_start = found_key.offset - extent_offset;
1697
+ struct extent_map *em;
15311698
15321699 em = create_io_em(inode, cur_offset, num_bytes,
15331700 orig_start,
....@@ -1537,26 +1704,32 @@
15371704 ram_bytes, BTRFS_COMPRESS_NONE,
15381705 BTRFS_ORDERED_PREALLOC);
15391706 if (IS_ERR(em)) {
1540
- if (nocow)
1541
- btrfs_dec_nocow_writers(fs_info,
1542
- disk_bytenr);
15431707 ret = PTR_ERR(em);
15441708 goto error;
15451709 }
15461710 free_extent_map(em);
1547
- }
1548
-
1549
- if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1550
- type = BTRFS_ORDERED_PREALLOC;
1711
+ ret = btrfs_add_ordered_extent(inode, cur_offset,
1712
+ disk_bytenr, num_bytes,
1713
+ num_bytes,
1714
+ BTRFS_ORDERED_PREALLOC);
1715
+ if (ret) {
1716
+ btrfs_drop_extent_cache(inode, cur_offset,
1717
+ cur_offset + num_bytes - 1,
1718
+ 0);
1719
+ goto error;
1720
+ }
15511721 } else {
1552
- type = BTRFS_ORDERED_NOCOW;
1722
+ ret = btrfs_add_ordered_extent(inode, cur_offset,
1723
+ disk_bytenr, num_bytes,
1724
+ num_bytes,
1725
+ BTRFS_ORDERED_NOCOW);
1726
+ if (ret)
1727
+ goto error;
15531728 }
15541729
1555
- ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1556
- num_bytes, num_bytes, type);
15571730 if (nocow)
15581731 btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1559
- BUG_ON(ret); /* -ENOMEM */
1732
+ nocow = false;
15601733
15611734 if (root->root_key.objectid ==
15621735 BTRFS_DATA_RELOC_TREE_OBJECTID)
....@@ -1569,7 +1742,7 @@
15691742 num_bytes);
15701743
15711744 extent_clear_unlock_delalloc(inode, cur_offset,
1572
- cur_offset + num_bytes - 1, end,
1745
+ cur_offset + num_bytes - 1,
15731746 locked_page, EXTENT_LOCKED |
15741747 EXTENT_DELALLOC |
15751748 EXTENT_CLEAR_DATA_RESV,
....@@ -1594,15 +1767,18 @@
15941767
15951768 if (cow_start != (u64)-1) {
15961769 cur_offset = end;
1597
- ret = cow_file_range(inode, locked_page, cow_start, end, end,
1598
- page_started, nr_written, 1, NULL);
1770
+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
1771
+ page_started, nr_written);
15991772 if (ret)
16001773 goto error;
16011774 }
16021775
16031776 error:
1777
+ if (nocow)
1778
+ btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1779
+
16041780 if (ret && cur_offset < end)
1605
- extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1781
+ extent_clear_unlock_delalloc(inode, cur_offset, end,
16061782 locked_page, EXTENT_LOCKED |
16071783 EXTENT_DELALLOC | EXTENT_DEFRAG |
16081784 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
....@@ -1613,11 +1789,11 @@
16131789 return ret;
16141790 }
16151791
1616
-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1792
+static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
16171793 {
16181794
1619
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1620
- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1795
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1796
+ !(inode->flags & BTRFS_INODE_PREALLOC))
16211797 return 0;
16221798
16231799 /*
....@@ -1625,9 +1801,8 @@
16251801 * if is not zero, it means the file is defragging.
16261802 * Force cow if given extent needs to be defragged.
16271803 */
1628
- if (BTRFS_I(inode)->defrag_bytes &&
1629
- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1630
- EXTENT_DEFRAG, 0, NULL))
1804
+ if (inode->defrag_bytes &&
1805
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
16311806 return 1;
16321807
16331808 return 0;
....@@ -1637,31 +1812,27 @@
16371812 * Function to process delayed allocation (create CoW) for ranges which are
16381813 * being touched for the first time.
16391814 */
1640
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
1815
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
16411816 u64 start, u64 end, int *page_started, unsigned long *nr_written,
16421817 struct writeback_control *wbc)
16431818 {
1644
- struct inode *inode = private_data;
16451819 int ret;
16461820 int force_cow = need_force_cow(inode, start, end);
1647
- unsigned int write_flags = wbc_to_write_flags(wbc);
16481821
1649
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1822
+ if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
16501823 ret = run_delalloc_nocow(inode, locked_page, start, end,
16511824 page_started, 1, nr_written);
1652
- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1825
+ } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
16531826 ret = run_delalloc_nocow(inode, locked_page, start, end,
16541827 page_started, 0, nr_written);
16551828 } else if (!inode_can_compress(inode) ||
16561829 !inode_need_compress(inode, start, end)) {
1657
- ret = cow_file_range(inode, locked_page, start, end, end,
1658
- page_started, nr_written, 1, NULL);
1830
+ ret = cow_file_range(inode, locked_page, start, end,
1831
+ page_started, nr_written, 1);
16591832 } else {
1660
- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1661
- &BTRFS_I(inode)->runtime_flags);
1662
- ret = cow_file_range_async(inode, locked_page, start, end,
1663
- page_started, nr_written,
1664
- write_flags);
1833
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1834
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
1835
+ page_started, nr_written);
16651836 }
16661837 if (ret)
16671838 btrfs_cleanup_ordered_extents(inode, locked_page, start,
....@@ -1669,10 +1840,9 @@
16691840 return ret;
16701841 }
16711842
1672
-static void btrfs_split_extent_hook(void *private_data,
1673
- struct extent_state *orig, u64 split)
1843
+void btrfs_split_delalloc_extent(struct inode *inode,
1844
+ struct extent_state *orig, u64 split)
16741845 {
1675
- struct inode *inode = private_data;
16761846 u64 size;
16771847
16781848 /* not delalloc, ignore it */
....@@ -1685,7 +1855,7 @@
16851855 u64 new_size;
16861856
16871857 /*
1688
- * See the explanation in btrfs_merge_extent_hook, the same
1858
+ * See the explanation in btrfs_merge_delalloc_extent, the same
16891859 * applies here, just in reverse.
16901860 */
16911861 new_size = orig->end - split + 1;
....@@ -1702,16 +1872,13 @@
17021872 }
17031873
17041874 /*
1705
- * extent_io.c merge_extent_hook, used to track merged delayed allocation
1706
- * extents so we can keep track of new extents that are just merged onto old
1707
- * extents, such as when we are doing sequential writes, so we can properly
1708
- * account for the metadata space we'll need.
1875
+ * Handle merged delayed allocation extents so we can keep track of new extents
1876
+ * that are just merged onto old extents, such as when we are doing sequential
1877
+ * writes, so we can properly account for the metadata space we'll need.
17091878 */
1710
-static void btrfs_merge_extent_hook(void *private_data,
1711
- struct extent_state *new,
1712
- struct extent_state *other)
1879
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
1880
+ struct extent_state *other)
17131881 {
1714
- struct inode *inode = private_data;
17151882 u64 new_size, old_size;
17161883 u32 num_extents;
17171884
....@@ -1815,15 +1982,12 @@
18151982 }
18161983
18171984 /*
1818
- * extent_io.c set_bit_hook, used to track delayed allocation
1819
- * bytes in this file, and to maintain the list of inodes that
1820
- * have pending delalloc work to be done.
1985
+ * Properly track delayed allocation bytes in the inode and to maintain the
1986
+ * list of inodes that have pending delalloc work to be done.
18211987 */
1822
-static void btrfs_set_bit_hook(void *private_data,
1823
- struct extent_state *state, unsigned *bits)
1988
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
1989
+ unsigned *bits)
18241990 {
1825
- struct inode *inode = private_data;
1826
-
18271991 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
18281992
18291993 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
....@@ -1869,14 +2033,14 @@
18692033 }
18702034
18712035 /*
1872
- * extent_io.c clear_bit_hook, see set_bit_hook for why
2036
+ * Once a range is no longer delalloc this function ensures that proper
2037
+ * accounting happens.
18732038 */
1874
-static void btrfs_clear_bit_hook(void *private_data,
1875
- struct extent_state *state,
1876
- unsigned *bits)
2039
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
2040
+ struct extent_state *state, unsigned *bits)
18772041 {
1878
- struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1879
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
2042
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
2043
+ struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
18802044 u64 len = state->end + 1 - state->start;
18812045 u32 num_extents = count_max_extents(len);
18822046
....@@ -1901,7 +2065,7 @@
19012065
19022066 /*
19032067 * We don't reserve metadata space for space cache inodes so we
1904
- * don't need to call dellalloc_release_metadata if there is an
2068
+ * don't need to call delalloc_release_metadata if there is an
19052069 * error.
19062070 */
19072071 if (*bits & EXTENT_CLEAR_META_RESV &&
....@@ -1915,9 +2079,7 @@
19152079 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
19162080 do_list && !(state->state & EXTENT_NORESERVE) &&
19172081 (*bits & EXTENT_CLEAR_DATA_RESV))
1918
- btrfs_free_reserved_data_space_noquota(
1919
- &inode->vfs_inode,
1920
- state->start, len);
2082
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
19212083
19222084 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
19232085 fs_info->delalloc_batch);
....@@ -1940,16 +2102,21 @@
19402102 }
19412103
19422104 /*
1943
- * Merge bio hook, this must check the chunk tree to make sure we don't create
1944
- * bios that span stripes or chunks
2105
+ * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
2106
+ * in a chunk's stripe. This function ensures that bios do not span a
2107
+ * stripe/chunk
19452108 *
1946
- * return 1 if page cannot be merged to bio
1947
- * return 0 if page can be merged to bio
2109
+ * @page - The page we are about to add to the bio
2110
+ * @size - size we want to add to the bio
2111
+ * @bio - bio we want to ensure is smaller than a stripe
2112
+ * @bio_flags - flags of the bio
2113
+ *
2114
+ * return 1 if page cannot be added to the bio
2115
+ * return 0 if page can be added to the bio
19482116 * return error otherwise
19492117 */
1950
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1951
- size_t size, struct bio *bio,
1952
- unsigned long bio_flags)
2118
+int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
2119
+ unsigned long bio_flags)
19532120 {
19542121 struct inode *inode = page->mapping->host;
19552122 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
....@@ -1957,17 +2124,19 @@
19572124 u64 length = 0;
19582125 u64 map_length;
19592126 int ret;
2127
+ struct btrfs_io_geometry geom;
19602128
19612129 if (bio_flags & EXTENT_BIO_COMPRESSED)
19622130 return 0;
19632131
19642132 length = bio->bi_iter.bi_size;
19652133 map_length = length;
1966
- ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1967
- NULL, 0);
2134
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
2135
+ &geom);
19682136 if (ret < 0)
19692137 return ret;
1970
- if (map_length < length + size)
2138
+
2139
+ if (geom.len < length + size)
19712140 return 1;
19722141 return 0;
19732142 }
....@@ -1984,34 +2153,8 @@
19842153 u64 bio_offset)
19852154 {
19862155 struct inode *inode = private_data;
1987
- blk_status_t ret = 0;
19882156
1989
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1990
- BUG_ON(ret); /* -ENOMEM */
1991
- return 0;
1992
-}
1993
-
1994
-/*
1995
- * in order to insert checksums into the metadata in large chunks,
1996
- * we wait until bio submission time. All the pages in the bio are
1997
- * checksummed and sums are attached onto the ordered extent record.
1998
- *
1999
- * At IO completion time the cums attached on the ordered extent record
2000
- * are inserted into the btree
2001
- */
2002
-blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
2003
- int mirror_num)
2004
-{
2005
- struct inode *inode = private_data;
2006
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2007
- blk_status_t ret;
2008
-
2009
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
2010
- if (ret) {
2011
- bio->bi_status = ret;
2012
- bio_endio(bio);
2013
- }
2014
- return ret;
2157
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
20152158 }
20162159
20172160 /*
....@@ -2032,11 +2175,10 @@
20322175 *
20332176 * c-3) otherwise: async submit
20342177 */
2035
-static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
2036
- int mirror_num, unsigned long bio_flags,
2037
- u64 bio_offset)
2178
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
2179
+ int mirror_num, unsigned long bio_flags)
2180
+
20382181 {
2039
- struct inode *inode = private_data;
20402182 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
20412183 struct btrfs_root *root = BTRFS_I(inode)->root;
20422184 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
....@@ -2060,7 +2202,7 @@
20602202 bio_flags);
20612203 goto out;
20622204 } else if (!skip_sum) {
2063
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2205
+ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
20642206 if (ret)
20652207 goto out;
20662208 }
....@@ -2071,17 +2213,16 @@
20712213 goto mapit;
20722214 /* we're doing a write, do the async checksumming */
20732215 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2074
- bio_offset, inode,
2075
- btrfs_submit_bio_start);
2216
+ 0, inode, btrfs_submit_bio_start);
20762217 goto out;
20772218 } else if (!skip_sum) {
2078
- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2219
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
20792220 if (ret)
20802221 goto out;
20812222 }
20822223
20832224 mapit:
2084
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2225
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
20852226
20862227 out:
20872228 if (ret) {
....@@ -2095,16 +2236,15 @@
20952236 * given a list of ordered sums record them in the inode. This happens
20962237 * at IO completion time based on sums calculated at bio submission time.
20972238 */
2098
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2099
- struct inode *inode, struct list_head *list)
2239
+static int add_pending_csums(struct btrfs_trans_handle *trans,
2240
+ struct list_head *list)
21002241 {
21012242 struct btrfs_ordered_sum *sum;
21022243 int ret;
21032244
21042245 list_for_each_entry(sum, list, list) {
21052246 trans->adding_csums = true;
2106
- ret = btrfs_csum_file_blocks(trans,
2107
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
2247
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
21082248 trans->adding_csums = false;
21092249 if (ret)
21102250 return ret;
....@@ -2112,18 +2252,77 @@
21122252 return 0;
21132253 }
21142254
2115
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2116
- unsigned int extra_bits,
2117
- struct extent_state **cached_state, int dedupe)
2255
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2256
+ const u64 start,
2257
+ const u64 len,
2258
+ struct extent_state **cached_state)
21182259 {
2119
- WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2120
- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2121
- extra_bits, cached_state);
2260
+ u64 search_start = start;
2261
+ const u64 end = start + len - 1;
2262
+
2263
+ while (search_start < end) {
2264
+ const u64 search_len = end - search_start + 1;
2265
+ struct extent_map *em;
2266
+ u64 em_len;
2267
+ int ret = 0;
2268
+
2269
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2270
+ if (IS_ERR(em))
2271
+ return PTR_ERR(em);
2272
+
2273
+ if (em->block_start != EXTENT_MAP_HOLE)
2274
+ goto next;
2275
+
2276
+ em_len = em->len;
2277
+ if (em->start < search_start)
2278
+ em_len -= search_start - em->start;
2279
+ if (em_len > search_len)
2280
+ em_len = search_len;
2281
+
2282
+ ret = set_extent_bit(&inode->io_tree, search_start,
2283
+ search_start + em_len - 1,
2284
+ EXTENT_DELALLOC_NEW,
2285
+ NULL, cached_state, GFP_NOFS);
2286
+next:
2287
+ search_start = extent_map_end(em);
2288
+ free_extent_map(em);
2289
+ if (ret)
2290
+ return ret;
2291
+ }
2292
+ return 0;
2293
+}
2294
+
2295
+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2296
+ unsigned int extra_bits,
2297
+ struct extent_state **cached_state)
2298
+{
2299
+ WARN_ON(PAGE_ALIGNED(end));
2300
+
2301
+ if (start >= i_size_read(&inode->vfs_inode) &&
2302
+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
2303
+ /*
2304
+ * There can't be any extents following eof in this case so just
2305
+ * set the delalloc new bit for the range directly.
2306
+ */
2307
+ extra_bits |= EXTENT_DELALLOC_NEW;
2308
+ } else {
2309
+ int ret;
2310
+
2311
+ ret = btrfs_find_new_delalloc_bytes(inode, start,
2312
+ end + 1 - start,
2313
+ cached_state);
2314
+ if (ret)
2315
+ return ret;
2316
+ }
2317
+
2318
+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
2319
+ cached_state);
21222320 }
21232321
21242322 /* see btrfs_writepage_start_hook for details on why this is required */
21252323 struct btrfs_writepage_fixup {
21262324 struct page *page;
2325
+ struct inode *inode;
21272326 struct btrfs_work work;
21282327 };
21292328
....@@ -2134,75 +2333,126 @@
21342333 struct extent_state *cached_state = NULL;
21352334 struct extent_changeset *data_reserved = NULL;
21362335 struct page *page;
2137
- struct inode *inode;
2336
+ struct btrfs_inode *inode;
21382337 u64 page_start;
21392338 u64 page_end;
2140
- int ret;
2339
+ int ret = 0;
2340
+ bool free_delalloc_space = true;
21412341
21422342 fixup = container_of(work, struct btrfs_writepage_fixup, work);
21432343 page = fixup->page;
2144
-again:
2145
- lock_page(page);
2146
- if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2147
- ClearPageChecked(page);
2148
- goto out_page;
2149
- }
2150
-
2151
- inode = page->mapping->host;
2344
+ inode = BTRFS_I(fixup->inode);
21522345 page_start = page_offset(page);
21532346 page_end = page_offset(page) + PAGE_SIZE - 1;
21542347
2155
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2156
- &cached_state);
2348
+ /*
2349
+ * This is similar to page_mkwrite, we need to reserve the space before
2350
+ * we take the page lock.
2351
+ */
2352
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2353
+ PAGE_SIZE);
2354
+again:
2355
+ lock_page(page);
2356
+
2357
+ /*
2358
+ * Before we queued this fixup, we took a reference on the page.
2359
+ * page->mapping may go NULL, but it shouldn't be moved to a different
2360
+ * address space.
2361
+ */
2362
+ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2363
+ /*
2364
+ * Unfortunately this is a little tricky, either
2365
+ *
2366
+ * 1) We got here and our page had already been dealt with and
2367
+ * we reserved our space, thus ret == 0, so we need to just
2368
+ * drop our space reservation and bail. This can happen the
2369
+ * first time we come into the fixup worker, or could happen
2370
+ * while waiting for the ordered extent.
2371
+ * 2) Our page was already dealt with, but we happened to get an
2372
+ * ENOSPC above from the btrfs_delalloc_reserve_space. In
2373
+ * this case we obviously don't have anything to release, but
2374
+ * because the page was already dealt with we don't want to
2375
+ * mark the page with an error, so make sure we're resetting
2376
+ * ret to 0. This is why we have this check _before_ the ret
2377
+ * check, because we do not want to have a surprise ENOSPC
2378
+ * when the page was already properly dealt with.
2379
+ */
2380
+ if (!ret) {
2381
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2382
+ btrfs_delalloc_release_space(inode, data_reserved,
2383
+ page_start, PAGE_SIZE,
2384
+ true);
2385
+ }
2386
+ ret = 0;
2387
+ goto out_page;
2388
+ }
2389
+
2390
+ /*
2391
+ * We can't mess with the page state unless it is locked, so now that
2392
+ * it is locked bail if we failed to make our space reservation.
2393
+ */
2394
+ if (ret)
2395
+ goto out_page;
2396
+
2397
+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
21572398
21582399 /* already ordered? We're done */
21592400 if (PagePrivate2(page))
2160
- goto out;
2401
+ goto out_reserved;
21612402
2162
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2163
- PAGE_SIZE);
2403
+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
21642404 if (ordered) {
2165
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2166
- page_end, &cached_state);
2405
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
2406
+ &cached_state);
21672407 unlock_page(page);
2168
- btrfs_start_ordered_extent(inode, ordered, 1);
2408
+ btrfs_start_ordered_extent(ordered, 1);
21692409 btrfs_put_ordered_extent(ordered);
21702410 goto again;
21712411 }
21722412
2173
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2174
- PAGE_SIZE);
2175
- if (ret) {
2176
- mapping_set_error(page->mapping, ret);
2177
- end_extent_writepage(page, ret, page_start, page_end);
2178
- ClearPageChecked(page);
2179
- goto out;
2180
- }
2181
-
21822413 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2183
- &cached_state, 0);
2184
- if (ret) {
2185
- mapping_set_error(page->mapping, ret);
2186
- end_extent_writepage(page, ret, page_start, page_end);
2187
- ClearPageChecked(page);
2188
- goto out_reserved;
2189
- }
2190
-
2191
- ClearPageChecked(page);
2192
- set_page_dirty(page);
2193
-out_reserved:
2194
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2414
+ &cached_state);
21952415 if (ret)
2416
+ goto out_reserved;
2417
+
2418
+ /*
2419
+ * Everything went as planned, we're now the owner of a dirty page with
2420
+ * delayed allocation bits set and space reserved for our COW
2421
+ * destination.
2422
+ *
2423
+ * The page was dirty when we started, nothing should have cleaned it.
2424
+ */
2425
+ BUG_ON(!PageDirty(page));
2426
+ free_delalloc_space = false;
2427
+out_reserved:
2428
+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2429
+ if (free_delalloc_space)
21962430 btrfs_delalloc_release_space(inode, data_reserved, page_start,
21972431 PAGE_SIZE, true);
2198
-out:
2199
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2432
+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
22002433 &cached_state);
22012434 out_page:
2435
+ if (ret) {
2436
+ /*
2437
+ * We hit ENOSPC or other errors. Update the mapping and page
2438
+ * to reflect the errors and clean the page.
2439
+ */
2440
+ mapping_set_error(page->mapping, ret);
2441
+ end_extent_writepage(page, ret, page_start, page_end);
2442
+ clear_page_dirty_for_io(page);
2443
+ SetPageError(page);
2444
+ }
2445
+ ClearPageChecked(page);
22022446 unlock_page(page);
22032447 put_page(page);
22042448 kfree(fixup);
22052449 extent_changeset_free(data_reserved);
2450
+ /*
2451
+ * As a precaution, do a delayed iput in case it would be the last iput
2452
+ * that could need flushing space. Recursing back to fixup worker would
2453
+ * deadlock.
2454
+ */
2455
+ btrfs_add_delayed_iput(&inode->vfs_inode);
22062456 }
22072457
22082458 /*
....@@ -2216,7 +2466,7 @@
22162466 * to fix it up. The async helper will wait for ordered extents, set
22172467 * the delalloc bit and make it safe to write the page.
22182468 */
2219
-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2469
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
22202470 {
22212471 struct inode *inode = page->mapping->host;
22222472 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
....@@ -2226,6 +2476,13 @@
22262476 if (TestClearPagePrivate2(page))
22272477 return 0;
22282478
2479
+ /*
2480
+ * PageChecked is set below when we create a fixup worker for this page,
2481
+ * don't try to create another one if we're already PageChecked()
2482
+ *
2483
+ * The extent_io writepage code will redirty the page if we send back
2484
+ * EAGAIN.
2485
+ */
22292486 if (PageChecked(page))
22302487 return -EAGAIN;
22312488
....@@ -2233,28 +2490,36 @@
22332490 if (!fixup)
22342491 return -EAGAIN;
22352492
2493
+ /*
2494
+ * We are already holding a reference to this inode from
2495
+ * write_cache_pages. We need to hold it because the space reservation
2496
+ * takes place outside of the page lock, and we can't trust
2497
+ * page->mapping outside of the page lock.
2498
+ */
2499
+ ihold(inode);
22362500 SetPageChecked(page);
22372501 get_page(page);
2238
- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2239
- btrfs_writepage_fixup_worker, NULL, NULL);
2502
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
22402503 fixup->page = page;
2504
+ fixup->inode = inode;
22412505 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2242
- return -EBUSY;
2506
+
2507
+ return -EAGAIN;
22432508 }
22442509
22452510 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2246
- struct inode *inode, u64 file_pos,
2247
- u64 disk_bytenr, u64 disk_num_bytes,
2248
- u64 num_bytes, u64 ram_bytes,
2249
- u8 compression, u8 encryption,
2250
- u16 other_encoding, int extent_type)
2511
+ struct btrfs_inode *inode, u64 file_pos,
2512
+ struct btrfs_file_extent_item *stack_fi,
2513
+ u64 qgroup_reserved)
22512514 {
2252
- struct btrfs_root *root = BTRFS_I(inode)->root;
2253
- struct btrfs_file_extent_item *fi;
2515
+ struct btrfs_root *root = inode->root;
22542516 struct btrfs_path *path;
22552517 struct extent_buffer *leaf;
22562518 struct btrfs_key ins;
2257
- u64 qg_released;
2519
+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2520
+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2521
+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2522
+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
22582523 int extent_inserted = 0;
22592524 int ret;
22602525
....@@ -2273,709 +2538,52 @@
22732538 */
22742539 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
22752540 file_pos + num_bytes, NULL, 0,
2276
- 1, sizeof(*fi), &extent_inserted);
2541
+ 1, sizeof(*stack_fi), &extent_inserted);
22772542 if (ret)
22782543 goto out;
22792544
22802545 if (!extent_inserted) {
2281
- ins.objectid = btrfs_ino(BTRFS_I(inode));
2546
+ ins.objectid = btrfs_ino(inode);
22822547 ins.offset = file_pos;
22832548 ins.type = BTRFS_EXTENT_DATA_KEY;
22842549
22852550 path->leave_spinning = 1;
22862551 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2287
- sizeof(*fi));
2552
+ sizeof(*stack_fi));
22882553 if (ret)
22892554 goto out;
22902555 }
22912556 leaf = path->nodes[0];
2292
- fi = btrfs_item_ptr(leaf, path->slots[0],
2293
- struct btrfs_file_extent_item);
2294
- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2295
- btrfs_set_file_extent_type(leaf, fi, extent_type);
2296
- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2297
- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2298
- btrfs_set_file_extent_offset(leaf, fi, 0);
2299
- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2300
- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2301
- btrfs_set_file_extent_compression(leaf, fi, compression);
2302
- btrfs_set_file_extent_encryption(leaf, fi, encryption);
2303
- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2557
+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2558
+ write_extent_buffer(leaf, stack_fi,
2559
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
2560
+ sizeof(struct btrfs_file_extent_item));
23042561
23052562 btrfs_mark_buffer_dirty(leaf);
23062563 btrfs_release_path(path);
23072564
2308
- inode_add_bytes(inode, num_bytes);
2565
+ inode_add_bytes(&inode->vfs_inode, num_bytes);
23092566
23102567 ins.objectid = disk_bytenr;
23112568 ins.offset = disk_num_bytes;
23122569 ins.type = BTRFS_EXTENT_ITEM_KEY;
23132570
2314
- /*
2315
- * Release the reserved range from inode dirty range map, as it is
2316
- * already moved into delayed_ref_head
2317
- */
2318
- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2319
- if (ret < 0)
2320
- goto out;
2321
- qg_released = ret;
2322
- ret = btrfs_alloc_reserved_file_extent(trans, root,
2323
- btrfs_ino(BTRFS_I(inode)),
2324
- file_pos, qg_released, &ins);
2325
-out:
2326
- btrfs_free_path(path);
2327
-
2328
- return ret;
2329
-}
2330
-
2331
-/* snapshot-aware defrag */
2332
-struct sa_defrag_extent_backref {
2333
- struct rb_node node;
2334
- struct old_sa_defrag_extent *old;
2335
- u64 root_id;
2336
- u64 inum;
2337
- u64 file_pos;
2338
- u64 extent_offset;
2339
- u64 num_bytes;
2340
- u64 generation;
2341
-};
2342
-
2343
-struct old_sa_defrag_extent {
2344
- struct list_head list;
2345
- struct new_sa_defrag_extent *new;
2346
-
2347
- u64 extent_offset;
2348
- u64 bytenr;
2349
- u64 offset;
2350
- u64 len;
2351
- int count;
2352
-};
2353
-
2354
-struct new_sa_defrag_extent {
2355
- struct rb_root root;
2356
- struct list_head head;
2357
- struct btrfs_path *path;
2358
- struct inode *inode;
2359
- u64 file_pos;
2360
- u64 len;
2361
- u64 bytenr;
2362
- u64 disk_len;
2363
- u8 compress_type;
2364
-};
2365
-
2366
-static int backref_comp(struct sa_defrag_extent_backref *b1,
2367
- struct sa_defrag_extent_backref *b2)
2368
-{
2369
- if (b1->root_id < b2->root_id)
2370
- return -1;
2371
- else if (b1->root_id > b2->root_id)
2372
- return 1;
2373
-
2374
- if (b1->inum < b2->inum)
2375
- return -1;
2376
- else if (b1->inum > b2->inum)
2377
- return 1;
2378
-
2379
- if (b1->file_pos < b2->file_pos)
2380
- return -1;
2381
- else if (b1->file_pos > b2->file_pos)
2382
- return 1;
2383
-
2384
- /*
2385
- * [------------------------------] ===> (a range of space)
2386
- * |<--->| |<---->| =============> (fs/file tree A)
2387
- * |<---------------------------->| ===> (fs/file tree B)
2388
- *
2389
- * A range of space can refer to two file extents in one tree while
2390
- * refer to only one file extent in another tree.
2391
- *
2392
- * So we may process a disk offset more than one time(two extents in A)
2393
- * and locate at the same extent(one extent in B), then insert two same
2394
- * backrefs(both refer to the extent in B).
2395
- */
2396
- return 0;
2397
-}
2398
-
2399
-static void backref_insert(struct rb_root *root,
2400
- struct sa_defrag_extent_backref *backref)
2401
-{
2402
- struct rb_node **p = &root->rb_node;
2403
- struct rb_node *parent = NULL;
2404
- struct sa_defrag_extent_backref *entry;
2405
- int ret;
2406
-
2407
- while (*p) {
2408
- parent = *p;
2409
- entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2410
-
2411
- ret = backref_comp(backref, entry);
2412
- if (ret < 0)
2413
- p = &(*p)->rb_left;
2414
- else
2415
- p = &(*p)->rb_right;
2416
- }
2417
-
2418
- rb_link_node(&backref->node, parent, p);
2419
- rb_insert_color(&backref->node, root);
2420
-}
2421
-
2422
-/*
2423
- * Note the backref might has changed, and in this case we just return 0.
2424
- */
2425
-static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2426
- void *ctx)
2427
-{
2428
- struct btrfs_file_extent_item *extent;
2429
- struct old_sa_defrag_extent *old = ctx;
2430
- struct new_sa_defrag_extent *new = old->new;
2431
- struct btrfs_path *path = new->path;
2432
- struct btrfs_key key;
2433
- struct btrfs_root *root;
2434
- struct sa_defrag_extent_backref *backref;
2435
- struct extent_buffer *leaf;
2436
- struct inode *inode = new->inode;
2437
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2438
- int slot;
2439
- int ret;
2440
- u64 extent_offset;
2441
- u64 num_bytes;
2442
-
2443
- if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2444
- inum == btrfs_ino(BTRFS_I(inode)))
2445
- return 0;
2446
-
2447
- key.objectid = root_id;
2448
- key.type = BTRFS_ROOT_ITEM_KEY;
2449
- key.offset = (u64)-1;
2450
-
2451
- root = btrfs_read_fs_root_no_name(fs_info, &key);
2452
- if (IS_ERR(root)) {
2453
- if (PTR_ERR(root) == -ENOENT)
2454
- return 0;
2455
- WARN_ON(1);
2456
- btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2457
- inum, offset, root_id);
2458
- return PTR_ERR(root);
2459
- }
2460
-
2461
- key.objectid = inum;
2462
- key.type = BTRFS_EXTENT_DATA_KEY;
2463
- if (offset > (u64)-1 << 32)
2464
- key.offset = 0;
2465
- else
2466
- key.offset = offset;
2467
-
2468
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2469
- if (WARN_ON(ret < 0))
2470
- return ret;
2471
- ret = 0;
2472
-
2473
- while (1) {
2474
- cond_resched();
2475
-
2476
- leaf = path->nodes[0];
2477
- slot = path->slots[0];
2478
-
2479
- if (slot >= btrfs_header_nritems(leaf)) {
2480
- ret = btrfs_next_leaf(root, path);
2481
- if (ret < 0) {
2482
- goto out;
2483
- } else if (ret > 0) {
2484
- ret = 0;
2485
- goto out;
2486
- }
2487
- continue;
2488
- }
2489
-
2490
- path->slots[0]++;
2491
-
2492
- btrfs_item_key_to_cpu(leaf, &key, slot);
2493
-
2494
- if (key.objectid > inum)
2495
- goto out;
2496
-
2497
- if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2498
- continue;
2499
-
2500
- extent = btrfs_item_ptr(leaf, slot,
2501
- struct btrfs_file_extent_item);
2502
-
2503
- if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2504
- continue;
2505
-
2506
- /*
2507
- * 'offset' refers to the exact key.offset,
2508
- * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2509
- * (key.offset - extent_offset).
2510
- */
2511
- if (key.offset != offset)
2512
- continue;
2513
-
2514
- extent_offset = btrfs_file_extent_offset(leaf, extent);
2515
- num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2516
-
2517
- if (extent_offset >= old->extent_offset + old->offset +
2518
- old->len || extent_offset + num_bytes <=
2519
- old->extent_offset + old->offset)
2520
- continue;
2521
- break;
2522
- }
2523
-
2524
- backref = kmalloc(sizeof(*backref), GFP_NOFS);
2525
- if (!backref) {
2526
- ret = -ENOENT;
2527
- goto out;
2528
- }
2529
-
2530
- backref->root_id = root_id;
2531
- backref->inum = inum;
2532
- backref->file_pos = offset;
2533
- backref->num_bytes = num_bytes;
2534
- backref->extent_offset = extent_offset;
2535
- backref->generation = btrfs_file_extent_generation(leaf, extent);
2536
- backref->old = old;
2537
- backref_insert(&new->root, backref);
2538
- old->count++;
2539
-out:
2540
- btrfs_release_path(path);
2541
- WARN_ON(ret);
2542
- return ret;
2543
-}
2544
-
2545
-static noinline bool record_extent_backrefs(struct btrfs_path *path,
2546
- struct new_sa_defrag_extent *new)
2547
-{
2548
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2549
- struct old_sa_defrag_extent *old, *tmp;
2550
- int ret;
2551
-
2552
- new->path = path;
2553
-
2554
- list_for_each_entry_safe(old, tmp, &new->head, list) {
2555
- ret = iterate_inodes_from_logical(old->bytenr +
2556
- old->extent_offset, fs_info,
2557
- path, record_one_backref,
2558
- old, false);
2559
- if (ret < 0 && ret != -ENOENT)
2560
- return false;
2561
-
2562
- /* no backref to be processed for this extent */
2563
- if (!old->count) {
2564
- list_del(&old->list);
2565
- kfree(old);
2566
- }
2567
- }
2568
-
2569
- if (list_empty(&new->head))
2570
- return false;
2571
-
2572
- return true;
2573
-}
2574
-
2575
-static int relink_is_mergable(struct extent_buffer *leaf,
2576
- struct btrfs_file_extent_item *fi,
2577
- struct new_sa_defrag_extent *new)
2578
-{
2579
- if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2580
- return 0;
2581
-
2582
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2583
- return 0;
2584
-
2585
- if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2586
- return 0;
2587
-
2588
- if (btrfs_file_extent_encryption(leaf, fi) ||
2589
- btrfs_file_extent_other_encoding(leaf, fi))
2590
- return 0;
2591
-
2592
- return 1;
2593
-}
2594
-
2595
-/*
2596
- * Note the backref might has changed, and in this case we just return 0.
2597
- */
2598
-static noinline int relink_extent_backref(struct btrfs_path *path,
2599
- struct sa_defrag_extent_backref *prev,
2600
- struct sa_defrag_extent_backref *backref)
2601
-{
2602
- struct btrfs_file_extent_item *extent;
2603
- struct btrfs_file_extent_item *item;
2604
- struct btrfs_ordered_extent *ordered;
2605
- struct btrfs_trans_handle *trans;
2606
- struct btrfs_root *root;
2607
- struct btrfs_key key;
2608
- struct extent_buffer *leaf;
2609
- struct old_sa_defrag_extent *old = backref->old;
2610
- struct new_sa_defrag_extent *new = old->new;
2611
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2612
- struct inode *inode;
2613
- struct extent_state *cached = NULL;
2614
- int ret = 0;
2615
- u64 start;
2616
- u64 len;
2617
- u64 lock_start;
2618
- u64 lock_end;
2619
- bool merge = false;
2620
- int index;
2621
-
2622
- if (prev && prev->root_id == backref->root_id &&
2623
- prev->inum == backref->inum &&
2624
- prev->file_pos + prev->num_bytes == backref->file_pos)
2625
- merge = true;
2626
-
2627
- /* step 1: get root */
2628
- key.objectid = backref->root_id;
2629
- key.type = BTRFS_ROOT_ITEM_KEY;
2630
- key.offset = (u64)-1;
2631
-
2632
- index = srcu_read_lock(&fs_info->subvol_srcu);
2633
-
2634
- root = btrfs_read_fs_root_no_name(fs_info, &key);
2635
- if (IS_ERR(root)) {
2636
- srcu_read_unlock(&fs_info->subvol_srcu, index);
2637
- if (PTR_ERR(root) == -ENOENT)
2638
- return 0;
2639
- return PTR_ERR(root);
2640
- }
2641
-
2642
- if (btrfs_root_readonly(root)) {
2643
- srcu_read_unlock(&fs_info->subvol_srcu, index);
2644
- return 0;
2645
- }
2646
-
2647
- /* step 2: get inode */
2648
- key.objectid = backref->inum;
2649
- key.type = BTRFS_INODE_ITEM_KEY;
2650
- key.offset = 0;
2651
-
2652
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2653
- if (IS_ERR(inode)) {
2654
- srcu_read_unlock(&fs_info->subvol_srcu, index);
2655
- return 0;
2656
- }
2657
-
2658
- srcu_read_unlock(&fs_info->subvol_srcu, index);
2659
-
2660
- /* step 3: relink backref */
2661
- lock_start = backref->file_pos;
2662
- lock_end = backref->file_pos + backref->num_bytes - 1;
2663
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2664
- &cached);
2665
-
2666
- ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2667
- if (ordered) {
2668
- btrfs_put_ordered_extent(ordered);
2669
- goto out_unlock;
2670
- }
2671
-
2672
- trans = btrfs_join_transaction(root);
2673
- if (IS_ERR(trans)) {
2674
- ret = PTR_ERR(trans);
2675
- goto out_unlock;
2676
- }
2677
-
2678
- key.objectid = backref->inum;
2679
- key.type = BTRFS_EXTENT_DATA_KEY;
2680
- key.offset = backref->file_pos;
2681
-
2682
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2683
- if (ret < 0) {
2684
- goto out_free_path;
2685
- } else if (ret > 0) {
2686
- ret = 0;
2687
- goto out_free_path;
2688
- }
2689
-
2690
- extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2691
- struct btrfs_file_extent_item);
2692
-
2693
- if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2694
- backref->generation)
2695
- goto out_free_path;
2696
-
2697
- btrfs_release_path(path);
2698
-
2699
- start = backref->file_pos;
2700
- if (backref->extent_offset < old->extent_offset + old->offset)
2701
- start += old->extent_offset + old->offset -
2702
- backref->extent_offset;
2703
-
2704
- len = min(backref->extent_offset + backref->num_bytes,
2705
- old->extent_offset + old->offset + old->len);
2706
- len -= max(backref->extent_offset, old->extent_offset + old->offset);
2707
-
2708
- ret = btrfs_drop_extents(trans, root, inode, start,
2709
- start + len, 1);
2571
+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
27102572 if (ret)
2711
- goto out_free_path;
2712
-again:
2713
- key.objectid = btrfs_ino(BTRFS_I(inode));
2714
- key.type = BTRFS_EXTENT_DATA_KEY;
2715
- key.offset = start;
2716
-
2717
- path->leave_spinning = 1;
2718
- if (merge) {
2719
- struct btrfs_file_extent_item *fi;
2720
- u64 extent_len;
2721
- struct btrfs_key found_key;
2722
-
2723
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2724
- if (ret < 0)
2725
- goto out_free_path;
2726
-
2727
- path->slots[0]--;
2728
- leaf = path->nodes[0];
2729
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2730
-
2731
- fi = btrfs_item_ptr(leaf, path->slots[0],
2732
- struct btrfs_file_extent_item);
2733
- extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2734
-
2735
- if (extent_len + found_key.offset == start &&
2736
- relink_is_mergable(leaf, fi, new)) {
2737
- btrfs_set_file_extent_num_bytes(leaf, fi,
2738
- extent_len + len);
2739
- btrfs_mark_buffer_dirty(leaf);
2740
- inode_add_bytes(inode, len);
2741
-
2742
- ret = 1;
2743
- goto out_free_path;
2744
- } else {
2745
- merge = false;
2746
- btrfs_release_path(path);
2747
- goto again;
2748
- }
2749
- }
2750
-
2751
- ret = btrfs_insert_empty_item(trans, root, path, &key,
2752
- sizeof(*extent));
2753
- if (ret) {
2754
- btrfs_abort_transaction(trans, ret);
2755
- goto out_free_path;
2756
- }
2757
-
2758
- leaf = path->nodes[0];
2759
- item = btrfs_item_ptr(leaf, path->slots[0],
2760
- struct btrfs_file_extent_item);
2761
- btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2762
- btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2763
- btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2764
- btrfs_set_file_extent_num_bytes(leaf, item, len);
2765
- btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2766
- btrfs_set_file_extent_generation(leaf, item, trans->transid);
2767
- btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2768
- btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2769
- btrfs_set_file_extent_encryption(leaf, item, 0);
2770
- btrfs_set_file_extent_other_encoding(leaf, item, 0);
2771
-
2772
- btrfs_mark_buffer_dirty(leaf);
2773
- inode_add_bytes(inode, len);
2774
- btrfs_release_path(path);
2775
-
2776
- ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2777
- new->disk_len, 0,
2778
- backref->root_id, backref->inum,
2779
- new->file_pos); /* start - extent_offset */
2780
- if (ret) {
2781
- btrfs_abort_transaction(trans, ret);
2782
- goto out_free_path;
2783
- }
2784
-
2785
- ret = 1;
2786
-out_free_path:
2787
- btrfs_release_path(path);
2788
- path->leave_spinning = 0;
2789
- btrfs_end_transaction(trans);
2790
-out_unlock:
2791
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2792
- &cached);
2793
- iput(inode);
2794
- return ret;
2795
-}
2796
-
2797
-static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2798
-{
2799
- struct old_sa_defrag_extent *old, *tmp;
2800
-
2801
- if (!new)
2802
- return;
2803
-
2804
- list_for_each_entry_safe(old, tmp, &new->head, list) {
2805
- kfree(old);
2806
- }
2807
- kfree(new);
2808
-}
2809
-
2810
-static void relink_file_extents(struct new_sa_defrag_extent *new)
2811
-{
2812
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2813
- struct btrfs_path *path;
2814
- struct sa_defrag_extent_backref *backref;
2815
- struct sa_defrag_extent_backref *prev = NULL;
2816
- struct inode *inode;
2817
- struct rb_node *node;
2818
- int ret;
2819
-
2820
- inode = new->inode;
2821
-
2822
- path = btrfs_alloc_path();
2823
- if (!path)
2824
- return;
2825
-
2826
- if (!record_extent_backrefs(path, new)) {
2827
- btrfs_free_path(path);
28282573 goto out;
2829
- }
2830
- btrfs_release_path(path);
28312574
2832
- while (1) {
2833
- node = rb_first(&new->root);
2834
- if (!node)
2835
- break;
2836
- rb_erase(node, &new->root);
2837
-
2838
- backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2839
-
2840
- ret = relink_extent_backref(path, prev, backref);
2841
- WARN_ON(ret < 0);
2842
-
2843
- kfree(prev);
2844
-
2845
- if (ret == 1)
2846
- prev = backref;
2847
- else
2848
- prev = NULL;
2849
- cond_resched();
2850
- }
2851
- kfree(prev);
2852
-
2853
- btrfs_free_path(path);
2575
+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2576
+ file_pos, qgroup_reserved, &ins);
28542577 out:
2855
- free_sa_defrag_extent(new);
2856
-
2857
- atomic_dec(&fs_info->defrag_running);
2858
- wake_up(&fs_info->transaction_wait);
2859
-}
2860
-
2861
-static struct new_sa_defrag_extent *
2862
-record_old_file_extents(struct inode *inode,
2863
- struct btrfs_ordered_extent *ordered)
2864
-{
2865
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2866
- struct btrfs_root *root = BTRFS_I(inode)->root;
2867
- struct btrfs_path *path;
2868
- struct btrfs_key key;
2869
- struct old_sa_defrag_extent *old;
2870
- struct new_sa_defrag_extent *new;
2871
- int ret;
2872
-
2873
- new = kmalloc(sizeof(*new), GFP_NOFS);
2874
- if (!new)
2875
- return NULL;
2876
-
2877
- new->inode = inode;
2878
- new->file_pos = ordered->file_offset;
2879
- new->len = ordered->len;
2880
- new->bytenr = ordered->start;
2881
- new->disk_len = ordered->disk_len;
2882
- new->compress_type = ordered->compress_type;
2883
- new->root = RB_ROOT;
2884
- INIT_LIST_HEAD(&new->head);
2885
-
2886
- path = btrfs_alloc_path();
2887
- if (!path)
2888
- goto out_kfree;
2889
-
2890
- key.objectid = btrfs_ino(BTRFS_I(inode));
2891
- key.type = BTRFS_EXTENT_DATA_KEY;
2892
- key.offset = new->file_pos;
2893
-
2894
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2895
- if (ret < 0)
2896
- goto out_free_path;
2897
- if (ret > 0 && path->slots[0] > 0)
2898
- path->slots[0]--;
2899
-
2900
- /* find out all the old extents for the file range */
2901
- while (1) {
2902
- struct btrfs_file_extent_item *extent;
2903
- struct extent_buffer *l;
2904
- int slot;
2905
- u64 num_bytes;
2906
- u64 offset;
2907
- u64 end;
2908
- u64 disk_bytenr;
2909
- u64 extent_offset;
2910
-
2911
- l = path->nodes[0];
2912
- slot = path->slots[0];
2913
-
2914
- if (slot >= btrfs_header_nritems(l)) {
2915
- ret = btrfs_next_leaf(root, path);
2916
- if (ret < 0)
2917
- goto out_free_path;
2918
- else if (ret > 0)
2919
- break;
2920
- continue;
2921
- }
2922
-
2923
- btrfs_item_key_to_cpu(l, &key, slot);
2924
-
2925
- if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2926
- break;
2927
- if (key.type != BTRFS_EXTENT_DATA_KEY)
2928
- break;
2929
- if (key.offset >= new->file_pos + new->len)
2930
- break;
2931
-
2932
- extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2933
-
2934
- num_bytes = btrfs_file_extent_num_bytes(l, extent);
2935
- if (key.offset + num_bytes < new->file_pos)
2936
- goto next;
2937
-
2938
- disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2939
- if (!disk_bytenr)
2940
- goto next;
2941
-
2942
- extent_offset = btrfs_file_extent_offset(l, extent);
2943
-
2944
- old = kmalloc(sizeof(*old), GFP_NOFS);
2945
- if (!old)
2946
- goto out_free_path;
2947
-
2948
- offset = max(new->file_pos, key.offset);
2949
- end = min(new->file_pos + new->len, key.offset + num_bytes);
2950
-
2951
- old->bytenr = disk_bytenr;
2952
- old->extent_offset = extent_offset;
2953
- old->offset = offset - key.offset;
2954
- old->len = end - offset;
2955
- old->new = new;
2956
- old->count = 0;
2957
- list_add_tail(&old->list, &new->head);
2958
-next:
2959
- path->slots[0]++;
2960
- cond_resched();
2961
- }
2962
-
29632578 btrfs_free_path(path);
2964
- atomic_inc(&fs_info->defrag_running);
29652579
2966
- return new;
2967
-
2968
-out_free_path:
2969
- btrfs_free_path(path);
2970
-out_kfree:
2971
- free_sa_defrag_extent(new);
2972
- return NULL;
2580
+ return ret;
29732581 }
29742582
29752583 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
29762584 u64 start, u64 len)
29772585 {
2978
- struct btrfs_block_group_cache *cache;
2586
+ struct btrfs_block_group *cache;
29792587
29802588 cache = btrfs_lookup_block_group(fs_info, start);
29812589 ASSERT(cache);
....@@ -2987,7 +2595,33 @@
29872595 btrfs_put_block_group(cache);
29882596 }
29892597
2990
-/* as ordered data IO finishes, this gets called so we can finish
2598
+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2599
+ struct btrfs_ordered_extent *oe)
2600
+{
2601
+ struct btrfs_file_extent_item stack_fi;
2602
+ u64 logical_len;
2603
+
2604
+ memset(&stack_fi, 0, sizeof(stack_fi));
2605
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2606
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2607
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2608
+ oe->disk_num_bytes);
2609
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
2610
+ logical_len = oe->truncated_len;
2611
+ else
2612
+ logical_len = oe->num_bytes;
2613
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
2614
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
2615
+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2616
+ /* Encryption and other encoding is reserved and all 0 */
2617
+
2618
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
2619
+ oe->file_offset, &stack_fi,
2620
+ oe->qgroup_rsv);
2621
+}
2622
+
2623
+/*
2624
+ * As ordered data IO finishes, this gets called so we can finish
29912625 * an ordered extent if the range of bytes in the file it covers are
29922626 * fully written.
29932627 */
....@@ -2999,32 +2633,33 @@
29992633 struct btrfs_trans_handle *trans = NULL;
30002634 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
30012635 struct extent_state *cached_state = NULL;
3002
- struct new_sa_defrag_extent *new = NULL;
2636
+ u64 start, end;
30032637 int compress_type = 0;
30042638 int ret = 0;
3005
- u64 logical_len = ordered_extent->len;
3006
- bool nolock;
2639
+ u64 logical_len = ordered_extent->num_bytes;
2640
+ bool freespace_inode;
30072641 bool truncated = false;
30082642 bool range_locked = false;
30092643 bool clear_new_delalloc_bytes = false;
30102644 bool clear_reserved_extent = true;
2645
+ unsigned int clear_bits;
2646
+
2647
+ start = ordered_extent->file_offset;
2648
+ end = start + ordered_extent->num_bytes - 1;
30112649
30122650 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
30132651 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
30142652 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
30152653 clear_new_delalloc_bytes = true;
30162654
3017
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2655
+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
30182656
30192657 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
30202658 ret = -EIO;
30212659 goto out;
30222660 }
30232661
3024
- btrfs_free_io_failure_record(BTRFS_I(inode),
3025
- ordered_extent->file_offset,
3026
- ordered_extent->file_offset +
3027
- ordered_extent->len - 1);
2662
+ btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
30282663
30292664 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
30302665 truncated = true;
....@@ -3037,16 +2672,9 @@
30372672 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
30382673 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
30392674
3040
- /*
3041
- * For mwrite(mmap + memset to write) case, we still reserve
3042
- * space for NOCOW range.
3043
- * As NOCOW won't cause a new delayed ref, just free the space
3044
- */
3045
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3046
- ordered_extent->len);
3047
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3048
- if (nolock)
3049
- trans = btrfs_join_transaction_nolock(root);
2675
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
2676
+ if (freespace_inode)
2677
+ trans = btrfs_join_transaction_spacecache(root);
30502678 else
30512679 trans = btrfs_join_transaction(root);
30522680 if (IS_ERR(trans)) {
....@@ -3062,26 +2690,10 @@
30622690 }
30632691
30642692 range_locked = true;
3065
- lock_extent_bits(io_tree, ordered_extent->file_offset,
3066
- ordered_extent->file_offset + ordered_extent->len - 1,
3067
- &cached_state);
2693
+ lock_extent_bits(io_tree, start, end, &cached_state);
30682694
3069
- ret = test_range_bit(io_tree, ordered_extent->file_offset,
3070
- ordered_extent->file_offset + ordered_extent->len - 1,
3071
- EXTENT_DEFRAG, 0, cached_state);
3072
- if (ret) {
3073
- u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3074
- if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3075
- /* the inode is shared */
3076
- new = record_old_file_extents(inode, ordered_extent);
3077
-
3078
- clear_extent_bit(io_tree, ordered_extent->file_offset,
3079
- ordered_extent->file_offset + ordered_extent->len - 1,
3080
- EXTENT_DEFRAG, 0, 0, &cached_state);
3081
- }
3082
-
3083
- if (nolock)
3084
- trans = btrfs_join_transaction_nolock(root);
2695
+ if (freespace_inode)
2696
+ trans = btrfs_join_transaction_spacecache(root);
30852697 else
30862698 trans = btrfs_join_transaction(root);
30872699 if (IS_ERR(trans)) {
....@@ -3096,43 +2708,35 @@
30962708 compress_type = ordered_extent->compress_type;
30972709 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
30982710 BUG_ON(compress_type);
3099
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3100
- ordered_extent->len);
31012711 ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
31022712 ordered_extent->file_offset,
31032713 ordered_extent->file_offset +
31042714 logical_len);
31052715 } else {
31062716 BUG_ON(root == fs_info->tree_root);
3107
- ret = insert_reserved_file_extent(trans, inode,
3108
- ordered_extent->file_offset,
3109
- ordered_extent->start,
3110
- ordered_extent->disk_len,
3111
- logical_len, logical_len,
3112
- compress_type, 0, 0,
3113
- BTRFS_FILE_EXTENT_REG);
2717
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
31142718 if (!ret) {
31152719 clear_reserved_extent = false;
31162720 btrfs_release_delalloc_bytes(fs_info,
3117
- ordered_extent->start,
3118
- ordered_extent->disk_len);
2721
+ ordered_extent->disk_bytenr,
2722
+ ordered_extent->disk_num_bytes);
31192723 }
31202724 }
31212725 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3122
- ordered_extent->file_offset, ordered_extent->len,
3123
- trans->transid);
2726
+ ordered_extent->file_offset,
2727
+ ordered_extent->num_bytes, trans->transid);
31242728 if (ret < 0) {
31252729 btrfs_abort_transaction(trans, ret);
31262730 goto out;
31272731 }
31282732
3129
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
2733
+ ret = add_pending_csums(trans, &ordered_extent->list);
31302734 if (ret) {
31312735 btrfs_abort_transaction(trans, ret);
31322736 goto out;
31332737 }
31342738
3135
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2739
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
31362740 ret = btrfs_update_inode_fallback(trans, root, inode);
31372741 if (ret) { /* -ENOMEM or corruption */
31382742 btrfs_abort_transaction(trans, ret);
....@@ -3140,27 +2744,20 @@
31402744 }
31412745 ret = 0;
31422746 out:
3143
- if (range_locked || clear_new_delalloc_bytes) {
3144
- unsigned int clear_bits = 0;
3145
-
3146
- if (range_locked)
3147
- clear_bits |= EXTENT_LOCKED;
3148
- if (clear_new_delalloc_bytes)
3149
- clear_bits |= EXTENT_DELALLOC_NEW;
3150
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
3151
- ordered_extent->file_offset,
3152
- ordered_extent->file_offset +
3153
- ordered_extent->len - 1,
3154
- clear_bits,
3155
- (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3156
- 0, &cached_state);
3157
- }
2747
+ clear_bits = EXTENT_DEFRAG;
2748
+ if (range_locked)
2749
+ clear_bits |= EXTENT_LOCKED;
2750
+ if (clear_new_delalloc_bytes)
2751
+ clear_bits |= EXTENT_DELALLOC_NEW;
2752
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
2753
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
2754
+ &cached_state);
31582755
31592756 if (trans)
31602757 btrfs_end_transaction(trans);
31612758
31622759 if (ret || truncated) {
3163
- u64 start, end;
2760
+ u64 unwritten_start = start;
31642761
31652762 /*
31662763 * If we failed to finish this ordered extent for any reason we
....@@ -3175,14 +2772,11 @@
31752772 mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
31762773
31772774 if (truncated)
3178
- start = ordered_extent->file_offset + logical_len;
3179
- else
3180
- start = ordered_extent->file_offset;
3181
- end = ordered_extent->file_offset + ordered_extent->len - 1;
3182
- clear_extent_uptodate(io_tree, start, end, NULL);
2775
+ unwritten_start += logical_len;
2776
+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
31832777
31842778 /* Drop the cache for the part of the extent we didn't write. */
3185
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
2779
+ btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
31862780
31872781 /*
31882782 * If the ordered extent had an IOERR or something else went
....@@ -3197,28 +2791,27 @@
31972791 if ((ret || !logical_len) &&
31982792 clear_reserved_extent &&
31992793 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3200
- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2794
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2795
+ /*
2796
+ * Discard the range before returning it back to the
2797
+ * free space pool
2798
+ */
2799
+ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
2800
+ btrfs_discard_extent(fs_info,
2801
+ ordered_extent->disk_bytenr,
2802
+ ordered_extent->disk_num_bytes,
2803
+ NULL);
32012804 btrfs_free_reserved_extent(fs_info,
3202
- ordered_extent->start,
3203
- ordered_extent->disk_len, 1);
2805
+ ordered_extent->disk_bytenr,
2806
+ ordered_extent->disk_num_bytes, 1);
2807
+ }
32042808 }
3205
-
32062809
32072810 /*
32082811 * This needs to be done to make sure anybody waiting knows we are done
32092812 * updating everything for this ordered extent.
32102813 */
3211
- btrfs_remove_ordered_extent(inode, ordered_extent);
3212
-
3213
- /* for snapshot-aware defrag */
3214
- if (new) {
3215
- if (ret) {
3216
- free_sa_defrag_extent(new);
3217
- atomic_dec(&fs_info->defrag_running);
3218
- } else {
3219
- relink_file_extents(new);
3220
- }
3221
- }
2814
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
32222815
32232816 /* once for us */
32242817 btrfs_put_ordered_extent(ordered_extent);
....@@ -3235,14 +2828,13 @@
32352828 btrfs_finish_ordered_io(ordered_extent);
32362829 }
32372830
3238
-static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3239
- struct extent_state *state, int uptodate)
2831
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
2832
+ u64 end, int uptodate)
32402833 {
3241
- struct inode *inode = page->mapping->host;
3242
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2834
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2835
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
32432836 struct btrfs_ordered_extent *ordered_extent = NULL;
32442837 struct btrfs_workqueue *wq;
3245
- btrfs_work_func_t func;
32462838
32472839 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
32482840
....@@ -3251,34 +2843,34 @@
32512843 end - start + 1, uptodate))
32522844 return;
32532845
3254
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
2846
+ if (btrfs_is_free_space_inode(inode))
32552847 wq = fs_info->endio_freespace_worker;
3256
- func = btrfs_freespace_write_helper;
3257
- } else {
2848
+ else
32582849 wq = fs_info->endio_write_workers;
3259
- func = btrfs_endio_write_helper;
3260
- }
32612850
3262
- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3263
- NULL);
2851
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
32642852 btrfs_queue_work(wq, &ordered_extent->work);
32652853 }
32662854
3267
-static int __readpage_endio_check(struct inode *inode,
3268
- struct btrfs_io_bio *io_bio,
3269
- int icsum, struct page *page,
3270
- int pgoff, u64 start, size_t len)
2855
+static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
2856
+ int icsum, struct page *page, int pgoff, u64 start,
2857
+ size_t len)
32712858 {
2859
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2860
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
32722861 char *kaddr;
3273
- u32 csum_expected;
3274
- u32 csum = ~(u32)0;
2862
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2863
+ u8 *csum_expected;
2864
+ u8 csum[BTRFS_CSUM_SIZE];
32752865
3276
- csum_expected = *(((u32 *)io_bio->csum) + icsum);
2866
+ csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
32772867
32782868 kaddr = kmap_atomic(page);
3279
- csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3280
- btrfs_csum_final(csum, (u8 *)&csum);
3281
- if (csum != csum_expected)
2869
+ shash->tfm = fs_info->csum_shash;
2870
+
2871
+ crypto_shash_digest(shash, kaddr + pgoff, len, csum);
2872
+
2873
+ if (memcmp(csum, csum_expected, csum_size))
32822874 goto zeroit;
32832875
32842876 kunmap_atomic(kaddr);
....@@ -3286,6 +2878,9 @@
32862878 zeroit:
32872879 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
32882880 io_bio->mirror_num);
2881
+ if (io_bio->device)
2882
+ btrfs_dev_stat_inc_and_print(io_bio->device,
2883
+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
32892884 memset(kaddr + pgoff, 1, len);
32902885 flush_dcache_page(page);
32912886 kunmap_atomic(kaddr);
....@@ -3297,9 +2892,8 @@
32972892 * if there's a match, we allow the bio to finish. If not, the code in
32982893 * extent_io.c will try to find good copies for us.
32992894 */
3300
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3301
- u64 phy_offset, struct page *page,
3302
- u64 start, u64 end, int mirror)
2895
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
2896
+ struct page *page, u64 start, u64 end, int mirror)
33032897 {
33042898 size_t offset = start - page_offset(page);
33052899 struct inode *inode = page->mapping->host;
....@@ -3321,8 +2915,8 @@
33212915 }
33222916
33232917 phy_offset >>= inode->i_sb->s_blocksize_bits;
3324
- return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3325
- start, (size_t)(end - start + 1));
2918
+ return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
2919
+ (size_t)(end - start + 1));
33262920 }
33272921
33282922 /*
....@@ -3343,10 +2937,35 @@
33432937 if (atomic_add_unless(&inode->i_count, -1, 1))
33442938 return;
33452939
2940
+ atomic_inc(&fs_info->nr_delayed_iputs);
33462941 spin_lock(&fs_info->delayed_iput_lock);
33472942 ASSERT(list_empty(&binode->delayed_iput));
33482943 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
33492944 spin_unlock(&fs_info->delayed_iput_lock);
2945
+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
2946
+ wake_up_process(fs_info->cleaner_kthread);
2947
+}
2948
+
2949
+static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
2950
+ struct btrfs_inode *inode)
2951
+{
2952
+ list_del_init(&inode->delayed_iput);
2953
+ spin_unlock(&fs_info->delayed_iput_lock);
2954
+ iput(&inode->vfs_inode);
2955
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
2956
+ wake_up(&fs_info->delayed_iputs_wait);
2957
+ spin_lock(&fs_info->delayed_iput_lock);
2958
+}
2959
+
2960
+static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
2961
+ struct btrfs_inode *inode)
2962
+{
2963
+ if (!list_empty(&inode->delayed_iput)) {
2964
+ spin_lock(&fs_info->delayed_iput_lock);
2965
+ if (!list_empty(&inode->delayed_iput))
2966
+ run_delayed_iput_locked(fs_info, inode);
2967
+ spin_unlock(&fs_info->delayed_iput_lock);
2968
+ }
33502969 }
33512970
33522971 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
....@@ -3358,12 +2977,29 @@
33582977
33592978 inode = list_first_entry(&fs_info->delayed_iputs,
33602979 struct btrfs_inode, delayed_iput);
3361
- list_del_init(&inode->delayed_iput);
3362
- spin_unlock(&fs_info->delayed_iput_lock);
3363
- iput(&inode->vfs_inode);
3364
- spin_lock(&fs_info->delayed_iput_lock);
2980
+ run_delayed_iput_locked(fs_info, inode);
2981
+ cond_resched_lock(&fs_info->delayed_iput_lock);
33652982 }
33662983 spin_unlock(&fs_info->delayed_iput_lock);
2984
+}
2985
+
2986
+/**
2987
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
2988
+ * @fs_info - the fs_info for this fs
2989
+ * @return - EINTR if we were killed, 0 if nothing's pending
2990
+ *
2991
+ * This will wait on any delayed iputs that are currently running with KILLABLE
2992
+ * set. Once they are all done running we will return, unless we are killed in
2993
+ * which case we return EINTR. This helps in user operations like fallocate etc
2994
+ * that might get blocked on the iputs.
2995
+ */
2996
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
2997
+{
2998
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
2999
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
3000
+ if (ret)
3001
+ return -EINTR;
3002
+ return 0;
33673003 }
33683004
33693005 /*
....@@ -3471,14 +3107,13 @@
34713107 found_key.objectid = found_key.offset;
34723108 found_key.type = BTRFS_INODE_ITEM_KEY;
34733109 found_key.offset = 0;
3474
- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3110
+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
34753111 ret = PTR_ERR_OR_ZERO(inode);
34763112 if (ret && ret != -ENOENT)
34773113 goto out;
34783114
34793115 if (ret == -ENOENT && root == fs_info->tree_root) {
34803116 struct btrfs_root *dead_root;
3481
- struct btrfs_fs_info *fs_info = root->fs_info;
34823117 int is_dead_root = 0;
34833118
34843119 /*
....@@ -3490,18 +3125,16 @@
34903125 * orphan must not get deleted.
34913126 * find_dead_roots already ran before us, so if this
34923127 * is a snapshot deletion, we should find the root
3493
- * in the dead_roots list
3128
+ * in the fs_roots radix tree.
34943129 */
3495
- spin_lock(&fs_info->trans_lock);
3496
- list_for_each_entry(dead_root, &fs_info->dead_roots,
3497
- root_list) {
3498
- if (dead_root->root_key.objectid ==
3499
- found_key.objectid) {
3500
- is_dead_root = 1;
3501
- break;
3502
- }
3503
- }
3504
- spin_unlock(&fs_info->trans_lock);
3130
+
3131
+ spin_lock(&fs_info->fs_roots_radix_lock);
3132
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3133
+ (unsigned long)found_key.objectid);
3134
+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3135
+ is_dead_root = 1;
3136
+ spin_unlock(&fs_info->fs_roots_radix_lock);
3137
+
35053138 if (is_dead_root) {
35063139 /* prevent this orphan from being found again */
35073140 key.offset = found_key.objectid - 1;
....@@ -3551,8 +3184,6 @@
35513184
35523185 /* this will do delete_inode and everything for us */
35533186 iput(inode);
3554
- if (ret)
3555
- goto out;
35563187 }
35573188 /* release the path since we're done with it */
35583189 btrfs_release_path(path);
....@@ -3694,6 +3325,8 @@
36943325 i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
36953326 i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
36963327 btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3328
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3329
+ round_up(i_size_read(inode), fs_info->sectorsize));
36973330
36983331 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
36993332 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
....@@ -3764,21 +3397,14 @@
37643397 * inode is not a directory, logging its parent unnecessarily.
37653398 */
37663399 BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3400
+
37673401 /*
3768
- * Similar reasoning for last_link_trans, needs to be set otherwise
3769
- * for a case like the following:
3770
- *
3771
- * mkdir A
3772
- * touch foo
3773
- * ln foo A/bar
3774
- * echo 2 > /proc/sys/vm/drop_caches
3775
- * fsync foo
3776
- * <power failure>
3777
- *
3778
- * Would result in link bar and directory A not existing after the power
3779
- * failure.
3402
+ * Same logic as for last_unlink_trans. We don't persist the generation
3403
+ * of the last transaction where this inode was used for a reflink
3404
+ * operation, so after eviction and reloading the inode we must be
3405
+ * pessimistic and assume the last transaction that modified the inode.
37803406 */
3781
- BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
3407
+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
37823408
37833409 path->slots[0]++;
37843410 if (inode->i_nlink != 1 ||
....@@ -3827,7 +3453,6 @@
38273453 switch (inode->i_mode & S_IFMT) {
38283454 case S_IFREG:
38293455 inode->i_mapping->a_ops = &btrfs_aops;
3830
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
38313456 inode->i_fop = &btrfs_file_operations;
38323457 inode->i_op = &btrfs_file_inode_operations;
38333458 break;
....@@ -3838,7 +3463,7 @@
38383463 case S_IFLNK:
38393464 inode->i_op = &btrfs_symlink_inode_operations;
38403465 inode_nohighmem(inode);
3841
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
3466
+ inode->i_mapping->a_ops = &btrfs_aops;
38423467 break;
38433468 default:
38443469 inode->i_op = &btrfs_special_inode_operations;
....@@ -3860,45 +3485,42 @@
38603485 {
38613486 struct btrfs_map_token token;
38623487
3863
- btrfs_init_map_token(&token);
3488
+ btrfs_init_map_token(&token, leaf);
38643489
3865
- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3866
- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3867
- btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3868
- &token);
3869
- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3870
- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3490
+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3491
+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3492
+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3493
+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3494
+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
38713495
3872
- btrfs_set_token_timespec_sec(leaf, &item->atime,
3873
- inode->i_atime.tv_sec, &token);
3874
- btrfs_set_token_timespec_nsec(leaf, &item->atime,
3875
- inode->i_atime.tv_nsec, &token);
3496
+ btrfs_set_token_timespec_sec(&token, &item->atime,
3497
+ inode->i_atime.tv_sec);
3498
+ btrfs_set_token_timespec_nsec(&token, &item->atime,
3499
+ inode->i_atime.tv_nsec);
38763500
3877
- btrfs_set_token_timespec_sec(leaf, &item->mtime,
3878
- inode->i_mtime.tv_sec, &token);
3879
- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3880
- inode->i_mtime.tv_nsec, &token);
3501
+ btrfs_set_token_timespec_sec(&token, &item->mtime,
3502
+ inode->i_mtime.tv_sec);
3503
+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
3504
+ inode->i_mtime.tv_nsec);
38813505
3882
- btrfs_set_token_timespec_sec(leaf, &item->ctime,
3883
- inode->i_ctime.tv_sec, &token);
3884
- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3885
- inode->i_ctime.tv_nsec, &token);
3506
+ btrfs_set_token_timespec_sec(&token, &item->ctime,
3507
+ inode->i_ctime.tv_sec);
3508
+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
3509
+ inode->i_ctime.tv_nsec);
38863510
3887
- btrfs_set_token_timespec_sec(leaf, &item->otime,
3888
- BTRFS_I(inode)->i_otime.tv_sec, &token);
3889
- btrfs_set_token_timespec_nsec(leaf, &item->otime,
3890
- BTRFS_I(inode)->i_otime.tv_nsec, &token);
3511
+ btrfs_set_token_timespec_sec(&token, &item->otime,
3512
+ BTRFS_I(inode)->i_otime.tv_sec);
3513
+ btrfs_set_token_timespec_nsec(&token, &item->otime,
3514
+ BTRFS_I(inode)->i_otime.tv_nsec);
38913515
3892
- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3893
- &token);
3894
- btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3895
- &token);
3896
- btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
3897
- &token);
3898
- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3899
- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3900
- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3901
- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3516
+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3517
+ btrfs_set_token_inode_generation(&token, item,
3518
+ BTRFS_I(inode)->generation);
3519
+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3520
+ btrfs_set_token_inode_transid(&token, item, trans->transid);
3521
+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3522
+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3523
+ btrfs_set_token_inode_block_group(&token, item, 0);
39023524 }
39033525
39043526 /*
....@@ -3931,7 +3553,7 @@
39313553
39323554 fill_inode_item(trans, leaf, inode_item, inode);
39333555 btrfs_mark_buffer_dirty(leaf);
3934
- btrfs_set_inode_last_trans(trans, inode);
3556
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
39353557 ret = 0;
39363558 failed:
39373559 btrfs_free_path(path);
....@@ -3961,7 +3583,7 @@
39613583
39623584 ret = btrfs_delayed_update_inode(trans, root, inode);
39633585 if (!ret)
3964
- btrfs_set_inode_last_trans(trans, inode);
3586
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
39653587 return ret;
39663588 }
39673589
....@@ -3994,9 +3616,7 @@
39943616 struct btrfs_fs_info *fs_info = root->fs_info;
39953617 struct btrfs_path *path;
39963618 int ret = 0;
3997
- struct extent_buffer *leaf;
39983619 struct btrfs_dir_item *di;
3999
- struct btrfs_key key;
40003620 u64 index;
40013621 u64 ino = btrfs_ino(inode);
40023622 u64 dir_ino = btrfs_ino(dir);
....@@ -4010,16 +3630,10 @@
40103630 path->leave_spinning = 1;
40113631 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
40123632 name, name_len, -1);
4013
- if (IS_ERR(di)) {
4014
- ret = PTR_ERR(di);
3633
+ if (IS_ERR_OR_NULL(di)) {
3634
+ ret = di ? PTR_ERR(di) : -ENOENT;
40153635 goto err;
40163636 }
4017
- if (!di) {
4018
- ret = -ENOENT;
4019
- goto err;
4020
- }
4021
- leaf = path->nodes[0];
4022
- btrfs_dir_item_key_to_cpu(leaf, di, &key);
40233637 ret = btrfs_delete_one_dir_name(trans, root, path, di);
40243638 if (ret)
40253639 goto err;
....@@ -4072,6 +3686,17 @@
40723686 ret = 0;
40733687 else if (ret)
40743688 btrfs_abort_transaction(trans, ret);
3689
+
3690
+ /*
3691
+ * If we have a pending delayed iput we could end up with the final iput
3692
+ * being run in btrfs-cleaner context. If we have enough of these built
3693
+ * up we can end up burning a lot of time in btrfs-cleaner without any
3694
+ * way to throttle the unlinks. Since we're currently holding a ref on
3695
+ * the inode we can run the delayed iput here without any issues as the
3696
+ * final iput won't be done until after we drop the ref we're currently
3697
+ * holding.
3698
+ */
3699
+ btrfs_run_delayed_iput(fs_info, inode);
40753700 err:
40763701 btrfs_free_path(path);
40773702 if (ret)
....@@ -4120,7 +3745,7 @@
41203745 * 1 for the inode ref
41213746 * 1 for the inode
41223747 */
4123
- return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
3748
+ return btrfs_start_transaction_fallback_global_rsv(root, 5);
41243749 }
41253750
41263751 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
....@@ -4187,10 +3812,7 @@
41873812 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
41883813 name, name_len, -1);
41893814 if (IS_ERR_OR_NULL(di)) {
4190
- if (!di)
4191
- ret = -ENOENT;
4192
- else
4193
- ret = PTR_ERR(di);
3815
+ ret = di ? PTR_ERR(di) : -ENOENT;
41943816 goto out;
41953817 }
41963818
....@@ -4393,18 +4015,24 @@
43934015 * again is not run concurrently.
43944016 */
43954017 spin_lock(&dest->root_item_lock);
4396
- root_flags = btrfs_root_flags(&dest->root_item);
4397
- if (dest->send_in_progress == 0) {
4398
- btrfs_set_root_flags(&dest->root_item,
4399
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4400
- spin_unlock(&dest->root_item_lock);
4401
- } else {
4018
+ if (dest->send_in_progress) {
44024019 spin_unlock(&dest->root_item_lock);
44034020 btrfs_warn(fs_info,
44044021 "attempt to delete subvolume %llu during send",
44054022 dest->root_key.objectid);
44064023 return -EPERM;
44074024 }
4025
+ if (atomic_read(&dest->nr_swapfiles)) {
4026
+ spin_unlock(&dest->root_item_lock);
4027
+ btrfs_warn(fs_info,
4028
+ "attempt to delete subvolume %llu with active swapfile",
4029
+ root->root_key.objectid);
4030
+ return -EPERM;
4031
+ }
4032
+ root_flags = btrfs_root_flags(&dest->root_item);
4033
+ btrfs_set_root_flags(&dest->root_item,
4034
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4035
+ spin_unlock(&dest->root_item_lock);
44084036
44094037 down_write(&fs_info->subvol_sem);
44104038
....@@ -4487,7 +4115,7 @@
44874115 err = ret;
44884116 inode->i_flags |= S_DEAD;
44894117 out_release:
4490
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
4118
+ btrfs_subvolume_release_metadata(root, &block_rsv);
44914119 out_up_write:
44924120 up_write(&fs_info->subvol_sem);
44934121 if (err) {
....@@ -4566,31 +4194,6 @@
45664194 return err;
45674195 }
45684196
4569
-static int truncate_space_check(struct btrfs_trans_handle *trans,
4570
- struct btrfs_root *root,
4571
- u64 bytes_deleted)
4572
-{
4573
- struct btrfs_fs_info *fs_info = root->fs_info;
4574
- int ret;
4575
-
4576
- /*
4577
- * This is only used to apply pressure to the enospc system, we don't
4578
- * intend to use this reservation at all.
4579
- */
4580
- bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4581
- bytes_deleted *= fs_info->nodesize;
4582
- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4583
- bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4584
- if (!ret) {
4585
- trace_btrfs_space_reservation(fs_info, "transaction",
4586
- trans->transid,
4587
- bytes_deleted, 1);
4588
- trans->bytes_reserved += bytes_deleted;
4589
- }
4590
- return ret;
4591
-
4592
-}
4593
-
45944197 /*
45954198 * Return this if we need to call truncate_block for the last bit of the
45964199 * truncate.
....@@ -4635,16 +4238,18 @@
46354238 u64 bytes_deleted = 0;
46364239 bool be_nice = false;
46374240 bool should_throttle = false;
4638
- bool should_end = false;
4241
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
4242
+ struct extent_state *cached_state = NULL;
46394243
46404244 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
46414245
46424246 /*
4643
- * for non-free space inodes and ref cows, we want to back off from
4644
- * time to time
4247
+ * For non-free space inodes and non-shareable roots, we want to back
4248
+ * off from time to time. This means all inodes in subvolume roots,
4249
+ * reloc roots, and data reloc roots.
46454250 */
46464251 if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4647
- test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4252
+ test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
46484253 be_nice = true;
46494254
46504255 path = btrfs_alloc_path();
....@@ -4652,21 +4257,24 @@
46524257 return -ENOMEM;
46534258 path->reada = READA_BACK;
46544259
4655
- /*
4656
- * We want to drop from the next block forward in case this new size is
4657
- * not block aligned since we will be keeping the last block of the
4658
- * extent just the way it is.
4659
- */
4660
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4661
- root == fs_info->tree_root)
4260
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4261
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
4262
+ &cached_state);
4263
+
4264
+ /*
4265
+ * We want to drop from the next block forward in case this
4266
+ * new size is not block aligned since we will be keeping the
4267
+ * last block of the extent just the way it is.
4268
+ */
46624269 btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
46634270 fs_info->sectorsize),
46644271 (u64)-1, 0);
4272
+ }
46654273
46664274 /*
46674275 * This function is also used to drop the items in the log tree before
46684276 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4669
- * it is used to drop the loged items. So we shouldn't kill the delayed
4277
+ * it is used to drop the logged items. So we shouldn't kill the delayed
46704278 * items.
46714279 */
46724280 if (min_type == 0 && root == BTRFS_I(inode)->root)
....@@ -4688,7 +4296,6 @@
46884296 goto out;
46894297 }
46904298
4691
- path->leave_spinning = 1;
46924299 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
46934300 if (ret < 0)
46944301 goto out;
....@@ -4704,6 +4311,8 @@
47044311 }
47054312
47064313 while (1) {
4314
+ u64 clear_start = 0, clear_len = 0;
4315
+
47074316 fi = NULL;
47084317 leaf = path->nodes[0];
47094318 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
....@@ -4754,6 +4363,8 @@
47544363
47554364 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
47564365 u64 num_dec;
4366
+
4367
+ clear_start = found_key.offset;
47574368 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
47584369 if (!del_item) {
47594370 u64 orig_num_bytes =
....@@ -4761,11 +4372,12 @@
47614372 extent_num_bytes = ALIGN(new_size -
47624373 found_key.offset,
47634374 fs_info->sectorsize);
4375
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
47644376 btrfs_set_file_extent_num_bytes(leaf, fi,
47654377 extent_num_bytes);
47664378 num_dec = (orig_num_bytes -
47674379 extent_num_bytes);
4768
- if (test_bit(BTRFS_ROOT_REF_COWS,
4380
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
47694381 &root->state) &&
47704382 extent_start != 0)
47714383 inode_sub_bytes(inode, num_dec);
....@@ -4781,11 +4393,12 @@
47814393 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
47824394 if (extent_start != 0) {
47834395 found_extent = 1;
4784
- if (test_bit(BTRFS_ROOT_REF_COWS,
4396
+ if (test_bit(BTRFS_ROOT_SHAREABLE,
47854397 &root->state))
47864398 inode_sub_bytes(inode, num_dec);
47874399 }
47884400 }
4401
+ clear_len = num_dec;
47894402 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
47904403 /*
47914404 * we can't truncate inline items that have had
....@@ -4799,7 +4412,7 @@
47994412
48004413 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
48014414 size = btrfs_file_extent_calc_inline_size(size);
4802
- btrfs_truncate_item(root->fs_info, path, size, 1);
4415
+ btrfs_truncate_item(path, size, 1);
48034416 } else if (!del_item) {
48044417 /*
48054418 * We have to bail so the last_size is set to
....@@ -4807,12 +4420,33 @@
48074420 */
48084421 ret = NEED_TRUNCATE_BLOCK;
48094422 break;
4423
+ } else {
4424
+ /*
4425
+ * Inline extents are special, we just treat
4426
+ * them as a full sector worth in the file
4427
+ * extent tree just for simplicity sake.
4428
+ */
4429
+ clear_len = fs_info->sectorsize;
48104430 }
48114431
4812
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4432
+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
48134433 inode_sub_bytes(inode, item_end + 1 - new_size);
48144434 }
48154435 delete:
4436
+ /*
4437
+ * We use btrfs_truncate_inode_items() to clean up log trees for
4438
+ * multiple fsyncs, and in this case we don't want to clear the
4439
+ * file extent range because it's just the log.
4440
+ */
4441
+ if (root == BTRFS_I(inode)->root) {
4442
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
4443
+ clear_start, clear_len);
4444
+ if (ret) {
4445
+ btrfs_abort_transaction(trans, ret);
4446
+ break;
4447
+ }
4448
+ }
4449
+
48164450 if (del_item)
48174451 last_size = found_key.offset;
48184452 else
....@@ -4836,29 +4470,23 @@
48364470 should_throttle = false;
48374471
48384472 if (found_extent &&
4839
- (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4840
- root == fs_info->tree_root)) {
4841
- btrfs_set_path_blocking(path);
4473
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4474
+ struct btrfs_ref ref = { 0 };
4475
+
48424476 bytes_deleted += extent_num_bytes;
4843
- ret = btrfs_free_extent(trans, root, extent_start,
4844
- extent_num_bytes, 0,
4845
- btrfs_header_owner(leaf),
4846
- ino, extent_offset);
4477
+
4478
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
4479
+ extent_start, extent_num_bytes, 0);
4480
+ ref.real_root = root->root_key.objectid;
4481
+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
4482
+ ino, extent_offset);
4483
+ ret = btrfs_free_extent(trans, &ref);
48474484 if (ret) {
48484485 btrfs_abort_transaction(trans, ret);
48494486 break;
48504487 }
4851
- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4852
- btrfs_async_run_delayed_refs(fs_info,
4853
- trans->delayed_ref_updates * 2,
4854
- trans->transid, 0);
48554488 if (be_nice) {
4856
- if (truncate_space_check(trans, root,
4857
- extent_num_bytes)) {
4858
- should_end = true;
4859
- }
4860
- if (btrfs_should_throttle_delayed_refs(trans,
4861
- fs_info))
4489
+ if (btrfs_should_throttle_delayed_refs(trans))
48624490 should_throttle = true;
48634491 }
48644492 }
....@@ -4868,7 +4496,7 @@
48684496
48694497 if (path->slots[0] == 0 ||
48704498 path->slots[0] != pending_del_slot ||
4871
- should_throttle || should_end) {
4499
+ should_throttle) {
48724500 if (pending_del_nr) {
48734501 ret = btrfs_del_items(trans, root, path,
48744502 pending_del_slot,
....@@ -4880,23 +4508,24 @@
48804508 pending_del_nr = 0;
48814509 }
48824510 btrfs_release_path(path);
4883
- if (should_throttle) {
4884
- unsigned long updates = trans->delayed_ref_updates;
4885
- if (updates) {
4886
- trans->delayed_ref_updates = 0;
4887
- ret = btrfs_run_delayed_refs(trans,
4888
- updates * 2);
4889
- if (ret)
4890
- break;
4891
- }
4892
- }
4511
+
48934512 /*
4894
- * if we failed to refill our space rsv, bail out
4895
- * and let the transaction restart
4513
+ * We can generate a lot of delayed refs, so we need to
4514
+ * throttle every once and a while and make sure we're
4515
+ * adding enough space to keep up with the work we are
4516
+ * generating. Since we hold a transaction here we
4517
+ * can't flush, and we don't want to FLUSH_LIMIT because
4518
+ * we could have generated too many delayed refs to
4519
+ * actually allocate, so just bail if we're short and
4520
+ * let the normal reservation dance happen higher up.
48964521 */
4897
- if (should_end) {
4898
- ret = -EAGAIN;
4899
- break;
4522
+ if (should_throttle) {
4523
+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
4524
+ BTRFS_RESERVE_NO_FLUSH);
4525
+ if (ret) {
4526
+ ret = -EAGAIN;
4527
+ break;
4528
+ }
49004529 }
49014530 goto search_again;
49024531 } else {
....@@ -4918,22 +4547,12 @@
49184547 ASSERT(last_size >= new_size);
49194548 if (!ret && last_size > new_size)
49204549 last_size = new_size;
4921
- btrfs_ordered_update_i_size(inode, last_size, NULL);
4550
+ btrfs_inode_safe_disk_i_size_write(inode, last_size);
4551
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
4552
+ (u64)-1, &cached_state);
49224553 }
49234554
49244555 btrfs_free_path(path);
4925
-
4926
- if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
4927
- unsigned long updates = trans->delayed_ref_updates;
4928
- int err;
4929
-
4930
- if (updates) {
4931
- trans->delayed_ref_updates = 0;
4932
- err = btrfs_run_delayed_refs(trans, updates * 2);
4933
- if (err)
4934
- ret = err;
4935
- }
4936
- }
49374556 return ret;
49384557 }
49394558
....@@ -4958,11 +4577,13 @@
49584577 struct extent_state *cached_state = NULL;
49594578 struct extent_changeset *data_reserved = NULL;
49604579 char *kaddr;
4580
+ bool only_release_metadata = false;
49614581 u32 blocksize = fs_info->sectorsize;
49624582 pgoff_t index = from >> PAGE_SHIFT;
49634583 unsigned offset = from & (blocksize - 1);
49644584 struct page *page;
49654585 gfp_t mask = btrfs_alloc_write_mask(mapping);
4586
+ size_t write_bytes = blocksize;
49664587 int ret = 0;
49674588 u64 block_start;
49684589 u64 block_end;
....@@ -4974,15 +4595,28 @@
49744595 block_start = round_down(from, blocksize);
49754596 block_end = block_start + blocksize - 1;
49764597
4977
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4978
- block_start, blocksize);
4979
- if (ret)
4598
+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
4599
+ block_start, blocksize);
4600
+ if (ret < 0) {
4601
+ if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
4602
+ &write_bytes) > 0) {
4603
+ /* For nocow case, no need to reserve data space */
4604
+ only_release_metadata = true;
4605
+ } else {
4606
+ goto out;
4607
+ }
4608
+ }
4609
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
4610
+ if (ret < 0) {
4611
+ if (!only_release_metadata)
4612
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
4613
+ data_reserved, block_start, blocksize);
49804614 goto out;
4981
-
4615
+ }
49824616 again:
49834617 page = find_or_create_page(mapping, index, mask);
49844618 if (!page) {
4985
- btrfs_delalloc_release_space(inode, data_reserved,
4619
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
49864620 block_start, blocksize, true);
49874621 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
49884622 ret = -ENOMEM;
....@@ -5007,24 +4641,23 @@
50074641 lock_extent_bits(io_tree, block_start, block_end, &cached_state);
50084642 set_page_extent_mapped(page);
50094643
5010
- ordered = btrfs_lookup_ordered_extent(inode, block_start);
4644
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
50114645 if (ordered) {
50124646 unlock_extent_cached(io_tree, block_start, block_end,
50134647 &cached_state);
50144648 unlock_page(page);
50154649 put_page(page);
5016
- btrfs_start_ordered_extent(inode, ordered, 1);
4650
+ btrfs_start_ordered_extent(ordered, 1);
50174651 btrfs_put_ordered_extent(ordered);
50184652 goto again;
50194653 }
50204654
50214655 clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
5022
- EXTENT_DIRTY | EXTENT_DELALLOC |
5023
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
5024
- 0, 0, &cached_state);
4656
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4657
+ 0, 0, &cached_state);
50254658
5026
- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5027
- &cached_state, 0);
4659
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
4660
+ &cached_state);
50284661 if (ret) {
50294662 unlock_extent_cached(io_tree, block_start, block_end,
50304663 &cached_state);
....@@ -5048,14 +4681,26 @@
50484681 set_page_dirty(page);
50494682 unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
50504683
4684
+ if (only_release_metadata)
4685
+ set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
4686
+ block_end, EXTENT_NORESERVE, NULL, NULL,
4687
+ GFP_NOFS);
4688
+
50514689 out_unlock:
5052
- if (ret)
5053
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
5054
- blocksize, true);
4690
+ if (ret) {
4691
+ if (only_release_metadata)
4692
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
4693
+ blocksize, true);
4694
+ else
4695
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
4696
+ block_start, blocksize, true);
4697
+ }
50554698 btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
50564699 unlock_page(page);
50574700 put_page(page);
50584701 out:
4702
+ if (only_release_metadata)
4703
+ btrfs_check_nocow_unlock(BTRFS_I(inode));
50594704 extent_changeset_free(data_reserved);
50604705 return ret;
50614706 }
....@@ -5137,25 +4782,12 @@
51374782 if (size <= hole_start)
51384783 return 0;
51394784
5140
- while (1) {
5141
- struct btrfs_ordered_extent *ordered;
5142
-
5143
- lock_extent_bits(io_tree, hole_start, block_end - 1,
5144
- &cached_state);
5145
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
5146
- block_end - hole_start);
5147
- if (!ordered)
5148
- break;
5149
- unlock_extent_cached(io_tree, hole_start, block_end - 1,
5150
- &cached_state);
5151
- btrfs_start_ordered_extent(inode, ordered, 1);
5152
- btrfs_put_ordered_extent(ordered);
5153
- }
5154
-
4785
+ btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
4786
+ block_end - 1, &cached_state);
51554787 cur_offset = hole_start;
51564788 while (1) {
51574789 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
5158
- block_end - cur_offset, 0);
4790
+ block_end - cur_offset);
51594791 if (IS_ERR(em)) {
51604792 err = PTR_ERR(em);
51614793 em = NULL;
....@@ -5163,14 +4795,21 @@
51634795 }
51644796 last_byte = min(extent_map_end(em), block_end);
51654797 last_byte = ALIGN(last_byte, fs_info->sectorsize);
4798
+ hole_size = last_byte - cur_offset;
4799
+
51664800 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
51674801 struct extent_map *hole_em;
5168
- hole_size = last_byte - cur_offset;
51694802
51704803 err = maybe_insert_hole(root, inode, cur_offset,
51714804 hole_size);
51724805 if (err)
51734806 break;
4807
+
4808
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
4809
+ cur_offset, hole_size);
4810
+ if (err)
4811
+ break;
4812
+
51744813 btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
51754814 cur_offset + hole_size - 1, 0);
51764815 hole_em = alloc_extent_map();
....@@ -5187,7 +4826,6 @@
51874826 hole_em->block_len = 0;
51884827 hole_em->orig_block_len = 0;
51894828 hole_em->ram_bytes = hole_size;
5190
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
51914829 hole_em->compress_type = BTRFS_COMPRESS_NONE;
51924830 hole_em->generation = fs_info->generation;
51934831
....@@ -5203,6 +4841,11 @@
52034841 hole_size - 1, 0);
52044842 }
52054843 free_extent_map(hole_em);
4844
+ } else {
4845
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
4846
+ cur_offset, hole_size);
4847
+ if (err)
4848
+ break;
52064849 }
52074850 next:
52084851 free_extent_map(em);
....@@ -5246,42 +4889,39 @@
52464889 * truncation, it must capture all writes that happened before
52474890 * this truncation.
52484891 */
5249
- btrfs_wait_for_snapshot_creation(root);
4892
+ btrfs_drew_write_lock(&root->snapshot_lock);
52504893 ret = btrfs_cont_expand(inode, oldsize, newsize);
52514894 if (ret) {
5252
- btrfs_end_write_no_snapshotting(root);
4895
+ btrfs_drew_write_unlock(&root->snapshot_lock);
52534896 return ret;
52544897 }
52554898
52564899 trans = btrfs_start_transaction(root, 1);
52574900 if (IS_ERR(trans)) {
5258
- btrfs_end_write_no_snapshotting(root);
4901
+ btrfs_drew_write_unlock(&root->snapshot_lock);
52594902 return PTR_ERR(trans);
52604903 }
52614904
52624905 i_size_write(inode, newsize);
5263
- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4906
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
52644907 pagecache_isize_extended(inode, oldsize, newsize);
52654908 ret = btrfs_update_inode(trans, root, inode);
5266
- btrfs_end_write_no_snapshotting(root);
4909
+ btrfs_drew_write_unlock(&root->snapshot_lock);
52674910 btrfs_end_transaction(trans);
52684911 } else {
52694912
52704913 /*
52714914 * We're truncating a file that used to have good data down to
5272
- * zero. Make sure it gets into the ordered flush list so that
5273
- * any new writes get down to disk quickly.
4915
+ * zero. Make sure any new writes to the file get on disk
4916
+ * on close.
52744917 */
52754918 if (newsize == 0)
5276
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4919
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
52774920 &BTRFS_I(inode)->runtime_flags);
52784921
52794922 truncate_setsize(inode, newsize);
52804923
5281
- /* Disable nonlocked read DIO to avoid the end less truncate */
5282
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
52834924 inode_dio_wait(inode);
5284
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
52854925
52864926 ret = btrfs_truncate(inode, newsize == oldsize);
52874927 if (ret && inode->i_nlink) {
....@@ -5356,10 +4996,10 @@
53564996 truncate_inode_pages_final(&inode->i_data);
53574997
53584998 write_lock(&map_tree->lock);
5359
- while (!RB_EMPTY_ROOT(&map_tree->map)) {
4999
+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
53605000 struct extent_map *em;
53615001
5362
- node = rb_first(&map_tree->map);
5002
+ node = rb_first_cached(&map_tree->map);
53635003 em = rb_entry(node, struct extent_map, rb_node);
53645004 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
53655005 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
....@@ -5375,8 +5015,8 @@
53755015
53765016 /*
53775017 * Keep looping until we have no more ranges in the io tree.
5378
- * We can have ongoing bios started by readpages (called from readahead)
5379
- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5018
+ * We can have ongoing bios started by readahead that have
5019
+ * their endio callback (extent_io.c:end_bio_extent_readpage)
53805020 * still in progress (unlocked the pages in the bio but did not yet
53815021 * unlocked the ranges in the io tree). Therefore this means some
53825022 * ranges can still be locked and eviction started because before
....@@ -5415,12 +5055,13 @@
54155055 * Note, end is the bytenr of last byte, so we need + 1 here.
54165056 */
54175057 if (state_flags & EXTENT_DELALLOC)
5418
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
5058
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5059
+ end - start + 1);
54195060
54205061 clear_extent_bit(io_tree, start, end,
5421
- EXTENT_LOCKED | EXTENT_DIRTY |
5422
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5423
- EXTENT_DEFRAG, 1, 1, &cached_state);
5062
+ EXTENT_LOCKED | EXTENT_DELALLOC |
5063
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
5064
+ &cached_state);
54245065
54255066 cond_resched();
54265067 spin_lock(&io_tree->lock);
....@@ -5429,43 +5070,54 @@
54295070 }
54305071
54315072 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5432
- struct btrfs_block_rsv *rsv,
5433
- u64 min_size)
5073
+ struct btrfs_block_rsv *rsv)
54345074 {
54355075 struct btrfs_fs_info *fs_info = root->fs_info;
54365076 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5437
- int failures = 0;
5077
+ struct btrfs_trans_handle *trans;
5078
+ u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5079
+ int ret;
54385080
5439
- for (;;) {
5440
- struct btrfs_trans_handle *trans;
5441
- int ret;
5442
-
5443
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
5444
- BTRFS_RESERVE_FLUSH_LIMIT);
5445
-
5446
- if (ret && ++failures > 2) {
5447
- btrfs_warn(fs_info,
5448
- "could not allocate space for a delete; will truncate on mount");
5449
- return ERR_PTR(-ENOSPC);
5450
- }
5451
-
5452
- trans = btrfs_join_transaction(root);
5453
- if (IS_ERR(trans) || !ret)
5454
- return trans;
5455
-
5081
+ /*
5082
+ * Eviction should be taking place at some place safe because of our
5083
+ * delayed iputs. However the normal flushing code will run delayed
5084
+ * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5085
+ *
5086
+ * We reserve the delayed_refs_extra here again because we can't use
5087
+ * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5088
+ * above. We reserve our extra bit here because we generate a ton of
5089
+ * delayed refs activity by truncating.
5090
+ *
5091
+ * If we cannot make our reservation we'll attempt to steal from the
5092
+ * global reserve, because we really want to be able to free up space.
5093
+ */
5094
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
5095
+ BTRFS_RESERVE_FLUSH_EVICT);
5096
+ if (ret) {
54565097 /*
54575098 * Try to steal from the global reserve if there is space for
54585099 * it.
54595100 */
5460
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
5461
- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
5462
- return trans;
5463
-
5464
- /* If not, commit and try again. */
5465
- ret = btrfs_commit_transaction(trans);
5466
- if (ret)
5467
- return ERR_PTR(ret);
5101
+ if (btrfs_check_space_for_delayed_refs(fs_info) ||
5102
+ btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
5103
+ btrfs_warn(fs_info,
5104
+ "could not allocate space for delete; will truncate on mount");
5105
+ return ERR_PTR(-ENOSPC);
5106
+ }
5107
+ delayed_refs_extra = 0;
54685108 }
5109
+
5110
+ trans = btrfs_join_transaction(root);
5111
+ if (IS_ERR(trans))
5112
+ return trans;
5113
+
5114
+ if (delayed_refs_extra) {
5115
+ trans->block_rsv = &fs_info->trans_block_rsv;
5116
+ trans->bytes_reserved = delayed_refs_extra;
5117
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5118
+ delayed_refs_extra, 1);
5119
+ }
5120
+ return trans;
54695121 }
54705122
54715123 void btrfs_evict_inode(struct inode *inode)
....@@ -5474,7 +5126,6 @@
54745126 struct btrfs_trans_handle *trans;
54755127 struct btrfs_root *root = BTRFS_I(inode)->root;
54765128 struct btrfs_block_rsv *rsv;
5477
- u64 min_size;
54785129 int ret;
54795130
54805131 trace_btrfs_inode_evict(inode);
....@@ -5483,8 +5134,6 @@
54835134 clear_inode(inode);
54845135 return;
54855136 }
5486
-
5487
- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
54885137
54895138 evict_inode_truncate_pages(inode);
54905139
....@@ -5496,9 +5145,6 @@
54965145
54975146 if (is_bad_inode(inode))
54985147 goto no_delete;
5499
- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5500
- if (!special_file(inode->i_mode))
5501
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
55025148
55035149 btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
55045150
....@@ -5518,13 +5164,13 @@
55185164 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
55195165 if (!rsv)
55205166 goto no_delete;
5521
- rsv->size = min_size;
5167
+ rsv->size = btrfs_calc_metadata_size(fs_info, 1);
55225168 rsv->failfast = 1;
55235169
55245170 btrfs_i_size_write(BTRFS_I(inode), 0);
55255171
55265172 while (1) {
5527
- trans = evict_refill_and_join(root, rsv, min_size);
5173
+ trans = evict_refill_and_join(root, rsv);
55285174 if (IS_ERR(trans))
55295175 goto free_rsv;
55305176
....@@ -5549,7 +5195,7 @@
55495195 * If it turns out that we are dropping too many of these, we might want
55505196 * to add a mechanism for retrying these after a commit.
55515197 */
5552
- trans = evict_refill_and_join(root, rsv, min_size);
5198
+ trans = evict_refill_and_join(root, rsv);
55535199 if (!IS_ERR(trans)) {
55545200 trans->block_rsv = rsv;
55555201 btrfs_orphan_del(trans, BTRFS_I(inode));
....@@ -5596,12 +5242,8 @@
55965242
55975243 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
55985244 name, namelen, 0);
5599
- if (!di) {
5600
- ret = -ENOENT;
5601
- goto out;
5602
- }
5603
- if (IS_ERR(di)) {
5604
- ret = PTR_ERR(di);
5245
+ if (IS_ERR_OR_NULL(di)) {
5246
+ ret = di ? PTR_ERR(di) : -ENOENT;
56055247 goto out;
56065248 }
56075249
....@@ -5672,7 +5314,7 @@
56725314
56735315 btrfs_release_path(path);
56745316
5675
- new_root = btrfs_read_fs_root_no_name(fs_info, location);
5317
+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
56765318 if (IS_ERR(new_root)) {
56775319 err = PTR_ERR(new_root);
56785320 goto out;
....@@ -5724,15 +5366,15 @@
57245366 spin_unlock(&root->inode_lock);
57255367 }
57265368
5727
-static void inode_tree_del(struct inode *inode)
5369
+static void inode_tree_del(struct btrfs_inode *inode)
57285370 {
5729
- struct btrfs_root *root = BTRFS_I(inode)->root;
5371
+ struct btrfs_root *root = inode->root;
57305372 int empty = 0;
57315373
57325374 spin_lock(&root->inode_lock);
5733
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5734
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5735
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5375
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
5376
+ rb_erase(&inode->rb_node, &root->inode_tree);
5377
+ RB_CLEAR_NODE(&inode->rb_node);
57365378 empty = RB_EMPTY_ROOT(&root->inode_tree);
57375379 }
57385380 spin_unlock(&root->inode_lock);
....@@ -5750,29 +5392,32 @@
57505392 static int btrfs_init_locked_inode(struct inode *inode, void *p)
57515393 {
57525394 struct btrfs_iget_args *args = p;
5753
- inode->i_ino = args->location->objectid;
5754
- memcpy(&BTRFS_I(inode)->location, args->location,
5755
- sizeof(*args->location));
5756
- BTRFS_I(inode)->root = args->root;
5395
+
5396
+ inode->i_ino = args->ino;
5397
+ BTRFS_I(inode)->location.objectid = args->ino;
5398
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5399
+ BTRFS_I(inode)->location.offset = 0;
5400
+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5401
+ BUG_ON(args->root && !BTRFS_I(inode)->root);
57575402 return 0;
57585403 }
57595404
57605405 static int btrfs_find_actor(struct inode *inode, void *opaque)
57615406 {
57625407 struct btrfs_iget_args *args = opaque;
5763
- return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5408
+
5409
+ return args->ino == BTRFS_I(inode)->location.objectid &&
57645410 args->root == BTRFS_I(inode)->root;
57655411 }
57665412
5767
-static struct inode *btrfs_iget_locked(struct super_block *s,
5768
- struct btrfs_key *location,
5413
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
57695414 struct btrfs_root *root)
57705415 {
57715416 struct inode *inode;
57725417 struct btrfs_iget_args args;
5773
- unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5418
+ unsigned long hashval = btrfs_inode_hash(ino, root);
57745419
5775
- args.location = location;
5420
+ args.ino = ino;
57765421 args.root = root;
57775422
57785423 inode = iget5_locked(s, hashval, btrfs_find_actor,
....@@ -5781,16 +5426,18 @@
57815426 return inode;
57825427 }
57835428
5784
-/* Get an inode object given its location and corresponding root.
5785
- * Returns in *is_new if the inode was read from disk
5429
+/*
5430
+ * Get an inode object given its inode number and corresponding root.
5431
+ * Path can be preallocated to prevent recursing back to iget through
5432
+ * allocator. NULL is also valid but may require an additional allocation
5433
+ * later.
57865434 */
5787
-struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
5788
- struct btrfs_root *root, int *new,
5789
- struct btrfs_path *path)
5435
+struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5436
+ struct btrfs_root *root, struct btrfs_path *path)
57905437 {
57915438 struct inode *inode;
57925439
5793
- inode = btrfs_iget_locked(s, location, root);
5440
+ inode = btrfs_iget_locked(s, ino, root);
57945441 if (!inode)
57955442 return ERR_PTR(-ENOMEM);
57965443
....@@ -5801,8 +5448,6 @@
58015448 if (!ret) {
58025449 inode_tree_add(inode);
58035450 unlock_new_inode(inode);
5804
- if (new)
5805
- *new = 1;
58065451 } else {
58075452 iget_failed(inode);
58085453 /*
....@@ -5819,10 +5464,9 @@
58195464 return inode;
58205465 }
58215466
5822
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5823
- struct btrfs_root *root, int *new)
5467
+struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
58245468 {
5825
- return btrfs_iget_path(s, location, root, new, NULL);
5469
+ return btrfs_iget_path(s, ino, root, NULL);
58265470 }
58275471
58285472 static struct inode *new_simple_dir(struct super_block *s,
....@@ -5834,12 +5478,16 @@
58345478 if (!inode)
58355479 return ERR_PTR(-ENOMEM);
58365480
5837
- BTRFS_I(inode)->root = root;
5481
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
58385482 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
58395483 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
58405484
58415485 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5842
- inode->i_op = &btrfs_dir_ro_inode_operations;
5486
+ /*
5487
+ * We only need lookup, the rest is read-only and there's no inode
5488
+ * associated with the dentry
5489
+ */
5490
+ inode->i_op = &simple_dir_inode_operations;
58435491 inode->i_opflags &= ~IOP_XATTR;
58445492 inode->i_fop = &simple_dir_operations;
58455493 inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
....@@ -5853,7 +5501,20 @@
58535501
58545502 static inline u8 btrfs_inode_type(struct inode *inode)
58555503 {
5856
- return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5504
+ /*
5505
+ * Compile-time asserts that generic FT_* types still match
5506
+ * BTRFS_FT_* types
5507
+ */
5508
+ BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
5509
+ BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
5510
+ BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
5511
+ BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
5512
+ BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
5513
+ BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
5514
+ BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
5515
+ BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
5516
+
5517
+ return fs_umode_to_ftype(inode->i_mode);
58575518 }
58585519
58595520 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
....@@ -5864,7 +5525,6 @@
58645525 struct btrfs_root *sub_root = root;
58655526 struct btrfs_key location;
58665527 u8 di_type = 0;
5867
- int index;
58685528 int ret = 0;
58695529
58705530 if (dentry->d_name.len > BTRFS_NAME_LEN)
....@@ -5875,7 +5535,7 @@
58755535 return ERR_PTR(ret);
58765536
58775537 if (location.type == BTRFS_INODE_ITEM_KEY) {
5878
- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5538
+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
58795539 if (IS_ERR(inode))
58805540 return inode;
58815541
....@@ -5891,7 +5551,6 @@
58915551 return inode;
58925552 }
58935553
5894
- index = srcu_read_lock(&fs_info->subvol_srcu);
58955554 ret = fixup_tree_root_location(fs_info, dir, dentry,
58965555 &location, &sub_root);
58975556 if (ret < 0) {
....@@ -5900,9 +5559,10 @@
59005559 else
59015560 inode = new_simple_dir(dir->i_sb, &location, sub_root);
59025561 } else {
5903
- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5562
+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
59045563 }
5905
- srcu_read_unlock(&fs_info->subvol_srcu, index);
5564
+ if (root != sub_root)
5565
+ btrfs_put_root(sub_root);
59065566
59075567 if (!IS_ERR(inode) && root != sub_root) {
59085568 down_read(&fs_info->cleanup_work_sem);
....@@ -5940,22 +5600,12 @@
59405600 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
59415601 unsigned int flags)
59425602 {
5943
- struct inode *inode;
5603
+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
59445604
5945
- inode = btrfs_lookup_dentry(dir, dentry);
5946
- if (IS_ERR(inode)) {
5947
- if (PTR_ERR(inode) == -ENOENT)
5948
- inode = NULL;
5949
- else
5950
- return ERR_CAST(inode);
5951
- }
5952
-
5605
+ if (inode == ERR_PTR(-ENOENT))
5606
+ inode = NULL;
59535607 return d_splice_alias(inode, dentry);
59545608 }
5955
-
5956
-unsigned char btrfs_filetype_table[] = {
5957
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5958
-};
59595609
59605610 /*
59615611 * All this infrastructure exists because dir_emit can fault, and we are holding
....@@ -6095,7 +5745,7 @@
60955745 name_ptr = (char *)(entry + 1);
60965746 read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
60975747 name_len);
6098
- put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
5748
+ put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
60995749 &entry->type);
61005750 btrfs_dir_item_key_to_cpu(leaf, di, &location);
61015751 put_unaligned(location.objectid, &entry->ino);
....@@ -6167,7 +5817,7 @@
61675817 return PTR_ERR(trans);
61685818
61695819 ret = btrfs_update_inode(trans, root, inode);
6170
- if (ret && ret == -ENOSPC) {
5820
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
61715821 /* whoops, lets try again with the full transaction */
61725822 btrfs_end_transaction(trans);
61735823 trans = btrfs_start_transaction(root, 1);
....@@ -6290,7 +5940,8 @@
62905940 static int btrfs_insert_inode_locked(struct inode *inode)
62915941 {
62925942 struct btrfs_iget_args args;
6293
- args.location = &BTRFS_I(inode)->location;
5943
+
5944
+ args.ino = BTRFS_I(inode)->location.objectid;
62945945 args.root = BTRFS_I(inode)->root;
62955946
62965947 return insert_inode_locked4(inode,
....@@ -6346,13 +5997,16 @@
63465997 u32 sizes[2];
63475998 int nitems = name ? 2 : 1;
63485999 unsigned long ptr;
6000
+ unsigned int nofs_flag;
63496001 int ret;
63506002
63516003 path = btrfs_alloc_path();
63526004 if (!path)
63536005 return ERR_PTR(-ENOMEM);
63546006
6007
+ nofs_flag = memalloc_nofs_save();
63556008 inode = new_inode(fs_info->sb);
6009
+ memalloc_nofs_restore(nofs_flag);
63566010 if (!inode) {
63576011 btrfs_free_path(path);
63586012 return ERR_PTR(-ENOMEM);
....@@ -6390,7 +6044,7 @@
63906044 */
63916045 BTRFS_I(inode)->index_cnt = 2;
63926046 BTRFS_I(inode)->dir_index = *index;
6393
- BTRFS_I(inode)->root = root;
6047
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
63946048 BTRFS_I(inode)->generation = trans->transid;
63956049 inode->i_generation = BTRFS_I(inode)->generation;
63966050
....@@ -6477,7 +6131,7 @@
64776131 inode_tree_add(inode);
64786132
64796133 trace_btrfs_inode_new(inode);
6480
- btrfs_set_inode_last_trans(trans, inode);
6134
+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
64816135
64826136 btrfs_update_root_times(trans, root);
64836137
....@@ -6535,8 +6189,7 @@
65356189 if (ret)
65366190 return ret;
65376191
6538
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
6539
- parent_inode, &key,
6192
+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
65406193 btrfs_inode_type(&inode->vfs_inode), index);
65416194 if (ret == -EEXIST || ret == -EOVERFLOW)
65426195 goto fail_dir_item;
....@@ -6620,7 +6273,7 @@
66206273 if (IS_ERR(trans))
66216274 return PTR_ERR(trans);
66226275
6623
- err = btrfs_find_free_ino(root, &objectid);
6276
+ err = btrfs_find_free_objectid(root, &objectid);
66246277 if (err)
66256278 goto out_unlock;
66266279
....@@ -6684,7 +6337,7 @@
66846337 if (IS_ERR(trans))
66856338 return PTR_ERR(trans);
66866339
6687
- err = btrfs_find_free_ino(root, &objectid);
6340
+ err = btrfs_find_free_objectid(root, &objectid);
66886341 if (err)
66896342 goto out_unlock;
66906343
....@@ -6719,7 +6372,6 @@
67196372 if (err)
67206373 goto out_unlock;
67216374
6722
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
67236375 d_instantiate_new(dentry, inode);
67246376
67256377 out_unlock:
....@@ -6744,7 +6396,7 @@
67446396 int drop_inode = 0;
67456397
67466398 /* do not allow sys_link's with other subvols of the same device */
6747
- if (root->objectid != BTRFS_I(inode)->root->objectid)
6399
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
67486400 return -EXDEV;
67496401
67506402 if (inode->i_nlink >= BTRFS_LINK_MAX)
....@@ -6782,7 +6434,6 @@
67826434 drop_inode = 1;
67836435 } else {
67846436 struct dentry *parent = dentry->d_parent;
6785
- int ret;
67866437
67876438 err = btrfs_update_inode(trans, root, inode);
67886439 if (err)
....@@ -6796,14 +6447,8 @@
67966447 if (err)
67976448 goto fail;
67986449 }
6799
- BTRFS_I(inode)->last_link_trans = trans->transid;
68006450 d_instantiate(dentry, inode);
6801
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
6802
- true, NULL);
6803
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
6804
- err = btrfs_commit_transaction(trans);
6805
- trans = NULL;
6806
- }
6451
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
68076452 }
68086453
68096454 fail:
....@@ -6824,7 +6469,6 @@
68246469 struct btrfs_trans_handle *trans;
68256470 struct btrfs_root *root = BTRFS_I(dir)->root;
68266471 int err = 0;
6827
- int drop_on_err = 0;
68286472 u64 objectid = 0;
68296473 u64 index = 0;
68306474
....@@ -6837,7 +6481,7 @@
68376481 if (IS_ERR(trans))
68386482 return PTR_ERR(trans);
68396483
6840
- err = btrfs_find_free_ino(root, &objectid);
6484
+ err = btrfs_find_free_objectid(root, &objectid);
68416485 if (err)
68426486 goto out_fail;
68436487
....@@ -6850,7 +6494,6 @@
68506494 goto out_fail;
68516495 }
68526496
6853
- drop_on_err = 1;
68546497 /* these must be set before we unlock the inode */
68556498 inode->i_op = &btrfs_dir_inode_operations;
68566499 inode->i_fop = &btrfs_dir_file_operations;
....@@ -6871,7 +6514,6 @@
68716514 goto out_fail;
68726515
68736516 d_instantiate_new(dentry, inode);
6874
- drop_on_err = 0;
68756517
68766518 out_fail:
68776519 btrfs_end_transaction(trans);
....@@ -6929,26 +6571,34 @@
69296571 return ret;
69306572 }
69316573
6932
-/*
6933
- * a bit scary, this does extent mapping from logical file offset to the disk.
6934
- * the ugly parts come from merging extents from the disk with the in-ram
6935
- * representation. This gets more complex because of the data=ordered code,
6936
- * where the in-ram extents might be locked pending data=ordered completion.
6574
+/**
6575
+ * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
6576
+ * @inode: file to search in
6577
+ * @page: page to read extent data into if the extent is inline
6578
+ * @pg_offset: offset into @page to copy to
6579
+ * @start: file offset
6580
+ * @len: length of range starting at @start
69376581 *
6938
- * This also copies inline extents directly into the page.
6582
+ * This returns the first &struct extent_map which overlaps with the given
6583
+ * range, reading it from the B-tree and caching it if necessary. Note that
6584
+ * there may be more extents which overlap the given range after the returned
6585
+ * extent_map.
6586
+ *
6587
+ * If @page is not NULL and the extent is inline, this also reads the extent
6588
+ * data directly into the page and marks the extent up to date in the io_tree.
6589
+ *
6590
+ * Return: ERR_PTR on error, non-NULL extent_map on success.
69396591 */
69406592 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6941
- struct page *page,
6942
- size_t pg_offset, u64 start, u64 len,
6943
- int create)
6593
+ struct page *page, size_t pg_offset,
6594
+ u64 start, u64 len)
69446595 {
69456596 struct btrfs_fs_info *fs_info = inode->root->fs_info;
6946
- int ret;
6947
- int err = 0;
6597
+ int ret = 0;
69486598 u64 extent_start = 0;
69496599 u64 extent_end = 0;
69506600 u64 objectid = btrfs_ino(inode);
6951
- u32 found_type;
6601
+ int extent_type = -1;
69526602 struct btrfs_path *path = NULL;
69536603 struct btrfs_root *root = inode->root;
69546604 struct btrfs_file_extent_item *item;
....@@ -6957,12 +6607,9 @@
69576607 struct extent_map *em = NULL;
69586608 struct extent_map_tree *em_tree = &inode->extent_tree;
69596609 struct extent_io_tree *io_tree = &inode->io_tree;
6960
- const bool new_inline = !page || create;
69616610
69626611 read_lock(&em_tree->lock);
69636612 em = lookup_extent_mapping(em_tree, start, len);
6964
- if (em)
6965
- em->bdev = fs_info->fs_devices->latest_bdev;
69666613 read_unlock(&em_tree->lock);
69676614
69686615 if (em) {
....@@ -6975,48 +6622,47 @@
69756622 }
69766623 em = alloc_extent_map();
69776624 if (!em) {
6978
- err = -ENOMEM;
6625
+ ret = -ENOMEM;
69796626 goto out;
69806627 }
6981
- em->bdev = fs_info->fs_devices->latest_bdev;
69826628 em->start = EXTENT_MAP_HOLE;
69836629 em->orig_start = EXTENT_MAP_HOLE;
69846630 em->len = (u64)-1;
69856631 em->block_len = (u64)-1;
69866632
6633
+ path = btrfs_alloc_path();
69876634 if (!path) {
6988
- path = btrfs_alloc_path();
6989
- if (!path) {
6990
- err = -ENOMEM;
6991
- goto out;
6992
- }
6993
- /*
6994
- * Chances are we'll be called again, so go ahead and do
6995
- * readahead
6996
- */
6997
- path->reada = READA_FORWARD;
6998
- }
6999
-
7000
- ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
7001
- if (ret < 0) {
7002
- err = ret;
6635
+ ret = -ENOMEM;
70036636 goto out;
70046637 }
70056638
7006
- if (ret != 0) {
6639
+ /* Chances are we'll be called again, so go ahead and do readahead */
6640
+ path->reada = READA_FORWARD;
6641
+
6642
+ /*
6643
+ * Unless we're going to uncompress the inline extent, no sleep would
6644
+ * happen.
6645
+ */
6646
+ path->leave_spinning = 1;
6647
+
6648
+ path->recurse = btrfs_is_free_space_inode(inode);
6649
+
6650
+ ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6651
+ if (ret < 0) {
6652
+ goto out;
6653
+ } else if (ret > 0) {
70076654 if (path->slots[0] == 0)
70086655 goto not_found;
70096656 path->slots[0]--;
6657
+ ret = 0;
70106658 }
70116659
70126660 leaf = path->nodes[0];
70136661 item = btrfs_item_ptr(leaf, path->slots[0],
70146662 struct btrfs_file_extent_item);
7015
- /* are we inside the extent that was found? */
70166663 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7017
- found_type = found_key.type;
70186664 if (found_key.objectid != objectid ||
7019
- found_type != BTRFS_EXTENT_DATA_KEY) {
6665
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
70206666 /*
70216667 * If we backup past the first extent we want to move forward
70226668 * and see if there is an extent in front of us, otherwise we'll
....@@ -7027,30 +6673,22 @@
70276673 goto next;
70286674 }
70296675
7030
- found_type = btrfs_file_extent_type(leaf, item);
6676
+ extent_type = btrfs_file_extent_type(leaf, item);
70316677 extent_start = found_key.offset;
7032
- if (found_type == BTRFS_FILE_EXTENT_REG ||
7033
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6678
+ extent_end = btrfs_file_extent_end(path);
6679
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
6680
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
70346681 /* Only regular file could have regular/prealloc extent */
70356682 if (!S_ISREG(inode->vfs_inode.i_mode)) {
7036
- err = -EUCLEAN;
6683
+ ret = -EUCLEAN;
70376684 btrfs_crit(fs_info,
70386685 "regular/prealloc extent found for non-regular inode %llu",
70396686 btrfs_ino(inode));
70406687 goto out;
70416688 }
7042
- extent_end = extent_start +
7043
- btrfs_file_extent_num_bytes(leaf, item);
7044
-
70456689 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
70466690 extent_start);
7047
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7048
- size_t size;
7049
-
7050
- size = btrfs_file_extent_ram_bytes(leaf, item);
7051
- extent_end = ALIGN(extent_start + size,
7052
- fs_info->sectorsize);
7053
-
6691
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
70546692 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
70556693 path->slots[0],
70566694 extent_start);
....@@ -7060,12 +6698,11 @@
70606698 path->slots[0]++;
70616699 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
70626700 ret = btrfs_next_leaf(root, path);
7063
- if (ret < 0) {
7064
- err = ret;
6701
+ if (ret < 0)
70656702 goto out;
7066
- }
7067
- if (ret > 0)
6703
+ else if (ret > 0)
70686704 goto not_found;
6705
+
70696706 leaf = path->nodes[0];
70706707 }
70716708 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
....@@ -7076,26 +6713,28 @@
70766713 goto not_found;
70776714 if (start > found_key.offset)
70786715 goto next;
6716
+
6717
+ /* New extent overlaps with existing one */
70796718 em->start = start;
70806719 em->orig_start = start;
70816720 em->len = found_key.offset - start;
7082
- goto not_found_em;
6721
+ em->block_start = EXTENT_MAP_HOLE;
6722
+ goto insert;
70836723 }
70846724
7085
- btrfs_extent_item_to_extent_map(inode, path, item,
7086
- new_inline, em);
6725
+ btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
70876726
7088
- if (found_type == BTRFS_FILE_EXTENT_REG ||
7089
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6727
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
6728
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
70906729 goto insert;
7091
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6730
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
70926731 unsigned long ptr;
70936732 char *map;
70946733 size_t size;
70956734 size_t extent_offset;
70966735 size_t copy_size;
70976736
7098
- if (new_inline)
6737
+ if (!page)
70996738 goto out;
71006739
71016740 size = btrfs_file_extent_ram_bytes(leaf, item);
....@@ -7107,15 +6746,15 @@
71076746 em->orig_block_len = em->len;
71086747 em->orig_start = em->start;
71096748 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6749
+
6750
+ btrfs_set_path_blocking(path);
71106751 if (!PageUptodate(page)) {
71116752 if (btrfs_file_extent_compression(leaf, item) !=
71126753 BTRFS_COMPRESS_NONE) {
71136754 ret = uncompress_inline(path, page, pg_offset,
71146755 extent_offset, item);
7115
- if (ret) {
7116
- err = ret;
6756
+ if (ret)
71176757 goto out;
7118
- }
71196758 } else {
71206759 map = kmap(page);
71216760 read_extent_buffer(leaf, map + pg_offset, ptr,
....@@ -7137,49 +6776,45 @@
71376776 em->start = start;
71386777 em->orig_start = start;
71396778 em->len = len;
7140
-not_found_em:
71416779 em->block_start = EXTENT_MAP_HOLE;
71426780 insert:
6781
+ ret = 0;
71436782 btrfs_release_path(path);
71446783 if (em->start > start || extent_map_end(em) <= start) {
71456784 btrfs_err(fs_info,
71466785 "bad extent! em: [%llu %llu] passed [%llu %llu]",
71476786 em->start, em->len, start, len);
7148
- err = -EIO;
6787
+ ret = -EIO;
71496788 goto out;
71506789 }
71516790
7152
- err = 0;
71536791 write_lock(&em_tree->lock);
7154
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6792
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
71556793 write_unlock(&em_tree->lock);
71566794 out:
6795
+ btrfs_free_path(path);
71576796
71586797 trace_btrfs_get_extent(root, inode, em);
71596798
7160
- btrfs_free_path(path);
7161
- if (err) {
6799
+ if (ret) {
71626800 free_extent_map(em);
7163
- return ERR_PTR(err);
6801
+ return ERR_PTR(ret);
71646802 }
7165
- BUG_ON(!em); /* Error is always set */
71666803 return em;
71676804 }
71686805
71696806 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7170
- struct page *page,
7171
- size_t pg_offset, u64 start, u64 len,
7172
- int create)
6807
+ u64 start, u64 len)
71736808 {
71746809 struct extent_map *em;
71756810 struct extent_map *hole_em = NULL;
7176
- u64 range_start = start;
6811
+ u64 delalloc_start = start;
71776812 u64 end;
7178
- u64 found;
7179
- u64 found_end;
6813
+ u64 delalloc_len;
6814
+ u64 delalloc_end;
71806815 int err = 0;
71816816
7182
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6817
+ em = btrfs_get_extent(inode, NULL, 0, start, len);
71836818 if (IS_ERR(em))
71846819 return em;
71856820 /*
....@@ -7204,80 +6839,83 @@
72046839 em = NULL;
72056840
72066841 /* ok, we didn't find anything, lets look for delalloc */
7207
- found = count_range_bits(&inode->io_tree, &range_start,
6842
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
72086843 end, len, EXTENT_DELALLOC, 1);
7209
- found_end = range_start + found;
7210
- if (found_end < range_start)
7211
- found_end = (u64)-1;
6844
+ delalloc_end = delalloc_start + delalloc_len;
6845
+ if (delalloc_end < delalloc_start)
6846
+ delalloc_end = (u64)-1;
72126847
72136848 /*
7214
- * we didn't find anything useful, return
7215
- * the original results from get_extent()
6849
+ * We didn't find anything useful, return the original results from
6850
+ * get_extent()
72166851 */
7217
- if (range_start > end || found_end <= start) {
6852
+ if (delalloc_start > end || delalloc_end <= start) {
72186853 em = hole_em;
72196854 hole_em = NULL;
72206855 goto out;
72216856 }
72226857
7223
- /* adjust the range_start to make sure it doesn't
7224
- * go backwards from the start they passed in
6858
+ /*
6859
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
6860
+ * the start they passed in
72256861 */
7226
- range_start = max(start, range_start);
7227
- found = found_end - range_start;
6862
+ delalloc_start = max(start, delalloc_start);
6863
+ delalloc_len = delalloc_end - delalloc_start;
72286864
7229
- if (found > 0) {
7230
- u64 hole_start = start;
7231
- u64 hole_len = len;
6865
+ if (delalloc_len > 0) {
6866
+ u64 hole_start;
6867
+ u64 hole_len;
6868
+ const u64 hole_end = extent_map_end(hole_em);
72326869
72336870 em = alloc_extent_map();
72346871 if (!em) {
72356872 err = -ENOMEM;
72366873 goto out;
72376874 }
7238
- /*
7239
- * when btrfs_get_extent can't find anything it
7240
- * returns one huge hole
7241
- *
7242
- * make sure what it found really fits our range, and
7243
- * adjust to make sure it is based on the start from
7244
- * the caller
7245
- */
7246
- if (hole_em) {
7247
- u64 calc_end = extent_map_end(hole_em);
72486875
7249
- if (calc_end <= start || (hole_em->start > end)) {
7250
- free_extent_map(hole_em);
7251
- hole_em = NULL;
7252
- } else {
7253
- hole_start = max(hole_em->start, start);
7254
- hole_len = calc_end - hole_start;
7255
- }
6876
+ ASSERT(hole_em);
6877
+ /*
6878
+ * When btrfs_get_extent can't find anything it returns one
6879
+ * huge hole
6880
+ *
6881
+ * Make sure what it found really fits our range, and adjust to
6882
+ * make sure it is based on the start from the caller
6883
+ */
6884
+ if (hole_end <= start || hole_em->start > end) {
6885
+ free_extent_map(hole_em);
6886
+ hole_em = NULL;
6887
+ } else {
6888
+ hole_start = max(hole_em->start, start);
6889
+ hole_len = hole_end - hole_start;
72566890 }
7257
- em->bdev = NULL;
7258
- if (hole_em && range_start > hole_start) {
7259
- /* our hole starts before our delalloc, so we
7260
- * have to return just the parts of the hole
7261
- * that go until the delalloc starts
6891
+
6892
+ if (hole_em && delalloc_start > hole_start) {
6893
+ /*
6894
+ * Our hole starts before our delalloc, so we have to
6895
+ * return just the parts of the hole that go until the
6896
+ * delalloc starts
72626897 */
7263
- em->len = min(hole_len,
7264
- range_start - hole_start);
6898
+ em->len = min(hole_len, delalloc_start - hole_start);
72656899 em->start = hole_start;
72666900 em->orig_start = hole_start;
72676901 /*
7268
- * don't adjust block start at all,
7269
- * it is fixed at EXTENT_MAP_HOLE
6902
+ * Don't adjust block start at all, it is fixed at
6903
+ * EXTENT_MAP_HOLE
72706904 */
72716905 em->block_start = hole_em->block_start;
72726906 em->block_len = hole_len;
72736907 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
72746908 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
72756909 } else {
7276
- em->start = range_start;
7277
- em->len = found;
7278
- em->orig_start = range_start;
6910
+ /*
6911
+ * Hole is out of passed range or it starts after
6912
+ * delalloc range
6913
+ */
6914
+ em->start = delalloc_start;
6915
+ em->len = delalloc_len;
6916
+ em->orig_start = delalloc_start;
72796917 em->block_start = EXTENT_MAP_DELALLOC;
7280
- em->block_len = found;
6918
+ em->block_len = delalloc_len;
72816919 }
72826920 } else {
72836921 return hole_em;
....@@ -7292,7 +6930,7 @@
72926930 return em;
72936931 }
72946932
7295
-static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
6933
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
72966934 const u64 start,
72976935 const u64 len,
72986936 const u64 orig_start,
....@@ -7306,21 +6944,19 @@
73066944 int ret;
73076945
73086946 if (type != BTRFS_ORDERED_NOCOW) {
7309
- em = create_io_em(inode, start, len, orig_start,
7310
- block_start, block_len, orig_block_len,
7311
- ram_bytes,
6947
+ em = create_io_em(inode, start, len, orig_start, block_start,
6948
+ block_len, orig_block_len, ram_bytes,
73126949 BTRFS_COMPRESS_NONE, /* compress_type */
73136950 type);
73146951 if (IS_ERR(em))
73156952 goto out;
73166953 }
7317
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7318
- len, block_len, type);
6954
+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
6955
+ block_len, type);
73196956 if (ret) {
73206957 if (em) {
73216958 free_extent_map(em);
7322
- btrfs_drop_extent_cache(BTRFS_I(inode), start,
7323
- start + len - 1, 0);
6959
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
73246960 }
73256961 em = ERR_PTR(ret);
73266962 }
....@@ -7329,11 +6965,11 @@
73296965 return em;
73306966 }
73316967
7332
-static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6968
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
73336969 u64 start, u64 len)
73346970 {
7335
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7336
- struct btrfs_root *root = BTRFS_I(inode)->root;
6971
+ struct btrfs_root *root = inode->root;
6972
+ struct btrfs_fs_info *fs_info = root->fs_info;
73376973 struct extent_map *em;
73386974 struct btrfs_key ins;
73396975 u64 alloc_hint;
....@@ -7350,19 +6986,38 @@
73506986 ins.offset, BTRFS_ORDERED_REGULAR);
73516987 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
73526988 if (IS_ERR(em))
7353
- btrfs_free_reserved_extent(fs_info, ins.objectid,
7354
- ins.offset, 1);
6989
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
6990
+ 1);
73556991
73566992 return em;
73576993 }
73586994
73596995 /*
7360
- * returns 1 when the nocow is safe, < 1 on error, 0 if the
7361
- * block must be cow'd
6996
+ * Check if we can do nocow write into the range [@offset, @offset + @len)
6997
+ *
6998
+ * @offset: File offset
6999
+ * @len: The length to write, will be updated to the nocow writeable
7000
+ * range
7001
+ * @orig_start: (optional) Return the original file offset of the file extent
7002
+ * @orig_len: (optional) Return the original on-disk length of the file extent
7003
+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
7004
+ * @strict: if true, omit optimizations that might force us into unnecessary
7005
+ * cow. e.g., don't trust generation number.
7006
+ *
7007
+ * This function will flush ordered extents in the range to ensure proper
7008
+ * nocow checks for (nowait == false) case.
7009
+ *
7010
+ * Return:
7011
+ * >0 and update @len if we can do nocow write
7012
+ * 0 if we can't do nocow write
7013
+ * <0 if error happened
7014
+ *
7015
+ * NOTE: This only checks the file extents, caller is responsible to wait for
7016
+ * any ordered extents.
73627017 */
73637018 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
73647019 u64 *orig_start, u64 *orig_block_len,
7365
- u64 *ram_bytes)
7020
+ u64 *ram_bytes, bool strict)
73667021 {
73677022 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
73687023 struct btrfs_path *path;
....@@ -7440,8 +7095,9 @@
74407095 * Do the same check as in btrfs_cross_ref_exist but without the
74417096 * unnecessary search.
74427097 */
7443
- if (btrfs_file_extent_generation(leaf, fi) <=
7444
- btrfs_root_last_snapshot(&root->root_item))
7098
+ if (!strict &&
7099
+ (btrfs_file_extent_generation(leaf, fi) <=
7100
+ btrfs_root_last_snapshot(&root->root_item)))
74457101 goto out;
74467102
74477103 backref_offset = btrfs_file_extent_offset(leaf, fi);
....@@ -7477,7 +7133,8 @@
74777133 */
74787134
74797135 ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7480
- key.offset - backref_offset, disk_bytenr);
7136
+ key.offset - backref_offset, disk_bytenr,
7137
+ strict);
74817138 if (ret) {
74827139 ret = 0;
74837140 goto out;
....@@ -7505,7 +7162,7 @@
75057162 }
75067163
75077164 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7508
- struct extent_state **cached_state, int writing)
7165
+ struct extent_state **cached_state, bool writing)
75097166 {
75107167 struct btrfs_ordered_extent *ordered;
75117168 int ret = 0;
....@@ -7554,7 +7211,7 @@
75547211 */
75557212 if (writing ||
75567213 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7557
- btrfs_start_ordered_extent(inode, ordered, 1);
7214
+ btrfs_start_ordered_extent(ordered, 1);
75587215 else
75597216 ret = -ENOTBLK;
75607217 btrfs_put_ordered_extent(ordered);
....@@ -7564,11 +7221,11 @@
75647221 * for it to complete) and then invalidate the pages for
75657222 * this range (through invalidate_inode_pages2_range()),
75667223 * but that can lead us to a deadlock with a concurrent
7567
- * call to readpages() (a buffered read or a defrag call
7224
+ * call to readahead (a buffered read or a defrag call
75687225 * triggered a readahead) on a page lock due to an
75697226 * ordered dio extent we created before but did not have
75707227 * yet a corresponding bio submitted (whence it can not
7571
- * complete), which makes readpages() wait for that
7228
+ * complete), which makes readahead wait for that
75727229 * ordered extent to complete while holding a lock on
75737230 * that page.
75747231 */
....@@ -7585,15 +7242,14 @@
75857242 }
75867243
75877244 /* The callers of this must take lock_extent() */
7588
-static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7589
- u64 orig_start, u64 block_start,
7245
+static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7246
+ u64 len, u64 orig_start, u64 block_start,
75907247 u64 block_len, u64 orig_block_len,
75917248 u64 ram_bytes, int compress_type,
75927249 int type)
75937250 {
75947251 struct extent_map_tree *em_tree;
75957252 struct extent_map *em;
7596
- struct btrfs_root *root = BTRFS_I(inode)->root;
75977253 int ret;
75987254
75997255 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
....@@ -7601,7 +7257,7 @@
76017257 type == BTRFS_ORDERED_NOCOW ||
76027258 type == BTRFS_ORDERED_REGULAR);
76037259
7604
- em_tree = &BTRFS_I(inode)->extent_tree;
7260
+ em_tree = &inode->extent_tree;
76057261 em = alloc_extent_map();
76067262 if (!em)
76077263 return ERR_PTR(-ENOMEM);
....@@ -7611,7 +7267,6 @@
76117267 em->len = len;
76127268 em->block_len = block_len;
76137269 em->block_start = block_start;
7614
- em->bdev = root->fs_info->fs_devices->latest_bdev;
76157270 em->orig_block_len = orig_block_len;
76167271 em->ram_bytes = ram_bytes;
76177272 em->generation = -1;
....@@ -7624,8 +7279,8 @@
76247279 }
76257280
76267281 do {
7627
- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7628
- em->start + em->len - 1, 0);
7282
+ btrfs_drop_extent_cache(inode, em->start,
7283
+ em->start + em->len - 1, 0);
76297284 write_lock(&em_tree->lock);
76307285 ret = add_extent_mapping(em_tree, em, 1);
76317286 write_unlock(&em_tree->lock);
....@@ -7645,28 +7300,7 @@
76457300 }
76467301
76477302
7648
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
7649
- struct buffer_head *bh_result,
7650
- struct inode *inode,
7651
- u64 start, u64 len)
7652
-{
7653
- if (em->block_start == EXTENT_MAP_HOLE ||
7654
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7655
- return -ENOENT;
7656
-
7657
- len = min(len, em->len - (start - em->start));
7658
-
7659
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7660
- inode->i_blkbits;
7661
- bh_result->b_size = len;
7662
- bh_result->b_bdev = em->bdev;
7663
- set_buffer_mapped(bh_result);
7664
-
7665
- return 0;
7666
-}
7667
-
76687303 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7669
- struct buffer_head *bh_result,
76707304 struct inode *inode,
76717305 struct btrfs_dio_data *dio_data,
76727306 u64 start, u64 len)
....@@ -7698,11 +7332,11 @@
76987332 block_start = em->block_start + (start - em->start);
76997333
77007334 if (can_nocow_extent(inode, start, &len, &orig_start,
7701
- &orig_block_len, &ram_bytes) == 1 &&
7335
+ &orig_block_len, &ram_bytes, false) == 1 &&
77027336 btrfs_inc_nocow_writers(fs_info, block_start)) {
77037337 struct extent_map *em2;
77047338
7705
- em2 = btrfs_create_dio_extent(inode, start, len,
7339
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
77067340 orig_start, block_start,
77077341 len, orig_block_len,
77087342 ram_bytes, type);
....@@ -7721,16 +7355,14 @@
77217355 * use the existing or preallocated extent, so does not
77227356 * need to adjust btrfs_space_info's bytes_may_use.
77237357 */
7724
- btrfs_free_reserved_data_space_noquota(inode, start,
7725
- len);
7358
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
77267359 goto skip_cow;
77277360 }
77287361 }
77297362
77307363 /* this will cow the extent */
7731
- len = bh_result->b_size;
77327364 free_extent_map(em);
7733
- *map = em = btrfs_new_extent_direct(inode, start, len);
7365
+ *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
77347366 if (IS_ERR(em)) {
77357367 ret = PTR_ERR(em);
77367368 goto out;
....@@ -7739,72 +7371,93 @@
77397371 len = min(len, em->len - (start - em->start));
77407372
77417373 skip_cow:
7742
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7743
- inode->i_blkbits;
7744
- bh_result->b_size = len;
7745
- bh_result->b_bdev = em->bdev;
7746
- set_buffer_mapped(bh_result);
7747
-
7748
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7749
- set_buffer_new(bh_result);
7750
-
77517374 /*
77527375 * Need to update the i_size under the extent lock so buffered
77537376 * readers will get the updated i_size when we unlock.
77547377 */
7755
- if (!dio_data->overwrite && start + len > i_size_read(inode))
7378
+ if (start + len > i_size_read(inode))
77567379 i_size_write(inode, start + len);
77577380
7758
- WARN_ON(dio_data->reserve < len);
77597381 dio_data->reserve -= len;
7760
- dio_data->unsubmitted_oe_range_end = start + len;
7761
- current->journal_info = dio_data;
77627382 out:
77637383 return ret;
77647384 }
77657385
7766
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7767
- struct buffer_head *bh_result, int create)
7386
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7387
+ loff_t length, unsigned int flags, struct iomap *iomap,
7388
+ struct iomap *srcmap)
77687389 {
77697390 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
77707391 struct extent_map *em;
77717392 struct extent_state *cached_state = NULL;
77727393 struct btrfs_dio_data *dio_data = NULL;
7773
- u64 start = iblock << inode->i_blkbits;
77747394 u64 lockstart, lockend;
7775
- u64 len = bh_result->b_size;
7776
- int unlock_bits = EXTENT_LOCKED;
7395
+ const bool write = !!(flags & IOMAP_WRITE);
77777396 int ret = 0;
7397
+ u64 len = length;
7398
+ bool unlock_extents = false;
7399
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
77787400
7779
- if (create)
7780
- unlock_bits |= EXTENT_DIRTY;
7781
- else
7401
+ /*
7402
+ * We used current->journal_info here to see if we were sync, but
7403
+ * there's a lot of tests in the enospc machinery to not do flushing if
7404
+ * we have a journal_info set, so we need to clear this out and re-set
7405
+ * it in iomap_end.
7406
+ */
7407
+ ASSERT(current->journal_info == NULL ||
7408
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
7409
+ current->journal_info = NULL;
7410
+
7411
+ if (!write)
77827412 len = min_t(u64, len, fs_info->sectorsize);
77837413
77847414 lockstart = start;
77857415 lockend = start + len - 1;
77867416
7787
- if (current->journal_info) {
7788
- /*
7789
- * Need to pull our outstanding extents and set journal_info to NULL so
7790
- * that anything that needs to check if there's a transaction doesn't get
7791
- * confused.
7792
- */
7793
- dio_data = current->journal_info;
7794
- current->journal_info = NULL;
7417
+ /*
7418
+ * The generic stuff only does filemap_write_and_wait_range, which
7419
+ * isn't enough if we've written compressed pages to this area, so we
7420
+ * need to flush the dirty pages again to make absolutely sure that any
7421
+ * outstanding dirty pages are on disk.
7422
+ */
7423
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7424
+ &BTRFS_I(inode)->runtime_flags)) {
7425
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
7426
+ start + length - 1);
7427
+ if (ret)
7428
+ return ret;
77957429 }
7430
+
7431
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
7432
+ if (!dio_data)
7433
+ return -ENOMEM;
7434
+
7435
+ dio_data->sync = sync;
7436
+ dio_data->length = length;
7437
+ if (write) {
7438
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
7439
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
7440
+ &dio_data->data_reserved,
7441
+ start, dio_data->reserve);
7442
+ if (ret) {
7443
+ extent_changeset_free(dio_data->data_reserved);
7444
+ kfree(dio_data);
7445
+ return ret;
7446
+ }
7447
+ }
7448
+ iomap->private = dio_data;
7449
+
77967450
77977451 /*
77987452 * If this errors out it's because we couldn't invalidate pagecache for
77997453 * this range and we need to fallback to buffered.
78007454 */
7801
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7802
- create)) {
7455
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
78037456 ret = -ENOTBLK;
78047457 goto err;
78057458 }
78067459
7807
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
7460
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
78087461 if (IS_ERR(em)) {
78097462 ret = PTR_ERR(em);
78107463 goto unlock_err;
....@@ -7827,443 +7480,253 @@
78277480 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
78287481 em->block_start == EXTENT_MAP_INLINE) {
78297482 free_extent_map(em);
7830
- ret = -ENOTBLK;
7483
+ /*
7484
+ * If we are in a NOWAIT context, return -EAGAIN in order to
7485
+ * fallback to buffered IO. This is not only because we can
7486
+ * block with buffered IO (no support for NOWAIT semantics at
7487
+ * the moment) but also to avoid returning short reads to user
7488
+ * space - this happens if we were able to read some data from
7489
+ * previous non-compressed extents and then when we fallback to
7490
+ * buffered IO, at btrfs_file_read_iter() by calling
7491
+ * filemap_read(), we fail to fault in pages for the read buffer,
7492
+ * in which case filemap_read() returns a short read (the number
7493
+ * of bytes previously read is > 0, so it does not return -EFAULT).
7494
+ */
7495
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
78317496 goto unlock_err;
78327497 }
78337498
7834
- if (create) {
7835
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
7836
- dio_data, start, len);
7499
+ len = min(len, em->len - (start - em->start));
7500
+ if (write) {
7501
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7502
+ start, len);
78377503 if (ret < 0)
78387504 goto unlock_err;
7839
-
7840
- /* clear and unlock the entire range */
7841
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7842
- unlock_bits, 1, 0, &cached_state);
7505
+ unlock_extents = true;
7506
+ /* Recalc len in case the new em is smaller than requested */
7507
+ len = min(len, em->len - (start - em->start));
78437508 } else {
7844
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
7845
- start, len);
7846
- /* Can be negative only if we read from a hole */
7847
- if (ret < 0) {
7848
- ret = 0;
7849
- free_extent_map(em);
7850
- goto unlock_err;
7851
- }
78527509 /*
78537510 * We need to unlock only the end area that we aren't using.
78547511 * The rest is going to be unlocked by the endio routine.
78557512 */
7856
- lockstart = start + bh_result->b_size;
7857
- if (lockstart < lockend) {
7858
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7859
- lockend, unlock_bits, 1, 0,
7860
- &cached_state);
7861
- } else {
7862
- free_extent_state(cached_state);
7863
- }
7513
+ lockstart = start + len;
7514
+ if (lockstart < lockend)
7515
+ unlock_extents = true;
78647516 }
7517
+
7518
+ if (unlock_extents)
7519
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
7520
+ lockstart, lockend, &cached_state);
7521
+ else
7522
+ free_extent_state(cached_state);
7523
+
7524
+ /*
7525
+ * Translate extent map information to iomap.
7526
+ * We trim the extents (and move the addr) even though iomap code does
7527
+ * that, since we have locked only the parts we are performing I/O in.
7528
+ */
7529
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
7530
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7531
+ iomap->addr = IOMAP_NULL_ADDR;
7532
+ iomap->type = IOMAP_HOLE;
7533
+ } else {
7534
+ iomap->addr = em->block_start + (start - em->start);
7535
+ iomap->type = IOMAP_MAPPED;
7536
+ }
7537
+ iomap->offset = start;
7538
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
7539
+ iomap->length = len;
78657540
78667541 free_extent_map(em);
78677542
78687543 return 0;
78697544
78707545 unlock_err:
7871
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7872
- unlock_bits, 1, 0, &cached_state);
7546
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7547
+ &cached_state);
78737548 err:
7874
- if (dio_data)
7875
- current->journal_info = dio_data;
7549
+ if (dio_data) {
7550
+ btrfs_delalloc_release_space(BTRFS_I(inode),
7551
+ dio_data->data_reserved, start,
7552
+ dio_data->reserve, true);
7553
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
7554
+ extent_changeset_free(dio_data->data_reserved);
7555
+ kfree(dio_data);
7556
+ }
78767557 return ret;
78777558 }
78787559
7879
-static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
7880
- struct bio *bio,
7881
- int mirror_num)
7560
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7561
+ ssize_t written, unsigned int flags, struct iomap *iomap)
78827562 {
7563
+ int ret = 0;
7564
+ struct btrfs_dio_data *dio_data = iomap->private;
7565
+ size_t submitted = dio_data->submitted;
7566
+ const bool write = !!(flags & IOMAP_WRITE);
7567
+
7568
+ if (!write && (iomap->type == IOMAP_HOLE)) {
7569
+ /* If reading from a hole, unlock and return */
7570
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
7571
+ goto out;
7572
+ }
7573
+
7574
+ if (submitted < length) {
7575
+ pos += submitted;
7576
+ length -= submitted;
7577
+ if (write)
7578
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
7579
+ length, false);
7580
+ else
7581
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7582
+ pos + length - 1);
7583
+ ret = -ENOTBLK;
7584
+ }
7585
+
7586
+ if (write) {
7587
+ if (dio_data->reserve)
7588
+ btrfs_delalloc_release_space(BTRFS_I(inode),
7589
+ dio_data->data_reserved, pos,
7590
+ dio_data->reserve, true);
7591
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
7592
+ extent_changeset_free(dio_data->data_reserved);
7593
+ }
7594
+out:
7595
+ /*
7596
+ * We're all done, we can re-set the current->journal_info now safely
7597
+ * for our endio.
7598
+ */
7599
+ if (dio_data->sync) {
7600
+ ASSERT(current->journal_info == NULL);
7601
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
7602
+ }
7603
+ kfree(dio_data);
7604
+ iomap->private = NULL;
7605
+
7606
+ return ret;
7607
+}
7608
+
7609
+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
7610
+{
7611
+ /*
7612
+ * This implies a barrier so that stores to dio_bio->bi_status before
7613
+ * this and loads of dio_bio->bi_status after this are fully ordered.
7614
+ */
7615
+ if (!refcount_dec_and_test(&dip->refs))
7616
+ return;
7617
+
7618
+ if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
7619
+ __endio_write_update_ordered(BTRFS_I(dip->inode),
7620
+ dip->logical_offset,
7621
+ dip->bytes,
7622
+ !dip->dio_bio->bi_status);
7623
+ } else {
7624
+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
7625
+ dip->logical_offset,
7626
+ dip->logical_offset + dip->bytes - 1);
7627
+ }
7628
+
7629
+ bio_endio(dip->dio_bio);
7630
+ kfree(dip);
7631
+}
7632
+
7633
+static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7634
+ int mirror_num,
7635
+ unsigned long bio_flags)
7636
+{
7637
+ struct btrfs_dio_private *dip = bio->bi_private;
78837638 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
78847639 blk_status_t ret;
78857640
78867641 BUG_ON(bio_op(bio) == REQ_OP_WRITE);
78877642
7888
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7643
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
78897644 if (ret)
78907645 return ret;
78917646
7892
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7893
-
7647
+ refcount_inc(&dip->refs);
7648
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
7649
+ if (ret)
7650
+ refcount_dec(&dip->refs);
78947651 return ret;
78957652 }
78967653
7897
-static int btrfs_check_dio_repairable(struct inode *inode,
7898
- struct bio *failed_bio,
7899
- struct io_failure_record *failrec,
7900
- int failed_mirror)
7654
+static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
7655
+ struct btrfs_io_bio *io_bio,
7656
+ const bool uptodate)
79017657 {
7902
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7903
- int num_copies;
7904
-
7905
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7906
- if (num_copies == 1) {
7907
- /*
7908
- * we only have a single copy of the data, so don't bother with
7909
- * all the retry and error correction code that follows. no
7910
- * matter what the error is, it is very likely to persist.
7911
- */
7912
- btrfs_debug(fs_info,
7913
- "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7914
- num_copies, failrec->this_mirror, failed_mirror);
7915
- return 0;
7916
- }
7917
-
7918
- failrec->failed_mirror = failed_mirror;
7919
- failrec->this_mirror++;
7920
- if (failrec->this_mirror == failed_mirror)
7921
- failrec->this_mirror++;
7922
-
7923
- if (failrec->this_mirror > num_copies) {
7924
- btrfs_debug(fs_info,
7925
- "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7926
- num_copies, failrec->this_mirror, failed_mirror);
7927
- return 0;
7928
- }
7929
-
7930
- return 1;
7931
-}
7932
-
7933
-static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
7934
- struct page *page, unsigned int pgoff,
7935
- u64 start, u64 end, int failed_mirror,
7936
- bio_end_io_t *repair_endio, void *repair_arg)
7937
-{
7938
- struct io_failure_record *failrec;
7939
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7658
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
7659
+ const u32 sectorsize = fs_info->sectorsize;
79407660 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7941
- struct bio *bio;
7942
- int isector;
7943
- unsigned int read_mode = 0;
7944
- int segs;
7945
- int ret;
7946
- blk_status_t status;
7947
- struct bio_vec bvec;
7948
-
7949
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7950
-
7951
- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7952
- if (ret)
7953
- return errno_to_blk_status(ret);
7954
-
7955
- ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7956
- failed_mirror);
7957
- if (!ret) {
7958
- free_io_failure(failure_tree, io_tree, failrec);
7959
- return BLK_STS_IOERR;
7960
- }
7961
-
7962
- segs = bio_segments(failed_bio);
7963
- bio_get_first_bvec(failed_bio, &bvec);
7964
- if (segs > 1 ||
7965
- (bvec.bv_len > btrfs_inode_sectorsize(inode)))
7966
- read_mode |= REQ_FAILFAST_DEV;
7967
-
7968
- isector = start - btrfs_io_bio(failed_bio)->logical;
7969
- isector >>= inode->i_sb->s_blocksize_bits;
7970
- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7971
- pgoff, isector, repair_endio, repair_arg);
7972
- bio->bi_opf = REQ_OP_READ | read_mode;
7973
-
7974
- btrfs_debug(BTRFS_I(inode)->root->fs_info,
7975
- "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
7976
- read_mode, failrec->this_mirror, failrec->in_validation);
7977
-
7978
- status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
7979
- if (status) {
7980
- free_io_failure(failure_tree, io_tree, failrec);
7981
- bio_put(bio);
7982
- }
7983
-
7984
- return status;
7985
-}
7986
-
7987
-struct btrfs_retry_complete {
7988
- struct completion done;
7989
- struct inode *inode;
7990
- u64 start;
7991
- int uptodate;
7992
-};
7993
-
7994
-static void btrfs_retry_endio_nocsum(struct bio *bio)
7995
-{
7996
- struct btrfs_retry_complete *done = bio->bi_private;
7997
- struct inode *inode = done->inode;
7998
- struct bio_vec *bvec;
7999
- struct extent_io_tree *io_tree, *failure_tree;
8000
- int i;
8001
-
8002
- if (bio->bi_status)
8003
- goto end;
8004
-
8005
- ASSERT(bio->bi_vcnt == 1);
8006
- io_tree = &BTRFS_I(inode)->io_tree;
8007
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
8008
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
8009
-
8010
- done->uptodate = 1;
8011
- ASSERT(!bio_flagged(bio, BIO_CLONED));
8012
- bio_for_each_segment_all(bvec, bio, i)
8013
- clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
8014
- io_tree, done->start, bvec->bv_page,
8015
- btrfs_ino(BTRFS_I(inode)), 0);
8016
-end:
8017
- complete(&done->done);
8018
- bio_put(bio);
8019
-}
8020
-
8021
-static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
8022
- struct btrfs_io_bio *io_bio)
8023
-{
8024
- struct btrfs_fs_info *fs_info;
7661
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7662
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
80257663 struct bio_vec bvec;
80267664 struct bvec_iter iter;
8027
- struct btrfs_retry_complete done;
8028
- u64 start;
8029
- unsigned int pgoff;
8030
- u32 sectorsize;
8031
- int nr_sectors;
8032
- blk_status_t ret;
7665
+ u64 start = io_bio->logical;
7666
+ int icsum = 0;
80337667 blk_status_t err = BLK_STS_OK;
80347668
8035
- fs_info = BTRFS_I(inode)->root->fs_info;
8036
- sectorsize = fs_info->sectorsize;
7669
+ __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
7670
+ unsigned int i, nr_sectors, pgoff;
80377671
8038
- start = io_bio->logical;
8039
- done.inode = inode;
8040
- io_bio->bio.bi_iter = io_bio->iter;
8041
-
8042
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
80437672 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
80447673 pgoff = bvec.bv_offset;
8045
-
8046
-next_block_or_try_again:
8047
- done.uptodate = 0;
8048
- done.start = start;
8049
- init_completion(&done.done);
8050
-
8051
- ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8052
- pgoff, start, start + sectorsize - 1,
8053
- io_bio->mirror_num,
8054
- btrfs_retry_endio_nocsum, &done);
8055
- if (ret) {
8056
- err = ret;
8057
- goto next;
8058
- }
8059
-
8060
- wait_for_completion_io(&done.done);
8061
-
8062
- if (!done.uptodate) {
8063
- /* We might have another mirror, so try again */
8064
- goto next_block_or_try_again;
8065
- }
8066
-
8067
-next:
8068
- start += sectorsize;
8069
-
8070
- nr_sectors--;
8071
- if (nr_sectors) {
8072
- pgoff += sectorsize;
7674
+ for (i = 0; i < nr_sectors; i++) {
80737675 ASSERT(pgoff < PAGE_SIZE);
8074
- goto next_block_or_try_again;
7676
+ if (uptodate &&
7677
+ (!csum || !check_data_csum(inode, io_bio, icsum,
7678
+ bvec.bv_page, pgoff,
7679
+ start, sectorsize))) {
7680
+ clean_io_failure(fs_info, failure_tree, io_tree,
7681
+ start, bvec.bv_page,
7682
+ btrfs_ino(BTRFS_I(inode)),
7683
+ pgoff);
7684
+ } else {
7685
+ blk_status_t status;
7686
+
7687
+ status = btrfs_submit_read_repair(inode,
7688
+ &io_bio->bio,
7689
+ start - io_bio->logical,
7690
+ bvec.bv_page, pgoff,
7691
+ start,
7692
+ start + sectorsize - 1,
7693
+ io_bio->mirror_num,
7694
+ submit_dio_repair_bio);
7695
+ if (status)
7696
+ err = status;
7697
+ }
7698
+ start += sectorsize;
7699
+ icsum++;
7700
+ pgoff += sectorsize;
80757701 }
80767702 }
8077
-
80787703 return err;
80797704 }
80807705
8081
-static void btrfs_retry_endio(struct bio *bio)
8082
-{
8083
- struct btrfs_retry_complete *done = bio->bi_private;
8084
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8085
- struct extent_io_tree *io_tree, *failure_tree;
8086
- struct inode *inode = done->inode;
8087
- struct bio_vec *bvec;
8088
- int uptodate;
8089
- int ret;
8090
- int i;
8091
-
8092
- if (bio->bi_status)
8093
- goto end;
8094
-
8095
- uptodate = 1;
8096
-
8097
- ASSERT(bio->bi_vcnt == 1);
8098
- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
8099
-
8100
- io_tree = &BTRFS_I(inode)->io_tree;
8101
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
8102
-
8103
- ASSERT(!bio_flagged(bio, BIO_CLONED));
8104
- bio_for_each_segment_all(bvec, bio, i) {
8105
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
8106
- bvec->bv_offset, done->start,
8107
- bvec->bv_len);
8108
- if (!ret)
8109
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
8110
- failure_tree, io_tree, done->start,
8111
- bvec->bv_page,
8112
- btrfs_ino(BTRFS_I(inode)),
8113
- bvec->bv_offset);
8114
- else
8115
- uptodate = 0;
8116
- }
8117
-
8118
- done->uptodate = uptodate;
8119
-end:
8120
- complete(&done->done);
8121
- bio_put(bio);
8122
-}
8123
-
8124
-static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8125
- struct btrfs_io_bio *io_bio, blk_status_t err)
8126
-{
8127
- struct btrfs_fs_info *fs_info;
8128
- struct bio_vec bvec;
8129
- struct bvec_iter iter;
8130
- struct btrfs_retry_complete done;
8131
- u64 start;
8132
- u64 offset = 0;
8133
- u32 sectorsize;
8134
- int nr_sectors;
8135
- unsigned int pgoff;
8136
- int csum_pos;
8137
- bool uptodate = (err == 0);
8138
- int ret;
8139
- blk_status_t status;
8140
-
8141
- fs_info = BTRFS_I(inode)->root->fs_info;
8142
- sectorsize = fs_info->sectorsize;
8143
-
8144
- err = BLK_STS_OK;
8145
- start = io_bio->logical;
8146
- done.inode = inode;
8147
- io_bio->bio.bi_iter = io_bio->iter;
8148
-
8149
- bio_for_each_segment(bvec, &io_bio->bio, iter) {
8150
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8151
-
8152
- pgoff = bvec.bv_offset;
8153
-next_block:
8154
- if (uptodate) {
8155
- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8156
- ret = __readpage_endio_check(inode, io_bio, csum_pos,
8157
- bvec.bv_page, pgoff, start, sectorsize);
8158
- if (likely(!ret))
8159
- goto next;
8160
- }
8161
-try_again:
8162
- done.uptodate = 0;
8163
- done.start = start;
8164
- init_completion(&done.done);
8165
-
8166
- status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8167
- pgoff, start, start + sectorsize - 1,
8168
- io_bio->mirror_num, btrfs_retry_endio,
8169
- &done);
8170
- if (status) {
8171
- err = status;
8172
- goto next;
8173
- }
8174
-
8175
- wait_for_completion_io(&done.done);
8176
-
8177
- if (!done.uptodate) {
8178
- /* We might have another mirror, so try again */
8179
- goto try_again;
8180
- }
8181
-next:
8182
- offset += sectorsize;
8183
- start += sectorsize;
8184
-
8185
- ASSERT(nr_sectors);
8186
-
8187
- nr_sectors--;
8188
- if (nr_sectors) {
8189
- pgoff += sectorsize;
8190
- ASSERT(pgoff < PAGE_SIZE);
8191
- goto next_block;
8192
- }
8193
- }
8194
-
8195
- return err;
8196
-}
8197
-
8198
-static blk_status_t btrfs_subio_endio_read(struct inode *inode,
8199
- struct btrfs_io_bio *io_bio, blk_status_t err)
8200
-{
8201
- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8202
-
8203
- if (skip_csum) {
8204
- if (unlikely(err))
8205
- return __btrfs_correct_data_nocsum(inode, io_bio);
8206
- else
8207
- return BLK_STS_OK;
8208
- } else {
8209
- return __btrfs_subio_endio_read(inode, io_bio, err);
8210
- }
8211
-}
8212
-
8213
-static void btrfs_endio_direct_read(struct bio *bio)
8214
-{
8215
- struct btrfs_dio_private *dip = bio->bi_private;
8216
- struct inode *inode = dip->inode;
8217
- struct bio *dio_bio;
8218
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8219
- blk_status_t err = bio->bi_status;
8220
-
8221
- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8222
- err = btrfs_subio_endio_read(inode, io_bio, err);
8223
-
8224
- unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8225
- dip->logical_offset + dip->bytes - 1);
8226
- dio_bio = dip->dio_bio;
8227
-
8228
- kfree(dip);
8229
-
8230
- dio_bio->bi_status = err;
8231
- dio_end_io(dio_bio);
8232
-
8233
- if (io_bio->end_io)
8234
- io_bio->end_io(io_bio, blk_status_to_errno(err));
8235
- bio_put(bio);
8236
-}
8237
-
8238
-static void __endio_write_update_ordered(struct inode *inode,
7706
+static void __endio_write_update_ordered(struct btrfs_inode *inode,
82397707 const u64 offset, const u64 bytes,
82407708 const bool uptodate)
82417709 {
8242
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7710
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
82437711 struct btrfs_ordered_extent *ordered = NULL;
82447712 struct btrfs_workqueue *wq;
8245
- btrfs_work_func_t func;
82467713 u64 ordered_offset = offset;
82477714 u64 ordered_bytes = bytes;
82487715 u64 last_offset;
82497716
8250
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
7717
+ if (btrfs_is_free_space_inode(inode))
82517718 wq = fs_info->endio_freespace_worker;
8252
- func = btrfs_freespace_write_helper;
8253
- } else {
7719
+ else
82547720 wq = fs_info->endio_write_workers;
8255
- func = btrfs_endio_write_helper;
8256
- }
82577721
82587722 while (ordered_offset < offset + bytes) {
82597723 last_offset = ordered_offset;
82607724 if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
8261
- &ordered_offset,
8262
- ordered_bytes,
8263
- uptodate)) {
8264
- btrfs_init_work(&ordered->work, func,
8265
- finish_ordered_fn,
8266
- NULL, NULL);
7725
+ &ordered_offset,
7726
+ ordered_bytes,
7727
+ uptodate)) {
7728
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
7729
+ NULL);
82677730 btrfs_queue_work(wq, &ordered->work);
82687731 }
82697732 /*
....@@ -8274,7 +7737,7 @@
82747737 return;
82757738 /*
82767739 * Our bio might span multiple ordered extents. In this case
8277
- * we keep goin until we have accounted the whole dio.
7740
+ * we keep going until we have accounted the whole dio.
82787741 */
82797742 if (ordered_offset < offset + bytes) {
82807743 ordered_bytes = offset + bytes - ordered_offset;
....@@ -8283,29 +7746,12 @@
82837746 }
82847747 }
82857748
8286
-static void btrfs_endio_direct_write(struct bio *bio)
8287
-{
8288
- struct btrfs_dio_private *dip = bio->bi_private;
8289
- struct bio *dio_bio = dip->dio_bio;
8290
-
8291
- __endio_write_update_ordered(dip->inode, dip->logical_offset,
8292
- dip->bytes, !bio->bi_status);
8293
-
8294
- kfree(dip);
8295
-
8296
- dio_bio->bi_status = bio->bi_status;
8297
- dio_end_io(dio_bio);
8298
- bio_put(bio);
8299
-}
8300
-
83017749 static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
83027750 struct bio *bio, u64 offset)
83037751 {
83047752 struct inode *inode = private_data;
8305
- blk_status_t ret;
8306
- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8307
- BUG_ON(ret); /* -ENOMEM */
8308
- return 0;
7753
+
7754
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
83097755 }
83107756
83117757 static void btrfs_end_dio_bio(struct bio *bio)
....@@ -8321,62 +7767,16 @@
83217767 (unsigned long long)bio->bi_iter.bi_sector,
83227768 bio->bi_iter.bi_size, err);
83237769
8324
- if (dip->subio_endio)
8325
- err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8326
-
8327
- if (err) {
8328
- /*
8329
- * We want to perceive the errors flag being set before
8330
- * decrementing the reference count. We don't need a barrier
8331
- * since atomic operations with a return value are fully
8332
- * ordered as per atomic_t.txt
8333
- */
8334
- dip->errors = 1;
7770
+ if (bio_op(bio) == REQ_OP_READ) {
7771
+ err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
7772
+ !err);
83357773 }
83367774
8337
- /* if there are more bios still pending for this dio, just exit */
8338
- if (!atomic_dec_and_test(&dip->pending_bios))
8339
- goto out;
7775
+ if (err)
7776
+ dip->dio_bio->bi_status = err;
83407777
8341
- if (dip->errors) {
8342
- bio_io_error(dip->orig_bio);
8343
- } else {
8344
- dip->dio_bio->bi_status = BLK_STS_OK;
8345
- bio_endio(dip->orig_bio);
8346
- }
8347
-out:
83487778 bio_put(bio);
8349
-}
8350
-
8351
-static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8352
- struct btrfs_dio_private *dip,
8353
- struct bio *bio,
8354
- u64 file_offset)
8355
-{
8356
- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8357
- struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8358
- blk_status_t ret;
8359
-
8360
- /*
8361
- * We load all the csum data we need when we submit
8362
- * the first bio to reduce the csum tree search and
8363
- * contention.
8364
- */
8365
- if (dip->logical_offset == file_offset) {
8366
- ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8367
- file_offset);
8368
- if (ret)
8369
- return ret;
8370
- }
8371
-
8372
- if (bio == dip->orig_bio)
8373
- return 0;
8374
-
8375
- file_offset -= dip->logical_offset;
8376
- file_offset >>= inode->i_sb->s_blocksize_bits;
8377
- io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8378
-
8379
- return 0;
7779
+ btrfs_dio_private_put(dip);
83807780 }
83817781
83827782 static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
....@@ -8410,222 +7810,169 @@
84107810 * If we aren't doing async submit, calculate the csum of the
84117811 * bio now.
84127812 */
8413
- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
7813
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
84147814 if (ret)
84157815 goto err;
84167816 } else {
8417
- ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8418
- file_offset);
8419
- if (ret)
8420
- goto err;
7817
+ u64 csum_offset;
7818
+
7819
+ csum_offset = file_offset - dip->logical_offset;
7820
+ csum_offset >>= inode->i_sb->s_blocksize_bits;
7821
+ csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
7822
+ btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
84217823 }
84227824 map:
8423
- ret = btrfs_map_bio(fs_info, bio, 0, 0);
7825
+ ret = btrfs_map_bio(fs_info, bio, 0);
84247826 err:
84257827 return ret;
84267828 }
84277829
8428
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
7830
+/*
7831
+ * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
7832
+ * or ordered extents whether or not we submit any bios.
7833
+ */
7834
+static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
7835
+ struct inode *inode,
7836
+ loff_t file_offset)
84297837 {
8430
- struct inode *inode = dip->inode;
7838
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
7839
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
7840
+ size_t dip_size;
7841
+ struct btrfs_dio_private *dip;
7842
+
7843
+ dip_size = sizeof(*dip);
7844
+ if (!write && csum) {
7845
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7846
+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
7847
+ size_t nblocks;
7848
+
7849
+ nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
7850
+ dip_size += csum_size * nblocks;
7851
+ }
7852
+
7853
+ dip = kzalloc(dip_size, GFP_NOFS);
7854
+ if (!dip)
7855
+ return NULL;
7856
+
7857
+ dip->inode = inode;
7858
+ dip->logical_offset = file_offset;
7859
+ dip->bytes = dio_bio->bi_iter.bi_size;
7860
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7861
+ dip->dio_bio = dio_bio;
7862
+ refcount_set(&dip->refs, 1);
7863
+ return dip;
7864
+}
7865
+
7866
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
7867
+ struct bio *dio_bio, loff_t file_offset)
7868
+{
7869
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
7870
+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
84317871 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7872
+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
7873
+ BTRFS_BLOCK_GROUP_RAID56_MASK);
7874
+ struct btrfs_dio_private *dip;
84327875 struct bio *bio;
8433
- struct bio *orig_bio = dip->orig_bio;
8434
- u64 start_sector = orig_bio->bi_iter.bi_sector;
8435
- u64 file_offset = dip->logical_offset;
8436
- u64 map_length;
7876
+ u64 start_sector;
84377877 int async_submit = 0;
84387878 u64 submit_len;
84397879 int clone_offset = 0;
84407880 int clone_len;
84417881 int ret;
84427882 blk_status_t status;
7883
+ struct btrfs_io_geometry geom;
7884
+ struct btrfs_dio_data *dio_data = iomap->private;
84437885
8444
- map_length = orig_bio->bi_iter.bi_size;
8445
- submit_len = map_length;
8446
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8447
- &map_length, NULL, 0);
8448
- if (ret)
8449
- return -EIO;
8450
-
8451
- if (map_length >= submit_len) {
8452
- bio = orig_bio;
8453
- dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8454
- goto submit;
7886
+ dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
7887
+ if (!dip) {
7888
+ if (!write) {
7889
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
7890
+ file_offset + dio_bio->bi_iter.bi_size - 1);
7891
+ }
7892
+ dio_bio->bi_status = BLK_STS_RESOURCE;
7893
+ bio_endio(dio_bio);
7894
+ return BLK_QC_T_NONE;
84557895 }
84567896
8457
- /* async crcs make it difficult to collect full stripe writes. */
8458
- if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8459
- async_submit = 0;
8460
- else
8461
- async_submit = 1;
7897
+ if (!write && csum) {
7898
+ /*
7899
+ * Load the csums up front to reduce csum tree searches and
7900
+ * contention when submitting bios.
7901
+ */
7902
+ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
7903
+ dip->csums);
7904
+ if (status != BLK_STS_OK)
7905
+ goto out_err;
7906
+ }
84627907
8463
- /* bio split */
8464
- ASSERT(map_length <= INT_MAX);
7908
+ start_sector = dio_bio->bi_iter.bi_sector;
7909
+ submit_len = dio_bio->bi_iter.bi_size;
7910
+
84657911 do {
8466
- clone_len = min_t(int, submit_len, map_length);
7912
+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
7913
+ start_sector << 9, submit_len,
7914
+ &geom);
7915
+ if (ret) {
7916
+ status = errno_to_blk_status(ret);
7917
+ goto out_err;
7918
+ }
7919
+ ASSERT(geom.len <= INT_MAX);
7920
+
7921
+ clone_len = min_t(int, submit_len, geom.len);
84677922
84687923 /*
84697924 * This will never fail as it's passing GPF_NOFS and
84707925 * the allocation is backed by btrfs_bioset.
84717926 */
8472
- bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8473
- clone_len);
7927
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
84747928 bio->bi_private = dip;
84757929 bio->bi_end_io = btrfs_end_dio_bio;
84767930 btrfs_io_bio(bio)->logical = file_offset;
84777931
84787932 ASSERT(submit_len >= clone_len);
84797933 submit_len -= clone_len;
8480
- if (submit_len == 0)
8481
- break;
84827934
84837935 /*
84847936 * Increase the count before we submit the bio so we know
84857937 * the end IO handler won't happen before we increase the
84867938 * count. Otherwise, the dip might get freed before we're
84877939 * done setting it up.
7940
+ *
7941
+ * We transfer the initial reference to the last bio, so we
7942
+ * don't need to increment the reference count for the last one.
84887943 */
8489
- atomic_inc(&dip->pending_bios);
7944
+ if (submit_len > 0) {
7945
+ refcount_inc(&dip->refs);
7946
+ /*
7947
+ * If we are submitting more than one bio, submit them
7948
+ * all asynchronously. The exception is RAID 5 or 6, as
7949
+ * asynchronous checksums make it difficult to collect
7950
+ * full stripe writes.
7951
+ */
7952
+ if (!raid56)
7953
+ async_submit = 1;
7954
+ }
84907955
84917956 status = btrfs_submit_dio_bio(bio, inode, file_offset,
84927957 async_submit);
84937958 if (status) {
84947959 bio_put(bio);
8495
- atomic_dec(&dip->pending_bios);
7960
+ if (submit_len > 0)
7961
+ refcount_dec(&dip->refs);
84967962 goto out_err;
84977963 }
84987964
7965
+ dio_data->submitted += clone_len;
84997966 clone_offset += clone_len;
85007967 start_sector += clone_len >> 9;
85017968 file_offset += clone_len;
8502
-
8503
- map_length = submit_len;
8504
- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8505
- start_sector << 9, &map_length, NULL, 0);
8506
- if (ret)
8507
- goto out_err;
85087969 } while (submit_len > 0);
7970
+ return BLK_QC_T_NONE;
85097971
8510
-submit:
8511
- status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
8512
- if (!status)
8513
- return 0;
8514
-
8515
- if (bio != orig_bio)
8516
- bio_put(bio);
85177972 out_err:
8518
- dip->errors = 1;
8519
- /*
8520
- * Before atomic variable goto zero, we must make sure dip->errors is
8521
- * perceived to be set. This ordering is ensured by the fact that an
8522
- * atomic operations with a return value are fully ordered as per
8523
- * atomic_t.txt
8524
- */
8525
- if (atomic_dec_and_test(&dip->pending_bios))
8526
- bio_io_error(dip->orig_bio);
8527
-
8528
- /* bio_end_io() will handle error, so we needn't return it */
8529
- return 0;
8530
-}
8531
-
8532
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8533
- loff_t file_offset)
8534
-{
8535
- struct btrfs_dio_private *dip = NULL;
8536
- struct bio *bio = NULL;
8537
- struct btrfs_io_bio *io_bio;
8538
- bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8539
- int ret = 0;
8540
-
8541
- bio = btrfs_bio_clone(dio_bio);
8542
-
8543
- dip = kzalloc(sizeof(*dip), GFP_NOFS);
8544
- if (!dip) {
8545
- ret = -ENOMEM;
8546
- goto free_ordered;
8547
- }
8548
-
8549
- dip->private = dio_bio->bi_private;
8550
- dip->inode = inode;
8551
- dip->logical_offset = file_offset;
8552
- dip->bytes = dio_bio->bi_iter.bi_size;
8553
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8554
- bio->bi_private = dip;
8555
- dip->orig_bio = bio;
8556
- dip->dio_bio = dio_bio;
8557
- atomic_set(&dip->pending_bios, 1);
8558
- io_bio = btrfs_io_bio(bio);
8559
- io_bio->logical = file_offset;
8560
-
8561
- if (write) {
8562
- bio->bi_end_io = btrfs_endio_direct_write;
8563
- } else {
8564
- bio->bi_end_io = btrfs_endio_direct_read;
8565
- dip->subio_endio = btrfs_subio_endio_read;
8566
- }
8567
-
8568
- /*
8569
- * Reset the range for unsubmitted ordered extents (to a 0 length range)
8570
- * even if we fail to submit a bio, because in such case we do the
8571
- * corresponding error handling below and it must not be done a second
8572
- * time by btrfs_direct_IO().
8573
- */
8574
- if (write) {
8575
- struct btrfs_dio_data *dio_data = current->journal_info;
8576
-
8577
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8578
- dip->bytes;
8579
- dio_data->unsubmitted_oe_range_start =
8580
- dio_data->unsubmitted_oe_range_end;
8581
- }
8582
-
8583
- ret = btrfs_submit_direct_hook(dip);
8584
- if (!ret)
8585
- return;
8586
-
8587
- if (io_bio->end_io)
8588
- io_bio->end_io(io_bio, ret);
8589
-
8590
-free_ordered:
8591
- /*
8592
- * If we arrived here it means either we failed to submit the dip
8593
- * or we either failed to clone the dio_bio or failed to allocate the
8594
- * dip. If we cloned the dio_bio and allocated the dip, we can just
8595
- * call bio_endio against our io_bio so that we get proper resource
8596
- * cleanup if we fail to submit the dip, otherwise, we must do the
8597
- * same as btrfs_endio_direct_[write|read] because we can't call these
8598
- * callbacks - they require an allocated dip and a clone of dio_bio.
8599
- */
8600
- if (bio && dip) {
8601
- bio_io_error(bio);
8602
- /*
8603
- * The end io callbacks free our dip, do the final put on bio
8604
- * and all the cleanup and final put for dio_bio (through
8605
- * dio_end_io()).
8606
- */
8607
- dip = NULL;
8608
- bio = NULL;
8609
- } else {
8610
- if (write)
8611
- __endio_write_update_ordered(inode,
8612
- file_offset,
8613
- dio_bio->bi_iter.bi_size,
8614
- false);
8615
- else
8616
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8617
- file_offset + dio_bio->bi_iter.bi_size - 1);
8618
-
8619
- dio_bio->bi_status = BLK_STS_IOERR;
8620
- /*
8621
- * Releases and cleans up our dio_bio, no need to bio_put()
8622
- * nor bio_endio()/bio_io_error() against dio_bio.
8623
- */
8624
- dio_end_io(dio_bio);
8625
- }
8626
- if (bio)
8627
- bio_put(bio);
8628
- kfree(dip);
7973
+ dip->dio_bio->bi_status = status;
7974
+ btrfs_dio_private_put(dip);
7975
+ return BLK_QC_T_NONE;
86297976 }
86307977
86317978 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
....@@ -8661,37 +8008,63 @@
86618008 return retval;
86628009 }
86638010
8664
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8011
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
8012
+ int error, unsigned flags)
8013
+{
8014
+ /*
8015
+ * Now if we're still in the context of our submitter we know we can't
8016
+ * safely run generic_write_sync(), so clear our flag here so that the
8017
+ * caller knows to follow up with a sync.
8018
+ */
8019
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
8020
+ current->journal_info = NULL;
8021
+ return error;
8022
+ }
8023
+
8024
+ if (error)
8025
+ return error;
8026
+
8027
+ if (size) {
8028
+ iocb->ki_flags |= IOCB_DSYNC;
8029
+ return generic_write_sync(iocb, size);
8030
+ }
8031
+
8032
+ return 0;
8033
+}
8034
+
8035
+static const struct iomap_ops btrfs_dio_iomap_ops = {
8036
+ .iomap_begin = btrfs_dio_iomap_begin,
8037
+ .iomap_end = btrfs_dio_iomap_end,
8038
+};
8039
+
8040
+static const struct iomap_dio_ops btrfs_dio_ops = {
8041
+ .submit_io = btrfs_submit_direct,
8042
+};
8043
+
8044
+static const struct iomap_dio_ops btrfs_sync_dops = {
8045
+ .submit_io = btrfs_submit_direct,
8046
+ .end_io = btrfs_maybe_fsync_end_io,
8047
+};
8048
+
8049
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
86658050 {
86668051 struct file *file = iocb->ki_filp;
86678052 struct inode *inode = file->f_mapping->host;
86688053 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8669
- struct btrfs_dio_data dio_data = { 0 };
86708054 struct extent_changeset *data_reserved = NULL;
86718055 loff_t offset = iocb->ki_pos;
86728056 size_t count = 0;
8673
- int flags = 0;
8674
- bool wakeup = true;
86758057 bool relock = false;
86768058 ssize_t ret;
86778059
8678
- if (check_direct_IO(fs_info, iter, offset))
8060
+ if (check_direct_IO(fs_info, iter, offset)) {
8061
+ ASSERT(current->journal_info == NULL ||
8062
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
8063
+ current->journal_info = NULL;
86798064 return 0;
8065
+ }
86808066
8681
- inode_dio_begin(inode);
8682
-
8683
- /*
8684
- * The generic stuff only does filemap_write_and_wait_range, which
8685
- * isn't enough if we've written compressed pages to this area, so
8686
- * we need to flush the dirty pages again to make absolutely sure
8687
- * that any outstanding dirty pages are on disk.
8688
- */
86898067 count = iov_iter_count(iter);
8690
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8691
- &BTRFS_I(inode)->runtime_flags))
8692
- filemap_fdatawrite_range(inode->i_mapping, offset,
8693
- offset + count - 1);
8694
-
86958068 if (iov_iter_rw(iter) == WRITE) {
86968069 /*
86978070 * If the write DIO is beyond the EOF, we need update
....@@ -8699,65 +8072,29 @@
86998072 * not unlock the i_mutex at this case.
87008073 */
87018074 if (offset + count <= inode->i_size) {
8702
- dio_data.overwrite = 1;
87038075 inode_unlock(inode);
87048076 relock = true;
87058077 }
8706
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8707
- offset, count);
8708
- if (ret)
8709
- goto out;
8710
-
8711
- /*
8712
- * We need to know how many extents we reserved so that we can
8713
- * do the accounting properly if we go over the number we
8714
- * originally calculated. Abuse current->journal_info for this.
8715
- */
8716
- dio_data.reserve = round_up(count,
8717
- fs_info->sectorsize);
8718
- dio_data.unsubmitted_oe_range_start = (u64)offset;
8719
- dio_data.unsubmitted_oe_range_end = (u64)offset;
8720
- current->journal_info = &dio_data;
87218078 down_read(&BTRFS_I(inode)->dio_sem);
8722
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8723
- &BTRFS_I(inode)->runtime_flags)) {
8724
- inode_dio_end(inode);
8725
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
8726
- wakeup = false;
87278079 }
87288080
8729
- ret = __blockdev_direct_IO(iocb, inode,
8730
- fs_info->fs_devices->latest_bdev,
8731
- iter, btrfs_get_blocks_direct, NULL,
8732
- btrfs_submit_direct, flags);
8733
- if (iov_iter_rw(iter) == WRITE) {
8081
+ /*
8082
+ * We have are actually a sync iocb, so we need our fancy endio to know
8083
+ * if we need to sync.
8084
+ */
8085
+ if (current->journal_info)
8086
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
8087
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
8088
+ else
8089
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
8090
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
8091
+
8092
+ if (ret == -ENOTBLK)
8093
+ ret = 0;
8094
+
8095
+ if (iov_iter_rw(iter) == WRITE)
87348096 up_read(&BTRFS_I(inode)->dio_sem);
8735
- current->journal_info = NULL;
8736
- if (ret < 0 && ret != -EIOCBQUEUED) {
8737
- if (dio_data.reserve)
8738
- btrfs_delalloc_release_space(inode, data_reserved,
8739
- offset, dio_data.reserve, true);
8740
- /*
8741
- * On error we might have left some ordered extents
8742
- * without submitting corresponding bios for them, so
8743
- * cleanup them up to avoid other tasks getting them
8744
- * and waiting for them to complete forever.
8745
- */
8746
- if (dio_data.unsubmitted_oe_range_start <
8747
- dio_data.unsubmitted_oe_range_end)
8748
- __endio_write_update_ordered(inode,
8749
- dio_data.unsubmitted_oe_range_start,
8750
- dio_data.unsubmitted_oe_range_end -
8751
- dio_data.unsubmitted_oe_range_start,
8752
- false);
8753
- } else if (ret >= 0 && (size_t)ret < count)
8754
- btrfs_delalloc_release_space(inode, data_reserved,
8755
- offset, count - (size_t)ret, true);
8756
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8757
- }
8758
-out:
8759
- if (wakeup)
8760
- inode_dio_end(inode);
8097
+
87618098 if (relock)
87628099 inode_lock(inode);
87638100
....@@ -8765,25 +8102,33 @@
87658102 return ret;
87668103 }
87678104
8768
-#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
8769
-
87708105 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8771
- __u64 start, __u64 len)
8106
+ u64 start, u64 len)
87728107 {
87738108 int ret;
87748109
8775
- ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8110
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
87768111 if (ret)
87778112 return ret;
87788113
8779
- return extent_fiemap(inode, fieinfo, start, len);
8114
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
87808115 }
87818116
87828117 int btrfs_readpage(struct file *file, struct page *page)
87838118 {
8784
- struct extent_io_tree *tree;
8785
- tree = &BTRFS_I(page->mapping->host)->io_tree;
8786
- return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8119
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8120
+ u64 start = page_offset(page);
8121
+ u64 end = start + PAGE_SIZE - 1;
8122
+ unsigned long bio_flags = 0;
8123
+ struct bio *bio = NULL;
8124
+ int ret;
8125
+
8126
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
8127
+
8128
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
8129
+ if (bio)
8130
+ ret = submit_one_bio(bio, 0, bio_flags);
8131
+ return ret;
87878132 }
87888133
87898134 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
....@@ -8817,21 +8162,16 @@
88178162 return extent_writepages(mapping, wbc);
88188163 }
88198164
8820
-static int
8821
-btrfs_readpages(struct file *file, struct address_space *mapping,
8822
- struct list_head *pages, unsigned nr_pages)
8165
+static void btrfs_readahead(struct readahead_control *rac)
88238166 {
8824
- return extent_readpages(mapping, pages, nr_pages);
8167
+ extent_readahead(rac);
88258168 }
88268169
88278170 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
88288171 {
88298172 int ret = try_release_extent_mapping(page, gfp_flags);
8830
- if (ret == 1) {
8831
- ClearPagePrivate(page);
8832
- set_page_private(page, 0);
8833
- put_page(page);
8834
- }
8173
+ if (ret == 1)
8174
+ detach_page_private(page);
88358175 return ret;
88368176 }
88378177
....@@ -8842,18 +8182,45 @@
88428182 return __btrfs_releasepage(page, gfp_flags);
88438183 }
88448184
8185
+#ifdef CONFIG_MIGRATION
8186
+static int btrfs_migratepage(struct address_space *mapping,
8187
+ struct page *newpage, struct page *page,
8188
+ enum migrate_mode mode)
8189
+{
8190
+ int ret;
8191
+
8192
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
8193
+ if (ret != MIGRATEPAGE_SUCCESS)
8194
+ return ret;
8195
+
8196
+ if (page_has_private(page))
8197
+ attach_page_private(newpage, detach_page_private(page));
8198
+
8199
+ if (PagePrivate2(page)) {
8200
+ ClearPagePrivate2(page);
8201
+ SetPagePrivate2(newpage);
8202
+ }
8203
+
8204
+ if (mode != MIGRATE_SYNC_NO_COPY)
8205
+ migrate_page_copy(newpage, page);
8206
+ else
8207
+ migrate_page_states(newpage, page);
8208
+ return MIGRATEPAGE_SUCCESS;
8209
+}
8210
+#endif
8211
+
88458212 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
88468213 unsigned int length)
88478214 {
8848
- struct inode *inode = page->mapping->host;
8849
- struct extent_io_tree *tree;
8215
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8216
+ struct extent_io_tree *tree = &inode->io_tree;
88508217 struct btrfs_ordered_extent *ordered;
88518218 struct extent_state *cached_state = NULL;
88528219 u64 page_start = page_offset(page);
88538220 u64 page_end = page_start + PAGE_SIZE - 1;
88548221 u64 start;
88558222 u64 end;
8856
- int inode_evicting = inode->i_state & I_FREEING;
8223
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
88578224
88588225 /*
88598226 * we have the page locked, so new writeback can't start,
....@@ -8864,28 +8231,39 @@
88648231 */
88658232 wait_on_page_writeback(page);
88668233
8867
- tree = &BTRFS_I(inode)->io_tree;
8868
- if (offset) {
8234
+ /*
8235
+ * For subpage case, we have call sites like
8236
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
8237
+ * sectorsize.
8238
+ * If the range doesn't cover the full page, we don't need to and
8239
+ * shouldn't clear page extent mapped, as page->private can still
8240
+ * record subpage dirty bits for other part of the range.
8241
+ *
8242
+ * For cases that can invalidate the full even the range doesn't
8243
+ * cover the full page, like invalidating the last page, we're
8244
+ * still safe to wait for ordered extent to finish.
8245
+ */
8246
+ if (!(offset == 0 && length == PAGE_SIZE)) {
88698247 btrfs_releasepage(page, GFP_NOFS);
88708248 return;
88718249 }
88728250
88738251 if (!inode_evicting)
88748252 lock_extent_bits(tree, page_start, page_end, &cached_state);
8875
-again:
8253
+
88768254 start = page_start;
8877
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8878
- page_end - start + 1);
8255
+again:
8256
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
88798257 if (ordered) {
8880
- end = min(page_end, ordered->file_offset + ordered->len - 1);
8258
+ end = min(page_end,
8259
+ ordered->file_offset + ordered->num_bytes - 1);
88818260 /*
88828261 * IO on this page will never be started, so we need
88838262 * to account for any ordered extents now
88848263 */
88858264 if (!inode_evicting)
88868265 clear_extent_bit(tree, start, end,
8887
- EXTENT_DIRTY | EXTENT_DELALLOC |
8888
- EXTENT_DELALLOC_NEW |
8266
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
88898267 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
88908268 EXTENT_DEFRAG, 1, 0, &cached_state);
88918269 /*
....@@ -8896,7 +8274,7 @@
88968274 struct btrfs_ordered_inode_tree *tree;
88978275 u64 new_len;
88988276
8899
- tree = &BTRFS_I(inode)->ordered_tree;
8277
+ tree = &inode->ordered_tree;
89008278
89018279 spin_lock_irq(&tree->lock);
89028280 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
....@@ -8937,8 +8315,7 @@
89378315 */
89388316 btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
89398317 if (!inode_evicting) {
8940
- clear_extent_bit(tree, page_start, page_end,
8941
- EXTENT_LOCKED | EXTENT_DIRTY |
8318
+ clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
89428319 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
89438320 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
89448321 &cached_state);
....@@ -8947,11 +8324,7 @@
89478324 }
89488325
89498326 ClearPageChecked(page);
8950
- if (PagePrivate(page)) {
8951
- ClearPagePrivate(page);
8952
- set_page_private(page, 0);
8953
- put_page(page);
8954
- }
8327
+ detach_page_private(page);
89558328 }
89568329
89578330 /*
....@@ -9004,8 +8377,8 @@
90048377 * end up waiting indefinitely to get a lock on the page currently
90058378 * being processed by btrfs_page_mkwrite() function.
90068379 */
9007
- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
9008
- reserved_space);
8380
+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8381
+ page_start, reserved_space);
90098382 if (!ret2) {
90108383 ret2 = file_update_time(vmf->vma->vm_file);
90118384 reserved = 1;
....@@ -9042,7 +8415,7 @@
90428415 unlock_extent_cached(io_tree, page_start, page_end,
90438416 &cached_state);
90448417 unlock_page(page);
9045
- btrfs_start_ordered_extent(inode, ordered, 1);
8418
+ btrfs_start_ordered_extent(ordered, 1);
90468419 btrfs_put_ordered_extent(ordered);
90478420 goto again;
90488421 }
....@@ -9052,9 +8425,9 @@
90528425 fs_info->sectorsize);
90538426 if (reserved_space < PAGE_SIZE) {
90548427 end = page_start + reserved_space - 1;
9055
- btrfs_delalloc_release_space(inode, data_reserved,
9056
- page_start, PAGE_SIZE - reserved_space,
9057
- true);
8428
+ btrfs_delalloc_release_space(BTRFS_I(inode),
8429
+ data_reserved, page_start,
8430
+ PAGE_SIZE - reserved_space, true);
90588431 }
90598432 }
90608433
....@@ -9066,23 +8439,21 @@
90668439 * reserve data&meta space before lock_page() (see above comments).
90678440 */
90688441 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9069
- EXTENT_DIRTY | EXTENT_DELALLOC |
9070
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9071
- 0, 0, &cached_state);
8442
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8443
+ EXTENT_DEFRAG, 0, 0, &cached_state);
90728444
9073
- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
9074
- &cached_state, 0);
8445
+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8446
+ &cached_state);
90758447 if (ret2) {
90768448 unlock_extent_cached(io_tree, page_start, page_end,
90778449 &cached_state);
90788450 ret = VM_FAULT_SIGBUS;
90798451 goto out_unlock;
90808452 }
9081
- ret2 = 0;
90828453
90838454 /* page is wholly or partially inside EOF */
90848455 if (page_start + PAGE_SIZE > size)
9085
- zero_start = size & ~PAGE_MASK;
8456
+ zero_start = offset_in_page(size);
90868457 else
90878458 zero_start = PAGE_SIZE;
90888459
....@@ -9096,24 +8467,20 @@
90968467 set_page_dirty(page);
90978468 SetPageUptodate(page);
90988469
9099
- BTRFS_I(inode)->last_trans = fs_info->generation;
9100
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9101
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
8470
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
91028471
91038472 unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
91048473
9105
- if (!ret2) {
9106
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9107
- sb_end_pagefault(inode->i_sb);
9108
- extent_changeset_free(data_reserved);
9109
- return VM_FAULT_LOCKED;
9110
- }
8474
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8475
+ sb_end_pagefault(inode->i_sb);
8476
+ extent_changeset_free(data_reserved);
8477
+ return VM_FAULT_LOCKED;
91118478
91128479 out_unlock:
91138480 unlock_page(page);
91148481 out:
91158482 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9116
- btrfs_delalloc_release_space(inode, data_reserved, page_start,
8483
+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
91178484 reserved_space, (ret != 0));
91188485 out_noreserve:
91198486 sb_end_pagefault(inode->i_sb);
....@@ -9129,7 +8496,7 @@
91298496 int ret;
91308497 struct btrfs_trans_handle *trans;
91318498 u64 mask = fs_info->sectorsize - 1;
9132
- u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
8499
+ u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
91338500
91348501 if (!skip_writeback) {
91358502 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
....@@ -9184,7 +8551,7 @@
91848551
91858552 /* Migrate the slack space for the truncate to our reserve */
91868553 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9187
- min_size, 0);
8554
+ min_size, false);
91888555 BUG_ON(ret);
91898556
91908557 /*
....@@ -9219,9 +8586,9 @@
92198586 break;
92208587 }
92218588
9222
- btrfs_block_rsv_release(fs_info, rsv, -1);
8589
+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
92238590 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9224
- rsv, min_size, 0);
8591
+ rsv, min_size, false);
92258592 BUG_ON(ret); /* shouldn't happen */
92268593 trans->block_rsv = rsv;
92278594 }
....@@ -9244,7 +8611,7 @@
92448611 ret = PTR_ERR(trans);
92458612 goto out;
92468613 }
9247
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
8614
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
92488615 }
92498616
92508617 if (trans) {
....@@ -9327,7 +8694,7 @@
93278694 ei->index_cnt = (u64)-1;
93288695 ei->dir_index = 0;
93298696 ei->last_unlink_trans = 0;
9330
- ei->last_link_trans = 0;
8697
+ ei->last_reflink_trans = 0;
93318698 ei->last_log_commit = 0;
93328699
93338700 spin_lock_init(&ei->lock);
....@@ -9346,13 +8713,15 @@
93468713
93478714 inode = &ei->vfs_inode;
93488715 extent_map_tree_init(&ei->extent_tree);
9349
- extent_io_tree_init(&ei->io_tree, inode);
9350
- extent_io_tree_init(&ei->io_failure_tree, inode);
9351
- ei->io_tree.track_uptodate = 1;
9352
- ei->io_failure_tree.track_uptodate = 1;
8716
+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
8717
+ extent_io_tree_init(fs_info, &ei->io_failure_tree,
8718
+ IO_TREE_INODE_IO_FAILURE, inode);
8719
+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
8720
+ IO_TREE_INODE_FILE_EXTENT, inode);
8721
+ ei->io_tree.track_uptodate = true;
8722
+ ei->io_failure_tree.track_uptodate = true;
93538723 atomic_set(&ei->sync_writers, 0);
93548724 mutex_init(&ei->log_mutex);
9355
- mutex_init(&ei->delalloc_mutex);
93568725 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
93578726 INIT_LIST_HEAD(&ei->delalloc_inodes);
93588727 INIT_LIST_HEAD(&ei->delayed_iput);
....@@ -9370,27 +8739,26 @@
93708739 }
93718740 #endif
93728741
9373
-static void btrfs_i_callback(struct rcu_head *head)
8742
+void btrfs_free_inode(struct inode *inode)
93748743 {
9375
- struct inode *inode = container_of(head, struct inode, i_rcu);
93768744 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
93778745 }
93788746
9379
-void btrfs_destroy_inode(struct inode *inode)
8747
+void btrfs_destroy_inode(struct inode *vfs_inode)
93808748 {
9381
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
93828749 struct btrfs_ordered_extent *ordered;
9383
- struct btrfs_root *root = BTRFS_I(inode)->root;
8750
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8751
+ struct btrfs_root *root = inode->root;
93848752
9385
- WARN_ON(!hlist_empty(&inode->i_dentry));
9386
- WARN_ON(inode->i_data.nrpages);
9387
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9388
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
9389
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
9390
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9391
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9392
- WARN_ON(BTRFS_I(inode)->csum_bytes);
9393
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
8753
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8754
+ WARN_ON(vfs_inode->i_data.nrpages);
8755
+ WARN_ON(inode->block_rsv.reserved);
8756
+ WARN_ON(inode->block_rsv.size);
8757
+ WARN_ON(inode->outstanding_extents);
8758
+ WARN_ON(inode->delalloc_bytes);
8759
+ WARN_ON(inode->new_delalloc_bytes);
8760
+ WARN_ON(inode->csum_bytes);
8761
+ WARN_ON(inode->defrag_bytes);
93948762
93958763 /*
93968764 * This can happen where we create an inode, but somebody else also
....@@ -9398,16 +8766,16 @@
93988766 * created.
93998767 */
94008768 if (!root)
9401
- goto free;
8769
+ return;
94028770
94038771 while (1) {
94048772 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
94058773 if (!ordered)
94068774 break;
94078775 else {
9408
- btrfs_err(fs_info,
8776
+ btrfs_err(root->fs_info,
94098777 "found ordered extent %llu %llu on inode cleanup",
9410
- ordered->file_offset, ordered->len);
8778
+ ordered->file_offset, ordered->num_bytes);
94118779 btrfs_remove_ordered_extent(inode, ordered);
94128780 btrfs_put_ordered_extent(ordered);
94138781 btrfs_put_ordered_extent(ordered);
....@@ -9415,9 +8783,9 @@
94158783 }
94168784 btrfs_qgroup_check_reserved_leak(inode);
94178785 inode_tree_del(inode);
9418
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9419
-free:
9420
- call_rcu(&inode->i_rcu, btrfs_i_callback);
8786
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8787
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8788
+ btrfs_put_root(inode->root);
94218789 }
94228790
94238791 int btrfs_drop_inode(struct inode *inode)
....@@ -9542,19 +8910,15 @@
95428910 struct inode *new_inode = new_dentry->d_inode;
95438911 struct inode *old_inode = old_dentry->d_inode;
95448912 struct timespec64 ctime = current_time(old_inode);
9545
- struct dentry *parent;
95468913 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
95478914 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
95488915 u64 old_idx = 0;
95498916 u64 new_idx = 0;
95508917 int ret;
8918
+ int ret2;
95518919 bool root_log_pinned = false;
95528920 bool dest_log_pinned = false;
9553
- struct btrfs_log_ctx ctx_root;
9554
- struct btrfs_log_ctx ctx_dest;
9555
- bool sync_log_root = false;
9556
- bool sync_log_dest = false;
9557
- bool commit_transaction = false;
8921
+ bool need_abort = false;
95588922
95598923 /*
95608924 * For non-subvolumes allow exchange only within one subvolume, in the
....@@ -9565,9 +8929,6 @@
95658929 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
95668930 new_ino != BTRFS_FIRST_FREE_OBJECTID))
95678931 return -EXDEV;
9568
-
9569
- btrfs_init_log_ctx(&ctx_root, old_inode);
9570
- btrfs_init_log_ctx(&ctx_dest, new_inode);
95718932
95728933 /* close the race window with snapshot create/destroy ioctl */
95738934 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
....@@ -9608,7 +8969,7 @@
96088969 /* Reference for the source. */
96098970 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
96108971 /* force full log commit if subvolume involved. */
9611
- btrfs_set_log_full_commit(fs_info, trans);
8972
+ btrfs_set_log_full_commit(trans);
96128973 } else {
96138974 btrfs_pin_log_trans(root);
96148975 root_log_pinned = true;
....@@ -9620,12 +8981,13 @@
96208981 old_idx);
96218982 if (ret)
96228983 goto out_fail;
8984
+ need_abort = true;
96238985 }
96248986
96258987 /* And now for the dest. */
96268988 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
96278989 /* force full log commit if subvolume involved. */
9628
- btrfs_set_log_full_commit(fs_info, trans);
8990
+ btrfs_set_log_full_commit(trans);
96298991 } else {
96308992 btrfs_pin_log_trans(dest);
96318993 dest_log_pinned = true;
....@@ -9635,8 +8997,11 @@
96358997 new_ino,
96368998 btrfs_ino(BTRFS_I(old_dir)),
96378999 new_idx);
9638
- if (ret)
9000
+ if (ret) {
9001
+ if (need_abort)
9002
+ btrfs_abort_transaction(trans, ret);
96399003 goto out_fail;
9004
+ }
96409005 }
96419006
96429007 /* Update inode version and ctime/mtime. */
....@@ -9710,30 +9075,14 @@
97109075 BTRFS_I(new_inode)->dir_index = new_idx;
97119076
97129077 if (root_log_pinned) {
9713
- parent = new_dentry->d_parent;
9714
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
9715
- BTRFS_I(old_dir), parent,
9716
- false, &ctx_root);
9717
- if (ret == BTRFS_NEED_LOG_SYNC)
9718
- sync_log_root = true;
9719
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
9720
- commit_transaction = true;
9721
- ret = 0;
9078
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9079
+ new_dentry->d_parent);
97229080 btrfs_end_log_trans(root);
97239081 root_log_pinned = false;
97249082 }
97259083 if (dest_log_pinned) {
9726
- if (!commit_transaction) {
9727
- parent = old_dentry->d_parent;
9728
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
9729
- BTRFS_I(new_dir), parent,
9730
- false, &ctx_dest);
9731
- if (ret == BTRFS_NEED_LOG_SYNC)
9732
- sync_log_dest = true;
9733
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
9734
- commit_transaction = true;
9735
- ret = 0;
9736
- }
9084
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
9085
+ old_dentry->d_parent);
97379086 btrfs_end_log_trans(dest);
97389087 dest_log_pinned = false;
97399088 }
....@@ -9755,7 +9104,7 @@
97559104 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
97569105 (new_inode &&
97579106 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9758
- btrfs_set_log_full_commit(fs_info, trans);
9107
+ btrfs_set_log_full_commit(trans);
97599108
97609109 if (root_log_pinned) {
97619110 btrfs_end_log_trans(root);
....@@ -9766,45 +9115,12 @@
97669115 dest_log_pinned = false;
97679116 }
97689117 }
9769
- if (!ret && sync_log_root && !commit_transaction) {
9770
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
9771
- &ctx_root);
9772
- if (ret)
9773
- commit_transaction = true;
9774
- }
9775
- if (!ret && sync_log_dest && !commit_transaction) {
9776
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
9777
- &ctx_dest);
9778
- if (ret)
9779
- commit_transaction = true;
9780
- }
9781
- if (commit_transaction) {
9782
- /*
9783
- * We may have set commit_transaction when logging the new name
9784
- * in the destination root, in which case we left the source
9785
- * root context in the list of log contextes. So make sure we
9786
- * remove it to avoid invalid memory accesses, since the context
9787
- * was allocated in our stack frame.
9788
- */
9789
- if (sync_log_root) {
9790
- mutex_lock(&root->log_mutex);
9791
- list_del_init(&ctx_root.list);
9792
- mutex_unlock(&root->log_mutex);
9793
- }
9794
- ret = btrfs_commit_transaction(trans);
9795
- } else {
9796
- int ret2;
9797
-
9798
- ret2 = btrfs_end_transaction(trans);
9799
- ret = ret ? ret : ret2;
9800
- }
9118
+ ret2 = btrfs_end_transaction(trans);
9119
+ ret = ret ? ret : ret2;
98019120 out_notrans:
98029121 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
98039122 old_ino == BTRFS_FIRST_FREE_OBJECTID)
98049123 up_read(&fs_info->subvol_sem);
9805
-
9806
- ASSERT(list_empty(&ctx_root.list));
9807
- ASSERT(list_empty(&ctx_dest.list));
98089124
98099125 return ret;
98109126 }
....@@ -9819,7 +9135,7 @@
98199135 u64 objectid;
98209136 u64 index;
98219137
9822
- ret = btrfs_find_free_ino(root, &objectid);
9138
+ ret = btrfs_find_free_objectid(root, &objectid);
98239139 if (ret)
98249140 return ret;
98259141
....@@ -9873,11 +9189,9 @@
98739189 struct inode *old_inode = d_inode(old_dentry);
98749190 u64 index = 0;
98759191 int ret;
9192
+ int ret2;
98769193 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
98779194 bool log_pinned = false;
9878
- struct btrfs_log_ctx ctx;
9879
- bool sync_log = false;
9880
- bool commit_transaction = false;
98819195
98829196 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
98839197 return -EPERM;
....@@ -9954,7 +9268,7 @@
99549268 BTRFS_I(old_inode)->dir_index = 0ULL;
99559269 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
99569270 /* force full log commit if subvolume involved. */
9957
- btrfs_set_log_full_commit(fs_info, trans);
9271
+ btrfs_set_log_full_commit(trans);
99589272 } else {
99599273 btrfs_pin_log_trans(root);
99609274 log_pinned = true;
....@@ -10027,17 +9341,8 @@
100279341 BTRFS_I(old_inode)->dir_index = index;
100289342
100299343 if (log_pinned) {
10030
- struct dentry *parent = new_dentry->d_parent;
10031
-
10032
- btrfs_init_log_ctx(&ctx, old_inode);
10033
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
10034
- BTRFS_I(old_dir), parent,
10035
- false, &ctx);
10036
- if (ret == BTRFS_NEED_LOG_SYNC)
10037
- sync_log = true;
10038
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
10039
- commit_transaction = true;
10040
- ret = 0;
9344
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9345
+ new_dentry->d_parent);
100419346 btrfs_end_log_trans(root);
100429347 log_pinned = false;
100439348 }
....@@ -10069,28 +9374,13 @@
100699374 btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
100709375 (new_inode &&
100719376 btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
10072
- btrfs_set_log_full_commit(fs_info, trans);
9377
+ btrfs_set_log_full_commit(trans);
100739378
100749379 btrfs_end_log_trans(root);
100759380 log_pinned = false;
100769381 }
10077
- if (!ret && sync_log) {
10078
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
10079
- if (ret)
10080
- commit_transaction = true;
10081
- } else if (sync_log) {
10082
- mutex_lock(&root->log_mutex);
10083
- list_del(&ctx.list);
10084
- mutex_unlock(&root->log_mutex);
10085
- }
10086
- if (commit_transaction) {
10087
- ret = btrfs_commit_transaction(trans);
10088
- } else {
10089
- int ret2;
10090
-
10091
- ret2 = btrfs_end_transaction(trans);
10092
- ret = ret ? ret : ret2;
10093
- }
9382
+ ret2 = btrfs_end_transaction(trans);
9383
+ ret = ret ? ret : ret2;
100949384 out_notrans:
100959385 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
100969386 up_read(&fs_info->subvol_sem);
....@@ -10147,9 +9437,7 @@
101479437 init_completion(&work->completion);
101489438 INIT_LIST_HEAD(&work->list);
101499439 work->inode = inode;
10150
- WARN_ON_ONCE(!inode);
10151
- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10152
- btrfs_run_delalloc_work, NULL, NULL);
9440
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
101539441
101549442 return work;
101559443 }
....@@ -10158,7 +9446,9 @@
101589446 * some fairly slow code that needs optimization. This walks the list
101599447 * of all the inodes with pending delalloc and forces them to disk.
101609448 */
10161
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
9449
+static int start_delalloc_inodes(struct btrfs_root *root,
9450
+ struct writeback_control *wbc, bool snapshot,
9451
+ bool in_reclaim_context)
101629452 {
101639453 struct btrfs_inode *binode;
101649454 struct inode *inode;
....@@ -10166,6 +9456,7 @@
101669456 struct list_head works;
101679457 struct list_head splice;
101689458 int ret = 0;
9459
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
101699460
101709461 INIT_LIST_HEAD(&works);
101719462 INIT_LIST_HEAD(&splice);
....@@ -10179,6 +9470,11 @@
101799470
101809471 list_move_tail(&binode->delalloc_inodes,
101819472 &root->delalloc_inodes);
9473
+
9474
+ if (in_reclaim_context &&
9475
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9476
+ continue;
9477
+
101829478 inode = igrab(&binode->vfs_inode);
101839479 if (!inode) {
101849480 cond_resched_lock(&root->delalloc_lock);
....@@ -10189,18 +9485,26 @@
101899485 if (snapshot)
101909486 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
101919487 &binode->runtime_flags);
10192
- work = btrfs_alloc_delalloc_work(inode);
10193
- if (!work) {
10194
- iput(inode);
10195
- ret = -ENOMEM;
10196
- goto out;
9488
+ if (full_flush) {
9489
+ work = btrfs_alloc_delalloc_work(inode);
9490
+ if (!work) {
9491
+ iput(inode);
9492
+ ret = -ENOMEM;
9493
+ goto out;
9494
+ }
9495
+ list_add_tail(&work->list, &works);
9496
+ btrfs_queue_work(root->fs_info->flush_workers,
9497
+ &work->work);
9498
+ } else {
9499
+ ret = sync_inode(inode, wbc);
9500
+ if (!ret &&
9501
+ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9502
+ &BTRFS_I(inode)->runtime_flags))
9503
+ ret = sync_inode(inode, wbc);
9504
+ btrfs_add_delayed_iput(inode);
9505
+ if (ret || wbc->nr_to_write <= 0)
9506
+ goto out;
101979507 }
10198
- list_add_tail(&work->list, &works);
10199
- btrfs_queue_work(root->fs_info->flush_workers,
10200
- &work->work);
10201
- ret++;
10202
- if (nr != -1 && ret >= nr)
10203
- goto out;
102049508 cond_resched();
102059509 spin_lock(&root->delalloc_lock);
102069510 }
....@@ -10224,20 +9528,29 @@
102249528
102259529 int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
102269530 {
9531
+ struct writeback_control wbc = {
9532
+ .nr_to_write = LONG_MAX,
9533
+ .sync_mode = WB_SYNC_NONE,
9534
+ .range_start = 0,
9535
+ .range_end = LLONG_MAX,
9536
+ };
102279537 struct btrfs_fs_info *fs_info = root->fs_info;
10228
- int ret;
102299538
102309539 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
102319540 return -EROFS;
102329541
10233
- ret = start_delalloc_inodes(root, -1, true);
10234
- if (ret > 0)
10235
- ret = 0;
10236
- return ret;
9542
+ return start_delalloc_inodes(root, &wbc, true, false);
102379543 }
102389544
10239
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
9545
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
9546
+ bool in_reclaim_context)
102409547 {
9548
+ struct writeback_control wbc = {
9549
+ .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
9550
+ .sync_mode = WB_SYNC_NONE,
9551
+ .range_start = 0,
9552
+ .range_end = LLONG_MAX,
9553
+ };
102419554 struct btrfs_root *root;
102429555 struct list_head splice;
102439556 int ret;
....@@ -10251,23 +9564,25 @@
102519564 spin_lock(&fs_info->delalloc_root_lock);
102529565 list_splice_init(&fs_info->delalloc_roots, &splice);
102539566 while (!list_empty(&splice) && nr) {
9567
+ /*
9568
+ * Reset nr_to_write here so we know that we're doing a full
9569
+ * flush.
9570
+ */
9571
+ if (nr == U64_MAX)
9572
+ wbc.nr_to_write = LONG_MAX;
9573
+
102549574 root = list_first_entry(&splice, struct btrfs_root,
102559575 delalloc_root);
10256
- root = btrfs_grab_fs_root(root);
9576
+ root = btrfs_grab_root(root);
102579577 BUG_ON(!root);
102589578 list_move_tail(&root->delalloc_root,
102599579 &fs_info->delalloc_roots);
102609580 spin_unlock(&fs_info->delalloc_root_lock);
102619581
10262
- ret = start_delalloc_inodes(root, nr, false);
10263
- btrfs_put_fs_root(root);
10264
- if (ret < 0)
9582
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9583
+ btrfs_put_root(root);
9584
+ if (ret < 0 || wbc.nr_to_write <= 0)
102659585 goto out;
10266
-
10267
- if (nr != -1) {
10268
- nr -= ret;
10269
- WARN_ON(nr < 0);
10270
- }
102719586 spin_lock(&fs_info->delalloc_root_lock);
102729587 }
102739588 spin_unlock(&fs_info->delalloc_root_lock);
....@@ -10316,7 +9631,7 @@
103169631 if (IS_ERR(trans))
103179632 return PTR_ERR(trans);
103189633
10319
- err = btrfs_find_free_ino(root, &objectid);
9634
+ err = btrfs_find_free_objectid(root, &objectid);
103209635 if (err)
103219636 goto out_unlock;
103229637
....@@ -10338,7 +9653,6 @@
103389653 inode->i_fop = &btrfs_file_operations;
103399654 inode->i_op = &btrfs_file_inode_operations;
103409655 inode->i_mapping->a_ops = &btrfs_aops;
10341
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
103429656
103439657 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
103449658 if (err)
....@@ -10377,7 +9691,6 @@
103779691
103789692 inode->i_op = &btrfs_symlink_inode_operations;
103799693 inode_nohighmem(inode);
10380
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
103819694 inode_set_bytes(inode, name_len);
103829695 btrfs_i_size_write(BTRFS_I(inode), name_len);
103839696 err = btrfs_update_inode(trans, root, inode);
....@@ -10404,6 +9717,65 @@
104049717 return err;
104059718 }
104069719
9720
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
9721
+ struct btrfs_trans_handle *trans_in,
9722
+ struct inode *inode, struct btrfs_key *ins,
9723
+ u64 file_offset)
9724
+{
9725
+ struct btrfs_file_extent_item stack_fi;
9726
+ struct btrfs_replace_extent_info extent_info;
9727
+ struct btrfs_trans_handle *trans = trans_in;
9728
+ struct btrfs_path *path;
9729
+ u64 start = ins->objectid;
9730
+ u64 len = ins->offset;
9731
+ int ret;
9732
+
9733
+ memset(&stack_fi, 0, sizeof(stack_fi));
9734
+
9735
+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9736
+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9737
+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9738
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9739
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9740
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9741
+ /* Encryption and other encoding is reserved and all 0 */
9742
+
9743
+ ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
9744
+ if (ret < 0)
9745
+ return ERR_PTR(ret);
9746
+
9747
+ if (trans) {
9748
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
9749
+ file_offset, &stack_fi, ret);
9750
+ if (ret)
9751
+ return ERR_PTR(ret);
9752
+ return trans;
9753
+ }
9754
+
9755
+ extent_info.disk_offset = start;
9756
+ extent_info.disk_len = len;
9757
+ extent_info.data_offset = 0;
9758
+ extent_info.data_len = len;
9759
+ extent_info.file_offset = file_offset;
9760
+ extent_info.extent_buf = (char *)&stack_fi;
9761
+ extent_info.is_new_extent = true;
9762
+ extent_info.qgroup_reserved = ret;
9763
+ extent_info.insertions = 0;
9764
+
9765
+ path = btrfs_alloc_path();
9766
+ if (!path)
9767
+ return ERR_PTR(-ENOMEM);
9768
+
9769
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
9770
+ file_offset + len - 1, &extent_info,
9771
+ &trans);
9772
+ btrfs_free_path(path);
9773
+ if (ret)
9774
+ return ERR_PTR(ret);
9775
+
9776
+ return trans;
9777
+}
9778
+
104079779 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
104089780 u64 start, u64 num_bytes, u64 min_size,
104099781 loff_t actual_len, u64 *alloc_hint,
....@@ -10426,14 +9798,6 @@
104269798 if (trans)
104279799 own_trans = false;
104289800 while (num_bytes > 0) {
10429
- if (own_trans) {
10430
- trans = btrfs_start_transaction(root, 3);
10431
- if (IS_ERR(trans)) {
10432
- ret = PTR_ERR(trans);
10433
- break;
10434
- }
10435
- }
10436
-
104379801 cur_bytes = min_t(u64, num_bytes, SZ_256M);
104389802 cur_bytes = max(cur_bytes, min_size);
104399803 /*
....@@ -10445,11 +9809,8 @@
104459809 cur_bytes = min(cur_bytes, last_alloc);
104469810 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
104479811 min_size, 0, *alloc_hint, &ins, 1, 0);
10448
- if (ret) {
10449
- if (own_trans)
10450
- btrfs_end_transaction(trans);
9812
+ if (ret)
104519813 break;
10452
- }
104539814
104549815 /*
104559816 * We've reserved this space, and thus converted it from
....@@ -10459,20 +9820,20 @@
104599820 * clear_offset by our extent size.
104609821 */
104619822 clear_offset += ins.offset;
10462
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
104639823
104649824 last_alloc = ins.offset;
10465
- ret = insert_reserved_file_extent(trans, inode,
10466
- cur_offset, ins.objectid,
10467
- ins.offset, ins.offset,
10468
- ins.offset, 0, 0, 0,
10469
- BTRFS_FILE_EXTENT_PREALLOC);
10470
- if (ret) {
9825
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
9826
+ /*
9827
+ * Now that we inserted the prealloc extent we can finally
9828
+ * decrement the number of reservations in the block group.
9829
+ * If we did it before, we could race with relocation and have
9830
+ * relocation miss the reserved extent, making it fail later.
9831
+ */
9832
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9833
+ if (IS_ERR(trans)) {
9834
+ ret = PTR_ERR(trans);
104719835 btrfs_free_reserved_extent(fs_info, ins.objectid,
104729836 ins.offset, 0);
10473
- btrfs_abort_transaction(trans, ret);
10474
- if (own_trans)
10475
- btrfs_end_transaction(trans);
104769837 break;
104779838 }
104789839
....@@ -10493,7 +9854,6 @@
104939854 em->block_len = ins.offset;
104949855 em->orig_block_len = ins.offset;
104959856 em->ram_bytes = ins.offset;
10496
- em->bdev = fs_info->fs_devices->latest_bdev;
104979857 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
104989858 em->generation = trans->transid;
104999859
....@@ -10524,7 +9884,7 @@
105249884 else
105259885 i_size = cur_offset;
105269886 i_size_write(inode, i_size);
10527
- btrfs_ordered_update_i_size(inode, i_size, NULL);
9887
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
105289888 }
105299889
105309890 ret = btrfs_update_inode(trans, root, inode);
....@@ -10536,11 +9896,13 @@
105369896 break;
105379897 }
105389898
10539
- if (own_trans)
9899
+ if (own_trans) {
105409900 btrfs_end_transaction(trans);
9901
+ trans = NULL;
9902
+ }
105419903 }
105429904 if (clear_offset < end)
10543
- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
9905
+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
105449906 end - clear_offset + 1);
105459907 return ret;
105469908 }
....@@ -10600,7 +9962,7 @@
106009962 if (IS_ERR(trans))
106019963 return PTR_ERR(trans);
106029964
10603
- ret = btrfs_find_free_ino(root, &objectid);
9965
+ ret = btrfs_find_free_objectid(root, &objectid);
106049966 if (ret)
106059967 goto out;
106069968
....@@ -10616,7 +9978,6 @@
106169978 inode->i_op = &btrfs_file_inode_operations;
106179979
106189980 inode->i_mapping->a_ops = &btrfs_aops;
10619
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
106209981
106219982 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
106229983 if (ret)
....@@ -10648,26 +10009,6 @@
1064810009 return ret;
1064910010 }
1065010011
10651
-__attribute__((const))
10652
-static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10653
-{
10654
- return -EAGAIN;
10655
-}
10656
-
10657
-static void btrfs_check_extent_io_range(void *private_data, const char *caller,
10658
- u64 start, u64 end)
10659
-{
10660
- struct inode *inode = private_data;
10661
- u64 isize;
10662
-
10663
- isize = i_size_read(inode);
10664
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
10665
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
10666
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
10667
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
10668
- }
10669
-}
10670
-
1067110012 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1067210013 {
1067310014 struct inode *inode = tree->private_data;
....@@ -10683,6 +10024,403 @@
1068310024 index++;
1068410025 }
1068510026 }
10027
+
10028
+#ifdef CONFIG_SWAP
10029
+/*
10030
+ * Add an entry indicating a block group or device which is pinned by a
10031
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10032
+ * negative errno on failure.
10033
+ */
10034
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10035
+ bool is_block_group)
10036
+{
10037
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10038
+ struct btrfs_swapfile_pin *sp, *entry;
10039
+ struct rb_node **p;
10040
+ struct rb_node *parent = NULL;
10041
+
10042
+ sp = kmalloc(sizeof(*sp), GFP_NOFS);
10043
+ if (!sp)
10044
+ return -ENOMEM;
10045
+ sp->ptr = ptr;
10046
+ sp->inode = inode;
10047
+ sp->is_block_group = is_block_group;
10048
+ sp->bg_extent_count = 1;
10049
+
10050
+ spin_lock(&fs_info->swapfile_pins_lock);
10051
+ p = &fs_info->swapfile_pins.rb_node;
10052
+ while (*p) {
10053
+ parent = *p;
10054
+ entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10055
+ if (sp->ptr < entry->ptr ||
10056
+ (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10057
+ p = &(*p)->rb_left;
10058
+ } else if (sp->ptr > entry->ptr ||
10059
+ (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10060
+ p = &(*p)->rb_right;
10061
+ } else {
10062
+ if (is_block_group)
10063
+ entry->bg_extent_count++;
10064
+ spin_unlock(&fs_info->swapfile_pins_lock);
10065
+ kfree(sp);
10066
+ return 1;
10067
+ }
10068
+ }
10069
+ rb_link_node(&sp->node, parent, p);
10070
+ rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10071
+ spin_unlock(&fs_info->swapfile_pins_lock);
10072
+ return 0;
10073
+}
10074
+
10075
+/* Free all of the entries pinned by this swapfile. */
10076
+static void btrfs_free_swapfile_pins(struct inode *inode)
10077
+{
10078
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10079
+ struct btrfs_swapfile_pin *sp;
10080
+ struct rb_node *node, *next;
10081
+
10082
+ spin_lock(&fs_info->swapfile_pins_lock);
10083
+ node = rb_first(&fs_info->swapfile_pins);
10084
+ while (node) {
10085
+ next = rb_next(node);
10086
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10087
+ if (sp->inode == inode) {
10088
+ rb_erase(&sp->node, &fs_info->swapfile_pins);
10089
+ if (sp->is_block_group) {
10090
+ btrfs_dec_block_group_swap_extents(sp->ptr,
10091
+ sp->bg_extent_count);
10092
+ btrfs_put_block_group(sp->ptr);
10093
+ }
10094
+ kfree(sp);
10095
+ }
10096
+ node = next;
10097
+ }
10098
+ spin_unlock(&fs_info->swapfile_pins_lock);
10099
+}
10100
+
10101
+struct btrfs_swap_info {
10102
+ u64 start;
10103
+ u64 block_start;
10104
+ u64 block_len;
10105
+ u64 lowest_ppage;
10106
+ u64 highest_ppage;
10107
+ unsigned long nr_pages;
10108
+ int nr_extents;
10109
+};
10110
+
10111
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10112
+ struct btrfs_swap_info *bsi)
10113
+{
10114
+ unsigned long nr_pages;
10115
+ unsigned long max_pages;
10116
+ u64 first_ppage, first_ppage_reported, next_ppage;
10117
+ int ret;
10118
+
10119
+ /*
10120
+ * Our swapfile may have had its size extended after the swap header was
10121
+ * written. In that case activating the swapfile should not go beyond
10122
+ * the max size set in the swap header.
10123
+ */
10124
+ if (bsi->nr_pages >= sis->max)
10125
+ return 0;
10126
+
10127
+ max_pages = sis->max - bsi->nr_pages;
10128
+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
10129
+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
10130
+ PAGE_SIZE) >> PAGE_SHIFT;
10131
+
10132
+ if (first_ppage >= next_ppage)
10133
+ return 0;
10134
+ nr_pages = next_ppage - first_ppage;
10135
+ nr_pages = min(nr_pages, max_pages);
10136
+
10137
+ first_ppage_reported = first_ppage;
10138
+ if (bsi->start == 0)
10139
+ first_ppage_reported++;
10140
+ if (bsi->lowest_ppage > first_ppage_reported)
10141
+ bsi->lowest_ppage = first_ppage_reported;
10142
+ if (bsi->highest_ppage < (next_ppage - 1))
10143
+ bsi->highest_ppage = next_ppage - 1;
10144
+
10145
+ ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10146
+ if (ret < 0)
10147
+ return ret;
10148
+ bsi->nr_extents += ret;
10149
+ bsi->nr_pages += nr_pages;
10150
+ return 0;
10151
+}
10152
+
10153
+static void btrfs_swap_deactivate(struct file *file)
10154
+{
10155
+ struct inode *inode = file_inode(file);
10156
+
10157
+ btrfs_free_swapfile_pins(inode);
10158
+ atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10159
+}
10160
+
10161
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10162
+ sector_t *span)
10163
+{
10164
+ struct inode *inode = file_inode(file);
10165
+ struct btrfs_root *root = BTRFS_I(inode)->root;
10166
+ struct btrfs_fs_info *fs_info = root->fs_info;
10167
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10168
+ struct extent_state *cached_state = NULL;
10169
+ struct extent_map *em = NULL;
10170
+ struct btrfs_device *device = NULL;
10171
+ struct btrfs_swap_info bsi = {
10172
+ .lowest_ppage = (sector_t)-1ULL,
10173
+ };
10174
+ int ret = 0;
10175
+ u64 isize;
10176
+ u64 start;
10177
+
10178
+ /*
10179
+ * If the swap file was just created, make sure delalloc is done. If the
10180
+ * file changes again after this, the user is doing something stupid and
10181
+ * we don't really care.
10182
+ */
10183
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10184
+ if (ret)
10185
+ return ret;
10186
+
10187
+ /*
10188
+ * The inode is locked, so these flags won't change after we check them.
10189
+ */
10190
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10191
+ btrfs_warn(fs_info, "swapfile must not be compressed");
10192
+ return -EINVAL;
10193
+ }
10194
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10195
+ btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10196
+ return -EINVAL;
10197
+ }
10198
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10199
+ btrfs_warn(fs_info, "swapfile must not be checksummed");
10200
+ return -EINVAL;
10201
+ }
10202
+
10203
+ /*
10204
+ * Balance or device remove/replace/resize can move stuff around from
10205
+ * under us. The exclop protection makes sure they aren't running/won't
10206
+ * run concurrently while we are mapping the swap extents, and
10207
+ * fs_info->swapfile_pins prevents them from running while the swap
10208
+ * file is active and moving the extents. Note that this also prevents
10209
+ * a concurrent device add which isn't actually necessary, but it's not
10210
+ * really worth the trouble to allow it.
10211
+ */
10212
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10213
+ btrfs_warn(fs_info,
10214
+ "cannot activate swapfile while exclusive operation is running");
10215
+ return -EBUSY;
10216
+ }
10217
+
10218
+ /*
10219
+ * Prevent snapshot creation while we are activating the swap file.
10220
+ * We do not want to race with snapshot creation. If snapshot creation
10221
+ * already started before we bumped nr_swapfiles from 0 to 1 and
10222
+ * completes before the first write into the swap file after it is
10223
+ * activated, than that write would fallback to COW.
10224
+ */
10225
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10226
+ btrfs_exclop_finish(fs_info);
10227
+ btrfs_warn(fs_info,
10228
+ "cannot activate swapfile because snapshot creation is in progress");
10229
+ return -EINVAL;
10230
+ }
10231
+ /*
10232
+ * Snapshots can create extents which require COW even if NODATACOW is
10233
+ * set. We use this counter to prevent snapshots. We must increment it
10234
+ * before walking the extents because we don't want a concurrent
10235
+ * snapshot to run after we've already checked the extents.
10236
+ *
10237
+ * It is possible that subvolume is marked for deletion but still not
10238
+ * removed yet. To prevent this race, we check the root status before
10239
+ * activating the swapfile.
10240
+ */
10241
+ spin_lock(&root->root_item_lock);
10242
+ if (btrfs_root_dead(root)) {
10243
+ spin_unlock(&root->root_item_lock);
10244
+
10245
+ btrfs_exclop_finish(fs_info);
10246
+ btrfs_warn(fs_info,
10247
+ "cannot activate swapfile because subvolume %llu is being deleted",
10248
+ root->root_key.objectid);
10249
+ return -EPERM;
10250
+ }
10251
+ atomic_inc(&root->nr_swapfiles);
10252
+ spin_unlock(&root->root_item_lock);
10253
+
10254
+ isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10255
+
10256
+ lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
10257
+ start = 0;
10258
+ while (start < isize) {
10259
+ u64 logical_block_start, physical_block_start;
10260
+ struct btrfs_block_group *bg;
10261
+ u64 len = isize - start;
10262
+
10263
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10264
+ if (IS_ERR(em)) {
10265
+ ret = PTR_ERR(em);
10266
+ goto out;
10267
+ }
10268
+
10269
+ if (em->block_start == EXTENT_MAP_HOLE) {
10270
+ btrfs_warn(fs_info, "swapfile must not have holes");
10271
+ ret = -EINVAL;
10272
+ goto out;
10273
+ }
10274
+ if (em->block_start == EXTENT_MAP_INLINE) {
10275
+ /*
10276
+ * It's unlikely we'll ever actually find ourselves
10277
+ * here, as a file small enough to fit inline won't be
10278
+ * big enough to store more than the swap header, but in
10279
+ * case something changes in the future, let's catch it
10280
+ * here rather than later.
10281
+ */
10282
+ btrfs_warn(fs_info, "swapfile must not be inline");
10283
+ ret = -EINVAL;
10284
+ goto out;
10285
+ }
10286
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10287
+ btrfs_warn(fs_info, "swapfile must not be compressed");
10288
+ ret = -EINVAL;
10289
+ goto out;
10290
+ }
10291
+
10292
+ logical_block_start = em->block_start + (start - em->start);
10293
+ len = min(len, em->len - (start - em->start));
10294
+ free_extent_map(em);
10295
+ em = NULL;
10296
+
10297
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
10298
+ if (ret < 0) {
10299
+ goto out;
10300
+ } else if (ret) {
10301
+ ret = 0;
10302
+ } else {
10303
+ btrfs_warn(fs_info,
10304
+ "swapfile must not be copy-on-write");
10305
+ ret = -EINVAL;
10306
+ goto out;
10307
+ }
10308
+
10309
+ em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10310
+ if (IS_ERR(em)) {
10311
+ ret = PTR_ERR(em);
10312
+ goto out;
10313
+ }
10314
+
10315
+ if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10316
+ btrfs_warn(fs_info,
10317
+ "swapfile must have single data profile");
10318
+ ret = -EINVAL;
10319
+ goto out;
10320
+ }
10321
+
10322
+ if (device == NULL) {
10323
+ device = em->map_lookup->stripes[0].dev;
10324
+ ret = btrfs_add_swapfile_pin(inode, device, false);
10325
+ if (ret == 1)
10326
+ ret = 0;
10327
+ else if (ret)
10328
+ goto out;
10329
+ } else if (device != em->map_lookup->stripes[0].dev) {
10330
+ btrfs_warn(fs_info, "swapfile must be on one device");
10331
+ ret = -EINVAL;
10332
+ goto out;
10333
+ }
10334
+
10335
+ physical_block_start = (em->map_lookup->stripes[0].physical +
10336
+ (logical_block_start - em->start));
10337
+ len = min(len, em->len - (logical_block_start - em->start));
10338
+ free_extent_map(em);
10339
+ em = NULL;
10340
+
10341
+ bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10342
+ if (!bg) {
10343
+ btrfs_warn(fs_info,
10344
+ "could not find block group containing swapfile");
10345
+ ret = -EINVAL;
10346
+ goto out;
10347
+ }
10348
+
10349
+ if (!btrfs_inc_block_group_swap_extents(bg)) {
10350
+ btrfs_warn(fs_info,
10351
+ "block group for swapfile at %llu is read-only%s",
10352
+ bg->start,
10353
+ atomic_read(&fs_info->scrubs_running) ?
10354
+ " (scrub running)" : "");
10355
+ btrfs_put_block_group(bg);
10356
+ ret = -EINVAL;
10357
+ goto out;
10358
+ }
10359
+
10360
+ ret = btrfs_add_swapfile_pin(inode, bg, true);
10361
+ if (ret) {
10362
+ btrfs_put_block_group(bg);
10363
+ if (ret == 1)
10364
+ ret = 0;
10365
+ else
10366
+ goto out;
10367
+ }
10368
+
10369
+ if (bsi.block_len &&
10370
+ bsi.block_start + bsi.block_len == physical_block_start) {
10371
+ bsi.block_len += len;
10372
+ } else {
10373
+ if (bsi.block_len) {
10374
+ ret = btrfs_add_swap_extent(sis, &bsi);
10375
+ if (ret)
10376
+ goto out;
10377
+ }
10378
+ bsi.start = start;
10379
+ bsi.block_start = physical_block_start;
10380
+ bsi.block_len = len;
10381
+ }
10382
+
10383
+ start += len;
10384
+ }
10385
+
10386
+ if (bsi.block_len)
10387
+ ret = btrfs_add_swap_extent(sis, &bsi);
10388
+
10389
+out:
10390
+ if (!IS_ERR_OR_NULL(em))
10391
+ free_extent_map(em);
10392
+
10393
+ unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
10394
+
10395
+ if (ret)
10396
+ btrfs_swap_deactivate(file);
10397
+
10398
+ btrfs_drew_write_unlock(&root->snapshot_lock);
10399
+
10400
+ btrfs_exclop_finish(fs_info);
10401
+
10402
+ if (ret)
10403
+ return ret;
10404
+
10405
+ if (device)
10406
+ sis->bdev = device->bdev;
10407
+ *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10408
+ sis->max = bsi.nr_pages;
10409
+ sis->pages = bsi.nr_pages - 1;
10410
+ sis->highest_bit = bsi.nr_pages - 1;
10411
+ return bsi.nr_extents;
10412
+}
10413
+#else
10414
+static void btrfs_swap_deactivate(struct file *file)
10415
+{
10416
+}
10417
+
10418
+static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10419
+ sector_t *span)
10420
+{
10421
+ return -EOPNOTSUPP;
10422
+}
10423
+#endif
1068610424
1068710425 static const struct inode_operations btrfs_dir_inode_operations = {
1068810426 .getattr = btrfs_getattr,
....@@ -10703,11 +10441,6 @@
1070310441 .update_time = btrfs_update_time,
1070410442 .tmpfile = btrfs_tmpfile,
1070510443 };
10706
-static const struct inode_operations btrfs_dir_ro_inode_operations = {
10707
- .lookup = btrfs_lookup,
10708
- .permission = btrfs_permission,
10709
- .update_time = btrfs_update_time,
10710
-};
1071110444
1071210445 static const struct file_operations btrfs_dir_file_operations = {
1071310446 .llseek = generic_file_llseek,
....@@ -10720,22 +10453,6 @@
1072010453 #endif
1072110454 .release = btrfs_release_file,
1072210455 .fsync = btrfs_sync_file,
10723
-};
10724
-
10725
-static const struct extent_io_ops btrfs_extent_io_ops = {
10726
- /* mandatory callbacks */
10727
- .submit_bio_hook = btrfs_submit_bio_hook,
10728
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10729
- .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10730
-
10731
- /* optional callbacks */
10732
- .writepage_end_io_hook = btrfs_writepage_end_io_hook,
10733
- .writepage_start_hook = btrfs_writepage_start_hook,
10734
- .set_bit_hook = btrfs_set_bit_hook,
10735
- .clear_bit_hook = btrfs_clear_bit_hook,
10736
- .merge_extent_hook = btrfs_merge_extent_hook,
10737
- .split_extent_hook = btrfs_split_extent_hook,
10738
- .check_extent_io_range = btrfs_check_extent_io_range,
1073910456 };
1074010457
1074110458 /*
....@@ -10754,19 +10471,17 @@
1075410471 .readpage = btrfs_readpage,
1075510472 .writepage = btrfs_writepage,
1075610473 .writepages = btrfs_writepages,
10757
- .readpages = btrfs_readpages,
10758
- .direct_IO = btrfs_direct_IO,
10474
+ .readahead = btrfs_readahead,
10475
+ .direct_IO = noop_direct_IO,
1075910476 .invalidatepage = btrfs_invalidatepage,
1076010477 .releasepage = btrfs_releasepage,
10478
+#ifdef CONFIG_MIGRATION
10479
+ .migratepage = btrfs_migratepage,
10480
+#endif
1076110481 .set_page_dirty = btrfs_set_page_dirty,
1076210482 .error_remove_page = generic_error_remove_page,
10763
-};
10764
-
10765
-static const struct address_space_operations btrfs_symlink_aops = {
10766
- .readpage = btrfs_readpage,
10767
- .writepage = btrfs_writepage,
10768
- .invalidatepage = btrfs_invalidatepage,
10769
- .releasepage = btrfs_releasepage,
10483
+ .swap_activate = btrfs_swap_activate,
10484
+ .swap_deactivate = btrfs_swap_deactivate,
1077010485 };
1077110486
1077210487 static const struct inode_operations btrfs_file_inode_operations = {