~hc/RK356X_SDK_RELEASE.git

..	..	@@ -3,9 +3,9 @@
3	3	* Copyright (C) 2007 Oracle. All rights reserved.
4	4	*/
5	5
	6	+#include <crypto/hash.h>
6	7	#include <linux/kernel.h>
7	8	#include <linux/bio.h>
8		-#include <linux/buffer_head.h>
9	9	#include <linux/file.h>
10	10	#include <linux/fs.h>
11	11	#include <linux/pagemap.h>
..	..	@@ -27,7 +27,12 @@
27	27	#include <linux/uio.h>
28	28	#include <linux/magic.h>
29	29	#include <linux/iversion.h>
	30	+#include <linux/swap.h>
	31	+#include <linux/migrate.h>
	32	+#include <linux/sched/mm.h>
	33	+#include <linux/iomap.h>
30	34	#include <asm/unaligned.h>
	35	+#include "misc.h"
31	36	#include "ctree.h"
32	37	#include "disk-io.h"
33	38	#include "transaction.h"
..	..	@@ -41,32 +46,31 @@
41	46	#include "locking.h"
42	47	#include "free-space-cache.h"
43	48	#include "inode-map.h"
44		-#include "backref.h"
45	49	#include "props.h"
46	50	#include "qgroup.h"
47		-#include "dedupe.h"
	51	+#include "delalloc-space.h"
	52	+#include "block-group.h"
	53	+#include "space-info.h"
48	54
49	55	struct btrfs_iget_args {
50		- struct btrfs_key *location;
	56	+ u64 ino;
51	57	struct btrfs_root *root;
52	58	};
53	59
54	60	struct btrfs_dio_data {
55	61	u64 reserve;
56		- u64 unsubmitted_oe_range_start;
57		- u64 unsubmitted_oe_range_end;
58		- int overwrite;
	62	+ loff_t length;
	63	+ ssize_t submitted;
	64	+ struct extent_changeset *data_reserved;
	65	+ bool sync;
59	66	};
60	67
61	68	static const struct inode_operations btrfs_dir_inode_operations;
62	69	static const struct inode_operations btrfs_symlink_inode_operations;
63		-static const struct inode_operations btrfs_dir_ro_inode_operations;
64	70	static const struct inode_operations btrfs_special_inode_operations;
65	71	static const struct inode_operations btrfs_file_inode_operations;
66	72	static const struct address_space_operations btrfs_aops;
67		-static const struct address_space_operations btrfs_symlink_aops;
68	73	static const struct file_operations btrfs_dir_file_operations;
69		-static const struct extent_io_ops btrfs_extent_io_ops;
70	74
71	75	static struct kmem_cache *btrfs_inode_cachep;
72	76	struct kmem_cache *btrfs_trans_handle_cachep;
..	..	@@ -74,38 +78,26 @@
74	78	struct kmem_cache *btrfs_free_space_cachep;
75	79	struct kmem_cache *btrfs_free_space_bitmap_cachep;
76	80
77		-#define S_SHIFT 12
78		-static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
79		- [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
80		- [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
81		- [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
82		- [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
83		- [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
84		- [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
85		- [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
86		-};
87		-
88	81	static int btrfs_setsize(struct inode inode, struct iattr attr);
89	82	static int btrfs_truncate(struct inode *inode, bool skip_writeback);
90	83	static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
91		-static noinline int cow_file_range(struct inode *inode,
	84	+static noinline int cow_file_range(struct btrfs_inode *inode,
92	85	struct page *locked_page,
93		- u64 start, u64 end, u64 delalloc_end,
94		- int page_started, unsigned long nr_written,
95		- int unlock, struct btrfs_dedupe_hash *hash);
96		-static struct extent_map create_io_em(struct inode inode, u64 start, u64 len,
97		- u64 orig_start, u64 block_start,
	86	+ u64 start, u64 end, int *page_started,
	87	+ unsigned long *nr_written, int unlock);
	88	+static struct extent_map create_io_em(struct btrfs_inode inode, u64 start,
	89	+ u64 len, u64 orig_start, u64 block_start,
98	90	u64 block_len, u64 orig_block_len,
99	91	u64 ram_bytes, int compress_type,
100	92	int type);
101	93
102		-static void __endio_write_update_ordered(struct inode *inode,
	94	+static void __endio_write_update_ordered(struct btrfs_inode *inode,
103	95	const u64 offset, const u64 bytes,
104	96	const bool uptodate);
105	97
106	98	/*
107	99	* Cleanup all submitted ordered extents in specified range to handle errors
108		- * from the fill_dellaloc() callback.
	100	+ * from the btrfs_run_delalloc_range() callback.
109	101	*
110	102	* NOTE: caller must ensure that when an error happens, it can not call
111	103	* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
..	..	@@ -113,7 +105,7 @@
113	105	* to be released, which we want to happen only when finishing the ordered
114	106	* extent (btrfs_finish_ordered_io()).
115	107	*/
116		-static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
	108	+static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
117	109	struct page *locked_page,
118	110	u64 offset, u64 bytes)
119	111	{
..	..	@@ -125,7 +117,7 @@
125	117	struct page *page;
126	118
127	119	while (index <= end_index) {
128		- page = find_get_page(inode->i_mapping, index);
	120	+ page = find_get_page(inode->vfs_inode.i_mapping, index);
129	121	index++;
130	122	if (!page)
131	123	continue;
..	..	@@ -147,13 +139,6 @@
147	139	}
148	140
149	141	static int btrfs_dirty_inode(struct inode *inode);
150		-
151		-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
152		-void btrfs_test_inode_set_ops(struct inode *inode)
153		-{
154		- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
155		-}
156		-#endif
157	142
158	143	static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
159	144	struct inode inode, struct inode dir,
..	..	@@ -187,6 +172,9 @@
187	172	int ret;
188	173	size_t cur_size = size;
189	174	unsigned long offset;
	175	+
	176	+ ASSERT((compressed_size > 0 && compressed_pages) \|\|
	177	+ (compressed_size == 0 && !compressed_pages));
190	178
191	179	if (compressed_size && compressed_pages)
192	180	cur_size = compressed_size;
..	..	@@ -241,13 +229,22 @@
241	229	start >> PAGE_SHIFT);
242	230	btrfs_set_file_extent_compression(leaf, ei, 0);
243	231	kaddr = kmap_atomic(page);
244		- offset = start & (PAGE_SIZE - 1);
	232	+ offset = offset_in_page(start);
245	233	write_extent_buffer(leaf, kaddr + offset, ptr, size);
246	234	kunmap_atomic(kaddr);
247	235	put_page(page);
248	236	}
249	237	btrfs_mark_buffer_dirty(leaf);
250	238	btrfs_release_path(path);
	239	+
	240	+ /*
	241	+ * We align size to sectorsize for inline extents just for simplicity
	242	+ * sake.
	243	+ */
	244	+ size = ALIGN(size, root->fs_info->sectorsize);
	245	+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
	246	+ if (ret)
	247	+ goto fail;
251	248
252	249	/*
253	250	* we're an inline extent, so nobody can
..	..	@@ -271,15 +268,15 @@
271	268	* does the checks required to make sure the data is small enough
272	269	* to fit as an inline extent.
273	270	*/
274		-static noinline int cow_file_range_inline(struct inode *inode, u64 start,
	271	+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
275	272	u64 end, size_t compressed_size,
276	273	int compress_type,
277	274	struct page **compressed_pages)
278	275	{
279		- struct btrfs_root *root = BTRFS_I(inode)->root;
	276	+ struct btrfs_root *root = inode->root;
280	277	struct btrfs_fs_info *fs_info = root->fs_info;
281	278	struct btrfs_trans_handle *trans;
282		- u64 isize = i_size_read(inode);
	279	+ u64 isize = i_size_read(&inode->vfs_inode);
283	280	u64 actual_end = min(end + 1, isize);
284	281	u64 inline_len = actual_end - start;
285	282	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
..	..	@@ -311,7 +308,7 @@
311	308	btrfs_free_path(path);
312	309	return PTR_ERR(trans);
313	310	}
314		- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
	311	+ trans->block_rsv = &inode->block_rsv;
315	312
316	313	if (compressed_size && compressed_pages)
317	314	extent_item_size = btrfs_file_extent_calc_inline_size(
..	..	@@ -320,9 +317,9 @@
320	317	extent_item_size = btrfs_file_extent_calc_inline_size(
321	318	inline_len);
322	319
323		- ret = __btrfs_drop_extents(trans, root, inode, path,
324		- start, aligned_end, NULL,
325		- 1, 1, extent_item_size, &extent_inserted);
	320	+ ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
	321	+ NULL, 1, 1, extent_item_size,
	322	+ &extent_inserted);
326	323	if (ret) {
327	324	btrfs_abort_transaction(trans, ret);
328	325	goto out;
..	..	@@ -331,7 +328,7 @@
331	328	if (isize > actual_end)
332	329	inline_len = min_t(u64, isize, actual_end);
333	330	ret = insert_inline_extent(trans, path, extent_inserted,
334		- root, inode, start,
	331	+ root, &inode->vfs_inode, start,
335	332	inline_len, compressed_size,
336	333	compress_type, compressed_pages);
337	334	if (ret && ret != -ENOSPC) {
..	..	@@ -342,8 +339,8 @@
342	339	goto out;
343	340	}
344	341
345		- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
346		- btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
	342	+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
	343	+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
347	344	out:
348	345	/*
349	346	* Don't forget to free the reserved space, as for inlined extent
..	..	@@ -367,18 +364,25 @@
367	364	struct list_head list;
368	365	};
369	366
370		-struct async_cow {
	367	+struct async_chunk {
371	368	struct inode *inode;
372		- struct btrfs_root *root;
373	369	struct page *locked_page;
374	370	u64 start;
375	371	u64 end;
376	372	unsigned int write_flags;
377	373	struct list_head extents;
	374	+ struct cgroup_subsys_state *blkcg_css;
378	375	struct btrfs_work work;
	376	+ atomic_t *pending;
379	377	};
380	378
381		-static noinline int add_async_extent(struct async_cow *cow,
	379	+struct async_cow {
	380	+ /* Number of chunks in flight; must be first in the structure */
	381	+ atomic_t num_chunks;
	382	+ struct async_chunk chunks[];
	383	+};
	384	+
	385	+static noinline int add_async_extent(struct async_chunk *cow,
382	386	u64 start, u64 ram_size,
383	387	u64 compressed_size,
384	388	struct page **pages,
..	..	@@ -402,10 +406,10 @@
402	406	/*
403	407	* Check if the inode has flags compatible with compression
404	408	*/
405		-static inline bool inode_can_compress(struct inode *inode)
	409	+static inline bool inode_can_compress(struct btrfs_inode *inode)
406	410	{
407		- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW \|\|
408		- BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
	411	+ if (inode->flags & BTRFS_INODE_NODATACOW \|\|
	412	+ inode->flags & BTRFS_INODE_NODATASUM)
409	413	return false;
410	414	return true;
411	415	}
..	..	@@ -414,29 +418,30 @@
414	418	* Check if the inode needs to be submitted to compression, based on mount
415	419	* options, defragmentation, properties or heuristics.
416	420	*/
417		-static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
	421	+static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
	422	+ u64 end)
418	423	{
419		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	424	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
420	425
421	426	if (!inode_can_compress(inode)) {
422	427	WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
423	428	KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
424		- btrfs_ino(BTRFS_I(inode)));
	429	+ btrfs_ino(inode));
425	430	return 0;
426	431	}
427	432	/* force compress */
428	433	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
429	434	return 1;
430	435	/* defrag ioctl */
431		- if (BTRFS_I(inode)->defrag_compress)
	436	+ if (inode->defrag_compress)
432	437	return 1;
433	438	/* bad compression ratios */
434		- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
	439	+ if (inode->flags & BTRFS_INODE_NOCOMPRESS)
435	440	return 0;
436	441	if (btrfs_test_opt(fs_info, COMPRESS) \|\|
437		- BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS \|\|
438		- BTRFS_I(inode)->prop_compress)
439		- return btrfs_compress_heuristic(inode, start, end);
	442	+ inode->flags & BTRFS_INODE_COMPRESS \|\|
	443	+ inode->prop_compress)
	444	+ return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
440	445	return 0;
441	446	}
442	447
..	..	@@ -466,16 +471,15 @@
466	471	* are written in the same order that the flusher thread sent them
467	472	* down.
468	473	*/
469		-static noinline void compress_file_range(struct inode *inode,
470		- struct page *locked_page,
471		- u64 start, u64 end,
472		- struct async_cow *async_cow,
473		- int *num_added)
	474	+static noinline int compress_file_range(struct async_chunk *async_chunk)
474	475	{
	476	+ struct inode *inode = async_chunk->inode;
475	477	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
476	478	u64 blocksize = fs_info->sectorsize;
	479	+ u64 start = async_chunk->start;
	480	+ u64 end = async_chunk->end;
477	481	u64 actual_end;
478		- u64 isize = i_size_read(inode);
	482	+ u64 i_size;
479	483	int ret = 0;
480	484	struct page **pages = NULL;
481	485	unsigned long nr_pages;
..	..	@@ -484,12 +488,25 @@
484	488	int i;
485	489	int will_compress;
486	490	int compress_type = fs_info->compress_type;
	491	+ int compressed_extents = 0;
487	492	int redirty = 0;
488	493
489	494	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
490	495	SZ_16K);
491	496
492		- actual_end = min_t(u64, isize, end + 1);
	497	+ /*
	498	+ * We need to save i_size before now because it could change in between
	499	+ * us evaluating the size and assigning it. This is because we lock and
	500	+ * unlock the page in truncate and fallocate, and then modify the i_size
	501	+ * later on.
	502	+ *
	503	+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
	504	+ * does that for us.
	505	+ */
	506	+ barrier();
	507	+ i_size = i_size_read(inode);
	508	+ barrier();
	509	+ actual_end = min_t(u64, i_size, end + 1);
493	510	again:
494	511	will_compress = 0;
495	512	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
..	..	@@ -530,7 +547,7 @@
530	547	* inode has not been flagged as nocompress. This flag can
531	548	* change at any time if we discover bad compression ratios.
532	549	*/
533		- if (inode_need_compress(inode, start, end)) {
	550	+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
534	551	WARN_ON(pages);
535	552	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
536	553	if (!pages) {
..	..	@@ -571,8 +588,7 @@
571	588	&total_compressed);
572	589
573	590	if (!ret) {
574		- unsigned long offset = total_compressed &
575		- (PAGE_SIZE - 1);
	591	+ unsigned long offset = offset_in_page(total_compressed);
576	592	struct page *page = pages[nr_pages - 1];
577	593	char *kaddr;
578	594
..	..	@@ -595,11 +611,12 @@
595	611	/* we didn't compress the entire range, try
596	612	* to make an uncompressed inline extent.
597	613	*/
598		- ret = cow_file_range_inline(inode, start, end, 0,
599		- BTRFS_COMPRESS_NONE, NULL);
	614	+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
	615	+ 0, BTRFS_COMPRESS_NONE,
	616	+ NULL);
600	617	} else {
601	618	/* try making a compressed inline extent */
602		- ret = cow_file_range_inline(inode, start, end,
	619	+ ret = cow_file_range_inline(BTRFS_I(inode), start, end,
603	620	total_compressed,
604	621	compress_type, pages);
605	622	}
..	..	@@ -621,8 +638,9 @@
621	638	* our outstanding extent for clearing delalloc for this
622	639	* range.
623	640	*/
624		- extent_clear_unlock_delalloc(inode, start, end, end,
625		- NULL, clear_flags,
	641	+ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
	642	+ NULL,
	643	+ clear_flags,
626	644	PAGE_UNLOCK \|
627	645	PAGE_CLEAR_DIRTY \|
628	646	PAGE_SET_WRITEBACK \|
..	..	@@ -641,8 +659,7 @@
641	659	}
642	660	kfree(pages);
643	661	}
644		-
645		- return;
	662	+ return 0;
646	663	}
647	664	}
648	665
..	..	@@ -661,14 +678,14 @@
661	678	*/
662	679	total_in = ALIGN(total_in, PAGE_SIZE);
663	680	if (total_compressed + blocksize <= total_in) {
664		- *num_added += 1;
	681	+ compressed_extents++;
665	682
666	683	/*
667	684	* The async work queues will take care of doing actual
668	685	* allocation on disk for these compressed pages, and
669	686	* will submit them to the elevator.
670	687	*/
671		- add_async_extent(async_cow, start, total_in,
	688	+ add_async_extent(async_chunk, start, total_in,
672	689	total_compressed, pages, nr_pages,
673	690	compress_type);
674	691
..	..	@@ -678,7 +695,7 @@
678	695	cond_resched();
679	696	goto again;
680	697	}
681		- return;
	698	+ return compressed_extents;
682	699	}
683	700	}
684	701	if (pages) {
..	..	@@ -708,18 +725,20 @@
708	725	* to our extent and set things up for the async work queue to run
709	726	* cow_file_range to do the normal delalloc dance.
710	727	*/
711		- if (page_offset(locked_page) >= start &&
712		- page_offset(locked_page) <= end)
713		- __set_page_dirty_nobuffers(locked_page);
	728	+ if (async_chunk->locked_page &&
	729	+ (page_offset(async_chunk->locked_page) >= start &&
	730	+ page_offset(async_chunk->locked_page)) <= end) {
	731	+ __set_page_dirty_nobuffers(async_chunk->locked_page);
714	732	/* unlocked later on in the async handlers */
	733	+ }
715	734
716	735	if (redirty)
717	736	extent_range_redirty_for_io(inode, start, end);
718		- add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
	737	+ add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
719	738	BTRFS_COMPRESS_NONE);
720		- *num_added += 1;
	739	+ compressed_extents++;
721	740
722		- return;
	741	+ return compressed_extents;
723	742	}
724	743
725	744	static void free_async_extent_pages(struct async_extent *async_extent)
..	..	@@ -744,45 +763,38 @@
744	763	* queued. We walk all the async extents created by compress_file_range
745	764	* and send them down to the disk.
746	765	*/
747		-static noinline void submit_compressed_extents(struct inode *inode,
748		- struct async_cow *async_cow)
	766	+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
749	767	{
750		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	768	+ struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
	769	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
751	770	struct async_extent *async_extent;
752	771	u64 alloc_hint = 0;
753	772	struct btrfs_key ins;
754	773	struct extent_map *em;
755		- struct btrfs_root *root = BTRFS_I(inode)->root;
756		- struct extent_io_tree *io_tree;
	774	+ struct btrfs_root *root = inode->root;
	775	+ struct extent_io_tree *io_tree = &inode->io_tree;
757	776	int ret = 0;
758	777
759	778	again:
760		- while (!list_empty(&async_cow->extents)) {
761		- async_extent = list_entry(async_cow->extents.next,
	779	+ while (!list_empty(&async_chunk->extents)) {
	780	+ async_extent = list_entry(async_chunk->extents.next,
762	781	struct async_extent, list);
763	782	list_del(&async_extent->list);
764	783
765		- io_tree = &BTRFS_I(inode)->io_tree;
766		-
767	784	retry:
	785	+ lock_extent(io_tree, async_extent->start,
	786	+ async_extent->start + async_extent->ram_size - 1);
768	787	/* did the compression code fall back to uncompressed IO? */
769	788	if (!async_extent->pages) {
770	789	int page_started = 0;
771	790	unsigned long nr_written = 0;
772	791
773		- lock_extent(io_tree, async_extent->start,
774		- async_extent->start +
775		- async_extent->ram_size - 1);
776		-
777	792	/* allocate blocks */
778		- ret = cow_file_range(inode, async_cow->locked_page,
	793	+ ret = cow_file_range(inode, async_chunk->locked_page,
779	794	async_extent->start,
780	795	async_extent->start +
781	796	async_extent->ram_size - 1,
782		- async_extent->start +
783		- async_extent->ram_size - 1,
784		- &page_started, &nr_written, 0,
785		- NULL);
	797	+ &page_started, &nr_written, 0);
786	798
787	799	/* JDM XXX */
788	800
..	..	@@ -793,20 +805,17 @@
793	805	* all those pages down to the drive.
794	806	*/
795	807	if (!page_started && !ret)
796		- extent_write_locked_range(inode,
	808	+ extent_write_locked_range(&inode->vfs_inode,
797	809	async_extent->start,
798	810	async_extent->start +
799	811	async_extent->ram_size - 1,
800	812	WB_SYNC_ALL);
801		- else if (ret)
802		- unlock_page(async_cow->locked_page);
	813	+ else if (ret && async_chunk->locked_page)
	814	+ unlock_page(async_chunk->locked_page);
803	815	kfree(async_extent);
804	816	cond_resched();
805	817	continue;
806	818	}
807		-
808		- lock_extent(io_tree, async_extent->start,
809		- async_extent->start + async_extent->ram_size - 1);
810	819
811	820	ret = btrfs_reserve_extent(root, async_extent->ram_size,
812	821	async_extent->compressed_size,
..	..	@@ -826,7 +835,7 @@
826	835	* will not submit these pages down to lower
827	836	* layers.
828	837	*/
829		- extent_range_redirty_for_io(inode,
	838	+ extent_range_redirty_for_io(&inode->vfs_inode,
830	839	async_extent->start,
831	840	async_extent->start +
832	841	async_extent->ram_size - 1);
..	..	@@ -861,8 +870,7 @@
861	870	BTRFS_ORDERED_COMPRESSED,
862	871	async_extent->compress_type);
863	872	if (ret) {
864		- btrfs_drop_extent_cache(BTRFS_I(inode),
865		- async_extent->start,
	873	+ btrfs_drop_extent_cache(inode, async_extent->start,
866	874	async_extent->start +
867	875	async_extent->ram_size - 1, 0);
868	876	goto out_free_reserve;
..	..	@@ -875,29 +883,25 @@
875	883	extent_clear_unlock_delalloc(inode, async_extent->start,
876	884	async_extent->start +
877	885	async_extent->ram_size - 1,
878		- async_extent->start +
879		- async_extent->ram_size - 1,
880	886	NULL, EXTENT_LOCKED \| EXTENT_DELALLOC,
881	887	PAGE_UNLOCK \| PAGE_CLEAR_DIRTY \|
882	888	PAGE_SET_WRITEBACK);
883		- if (btrfs_submit_compressed_write(inode,
884		- async_extent->start,
	889	+ if (btrfs_submit_compressed_write(inode, async_extent->start,
885	890	async_extent->ram_size,
886	891	ins.objectid,
887	892	ins.offset, async_extent->pages,
888	893	async_extent->nr_pages,
889		- async_cow->write_flags)) {
890		- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
	894	+ async_chunk->write_flags,
	895	+ async_chunk->blkcg_css)) {
891	896	struct page *p = async_extent->pages[0];
892	897	const u64 start = async_extent->start;
893	898	const u64 end = start + async_extent->ram_size - 1;
894	899
895		- p->mapping = inode->i_mapping;
896		- tree->ops->writepage_end_io_hook(p, start, end,
897		- NULL, 0);
	900	+ p->mapping = inode->vfs_inode.i_mapping;
	901	+ btrfs_writepage_endio_finish_ordered(p, start, end, 0);
	902	+
898	903	p->mapping = NULL;
899		- extent_clear_unlock_delalloc(inode, start, end, end,
900		- NULL, 0,
	904	+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
901	905	PAGE_END_WRITEBACK \|
902	906	PAGE_SET_ERROR);
903	907	free_async_extent_pages(async_extent);
..	..	@@ -914,8 +918,6 @@
914	918	extent_clear_unlock_delalloc(inode, async_extent->start,
915	919	async_extent->start +
916	920	async_extent->ram_size - 1,
917		- async_extent->start +
918		- async_extent->ram_size - 1,
919	921	NULL, EXTENT_LOCKED \| EXTENT_DELALLOC \|
920	922	EXTENT_DELALLOC_NEW \|
921	923	EXTENT_DEFRAG \| EXTENT_DO_ACCOUNTING,
..	..	@@ -927,10 +929,10 @@
927	929	goto again;
928	930	}
929	931
930		-static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
	932	+static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
931	933	u64 num_bytes)
932	934	{
933		- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	935	+ struct extent_map_tree *em_tree = &inode->extent_tree;
934	936	struct extent_map *em;
935	937	u64 alloc_hint = 0;
936	938
..	..	@@ -972,14 +974,13 @@
972	974	* required to start IO on it. It may be clean and already done with
973	975	* IO when we return.
974	976	*/
975		-static noinline int cow_file_range(struct inode *inode,
	977	+static noinline int cow_file_range(struct btrfs_inode *inode,
976	978	struct page *locked_page,
977		- u64 start, u64 end, u64 delalloc_end,
978		- int page_started, unsigned long nr_written,
979		- int unlock, struct btrfs_dedupe_hash *hash)
	979	+ u64 start, u64 end, int *page_started,
	980	+ unsigned long *nr_written, int unlock)
980	981	{
981		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
982		- struct btrfs_root *root = BTRFS_I(inode)->root;
	982	+ struct btrfs_root *root = inode->root;
	983	+ struct btrfs_fs_info *fs_info = root->fs_info;
983	984	u64 alloc_hint = 0;
984	985	u64 num_bytes;
985	986	unsigned long ram_size;
..	..	@@ -993,8 +994,7 @@
993	994	bool extent_reserved = false;
994	995	int ret = 0;
995	996
996		- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
997		- WARN_ON_ONCE(1);
	997	+ if (btrfs_is_free_space_inode(inode)) {
998	998	ret = -EINVAL;
999	999	goto out_unlock;
1000	1000	}
..	..	@@ -1003,7 +1003,7 @@
1003	1003	num_bytes = max(blocksize, num_bytes);
1004	1004	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1005	1005
1006		- inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
	1006	+ inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1007	1007
1008	1008	if (start == 0) {
1009	1009	/* lets try to make an inline extent */
..	..	@@ -1016,8 +1016,7 @@
1016	1016	* our outstanding extent for clearing delalloc for this
1017	1017	* range.
1018	1018	*/
1019		- extent_clear_unlock_delalloc(inode, start, end,
1020		- delalloc_end, NULL,
	1019	+ extent_clear_unlock_delalloc(inode, start, end, NULL,
1021	1020	EXTENT_LOCKED \| EXTENT_DELALLOC \|
1022	1021	EXTENT_DELALLOC_NEW \| EXTENT_DEFRAG \|
1023	1022	EXTENT_DO_ACCOUNTING, PAGE_UNLOCK \|
..	..	@@ -1033,8 +1032,7 @@
1033	1032	}
1034	1033
1035	1034	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1036		- btrfs_drop_extent_cache(BTRFS_I(inode), start,
1037		- start + num_bytes - 1, 0);
	1035	+ btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1038	1036
1039	1037	/*
1040	1038	* Relocation relies on the relocated extents to have exactly the same
..	..	@@ -1098,7 +1096,7 @@
1098	1096	* skip current ordered extent.
1099	1097	*/
1100	1098	if (ret)
1101		- btrfs_drop_extent_cache(BTRFS_I(inode), start,
	1099	+ btrfs_drop_extent_cache(inode, start,
1102	1100	start + ram_size - 1, 0);
1103	1101	}
1104	1102
..	..	@@ -1114,9 +1112,8 @@
1114	1112	page_ops = unlock ? PAGE_UNLOCK : 0;
1115	1113	page_ops \|= PAGE_SET_PRIVATE2;
1116	1114
1117		- extent_clear_unlock_delalloc(inode, start,
1118		- start + ram_size - 1,
1119		- delalloc_end, locked_page,
	1115	+ extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
	1116	+ locked_page,
1120	1117	EXTENT_LOCKED \| EXTENT_DELALLOC,
1121	1118	page_ops);
1122	1119	if (num_bytes < cur_alloc_size)
..	..	@@ -1139,7 +1136,7 @@
1139	1136	return ret;
1140	1137
1141	1138	out_drop_extent_cache:
1142		- btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
	1139	+ btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1143	1140	out_reserve:
1144	1141	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1145	1142	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
..	..	@@ -1161,7 +1158,6 @@
1161	1158	if (extent_reserved) {
1162	1159	extent_clear_unlock_delalloc(inode, start,
1163	1160	start + cur_alloc_size - 1,
1164		- start + cur_alloc_size - 1,
1165	1161	locked_page,
1166	1162	clear_bits,
1167	1163	page_ops);
..	..	@@ -1169,8 +1165,7 @@
1169	1165	if (start >= end)
1170	1166	goto out;
1171	1167	}
1172		- extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1173		- locked_page,
	1168	+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
1174	1169	clear_bits \| EXTENT_CLEAR_DATA_RESV,
1175	1170	page_ops);
1176	1171	goto out;
..	..	@@ -1181,16 +1176,15 @@
1181	1176	*/
1182	1177	static noinline void async_cow_start(struct btrfs_work *work)
1183	1178	{
1184		- struct async_cow *async_cow;
1185		- int num_added = 0;
1186		- async_cow = container_of(work, struct async_cow, work);
	1179	+ struct async_chunk *async_chunk;
	1180	+ int compressed_extents;
1187	1181
1188		- compress_file_range(async_cow->inode, async_cow->locked_page,
1189		- async_cow->start, async_cow->end, async_cow,
1190		- &num_added);
1191		- if (num_added == 0) {
1192		- btrfs_add_delayed_iput(async_cow->inode);
1193		- async_cow->inode = NULL;
	1182	+ async_chunk = container_of(work, struct async_chunk, work);
	1183	+
	1184	+ compressed_extents = compress_file_range(async_chunk);
	1185	+ if (compressed_extents == 0) {
	1186	+ btrfs_add_delayed_iput(async_chunk->inode);
	1187	+ async_chunk->inode = NULL;
1194	1188	}
1195	1189	}
1196	1190
..	..	@@ -1199,77 +1193,153 @@
1199	1193	*/
1200	1194	static noinline void async_cow_submit(struct btrfs_work *work)
1201	1195	{
1202		- struct btrfs_fs_info *fs_info;
1203		- struct async_cow *async_cow;
1204		- struct btrfs_root *root;
	1196	+ struct async_chunk *async_chunk = container_of(work, struct async_chunk,
	1197	+ work);
	1198	+ struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1205	1199	unsigned long nr_pages;
1206	1200
1207		- async_cow = container_of(work, struct async_cow, work);
1208		-
1209		- root = async_cow->root;
1210		- fs_info = root->fs_info;
1211		- nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
	1201	+ nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1212	1202	PAGE_SHIFT;
	1203	+
	1204	+ /*
	1205	+ * ->inode could be NULL if async_chunk_start has failed to compress,
	1206	+ * in which case we don't have anything to submit, yet we need to
	1207	+ * always adjust ->async_delalloc_pages as its paired with the init
	1208	+ * happening in cow_file_range_async
	1209	+ */
	1210	+ if (async_chunk->inode)
	1211	+ submit_compressed_extents(async_chunk);
1213	1212
1214	1213	/* atomic_sub_return implies a barrier */
1215	1214	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1216	1215	5 * SZ_1M)
1217	1216	cond_wake_up_nomb(&fs_info->async_submit_wait);
1218		-
1219		- if (async_cow->inode)
1220		- submit_compressed_extents(async_cow->inode, async_cow);
1221	1217	}
1222	1218
1223	1219	static noinline void async_cow_free(struct btrfs_work *work)
1224	1220	{
1225		- struct async_cow *async_cow;
1226		- async_cow = container_of(work, struct async_cow, work);
1227		- if (async_cow->inode)
1228		- btrfs_add_delayed_iput(async_cow->inode);
1229		- kfree(async_cow);
	1221	+ struct async_chunk *async_chunk;
	1222	+
	1223	+ async_chunk = container_of(work, struct async_chunk, work);
	1224	+ if (async_chunk->inode)
	1225	+ btrfs_add_delayed_iput(async_chunk->inode);
	1226	+ if (async_chunk->blkcg_css)
	1227	+ css_put(async_chunk->blkcg_css);
	1228	+ /*
	1229	+ * Since the pointer to 'pending' is at the beginning of the array of
	1230	+ * async_chunk's, freeing it ensures the whole array has been freed.
	1231	+ */
	1232	+ if (atomic_dec_and_test(async_chunk->pending))
	1233	+ kvfree(async_chunk->pending);
1230	1234	}
1231	1235
1232		-static int cow_file_range_async(struct inode inode, struct page locked_page,
	1236	+static int cow_file_range_async(struct btrfs_inode *inode,
	1237	+ struct writeback_control *wbc,
	1238	+ struct page *locked_page,
1233	1239	u64 start, u64 end, int *page_started,
1234		- unsigned long *nr_written,
1235		- unsigned int write_flags)
	1240	+ unsigned long *nr_written)
1236	1241	{
1237		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1238		- struct async_cow *async_cow;
1239		- struct btrfs_root *root = BTRFS_I(inode)->root;
	1242	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
	1243	+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
	1244	+ struct async_cow *ctx;
	1245	+ struct async_chunk *async_chunk;
1240	1246	unsigned long nr_pages;
1241	1247	u64 cur_end;
	1248	+ u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
	1249	+ int i;
	1250	+ bool should_compress;
	1251	+ unsigned nofs_flag;
	1252	+ const unsigned int write_flags = wbc_to_write_flags(wbc);
1242	1253
1243		- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1244		- 1, 0, NULL);
1245		- while (start < end) {
1246		- async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1247		- BUG_ON(!async_cow); /* -ENOMEM */
1248		- async_cow->inode = igrab(inode);
1249		- async_cow->root = root;
1250		- async_cow->locked_page = locked_page;
1251		- async_cow->start = start;
1252		- async_cow->write_flags = write_flags;
	1254	+ unlock_extent(&inode->io_tree, start, end);
1253	1255
1254		- if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1255		- !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1256		- cur_end = end;
1257		- else
	1256	+ if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
	1257	+ !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
	1258	+ num_chunks = 1;
	1259	+ should_compress = false;
	1260	+ } else {
	1261	+ should_compress = true;
	1262	+ }
	1263	+
	1264	+ nofs_flag = memalloc_nofs_save();
	1265	+ ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
	1266	+ memalloc_nofs_restore(nofs_flag);
	1267	+
	1268	+ if (!ctx) {
	1269	+ unsigned clear_bits = EXTENT_LOCKED \| EXTENT_DELALLOC \|
	1270	+ EXTENT_DELALLOC_NEW \| EXTENT_DEFRAG \|
	1271	+ EXTENT_DO_ACCOUNTING;
	1272	+ unsigned long page_ops = PAGE_UNLOCK \| PAGE_CLEAR_DIRTY \|
	1273	+ PAGE_SET_WRITEBACK \| PAGE_END_WRITEBACK \|
	1274	+ PAGE_SET_ERROR;
	1275	+
	1276	+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
	1277	+ clear_bits, page_ops);
	1278	+ return -ENOMEM;
	1279	+ }
	1280	+
	1281	+ async_chunk = ctx->chunks;
	1282	+ atomic_set(&ctx->num_chunks, num_chunks);
	1283	+
	1284	+ for (i = 0; i < num_chunks; i++) {
	1285	+ if (should_compress)
1258	1286	cur_end = min(end, start + SZ_512K - 1);
	1287	+ else
	1288	+ cur_end = end;
1259	1289
1260		- async_cow->end = cur_end;
1261		- INIT_LIST_HEAD(&async_cow->extents);
	1290	+ /*
	1291	+ * igrab is called higher up in the call chain, take only the
	1292	+ * lightweight reference for the callback lifetime
	1293	+ */
	1294	+ ihold(&inode->vfs_inode);
	1295	+ async_chunk[i].pending = &ctx->num_chunks;
	1296	+ async_chunk[i].inode = &inode->vfs_inode;
	1297	+ async_chunk[i].start = start;
	1298	+ async_chunk[i].end = cur_end;
	1299	+ async_chunk[i].write_flags = write_flags;
	1300	+ INIT_LIST_HEAD(&async_chunk[i].extents);
1262	1301
1263		- btrfs_init_work(&async_cow->work,
1264		- btrfs_delalloc_helper,
1265		- async_cow_start, async_cow_submit,
1266		- async_cow_free);
	1302	+ /*
	1303	+ * The locked_page comes all the way from writepage and its
	1304	+ * the original page we were actually given. As we spread
	1305	+ * this large delalloc region across multiple async_chunk
	1306	+ * structs, only the first struct needs a pointer to locked_page
	1307	+ *
	1308	+ * This way we don't need racey decisions about who is supposed
	1309	+ * to unlock it.
	1310	+ */
	1311	+ if (locked_page) {
	1312	+ /*
	1313	+ * Depending on the compressibility, the pages might or
	1314	+ * might not go through async. We want all of them to
	1315	+ * be accounted against wbc once. Let's do it here
	1316	+ * before the paths diverge. wbc accounting is used
	1317	+ * only for foreign writeback detection and doesn't
	1318	+ * need full accuracy. Just account the whole thing
	1319	+ * against the first page.
	1320	+ */
	1321	+ wbc_account_cgroup_owner(wbc, locked_page,
	1322	+ cur_end - start);
	1323	+ async_chunk[i].locked_page = locked_page;
	1324	+ locked_page = NULL;
	1325	+ } else {
	1326	+ async_chunk[i].locked_page = NULL;
	1327	+ }
1267	1328
1268		- nr_pages = (cur_end - start + PAGE_SIZE) >>
1269		- PAGE_SHIFT;
	1329	+ if (blkcg_css != blkcg_root_css) {
	1330	+ css_get(blkcg_css);
	1331	+ async_chunk[i].blkcg_css = blkcg_css;
	1332	+ } else {
	1333	+ async_chunk[i].blkcg_css = NULL;
	1334	+ }
	1335	+
	1336	+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
	1337	+ async_cow_submit, async_cow_free);
	1338	+
	1339	+ nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1270	1340	atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1271	1341
1272		- btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
	1342	+ btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1273	1343
1274	1344	*nr_written += nr_pages;
1275	1345	start = cur_end + 1;
..	..	@@ -1300,6 +1370,73 @@
1300	1370	return 1;
1301	1371	}
1302	1372
	1373	+static int fallback_to_cow(struct btrfs_inode inode, struct page locked_page,
	1374	+ const u64 start, const u64 end,
	1375	+ int page_started, unsigned long nr_written)
	1376	+{
	1377	+ const bool is_space_ino = btrfs_is_free_space_inode(inode);
	1378	+ const bool is_reloc_ino = (inode->root->root_key.objectid ==
	1379	+ BTRFS_DATA_RELOC_TREE_OBJECTID);
	1380	+ const u64 range_bytes = end + 1 - start;
	1381	+ struct extent_io_tree *io_tree = &inode->io_tree;
	1382	+ u64 range_start = start;
	1383	+ u64 count;
	1384	+
	1385	+ /*
	1386	+ * If EXTENT_NORESERVE is set it means that when the buffered write was
	1387	+ * made we had not enough available data space and therefore we did not
	1388	+ * reserve data space for it, since we though we could do NOCOW for the
	1389	+ * respective file range (either there is prealloc extent or the inode
	1390	+ * has the NOCOW bit set).
	1391	+ *
	1392	+ * However when we need to fallback to COW mode (because for example the
	1393	+ * block group for the corresponding extent was turned to RO mode by a
	1394	+ * scrub or relocation) we need to do the following:
	1395	+ *
	1396	+ * 1) We increment the bytes_may_use counter of the data space info.
	1397	+ * If COW succeeds, it allocates a new data extent and after doing
	1398	+ * that it decrements the space info's bytes_may_use counter and
	1399	+ * increments its bytes_reserved counter by the same amount (we do
	1400	+ * this at btrfs_add_reserved_bytes()). So we need to increment the
	1401	+ * bytes_may_use counter to compensate (when space is reserved at
	1402	+ * buffered write time, the bytes_may_use counter is incremented);
	1403	+ *
	1404	+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
	1405	+ * that if the COW path fails for any reason, it decrements (through
	1406	+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
	1407	+ * data space info, which we incremented in the step above.
	1408	+ *
	1409	+ * If we need to fallback to cow and the inode corresponds to a free
	1410	+ * space cache inode or an inode of the data relocation tree, we must
	1411	+ * also increment bytes_may_use of the data space_info for the same
	1412	+ * reason. Space caches and relocated data extents always get a prealloc
	1413	+ * extent for them, however scrub or balance may have set the block
	1414	+ * group that contains that extent to RO mode and therefore force COW
	1415	+ * when starting writeback.
	1416	+ */
	1417	+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
	1418	+ EXTENT_NORESERVE, 0);
	1419	+ if (count > 0 \|\| is_space_ino \|\| is_reloc_ino) {
	1420	+ u64 bytes = count;
	1421	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
	1422	+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
	1423	+
	1424	+ if (is_space_ino \|\| is_reloc_ino)
	1425	+ bytes = range_bytes;
	1426	+
	1427	+ spin_lock(&sinfo->lock);
	1428	+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
	1429	+ spin_unlock(&sinfo->lock);
	1430	+
	1431	+ if (count > 0)
	1432	+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
	1433	+ 0, 0, NULL);
	1434	+ }
	1435	+
	1436	+ return cow_file_range(inode, locked_page, start, end, page_started,
	1437	+ nr_written, 1);
	1438	+}
	1439	+
1303	1440	/*
1304	1441	* when nowcow writeback call back. This checks for snapshots or COW copies
1305	1442	* of the extents that exist in the file, and COWs the file as required.
..	..	@@ -1307,38 +1444,27 @@
1307	1444	* If no cow copies or snapshots exist, we write directly to the existing
1308	1445	* blocks on disk
1309	1446	*/
1310		-static noinline int run_delalloc_nocow(struct inode *inode,
	1447	+static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1311	1448	struct page *locked_page,
1312		- u64 start, u64 end, int *page_started, int force,
1313		- unsigned long *nr_written)
	1449	+ const u64 start, const u64 end,
	1450	+ int *page_started, int force,
	1451	+ unsigned long *nr_written)
1314	1452	{
1315		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1316		- struct btrfs_root *root = BTRFS_I(inode)->root;
1317		- struct extent_buffer *leaf;
	1453	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
	1454	+ struct btrfs_root *root = inode->root;
1318	1455	struct btrfs_path *path;
1319		- struct btrfs_file_extent_item *fi;
1320		- struct btrfs_key found_key;
1321		- struct extent_map *em;
1322		- u64 cow_start;
1323		- u64 cur_offset;
1324		- u64 extent_end;
1325		- u64 extent_offset;
1326		- u64 disk_bytenr;
1327		- u64 num_bytes;
1328		- u64 disk_num_bytes;
1329		- u64 ram_bytes;
1330		- int extent_type;
	1456	+ u64 cow_start = (u64)-1;
	1457	+ u64 cur_offset = start;
1331	1458	int ret;
1332		- int type;
1333		- int nocow;
1334		- int check_prev = 1;
1335		- bool nolock;
1336		- u64 ino = btrfs_ino(BTRFS_I(inode));
	1459	+ bool check_prev = true;
	1460	+ const bool freespace_inode = btrfs_is_free_space_inode(inode);
	1461	+ u64 ino = btrfs_ino(inode);
	1462	+ bool nocow = false;
	1463	+ u64 disk_bytenr = 0;
1337	1464
1338	1465	path = btrfs_alloc_path();
1339	1466	if (!path) {
1340		- extent_clear_unlock_delalloc(inode, start, end, end,
1341		- locked_page,
	1467	+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
1342	1468	EXTENT_LOCKED \| EXTENT_DELALLOC \|
1343	1469	EXTENT_DO_ACCOUNTING \|
1344	1470	EXTENT_DEFRAG, PAGE_UNLOCK \|
..	..	@@ -1348,15 +1474,29 @@
1348	1474	return -ENOMEM;
1349	1475	}
1350	1476
1351		- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1352		-
1353		- cow_start = (u64)-1;
1354		- cur_offset = start;
1355	1477	while (1) {
	1478	+ struct btrfs_key found_key;
	1479	+ struct btrfs_file_extent_item *fi;
	1480	+ struct extent_buffer *leaf;
	1481	+ u64 extent_end;
	1482	+ u64 extent_offset;
	1483	+ u64 num_bytes = 0;
	1484	+ u64 disk_num_bytes;
	1485	+ u64 ram_bytes;
	1486	+ int extent_type;
	1487	+
	1488	+ nocow = false;
	1489	+
1356	1490	ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1357	1491	cur_offset, 0);
1358	1492	if (ret < 0)
1359	1493	goto error;
	1494	+
	1495	+ /*
	1496	+ * If there is no extent for our range when doing the initial
	1497	+ * search, then go back to the previous slot as it will be the
	1498	+ * one containing the search offset
	1499	+ */
1360	1500	if (ret > 0 && path->slots[0] > 0 && check_prev) {
1361	1501	leaf = path->nodes[0];
1362	1502	btrfs_item_key_to_cpu(leaf, &found_key,
..	..	@@ -1365,8 +1505,9 @@
1365	1505	found_key.type == BTRFS_EXTENT_DATA_KEY)
1366	1506	path->slots[0]--;
1367	1507	}
1368		- check_prev = 0;
	1508	+ check_prev = false;
1369	1509	next_slot:
	1510	+ /* Go to next leaf if we have exhausted the current one */
1370	1511	leaf = path->nodes[0];
1371	1512	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1372	1513	ret = btrfs_next_leaf(root, path);
..	..	@@ -1380,28 +1521,40 @@
1380	1521	leaf = path->nodes[0];
1381	1522	}
1382	1523
1383		- nocow = 0;
1384		- disk_bytenr = 0;
1385		- num_bytes = 0;
1386	1524	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1387	1525
	1526	+ /* Didn't find anything for our INO */
1388	1527	if (found_key.objectid > ino)
1389	1528	break;
	1529	+ /*
	1530	+ * Keep searching until we find an EXTENT_ITEM or there are no
	1531	+ * more extents for this inode
	1532	+ */
1390	1533	if (WARN_ON_ONCE(found_key.objectid < ino) \|\|
1391	1534	found_key.type < BTRFS_EXTENT_DATA_KEY) {
1392	1535	path->slots[0]++;
1393	1536	goto next_slot;
1394	1537	}
	1538	+
	1539	+ /* Found key is not EXTENT_DATA_KEY or starts after req range */
1395	1540	if (found_key.type > BTRFS_EXTENT_DATA_KEY \|\|
1396	1541	found_key.offset > end)
1397	1542	break;
1398	1543
	1544	+ /*
	1545	+ * If the found extent starts after requested offset, then
	1546	+ * adjust extent_end to be right before this extent begins
	1547	+ */
1399	1548	if (found_key.offset > cur_offset) {
1400	1549	extent_end = found_key.offset;
1401	1550	extent_type = 0;
1402	1551	goto out_check;
1403	1552	}
1404	1553
	1554	+ /*
	1555	+ * Found extent which begins before our range and potentially
	1556	+ * intersect it
	1557	+ */
1405	1558	fi = btrfs_item_ptr(leaf, path->slots[0],
1406	1559	struct btrfs_file_extent_item);
1407	1560	extent_type = btrfs_file_extent_type(leaf, fi);
..	..	@@ -1415,31 +1568,41 @@
1415	1568	btrfs_file_extent_num_bytes(leaf, fi);
1416	1569	disk_num_bytes =
1417	1570	btrfs_file_extent_disk_num_bytes(leaf, fi);
1418		- if (extent_end <= start) {
	1571	+ /*
	1572	+ * If the extent we got ends before our current offset,
	1573	+ * skip to the next extent.
	1574	+ */
	1575	+ if (extent_end <= cur_offset) {
1419	1576	path->slots[0]++;
1420	1577	goto next_slot;
1421	1578	}
	1579	+ /* Skip holes */
1422	1580	if (disk_bytenr == 0)
1423	1581	goto out_check;
	1582	+ /* Skip compressed/encrypted/encoded extents */
1424	1583	if (btrfs_file_extent_compression(leaf, fi) \|\|
1425	1584	btrfs_file_extent_encryption(leaf, fi) \|\|
1426	1585	btrfs_file_extent_other_encoding(leaf, fi))
1427	1586	goto out_check;
1428	1587	/*
1429		- * Do the same check as in btrfs_cross_ref_exist but
1430		- * without the unnecessary search.
	1588	+ * If extent is created before the last volume's snapshot
	1589	+ * this implies the extent is shared, hence we can't do
	1590	+ * nocow. This is the same check as in
	1591	+ * btrfs_cross_ref_exist but without calling
	1592	+ * btrfs_search_slot.
1431	1593	*/
1432		- if (!nolock &&
	1594	+ if (!freespace_inode &&
1433	1595	btrfs_file_extent_generation(leaf, fi) <=
1434	1596	btrfs_root_last_snapshot(&root->root_item))
1435	1597	goto out_check;
1436	1598	if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1437	1599	goto out_check;
	1600	+ /* If extent is RO, we must COW it */
1438	1601	if (btrfs_extent_readonly(fs_info, disk_bytenr))
1439	1602	goto out_check;
1440	1603	ret = btrfs_cross_ref_exist(root, ino,
1441	1604	found_key.offset -
1442		- extent_offset, disk_bytenr);
	1605	+ extent_offset, disk_bytenr, false);
1443	1606	if (ret) {
1444	1607	/*
1445	1608	* ret could be -EIO if the above fails to read
..	..	@@ -1451,17 +1614,17 @@
1451	1614	goto error;
1452	1615	}
1453	1616
1454		- WARN_ON_ONCE(nolock);
	1617	+ WARN_ON_ONCE(freespace_inode);
1455	1618	goto out_check;
1456	1619	}
1457	1620	disk_bytenr += extent_offset;
1458	1621	disk_bytenr += cur_offset - found_key.offset;
1459	1622	num_bytes = min(end + 1, extent_end) - cur_offset;
1460	1623	/*
1461		- * if there are pending snapshots for this root,
1462		- * we fall into common COW way.
	1624	+ * If there are pending snapshots for this root, we
	1625	+ * fall into common COW way
1463	1626	*/
1464		- if (!nolock && atomic_read(&root->snapshot_force_cow))
	1627	+ if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1465	1628	goto out_check;
1466	1629	/*
1467	1630	* force cow if csum exists in the range.
..	..	@@ -1480,27 +1643,29 @@
1480	1643	cur_offset = cow_start;
1481	1644	goto error;
1482	1645	}
1483		- WARN_ON_ONCE(nolock);
	1646	+ WARN_ON_ONCE(freespace_inode);
1484	1647	goto out_check;
1485	1648	}
1486	1649	if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1487	1650	goto out_check;
1488		- nocow = 1;
	1651	+ nocow = true;
1489	1652	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1490		- extent_end = found_key.offset +
1491		- btrfs_file_extent_ram_bytes(leaf, fi);
1492		- extent_end = ALIGN(extent_end,
1493		- fs_info->sectorsize);
	1653	+ extent_end = found_key.offset + ram_bytes;
	1654	+ extent_end = ALIGN(extent_end, fs_info->sectorsize);
	1655	+ /* Skip extents outside of our requested range */
	1656	+ if (extent_end <= start) {
	1657	+ path->slots[0]++;
	1658	+ goto next_slot;
	1659	+ }
1494	1660	} else {
1495		- BUG_ON(1);
	1661	+ /* If this triggers then we have a memory corruption */
	1662	+ BUG();
1496	1663	}
1497	1664	out_check:
1498		- if (extent_end <= start) {
1499		- path->slots[0]++;
1500		- if (nocow)
1501		- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1502		- goto next_slot;
1503		- }
	1665	+ /*
	1666	+ * If nocow is false then record the beginning of the range
	1667	+ * that needs to be COWed
	1668	+ */
1504	1669	if (!nocow) {
1505	1670	if (cow_start == (u64)-1)
1506	1671	cow_start = cur_offset;
..	..	@@ -1512,22 +1677,24 @@
1512	1677	}
1513	1678
1514	1679	btrfs_release_path(path);
	1680	+
	1681	+ /*
	1682	+ * COW range from cow_start to found_key.offset - 1. As the key
	1683	+ * will contain the beginning of the first extent that can be
	1684	+ * NOCOW, following one which needs to be COW'ed
	1685	+ */
1515	1686	if (cow_start != (u64)-1) {
1516		- ret = cow_file_range(inode, locked_page,
1517		- cow_start, found_key.offset - 1,
1518		- end, page_started, nr_written, 1,
1519		- NULL);
1520		- if (ret) {
1521		- if (nocow)
1522		- btrfs_dec_nocow_writers(fs_info,
1523		- disk_bytenr);
	1687	+ ret = fallback_to_cow(inode, locked_page,
	1688	+ cow_start, found_key.offset - 1,
	1689	+ page_started, nr_written);
	1690	+ if (ret)
1524	1691	goto error;
1525		- }
1526	1692	cow_start = (u64)-1;
1527	1693	}
1528	1694
1529	1695	if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1530	1696	u64 orig_start = found_key.offset - extent_offset;
	1697	+ struct extent_map *em;
1531	1698
1532	1699	em = create_io_em(inode, cur_offset, num_bytes,
1533	1700	orig_start,
..	..	@@ -1537,26 +1704,32 @@
1537	1704	ram_bytes, BTRFS_COMPRESS_NONE,
1538	1705	BTRFS_ORDERED_PREALLOC);
1539	1706	if (IS_ERR(em)) {
1540		- if (nocow)
1541		- btrfs_dec_nocow_writers(fs_info,
1542		- disk_bytenr);
1543	1707	ret = PTR_ERR(em);
1544	1708	goto error;
1545	1709	}
1546	1710	free_extent_map(em);
1547		- }
1548		-
1549		- if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1550		- type = BTRFS_ORDERED_PREALLOC;
	1711	+ ret = btrfs_add_ordered_extent(inode, cur_offset,
	1712	+ disk_bytenr, num_bytes,
	1713	+ num_bytes,
	1714	+ BTRFS_ORDERED_PREALLOC);
	1715	+ if (ret) {
	1716	+ btrfs_drop_extent_cache(inode, cur_offset,
	1717	+ cur_offset + num_bytes - 1,
	1718	+ 0);
	1719	+ goto error;
	1720	+ }
1551	1721	} else {
1552		- type = BTRFS_ORDERED_NOCOW;
	1722	+ ret = btrfs_add_ordered_extent(inode, cur_offset,
	1723	+ disk_bytenr, num_bytes,
	1724	+ num_bytes,
	1725	+ BTRFS_ORDERED_NOCOW);
	1726	+ if (ret)
	1727	+ goto error;
1553	1728	}
1554	1729
1555		- ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1556		- num_bytes, num_bytes, type);
1557	1730	if (nocow)
1558	1731	btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1559		- BUG_ON(ret); /* -ENOMEM */
	1732	+ nocow = false;
1560	1733
1561	1734	if (root->root_key.objectid ==
1562	1735	BTRFS_DATA_RELOC_TREE_OBJECTID)
..	..	@@ -1569,7 +1742,7 @@
1569	1742	num_bytes);
1570	1743
1571	1744	extent_clear_unlock_delalloc(inode, cur_offset,
1572		- cur_offset + num_bytes - 1, end,
	1745	+ cur_offset + num_bytes - 1,
1573	1746	locked_page, EXTENT_LOCKED \|
1574	1747	EXTENT_DELALLOC \|
1575	1748	EXTENT_CLEAR_DATA_RESV,
..	..	@@ -1594,15 +1767,18 @@
1594	1767
1595	1768	if (cow_start != (u64)-1) {
1596	1769	cur_offset = end;
1597		- ret = cow_file_range(inode, locked_page, cow_start, end, end,
1598		- page_started, nr_written, 1, NULL);
	1770	+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
	1771	+ page_started, nr_written);
1599	1772	if (ret)
1600	1773	goto error;
1601	1774	}
1602	1775
1603	1776	error:
	1777	+ if (nocow)
	1778	+ btrfs_dec_nocow_writers(fs_info, disk_bytenr);
	1779	+
1604	1780	if (ret && cur_offset < end)
1605		- extent_clear_unlock_delalloc(inode, cur_offset, end, end,
	1781	+ extent_clear_unlock_delalloc(inode, cur_offset, end,
1606	1782	locked_page, EXTENT_LOCKED \|
1607	1783	EXTENT_DELALLOC \| EXTENT_DEFRAG \|
1608	1784	EXTENT_DO_ACCOUNTING, PAGE_UNLOCK \|
..	..	@@ -1613,11 +1789,11 @@
1613	1789	return ret;
1614	1790	}
1615	1791
1616		-static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
	1792	+static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
1617	1793	{
1618	1794
1619		- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1620		- !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
	1795	+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
	1796	+ !(inode->flags & BTRFS_INODE_PREALLOC))
1621	1797	return 0;
1622	1798
1623	1799	/*
..	..	@@ -1625,9 +1801,8 @@
1625	1801	* if is not zero, it means the file is defragging.
1626	1802	* Force cow if given extent needs to be defragged.
1627	1803	*/
1628		- if (BTRFS_I(inode)->defrag_bytes &&
1629		- test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1630		- EXTENT_DEFRAG, 0, NULL))
	1804	+ if (inode->defrag_bytes &&
	1805	+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
1631	1806	return 1;
1632	1807
1633	1808	return 0;
..	..	@@ -1637,31 +1812,27 @@
1637	1812	* Function to process delayed allocation (create CoW) for ranges which are
1638	1813	* being touched for the first time.
1639	1814	*/
1640		-int btrfs_run_delalloc_range(void private_data, struct page locked_page,
	1815	+int btrfs_run_delalloc_range(struct btrfs_inode inode, struct page locked_page,
1641	1816	u64 start, u64 end, int page_started, unsigned long nr_written,
1642	1817	struct writeback_control *wbc)
1643	1818	{
1644		- struct inode *inode = private_data;
1645	1819	int ret;
1646	1820	int force_cow = need_force_cow(inode, start, end);
1647		- unsigned int write_flags = wbc_to_write_flags(wbc);
1648	1821
1649		- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
	1822	+ if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1650	1823	ret = run_delalloc_nocow(inode, locked_page, start, end,
1651	1824	page_started, 1, nr_written);
1652		- } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
	1825	+ } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1653	1826	ret = run_delalloc_nocow(inode, locked_page, start, end,
1654	1827	page_started, 0, nr_written);
1655	1828	} else if (!inode_can_compress(inode) \|\|
1656	1829	!inode_need_compress(inode, start, end)) {
1657		- ret = cow_file_range(inode, locked_page, start, end, end,
1658		- page_started, nr_written, 1, NULL);
	1830	+ ret = cow_file_range(inode, locked_page, start, end,
	1831	+ page_started, nr_written, 1);
1659	1832	} else {
1660		- set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1661		- &BTRFS_I(inode)->runtime_flags);
1662		- ret = cow_file_range_async(inode, locked_page, start, end,
1663		- page_started, nr_written,
1664		- write_flags);
	1833	+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
	1834	+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
	1835	+ page_started, nr_written);
1665	1836	}
1666	1837	if (ret)
1667	1838	btrfs_cleanup_ordered_extents(inode, locked_page, start,
..	..	@@ -1669,10 +1840,9 @@
1669	1840	return ret;
1670	1841	}
1671	1842
1672		-static void btrfs_split_extent_hook(void *private_data,
1673		- struct extent_state *orig, u64 split)
	1843	+void btrfs_split_delalloc_extent(struct inode *inode,
	1844	+ struct extent_state *orig, u64 split)
1674	1845	{
1675		- struct inode *inode = private_data;
1676	1846	u64 size;
1677	1847
1678	1848	/* not delalloc, ignore it */
..	..	@@ -1685,7 +1855,7 @@
1685	1855	u64 new_size;
1686	1856
1687	1857	/*
1688		- * See the explanation in btrfs_merge_extent_hook, the same
	1858	+ * See the explanation in btrfs_merge_delalloc_extent, the same
1689	1859	* applies here, just in reverse.
1690	1860	*/
1691	1861	new_size = orig->end - split + 1;
..	..	@@ -1702,16 +1872,13 @@
1702	1872	}
1703	1873
1704	1874	/*
1705		- * extent_io.c merge_extent_hook, used to track merged delayed allocation
1706		- * extents so we can keep track of new extents that are just merged onto old
1707		- * extents, such as when we are doing sequential writes, so we can properly
1708		- * account for the metadata space we'll need.
	1875	+ * Handle merged delayed allocation extents so we can keep track of new extents
	1876	+ * that are just merged onto old extents, such as when we are doing sequential
	1877	+ * writes, so we can properly account for the metadata space we'll need.
1709	1878	*/
1710		-static void btrfs_merge_extent_hook(void *private_data,
1711		- struct extent_state *new,
1712		- struct extent_state *other)
	1879	+void btrfs_merge_delalloc_extent(struct inode inode, struct extent_state new,
	1880	+ struct extent_state *other)
1713	1881	{
1714		- struct inode *inode = private_data;
1715	1882	u64 new_size, old_size;
1716	1883	u32 num_extents;
1717	1884
..	..	@@ -1815,15 +1982,12 @@
1815	1982	}
1816	1983
1817	1984	/*
1818		- * extent_io.c set_bit_hook, used to track delayed allocation
1819		- * bytes in this file, and to maintain the list of inodes that
1820		- * have pending delalloc work to be done.
	1985	+ * Properly track delayed allocation bytes in the inode and to maintain the
	1986	+ * list of inodes that have pending delalloc work to be done.
1821	1987	*/
1822		-static void btrfs_set_bit_hook(void *private_data,
1823		- struct extent_state state, unsigned bits)
	1988	+void btrfs_set_delalloc_extent(struct inode inode, struct extent_state state,
	1989	+ unsigned *bits)
1824	1990	{
1825		- struct inode *inode = private_data;
1826		-
1827	1991	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1828	1992
1829	1993	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
..	..	@@ -1869,14 +2033,14 @@
1869	2033	}
1870	2034
1871	2035	/*
1872		- * extent_io.c clear_bit_hook, see set_bit_hook for why
	2036	+ * Once a range is no longer delalloc this function ensures that proper
	2037	+ * accounting happens.
1873	2038	*/
1874		-static void btrfs_clear_bit_hook(void *private_data,
1875		- struct extent_state *state,
1876		- unsigned *bits)
	2039	+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
	2040	+ struct extent_state state, unsigned bits)
1877	2041	{
1878		- struct btrfs_inode inode = BTRFS_I((struct inode )private_data);
1879		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
	2042	+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
	2043	+ struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
1880	2044	u64 len = state->end + 1 - state->start;
1881	2045	u32 num_extents = count_max_extents(len);
1882	2046
..	..	@@ -1901,7 +2065,7 @@
1901	2065
1902	2066	/*
1903	2067	* We don't reserve metadata space for space cache inodes so we
1904		- * don't need to call dellalloc_release_metadata if there is an
	2068	+ * don't need to call delalloc_release_metadata if there is an
1905	2069	* error.
1906	2070	*/
1907	2071	if (*bits & EXTENT_CLEAR_META_RESV &&
..	..	@@ -1915,9 +2079,7 @@
1915	2079	if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1916	2080	do_list && !(state->state & EXTENT_NORESERVE) &&
1917	2081	(*bits & EXTENT_CLEAR_DATA_RESV))
1918		- btrfs_free_reserved_data_space_noquota(
1919		- &inode->vfs_inode,
1920		- state->start, len);
	2082	+ btrfs_free_reserved_data_space_noquota(fs_info, len);
1921	2083
1922	2084	percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1923	2085	fs_info->delalloc_batch);
..	..	@@ -1940,16 +2102,21 @@
1940	2102	}
1941	2103
1942	2104	/*
1943		- * Merge bio hook, this must check the chunk tree to make sure we don't create
1944		- * bios that span stripes or chunks
	2105	+ * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
	2106	+ * in a chunk's stripe. This function ensures that bios do not span a
	2107	+ * stripe/chunk
1945	2108	*
1946		- * return 1 if page cannot be merged to bio
1947		- * return 0 if page can be merged to bio
	2109	+ * @page - The page we are about to add to the bio
	2110	+ * @size - size we want to add to the bio
	2111	+ * @bio - bio we want to ensure is smaller than a stripe
	2112	+ * @bio_flags - flags of the bio
	2113	+ *
	2114	+ * return 1 if page cannot be added to the bio
	2115	+ * return 0 if page can be added to the bio
1948	2116	* return error otherwise
1949	2117	*/
1950		-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1951		- size_t size, struct bio *bio,
1952		- unsigned long bio_flags)
	2118	+int btrfs_bio_fits_in_stripe(struct page page, size_t size, struct bio bio,
	2119	+ unsigned long bio_flags)
1953	2120	{
1954	2121	struct inode *inode = page->mapping->host;
1955	2122	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
..	..	@@ -1957,17 +2124,19 @@
1957	2124	u64 length = 0;
1958	2125	u64 map_length;
1959	2126	int ret;
	2127	+ struct btrfs_io_geometry geom;
1960	2128
1961	2129	if (bio_flags & EXTENT_BIO_COMPRESSED)
1962	2130	return 0;
1963	2131
1964	2132	length = bio->bi_iter.bi_size;
1965	2133	map_length = length;
1966		- ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1967		- NULL, 0);
	2134	+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
	2135	+ &geom);
1968	2136	if (ret < 0)
1969	2137	return ret;
1970		- if (map_length < length + size)
	2138	+
	2139	+ if (geom.len < length + size)
1971	2140	return 1;
1972	2141	return 0;
1973	2142	}
..	..	@@ -1984,34 +2153,8 @@
1984	2153	u64 bio_offset)
1985	2154	{
1986	2155	struct inode *inode = private_data;
1987		- blk_status_t ret = 0;
1988	2156
1989		- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1990		- BUG_ON(ret); /* -ENOMEM */
1991		- return 0;
1992		-}
1993		-
1994		-/*
1995		- * in order to insert checksums into the metadata in large chunks,
1996		- * we wait until bio submission time. All the pages in the bio are
1997		- * checksummed and sums are attached onto the ordered extent record.
1998		- *
1999		- * At IO completion time the cums attached on the ordered extent record
2000		- * are inserted into the btree
2001		- */
2002		-blk_status_t btrfs_submit_bio_done(void private_data, struct bio bio,
2003		- int mirror_num)
2004		-{
2005		- struct inode *inode = private_data;
2006		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2007		- blk_status_t ret;
2008		-
2009		- ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
2010		- if (ret) {
2011		- bio->bi_status = ret;
2012		- bio_endio(bio);
2013		- }
2014		- return ret;
	2157	+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2015	2158	}
2016	2159
2017	2160	/*
..	..	@@ -2032,11 +2175,10 @@
2032	2175	*
2033	2176	* c-3) otherwise: async submit
2034	2177	*/
2035		-static blk_status_t btrfs_submit_bio_hook(void private_data, struct bio bio,
2036		- int mirror_num, unsigned long bio_flags,
2037		- u64 bio_offset)
	2178	+blk_status_t btrfs_submit_data_bio(struct inode inode, struct bio bio,
	2179	+ int mirror_num, unsigned long bio_flags)
	2180	+
2038	2181	{
2039		- struct inode *inode = private_data;
2040	2182	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2041	2183	struct btrfs_root *root = BTRFS_I(inode)->root;
2042	2184	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
..	..	@@ -2060,7 +2202,7 @@
2060	2202	bio_flags);
2061	2203	goto out;
2062	2204	} else if (!skip_sum) {
2063		- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
	2205	+ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
2064	2206	if (ret)
2065	2207	goto out;
2066	2208	}
..	..	@@ -2071,17 +2213,16 @@
2071	2213	goto mapit;
2072	2214	/* we're doing a write, do the async checksumming */
2073	2215	ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2074		- bio_offset, inode,
2075		- btrfs_submit_bio_start);
	2216	+ 0, inode, btrfs_submit_bio_start);
2076	2217	goto out;
2077	2218	} else if (!skip_sum) {
2078		- ret = btrfs_csum_one_bio(inode, bio, 0, 0);
	2219	+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2079	2220	if (ret)
2080	2221	goto out;
2081	2222	}
2082	2223
2083	2224	mapit:
2084		- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
	2225	+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
2085	2226
2086	2227	out:
2087	2228	if (ret) {
..	..	@@ -2095,16 +2236,15 @@
2095	2236	* given a list of ordered sums record them in the inode. This happens
2096	2237	* at IO completion time based on sums calculated at bio submission time.
2097	2238	*/
2098		-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2099		- struct inode inode, struct list_head list)
	2239	+static int add_pending_csums(struct btrfs_trans_handle *trans,
	2240	+ struct list_head *list)
2100	2241	{
2101	2242	struct btrfs_ordered_sum *sum;
2102	2243	int ret;
2103	2244
2104	2245	list_for_each_entry(sum, list, list) {
2105	2246	trans->adding_csums = true;
2106		- ret = btrfs_csum_file_blocks(trans,
2107		- BTRFS_I(inode)->root->fs_info->csum_root, sum);
	2247	+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
2108	2248	trans->adding_csums = false;
2109	2249	if (ret)
2110	2250	return ret;
..	..	@@ -2112,18 +2252,77 @@
2112	2252	return 0;
2113	2253	}
2114	2254
2115		-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2116		- unsigned int extra_bits,
2117		- struct extent_state **cached_state, int dedupe)
	2255	+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
	2256	+ const u64 start,
	2257	+ const u64 len,
	2258	+ struct extent_state **cached_state)
2118	2259	{
2119		- WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2120		- return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2121		- extra_bits, cached_state);
	2260	+ u64 search_start = start;
	2261	+ const u64 end = start + len - 1;
	2262	+
	2263	+ while (search_start < end) {
	2264	+ const u64 search_len = end - search_start + 1;
	2265	+ struct extent_map *em;
	2266	+ u64 em_len;
	2267	+ int ret = 0;
	2268	+
	2269	+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
	2270	+ if (IS_ERR(em))
	2271	+ return PTR_ERR(em);
	2272	+
	2273	+ if (em->block_start != EXTENT_MAP_HOLE)
	2274	+ goto next;
	2275	+
	2276	+ em_len = em->len;
	2277	+ if (em->start < search_start)
	2278	+ em_len -= search_start - em->start;
	2279	+ if (em_len > search_len)
	2280	+ em_len = search_len;
	2281	+
	2282	+ ret = set_extent_bit(&inode->io_tree, search_start,
	2283	+ search_start + em_len - 1,
	2284	+ EXTENT_DELALLOC_NEW,
	2285	+ NULL, cached_state, GFP_NOFS);
	2286	+next:
	2287	+ search_start = extent_map_end(em);
	2288	+ free_extent_map(em);
	2289	+ if (ret)
	2290	+ return ret;
	2291	+ }
	2292	+ return 0;
	2293	+}
	2294	+
	2295	+int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
	2296	+ unsigned int extra_bits,
	2297	+ struct extent_state **cached_state)
	2298	+{
	2299	+ WARN_ON(PAGE_ALIGNED(end));
	2300	+
	2301	+ if (start >= i_size_read(&inode->vfs_inode) &&
	2302	+ !(inode->flags & BTRFS_INODE_PREALLOC)) {
	2303	+ /*
	2304	+ * There can't be any extents following eof in this case so just
	2305	+ * set the delalloc new bit for the range directly.
	2306	+ */
	2307	+ extra_bits \|= EXTENT_DELALLOC_NEW;
	2308	+ } else {
	2309	+ int ret;
	2310	+
	2311	+ ret = btrfs_find_new_delalloc_bytes(inode, start,
	2312	+ end + 1 - start,
	2313	+ cached_state);
	2314	+ if (ret)
	2315	+ return ret;
	2316	+ }
	2317	+
	2318	+ return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
	2319	+ cached_state);
2122	2320	}
2123	2321
2124	2322	/* see btrfs_writepage_start_hook for details on why this is required */
2125	2323	struct btrfs_writepage_fixup {
2126	2324	struct page *page;
	2325	+ struct inode *inode;
2127	2326	struct btrfs_work work;
2128	2327	};
2129	2328
..	..	@@ -2134,75 +2333,126 @@
2134	2333	struct extent_state *cached_state = NULL;
2135	2334	struct extent_changeset *data_reserved = NULL;
2136	2335	struct page *page;
2137		- struct inode *inode;
	2336	+ struct btrfs_inode *inode;
2138	2337	u64 page_start;
2139	2338	u64 page_end;
2140		- int ret;
	2339	+ int ret = 0;
	2340	+ bool free_delalloc_space = true;
2141	2341
2142	2342	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2143	2343	page = fixup->page;
2144		-again:
2145		- lock_page(page);
2146		- if (!page->mapping \|\| !PageDirty(page) \|\| !PageChecked(page)) {
2147		- ClearPageChecked(page);
2148		- goto out_page;
2149		- }
2150		-
2151		- inode = page->mapping->host;
	2344	+ inode = BTRFS_I(fixup->inode);
2152	2345	page_start = page_offset(page);
2153	2346	page_end = page_offset(page) + PAGE_SIZE - 1;
2154	2347
2155		- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2156		- &cached_state);
	2348	+ /*
	2349	+ * This is similar to page_mkwrite, we need to reserve the space before
	2350	+ * we take the page lock.
	2351	+ */
	2352	+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
	2353	+ PAGE_SIZE);
	2354	+again:
	2355	+ lock_page(page);
	2356	+
	2357	+ /*
	2358	+ * Before we queued this fixup, we took a reference on the page.
	2359	+ * page->mapping may go NULL, but it shouldn't be moved to a different
	2360	+ * address space.
	2361	+ */
	2362	+ if (!page->mapping \|\| !PageDirty(page) \|\| !PageChecked(page)) {
	2363	+ /*
	2364	+ * Unfortunately this is a little tricky, either
	2365	+ *
	2366	+ * 1) We got here and our page had already been dealt with and
	2367	+ * we reserved our space, thus ret == 0, so we need to just
	2368	+ * drop our space reservation and bail. This can happen the
	2369	+ * first time we come into the fixup worker, or could happen
	2370	+ * while waiting for the ordered extent.
	2371	+ * 2) Our page was already dealt with, but we happened to get an
	2372	+ * ENOSPC above from the btrfs_delalloc_reserve_space. In
	2373	+ * this case we obviously don't have anything to release, but
	2374	+ * because the page was already dealt with we don't want to
	2375	+ * mark the page with an error, so make sure we're resetting
	2376	+ * ret to 0. This is why we have this check _before_ the ret
	2377	+ * check, because we do not want to have a surprise ENOSPC
	2378	+ * when the page was already properly dealt with.
	2379	+ */
	2380	+ if (!ret) {
	2381	+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
	2382	+ btrfs_delalloc_release_space(inode, data_reserved,
	2383	+ page_start, PAGE_SIZE,
	2384	+ true);
	2385	+ }
	2386	+ ret = 0;
	2387	+ goto out_page;
	2388	+ }
	2389	+
	2390	+ /*
	2391	+ * We can't mess with the page state unless it is locked, so now that
	2392	+ * it is locked bail if we failed to make our space reservation.
	2393	+ */
	2394	+ if (ret)
	2395	+ goto out_page;
	2396	+
	2397	+ lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
2157	2398
2158	2399	/* already ordered? We're done */
2159	2400	if (PagePrivate2(page))
2160		- goto out;
	2401	+ goto out_reserved;
2161	2402
2162		- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2163		- PAGE_SIZE);
	2403	+ ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2164	2404	if (ordered) {
2165		- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2166		- page_end, &cached_state);
	2405	+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
	2406	+ &cached_state);
2167	2407	unlock_page(page);
2168		- btrfs_start_ordered_extent(inode, ordered, 1);
	2408	+ btrfs_start_ordered_extent(ordered, 1);
2169	2409	btrfs_put_ordered_extent(ordered);
2170	2410	goto again;
2171	2411	}
2172	2412
2173		- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2174		- PAGE_SIZE);
2175		- if (ret) {
2176		- mapping_set_error(page->mapping, ret);
2177		- end_extent_writepage(page, ret, page_start, page_end);
2178		- ClearPageChecked(page);
2179		- goto out;
2180		- }
2181		-
2182	2413	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2183		- &cached_state, 0);
2184		- if (ret) {
2185		- mapping_set_error(page->mapping, ret);
2186		- end_extent_writepage(page, ret, page_start, page_end);
2187		- ClearPageChecked(page);
2188		- goto out_reserved;
2189		- }
2190		-
2191		- ClearPageChecked(page);
2192		- set_page_dirty(page);
2193		-out_reserved:
2194		- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
	2414	+ &cached_state);
2195	2415	if (ret)
	2416	+ goto out_reserved;
	2417	+
	2418	+ /*
	2419	+ * Everything went as planned, we're now the owner of a dirty page with
	2420	+ * delayed allocation bits set and space reserved for our COW
	2421	+ * destination.
	2422	+ *
	2423	+ * The page was dirty when we started, nothing should have cleaned it.
	2424	+ */
	2425	+ BUG_ON(!PageDirty(page));
	2426	+ free_delalloc_space = false;
	2427	+out_reserved:
	2428	+ btrfs_delalloc_release_extents(inode, PAGE_SIZE);
	2429	+ if (free_delalloc_space)
2196	2430	btrfs_delalloc_release_space(inode, data_reserved, page_start,
2197	2431	PAGE_SIZE, true);
2198		-out:
2199		- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
	2432	+ unlock_extent_cached(&inode->io_tree, page_start, page_end,
2200	2433	&cached_state);
2201	2434	out_page:
	2435	+ if (ret) {
	2436	+ /*
	2437	+ * We hit ENOSPC or other errors. Update the mapping and page
	2438	+ * to reflect the errors and clean the page.
	2439	+ */
	2440	+ mapping_set_error(page->mapping, ret);
	2441	+ end_extent_writepage(page, ret, page_start, page_end);
	2442	+ clear_page_dirty_for_io(page);
	2443	+ SetPageError(page);
	2444	+ }
	2445	+ ClearPageChecked(page);
2202	2446	unlock_page(page);
2203	2447	put_page(page);
2204	2448	kfree(fixup);
2205	2449	extent_changeset_free(data_reserved);
	2450	+ /*
	2451	+ * As a precaution, do a delayed iput in case it would be the last iput
	2452	+ * that could need flushing space. Recursing back to fixup worker would
	2453	+ * deadlock.
	2454	+ */
	2455	+ btrfs_add_delayed_iput(&inode->vfs_inode);
2206	2456	}
2207	2457
2208	2458	/*
..	..	@@ -2216,7 +2466,7 @@
2216	2466	* to fix it up. The async helper will wait for ordered extents, set
2217	2467	* the delalloc bit and make it safe to write the page.
2218	2468	*/
2219		-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
	2469	+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
2220	2470	{
2221	2471	struct inode *inode = page->mapping->host;
2222	2472	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
..	..	@@ -2226,6 +2476,13 @@
2226	2476	if (TestClearPagePrivate2(page))
2227	2477	return 0;
2228	2478
	2479	+ /*
	2480	+ * PageChecked is set below when we create a fixup worker for this page,
	2481	+ * don't try to create another one if we're already PageChecked()
	2482	+ *
	2483	+ * The extent_io writepage code will redirty the page if we send back
	2484	+ * EAGAIN.
	2485	+ */
2229	2486	if (PageChecked(page))
2230	2487	return -EAGAIN;
2231	2488
..	..	@@ -2233,28 +2490,36 @@
2233	2490	if (!fixup)
2234	2491	return -EAGAIN;
2235	2492
	2493	+ /*
	2494	+ * We are already holding a reference to this inode from
	2495	+ * write_cache_pages. We need to hold it because the space reservation
	2496	+ * takes place outside of the page lock, and we can't trust
	2497	+ * page->mapping outside of the page lock.
	2498	+ */
	2499	+ ihold(inode);
2236	2500	SetPageChecked(page);
2237	2501	get_page(page);
2238		- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2239		- btrfs_writepage_fixup_worker, NULL, NULL);
	2502	+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2240	2503	fixup->page = page;
	2504	+ fixup->inode = inode;
2241	2505	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2242		- return -EBUSY;
	2506	+
	2507	+ return -EAGAIN;
2243	2508	}
2244	2509
2245	2510	static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2246		- struct inode *inode, u64 file_pos,
2247		- u64 disk_bytenr, u64 disk_num_bytes,
2248		- u64 num_bytes, u64 ram_bytes,
2249		- u8 compression, u8 encryption,
2250		- u16 other_encoding, int extent_type)
	2511	+ struct btrfs_inode *inode, u64 file_pos,
	2512	+ struct btrfs_file_extent_item *stack_fi,
	2513	+ u64 qgroup_reserved)
2251	2514	{
2252		- struct btrfs_root *root = BTRFS_I(inode)->root;
2253		- struct btrfs_file_extent_item *fi;
	2515	+ struct btrfs_root *root = inode->root;
2254	2516	struct btrfs_path *path;
2255	2517	struct extent_buffer *leaf;
2256	2518	struct btrfs_key ins;
2257		- u64 qg_released;
	2519	+ u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
	2520	+ u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
	2521	+ u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
	2522	+ u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2258	2523	int extent_inserted = 0;
2259	2524	int ret;
2260	2525
..	..	@@ -2273,709 +2538,52 @@
2273	2538	*/
2274	2539	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2275	2540	file_pos + num_bytes, NULL, 0,
2276		- 1, sizeof(*fi), &extent_inserted);
	2541	+ 1, sizeof(*stack_fi), &extent_inserted);
2277	2542	if (ret)
2278	2543	goto out;
2279	2544
2280	2545	if (!extent_inserted) {
2281		- ins.objectid = btrfs_ino(BTRFS_I(inode));
	2546	+ ins.objectid = btrfs_ino(inode);
2282	2547	ins.offset = file_pos;
2283	2548	ins.type = BTRFS_EXTENT_DATA_KEY;
2284	2549
2285	2550	path->leave_spinning = 1;
2286	2551	ret = btrfs_insert_empty_item(trans, root, path, &ins,
2287		- sizeof(*fi));
	2552	+ sizeof(*stack_fi));
2288	2553	if (ret)
2289	2554	goto out;
2290	2555	}
2291	2556	leaf = path->nodes[0];
2292		- fi = btrfs_item_ptr(leaf, path->slots[0],
2293		- struct btrfs_file_extent_item);
2294		- btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2295		- btrfs_set_file_extent_type(leaf, fi, extent_type);
2296		- btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2297		- btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2298		- btrfs_set_file_extent_offset(leaf, fi, 0);
2299		- btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2300		- btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2301		- btrfs_set_file_extent_compression(leaf, fi, compression);
2302		- btrfs_set_file_extent_encryption(leaf, fi, encryption);
2303		- btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
	2557	+ btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
	2558	+ write_extent_buffer(leaf, stack_fi,
	2559	+ btrfs_item_ptr_offset(leaf, path->slots[0]),
	2560	+ sizeof(struct btrfs_file_extent_item));
2304	2561
2305	2562	btrfs_mark_buffer_dirty(leaf);
2306	2563	btrfs_release_path(path);
2307	2564
2308		- inode_add_bytes(inode, num_bytes);
	2565	+ inode_add_bytes(&inode->vfs_inode, num_bytes);
2309	2566
2310	2567	ins.objectid = disk_bytenr;
2311	2568	ins.offset = disk_num_bytes;
2312	2569	ins.type = BTRFS_EXTENT_ITEM_KEY;
2313	2570
2314		- /*
2315		- * Release the reserved range from inode dirty range map, as it is
2316		- * already moved into delayed_ref_head
2317		- */
2318		- ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2319		- if (ret < 0)
2320		- goto out;
2321		- qg_released = ret;
2322		- ret = btrfs_alloc_reserved_file_extent(trans, root,
2323		- btrfs_ino(BTRFS_I(inode)),
2324		- file_pos, qg_released, &ins);
2325		-out:
2326		- btrfs_free_path(path);
2327		-
2328		- return ret;
2329		-}
2330		-
2331		-/* snapshot-aware defrag */
2332		-struct sa_defrag_extent_backref {
2333		- struct rb_node node;
2334		- struct old_sa_defrag_extent *old;
2335		- u64 root_id;
2336		- u64 inum;
2337		- u64 file_pos;
2338		- u64 extent_offset;
2339		- u64 num_bytes;
2340		- u64 generation;
2341		-};
2342		-
2343		-struct old_sa_defrag_extent {
2344		- struct list_head list;
2345		- struct new_sa_defrag_extent *new;
2346		-
2347		- u64 extent_offset;
2348		- u64 bytenr;
2349		- u64 offset;
2350		- u64 len;
2351		- int count;
2352		-};
2353		-
2354		-struct new_sa_defrag_extent {
2355		- struct rb_root root;
2356		- struct list_head head;
2357		- struct btrfs_path *path;
2358		- struct inode *inode;
2359		- u64 file_pos;
2360		- u64 len;
2361		- u64 bytenr;
2362		- u64 disk_len;
2363		- u8 compress_type;
2364		-};
2365		-
2366		-static int backref_comp(struct sa_defrag_extent_backref *b1,
2367		- struct sa_defrag_extent_backref *b2)
2368		-{
2369		- if (b1->root_id < b2->root_id)
2370		- return -1;
2371		- else if (b1->root_id > b2->root_id)
2372		- return 1;
2373		-
2374		- if (b1->inum < b2->inum)
2375		- return -1;
2376		- else if (b1->inum > b2->inum)
2377		- return 1;
2378		-
2379		- if (b1->file_pos < b2->file_pos)
2380		- return -1;
2381		- else if (b1->file_pos > b2->file_pos)
2382		- return 1;
2383		-
2384		- /*
2385		- * [------------------------------] ===> (a range of space)
2386		- * \|<--->\| \|<---->\| =============> (fs/file tree A)
2387		- * \|<---------------------------->\| ===> (fs/file tree B)
2388		- *
2389		- * A range of space can refer to two file extents in one tree while
2390		- * refer to only one file extent in another tree.
2391		- *
2392		- * So we may process a disk offset more than one time(two extents in A)
2393		- * and locate at the same extent(one extent in B), then insert two same
2394		- * backrefs(both refer to the extent in B).
2395		- */
2396		- return 0;
2397		-}
2398		-
2399		-static void backref_insert(struct rb_root *root,
2400		- struct sa_defrag_extent_backref *backref)
2401		-{
2402		- struct rb_node **p = &root->rb_node;
2403		- struct rb_node *parent = NULL;
2404		- struct sa_defrag_extent_backref *entry;
2405		- int ret;
2406		-
2407		- while (*p) {
2408		- parent = *p;
2409		- entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2410		-
2411		- ret = backref_comp(backref, entry);
2412		- if (ret < 0)
2413		- p = &(*p)->rb_left;
2414		- else
2415		- p = &(*p)->rb_right;
2416		- }
2417		-
2418		- rb_link_node(&backref->node, parent, p);
2419		- rb_insert_color(&backref->node, root);
2420		-}
2421		-
2422		-/*
2423		- * Note the backref might has changed, and in this case we just return 0.
2424		- */
2425		-static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2426		- void *ctx)
2427		-{
2428		- struct btrfs_file_extent_item *extent;
2429		- struct old_sa_defrag_extent *old = ctx;
2430		- struct new_sa_defrag_extent *new = old->new;
2431		- struct btrfs_path *path = new->path;
2432		- struct btrfs_key key;
2433		- struct btrfs_root *root;
2434		- struct sa_defrag_extent_backref *backref;
2435		- struct extent_buffer *leaf;
2436		- struct inode *inode = new->inode;
2437		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2438		- int slot;
2439		- int ret;
2440		- u64 extent_offset;
2441		- u64 num_bytes;
2442		-
2443		- if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2444		- inum == btrfs_ino(BTRFS_I(inode)))
2445		- return 0;
2446		-
2447		- key.objectid = root_id;
2448		- key.type = BTRFS_ROOT_ITEM_KEY;
2449		- key.offset = (u64)-1;
2450		-
2451		- root = btrfs_read_fs_root_no_name(fs_info, &key);
2452		- if (IS_ERR(root)) {
2453		- if (PTR_ERR(root) == -ENOENT)
2454		- return 0;
2455		- WARN_ON(1);
2456		- btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2457		- inum, offset, root_id);
2458		- return PTR_ERR(root);
2459		- }
2460		-
2461		- key.objectid = inum;
2462		- key.type = BTRFS_EXTENT_DATA_KEY;
2463		- if (offset > (u64)-1 << 32)
2464		- key.offset = 0;
2465		- else
2466		- key.offset = offset;
2467		-
2468		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2469		- if (WARN_ON(ret < 0))
2470		- return ret;
2471		- ret = 0;
2472		-
2473		- while (1) {
2474		- cond_resched();
2475		-
2476		- leaf = path->nodes[0];
2477		- slot = path->slots[0];
2478		-
2479		- if (slot >= btrfs_header_nritems(leaf)) {
2480		- ret = btrfs_next_leaf(root, path);
2481		- if (ret < 0) {
2482		- goto out;
2483		- } else if (ret > 0) {
2484		- ret = 0;
2485		- goto out;
2486		- }
2487		- continue;
2488		- }
2489		-
2490		- path->slots[0]++;
2491		-
2492		- btrfs_item_key_to_cpu(leaf, &key, slot);
2493		-
2494		- if (key.objectid > inum)
2495		- goto out;
2496		-
2497		- if (key.objectid < inum \|\| key.type != BTRFS_EXTENT_DATA_KEY)
2498		- continue;
2499		-
2500		- extent = btrfs_item_ptr(leaf, slot,
2501		- struct btrfs_file_extent_item);
2502		-
2503		- if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2504		- continue;
2505		-
2506		- /*
2507		- * 'offset' refers to the exact key.offset,
2508		- * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2509		- * (key.offset - extent_offset).
2510		- */
2511		- if (key.offset != offset)
2512		- continue;
2513		-
2514		- extent_offset = btrfs_file_extent_offset(leaf, extent);
2515		- num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2516		-
2517		- if (extent_offset >= old->extent_offset + old->offset +
2518		- old->len \|\| extent_offset + num_bytes <=
2519		- old->extent_offset + old->offset)
2520		- continue;
2521		- break;
2522		- }
2523		-
2524		- backref = kmalloc(sizeof(*backref), GFP_NOFS);
2525		- if (!backref) {
2526		- ret = -ENOENT;
2527		- goto out;
2528		- }
2529		-
2530		- backref->root_id = root_id;
2531		- backref->inum = inum;
2532		- backref->file_pos = offset;
2533		- backref->num_bytes = num_bytes;
2534		- backref->extent_offset = extent_offset;
2535		- backref->generation = btrfs_file_extent_generation(leaf, extent);
2536		- backref->old = old;
2537		- backref_insert(&new->root, backref);
2538		- old->count++;
2539		-out:
2540		- btrfs_release_path(path);
2541		- WARN_ON(ret);
2542		- return ret;
2543		-}
2544		-
2545		-static noinline bool record_extent_backrefs(struct btrfs_path *path,
2546		- struct new_sa_defrag_extent *new)
2547		-{
2548		- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2549		- struct old_sa_defrag_extent old, tmp;
2550		- int ret;
2551		-
2552		- new->path = path;
2553		-
2554		- list_for_each_entry_safe(old, tmp, &new->head, list) {
2555		- ret = iterate_inodes_from_logical(old->bytenr +
2556		- old->extent_offset, fs_info,
2557		- path, record_one_backref,
2558		- old, false);
2559		- if (ret < 0 && ret != -ENOENT)
2560		- return false;
2561		-
2562		- /* no backref to be processed for this extent */
2563		- if (!old->count) {
2564		- list_del(&old->list);
2565		- kfree(old);
2566		- }
2567		- }
2568		-
2569		- if (list_empty(&new->head))
2570		- return false;
2571		-
2572		- return true;
2573		-}
2574		-
2575		-static int relink_is_mergable(struct extent_buffer *leaf,
2576		- struct btrfs_file_extent_item *fi,
2577		- struct new_sa_defrag_extent *new)
2578		-{
2579		- if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2580		- return 0;
2581		-
2582		- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2583		- return 0;
2584		-
2585		- if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2586		- return 0;
2587		-
2588		- if (btrfs_file_extent_encryption(leaf, fi) \|\|
2589		- btrfs_file_extent_other_encoding(leaf, fi))
2590		- return 0;
2591		-
2592		- return 1;
2593		-}
2594		-
2595		-/*
2596		- * Note the backref might has changed, and in this case we just return 0.
2597		- */
2598		-static noinline int relink_extent_backref(struct btrfs_path *path,
2599		- struct sa_defrag_extent_backref *prev,
2600		- struct sa_defrag_extent_backref *backref)
2601		-{
2602		- struct btrfs_file_extent_item *extent;
2603		- struct btrfs_file_extent_item *item;
2604		- struct btrfs_ordered_extent *ordered;
2605		- struct btrfs_trans_handle *trans;
2606		- struct btrfs_root *root;
2607		- struct btrfs_key key;
2608		- struct extent_buffer *leaf;
2609		- struct old_sa_defrag_extent *old = backref->old;
2610		- struct new_sa_defrag_extent *new = old->new;
2611		- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2612		- struct inode *inode;
2613		- struct extent_state *cached = NULL;
2614		- int ret = 0;
2615		- u64 start;
2616		- u64 len;
2617		- u64 lock_start;
2618		- u64 lock_end;
2619		- bool merge = false;
2620		- int index;
2621		-
2622		- if (prev && prev->root_id == backref->root_id &&
2623		- prev->inum == backref->inum &&
2624		- prev->file_pos + prev->num_bytes == backref->file_pos)
2625		- merge = true;
2626		-
2627		- /* step 1: get root */
2628		- key.objectid = backref->root_id;
2629		- key.type = BTRFS_ROOT_ITEM_KEY;
2630		- key.offset = (u64)-1;
2631		-
2632		- index = srcu_read_lock(&fs_info->subvol_srcu);
2633		-
2634		- root = btrfs_read_fs_root_no_name(fs_info, &key);
2635		- if (IS_ERR(root)) {
2636		- srcu_read_unlock(&fs_info->subvol_srcu, index);
2637		- if (PTR_ERR(root) == -ENOENT)
2638		- return 0;
2639		- return PTR_ERR(root);
2640		- }
2641		-
2642		- if (btrfs_root_readonly(root)) {
2643		- srcu_read_unlock(&fs_info->subvol_srcu, index);
2644		- return 0;
2645		- }
2646		-
2647		- /* step 2: get inode */
2648		- key.objectid = backref->inum;
2649		- key.type = BTRFS_INODE_ITEM_KEY;
2650		- key.offset = 0;
2651		-
2652		- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2653		- if (IS_ERR(inode)) {
2654		- srcu_read_unlock(&fs_info->subvol_srcu, index);
2655		- return 0;
2656		- }
2657		-
2658		- srcu_read_unlock(&fs_info->subvol_srcu, index);
2659		-
2660		- /* step 3: relink backref */
2661		- lock_start = backref->file_pos;
2662		- lock_end = backref->file_pos + backref->num_bytes - 1;
2663		- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2664		- &cached);
2665		-
2666		- ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2667		- if (ordered) {
2668		- btrfs_put_ordered_extent(ordered);
2669		- goto out_unlock;
2670		- }
2671		-
2672		- trans = btrfs_join_transaction(root);
2673		- if (IS_ERR(trans)) {
2674		- ret = PTR_ERR(trans);
2675		- goto out_unlock;
2676		- }
2677		-
2678		- key.objectid = backref->inum;
2679		- key.type = BTRFS_EXTENT_DATA_KEY;
2680		- key.offset = backref->file_pos;
2681		-
2682		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2683		- if (ret < 0) {
2684		- goto out_free_path;
2685		- } else if (ret > 0) {
2686		- ret = 0;
2687		- goto out_free_path;
2688		- }
2689		-
2690		- extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2691		- struct btrfs_file_extent_item);
2692		-
2693		- if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2694		- backref->generation)
2695		- goto out_free_path;
2696		-
2697		- btrfs_release_path(path);
2698		-
2699		- start = backref->file_pos;
2700		- if (backref->extent_offset < old->extent_offset + old->offset)
2701		- start += old->extent_offset + old->offset -
2702		- backref->extent_offset;
2703		-
2704		- len = min(backref->extent_offset + backref->num_bytes,
2705		- old->extent_offset + old->offset + old->len);
2706		- len -= max(backref->extent_offset, old->extent_offset + old->offset);
2707		-
2708		- ret = btrfs_drop_extents(trans, root, inode, start,
2709		- start + len, 1);
	2571	+ ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2710	2572	if (ret)
2711		- goto out_free_path;
2712		-again:
2713		- key.objectid = btrfs_ino(BTRFS_I(inode));
2714		- key.type = BTRFS_EXTENT_DATA_KEY;
2715		- key.offset = start;
2716		-
2717		- path->leave_spinning = 1;
2718		- if (merge) {
2719		- struct btrfs_file_extent_item *fi;
2720		- u64 extent_len;
2721		- struct btrfs_key found_key;
2722		-
2723		- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2724		- if (ret < 0)
2725		- goto out_free_path;
2726		-
2727		- path->slots[0]--;
2728		- leaf = path->nodes[0];
2729		- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2730		-
2731		- fi = btrfs_item_ptr(leaf, path->slots[0],
2732		- struct btrfs_file_extent_item);
2733		- extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2734		-
2735		- if (extent_len + found_key.offset == start &&
2736		- relink_is_mergable(leaf, fi, new)) {
2737		- btrfs_set_file_extent_num_bytes(leaf, fi,
2738		- extent_len + len);
2739		- btrfs_mark_buffer_dirty(leaf);
2740		- inode_add_bytes(inode, len);
2741		-
2742		- ret = 1;
2743		- goto out_free_path;
2744		- } else {
2745		- merge = false;
2746		- btrfs_release_path(path);
2747		- goto again;
2748		- }
2749		- }
2750		-
2751		- ret = btrfs_insert_empty_item(trans, root, path, &key,
2752		- sizeof(*extent));
2753		- if (ret) {
2754		- btrfs_abort_transaction(trans, ret);
2755		- goto out_free_path;
2756		- }
2757		-
2758		- leaf = path->nodes[0];
2759		- item = btrfs_item_ptr(leaf, path->slots[0],
2760		- struct btrfs_file_extent_item);
2761		- btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2762		- btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2763		- btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2764		- btrfs_set_file_extent_num_bytes(leaf, item, len);
2765		- btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2766		- btrfs_set_file_extent_generation(leaf, item, trans->transid);
2767		- btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2768		- btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2769		- btrfs_set_file_extent_encryption(leaf, item, 0);
2770		- btrfs_set_file_extent_other_encoding(leaf, item, 0);
2771		-
2772		- btrfs_mark_buffer_dirty(leaf);
2773		- inode_add_bytes(inode, len);
2774		- btrfs_release_path(path);
2775		-
2776		- ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2777		- new->disk_len, 0,
2778		- backref->root_id, backref->inum,
2779		- new->file_pos); /* start - extent_offset */
2780		- if (ret) {
2781		- btrfs_abort_transaction(trans, ret);
2782		- goto out_free_path;
2783		- }
2784		-
2785		- ret = 1;
2786		-out_free_path:
2787		- btrfs_release_path(path);
2788		- path->leave_spinning = 0;
2789		- btrfs_end_transaction(trans);
2790		-out_unlock:
2791		- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2792		- &cached);
2793		- iput(inode);
2794		- return ret;
2795		-}
2796		-
2797		-static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2798		-{
2799		- struct old_sa_defrag_extent old, tmp;
2800		-
2801		- if (!new)
2802		- return;
2803		-
2804		- list_for_each_entry_safe(old, tmp, &new->head, list) {
2805		- kfree(old);
2806		- }
2807		- kfree(new);
2808		-}
2809		-
2810		-static void relink_file_extents(struct new_sa_defrag_extent *new)
2811		-{
2812		- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2813		- struct btrfs_path *path;
2814		- struct sa_defrag_extent_backref *backref;
2815		- struct sa_defrag_extent_backref *prev = NULL;
2816		- struct inode *inode;
2817		- struct rb_node *node;
2818		- int ret;
2819		-
2820		- inode = new->inode;
2821		-
2822		- path = btrfs_alloc_path();
2823		- if (!path)
2824		- return;
2825		-
2826		- if (!record_extent_backrefs(path, new)) {
2827		- btrfs_free_path(path);
2828	2573	goto out;
2829		- }
2830		- btrfs_release_path(path);
2831	2574
2832		- while (1) {
2833		- node = rb_first(&new->root);
2834		- if (!node)
2835		- break;
2836		- rb_erase(node, &new->root);
2837		-
2838		- backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2839		-
2840		- ret = relink_extent_backref(path, prev, backref);
2841		- WARN_ON(ret < 0);
2842		-
2843		- kfree(prev);
2844		-
2845		- if (ret == 1)
2846		- prev = backref;
2847		- else
2848		- prev = NULL;
2849		- cond_resched();
2850		- }
2851		- kfree(prev);
2852		-
2853		- btrfs_free_path(path);
	2575	+ ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
	2576	+ file_pos, qgroup_reserved, &ins);
2854	2577	out:
2855		- free_sa_defrag_extent(new);
2856		-
2857		- atomic_dec(&fs_info->defrag_running);
2858		- wake_up(&fs_info->transaction_wait);
2859		-}
2860		-
2861		-static struct new_sa_defrag_extent *
2862		-record_old_file_extents(struct inode *inode,
2863		- struct btrfs_ordered_extent *ordered)
2864		-{
2865		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2866		- struct btrfs_root *root = BTRFS_I(inode)->root;
2867		- struct btrfs_path *path;
2868		- struct btrfs_key key;
2869		- struct old_sa_defrag_extent *old;
2870		- struct new_sa_defrag_extent *new;
2871		- int ret;
2872		-
2873		- new = kmalloc(sizeof(*new), GFP_NOFS);
2874		- if (!new)
2875		- return NULL;
2876		-
2877		- new->inode = inode;
2878		- new->file_pos = ordered->file_offset;
2879		- new->len = ordered->len;
2880		- new->bytenr = ordered->start;
2881		- new->disk_len = ordered->disk_len;
2882		- new->compress_type = ordered->compress_type;
2883		- new->root = RB_ROOT;
2884		- INIT_LIST_HEAD(&new->head);
2885		-
2886		- path = btrfs_alloc_path();
2887		- if (!path)
2888		- goto out_kfree;
2889		-
2890		- key.objectid = btrfs_ino(BTRFS_I(inode));
2891		- key.type = BTRFS_EXTENT_DATA_KEY;
2892		- key.offset = new->file_pos;
2893		-
2894		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2895		- if (ret < 0)
2896		- goto out_free_path;
2897		- if (ret > 0 && path->slots[0] > 0)
2898		- path->slots[0]--;
2899		-
2900		- /* find out all the old extents for the file range */
2901		- while (1) {
2902		- struct btrfs_file_extent_item *extent;
2903		- struct extent_buffer *l;
2904		- int slot;
2905		- u64 num_bytes;
2906		- u64 offset;
2907		- u64 end;
2908		- u64 disk_bytenr;
2909		- u64 extent_offset;
2910		-
2911		- l = path->nodes[0];
2912		- slot = path->slots[0];
2913		-
2914		- if (slot >= btrfs_header_nritems(l)) {
2915		- ret = btrfs_next_leaf(root, path);
2916		- if (ret < 0)
2917		- goto out_free_path;
2918		- else if (ret > 0)
2919		- break;
2920		- continue;
2921		- }
2922		-
2923		- btrfs_item_key_to_cpu(l, &key, slot);
2924		-
2925		- if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2926		- break;
2927		- if (key.type != BTRFS_EXTENT_DATA_KEY)
2928		- break;
2929		- if (key.offset >= new->file_pos + new->len)
2930		- break;
2931		-
2932		- extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2933		-
2934		- num_bytes = btrfs_file_extent_num_bytes(l, extent);
2935		- if (key.offset + num_bytes < new->file_pos)
2936		- goto next;
2937		-
2938		- disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2939		- if (!disk_bytenr)
2940		- goto next;
2941		-
2942		- extent_offset = btrfs_file_extent_offset(l, extent);
2943		-
2944		- old = kmalloc(sizeof(*old), GFP_NOFS);
2945		- if (!old)
2946		- goto out_free_path;
2947		-
2948		- offset = max(new->file_pos, key.offset);
2949		- end = min(new->file_pos + new->len, key.offset + num_bytes);
2950		-
2951		- old->bytenr = disk_bytenr;
2952		- old->extent_offset = extent_offset;
2953		- old->offset = offset - key.offset;
2954		- old->len = end - offset;
2955		- old->new = new;
2956		- old->count = 0;
2957		- list_add_tail(&old->list, &new->head);
2958		-next:
2959		- path->slots[0]++;
2960		- cond_resched();
2961		- }
2962		-
2963	2578	btrfs_free_path(path);
2964		- atomic_inc(&fs_info->defrag_running);
2965	2579
2966		- return new;
2967		-
2968		-out_free_path:
2969		- btrfs_free_path(path);
2970		-out_kfree:
2971		- free_sa_defrag_extent(new);
2972		- return NULL;
	2580	+ return ret;
2973	2581	}
2974	2582
2975	2583	static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2976	2584	u64 start, u64 len)
2977	2585	{
2978		- struct btrfs_block_group_cache *cache;
	2586	+ struct btrfs_block_group *cache;
2979	2587
2980	2588	cache = btrfs_lookup_block_group(fs_info, start);
2981	2589	ASSERT(cache);
..	..	@@ -2987,7 +2595,33 @@
2987	2595	btrfs_put_block_group(cache);
2988	2596	}
2989	2597
2990		-/* as ordered data IO finishes, this gets called so we can finish
	2598	+static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
	2599	+ struct btrfs_ordered_extent *oe)
	2600	+{
	2601	+ struct btrfs_file_extent_item stack_fi;
	2602	+ u64 logical_len;
	2603	+
	2604	+ memset(&stack_fi, 0, sizeof(stack_fi));
	2605	+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
	2606	+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
	2607	+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
	2608	+ oe->disk_num_bytes);
	2609	+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
	2610	+ logical_len = oe->truncated_len;
	2611	+ else
	2612	+ logical_len = oe->num_bytes;
	2613	+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
	2614	+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
	2615	+ btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
	2616	+ /* Encryption and other encoding is reserved and all 0 */
	2617	+
	2618	+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
	2619	+ oe->file_offset, &stack_fi,
	2620	+ oe->qgroup_rsv);
	2621	+}
	2622	+
	2623	+/*
	2624	+ * As ordered data IO finishes, this gets called so we can finish
2991	2625	* an ordered extent if the range of bytes in the file it covers are
2992	2626	* fully written.
2993	2627	*/
..	..	@@ -2999,32 +2633,33 @@
2999	2633	struct btrfs_trans_handle *trans = NULL;
3000	2634	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3001	2635	struct extent_state *cached_state = NULL;
3002		- struct new_sa_defrag_extent *new = NULL;
	2636	+ u64 start, end;
3003	2637	int compress_type = 0;
3004	2638	int ret = 0;
3005		- u64 logical_len = ordered_extent->len;
3006		- bool nolock;
	2639	+ u64 logical_len = ordered_extent->num_bytes;
	2640	+ bool freespace_inode;
3007	2641	bool truncated = false;
3008	2642	bool range_locked = false;
3009	2643	bool clear_new_delalloc_bytes = false;
3010	2644	bool clear_reserved_extent = true;
	2645	+ unsigned int clear_bits;
	2646	+
	2647	+ start = ordered_extent->file_offset;
	2648	+ end = start + ordered_extent->num_bytes - 1;
3011	2649
3012	2650	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3013	2651	!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3014	2652	!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
3015	2653	clear_new_delalloc_bytes = true;
3016	2654
3017		- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
	2655	+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
3018	2656
3019	2657	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3020	2658	ret = -EIO;
3021	2659	goto out;
3022	2660	}
3023	2661
3024		- btrfs_free_io_failure_record(BTRFS_I(inode),
3025		- ordered_extent->file_offset,
3026		- ordered_extent->file_offset +
3027		- ordered_extent->len - 1);
	2662	+ btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
3028	2663
3029	2664	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3030	2665	truncated = true;
..	..	@@ -3037,16 +2672,9 @@
3037	2672	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3038	2673	BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3039	2674
3040		- /*
3041		- * For mwrite(mmap + memset to write) case, we still reserve
3042		- * space for NOCOW range.
3043		- * As NOCOW won't cause a new delayed ref, just free the space
3044		- */
3045		- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3046		- ordered_extent->len);
3047		- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3048		- if (nolock)
3049		- trans = btrfs_join_transaction_nolock(root);
	2675	+ btrfs_inode_safe_disk_i_size_write(inode, 0);
	2676	+ if (freespace_inode)
	2677	+ trans = btrfs_join_transaction_spacecache(root);
3050	2678	else
3051	2679	trans = btrfs_join_transaction(root);
3052	2680	if (IS_ERR(trans)) {
..	..	@@ -3062,26 +2690,10 @@
3062	2690	}
3063	2691
3064	2692	range_locked = true;
3065		- lock_extent_bits(io_tree, ordered_extent->file_offset,
3066		- ordered_extent->file_offset + ordered_extent->len - 1,
3067		- &cached_state);
	2693	+ lock_extent_bits(io_tree, start, end, &cached_state);
3068	2694
3069		- ret = test_range_bit(io_tree, ordered_extent->file_offset,
3070		- ordered_extent->file_offset + ordered_extent->len - 1,
3071		- EXTENT_DEFRAG, 0, cached_state);
3072		- if (ret) {
3073		- u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3074		- if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3075		- /* the inode is shared */
3076		- new = record_old_file_extents(inode, ordered_extent);
3077		-
3078		- clear_extent_bit(io_tree, ordered_extent->file_offset,
3079		- ordered_extent->file_offset + ordered_extent->len - 1,
3080		- EXTENT_DEFRAG, 0, 0, &cached_state);
3081		- }
3082		-
3083		- if (nolock)
3084		- trans = btrfs_join_transaction_nolock(root);
	2695	+ if (freespace_inode)
	2696	+ trans = btrfs_join_transaction_spacecache(root);
3085	2697	else
3086	2698	trans = btrfs_join_transaction(root);
3087	2699	if (IS_ERR(trans)) {
..	..	@@ -3096,43 +2708,35 @@
3096	2708	compress_type = ordered_extent->compress_type;
3097	2709	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3098	2710	BUG_ON(compress_type);
3099		- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3100		- ordered_extent->len);
3101	2711	ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3102	2712	ordered_extent->file_offset,
3103	2713	ordered_extent->file_offset +
3104	2714	logical_len);
3105	2715	} else {
3106	2716	BUG_ON(root == fs_info->tree_root);
3107		- ret = insert_reserved_file_extent(trans, inode,
3108		- ordered_extent->file_offset,
3109		- ordered_extent->start,
3110		- ordered_extent->disk_len,
3111		- logical_len, logical_len,
3112		- compress_type, 0, 0,
3113		- BTRFS_FILE_EXTENT_REG);
	2717	+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3114	2718	if (!ret) {
3115	2719	clear_reserved_extent = false;
3116	2720	btrfs_release_delalloc_bytes(fs_info,
3117		- ordered_extent->start,
3118		- ordered_extent->disk_len);
	2721	+ ordered_extent->disk_bytenr,
	2722	+ ordered_extent->disk_num_bytes);
3119	2723	}
3120	2724	}
3121	2725	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3122		- ordered_extent->file_offset, ordered_extent->len,
3123		- trans->transid);
	2726	+ ordered_extent->file_offset,
	2727	+ ordered_extent->num_bytes, trans->transid);
3124	2728	if (ret < 0) {
3125	2729	btrfs_abort_transaction(trans, ret);
3126	2730	goto out;
3127	2731	}
3128	2732
3129		- ret = add_pending_csums(trans, inode, &ordered_extent->list);
	2733	+ ret = add_pending_csums(trans, &ordered_extent->list);
3130	2734	if (ret) {
3131	2735	btrfs_abort_transaction(trans, ret);
3132	2736	goto out;
3133	2737	}
3134	2738
3135		- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
	2739	+ btrfs_inode_safe_disk_i_size_write(inode, 0);
3136	2740	ret = btrfs_update_inode_fallback(trans, root, inode);
3137	2741	if (ret) { /* -ENOMEM or corruption */
3138	2742	btrfs_abort_transaction(trans, ret);
..	..	@@ -3140,27 +2744,20 @@
3140	2744	}
3141	2745	ret = 0;
3142	2746	out:
3143		- if (range_locked \|\| clear_new_delalloc_bytes) {
3144		- unsigned int clear_bits = 0;
3145		-
3146		- if (range_locked)
3147		- clear_bits \|= EXTENT_LOCKED;
3148		- if (clear_new_delalloc_bytes)
3149		- clear_bits \|= EXTENT_DELALLOC_NEW;
3150		- clear_extent_bit(&BTRFS_I(inode)->io_tree,
3151		- ordered_extent->file_offset,
3152		- ordered_extent->file_offset +
3153		- ordered_extent->len - 1,
3154		- clear_bits,
3155		- (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3156		- 0, &cached_state);
3157		- }
	2747	+ clear_bits = EXTENT_DEFRAG;
	2748	+ if (range_locked)
	2749	+ clear_bits \|= EXTENT_LOCKED;
	2750	+ if (clear_new_delalloc_bytes)
	2751	+ clear_bits \|= EXTENT_DELALLOC_NEW;
	2752	+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
	2753	+ (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
	2754	+ &cached_state);
3158	2755
3159	2756	if (trans)
3160	2757	btrfs_end_transaction(trans);
3161	2758
3162	2759	if (ret \|\| truncated) {
3163		- u64 start, end;
	2760	+ u64 unwritten_start = start;
3164	2761
3165	2762	/*
3166	2763	* If we failed to finish this ordered extent for any reason we
..	..	@@ -3175,14 +2772,11 @@
3175	2772	mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3176	2773
3177	2774	if (truncated)
3178		- start = ordered_extent->file_offset + logical_len;
3179		- else
3180		- start = ordered_extent->file_offset;
3181		- end = ordered_extent->file_offset + ordered_extent->len - 1;
3182		- clear_extent_uptodate(io_tree, start, end, NULL);
	2775	+ unwritten_start += logical_len;
	2776	+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3183	2777
3184	2778	/* Drop the cache for the part of the extent we didn't write. */
3185		- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
	2779	+ btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
3186	2780
3187	2781	/*
3188	2782	* If the ordered extent had an IOERR or something else went
..	..	@@ -3197,28 +2791,27 @@
3197	2791	if ((ret \|\| !logical_len) &&
3198	2792	clear_reserved_extent &&
3199	2793	!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3200		- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
	2794	+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
	2795	+ /*
	2796	+ * Discard the range before returning it back to the
	2797	+ * free space pool
	2798	+ */
	2799	+ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
	2800	+ btrfs_discard_extent(fs_info,
	2801	+ ordered_extent->disk_bytenr,
	2802	+ ordered_extent->disk_num_bytes,
	2803	+ NULL);
3201	2804	btrfs_free_reserved_extent(fs_info,
3202		- ordered_extent->start,
3203		- ordered_extent->disk_len, 1);
	2805	+ ordered_extent->disk_bytenr,
	2806	+ ordered_extent->disk_num_bytes, 1);
	2807	+ }
3204	2808	}
3205		-
3206	2809
3207	2810	/*
3208	2811	* This needs to be done to make sure anybody waiting knows we are done
3209	2812	* updating everything for this ordered extent.
3210	2813	*/
3211		- btrfs_remove_ordered_extent(inode, ordered_extent);
3212		-
3213		- /* for snapshot-aware defrag */
3214		- if (new) {
3215		- if (ret) {
3216		- free_sa_defrag_extent(new);
3217		- atomic_dec(&fs_info->defrag_running);
3218		- } else {
3219		- relink_file_extents(new);
3220		- }
3221		- }
	2814	+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
3222	2815
3223	2816	/* once for us */
3224	2817	btrfs_put_ordered_extent(ordered_extent);
..	..	@@ -3235,14 +2828,13 @@
3235	2828	btrfs_finish_ordered_io(ordered_extent);
3236	2829	}
3237	2830
3238		-static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3239		- struct extent_state *state, int uptodate)
	2831	+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
	2832	+ u64 end, int uptodate)
3240	2833	{
3241		- struct inode *inode = page->mapping->host;
3242		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	2834	+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
	2835	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
3243	2836	struct btrfs_ordered_extent *ordered_extent = NULL;
3244	2837	struct btrfs_workqueue *wq;
3245		- btrfs_work_func_t func;
3246	2838
3247	2839	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3248	2840
..	..	@@ -3251,34 +2843,34 @@
3251	2843	end - start + 1, uptodate))
3252	2844	return;
3253	2845
3254		- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
	2846	+ if (btrfs_is_free_space_inode(inode))
3255	2847	wq = fs_info->endio_freespace_worker;
3256		- func = btrfs_freespace_write_helper;
3257		- } else {
	2848	+ else
3258	2849	wq = fs_info->endio_write_workers;
3259		- func = btrfs_endio_write_helper;
3260		- }
3261	2850
3262		- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3263		- NULL);
	2851	+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
3264	2852	btrfs_queue_work(wq, &ordered_extent->work);
3265	2853	}
3266	2854
3267		-static int __readpage_endio_check(struct inode *inode,
3268		- struct btrfs_io_bio *io_bio,
3269		- int icsum, struct page *page,
3270		- int pgoff, u64 start, size_t len)
	2855	+static int check_data_csum(struct inode inode, struct btrfs_io_bio io_bio,
	2856	+ int icsum, struct page *page, int pgoff, u64 start,
	2857	+ size_t len)
3271	2858	{
	2859	+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	2860	+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3272	2861	char *kaddr;
3273		- u32 csum_expected;
3274		- u32 csum = ~(u32)0;
	2862	+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
	2863	+ u8 *csum_expected;
	2864	+ u8 csum[BTRFS_CSUM_SIZE];
3275	2865
3276		- csum_expected = (((u32 )io_bio->csum) + icsum);
	2866	+ csum_expected = ((u8 )io_bio->csum) + icsum csum_size;
3277	2867
3278	2868	kaddr = kmap_atomic(page);
3279		- csum = btrfs_csum_data(kaddr + pgoff, csum, len);
3280		- btrfs_csum_final(csum, (u8 *)&csum);
3281		- if (csum != csum_expected)
	2869	+ shash->tfm = fs_info->csum_shash;
	2870	+
	2871	+ crypto_shash_digest(shash, kaddr + pgoff, len, csum);
	2872	+
	2873	+ if (memcmp(csum, csum_expected, csum_size))
3282	2874	goto zeroit;
3283	2875
3284	2876	kunmap_atomic(kaddr);
..	..	@@ -3286,6 +2878,9 @@
3286	2878	zeroit:
3287	2879	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3288	2880	io_bio->mirror_num);
	2881	+ if (io_bio->device)
	2882	+ btrfs_dev_stat_inc_and_print(io_bio->device,
	2883	+ BTRFS_DEV_STAT_CORRUPTION_ERRS);
3289	2884	memset(kaddr + pgoff, 1, len);
3290	2885	flush_dcache_page(page);
3291	2886	kunmap_atomic(kaddr);
..	..	@@ -3297,9 +2892,8 @@
3297	2892	* if there's a match, we allow the bio to finish. If not, the code in
3298	2893	* extent_io.c will try to find good copies for us.
3299	2894	*/
3300		-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3301		- u64 phy_offset, struct page *page,
3302		- u64 start, u64 end, int mirror)
	2895	+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
	2896	+ struct page *page, u64 start, u64 end, int mirror)
3303	2897	{
3304	2898	size_t offset = start - page_offset(page);
3305	2899	struct inode *inode = page->mapping->host;
..	..	@@ -3321,8 +2915,8 @@
3321	2915	}
3322	2916
3323	2917	phy_offset >>= inode->i_sb->s_blocksize_bits;
3324		- return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3325		- start, (size_t)(end - start + 1));
	2918	+ return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
	2919	+ (size_t)(end - start + 1));
3326	2920	}
3327	2921
3328	2922	/*
..	..	@@ -3343,10 +2937,35 @@
3343	2937	if (atomic_add_unless(&inode->i_count, -1, 1))
3344	2938	return;
3345	2939
	2940	+ atomic_inc(&fs_info->nr_delayed_iputs);
3346	2941	spin_lock(&fs_info->delayed_iput_lock);
3347	2942	ASSERT(list_empty(&binode->delayed_iput));
3348	2943	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3349	2944	spin_unlock(&fs_info->delayed_iput_lock);
	2945	+ if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
	2946	+ wake_up_process(fs_info->cleaner_kthread);
	2947	+}
	2948	+
	2949	+static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
	2950	+ struct btrfs_inode *inode)
	2951	+{
	2952	+ list_del_init(&inode->delayed_iput);
	2953	+ spin_unlock(&fs_info->delayed_iput_lock);
	2954	+ iput(&inode->vfs_inode);
	2955	+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
	2956	+ wake_up(&fs_info->delayed_iputs_wait);
	2957	+ spin_lock(&fs_info->delayed_iput_lock);
	2958	+}
	2959	+
	2960	+static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
	2961	+ struct btrfs_inode *inode)
	2962	+{
	2963	+ if (!list_empty(&inode->delayed_iput)) {
	2964	+ spin_lock(&fs_info->delayed_iput_lock);
	2965	+ if (!list_empty(&inode->delayed_iput))
	2966	+ run_delayed_iput_locked(fs_info, inode);
	2967	+ spin_unlock(&fs_info->delayed_iput_lock);
	2968	+ }
3350	2969	}
3351	2970
3352	2971	void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
..	..	@@ -3358,12 +2977,29 @@
3358	2977
3359	2978	inode = list_first_entry(&fs_info->delayed_iputs,
3360	2979	struct btrfs_inode, delayed_iput);
3361		- list_del_init(&inode->delayed_iput);
3362		- spin_unlock(&fs_info->delayed_iput_lock);
3363		- iput(&inode->vfs_inode);
3364		- spin_lock(&fs_info->delayed_iput_lock);
	2980	+ run_delayed_iput_locked(fs_info, inode);
	2981	+ cond_resched_lock(&fs_info->delayed_iput_lock);
3365	2982	}
3366	2983	spin_unlock(&fs_info->delayed_iput_lock);
	2984	+}
	2985	+
	2986	+/**
	2987	+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
	2988	+ * @fs_info - the fs_info for this fs
	2989	+ * @return - EINTR if we were killed, 0 if nothing's pending
	2990	+ *
	2991	+ * This will wait on any delayed iputs that are currently running with KILLABLE
	2992	+ * set. Once they are all done running we will return, unless we are killed in
	2993	+ * which case we return EINTR. This helps in user operations like fallocate etc
	2994	+ * that might get blocked on the iputs.
	2995	+ */
	2996	+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
	2997	+{
	2998	+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
	2999	+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
	3000	+ if (ret)
	3001	+ return -EINTR;
	3002	+ return 0;
3367	3003	}
3368	3004
3369	3005	/*
..	..	@@ -3471,14 +3107,13 @@
3471	3107	found_key.objectid = found_key.offset;
3472	3108	found_key.type = BTRFS_INODE_ITEM_KEY;
3473	3109	found_key.offset = 0;
3474		- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
	3110	+ inode = btrfs_iget(fs_info->sb, last_objectid, root);
3475	3111	ret = PTR_ERR_OR_ZERO(inode);
3476	3112	if (ret && ret != -ENOENT)
3477	3113	goto out;
3478	3114
3479	3115	if (ret == -ENOENT && root == fs_info->tree_root) {
3480	3116	struct btrfs_root *dead_root;
3481		- struct btrfs_fs_info *fs_info = root->fs_info;
3482	3117	int is_dead_root = 0;
3483	3118
3484	3119	/*
..	..	@@ -3490,18 +3125,16 @@
3490	3125	* orphan must not get deleted.
3491	3126	* find_dead_roots already ran before us, so if this
3492	3127	* is a snapshot deletion, we should find the root
3493		- * in the dead_roots list
	3128	+ * in the fs_roots radix tree.
3494	3129	*/
3495		- spin_lock(&fs_info->trans_lock);
3496		- list_for_each_entry(dead_root, &fs_info->dead_roots,
3497		- root_list) {
3498		- if (dead_root->root_key.objectid ==
3499		- found_key.objectid) {
3500		- is_dead_root = 1;
3501		- break;
3502		- }
3503		- }
3504		- spin_unlock(&fs_info->trans_lock);
	3130	+
	3131	+ spin_lock(&fs_info->fs_roots_radix_lock);
	3132	+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
	3133	+ (unsigned long)found_key.objectid);
	3134	+ if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
	3135	+ is_dead_root = 1;
	3136	+ spin_unlock(&fs_info->fs_roots_radix_lock);
	3137	+
3505	3138	if (is_dead_root) {
3506	3139	/* prevent this orphan from being found again */
3507	3140	key.offset = found_key.objectid - 1;
..	..	@@ -3551,8 +3184,6 @@
3551	3184
3552	3185	/* this will do delete_inode and everything for us */
3553	3186	iput(inode);
3554		- if (ret)
3555		- goto out;
3556	3187	}
3557	3188	/* release the path since we're done with it */
3558	3189	btrfs_release_path(path);
..	..	@@ -3694,6 +3325,8 @@
3694	3325	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3695	3326	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3696	3327	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
	3328	+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
	3329	+ round_up(i_size_read(inode), fs_info->sectorsize));
3697	3330
3698	3331	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3699	3332	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
..	..	@@ -3764,21 +3397,14 @@
3764	3397	* inode is not a directory, logging its parent unnecessarily.
3765	3398	*/
3766	3399	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
	3400	+
3767	3401	/*
3768		- * Similar reasoning for last_link_trans, needs to be set otherwise
3769		- * for a case like the following:
3770		- *
3771		- * mkdir A
3772		- * touch foo
3773		- * ln foo A/bar
3774		- * echo 2 > /proc/sys/vm/drop_caches
3775		- * fsync foo
3776		- * <power failure>
3777		- *
3778		- * Would result in link bar and directory A not existing after the power
3779		- * failure.
	3402	+ * Same logic as for last_unlink_trans. We don't persist the generation
	3403	+ * of the last transaction where this inode was used for a reflink
	3404	+ * operation, so after eviction and reloading the inode we must be
	3405	+ * pessimistic and assume the last transaction that modified the inode.
3780	3406	*/
3781		- BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
	3407	+ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3782	3408
3783	3409	path->slots[0]++;
3784	3410	if (inode->i_nlink != 1 \|\|
..	..	@@ -3827,7 +3453,6 @@
3827	3453	switch (inode->i_mode & S_IFMT) {
3828	3454	case S_IFREG:
3829	3455	inode->i_mapping->a_ops = &btrfs_aops;
3830		- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3831	3456	inode->i_fop = &btrfs_file_operations;
3832	3457	inode->i_op = &btrfs_file_inode_operations;
3833	3458	break;
..	..	@@ -3838,7 +3463,7 @@
3838	3463	case S_IFLNK:
3839	3464	inode->i_op = &btrfs_symlink_inode_operations;
3840	3465	inode_nohighmem(inode);
3841		- inode->i_mapping->a_ops = &btrfs_symlink_aops;
	3466	+ inode->i_mapping->a_ops = &btrfs_aops;
3842	3467	break;
3843	3468	default:
3844	3469	inode->i_op = &btrfs_special_inode_operations;
..	..	@@ -3860,45 +3485,42 @@
3860	3485	{
3861	3486	struct btrfs_map_token token;
3862	3487
3863		- btrfs_init_map_token(&token);
	3488	+ btrfs_init_map_token(&token, leaf);
3864	3489
3865		- btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3866		- btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3867		- btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3868		- &token);
3869		- btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3870		- btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
	3490	+ btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
	3491	+ btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
	3492	+ btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
	3493	+ btrfs_set_token_inode_mode(&token, item, inode->i_mode);
	3494	+ btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3871	3495
3872		- btrfs_set_token_timespec_sec(leaf, &item->atime,
3873		- inode->i_atime.tv_sec, &token);
3874		- btrfs_set_token_timespec_nsec(leaf, &item->atime,
3875		- inode->i_atime.tv_nsec, &token);
	3496	+ btrfs_set_token_timespec_sec(&token, &item->atime,
	3497	+ inode->i_atime.tv_sec);
	3498	+ btrfs_set_token_timespec_nsec(&token, &item->atime,
	3499	+ inode->i_atime.tv_nsec);
3876	3500
3877		- btrfs_set_token_timespec_sec(leaf, &item->mtime,
3878		- inode->i_mtime.tv_sec, &token);
3879		- btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3880		- inode->i_mtime.tv_nsec, &token);
	3501	+ btrfs_set_token_timespec_sec(&token, &item->mtime,
	3502	+ inode->i_mtime.tv_sec);
	3503	+ btrfs_set_token_timespec_nsec(&token, &item->mtime,
	3504	+ inode->i_mtime.tv_nsec);
3881	3505
3882		- btrfs_set_token_timespec_sec(leaf, &item->ctime,
3883		- inode->i_ctime.tv_sec, &token);
3884		- btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3885		- inode->i_ctime.tv_nsec, &token);
	3506	+ btrfs_set_token_timespec_sec(&token, &item->ctime,
	3507	+ inode->i_ctime.tv_sec);
	3508	+ btrfs_set_token_timespec_nsec(&token, &item->ctime,
	3509	+ inode->i_ctime.tv_nsec);
3886	3510
3887		- btrfs_set_token_timespec_sec(leaf, &item->otime,
3888		- BTRFS_I(inode)->i_otime.tv_sec, &token);
3889		- btrfs_set_token_timespec_nsec(leaf, &item->otime,
3890		- BTRFS_I(inode)->i_otime.tv_nsec, &token);
	3511	+ btrfs_set_token_timespec_sec(&token, &item->otime,
	3512	+ BTRFS_I(inode)->i_otime.tv_sec);
	3513	+ btrfs_set_token_timespec_nsec(&token, &item->otime,
	3514	+ BTRFS_I(inode)->i_otime.tv_nsec);
3891	3515
3892		- btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3893		- &token);
3894		- btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3895		- &token);
3896		- btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
3897		- &token);
3898		- btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3899		- btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3900		- btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3901		- btrfs_set_token_inode_block_group(leaf, item, 0, &token);
	3516	+ btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
	3517	+ btrfs_set_token_inode_generation(&token, item,
	3518	+ BTRFS_I(inode)->generation);
	3519	+ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
	3520	+ btrfs_set_token_inode_transid(&token, item, trans->transid);
	3521	+ btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
	3522	+ btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
	3523	+ btrfs_set_token_inode_block_group(&token, item, 0);
3902	3524	}
3903	3525
3904	3526	/*
..	..	@@ -3931,7 +3553,7 @@
3931	3553
3932	3554	fill_inode_item(trans, leaf, inode_item, inode);
3933	3555	btrfs_mark_buffer_dirty(leaf);
3934		- btrfs_set_inode_last_trans(trans, inode);
	3556	+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
3935	3557	ret = 0;
3936	3558	failed:
3937	3559	btrfs_free_path(path);
..	..	@@ -3961,7 +3583,7 @@
3961	3583
3962	3584	ret = btrfs_delayed_update_inode(trans, root, inode);
3963	3585	if (!ret)
3964		- btrfs_set_inode_last_trans(trans, inode);
	3586	+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
3965	3587	return ret;
3966	3588	}
3967	3589
..	..	@@ -3994,9 +3616,7 @@
3994	3616	struct btrfs_fs_info *fs_info = root->fs_info;
3995	3617	struct btrfs_path *path;
3996	3618	int ret = 0;
3997		- struct extent_buffer *leaf;
3998	3619	struct btrfs_dir_item *di;
3999		- struct btrfs_key key;
4000	3620	u64 index;
4001	3621	u64 ino = btrfs_ino(inode);
4002	3622	u64 dir_ino = btrfs_ino(dir);
..	..	@@ -4010,16 +3630,10 @@
4010	3630	path->leave_spinning = 1;
4011	3631	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4012	3632	name, name_len, -1);
4013		- if (IS_ERR(di)) {
4014		- ret = PTR_ERR(di);
	3633	+ if (IS_ERR_OR_NULL(di)) {
	3634	+ ret = di ? PTR_ERR(di) : -ENOENT;
4015	3635	goto err;
4016	3636	}
4017		- if (!di) {
4018		- ret = -ENOENT;
4019		- goto err;
4020		- }
4021		- leaf = path->nodes[0];
4022		- btrfs_dir_item_key_to_cpu(leaf, di, &key);
4023	3637	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4024	3638	if (ret)
4025	3639	goto err;
..	..	@@ -4072,6 +3686,17 @@
4072	3686	ret = 0;
4073	3687	else if (ret)
4074	3688	btrfs_abort_transaction(trans, ret);
	3689	+
	3690	+ /*
	3691	+ * If we have a pending delayed iput we could end up with the final iput
	3692	+ * being run in btrfs-cleaner context. If we have enough of these built
	3693	+ * up we can end up burning a lot of time in btrfs-cleaner without any
	3694	+ * way to throttle the unlinks. Since we're currently holding a ref on
	3695	+ * the inode we can run the delayed iput here without any issues as the
	3696	+ * final iput won't be done until after we drop the ref we're currently
	3697	+ * holding.
	3698	+ */
	3699	+ btrfs_run_delayed_iput(fs_info, inode);
4075	3700	err:
4076	3701	btrfs_free_path(path);
4077	3702	if (ret)
..	..	@@ -4120,7 +3745,7 @@
4120	3745	* 1 for the inode ref
4121	3746	* 1 for the inode
4122	3747	*/
4123		- return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
	3748	+ return btrfs_start_transaction_fallback_global_rsv(root, 5);
4124	3749	}
4125	3750
4126	3751	static int btrfs_unlink(struct inode dir, struct dentry dentry)
..	..	@@ -4187,10 +3812,7 @@
4187	3812	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4188	3813	name, name_len, -1);
4189	3814	if (IS_ERR_OR_NULL(di)) {
4190		- if (!di)
4191		- ret = -ENOENT;
4192		- else
4193		- ret = PTR_ERR(di);
	3815	+ ret = di ? PTR_ERR(di) : -ENOENT;
4194	3816	goto out;
4195	3817	}
4196	3818
..	..	@@ -4393,18 +4015,24 @@
4393	4015	* again is not run concurrently.
4394	4016	*/
4395	4017	spin_lock(&dest->root_item_lock);
4396		- root_flags = btrfs_root_flags(&dest->root_item);
4397		- if (dest->send_in_progress == 0) {
4398		- btrfs_set_root_flags(&dest->root_item,
4399		- root_flags \| BTRFS_ROOT_SUBVOL_DEAD);
4400		- spin_unlock(&dest->root_item_lock);
4401		- } else {
	4018	+ if (dest->send_in_progress) {
4402	4019	spin_unlock(&dest->root_item_lock);
4403	4020	btrfs_warn(fs_info,
4404	4021	"attempt to delete subvolume %llu during send",
4405	4022	dest->root_key.objectid);
4406	4023	return -EPERM;
4407	4024	}
	4025	+ if (atomic_read(&dest->nr_swapfiles)) {
	4026	+ spin_unlock(&dest->root_item_lock);
	4027	+ btrfs_warn(fs_info,
	4028	+ "attempt to delete subvolume %llu with active swapfile",
	4029	+ root->root_key.objectid);
	4030	+ return -EPERM;
	4031	+ }
	4032	+ root_flags = btrfs_root_flags(&dest->root_item);
	4033	+ btrfs_set_root_flags(&dest->root_item,
	4034	+ root_flags \| BTRFS_ROOT_SUBVOL_DEAD);
	4035	+ spin_unlock(&dest->root_item_lock);
4408	4036
4409	4037	down_write(&fs_info->subvol_sem);
4410	4038
..	..	@@ -4487,7 +4115,7 @@
4487	4115	err = ret;
4488	4116	inode->i_flags \|= S_DEAD;
4489	4117	out_release:
4490		- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
	4118	+ btrfs_subvolume_release_metadata(root, &block_rsv);
4491	4119	out_up_write:
4492	4120	up_write(&fs_info->subvol_sem);
4493	4121	if (err) {
..	..	@@ -4566,31 +4194,6 @@
4566	4194	return err;
4567	4195	}
4568	4196
4569		-static int truncate_space_check(struct btrfs_trans_handle *trans,
4570		- struct btrfs_root *root,
4571		- u64 bytes_deleted)
4572		-{
4573		- struct btrfs_fs_info *fs_info = root->fs_info;
4574		- int ret;
4575		-
4576		- /*
4577		- * This is only used to apply pressure to the enospc system, we don't
4578		- * intend to use this reservation at all.
4579		- */
4580		- bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4581		- bytes_deleted *= fs_info->nodesize;
4582		- ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4583		- bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4584		- if (!ret) {
4585		- trace_btrfs_space_reservation(fs_info, "transaction",
4586		- trans->transid,
4587		- bytes_deleted, 1);
4588		- trans->bytes_reserved += bytes_deleted;
4589		- }
4590		- return ret;
4591		-
4592		-}
4593		-
4594	4197	/*
4595	4198	* Return this if we need to call truncate_block for the last bit of the
4596	4199	* truncate.
..	..	@@ -4635,16 +4238,18 @@
4635	4238	u64 bytes_deleted = 0;
4636	4239	bool be_nice = false;
4637	4240	bool should_throttle = false;
4638		- bool should_end = false;
	4241	+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
	4242	+ struct extent_state *cached_state = NULL;
4639	4243
4640	4244	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4641	4245
4642	4246	/*
4643		- * for non-free space inodes and ref cows, we want to back off from
4644		- * time to time
	4247	+ * For non-free space inodes and non-shareable roots, we want to back
	4248	+ * off from time to time. This means all inodes in subvolume roots,
	4249	+ * reloc roots, and data reloc roots.
4645	4250	*/
4646	4251	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4647		- test_bit(BTRFS_ROOT_REF_COWS, &root->state))
	4252	+ test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4648	4253	be_nice = true;
4649	4254
4650	4255	path = btrfs_alloc_path();
..	..	@@ -4652,21 +4257,24 @@
4652	4257	return -ENOMEM;
4653	4258	path->reada = READA_BACK;
4654	4259
4655		- /*
4656		- * We want to drop from the next block forward in case this new size is
4657		- * not block aligned since we will be keeping the last block of the
4658		- * extent just the way it is.
4659		- */
4660		- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
4661		- root == fs_info->tree_root)
	4260	+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
	4261	+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
	4262	+ &cached_state);
	4263	+
	4264	+ /*
	4265	+ * We want to drop from the next block forward in case this
	4266	+ * new size is not block aligned since we will be keeping the
	4267	+ * last block of the extent just the way it is.
	4268	+ */
4662	4269	btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4663	4270	fs_info->sectorsize),
4664	4271	(u64)-1, 0);
	4272	+ }
4665	4273
4666	4274	/*
4667	4275	* This function is also used to drop the items in the log tree before
4668	4276	* we relog the inode, so if root != BTRFS_I(inode)->root, it means
4669		- * it is used to drop the loged items. So we shouldn't kill the delayed
	4277	+ * it is used to drop the logged items. So we shouldn't kill the delayed
4670	4278	* items.
4671	4279	*/
4672	4280	if (min_type == 0 && root == BTRFS_I(inode)->root)
..	..	@@ -4688,7 +4296,6 @@
4688	4296	goto out;
4689	4297	}
4690	4298
4691		- path->leave_spinning = 1;
4692	4299	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4693	4300	if (ret < 0)
4694	4301	goto out;
..	..	@@ -4704,6 +4311,8 @@
4704	4311	}
4705	4312
4706	4313	while (1) {
	4314	+ u64 clear_start = 0, clear_len = 0;
	4315	+
4707	4316	fi = NULL;
4708	4317	leaf = path->nodes[0];
4709	4318	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
..	..	@@ -4754,6 +4363,8 @@
4754	4363
4755	4364	if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4756	4365	u64 num_dec;
	4366	+
	4367	+ clear_start = found_key.offset;
4757	4368	extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4758	4369	if (!del_item) {
4759	4370	u64 orig_num_bytes =
..	..	@@ -4761,11 +4372,12 @@
4761	4372	extent_num_bytes = ALIGN(new_size -
4762	4373	found_key.offset,
4763	4374	fs_info->sectorsize);
	4375	+ clear_start = ALIGN(new_size, fs_info->sectorsize);
4764	4376	btrfs_set_file_extent_num_bytes(leaf, fi,
4765	4377	extent_num_bytes);
4766	4378	num_dec = (orig_num_bytes -
4767	4379	extent_num_bytes);
4768		- if (test_bit(BTRFS_ROOT_REF_COWS,
	4380	+ if (test_bit(BTRFS_ROOT_SHAREABLE,
4769	4381	&root->state) &&
4770	4382	extent_start != 0)
4771	4383	inode_sub_bytes(inode, num_dec);
..	..	@@ -4781,11 +4393,12 @@
4781	4393	num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4782	4394	if (extent_start != 0) {
4783	4395	found_extent = 1;
4784		- if (test_bit(BTRFS_ROOT_REF_COWS,
	4396	+ if (test_bit(BTRFS_ROOT_SHAREABLE,
4785	4397	&root->state))
4786	4398	inode_sub_bytes(inode, num_dec);
4787	4399	}
4788	4400	}
	4401	+ clear_len = num_dec;
4789	4402	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4790	4403	/*
4791	4404	* we can't truncate inline items that have had
..	..	@@ -4799,7 +4412,7 @@
4799	4412
4800	4413	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4801	4414	size = btrfs_file_extent_calc_inline_size(size);
4802		- btrfs_truncate_item(root->fs_info, path, size, 1);
	4415	+ btrfs_truncate_item(path, size, 1);
4803	4416	} else if (!del_item) {
4804	4417	/*
4805	4418	* We have to bail so the last_size is set to
..	..	@@ -4807,12 +4420,33 @@
4807	4420	*/
4808	4421	ret = NEED_TRUNCATE_BLOCK;
4809	4422	break;
	4423	+ } else {
	4424	+ /*
	4425	+ * Inline extents are special, we just treat
	4426	+ * them as a full sector worth in the file
	4427	+ * extent tree just for simplicity sake.
	4428	+ */
	4429	+ clear_len = fs_info->sectorsize;
4810	4430	}
4811	4431
4812		- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
	4432	+ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
4813	4433	inode_sub_bytes(inode, item_end + 1 - new_size);
4814	4434	}
4815	4435	delete:
	4436	+ /*
	4437	+ * We use btrfs_truncate_inode_items() to clean up log trees for
	4438	+ * multiple fsyncs, and in this case we don't want to clear the
	4439	+ * file extent range because it's just the log.
	4440	+ */
	4441	+ if (root == BTRFS_I(inode)->root) {
	4442	+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
	4443	+ clear_start, clear_len);
	4444	+ if (ret) {
	4445	+ btrfs_abort_transaction(trans, ret);
	4446	+ break;
	4447	+ }
	4448	+ }
	4449	+
4816	4450	if (del_item)
4817	4451	last_size = found_key.offset;
4818	4452	else
..	..	@@ -4836,29 +4470,23 @@
4836	4470	should_throttle = false;
4837	4471
4838	4472	if (found_extent &&
4839		- (test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
4840		- root == fs_info->tree_root)) {
4841		- btrfs_set_path_blocking(path);
	4473	+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
	4474	+ struct btrfs_ref ref = { 0 };
	4475	+
4842	4476	bytes_deleted += extent_num_bytes;
4843		- ret = btrfs_free_extent(trans, root, extent_start,
4844		- extent_num_bytes, 0,
4845		- btrfs_header_owner(leaf),
4846		- ino, extent_offset);
	4477	+
	4478	+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
	4479	+ extent_start, extent_num_bytes, 0);
	4480	+ ref.real_root = root->root_key.objectid;
	4481	+ btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
	4482	+ ino, extent_offset);
	4483	+ ret = btrfs_free_extent(trans, &ref);
4847	4484	if (ret) {
4848	4485	btrfs_abort_transaction(trans, ret);
4849	4486	break;
4850	4487	}
4851		- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4852		- btrfs_async_run_delayed_refs(fs_info,
4853		- trans->delayed_ref_updates * 2,
4854		- trans->transid, 0);
4855	4488	if (be_nice) {
4856		- if (truncate_space_check(trans, root,
4857		- extent_num_bytes)) {
4858		- should_end = true;
4859		- }
4860		- if (btrfs_should_throttle_delayed_refs(trans,
4861		- fs_info))
	4489	+ if (btrfs_should_throttle_delayed_refs(trans))
4862	4490	should_throttle = true;
4863	4491	}
4864	4492	}
..	..	@@ -4868,7 +4496,7 @@
4868	4496
4869	4497	if (path->slots[0] == 0 \|\|
4870	4498	path->slots[0] != pending_del_slot \|\|
4871		- should_throttle \|\| should_end) {
	4499	+ should_throttle) {
4872	4500	if (pending_del_nr) {
4873	4501	ret = btrfs_del_items(trans, root, path,
4874	4502	pending_del_slot,
..	..	@@ -4880,23 +4508,24 @@
4880	4508	pending_del_nr = 0;
4881	4509	}
4882	4510	btrfs_release_path(path);
4883		- if (should_throttle) {
4884		- unsigned long updates = trans->delayed_ref_updates;
4885		- if (updates) {
4886		- trans->delayed_ref_updates = 0;
4887		- ret = btrfs_run_delayed_refs(trans,
4888		- updates * 2);
4889		- if (ret)
4890		- break;
4891		- }
4892		- }
	4511	+
4893	4512	/*
4894		- * if we failed to refill our space rsv, bail out
4895		- * and let the transaction restart
	4513	+ * We can generate a lot of delayed refs, so we need to
	4514	+ * throttle every once and a while and make sure we're
	4515	+ * adding enough space to keep up with the work we are
	4516	+ * generating. Since we hold a transaction here we
	4517	+ * can't flush, and we don't want to FLUSH_LIMIT because
	4518	+ * we could have generated too many delayed refs to
	4519	+ * actually allocate, so just bail if we're short and
	4520	+ * let the normal reservation dance happen higher up.
4896	4521	*/
4897		- if (should_end) {
4898		- ret = -EAGAIN;
4899		- break;
	4522	+ if (should_throttle) {
	4523	+ ret = btrfs_delayed_refs_rsv_refill(fs_info,
	4524	+ BTRFS_RESERVE_NO_FLUSH);
	4525	+ if (ret) {
	4526	+ ret = -EAGAIN;
	4527	+ break;
	4528	+ }
4900	4529	}
4901	4530	goto search_again;
4902	4531	} else {
..	..	@@ -4918,22 +4547,12 @@
4918	4547	ASSERT(last_size >= new_size);
4919	4548	if (!ret && last_size > new_size)
4920	4549	last_size = new_size;
4921		- btrfs_ordered_update_i_size(inode, last_size, NULL);
	4550	+ btrfs_inode_safe_disk_i_size_write(inode, last_size);
	4551	+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
	4552	+ (u64)-1, &cached_state);
4922	4553	}
4923	4554
4924	4555	btrfs_free_path(path);
4925		-
4926		- if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 \|\| ret == -EAGAIN)) {
4927		- unsigned long updates = trans->delayed_ref_updates;
4928		- int err;
4929		-
4930		- if (updates) {
4931		- trans->delayed_ref_updates = 0;
4932		- err = btrfs_run_delayed_refs(trans, updates * 2);
4933		- if (err)
4934		- ret = err;
4935		- }
4936		- }
4937	4556	return ret;
4938	4557	}
4939	4558
..	..	@@ -4958,11 +4577,13 @@
4958	4577	struct extent_state *cached_state = NULL;
4959	4578	struct extent_changeset *data_reserved = NULL;
4960	4579	char *kaddr;
	4580	+ bool only_release_metadata = false;
4961	4581	u32 blocksize = fs_info->sectorsize;
4962	4582	pgoff_t index = from >> PAGE_SHIFT;
4963	4583	unsigned offset = from & (blocksize - 1);
4964	4584	struct page *page;
4965	4585	gfp_t mask = btrfs_alloc_write_mask(mapping);
	4586	+ size_t write_bytes = blocksize;
4966	4587	int ret = 0;
4967	4588	u64 block_start;
4968	4589	u64 block_end;
..	..	@@ -4974,15 +4595,28 @@
4974	4595	block_start = round_down(from, blocksize);
4975	4596	block_end = block_start + blocksize - 1;
4976	4597
4977		- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4978		- block_start, blocksize);
4979		- if (ret)
	4598	+ ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
	4599	+ block_start, blocksize);
	4600	+ if (ret < 0) {
	4601	+ if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
	4602	+ &write_bytes) > 0) {
	4603	+ /* For nocow case, no need to reserve data space */
	4604	+ only_release_metadata = true;
	4605	+ } else {
	4606	+ goto out;
	4607	+ }
	4608	+ }
	4609	+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
	4610	+ if (ret < 0) {
	4611	+ if (!only_release_metadata)
	4612	+ btrfs_free_reserved_data_space(BTRFS_I(inode),
	4613	+ data_reserved, block_start, blocksize);
4980	4614	goto out;
4981		-
	4615	+ }
4982	4616	again:
4983	4617	page = find_or_create_page(mapping, index, mask);
4984	4618	if (!page) {
4985		- btrfs_delalloc_release_space(inode, data_reserved,
	4619	+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
4986	4620	block_start, blocksize, true);
4987	4621	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4988	4622	ret = -ENOMEM;
..	..	@@ -5007,24 +4641,23 @@
5007	4641	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
5008	4642	set_page_extent_mapped(page);
5009	4643
5010		- ordered = btrfs_lookup_ordered_extent(inode, block_start);
	4644	+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
5011	4645	if (ordered) {
5012	4646	unlock_extent_cached(io_tree, block_start, block_end,
5013	4647	&cached_state);
5014	4648	unlock_page(page);
5015	4649	put_page(page);
5016		- btrfs_start_ordered_extent(inode, ordered, 1);
	4650	+ btrfs_start_ordered_extent(ordered, 1);
5017	4651	btrfs_put_ordered_extent(ordered);
5018	4652	goto again;
5019	4653	}
5020	4654
5021	4655	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
5022		- EXTENT_DIRTY \| EXTENT_DELALLOC \|
5023		- EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
5024		- 0, 0, &cached_state);
	4656	+ EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
	4657	+ 0, 0, &cached_state);
5025	4658
5026		- ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5027		- &cached_state, 0);
	4659	+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
	4660	+ &cached_state);
5028	4661	if (ret) {
5029	4662	unlock_extent_cached(io_tree, block_start, block_end,
5030	4663	&cached_state);
..	..	@@ -5048,14 +4681,26 @@
5048	4681	set_page_dirty(page);
5049	4682	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
5050	4683
	4684	+ if (only_release_metadata)
	4685	+ set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
	4686	+ block_end, EXTENT_NORESERVE, NULL, NULL,
	4687	+ GFP_NOFS);
	4688	+
5051	4689	out_unlock:
5052		- if (ret)
5053		- btrfs_delalloc_release_space(inode, data_reserved, block_start,
5054		- blocksize, true);
	4690	+ if (ret) {
	4691	+ if (only_release_metadata)
	4692	+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
	4693	+ blocksize, true);
	4694	+ else
	4695	+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
	4696	+ block_start, blocksize, true);
	4697	+ }
5055	4698	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
5056	4699	unlock_page(page);
5057	4700	put_page(page);
5058	4701	out:
	4702	+ if (only_release_metadata)
	4703	+ btrfs_check_nocow_unlock(BTRFS_I(inode));
5059	4704	extent_changeset_free(data_reserved);
5060	4705	return ret;
5061	4706	}
..	..	@@ -5137,25 +4782,12 @@
5137	4782	if (size <= hole_start)
5138	4783	return 0;
5139	4784
5140		- while (1) {
5141		- struct btrfs_ordered_extent *ordered;
5142		-
5143		- lock_extent_bits(io_tree, hole_start, block_end - 1,
5144		- &cached_state);
5145		- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
5146		- block_end - hole_start);
5147		- if (!ordered)
5148		- break;
5149		- unlock_extent_cached(io_tree, hole_start, block_end - 1,
5150		- &cached_state);
5151		- btrfs_start_ordered_extent(inode, ordered, 1);
5152		- btrfs_put_ordered_extent(ordered);
5153		- }
5154		-
	4785	+ btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
	4786	+ block_end - 1, &cached_state);
5155	4787	cur_offset = hole_start;
5156	4788	while (1) {
5157	4789	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
5158		- block_end - cur_offset, 0);
	4790	+ block_end - cur_offset);
5159	4791	if (IS_ERR(em)) {
5160	4792	err = PTR_ERR(em);
5161	4793	em = NULL;
..	..	@@ -5163,14 +4795,21 @@
5163	4795	}
5164	4796	last_byte = min(extent_map_end(em), block_end);
5165	4797	last_byte = ALIGN(last_byte, fs_info->sectorsize);
	4798	+ hole_size = last_byte - cur_offset;
	4799	+
5166	4800	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
5167	4801	struct extent_map *hole_em;
5168		- hole_size = last_byte - cur_offset;
5169	4802
5170	4803	err = maybe_insert_hole(root, inode, cur_offset,
5171	4804	hole_size);
5172	4805	if (err)
5173	4806	break;
	4807	+
	4808	+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
	4809	+ cur_offset, hole_size);
	4810	+ if (err)
	4811	+ break;
	4812	+
5174	4813	btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
5175	4814	cur_offset + hole_size - 1, 0);
5176	4815	hole_em = alloc_extent_map();
..	..	@@ -5187,7 +4826,6 @@
5187	4826	hole_em->block_len = 0;
5188	4827	hole_em->orig_block_len = 0;
5189	4828	hole_em->ram_bytes = hole_size;
5190		- hole_em->bdev = fs_info->fs_devices->latest_bdev;
5191	4829	hole_em->compress_type = BTRFS_COMPRESS_NONE;
5192	4830	hole_em->generation = fs_info->generation;
5193	4831
..	..	@@ -5203,6 +4841,11 @@
5203	4841	hole_size - 1, 0);
5204	4842	}
5205	4843	free_extent_map(hole_em);
	4844	+ } else {
	4845	+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
	4846	+ cur_offset, hole_size);
	4847	+ if (err)
	4848	+ break;
5206	4849	}
5207	4850	next:
5208	4851	free_extent_map(em);
..	..	@@ -5246,42 +4889,39 @@
5246	4889	* truncation, it must capture all writes that happened before
5247	4890	* this truncation.
5248	4891	*/
5249		- btrfs_wait_for_snapshot_creation(root);
	4892	+ btrfs_drew_write_lock(&root->snapshot_lock);
5250	4893	ret = btrfs_cont_expand(inode, oldsize, newsize);
5251	4894	if (ret) {
5252		- btrfs_end_write_no_snapshotting(root);
	4895	+ btrfs_drew_write_unlock(&root->snapshot_lock);
5253	4896	return ret;
5254	4897	}
5255	4898
5256	4899	trans = btrfs_start_transaction(root, 1);
5257	4900	if (IS_ERR(trans)) {
5258		- btrfs_end_write_no_snapshotting(root);
	4901	+ btrfs_drew_write_unlock(&root->snapshot_lock);
5259	4902	return PTR_ERR(trans);
5260	4903	}
5261	4904
5262	4905	i_size_write(inode, newsize);
5263		- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
	4906	+ btrfs_inode_safe_disk_i_size_write(inode, 0);
5264	4907	pagecache_isize_extended(inode, oldsize, newsize);
5265	4908	ret = btrfs_update_inode(trans, root, inode);
5266		- btrfs_end_write_no_snapshotting(root);
	4909	+ btrfs_drew_write_unlock(&root->snapshot_lock);
5267	4910	btrfs_end_transaction(trans);
5268	4911	} else {
5269	4912
5270	4913	/*
5271	4914	* We're truncating a file that used to have good data down to
5272		- * zero. Make sure it gets into the ordered flush list so that
5273		- * any new writes get down to disk quickly.
	4915	+ * zero. Make sure any new writes to the file get on disk
	4916	+ * on close.
5274	4917	*/
5275	4918	if (newsize == 0)
5276		- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
	4919	+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5277	4920	&BTRFS_I(inode)->runtime_flags);
5278	4921
5279	4922	truncate_setsize(inode, newsize);
5280	4923
5281		- /* Disable nonlocked read DIO to avoid the end less truncate */
5282		- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5283	4924	inode_dio_wait(inode);
5284		- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5285	4925
5286	4926	ret = btrfs_truncate(inode, newsize == oldsize);
5287	4927	if (ret && inode->i_nlink) {
..	..	@@ -5356,10 +4996,10 @@
5356	4996	truncate_inode_pages_final(&inode->i_data);
5357	4997
5358	4998	write_lock(&map_tree->lock);
5359		- while (!RB_EMPTY_ROOT(&map_tree->map)) {
	4999	+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5360	5000	struct extent_map *em;
5361	5001
5362		- node = rb_first(&map_tree->map);
	5002	+ node = rb_first_cached(&map_tree->map);
5363	5003	em = rb_entry(node, struct extent_map, rb_node);
5364	5004	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5365	5005	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
..	..	@@ -5375,8 +5015,8 @@
5375	5015
5376	5016	/*
5377	5017	* Keep looping until we have no more ranges in the io tree.
5378		- * We can have ongoing bios started by readpages (called from readahead)
5379		- * that have their endio callback (extent_io.c:end_bio_extent_readpage)
	5018	+ * We can have ongoing bios started by readahead that have
	5019	+ * their endio callback (extent_io.c:end_bio_extent_readpage)
5380	5020	* still in progress (unlocked the pages in the bio but did not yet
5381	5021	* unlocked the ranges in the io tree). Therefore this means some
5382	5022	* ranges can still be locked and eviction started because before
..	..	@@ -5415,12 +5055,13 @@
5415	5055	* Note, end is the bytenr of last byte, so we need + 1 here.
5416	5056	*/
5417	5057	if (state_flags & EXTENT_DELALLOC)
5418		- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
	5058	+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
	5059	+ end - start + 1);
5419	5060
5420	5061	clear_extent_bit(io_tree, start, end,
5421		- EXTENT_LOCKED \| EXTENT_DIRTY \|
5422		- EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \|
5423		- EXTENT_DEFRAG, 1, 1, &cached_state);
	5062	+ EXTENT_LOCKED \| EXTENT_DELALLOC \|
	5063	+ EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 1, 1,
	5064	+ &cached_state);
5424	5065
5425	5066	cond_resched();
5426	5067	spin_lock(&io_tree->lock);
..	..	@@ -5429,43 +5070,54 @@
5429	5070	}
5430	5071
5431	5072	static struct btrfs_trans_handle evict_refill_and_join(struct btrfs_root root,
5432		- struct btrfs_block_rsv *rsv,
5433		- u64 min_size)
	5073	+ struct btrfs_block_rsv *rsv)
5434	5074	{
5435	5075	struct btrfs_fs_info *fs_info = root->fs_info;
5436	5076	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5437		- int failures = 0;
	5077	+ struct btrfs_trans_handle *trans;
	5078	+ u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
	5079	+ int ret;
5438	5080
5439		- for (;;) {
5440		- struct btrfs_trans_handle *trans;
5441		- int ret;
5442		-
5443		- ret = btrfs_block_rsv_refill(root, rsv, min_size,
5444		- BTRFS_RESERVE_FLUSH_LIMIT);
5445		-
5446		- if (ret && ++failures > 2) {
5447		- btrfs_warn(fs_info,
5448		- "could not allocate space for a delete; will truncate on mount");
5449		- return ERR_PTR(-ENOSPC);
5450		- }
5451		-
5452		- trans = btrfs_join_transaction(root);
5453		- if (IS_ERR(trans) \|\| !ret)
5454		- return trans;
5455		-
	5081	+ /*
	5082	+ * Eviction should be taking place at some place safe because of our
	5083	+ * delayed iputs. However the normal flushing code will run delayed
	5084	+ * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
	5085	+ *
	5086	+ * We reserve the delayed_refs_extra here again because we can't use
	5087	+ * btrfs_start_transaction(root, 0) for the same deadlocky reason as
	5088	+ * above. We reserve our extra bit here because we generate a ton of
	5089	+ * delayed refs activity by truncating.
	5090	+ *
	5091	+ * If we cannot make our reservation we'll attempt to steal from the
	5092	+ * global reserve, because we really want to be able to free up space.
	5093	+ */
	5094	+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
	5095	+ BTRFS_RESERVE_FLUSH_EVICT);
	5096	+ if (ret) {
5456	5097	/*
5457	5098	* Try to steal from the global reserve if there is space for
5458	5099	* it.
5459	5100	*/
5460		- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
5461		- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
5462		- return trans;
5463		-
5464		- /* If not, commit and try again. */
5465		- ret = btrfs_commit_transaction(trans);
5466		- if (ret)
5467		- return ERR_PTR(ret);
	5101	+ if (btrfs_check_space_for_delayed_refs(fs_info) \|\|
	5102	+ btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
	5103	+ btrfs_warn(fs_info,
	5104	+ "could not allocate space for delete; will truncate on mount");
	5105	+ return ERR_PTR(-ENOSPC);
	5106	+ }
	5107	+ delayed_refs_extra = 0;
5468	5108	}
	5109	+
	5110	+ trans = btrfs_join_transaction(root);
	5111	+ if (IS_ERR(trans))
	5112	+ return trans;
	5113	+
	5114	+ if (delayed_refs_extra) {
	5115	+ trans->block_rsv = &fs_info->trans_block_rsv;
	5116	+ trans->bytes_reserved = delayed_refs_extra;
	5117	+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
	5118	+ delayed_refs_extra, 1);
	5119	+ }
	5120	+ return trans;
5469	5121	}
5470	5122
5471	5123	void btrfs_evict_inode(struct inode *inode)
..	..	@@ -5474,7 +5126,6 @@
5474	5126	struct btrfs_trans_handle *trans;
5475	5127	struct btrfs_root *root = BTRFS_I(inode)->root;
5476	5128	struct btrfs_block_rsv *rsv;
5477		- u64 min_size;
5478	5129	int ret;
5479	5130
5480	5131	trace_btrfs_inode_evict(inode);
..	..	@@ -5483,8 +5134,6 @@
5483	5134	clear_inode(inode);
5484	5135	return;
5485	5136	}
5486		-
5487		- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
5488	5137
5489	5138	evict_inode_truncate_pages(inode);
5490	5139
..	..	@@ -5496,9 +5145,6 @@
5496	5145
5497	5146	if (is_bad_inode(inode))
5498	5147	goto no_delete;
5499		- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5500		- if (!special_file(inode->i_mode))
5501		- btrfs_wait_ordered_range(inode, 0, (u64)-1);
5502	5148
5503	5149	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5504	5150
..	..	@@ -5518,13 +5164,13 @@
5518	5164	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5519	5165	if (!rsv)
5520	5166	goto no_delete;
5521		- rsv->size = min_size;
	5167	+ rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5522	5168	rsv->failfast = 1;
5523	5169
5524	5170	btrfs_i_size_write(BTRFS_I(inode), 0);
5525	5171
5526	5172	while (1) {
5527		- trans = evict_refill_and_join(root, rsv, min_size);
	5173	+ trans = evict_refill_and_join(root, rsv);
5528	5174	if (IS_ERR(trans))
5529	5175	goto free_rsv;
5530	5176
..	..	@@ -5549,7 +5195,7 @@
5549	5195	* If it turns out that we are dropping too many of these, we might want
5550	5196	* to add a mechanism for retrying these after a commit.
5551	5197	*/
5552		- trans = evict_refill_and_join(root, rsv, min_size);
	5198	+ trans = evict_refill_and_join(root, rsv);
5553	5199	if (!IS_ERR(trans)) {
5554	5200	trans->block_rsv = rsv;
5555	5201	btrfs_orphan_del(trans, BTRFS_I(inode));
..	..	@@ -5596,12 +5242,8 @@
5596	5242
5597	5243	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5598	5244	name, namelen, 0);
5599		- if (!di) {
5600		- ret = -ENOENT;
5601		- goto out;
5602		- }
5603		- if (IS_ERR(di)) {
5604		- ret = PTR_ERR(di);
	5245	+ if (IS_ERR_OR_NULL(di)) {
	5246	+ ret = di ? PTR_ERR(di) : -ENOENT;
5605	5247	goto out;
5606	5248	}
5607	5249
..	..	@@ -5672,7 +5314,7 @@
5672	5314
5673	5315	btrfs_release_path(path);
5674	5316
5675		- new_root = btrfs_read_fs_root_no_name(fs_info, location);
	5317	+ new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5676	5318	if (IS_ERR(new_root)) {
5677	5319	err = PTR_ERR(new_root);
5678	5320	goto out;
..	..	@@ -5724,15 +5366,15 @@
5724	5366	spin_unlock(&root->inode_lock);
5725	5367	}
5726	5368
5727		-static void inode_tree_del(struct inode *inode)
	5369	+static void inode_tree_del(struct btrfs_inode *inode)
5728	5370	{
5729		- struct btrfs_root *root = BTRFS_I(inode)->root;
	5371	+ struct btrfs_root *root = inode->root;
5730	5372	int empty = 0;
5731	5373
5732	5374	spin_lock(&root->inode_lock);
5733		- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5734		- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5735		- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
	5375	+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
	5376	+ rb_erase(&inode->rb_node, &root->inode_tree);
	5377	+ RB_CLEAR_NODE(&inode->rb_node);
5736	5378	empty = RB_EMPTY_ROOT(&root->inode_tree);
5737	5379	}
5738	5380	spin_unlock(&root->inode_lock);
..	..	@@ -5750,29 +5392,32 @@
5750	5392	static int btrfs_init_locked_inode(struct inode inode, void p)
5751	5393	{
5752	5394	struct btrfs_iget_args *args = p;
5753		- inode->i_ino = args->location->objectid;
5754		- memcpy(&BTRFS_I(inode)->location, args->location,
5755		- sizeof(*args->location));
5756		- BTRFS_I(inode)->root = args->root;
	5395	+
	5396	+ inode->i_ino = args->ino;
	5397	+ BTRFS_I(inode)->location.objectid = args->ino;
	5398	+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
	5399	+ BTRFS_I(inode)->location.offset = 0;
	5400	+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
	5401	+ BUG_ON(args->root && !BTRFS_I(inode)->root);
5757	5402	return 0;
5758	5403	}
5759	5404
5760	5405	static int btrfs_find_actor(struct inode inode, void opaque)
5761	5406	{
5762	5407	struct btrfs_iget_args *args = opaque;
5763		- return args->location->objectid == BTRFS_I(inode)->location.objectid &&
	5408	+
	5409	+ return args->ino == BTRFS_I(inode)->location.objectid &&
5764	5410	args->root == BTRFS_I(inode)->root;
5765	5411	}
5766	5412
5767		-static struct inode btrfs_iget_locked(struct super_block s,
5768		- struct btrfs_key *location,
	5413	+static struct inode btrfs_iget_locked(struct super_block s, u64 ino,
5769	5414	struct btrfs_root *root)
5770	5415	{
5771	5416	struct inode *inode;
5772	5417	struct btrfs_iget_args args;
5773		- unsigned long hashval = btrfs_inode_hash(location->objectid, root);
	5418	+ unsigned long hashval = btrfs_inode_hash(ino, root);
5774	5419
5775		- args.location = location;
	5420	+ args.ino = ino;
5776	5421	args.root = root;
5777	5422
5778	5423	inode = iget5_locked(s, hashval, btrfs_find_actor,
..	..	@@ -5781,16 +5426,18 @@
5781	5426	return inode;
5782	5427	}
5783	5428
5784		-/* Get an inode object given its location and corresponding root.
5785		- * Returns in *is_new if the inode was read from disk
	5429	+/*
	5430	+ * Get an inode object given its inode number and corresponding root.
	5431	+ * Path can be preallocated to prevent recursing back to iget through
	5432	+ * allocator. NULL is also valid but may require an additional allocation
	5433	+ * later.
5786	5434	*/
5787		-struct inode btrfs_iget_path(struct super_block s, struct btrfs_key *location,
5788		- struct btrfs_root root, int new,
5789		- struct btrfs_path *path)
	5435	+struct inode btrfs_iget_path(struct super_block s, u64 ino,
	5436	+ struct btrfs_root root, struct btrfs_path path)
5790	5437	{
5791	5438	struct inode *inode;
5792	5439
5793		- inode = btrfs_iget_locked(s, location, root);
	5440	+ inode = btrfs_iget_locked(s, ino, root);
5794	5441	if (!inode)
5795	5442	return ERR_PTR(-ENOMEM);
5796	5443
..	..	@@ -5801,8 +5448,6 @@
5801	5448	if (!ret) {
5802	5449	inode_tree_add(inode);
5803	5450	unlock_new_inode(inode);
5804		- if (new)
5805		- *new = 1;
5806	5451	} else {
5807	5452	iget_failed(inode);
5808	5453	/*
..	..	@@ -5819,10 +5464,9 @@
5819	5464	return inode;
5820	5465	}
5821	5466
5822		-struct inode btrfs_iget(struct super_block s, struct btrfs_key *location,
5823		- struct btrfs_root root, int new)
	5467	+struct inode btrfs_iget(struct super_block s, u64 ino, struct btrfs_root *root)
5824	5468	{
5825		- return btrfs_iget_path(s, location, root, new, NULL);
	5469	+ return btrfs_iget_path(s, ino, root, NULL);
5826	5470	}
5827	5471
5828	5472	static struct inode new_simple_dir(struct super_block s,
..	..	@@ -5834,12 +5478,16 @@
5834	5478	if (!inode)
5835	5479	return ERR_PTR(-ENOMEM);
5836	5480
5837		- BTRFS_I(inode)->root = root;
	5481	+ BTRFS_I(inode)->root = btrfs_grab_root(root);
5838	5482	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5839	5483	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5840	5484
5841	5485	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5842		- inode->i_op = &btrfs_dir_ro_inode_operations;
	5486	+ /*
	5487	+ * We only need lookup, the rest is read-only and there's no inode
	5488	+ * associated with the dentry
	5489	+ */
	5490	+ inode->i_op = &simple_dir_inode_operations;
5843	5491	inode->i_opflags &= ~IOP_XATTR;
5844	5492	inode->i_fop = &simple_dir_operations;
5845	5493	inode->i_mode = S_IFDIR \| S_IRUGO \| S_IWUSR \| S_IXUGO;
..	..	@@ -5853,7 +5501,20 @@
5853	5501
5854	5502	static inline u8 btrfs_inode_type(struct inode *inode)
5855	5503	{
5856		- return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
	5504	+ /*
	5505	+ * Compile-time asserts that generic FT_* types still match
	5506	+ * BTRFS_FT_* types
	5507	+ */
	5508	+ BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
	5509	+ BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
	5510	+ BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
	5511	+ BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
	5512	+ BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
	5513	+ BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
	5514	+ BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
	5515	+ BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
	5516	+
	5517	+ return fs_umode_to_ftype(inode->i_mode);
5857	5518	}
5858	5519
5859	5520	struct inode btrfs_lookup_dentry(struct inode dir, struct dentry *dentry)
..	..	@@ -5864,7 +5525,6 @@
5864	5525	struct btrfs_root *sub_root = root;
5865	5526	struct btrfs_key location;
5866	5527	u8 di_type = 0;
5867		- int index;
5868	5528	int ret = 0;
5869	5529
5870	5530	if (dentry->d_name.len > BTRFS_NAME_LEN)
..	..	@@ -5875,7 +5535,7 @@
5875	5535	return ERR_PTR(ret);
5876	5536
5877	5537	if (location.type == BTRFS_INODE_ITEM_KEY) {
5878		- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
	5538	+ inode = btrfs_iget(dir->i_sb, location.objectid, root);
5879	5539	if (IS_ERR(inode))
5880	5540	return inode;
5881	5541
..	..	@@ -5891,7 +5551,6 @@
5891	5551	return inode;
5892	5552	}
5893	5553
5894		- index = srcu_read_lock(&fs_info->subvol_srcu);
5895	5554	ret = fixup_tree_root_location(fs_info, dir, dentry,
5896	5555	&location, &sub_root);
5897	5556	if (ret < 0) {
..	..	@@ -5900,9 +5559,10 @@
5900	5559	else
5901	5560	inode = new_simple_dir(dir->i_sb, &location, sub_root);
5902	5561	} else {
5903		- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
	5562	+ inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5904	5563	}
5905		- srcu_read_unlock(&fs_info->subvol_srcu, index);
	5564	+ if (root != sub_root)
	5565	+ btrfs_put_root(sub_root);
5906	5566
5907	5567	if (!IS_ERR(inode) && root != sub_root) {
5908	5568	down_read(&fs_info->cleanup_work_sem);
..	..	@@ -5940,22 +5600,12 @@
5940	5600	static struct dentry btrfs_lookup(struct inode dir, struct dentry *dentry,
5941	5601	unsigned int flags)
5942	5602	{
5943		- struct inode *inode;
	5603	+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5944	5604
5945		- inode = btrfs_lookup_dentry(dir, dentry);
5946		- if (IS_ERR(inode)) {
5947		- if (PTR_ERR(inode) == -ENOENT)
5948		- inode = NULL;
5949		- else
5950		- return ERR_CAST(inode);
5951		- }
5952		-
	5605	+ if (inode == ERR_PTR(-ENOENT))
	5606	+ inode = NULL;
5953	5607	return d_splice_alias(inode, dentry);
5954	5608	}
5955		-
5956		-unsigned char btrfs_filetype_table[] = {
5957		- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5958		-};
5959	5609
5960	5610	/*
5961	5611	* All this infrastructure exists because dir_emit can fault, and we are holding
..	..	@@ -6095,7 +5745,7 @@
6095	5745	name_ptr = (char *)(entry + 1);
6096	5746	read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
6097	5747	name_len);
6098		- put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)],
	5748	+ put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
6099	5749	&entry->type);
6100	5750	btrfs_dir_item_key_to_cpu(leaf, di, &location);
6101	5751	put_unaligned(location.objectid, &entry->ino);
..	..	@@ -6167,7 +5817,7 @@
6167	5817	return PTR_ERR(trans);
6168	5818
6169	5819	ret = btrfs_update_inode(trans, root, inode);
6170		- if (ret && ret == -ENOSPC) {
	5820	+ if (ret && (ret == -ENOSPC \|\| ret == -EDQUOT)) {
6171	5821	/* whoops, lets try again with the full transaction */
6172	5822	btrfs_end_transaction(trans);
6173	5823	trans = btrfs_start_transaction(root, 1);
..	..	@@ -6290,7 +5940,8 @@
6290	5940	static int btrfs_insert_inode_locked(struct inode *inode)
6291	5941	{
6292	5942	struct btrfs_iget_args args;
6293		- args.location = &BTRFS_I(inode)->location;
	5943	+
	5944	+ args.ino = BTRFS_I(inode)->location.objectid;
6294	5945	args.root = BTRFS_I(inode)->root;
6295	5946
6296	5947	return insert_inode_locked4(inode,
..	..	@@ -6346,13 +5997,16 @@
6346	5997	u32 sizes[2];
6347	5998	int nitems = name ? 2 : 1;
6348	5999	unsigned long ptr;
	6000	+ unsigned int nofs_flag;
6349	6001	int ret;
6350	6002
6351	6003	path = btrfs_alloc_path();
6352	6004	if (!path)
6353	6005	return ERR_PTR(-ENOMEM);
6354	6006
	6007	+ nofs_flag = memalloc_nofs_save();
6355	6008	inode = new_inode(fs_info->sb);
	6009	+ memalloc_nofs_restore(nofs_flag);
6356	6010	if (!inode) {
6357	6011	btrfs_free_path(path);
6358	6012	return ERR_PTR(-ENOMEM);
..	..	@@ -6390,7 +6044,7 @@
6390	6044	*/
6391	6045	BTRFS_I(inode)->index_cnt = 2;
6392	6046	BTRFS_I(inode)->dir_index = *index;
6393		- BTRFS_I(inode)->root = root;
	6047	+ BTRFS_I(inode)->root = btrfs_grab_root(root);
6394	6048	BTRFS_I(inode)->generation = trans->transid;
6395	6049	inode->i_generation = BTRFS_I(inode)->generation;
6396	6050
..	..	@@ -6477,7 +6131,7 @@
6477	6131	inode_tree_add(inode);
6478	6132
6479	6133	trace_btrfs_inode_new(inode);
6480		- btrfs_set_inode_last_trans(trans, inode);
	6134	+ btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6481	6135
6482	6136	btrfs_update_root_times(trans, root);
6483	6137
..	..	@@ -6535,8 +6189,7 @@
6535	6189	if (ret)
6536	6190	return ret;
6537	6191
6538		- ret = btrfs_insert_dir_item(trans, root, name, name_len,
6539		- parent_inode, &key,
	6192	+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6540	6193	btrfs_inode_type(&inode->vfs_inode), index);
6541	6194	if (ret == -EEXIST \|\| ret == -EOVERFLOW)
6542	6195	goto fail_dir_item;
..	..	@@ -6620,7 +6273,7 @@
6620	6273	if (IS_ERR(trans))
6621	6274	return PTR_ERR(trans);
6622	6275
6623		- err = btrfs_find_free_ino(root, &objectid);
	6276	+ err = btrfs_find_free_objectid(root, &objectid);
6624	6277	if (err)
6625	6278	goto out_unlock;
6626	6279
..	..	@@ -6684,7 +6337,7 @@
6684	6337	if (IS_ERR(trans))
6685	6338	return PTR_ERR(trans);
6686	6339
6687		- err = btrfs_find_free_ino(root, &objectid);
	6340	+ err = btrfs_find_free_objectid(root, &objectid);
6688	6341	if (err)
6689	6342	goto out_unlock;
6690	6343
..	..	@@ -6719,7 +6372,6 @@
6719	6372	if (err)
6720	6373	goto out_unlock;
6721	6374
6722		- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6723	6375	d_instantiate_new(dentry, inode);
6724	6376
6725	6377	out_unlock:
..	..	@@ -6744,7 +6396,7 @@
6744	6396	int drop_inode = 0;
6745	6397
6746	6398	/* do not allow sys_link's with other subvols of the same device */
6747		- if (root->objectid != BTRFS_I(inode)->root->objectid)
	6399	+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6748	6400	return -EXDEV;
6749	6401
6750	6402	if (inode->i_nlink >= BTRFS_LINK_MAX)
..	..	@@ -6782,7 +6434,6 @@
6782	6434	drop_inode = 1;
6783	6435	} else {
6784	6436	struct dentry *parent = dentry->d_parent;
6785		- int ret;
6786	6437
6787	6438	err = btrfs_update_inode(trans, root, inode);
6788	6439	if (err)
..	..	@@ -6796,14 +6447,8 @@
6796	6447	if (err)
6797	6448	goto fail;
6798	6449	}
6799		- BTRFS_I(inode)->last_link_trans = trans->transid;
6800	6450	d_instantiate(dentry, inode);
6801		- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
6802		- true, NULL);
6803		- if (ret == BTRFS_NEED_TRANS_COMMIT) {
6804		- err = btrfs_commit_transaction(trans);
6805		- trans = NULL;
6806		- }
	6451	+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
6807	6452	}
6808	6453
6809	6454	fail:
..	..	@@ -6824,7 +6469,6 @@
6824	6469	struct btrfs_trans_handle *trans;
6825	6470	struct btrfs_root *root = BTRFS_I(dir)->root;
6826	6471	int err = 0;
6827		- int drop_on_err = 0;
6828	6472	u64 objectid = 0;
6829	6473	u64 index = 0;
6830	6474
..	..	@@ -6837,7 +6481,7 @@
6837	6481	if (IS_ERR(trans))
6838	6482	return PTR_ERR(trans);
6839	6483
6840		- err = btrfs_find_free_ino(root, &objectid);
	6484	+ err = btrfs_find_free_objectid(root, &objectid);
6841	6485	if (err)
6842	6486	goto out_fail;
6843	6487
..	..	@@ -6850,7 +6494,6 @@
6850	6494	goto out_fail;
6851	6495	}
6852	6496
6853		- drop_on_err = 1;
6854	6497	/* these must be set before we unlock the inode */
6855	6498	inode->i_op = &btrfs_dir_inode_operations;
6856	6499	inode->i_fop = &btrfs_dir_file_operations;
..	..	@@ -6871,7 +6514,6 @@
6871	6514	goto out_fail;
6872	6515
6873	6516	d_instantiate_new(dentry, inode);
6874		- drop_on_err = 0;
6875	6517
6876	6518	out_fail:
6877	6519	btrfs_end_transaction(trans);
..	..	@@ -6929,26 +6571,34 @@
6929	6571	return ret;
6930	6572	}
6931	6573
6932		-/*
6933		- * a bit scary, this does extent mapping from logical file offset to the disk.
6934		- * the ugly parts come from merging extents from the disk with the in-ram
6935		- * representation. This gets more complex because of the data=ordered code,
6936		- * where the in-ram extents might be locked pending data=ordered completion.
	6574	+/**
	6575	+ * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
	6576	+ * @inode: file to search in
	6577	+ * @page: page to read extent data into if the extent is inline
	6578	+ * @pg_offset: offset into @page to copy to
	6579	+ * @start: file offset
	6580	+ * @len: length of range starting at @start
6937	6581	*
6938		- * This also copies inline extents directly into the page.
	6582	+ * This returns the first &struct extent_map which overlaps with the given
	6583	+ * range, reading it from the B-tree and caching it if necessary. Note that
	6584	+ * there may be more extents which overlap the given range after the returned
	6585	+ * extent_map.
	6586	+ *
	6587	+ * If @page is not NULL and the extent is inline, this also reads the extent
	6588	+ * data directly into the page and marks the extent up to date in the io_tree.
	6589	+ *
	6590	+ * Return: ERR_PTR on error, non-NULL extent_map on success.
6939	6591	*/
6940	6592	struct extent_map btrfs_get_extent(struct btrfs_inode inode,
6941		- struct page *page,
6942		- size_t pg_offset, u64 start, u64 len,
6943		- int create)
	6593	+ struct page *page, size_t pg_offset,
	6594	+ u64 start, u64 len)
6944	6595	{
6945	6596	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6946		- int ret;
6947		- int err = 0;
	6597	+ int ret = 0;
6948	6598	u64 extent_start = 0;
6949	6599	u64 extent_end = 0;
6950	6600	u64 objectid = btrfs_ino(inode);
6951		- u32 found_type;
	6601	+ int extent_type = -1;
6952	6602	struct btrfs_path *path = NULL;
6953	6603	struct btrfs_root *root = inode->root;
6954	6604	struct btrfs_file_extent_item *item;
..	..	@@ -6957,12 +6607,9 @@
6957	6607	struct extent_map *em = NULL;
6958	6608	struct extent_map_tree *em_tree = &inode->extent_tree;
6959	6609	struct extent_io_tree *io_tree = &inode->io_tree;
6960		- const bool new_inline = !page \|\| create;
6961	6610
6962	6611	read_lock(&em_tree->lock);
6963	6612	em = lookup_extent_mapping(em_tree, start, len);
6964		- if (em)
6965		- em->bdev = fs_info->fs_devices->latest_bdev;
6966	6613	read_unlock(&em_tree->lock);
6967	6614
6968	6615	if (em) {
..	..	@@ -6975,48 +6622,47 @@
6975	6622	}
6976	6623	em = alloc_extent_map();
6977	6624	if (!em) {
6978		- err = -ENOMEM;
	6625	+ ret = -ENOMEM;
6979	6626	goto out;
6980	6627	}
6981		- em->bdev = fs_info->fs_devices->latest_bdev;
6982	6628	em->start = EXTENT_MAP_HOLE;
6983	6629	em->orig_start = EXTENT_MAP_HOLE;
6984	6630	em->len = (u64)-1;
6985	6631	em->block_len = (u64)-1;
6986	6632
	6633	+ path = btrfs_alloc_path();
6987	6634	if (!path) {
6988		- path = btrfs_alloc_path();
6989		- if (!path) {
6990		- err = -ENOMEM;
6991		- goto out;
6992		- }
6993		- /*
6994		- * Chances are we'll be called again, so go ahead and do
6995		- * readahead
6996		- */
6997		- path->reada = READA_FORWARD;
6998		- }
6999		-
7000		- ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
7001		- if (ret < 0) {
7002		- err = ret;
	6635	+ ret = -ENOMEM;
7003	6636	goto out;
7004	6637	}
7005	6638
7006		- if (ret != 0) {
	6639	+ /* Chances are we'll be called again, so go ahead and do readahead */
	6640	+ path->reada = READA_FORWARD;
	6641	+
	6642	+ /*
	6643	+ * Unless we're going to uncompress the inline extent, no sleep would
	6644	+ * happen.
	6645	+ */
	6646	+ path->leave_spinning = 1;
	6647	+
	6648	+ path->recurse = btrfs_is_free_space_inode(inode);
	6649	+
	6650	+ ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
	6651	+ if (ret < 0) {
	6652	+ goto out;
	6653	+ } else if (ret > 0) {
7007	6654	if (path->slots[0] == 0)
7008	6655	goto not_found;
7009	6656	path->slots[0]--;
	6657	+ ret = 0;
7010	6658	}
7011	6659
7012	6660	leaf = path->nodes[0];
7013	6661	item = btrfs_item_ptr(leaf, path->slots[0],
7014	6662	struct btrfs_file_extent_item);
7015		- /* are we inside the extent that was found? */
7016	6663	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7017		- found_type = found_key.type;
7018	6664	if (found_key.objectid != objectid \|\|
7019		- found_type != BTRFS_EXTENT_DATA_KEY) {
	6665	+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
7020	6666	/*
7021	6667	* If we backup past the first extent we want to move forward
7022	6668	* and see if there is an extent in front of us, otherwise we'll
..	..	@@ -7027,30 +6673,22 @@
7027	6673	goto next;
7028	6674	}
7029	6675
7030		- found_type = btrfs_file_extent_type(leaf, item);
	6676	+ extent_type = btrfs_file_extent_type(leaf, item);
7031	6677	extent_start = found_key.offset;
7032		- if (found_type == BTRFS_FILE_EXTENT_REG \|\|
7033		- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
	6678	+ extent_end = btrfs_file_extent_end(path);
	6679	+ if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
	6680	+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7034	6681	/* Only regular file could have regular/prealloc extent */
7035	6682	if (!S_ISREG(inode->vfs_inode.i_mode)) {
7036		- err = -EUCLEAN;
	6683	+ ret = -EUCLEAN;
7037	6684	btrfs_crit(fs_info,
7038	6685	"regular/prealloc extent found for non-regular inode %llu",
7039	6686	btrfs_ino(inode));
7040	6687	goto out;
7041	6688	}
7042		- extent_end = extent_start +
7043		- btrfs_file_extent_num_bytes(leaf, item);
7044		-
7045	6689	trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
7046	6690	extent_start);
7047		- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7048		- size_t size;
7049		-
7050		- size = btrfs_file_extent_ram_bytes(leaf, item);
7051		- extent_end = ALIGN(extent_start + size,
7052		- fs_info->sectorsize);
7053		-
	6691	+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7054	6692	trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
7055	6693	path->slots[0],
7056	6694	extent_start);
..	..	@@ -7060,12 +6698,11 @@
7060	6698	path->slots[0]++;
7061	6699	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
7062	6700	ret = btrfs_next_leaf(root, path);
7063		- if (ret < 0) {
7064		- err = ret;
	6701	+ if (ret < 0)
7065	6702	goto out;
7066		- }
7067		- if (ret > 0)
	6703	+ else if (ret > 0)
7068	6704	goto not_found;
	6705	+
7069	6706	leaf = path->nodes[0];
7070	6707	}
7071	6708	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
..	..	@@ -7076,26 +6713,28 @@
7076	6713	goto not_found;
7077	6714	if (start > found_key.offset)
7078	6715	goto next;
	6716	+
	6717	+ /* New extent overlaps with existing one */
7079	6718	em->start = start;
7080	6719	em->orig_start = start;
7081	6720	em->len = found_key.offset - start;
7082		- goto not_found_em;
	6721	+ em->block_start = EXTENT_MAP_HOLE;
	6722	+ goto insert;
7083	6723	}
7084	6724
7085		- btrfs_extent_item_to_extent_map(inode, path, item,
7086		- new_inline, em);
	6725	+ btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
7087	6726
7088		- if (found_type == BTRFS_FILE_EXTENT_REG \|\|
7089		- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
	6727	+ if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
	6728	+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7090	6729	goto insert;
7091		- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
	6730	+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7092	6731	unsigned long ptr;
7093	6732	char *map;
7094	6733	size_t size;
7095	6734	size_t extent_offset;
7096	6735	size_t copy_size;
7097	6736
7098		- if (new_inline)
	6737	+ if (!page)
7099	6738	goto out;
7100	6739
7101	6740	size = btrfs_file_extent_ram_bytes(leaf, item);
..	..	@@ -7107,15 +6746,15 @@
7107	6746	em->orig_block_len = em->len;
7108	6747	em->orig_start = em->start;
7109	6748	ptr = btrfs_file_extent_inline_start(item) + extent_offset;
	6749	+
	6750	+ btrfs_set_path_blocking(path);
7110	6751	if (!PageUptodate(page)) {
7111	6752	if (btrfs_file_extent_compression(leaf, item) !=
7112	6753	BTRFS_COMPRESS_NONE) {
7113	6754	ret = uncompress_inline(path, page, pg_offset,
7114	6755	extent_offset, item);
7115		- if (ret) {
7116		- err = ret;
	6756	+ if (ret)
7117	6757	goto out;
7118		- }
7119	6758	} else {
7120	6759	map = kmap(page);
7121	6760	read_extent_buffer(leaf, map + pg_offset, ptr,
..	..	@@ -7137,49 +6776,45 @@
7137	6776	em->start = start;
7138	6777	em->orig_start = start;
7139	6778	em->len = len;
7140		-not_found_em:
7141	6779	em->block_start = EXTENT_MAP_HOLE;
7142	6780	insert:
	6781	+ ret = 0;
7143	6782	btrfs_release_path(path);
7144	6783	if (em->start > start \|\| extent_map_end(em) <= start) {
7145	6784	btrfs_err(fs_info,
7146	6785	"bad extent! em: [%llu %llu] passed [%llu %llu]",
7147	6786	em->start, em->len, start, len);
7148		- err = -EIO;
	6787	+ ret = -EIO;
7149	6788	goto out;
7150	6789	}
7151	6790
7152		- err = 0;
7153	6791	write_lock(&em_tree->lock);
7154		- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
	6792	+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
7155	6793	write_unlock(&em_tree->lock);
7156	6794	out:
	6795	+ btrfs_free_path(path);
7157	6796
7158	6797	trace_btrfs_get_extent(root, inode, em);
7159	6798
7160		- btrfs_free_path(path);
7161		- if (err) {
	6799	+ if (ret) {
7162	6800	free_extent_map(em);
7163		- return ERR_PTR(err);
	6801	+ return ERR_PTR(ret);
7164	6802	}
7165		- BUG_ON(!em); /* Error is always set */
7166	6803	return em;
7167	6804	}
7168	6805
7169	6806	struct extent_map btrfs_get_extent_fiemap(struct btrfs_inode inode,
7170		- struct page *page,
7171		- size_t pg_offset, u64 start, u64 len,
7172		- int create)
	6807	+ u64 start, u64 len)
7173	6808	{
7174	6809	struct extent_map *em;
7175	6810	struct extent_map *hole_em = NULL;
7176		- u64 range_start = start;
	6811	+ u64 delalloc_start = start;
7177	6812	u64 end;
7178		- u64 found;
7179		- u64 found_end;
	6813	+ u64 delalloc_len;
	6814	+ u64 delalloc_end;
7180	6815	int err = 0;
7181	6816
7182		- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
	6817	+ em = btrfs_get_extent(inode, NULL, 0, start, len);
7183	6818	if (IS_ERR(em))
7184	6819	return em;
7185	6820	/*
..	..	@@ -7204,80 +6839,83 @@
7204	6839	em = NULL;
7205	6840
7206	6841	/* ok, we didn't find anything, lets look for delalloc */
7207		- found = count_range_bits(&inode->io_tree, &range_start,
	6842	+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
7208	6843	end, len, EXTENT_DELALLOC, 1);
7209		- found_end = range_start + found;
7210		- if (found_end < range_start)
7211		- found_end = (u64)-1;
	6844	+ delalloc_end = delalloc_start + delalloc_len;
	6845	+ if (delalloc_end < delalloc_start)
	6846	+ delalloc_end = (u64)-1;
7212	6847
7213	6848	/*
7214		- * we didn't find anything useful, return
7215		- * the original results from get_extent()
	6849	+ * We didn't find anything useful, return the original results from
	6850	+ * get_extent()
7216	6851	*/
7217		- if (range_start > end \|\| found_end <= start) {
	6852	+ if (delalloc_start > end \|\| delalloc_end <= start) {
7218	6853	em = hole_em;
7219	6854	hole_em = NULL;
7220	6855	goto out;
7221	6856	}
7222	6857
7223		- /* adjust the range_start to make sure it doesn't
7224		- * go backwards from the start they passed in
	6858	+ /*
	6859	+ * Adjust the delalloc_start to make sure it doesn't go backwards from
	6860	+ * the start they passed in
7225	6861	*/
7226		- range_start = max(start, range_start);
7227		- found = found_end - range_start;
	6862	+ delalloc_start = max(start, delalloc_start);
	6863	+ delalloc_len = delalloc_end - delalloc_start;
7228	6864
7229		- if (found > 0) {
7230		- u64 hole_start = start;
7231		- u64 hole_len = len;
	6865	+ if (delalloc_len > 0) {
	6866	+ u64 hole_start;
	6867	+ u64 hole_len;
	6868	+ const u64 hole_end = extent_map_end(hole_em);
7232	6869
7233	6870	em = alloc_extent_map();
7234	6871	if (!em) {
7235	6872	err = -ENOMEM;
7236	6873	goto out;
7237	6874	}
7238		- /*
7239		- * when btrfs_get_extent can't find anything it
7240		- * returns one huge hole
7241		- *
7242		- * make sure what it found really fits our range, and
7243		- * adjust to make sure it is based on the start from
7244		- * the caller
7245		- */
7246		- if (hole_em) {
7247		- u64 calc_end = extent_map_end(hole_em);
7248	6875
7249		- if (calc_end <= start \|\| (hole_em->start > end)) {
7250		- free_extent_map(hole_em);
7251		- hole_em = NULL;
7252		- } else {
7253		- hole_start = max(hole_em->start, start);
7254		- hole_len = calc_end - hole_start;
7255		- }
	6876	+ ASSERT(hole_em);
	6877	+ /*
	6878	+ * When btrfs_get_extent can't find anything it returns one
	6879	+ * huge hole
	6880	+ *
	6881	+ * Make sure what it found really fits our range, and adjust to
	6882	+ * make sure it is based on the start from the caller
	6883	+ */
	6884	+ if (hole_end <= start \|\| hole_em->start > end) {
	6885	+ free_extent_map(hole_em);
	6886	+ hole_em = NULL;
	6887	+ } else {
	6888	+ hole_start = max(hole_em->start, start);
	6889	+ hole_len = hole_end - hole_start;
7256	6890	}
7257		- em->bdev = NULL;
7258		- if (hole_em && range_start > hole_start) {
7259		- /* our hole starts before our delalloc, so we
7260		- * have to return just the parts of the hole
7261		- * that go until the delalloc starts
	6891	+
	6892	+ if (hole_em && delalloc_start > hole_start) {
	6893	+ /*
	6894	+ * Our hole starts before our delalloc, so we have to
	6895	+ * return just the parts of the hole that go until the
	6896	+ * delalloc starts
7262	6897	*/
7263		- em->len = min(hole_len,
7264		- range_start - hole_start);
	6898	+ em->len = min(hole_len, delalloc_start - hole_start);
7265	6899	em->start = hole_start;
7266	6900	em->orig_start = hole_start;
7267	6901	/*
7268		- * don't adjust block start at all,
7269		- * it is fixed at EXTENT_MAP_HOLE
	6902	+ * Don't adjust block start at all, it is fixed at
	6903	+ * EXTENT_MAP_HOLE
7270	6904	*/
7271	6905	em->block_start = hole_em->block_start;
7272	6906	em->block_len = hole_len;
7273	6907	if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7274	6908	set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7275	6909	} else {
7276		- em->start = range_start;
7277		- em->len = found;
7278		- em->orig_start = range_start;
	6910	+ /*
	6911	+ * Hole is out of passed range or it starts after
	6912	+ * delalloc range
	6913	+ */
	6914	+ em->start = delalloc_start;
	6915	+ em->len = delalloc_len;
	6916	+ em->orig_start = delalloc_start;
7279	6917	em->block_start = EXTENT_MAP_DELALLOC;
7280		- em->block_len = found;
	6918	+ em->block_len = delalloc_len;
7281	6919	}
7282	6920	} else {
7283	6921	return hole_em;
..	..	@@ -7292,7 +6930,7 @@
7292	6930	return em;
7293	6931	}
7294	6932
7295		-static struct extent_map btrfs_create_dio_extent(struct inode inode,
	6933	+static struct extent_map btrfs_create_dio_extent(struct btrfs_inode inode,
7296	6934	const u64 start,
7297	6935	const u64 len,
7298	6936	const u64 orig_start,
..	..	@@ -7306,21 +6944,19 @@
7306	6944	int ret;
7307	6945
7308	6946	if (type != BTRFS_ORDERED_NOCOW) {
7309		- em = create_io_em(inode, start, len, orig_start,
7310		- block_start, block_len, orig_block_len,
7311		- ram_bytes,
	6947	+ em = create_io_em(inode, start, len, orig_start, block_start,
	6948	+ block_len, orig_block_len, ram_bytes,
7312	6949	BTRFS_COMPRESS_NONE, /* compress_type */
7313	6950	type);
7314	6951	if (IS_ERR(em))
7315	6952	goto out;
7316	6953	}
7317		- ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7318		- len, block_len, type);
	6954	+ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
	6955	+ block_len, type);
7319	6956	if (ret) {
7320	6957	if (em) {
7321	6958	free_extent_map(em);
7322		- btrfs_drop_extent_cache(BTRFS_I(inode), start,
7323		- start + len - 1, 0);
	6959	+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
7324	6960	}
7325	6961	em = ERR_PTR(ret);
7326	6962	}
..	..	@@ -7329,11 +6965,11 @@
7329	6965	return em;
7330	6966	}
7331	6967
7332		-static struct extent_map btrfs_new_extent_direct(struct inode inode,
	6968	+static struct extent_map btrfs_new_extent_direct(struct btrfs_inode inode,
7333	6969	u64 start, u64 len)
7334	6970	{
7335		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7336		- struct btrfs_root *root = BTRFS_I(inode)->root;
	6971	+ struct btrfs_root *root = inode->root;
	6972	+ struct btrfs_fs_info *fs_info = root->fs_info;
7337	6973	struct extent_map *em;
7338	6974	struct btrfs_key ins;
7339	6975	u64 alloc_hint;
..	..	@@ -7350,19 +6986,38 @@
7350	6986	ins.offset, BTRFS_ORDERED_REGULAR);
7351	6987	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7352	6988	if (IS_ERR(em))
7353		- btrfs_free_reserved_extent(fs_info, ins.objectid,
7354		- ins.offset, 1);
	6989	+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
	6990	+ 1);
7355	6991
7356	6992	return em;
7357	6993	}
7358	6994
7359	6995	/*
7360		- * returns 1 when the nocow is safe, < 1 on error, 0 if the
7361		- * block must be cow'd
	6996	+ * Check if we can do nocow write into the range [@offset, @offset + @len)
	6997	+ *
	6998	+ * @offset: File offset
	6999	+ * @len: The length to write, will be updated to the nocow writeable
	7000	+ * range
	7001	+ * @orig_start: (optional) Return the original file offset of the file extent
	7002	+ * @orig_len: (optional) Return the original on-disk length of the file extent
	7003	+ * @ram_bytes: (optional) Return the ram_bytes of the file extent
	7004	+ * @strict: if true, omit optimizations that might force us into unnecessary
	7005	+ * cow. e.g., don't trust generation number.
	7006	+ *
	7007	+ * This function will flush ordered extents in the range to ensure proper
	7008	+ * nocow checks for (nowait == false) case.
	7009	+ *
	7010	+ * Return:
	7011	+ * >0 and update @len if we can do nocow write
	7012	+ * 0 if we can't do nocow write
	7013	+ * <0 if error happened
	7014	+ *
	7015	+ * NOTE: This only checks the file extents, caller is responsible to wait for
	7016	+ * any ordered extents.
7362	7017	*/
7363	7018	noinline int can_nocow_extent(struct inode inode, u64 offset, u64 len,
7364	7019	u64 orig_start, u64 orig_block_len,
7365		- u64 *ram_bytes)
	7020	+ u64 *ram_bytes, bool strict)
7366	7021	{
7367	7022	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7368	7023	struct btrfs_path *path;
..	..	@@ -7440,8 +7095,9 @@
7440	7095	* Do the same check as in btrfs_cross_ref_exist but without the
7441	7096	* unnecessary search.
7442	7097	*/
7443		- if (btrfs_file_extent_generation(leaf, fi) <=
7444		- btrfs_root_last_snapshot(&root->root_item))
	7098	+ if (!strict &&
	7099	+ (btrfs_file_extent_generation(leaf, fi) <=
	7100	+ btrfs_root_last_snapshot(&root->root_item)))
7445	7101	goto out;
7446	7102
7447	7103	backref_offset = btrfs_file_extent_offset(leaf, fi);
..	..	@@ -7477,7 +7133,8 @@
7477	7133	*/
7478	7134
7479	7135	ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7480		- key.offset - backref_offset, disk_bytenr);
	7136	+ key.offset - backref_offset, disk_bytenr,
	7137	+ strict);
7481	7138	if (ret) {
7482	7139	ret = 0;
7483	7140	goto out;
..	..	@@ -7505,7 +7162,7 @@
7505	7162	}
7506	7163
7507	7164	static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7508		- struct extent_state **cached_state, int writing)
	7165	+ struct extent_state **cached_state, bool writing)
7509	7166	{
7510	7167	struct btrfs_ordered_extent *ordered;
7511	7168	int ret = 0;
..	..	@@ -7554,7 +7211,7 @@
7554	7211	*/
7555	7212	if (writing \|\|
7556	7213	test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7557		- btrfs_start_ordered_extent(inode, ordered, 1);
	7214	+ btrfs_start_ordered_extent(ordered, 1);
7558	7215	else
7559	7216	ret = -ENOTBLK;
7560	7217	btrfs_put_ordered_extent(ordered);
..	..	@@ -7564,11 +7221,11 @@
7564	7221	* for it to complete) and then invalidate the pages for
7565	7222	* this range (through invalidate_inode_pages2_range()),
7566	7223	* but that can lead us to a deadlock with a concurrent
7567		- * call to readpages() (a buffered read or a defrag call
	7224	+ * call to readahead (a buffered read or a defrag call
7568	7225	* triggered a readahead) on a page lock due to an
7569	7226	* ordered dio extent we created before but did not have
7570	7227	* yet a corresponding bio submitted (whence it can not
7571		- * complete), which makes readpages() wait for that
	7228	+ * complete), which makes readahead wait for that
7572	7229	* ordered extent to complete while holding a lock on
7573	7230	* that page.
7574	7231	*/
..	..	@@ -7585,15 +7242,14 @@
7585	7242	}
7586	7243
7587	7244	/* The callers of this must take lock_extent() */
7588		-static struct extent_map create_io_em(struct inode inode, u64 start, u64 len,
7589		- u64 orig_start, u64 block_start,
	7245	+static struct extent_map create_io_em(struct btrfs_inode inode, u64 start,
	7246	+ u64 len, u64 orig_start, u64 block_start,
7590	7247	u64 block_len, u64 orig_block_len,
7591	7248	u64 ram_bytes, int compress_type,
7592	7249	int type)
7593	7250	{
7594	7251	struct extent_map_tree *em_tree;
7595	7252	struct extent_map *em;
7596		- struct btrfs_root *root = BTRFS_I(inode)->root;
7597	7253	int ret;
7598	7254
7599	7255	ASSERT(type == BTRFS_ORDERED_PREALLOC \|\|
..	..	@@ -7601,7 +7257,7 @@
7601	7257	type == BTRFS_ORDERED_NOCOW \|\|
7602	7258	type == BTRFS_ORDERED_REGULAR);
7603	7259
7604		- em_tree = &BTRFS_I(inode)->extent_tree;
	7260	+ em_tree = &inode->extent_tree;
7605	7261	em = alloc_extent_map();
7606	7262	if (!em)
7607	7263	return ERR_PTR(-ENOMEM);
..	..	@@ -7611,7 +7267,6 @@
7611	7267	em->len = len;
7612	7268	em->block_len = block_len;
7613	7269	em->block_start = block_start;
7614		- em->bdev = root->fs_info->fs_devices->latest_bdev;
7615	7270	em->orig_block_len = orig_block_len;
7616	7271	em->ram_bytes = ram_bytes;
7617	7272	em->generation = -1;
..	..	@@ -7624,8 +7279,8 @@
7624	7279	}
7625	7280
7626	7281	do {
7627		- btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7628		- em->start + em->len - 1, 0);
	7282	+ btrfs_drop_extent_cache(inode, em->start,
	7283	+ em->start + em->len - 1, 0);
7629	7284	write_lock(&em_tree->lock);
7630	7285	ret = add_extent_mapping(em_tree, em, 1);
7631	7286	write_unlock(&em_tree->lock);
..	..	@@ -7645,28 +7300,7 @@
7645	7300	}
7646	7301
7647	7302
7648		-static int btrfs_get_blocks_direct_read(struct extent_map *em,
7649		- struct buffer_head *bh_result,
7650		- struct inode *inode,
7651		- u64 start, u64 len)
7652		-{
7653		- if (em->block_start == EXTENT_MAP_HOLE \|\|
7654		- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7655		- return -ENOENT;
7656		-
7657		- len = min(len, em->len - (start - em->start));
7658		-
7659		- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7660		- inode->i_blkbits;
7661		- bh_result->b_size = len;
7662		- bh_result->b_bdev = em->bdev;
7663		- set_buffer_mapped(bh_result);
7664		-
7665		- return 0;
7666		-}
7667		-
7668	7303	static int btrfs_get_blocks_direct_write(struct extent_map **map,
7669		- struct buffer_head *bh_result,
7670	7304	struct inode *inode,
7671	7305	struct btrfs_dio_data *dio_data,
7672	7306	u64 start, u64 len)
..	..	@@ -7698,11 +7332,11 @@
7698	7332	block_start = em->block_start + (start - em->start);
7699	7333
7700	7334	if (can_nocow_extent(inode, start, &len, &orig_start,
7701		- &orig_block_len, &ram_bytes) == 1 &&
	7335	+ &orig_block_len, &ram_bytes, false) == 1 &&
7702	7336	btrfs_inc_nocow_writers(fs_info, block_start)) {
7703	7337	struct extent_map *em2;
7704	7338
7705		- em2 = btrfs_create_dio_extent(inode, start, len,
	7339	+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
7706	7340	orig_start, block_start,
7707	7341	len, orig_block_len,
7708	7342	ram_bytes, type);
..	..	@@ -7721,16 +7355,14 @@
7721	7355	* use the existing or preallocated extent, so does not
7722	7356	* need to adjust btrfs_space_info's bytes_may_use.
7723	7357	*/
7724		- btrfs_free_reserved_data_space_noquota(inode, start,
7725		- len);
	7358	+ btrfs_free_reserved_data_space_noquota(fs_info, len);
7726	7359	goto skip_cow;
7727	7360	}
7728	7361	}
7729	7362
7730	7363	/* this will cow the extent */
7731		- len = bh_result->b_size;
7732	7364	free_extent_map(em);
7733		- *map = em = btrfs_new_extent_direct(inode, start, len);
	7365	+ *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
7734	7366	if (IS_ERR(em)) {
7735	7367	ret = PTR_ERR(em);
7736	7368	goto out;
..	..	@@ -7739,72 +7371,93 @@
7739	7371	len = min(len, em->len - (start - em->start));
7740	7372
7741	7373	skip_cow:
7742		- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7743		- inode->i_blkbits;
7744		- bh_result->b_size = len;
7745		- bh_result->b_bdev = em->bdev;
7746		- set_buffer_mapped(bh_result);
7747		-
7748		- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7749		- set_buffer_new(bh_result);
7750		-
7751	7374	/*
7752	7375	* Need to update the i_size under the extent lock so buffered
7753	7376	* readers will get the updated i_size when we unlock.
7754	7377	*/
7755		- if (!dio_data->overwrite && start + len > i_size_read(inode))
	7378	+ if (start + len > i_size_read(inode))
7756	7379	i_size_write(inode, start + len);
7757	7380
7758		- WARN_ON(dio_data->reserve < len);
7759	7381	dio_data->reserve -= len;
7760		- dio_data->unsubmitted_oe_range_end = start + len;
7761		- current->journal_info = dio_data;
7762	7382	out:
7763	7383	return ret;
7764	7384	}
7765	7385
7766		-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7767		- struct buffer_head *bh_result, int create)
	7386	+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
	7387	+ loff_t length, unsigned int flags, struct iomap *iomap,
	7388	+ struct iomap *srcmap)
7768	7389	{
7769	7390	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7770	7391	struct extent_map *em;
7771	7392	struct extent_state *cached_state = NULL;
7772	7393	struct btrfs_dio_data *dio_data = NULL;
7773		- u64 start = iblock << inode->i_blkbits;
7774	7394	u64 lockstart, lockend;
7775		- u64 len = bh_result->b_size;
7776		- int unlock_bits = EXTENT_LOCKED;
	7395	+ const bool write = !!(flags & IOMAP_WRITE);
7777	7396	int ret = 0;
	7397	+ u64 len = length;
	7398	+ bool unlock_extents = false;
	7399	+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
7778	7400
7779		- if (create)
7780		- unlock_bits \|= EXTENT_DIRTY;
7781		- else
	7401	+ /*
	7402	+ * We used current->journal_info here to see if we were sync, but
	7403	+ * there's a lot of tests in the enospc machinery to not do flushing if
	7404	+ * we have a journal_info set, so we need to clear this out and re-set
	7405	+ * it in iomap_end.
	7406	+ */
	7407	+ ASSERT(current->journal_info == NULL \|\|
	7408	+ current->journal_info == BTRFS_DIO_SYNC_STUB);
	7409	+ current->journal_info = NULL;
	7410	+
	7411	+ if (!write)
7782	7412	len = min_t(u64, len, fs_info->sectorsize);
7783	7413
7784	7414	lockstart = start;
7785	7415	lockend = start + len - 1;
7786	7416
7787		- if (current->journal_info) {
7788		- /*
7789		- * Need to pull our outstanding extents and set journal_info to NULL so
7790		- * that anything that needs to check if there's a transaction doesn't get
7791		- * confused.
7792		- */
7793		- dio_data = current->journal_info;
7794		- current->journal_info = NULL;
	7417	+ /*
	7418	+ * The generic stuff only does filemap_write_and_wait_range, which
	7419	+ * isn't enough if we've written compressed pages to this area, so we
	7420	+ * need to flush the dirty pages again to make absolutely sure that any
	7421	+ * outstanding dirty pages are on disk.
	7422	+ */
	7423	+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
	7424	+ &BTRFS_I(inode)->runtime_flags)) {
	7425	+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
	7426	+ start + length - 1);
	7427	+ if (ret)
	7428	+ return ret;
7795	7429	}
	7430	+
	7431	+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
	7432	+ if (!dio_data)
	7433	+ return -ENOMEM;
	7434	+
	7435	+ dio_data->sync = sync;
	7436	+ dio_data->length = length;
	7437	+ if (write) {
	7438	+ dio_data->reserve = round_up(length, fs_info->sectorsize);
	7439	+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
	7440	+ &dio_data->data_reserved,
	7441	+ start, dio_data->reserve);
	7442	+ if (ret) {
	7443	+ extent_changeset_free(dio_data->data_reserved);
	7444	+ kfree(dio_data);
	7445	+ return ret;
	7446	+ }
	7447	+ }
	7448	+ iomap->private = dio_data;
	7449	+
7796	7450
7797	7451	/*
7798	7452	* If this errors out it's because we couldn't invalidate pagecache for
7799	7453	* this range and we need to fallback to buffered.
7800	7454	*/
7801		- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7802		- create)) {
	7455	+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
7803	7456	ret = -ENOTBLK;
7804	7457	goto err;
7805	7458	}
7806	7459
7807		- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
	7460	+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7808	7461	if (IS_ERR(em)) {
7809	7462	ret = PTR_ERR(em);
7810	7463	goto unlock_err;
..	..	@@ -7827,443 +7480,253 @@
7827	7480	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) \|\|
7828	7481	em->block_start == EXTENT_MAP_INLINE) {
7829	7482	free_extent_map(em);
7830		- ret = -ENOTBLK;
	7483	+ /*
	7484	+ * If we are in a NOWAIT context, return -EAGAIN in order to
	7485	+ * fallback to buffered IO. This is not only because we can
	7486	+ * block with buffered IO (no support for NOWAIT semantics at
	7487	+ * the moment) but also to avoid returning short reads to user
	7488	+ * space - this happens if we were able to read some data from
	7489	+ * previous non-compressed extents and then when we fallback to
	7490	+ * buffered IO, at btrfs_file_read_iter() by calling
	7491	+ * filemap_read(), we fail to fault in pages for the read buffer,
	7492	+ * in which case filemap_read() returns a short read (the number
	7493	+ * of bytes previously read is > 0, so it does not return -EFAULT).
	7494	+ */
	7495	+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7831	7496	goto unlock_err;
7832	7497	}
7833	7498
7834		- if (create) {
7835		- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
7836		- dio_data, start, len);
	7499	+ len = min(len, em->len - (start - em->start));
	7500	+ if (write) {
	7501	+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
	7502	+ start, len);
7837	7503	if (ret < 0)
7838	7504	goto unlock_err;
7839		-
7840		- /* clear and unlock the entire range */
7841		- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7842		- unlock_bits, 1, 0, &cached_state);
	7505	+ unlock_extents = true;
	7506	+ /* Recalc len in case the new em is smaller than requested */
	7507	+ len = min(len, em->len - (start - em->start));
7843	7508	} else {
7844		- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
7845		- start, len);
7846		- /* Can be negative only if we read from a hole */
7847		- if (ret < 0) {
7848		- ret = 0;
7849		- free_extent_map(em);
7850		- goto unlock_err;
7851		- }
7852	7509	/*
7853	7510	* We need to unlock only the end area that we aren't using.
7854	7511	* The rest is going to be unlocked by the endio routine.
7855	7512	*/
7856		- lockstart = start + bh_result->b_size;
7857		- if (lockstart < lockend) {
7858		- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7859		- lockend, unlock_bits, 1, 0,
7860		- &cached_state);
7861		- } else {
7862		- free_extent_state(cached_state);
7863		- }
	7513	+ lockstart = start + len;
	7514	+ if (lockstart < lockend)
	7515	+ unlock_extents = true;
7864	7516	}
	7517	+
	7518	+ if (unlock_extents)
	7519	+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
	7520	+ lockstart, lockend, &cached_state);
	7521	+ else
	7522	+ free_extent_state(cached_state);
	7523	+
	7524	+ /*
	7525	+ * Translate extent map information to iomap.
	7526	+ * We trim the extents (and move the addr) even though iomap code does
	7527	+ * that, since we have locked only the parts we are performing I/O in.
	7528	+ */
	7529	+ if ((em->block_start == EXTENT_MAP_HOLE) \|\|
	7530	+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
	7531	+ iomap->addr = IOMAP_NULL_ADDR;
	7532	+ iomap->type = IOMAP_HOLE;
	7533	+ } else {
	7534	+ iomap->addr = em->block_start + (start - em->start);
	7535	+ iomap->type = IOMAP_MAPPED;
	7536	+ }
	7537	+ iomap->offset = start;
	7538	+ iomap->bdev = fs_info->fs_devices->latest_bdev;
	7539	+ iomap->length = len;
7865	7540
7866	7541	free_extent_map(em);
7867	7542
7868	7543	return 0;
7869	7544
7870	7545	unlock_err:
7871		- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7872		- unlock_bits, 1, 0, &cached_state);
	7546	+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
	7547	+ &cached_state);
7873	7548	err:
7874		- if (dio_data)
7875		- current->journal_info = dio_data;
	7549	+ if (dio_data) {
	7550	+ btrfs_delalloc_release_space(BTRFS_I(inode),
	7551	+ dio_data->data_reserved, start,
	7552	+ dio_data->reserve, true);
	7553	+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
	7554	+ extent_changeset_free(dio_data->data_reserved);
	7555	+ kfree(dio_data);
	7556	+ }
7876	7557	return ret;
7877	7558	}
7878	7559
7879		-static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
7880		- struct bio *bio,
7881		- int mirror_num)
	7560	+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
	7561	+ ssize_t written, unsigned int flags, struct iomap *iomap)
7882	7562	{
	7563	+ int ret = 0;
	7564	+ struct btrfs_dio_data *dio_data = iomap->private;
	7565	+ size_t submitted = dio_data->submitted;
	7566	+ const bool write = !!(flags & IOMAP_WRITE);
	7567	+
	7568	+ if (!write && (iomap->type == IOMAP_HOLE)) {
	7569	+ /* If reading from a hole, unlock and return */
	7570	+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
	7571	+ goto out;
	7572	+ }
	7573	+
	7574	+ if (submitted < length) {
	7575	+ pos += submitted;
	7576	+ length -= submitted;
	7577	+ if (write)
	7578	+ __endio_write_update_ordered(BTRFS_I(inode), pos,
	7579	+ length, false);
	7580	+ else
	7581	+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
	7582	+ pos + length - 1);
	7583	+ ret = -ENOTBLK;
	7584	+ }
	7585	+
	7586	+ if (write) {
	7587	+ if (dio_data->reserve)
	7588	+ btrfs_delalloc_release_space(BTRFS_I(inode),
	7589	+ dio_data->data_reserved, pos,
	7590	+ dio_data->reserve, true);
	7591	+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
	7592	+ extent_changeset_free(dio_data->data_reserved);
	7593	+ }
	7594	+out:
	7595	+ /*
	7596	+ * We're all done, we can re-set the current->journal_info now safely
	7597	+ * for our endio.
	7598	+ */
	7599	+ if (dio_data->sync) {
	7600	+ ASSERT(current->journal_info == NULL);
	7601	+ current->journal_info = BTRFS_DIO_SYNC_STUB;
	7602	+ }
	7603	+ kfree(dio_data);
	7604	+ iomap->private = NULL;
	7605	+
	7606	+ return ret;
	7607	+}
	7608	+
	7609	+static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
	7610	+{
	7611	+ /*
	7612	+ * This implies a barrier so that stores to dio_bio->bi_status before
	7613	+ * this and loads of dio_bio->bi_status after this are fully ordered.
	7614	+ */
	7615	+ if (!refcount_dec_and_test(&dip->refs))
	7616	+ return;
	7617	+
	7618	+ if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
	7619	+ __endio_write_update_ordered(BTRFS_I(dip->inode),
	7620	+ dip->logical_offset,
	7621	+ dip->bytes,
	7622	+ !dip->dio_bio->bi_status);
	7623	+ } else {
	7624	+ unlock_extent(&BTRFS_I(dip->inode)->io_tree,
	7625	+ dip->logical_offset,
	7626	+ dip->logical_offset + dip->bytes - 1);
	7627	+ }
	7628	+
	7629	+ bio_endio(dip->dio_bio);
	7630	+ kfree(dip);
	7631	+}
	7632	+
	7633	+static blk_status_t submit_dio_repair_bio(struct inode inode, struct bio bio,
	7634	+ int mirror_num,
	7635	+ unsigned long bio_flags)
	7636	+{
	7637	+ struct btrfs_dio_private *dip = bio->bi_private;
7883	7638	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7884	7639	blk_status_t ret;
7885	7640
7886	7641	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7887	7642
7888		- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
	7643	+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7889	7644	if (ret)
7890	7645	return ret;
7891	7646
7892		- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7893		-
	7647	+ refcount_inc(&dip->refs);
	7648	+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
	7649	+ if (ret)
	7650	+ refcount_dec(&dip->refs);
7894	7651	return ret;
7895	7652	}
7896	7653
7897		-static int btrfs_check_dio_repairable(struct inode *inode,
7898		- struct bio *failed_bio,
7899		- struct io_failure_record *failrec,
7900		- int failed_mirror)
	7654	+static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
	7655	+ struct btrfs_io_bio *io_bio,
	7656	+ const bool uptodate)
7901	7657	{
7902		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7903		- int num_copies;
7904		-
7905		- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7906		- if (num_copies == 1) {
7907		- /*
7908		- * we only have a single copy of the data, so don't bother with
7909		- * all the retry and error correction code that follows. no
7910		- * matter what the error is, it is very likely to persist.
7911		- */
7912		- btrfs_debug(fs_info,
7913		- "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7914		- num_copies, failrec->this_mirror, failed_mirror);
7915		- return 0;
7916		- }
7917		-
7918		- failrec->failed_mirror = failed_mirror;
7919		- failrec->this_mirror++;
7920		- if (failrec->this_mirror == failed_mirror)
7921		- failrec->this_mirror++;
7922		-
7923		- if (failrec->this_mirror > num_copies) {
7924		- btrfs_debug(fs_info,
7925		- "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7926		- num_copies, failrec->this_mirror, failed_mirror);
7927		- return 0;
7928		- }
7929		-
7930		- return 1;
7931		-}
7932		-
7933		-static blk_status_t dio_read_error(struct inode inode, struct bio failed_bio,
7934		- struct page *page, unsigned int pgoff,
7935		- u64 start, u64 end, int failed_mirror,
7936		- bio_end_io_t repair_endio, void repair_arg)
7937		-{
7938		- struct io_failure_record *failrec;
7939		- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	7658	+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	7659	+ const u32 sectorsize = fs_info->sectorsize;
7940	7660	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7941		- struct bio *bio;
7942		- int isector;
7943		- unsigned int read_mode = 0;
7944		- int segs;
7945		- int ret;
7946		- blk_status_t status;
7947		- struct bio_vec bvec;
7948		-
7949		- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7950		-
7951		- ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7952		- if (ret)
7953		- return errno_to_blk_status(ret);
7954		-
7955		- ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7956		- failed_mirror);
7957		- if (!ret) {
7958		- free_io_failure(failure_tree, io_tree, failrec);
7959		- return BLK_STS_IOERR;
7960		- }
7961		-
7962		- segs = bio_segments(failed_bio);
7963		- bio_get_first_bvec(failed_bio, &bvec);
7964		- if (segs > 1 \|\|
7965		- (bvec.bv_len > btrfs_inode_sectorsize(inode)))
7966		- read_mode \|= REQ_FAILFAST_DEV;
7967		-
7968		- isector = start - btrfs_io_bio(failed_bio)->logical;
7969		- isector >>= inode->i_sb->s_blocksize_bits;
7970		- bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7971		- pgoff, isector, repair_endio, repair_arg);
7972		- bio->bi_opf = REQ_OP_READ \| read_mode;
7973		-
7974		- btrfs_debug(BTRFS_I(inode)->root->fs_info,
7975		- "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
7976		- read_mode, failrec->this_mirror, failrec->in_validation);
7977		-
7978		- status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
7979		- if (status) {
7980		- free_io_failure(failure_tree, io_tree, failrec);
7981		- bio_put(bio);
7982		- }
7983		-
7984		- return status;
7985		-}
7986		-
7987		-struct btrfs_retry_complete {
7988		- struct completion done;
7989		- struct inode *inode;
7990		- u64 start;
7991		- int uptodate;
7992		-};
7993		-
7994		-static void btrfs_retry_endio_nocsum(struct bio *bio)
7995		-{
7996		- struct btrfs_retry_complete *done = bio->bi_private;
7997		- struct inode *inode = done->inode;
7998		- struct bio_vec *bvec;
7999		- struct extent_io_tree io_tree, failure_tree;
8000		- int i;
8001		-
8002		- if (bio->bi_status)
8003		- goto end;
8004		-
8005		- ASSERT(bio->bi_vcnt == 1);
8006		- io_tree = &BTRFS_I(inode)->io_tree;
8007		- failure_tree = &BTRFS_I(inode)->io_failure_tree;
8008		- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
8009		-
8010		- done->uptodate = 1;
8011		- ASSERT(!bio_flagged(bio, BIO_CLONED));
8012		- bio_for_each_segment_all(bvec, bio, i)
8013		- clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
8014		- io_tree, done->start, bvec->bv_page,
8015		- btrfs_ino(BTRFS_I(inode)), 0);
8016		-end:
8017		- complete(&done->done);
8018		- bio_put(bio);
8019		-}
8020		-
8021		-static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
8022		- struct btrfs_io_bio *io_bio)
8023		-{
8024		- struct btrfs_fs_info *fs_info;
	7661	+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	7662	+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
8025	7663	struct bio_vec bvec;
8026	7664	struct bvec_iter iter;
8027		- struct btrfs_retry_complete done;
8028		- u64 start;
8029		- unsigned int pgoff;
8030		- u32 sectorsize;
8031		- int nr_sectors;
8032		- blk_status_t ret;
	7665	+ u64 start = io_bio->logical;
	7666	+ int icsum = 0;
8033	7667	blk_status_t err = BLK_STS_OK;
8034	7668
8035		- fs_info = BTRFS_I(inode)->root->fs_info;
8036		- sectorsize = fs_info->sectorsize;
	7669	+ __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
	7670	+ unsigned int i, nr_sectors, pgoff;
8037	7671
8038		- start = io_bio->logical;
8039		- done.inode = inode;
8040		- io_bio->bio.bi_iter = io_bio->iter;
8041		-
8042		- bio_for_each_segment(bvec, &io_bio->bio, iter) {
8043	7672	nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8044	7673	pgoff = bvec.bv_offset;
8045		-
8046		-next_block_or_try_again:
8047		- done.uptodate = 0;
8048		- done.start = start;
8049		- init_completion(&done.done);
8050		-
8051		- ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8052		- pgoff, start, start + sectorsize - 1,
8053		- io_bio->mirror_num,
8054		- btrfs_retry_endio_nocsum, &done);
8055		- if (ret) {
8056		- err = ret;
8057		- goto next;
8058		- }
8059		-
8060		- wait_for_completion_io(&done.done);
8061		-
8062		- if (!done.uptodate) {
8063		- /* We might have another mirror, so try again */
8064		- goto next_block_or_try_again;
8065		- }
8066		-
8067		-next:
8068		- start += sectorsize;
8069		-
8070		- nr_sectors--;
8071		- if (nr_sectors) {
8072		- pgoff += sectorsize;
	7674	+ for (i = 0; i < nr_sectors; i++) {
8073	7675	ASSERT(pgoff < PAGE_SIZE);
8074		- goto next_block_or_try_again;
	7676	+ if (uptodate &&
	7677	+ (!csum \|\| !check_data_csum(inode, io_bio, icsum,
	7678	+ bvec.bv_page, pgoff,
	7679	+ start, sectorsize))) {
	7680	+ clean_io_failure(fs_info, failure_tree, io_tree,
	7681	+ start, bvec.bv_page,
	7682	+ btrfs_ino(BTRFS_I(inode)),
	7683	+ pgoff);
	7684	+ } else {
	7685	+ blk_status_t status;
	7686	+
	7687	+ status = btrfs_submit_read_repair(inode,
	7688	+ &io_bio->bio,
	7689	+ start - io_bio->logical,
	7690	+ bvec.bv_page, pgoff,
	7691	+ start,
	7692	+ start + sectorsize - 1,
	7693	+ io_bio->mirror_num,
	7694	+ submit_dio_repair_bio);
	7695	+ if (status)
	7696	+ err = status;
	7697	+ }
	7698	+ start += sectorsize;
	7699	+ icsum++;
	7700	+ pgoff += sectorsize;
8075	7701	}
8076	7702	}
8077		-
8078	7703	return err;
8079	7704	}
8080	7705
8081		-static void btrfs_retry_endio(struct bio *bio)
8082		-{
8083		- struct btrfs_retry_complete *done = bio->bi_private;
8084		- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8085		- struct extent_io_tree io_tree, failure_tree;
8086		- struct inode *inode = done->inode;
8087		- struct bio_vec *bvec;
8088		- int uptodate;
8089		- int ret;
8090		- int i;
8091		-
8092		- if (bio->bi_status)
8093		- goto end;
8094		-
8095		- uptodate = 1;
8096		-
8097		- ASSERT(bio->bi_vcnt == 1);
8098		- ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
8099		-
8100		- io_tree = &BTRFS_I(inode)->io_tree;
8101		- failure_tree = &BTRFS_I(inode)->io_failure_tree;
8102		-
8103		- ASSERT(!bio_flagged(bio, BIO_CLONED));
8104		- bio_for_each_segment_all(bvec, bio, i) {
8105		- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
8106		- bvec->bv_offset, done->start,
8107		- bvec->bv_len);
8108		- if (!ret)
8109		- clean_io_failure(BTRFS_I(inode)->root->fs_info,
8110		- failure_tree, io_tree, done->start,
8111		- bvec->bv_page,
8112		- btrfs_ino(BTRFS_I(inode)),
8113		- bvec->bv_offset);
8114		- else
8115		- uptodate = 0;
8116		- }
8117		-
8118		- done->uptodate = uptodate;
8119		-end:
8120		- complete(&done->done);
8121		- bio_put(bio);
8122		-}
8123		-
8124		-static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8125		- struct btrfs_io_bio *io_bio, blk_status_t err)
8126		-{
8127		- struct btrfs_fs_info *fs_info;
8128		- struct bio_vec bvec;
8129		- struct bvec_iter iter;
8130		- struct btrfs_retry_complete done;
8131		- u64 start;
8132		- u64 offset = 0;
8133		- u32 sectorsize;
8134		- int nr_sectors;
8135		- unsigned int pgoff;
8136		- int csum_pos;
8137		- bool uptodate = (err == 0);
8138		- int ret;
8139		- blk_status_t status;
8140		-
8141		- fs_info = BTRFS_I(inode)->root->fs_info;
8142		- sectorsize = fs_info->sectorsize;
8143		-
8144		- err = BLK_STS_OK;
8145		- start = io_bio->logical;
8146		- done.inode = inode;
8147		- io_bio->bio.bi_iter = io_bio->iter;
8148		-
8149		- bio_for_each_segment(bvec, &io_bio->bio, iter) {
8150		- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8151		-
8152		- pgoff = bvec.bv_offset;
8153		-next_block:
8154		- if (uptodate) {
8155		- csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8156		- ret = __readpage_endio_check(inode, io_bio, csum_pos,
8157		- bvec.bv_page, pgoff, start, sectorsize);
8158		- if (likely(!ret))
8159		- goto next;
8160		- }
8161		-try_again:
8162		- done.uptodate = 0;
8163		- done.start = start;
8164		- init_completion(&done.done);
8165		-
8166		- status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8167		- pgoff, start, start + sectorsize - 1,
8168		- io_bio->mirror_num, btrfs_retry_endio,
8169		- &done);
8170		- if (status) {
8171		- err = status;
8172		- goto next;
8173		- }
8174		-
8175		- wait_for_completion_io(&done.done);
8176		-
8177		- if (!done.uptodate) {
8178		- /* We might have another mirror, so try again */
8179		- goto try_again;
8180		- }
8181		-next:
8182		- offset += sectorsize;
8183		- start += sectorsize;
8184		-
8185		- ASSERT(nr_sectors);
8186		-
8187		- nr_sectors--;
8188		- if (nr_sectors) {
8189		- pgoff += sectorsize;
8190		- ASSERT(pgoff < PAGE_SIZE);
8191		- goto next_block;
8192		- }
8193		- }
8194		-
8195		- return err;
8196		-}
8197		-
8198		-static blk_status_t btrfs_subio_endio_read(struct inode *inode,
8199		- struct btrfs_io_bio *io_bio, blk_status_t err)
8200		-{
8201		- bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8202		-
8203		- if (skip_csum) {
8204		- if (unlikely(err))
8205		- return __btrfs_correct_data_nocsum(inode, io_bio);
8206		- else
8207		- return BLK_STS_OK;
8208		- } else {
8209		- return __btrfs_subio_endio_read(inode, io_bio, err);
8210		- }
8211		-}
8212		-
8213		-static void btrfs_endio_direct_read(struct bio *bio)
8214		-{
8215		- struct btrfs_dio_private *dip = bio->bi_private;
8216		- struct inode *inode = dip->inode;
8217		- struct bio *dio_bio;
8218		- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8219		- blk_status_t err = bio->bi_status;
8220		-
8221		- if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8222		- err = btrfs_subio_endio_read(inode, io_bio, err);
8223		-
8224		- unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8225		- dip->logical_offset + dip->bytes - 1);
8226		- dio_bio = dip->dio_bio;
8227		-
8228		- kfree(dip);
8229		-
8230		- dio_bio->bi_status = err;
8231		- dio_end_io(dio_bio);
8232		-
8233		- if (io_bio->end_io)
8234		- io_bio->end_io(io_bio, blk_status_to_errno(err));
8235		- bio_put(bio);
8236		-}
8237		-
8238		-static void __endio_write_update_ordered(struct inode *inode,
	7706	+static void __endio_write_update_ordered(struct btrfs_inode *inode,
8239	7707	const u64 offset, const u64 bytes,
8240	7708	const bool uptodate)
8241	7709	{
8242		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	7710	+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
8243	7711	struct btrfs_ordered_extent *ordered = NULL;
8244	7712	struct btrfs_workqueue *wq;
8245		- btrfs_work_func_t func;
8246	7713	u64 ordered_offset = offset;
8247	7714	u64 ordered_bytes = bytes;
8248	7715	u64 last_offset;
8249	7716
8250		- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
	7717	+ if (btrfs_is_free_space_inode(inode))
8251	7718	wq = fs_info->endio_freespace_worker;
8252		- func = btrfs_freespace_write_helper;
8253		- } else {
	7719	+ else
8254	7720	wq = fs_info->endio_write_workers;
8255		- func = btrfs_endio_write_helper;
8256		- }
8257	7721
8258	7722	while (ordered_offset < offset + bytes) {
8259	7723	last_offset = ordered_offset;
8260	7724	if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
8261		- &ordered_offset,
8262		- ordered_bytes,
8263		- uptodate)) {
8264		- btrfs_init_work(&ordered->work, func,
8265		- finish_ordered_fn,
8266		- NULL, NULL);
	7725	+ &ordered_offset,
	7726	+ ordered_bytes,
	7727	+ uptodate)) {
	7728	+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
	7729	+ NULL);
8267	7730	btrfs_queue_work(wq, &ordered->work);
8268	7731	}
8269	7732	/*
..	..	@@ -8274,7 +7737,7 @@
8274	7737	return;
8275	7738	/*
8276	7739	* Our bio might span multiple ordered extents. In this case
8277		- * we keep goin until we have accounted the whole dio.
	7740	+ * we keep going until we have accounted the whole dio.
8278	7741	*/
8279	7742	if (ordered_offset < offset + bytes) {
8280	7743	ordered_bytes = offset + bytes - ordered_offset;
..	..	@@ -8283,29 +7746,12 @@
8283	7746	}
8284	7747	}
8285	7748
8286		-static void btrfs_endio_direct_write(struct bio *bio)
8287		-{
8288		- struct btrfs_dio_private *dip = bio->bi_private;
8289		- struct bio *dio_bio = dip->dio_bio;
8290		-
8291		- __endio_write_update_ordered(dip->inode, dip->logical_offset,
8292		- dip->bytes, !bio->bi_status);
8293		-
8294		- kfree(dip);
8295		-
8296		- dio_bio->bi_status = bio->bi_status;
8297		- dio_end_io(dio_bio);
8298		- bio_put(bio);
8299		-}
8300		-
8301	7749	static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
8302	7750	struct bio *bio, u64 offset)
8303	7751	{
8304	7752	struct inode *inode = private_data;
8305		- blk_status_t ret;
8306		- ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8307		- BUG_ON(ret); /* -ENOMEM */
8308		- return 0;
	7753	+
	7754	+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
8309	7755	}
8310	7756
8311	7757	static void btrfs_end_dio_bio(struct bio *bio)
..	..	@@ -8321,62 +7767,16 @@
8321	7767	(unsigned long long)bio->bi_iter.bi_sector,
8322	7768	bio->bi_iter.bi_size, err);
8323	7769
8324		- if (dip->subio_endio)
8325		- err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8326		-
8327		- if (err) {
8328		- /*
8329		- * We want to perceive the errors flag being set before
8330		- * decrementing the reference count. We don't need a barrier
8331		- * since atomic operations with a return value are fully
8332		- * ordered as per atomic_t.txt
8333		- */
8334		- dip->errors = 1;
	7770	+ if (bio_op(bio) == REQ_OP_READ) {
	7771	+ err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
	7772	+ !err);
8335	7773	}
8336	7774
8337		- /* if there are more bios still pending for this dio, just exit */
8338		- if (!atomic_dec_and_test(&dip->pending_bios))
8339		- goto out;
	7775	+ if (err)
	7776	+ dip->dio_bio->bi_status = err;
8340	7777
8341		- if (dip->errors) {
8342		- bio_io_error(dip->orig_bio);
8343		- } else {
8344		- dip->dio_bio->bi_status = BLK_STS_OK;
8345		- bio_endio(dip->orig_bio);
8346		- }
8347		-out:
8348	7778	bio_put(bio);
8349		-}
8350		-
8351		-static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8352		- struct btrfs_dio_private *dip,
8353		- struct bio *bio,
8354		- u64 file_offset)
8355		-{
8356		- struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8357		- struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8358		- blk_status_t ret;
8359		-
8360		- /*
8361		- * We load all the csum data we need when we submit
8362		- * the first bio to reduce the csum tree search and
8363		- * contention.
8364		- */
8365		- if (dip->logical_offset == file_offset) {
8366		- ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8367		- file_offset);
8368		- if (ret)
8369		- return ret;
8370		- }
8371		-
8372		- if (bio == dip->orig_bio)
8373		- return 0;
8374		-
8375		- file_offset -= dip->logical_offset;
8376		- file_offset >>= inode->i_sb->s_blocksize_bits;
8377		- io_bio->csum = (u8 )(((u32 )orig_io_bio->csum) + file_offset);
8378		-
8379		- return 0;
	7779	+ btrfs_dio_private_put(dip);
8380	7780	}
8381	7781
8382	7782	static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
..	..	@@ -8410,222 +7810,169 @@
8410	7810	* If we aren't doing async submit, calculate the csum of the
8411	7811	* bio now.
8412	7812	*/
8413		- ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
	7813	+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
8414	7814	if (ret)
8415	7815	goto err;
8416	7816	} else {
8417		- ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8418		- file_offset);
8419		- if (ret)
8420		- goto err;
	7817	+ u64 csum_offset;
	7818	+
	7819	+ csum_offset = file_offset - dip->logical_offset;
	7820	+ csum_offset >>= inode->i_sb->s_blocksize_bits;
	7821	+ csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
	7822	+ btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
8421	7823	}
8422	7824	map:
8423		- ret = btrfs_map_bio(fs_info, bio, 0, 0);
	7825	+ ret = btrfs_map_bio(fs_info, bio, 0);
8424	7826	err:
8425	7827	return ret;
8426	7828	}
8427	7829
8428		-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
	7830	+/*
	7831	+ * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
	7832	+ * or ordered extents whether or not we submit any bios.
	7833	+ */
	7834	+static struct btrfs_dio_private btrfs_create_dio_private(struct bio dio_bio,
	7835	+ struct inode *inode,
	7836	+ loff_t file_offset)
8429	7837	{
8430		- struct inode *inode = dip->inode;
	7838	+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
	7839	+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
	7840	+ size_t dip_size;
	7841	+ struct btrfs_dio_private *dip;
	7842	+
	7843	+ dip_size = sizeof(*dip);
	7844	+ if (!write && csum) {
	7845	+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	7846	+ const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
	7847	+ size_t nblocks;
	7848	+
	7849	+ nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
	7850	+ dip_size += csum_size * nblocks;
	7851	+ }
	7852	+
	7853	+ dip = kzalloc(dip_size, GFP_NOFS);
	7854	+ if (!dip)
	7855	+ return NULL;
	7856	+
	7857	+ dip->inode = inode;
	7858	+ dip->logical_offset = file_offset;
	7859	+ dip->bytes = dio_bio->bi_iter.bi_size;
	7860	+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
	7861	+ dip->dio_bio = dio_bio;
	7862	+ refcount_set(&dip->refs, 1);
	7863	+ return dip;
	7864	+}
	7865	+
	7866	+static blk_qc_t btrfs_submit_direct(struct inode inode, struct iomap iomap,
	7867	+ struct bio *dio_bio, loff_t file_offset)
	7868	+{
	7869	+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
	7870	+ const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
8431	7871	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	7872	+ const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
	7873	+ BTRFS_BLOCK_GROUP_RAID56_MASK);
	7874	+ struct btrfs_dio_private *dip;
8432	7875	struct bio *bio;
8433		- struct bio *orig_bio = dip->orig_bio;
8434		- u64 start_sector = orig_bio->bi_iter.bi_sector;
8435		- u64 file_offset = dip->logical_offset;
8436		- u64 map_length;
	7876	+ u64 start_sector;
8437	7877	int async_submit = 0;
8438	7878	u64 submit_len;
8439	7879	int clone_offset = 0;
8440	7880	int clone_len;
8441	7881	int ret;
8442	7882	blk_status_t status;
	7883	+ struct btrfs_io_geometry geom;
	7884	+ struct btrfs_dio_data *dio_data = iomap->private;
8443	7885
8444		- map_length = orig_bio->bi_iter.bi_size;
8445		- submit_len = map_length;
8446		- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8447		- &map_length, NULL, 0);
8448		- if (ret)
8449		- return -EIO;
8450		-
8451		- if (map_length >= submit_len) {
8452		- bio = orig_bio;
8453		- dip->flags \|= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8454		- goto submit;
	7886	+ dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
	7887	+ if (!dip) {
	7888	+ if (!write) {
	7889	+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
	7890	+ file_offset + dio_bio->bi_iter.bi_size - 1);
	7891	+ }
	7892	+ dio_bio->bi_status = BLK_STS_RESOURCE;
	7893	+ bio_endio(dio_bio);
	7894	+ return BLK_QC_T_NONE;
8455	7895	}
8456	7896
8457		- /* async crcs make it difficult to collect full stripe writes. */
8458		- if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8459		- async_submit = 0;
8460		- else
8461		- async_submit = 1;
	7897	+ if (!write && csum) {
	7898	+ /*
	7899	+ * Load the csums up front to reduce csum tree searches and
	7900	+ * contention when submitting bios.
	7901	+ */
	7902	+ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
	7903	+ dip->csums);
	7904	+ if (status != BLK_STS_OK)
	7905	+ goto out_err;
	7906	+ }
8462	7907
8463		- /* bio split */
8464		- ASSERT(map_length <= INT_MAX);
	7908	+ start_sector = dio_bio->bi_iter.bi_sector;
	7909	+ submit_len = dio_bio->bi_iter.bi_size;
	7910	+
8465	7911	do {
8466		- clone_len = min_t(int, submit_len, map_length);
	7912	+ ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
	7913	+ start_sector << 9, submit_len,
	7914	+ &geom);
	7915	+ if (ret) {
	7916	+ status = errno_to_blk_status(ret);
	7917	+ goto out_err;
	7918	+ }
	7919	+ ASSERT(geom.len <= INT_MAX);
	7920	+
	7921	+ clone_len = min_t(int, submit_len, geom.len);
8467	7922
8468	7923	/*
8469	7924	* This will never fail as it's passing GPF_NOFS and
8470	7925	* the allocation is backed by btrfs_bioset.
8471	7926	*/
8472		- bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8473		- clone_len);
	7927	+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
8474	7928	bio->bi_private = dip;
8475	7929	bio->bi_end_io = btrfs_end_dio_bio;
8476	7930	btrfs_io_bio(bio)->logical = file_offset;
8477	7931
8478	7932	ASSERT(submit_len >= clone_len);
8479	7933	submit_len -= clone_len;
8480		- if (submit_len == 0)
8481		- break;
8482	7934
8483	7935	/*
8484	7936	* Increase the count before we submit the bio so we know
8485	7937	* the end IO handler won't happen before we increase the
8486	7938	* count. Otherwise, the dip might get freed before we're
8487	7939	* done setting it up.
	7940	+ *
	7941	+ * We transfer the initial reference to the last bio, so we
	7942	+ * don't need to increment the reference count for the last one.
8488	7943	*/
8489		- atomic_inc(&dip->pending_bios);
	7944	+ if (submit_len > 0) {
	7945	+ refcount_inc(&dip->refs);
	7946	+ /*
	7947	+ * If we are submitting more than one bio, submit them
	7948	+ * all asynchronously. The exception is RAID 5 or 6, as
	7949	+ * asynchronous checksums make it difficult to collect
	7950	+ * full stripe writes.
	7951	+ */
	7952	+ if (!raid56)
	7953	+ async_submit = 1;
	7954	+ }
8490	7955
8491	7956	status = btrfs_submit_dio_bio(bio, inode, file_offset,
8492	7957	async_submit);
8493	7958	if (status) {
8494	7959	bio_put(bio);
8495		- atomic_dec(&dip->pending_bios);
	7960	+ if (submit_len > 0)
	7961	+ refcount_dec(&dip->refs);
8496	7962	goto out_err;
8497	7963	}
8498	7964
	7965	+ dio_data->submitted += clone_len;
8499	7966	clone_offset += clone_len;
8500	7967	start_sector += clone_len >> 9;
8501	7968	file_offset += clone_len;
8502		-
8503		- map_length = submit_len;
8504		- ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8505		- start_sector << 9, &map_length, NULL, 0);
8506		- if (ret)
8507		- goto out_err;
8508	7969	} while (submit_len > 0);
	7970	+ return BLK_QC_T_NONE;
8509	7971
8510		-submit:
8511		- status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
8512		- if (!status)
8513		- return 0;
8514		-
8515		- if (bio != orig_bio)
8516		- bio_put(bio);
8517	7972	out_err:
8518		- dip->errors = 1;
8519		- /*
8520		- * Before atomic variable goto zero, we must make sure dip->errors is
8521		- * perceived to be set. This ordering is ensured by the fact that an
8522		- * atomic operations with a return value are fully ordered as per
8523		- * atomic_t.txt
8524		- */
8525		- if (atomic_dec_and_test(&dip->pending_bios))
8526		- bio_io_error(dip->orig_bio);
8527		-
8528		- /* bio_end_io() will handle error, so we needn't return it */
8529		- return 0;
8530		-}
8531		-
8532		-static void btrfs_submit_direct(struct bio dio_bio, struct inode inode,
8533		- loff_t file_offset)
8534		-{
8535		- struct btrfs_dio_private *dip = NULL;
8536		- struct bio *bio = NULL;
8537		- struct btrfs_io_bio *io_bio;
8538		- bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8539		- int ret = 0;
8540		-
8541		- bio = btrfs_bio_clone(dio_bio);
8542		-
8543		- dip = kzalloc(sizeof(*dip), GFP_NOFS);
8544		- if (!dip) {
8545		- ret = -ENOMEM;
8546		- goto free_ordered;
8547		- }
8548		-
8549		- dip->private = dio_bio->bi_private;
8550		- dip->inode = inode;
8551		- dip->logical_offset = file_offset;
8552		- dip->bytes = dio_bio->bi_iter.bi_size;
8553		- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8554		- bio->bi_private = dip;
8555		- dip->orig_bio = bio;
8556		- dip->dio_bio = dio_bio;
8557		- atomic_set(&dip->pending_bios, 1);
8558		- io_bio = btrfs_io_bio(bio);
8559		- io_bio->logical = file_offset;
8560		-
8561		- if (write) {
8562		- bio->bi_end_io = btrfs_endio_direct_write;
8563		- } else {
8564		- bio->bi_end_io = btrfs_endio_direct_read;
8565		- dip->subio_endio = btrfs_subio_endio_read;
8566		- }
8567		-
8568		- /*
8569		- * Reset the range for unsubmitted ordered extents (to a 0 length range)
8570		- * even if we fail to submit a bio, because in such case we do the
8571		- * corresponding error handling below and it must not be done a second
8572		- * time by btrfs_direct_IO().
8573		- */
8574		- if (write) {
8575		- struct btrfs_dio_data *dio_data = current->journal_info;
8576		-
8577		- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8578		- dip->bytes;
8579		- dio_data->unsubmitted_oe_range_start =
8580		- dio_data->unsubmitted_oe_range_end;
8581		- }
8582		-
8583		- ret = btrfs_submit_direct_hook(dip);
8584		- if (!ret)
8585		- return;
8586		-
8587		- if (io_bio->end_io)
8588		- io_bio->end_io(io_bio, ret);
8589		-
8590		-free_ordered:
8591		- /*
8592		- * If we arrived here it means either we failed to submit the dip
8593		- * or we either failed to clone the dio_bio or failed to allocate the
8594		- * dip. If we cloned the dio_bio and allocated the dip, we can just
8595		- * call bio_endio against our io_bio so that we get proper resource
8596		- * cleanup if we fail to submit the dip, otherwise, we must do the
8597		- * same as btrfs_endio_direct_[write\|read] because we can't call these
8598		- * callbacks - they require an allocated dip and a clone of dio_bio.
8599		- */
8600		- if (bio && dip) {
8601		- bio_io_error(bio);
8602		- /*
8603		- * The end io callbacks free our dip, do the final put on bio
8604		- * and all the cleanup and final put for dio_bio (through
8605		- * dio_end_io()).
8606		- */
8607		- dip = NULL;
8608		- bio = NULL;
8609		- } else {
8610		- if (write)
8611		- __endio_write_update_ordered(inode,
8612		- file_offset,
8613		- dio_bio->bi_iter.bi_size,
8614		- false);
8615		- else
8616		- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8617		- file_offset + dio_bio->bi_iter.bi_size - 1);
8618		-
8619		- dio_bio->bi_status = BLK_STS_IOERR;
8620		- /*
8621		- * Releases and cleans up our dio_bio, no need to bio_put()
8622		- * nor bio_endio()/bio_io_error() against dio_bio.
8623		- */
8624		- dio_end_io(dio_bio);
8625		- }
8626		- if (bio)
8627		- bio_put(bio);
8628		- kfree(dip);
	7973	+ dip->dio_bio->bi_status = status;
	7974	+ btrfs_dio_private_put(dip);
	7975	+ return BLK_QC_T_NONE;
8629	7976	}
8630	7977
8631	7978	static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
..	..	@@ -8661,37 +8008,63 @@
8661	8008	return retval;
8662	8009	}
8663	8010
8664		-static ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter)
	8011	+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
	8012	+ int error, unsigned flags)
	8013	+{
	8014	+ /*
	8015	+ * Now if we're still in the context of our submitter we know we can't
	8016	+ * safely run generic_write_sync(), so clear our flag here so that the
	8017	+ * caller knows to follow up with a sync.
	8018	+ */
	8019	+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
	8020	+ current->journal_info = NULL;
	8021	+ return error;
	8022	+ }
	8023	+
	8024	+ if (error)
	8025	+ return error;
	8026	+
	8027	+ if (size) {
	8028	+ iocb->ki_flags \|= IOCB_DSYNC;
	8029	+ return generic_write_sync(iocb, size);
	8030	+ }
	8031	+
	8032	+ return 0;
	8033	+}
	8034	+
	8035	+static const struct iomap_ops btrfs_dio_iomap_ops = {
	8036	+ .iomap_begin = btrfs_dio_iomap_begin,
	8037	+ .iomap_end = btrfs_dio_iomap_end,
	8038	+};
	8039	+
	8040	+static const struct iomap_dio_ops btrfs_dio_ops = {
	8041	+ .submit_io = btrfs_submit_direct,
	8042	+};
	8043	+
	8044	+static const struct iomap_dio_ops btrfs_sync_dops = {
	8045	+ .submit_io = btrfs_submit_direct,
	8046	+ .end_io = btrfs_maybe_fsync_end_io,
	8047	+};
	8048	+
	8049	+ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter)
8665	8050	{
8666	8051	struct file *file = iocb->ki_filp;
8667	8052	struct inode *inode = file->f_mapping->host;
8668	8053	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8669		- struct btrfs_dio_data dio_data = { 0 };
8670	8054	struct extent_changeset *data_reserved = NULL;
8671	8055	loff_t offset = iocb->ki_pos;
8672	8056	size_t count = 0;
8673		- int flags = 0;
8674		- bool wakeup = true;
8675	8057	bool relock = false;
8676	8058	ssize_t ret;
8677	8059
8678		- if (check_direct_IO(fs_info, iter, offset))
	8060	+ if (check_direct_IO(fs_info, iter, offset)) {
	8061	+ ASSERT(current->journal_info == NULL \|\|
	8062	+ current->journal_info == BTRFS_DIO_SYNC_STUB);
	8063	+ current->journal_info = NULL;
8679	8064	return 0;
	8065	+ }
8680	8066
8681		- inode_dio_begin(inode);
8682		-
8683		- /*
8684		- * The generic stuff only does filemap_write_and_wait_range, which
8685		- * isn't enough if we've written compressed pages to this area, so
8686		- * we need to flush the dirty pages again to make absolutely sure
8687		- * that any outstanding dirty pages are on disk.
8688		- */
8689	8067	count = iov_iter_count(iter);
8690		- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8691		- &BTRFS_I(inode)->runtime_flags))
8692		- filemap_fdatawrite_range(inode->i_mapping, offset,
8693		- offset + count - 1);
8694		-
8695	8068	if (iov_iter_rw(iter) == WRITE) {
8696	8069	/*
8697	8070	* If the write DIO is beyond the EOF, we need update
..	..	@@ -8699,65 +8072,29 @@
8699	8072	* not unlock the i_mutex at this case.
8700	8073	*/
8701	8074	if (offset + count <= inode->i_size) {
8702		- dio_data.overwrite = 1;
8703	8075	inode_unlock(inode);
8704	8076	relock = true;
8705	8077	}
8706		- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8707		- offset, count);
8708		- if (ret)
8709		- goto out;
8710		-
8711		- /*
8712		- * We need to know how many extents we reserved so that we can
8713		- * do the accounting properly if we go over the number we
8714		- * originally calculated. Abuse current->journal_info for this.
8715		- */
8716		- dio_data.reserve = round_up(count,
8717		- fs_info->sectorsize);
8718		- dio_data.unsubmitted_oe_range_start = (u64)offset;
8719		- dio_data.unsubmitted_oe_range_end = (u64)offset;
8720		- current->journal_info = &dio_data;
8721	8078	down_read(&BTRFS_I(inode)->dio_sem);
8722		- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8723		- &BTRFS_I(inode)->runtime_flags)) {
8724		- inode_dio_end(inode);
8725		- flags = DIO_LOCKING \| DIO_SKIP_HOLES;
8726		- wakeup = false;
8727	8079	}
8728	8080
8729		- ret = __blockdev_direct_IO(iocb, inode,
8730		- fs_info->fs_devices->latest_bdev,
8731		- iter, btrfs_get_blocks_direct, NULL,
8732		- btrfs_submit_direct, flags);
8733		- if (iov_iter_rw(iter) == WRITE) {
	8081	+ /*
	8082	+ * We have are actually a sync iocb, so we need our fancy endio to know
	8083	+ * if we need to sync.
	8084	+ */
	8085	+ if (current->journal_info)
	8086	+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
	8087	+ &btrfs_sync_dops, is_sync_kiocb(iocb));
	8088	+ else
	8089	+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
	8090	+ &btrfs_dio_ops, is_sync_kiocb(iocb));
	8091	+
	8092	+ if (ret == -ENOTBLK)
	8093	+ ret = 0;
	8094	+
	8095	+ if (iov_iter_rw(iter) == WRITE)
8734	8096	up_read(&BTRFS_I(inode)->dio_sem);
8735		- current->journal_info = NULL;
8736		- if (ret < 0 && ret != -EIOCBQUEUED) {
8737		- if (dio_data.reserve)
8738		- btrfs_delalloc_release_space(inode, data_reserved,
8739		- offset, dio_data.reserve, true);
8740		- /*
8741		- * On error we might have left some ordered extents
8742		- * without submitting corresponding bios for them, so
8743		- * cleanup them up to avoid other tasks getting them
8744		- * and waiting for them to complete forever.
8745		- */
8746		- if (dio_data.unsubmitted_oe_range_start <
8747		- dio_data.unsubmitted_oe_range_end)
8748		- __endio_write_update_ordered(inode,
8749		- dio_data.unsubmitted_oe_range_start,
8750		- dio_data.unsubmitted_oe_range_end -
8751		- dio_data.unsubmitted_oe_range_start,
8752		- false);
8753		- } else if (ret >= 0 && (size_t)ret < count)
8754		- btrfs_delalloc_release_space(inode, data_reserved,
8755		- offset, count - (size_t)ret, true);
8756		- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8757		- }
8758		-out:
8759		- if (wakeup)
8760		- inode_dio_end(inode);
	8097	+
8761	8098	if (relock)
8762	8099	inode_lock(inode);
8763	8100
..	..	@@ -8765,25 +8102,33 @@
8765	8102	return ret;
8766	8103	}
8767	8104
8768		-#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
8769		-
8770	8105	static int btrfs_fiemap(struct inode inode, struct fiemap_extent_info fieinfo,
8771		- __u64 start, __u64 len)
	8106	+ u64 start, u64 len)
8772	8107	{
8773	8108	int ret;
8774	8109
8775		- ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
	8110	+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
8776	8111	if (ret)
8777	8112	return ret;
8778	8113
8779		- return extent_fiemap(inode, fieinfo, start, len);
	8114	+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
8780	8115	}
8781	8116
8782	8117	int btrfs_readpage(struct file file, struct page page)
8783	8118	{
8784		- struct extent_io_tree *tree;
8785		- tree = &BTRFS_I(page->mapping->host)->io_tree;
8786		- return extent_read_full_page(tree, page, btrfs_get_extent, 0);
	8119	+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
	8120	+ u64 start = page_offset(page);
	8121	+ u64 end = start + PAGE_SIZE - 1;
	8122	+ unsigned long bio_flags = 0;
	8123	+ struct bio *bio = NULL;
	8124	+ int ret;
	8125	+
	8126	+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
	8127	+
	8128	+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
	8129	+ if (bio)
	8130	+ ret = submit_one_bio(bio, 0, bio_flags);
	8131	+ return ret;
8787	8132	}
8788	8133
8789	8134	static int btrfs_writepage(struct page page, struct writeback_control wbc)
..	..	@@ -8817,21 +8162,16 @@
8817	8162	return extent_writepages(mapping, wbc);
8818	8163	}
8819	8164
8820		-static int
8821		-btrfs_readpages(struct file file, struct address_space mapping,
8822		- struct list_head *pages, unsigned nr_pages)
	8165	+static void btrfs_readahead(struct readahead_control *rac)
8823	8166	{
8824		- return extent_readpages(mapping, pages, nr_pages);
	8167	+ extent_readahead(rac);
8825	8168	}
8826	8169
8827	8170	static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8828	8171	{
8829	8172	int ret = try_release_extent_mapping(page, gfp_flags);
8830		- if (ret == 1) {
8831		- ClearPagePrivate(page);
8832		- set_page_private(page, 0);
8833		- put_page(page);
8834		- }
	8173	+ if (ret == 1)
	8174	+ detach_page_private(page);
8835	8175	return ret;
8836	8176	}
8837	8177
..	..	@@ -8842,18 +8182,45 @@
8842	8182	return __btrfs_releasepage(page, gfp_flags);
8843	8183	}
8844	8184
	8185	+#ifdef CONFIG_MIGRATION
	8186	+static int btrfs_migratepage(struct address_space *mapping,
	8187	+ struct page newpage, struct page page,
	8188	+ enum migrate_mode mode)
	8189	+{
	8190	+ int ret;
	8191	+
	8192	+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
	8193	+ if (ret != MIGRATEPAGE_SUCCESS)
	8194	+ return ret;
	8195	+
	8196	+ if (page_has_private(page))
	8197	+ attach_page_private(newpage, detach_page_private(page));
	8198	+
	8199	+ if (PagePrivate2(page)) {
	8200	+ ClearPagePrivate2(page);
	8201	+ SetPagePrivate2(newpage);
	8202	+ }
	8203	+
	8204	+ if (mode != MIGRATE_SYNC_NO_COPY)
	8205	+ migrate_page_copy(newpage, page);
	8206	+ else
	8207	+ migrate_page_states(newpage, page);
	8208	+ return MIGRATEPAGE_SUCCESS;
	8209	+}
	8210	+#endif
	8211	+
8845	8212	static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8846	8213	unsigned int length)
8847	8214	{
8848		- struct inode *inode = page->mapping->host;
8849		- struct extent_io_tree *tree;
	8215	+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
	8216	+ struct extent_io_tree *tree = &inode->io_tree;
8850	8217	struct btrfs_ordered_extent *ordered;
8851	8218	struct extent_state *cached_state = NULL;
8852	8219	u64 page_start = page_offset(page);
8853	8220	u64 page_end = page_start + PAGE_SIZE - 1;
8854	8221	u64 start;
8855	8222	u64 end;
8856		- int inode_evicting = inode->i_state & I_FREEING;
	8223	+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
8857	8224
8858	8225	/*
8859	8226	* we have the page locked, so new writeback can't start,
..	..	@@ -8864,28 +8231,39 @@
8864	8231	*/
8865	8232	wait_on_page_writeback(page);
8866	8233
8867		- tree = &BTRFS_I(inode)->io_tree;
8868		- if (offset) {
	8234	+ /*
	8235	+ * For subpage case, we have call sites like
	8236	+ * btrfs_punch_hole_lock_range() which passes range not aligned to
	8237	+ * sectorsize.
	8238	+ * If the range doesn't cover the full page, we don't need to and
	8239	+ * shouldn't clear page extent mapped, as page->private can still
	8240	+ * record subpage dirty bits for other part of the range.
	8241	+ *
	8242	+ * For cases that can invalidate the full even the range doesn't
	8243	+ * cover the full page, like invalidating the last page, we're
	8244	+ * still safe to wait for ordered extent to finish.
	8245	+ */
	8246	+ if (!(offset == 0 && length == PAGE_SIZE)) {
8869	8247	btrfs_releasepage(page, GFP_NOFS);
8870	8248	return;
8871	8249	}
8872	8250
8873	8251	if (!inode_evicting)
8874	8252	lock_extent_bits(tree, page_start, page_end, &cached_state);
8875		-again:
	8253	+
8876	8254	start = page_start;
8877		- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8878		- page_end - start + 1);
	8255	+again:
	8256	+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
8879	8257	if (ordered) {
8880		- end = min(page_end, ordered->file_offset + ordered->len - 1);
	8258	+ end = min(page_end,
	8259	+ ordered->file_offset + ordered->num_bytes - 1);
8881	8260	/*
8882	8261	* IO on this page will never be started, so we need
8883	8262	* to account for any ordered extents now
8884	8263	*/
8885	8264	if (!inode_evicting)
8886	8265	clear_extent_bit(tree, start, end,
8887		- EXTENT_DIRTY \| EXTENT_DELALLOC \|
8888		- EXTENT_DELALLOC_NEW \|
	8266	+ EXTENT_DELALLOC \| EXTENT_DELALLOC_NEW \|
8889	8267	EXTENT_LOCKED \| EXTENT_DO_ACCOUNTING \|
8890	8268	EXTENT_DEFRAG, 1, 0, &cached_state);
8891	8269	/*
..	..	@@ -8896,7 +8274,7 @@
8896	8274	struct btrfs_ordered_inode_tree *tree;
8897	8275	u64 new_len;
8898	8276
8899		- tree = &BTRFS_I(inode)->ordered_tree;
	8277	+ tree = &inode->ordered_tree;
8900	8278
8901	8279	spin_lock_irq(&tree->lock);
8902	8280	set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
..	..	@@ -8937,8 +8315,7 @@
8937	8315	*/
8938	8316	btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
8939	8317	if (!inode_evicting) {
8940		- clear_extent_bit(tree, page_start, page_end,
8941		- EXTENT_LOCKED \| EXTENT_DIRTY \|
	8318	+ clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED \|
8942	8319	EXTENT_DELALLOC \| EXTENT_DELALLOC_NEW \|
8943	8320	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 1, 1,
8944	8321	&cached_state);
..	..	@@ -8947,11 +8324,7 @@
8947	8324	}
8948	8325
8949	8326	ClearPageChecked(page);
8950		- if (PagePrivate(page)) {
8951		- ClearPagePrivate(page);
8952		- set_page_private(page, 0);
8953		- put_page(page);
8954		- }
	8327	+ detach_page_private(page);
8955	8328	}
8956	8329
8957	8330	/*
..	..	@@ -9004,8 +8377,8 @@
9004	8377	* end up waiting indefinitely to get a lock on the page currently
9005	8378	* being processed by btrfs_page_mkwrite() function.
9006	8379	*/
9007		- ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
9008		- reserved_space);
	8380	+ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
	8381	+ page_start, reserved_space);
9009	8382	if (!ret2) {
9010	8383	ret2 = file_update_time(vmf->vma->vm_file);
9011	8384	reserved = 1;
..	..	@@ -9042,7 +8415,7 @@
9042	8415	unlock_extent_cached(io_tree, page_start, page_end,
9043	8416	&cached_state);
9044	8417	unlock_page(page);
9045		- btrfs_start_ordered_extent(inode, ordered, 1);
	8418	+ btrfs_start_ordered_extent(ordered, 1);
9046	8419	btrfs_put_ordered_extent(ordered);
9047	8420	goto again;
9048	8421	}
..	..	@@ -9052,9 +8425,9 @@
9052	8425	fs_info->sectorsize);
9053	8426	if (reserved_space < PAGE_SIZE) {
9054	8427	end = page_start + reserved_space - 1;
9055		- btrfs_delalloc_release_space(inode, data_reserved,
9056		- page_start, PAGE_SIZE - reserved_space,
9057		- true);
	8428	+ btrfs_delalloc_release_space(BTRFS_I(inode),
	8429	+ data_reserved, page_start,
	8430	+ PAGE_SIZE - reserved_space, true);
9058	8431	}
9059	8432	}
9060	8433
..	..	@@ -9066,23 +8439,21 @@
9066	8439	* reserve data&meta space before lock_page() (see above comments).
9067	8440	*/
9068	8441	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9069		- EXTENT_DIRTY \| EXTENT_DELALLOC \|
9070		- EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
9071		- 0, 0, &cached_state);
	8442	+ EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \|
	8443	+ EXTENT_DEFRAG, 0, 0, &cached_state);
9072	8444
9073		- ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
9074		- &cached_state, 0);
	8445	+ ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
	8446	+ &cached_state);
9075	8447	if (ret2) {
9076	8448	unlock_extent_cached(io_tree, page_start, page_end,
9077	8449	&cached_state);
9078	8450	ret = VM_FAULT_SIGBUS;
9079	8451	goto out_unlock;
9080	8452	}
9081		- ret2 = 0;
9082	8453
9083	8454	/* page is wholly or partially inside EOF */
9084	8455	if (page_start + PAGE_SIZE > size)
9085		- zero_start = size & ~PAGE_MASK;
	8456	+ zero_start = offset_in_page(size);
9086	8457	else
9087	8458	zero_start = PAGE_SIZE;
9088	8459
..	..	@@ -9096,24 +8467,20 @@
9096	8467	set_page_dirty(page);
9097	8468	SetPageUptodate(page);
9098	8469
9099		- BTRFS_I(inode)->last_trans = fs_info->generation;
9100		- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9101		- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
	8470	+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
9102	8471
9103	8472	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
9104	8473
9105		- if (!ret2) {
9106		- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9107		- sb_end_pagefault(inode->i_sb);
9108		- extent_changeset_free(data_reserved);
9109		- return VM_FAULT_LOCKED;
9110		- }
	8474	+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
	8475	+ sb_end_pagefault(inode->i_sb);
	8476	+ extent_changeset_free(data_reserved);
	8477	+ return VM_FAULT_LOCKED;
9111	8478
9112	8479	out_unlock:
9113	8480	unlock_page(page);
9114	8481	out:
9115	8482	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9116		- btrfs_delalloc_release_space(inode, data_reserved, page_start,
	8483	+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
9117	8484	reserved_space, (ret != 0));
9118	8485	out_noreserve:
9119	8486	sb_end_pagefault(inode->i_sb);
..	..	@@ -9129,7 +8496,7 @@
9129	8496	int ret;
9130	8497	struct btrfs_trans_handle *trans;
9131	8498	u64 mask = fs_info->sectorsize - 1;
9132		- u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
	8499	+ u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
9133	8500
9134	8501	if (!skip_writeback) {
9135	8502	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
..	..	@@ -9184,7 +8551,7 @@
9184	8551
9185	8552	/* Migrate the slack space for the truncate to our reserve */
9186	8553	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9187		- min_size, 0);
	8554	+ min_size, false);
9188	8555	BUG_ON(ret);
9189	8556
9190	8557	/*
..	..	@@ -9219,9 +8586,9 @@
9219	8586	break;
9220	8587	}
9221	8588
9222		- btrfs_block_rsv_release(fs_info, rsv, -1);
	8589	+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
9223	8590	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9224		- rsv, min_size, 0);
	8591	+ rsv, min_size, false);
9225	8592	BUG_ON(ret); /* shouldn't happen */
9226	8593	trans->block_rsv = rsv;
9227	8594	}
..	..	@@ -9244,7 +8611,7 @@
9244	8611	ret = PTR_ERR(trans);
9245	8612	goto out;
9246	8613	}
9247		- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
	8614	+ btrfs_inode_safe_disk_i_size_write(inode, 0);
9248	8615	}
9249	8616
9250	8617	if (trans) {
..	..	@@ -9327,7 +8694,7 @@
9327	8694	ei->index_cnt = (u64)-1;
9328	8695	ei->dir_index = 0;
9329	8696	ei->last_unlink_trans = 0;
9330		- ei->last_link_trans = 0;
	8697	+ ei->last_reflink_trans = 0;
9331	8698	ei->last_log_commit = 0;
9332	8699
9333	8700	spin_lock_init(&ei->lock);
..	..	@@ -9346,13 +8713,15 @@
9346	8713
9347	8714	inode = &ei->vfs_inode;
9348	8715	extent_map_tree_init(&ei->extent_tree);
9349		- extent_io_tree_init(&ei->io_tree, inode);
9350		- extent_io_tree_init(&ei->io_failure_tree, inode);
9351		- ei->io_tree.track_uptodate = 1;
9352		- ei->io_failure_tree.track_uptodate = 1;
	8716	+ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
	8717	+ extent_io_tree_init(fs_info, &ei->io_failure_tree,
	8718	+ IO_TREE_INODE_IO_FAILURE, inode);
	8719	+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
	8720	+ IO_TREE_INODE_FILE_EXTENT, inode);
	8721	+ ei->io_tree.track_uptodate = true;
	8722	+ ei->io_failure_tree.track_uptodate = true;
9353	8723	atomic_set(&ei->sync_writers, 0);
9354	8724	mutex_init(&ei->log_mutex);
9355		- mutex_init(&ei->delalloc_mutex);
9356	8725	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9357	8726	INIT_LIST_HEAD(&ei->delalloc_inodes);
9358	8727	INIT_LIST_HEAD(&ei->delayed_iput);
..	..	@@ -9370,27 +8739,26 @@
9370	8739	}
9371	8740	#endif
9372	8741
9373		-static void btrfs_i_callback(struct rcu_head *head)
	8742	+void btrfs_free_inode(struct inode *inode)
9374	8743	{
9375		- struct inode *inode = container_of(head, struct inode, i_rcu);
9376	8744	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9377	8745	}
9378	8746
9379		-void btrfs_destroy_inode(struct inode *inode)
	8747	+void btrfs_destroy_inode(struct inode *vfs_inode)
9380	8748	{
9381		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9382	8749	struct btrfs_ordered_extent *ordered;
9383		- struct btrfs_root *root = BTRFS_I(inode)->root;
	8750	+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
	8751	+ struct btrfs_root *root = inode->root;
9384	8752
9385		- WARN_ON(!hlist_empty(&inode->i_dentry));
9386		- WARN_ON(inode->i_data.nrpages);
9387		- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9388		- WARN_ON(BTRFS_I(inode)->block_rsv.size);
9389		- WARN_ON(BTRFS_I(inode)->outstanding_extents);
9390		- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9391		- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9392		- WARN_ON(BTRFS_I(inode)->csum_bytes);
9393		- WARN_ON(BTRFS_I(inode)->defrag_bytes);
	8753	+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
	8754	+ WARN_ON(vfs_inode->i_data.nrpages);
	8755	+ WARN_ON(inode->block_rsv.reserved);
	8756	+ WARN_ON(inode->block_rsv.size);
	8757	+ WARN_ON(inode->outstanding_extents);
	8758	+ WARN_ON(inode->delalloc_bytes);
	8759	+ WARN_ON(inode->new_delalloc_bytes);
	8760	+ WARN_ON(inode->csum_bytes);
	8761	+ WARN_ON(inode->defrag_bytes);
9394	8762
9395	8763	/*
9396	8764	* This can happen where we create an inode, but somebody else also
..	..	@@ -9398,16 +8766,16 @@
9398	8766	* created.
9399	8767	*/
9400	8768	if (!root)
9401		- goto free;
	8769	+ return;
9402	8770
9403	8771	while (1) {
9404	8772	ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9405	8773	if (!ordered)
9406	8774	break;
9407	8775	else {
9408		- btrfs_err(fs_info,
	8776	+ btrfs_err(root->fs_info,
9409	8777	"found ordered extent %llu %llu on inode cleanup",
9410		- ordered->file_offset, ordered->len);
	8778	+ ordered->file_offset, ordered->num_bytes);
9411	8779	btrfs_remove_ordered_extent(inode, ordered);
9412	8780	btrfs_put_ordered_extent(ordered);
9413	8781	btrfs_put_ordered_extent(ordered);
..	..	@@ -9415,9 +8783,9 @@
9415	8783	}
9416	8784	btrfs_qgroup_check_reserved_leak(inode);
9417	8785	inode_tree_del(inode);
9418		- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9419		-free:
9420		- call_rcu(&inode->i_rcu, btrfs_i_callback);
	8786	+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
	8787	+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
	8788	+ btrfs_put_root(inode->root);
9421	8789	}
9422	8790
9423	8791	int btrfs_drop_inode(struct inode *inode)
..	..	@@ -9542,19 +8910,15 @@
9542	8910	struct inode *new_inode = new_dentry->d_inode;
9543	8911	struct inode *old_inode = old_dentry->d_inode;
9544	8912	struct timespec64 ctime = current_time(old_inode);
9545		- struct dentry *parent;
9546	8913	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9547	8914	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9548	8915	u64 old_idx = 0;
9549	8916	u64 new_idx = 0;
9550	8917	int ret;
	8918	+ int ret2;
9551	8919	bool root_log_pinned = false;
9552	8920	bool dest_log_pinned = false;
9553		- struct btrfs_log_ctx ctx_root;
9554		- struct btrfs_log_ctx ctx_dest;
9555		- bool sync_log_root = false;
9556		- bool sync_log_dest = false;
9557		- bool commit_transaction = false;
	8921	+ bool need_abort = false;
9558	8922
9559	8923	/*
9560	8924	* For non-subvolumes allow exchange only within one subvolume, in the
..	..	@@ -9565,9 +8929,6 @@
9565	8929	(old_ino != BTRFS_FIRST_FREE_OBJECTID \|\|
9566	8930	new_ino != BTRFS_FIRST_FREE_OBJECTID))
9567	8931	return -EXDEV;
9568		-
9569		- btrfs_init_log_ctx(&ctx_root, old_inode);
9570		- btrfs_init_log_ctx(&ctx_dest, new_inode);
9571	8932
9572	8933	/* close the race window with snapshot create/destroy ioctl */
9573	8934	if (old_ino == BTRFS_FIRST_FREE_OBJECTID \|\|
..	..	@@ -9608,7 +8969,7 @@
9608	8969	/* Reference for the source. */
9609	8970	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9610	8971	/* force full log commit if subvolume involved. */
9611		- btrfs_set_log_full_commit(fs_info, trans);
	8972	+ btrfs_set_log_full_commit(trans);
9612	8973	} else {
9613	8974	btrfs_pin_log_trans(root);
9614	8975	root_log_pinned = true;
..	..	@@ -9620,12 +8981,13 @@
9620	8981	old_idx);
9621	8982	if (ret)
9622	8983	goto out_fail;
	8984	+ need_abort = true;
9623	8985	}
9624	8986
9625	8987	/* And now for the dest. */
9626	8988	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9627	8989	/* force full log commit if subvolume involved. */
9628		- btrfs_set_log_full_commit(fs_info, trans);
	8990	+ btrfs_set_log_full_commit(trans);
9629	8991	} else {
9630	8992	btrfs_pin_log_trans(dest);
9631	8993	dest_log_pinned = true;
..	..	@@ -9635,8 +8997,11 @@
9635	8997	new_ino,
9636	8998	btrfs_ino(BTRFS_I(old_dir)),
9637	8999	new_idx);
9638		- if (ret)
	9000	+ if (ret) {
	9001	+ if (need_abort)
	9002	+ btrfs_abort_transaction(trans, ret);
9639	9003	goto out_fail;
	9004	+ }
9640	9005	}
9641	9006
9642	9007	/* Update inode version and ctime/mtime. */
..	..	@@ -9710,30 +9075,14 @@
9710	9075	BTRFS_I(new_inode)->dir_index = new_idx;
9711	9076
9712	9077	if (root_log_pinned) {
9713		- parent = new_dentry->d_parent;
9714		- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
9715		- BTRFS_I(old_dir), parent,
9716		- false, &ctx_root);
9717		- if (ret == BTRFS_NEED_LOG_SYNC)
9718		- sync_log_root = true;
9719		- else if (ret == BTRFS_NEED_TRANS_COMMIT)
9720		- commit_transaction = true;
9721		- ret = 0;
	9078	+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
	9079	+ new_dentry->d_parent);
9722	9080	btrfs_end_log_trans(root);
9723	9081	root_log_pinned = false;
9724	9082	}
9725	9083	if (dest_log_pinned) {
9726		- if (!commit_transaction) {
9727		- parent = old_dentry->d_parent;
9728		- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
9729		- BTRFS_I(new_dir), parent,
9730		- false, &ctx_dest);
9731		- if (ret == BTRFS_NEED_LOG_SYNC)
9732		- sync_log_dest = true;
9733		- else if (ret == BTRFS_NEED_TRANS_COMMIT)
9734		- commit_transaction = true;
9735		- ret = 0;
9736		- }
	9084	+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
	9085	+ old_dentry->d_parent);
9737	9086	btrfs_end_log_trans(dest);
9738	9087	dest_log_pinned = false;
9739	9088	}
..	..	@@ -9755,7 +9104,7 @@
9755	9104	btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) \|\|
9756	9105	(new_inode &&
9757	9106	btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9758		- btrfs_set_log_full_commit(fs_info, trans);
	9107	+ btrfs_set_log_full_commit(trans);
9759	9108
9760	9109	if (root_log_pinned) {
9761	9110	btrfs_end_log_trans(root);
..	..	@@ -9766,45 +9115,12 @@
9766	9115	dest_log_pinned = false;
9767	9116	}
9768	9117	}
9769		- if (!ret && sync_log_root && !commit_transaction) {
9770		- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
9771		- &ctx_root);
9772		- if (ret)
9773		- commit_transaction = true;
9774		- }
9775		- if (!ret && sync_log_dest && !commit_transaction) {
9776		- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
9777		- &ctx_dest);
9778		- if (ret)
9779		- commit_transaction = true;
9780		- }
9781		- if (commit_transaction) {
9782		- /*
9783		- * We may have set commit_transaction when logging the new name
9784		- * in the destination root, in which case we left the source
9785		- * root context in the list of log contextes. So make sure we
9786		- * remove it to avoid invalid memory accesses, since the context
9787		- * was allocated in our stack frame.
9788		- */
9789		- if (sync_log_root) {
9790		- mutex_lock(&root->log_mutex);
9791		- list_del_init(&ctx_root.list);
9792		- mutex_unlock(&root->log_mutex);
9793		- }
9794		- ret = btrfs_commit_transaction(trans);
9795		- } else {
9796		- int ret2;
9797		-
9798		- ret2 = btrfs_end_transaction(trans);
9799		- ret = ret ? ret : ret2;
9800		- }
	9118	+ ret2 = btrfs_end_transaction(trans);
	9119	+ ret = ret ? ret : ret2;
9801	9120	out_notrans:
9802	9121	if (new_ino == BTRFS_FIRST_FREE_OBJECTID \|\|
9803	9122	old_ino == BTRFS_FIRST_FREE_OBJECTID)
9804	9123	up_read(&fs_info->subvol_sem);
9805		-
9806		- ASSERT(list_empty(&ctx_root.list));
9807		- ASSERT(list_empty(&ctx_dest.list));
9808	9124
9809	9125	return ret;
9810	9126	}
..	..	@@ -9819,7 +9135,7 @@
9819	9135	u64 objectid;
9820	9136	u64 index;
9821	9137
9822		- ret = btrfs_find_free_ino(root, &objectid);
	9138	+ ret = btrfs_find_free_objectid(root, &objectid);
9823	9139	if (ret)
9824	9140	return ret;
9825	9141
..	..	@@ -9873,11 +9189,9 @@
9873	9189	struct inode *old_inode = d_inode(old_dentry);
9874	9190	u64 index = 0;
9875	9191	int ret;
	9192	+ int ret2;
9876	9193	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9877	9194	bool log_pinned = false;
9878		- struct btrfs_log_ctx ctx;
9879		- bool sync_log = false;
9880		- bool commit_transaction = false;
9881	9195
9882	9196	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9883	9197	return -EPERM;
..	..	@@ -9954,7 +9268,7 @@
9954	9268	BTRFS_I(old_inode)->dir_index = 0ULL;
9955	9269	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9956	9270	/* force full log commit if subvolume involved. */
9957		- btrfs_set_log_full_commit(fs_info, trans);
	9271	+ btrfs_set_log_full_commit(trans);
9958	9272	} else {
9959	9273	btrfs_pin_log_trans(root);
9960	9274	log_pinned = true;
..	..	@@ -10027,17 +9341,8 @@
10027	9341	BTRFS_I(old_inode)->dir_index = index;
10028	9342
10029	9343	if (log_pinned) {
10030		- struct dentry *parent = new_dentry->d_parent;
10031		-
10032		- btrfs_init_log_ctx(&ctx, old_inode);
10033		- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
10034		- BTRFS_I(old_dir), parent,
10035		- false, &ctx);
10036		- if (ret == BTRFS_NEED_LOG_SYNC)
10037		- sync_log = true;
10038		- else if (ret == BTRFS_NEED_TRANS_COMMIT)
10039		- commit_transaction = true;
10040		- ret = 0;
	9344	+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
	9345	+ new_dentry->d_parent);
10041	9346	btrfs_end_log_trans(root);
10042	9347	log_pinned = false;
10043	9348	}
..	..	@@ -10069,28 +9374,13 @@
10069	9374	btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) \|\|
10070	9375	(new_inode &&
10071	9376	btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
10072		- btrfs_set_log_full_commit(fs_info, trans);
	9377	+ btrfs_set_log_full_commit(trans);
10073	9378
10074	9379	btrfs_end_log_trans(root);
10075	9380	log_pinned = false;
10076	9381	}
10077		- if (!ret && sync_log) {
10078		- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
10079		- if (ret)
10080		- commit_transaction = true;
10081		- } else if (sync_log) {
10082		- mutex_lock(&root->log_mutex);
10083		- list_del(&ctx.list);
10084		- mutex_unlock(&root->log_mutex);
10085		- }
10086		- if (commit_transaction) {
10087		- ret = btrfs_commit_transaction(trans);
10088		- } else {
10089		- int ret2;
10090		-
10091		- ret2 = btrfs_end_transaction(trans);
10092		- ret = ret ? ret : ret2;
10093		- }
	9382	+ ret2 = btrfs_end_transaction(trans);
	9383	+ ret = ret ? ret : ret2;
10094	9384	out_notrans:
10095	9385	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10096	9386	up_read(&fs_info->subvol_sem);
..	..	@@ -10147,9 +9437,7 @@
10147	9437	init_completion(&work->completion);
10148	9438	INIT_LIST_HEAD(&work->list);
10149	9439	work->inode = inode;
10150		- WARN_ON_ONCE(!inode);
10151		- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10152		- btrfs_run_delalloc_work, NULL, NULL);
	9440	+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
10153	9441
10154	9442	return work;
10155	9443	}
..	..	@@ -10158,7 +9446,9 @@
10158	9446	* some fairly slow code that needs optimization. This walks the list
10159	9447	* of all the inodes with pending delalloc and forces them to disk.
10160	9448	*/
10161		-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
	9449	+static int start_delalloc_inodes(struct btrfs_root *root,
	9450	+ struct writeback_control *wbc, bool snapshot,
	9451	+ bool in_reclaim_context)
10162	9452	{
10163	9453	struct btrfs_inode *binode;
10164	9454	struct inode *inode;
..	..	@@ -10166,6 +9456,7 @@
10166	9456	struct list_head works;
10167	9457	struct list_head splice;
10168	9458	int ret = 0;
	9459	+ bool full_flush = wbc->nr_to_write == LONG_MAX;
10169	9460
10170	9461	INIT_LIST_HEAD(&works);
10171	9462	INIT_LIST_HEAD(&splice);
..	..	@@ -10179,6 +9470,11 @@
10179	9470
10180	9471	list_move_tail(&binode->delalloc_inodes,
10181	9472	&root->delalloc_inodes);
	9473	+
	9474	+ if (in_reclaim_context &&
	9475	+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
	9476	+ continue;
	9477	+
10182	9478	inode = igrab(&binode->vfs_inode);
10183	9479	if (!inode) {
10184	9480	cond_resched_lock(&root->delalloc_lock);
..	..	@@ -10189,18 +9485,26 @@
10189	9485	if (snapshot)
10190	9486	set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
10191	9487	&binode->runtime_flags);
10192		- work = btrfs_alloc_delalloc_work(inode);
10193		- if (!work) {
10194		- iput(inode);
10195		- ret = -ENOMEM;
10196		- goto out;
	9488	+ if (full_flush) {
	9489	+ work = btrfs_alloc_delalloc_work(inode);
	9490	+ if (!work) {
	9491	+ iput(inode);
	9492	+ ret = -ENOMEM;
	9493	+ goto out;
	9494	+ }
	9495	+ list_add_tail(&work->list, &works);
	9496	+ btrfs_queue_work(root->fs_info->flush_workers,
	9497	+ &work->work);
	9498	+ } else {
	9499	+ ret = sync_inode(inode, wbc);
	9500	+ if (!ret &&
	9501	+ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
	9502	+ &BTRFS_I(inode)->runtime_flags))
	9503	+ ret = sync_inode(inode, wbc);
	9504	+ btrfs_add_delayed_iput(inode);
	9505	+ if (ret \|\| wbc->nr_to_write <= 0)
	9506	+ goto out;
10197	9507	}
10198		- list_add_tail(&work->list, &works);
10199		- btrfs_queue_work(root->fs_info->flush_workers,
10200		- &work->work);
10201		- ret++;
10202		- if (nr != -1 && ret >= nr)
10203		- goto out;
10204	9508	cond_resched();
10205	9509	spin_lock(&root->delalloc_lock);
10206	9510	}
..	..	@@ -10224,20 +9528,29 @@
10224	9528
10225	9529	int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
10226	9530	{
	9531	+ struct writeback_control wbc = {
	9532	+ .nr_to_write = LONG_MAX,
	9533	+ .sync_mode = WB_SYNC_NONE,
	9534	+ .range_start = 0,
	9535	+ .range_end = LLONG_MAX,
	9536	+ };
10227	9537	struct btrfs_fs_info *fs_info = root->fs_info;
10228		- int ret;
10229	9538
10230	9539	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10231	9540	return -EROFS;
10232	9541
10233		- ret = start_delalloc_inodes(root, -1, true);
10234		- if (ret > 0)
10235		- ret = 0;
10236		- return ret;
	9542	+ return start_delalloc_inodes(root, &wbc, true, false);
10237	9543	}
10238	9544
10239		-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
	9545	+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
	9546	+ bool in_reclaim_context)
10240	9547	{
	9548	+ struct writeback_control wbc = {
	9549	+ .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
	9550	+ .sync_mode = WB_SYNC_NONE,
	9551	+ .range_start = 0,
	9552	+ .range_end = LLONG_MAX,
	9553	+ };
10241	9554	struct btrfs_root *root;
10242	9555	struct list_head splice;
10243	9556	int ret;
..	..	@@ -10251,23 +9564,25 @@
10251	9564	spin_lock(&fs_info->delalloc_root_lock);
10252	9565	list_splice_init(&fs_info->delalloc_roots, &splice);
10253	9566	while (!list_empty(&splice) && nr) {
	9567	+ /*
	9568	+ * Reset nr_to_write here so we know that we're doing a full
	9569	+ * flush.
	9570	+ */
	9571	+ if (nr == U64_MAX)
	9572	+ wbc.nr_to_write = LONG_MAX;
	9573	+
10254	9574	root = list_first_entry(&splice, struct btrfs_root,
10255	9575	delalloc_root);
10256		- root = btrfs_grab_fs_root(root);
	9576	+ root = btrfs_grab_root(root);
10257	9577	BUG_ON(!root);
10258	9578	list_move_tail(&root->delalloc_root,
10259	9579	&fs_info->delalloc_roots);
10260	9580	spin_unlock(&fs_info->delalloc_root_lock);
10261	9581
10262		- ret = start_delalloc_inodes(root, nr, false);
10263		- btrfs_put_fs_root(root);
10264		- if (ret < 0)
	9582	+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
	9583	+ btrfs_put_root(root);
	9584	+ if (ret < 0 \|\| wbc.nr_to_write <= 0)
10265	9585	goto out;
10266		-
10267		- if (nr != -1) {
10268		- nr -= ret;
10269		- WARN_ON(nr < 0);
10270		- }
10271	9586	spin_lock(&fs_info->delalloc_root_lock);
10272	9587	}
10273	9588	spin_unlock(&fs_info->delalloc_root_lock);
..	..	@@ -10316,7 +9631,7 @@
10316	9631	if (IS_ERR(trans))
10317	9632	return PTR_ERR(trans);
10318	9633
10319		- err = btrfs_find_free_ino(root, &objectid);
	9634	+ err = btrfs_find_free_objectid(root, &objectid);
10320	9635	if (err)
10321	9636	goto out_unlock;
10322	9637
..	..	@@ -10338,7 +9653,6 @@
10338	9653	inode->i_fop = &btrfs_file_operations;
10339	9654	inode->i_op = &btrfs_file_inode_operations;
10340	9655	inode->i_mapping->a_ops = &btrfs_aops;
10341		- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10342	9656
10343	9657	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10344	9658	if (err)
..	..	@@ -10377,7 +9691,6 @@
10377	9691
10378	9692	inode->i_op = &btrfs_symlink_inode_operations;
10379	9693	inode_nohighmem(inode);
10380		- inode->i_mapping->a_ops = &btrfs_symlink_aops;
10381	9694	inode_set_bytes(inode, name_len);
10382	9695	btrfs_i_size_write(BTRFS_I(inode), name_len);
10383	9696	err = btrfs_update_inode(trans, root, inode);
..	..	@@ -10404,6 +9717,65 @@
10404	9717	return err;
10405	9718	}
10406	9719
	9720	+static struct btrfs_trans_handle *insert_prealloc_file_extent(
	9721	+ struct btrfs_trans_handle *trans_in,
	9722	+ struct inode inode, struct btrfs_key ins,
	9723	+ u64 file_offset)
	9724	+{
	9725	+ struct btrfs_file_extent_item stack_fi;
	9726	+ struct btrfs_replace_extent_info extent_info;
	9727	+ struct btrfs_trans_handle *trans = trans_in;
	9728	+ struct btrfs_path *path;
	9729	+ u64 start = ins->objectid;
	9730	+ u64 len = ins->offset;
	9731	+ int ret;
	9732	+
	9733	+ memset(&stack_fi, 0, sizeof(stack_fi));
	9734	+
	9735	+ btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
	9736	+ btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
	9737	+ btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
	9738	+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
	9739	+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
	9740	+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
	9741	+ /* Encryption and other encoding is reserved and all 0 */
	9742	+
	9743	+ ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
	9744	+ if (ret < 0)
	9745	+ return ERR_PTR(ret);
	9746	+
	9747	+ if (trans) {
	9748	+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
	9749	+ file_offset, &stack_fi, ret);
	9750	+ if (ret)
	9751	+ return ERR_PTR(ret);
	9752	+ return trans;
	9753	+ }
	9754	+
	9755	+ extent_info.disk_offset = start;
	9756	+ extent_info.disk_len = len;
	9757	+ extent_info.data_offset = 0;
	9758	+ extent_info.data_len = len;
	9759	+ extent_info.file_offset = file_offset;
	9760	+ extent_info.extent_buf = (char *)&stack_fi;
	9761	+ extent_info.is_new_extent = true;
	9762	+ extent_info.qgroup_reserved = ret;
	9763	+ extent_info.insertions = 0;
	9764	+
	9765	+ path = btrfs_alloc_path();
	9766	+ if (!path)
	9767	+ return ERR_PTR(-ENOMEM);
	9768	+
	9769	+ ret = btrfs_replace_file_extents(inode, path, file_offset,
	9770	+ file_offset + len - 1, &extent_info,
	9771	+ &trans);
	9772	+ btrfs_free_path(path);
	9773	+ if (ret)
	9774	+ return ERR_PTR(ret);
	9775	+
	9776	+ return trans;
	9777	+}
	9778	+
10407	9779	static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10408	9780	u64 start, u64 num_bytes, u64 min_size,
10409	9781	loff_t actual_len, u64 *alloc_hint,
..	..	@@ -10426,14 +9798,6 @@
10426	9798	if (trans)
10427	9799	own_trans = false;
10428	9800	while (num_bytes > 0) {
10429		- if (own_trans) {
10430		- trans = btrfs_start_transaction(root, 3);
10431		- if (IS_ERR(trans)) {
10432		- ret = PTR_ERR(trans);
10433		- break;
10434		- }
10435		- }
10436		-
10437	9801	cur_bytes = min_t(u64, num_bytes, SZ_256M);
10438	9802	cur_bytes = max(cur_bytes, min_size);
10439	9803	/*
..	..	@@ -10445,11 +9809,8 @@
10445	9809	cur_bytes = min(cur_bytes, last_alloc);
10446	9810	ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10447	9811	min_size, 0, *alloc_hint, &ins, 1, 0);
10448		- if (ret) {
10449		- if (own_trans)
10450		- btrfs_end_transaction(trans);
	9812	+ if (ret)
10451	9813	break;
10452		- }
10453	9814
10454	9815	/*
10455	9816	* We've reserved this space, and thus converted it from
..	..	@@ -10459,20 +9820,20 @@
10459	9820	* clear_offset by our extent size.
10460	9821	*/
10461	9822	clear_offset += ins.offset;
10462		- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10463	9823
10464	9824	last_alloc = ins.offset;
10465		- ret = insert_reserved_file_extent(trans, inode,
10466		- cur_offset, ins.objectid,
10467		- ins.offset, ins.offset,
10468		- ins.offset, 0, 0, 0,
10469		- BTRFS_FILE_EXTENT_PREALLOC);
10470		- if (ret) {
	9825	+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
	9826	+ /*
	9827	+ * Now that we inserted the prealloc extent we can finally
	9828	+ * decrement the number of reservations in the block group.
	9829	+ * If we did it before, we could race with relocation and have
	9830	+ * relocation miss the reserved extent, making it fail later.
	9831	+ */
	9832	+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
	9833	+ if (IS_ERR(trans)) {
	9834	+ ret = PTR_ERR(trans);
10471	9835	btrfs_free_reserved_extent(fs_info, ins.objectid,
10472	9836	ins.offset, 0);
10473		- btrfs_abort_transaction(trans, ret);
10474		- if (own_trans)
10475		- btrfs_end_transaction(trans);
10476	9837	break;
10477	9838	}
10478	9839
..	..	@@ -10493,7 +9854,6 @@
10493	9854	em->block_len = ins.offset;
10494	9855	em->orig_block_len = ins.offset;
10495	9856	em->ram_bytes = ins.offset;
10496		- em->bdev = fs_info->fs_devices->latest_bdev;
10497	9857	set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10498	9858	em->generation = trans->transid;
10499	9859
..	..	@@ -10524,7 +9884,7 @@
10524	9884	else
10525	9885	i_size = cur_offset;
10526	9886	i_size_write(inode, i_size);
10527		- btrfs_ordered_update_i_size(inode, i_size, NULL);
	9887	+ btrfs_inode_safe_disk_i_size_write(inode, 0);
10528	9888	}
10529	9889
10530	9890	ret = btrfs_update_inode(trans, root, inode);
..	..	@@ -10536,11 +9896,13 @@
10536	9896	break;
10537	9897	}
10538	9898
10539		- if (own_trans)
	9899	+ if (own_trans) {
10540	9900	btrfs_end_transaction(trans);
	9901	+ trans = NULL;
	9902	+ }
10541	9903	}
10542	9904	if (clear_offset < end)
10543		- btrfs_free_reserved_data_space(inode, NULL, clear_offset,
	9905	+ btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
10544	9906	end - clear_offset + 1);
10545	9907	return ret;
10546	9908	}
..	..	@@ -10600,7 +9962,7 @@
10600	9962	if (IS_ERR(trans))
10601	9963	return PTR_ERR(trans);
10602	9964
10603		- ret = btrfs_find_free_ino(root, &objectid);
	9965	+ ret = btrfs_find_free_objectid(root, &objectid);
10604	9966	if (ret)
10605	9967	goto out;
10606	9968
..	..	@@ -10616,7 +9978,6 @@
10616	9978	inode->i_op = &btrfs_file_inode_operations;
10617	9979
10618	9980	inode->i_mapping->a_ops = &btrfs_aops;
10619		- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10620	9981
10621	9982	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10622	9983	if (ret)
..	..	@@ -10648,26 +10009,6 @@
10648	10009	return ret;
10649	10010	}
10650	10011
10651		-__attribute__((const))
10652		-static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10653		-{
10654		- return -EAGAIN;
10655		-}
10656		-
10657		-static void btrfs_check_extent_io_range(void private_data, const char caller,
10658		- u64 start, u64 end)
10659		-{
10660		- struct inode *inode = private_data;
10661		- u64 isize;
10662		-
10663		- isize = i_size_read(inode);
10664		- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
10665		- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
10666		- "%s: ino %llu isize %llu odd range [%llu,%llu]",
10667		- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
10668		- }
10669		-}
10670		-
10671	10012	void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
10672	10013	{
10673	10014	struct inode *inode = tree->private_data;
..	..	@@ -10683,6 +10024,403 @@
10683	10024	index++;
10684	10025	}
10685	10026	}
	10027	+
	10028	+#ifdef CONFIG_SWAP
	10029	+/*
	10030	+ * Add an entry indicating a block group or device which is pinned by a
	10031	+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
	10032	+ * negative errno on failure.
	10033	+ */
	10034	+static int btrfs_add_swapfile_pin(struct inode inode, void ptr,
	10035	+ bool is_block_group)
	10036	+{
	10037	+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	10038	+ struct btrfs_swapfile_pin sp, entry;
	10039	+ struct rb_node **p;
	10040	+ struct rb_node *parent = NULL;
	10041	+
	10042	+ sp = kmalloc(sizeof(*sp), GFP_NOFS);
	10043	+ if (!sp)
	10044	+ return -ENOMEM;
	10045	+ sp->ptr = ptr;
	10046	+ sp->inode = inode;
	10047	+ sp->is_block_group = is_block_group;
	10048	+ sp->bg_extent_count = 1;
	10049	+
	10050	+ spin_lock(&fs_info->swapfile_pins_lock);
	10051	+ p = &fs_info->swapfile_pins.rb_node;
	10052	+ while (*p) {
	10053	+ parent = *p;
	10054	+ entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
	10055	+ if (sp->ptr < entry->ptr \|\|
	10056	+ (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
	10057	+ p = &(*p)->rb_left;
	10058	+ } else if (sp->ptr > entry->ptr \|\|
	10059	+ (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
	10060	+ p = &(*p)->rb_right;
	10061	+ } else {
	10062	+ if (is_block_group)
	10063	+ entry->bg_extent_count++;
	10064	+ spin_unlock(&fs_info->swapfile_pins_lock);
	10065	+ kfree(sp);
	10066	+ return 1;
	10067	+ }
	10068	+ }
	10069	+ rb_link_node(&sp->node, parent, p);
	10070	+ rb_insert_color(&sp->node, &fs_info->swapfile_pins);
	10071	+ spin_unlock(&fs_info->swapfile_pins_lock);
	10072	+ return 0;
	10073	+}
	10074	+
	10075	+/* Free all of the entries pinned by this swapfile. */
	10076	+static void btrfs_free_swapfile_pins(struct inode *inode)
	10077	+{
	10078	+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	10079	+ struct btrfs_swapfile_pin *sp;
	10080	+ struct rb_node node, next;
	10081	+
	10082	+ spin_lock(&fs_info->swapfile_pins_lock);
	10083	+ node = rb_first(&fs_info->swapfile_pins);
	10084	+ while (node) {
	10085	+ next = rb_next(node);
	10086	+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
	10087	+ if (sp->inode == inode) {
	10088	+ rb_erase(&sp->node, &fs_info->swapfile_pins);
	10089	+ if (sp->is_block_group) {
	10090	+ btrfs_dec_block_group_swap_extents(sp->ptr,
	10091	+ sp->bg_extent_count);
	10092	+ btrfs_put_block_group(sp->ptr);
	10093	+ }
	10094	+ kfree(sp);
	10095	+ }
	10096	+ node = next;
	10097	+ }
	10098	+ spin_unlock(&fs_info->swapfile_pins_lock);
	10099	+}
	10100	+
	10101	+struct btrfs_swap_info {
	10102	+ u64 start;
	10103	+ u64 block_start;
	10104	+ u64 block_len;
	10105	+ u64 lowest_ppage;
	10106	+ u64 highest_ppage;
	10107	+ unsigned long nr_pages;
	10108	+ int nr_extents;
	10109	+};
	10110	+
	10111	+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
	10112	+ struct btrfs_swap_info *bsi)
	10113	+{
	10114	+ unsigned long nr_pages;
	10115	+ unsigned long max_pages;
	10116	+ u64 first_ppage, first_ppage_reported, next_ppage;
	10117	+ int ret;
	10118	+
	10119	+ /*
	10120	+ * Our swapfile may have had its size extended after the swap header was
	10121	+ * written. In that case activating the swapfile should not go beyond
	10122	+ * the max size set in the swap header.
	10123	+ */
	10124	+ if (bsi->nr_pages >= sis->max)
	10125	+ return 0;
	10126	+
	10127	+ max_pages = sis->max - bsi->nr_pages;
	10128	+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
	10129	+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
	10130	+ PAGE_SIZE) >> PAGE_SHIFT;
	10131	+
	10132	+ if (first_ppage >= next_ppage)
	10133	+ return 0;
	10134	+ nr_pages = next_ppage - first_ppage;
	10135	+ nr_pages = min(nr_pages, max_pages);
	10136	+
	10137	+ first_ppage_reported = first_ppage;
	10138	+ if (bsi->start == 0)
	10139	+ first_ppage_reported++;
	10140	+ if (bsi->lowest_ppage > first_ppage_reported)
	10141	+ bsi->lowest_ppage = first_ppage_reported;
	10142	+ if (bsi->highest_ppage < (next_ppage - 1))
	10143	+ bsi->highest_ppage = next_ppage - 1;
	10144	+
	10145	+ ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
	10146	+ if (ret < 0)
	10147	+ return ret;
	10148	+ bsi->nr_extents += ret;
	10149	+ bsi->nr_pages += nr_pages;
	10150	+ return 0;
	10151	+}
	10152	+
	10153	+static void btrfs_swap_deactivate(struct file *file)
	10154	+{
	10155	+ struct inode *inode = file_inode(file);
	10156	+
	10157	+ btrfs_free_swapfile_pins(inode);
	10158	+ atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
	10159	+}
	10160	+
	10161	+static int btrfs_swap_activate(struct swap_info_struct sis, struct file file,
	10162	+ sector_t *span)
	10163	+{
	10164	+ struct inode *inode = file_inode(file);
	10165	+ struct btrfs_root *root = BTRFS_I(inode)->root;
	10166	+ struct btrfs_fs_info *fs_info = root->fs_info;
	10167	+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	10168	+ struct extent_state *cached_state = NULL;
	10169	+ struct extent_map *em = NULL;
	10170	+ struct btrfs_device *device = NULL;
	10171	+ struct btrfs_swap_info bsi = {
	10172	+ .lowest_ppage = (sector_t)-1ULL,
	10173	+ };
	10174	+ int ret = 0;
	10175	+ u64 isize;
	10176	+ u64 start;
	10177	+
	10178	+ /*
	10179	+ * If the swap file was just created, make sure delalloc is done. If the
	10180	+ * file changes again after this, the user is doing something stupid and
	10181	+ * we don't really care.
	10182	+ */
	10183	+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
	10184	+ if (ret)
	10185	+ return ret;
	10186	+
	10187	+ /*
	10188	+ * The inode is locked, so these flags won't change after we check them.
	10189	+ */
	10190	+ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
	10191	+ btrfs_warn(fs_info, "swapfile must not be compressed");
	10192	+ return -EINVAL;
	10193	+ }
	10194	+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
	10195	+ btrfs_warn(fs_info, "swapfile must not be copy-on-write");
	10196	+ return -EINVAL;
	10197	+ }
	10198	+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
	10199	+ btrfs_warn(fs_info, "swapfile must not be checksummed");
	10200	+ return -EINVAL;
	10201	+ }
	10202	+
	10203	+ /*
	10204	+ * Balance or device remove/replace/resize can move stuff around from
	10205	+ * under us. The exclop protection makes sure they aren't running/won't
	10206	+ * run concurrently while we are mapping the swap extents, and
	10207	+ * fs_info->swapfile_pins prevents them from running while the swap
	10208	+ * file is active and moving the extents. Note that this also prevents
	10209	+ * a concurrent device add which isn't actually necessary, but it's not
	10210	+ * really worth the trouble to allow it.
	10211	+ */
	10212	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
	10213	+ btrfs_warn(fs_info,
	10214	+ "cannot activate swapfile while exclusive operation is running");
	10215	+ return -EBUSY;
	10216	+ }
	10217	+
	10218	+ /*
	10219	+ * Prevent snapshot creation while we are activating the swap file.
	10220	+ * We do not want to race with snapshot creation. If snapshot creation
	10221	+ * already started before we bumped nr_swapfiles from 0 to 1 and
	10222	+ * completes before the first write into the swap file after it is
	10223	+ * activated, than that write would fallback to COW.
	10224	+ */
	10225	+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
	10226	+ btrfs_exclop_finish(fs_info);
	10227	+ btrfs_warn(fs_info,
	10228	+ "cannot activate swapfile because snapshot creation is in progress");
	10229	+ return -EINVAL;
	10230	+ }
	10231	+ /*
	10232	+ * Snapshots can create extents which require COW even if NODATACOW is
	10233	+ * set. We use this counter to prevent snapshots. We must increment it
	10234	+ * before walking the extents because we don't want a concurrent
	10235	+ * snapshot to run after we've already checked the extents.
	10236	+ *
	10237	+ * It is possible that subvolume is marked for deletion but still not
	10238	+ * removed yet. To prevent this race, we check the root status before
	10239	+ * activating the swapfile.
	10240	+ */
	10241	+ spin_lock(&root->root_item_lock);
	10242	+ if (btrfs_root_dead(root)) {
	10243	+ spin_unlock(&root->root_item_lock);
	10244	+
	10245	+ btrfs_exclop_finish(fs_info);
	10246	+ btrfs_warn(fs_info,
	10247	+ "cannot activate swapfile because subvolume %llu is being deleted",
	10248	+ root->root_key.objectid);
	10249	+ return -EPERM;
	10250	+ }
	10251	+ atomic_inc(&root->nr_swapfiles);
	10252	+ spin_unlock(&root->root_item_lock);
	10253	+
	10254	+ isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
	10255	+
	10256	+ lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
	10257	+ start = 0;
	10258	+ while (start < isize) {
	10259	+ u64 logical_block_start, physical_block_start;
	10260	+ struct btrfs_block_group *bg;
	10261	+ u64 len = isize - start;
	10262	+
	10263	+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
	10264	+ if (IS_ERR(em)) {
	10265	+ ret = PTR_ERR(em);
	10266	+ goto out;
	10267	+ }
	10268	+
	10269	+ if (em->block_start == EXTENT_MAP_HOLE) {
	10270	+ btrfs_warn(fs_info, "swapfile must not have holes");
	10271	+ ret = -EINVAL;
	10272	+ goto out;
	10273	+ }
	10274	+ if (em->block_start == EXTENT_MAP_INLINE) {
	10275	+ /*
	10276	+ * It's unlikely we'll ever actually find ourselves
	10277	+ * here, as a file small enough to fit inline won't be
	10278	+ * big enough to store more than the swap header, but in
	10279	+ * case something changes in the future, let's catch it
	10280	+ * here rather than later.
	10281	+ */
	10282	+ btrfs_warn(fs_info, "swapfile must not be inline");
	10283	+ ret = -EINVAL;
	10284	+ goto out;
	10285	+ }
	10286	+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
	10287	+ btrfs_warn(fs_info, "swapfile must not be compressed");
	10288	+ ret = -EINVAL;
	10289	+ goto out;
	10290	+ }
	10291	+
	10292	+ logical_block_start = em->block_start + (start - em->start);
	10293	+ len = min(len, em->len - (start - em->start));
	10294	+ free_extent_map(em);
	10295	+ em = NULL;
	10296	+
	10297	+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
	10298	+ if (ret < 0) {
	10299	+ goto out;
	10300	+ } else if (ret) {
	10301	+ ret = 0;
	10302	+ } else {
	10303	+ btrfs_warn(fs_info,
	10304	+ "swapfile must not be copy-on-write");
	10305	+ ret = -EINVAL;
	10306	+ goto out;
	10307	+ }
	10308	+
	10309	+ em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
	10310	+ if (IS_ERR(em)) {
	10311	+ ret = PTR_ERR(em);
	10312	+ goto out;
	10313	+ }
	10314	+
	10315	+ if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
	10316	+ btrfs_warn(fs_info,
	10317	+ "swapfile must have single data profile");
	10318	+ ret = -EINVAL;
	10319	+ goto out;
	10320	+ }
	10321	+
	10322	+ if (device == NULL) {
	10323	+ device = em->map_lookup->stripes[0].dev;
	10324	+ ret = btrfs_add_swapfile_pin(inode, device, false);
	10325	+ if (ret == 1)
	10326	+ ret = 0;
	10327	+ else if (ret)
	10328	+ goto out;
	10329	+ } else if (device != em->map_lookup->stripes[0].dev) {
	10330	+ btrfs_warn(fs_info, "swapfile must be on one device");
	10331	+ ret = -EINVAL;
	10332	+ goto out;
	10333	+ }
	10334	+
	10335	+ physical_block_start = (em->map_lookup->stripes[0].physical +
	10336	+ (logical_block_start - em->start));
	10337	+ len = min(len, em->len - (logical_block_start - em->start));
	10338	+ free_extent_map(em);
	10339	+ em = NULL;
	10340	+
	10341	+ bg = btrfs_lookup_block_group(fs_info, logical_block_start);
	10342	+ if (!bg) {
	10343	+ btrfs_warn(fs_info,
	10344	+ "could not find block group containing swapfile");
	10345	+ ret = -EINVAL;
	10346	+ goto out;
	10347	+ }
	10348	+
	10349	+ if (!btrfs_inc_block_group_swap_extents(bg)) {
	10350	+ btrfs_warn(fs_info,
	10351	+ "block group for swapfile at %llu is read-only%s",
	10352	+ bg->start,
	10353	+ atomic_read(&fs_info->scrubs_running) ?
	10354	+ " (scrub running)" : "");
	10355	+ btrfs_put_block_group(bg);
	10356	+ ret = -EINVAL;
	10357	+ goto out;
	10358	+ }
	10359	+
	10360	+ ret = btrfs_add_swapfile_pin(inode, bg, true);
	10361	+ if (ret) {
	10362	+ btrfs_put_block_group(bg);
	10363	+ if (ret == 1)
	10364	+ ret = 0;
	10365	+ else
	10366	+ goto out;
	10367	+ }
	10368	+
	10369	+ if (bsi.block_len &&
	10370	+ bsi.block_start + bsi.block_len == physical_block_start) {
	10371	+ bsi.block_len += len;
	10372	+ } else {
	10373	+ if (bsi.block_len) {
	10374	+ ret = btrfs_add_swap_extent(sis, &bsi);
	10375	+ if (ret)
	10376	+ goto out;
	10377	+ }
	10378	+ bsi.start = start;
	10379	+ bsi.block_start = physical_block_start;
	10380	+ bsi.block_len = len;
	10381	+ }
	10382	+
	10383	+ start += len;
	10384	+ }
	10385	+
	10386	+ if (bsi.block_len)
	10387	+ ret = btrfs_add_swap_extent(sis, &bsi);
	10388	+
	10389	+out:
	10390	+ if (!IS_ERR_OR_NULL(em))
	10391	+ free_extent_map(em);
	10392	+
	10393	+ unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
	10394	+
	10395	+ if (ret)
	10396	+ btrfs_swap_deactivate(file);
	10397	+
	10398	+ btrfs_drew_write_unlock(&root->snapshot_lock);
	10399	+
	10400	+ btrfs_exclop_finish(fs_info);
	10401	+
	10402	+ if (ret)
	10403	+ return ret;
	10404	+
	10405	+ if (device)
	10406	+ sis->bdev = device->bdev;
	10407	+ *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
	10408	+ sis->max = bsi.nr_pages;
	10409	+ sis->pages = bsi.nr_pages - 1;
	10410	+ sis->highest_bit = bsi.nr_pages - 1;
	10411	+ return bsi.nr_extents;
	10412	+}
	10413	+#else
	10414	+static void btrfs_swap_deactivate(struct file *file)
	10415	+{
	10416	+}
	10417	+
	10418	+static int btrfs_swap_activate(struct swap_info_struct sis, struct file file,
	10419	+ sector_t *span)
	10420	+{
	10421	+ return -EOPNOTSUPP;
	10422	+}
	10423	+#endif
10686	10424
10687	10425	static const struct inode_operations btrfs_dir_inode_operations = {
10688	10426	.getattr = btrfs_getattr,
..	..	@@ -10703,11 +10441,6 @@
10703	10441	.update_time = btrfs_update_time,
10704	10442	.tmpfile = btrfs_tmpfile,
10705	10443	};
10706		-static const struct inode_operations btrfs_dir_ro_inode_operations = {
10707		- .lookup = btrfs_lookup,
10708		- .permission = btrfs_permission,
10709		- .update_time = btrfs_update_time,
10710		-};
10711	10444
10712	10445	static const struct file_operations btrfs_dir_file_operations = {
10713	10446	.llseek = generic_file_llseek,
..	..	@@ -10720,22 +10453,6 @@
10720	10453	#endif
10721	10454	.release = btrfs_release_file,
10722	10455	.fsync = btrfs_sync_file,
10723		-};
10724		-
10725		-static const struct extent_io_ops btrfs_extent_io_ops = {
10726		- /* mandatory callbacks */
10727		- .submit_bio_hook = btrfs_submit_bio_hook,
10728		- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10729		- .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10730		-
10731		- /* optional callbacks */
10732		- .writepage_end_io_hook = btrfs_writepage_end_io_hook,
10733		- .writepage_start_hook = btrfs_writepage_start_hook,
10734		- .set_bit_hook = btrfs_set_bit_hook,
10735		- .clear_bit_hook = btrfs_clear_bit_hook,
10736		- .merge_extent_hook = btrfs_merge_extent_hook,
10737		- .split_extent_hook = btrfs_split_extent_hook,
10738		- .check_extent_io_range = btrfs_check_extent_io_range,
10739	10456	};
10740	10457
10741	10458	/*
..	..	@@ -10754,19 +10471,17 @@
10754	10471	.readpage = btrfs_readpage,
10755	10472	.writepage = btrfs_writepage,
10756	10473	.writepages = btrfs_writepages,
10757		- .readpages = btrfs_readpages,
10758		- .direct_IO = btrfs_direct_IO,
	10474	+ .readahead = btrfs_readahead,
	10475	+ .direct_IO = noop_direct_IO,
10759	10476	.invalidatepage = btrfs_invalidatepage,
10760	10477	.releasepage = btrfs_releasepage,
	10478	+#ifdef CONFIG_MIGRATION
	10479	+ .migratepage = btrfs_migratepage,
	10480	+#endif
10761	10481	.set_page_dirty = btrfs_set_page_dirty,
10762	10482	.error_remove_page = generic_error_remove_page,
10763		-};
10764		-
10765		-static const struct address_space_operations btrfs_symlink_aops = {
10766		- .readpage = btrfs_readpage,
10767		- .writepage = btrfs_writepage,
10768		- .invalidatepage = btrfs_invalidatepage,
10769		- .releasepage = btrfs_releasepage,
	10483	+ .swap_activate = btrfs_swap_activate,
	10484	+ .swap_deactivate = btrfs_swap_deactivate,
10770	10485	};
10771	10486
10772	10487	static const struct inode_operations btrfs_file_inode_operations = {