~hc/RK356X_SDK_RELEASE.git

..	..	@@ -28,6 +28,7 @@
28	28	#include <linux/iversion.h>
29	29	#include "ctree.h"
30	30	#include "disk-io.h"
	31	+#include "export.h"
31	32	#include "transaction.h"
32	33	#include "btrfs_inode.h"
33	34	#include "print-tree.h"
..	..	@@ -43,6 +44,9 @@
43	44	#include "qgroup.h"
44	45	#include "tree-log.h"
45	46	#include "compression.h"
	47	+#include "space-info.h"
	48	+#include "delalloc-space.h"
	49	+#include "block-group.h"
46	50
47	51	#ifdef CONFIG_64BIT
48	52	/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
..	..	@@ -82,10 +86,6 @@
82	86	#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
83	87	struct btrfs_ioctl_send_args_32)
84	88	#endif
85		-
86		-static int btrfs_clone(struct inode src, struct inode inode,
87		- u64 off, u64 olen, u64 olen_aligned, u64 destoff,
88		- int no_time_update);
89	89
90	90	/* Mask out flags that are inappropriate for the given type of inode. */
91	91	static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
..	..	@@ -164,8 +164,11 @@
164	164	return 0;
165	165	}
166	166
167		-/* Check if @flags are a supported and valid set of FS__FL flags /
168		-static int check_fsflags(unsigned int flags)
	167	+/*
	168	+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
	169	+ * the old and new flags are not conflicting
	170	+ */
	171	+static int check_fsflags(unsigned int old_flags, unsigned int flags)
169	172	{
170	173	if (flags & ~(FS_IMMUTABLE_FL \| FS_APPEND_FL \| \
171	174	FS_NOATIME_FL \| FS_NODUMP_FL \| \
..	..	@@ -174,7 +177,17 @@
174	177	FS_NOCOW_FL))
175	178	return -EOPNOTSUPP;
176	179
	180	+ /* COMPR and NOCOMP on new/old are valid */
177	181	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
	182	+ return -EINVAL;
	183	+
	184	+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
	185	+ return -EINVAL;
	186	+
	187	+ /* NOCOW and compression options are mutually exclusive */
	188	+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL \| FS_NOCOMP_FL)))
	189	+ return -EINVAL;
	190	+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL \| FS_NOCOMP_FL)))
178	191	return -EINVAL;
179	192
180	193	return 0;
..	..	@@ -189,9 +202,8 @@
189	202	struct btrfs_trans_handle *trans;
190	203	unsigned int fsflags, old_fsflags;
191	204	int ret;
192		- u64 old_flags;
193		- unsigned int old_i_flags;
194		- umode_t mode;
	205	+ const char *comp = NULL;
	206	+ u32 binode_flags;
195	207
196	208	if (!inode_owner_or_capable(inode))
197	209	return -EPERM;
..	..	@@ -202,76 +214,70 @@
202	214	if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
203	215	return -EFAULT;
204	216
205		- ret = check_fsflags(fsflags);
206		- if (ret)
207		- return ret;
208		-
209	217	ret = mnt_want_write_file(file);
210	218	if (ret)
211	219	return ret;
212	220
213	221	inode_lock(inode);
214		-
215		- old_flags = binode->flags;
216		- old_i_flags = inode->i_flags;
217		- mode = inode->i_mode;
218		-
219	222	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
220	223	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
221		- if ((fsflags ^ old_fsflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL)) {
222		- if (!capable(CAP_LINUX_IMMUTABLE)) {
223		- ret = -EPERM;
224		- goto out_unlock;
225		- }
226		- }
227	224
	225	+ ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
	226	+ if (ret)
	227	+ goto out_unlock;
	228	+
	229	+ ret = check_fsflags(old_fsflags, fsflags);
	230	+ if (ret)
	231	+ goto out_unlock;
	232	+
	233	+ binode_flags = binode->flags;
228	234	if (fsflags & FS_SYNC_FL)
229		- binode->flags \|= BTRFS_INODE_SYNC;
	235	+ binode_flags \|= BTRFS_INODE_SYNC;
230	236	else
231		- binode->flags &= ~BTRFS_INODE_SYNC;
	237	+ binode_flags &= ~BTRFS_INODE_SYNC;
232	238	if (fsflags & FS_IMMUTABLE_FL)
233		- binode->flags \|= BTRFS_INODE_IMMUTABLE;
	239	+ binode_flags \|= BTRFS_INODE_IMMUTABLE;
234	240	else
235		- binode->flags &= ~BTRFS_INODE_IMMUTABLE;
	241	+ binode_flags &= ~BTRFS_INODE_IMMUTABLE;
236	242	if (fsflags & FS_APPEND_FL)
237		- binode->flags \|= BTRFS_INODE_APPEND;
	243	+ binode_flags \|= BTRFS_INODE_APPEND;
238	244	else
239		- binode->flags &= ~BTRFS_INODE_APPEND;
	245	+ binode_flags &= ~BTRFS_INODE_APPEND;
240	246	if (fsflags & FS_NODUMP_FL)
241		- binode->flags \|= BTRFS_INODE_NODUMP;
	247	+ binode_flags \|= BTRFS_INODE_NODUMP;
242	248	else
243		- binode->flags &= ~BTRFS_INODE_NODUMP;
	249	+ binode_flags &= ~BTRFS_INODE_NODUMP;
244	250	if (fsflags & FS_NOATIME_FL)
245		- binode->flags \|= BTRFS_INODE_NOATIME;
	251	+ binode_flags \|= BTRFS_INODE_NOATIME;
246	252	else
247		- binode->flags &= ~BTRFS_INODE_NOATIME;
	253	+ binode_flags &= ~BTRFS_INODE_NOATIME;
248	254	if (fsflags & FS_DIRSYNC_FL)
249		- binode->flags \|= BTRFS_INODE_DIRSYNC;
	255	+ binode_flags \|= BTRFS_INODE_DIRSYNC;
250	256	else
251		- binode->flags &= ~BTRFS_INODE_DIRSYNC;
	257	+ binode_flags &= ~BTRFS_INODE_DIRSYNC;
252	258	if (fsflags & FS_NOCOW_FL) {
253		- if (S_ISREG(mode)) {
	259	+ if (S_ISREG(inode->i_mode)) {
254	260	/*
255	261	* It's safe to turn csums off here, no extents exist.
256	262	* Otherwise we want the flag to reflect the real COW
257	263	* status of the file and will not set it.
258	264	*/
259	265	if (inode->i_size == 0)
260		- binode->flags \|= BTRFS_INODE_NODATACOW
261		- \| BTRFS_INODE_NODATASUM;
	266	+ binode_flags \|= BTRFS_INODE_NODATACOW \|
	267	+ BTRFS_INODE_NODATASUM;
262	268	} else {
263		- binode->flags \|= BTRFS_INODE_NODATACOW;
	269	+ binode_flags \|= BTRFS_INODE_NODATACOW;
264	270	}
265	271	} else {
266	272	/*
267	273	* Revert back under same assumptions as above
268	274	*/
269		- if (S_ISREG(mode)) {
	275	+ if (S_ISREG(inode->i_mode)) {
270	276	if (inode->i_size == 0)
271		- binode->flags &= ~(BTRFS_INODE_NODATACOW
272		- \| BTRFS_INODE_NODATASUM);
	277	+ binode_flags &= ~(BTRFS_INODE_NODATACOW \|
	278	+ BTRFS_INODE_NODATASUM);
273	279	} else {
274		- binode->flags &= ~BTRFS_INODE_NODATACOW;
	280	+ binode_flags &= ~BTRFS_INODE_NODATACOW;
275	281	}
276	282	}
277	283
..	..	@@ -281,52 +287,59 @@
281	287	* things smaller.
282	288	*/
283	289	if (fsflags & FS_NOCOMP_FL) {
284		- binode->flags &= ~BTRFS_INODE_COMPRESS;
285		- binode->flags \|= BTRFS_INODE_NOCOMPRESS;
286		-
287		- ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
288		- if (ret && ret != -ENODATA)
289		- goto out_drop;
	290	+ binode_flags &= ~BTRFS_INODE_COMPRESS;
	291	+ binode_flags \|= BTRFS_INODE_NOCOMPRESS;
290	292	} else if (fsflags & FS_COMPR_FL) {
291		- const char *comp;
292	293
293		- binode->flags \|= BTRFS_INODE_COMPRESS;
294		- binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
	294	+ if (IS_SWAPFILE(inode)) {
	295	+ ret = -ETXTBSY;
	296	+ goto out_unlock;
	297	+ }
	298	+
	299	+ binode_flags \|= BTRFS_INODE_COMPRESS;
	300	+ binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
295	301
296	302	comp = btrfs_compress_type2str(fs_info->compress_type);
297	303	if (!comp \|\| comp[0] == 0)
298	304	comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
299		-
300		- ret = btrfs_set_prop(inode, "btrfs.compression",
301		- comp, strlen(comp), 0);
302		- if (ret)
303		- goto out_drop;
304		-
305	305	} else {
306		- ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
307		- if (ret && ret != -ENODATA)
308		- goto out_drop;
309		- binode->flags &= ~(BTRFS_INODE_COMPRESS \| BTRFS_INODE_NOCOMPRESS);
	306	+ binode_flags &= ~(BTRFS_INODE_COMPRESS \| BTRFS_INODE_NOCOMPRESS);
310	307	}
311	308
312		- trans = btrfs_start_transaction(root, 1);
	309	+ /*
	310	+ * 1 for inode item
	311	+ * 2 for properties
	312	+ */
	313	+ trans = btrfs_start_transaction(root, 3);
313	314	if (IS_ERR(trans)) {
314	315	ret = PTR_ERR(trans);
315		- goto out_drop;
	316	+ goto out_unlock;
316	317	}
317	318
	319	+ if (comp) {
	320	+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
	321	+ strlen(comp), 0);
	322	+ if (ret) {
	323	+ btrfs_abort_transaction(trans, ret);
	324	+ goto out_end_trans;
	325	+ }
	326	+ } else {
	327	+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
	328	+ 0, 0);
	329	+ if (ret && ret != -ENODATA) {
	330	+ btrfs_abort_transaction(trans, ret);
	331	+ goto out_end_trans;
	332	+ }
	333	+ }
	334	+
	335	+ binode->flags = binode_flags;
318	336	btrfs_sync_inode_flags_to_i_flags(inode);
319	337	inode_inc_iversion(inode);
320	338	inode->i_ctime = current_time(inode);
321	339	ret = btrfs_update_inode(trans, root, inode);
322	340
	341	+ out_end_trans:
323	342	btrfs_end_transaction(trans);
324		- out_drop:
325		- if (ret) {
326		- binode->flags = old_flags;
327		- inode->i_flags = old_i_flags;
328		- }
329		-
330	343	out_unlock:
331	344	inode_unlock(inode);
332	345	mnt_drop_write_file(file);
..	..	@@ -365,6 +378,18 @@
365	378	return 0;
366	379	}
367	380
	381	+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
	382	+ enum btrfs_exclusive_operation type)
	383	+{
	384	+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
	385	+}
	386	+
	387	+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
	388	+{
	389	+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
	390	+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
	391	+}
	392	+
368	393	/*
369	394	* Set the xflags from the internal inode flags. The remaining items of fsxattr
370	395	* are zeroed.
..	..	@@ -374,9 +399,7 @@
374	399	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
375	400	struct fsxattr fa;
376	401
377		- memset(&fa, 0, sizeof(fa));
378		- fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
379		-
	402	+ simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
380	403	if (copy_to_user(arg, &fa, sizeof(fa)))
381	404	return -EFAULT;
382	405
..	..	@@ -389,7 +412,7 @@
389	412	struct btrfs_inode *binode = BTRFS_I(inode);
390	413	struct btrfs_root *root = binode->root;
391	414	struct btrfs_trans_handle *trans;
392		- struct fsxattr fa;
	415	+ struct fsxattr fa, old_fa;
393	416	unsigned old_flags;
394	417	unsigned old_i_flags;
395	418	int ret = 0;
..	..	@@ -400,7 +423,6 @@
400	423	if (btrfs_root_readonly(root))
401	424	return -EROFS;
402	425
403		- memset(&fa, 0, sizeof(fa));
404	426	if (copy_from_user(&fa, arg, sizeof(fa)))
405	427	return -EFAULT;
406	428
..	..	@@ -420,13 +442,11 @@
420	442	old_flags = binode->flags;
421	443	old_i_flags = inode->i_flags;
422	444
423		- /* We need the capabilities to change append-only or immutable inode */
424		- if (((old_flags & (BTRFS_INODE_APPEND \| BTRFS_INODE_IMMUTABLE)) \|\|
425		- (fa.fsx_xflags & (FS_XFLAG_APPEND \| FS_XFLAG_IMMUTABLE))) &&
426		- !capable(CAP_LINUX_IMMUTABLE)) {
427		- ret = -EPERM;
	445	+ simple_fill_fsxattr(&old_fa,
	446	+ btrfs_inode_flags_to_xflags(binode->flags));
	447	+ ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
	448	+ if (ret)
428	449	goto out_unlock;
429		- }
430	450
431	451	if (fa.fsx_xflags & FS_XFLAG_SYNC)
432	452	binode->flags \|= BTRFS_INODE_SYNC;
..	..	@@ -482,10 +502,9 @@
482	502	return put_user(inode->i_generation, arg);
483	503	}
484	504
485		-static noinline int btrfs_ioctl_fitrim(struct file file, void __user arg)
	505	+static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
	506	+ void __user *arg)
486	507	{
487		- struct inode *inode = file_inode(file);
488		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
489	508	struct btrfs_device *device;
490	509	struct request_queue *q;
491	510	struct fstrim_range range;
..	..	@@ -544,7 +563,7 @@
544	563	return 0;
545	564	}
546	565
547		-int btrfs_is_empty_uuid(u8 *uuid)
	566	+int __pure btrfs_is_empty_uuid(u8 *uuid)
548	567	{
549	568	int i;
550	569
..	..	@@ -558,7 +577,6 @@
558	577	static noinline int create_subvol(struct inode *dir,
559	578	struct dentry *dentry,
560	579	const char *name, int namelen,
561		- u64 *async_transid,
562	580	struct btrfs_qgroup_inherit *inherit)
563	581	{
564	582	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
..	..	@@ -574,10 +592,10 @@
574	592	struct inode *inode;
575	593	int ret;
576	594	int err;
	595	+ dev_t anon_dev = 0;
577	596	u64 objectid;
578	597	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
579	598	u64 index = 0;
580		- uuid_le new_uuid;
581	599
582	600	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
583	601	if (!root_item)
..	..	@@ -585,6 +603,10 @@
585	603
586	604	ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
587	605	if (ret)
	606	+ goto fail_free;
	607	+
	608	+ ret = get_anon_bdev(&anon_dev);
	609	+ if (ret < 0)
588	610	goto fail_free;
589	611
590	612	/*
..	..	@@ -608,7 +630,7 @@
608	630	trans = btrfs_start_transaction(root, 0);
609	631	if (IS_ERR(trans)) {
610	632	ret = PTR_ERR(trans);
611		- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
	633	+ btrfs_subvolume_release_metadata(root, &block_rsv);
612	634	goto fail_free;
613	635	}
614	636	trans->block_rsv = &block_rsv;
..	..	@@ -618,7 +640,8 @@
618	640	if (ret)
619	641	goto fail;
620	642
621		- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
	643	+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
	644	+ BTRFS_NESTING_NORMAL);
622	645	if (IS_ERR(leaf)) {
623	646	ret = PTR_ERR(leaf);
624	647	goto fail;
..	..	@@ -647,8 +670,7 @@
647	670
648	671	btrfs_set_root_generation_v2(root_item,
649	672	btrfs_root_generation(root_item));
650		- uuid_le_gen(&new_uuid);
651		- memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
	673	+ generate_random_guid(root_item->uuid);
652	674	btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
653	675	btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
654	676	root_item->ctime = root_item->otime;
..	..	@@ -682,16 +704,20 @@
682	704	leaf = NULL;
683	705
684	706	key.offset = (u64)-1;
685		- new_root = btrfs_read_fs_root_no_name(fs_info, &key);
	707	+ new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
686	708	if (IS_ERR(new_root)) {
	709	+ free_anon_bdev(anon_dev);
687	710	ret = PTR_ERR(new_root);
688	711	btrfs_abort_transaction(trans, ret);
689	712	goto fail;
690	713	}
	714	+ /* Freeing will be done in btrfs_put_root() of new_root */
	715	+ anon_dev = 0;
691	716
692	717	btrfs_record_root_in_trans(trans, new_root);
693	718
694	719	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
	720	+ btrfs_put_root(new_root);
695	721	if (ret) {
696	722	/* We potentially lose an unused inode item here */
697	723	btrfs_abort_transaction(trans, ret);
..	..	@@ -711,8 +737,7 @@
711	737	goto fail;
712	738	}
713	739
714		- ret = btrfs_insert_dir_item(trans, root,
715		- name, namelen, BTRFS_I(dir), &key,
	740	+ ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
716	741	BTRFS_FT_DIR, index);
717	742	if (ret) {
718	743	btrfs_abort_transaction(trans, ret);
..	..	@@ -742,16 +767,9 @@
742	767	kfree(root_item);
743	768	trans->block_rsv = NULL;
744	769	trans->bytes_reserved = 0;
745		- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
	770	+ btrfs_subvolume_release_metadata(root, &block_rsv);
746	771
747		- if (async_transid) {
748		- *async_transid = trans->transid;
749		- err = btrfs_commit_transaction_async(trans, 1);
750		- if (err)
751		- err = btrfs_commit_transaction(trans);
752		- } else {
753		- err = btrfs_commit_transaction(trans);
754		- }
	772	+ err = btrfs_commit_transaction(trans);
755	773	if (err && !ret)
756	774	ret = err;
757	775
..	..	@@ -764,13 +782,14 @@
764	782	return ret;
765	783
766	784	fail_free:
	785	+ if (anon_dev)
	786	+ free_anon_bdev(anon_dev);
767	787	kfree(root_item);
768	788	return ret;
769	789	}
770	790
771	791	static int create_snapshot(struct btrfs_root root, struct inode dir,
772		- struct dentry *dentry,
773		- u64 *async_transid, bool readonly,
	792	+ struct dentry *dentry, bool readonly,
774	793	struct btrfs_qgroup_inherit *inherit)
775	794	{
776	795	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
..	..	@@ -778,15 +797,23 @@
778	797	struct btrfs_pending_snapshot *pending_snapshot;
779	798	struct btrfs_trans_handle *trans;
780	799	int ret;
781		- bool snapshot_force_cow = false;
782	800
783		- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
	801	+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
784	802	return -EINVAL;
	803	+
	804	+ if (atomic_read(&root->nr_swapfiles)) {
	805	+ btrfs_warn(fs_info,
	806	+ "cannot snapshot subvolume with active swapfile");
	807	+ return -ETXTBSY;
	808	+ }
785	809
786	810	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
787	811	if (!pending_snapshot)
788	812	return -ENOMEM;
789	813
	814	+ ret = get_anon_bdev(&pending_snapshot->anon_dev);
	815	+ if (ret < 0)
	816	+ goto free_pending;
790	817	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
791	818	GFP_KERNEL);
792	819	pending_snapshot->path = btrfs_alloc_path();
..	..	@@ -794,31 +821,6 @@
794	821	ret = -ENOMEM;
795	822	goto free_pending;
796	823	}
797		-
798		- /*
799		- * Force new buffered writes to reserve space even when NOCOW is
800		- * possible. This is to avoid later writeback (running dealloc) to
801		- * fallback to COW mode and unexpectedly fail with ENOSPC.
802		- */
803		- atomic_inc(&root->will_be_snapshotted);
804		- smp_mb__after_atomic();
805		- /* wait for no snapshot writes */
806		- wait_event(root->subv_writers->wait,
807		- percpu_counter_sum(&root->subv_writers->counter) == 0);
808		-
809		- ret = btrfs_start_delalloc_snapshot(root);
810		- if (ret)
811		- goto dec_and_free;
812		-
813		- /*
814		- * All previous writes have started writeback in NOCOW mode, so now
815		- * we force future writes to fallback to COW mode during snapshot
816		- * creation.
817		- */
818		- atomic_inc(&root->snapshot_force_cow);
819		- snapshot_force_cow = true;
820		-
821		- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
822	824
823	825	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
824	826	BTRFS_BLOCK_RSV_TEMP);
..	..	@@ -834,7 +836,7 @@
834	836	&pending_snapshot->block_rsv, 8,
835	837	false);
836	838	if (ret)
837		- goto dec_and_free;
	839	+ goto free_pending;
838	840
839	841	pending_snapshot->dentry = dentry;
840	842	pending_snapshot->root = root;
..	..	@@ -852,14 +854,8 @@
852	854	list_add(&pending_snapshot->list,
853	855	&trans->transaction->pending_snapshots);
854	856	spin_unlock(&fs_info->trans_lock);
855		- if (async_transid) {
856		- *async_transid = trans->transid;
857		- ret = btrfs_commit_transaction_async(trans, 1);
858		- if (ret)
859		- ret = btrfs_commit_transaction(trans);
860		- } else {
861		- ret = btrfs_commit_transaction(trans);
862		- }
	857	+
	858	+ ret = btrfs_commit_transaction(trans);
863	859	if (ret)
864	860	goto fail;
865	861
..	..	@@ -879,14 +875,16 @@
879	875
880	876	d_instantiate(dentry, inode);
881	877	ret = 0;
	878	+ pending_snapshot->anon_dev = 0;
882	879	fail:
883		- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
884		-dec_and_free:
885		- if (snapshot_force_cow)
886		- atomic_dec(&root->snapshot_force_cow);
887		- if (atomic_dec_and_test(&root->will_be_snapshotted))
888		- wake_up_var(&root->will_be_snapshotted);
	880	+ /* Prevent double freeing of anon_dev */
	881	+ if (ret && pending_snapshot->snap)
	882	+ pending_snapshot->snap->anon_dev = 0;
	883	+ btrfs_put_root(pending_snapshot->snap);
	884	+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
889	885	free_pending:
	886	+ if (pending_snapshot->anon_dev)
	887	+ free_anon_bdev(pending_snapshot->anon_dev);
890	888	kfree(pending_snapshot->root_item);
891	889	btrfs_free_path(pending_snapshot->path);
892	890	kfree(pending_snapshot);
..	..	@@ -964,7 +962,7 @@
964	962	static noinline int btrfs_mksubvol(const struct path *parent,
965	963	const char *name, int namelen,
966	964	struct btrfs_root *snap_src,
967		- u64 *async_transid, bool readonly,
	965	+ bool readonly,
968	966	struct btrfs_qgroup_inherit *inherit)
969	967	{
970	968	struct inode *dir = d_inode(parent->dentry);
..	..	@@ -1000,13 +998,11 @@
1000	998	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
1001	999	goto out_up_read;
1002	1000
1003		- if (snap_src) {
1004		- error = create_snapshot(snap_src, dir, dentry,
1005		- async_transid, readonly, inherit);
1006		- } else {
1007		- error = create_subvol(dir, dentry, name, namelen,
1008		- async_transid, inherit);
1009		- }
	1001	+ if (snap_src)
	1002	+ error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
	1003	+ else
	1004	+ error = create_subvol(dir, dentry, name, namelen, inherit);
	1005	+
1010	1006	if (!error)
1011	1007	fsnotify_mkdir(dir, dentry);
1012	1008	out_up_read:
..	..	@@ -1016,6 +1012,45 @@
1016	1012	out_unlock:
1017	1013	inode_unlock(dir);
1018	1014	return error;
	1015	+}
	1016	+
	1017	+static noinline int btrfs_mksnapshot(const struct path *parent,
	1018	+ const char *name, int namelen,
	1019	+ struct btrfs_root *root,
	1020	+ bool readonly,
	1021	+ struct btrfs_qgroup_inherit *inherit)
	1022	+{
	1023	+ int ret;
	1024	+ bool snapshot_force_cow = false;
	1025	+
	1026	+ /*
	1027	+ * Force new buffered writes to reserve space even when NOCOW is
	1028	+ * possible. This is to avoid later writeback (running dealloc) to
	1029	+ * fallback to COW mode and unexpectedly fail with ENOSPC.
	1030	+ */
	1031	+ btrfs_drew_read_lock(&root->snapshot_lock);
	1032	+
	1033	+ ret = btrfs_start_delalloc_snapshot(root);
	1034	+ if (ret)
	1035	+ goto out;
	1036	+
	1037	+ /*
	1038	+ * All previous writes have started writeback in NOCOW mode, so now
	1039	+ * we force future writes to fallback to COW mode during snapshot
	1040	+ * creation.
	1041	+ */
	1042	+ atomic_inc(&root->snapshot_force_cow);
	1043	+ snapshot_force_cow = true;
	1044	+
	1045	+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
	1046	+
	1047	+ ret = btrfs_mksubvol(parent, name, namelen,
	1048	+ root, readonly, inherit);
	1049	+out:
	1050	+ if (snapshot_force_cow)
	1051	+ atomic_dec(&root->snapshot_force_cow);
	1052	+ btrfs_drew_read_unlock(&root->snapshot_lock);
	1053	+ return ret;
1019	1054	}
1020	1055
1021	1056	/*
..	..	@@ -1139,7 +1174,7 @@
1139	1174
1140	1175	/* get the big lock and read metadata off disk */
1141	1176	lock_extent_bits(io_tree, start, end, &cached);
1142		- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
	1177	+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
1143	1178	unlock_extent_cached(io_tree, start, end, &cached);
1144	1179
1145	1180	if (IS_ERR(em))
..	..	@@ -1252,6 +1287,7 @@
1252	1287	u64 page_end;
1253	1288	u64 page_cnt;
1254	1289	u64 start = (u64)start_index << PAGE_SHIFT;
	1290	+ u64 search_start;
1255	1291	int ret;
1256	1292	int i;
1257	1293	int i_done;
..	..	@@ -1267,7 +1303,7 @@
1267	1303
1268	1304	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
1269	1305
1270		- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
	1306	+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
1271	1307	start, page_cnt << PAGE_SHIFT);
1272	1308	if (ret)
1273	1309	return ret;
..	..	@@ -1288,7 +1324,7 @@
1288	1324	while (1) {
1289	1325	lock_extent_bits(tree, page_start, page_end,
1290	1326	&cached_state);
1291		- ordered = btrfs_lookup_ordered_extent(inode,
	1327	+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
1292	1328	page_start);
1293	1329	unlock_extent_cached(tree, page_start, page_end,
1294	1330	&cached_state);
..	..	@@ -1296,7 +1332,7 @@
1296	1332	break;
1297	1333
1298	1334	unlock_page(page);
1299		- btrfs_start_ordered_extent(inode, ordered, 1);
	1335	+ btrfs_start_ordered_extent(ordered, 1);
1300	1336	btrfs_put_ordered_extent(ordered);
1301	1337	lock_page(page);
1302	1338	/*
..	..	@@ -1348,16 +1384,49 @@
1348	1384
1349	1385	lock_extent_bits(&BTRFS_I(inode)->io_tree,
1350	1386	page_start, page_end - 1, &cached_state);
	1387	+
	1388	+ /*
	1389	+ * When defragmenting we skip ranges that have holes or inline extents,
	1390	+ * (check should_defrag_range()), to avoid unnecessary IO and wasting
	1391	+ * space. At btrfs_defrag_file(), we check if a range should be defragged
	1392	+ * before locking the inode and then, if it should, we trigger a sync
	1393	+ * page cache readahead - we lock the inode only after that to avoid
	1394	+ * blocking for too long other tasks that possibly want to operate on
	1395	+ * other file ranges. But before we were able to get the inode lock,
	1396	+ * some other task may have punched a hole in the range, or we may have
	1397	+ * now an inline extent, in which case we should not defrag. So check
	1398	+ * for that here, where we have the inode and the range locked, and bail
	1399	+ * out if that happened.
	1400	+ */
	1401	+ search_start = page_start;
	1402	+ while (search_start < page_end) {
	1403	+ struct extent_map *em;
	1404	+
	1405	+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
	1406	+ page_end - search_start);
	1407	+ if (IS_ERR(em)) {
	1408	+ ret = PTR_ERR(em);
	1409	+ goto out_unlock_range;
	1410	+ }
	1411	+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
	1412	+ free_extent_map(em);
	1413	+ /* Ok, 0 means we did not defrag anything */
	1414	+ ret = 0;
	1415	+ goto out_unlock_range;
	1416	+ }
	1417	+ search_start = extent_map_end(em);
	1418	+ free_extent_map(em);
	1419	+ }
	1420	+
1351	1421	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1352		- page_end - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|
1353		- EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 0, 0,
1354		- &cached_state);
	1422	+ page_end - 1, EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \|
	1423	+ EXTENT_DEFRAG, 0, 0, &cached_state);
1355	1424
1356	1425	if (i_done != page_cnt) {
1357	1426	spin_lock(&BTRFS_I(inode)->lock);
1358	1427	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1359	1428	spin_unlock(&BTRFS_I(inode)->lock);
1360		- btrfs_delalloc_release_space(inode, data_reserved,
	1429	+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
1361	1430	start, (page_cnt - i_done) << PAGE_SHIFT, true);
1362	1431	}
1363	1432
..	..	@@ -1379,12 +1448,16 @@
1379	1448	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
1380	1449	extent_changeset_free(data_reserved);
1381	1450	return i_done;
	1451	+
	1452	+out_unlock_range:
	1453	+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
	1454	+ page_start, page_end - 1, &cached_state);
1382	1455	out:
1383	1456	for (i = 0; i < i_done; i++) {
1384	1457	unlock_page(pages[i]);
1385	1458	put_page(pages[i]);
1386	1459	}
1387		- btrfs_delalloc_release_space(inode, data_reserved,
	1460	+ btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
1388	1461	start, page_cnt << PAGE_SHIFT, true);
1389	1462	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
1390	1463	extent_changeset_free(data_reserved);
..	..	@@ -1424,7 +1497,7 @@
1424	1497	return -EINVAL;
1425	1498
1426	1499	if (do_compress) {
1427		- if (range->compress_type > BTRFS_COMPRESS_TYPES)
	1500	+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1428	1501	return -EINVAL;
1429	1502	if (range->compress_type)
1430	1503	compress_type = range->compress_type;
..	..	@@ -1530,9 +1603,13 @@
1530	1603	}
1531	1604
1532	1605	inode_lock(inode);
1533		- if (do_compress)
1534		- BTRFS_I(inode)->defrag_compress = compress_type;
1535		- ret = cluster_pages_for_defrag(inode, pages, i, cluster);
	1606	+ if (IS_SWAPFILE(inode)) {
	1607	+ ret = -ETXTBSY;
	1608	+ } else {
	1609	+ if (do_compress)
	1610	+ BTRFS_I(inode)->defrag_compress = compress_type;
	1611	+ ret = cluster_pages_for_defrag(inode, pages, i, cluster);
	1612	+ }
1536	1613	if (ret < 0) {
1537	1614	inode_unlock(inode);
1538	1615	goto out_ra;
..	..	@@ -1623,7 +1700,7 @@
1623	1700	if (ret)
1624	1701	return ret;
1625	1702
1626		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
	1703	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
1627	1704	mnt_drop_write_file(file);
1628	1705	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
1629	1706	}
..	..	@@ -1717,9 +1794,6 @@
1717	1794
1718	1795	new_size = round_down(new_size, fs_info->sectorsize);
1719	1796
1720		- btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
1721		- rcu_str_deref(device->name), new_size);
1722		-
1723	1797	if (new_size > old_size) {
1724	1798	trans = btrfs_start_transaction(root, 0);
1725	1799	if (IS_ERR(trans)) {
..	..	@@ -1732,17 +1806,22 @@
1732	1806	ret = btrfs_shrink_device(device, new_size);
1733	1807	} /* equal, nothing need to do */
1734	1808
	1809	+ if (ret == 0 && new_size != old_size)
	1810	+ btrfs_info_in_rcu(fs_info,
	1811	+ "resize device %s (devid %llu) from %llu to %llu",
	1812	+ rcu_str_deref(device->name), device->devid,
	1813	+ old_size, new_size);
1735	1814	out_free:
1736	1815	kfree(vol_args);
1737	1816	out:
1738		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	1817	+ btrfs_exclop_finish(fs_info);
1739	1818	mnt_drop_write_file(file);
1740	1819	return ret;
1741	1820	}
1742	1821
1743		-static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
	1822	+static noinline int __btrfs_ioctl_snap_create(struct file *file,
1744	1823	const char *name, unsigned long fd, int subvol,
1745		- u64 *transid, bool readonly,
	1824	+ bool readonly,
1746	1825	struct btrfs_qgroup_inherit *inherit)
1747	1826	{
1748	1827	int namelen;
..	..	@@ -1769,7 +1848,7 @@
1769	1848
1770	1849	if (subvol) {
1771	1850	ret = btrfs_mksubvol(&file->f_path, name, namelen,
1772		- NULL, transid, readonly, inherit);
	1851	+ NULL, readonly, inherit);
1773	1852	} else {
1774	1853	struct fd src = fdget(fd);
1775	1854	struct inode *src_inode;
..	..	@@ -1790,9 +1869,9 @@
1790	1869	*/
1791	1870	ret = -EPERM;
1792	1871	} else {
1793		- ret = btrfs_mksubvol(&file->f_path, name, namelen,
	1872	+ ret = btrfs_mksnapshot(&file->f_path, name, namelen,
1794	1873	BTRFS_I(src_inode)->root,
1795		- transid, readonly, inherit);
	1874	+ readonly, inherit);
1796	1875	}
1797	1876	fdput(src);
1798	1877	}
..	..	@@ -1816,9 +1895,8 @@
1816	1895	return PTR_ERR(vol_args);
1817	1896	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1818	1897
1819		- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1820		- vol_args->fd, subvol,
1821		- NULL, false, NULL);
	1898	+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
	1899	+ subvol, false, NULL);
1822	1900
1823	1901	kfree(vol_args);
1824	1902	return ret;
..	..	@@ -1829,8 +1907,6 @@
1829	1907	{
1830	1908	struct btrfs_ioctl_vol_args_v2 *vol_args;
1831	1909	int ret;
1832		- u64 transid = 0;
1833		- u64 *ptr = NULL;
1834	1910	bool readonly = false;
1835	1911	struct btrfs_qgroup_inherit *inherit = NULL;
1836	1912
..	..	@@ -1842,15 +1918,11 @@
1842	1918	return PTR_ERR(vol_args);
1843	1919	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1844	1920
1845		- if (vol_args->flags &
1846		- ~(BTRFS_SUBVOL_CREATE_ASYNC \| BTRFS_SUBVOL_RDONLY \|
1847		- BTRFS_SUBVOL_QGROUP_INHERIT)) {
	1921	+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
1848	1922	ret = -EOPNOTSUPP;
1849	1923	goto free_args;
1850	1924	}
1851	1925
1852		- if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1853		- ptr = &transid;
1854	1926	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1855	1927	readonly = true;
1856	1928	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
..	..	@@ -1882,18 +1954,10 @@
1882	1954	}
1883	1955	}
1884	1956
1885		- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1886		- vol_args->fd, subvol, ptr,
1887		- readonly, inherit);
	1957	+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
	1958	+ subvol, readonly, inherit);
1888	1959	if (ret)
1889	1960	goto free_inherit;
1890		-
1891		- if (ptr && copy_to_user(arg +
1892		- offsetof(struct btrfs_ioctl_vol_args_v2,
1893		- transid),
1894		- ptr, sizeof(*ptr)))
1895		- ret = -EFAULT;
1896		-
1897	1961	free_inherit:
1898	1962	kfree(inherit);
1899	1963	free_args:
..	..	@@ -1949,11 +2013,6 @@
1949	2013
1950	2014	if (copy_from_user(&flags, arg, sizeof(flags))) {
1951	2015	ret = -EFAULT;
1952		- goto out_drop_write;
1953		- }
1954		-
1955		- if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
1956		- ret = -EINVAL;
1957	2016	goto out_drop_write;
1958	2017	}
1959	2018
..	..	@@ -2112,7 +2171,7 @@
2112	2171	* problem. Otherwise we'll fault and then copy the buffer in
2113	2172	* properly this next time through
2114	2173	*/
2115		- if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) {
	2174	+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
2116	2175	ret = 0;
2117	2176	goto out;
2118	2177	}
..	..	@@ -2199,12 +2258,9 @@
2199	2258
2200	2259	if (sk->tree_id == 0) {
2201	2260	/* search the root of the inode that was passed */
2202		- root = BTRFS_I(inode)->root;
	2261	+ root = btrfs_grab_root(BTRFS_I(inode)->root);
2203	2262	} else {
2204		- key.objectid = sk->tree_id;
2205		- key.type = BTRFS_ROOT_ITEM_KEY;
2206		- key.offset = (u64)-1;
2207		- root = btrfs_read_fs_root_no_name(info, &key);
	2263	+ root = btrfs_get_fs_root(info, sk->tree_id, true);
2208	2264	if (IS_ERR(root)) {
2209	2265	btrfs_free_path(path);
2210	2266	return PTR_ERR(root);
..	..	@@ -2238,6 +2294,7 @@
2238	2294	ret = 0;
2239	2295	err:
2240	2296	sk->nr_items = num_found;
	2297	+ btrfs_put_root(root);
2241	2298	btrfs_free_path(path);
2242	2299	return ret;
2243	2300	}
..	..	@@ -2341,12 +2398,10 @@
2341	2398
2342	2399	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
2343	2400
2344		- key.objectid = tree_id;
2345		- key.type = BTRFS_ROOT_ITEM_KEY;
2346		- key.offset = (u64)-1;
2347		- root = btrfs_read_fs_root_no_name(info, &key);
	2401	+ root = btrfs_get_fs_root(info, tree_id, true);
2348	2402	if (IS_ERR(root)) {
2349	2403	ret = PTR_ERR(root);
	2404	+ root = NULL;
2350	2405	goto out;
2351	2406	}
2352	2407
..	..	@@ -2397,6 +2452,7 @@
2397	2452	name[total_len] = '\0';
2398	2453	ret = 0;
2399	2454	out:
	2455	+ btrfs_put_root(root);
2400	2456	btrfs_free_path(path);
2401	2457	return ret;
2402	2458	}
..	..	@@ -2413,7 +2469,7 @@
2413	2469	unsigned long item_len;
2414	2470	struct btrfs_inode_ref *iref;
2415	2471	struct btrfs_root_ref *rref;
2416		- struct btrfs_root *root;
	2472	+ struct btrfs_root *root = NULL;
2417	2473	struct btrfs_path *path;
2418	2474	struct btrfs_key key, key2;
2419	2475	struct extent_buffer *leaf;
..	..	@@ -2435,10 +2491,7 @@
2435	2491	if (dirid != upper_limit.objectid) {
2436	2492	ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
2437	2493
2438		- key.objectid = treeid;
2439		- key.type = BTRFS_ROOT_ITEM_KEY;
2440		- key.offset = (u64)-1;
2441		- root = btrfs_read_fs_root_no_name(fs_info, &key);
	2494	+ root = btrfs_get_fs_root(fs_info, treeid, true);
2442	2495	if (IS_ERR(root)) {
2443	2496	ret = PTR_ERR(root);
2444	2497	goto out;
..	..	@@ -2450,15 +2503,15 @@
2450	2503	while (1) {
2451	2504	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2452	2505	if (ret < 0) {
2453		- goto out;
	2506	+ goto out_put;
2454	2507	} else if (ret > 0) {
2455	2508	ret = btrfs_previous_item(root, path, dirid,
2456	2509	BTRFS_INODE_REF_KEY);
2457	2510	if (ret < 0) {
2458		- goto out;
	2511	+ goto out_put;
2459	2512	} else if (ret > 0) {
2460	2513	ret = -ENOENT;
2461		- goto out;
	2514	+ goto out_put;
2462	2515	}
2463	2516	}
2464	2517
..	..	@@ -2472,7 +2525,7 @@
2472	2525	total_len += len + 1;
2473	2526	if (ptr < args->path) {
2474	2527	ret = -ENAMETOOLONG;
2475		- goto out;
	2528	+ goto out_put;
2476	2529	}
2477	2530
2478	2531	*(ptr + len) = '/';
..	..	@@ -2483,10 +2536,10 @@
2483	2536	ret = btrfs_previous_item(root, path, dirid,
2484	2537	BTRFS_INODE_ITEM_KEY);
2485	2538	if (ret < 0) {
2486		- goto out;
	2539	+ goto out_put;
2487	2540	} else if (ret > 0) {
2488	2541	ret = -ENOENT;
2489		- goto out;
	2542	+ goto out_put;
2490	2543	}
2491	2544
2492	2545	leaf = path->nodes[0];
..	..	@@ -2494,29 +2547,35 @@
2494	2547	btrfs_item_key_to_cpu(leaf, &key2, slot);
2495	2548	if (key2.objectid != dirid) {
2496	2549	ret = -ENOENT;
2497		- goto out;
	2550	+ goto out_put;
2498	2551	}
2499	2552
2500		- temp_inode = btrfs_iget(sb, &key2, root, NULL);
	2553	+ /*
	2554	+ * We don't need the path anymore, so release it and
	2555	+ * avoid deadlocks and lockdep warnings in case
	2556	+ * btrfs_iget() needs to lookup the inode from its root
	2557	+ * btree and lock the same leaf.
	2558	+ */
	2559	+ btrfs_release_path(path);
	2560	+ temp_inode = btrfs_iget(sb, key2.objectid, root);
2501	2561	if (IS_ERR(temp_inode)) {
2502	2562	ret = PTR_ERR(temp_inode);
2503		- goto out;
	2563	+ goto out_put;
2504	2564	}
2505	2565	ret = inode_permission(temp_inode, MAY_READ \| MAY_EXEC);
2506	2566	iput(temp_inode);
2507	2567	if (ret) {
2508	2568	ret = -EACCES;
2509		- goto out;
	2569	+ goto out_put;
2510	2570	}
2511	2571
2512	2572	if (key.offset == upper_limit.objectid)
2513	2573	break;
2514	2574	if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
2515	2575	ret = -EACCES;
2516		- goto out;
	2576	+ goto out_put;
2517	2577	}
2518	2578
2519		- btrfs_release_path(path);
2520	2579	key.objectid = key.offset;
2521	2580	key.offset = (u64)-1;
2522	2581	dirid = key.objectid;
..	..	@@ -2524,15 +2583,16 @@
2524	2583
2525	2584	memmove(args->path, ptr, total_len);
2526	2585	args->path[total_len] = '\0';
	2586	+ btrfs_put_root(root);
	2587	+ root = NULL;
2527	2588	btrfs_release_path(path);
2528	2589	}
2529	2590
2530	2591	/* Get the bottom subvolume's name from ROOT_REF */
2531		- root = fs_info->tree_root;
2532	2592	key.objectid = treeid;
2533	2593	key.type = BTRFS_ROOT_REF_KEY;
2534	2594	key.offset = args->treeid;
2535		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	2595	+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2536	2596	if (ret < 0) {
2537	2597	goto out;
2538	2598	} else if (ret > 0) {
..	..	@@ -2559,6 +2619,8 @@
2559	2619	read_extent_buffer(leaf, args->name, item_off, item_len);
2560	2620	args->name[item_len] = 0;
2561	2621
	2622	+out_put:
	2623	+ btrfs_put_root(root);
2562	2624	out:
2563	2625	btrfs_free_path(path);
2564	2626	return ret;
..	..	@@ -2681,12 +2743,10 @@
2681	2743
2682	2744	/* Get root_item of inode's subvolume */
2683	2745	key.objectid = BTRFS_I(inode)->root->root_key.objectid;
2684		- key.type = BTRFS_ROOT_ITEM_KEY;
2685		- key.offset = (u64)-1;
2686		- root = btrfs_read_fs_root_no_name(fs_info, &key);
	2746	+ root = btrfs_get_fs_root(fs_info, key.objectid, true);
2687	2747	if (IS_ERR(root)) {
2688	2748	ret = PTR_ERR(root);
2689		- goto out;
	2749	+ goto out_free;
2690	2750	}
2691	2751	root_item = &root->root_item;
2692	2752
..	..	@@ -2719,16 +2779,14 @@
2719	2779
2720	2780	if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
2721	2781	/* Search root tree for ROOT_BACKREF of this subvolume */
2722		- root = fs_info->tree_root;
2723		-
2724	2782	key.type = BTRFS_ROOT_BACKREF_KEY;
2725	2783	key.offset = 0;
2726		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	2784	+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2727	2785	if (ret < 0) {
2728	2786	goto out;
2729	2787	} else if (path->slots[0] >=
2730	2788	btrfs_header_nritems(path->nodes[0])) {
2731		- ret = btrfs_next_leaf(root, path);
	2789	+ ret = btrfs_next_leaf(fs_info->tree_root, path);
2732	2790	if (ret < 0) {
2733	2791	goto out;
2734	2792	} else if (ret > 0) {
..	..	@@ -2759,12 +2817,16 @@
2759	2817	}
2760	2818	}
2761	2819
	2820	+ btrfs_free_path(path);
	2821	+ path = NULL;
2762	2822	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
2763	2823	ret = -EFAULT;
2764	2824
2765	2825	out:
	2826	+ btrfs_put_root(root);
	2827	+out_free:
2766	2828	btrfs_free_path(path);
2767		- kzfree(subvol_info);
	2829	+ kfree(subvol_info);
2768	2830	return ret;
2769	2831	}
2770	2832
..	..	@@ -2849,6 +2911,8 @@
2849	2911	}
2850	2912
2851	2913	out:
	2914	+ btrfs_free_path(path);
	2915	+
2852	2916	if (!ret \|\| ret == -EOVERFLOW) {
2853	2917	rootrefs->num_items = found;
2854	2918	/* update min_treeid for next search */
..	..	@@ -2860,13 +2924,13 @@
2860	2924	}
2861	2925
2862	2926	kfree(rootrefs);
2863		- btrfs_free_path(path);
2864	2927
2865	2928	return ret;
2866	2929	}
2867	2930
2868	2931	static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2869		- void __user *arg)
	2932	+ void __user *arg,
	2933	+ bool destroy_v2)
2870	2934	{
2871	2935	struct dentry *parent = file->f_path.dentry;
2872	2936	struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
..	..	@@ -2875,34 +2939,120 @@
2875	2939	struct inode *inode;
2876	2940	struct btrfs_root *root = BTRFS_I(dir)->root;
2877	2941	struct btrfs_root *dest = NULL;
2878		- struct btrfs_ioctl_vol_args *vol_args;
2879		- int namelen;
	2942	+ struct btrfs_ioctl_vol_args *vol_args = NULL;
	2943	+ struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
	2944	+ char subvol_name, subvol_name_ptr = NULL;
	2945	+ int subvol_namelen;
2880	2946	int err = 0;
	2947	+ bool destroy_parent = false;
2881	2948
2882		- if (!S_ISDIR(dir->i_mode))
2883		- return -ENOTDIR;
	2949	+ if (destroy_v2) {
	2950	+ vol_args2 = memdup_user(arg, sizeof(*vol_args2));
	2951	+ if (IS_ERR(vol_args2))
	2952	+ return PTR_ERR(vol_args2);
2884	2953
2885		- vol_args = memdup_user(arg, sizeof(*vol_args));
2886		- if (IS_ERR(vol_args))
2887		- return PTR_ERR(vol_args);
	2954	+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
	2955	+ err = -EOPNOTSUPP;
	2956	+ goto out;
	2957	+ }
2888	2958
2889		- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2890		- namelen = strlen(vol_args->name);
2891		- if (strchr(vol_args->name, '/') \|\|
2892		- strncmp(vol_args->name, "..", namelen) == 0) {
2893		- err = -EINVAL;
2894		- goto out;
	2959	+ /*
	2960	+ * If SPEC_BY_ID is not set, we are looking for the subvolume by
	2961	+ * name, same as v1 currently does.
	2962	+ */
	2963	+ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
	2964	+ vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
	2965	+ subvol_name = vol_args2->name;
	2966	+
	2967	+ err = mnt_want_write_file(file);
	2968	+ if (err)
	2969	+ goto out;
	2970	+ } else {
	2971	+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
	2972	+ err = -EINVAL;
	2973	+ goto out;
	2974	+ }
	2975	+
	2976	+ err = mnt_want_write_file(file);
	2977	+ if (err)
	2978	+ goto out;
	2979	+
	2980	+ dentry = btrfs_get_dentry(fs_info->sb,
	2981	+ BTRFS_FIRST_FREE_OBJECTID,
	2982	+ vol_args2->subvolid, 0, 0);
	2983	+ if (IS_ERR(dentry)) {
	2984	+ err = PTR_ERR(dentry);
	2985	+ goto out_drop_write;
	2986	+ }
	2987	+
	2988	+ /*
	2989	+ * Change the default parent since the subvolume being
	2990	+ * deleted can be outside of the current mount point.
	2991	+ */
	2992	+ parent = btrfs_get_parent(dentry);
	2993	+
	2994	+ /*
	2995	+ * At this point dentry->d_name can point to '/' if the
	2996	+ * subvolume we want to destroy is outsite of the
	2997	+ * current mount point, so we need to release the
	2998	+ * current dentry and execute the lookup to return a new
	2999	+ * one with ->d_name pointing to the
	3000	+ * <mount point>/subvol_name.
	3001	+ */
	3002	+ dput(dentry);
	3003	+ if (IS_ERR(parent)) {
	3004	+ err = PTR_ERR(parent);
	3005	+ goto out_drop_write;
	3006	+ }
	3007	+ dir = d_inode(parent);
	3008	+
	3009	+ /*
	3010	+ * If v2 was used with SPEC_BY_ID, a new parent was
	3011	+ * allocated since the subvolume can be outside of the
	3012	+ * current mount point. Later on we need to release this
	3013	+ * new parent dentry.
	3014	+ */
	3015	+ destroy_parent = true;
	3016	+
	3017	+ subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
	3018	+ fs_info, vol_args2->subvolid);
	3019	+ if (IS_ERR(subvol_name_ptr)) {
	3020	+ err = PTR_ERR(subvol_name_ptr);
	3021	+ goto free_parent;
	3022	+ }
	3023	+ /* subvol_name_ptr is already NULL termined */
	3024	+ subvol_name = (char *)kbasename(subvol_name_ptr);
	3025	+ }
	3026	+ } else {
	3027	+ vol_args = memdup_user(arg, sizeof(*vol_args));
	3028	+ if (IS_ERR(vol_args))
	3029	+ return PTR_ERR(vol_args);
	3030	+
	3031	+ vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
	3032	+ subvol_name = vol_args->name;
	3033	+
	3034	+ err = mnt_want_write_file(file);
	3035	+ if (err)
	3036	+ goto out;
2895	3037	}
2896	3038
2897		- err = mnt_want_write_file(file);
2898		- if (err)
2899		- goto out;
	3039	+ subvol_namelen = strlen(subvol_name);
2900	3040
	3041	+ if (strchr(subvol_name, '/') \|\|
	3042	+ strncmp(subvol_name, "..", subvol_namelen) == 0) {
	3043	+ err = -EINVAL;
	3044	+ goto free_subvol_name;
	3045	+ }
	3046	+
	3047	+ if (!S_ISDIR(dir->i_mode)) {
	3048	+ err = -ENOTDIR;
	3049	+ goto free_subvol_name;
	3050	+ }
2901	3051
2902	3052	err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
2903	3053	if (err == -EINTR)
2904		- goto out_drop_write;
2905		- dentry = lookup_one_len(vol_args->name, parent, namelen);
	3054	+ goto free_subvol_name;
	3055	+ dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
2906	3056	if (IS_ERR(dentry)) {
2907	3057	err = PTR_ERR(dentry);
2908	3058	goto out_unlock_dir;
..	..	@@ -2963,15 +3113,21 @@
2963	3113	err = btrfs_delete_subvolume(dir, dentry);
2964	3114	inode_unlock(inode);
2965	3115	if (!err)
2966		- d_delete(dentry);
	3116	+ d_delete_notify(dir, dentry);
2967	3117
2968	3118	out_dput:
2969	3119	dput(dentry);
2970	3120	out_unlock_dir:
2971	3121	inode_unlock(dir);
	3122	+free_subvol_name:
	3123	+ kfree(subvol_name_ptr);
	3124	+free_parent:
	3125	+ if (destroy_parent)
	3126	+ dput(parent);
2972	3127	out_drop_write:
2973	3128	mnt_drop_write_file(file);
2974	3129	out:
	3130	+ kfree(vol_args2);
2975	3131	kfree(vol_args);
2976	3132	return err;
2977	3133	}
..	..	@@ -3056,7 +3212,7 @@
3056	3212	if (!capable(CAP_SYS_ADMIN))
3057	3213	return -EPERM;
3058	3214
3059		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
	3215	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
3060	3216	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3061	3217
3062	3218	vol_args = memdup_user(arg, sizeof(*vol_args));
..	..	@@ -3073,7 +3229,7 @@
3073	3229
3074	3230	kfree(vol_args);
3075	3231	out:
3076		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	3232	+ btrfs_exclop_finish(fs_info);
3077	3233	return ret;
3078	3234	}
3079	3235
..	..	@@ -3097,13 +3253,12 @@
3097	3253	goto err_drop;
3098	3254	}
3099	3255
3100		- /* Check for compatibility reject unknown flags */
3101		- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
	3256	+ if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
3102	3257	ret = -EOPNOTSUPP;
3103	3258	goto out;
3104	3259	}
3105	3260
3106		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
	3261	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
3107	3262	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3108	3263	goto out;
3109	3264	}
..	..	@@ -3114,7 +3269,7 @@
3114	3269	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
3115	3270	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
3116	3271	}
3117		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	3272	+ btrfs_exclop_finish(fs_info);
3118	3273
3119	3274	if (!ret) {
3120	3275	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
..	..	@@ -3145,7 +3300,7 @@
3145	3300	if (ret)
3146	3301	return ret;
3147	3302
3148		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
	3303	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
3149	3304	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3150	3305	goto out_drop_write;
3151	3306	}
..	..	@@ -3163,7 +3318,7 @@
3163	3318	btrfs_info(fs_info, "disk deleted %s", vol_args->name);
3164	3319	kfree(vol_args);
3165	3320	out:
3166		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	3321	+ btrfs_exclop_finish(fs_info);
3167	3322	out_drop_write:
3168	3323	mnt_drop_write_file(file);
3169	3324
..	..	@@ -3176,11 +3331,15 @@
3176	3331	struct btrfs_ioctl_fs_info_args *fi_args;
3177	3332	struct btrfs_device *device;
3178	3333	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	3334	+ u64 flags_in;
3179	3335	int ret = 0;
3180	3336
3181		- fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
3182		- if (!fi_args)
3183		- return -ENOMEM;
	3337	+ fi_args = memdup_user(arg, sizeof(*fi_args));
	3338	+ if (IS_ERR(fi_args))
	3339	+ return PTR_ERR(fi_args);
	3340	+
	3341	+ flags_in = fi_args->flags;
	3342	+ memset(fi_args, 0, sizeof(*fi_args));
3184	3343
3185	3344	rcu_read_lock();
3186	3345	fi_args->num_devices = fs_devices->num_devices;
..	..	@@ -3191,10 +3350,27 @@
3191	3350	}
3192	3351	rcu_read_unlock();
3193	3352
3194		- memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
	3353	+ memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
3195	3354	fi_args->nodesize = fs_info->nodesize;
3196	3355	fi_args->sectorsize = fs_info->sectorsize;
3197	3356	fi_args->clone_alignment = fs_info->sectorsize;
	3357	+
	3358	+ if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
	3359	+ fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
	3360	+ fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
	3361	+ fi_args->flags \|= BTRFS_FS_INFO_FLAG_CSUM_INFO;
	3362	+ }
	3363	+
	3364	+ if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
	3365	+ fi_args->generation = fs_info->generation;
	3366	+ fi_args->flags \|= BTRFS_FS_INFO_FLAG_GENERATION;
	3367	+ }
	3368	+
	3369	+ if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
	3370	+ memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
	3371	+ sizeof(fi_args->metadata_uuid));
	3372	+ fi_args->flags \|= BTRFS_FS_INFO_FLAG_METADATA_UUID;
	3373	+ }
3198	3374
3199	3375	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
3200	3376	ret = -EFAULT;
..	..	@@ -3231,13 +3407,10 @@
3231	3407	di_args->bytes_used = btrfs_device_get_bytes_used(dev);
3232	3408	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
3233	3409	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
3234		- if (dev->name) {
3235		- strncpy(di_args->path, rcu_str_deref(dev->name),
3236		- sizeof(di_args->path) - 1);
3237		- di_args->path[sizeof(di_args->path) - 1] = 0;
3238		- } else {
	3410	+ if (dev->name)
	3411	+ strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path));
	3412	+ else
3239	3413	di_args->path[0] = '\0';
3240		- }
3241	3414
3242	3415	out:
3243	3416	rcu_read_unlock();
..	..	@@ -3248,1183 +3421,6 @@
3248	3421	return ret;
3249	3422	}
3250	3423
3251		-static struct page extent_same_get_page(struct inode inode, pgoff_t index)
3252		-{
3253		- struct page *page;
3254		-
3255		- page = grab_cache_page(inode->i_mapping, index);
3256		- if (!page)
3257		- return ERR_PTR(-ENOMEM);
3258		-
3259		- if (!PageUptodate(page)) {
3260		- int ret;
3261		-
3262		- ret = btrfs_readpage(NULL, page);
3263		- if (ret)
3264		- return ERR_PTR(ret);
3265		- lock_page(page);
3266		- if (!PageUptodate(page)) {
3267		- unlock_page(page);
3268		- put_page(page);
3269		- return ERR_PTR(-EIO);
3270		- }
3271		- if (page->mapping != inode->i_mapping) {
3272		- unlock_page(page);
3273		- put_page(page);
3274		- return ERR_PTR(-EAGAIN);
3275		- }
3276		- }
3277		-
3278		- return page;
3279		-}
3280		-
3281		-static int gather_extent_pages(struct inode inode, struct page *pages,
3282		- int num_pages, u64 off)
3283		-{
3284		- int i;
3285		- pgoff_t index = off >> PAGE_SHIFT;
3286		-
3287		- for (i = 0; i < num_pages; i++) {
3288		-again:
3289		- pages[i] = extent_same_get_page(inode, index + i);
3290		- if (IS_ERR(pages[i])) {
3291		- int err = PTR_ERR(pages[i]);
3292		-
3293		- if (err == -EAGAIN)
3294		- goto again;
3295		- pages[i] = NULL;
3296		- return err;
3297		- }
3298		- }
3299		- return 0;
3300		-}
3301		-
3302		-static int lock_extent_range(struct inode *inode, u64 off, u64 len,
3303		- bool retry_range_locking)
3304		-{
3305		- /*
3306		- * Do any pending delalloc/csum calculations on inode, one way or
3307		- * another, and lock file content.
3308		- * The locking order is:
3309		- *
3310		- * 1) pages
3311		- * 2) range in the inode's io tree
3312		- */
3313		- while (1) {
3314		- struct btrfs_ordered_extent *ordered;
3315		- lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
3316		- ordered = btrfs_lookup_first_ordered_extent(inode,
3317		- off + len - 1);
3318		- if ((!ordered \|\|
3319		- ordered->file_offset + ordered->len <= off \|\|
3320		- ordered->file_offset >= off + len) &&
3321		- !test_range_bit(&BTRFS_I(inode)->io_tree, off,
3322		- off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
3323		- if (ordered)
3324		- btrfs_put_ordered_extent(ordered);
3325		- break;
3326		- }
3327		- unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
3328		- if (ordered)
3329		- btrfs_put_ordered_extent(ordered);
3330		- if (!retry_range_locking)
3331		- return -EAGAIN;
3332		- btrfs_wait_ordered_range(inode, off, len);
3333		- }
3334		- return 0;
3335		-}
3336		-
3337		-static void btrfs_double_inode_unlock(struct inode inode1, struct inode inode2)
3338		-{
3339		- inode_unlock(inode1);
3340		- inode_unlock(inode2);
3341		-}
3342		-
3343		-static void btrfs_double_inode_lock(struct inode inode1, struct inode inode2)
3344		-{
3345		- if (inode1 < inode2)
3346		- swap(inode1, inode2);
3347		-
3348		- inode_lock_nested(inode1, I_MUTEX_PARENT);
3349		- inode_lock_nested(inode2, I_MUTEX_CHILD);
3350		-}
3351		-
3352		-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
3353		- struct inode *inode2, u64 loff2, u64 len)
3354		-{
3355		- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
3356		- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
3357		-}
3358		-
3359		-static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
3360		- struct inode *inode2, u64 loff2, u64 len,
3361		- bool retry_range_locking)
3362		-{
3363		- int ret;
3364		-
3365		- if (inode1 < inode2) {
3366		- swap(inode1, inode2);
3367		- swap(loff1, loff2);
3368		- }
3369		- ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
3370		- if (ret)
3371		- return ret;
3372		- ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
3373		- if (ret)
3374		- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
3375		- loff1 + len - 1);
3376		- return ret;
3377		-}
3378		-
3379		-struct cmp_pages {
3380		- int num_pages;
3381		- struct page **src_pages;
3382		- struct page **dst_pages;
3383		-};
3384		-
3385		-static void btrfs_cmp_data_free(struct cmp_pages *cmp)
3386		-{
3387		- int i;
3388		- struct page *pg;
3389		-
3390		- for (i = 0; i < cmp->num_pages; i++) {
3391		- pg = cmp->src_pages[i];
3392		- if (pg) {
3393		- unlock_page(pg);
3394		- put_page(pg);
3395		- cmp->src_pages[i] = NULL;
3396		- }
3397		- pg = cmp->dst_pages[i];
3398		- if (pg) {
3399		- unlock_page(pg);
3400		- put_page(pg);
3401		- cmp->dst_pages[i] = NULL;
3402		- }
3403		- }
3404		-}
3405		-
3406		-static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
3407		- struct inode *dst, u64 dst_loff,
3408		- u64 len, struct cmp_pages *cmp)
3409		-{
3410		- int ret;
3411		- int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
3412		-
3413		- cmp->num_pages = num_pages;
3414		-
3415		- ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
3416		- if (ret)
3417		- goto out;
3418		-
3419		- ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
3420		-
3421		-out:
3422		- if (ret)
3423		- btrfs_cmp_data_free(cmp);
3424		- return ret;
3425		-}
3426		-
3427		-static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
3428		-{
3429		- int ret = 0;
3430		- int i;
3431		- struct page src_page, dst_page;
3432		- unsigned int cmp_len = PAGE_SIZE;
3433		- void addr, dst_addr;
3434		-
3435		- i = 0;
3436		- while (len) {
3437		- if (len < PAGE_SIZE)
3438		- cmp_len = len;
3439		-
3440		- BUG_ON(i >= cmp->num_pages);
3441		-
3442		- src_page = cmp->src_pages[i];
3443		- dst_page = cmp->dst_pages[i];
3444		- ASSERT(PageLocked(src_page));
3445		- ASSERT(PageLocked(dst_page));
3446		-
3447		- addr = kmap_atomic(src_page);
3448		- dst_addr = kmap_atomic(dst_page);
3449		-
3450		- flush_dcache_page(src_page);
3451		- flush_dcache_page(dst_page);
3452		-
3453		- if (memcmp(addr, dst_addr, cmp_len))
3454		- ret = -EBADE;
3455		-
3456		- kunmap_atomic(addr);
3457		- kunmap_atomic(dst_addr);
3458		-
3459		- if (ret)
3460		- break;
3461		-
3462		- len -= cmp_len;
3463		- i++;
3464		- }
3465		-
3466		- return ret;
3467		-}
3468		-
3469		-static int extent_same_check_offsets(struct inode inode, u64 off, u64 plen,
3470		- u64 olen)
3471		-{
3472		- u64 len = *plen;
3473		- u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
3474		-
3475		- if (off + olen > inode->i_size \|\| off + olen < off)
3476		- return -EINVAL;
3477		-
3478		- /* if we extend to eof, continue to block boundary */
3479		- if (off + len == inode->i_size)
3480		- *plen = len = ALIGN(inode->i_size, bs) - off;
3481		-
3482		- /* Check that we are block aligned - btrfs_clone() requires this */
3483		- if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs))
3484		- return -EINVAL;
3485		-
3486		- return 0;
3487		-}
3488		-
3489		-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
3490		- struct inode *dst, u64 dst_loff,
3491		- struct cmp_pages *cmp)
3492		-{
3493		- int ret;
3494		- u64 len = olen;
3495		- bool same_inode = (src == dst);
3496		- u64 same_lock_start = 0;
3497		- u64 same_lock_len = 0;
3498		-
3499		- ret = extent_same_check_offsets(src, loff, &len, olen);
3500		- if (ret)
3501		- return ret;
3502		-
3503		- ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
3504		- if (ret)
3505		- return ret;
3506		-
3507		- if (same_inode) {
3508		- /*
3509		- * Single inode case wants the same checks, except we
3510		- * don't want our length pushed out past i_size as
3511		- * comparing that data range makes no sense.
3512		- *
3513		- * extent_same_check_offsets() will do this for an
3514		- * unaligned length at i_size, so catch it here and
3515		- * reject the request.
3516		- *
3517		- * This effectively means we require aligned extents
3518		- * for the single-inode case, whereas the other cases
3519		- * allow an unaligned length so long as it ends at
3520		- * i_size.
3521		- */
3522		- if (len != olen)
3523		- return -EINVAL;
3524		-
3525		- /* Check for overlapping ranges */
3526		- if (dst_loff + len > loff && dst_loff < loff + len)
3527		- return -EINVAL;
3528		-
3529		- same_lock_start = min_t(u64, loff, dst_loff);
3530		- same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
3531		- } else {
3532		- /*
3533		- * If the source and destination inodes are different, the
3534		- * source's range end offset matches the source's i_size, that
3535		- * i_size is not a multiple of the sector size, and the
3536		- * destination range does not go past the destination's i_size,
3537		- * we must round down the length to the nearest sector size
3538		- * multiple. If we don't do this adjustment we end replacing
3539		- * with zeroes the bytes in the range that starts at the
3540		- * deduplication range's end offset and ends at the next sector
3541		- * size multiple.
3542		- */
3543		- if (loff + olen == i_size_read(src) &&
3544		- dst_loff + len < i_size_read(dst)) {
3545		- const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
3546		-
3547		- len = round_down(i_size_read(src), sz) - loff;
3548		- if (len == 0)
3549		- return 0;
3550		- olen = len;
3551		- }
3552		- }
3553		-
3554		-again:
3555		- ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
3556		- if (ret)
3557		- return ret;
3558		-
3559		- if (same_inode)
3560		- ret = lock_extent_range(src, same_lock_start, same_lock_len,
3561		- false);
3562		- else
3563		- ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
3564		- false);
3565		- /*
3566		- * If one of the inodes has dirty pages in the respective range or
3567		- * ordered extents, we need to flush dellaloc and wait for all ordered
3568		- * extents in the range. We must unlock the pages and the ranges in the
3569		- * io trees to avoid deadlocks when flushing delalloc (requires locking
3570		- * pages) and when waiting for ordered extents to complete (they require
3571		- * range locking).
3572		- */
3573		- if (ret == -EAGAIN) {
3574		- /*
3575		- * Ranges in the io trees already unlocked. Now unlock all
3576		- * pages before waiting for all IO to complete.
3577		- */
3578		- btrfs_cmp_data_free(cmp);
3579		- if (same_inode) {
3580		- btrfs_wait_ordered_range(src, same_lock_start,
3581		- same_lock_len);
3582		- } else {
3583		- btrfs_wait_ordered_range(src, loff, len);
3584		- btrfs_wait_ordered_range(dst, dst_loff, len);
3585		- }
3586		- goto again;
3587		- }
3588		- ASSERT(ret == 0);
3589		- if (WARN_ON(ret)) {
3590		- /* ranges in the io trees already unlocked */
3591		- btrfs_cmp_data_free(cmp);
3592		- return ret;
3593		- }
3594		-
3595		- /* pass original length for comparison so we stay within i_size */
3596		- ret = btrfs_cmp_data(olen, cmp);
3597		- if (ret == 0)
3598		- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
3599		-
3600		- if (same_inode)
3601		- unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
3602		- same_lock_start + same_lock_len - 1);
3603		- else
3604		- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
3605		-
3606		- btrfs_cmp_data_free(cmp);
3607		-
3608		- return ret;
3609		-}
3610		-
3611		-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
3612		-
3613		-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
3614		- struct inode *dst, u64 dst_loff)
3615		-{
3616		- int ret;
3617		- struct cmp_pages cmp;
3618		- int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
3619		- bool same_inode = (src == dst);
3620		- u64 i, tail_len, chunk_count;
3621		-
3622		- if (olen == 0)
3623		- return 0;
3624		-
3625		- if (same_inode)
3626		- inode_lock(src);
3627		- else
3628		- btrfs_double_inode_lock(src, dst);
3629		-
3630		- /* don't make the dst file partly checksummed */
3631		- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
3632		- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
3633		- ret = -EINVAL;
3634		- goto out_unlock;
3635		- }
3636		-
3637		- tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
3638		- chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
3639		- if (chunk_count == 0)
3640		- num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
3641		-
3642		- /*
3643		- * If deduping ranges in the same inode, locking rules make it
3644		- * mandatory to always lock pages in ascending order to avoid deadlocks
3645		- * with concurrent tasks (such as starting writeback/delalloc).
3646		- */
3647		- if (same_inode && dst_loff < loff)
3648		- swap(loff, dst_loff);
3649		-
3650		- /*
3651		- * We must gather up all the pages before we initiate our extent
3652		- * locking. We use an array for the page pointers. Size of the array is
3653		- * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
3654		- */
3655		- cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
3656		- GFP_KERNEL \| __GFP_ZERO);
3657		- cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
3658		- GFP_KERNEL \| __GFP_ZERO);
3659		- if (!cmp.src_pages \|\| !cmp.dst_pages) {
3660		- ret = -ENOMEM;
3661		- goto out_free;
3662		- }
3663		-
3664		- for (i = 0; i < chunk_count; i++) {
3665		- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
3666		- dst, dst_loff, &cmp);
3667		- if (ret)
3668		- goto out_free;
3669		-
3670		- loff += BTRFS_MAX_DEDUPE_LEN;
3671		- dst_loff += BTRFS_MAX_DEDUPE_LEN;
3672		- }
3673		-
3674		- if (tail_len > 0)
3675		- ret = btrfs_extent_same_range(src, loff, tail_len, dst,
3676		- dst_loff, &cmp);
3677		-
3678		-out_free:
3679		- kvfree(cmp.src_pages);
3680		- kvfree(cmp.dst_pages);
3681		-
3682		-out_unlock:
3683		- if (same_inode)
3684		- inode_unlock(src);
3685		- else
3686		- btrfs_double_inode_unlock(src, dst);
3687		-
3688		- return ret;
3689		-}
3690		-
3691		-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
3692		- struct file *dst_file, loff_t dst_loff,
3693		- u64 olen)
3694		-{
3695		- struct inode *src = file_inode(src_file);
3696		- struct inode *dst = file_inode(dst_file);
3697		- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
3698		-
3699		- if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
3700		- /*
3701		- * Btrfs does not support blocksize < page_size. As a
3702		- * result, btrfs_cmp_data() won't correctly handle
3703		- * this situation without an update.
3704		- */
3705		- return -EINVAL;
3706		- }
3707		-
3708		- return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
3709		-}
3710		-
3711		-static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3712		- struct inode *inode,
3713		- u64 endoff,
3714		- const u64 destoff,
3715		- const u64 olen,
3716		- int no_time_update)
3717		-{
3718		- struct btrfs_root *root = BTRFS_I(inode)->root;
3719		- int ret;
3720		-
3721		- inode_inc_iversion(inode);
3722		- if (!no_time_update)
3723		- inode->i_mtime = inode->i_ctime = current_time(inode);
3724		- /*
3725		- * We round up to the block size at eof when determining which
3726		- * extents to clone above, but shouldn't round up the file size.
3727		- */
3728		- if (endoff > destoff + olen)
3729		- endoff = destoff + olen;
3730		- if (endoff > inode->i_size)
3731		- btrfs_i_size_write(BTRFS_I(inode), endoff);
3732		-
3733		- ret = btrfs_update_inode(trans, root, inode);
3734		- if (ret) {
3735		- btrfs_abort_transaction(trans, ret);
3736		- btrfs_end_transaction(trans);
3737		- goto out;
3738		- }
3739		- ret = btrfs_end_transaction(trans);
3740		-out:
3741		- return ret;
3742		-}
3743		-
3744		-static void clone_update_extent_map(struct btrfs_inode *inode,
3745		- const struct btrfs_trans_handle *trans,
3746		- const struct btrfs_path *path,
3747		- const u64 hole_offset,
3748		- const u64 hole_len)
3749		-{
3750		- struct extent_map_tree *em_tree = &inode->extent_tree;
3751		- struct extent_map *em;
3752		- int ret;
3753		-
3754		- em = alloc_extent_map();
3755		- if (!em) {
3756		- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3757		- return;
3758		- }
3759		-
3760		- if (path) {
3761		- struct btrfs_file_extent_item *fi;
3762		-
3763		- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3764		- struct btrfs_file_extent_item);
3765		- btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
3766		- em->generation = -1;
3767		- if (btrfs_file_extent_type(path->nodes[0], fi) ==
3768		- BTRFS_FILE_EXTENT_INLINE)
3769		- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3770		- &inode->runtime_flags);
3771		- } else {
3772		- em->start = hole_offset;
3773		- em->len = hole_len;
3774		- em->ram_bytes = em->len;
3775		- em->orig_start = hole_offset;
3776		- em->block_start = EXTENT_MAP_HOLE;
3777		- em->block_len = 0;
3778		- em->orig_block_len = 0;
3779		- em->compress_type = BTRFS_COMPRESS_NONE;
3780		- em->generation = trans->transid;
3781		- }
3782		-
3783		- while (1) {
3784		- write_lock(&em_tree->lock);
3785		- ret = add_extent_mapping(em_tree, em, 1);
3786		- write_unlock(&em_tree->lock);
3787		- if (ret != -EEXIST) {
3788		- free_extent_map(em);
3789		- break;
3790		- }
3791		- btrfs_drop_extent_cache(inode, em->start,
3792		- em->start + em->len - 1, 0);
3793		- }
3794		-
3795		- if (ret)
3796		- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3797		-}
3798		-
3799		-/*
3800		- * Make sure we do not end up inserting an inline extent into a file that has
3801		- * already other (non-inline) extents. If a file has an inline extent it can
3802		- * not have any other extents and the (single) inline extent must start at the
3803		- * file offset 0. Failing to respect these rules will lead to file corruption,
3804		- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
3805		- *
3806		- * We can have extents that have been already written to disk or we can have
3807		- * dirty ranges still in delalloc, in which case the extent maps and items are
3808		- * created only when we run delalloc, and the delalloc ranges might fall outside
3809		- * the range we are currently locking in the inode's io tree. So we check the
3810		- * inode's i_size because of that (i_size updates are done while holding the
3811		- * i_mutex, which we are holding here).
3812		- * We also check to see if the inode has a size not greater than "datal" but has
3813		- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
3814		- * protected against such concurrent fallocate calls by the i_mutex).
3815		- *
3816		- * If the file has no extents but a size greater than datal, do not allow the
3817		- * copy because we would need turn the inline extent into a non-inline one (even
3818		- * with NO_HOLES enabled). If we find our destination inode only has one inline
3819		- * extent, just overwrite it with the source inline extent if its size is less
3820		- * than the source extent's size, or we could copy the source inline extent's
3821		- * data into the destination inode's inline extent if the later is greater then
3822		- * the former.
3823		- */
3824		-static int clone_copy_inline_extent(struct inode *dst,
3825		- struct btrfs_trans_handle *trans,
3826		- struct btrfs_path *path,
3827		- struct btrfs_key *new_key,
3828		- const u64 drop_start,
3829		- const u64 datal,
3830		- const u64 skip,
3831		- const u64 size,
3832		- char *inline_data)
3833		-{
3834		- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
3835		- struct btrfs_root *root = BTRFS_I(dst)->root;
3836		- const u64 aligned_end = ALIGN(new_key->offset + datal,
3837		- fs_info->sectorsize);
3838		- int ret;
3839		- struct btrfs_key key;
3840		-
3841		- if (new_key->offset > 0)
3842		- return -EOPNOTSUPP;
3843		-
3844		- key.objectid = btrfs_ino(BTRFS_I(dst));
3845		- key.type = BTRFS_EXTENT_DATA_KEY;
3846		- key.offset = 0;
3847		- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3848		- if (ret < 0) {
3849		- return ret;
3850		- } else if (ret > 0) {
3851		- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3852		- ret = btrfs_next_leaf(root, path);
3853		- if (ret < 0)
3854		- return ret;
3855		- else if (ret > 0)
3856		- goto copy_inline_extent;
3857		- }
3858		- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3859		- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
3860		- key.type == BTRFS_EXTENT_DATA_KEY) {
3861		- ASSERT(key.offset > 0);
3862		- return -EOPNOTSUPP;
3863		- }
3864		- } else if (i_size_read(dst) <= datal) {
3865		- struct btrfs_file_extent_item *ei;
3866		- u64 ext_len;
3867		-
3868		- /*
3869		- * If the file size is <= datal, make sure there are no other
3870		- * extents following (can happen do to an fallocate call with
3871		- * the flag FALLOC_FL_KEEP_SIZE).
3872		- */
3873		- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3874		- struct btrfs_file_extent_item);
3875		- /*
3876		- * If it's an inline extent, it can not have other extents
3877		- * following it.
3878		- */
3879		- if (btrfs_file_extent_type(path->nodes[0], ei) ==
3880		- BTRFS_FILE_EXTENT_INLINE)
3881		- goto copy_inline_extent;
3882		-
3883		- ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3884		- if (ext_len > aligned_end)
3885		- return -EOPNOTSUPP;
3886		-
3887		- ret = btrfs_next_item(root, path);
3888		- if (ret < 0) {
3889		- return ret;
3890		- } else if (ret == 0) {
3891		- btrfs_item_key_to_cpu(path->nodes[0], &key,
3892		- path->slots[0]);
3893		- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
3894		- key.type == BTRFS_EXTENT_DATA_KEY)
3895		- return -EOPNOTSUPP;
3896		- }
3897		- }
3898		-
3899		-copy_inline_extent:
3900		- /*
3901		- * We have no extent items, or we have an extent at offset 0 which may
3902		- * or may not be inlined. All these cases are dealt the same way.
3903		- */
3904		- if (i_size_read(dst) > datal) {
3905		- /*
3906		- * If the destination inode has an inline extent...
3907		- * This would require copying the data from the source inline
3908		- * extent into the beginning of the destination's inline extent.
3909		- * But this is really complex, both extents can be compressed
3910		- * or just one of them, which would require decompressing and
3911		- * re-compressing data (which could increase the new compressed
3912		- * size, not allowing the compressed data to fit anymore in an
3913		- * inline extent).
3914		- * So just don't support this case for now (it should be rare,
3915		- * we are not really saving space when cloning inline extents).
3916		- */
3917		- return -EOPNOTSUPP;
3918		- }
3919		-
3920		- btrfs_release_path(path);
3921		- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
3922		- if (ret)
3923		- return ret;
3924		- ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
3925		- if (ret)
3926		- return ret;
3927		-
3928		- if (skip) {
3929		- const u32 start = btrfs_file_extent_calc_inline_size(0);
3930		-
3931		- memmove(inline_data + start, inline_data + start + skip, datal);
3932		- }
3933		-
3934		- write_extent_buffer(path->nodes[0], inline_data,
3935		- btrfs_item_ptr_offset(path->nodes[0],
3936		- path->slots[0]),
3937		- size);
3938		- inode_add_bytes(dst, datal);
3939		-
3940		- return 0;
3941		-}
3942		-
3943		-/**
3944		- * btrfs_clone() - clone a range from inode file to another
3945		- *
3946		- * @src: Inode to clone from
3947		- * @inode: Inode to clone to
3948		- * @off: Offset within source to start clone from
3949		- * @olen: Original length, passed by user, of range to clone
3950		- * @olen_aligned: Block-aligned value of olen
3951		- * @destoff: Offset within @inode to start clone
3952		- * @no_time_update: Whether to update mtime/ctime on the target inode
3953		- */
3954		-static int btrfs_clone(struct inode src, struct inode inode,
3955		- const u64 off, const u64 olen, const u64 olen_aligned,
3956		- const u64 destoff, int no_time_update)
3957		-{
3958		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3959		- struct btrfs_root *root = BTRFS_I(inode)->root;
3960		- struct btrfs_path *path = NULL;
3961		- struct extent_buffer *leaf;
3962		- struct btrfs_trans_handle *trans;
3963		- char *buf = NULL;
3964		- struct btrfs_key key;
3965		- u32 nritems;
3966		- int slot;
3967		- int ret;
3968		- const u64 len = olen_aligned;
3969		- u64 last_dest_end = destoff;
3970		-
3971		- ret = -ENOMEM;
3972		- buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
3973		- if (!buf)
3974		- return ret;
3975		-
3976		- path = btrfs_alloc_path();
3977		- if (!path) {
3978		- kvfree(buf);
3979		- return ret;
3980		- }
3981		-
3982		- path->reada = READA_FORWARD;
3983		- /* clone data */
3984		- key.objectid = btrfs_ino(BTRFS_I(src));
3985		- key.type = BTRFS_EXTENT_DATA_KEY;
3986		- key.offset = off;
3987		-
3988		- while (1) {
3989		- u64 next_key_min_offset = key.offset + 1;
3990		-
3991		- /*
3992		- * note the key will change type as we walk through the
3993		- * tree.
3994		- */
3995		- path->leave_spinning = 1;
3996		- ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
3997		- 0, 0);
3998		- if (ret < 0)
3999		- goto out;
4000		- /*
4001		- * First search, if no extent item that starts at offset off was
4002		- * found but the previous item is an extent item, it's possible
4003		- * it might overlap our target range, therefore process it.
4004		- */
4005		- if (key.offset == off && ret > 0 && path->slots[0] > 0) {
4006		- btrfs_item_key_to_cpu(path->nodes[0], &key,
4007		- path->slots[0] - 1);
4008		- if (key.type == BTRFS_EXTENT_DATA_KEY)
4009		- path->slots[0]--;
4010		- }
4011		-
4012		- nritems = btrfs_header_nritems(path->nodes[0]);
4013		-process_slot:
4014		- if (path->slots[0] >= nritems) {
4015		- ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
4016		- if (ret < 0)
4017		- goto out;
4018		- if (ret > 0)
4019		- break;
4020		- nritems = btrfs_header_nritems(path->nodes[0]);
4021		- }
4022		- leaf = path->nodes[0];
4023		- slot = path->slots[0];
4024		-
4025		- btrfs_item_key_to_cpu(leaf, &key, slot);
4026		- if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
4027		- key.objectid != btrfs_ino(BTRFS_I(src)))
4028		- break;
4029		-
4030		- if (key.type == BTRFS_EXTENT_DATA_KEY) {
4031		- struct btrfs_file_extent_item *extent;
4032		- int type;
4033		- u32 size;
4034		- struct btrfs_key new_key;
4035		- u64 disko = 0, diskl = 0;
4036		- u64 datao = 0, datal = 0;
4037		- u8 comp;
4038		- u64 drop_start;
4039		-
4040		- extent = btrfs_item_ptr(leaf, slot,
4041		- struct btrfs_file_extent_item);
4042		- comp = btrfs_file_extent_compression(leaf, extent);
4043		- type = btrfs_file_extent_type(leaf, extent);
4044		- if (type == BTRFS_FILE_EXTENT_REG \|\|
4045		- type == BTRFS_FILE_EXTENT_PREALLOC) {
4046		- disko = btrfs_file_extent_disk_bytenr(leaf,
4047		- extent);
4048		- diskl = btrfs_file_extent_disk_num_bytes(leaf,
4049		- extent);
4050		- datao = btrfs_file_extent_offset(leaf, extent);
4051		- datal = btrfs_file_extent_num_bytes(leaf,
4052		- extent);
4053		- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
4054		- /* take upper bound, may be compressed */
4055		- datal = btrfs_file_extent_ram_bytes(leaf,
4056		- extent);
4057		- }
4058		-
4059		- /*
4060		- * The first search might have left us at an extent
4061		- * item that ends before our target range's start, can
4062		- * happen if we have holes and NO_HOLES feature enabled.
4063		- */
4064		- if (key.offset + datal <= off) {
4065		- path->slots[0]++;
4066		- goto process_slot;
4067		- } else if (key.offset >= off + len) {
4068		- break;
4069		- }
4070		- next_key_min_offset = key.offset + datal;
4071		- size = btrfs_item_size_nr(leaf, slot);
4072		- read_extent_buffer(leaf, buf,
4073		- btrfs_item_ptr_offset(leaf, slot),
4074		- size);
4075		-
4076		- btrfs_release_path(path);
4077		- path->leave_spinning = 0;
4078		-
4079		- memcpy(&new_key, &key, sizeof(new_key));
4080		- new_key.objectid = btrfs_ino(BTRFS_I(inode));
4081		- if (off <= key.offset)
4082		- new_key.offset = key.offset + destoff - off;
4083		- else
4084		- new_key.offset = destoff;
4085		-
4086		- /*
4087		- * Deal with a hole that doesn't have an extent item
4088		- * that represents it (NO_HOLES feature enabled).
4089		- * This hole is either in the middle of the cloning
4090		- * range or at the beginning (fully overlaps it or
4091		- * partially overlaps it).
4092		- */
4093		- if (new_key.offset != last_dest_end)
4094		- drop_start = last_dest_end;
4095		- else
4096		- drop_start = new_key.offset;
4097		-
4098		- /*
4099		- * 1 - adjusting old extent (we may have to split it)
4100		- * 1 - add new extent
4101		- * 1 - inode update
4102		- */
4103		- trans = btrfs_start_transaction(root, 3);
4104		- if (IS_ERR(trans)) {
4105		- ret = PTR_ERR(trans);
4106		- goto out;
4107		- }
4108		-
4109		- if (type == BTRFS_FILE_EXTENT_REG \|\|
4110		- type == BTRFS_FILE_EXTENT_PREALLOC) {
4111		- /*
4112		- * a \| --- range to clone ---\| b
4113		- * \| ------------- extent ------------- \|
4114		- */
4115		-
4116		- /* subtract range b */
4117		- if (key.offset + datal > off + len)
4118		- datal = off + len - key.offset;
4119		-
4120		- /* subtract range a */
4121		- if (off > key.offset) {
4122		- datao += off - key.offset;
4123		- datal -= off - key.offset;
4124		- }
4125		-
4126		- ret = btrfs_drop_extents(trans, root, inode,
4127		- drop_start,
4128		- new_key.offset + datal,
4129		- 1);
4130		- if (ret) {
4131		- if (ret != -EOPNOTSUPP)
4132		- btrfs_abort_transaction(trans,
4133		- ret);
4134		- btrfs_end_transaction(trans);
4135		- goto out;
4136		- }
4137		-
4138		- ret = btrfs_insert_empty_item(trans, root, path,
4139		- &new_key, size);
4140		- if (ret) {
4141		- btrfs_abort_transaction(trans, ret);
4142		- btrfs_end_transaction(trans);
4143		- goto out;
4144		- }
4145		-
4146		- leaf = path->nodes[0];
4147		- slot = path->slots[0];
4148		- write_extent_buffer(leaf, buf,
4149		- btrfs_item_ptr_offset(leaf, slot),
4150		- size);
4151		-
4152		- extent = btrfs_item_ptr(leaf, slot,
4153		- struct btrfs_file_extent_item);
4154		-
4155		- /* disko == 0 means it's a hole */
4156		- if (!disko)
4157		- datao = 0;
4158		-
4159		- btrfs_set_file_extent_offset(leaf, extent,
4160		- datao);
4161		- btrfs_set_file_extent_num_bytes(leaf, extent,
4162		- datal);
4163		-
4164		- if (disko) {
4165		- inode_add_bytes(inode, datal);
4166		- ret = btrfs_inc_extent_ref(trans,
4167		- root,
4168		- disko, diskl, 0,
4169		- root->root_key.objectid,
4170		- btrfs_ino(BTRFS_I(inode)),
4171		- new_key.offset - datao);
4172		- if (ret) {
4173		- btrfs_abort_transaction(trans,
4174		- ret);
4175		- btrfs_end_transaction(trans);
4176		- goto out;
4177		-
4178		- }
4179		- }
4180		- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
4181		- u64 skip = 0;
4182		- u64 trim = 0;
4183		-
4184		- if (off > key.offset) {
4185		- skip = off - key.offset;
4186		- new_key.offset += skip;
4187		- }
4188		-
4189		- if (key.offset + datal > off + len)
4190		- trim = key.offset + datal - (off + len);
4191		-
4192		- if (comp && (skip \|\| trim)) {
4193		- ret = -EINVAL;
4194		- btrfs_end_transaction(trans);
4195		- goto out;
4196		- }
4197		- size -= skip + trim;
4198		- datal -= skip + trim;
4199		-
4200		- ret = clone_copy_inline_extent(inode,
4201		- trans, path,
4202		- &new_key,
4203		- drop_start,
4204		- datal,
4205		- skip, size, buf);
4206		- if (ret) {
4207		- if (ret != -EOPNOTSUPP)
4208		- btrfs_abort_transaction(trans,
4209		- ret);
4210		- btrfs_end_transaction(trans);
4211		- goto out;
4212		- }
4213		- leaf = path->nodes[0];
4214		- slot = path->slots[0];
4215		- }
4216		-
4217		- /* If we have an implicit hole (NO_HOLES feature). */
4218		- if (drop_start < new_key.offset)
4219		- clone_update_extent_map(BTRFS_I(inode), trans,
4220		- NULL, drop_start,
4221		- new_key.offset - drop_start);
4222		-
4223		- clone_update_extent_map(BTRFS_I(inode), trans,
4224		- path, 0, 0);
4225		-
4226		- btrfs_mark_buffer_dirty(leaf);
4227		- btrfs_release_path(path);
4228		-
4229		- last_dest_end = ALIGN(new_key.offset + datal,
4230		- fs_info->sectorsize);
4231		- ret = clone_finish_inode_update(trans, inode,
4232		- last_dest_end,
4233		- destoff, olen,
4234		- no_time_update);
4235		- if (ret)
4236		- goto out;
4237		- if (new_key.offset + datal >= destoff + len)
4238		- break;
4239		- }
4240		- btrfs_release_path(path);
4241		- key.offset = next_key_min_offset;
4242		-
4243		- if (fatal_signal_pending(current)) {
4244		- ret = -EINTR;
4245		- goto out;
4246		- }
4247		-
4248		- cond_resched();
4249		- }
4250		- ret = 0;
4251		-
4252		- if (last_dest_end < destoff + len) {
4253		- /*
4254		- * We have an implicit hole (NO_HOLES feature is enabled) that
4255		- * fully or partially overlaps our cloning range at its end.
4256		- */
4257		- btrfs_release_path(path);
4258		-
4259		- /*
4260		- * 1 - remove extent(s)
4261		- * 1 - inode update
4262		- */
4263		- trans = btrfs_start_transaction(root, 2);
4264		- if (IS_ERR(trans)) {
4265		- ret = PTR_ERR(trans);
4266		- goto out;
4267		- }
4268		- ret = btrfs_drop_extents(trans, root, inode,
4269		- last_dest_end, destoff + len, 1);
4270		- if (ret) {
4271		- if (ret != -EOPNOTSUPP)
4272		- btrfs_abort_transaction(trans, ret);
4273		- btrfs_end_transaction(trans);
4274		- goto out;
4275		- }
4276		- clone_update_extent_map(BTRFS_I(inode), trans, NULL,
4277		- last_dest_end,
4278		- destoff + len - last_dest_end);
4279		- ret = clone_finish_inode_update(trans, inode, destoff + len,
4280		- destoff, olen, no_time_update);
4281		- }
4282		-
4283		-out:
4284		- btrfs_free_path(path);
4285		- kvfree(buf);
4286		- return ret;
4287		-}
4288		-
4289		-static noinline int btrfs_clone_files(struct file file, struct file file_src,
4290		- u64 off, u64 olen, u64 destoff)
4291		-{
4292		- struct inode *inode = file_inode(file);
4293		- struct inode *src = file_inode(file_src);
4294		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4295		- struct btrfs_root *root = BTRFS_I(inode)->root;
4296		- int ret;
4297		- u64 len = olen;
4298		- u64 bs = fs_info->sb->s_blocksize;
4299		- int same_inode = src == inode;
4300		-
4301		- /*
4302		- * TODO:
4303		- * - split compressed inline extents. annoying: we need to
4304		- * decompress into destination's address_space (the file offset
4305		- * may change, so source mapping won't do), then recompress (or
4306		- * otherwise reinsert) a subrange.
4307		- *
4308		- * - split destination inode's inline extents. The inline extents can
4309		- * be either compressed or non-compressed.
4310		- */
4311		-
4312		- if (btrfs_root_readonly(root))
4313		- return -EROFS;
4314		-
4315		- if (file_src->f_path.mnt != file->f_path.mnt \|\|
4316		- src->i_sb != inode->i_sb)
4317		- return -EXDEV;
4318		-
4319		- if (S_ISDIR(src->i_mode) \|\| S_ISDIR(inode->i_mode))
4320		- return -EISDIR;
4321		-
4322		- if (!same_inode) {
4323		- btrfs_double_inode_lock(src, inode);
4324		- } else {
4325		- inode_lock(src);
4326		- }
4327		-
4328		- /* don't make the dst file partly checksummed */
4329		- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
4330		- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
4331		- ret = -EINVAL;
4332		- goto out_unlock;
4333		- }
4334		-
4335		- /* determine range to clone */
4336		- ret = -EINVAL;
4337		- if (off + len > src->i_size \|\| off + len < off)
4338		- goto out_unlock;
4339		- if (len == 0)
4340		- olen = len = src->i_size - off;
4341		- /*
4342		- * If we extend to eof, continue to block boundary if and only if the
4343		- * destination end offset matches the destination file's size, otherwise
4344		- * we would be corrupting data by placing the eof block into the middle
4345		- * of a file.
4346		- */
4347		- if (off + len == src->i_size) {
4348		- if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
4349		- goto out_unlock;
4350		- len = ALIGN(src->i_size, bs) - off;
4351		- }
4352		-
4353		- if (len == 0) {
4354		- ret = 0;
4355		- goto out_unlock;
4356		- }
4357		-
4358		- /* verify the end result is block aligned */
4359		- if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs) \|\|
4360		- !IS_ALIGNED(destoff, bs))
4361		- goto out_unlock;
4362		-
4363		- /* verify if ranges are overlapped within the same file */
4364		- if (same_inode) {
4365		- if (destoff + len > off && destoff < off + len)
4366		- goto out_unlock;
4367		- }
4368		-
4369		- if (destoff > inode->i_size) {
4370		- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
4371		- if (ret)
4372		- goto out_unlock;
4373		- }
4374		-
4375		- /*
4376		- * Lock the target range too. Right after we replace the file extent
4377		- * items in the fs tree (which now point to the cloned data), we might
4378		- * have a worker replace them with extent items relative to a write
4379		- * operation that was issued before this clone operation (i.e. confront
4380		- * with inode.c:btrfs_finish_ordered_io).
4381		- */
4382		- if (same_inode) {
4383		- u64 lock_start = min_t(u64, off, destoff);
4384		- u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
4385		-
4386		- ret = lock_extent_range(src, lock_start, lock_len, true);
4387		- } else {
4388		- ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
4389		- true);
4390		- }
4391		- ASSERT(ret == 0);
4392		- if (WARN_ON(ret)) {
4393		- /* ranges in the io trees already unlocked */
4394		- goto out_unlock;
4395		- }
4396		-
4397		- ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
4398		-
4399		- if (same_inode) {
4400		- u64 lock_start = min_t(u64, off, destoff);
4401		- u64 lock_end = max_t(u64, off, destoff) + len - 1;
4402		-
4403		- unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
4404		- } else {
4405		- btrfs_double_extent_unlock(src, off, inode, destoff, len);
4406		- }
4407		- /*
4408		- * Truncate page cache pages so that future reads will see the cloned
4409		- * data immediately and not the previous data.
4410		- */
4411		- truncate_inode_pages_range(&inode->i_data,
4412		- round_down(destoff, PAGE_SIZE),
4413		- round_up(destoff + len, PAGE_SIZE) - 1);
4414		-out_unlock:
4415		- if (!same_inode)
4416		- btrfs_double_inode_unlock(src, inode);
4417		- else
4418		- inode_unlock(src);
4419		- return ret;
4420		-}
4421		-
4422		-int btrfs_clone_file_range(struct file *src_file, loff_t off,
4423		- struct file *dst_file, loff_t destoff, u64 len)
4424		-{
4425		- return btrfs_clone_files(dst_file, src_file, off, len, destoff);
4426		-}
4427		-
4428	3424	static long btrfs_ioctl_default_subvol(struct file file, void __user argp)
4429	3425	{
4430	3426	struct inode *inode = file_inode(file);
..	..	@@ -4433,8 +3429,7 @@
4433	3429	struct btrfs_root *new_root;
4434	3430	struct btrfs_dir_item *di;
4435	3431	struct btrfs_trans_handle *trans;
4436		- struct btrfs_path *path;
4437		- struct btrfs_key location;
	3432	+ struct btrfs_path *path = NULL;
4438	3433	struct btrfs_disk_key disk_key;
4439	3434	u64 objectid = 0;
4440	3435	u64 dir_id;
..	..	@@ -4455,53 +3450,51 @@
4455	3450	if (!objectid)
4456	3451	objectid = BTRFS_FS_TREE_OBJECTID;
4457	3452
4458		- location.objectid = objectid;
4459		- location.type = BTRFS_ROOT_ITEM_KEY;
4460		- location.offset = (u64)-1;
4461		-
4462		- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
	3453	+ new_root = btrfs_get_fs_root(fs_info, objectid, true);
4463	3454	if (IS_ERR(new_root)) {
4464	3455	ret = PTR_ERR(new_root);
4465	3456	goto out;
4466	3457	}
4467		- if (!is_fstree(new_root->objectid)) {
	3458	+ if (!is_fstree(new_root->root_key.objectid)) {
4468	3459	ret = -ENOENT;
4469		- goto out;
	3460	+ goto out_free;
4470	3461	}
4471	3462
4472	3463	path = btrfs_alloc_path();
4473	3464	if (!path) {
4474	3465	ret = -ENOMEM;
4475		- goto out;
	3466	+ goto out_free;
4476	3467	}
4477	3468	path->leave_spinning = 1;
4478	3469
4479	3470	trans = btrfs_start_transaction(root, 1);
4480	3471	if (IS_ERR(trans)) {
4481		- btrfs_free_path(path);
4482	3472	ret = PTR_ERR(trans);
4483		- goto out;
	3473	+ goto out_free;
4484	3474	}
4485	3475
4486	3476	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4487	3477	di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
4488	3478	dir_id, "default", 7, 1);
4489	3479	if (IS_ERR_OR_NULL(di)) {
4490		- btrfs_free_path(path);
	3480	+ btrfs_release_path(path);
4491	3481	btrfs_end_transaction(trans);
4492	3482	btrfs_err(fs_info,
4493	3483	"Umm, you don't have the default diritem, this isn't going to work");
4494	3484	ret = -ENOENT;
4495		- goto out;
	3485	+ goto out_free;
4496	3486	}
4497	3487
4498	3488	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
4499	3489	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
4500	3490	btrfs_mark_buffer_dirty(path->nodes[0]);
4501		- btrfs_free_path(path);
	3491	+ btrfs_release_path(path);
4502	3492
4503	3493	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
4504	3494	btrfs_end_transaction(trans);
	3495	+out_free:
	3496	+ btrfs_put_root(new_root);
	3497	+ btrfs_free_path(path);
4505	3498	out:
4506	3499	mnt_drop_write_file(file);
4507	3500	return ret;
..	..	@@ -4510,16 +3503,15 @@
4510	3503	static void get_block_group_info(struct list_head *groups_list,
4511	3504	struct btrfs_ioctl_space_info *space)
4512	3505	{
4513		- struct btrfs_block_group_cache *block_group;
	3506	+ struct btrfs_block_group *block_group;
4514	3507
4515	3508	space->total_bytes = 0;
4516	3509	space->used_bytes = 0;
4517	3510	space->flags = 0;
4518	3511	list_for_each_entry(block_group, groups_list, list) {
4519	3512	space->flags = block_group->flags;
4520		- space->total_bytes += block_group->key.offset;
4521		- space->used_bytes +=
4522		- btrfs_block_group_used(&block_group->item);
	3513	+ space->total_bytes += block_group->length;
	3514	+ space->used_bytes += block_group->used;
4523	3515	}
4524	3516	}
4525	3517
..	..	@@ -4553,15 +3545,12 @@
4553	3545	struct btrfs_space_info *tmp;
4554	3546
4555	3547	info = NULL;
4556		- rcu_read_lock();
4557		- list_for_each_entry_rcu(tmp, &fs_info->space_info,
4558		- list) {
	3548	+ list_for_each_entry(tmp, &fs_info->space_info, list) {
4559	3549	if (tmp->flags == types[i]) {
4560	3550	info = tmp;
4561	3551	break;
4562	3552	}
4563	3553	}
4564		- rcu_read_unlock();
4565	3554
4566	3555	if (!info)
4567	3556	continue;
..	..	@@ -4609,15 +3598,12 @@
4609	3598	break;
4610	3599
4611	3600	info = NULL;
4612		- rcu_read_lock();
4613		- list_for_each_entry_rcu(tmp, &fs_info->space_info,
4614		- list) {
	3601	+ list_for_each_entry(tmp, &fs_info->space_info, list) {
4615	3602	if (tmp->flags == types[i]) {
4616	3603	info = tmp;
4617	3604	break;
4618	3605	}
4619	3606	}
4620		- rcu_read_unlock();
4621	3607
4622	3608	if (!info)
4623	3609	continue;
..	..	@@ -4722,6 +3708,11 @@
4722	3708	if (IS_ERR(sa))
4723	3709	return PTR_ERR(sa);
4724	3710
	3711	+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
	3712	+ ret = -EOPNOTSUPP;
	3713	+ goto out;
	3714	+ }
	3715	+
4725	3716	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
4726	3717	ret = mnt_want_write_file(file);
4727	3718	if (ret)
..	..	@@ -4732,6 +3723,18 @@
4732	3723	&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
4733	3724	0);
4734	3725
	3726	+ /*
	3727	+ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
	3728	+ * error. This is important as it allows user space to know how much
	3729	+ * progress scrub has done. For example, if scrub is canceled we get
	3730	+ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
	3731	+ * space. Later user space can inspect the progress from the structure
	3732	+ * btrfs_ioctl_scrub_args and resume scrub from where it left off
	3733	+ * previously (btrfs-progs does this).
	3734	+ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
	3735	+ * then return -EFAULT to signal the structure was not copied or it may
	3736	+ * be corrupt and unreliable due to a partial copy.
	3737	+ */
4735	3738	if (copy_to_user(arg, sa, sizeof(*sa)))
4736	3739	ret = -EFAULT;
4737	3740
..	..	@@ -4765,7 +3768,7 @@
4765	3768
4766	3769	ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
4767	3770
4768		- if (copy_to_user(arg, sa, sizeof(*sa)))
	3771	+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4769	3772	ret = -EFAULT;
4770	3773
4771	3774	kfree(sa);
..	..	@@ -4789,7 +3792,7 @@
4789	3792
4790	3793	ret = btrfs_get_dev_stats(fs_info, sa);
4791	3794
4792		- if (copy_to_user(arg, sa, sizeof(*sa)))
	3795	+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4793	3796	ret = -EFAULT;
4794	3797
4795	3798	kfree(sa);
..	..	@@ -4815,11 +3818,11 @@
4815	3818	ret = -EROFS;
4816	3819	goto out;
4817	3820	}
4818		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
	3821	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
4819	3822	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
4820	3823	} else {
4821	3824	ret = btrfs_dev_replace_by_ioctl(fs_info, p);
4822		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	3825	+ btrfs_exclop_finish(fs_info);
4823	3826	}
4824	3827	break;
4825	3828	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
..	..	@@ -4835,7 +3838,7 @@
4835	3838	break;
4836	3839	}
4837	3840
4838		- if (copy_to_user(arg, p, sizeof(*p)))
	3841	+ if ((ret == 0 \|\| ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
4839	3842	ret = -EFAULT;
4840	3843	out:
4841	3844	kfree(p);
..	..	@@ -4886,6 +3889,8 @@
4886	3889	ipath->fspath->val[i] = rel_ptr;
4887	3890	}
4888	3891
	3892	+ btrfs_free_path(path);
	3893	+ path = NULL;
4889	3894	ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
4890	3895	ipath->fspath, size);
4891	3896	if (ret) {
..	..	@@ -4899,26 +3904,6 @@
4899	3904	kfree(ipa);
4900	3905
4901	3906	return ret;
4902		-}
4903		-
4904		-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
4905		-{
4906		- struct btrfs_data_container *inodes = ctx;
4907		- const size_t c = 3 * sizeof(u64);
4908		-
4909		- if (inodes->bytes_left >= c) {
4910		- inodes->bytes_left -= c;
4911		- inodes->val[inodes->elem_cnt] = inum;
4912		- inodes->val[inodes->elem_cnt + 1] = offset;
4913		- inodes->val[inodes->elem_cnt + 2] = root;
4914		- inodes->elem_cnt += 3;
4915		- } else {
4916		- inodes->bytes_missing += c - inodes->bytes_left;
4917		- inodes->bytes_left = 0;
4918		- inodes->elem_missed += 3;
4919		- }
4920		-
4921		- return 0;
4922	3907	}
4923	3908
4924	3909	static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
..	..	@@ -4956,21 +3941,20 @@
4956	3941	size = min_t(u32, loi->size, SZ_16M);
4957	3942	}
4958	3943
	3944	+ inodes = init_data_container(size);
	3945	+ if (IS_ERR(inodes)) {
	3946	+ ret = PTR_ERR(inodes);
	3947	+ goto out_loi;
	3948	+ }
	3949	+
4959	3950	path = btrfs_alloc_path();
4960	3951	if (!path) {
4961	3952	ret = -ENOMEM;
4962	3953	goto out;
4963	3954	}
4964		-
4965		- inodes = init_data_container(size);
4966		- if (IS_ERR(inodes)) {
4967		- ret = PTR_ERR(inodes);
4968		- inodes = NULL;
4969		- goto out;
4970		- }
4971		-
4972	3955	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
4973		- build_ino_list, inodes, ignore_offset);
	3956	+ inodes, ignore_offset);
	3957	+ btrfs_free_path(path);
4974	3958	if (ret == -EINVAL)
4975	3959	ret = -ENOENT;
4976	3960	if (ret < 0)
..	..	@@ -4982,7 +3966,6 @@
4982	3966	ret = -EFAULT;
4983	3967
4984	3968	out:
4985		- btrfs_free_path(path);
4986	3969	kvfree(inodes);
4987	3970	out_loi:
4988	3971	kfree(loi);
..	..	@@ -5030,7 +4013,7 @@
5030	4013	return ret;
5031	4014
5032	4015	again:
5033		- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
	4016	+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
5034	4017	mutex_lock(&fs_info->balance_mutex);
5035	4018	need_unlock = true;
5036	4019	goto locked;
..	..	@@ -5076,7 +4059,6 @@
5076	4059	}
5077	4060
5078	4061	locked:
5079		- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
5080	4062
5081	4063	if (arg) {
5082	4064	bargs = memdup_user(arg, sizeof(*bargs));
..	..	@@ -5131,17 +4113,17 @@
5131	4113
5132	4114	do_balance:
5133	4115	/*
5134		- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
5135		- * btrfs_balance. bctl is freed in reset_balance_state, or, if
5136		- * restriper was paused all the way until unmount, in free_fs_info.
5137		- * The flag should be cleared after reset_balance_state.
	4116	+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
	4117	+ * bctl is freed in reset_balance_state, or, if restriper was paused
	4118	+ * all the way until unmount, in free_fs_info. The flag should be
	4119	+ * cleared after reset_balance_state.
5138	4120	*/
5139	4121	need_unlock = false;
5140	4122
5141	4123	ret = btrfs_balance(fs_info, bctl, bargs);
5142	4124	bctl = NULL;
5143	4125
5144		- if (arg) {
	4126	+ if ((ret == 0 \|\| ret == -ECANCELED) && arg) {
5145	4127	if (copy_to_user(arg, bargs, sizeof(*bargs)))
5146	4128	ret = -EFAULT;
5147	4129	}
..	..	@@ -5153,7 +4135,7 @@
5153	4135	out_unlock:
5154	4136	mutex_unlock(&fs_info->balance_mutex);
5155	4137	if (need_unlock)
5156		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4138	+ btrfs_exclop_finish(fs_info);
5157	4139	out:
5158	4140	mnt_drop_write_file(file);
5159	4141	return ret;
..	..	@@ -5283,7 +4265,9 @@
5283	4265	}
5284	4266
5285	4267	/* update qgroup status and info */
	4268	+ mutex_lock(&fs_info->qgroup_ioctl_lock);
5286	4269	err = btrfs_run_qgroups(trans);
	4270	+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
5287	4271	if (err < 0)
5288	4272	btrfs_handle_fs_error(fs_info, err,
5289	4273	"failed to update qgroup status and info");
..	..	@@ -5430,10 +4414,9 @@
5430	4414	return ret;
5431	4415	}
5432	4416
5433		-static long btrfs_ioctl_quota_rescan_status(struct file file, void __user arg)
	4417	+static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
	4418	+ void __user *arg)
5434	4419	{
5435		- struct inode *inode = file_inode(file);
5436		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5437	4420	struct btrfs_ioctl_quota_rescan_args *qsa;
5438	4421	int ret = 0;
5439	4422
..	..	@@ -5456,11 +4439,9 @@
5456	4439	return ret;
5457	4440	}
5458	4441
5459		-static long btrfs_ioctl_quota_rescan_wait(struct file file, void __user arg)
	4442	+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
	4443	+ void __user *arg)
5460	4444	{
5461		- struct inode *inode = file_inode(file);
5462		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5463		-
5464	4445	if (!capable(CAP_SYS_ADMIN))
5465	4446	return -EPERM;
5466	4447
..	..	@@ -5632,10 +4613,9 @@
5632	4613	return ret;
5633	4614	}
5634	4615
5635		-static int btrfs_ioctl_get_fslabel(struct file file, void __user arg)
	4616	+static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
	4617	+ void __user *arg)
5636	4618	{
5637		- struct inode *inode = file_inode(file);
5638		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5639	4619	size_t len;
5640	4620	int ret;
5641	4621	char label[BTRFS_LABEL_SIZE];
..	..	@@ -5719,10 +4699,9 @@
5719	4699	return 0;
5720	4700	}
5721	4701
5722		-static int btrfs_ioctl_get_features(struct file file, void __user arg)
	4702	+static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
	4703	+ void __user *arg)
5723	4704	{
5724		- struct inode *inode = file_inode(file);
5725		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5726	4705	struct btrfs_super_block *super_block = fs_info->super_copy;
5727	4706	struct btrfs_ioctl_feature_flags features;
5728	4707
..	..	@@ -5741,7 +4720,7 @@
5741	4720	u64 change_mask, u64 flags, u64 supported_flags,
5742	4721	u64 safe_set, u64 safe_clear)
5743	4722	{
5744		- const char *type = btrfs_feature_set_names[set];
	4723	+ const char *type = btrfs_feature_set_name(set);
5745	4724	char *names;
5746	4725	u64 disallowed, unsupported;
5747	4726	u64 set_mask = flags & change_mask;
..	..	@@ -5922,8 +4901,12 @@
5922	4901	return btrfs_ioctl_setflags(file, argp);
5923	4902	case FS_IOC_GETVERSION:
5924	4903	return btrfs_ioctl_getversion(file, argp);
	4904	+ case FS_IOC_GETFSLABEL:
	4905	+ return btrfs_ioctl_get_fslabel(fs_info, argp);
	4906	+ case FS_IOC_SETFSLABEL:
	4907	+ return btrfs_ioctl_set_fslabel(file, argp);
5925	4908	case FITRIM:
5926		- return btrfs_ioctl_fitrim(file, argp);
	4909	+ return btrfs_ioctl_fitrim(fs_info, argp);
5927	4910	case BTRFS_IOC_SNAP_CREATE:
5928	4911	return btrfs_ioctl_snap_create(file, argp, 0);
5929	4912	case BTRFS_IOC_SNAP_CREATE_V2:
..	..	@@ -5933,7 +4916,9 @@
5933	4916	case BTRFS_IOC_SUBVOL_CREATE_V2:
5934	4917	return btrfs_ioctl_snap_create_v2(file, argp, 1);
5935	4918	case BTRFS_IOC_SNAP_DESTROY:
5936		- return btrfs_ioctl_snap_destroy(file, argp);
	4919	+ return btrfs_ioctl_snap_destroy(file, argp, false);
	4920	+ case BTRFS_IOC_SNAP_DESTROY_V2:
	4921	+ return btrfs_ioctl_snap_destroy(file, argp, true);
5937	4922	case BTRFS_IOC_SUBVOL_GETFLAGS:
5938	4923	return btrfs_ioctl_subvol_getflags(file, argp);
5939	4924	case BTRFS_IOC_SUBVOL_SETFLAGS:
..	..	@@ -5975,7 +4960,7 @@
5975	4960	case BTRFS_IOC_SYNC: {
5976	4961	int ret;
5977	4962
5978		- ret = btrfs_start_delalloc_roots(fs_info, -1);
	4963	+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
5979	4964	if (ret)
5980	4965	return ret;
5981	4966	ret = btrfs_sync_fs(inode->i_sb, 1);
..	..	@@ -6028,19 +5013,15 @@
6028	5013	case BTRFS_IOC_QUOTA_RESCAN:
6029	5014	return btrfs_ioctl_quota_rescan(file, argp);
6030	5015	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
6031		- return btrfs_ioctl_quota_rescan_status(file, argp);
	5016	+ return btrfs_ioctl_quota_rescan_status(fs_info, argp);
6032	5017	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
6033		- return btrfs_ioctl_quota_rescan_wait(file, argp);
	5018	+ return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
6034	5019	case BTRFS_IOC_DEV_REPLACE:
6035	5020	return btrfs_ioctl_dev_replace(fs_info, argp);
6036		- case BTRFS_IOC_GET_FSLABEL:
6037		- return btrfs_ioctl_get_fslabel(file, argp);
6038		- case BTRFS_IOC_SET_FSLABEL:
6039		- return btrfs_ioctl_set_fslabel(file, argp);
6040	5021	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
6041	5022	return btrfs_ioctl_get_supported_features(argp);
6042	5023	case BTRFS_IOC_GET_FEATURES:
6043		- return btrfs_ioctl_get_features(file, argp);
	5024	+ return btrfs_ioctl_get_features(fs_info, argp);
6044	5025	case BTRFS_IOC_SET_FEATURES:
6045	5026	return btrfs_ioctl_set_features(file, argp);
6046	5027	case FS_IOC_FSGETXATTR: