~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,8 +1,8 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/fs/namespace.c
3	4	*
4	5	* (C) Copyright Al Viro 2000, 2001
5		- * Released under GPL v2.
6	6	*
7	7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
8	8	* Heavily rewritten.
..	..	@@ -14,19 +14,22 @@
14	14	#include <linux/mnt_namespace.h>
15	15	#include <linux/user_namespace.h>
16	16	#include <linux/namei.h>
17		-#include <linux/delay.h>
18	17	#include <linux/security.h>
19	18	#include <linux/cred.h>
20	19	#include <linux/idr.h>
21	20	#include <linux/init.h> /* init_rootfs */
22	21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
23	22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
	23	+#include <linux/file.h>
24	24	#include <linux/uaccess.h>
25	25	#include <linux/proc_ns.h>
26	26	#include <linux/magic.h>
27		-#include <linux/bootmem.h>
	27	+#include <linux/memblock.h>
28	28	#include <linux/task_work.h>
29	29	#include <linux/sched/task.h>
	30	+#include <uapi/linux/mount.h>
	31	+#include <linux/fs_context.h>
	32	+#include <linux/shmem_fs.h>
30	33
31	34	#include "pnode.h"
32	35	#include "internal.h"
..	..	@@ -67,6 +70,8 @@
67	70	static struct hlist_head *mountpoint_hashtable __read_mostly;
68	71	static struct kmem_cache *mnt_cache __read_mostly;
69	72	static DECLARE_RWSEM(namespace_sem);
	73	+static HLIST_HEAD(unmounted); /* protected by namespace_sem */
	74	+static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
70	75
71	76	/* /sys/fs */
72	77	struct kobject *fs_kobj;
..	..	@@ -151,10 +156,10 @@
151	156	/*
152	157	* vfsmount lock must be held for write
153	158	*/
154		-unsigned int mnt_get_count(struct mount *mnt)
	159	+int mnt_get_count(struct mount *mnt)
155	160	{
156	161	#ifdef CONFIG_SMP
157		- unsigned int count = 0;
	162	+ int count = 0;
158	163	int cpu;
159	164
160	165	for_each_possible_cpu(cpu) {
..	..	@@ -165,14 +170,6 @@
165	170	#else
166	171	return mnt->mnt_count;
167	172	#endif
168		-}
169		-
170		-static void drop_mountpoint(struct fs_pin *p)
171		-{
172		- struct mount *m = container_of(p, struct mount, mnt_umount);
173		- dput(m->mnt_ex_mountpoint);
174		- pin_remove(p);
175		- mntput(&m->mnt);
176	173	}
177	174
178	175	static struct mount alloc_vfsmnt(const char name)
..	..	@@ -201,7 +198,6 @@
201	198	mnt->mnt_count = 1;
202	199	mnt->mnt_writers = 0;
203	200	#endif
204		- mnt->mnt.data = NULL;
205	201
206	202	INIT_HLIST_NODE(&mnt->mnt_hash);
207	203	INIT_LIST_HEAD(&mnt->mnt_child);
..	..	@@ -213,7 +209,7 @@
213	209	INIT_LIST_HEAD(&mnt->mnt_slave);
214	210	INIT_HLIST_NODE(&mnt->mnt_mp_list);
215	211	INIT_LIST_HEAD(&mnt->mnt_umounting);
216		- init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
	212	+ INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
217	213	}
218	214	return mnt;
219	215
..	..	@@ -247,13 +243,9 @@
247	243	* mnt_want/drop_write() will _keep_ the filesystem
248	244	* r/w.
249	245	*/
250		-int __mnt_is_readonly(struct vfsmount *mnt)
	246	+bool __mnt_is_readonly(struct vfsmount *mnt)
251	247	{
252		- if (mnt->mnt_flags & MNT_READONLY)
253		- return 1;
254		- if (sb_rdonly(mnt->mnt_sb))
255		- return 1;
256		- return 0;
	248	+ return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(mnt->mnt_sb);
257	249	}
258	250	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
259	251
..	..	@@ -329,11 +321,8 @@
329	321	* incremented count after it has set MNT_WRITE_HOLD.
330	322	*/
331	323	smp_mb();
332		- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
333		- preempt_enable();
334		- cpu_chill();
335		- preempt_disable();
336		- }
	324	+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
	325	+ cpu_relax();
337	326	/*
338	327	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
339	328	* be set to match its requirements. So we must not load that until
..	..	@@ -426,7 +415,7 @@
426	415	sb_end_write(file_inode(file)->i_sb);
427	416	return ret;
428	417	}
429		-EXPORT_SYMBOL_GPL(mnt_want_write_file);
	418	+EXPORT_SYMBOL_NS_GPL(mnt_want_write_file, ANDROID_GKI_VFS_EXPORT_ONLY);
430	419
431	420	/**
432	421	* __mnt_drop_write - give up write access to a mount
..	..	@@ -468,7 +457,7 @@
468	457	__mnt_drop_write_file(file);
469	458	sb_end_write(file_inode(file)->i_sb);
470	459	}
471		-EXPORT_SYMBOL(mnt_drop_write_file);
	460	+EXPORT_SYMBOL_NS(mnt_drop_write_file, ANDROID_GKI_VFS_EXPORT_ONLY);
472	461
473	462	static int mnt_make_readonly(struct mount *mnt)
474	463	{
..	..	@@ -512,11 +501,12 @@
512	501	return ret;
513	502	}
514	503
515		-static void __mnt_unmake_readonly(struct mount *mnt)
	504	+static int __mnt_unmake_readonly(struct mount *mnt)
516	505	{
517	506	lock_mount_hash();
518	507	mnt->mnt.mnt_flags &= ~MNT_READONLY;
519	508	unlock_mount_hash();
	509	+ return 0;
520	510	}
521	511
522	512	int sb_prepare_remount_readonly(struct super_block *sb)
..	..	@@ -557,7 +547,6 @@
557	547
558	548	static void free_vfsmnt(struct mount *mnt)
559	549	{
560		- kfree(mnt->mnt.data);
561	550	kfree_const(mnt->mnt_devname);
562	551	#ifdef CONFIG_SMP
563	552	free_percpu(mnt->mnt_pcp);
..	..	@@ -659,6 +648,21 @@
659	648	return m;
660	649	}
661	650
	651	+static inline void lock_ns_list(struct mnt_namespace *ns)
	652	+{
	653	+ spin_lock(&ns->ns_lock);
	654	+}
	655	+
	656	+static inline void unlock_ns_list(struct mnt_namespace *ns)
	657	+{
	658	+ spin_unlock(&ns->ns_lock);
	659	+}
	660	+
	661	+static inline bool mnt_is_cursor(struct mount *mnt)
	662	+{
	663	+ return mnt->mnt.mnt_flags & MNT_CURSOR;
	664	+}
	665	+
662	666	/*
663	667	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
664	668	* current mount namespace.
..	..	@@ -680,17 +684,18 @@
680	684	struct mount *mnt;
681	685	bool is_covered = false;
682	686
683		- if (!d_mountpoint(dentry))
684		- goto out;
685		-
686	687	down_read(&namespace_sem);
	688	+ lock_ns_list(ns);
687	689	list_for_each_entry(mnt, &ns->list, mnt_list) {
	690	+ if (mnt_is_cursor(mnt))
	691	+ continue;
688	692	is_covered = (mnt->mnt_mountpoint == dentry);
689	693	if (is_covered)
690	694	break;
691	695	}
	696	+ unlock_ns_list(ns);
692	697	up_read(&namespace_sem);
693		-out:
	698	+
694	699	return is_covered;
695	700	}
696	701
..	..	@@ -745,7 +750,7 @@
745	750
746	751	/* Add the new mountpoint to the hash table */
747	752	read_seqlock_excl(&mount_lock);
748		- new->m_dentry = dentry;
	753	+ new->m_dentry = dget(dentry);
749	754	new->m_count = 1;
750	755	hlist_add_head(&new->m_hash, mp_hash(dentry));
751	756	INIT_HLIST_HEAD(&new->m_list);
..	..	@@ -758,7 +763,11 @@
758	763	return mp;
759	764	}
760	765
761		-static void put_mountpoint(struct mountpoint *mp)
	766	+/*
	767	+ * vfsmount lock must be held. Additionally, the caller is responsible
	768	+ * for serializing calls for given disposal list.
	769	+ */
	770	+static void __put_mountpoint(struct mountpoint mp, struct list_head list)
762	771	{
763	772	if (!--mp->m_count) {
764	773	struct dentry *dentry = mp->m_dentry;
..	..	@@ -766,9 +775,16 @@
766	775	spin_lock(&dentry->d_lock);
767	776	dentry->d_flags &= ~DCACHE_MOUNTED;
768	777	spin_unlock(&dentry->d_lock);
	778	+ dput_to_list(dentry, list);
769	779	hlist_del(&mp->m_hash);
770	780	kfree(mp);
771	781	}
	782	+}
	783	+
	784	+/* called with namespace_lock and vfsmount lock */
	785	+static void put_mountpoint(struct mountpoint *mp)
	786	+{
	787	+ __put_mountpoint(mp, &ex_mountpoints);
772	788	}
773	789
774	790	static inline int check_mnt(struct mount *mnt)
..	..	@@ -801,25 +817,17 @@
801	817	/*
802	818	* vfsmount lock must be held for write
803	819	*/
804		-static void unhash_mnt(struct mount *mnt)
	820	+static struct mountpoint unhash_mnt(struct mount mnt)
805	821	{
	822	+ struct mountpoint *mp;
806	823	mnt->mnt_parent = mnt;
807	824	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
808	825	list_del_init(&mnt->mnt_child);
809	826	hlist_del_init_rcu(&mnt->mnt_hash);
810	827	hlist_del_init(&mnt->mnt_mp_list);
811		- put_mountpoint(mnt->mnt_mp);
	828	+ mp = mnt->mnt_mp;
812	829	mnt->mnt_mp = NULL;
813		-}
814		-
815		-/*
816		- * vfsmount lock must be held for write
817		- */
818		-static void detach_mnt(struct mount mnt, struct path old_path)
819		-{
820		- old_path->dentry = mnt->mnt_mountpoint;
821		- old_path->mnt = &mnt->mnt_parent->mnt;
822		- unhash_mnt(mnt);
	830	+ return mp;
823	831	}
824	832
825	833	/*
..	..	@@ -827,9 +835,7 @@
827	835	*/
828	836	static void umount_mnt(struct mount *mnt)
829	837	{
830		- /* old mountpoint will be dropped when we can do that */
831		- mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
832		- unhash_mnt(mnt);
	838	+ put_mountpoint(unhash_mnt(mnt));
833	839	}
834	840
835	841	/*
..	..	@@ -841,7 +847,7 @@
841	847	{
842	848	mp->m_count++;
843	849	mnt_add_count(mnt, 1); /* essentially, that's mntget */
844		- child_mnt->mnt_mountpoint = dget(mp->m_dentry);
	850	+ child_mnt->mnt_mountpoint = mp->m_dentry;
845	851	child_mnt->mnt_parent = mnt;
846	852	child_mnt->mnt_mp = mp;
847	853	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
..	..	@@ -868,7 +874,6 @@
868	874	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
869	875	{
870	876	struct mountpoint *old_mp = mnt->mnt_mp;
871		- struct dentry *old_mountpoint = mnt->mnt_mountpoint;
872	877	struct mount *old_parent = mnt->mnt_parent;
873	878
874	879	list_del_init(&mnt->mnt_child);
..	..	@@ -878,22 +883,6 @@
878	883	attach_mnt(mnt, parent, mp);
879	884
880	885	put_mountpoint(old_mp);
881		-
882		- /*
883		- * Safely avoid even the suggestion this code might sleep or
884		- * lock the mount hash by taking advantage of the knowledge that
885		- * mnt_change_mountpoint will not release the final reference
886		- * to a mountpoint.
887		- *
888		- * During mounting, the mount passed in as the parent mount will
889		- * continue to use the old mountpoint and during unmounting, the
890		- * old mountpoint will continue to exist until namespace_unlock,
891		- * which happens well after mnt_change_mountpoint.
892		- */
893		- spin_lock(&old_mountpoint->d_lock);
894		- old_mountpoint->d_lockref.count--;
895		- spin_unlock(&old_mountpoint->d_lock);
896		-
897	886	mnt_add_count(old_parent, -1);
898	887	}
899	888
..	..	@@ -948,45 +937,80 @@
948	937	return p;
949	938	}
950	939
951		-struct vfsmount *
952		-vfs_kern_mount(struct file_system_type type, int flags, const char name, void *data)
	940	+/**
	941	+ * vfs_create_mount - Create a mount for a configured superblock
	942	+ * @fc: The configuration context with the superblock attached
	943	+ *
	944	+ * Create a mount to an already configured superblock. If necessary, the
	945	+ * caller should invoke vfs_get_tree() before calling this.
	946	+ *
	947	+ * Note that this does not attach the mount to anything.
	948	+ */
	949	+struct vfsmount vfs_create_mount(struct fs_context fc)
953	950	{
954	951	struct mount *mnt;
955		- struct dentry *root;
956	952
957		- if (!type)
958		- return ERR_PTR(-ENODEV);
	953	+ if (!fc->root)
	954	+ return ERR_PTR(-EINVAL);
959	955
960		- mnt = alloc_vfsmnt(name);
	956	+ mnt = alloc_vfsmnt(fc->source ?: "none");
961	957	if (!mnt)
962	958	return ERR_PTR(-ENOMEM);
963	959
964		- if (type->alloc_mnt_data) {
965		- mnt->mnt.data = type->alloc_mnt_data();
966		- if (!mnt->mnt.data) {
967		- mnt_free_id(mnt);
968		- free_vfsmnt(mnt);
969		- return ERR_PTR(-ENOMEM);
970		- }
971		- }
972		- if (flags & SB_KERNMOUNT)
	960	+ if (fc->sb_flags & SB_KERNMOUNT)
973	961	mnt->mnt.mnt_flags = MNT_INTERNAL;
974	962
975		- root = mount_fs(type, flags, name, &mnt->mnt, data);
976		- if (IS_ERR(root)) {
977		- mnt_free_id(mnt);
978		- free_vfsmnt(mnt);
979		- return ERR_CAST(root);
980		- }
	963	+ atomic_inc(&fc->root->d_sb->s_active);
	964	+ mnt->mnt.mnt_sb = fc->root->d_sb;
	965	+ mnt->mnt.mnt_root = dget(fc->root);
	966	+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	967	+ mnt->mnt_parent = mnt;
981	968
982		- mnt->mnt.mnt_root = root;
983		- mnt->mnt.mnt_sb = root->d_sb;
984		- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
985		- mnt->mnt_parent = mnt;
986	969	lock_mount_hash();
987		- list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
	970	+ list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
988	971	unlock_mount_hash();
989	972	return &mnt->mnt;
	973	+}
	974	+EXPORT_SYMBOL(vfs_create_mount);
	975	+
	976	+struct vfsmount fc_mount(struct fs_context fc)
	977	+{
	978	+ int err = vfs_get_tree(fc);
	979	+ if (!err) {
	980	+ up_write(&fc->root->d_sb->s_umount);
	981	+ return vfs_create_mount(fc);
	982	+ }
	983	+ return ERR_PTR(err);
	984	+}
	985	+EXPORT_SYMBOL(fc_mount);
	986	+
	987	+struct vfsmount vfs_kern_mount(struct file_system_type type,
	988	+ int flags, const char *name,
	989	+ void *data)
	990	+{
	991	+ struct fs_context *fc;
	992	+ struct vfsmount *mnt;
	993	+ int ret = 0;
	994	+
	995	+ if (!type)
	996	+ return ERR_PTR(-EINVAL);
	997	+
	998	+ fc = fs_context_for_mount(type, flags);
	999	+ if (IS_ERR(fc))
	1000	+ return ERR_CAST(fc);
	1001	+
	1002	+ if (name)
	1003	+ ret = vfs_parse_fs_string(fc, "source",
	1004	+ name, strlen(name));
	1005	+ if (!ret)
	1006	+ ret = parse_monolithic_mount_data(fc, data);
	1007	+ if (!ret)
	1008	+ mnt = fc_mount(fc);
	1009	+ else
	1010	+ mnt = ERR_PTR(ret);
	1011	+
	1012	+ put_fs_context(fc);
	1013	+ return mnt;
990	1014	}
991	1015	EXPORT_SYMBOL_GPL(vfs_kern_mount);
992	1016
..	..	@@ -1016,14 +1040,6 @@
1016	1040	if (!mnt)
1017	1041	return ERR_PTR(-ENOMEM);
1018	1042
1019		- if (sb->s_op->clone_mnt_data) {
1020		- mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
1021		- if (!mnt->mnt.data) {
1022		- err = -ENOMEM;
1023		- goto out_free;
1024		- }
1025		- }
1026		-
1027	1043	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
1028	1044	mnt->mnt_group_id = 0; /* not a peer of original */
1029	1045	else
..	..	@@ -1037,27 +1053,6 @@
1037	1053
1038	1054	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1039	1055	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
1040		- /* Don't allow unprivileged users to change mount flags */
1041		- if (flag & CL_UNPRIVILEGED) {
1042		- mnt->mnt.mnt_flags \|= MNT_LOCK_ATIME;
1043		-
1044		- if (mnt->mnt.mnt_flags & MNT_READONLY)
1045		- mnt->mnt.mnt_flags \|= MNT_LOCK_READONLY;
1046		-
1047		- if (mnt->mnt.mnt_flags & MNT_NODEV)
1048		- mnt->mnt.mnt_flags \|= MNT_LOCK_NODEV;
1049		-
1050		- if (mnt->mnt.mnt_flags & MNT_NOSUID)
1051		- mnt->mnt.mnt_flags \|= MNT_LOCK_NOSUID;
1052		-
1053		- if (mnt->mnt.mnt_flags & MNT_NOEXEC)
1054		- mnt->mnt.mnt_flags \|= MNT_LOCK_NOEXEC;
1055		- }
1056		-
1057		- /* Don't allow unprivileged users to reveal what is under a mount */
1058		- if ((flag & CL_UNPRIVILEGED) &&
1059		- (!(flag & CL_EXPIRE) \|\| list_empty(&old->mnt_expire)))
1060		- mnt->mnt.mnt_flags \|= MNT_LOCKED;
1061	1056
1062	1057	atomic_inc(&sb->s_active);
1063	1058	mnt->mnt.mnt_sb = sb;
..	..	@@ -1102,19 +1097,22 @@
1102	1097
1103	1098	static void cleanup_mnt(struct mount *mnt)
1104	1099	{
	1100	+ struct hlist_node *p;
	1101	+ struct mount *m;
1105	1102	/*
1106		- * This probably indicates that somebody messed
1107		- * up a mnt_want/drop_write() pair. If this
1108		- * happens, the filesystem was probably unable
1109		- * to make r/w->r/o transitions.
1110		- */
1111		- /*
	1103	+ * The warning here probably indicates that somebody messed
	1104	+ * up a mnt_want/drop_write() pair. If this happens, the
	1105	+ * filesystem was probably unable to make r/w->r/o transitions.
1112	1106	* The locking used to deal with mnt_count decrement provides barriers,
1113	1107	* so mnt_get_writers() below is safe.
1114	1108	*/
1115	1109	WARN_ON(mnt_get_writers(mnt));
1116	1110	if (unlikely(mnt->mnt_pins.first))
1117	1111	mnt_pin_kill(mnt);
	1112	+ hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
	1113	+ hlist_del(&m->mnt_umount);
	1114	+ mntput(&m->mnt);
	1115	+ }
1118	1116	fsnotify_vfsmount_delete(&mnt->mnt);
1119	1117	dput(mnt->mnt.mnt_root);
1120	1118	deactivate_super(mnt->mnt.mnt_sb);
..	..	@@ -1140,6 +1138,9 @@
1140	1138
1141	1139	static void mntput_no_expire(struct mount *mnt)
1142	1140	{
	1141	+ LIST_HEAD(list);
	1142	+ int count;
	1143	+
1143	1144	rcu_read_lock();
1144	1145	if (likely(READ_ONCE(mnt->mnt_ns))) {
1145	1146	/*
..	..	@@ -1162,7 +1163,9 @@
1162	1163	*/
1163	1164	smp_mb();
1164	1165	mnt_add_count(mnt, -1);
1165		- if (mnt_get_count(mnt)) {
	1166	+ count = mnt_get_count(mnt);
	1167	+ if (count != 0) {
	1168	+ WARN_ON(count < 0);
1166	1169	rcu_read_unlock();
1167	1170	unlock_mount_hash();
1168	1171	return;
..	..	@@ -1180,16 +1183,18 @@
1180	1183	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1181	1184	struct mount p, tmp;
1182	1185	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1183		- umount_mnt(p);
	1186	+ __put_mountpoint(unhash_mnt(p), &list);
	1187	+ hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1184	1188	}
1185	1189	}
1186	1190	unlock_mount_hash();
	1191	+ shrink_dentry_list(&list);
1187	1192
1188	1193	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1189	1194	struct task_struct *task = current;
1190	1195	if (likely(!(task->flags & PF_KTHREAD))) {
1191	1196	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1192		- if (!task_work_add(task, &mnt->mnt_rcu, true))
	1197	+ if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1193	1198	return;
1194	1199	}
1195	1200	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
..	..	@@ -1259,46 +1264,71 @@
1259	1264	}
1260	1265
1261	1266	#ifdef CONFIG_PROC_FS
	1267	+static struct mount mnt_list_next(struct mnt_namespace ns,
	1268	+ struct list_head *p)
	1269	+{
	1270	+ struct mount mnt, ret = NULL;
	1271	+
	1272	+ lock_ns_list(ns);
	1273	+ list_for_each_continue(p, &ns->list) {
	1274	+ mnt = list_entry(p, typeof(*mnt), mnt_list);
	1275	+ if (!mnt_is_cursor(mnt)) {
	1276	+ ret = mnt;
	1277	+ break;
	1278	+ }
	1279	+ }
	1280	+ unlock_ns_list(ns);
	1281	+
	1282	+ return ret;
	1283	+}
	1284	+
1262	1285	/* iterator; we want it to have access to namespace_sem, thus here... */
1263	1286	static void m_start(struct seq_file m, loff_t *pos)
1264	1287	{
1265	1288	struct proc_mounts *p = m->private;
	1289	+ struct list_head *prev;
1266	1290
1267	1291	down_read(&namespace_sem);
1268		- if (p->cached_event == p->ns->event) {
1269		- void *v = p->cached_mount;
1270		- if (*pos == p->cached_index)
1271		- return v;
1272		- if (*pos == p->cached_index + 1) {
1273		- v = seq_list_next(v, &p->ns->list, &p->cached_index);
1274		- return p->cached_mount = v;
1275		- }
	1292	+ if (!*pos) {
	1293	+ prev = &p->ns->list;
	1294	+ } else {
	1295	+ prev = &p->cursor.mnt_list;
	1296	+
	1297	+ /* Read after we'd reached the end? */
	1298	+ if (list_empty(prev))
	1299	+ return NULL;
1276	1300	}
1277	1301
1278		- p->cached_event = p->ns->event;
1279		- p->cached_mount = seq_list_start(&p->ns->list, *pos);
1280		- p->cached_index = *pos;
1281		- return p->cached_mount;
	1302	+ return mnt_list_next(p->ns, prev);
1282	1303	}
1283	1304
1284	1305	static void m_next(struct seq_file m, void v, loff_t pos)
1285	1306	{
1286	1307	struct proc_mounts *p = m->private;
	1308	+ struct mount *mnt = v;
1287	1309
1288		- p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1289		- p->cached_index = *pos;
1290		- return p->cached_mount;
	1310	+ ++*pos;
	1311	+ return mnt_list_next(p->ns, &mnt->mnt_list);
1291	1312	}
1292	1313
1293	1314	static void m_stop(struct seq_file m, void v)
1294	1315	{
	1316	+ struct proc_mounts *p = m->private;
	1317	+ struct mount *mnt = v;
	1318	+
	1319	+ lock_ns_list(p->ns);
	1320	+ if (mnt)
	1321	+ list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
	1322	+ else
	1323	+ list_del_init(&p->cursor.mnt_list);
	1324	+ unlock_ns_list(p->ns);
1295	1325	up_read(&namespace_sem);
1296	1326	}
1297	1327
1298	1328	static int m_show(struct seq_file m, void v)
1299	1329	{
1300	1330	struct proc_mounts *p = m->private;
1301		- struct mount *r = list_entry(v, struct mount, mnt_list);
	1331	+ struct mount *r = v;
1302	1332	return p->show(m, &r->mnt);
1303	1333	}
1304	1334
..	..	@@ -1308,6 +1338,15 @@
1308	1338	.stop = m_stop,
1309	1339	.show = m_show,
1310	1340	};
	1341	+
	1342	+void mnt_cursor_del(struct mnt_namespace ns, struct mount cursor)
	1343	+{
	1344	+ down_read(&namespace_sem);
	1345	+ lock_ns_list(ns);
	1346	+ list_del(&cursor->mnt_list);
	1347	+ unlock_ns_list(ns);
	1348	+ up_read(&namespace_sem);
	1349	+}
1311	1350	#endif /* CONFIG_PROC_FS */
1312	1351
1313	1352	/**
..	..	@@ -1369,22 +1408,29 @@
1369	1408
1370	1409	EXPORT_SYMBOL(may_umount);
1371	1410
1372		-static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1373		-
1374	1411	static void namespace_unlock(void)
1375	1412	{
1376	1413	struct hlist_head head;
	1414	+ struct hlist_node *p;
	1415	+ struct mount *m;
	1416	+ LIST_HEAD(list);
1377	1417
1378	1418	hlist_move_list(&unmounted, &head);
	1419	+ list_splice_init(&ex_mountpoints, &list);
1379	1420
1380	1421	up_write(&namespace_sem);
	1422	+
	1423	+ shrink_dentry_list(&list);
1381	1424
1382	1425	if (likely(hlist_empty(&head)))
1383	1426	return;
1384	1427
1385		- synchronize_rcu();
	1428	+ synchronize_rcu_expedited();
1386	1429
1387		- group_pin_kill(&head);
	1430	+ hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
	1431	+ hlist_del(&m->mnt_umount);
	1432	+ mntput(&m->mnt);
	1433	+ }
1388	1434	}
1389	1435
1390	1436	static inline void namespace_lock(void)
..	..	@@ -1470,9 +1516,6 @@
1470	1516	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1471	1517
1472	1518	disconnect = disconnect_mount(p, how);
1473		-
1474		- pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
1475		- disconnect ? &unmounted : NULL);
1476	1519	if (mnt_has_parent(p)) {
1477	1520	mnt_add_count(p->mnt_parent, -1);
1478	1521	if (!disconnect) {
..	..	@@ -1483,10 +1526,35 @@
1483	1526	}
1484	1527	}
1485	1528	change_mnt_propagation(p, MS_PRIVATE);
	1529	+ if (disconnect)
	1530	+ hlist_add_head(&p->mnt_umount, &unmounted);
1486	1531	}
1487	1532	}
1488	1533
1489	1534	static void shrink_submounts(struct mount *mnt);
	1535	+
	1536	+static int do_umount_root(struct super_block *sb)
	1537	+{
	1538	+ int ret = 0;
	1539	+
	1540	+ down_write(&sb->s_umount);
	1541	+ if (!sb_rdonly(sb)) {
	1542	+ struct fs_context *fc;
	1543	+
	1544	+ fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
	1545	+ SB_RDONLY);
	1546	+ if (IS_ERR(fc)) {
	1547	+ ret = PTR_ERR(fc);
	1548	+ } else {
	1549	+ ret = parse_monolithic_mount_data(fc, NULL);
	1550	+ if (!ret)
	1551	+ ret = reconfigure_super(fc);
	1552	+ put_fs_context(fc);
	1553	+ }
	1554	+ }
	1555	+ up_write(&sb->s_umount);
	1556	+ return ret;
	1557	+}
1490	1558
1491	1559	static int do_umount(struct mount *mnt, int flags)
1492	1560	{
..	..	@@ -1553,11 +1621,7 @@
1553	1621	*/
1554	1622	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1555	1623	return -EPERM;
1556		- down_write(&sb->s_umount);
1557		- if (!sb_rdonly(sb))
1558		- retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
1559		- up_write(&sb->s_umount);
1560		- return retval;
	1624	+ return do_umount_root(sb);
1561	1625	}
1562	1626
1563	1627	namespace_lock();
..	..	@@ -1606,15 +1670,15 @@
1606	1670	namespace_lock();
1607	1671	lock_mount_hash();
1608	1672	mp = lookup_mountpoint(dentry);
1609		- if (IS_ERR_OR_NULL(mp))
	1673	+ if (!mp)
1610	1674	goto out_unlock;
1611	1675
1612	1676	event++;
1613	1677	while (!hlist_empty(&mp->m_list)) {
1614	1678	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1615	1679	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1616		- hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
1617	1680	umount_mnt(mnt);
	1681	+ hlist_add_head(&mnt->mnt_umount, &unmounted);
1618	1682	}
1619	1683	else umount_tree(mnt, UMOUNT_CONNECTED);
1620	1684	}
..	..	@@ -1649,52 +1713,55 @@
1649	1713	}
1650	1714	#endif
1651	1715
1652		-/*
1653		- * Now umount can handle mount points as well as block devices.
1654		- * This is important for filesystems which use unnamed block devices.
1655		- *
1656		- * We now support a flag for forced unmount like the other 'big iron'
1657		- * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1658		- */
1659		-
1660		-int ksys_umount(char __user *name, int flags)
	1716	+static int can_umount(const struct path *path, int flags)
1661	1717	{
1662		- struct path path;
1663		- struct mount *mnt;
1664		- int retval;
1665		- int lookup_flags = 0;
1666		-
1667		- if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
1668		- return -EINVAL;
	1718	+ struct mount *mnt = real_mount(path->mnt);
1669	1719
1670	1720	if (!may_mount())
1671	1721	return -EPERM;
	1722	+ if (path->dentry != path->mnt->mnt_root)
	1723	+ return -EINVAL;
	1724	+ if (!check_mnt(mnt))
	1725	+ return -EINVAL;
	1726	+ if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
	1727	+ return -EINVAL;
	1728	+ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
	1729	+ return -EPERM;
	1730	+ return 0;
	1731	+}
	1732	+
	1733	+// caller is responsible for flags being sane
	1734	+int path_umount(struct path *path, int flags)
	1735	+{
	1736	+ struct mount *mnt = real_mount(path->mnt);
	1737	+ int ret;
	1738	+
	1739	+ ret = can_umount(path, flags);
	1740	+ if (!ret)
	1741	+ ret = do_umount(mnt, flags);
	1742	+
	1743	+ /* we mustn't call path_put() as that would clear mnt_expiry_mark */
	1744	+ dput(path->dentry);
	1745	+ mntput_no_expire(mnt);
	1746	+ return ret;
	1747	+}
	1748	+
	1749	+static int ksys_umount(char __user *name, int flags)
	1750	+{
	1751	+ int lookup_flags = LOOKUP_MOUNTPOINT;
	1752	+ struct path path;
	1753	+ int ret;
	1754	+
	1755	+ // basic validity checks done first
	1756	+ if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
	1757	+ return -EINVAL;
1672	1758
1673	1759	if (!(flags & UMOUNT_NOFOLLOW))
1674	1760	lookup_flags \|= LOOKUP_FOLLOW;
1675		-
1676		- retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1677		- if (retval)
1678		- goto out;
1679		- mnt = real_mount(path.mnt);
1680		- retval = -EINVAL;
1681		- if (path.dentry != path.mnt->mnt_root)
1682		- goto dput_and_out;
1683		- if (!check_mnt(mnt))
1684		- goto dput_and_out;
1685		- if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1686		- goto dput_and_out;
1687		- retval = -EPERM;
1688		- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1689		- goto dput_and_out;
1690		-
1691		- retval = do_umount(mnt, flags);
1692		-dput_and_out:
1693		- /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1694		- dput(path.dentry);
1695		- mntput_no_expire(mnt);
1696		-out:
1697		- return retval;
	1761	+ ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
	1762	+ if (ret)
	1763	+ return ret;
	1764	+ return path_umount(&path, flags);
1698	1765	}
1699	1766
1700	1767	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
..	..	@@ -1721,9 +1788,14 @@
1721	1788	dentry->d_fsdata == &mntns_operations;
1722	1789	}
1723	1790
1724		-struct mnt_namespace to_mnt_ns(struct ns_common ns)
	1791	+static struct mnt_namespace to_mnt_ns(struct ns_common ns)
1725	1792	{
1726	1793	return container_of(ns, struct mnt_namespace, ns);
	1794	+}
	1795	+
	1796	+struct ns_common from_mnt_ns(struct mnt_namespace mnt)
	1797	+{
	1798	+ return &mnt->ns;
1727	1799	}
1728	1800
1729	1801	static bool mnt_ns_loop(struct dentry *dentry)
..	..	@@ -1821,6 +1893,27 @@
1821	1893	return &tree->mnt;
1822	1894	}
1823	1895
	1896	+static void free_mnt_ns(struct mnt_namespace *);
	1897	+static struct mnt_namespace alloc_mnt_ns(struct user_namespace , bool);
	1898	+
	1899	+void dissolve_on_fput(struct vfsmount *mnt)
	1900	+{
	1901	+ struct mnt_namespace *ns;
	1902	+ namespace_lock();
	1903	+ lock_mount_hash();
	1904	+ ns = real_mount(mnt)->mnt_ns;
	1905	+ if (ns) {
	1906	+ if (is_anon_ns(ns))
	1907	+ umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
	1908	+ else
	1909	+ ns = NULL;
	1910	+ }
	1911	+ unlock_mount_hash();
	1912	+ namespace_unlock();
	1913	+ if (ns)
	1914	+ free_mnt_ns(ns);
	1915	+}
	1916	+
1824	1917	void drop_collected_mounts(struct vfsmount *mnt)
1825	1918	{
1826	1919	namespace_lock();
..	..	@@ -1874,6 +1967,9 @@
1874	1967	if (IS_ERR(new_mnt))
1875	1968	return ERR_CAST(new_mnt);
1876	1969
	1970	+ /* Longterm mount to be removed by kern_unmount() /
	1971	+ new_mnt->mnt_ns = MNT_NS_INTERNAL;
	1972	+
1877	1973	return &new_mnt->mnt;
1878	1974
1879	1975	invalid:
..	..	@@ -1895,6 +1991,33 @@
1895	1991	return res;
1896	1992	}
1897	1993	return 0;
	1994	+}
	1995	+
	1996	+static void lock_mnt_tree(struct mount *mnt)
	1997	+{
	1998	+ struct mount *p;
	1999	+
	2000	+ for (p = mnt; p; p = next_mnt(p, mnt)) {
	2001	+ int flags = p->mnt.mnt_flags;
	2002	+ /* Don't allow unprivileged users to change mount flags */
	2003	+ flags \|= MNT_LOCK_ATIME;
	2004	+
	2005	+ if (flags & MNT_READONLY)
	2006	+ flags \|= MNT_LOCK_READONLY;
	2007	+
	2008	+ if (flags & MNT_NODEV)
	2009	+ flags \|= MNT_LOCK_NODEV;
	2010	+
	2011	+ if (flags & MNT_NOSUID)
	2012	+ flags \|= MNT_LOCK_NOSUID;
	2013	+
	2014	+ if (flags & MNT_NOEXEC)
	2015	+ flags \|= MNT_LOCK_NOEXEC;
	2016	+ /* Don't allow unprivileged users to reveal what is under a mount */
	2017	+ if (list_empty(&p->mnt_expire))
	2018	+ flags \|= MNT_LOCKED;
	2019	+ p->mnt.mnt_flags = flags;
	2020	+ }
1898	2021	}
1899	2022
1900	2023	static void cleanup_group_ids(struct mount mnt, struct mount end)
..	..	@@ -2012,8 +2135,9 @@
2012	2135	static int attach_recursive_mnt(struct mount *source_mnt,
2013	2136	struct mount *dest_mnt,
2014	2137	struct mountpoint *dest_mp,
2015		- struct path *parent_path)
	2138	+ bool moving)
2016	2139	{
	2140	+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2017	2141	HLIST_HEAD(tree_list);
2018	2142	struct mnt_namespace *ns = dest_mnt->mnt_ns;
2019	2143	struct mountpoint *smp;
..	..	@@ -2029,7 +2153,7 @@
2029	2153	return PTR_ERR(smp);
2030	2154
2031	2155	/* Is there space to add these mounts to the mount namespace? */
2032		- if (!parent_path) {
	2156	+ if (!moving) {
2033	2157	err = count_mounts(ns, source_mnt);
2034	2158	if (err)
2035	2159	goto out;
..	..	@@ -2048,11 +2172,15 @@
2048	2172	} else {
2049	2173	lock_mount_hash();
2050	2174	}
2051		- if (parent_path) {
2052		- detach_mnt(source_mnt, parent_path);
	2175	+ if (moving) {
	2176	+ unhash_mnt(source_mnt);
2053	2177	attach_mnt(source_mnt, dest_mnt, dest_mp);
2054	2178	touch_mnt_namespace(source_mnt->mnt_ns);
2055	2179	} else {
	2180	+ if (source_mnt->mnt_ns) {
	2181	+ /* move from anon - the caller will destroy */
	2182	+ list_del_init(&source_mnt->mnt_ns->list);
	2183	+ }
2056	2184	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2057	2185	commit_tree(source_mnt);
2058	2186	}
..	..	@@ -2064,6 +2192,10 @@
2064	2192	child->mnt_mountpoint);
2065	2193	if (q)
2066	2194	mnt_change_mountpoint(child, smp, q);
	2195	+ /* Notice when we are propagating across user namespaces */
	2196	+ if (child->mnt_parent->mnt_ns->user_ns != user_ns)
	2197	+ lock_mnt_tree(child);
	2198	+ child->mnt.mnt_flags &= ~MNT_LOCKED;
2067	2199	commit_tree(child);
2068	2200	}
2069	2201	put_mountpoint(smp);
..	..	@@ -2139,7 +2271,7 @@
2139	2271	d_is_dir(mnt->mnt.mnt_root))
2140	2272	return -ENOTDIR;
2141	2273
2142		- return attach_recursive_mnt(mnt, p, mp, NULL);
	2274	+ return attach_recursive_mnt(mnt, p, mp, false);
2143	2275	}
2144	2276
2145	2277	/*
..	..	@@ -2194,6 +2326,30 @@
2194	2326	return err;
2195	2327	}
2196	2328
	2329	+static struct mount __do_loopback(struct path old_path, int recurse)
	2330	+{
	2331	+ struct mount mnt = ERR_PTR(-EINVAL), old = real_mount(old_path->mnt);
	2332	+
	2333	+ if (IS_MNT_UNBINDABLE(old))
	2334	+ return mnt;
	2335	+
	2336	+ if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
	2337	+ return mnt;
	2338	+
	2339	+ if (!recurse && has_locked_children(old, old_path->dentry))
	2340	+ return mnt;
	2341	+
	2342	+ if (recurse)
	2343	+ mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
	2344	+ else
	2345	+ mnt = clone_mnt(old, old_path->dentry, 0);
	2346	+
	2347	+ if (!IS_ERR(mnt))
	2348	+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
	2349	+
	2350	+ return mnt;
	2351	+}
	2352	+
2197	2353	/*
2198	2354	* do loopback mount.
2199	2355	*/
..	..	@@ -2201,7 +2357,7 @@
2201	2357	int recurse)
2202	2358	{
2203	2359	struct path old_path;
2204		- struct mount mnt = NULL, old, *parent;
	2360	+ struct mount mnt = NULL, parent;
2205	2361	struct mountpoint *mp;
2206	2362	int err;
2207	2363	if (!old_name \|\| !*old_name)
..	..	@@ -2215,37 +2371,20 @@
2215	2371	goto out;
2216	2372
2217	2373	mp = lock_mount(path);
2218		- err = PTR_ERR(mp);
2219		- if (IS_ERR(mp))
	2374	+ if (IS_ERR(mp)) {
	2375	+ err = PTR_ERR(mp);
2220	2376	goto out;
	2377	+ }
2221	2378
2222		- old = real_mount(old_path.mnt);
2223	2379	parent = real_mount(path->mnt);
2224		-
2225		- err = -EINVAL;
2226		- if (IS_MNT_UNBINDABLE(old))
2227		- goto out2;
2228		-
2229	2380	if (!check_mnt(parent))
2230	2381	goto out2;
2231	2382
2232		- if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
2233		- goto out2;
2234		-
2235		- if (!recurse && has_locked_children(old, old_path.dentry))
2236		- goto out2;
2237		-
2238		- if (recurse)
2239		- mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2240		- else
2241		- mnt = clone_mnt(old, old_path.dentry, 0);
2242		-
	2383	+ mnt = __do_loopback(&old_path, recurse);
2243	2384	if (IS_ERR(mnt)) {
2244	2385	err = PTR_ERR(mnt);
2245	2386	goto out2;
2246	2387	}
2247		-
2248		- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2249	2388
2250	2389	err = graft_tree(mnt, parent, mp);
2251	2390	if (err) {
..	..	@@ -2260,21 +2399,206 @@
2260	2399	return err;
2261	2400	}
2262	2401
2263		-static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
	2402	+static struct file open_detached_copy(struct path path, bool recursive)
2264	2403	{
2265		- int error = 0;
2266		- int readonly_request = 0;
	2404	+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
	2405	+ struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
	2406	+ struct mount mnt, p;
	2407	+ struct file *file;
2267	2408
2268		- if (ms_flags & MS_RDONLY)
2269		- readonly_request = 1;
2270		- if (readonly_request == __mnt_is_readonly(mnt))
	2409	+ if (IS_ERR(ns))
	2410	+ return ERR_CAST(ns);
	2411	+
	2412	+ namespace_lock();
	2413	+ mnt = __do_loopback(path, recursive);
	2414	+ if (IS_ERR(mnt)) {
	2415	+ namespace_unlock();
	2416	+ free_mnt_ns(ns);
	2417	+ return ERR_CAST(mnt);
	2418	+ }
	2419	+
	2420	+ lock_mount_hash();
	2421	+ for (p = mnt; p; p = next_mnt(p, mnt)) {
	2422	+ p->mnt_ns = ns;
	2423	+ ns->mounts++;
	2424	+ }
	2425	+ ns->root = mnt;
	2426	+ list_add_tail(&ns->list, &mnt->mnt_list);
	2427	+ mntget(&mnt->mnt);
	2428	+ unlock_mount_hash();
	2429	+ namespace_unlock();
	2430	+
	2431	+ mntput(path->mnt);
	2432	+ path->mnt = &mnt->mnt;
	2433	+ file = dentry_open(path, O_PATH, current_cred());
	2434	+ if (IS_ERR(file))
	2435	+ dissolve_on_fput(path->mnt);
	2436	+ else
	2437	+ file->f_mode \|= FMODE_NEED_UNMOUNT;
	2438	+ return file;
	2439	+}
	2440	+
	2441	+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
	2442	+{
	2443	+ struct file *file;
	2444	+ struct path path;
	2445	+ int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
	2446	+ bool detached = flags & OPEN_TREE_CLONE;
	2447	+ int error;
	2448	+ int fd;
	2449	+
	2450	+ BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
	2451	+
	2452	+ if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
	2453	+ AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
	2454	+ OPEN_TREE_CLOEXEC))
	2455	+ return -EINVAL;
	2456	+
	2457	+ if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
	2458	+ return -EINVAL;
	2459	+
	2460	+ if (flags & AT_NO_AUTOMOUNT)
	2461	+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
	2462	+ if (flags & AT_SYMLINK_NOFOLLOW)
	2463	+ lookup_flags &= ~LOOKUP_FOLLOW;
	2464	+ if (flags & AT_EMPTY_PATH)
	2465	+ lookup_flags \|= LOOKUP_EMPTY;
	2466	+
	2467	+ if (detached && !may_mount())
	2468	+ return -EPERM;
	2469	+
	2470	+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
	2471	+ if (fd < 0)
	2472	+ return fd;
	2473	+
	2474	+ error = user_path_at(dfd, filename, lookup_flags, &path);
	2475	+ if (unlikely(error)) {
	2476	+ file = ERR_PTR(error);
	2477	+ } else {
	2478	+ if (detached)
	2479	+ file = open_detached_copy(&path, flags & AT_RECURSIVE);
	2480	+ else
	2481	+ file = dentry_open(&path, O_PATH, current_cred());
	2482	+ path_put(&path);
	2483	+ }
	2484	+ if (IS_ERR(file)) {
	2485	+ put_unused_fd(fd);
	2486	+ return PTR_ERR(file);
	2487	+ }
	2488	+ fd_install(fd, file);
	2489	+ return fd;
	2490	+}
	2491	+
	2492	+/*
	2493	+ * Don't allow locked mount flags to be cleared.
	2494	+ *
	2495	+ * No locks need to be held here while testing the various MNT_LOCK
	2496	+ * flags because those flags can never be cleared once they are set.
	2497	+ */
	2498	+static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
	2499	+{
	2500	+ unsigned int fl = mnt->mnt.mnt_flags;
	2501	+
	2502	+ if ((fl & MNT_LOCK_READONLY) &&
	2503	+ !(mnt_flags & MNT_READONLY))
	2504	+ return false;
	2505	+
	2506	+ if ((fl & MNT_LOCK_NODEV) &&
	2507	+ !(mnt_flags & MNT_NODEV))
	2508	+ return false;
	2509	+
	2510	+ if ((fl & MNT_LOCK_NOSUID) &&
	2511	+ !(mnt_flags & MNT_NOSUID))
	2512	+ return false;
	2513	+
	2514	+ if ((fl & MNT_LOCK_NOEXEC) &&
	2515	+ !(mnt_flags & MNT_NOEXEC))
	2516	+ return false;
	2517	+
	2518	+ if ((fl & MNT_LOCK_ATIME) &&
	2519	+ ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
	2520	+ return false;
	2521	+
	2522	+ return true;
	2523	+}
	2524	+
	2525	+static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
	2526	+{
	2527	+ bool readonly_request = (mnt_flags & MNT_READONLY);
	2528	+
	2529	+ if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2271	2530	return 0;
2272	2531
2273	2532	if (readonly_request)
2274		- error = mnt_make_readonly(real_mount(mnt));
2275		- else
2276		- __mnt_unmake_readonly(real_mount(mnt));
2277		- return error;
	2533	+ return mnt_make_readonly(mnt);
	2534	+
	2535	+ return __mnt_unmake_readonly(mnt);
	2536	+}
	2537	+
	2538	+/*
	2539	+ * Update the user-settable attributes on a mount. The caller must hold
	2540	+ * sb->s_umount for writing.
	2541	+ */
	2542	+static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
	2543	+{
	2544	+ lock_mount_hash();
	2545	+ mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
	2546	+ mnt->mnt.mnt_flags = mnt_flags;
	2547	+ touch_mnt_namespace(mnt->mnt_ns);
	2548	+ unlock_mount_hash();
	2549	+}
	2550	+
	2551	+static void mnt_warn_timestamp_expiry(struct path mountpoint, struct vfsmount mnt)
	2552	+{
	2553	+ struct super_block *sb = mnt->mnt_sb;
	2554	+
	2555	+ if (!__mnt_is_readonly(mnt) &&
	2556	+ (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
	2557	+ char buf = (char )__get_free_page(GFP_KERNEL);
	2558	+ char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
	2559	+ struct tm tm;
	2560	+
	2561	+ time64_to_tm(sb->s_time_max, 0, &tm);
	2562	+
	2563	+ pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
	2564	+ sb->s_type->name,
	2565	+ is_mounted(mnt) ? "remounted" : "mounted",
	2566	+ mntpath,
	2567	+ tm.tm_year+1900, (unsigned long long)sb->s_time_max);
	2568	+
	2569	+ free_page((unsigned long)buf);
	2570	+ }
	2571	+}
	2572	+
	2573	+/*
	2574	+ * Handle reconfiguration of the mountpoint only without alteration of the
	2575	+ * superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
	2576	+ * to mount(2).
	2577	+ */
	2578	+static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
	2579	+{
	2580	+ struct super_block *sb = path->mnt->mnt_sb;
	2581	+ struct mount *mnt = real_mount(path->mnt);
	2582	+ int ret;
	2583	+
	2584	+ if (!check_mnt(mnt))
	2585	+ return -EINVAL;
	2586	+
	2587	+ if (path->dentry != mnt->mnt.mnt_root)
	2588	+ return -EINVAL;
	2589	+
	2590	+ if (!can_change_locked_flags(mnt, mnt_flags))
	2591	+ return -EPERM;
	2592	+
	2593	+ down_write(&sb->s_umount);
	2594	+ ret = change_mount_ro_state(mnt, mnt_flags);
	2595	+ if (ret == 0)
	2596	+ set_mount_attributes(mnt, mnt_flags);
	2597	+ up_write(&sb->s_umount);
	2598	+
	2599	+ mnt_warn_timestamp_expiry(path, &mnt->mnt);
	2600	+
	2601	+ return ret;
2278	2602	}
2279	2603
2280	2604	/*
..	..	@@ -2288,6 +2612,7 @@
2288	2612	int err;
2289	2613	struct super_block *sb = path->mnt->mnt_sb;
2290	2614	struct mount *mnt = real_mount(path->mnt);
	2615	+ struct fs_context *fc;
2291	2616
2292	2617	if (!check_mnt(mnt))
2293	2618	return -EINVAL;
..	..	@@ -2295,58 +2620,29 @@
2295	2620	if (path->dentry != path->mnt->mnt_root)
2296	2621	return -EINVAL;
2297	2622
2298		- /* Don't allow changing of locked mnt flags.
2299		- *
2300		- * No locks need to be held here while testing the various
2301		- * MNT_LOCK flags because those flags can never be cleared
2302		- * once they are set.
2303		- */
2304		- if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2305		- !(mnt_flags & MNT_READONLY)) {
	2623	+ if (!can_change_locked_flags(mnt, mnt_flags))
2306	2624	return -EPERM;
2307		- }
2308		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2309		- !(mnt_flags & MNT_NODEV)) {
2310		- return -EPERM;
2311		- }
2312		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2313		- !(mnt_flags & MNT_NOSUID)) {
2314		- return -EPERM;
2315		- }
2316		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2317		- !(mnt_flags & MNT_NOEXEC)) {
2318		- return -EPERM;
2319		- }
2320		- if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2321		- ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2322		- return -EPERM;
2323		- }
2324	2625
2325		- err = security_sb_remount(sb, data);
2326		- if (err)
2327		- return err;
	2626	+ fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
	2627	+ if (IS_ERR(fc))
	2628	+ return PTR_ERR(fc);
2328	2629
2329		- down_write(&sb->s_umount);
2330		- if (ms_flags & MS_BIND)
2331		- err = change_mount_flags(path->mnt, ms_flags);
2332		- else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
2333		- err = -EPERM;
2334		- else {
2335		- err = do_remount_sb2(path->mnt, sb, sb_flags, data, 0);
2336		- namespace_lock();
2337		- lock_mount_hash();
2338		- propagate_remount(mnt);
2339		- unlock_mount_hash();
2340		- namespace_unlock();
2341		- }
	2630	+ fc->oldapi = true;
	2631	+ err = parse_monolithic_mount_data(fc, data);
2342	2632	if (!err) {
2343		- lock_mount_hash();
2344		- mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2345		- mnt->mnt.mnt_flags = mnt_flags;
2346		- touch_mnt_namespace(mnt->mnt_ns);
2347		- unlock_mount_hash();
	2633	+ down_write(&sb->s_umount);
	2634	+ err = -EPERM;
	2635	+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
	2636	+ err = reconfigure_super(fc);
	2637	+ if (!err)
	2638	+ set_mount_attributes(mnt, mnt_flags);
	2639	+ }
	2640	+ up_write(&sb->s_umount);
2348	2641	}
2349		- up_write(&sb->s_umount);
	2642	+
	2643	+ mnt_warn_timestamp_expiry(path, &mnt->mnt);
	2644	+
	2645	+ put_fs_context(fc);
2350	2646	return err;
2351	2647	}
2352	2648
..	..	@@ -2360,144 +2656,200 @@
2360	2656	return 0;
2361	2657	}
2362	2658
2363		-static int do_move_mount(struct path path, const char old_name)
	2659	+/*
	2660	+ * Check that there aren't references to earlier/same mount namespaces in the
	2661	+ * specified subtree. Such references can act as pins for mount namespaces
	2662	+ * that aren't checked by the mount-cycle checking code, thereby allowing
	2663	+ * cycles to be made.
	2664	+ */
	2665	+static bool check_for_nsfs_mounts(struct mount *subtree)
2364	2666	{
2365		- struct path old_path, parent_path;
	2667	+ struct mount *p;
	2668	+ bool ret = false;
	2669	+
	2670	+ lock_mount_hash();
	2671	+ for (p = subtree; p; p = next_mnt(p, subtree))
	2672	+ if (mnt_ns_loop(p->mnt.mnt_root))
	2673	+ goto out;
	2674	+
	2675	+ ret = true;
	2676	+out:
	2677	+ unlock_mount_hash();
	2678	+ return ret;
	2679	+}
	2680	+
	2681	+static int do_move_mount(struct path old_path, struct path new_path)
	2682	+{
	2683	+ struct mnt_namespace *ns;
2366	2684	struct mount *p;
2367	2685	struct mount *old;
2368		- struct mountpoint *mp;
	2686	+ struct mount *parent;
	2687	+ struct mountpoint mp, old_mp;
2369	2688	int err;
2370		- if (!old_name \|\| !*old_name)
2371		- return -EINVAL;
2372		- err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2373		- if (err)
2374		- return err;
	2689	+ bool attached;
2375	2690
2376		- mp = lock_mount(path);
2377		- err = PTR_ERR(mp);
	2691	+ mp = lock_mount(new_path);
2378	2692	if (IS_ERR(mp))
	2693	+ return PTR_ERR(mp);
	2694	+
	2695	+ old = real_mount(old_path->mnt);
	2696	+ p = real_mount(new_path->mnt);
	2697	+ parent = old->mnt_parent;
	2698	+ attached = mnt_has_parent(old);
	2699	+ old_mp = old->mnt_mp;
	2700	+ ns = old->mnt_ns;
	2701	+
	2702	+ err = -EINVAL;
	2703	+ /* The mountpoint must be in our namespace. */
	2704	+ if (!check_mnt(p))
2379	2705	goto out;
2380	2706
2381		- old = real_mount(old_path.mnt);
2382		- p = real_mount(path->mnt);
	2707	+ /* The thing moved must be mounted... */
	2708	+ if (!is_mounted(&old->mnt))
	2709	+ goto out;
2383	2710
2384		- err = -EINVAL;
2385		- if (!check_mnt(p) \|\| !check_mnt(old))
2386		- goto out1;
	2711	+ /* ... and either ours or the root of anon namespace */
	2712	+ if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
	2713	+ goto out;
2387	2714
2388	2715	if (old->mnt.mnt_flags & MNT_LOCKED)
2389		- goto out1;
	2716	+ goto out;
2390	2717
2391		- err = -EINVAL;
2392		- if (old_path.dentry != old_path.mnt->mnt_root)
2393		- goto out1;
	2718	+ if (old_path->dentry != old_path->mnt->mnt_root)
	2719	+ goto out;
2394	2720
2395		- if (!mnt_has_parent(old))
2396		- goto out1;
2397		-
2398		- if (d_is_dir(path->dentry) !=
2399		- d_is_dir(old_path.dentry))
2400		- goto out1;
	2721	+ if (d_is_dir(new_path->dentry) !=
	2722	+ d_is_dir(old_path->dentry))
	2723	+ goto out;
2401	2724	/*
2402	2725	* Don't move a mount residing in a shared parent.
2403	2726	*/
2404		- if (IS_MNT_SHARED(old->mnt_parent))
2405		- goto out1;
	2727	+ if (attached && IS_MNT_SHARED(parent))
	2728	+ goto out;
2406	2729	/*
2407	2730	* Don't move a mount tree containing unbindable mounts to a destination
2408	2731	* mount which is shared.
2409	2732	*/
2410	2733	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2411		- goto out1;
	2734	+ goto out;
2412	2735	err = -ELOOP;
	2736	+ if (!check_for_nsfs_mounts(old))
	2737	+ goto out;
2413	2738	for (; mnt_has_parent(p); p = p->mnt_parent)
2414	2739	if (p == old)
2415		- goto out1;
	2740	+ goto out;
2416	2741
2417		- err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
	2742	+ err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
	2743	+ attached);
2418	2744	if (err)
2419		- goto out1;
	2745	+ goto out;
2420	2746
2421	2747	/* if the mount is moved, it should no longer be expire
2422	2748	* automatically */
2423	2749	list_del_init(&old->mnt_expire);
2424		-out1:
2425		- unlock_mount(mp);
	2750	+ if (attached)
	2751	+ put_mountpoint(old_mp);
2426	2752	out:
2427		- if (!err)
2428		- path_put(&parent_path);
2429		- path_put(&old_path);
	2753	+ unlock_mount(mp);
	2754	+ if (!err) {
	2755	+ if (attached)
	2756	+ mntput_no_expire(parent);
	2757	+ else
	2758	+ free_mnt_ns(ns);
	2759	+ }
2430	2760	return err;
2431	2761	}
2432	2762
2433		-static struct vfsmount fs_set_subtype(struct vfsmount mnt, const char *fstype)
	2763	+static int do_move_mount_old(struct path path, const char old_name)
2434	2764	{
	2765	+ struct path old_path;
2435	2766	int err;
2436		- const char *subtype = strchr(fstype, '.');
2437		- if (subtype) {
2438		- subtype++;
2439		- err = -EINVAL;
2440		- if (!subtype[0])
2441		- goto err;
2442		- } else
2443		- subtype = "";
2444	2767
2445		- mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2446		- err = -ENOMEM;
2447		- if (!mnt->mnt_sb->s_subtype)
2448		- goto err;
2449		- return mnt;
	2768	+ if (!old_name \|\| !*old_name)
	2769	+ return -EINVAL;
2450	2770
2451		- err:
2452		- mntput(mnt);
2453		- return ERR_PTR(err);
	2771	+ err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
	2772	+ if (err)
	2773	+ return err;
	2774	+
	2775	+ err = do_move_mount(&old_path, path);
	2776	+ path_put(&old_path);
	2777	+ return err;
2454	2778	}
2455	2779
2456	2780	/*
2457	2781	* add a mount into a namespace's mount tree
2458	2782	*/
2459		-static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
	2783	+static int do_add_mount(struct mount newmnt, struct mountpoint mp,
	2784	+ struct path *path, int mnt_flags)
2460	2785	{
2461		- struct mountpoint *mp;
2462		- struct mount *parent;
2463		- int err;
	2786	+ struct mount *parent = real_mount(path->mnt);
2464	2787
2465	2788	mnt_flags &= ~MNT_INTERNAL_FLAGS;
2466	2789
2467		- mp = lock_mount(path);
2468		- if (IS_ERR(mp))
2469		- return PTR_ERR(mp);
2470		-
2471		- parent = real_mount(path->mnt);
2472		- err = -EINVAL;
2473	2790	if (unlikely(!check_mnt(parent))) {
2474	2791	/* that's acceptable only for automounts done in private ns */
2475	2792	if (!(mnt_flags & MNT_SHRINKABLE))
2476		- goto unlock;
	2793	+ return -EINVAL;
2477	2794	/* ... and for those we'd better have mountpoint still alive */
2478	2795	if (!parent->mnt_ns)
2479		- goto unlock;
	2796	+ return -EINVAL;
2480	2797	}
2481	2798
2482	2799	/* Refuse the same filesystem on the same mount point */
2483		- err = -EBUSY;
2484	2800	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2485	2801	path->mnt->mnt_root == path->dentry)
2486		- goto unlock;
	2802	+ return -EBUSY;
2487	2803
2488		- err = -EINVAL;
2489	2804	if (d_is_symlink(newmnt->mnt.mnt_root))
2490		- goto unlock;
	2805	+ return -EINVAL;
2491	2806
2492	2807	newmnt->mnt.mnt_flags = mnt_flags;
2493		- err = graft_tree(newmnt, parent, mp);
2494		-
2495		-unlock:
2496		- unlock_mount(mp);
2497		- return err;
	2808	+ return graft_tree(newmnt, parent, mp);
2498	2809	}
2499	2810
2500		-static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags);
	2811	+static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags);
	2812	+
	2813	+/*
	2814	+ * Create a new mount using a superblock configuration and request it
	2815	+ * be added to the namespace tree.
	2816	+ */
	2817	+static int do_new_mount_fc(struct fs_context fc, struct path mountpoint,
	2818	+ unsigned int mnt_flags)
	2819	+{
	2820	+ struct vfsmount *mnt;
	2821	+ struct mountpoint *mp;
	2822	+ struct super_block *sb = fc->root->d_sb;
	2823	+ int error;
	2824	+
	2825	+ error = security_sb_kern_mount(sb);
	2826	+ if (!error && mount_too_revealing(sb, &mnt_flags))
	2827	+ error = -EPERM;
	2828	+
	2829	+ if (unlikely(error)) {
	2830	+ fc_drop_locked(fc);
	2831	+ return error;
	2832	+ }
	2833	+
	2834	+ up_write(&sb->s_umount);
	2835	+
	2836	+ mnt = vfs_create_mount(fc);
	2837	+ if (IS_ERR(mnt))
	2838	+ return PTR_ERR(mnt);
	2839	+
	2840	+ mnt_warn_timestamp_expiry(mountpoint, mnt);
	2841	+
	2842	+ mp = lock_mount(mountpoint);
	2843	+ if (IS_ERR(mp)) {
	2844	+ mntput(mnt);
	2845	+ return PTR_ERR(mp);
	2846	+ }
	2847	+ error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
	2848	+ unlock_mount(mp);
	2849	+ if (error < 0)
	2850	+ mntput(mnt);
	2851	+ return error;
	2852	+}
2501	2853
2502	2854	/*
2503	2855	* create a new mount for userspace and request it to be added into the
..	..	@@ -2507,8 +2859,9 @@
2507	2859	int mnt_flags, const char name, void data)
2508	2860	{
2509	2861	struct file_system_type *type;
2510		- struct vfsmount *mnt;
2511		- int err;
	2862	+ struct fs_context *fc;
	2863	+ const char *subtype = NULL;
	2864	+ int err = 0;
2512	2865
2513	2866	if (!fstype)
2514	2867	return -EINVAL;
..	..	@@ -2517,45 +2870,99 @@
2517	2870	if (!type)
2518	2871	return -ENODEV;
2519	2872
2520		- mnt = vfs_kern_mount(type, sb_flags, name, data);
2521		- if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2522		- !mnt->mnt_sb->s_subtype)
2523		- mnt = fs_set_subtype(mnt, fstype);
2524		-
2525		- put_filesystem(type);
2526		- if (IS_ERR(mnt))
2527		- return PTR_ERR(mnt);
2528		-
2529		- if (mount_too_revealing(mnt, &mnt_flags)) {
2530		- mntput(mnt);
2531		- return -EPERM;
	2873	+ if (type->fs_flags & FS_HAS_SUBTYPE) {
	2874	+ subtype = strchr(fstype, '.');
	2875	+ if (subtype) {
	2876	+ subtype++;
	2877	+ if (!*subtype) {
	2878	+ put_filesystem(type);
	2879	+ return -EINVAL;
	2880	+ }
	2881	+ }
2532	2882	}
2533	2883
2534		- err = do_add_mount(real_mount(mnt), path, mnt_flags);
2535		- if (err)
2536		- mntput(mnt);
	2884	+ fc = fs_context_for_mount(type, sb_flags);
	2885	+ put_filesystem(type);
	2886	+ if (IS_ERR(fc))
	2887	+ return PTR_ERR(fc);
	2888	+
	2889	+ if (subtype)
	2890	+ err = vfs_parse_fs_string(fc, "subtype",
	2891	+ subtype, strlen(subtype));
	2892	+ if (!err && name)
	2893	+ err = vfs_parse_fs_string(fc, "source", name, strlen(name));
	2894	+ if (!err)
	2895	+ err = parse_monolithic_mount_data(fc, data);
	2896	+ if (!err && !mount_capable(fc))
	2897	+ err = -EPERM;
	2898	+ if (!err)
	2899	+ err = vfs_get_tree(fc);
	2900	+ if (!err)
	2901	+ err = do_new_mount_fc(fc, path, mnt_flags);
	2902	+
	2903	+ put_fs_context(fc);
2537	2904	return err;
2538	2905	}
2539	2906
2540	2907	int finish_automount(struct vfsmount m, struct path path)
2541	2908	{
2542		- struct mount *mnt = real_mount(m);
	2909	+ struct dentry *dentry = path->dentry;
	2910	+ struct mountpoint *mp;
	2911	+ struct mount *mnt;
2543	2912	int err;
	2913	+
	2914	+ if (!m)
	2915	+ return 0;
	2916	+ if (IS_ERR(m))
	2917	+ return PTR_ERR(m);
	2918	+
	2919	+ mnt = real_mount(m);
2544	2920	/* The new mount record should have at least 2 refs to prevent it being
2545	2921	* expired before we get a chance to add it
2546	2922	*/
2547	2923	BUG_ON(mnt_get_count(mnt) < 2);
2548	2924
2549	2925	if (m->mnt_sb == path->mnt->mnt_sb &&
2550		- m->mnt_root == path->dentry) {
	2926	+ m->mnt_root == dentry) {
2551	2927	err = -ELOOP;
2552		- goto fail;
	2928	+ goto discard;
2553	2929	}
2554	2930
2555		- err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
2556		- if (!err)
2557		- return 0;
2558		-fail:
	2931	+ /*
	2932	+ * we don't want to use lock_mount() - in this case finding something
	2933	+ * that overmounts our mountpoint to be means "quitely drop what we've
	2934	+ * got", not "try to mount it on top".
	2935	+ */
	2936	+ inode_lock(dentry->d_inode);
	2937	+ namespace_lock();
	2938	+ if (unlikely(cant_mount(dentry))) {
	2939	+ err = -ENOENT;
	2940	+ goto discard_locked;
	2941	+ }
	2942	+ rcu_read_lock();
	2943	+ if (unlikely(__lookup_mnt(path->mnt, dentry))) {
	2944	+ rcu_read_unlock();
	2945	+ err = 0;
	2946	+ goto discard_locked;
	2947	+ }
	2948	+ rcu_read_unlock();
	2949	+ mp = get_mountpoint(dentry);
	2950	+ if (IS_ERR(mp)) {
	2951	+ err = PTR_ERR(mp);
	2952	+ goto discard_locked;
	2953	+ }
	2954	+
	2955	+ err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
	2956	+ unlock_mount(mp);
	2957	+ if (unlikely(err))
	2958	+ goto discard;
	2959	+ mntput(m);
	2960	+ return 0;
	2961	+
	2962	+discard_locked:
	2963	+ namespace_unlock();
	2964	+ inode_unlock(dentry->d_inode);
	2965	+discard:
2559	2966	/* remove m from any expiration list it may be on */
2560	2967	if (!list_empty(&mnt->mnt_expire)) {
2561	2968	namespace_lock();
..	..	@@ -2689,39 +3096,10 @@
2689	3096	}
2690	3097	}
2691	3098
2692		-/*
2693		- * Some copy_from_user() implementations do not return the exact number of
2694		- * bytes remaining to copy on a fault. But copy_mount_options() requires that.
2695		- * Note that this function differs from copy_from_user() in that it will oops
2696		- * on bad values of `to', rather than returning a short copy.
2697		- */
2698		-static long exact_copy_from_user(void to, const void __user from,
2699		- unsigned long n)
	3099	+static void copy_mount_options(const void __user data)
2700	3100	{
2701		- char *t = to;
2702		- const char __user *f = from;
2703		- char c;
2704		-
2705		- if (!access_ok(VERIFY_READ, from, n))
2706		- return n;
2707		-
2708		- while (n) {
2709		- if (__get_user(c, f)) {
2710		- memset(t, 0, n);
2711		- break;
2712		- }
2713		- *t++ = c;
2714		- f++;
2715		- n--;
2716		- }
2717		- return n;
2718		-}
2719		-
2720		-void copy_mount_options(const void __user data)
2721		-{
2722		- int i;
2723		- unsigned long size;
2724	3101	char *copy;
	3102	+ unsigned left, offset;
2725	3103
2726	3104	if (!data)
2727	3105	return NULL;
..	..	@@ -2730,28 +3108,33 @@
2730	3108	if (!copy)
2731	3109	return ERR_PTR(-ENOMEM);
2732	3110
2733		- /* We only care that some data at the address the user
2734		- * gave us is valid. Just in case, we'll zero
2735		- * the remainder of the page.
2736		- */
2737		- /* copy_from_user cannot cross TASK_SIZE ! */
2738		- size = TASK_SIZE - (unsigned long)untagged_addr(data);
2739		- if (size > PAGE_SIZE)
2740		- size = PAGE_SIZE;
	3111	+ left = copy_from_user(copy, data, PAGE_SIZE);
2741	3112
2742		- i = size - exact_copy_from_user(copy, data, size);
2743		- if (!i) {
	3113	+ /*
	3114	+ * Not all architectures have an exact copy_from_user(). Resort to
	3115	+ * byte at a time.
	3116	+ */
	3117	+ offset = PAGE_SIZE - left;
	3118	+ while (left) {
	3119	+ char c;
	3120	+ if (get_user(c, (const char __user *)data + offset))
	3121	+ break;
	3122	+ copy[offset] = c;
	3123	+ left--;
	3124	+ offset++;
	3125	+ }
	3126	+
	3127	+ if (left == PAGE_SIZE) {
2744	3128	kfree(copy);
2745	3129	return ERR_PTR(-EFAULT);
2746	3130	}
2747		- if (i != PAGE_SIZE)
2748		- memset(copy + i, 0, PAGE_SIZE - i);
	3131	+
2749	3132	return copy;
2750	3133	}
2751	3134
2752		-char copy_mount_string(const void __user data)
	3135	+static char copy_mount_string(const void __user data)
2753	3136	{
2754		- return data ? strndup_user(data, PAGE_SIZE) : NULL;
	3137	+ return data ? strndup_user(data, PATH_MAX) : NULL;
2755	3138	}
2756	3139
2757	3140	/*
..	..	@@ -2768,12 +3151,11 @@
2768	3151	* Therefore, if this magic number is present, it carries no information
2769	3152	* and must be discarded.
2770	3153	*/
2771		-long do_mount(const char dev_name, const char __user dir_name,
	3154	+int path_mount(const char dev_name, struct path path,
2772	3155	const char type_page, unsigned long flags, void data_page)
2773	3156	{
2774		- struct path path;
2775	3157	unsigned int mnt_flags = 0, sb_flags;
2776		- int retval = 0;
	3158	+ int ret;
2777	3159
2778	3160	/* Discard magic */
2779	3161	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
..	..	@@ -2786,19 +3168,13 @@
2786	3168	if (flags & MS_NOUSER)
2787	3169	return -EINVAL;
2788	3170
2789		- /* ... and get the mountpoint */
2790		- retval = user_path(dir_name, &path);
2791		- if (retval)
2792		- return retval;
2793		-
2794		- retval = security_sb_mount(dev_name, &path,
2795		- type_page, flags, data_page);
2796		- if (!retval && !may_mount())
2797		- retval = -EPERM;
2798		- if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
2799		- retval = -EPERM;
2800		- if (retval)
2801		- goto dput_out;
	3171	+ ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
	3172	+ if (ret)
	3173	+ return ret;
	3174	+ if (!may_mount())
	3175	+ return -EPERM;
	3176	+ if ((flags & SB_MANDLOCK) && !may_mandlock())
	3177	+ return -EPERM;
2802	3178
2803	3179	/* Default to relatime unless overriden */
2804	3180	if (!(flags & MS_NOATIME))
..	..	@@ -2819,13 +3195,15 @@
2819	3195	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
2820	3196	if (flags & MS_RDONLY)
2821	3197	mnt_flags \|= MNT_READONLY;
	3198	+ if (flags & MS_NOSYMFOLLOW)
	3199	+ mnt_flags \|= MNT_NOSYMFOLLOW;
2822	3200
2823	3201	/* The default atime for remount is preservation */
2824	3202	if ((flags & MS_REMOUNT) &&
2825	3203	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
2826	3204	MS_STRICTATIME)) == 0)) {
2827	3205	mnt_flags &= ~MNT_ATIME_MASK;
2828		- mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
	3206	+ mnt_flags \|= path->mnt->mnt_flags & MNT_ATIME_MASK;
2829	3207	}
2830	3208
2831	3209	sb_flags = flags & (SB_RDONLY \|
..	..	@@ -2837,21 +3215,33 @@
2837	3215	SB_LAZYTIME \|
2838	3216	SB_I_VERSION);
2839	3217
	3218	+ if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
	3219	+ return do_reconfigure_mnt(path, mnt_flags);
2840	3220	if (flags & MS_REMOUNT)
2841		- retval = do_remount(&path, flags, sb_flags, mnt_flags,
2842		- data_page);
2843		- else if (flags & MS_BIND)
2844		- retval = do_loopback(&path, dev_name, flags & MS_REC);
2845		- else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
2846		- retval = do_change_type(&path, flags);
2847		- else if (flags & MS_MOVE)
2848		- retval = do_move_mount(&path, dev_name);
2849		- else
2850		- retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
2851		- dev_name, data_page);
2852		-dput_out:
	3221	+ return do_remount(path, flags, sb_flags, mnt_flags, data_page);
	3222	+ if (flags & MS_BIND)
	3223	+ return do_loopback(path, dev_name, flags & MS_REC);
	3224	+ if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
	3225	+ return do_change_type(path, flags);
	3226	+ if (flags & MS_MOVE)
	3227	+ return do_move_mount_old(path, dev_name);
	3228	+
	3229	+ return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
	3230	+ data_page);
	3231	+}
	3232	+
	3233	+long do_mount(const char dev_name, const char __user dir_name,
	3234	+ const char type_page, unsigned long flags, void data_page)
	3235	+{
	3236	+ struct path path;
	3237	+ int ret;
	3238	+
	3239	+ ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
	3240	+ if (ret)
	3241	+ return ret;
	3242	+ ret = path_mount(dev_name, &path, type_page, flags, data_page);
2853	3243	path_put(&path);
2854		- return retval;
	3244	+ return ret;
2855	3245	}
2856	3246
2857	3247	static struct ucounts inc_mnt_namespaces(struct user_namespace ns)
..	..	@@ -2866,7 +3256,8 @@
2866	3256
2867	3257	static void free_mnt_ns(struct mnt_namespace *ns)
2868	3258	{
2869		- ns_free_inum(&ns->ns);
	3259	+ if (!is_anon_ns(ns))
	3260	+ ns_free_inum(&ns->ns);
2870	3261	dec_mnt_namespaces(ns->ucounts);
2871	3262	put_user_ns(ns->user_ns);
2872	3263	kfree(ns);
..	..	@@ -2881,7 +3272,7 @@
2881	3272	*/
2882	3273	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2883	3274
2884		-static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns)
	3275	+static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns, bool anon)
2885	3276	{
2886	3277	struct mnt_namespace *new_ns;
2887	3278	struct ucounts *ucounts;
..	..	@@ -2891,28 +3282,28 @@
2891	3282	if (!ucounts)
2892	3283	return ERR_PTR(-ENOSPC);
2893	3284
2894		- new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
	3285	+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2895	3286	if (!new_ns) {
2896	3287	dec_mnt_namespaces(ucounts);
2897	3288	return ERR_PTR(-ENOMEM);
2898	3289	}
2899		- ret = ns_alloc_inum(&new_ns->ns);
2900		- if (ret) {
2901		- kfree(new_ns);
2902		- dec_mnt_namespaces(ucounts);
2903		- return ERR_PTR(ret);
	3290	+ if (!anon) {
	3291	+ ret = ns_alloc_inum(&new_ns->ns);
	3292	+ if (ret) {
	3293	+ kfree(new_ns);
	3294	+ dec_mnt_namespaces(ucounts);
	3295	+ return ERR_PTR(ret);
	3296	+ }
2904	3297	}
2905	3298	new_ns->ns.ops = &mntns_operations;
2906		- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
	3299	+ if (!anon)
	3300	+ new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2907	3301	atomic_set(&new_ns->count, 1);
2908		- new_ns->root = NULL;
2909	3302	INIT_LIST_HEAD(&new_ns->list);
2910	3303	init_waitqueue_head(&new_ns->poll);
2911		- new_ns->event = 0;
	3304	+ spin_lock_init(&new_ns->ns_lock);
2912	3305	new_ns->user_ns = get_user_ns(user_ns);
2913	3306	new_ns->ucounts = ucounts;
2914		- new_ns->mounts = 0;
2915		- new_ns->pending_mounts = 0;
2916	3307	return new_ns;
2917	3308	}
2918	3309
..	..	@@ -2936,7 +3327,7 @@
2936	3327
2937	3328	old = ns->root;
2938	3329
2939		- new_ns = alloc_mnt_ns(user_ns);
	3330	+ new_ns = alloc_mnt_ns(user_ns, false);
2940	3331	if (IS_ERR(new_ns))
2941	3332	return new_ns;
2942	3333
..	..	@@ -2944,12 +3335,17 @@
2944	3335	/* First pass: copy the tree topology */
2945	3336	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
2946	3337	if (user_ns != ns->user_ns)
2947		- copy_flags \|= CL_SHARED_TO_SLAVE \| CL_UNPRIVILEGED;
	3338	+ copy_flags \|= CL_SHARED_TO_SLAVE;
2948	3339	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2949	3340	if (IS_ERR(new)) {
2950	3341	namespace_unlock();
2951	3342	free_mnt_ns(new_ns);
2952	3343	return ERR_CAST(new);
	3344	+ }
	3345	+ if (user_ns != ns->user_ns) {
	3346	+ lock_mount_hash();
	3347	+ lock_mnt_tree(new);
	3348	+ unlock_mount_hash();
2953	3349	}
2954	3350	new_ns->root = new;
2955	3351	list_add_tail(&new_ns->list, &new->mnt_list);
..	..	@@ -2991,37 +3387,25 @@
2991	3387	return new_ns;
2992	3388	}
2993	3389
2994		-/**
2995		- * create_mnt_ns - creates a private namespace and adds a root filesystem
2996		- * @mnt: pointer to the new root filesystem mountpoint
2997		- */
2998		-static struct mnt_namespace create_mnt_ns(struct vfsmount m)
	3390	+struct dentry mount_subtree(struct vfsmount m, const char *name)
2999	3391	{
3000		- struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
3001		- if (!IS_ERR(new_ns)) {
3002		- struct mount *mnt = real_mount(m);
3003		- mnt->mnt_ns = new_ns;
3004		- new_ns->root = mnt;
3005		- new_ns->mounts++;
3006		- list_add(&mnt->mnt_list, &new_ns->list);
3007		- } else {
3008		- mntput(m);
3009		- }
3010		- return new_ns;
3011		-}
3012		-
3013		-struct dentry mount_subtree(struct vfsmount mnt, const char *name)
3014		-{
	3392	+ struct mount *mnt = real_mount(m);
3015	3393	struct mnt_namespace *ns;
3016	3394	struct super_block *s;
3017	3395	struct path path;
3018	3396	int err;
3019	3397
3020		- ns = create_mnt_ns(mnt);
3021		- if (IS_ERR(ns))
	3398	+ ns = alloc_mnt_ns(&init_user_ns, true);
	3399	+ if (IS_ERR(ns)) {
	3400	+ mntput(m);
3022	3401	return ERR_CAST(ns);
	3402	+ }
	3403	+ mnt->mnt_ns = ns;
	3404	+ ns->root = mnt;
	3405	+ ns->mounts++;
	3406	+ list_add(&mnt->mnt_list, &ns->list);
3023	3407
3024		- err = vfs_path_lookup(mnt->mnt_root, mnt,
	3408	+ err = vfs_path_lookup(m->mnt_root, m,
3025	3409	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
3026	3410
3027	3411	put_mnt_ns(ns);
..	..	@@ -3040,8 +3424,8 @@
3040	3424	}
3041	3425	EXPORT_SYMBOL(mount_subtree);
3042	3426
3043		-int ksys_mount(char __user dev_name, char __user dir_name, char __user *type,
3044		- unsigned long flags, void __user *data)
	3427	+SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
	3428	+ char __user , type, unsigned long, flags, void __user , data)
3045	3429	{
3046	3430	int ret;
3047	3431	char *kernel_type;
..	..	@@ -3074,10 +3458,202 @@
3074	3458	return ret;
3075	3459	}
3076	3460
3077		-SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
3078		- char __user , type, unsigned long, flags, void __user , data)
	3461	+/*
	3462	+ * Create a kernel mount representation for a new, prepared superblock
	3463	+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
	3464	+ */
	3465	+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
	3466	+ unsigned int, attr_flags)
3079	3467	{
3080		- return ksys_mount(dev_name, dir_name, type, flags, data);
	3468	+ struct mnt_namespace *ns;
	3469	+ struct fs_context *fc;
	3470	+ struct file *file;
	3471	+ struct path newmount;
	3472	+ struct mount *mnt;
	3473	+ struct fd f;
	3474	+ unsigned int mnt_flags = 0;
	3475	+ long ret;
	3476	+
	3477	+ if (!may_mount())
	3478	+ return -EPERM;
	3479	+
	3480	+ if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
	3481	+ return -EINVAL;
	3482	+
	3483	+ if (attr_flags & ~(MOUNT_ATTR_RDONLY \|
	3484	+ MOUNT_ATTR_NOSUID \|
	3485	+ MOUNT_ATTR_NODEV \|
	3486	+ MOUNT_ATTR_NOEXEC \|
	3487	+ MOUNT_ATTR__ATIME \|
	3488	+ MOUNT_ATTR_NODIRATIME))
	3489	+ return -EINVAL;
	3490	+
	3491	+ if (attr_flags & MOUNT_ATTR_RDONLY)
	3492	+ mnt_flags \|= MNT_READONLY;
	3493	+ if (attr_flags & MOUNT_ATTR_NOSUID)
	3494	+ mnt_flags \|= MNT_NOSUID;
	3495	+ if (attr_flags & MOUNT_ATTR_NODEV)
	3496	+ mnt_flags \|= MNT_NODEV;
	3497	+ if (attr_flags & MOUNT_ATTR_NOEXEC)
	3498	+ mnt_flags \|= MNT_NOEXEC;
	3499	+ if (attr_flags & MOUNT_ATTR_NODIRATIME)
	3500	+ mnt_flags \|= MNT_NODIRATIME;
	3501	+
	3502	+ switch (attr_flags & MOUNT_ATTR__ATIME) {
	3503	+ case MOUNT_ATTR_STRICTATIME:
	3504	+ break;
	3505	+ case MOUNT_ATTR_NOATIME:
	3506	+ mnt_flags \|= MNT_NOATIME;
	3507	+ break;
	3508	+ case MOUNT_ATTR_RELATIME:
	3509	+ mnt_flags \|= MNT_RELATIME;
	3510	+ break;
	3511	+ default:
	3512	+ return -EINVAL;
	3513	+ }
	3514	+
	3515	+ f = fdget(fs_fd);
	3516	+ if (!f.file)
	3517	+ return -EBADF;
	3518	+
	3519	+ ret = -EINVAL;
	3520	+ if (f.file->f_op != &fscontext_fops)
	3521	+ goto err_fsfd;
	3522	+
	3523	+ fc = f.file->private_data;
	3524	+
	3525	+ ret = mutex_lock_interruptible(&fc->uapi_mutex);
	3526	+ if (ret < 0)
	3527	+ goto err_fsfd;
	3528	+
	3529	+ /* There must be a valid superblock or we can't mount it */
	3530	+ ret = -EINVAL;
	3531	+ if (!fc->root)
	3532	+ goto err_unlock;
	3533	+
	3534	+ ret = -EPERM;
	3535	+ if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
	3536	+ pr_warn("VFS: Mount too revealing\n");
	3537	+ goto err_unlock;
	3538	+ }
	3539	+
	3540	+ ret = -EBUSY;
	3541	+ if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
	3542	+ goto err_unlock;
	3543	+
	3544	+ ret = -EPERM;
	3545	+ if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
	3546	+ goto err_unlock;
	3547	+
	3548	+ newmount.mnt = vfs_create_mount(fc);
	3549	+ if (IS_ERR(newmount.mnt)) {
	3550	+ ret = PTR_ERR(newmount.mnt);
	3551	+ goto err_unlock;
	3552	+ }
	3553	+ newmount.dentry = dget(fc->root);
	3554	+ newmount.mnt->mnt_flags = mnt_flags;
	3555	+
	3556	+ /* We've done the mount bit - now move the file context into more or
	3557	+ * less the same state as if we'd done an fspick(). We don't want to
	3558	+ * do any memory allocation or anything like that at this point as we
	3559	+ * don't want to have to handle any errors incurred.
	3560	+ */
	3561	+ vfs_clean_context(fc);
	3562	+
	3563	+ ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
	3564	+ if (IS_ERR(ns)) {
	3565	+ ret = PTR_ERR(ns);
	3566	+ goto err_path;
	3567	+ }
	3568	+ mnt = real_mount(newmount.mnt);
	3569	+ mnt->mnt_ns = ns;
	3570	+ ns->root = mnt;
	3571	+ ns->mounts = 1;
	3572	+ list_add(&mnt->mnt_list, &ns->list);
	3573	+ mntget(newmount.mnt);
	3574	+
	3575	+ /* Attach to an apparent O_PATH fd with a note that we need to unmount
	3576	+ * it, not just simply put it.
	3577	+ */
	3578	+ file = dentry_open(&newmount, O_PATH, fc->cred);
	3579	+ if (IS_ERR(file)) {
	3580	+ dissolve_on_fput(newmount.mnt);
	3581	+ ret = PTR_ERR(file);
	3582	+ goto err_path;
	3583	+ }
	3584	+ file->f_mode \|= FMODE_NEED_UNMOUNT;
	3585	+
	3586	+ ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
	3587	+ if (ret >= 0)
	3588	+ fd_install(ret, file);
	3589	+ else
	3590	+ fput(file);
	3591	+
	3592	+err_path:
	3593	+ path_put(&newmount);
	3594	+err_unlock:
	3595	+ mutex_unlock(&fc->uapi_mutex);
	3596	+err_fsfd:
	3597	+ fdput(f);
	3598	+ return ret;
	3599	+}
	3600	+
	3601	+/*
	3602	+ * Move a mount from one place to another. In combination with
	3603	+ * fsopen()/fsmount() this is used to install a new mount and in combination
	3604	+ * with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
	3605	+ * a mount subtree.
	3606	+ *
	3607	+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
	3608	+ */
	3609	+SYSCALL_DEFINE5(move_mount,
	3610	+ int, from_dfd, const char __user *, from_pathname,
	3611	+ int, to_dfd, const char __user *, to_pathname,
	3612	+ unsigned int, flags)
	3613	+{
	3614	+ struct path from_path, to_path;
	3615	+ unsigned int lflags;
	3616	+ int ret = 0;
	3617	+
	3618	+ if (!may_mount())
	3619	+ return -EPERM;
	3620	+
	3621	+ if (flags & ~MOVE_MOUNT__MASK)
	3622	+ return -EINVAL;
	3623	+
	3624	+ /* If someone gives a pathname, they aren't permitted to move
	3625	+ * from an fd that requires unmount as we can't get at the flag
	3626	+ * to clear it afterwards.
	3627	+ */
	3628	+ lflags = 0;
	3629	+ if (flags & MOVE_MOUNT_F_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
	3630	+ if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
	3631	+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
	3632	+
	3633	+ ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
	3634	+ if (ret < 0)
	3635	+ return ret;
	3636	+
	3637	+ lflags = 0;
	3638	+ if (flags & MOVE_MOUNT_T_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
	3639	+ if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
	3640	+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
	3641	+
	3642	+ ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
	3643	+ if (ret < 0)
	3644	+ goto out_from;
	3645	+
	3646	+ ret = security_move_mount(&from_path, &to_path);
	3647	+ if (ret < 0)
	3648	+ goto out_to;
	3649	+
	3650	+ ret = do_move_mount(&from_path, &to_path);
	3651	+
	3652	+out_to:
	3653	+ path_put(&to_path);
	3654	+out_from:
	3655	+ path_put(&from_path);
	3656	+ return ret;
3081	3657	}
3082	3658
3083	3659	/*
..	..	@@ -3119,7 +3695,7 @@
3119	3695	* file system may be mounted on put_old. After all, new_root is a mountpoint.
3120	3696	*
3121	3697	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
3122		- * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
	3698	+ * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
3123	3699	* in this situation.
3124	3700	*
3125	3701	* Notes:
..	..	@@ -3133,19 +3709,21 @@
3133	3709	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
3134	3710	const char __user *, put_old)
3135	3711	{
3136		- struct path new, old, parent_path, root_parent, root;
3137		- struct mount new_mnt, root_mnt, *old_mnt;
	3712	+ struct path new, old, root;
	3713	+ struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
3138	3714	struct mountpoint old_mp, root_mp;
3139	3715	int error;
3140	3716
3141	3717	if (!may_mount())
3142	3718	return -EPERM;
3143	3719
3144		- error = user_path_dir(new_root, &new);
	3720	+ error = user_path_at(AT_FDCWD, new_root,
	3721	+ LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
3145	3722	if (error)
3146	3723	goto out0;
3147	3724
3148		- error = user_path_dir(put_old, &old);
	3725	+ error = user_path_at(AT_FDCWD, put_old,
	3726	+ LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
3149	3727	if (error)
3150	3728	goto out1;
3151	3729
..	..	@@ -3163,9 +3741,11 @@
3163	3741	new_mnt = real_mount(new.mnt);
3164	3742	root_mnt = real_mount(root.mnt);
3165	3743	old_mnt = real_mount(old.mnt);
	3744	+ ex_parent = new_mnt->mnt_parent;
	3745	+ root_parent = root_mnt->mnt_parent;
3166	3746	if (IS_MNT_SHARED(old_mnt) \|\|
3167		- IS_MNT_SHARED(new_mnt->mnt_parent) \|\|
3168		- IS_MNT_SHARED(root_mnt->mnt_parent))
	3747	+ IS_MNT_SHARED(ex_parent) \|\|
	3748	+ IS_MNT_SHARED(root_parent))
3169	3749	goto out4;
3170	3750	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
3171	3751	goto out4;
..	..	@@ -3182,7 +3762,6 @@
3182	3762	goto out4; /* not a mountpoint */
3183	3763	if (!mnt_has_parent(root_mnt))
3184	3764	goto out4; /* not attached */
3185		- root_mp = root_mnt->mnt_mp;
3186	3765	if (new.mnt->mnt_root != new.dentry)
3187	3766	goto out4; /* not a mountpoint */
3188	3767	if (!mnt_has_parent(new_mnt))
..	..	@@ -3194,9 +3773,8 @@
3194	3773	if (!is_path_reachable(new_mnt, new.dentry, &root))
3195	3774	goto out4;
3196	3775	lock_mount_hash();
3197		- root_mp->m_count++; /* pin it so it won't go away */
3198		- detach_mnt(new_mnt, &parent_path);
3199		- detach_mnt(root_mnt, &root_parent);
	3776	+ umount_mnt(new_mnt);
	3777	+ root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
3200	3778	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
3201	3779	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
3202	3780	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
..	..	@@ -3204,7 +3782,8 @@
3204	3782	/* mount old root on put_old */
3205	3783	attach_mnt(root_mnt, old_mnt, old_mp);
3206	3784	/* mount new_root on / */
3207		- attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
	3785	+ attach_mnt(new_mnt, root_parent, root_mp);
	3786	+ mnt_add_count(root_parent, -1);
3208	3787	touch_mnt_namespace(current->nsproxy->mnt_ns);
3209	3788	/* A moved mount should not expire automatically */
3210	3789	list_del_init(&new_mnt->mnt_expire);
..	..	@@ -3214,10 +3793,8 @@
3214	3793	error = 0;
3215	3794	out4:
3216	3795	unlock_mount(old_mp);
3217		- if (!error) {
3218		- path_put(&root_parent);
3219		- path_put(&parent_path);
3220		- }
	3796	+ if (!error)
	3797	+ mntput_no_expire(ex_parent);
3221	3798	out3:
3222	3799	path_put(&root);
3223	3800	out2:
..	..	@@ -3231,22 +3808,22 @@
3231	3808	static void __init init_mount_tree(void)
3232	3809	{
3233	3810	struct vfsmount *mnt;
	3811	+ struct mount *m;
3234	3812	struct mnt_namespace *ns;
3235	3813	struct path root;
3236		- struct file_system_type *type;
3237	3814
3238		- type = get_fs_type("rootfs");
3239		- if (!type)
3240		- panic("Can't find rootfs type");
3241		- mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
3242		- put_filesystem(type);
	3815	+ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
3243	3816	if (IS_ERR(mnt))
3244	3817	panic("Can't create rootfs");
3245	3818
3246		- ns = create_mnt_ns(mnt);
	3819	+ ns = alloc_mnt_ns(&init_user_ns, false);
3247	3820	if (IS_ERR(ns))
3248	3821	panic("Can't allocate initial namespace");
3249		-
	3822	+ m = real_mount(mnt);
	3823	+ m->mnt_ns = ns;
	3824	+ ns->root = m;
	3825	+ ns->mounts = 1;
	3826	+ list_add(&m->mnt_list, &ns->list);
3250	3827	init_task.nsproxy->mnt_ns = ns;
3251	3828	get_mnt_ns(ns);
3252	3829
..	..	@@ -3288,6 +3865,7 @@
3288	3865	fs_kobj = kobject_create_and_add("fs", NULL);
3289	3866	if (!fs_kobj)
3290	3867	printk(KERN_WARNING "%s: kobj create error\n", __func__);
	3868	+ shmem_init();
3291	3869	init_rootfs();
3292	3870	init_mount_tree();
3293	3871	}
..	..	@@ -3300,10 +3878,10 @@
3300	3878	free_mnt_ns(ns);
3301	3879	}
3302	3880
3303		-struct vfsmount kern_mount_data(struct file_system_type type, void *data)
	3881	+struct vfsmount kern_mount(struct file_system_type type)
3304	3882	{
3305	3883	struct vfsmount *mnt;
3306		- mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
	3884	+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
3307	3885	if (!IS_ERR(mnt)) {
3308	3886	/*
3309	3887	* it is a longterm mount, don't release mnt until
..	..	@@ -3313,7 +3891,7 @@
3313	3891	}
3314	3892	return mnt;
3315	3893	}
3316		-EXPORT_SYMBOL_GPL(kern_mount_data);
	3894	+EXPORT_SYMBOL_GPL(kern_mount);
3317	3895
3318	3896	void kern_unmount(struct vfsmount *mnt)
3319	3897	{
..	..	@@ -3325,6 +3903,19 @@
3325	3903	}
3326	3904	}
3327	3905	EXPORT_SYMBOL(kern_unmount);
	3906	+
	3907	+void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
	3908	+{
	3909	+ unsigned int i;
	3910	+
	3911	+ for (i = 0; i < num; i++)
	3912	+ if (mnt[i])
	3913	+ real_mount(mnt[i])->mnt_ns = NULL;
	3914	+ synchronize_rcu_expedited();
	3915	+ for (i = 0; i < num; i++)
	3916	+ mntput(mnt[i]);
	3917	+}
	3918	+EXPORT_SYMBOL(kern_unmount_array);
3328	3919
3329	3920	bool our_mnt(struct vfsmount *mnt)
3330	3921	{
..	..	@@ -3355,7 +3946,8 @@
3355	3946	return chrooted;
3356	3947	}
3357	3948
3358		-static bool mnt_already_visible(struct mnt_namespace ns, struct vfsmount new,
	3949	+static bool mnt_already_visible(struct mnt_namespace *ns,
	3950	+ const struct super_block *sb,
3359	3951	int *new_mnt_flags)
3360	3952	{
3361	3953	int new_flags = *new_mnt_flags;
..	..	@@ -3363,11 +3955,15 @@
3363	3955	bool visible = false;
3364	3956
3365	3957	down_read(&namespace_sem);
	3958	+ lock_ns_list(ns);
3366	3959	list_for_each_entry(mnt, &ns->list, mnt_list) {
3367	3960	struct mount *child;
3368	3961	int mnt_flags;
3369	3962
3370		- if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
	3963	+ if (mnt_is_cursor(mnt))
	3964	+ continue;
	3965	+
	3966	+ if (mnt->mnt.mnt_sb->s_type != sb->s_type)
3371	3967	continue;
3372	3968
3373	3969	/* This mount is not fully visible if it's root directory
..	..	@@ -3414,11 +4010,12 @@
3414	4010	next: ;
3415	4011	}
3416	4012	found:
	4013	+ unlock_ns_list(ns);
3417	4014	up_read(&namespace_sem);
3418	4015	return visible;
3419	4016	}
3420	4017
3421		-static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags)
	4018	+static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags)
3422	4019	{
3423	4020	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
3424	4021	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
..	..	@@ -3428,7 +4025,7 @@
3428	4025	return false;
3429	4026
3430	4027	/* Can this filesystem be too revealing? */
3431		- s_iflags = mnt->mnt_sb->s_iflags;
	4028	+ s_iflags = sb->s_iflags;
3432	4029	if (!(s_iflags & SB_I_USERNS_VISIBLE))
3433	4030	return false;
3434	4031
..	..	@@ -3438,7 +4035,7 @@
3438	4035	return true;
3439	4036	}
3440	4037
3441		- return !mnt_already_visible(ns, mnt, new_mnt_flags);
	4038	+ return !mnt_already_visible(ns, sb, new_mnt_flags);
3442	4039	}
3443	4040
3444	4041	bool mnt_may_suid(struct vfsmount *mnt)
..	..	@@ -3475,18 +4072,23 @@
3475	4072	put_mnt_ns(to_mnt_ns(ns));
3476	4073	}
3477	4074
3478		-static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
	4075	+static int mntns_install(struct nsset nsset, struct ns_common ns)
3479	4076	{
3480		- struct fs_struct *fs = current->fs;
	4077	+ struct nsproxy *nsproxy = nsset->nsproxy;
	4078	+ struct fs_struct *fs = nsset->fs;
3481	4079	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
	4080	+ struct user_namespace *user_ns = nsset->cred->user_ns;
3482	4081	struct path root;
3483	4082	int err;
3484	4083
3485	4084	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
3486		- !ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
3487		- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
	4085	+ !ns_capable(user_ns, CAP_SYS_CHROOT) \|\|
	4086	+ !ns_capable(user_ns, CAP_SYS_ADMIN))
3488	4087	return -EPERM;
3489	4088
	4089	+ if (is_anon_ns(mnt_ns))
	4090	+ return -EINVAL;
	4091	+
3490	4092	if (fs->users != 1)
3491	4093	return -EINVAL;
3492	4094