~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,8 +1,8 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/fs/namespace.c
3	4	*
4	5	* (C) Copyright Al Viro 2000, 2001
5		- * Released under GPL v2.
6	6	*
7	7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
8	8	* Heavily rewritten.
..	..	@@ -14,18 +14,23 @@
14	14	#include <linux/mnt_namespace.h>
15	15	#include <linux/user_namespace.h>
16	16	#include <linux/namei.h>
	17	+#include <linux/delay.h>
17	18	#include <linux/security.h>
18	19	#include <linux/cred.h>
19	20	#include <linux/idr.h>
20	21	#include <linux/init.h> /* init_rootfs */
21	22	#include <linux/fs_struct.h> /* get_fs_root et.al. */
22	23	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
	24	+#include <linux/file.h>
23	25	#include <linux/uaccess.h>
24	26	#include <linux/proc_ns.h>
25	27	#include <linux/magic.h>
26		-#include <linux/bootmem.h>
	28	+#include <linux/memblock.h>
27	29	#include <linux/task_work.h>
28	30	#include <linux/sched/task.h>
	31	+#include <uapi/linux/mount.h>
	32	+#include <linux/fs_context.h>
	33	+#include <linux/shmem_fs.h>
29	34
30	35	#include "pnode.h"
31	36	#include "internal.h"
..	..	@@ -66,6 +71,8 @@
66	71	static struct hlist_head *mountpoint_hashtable __read_mostly;
67	72	static struct kmem_cache *mnt_cache __read_mostly;
68	73	static DECLARE_RWSEM(namespace_sem);
	74	+static HLIST_HEAD(unmounted); /* protected by namespace_sem */
	75	+static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
69	76
70	77	/* /sys/fs */
71	78	struct kobject *fs_kobj;
..	..	@@ -150,10 +157,10 @@
150	157	/*
151	158	* vfsmount lock must be held for write
152	159	*/
153		-unsigned int mnt_get_count(struct mount *mnt)
	160	+int mnt_get_count(struct mount *mnt)
154	161	{
155	162	#ifdef CONFIG_SMP
156		- unsigned int count = 0;
	163	+ int count = 0;
157	164	int cpu;
158	165
159	166	for_each_possible_cpu(cpu) {
..	..	@@ -164,14 +171,6 @@
164	171	#else
165	172	return mnt->mnt_count;
166	173	#endif
167		-}
168		-
169		-static void drop_mountpoint(struct fs_pin *p)
170		-{
171		- struct mount *m = container_of(p, struct mount, mnt_umount);
172		- dput(m->mnt_ex_mountpoint);
173		- pin_remove(p);
174		- mntput(&m->mnt);
175	174	}
176	175
177	176	static struct mount alloc_vfsmnt(const char name)
..	..	@@ -200,7 +199,6 @@
200	199	mnt->mnt_count = 1;
201	200	mnt->mnt_writers = 0;
202	201	#endif
203		- mnt->mnt.data = NULL;
204	202
205	203	INIT_HLIST_NODE(&mnt->mnt_hash);
206	204	INIT_LIST_HEAD(&mnt->mnt_child);
..	..	@@ -212,7 +210,7 @@
212	210	INIT_LIST_HEAD(&mnt->mnt_slave);
213	211	INIT_HLIST_NODE(&mnt->mnt_mp_list);
214	212	INIT_LIST_HEAD(&mnt->mnt_umounting);
215		- init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
	213	+ INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
216	214	}
217	215	return mnt;
218	216
..	..	@@ -246,13 +244,9 @@
246	244	* mnt_want/drop_write() will _keep_ the filesystem
247	245	* r/w.
248	246	*/
249		-int __mnt_is_readonly(struct vfsmount *mnt)
	247	+bool __mnt_is_readonly(struct vfsmount *mnt)
250	248	{
251		- if (mnt->mnt_flags & MNT_READONLY)
252		- return 1;
253		- if (sb_rdonly(mnt->mnt_sb))
254		- return 1;
255		- return 0;
	249	+ return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(mnt->mnt_sb);
256	250	}
257	251	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
258	252
..	..	@@ -328,8 +322,11 @@
328	322	* incremented count after it has set MNT_WRITE_HOLD.
329	323	*/
330	324	smp_mb();
331		- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
332		- cpu_relax();
	325	+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
	326	+ preempt_enable();
	327	+ cpu_chill();
	328	+ preempt_disable();
	329	+ }
333	330	/*
334	331	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
335	332	* be set to match its requirements. So we must not load that until
..	..	@@ -422,7 +419,7 @@
422	419	sb_end_write(file_inode(file)->i_sb);
423	420	return ret;
424	421	}
425		-EXPORT_SYMBOL_GPL(mnt_want_write_file);
	422	+EXPORT_SYMBOL_NS_GPL(mnt_want_write_file, ANDROID_GKI_VFS_EXPORT_ONLY);
426	423
427	424	/**
428	425	* __mnt_drop_write - give up write access to a mount
..	..	@@ -464,7 +461,7 @@
464	461	__mnt_drop_write_file(file);
465	462	sb_end_write(file_inode(file)->i_sb);
466	463	}
467		-EXPORT_SYMBOL(mnt_drop_write_file);
	464	+EXPORT_SYMBOL_NS(mnt_drop_write_file, ANDROID_GKI_VFS_EXPORT_ONLY);
468	465
469	466	static int mnt_make_readonly(struct mount *mnt)
470	467	{
..	..	@@ -508,11 +505,12 @@
508	505	return ret;
509	506	}
510	507
511		-static void __mnt_unmake_readonly(struct mount *mnt)
	508	+static int __mnt_unmake_readonly(struct mount *mnt)
512	509	{
513	510	lock_mount_hash();
514	511	mnt->mnt.mnt_flags &= ~MNT_READONLY;
515	512	unlock_mount_hash();
	513	+ return 0;
516	514	}
517	515
518	516	int sb_prepare_remount_readonly(struct super_block *sb)
..	..	@@ -553,7 +551,6 @@
553	551
554	552	static void free_vfsmnt(struct mount *mnt)
555	553	{
556		- kfree(mnt->mnt.data);
557	554	kfree_const(mnt->mnt_devname);
558	555	#ifdef CONFIG_SMP
559	556	free_percpu(mnt->mnt_pcp);
..	..	@@ -655,6 +652,21 @@
655	652	return m;
656	653	}
657	654
	655	+static inline void lock_ns_list(struct mnt_namespace *ns)
	656	+{
	657	+ spin_lock(&ns->ns_lock);
	658	+}
	659	+
	660	+static inline void unlock_ns_list(struct mnt_namespace *ns)
	661	+{
	662	+ spin_unlock(&ns->ns_lock);
	663	+}
	664	+
	665	+static inline bool mnt_is_cursor(struct mount *mnt)
	666	+{
	667	+ return mnt->mnt.mnt_flags & MNT_CURSOR;
	668	+}
	669	+
658	670	/*
659	671	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
660	672	* current mount namespace.
..	..	@@ -676,17 +688,18 @@
676	688	struct mount *mnt;
677	689	bool is_covered = false;
678	690
679		- if (!d_mountpoint(dentry))
680		- goto out;
681		-
682	691	down_read(&namespace_sem);
	692	+ lock_ns_list(ns);
683	693	list_for_each_entry(mnt, &ns->list, mnt_list) {
	694	+ if (mnt_is_cursor(mnt))
	695	+ continue;
684	696	is_covered = (mnt->mnt_mountpoint == dentry);
685	697	if (is_covered)
686	698	break;
687	699	}
	700	+ unlock_ns_list(ns);
688	701	up_read(&namespace_sem);
689		-out:
	702	+
690	703	return is_covered;
691	704	}
692	705
..	..	@@ -741,7 +754,7 @@
741	754
742	755	/* Add the new mountpoint to the hash table */
743	756	read_seqlock_excl(&mount_lock);
744		- new->m_dentry = dentry;
	757	+ new->m_dentry = dget(dentry);
745	758	new->m_count = 1;
746	759	hlist_add_head(&new->m_hash, mp_hash(dentry));
747	760	INIT_HLIST_HEAD(&new->m_list);
..	..	@@ -754,7 +767,11 @@
754	767	return mp;
755	768	}
756	769
757		-static void put_mountpoint(struct mountpoint *mp)
	770	+/*
	771	+ * vfsmount lock must be held. Additionally, the caller is responsible
	772	+ * for serializing calls for given disposal list.
	773	+ */
	774	+static void __put_mountpoint(struct mountpoint mp, struct list_head list)
758	775	{
759	776	if (!--mp->m_count) {
760	777	struct dentry *dentry = mp->m_dentry;
..	..	@@ -762,9 +779,16 @@
762	779	spin_lock(&dentry->d_lock);
763	780	dentry->d_flags &= ~DCACHE_MOUNTED;
764	781	spin_unlock(&dentry->d_lock);
	782	+ dput_to_list(dentry, list);
765	783	hlist_del(&mp->m_hash);
766	784	kfree(mp);
767	785	}
	786	+}
	787	+
	788	+/* called with namespace_lock and vfsmount lock */
	789	+static void put_mountpoint(struct mountpoint *mp)
	790	+{
	791	+ __put_mountpoint(mp, &ex_mountpoints);
768	792	}
769	793
770	794	static inline int check_mnt(struct mount *mnt)
..	..	@@ -797,25 +821,17 @@
797	821	/*
798	822	* vfsmount lock must be held for write
799	823	*/
800		-static void unhash_mnt(struct mount *mnt)
	824	+static struct mountpoint unhash_mnt(struct mount mnt)
801	825	{
	826	+ struct mountpoint *mp;
802	827	mnt->mnt_parent = mnt;
803	828	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
804	829	list_del_init(&mnt->mnt_child);
805	830	hlist_del_init_rcu(&mnt->mnt_hash);
806	831	hlist_del_init(&mnt->mnt_mp_list);
807		- put_mountpoint(mnt->mnt_mp);
	832	+ mp = mnt->mnt_mp;
808	833	mnt->mnt_mp = NULL;
809		-}
810		-
811		-/*
812		- * vfsmount lock must be held for write
813		- */
814		-static void detach_mnt(struct mount mnt, struct path old_path)
815		-{
816		- old_path->dentry = mnt->mnt_mountpoint;
817		- old_path->mnt = &mnt->mnt_parent->mnt;
818		- unhash_mnt(mnt);
	834	+ return mp;
819	835	}
820	836
821	837	/*
..	..	@@ -823,9 +839,7 @@
823	839	*/
824	840	static void umount_mnt(struct mount *mnt)
825	841	{
826		- /* old mountpoint will be dropped when we can do that */
827		- mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
828		- unhash_mnt(mnt);
	842	+ put_mountpoint(unhash_mnt(mnt));
829	843	}
830	844
831	845	/*
..	..	@@ -837,7 +851,7 @@
837	851	{
838	852	mp->m_count++;
839	853	mnt_add_count(mnt, 1); /* essentially, that's mntget */
840		- child_mnt->mnt_mountpoint = dget(mp->m_dentry);
	854	+ child_mnt->mnt_mountpoint = mp->m_dentry;
841	855	child_mnt->mnt_parent = mnt;
842	856	child_mnt->mnt_mp = mp;
843	857	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
..	..	@@ -864,7 +878,6 @@
864	878	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
865	879	{
866	880	struct mountpoint *old_mp = mnt->mnt_mp;
867		- struct dentry *old_mountpoint = mnt->mnt_mountpoint;
868	881	struct mount *old_parent = mnt->mnt_parent;
869	882
870	883	list_del_init(&mnt->mnt_child);
..	..	@@ -874,22 +887,6 @@
874	887	attach_mnt(mnt, parent, mp);
875	888
876	889	put_mountpoint(old_mp);
877		-
878		- /*
879		- * Safely avoid even the suggestion this code might sleep or
880		- * lock the mount hash by taking advantage of the knowledge that
881		- * mnt_change_mountpoint will not release the final reference
882		- * to a mountpoint.
883		- *
884		- * During mounting, the mount passed in as the parent mount will
885		- * continue to use the old mountpoint and during unmounting, the
886		- * old mountpoint will continue to exist until namespace_unlock,
887		- * which happens well after mnt_change_mountpoint.
888		- */
889		- spin_lock(&old_mountpoint->d_lock);
890		- old_mountpoint->d_lockref.count--;
891		- spin_unlock(&old_mountpoint->d_lock);
892		-
893	890	mnt_add_count(old_parent, -1);
894	891	}
895	892
..	..	@@ -944,45 +941,80 @@
944	941	return p;
945	942	}
946	943
947		-struct vfsmount *
948		-vfs_kern_mount(struct file_system_type type, int flags, const char name, void *data)
	944	+/**
	945	+ * vfs_create_mount - Create a mount for a configured superblock
	946	+ * @fc: The configuration context with the superblock attached
	947	+ *
	948	+ * Create a mount to an already configured superblock. If necessary, the
	949	+ * caller should invoke vfs_get_tree() before calling this.
	950	+ *
	951	+ * Note that this does not attach the mount to anything.
	952	+ */
	953	+struct vfsmount vfs_create_mount(struct fs_context fc)
949	954	{
950	955	struct mount *mnt;
951		- struct dentry *root;
952	956
953		- if (!type)
954		- return ERR_PTR(-ENODEV);
	957	+ if (!fc->root)
	958	+ return ERR_PTR(-EINVAL);
955	959
956		- mnt = alloc_vfsmnt(name);
	960	+ mnt = alloc_vfsmnt(fc->source ?: "none");
957	961	if (!mnt)
958	962	return ERR_PTR(-ENOMEM);
959	963
960		- if (type->alloc_mnt_data) {
961		- mnt->mnt.data = type->alloc_mnt_data();
962		- if (!mnt->mnt.data) {
963		- mnt_free_id(mnt);
964		- free_vfsmnt(mnt);
965		- return ERR_PTR(-ENOMEM);
966		- }
967		- }
968		- if (flags & SB_KERNMOUNT)
	964	+ if (fc->sb_flags & SB_KERNMOUNT)
969	965	mnt->mnt.mnt_flags = MNT_INTERNAL;
970	966
971		- root = mount_fs(type, flags, name, &mnt->mnt, data);
972		- if (IS_ERR(root)) {
973		- mnt_free_id(mnt);
974		- free_vfsmnt(mnt);
975		- return ERR_CAST(root);
976		- }
	967	+ atomic_inc(&fc->root->d_sb->s_active);
	968	+ mnt->mnt.mnt_sb = fc->root->d_sb;
	969	+ mnt->mnt.mnt_root = dget(fc->root);
	970	+ mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	971	+ mnt->mnt_parent = mnt;
977	972
978		- mnt->mnt.mnt_root = root;
979		- mnt->mnt.mnt_sb = root->d_sb;
980		- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
981		- mnt->mnt_parent = mnt;
982	973	lock_mount_hash();
983		- list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
	974	+ list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
984	975	unlock_mount_hash();
985	976	return &mnt->mnt;
	977	+}
	978	+EXPORT_SYMBOL(vfs_create_mount);
	979	+
	980	+struct vfsmount fc_mount(struct fs_context fc)
	981	+{
	982	+ int err = vfs_get_tree(fc);
	983	+ if (!err) {
	984	+ up_write(&fc->root->d_sb->s_umount);
	985	+ return vfs_create_mount(fc);
	986	+ }
	987	+ return ERR_PTR(err);
	988	+}
	989	+EXPORT_SYMBOL(fc_mount);
	990	+
	991	+struct vfsmount vfs_kern_mount(struct file_system_type type,
	992	+ int flags, const char *name,
	993	+ void *data)
	994	+{
	995	+ struct fs_context *fc;
	996	+ struct vfsmount *mnt;
	997	+ int ret = 0;
	998	+
	999	+ if (!type)
	1000	+ return ERR_PTR(-EINVAL);
	1001	+
	1002	+ fc = fs_context_for_mount(type, flags);
	1003	+ if (IS_ERR(fc))
	1004	+ return ERR_CAST(fc);
	1005	+
	1006	+ if (name)
	1007	+ ret = vfs_parse_fs_string(fc, "source",
	1008	+ name, strlen(name));
	1009	+ if (!ret)
	1010	+ ret = parse_monolithic_mount_data(fc, data);
	1011	+ if (!ret)
	1012	+ mnt = fc_mount(fc);
	1013	+ else
	1014	+ mnt = ERR_PTR(ret);
	1015	+
	1016	+ put_fs_context(fc);
	1017	+ return mnt;
986	1018	}
987	1019	EXPORT_SYMBOL_GPL(vfs_kern_mount);
988	1020
..	..	@@ -1012,14 +1044,6 @@
1012	1044	if (!mnt)
1013	1045	return ERR_PTR(-ENOMEM);
1014	1046
1015		- if (sb->s_op->clone_mnt_data) {
1016		- mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
1017		- if (!mnt->mnt.data) {
1018		- err = -ENOMEM;
1019		- goto out_free;
1020		- }
1021		- }
1022		-
1023	1047	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
1024	1048	mnt->mnt_group_id = 0; /* not a peer of original */
1025	1049	else
..	..	@@ -1033,27 +1057,6 @@
1033	1057
1034	1058	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1035	1059	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
1036		- /* Don't allow unprivileged users to change mount flags */
1037		- if (flag & CL_UNPRIVILEGED) {
1038		- mnt->mnt.mnt_flags \|= MNT_LOCK_ATIME;
1039		-
1040		- if (mnt->mnt.mnt_flags & MNT_READONLY)
1041		- mnt->mnt.mnt_flags \|= MNT_LOCK_READONLY;
1042		-
1043		- if (mnt->mnt.mnt_flags & MNT_NODEV)
1044		- mnt->mnt.mnt_flags \|= MNT_LOCK_NODEV;
1045		-
1046		- if (mnt->mnt.mnt_flags & MNT_NOSUID)
1047		- mnt->mnt.mnt_flags \|= MNT_LOCK_NOSUID;
1048		-
1049		- if (mnt->mnt.mnt_flags & MNT_NOEXEC)
1050		- mnt->mnt.mnt_flags \|= MNT_LOCK_NOEXEC;
1051		- }
1052		-
1053		- /* Don't allow unprivileged users to reveal what is under a mount */
1054		- if ((flag & CL_UNPRIVILEGED) &&
1055		- (!(flag & CL_EXPIRE) \|\| list_empty(&old->mnt_expire)))
1056		- mnt->mnt.mnt_flags \|= MNT_LOCKED;
1057	1060
1058	1061	atomic_inc(&sb->s_active);
1059	1062	mnt->mnt.mnt_sb = sb;
..	..	@@ -1098,19 +1101,22 @@
1098	1101
1099	1102	static void cleanup_mnt(struct mount *mnt)
1100	1103	{
	1104	+ struct hlist_node *p;
	1105	+ struct mount *m;
1101	1106	/*
1102		- * This probably indicates that somebody messed
1103		- * up a mnt_want/drop_write() pair. If this
1104		- * happens, the filesystem was probably unable
1105		- * to make r/w->r/o transitions.
1106		- */
1107		- /*
	1107	+ * The warning here probably indicates that somebody messed
	1108	+ * up a mnt_want/drop_write() pair. If this happens, the
	1109	+ * filesystem was probably unable to make r/w->r/o transitions.
1108	1110	* The locking used to deal with mnt_count decrement provides barriers,
1109	1111	* so mnt_get_writers() below is safe.
1110	1112	*/
1111	1113	WARN_ON(mnt_get_writers(mnt));
1112	1114	if (unlikely(mnt->mnt_pins.first))
1113	1115	mnt_pin_kill(mnt);
	1116	+ hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
	1117	+ hlist_del(&m->mnt_umount);
	1118	+ mntput(&m->mnt);
	1119	+ }
1114	1120	fsnotify_vfsmount_delete(&mnt->mnt);
1115	1121	dput(mnt->mnt.mnt_root);
1116	1122	deactivate_super(mnt->mnt.mnt_sb);
..	..	@@ -1136,6 +1142,9 @@
1136	1142
1137	1143	static void mntput_no_expire(struct mount *mnt)
1138	1144	{
	1145	+ LIST_HEAD(list);
	1146	+ int count;
	1147	+
1139	1148	rcu_read_lock();
1140	1149	if (likely(READ_ONCE(mnt->mnt_ns))) {
1141	1150	/*
..	..	@@ -1158,7 +1167,9 @@
1158	1167	*/
1159	1168	smp_mb();
1160	1169	mnt_add_count(mnt, -1);
1161		- if (mnt_get_count(mnt)) {
	1170	+ count = mnt_get_count(mnt);
	1171	+ if (count != 0) {
	1172	+ WARN_ON(count < 0);
1162	1173	rcu_read_unlock();
1163	1174	unlock_mount_hash();
1164	1175	return;
..	..	@@ -1176,16 +1187,18 @@
1176	1187	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1177	1188	struct mount p, tmp;
1178	1189	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1179		- umount_mnt(p);
	1190	+ __put_mountpoint(unhash_mnt(p), &list);
	1191	+ hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1180	1192	}
1181	1193	}
1182	1194	unlock_mount_hash();
	1195	+ shrink_dentry_list(&list);
1183	1196
1184	1197	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1185	1198	struct task_struct *task = current;
1186	1199	if (likely(!(task->flags & PF_KTHREAD))) {
1187	1200	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1188		- if (!task_work_add(task, &mnt->mnt_rcu, true))
	1201	+ if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1189	1202	return;
1190	1203	}
1191	1204	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
..	..	@@ -1255,46 +1268,71 @@
1255	1268	}
1256	1269
1257	1270	#ifdef CONFIG_PROC_FS
	1271	+static struct mount mnt_list_next(struct mnt_namespace ns,
	1272	+ struct list_head *p)
	1273	+{
	1274	+ struct mount mnt, ret = NULL;
	1275	+
	1276	+ lock_ns_list(ns);
	1277	+ list_for_each_continue(p, &ns->list) {
	1278	+ mnt = list_entry(p, typeof(*mnt), mnt_list);
	1279	+ if (!mnt_is_cursor(mnt)) {
	1280	+ ret = mnt;
	1281	+ break;
	1282	+ }
	1283	+ }
	1284	+ unlock_ns_list(ns);
	1285	+
	1286	+ return ret;
	1287	+}
	1288	+
1258	1289	/* iterator; we want it to have access to namespace_sem, thus here... */
1259	1290	static void m_start(struct seq_file m, loff_t *pos)
1260	1291	{
1261	1292	struct proc_mounts *p = m->private;
	1293	+ struct list_head *prev;
1262	1294
1263	1295	down_read(&namespace_sem);
1264		- if (p->cached_event == p->ns->event) {
1265		- void *v = p->cached_mount;
1266		- if (*pos == p->cached_index)
1267		- return v;
1268		- if (*pos == p->cached_index + 1) {
1269		- v = seq_list_next(v, &p->ns->list, &p->cached_index);
1270		- return p->cached_mount = v;
1271		- }
	1296	+ if (!*pos) {
	1297	+ prev = &p->ns->list;
	1298	+ } else {
	1299	+ prev = &p->cursor.mnt_list;
	1300	+
	1301	+ /* Read after we'd reached the end? */
	1302	+ if (list_empty(prev))
	1303	+ return NULL;
1272	1304	}
1273	1305
1274		- p->cached_event = p->ns->event;
1275		- p->cached_mount = seq_list_start(&p->ns->list, *pos);
1276		- p->cached_index = *pos;
1277		- return p->cached_mount;
	1306	+ return mnt_list_next(p->ns, prev);
1278	1307	}
1279	1308
1280	1309	static void m_next(struct seq_file m, void v, loff_t pos)
1281	1310	{
1282	1311	struct proc_mounts *p = m->private;
	1312	+ struct mount *mnt = v;
1283	1313
1284		- p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1285		- p->cached_index = *pos;
1286		- return p->cached_mount;
	1314	+ ++*pos;
	1315	+ return mnt_list_next(p->ns, &mnt->mnt_list);
1287	1316	}
1288	1317
1289	1318	static void m_stop(struct seq_file m, void v)
1290	1319	{
	1320	+ struct proc_mounts *p = m->private;
	1321	+ struct mount *mnt = v;
	1322	+
	1323	+ lock_ns_list(p->ns);
	1324	+ if (mnt)
	1325	+ list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
	1326	+ else
	1327	+ list_del_init(&p->cursor.mnt_list);
	1328	+ unlock_ns_list(p->ns);
1291	1329	up_read(&namespace_sem);
1292	1330	}
1293	1331
1294	1332	static int m_show(struct seq_file m, void v)
1295	1333	{
1296	1334	struct proc_mounts *p = m->private;
1297		- struct mount *r = list_entry(v, struct mount, mnt_list);
	1335	+ struct mount *r = v;
1298	1336	return p->show(m, &r->mnt);
1299	1337	}
1300	1338
..	..	@@ -1304,6 +1342,15 @@
1304	1342	.stop = m_stop,
1305	1343	.show = m_show,
1306	1344	};
	1345	+
	1346	+void mnt_cursor_del(struct mnt_namespace ns, struct mount cursor)
	1347	+{
	1348	+ down_read(&namespace_sem);
	1349	+ lock_ns_list(ns);
	1350	+ list_del(&cursor->mnt_list);
	1351	+ unlock_ns_list(ns);
	1352	+ up_read(&namespace_sem);
	1353	+}
1307	1354	#endif /* CONFIG_PROC_FS */
1308	1355
1309	1356	/**
..	..	@@ -1365,22 +1412,29 @@
1365	1412
1366	1413	EXPORT_SYMBOL(may_umount);
1367	1414
1368		-static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1369		-
1370	1415	static void namespace_unlock(void)
1371	1416	{
1372	1417	struct hlist_head head;
	1418	+ struct hlist_node *p;
	1419	+ struct mount *m;
	1420	+ LIST_HEAD(list);
1373	1421
1374	1422	hlist_move_list(&unmounted, &head);
	1423	+ list_splice_init(&ex_mountpoints, &list);
1375	1424
1376	1425	up_write(&namespace_sem);
	1426	+
	1427	+ shrink_dentry_list(&list);
1377	1428
1378	1429	if (likely(hlist_empty(&head)))
1379	1430	return;
1380	1431
1381		- synchronize_rcu();
	1432	+ synchronize_rcu_expedited();
1382	1433
1383		- group_pin_kill(&head);
	1434	+ hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
	1435	+ hlist_del(&m->mnt_umount);
	1436	+ mntput(&m->mnt);
	1437	+ }
1384	1438	}
1385	1439
1386	1440	static inline void namespace_lock(void)
..	..	@@ -1466,9 +1520,6 @@
1466	1520	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1467	1521
1468	1522	disconnect = disconnect_mount(p, how);
1469		-
1470		- pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
1471		- disconnect ? &unmounted : NULL);
1472	1523	if (mnt_has_parent(p)) {
1473	1524	mnt_add_count(p->mnt_parent, -1);
1474	1525	if (!disconnect) {
..	..	@@ -1479,10 +1530,35 @@
1479	1530	}
1480	1531	}
1481	1532	change_mnt_propagation(p, MS_PRIVATE);
	1533	+ if (disconnect)
	1534	+ hlist_add_head(&p->mnt_umount, &unmounted);
1482	1535	}
1483	1536	}
1484	1537
1485	1538	static void shrink_submounts(struct mount *mnt);
	1539	+
	1540	+static int do_umount_root(struct super_block *sb)
	1541	+{
	1542	+ int ret = 0;
	1543	+
	1544	+ down_write(&sb->s_umount);
	1545	+ if (!sb_rdonly(sb)) {
	1546	+ struct fs_context *fc;
	1547	+
	1548	+ fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
	1549	+ SB_RDONLY);
	1550	+ if (IS_ERR(fc)) {
	1551	+ ret = PTR_ERR(fc);
	1552	+ } else {
	1553	+ ret = parse_monolithic_mount_data(fc, NULL);
	1554	+ if (!ret)
	1555	+ ret = reconfigure_super(fc);
	1556	+ put_fs_context(fc);
	1557	+ }
	1558	+ }
	1559	+ up_write(&sb->s_umount);
	1560	+ return ret;
	1561	+}
1486	1562
1487	1563	static int do_umount(struct mount *mnt, int flags)
1488	1564	{
..	..	@@ -1549,11 +1625,7 @@
1549	1625	*/
1550	1626	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1551	1627	return -EPERM;
1552		- down_write(&sb->s_umount);
1553		- if (!sb_rdonly(sb))
1554		- retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
1555		- up_write(&sb->s_umount);
1556		- return retval;
	1628	+ return do_umount_root(sb);
1557	1629	}
1558	1630
1559	1631	namespace_lock();
..	..	@@ -1602,15 +1674,15 @@
1602	1674	namespace_lock();
1603	1675	lock_mount_hash();
1604	1676	mp = lookup_mountpoint(dentry);
1605		- if (IS_ERR_OR_NULL(mp))
	1677	+ if (!mp)
1606	1678	goto out_unlock;
1607	1679
1608	1680	event++;
1609	1681	while (!hlist_empty(&mp->m_list)) {
1610	1682	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1611	1683	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1612		- hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
1613	1684	umount_mnt(mnt);
	1685	+ hlist_add_head(&mnt->mnt_umount, &unmounted);
1614	1686	}
1615	1687	else umount_tree(mnt, UMOUNT_CONNECTED);
1616	1688	}
..	..	@@ -1645,52 +1717,55 @@
1645	1717	}
1646	1718	#endif
1647	1719
1648		-/*
1649		- * Now umount can handle mount points as well as block devices.
1650		- * This is important for filesystems which use unnamed block devices.
1651		- *
1652		- * We now support a flag for forced unmount like the other 'big iron'
1653		- * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1654		- */
1655		-
1656		-int ksys_umount(char __user *name, int flags)
	1720	+static int can_umount(const struct path *path, int flags)
1657	1721	{
1658		- struct path path;
1659		- struct mount *mnt;
1660		- int retval;
1661		- int lookup_flags = 0;
1662		-
1663		- if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
1664		- return -EINVAL;
	1722	+ struct mount *mnt = real_mount(path->mnt);
1665	1723
1666	1724	if (!may_mount())
1667	1725	return -EPERM;
	1726	+ if (path->dentry != path->mnt->mnt_root)
	1727	+ return -EINVAL;
	1728	+ if (!check_mnt(mnt))
	1729	+ return -EINVAL;
	1730	+ if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
	1731	+ return -EINVAL;
	1732	+ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
	1733	+ return -EPERM;
	1734	+ return 0;
	1735	+}
	1736	+
	1737	+// caller is responsible for flags being sane
	1738	+int path_umount(struct path *path, int flags)
	1739	+{
	1740	+ struct mount *mnt = real_mount(path->mnt);
	1741	+ int ret;
	1742	+
	1743	+ ret = can_umount(path, flags);
	1744	+ if (!ret)
	1745	+ ret = do_umount(mnt, flags);
	1746	+
	1747	+ /* we mustn't call path_put() as that would clear mnt_expiry_mark */
	1748	+ dput(path->dentry);
	1749	+ mntput_no_expire(mnt);
	1750	+ return ret;
	1751	+}
	1752	+
	1753	+static int ksys_umount(char __user *name, int flags)
	1754	+{
	1755	+ int lookup_flags = LOOKUP_MOUNTPOINT;
	1756	+ struct path path;
	1757	+ int ret;
	1758	+
	1759	+ // basic validity checks done first
	1760	+ if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
	1761	+ return -EINVAL;
1668	1762
1669	1763	if (!(flags & UMOUNT_NOFOLLOW))
1670	1764	lookup_flags \|= LOOKUP_FOLLOW;
1671		-
1672		- retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1673		- if (retval)
1674		- goto out;
1675		- mnt = real_mount(path.mnt);
1676		- retval = -EINVAL;
1677		- if (path.dentry != path.mnt->mnt_root)
1678		- goto dput_and_out;
1679		- if (!check_mnt(mnt))
1680		- goto dput_and_out;
1681		- if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1682		- goto dput_and_out;
1683		- retval = -EPERM;
1684		- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1685		- goto dput_and_out;
1686		-
1687		- retval = do_umount(mnt, flags);
1688		-dput_and_out:
1689		- /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1690		- dput(path.dentry);
1691		- mntput_no_expire(mnt);
1692		-out:
1693		- return retval;
	1765	+ ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
	1766	+ if (ret)
	1767	+ return ret;
	1768	+ return path_umount(&path, flags);
1694	1769	}
1695	1770
1696	1771	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
..	..	@@ -1717,9 +1792,14 @@
1717	1792	dentry->d_fsdata == &mntns_operations;
1718	1793	}
1719	1794
1720		-struct mnt_namespace to_mnt_ns(struct ns_common ns)
	1795	+static struct mnt_namespace to_mnt_ns(struct ns_common ns)
1721	1796	{
1722	1797	return container_of(ns, struct mnt_namespace, ns);
	1798	+}
	1799	+
	1800	+struct ns_common from_mnt_ns(struct mnt_namespace mnt)
	1801	+{
	1802	+ return &mnt->ns;
1723	1803	}
1724	1804
1725	1805	static bool mnt_ns_loop(struct dentry *dentry)
..	..	@@ -1817,6 +1897,27 @@
1817	1897	return &tree->mnt;
1818	1898	}
1819	1899
	1900	+static void free_mnt_ns(struct mnt_namespace *);
	1901	+static struct mnt_namespace alloc_mnt_ns(struct user_namespace , bool);
	1902	+
	1903	+void dissolve_on_fput(struct vfsmount *mnt)
	1904	+{
	1905	+ struct mnt_namespace *ns;
	1906	+ namespace_lock();
	1907	+ lock_mount_hash();
	1908	+ ns = real_mount(mnt)->mnt_ns;
	1909	+ if (ns) {
	1910	+ if (is_anon_ns(ns))
	1911	+ umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
	1912	+ else
	1913	+ ns = NULL;
	1914	+ }
	1915	+ unlock_mount_hash();
	1916	+ namespace_unlock();
	1917	+ if (ns)
	1918	+ free_mnt_ns(ns);
	1919	+}
	1920	+
1820	1921	void drop_collected_mounts(struct vfsmount *mnt)
1821	1922	{
1822	1923	namespace_lock();
..	..	@@ -1870,6 +1971,9 @@
1870	1971	if (IS_ERR(new_mnt))
1871	1972	return ERR_CAST(new_mnt);
1872	1973
	1974	+ /* Longterm mount to be removed by kern_unmount() /
	1975	+ new_mnt->mnt_ns = MNT_NS_INTERNAL;
	1976	+
1873	1977	return &new_mnt->mnt;
1874	1978
1875	1979	invalid:
..	..	@@ -1891,6 +1995,33 @@
1891	1995	return res;
1892	1996	}
1893	1997	return 0;
	1998	+}
	1999	+
	2000	+static void lock_mnt_tree(struct mount *mnt)
	2001	+{
	2002	+ struct mount *p;
	2003	+
	2004	+ for (p = mnt; p; p = next_mnt(p, mnt)) {
	2005	+ int flags = p->mnt.mnt_flags;
	2006	+ /* Don't allow unprivileged users to change mount flags */
	2007	+ flags \|= MNT_LOCK_ATIME;
	2008	+
	2009	+ if (flags & MNT_READONLY)
	2010	+ flags \|= MNT_LOCK_READONLY;
	2011	+
	2012	+ if (flags & MNT_NODEV)
	2013	+ flags \|= MNT_LOCK_NODEV;
	2014	+
	2015	+ if (flags & MNT_NOSUID)
	2016	+ flags \|= MNT_LOCK_NOSUID;
	2017	+
	2018	+ if (flags & MNT_NOEXEC)
	2019	+ flags \|= MNT_LOCK_NOEXEC;
	2020	+ /* Don't allow unprivileged users to reveal what is under a mount */
	2021	+ if (list_empty(&p->mnt_expire))
	2022	+ flags \|= MNT_LOCKED;
	2023	+ p->mnt.mnt_flags = flags;
	2024	+ }
1894	2025	}
1895	2026
1896	2027	static void cleanup_group_ids(struct mount mnt, struct mount end)
..	..	@@ -2008,8 +2139,9 @@
2008	2139	static int attach_recursive_mnt(struct mount *source_mnt,
2009	2140	struct mount *dest_mnt,
2010	2141	struct mountpoint *dest_mp,
2011		- struct path *parent_path)
	2142	+ bool moving)
2012	2143	{
	2144	+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2013	2145	HLIST_HEAD(tree_list);
2014	2146	struct mnt_namespace *ns = dest_mnt->mnt_ns;
2015	2147	struct mountpoint *smp;
..	..	@@ -2025,7 +2157,7 @@
2025	2157	return PTR_ERR(smp);
2026	2158
2027	2159	/* Is there space to add these mounts to the mount namespace? */
2028		- if (!parent_path) {
	2160	+ if (!moving) {
2029	2161	err = count_mounts(ns, source_mnt);
2030	2162	if (err)
2031	2163	goto out;
..	..	@@ -2044,11 +2176,15 @@
2044	2176	} else {
2045	2177	lock_mount_hash();
2046	2178	}
2047		- if (parent_path) {
2048		- detach_mnt(source_mnt, parent_path);
	2179	+ if (moving) {
	2180	+ unhash_mnt(source_mnt);
2049	2181	attach_mnt(source_mnt, dest_mnt, dest_mp);
2050	2182	touch_mnt_namespace(source_mnt->mnt_ns);
2051	2183	} else {
	2184	+ if (source_mnt->mnt_ns) {
	2185	+ /* move from anon - the caller will destroy */
	2186	+ list_del_init(&source_mnt->mnt_ns->list);
	2187	+ }
2052	2188	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2053	2189	commit_tree(source_mnt);
2054	2190	}
..	..	@@ -2060,6 +2196,10 @@
2060	2196	child->mnt_mountpoint);
2061	2197	if (q)
2062	2198	mnt_change_mountpoint(child, smp, q);
	2199	+ /* Notice when we are propagating across user namespaces */
	2200	+ if (child->mnt_parent->mnt_ns->user_ns != user_ns)
	2201	+ lock_mnt_tree(child);
	2202	+ child->mnt.mnt_flags &= ~MNT_LOCKED;
2063	2203	commit_tree(child);
2064	2204	}
2065	2205	put_mountpoint(smp);
..	..	@@ -2135,7 +2275,7 @@
2135	2275	d_is_dir(mnt->mnt.mnt_root))
2136	2276	return -ENOTDIR;
2137	2277
2138		- return attach_recursive_mnt(mnt, p, mp, NULL);
	2278	+ return attach_recursive_mnt(mnt, p, mp, false);
2139	2279	}
2140	2280
2141	2281	/*
..	..	@@ -2190,6 +2330,30 @@
2190	2330	return err;
2191	2331	}
2192	2332
	2333	+static struct mount __do_loopback(struct path old_path, int recurse)
	2334	+{
	2335	+ struct mount mnt = ERR_PTR(-EINVAL), old = real_mount(old_path->mnt);
	2336	+
	2337	+ if (IS_MNT_UNBINDABLE(old))
	2338	+ return mnt;
	2339	+
	2340	+ if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
	2341	+ return mnt;
	2342	+
	2343	+ if (!recurse && has_locked_children(old, old_path->dentry))
	2344	+ return mnt;
	2345	+
	2346	+ if (recurse)
	2347	+ mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
	2348	+ else
	2349	+ mnt = clone_mnt(old, old_path->dentry, 0);
	2350	+
	2351	+ if (!IS_ERR(mnt))
	2352	+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
	2353	+
	2354	+ return mnt;
	2355	+}
	2356	+
2193	2357	/*
2194	2358	* do loopback mount.
2195	2359	*/
..	..	@@ -2197,7 +2361,7 @@
2197	2361	int recurse)
2198	2362	{
2199	2363	struct path old_path;
2200		- struct mount mnt = NULL, old, *parent;
	2364	+ struct mount mnt = NULL, parent;
2201	2365	struct mountpoint *mp;
2202	2366	int err;
2203	2367	if (!old_name \|\| !*old_name)
..	..	@@ -2211,37 +2375,20 @@
2211	2375	goto out;
2212	2376
2213	2377	mp = lock_mount(path);
2214		- err = PTR_ERR(mp);
2215		- if (IS_ERR(mp))
	2378	+ if (IS_ERR(mp)) {
	2379	+ err = PTR_ERR(mp);
2216	2380	goto out;
	2381	+ }
2217	2382
2218		- old = real_mount(old_path.mnt);
2219	2383	parent = real_mount(path->mnt);
2220		-
2221		- err = -EINVAL;
2222		- if (IS_MNT_UNBINDABLE(old))
2223		- goto out2;
2224		-
2225	2384	if (!check_mnt(parent))
2226	2385	goto out2;
2227	2386
2228		- if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
2229		- goto out2;
2230		-
2231		- if (!recurse && has_locked_children(old, old_path.dentry))
2232		- goto out2;
2233		-
2234		- if (recurse)
2235		- mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2236		- else
2237		- mnt = clone_mnt(old, old_path.dentry, 0);
2238		-
	2387	+ mnt = __do_loopback(&old_path, recurse);
2239	2388	if (IS_ERR(mnt)) {
2240	2389	err = PTR_ERR(mnt);
2241	2390	goto out2;
2242	2391	}
2243		-
2244		- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2245	2392
2246	2393	err = graft_tree(mnt, parent, mp);
2247	2394	if (err) {
..	..	@@ -2256,21 +2403,206 @@
2256	2403	return err;
2257	2404	}
2258	2405
2259		-static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
	2406	+static struct file open_detached_copy(struct path path, bool recursive)
2260	2407	{
2261		- int error = 0;
2262		- int readonly_request = 0;
	2408	+ struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
	2409	+ struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
	2410	+ struct mount mnt, p;
	2411	+ struct file *file;
2263	2412
2264		- if (ms_flags & MS_RDONLY)
2265		- readonly_request = 1;
2266		- if (readonly_request == __mnt_is_readonly(mnt))
	2413	+ if (IS_ERR(ns))
	2414	+ return ERR_CAST(ns);
	2415	+
	2416	+ namespace_lock();
	2417	+ mnt = __do_loopback(path, recursive);
	2418	+ if (IS_ERR(mnt)) {
	2419	+ namespace_unlock();
	2420	+ free_mnt_ns(ns);
	2421	+ return ERR_CAST(mnt);
	2422	+ }
	2423	+
	2424	+ lock_mount_hash();
	2425	+ for (p = mnt; p; p = next_mnt(p, mnt)) {
	2426	+ p->mnt_ns = ns;
	2427	+ ns->mounts++;
	2428	+ }
	2429	+ ns->root = mnt;
	2430	+ list_add_tail(&ns->list, &mnt->mnt_list);
	2431	+ mntget(&mnt->mnt);
	2432	+ unlock_mount_hash();
	2433	+ namespace_unlock();
	2434	+
	2435	+ mntput(path->mnt);
	2436	+ path->mnt = &mnt->mnt;
	2437	+ file = dentry_open(path, O_PATH, current_cred());
	2438	+ if (IS_ERR(file))
	2439	+ dissolve_on_fput(path->mnt);
	2440	+ else
	2441	+ file->f_mode \|= FMODE_NEED_UNMOUNT;
	2442	+ return file;
	2443	+}
	2444	+
	2445	+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
	2446	+{
	2447	+ struct file *file;
	2448	+ struct path path;
	2449	+ int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
	2450	+ bool detached = flags & OPEN_TREE_CLONE;
	2451	+ int error;
	2452	+ int fd;
	2453	+
	2454	+ BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
	2455	+
	2456	+ if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
	2457	+ AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
	2458	+ OPEN_TREE_CLOEXEC))
	2459	+ return -EINVAL;
	2460	+
	2461	+ if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
	2462	+ return -EINVAL;
	2463	+
	2464	+ if (flags & AT_NO_AUTOMOUNT)
	2465	+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
	2466	+ if (flags & AT_SYMLINK_NOFOLLOW)
	2467	+ lookup_flags &= ~LOOKUP_FOLLOW;
	2468	+ if (flags & AT_EMPTY_PATH)
	2469	+ lookup_flags \|= LOOKUP_EMPTY;
	2470	+
	2471	+ if (detached && !may_mount())
	2472	+ return -EPERM;
	2473	+
	2474	+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
	2475	+ if (fd < 0)
	2476	+ return fd;
	2477	+
	2478	+ error = user_path_at(dfd, filename, lookup_flags, &path);
	2479	+ if (unlikely(error)) {
	2480	+ file = ERR_PTR(error);
	2481	+ } else {
	2482	+ if (detached)
	2483	+ file = open_detached_copy(&path, flags & AT_RECURSIVE);
	2484	+ else
	2485	+ file = dentry_open(&path, O_PATH, current_cred());
	2486	+ path_put(&path);
	2487	+ }
	2488	+ if (IS_ERR(file)) {
	2489	+ put_unused_fd(fd);
	2490	+ return PTR_ERR(file);
	2491	+ }
	2492	+ fd_install(fd, file);
	2493	+ return fd;
	2494	+}
	2495	+
	2496	+/*
	2497	+ * Don't allow locked mount flags to be cleared.
	2498	+ *
	2499	+ * No locks need to be held here while testing the various MNT_LOCK
	2500	+ * flags because those flags can never be cleared once they are set.
	2501	+ */
	2502	+static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
	2503	+{
	2504	+ unsigned int fl = mnt->mnt.mnt_flags;
	2505	+
	2506	+ if ((fl & MNT_LOCK_READONLY) &&
	2507	+ !(mnt_flags & MNT_READONLY))
	2508	+ return false;
	2509	+
	2510	+ if ((fl & MNT_LOCK_NODEV) &&
	2511	+ !(mnt_flags & MNT_NODEV))
	2512	+ return false;
	2513	+
	2514	+ if ((fl & MNT_LOCK_NOSUID) &&
	2515	+ !(mnt_flags & MNT_NOSUID))
	2516	+ return false;
	2517	+
	2518	+ if ((fl & MNT_LOCK_NOEXEC) &&
	2519	+ !(mnt_flags & MNT_NOEXEC))
	2520	+ return false;
	2521	+
	2522	+ if ((fl & MNT_LOCK_ATIME) &&
	2523	+ ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
	2524	+ return false;
	2525	+
	2526	+ return true;
	2527	+}
	2528	+
	2529	+static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
	2530	+{
	2531	+ bool readonly_request = (mnt_flags & MNT_READONLY);
	2532	+
	2533	+ if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2267	2534	return 0;
2268	2535
2269	2536	if (readonly_request)
2270		- error = mnt_make_readonly(real_mount(mnt));
2271		- else
2272		- __mnt_unmake_readonly(real_mount(mnt));
2273		- return error;
	2537	+ return mnt_make_readonly(mnt);
	2538	+
	2539	+ return __mnt_unmake_readonly(mnt);
	2540	+}
	2541	+
	2542	+/*
	2543	+ * Update the user-settable attributes on a mount. The caller must hold
	2544	+ * sb->s_umount for writing.
	2545	+ */
	2546	+static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
	2547	+{
	2548	+ lock_mount_hash();
	2549	+ mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
	2550	+ mnt->mnt.mnt_flags = mnt_flags;
	2551	+ touch_mnt_namespace(mnt->mnt_ns);
	2552	+ unlock_mount_hash();
	2553	+}
	2554	+
	2555	+static void mnt_warn_timestamp_expiry(struct path mountpoint, struct vfsmount mnt)
	2556	+{
	2557	+ struct super_block *sb = mnt->mnt_sb;
	2558	+
	2559	+ if (!__mnt_is_readonly(mnt) &&
	2560	+ (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
	2561	+ char buf = (char )__get_free_page(GFP_KERNEL);
	2562	+ char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
	2563	+ struct tm tm;
	2564	+
	2565	+ time64_to_tm(sb->s_time_max, 0, &tm);
	2566	+
	2567	+ pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
	2568	+ sb->s_type->name,
	2569	+ is_mounted(mnt) ? "remounted" : "mounted",
	2570	+ mntpath,
	2571	+ tm.tm_year+1900, (unsigned long long)sb->s_time_max);
	2572	+
	2573	+ free_page((unsigned long)buf);
	2574	+ }
	2575	+}
	2576	+
	2577	+/*
	2578	+ * Handle reconfiguration of the mountpoint only without alteration of the
	2579	+ * superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
	2580	+ * to mount(2).
	2581	+ */
	2582	+static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
	2583	+{
	2584	+ struct super_block *sb = path->mnt->mnt_sb;
	2585	+ struct mount *mnt = real_mount(path->mnt);
	2586	+ int ret;
	2587	+
	2588	+ if (!check_mnt(mnt))
	2589	+ return -EINVAL;
	2590	+
	2591	+ if (path->dentry != mnt->mnt.mnt_root)
	2592	+ return -EINVAL;
	2593	+
	2594	+ if (!can_change_locked_flags(mnt, mnt_flags))
	2595	+ return -EPERM;
	2596	+
	2597	+ down_write(&sb->s_umount);
	2598	+ ret = change_mount_ro_state(mnt, mnt_flags);
	2599	+ if (ret == 0)
	2600	+ set_mount_attributes(mnt, mnt_flags);
	2601	+ up_write(&sb->s_umount);
	2602	+
	2603	+ mnt_warn_timestamp_expiry(path, &mnt->mnt);
	2604	+
	2605	+ return ret;
2274	2606	}
2275	2607
2276	2608	/*
..	..	@@ -2284,6 +2616,7 @@
2284	2616	int err;
2285	2617	struct super_block *sb = path->mnt->mnt_sb;
2286	2618	struct mount *mnt = real_mount(path->mnt);
	2619	+ struct fs_context *fc;
2287	2620
2288	2621	if (!check_mnt(mnt))
2289	2622	return -EINVAL;
..	..	@@ -2291,58 +2624,29 @@
2291	2624	if (path->dentry != path->mnt->mnt_root)
2292	2625	return -EINVAL;
2293	2626
2294		- /* Don't allow changing of locked mnt flags.
2295		- *
2296		- * No locks need to be held here while testing the various
2297		- * MNT_LOCK flags because those flags can never be cleared
2298		- * once they are set.
2299		- */
2300		- if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2301		- !(mnt_flags & MNT_READONLY)) {
	2627	+ if (!can_change_locked_flags(mnt, mnt_flags))
2302	2628	return -EPERM;
2303		- }
2304		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2305		- !(mnt_flags & MNT_NODEV)) {
2306		- return -EPERM;
2307		- }
2308		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2309		- !(mnt_flags & MNT_NOSUID)) {
2310		- return -EPERM;
2311		- }
2312		- if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2313		- !(mnt_flags & MNT_NOEXEC)) {
2314		- return -EPERM;
2315		- }
2316		- if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2317		- ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2318		- return -EPERM;
2319		- }
2320	2629
2321		- err = security_sb_remount(sb, data);
2322		- if (err)
2323		- return err;
	2630	+ fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
	2631	+ if (IS_ERR(fc))
	2632	+ return PTR_ERR(fc);
2324	2633
2325		- down_write(&sb->s_umount);
2326		- if (ms_flags & MS_BIND)
2327		- err = change_mount_flags(path->mnt, ms_flags);
2328		- else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
2329		- err = -EPERM;
2330		- else {
2331		- err = do_remount_sb2(path->mnt, sb, sb_flags, data, 0);
2332		- namespace_lock();
2333		- lock_mount_hash();
2334		- propagate_remount(mnt);
2335		- unlock_mount_hash();
2336		- namespace_unlock();
2337		- }
	2634	+ fc->oldapi = true;
	2635	+ err = parse_monolithic_mount_data(fc, data);
2338	2636	if (!err) {
2339		- lock_mount_hash();
2340		- mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2341		- mnt->mnt.mnt_flags = mnt_flags;
2342		- touch_mnt_namespace(mnt->mnt_ns);
2343		- unlock_mount_hash();
	2637	+ down_write(&sb->s_umount);
	2638	+ err = -EPERM;
	2639	+ if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
	2640	+ err = reconfigure_super(fc);
	2641	+ if (!err)
	2642	+ set_mount_attributes(mnt, mnt_flags);
	2643	+ }
	2644	+ up_write(&sb->s_umount);
2344	2645	}
2345		- up_write(&sb->s_umount);
	2646	+
	2647	+ mnt_warn_timestamp_expiry(path, &mnt->mnt);
	2648	+
	2649	+ put_fs_context(fc);
2346	2650	return err;
2347	2651	}
2348	2652
..	..	@@ -2356,144 +2660,200 @@
2356	2660	return 0;
2357	2661	}
2358	2662
2359		-static int do_move_mount(struct path path, const char old_name)
	2663	+/*
	2664	+ * Check that there aren't references to earlier/same mount namespaces in the
	2665	+ * specified subtree. Such references can act as pins for mount namespaces
	2666	+ * that aren't checked by the mount-cycle checking code, thereby allowing
	2667	+ * cycles to be made.
	2668	+ */
	2669	+static bool check_for_nsfs_mounts(struct mount *subtree)
2360	2670	{
2361		- struct path old_path, parent_path;
	2671	+ struct mount *p;
	2672	+ bool ret = false;
	2673	+
	2674	+ lock_mount_hash();
	2675	+ for (p = subtree; p; p = next_mnt(p, subtree))
	2676	+ if (mnt_ns_loop(p->mnt.mnt_root))
	2677	+ goto out;
	2678	+
	2679	+ ret = true;
	2680	+out:
	2681	+ unlock_mount_hash();
	2682	+ return ret;
	2683	+}
	2684	+
	2685	+static int do_move_mount(struct path old_path, struct path new_path)
	2686	+{
	2687	+ struct mnt_namespace *ns;
2362	2688	struct mount *p;
2363	2689	struct mount *old;
2364		- struct mountpoint *mp;
	2690	+ struct mount *parent;
	2691	+ struct mountpoint mp, old_mp;
2365	2692	int err;
2366		- if (!old_name \|\| !*old_name)
2367		- return -EINVAL;
2368		- err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2369		- if (err)
2370		- return err;
	2693	+ bool attached;
2371	2694
2372		- mp = lock_mount(path);
2373		- err = PTR_ERR(mp);
	2695	+ mp = lock_mount(new_path);
2374	2696	if (IS_ERR(mp))
	2697	+ return PTR_ERR(mp);
	2698	+
	2699	+ old = real_mount(old_path->mnt);
	2700	+ p = real_mount(new_path->mnt);
	2701	+ parent = old->mnt_parent;
	2702	+ attached = mnt_has_parent(old);
	2703	+ old_mp = old->mnt_mp;
	2704	+ ns = old->mnt_ns;
	2705	+
	2706	+ err = -EINVAL;
	2707	+ /* The mountpoint must be in our namespace. */
	2708	+ if (!check_mnt(p))
2375	2709	goto out;
2376	2710
2377		- old = real_mount(old_path.mnt);
2378		- p = real_mount(path->mnt);
	2711	+ /* The thing moved must be mounted... */
	2712	+ if (!is_mounted(&old->mnt))
	2713	+ goto out;
2379	2714
2380		- err = -EINVAL;
2381		- if (!check_mnt(p) \|\| !check_mnt(old))
2382		- goto out1;
	2715	+ /* ... and either ours or the root of anon namespace */
	2716	+ if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
	2717	+ goto out;
2383	2718
2384	2719	if (old->mnt.mnt_flags & MNT_LOCKED)
2385		- goto out1;
	2720	+ goto out;
2386	2721
2387		- err = -EINVAL;
2388		- if (old_path.dentry != old_path.mnt->mnt_root)
2389		- goto out1;
	2722	+ if (old_path->dentry != old_path->mnt->mnt_root)
	2723	+ goto out;
2390	2724
2391		- if (!mnt_has_parent(old))
2392		- goto out1;
2393		-
2394		- if (d_is_dir(path->dentry) !=
2395		- d_is_dir(old_path.dentry))
2396		- goto out1;
	2725	+ if (d_is_dir(new_path->dentry) !=
	2726	+ d_is_dir(old_path->dentry))
	2727	+ goto out;
2397	2728	/*
2398	2729	* Don't move a mount residing in a shared parent.
2399	2730	*/
2400		- if (IS_MNT_SHARED(old->mnt_parent))
2401		- goto out1;
	2731	+ if (attached && IS_MNT_SHARED(parent))
	2732	+ goto out;
2402	2733	/*
2403	2734	* Don't move a mount tree containing unbindable mounts to a destination
2404	2735	* mount which is shared.
2405	2736	*/
2406	2737	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2407		- goto out1;
	2738	+ goto out;
2408	2739	err = -ELOOP;
	2740	+ if (!check_for_nsfs_mounts(old))
	2741	+ goto out;
2409	2742	for (; mnt_has_parent(p); p = p->mnt_parent)
2410	2743	if (p == old)
2411		- goto out1;
	2744	+ goto out;
2412	2745
2413		- err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
	2746	+ err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
	2747	+ attached);
2414	2748	if (err)
2415		- goto out1;
	2749	+ goto out;
2416	2750
2417	2751	/* if the mount is moved, it should no longer be expire
2418	2752	* automatically */
2419	2753	list_del_init(&old->mnt_expire);
2420		-out1:
2421		- unlock_mount(mp);
	2754	+ if (attached)
	2755	+ put_mountpoint(old_mp);
2422	2756	out:
2423		- if (!err)
2424		- path_put(&parent_path);
2425		- path_put(&old_path);
	2757	+ unlock_mount(mp);
	2758	+ if (!err) {
	2759	+ if (attached)
	2760	+ mntput_no_expire(parent);
	2761	+ else
	2762	+ free_mnt_ns(ns);
	2763	+ }
2426	2764	return err;
2427	2765	}
2428	2766
2429		-static struct vfsmount fs_set_subtype(struct vfsmount mnt, const char *fstype)
	2767	+static int do_move_mount_old(struct path path, const char old_name)
2430	2768	{
	2769	+ struct path old_path;
2431	2770	int err;
2432		- const char *subtype = strchr(fstype, '.');
2433		- if (subtype) {
2434		- subtype++;
2435		- err = -EINVAL;
2436		- if (!subtype[0])
2437		- goto err;
2438		- } else
2439		- subtype = "";
2440	2771
2441		- mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2442		- err = -ENOMEM;
2443		- if (!mnt->mnt_sb->s_subtype)
2444		- goto err;
2445		- return mnt;
	2772	+ if (!old_name \|\| !*old_name)
	2773	+ return -EINVAL;
2446	2774
2447		- err:
2448		- mntput(mnt);
2449		- return ERR_PTR(err);
	2775	+ err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
	2776	+ if (err)
	2777	+ return err;
	2778	+
	2779	+ err = do_move_mount(&old_path, path);
	2780	+ path_put(&old_path);
	2781	+ return err;
2450	2782	}
2451	2783
2452	2784	/*
2453	2785	* add a mount into a namespace's mount tree
2454	2786	*/
2455		-static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
	2787	+static int do_add_mount(struct mount newmnt, struct mountpoint mp,
	2788	+ struct path *path, int mnt_flags)
2456	2789	{
2457		- struct mountpoint *mp;
2458		- struct mount *parent;
2459		- int err;
	2790	+ struct mount *parent = real_mount(path->mnt);
2460	2791
2461	2792	mnt_flags &= ~MNT_INTERNAL_FLAGS;
2462	2793
2463		- mp = lock_mount(path);
2464		- if (IS_ERR(mp))
2465		- return PTR_ERR(mp);
2466		-
2467		- parent = real_mount(path->mnt);
2468		- err = -EINVAL;
2469	2794	if (unlikely(!check_mnt(parent))) {
2470	2795	/* that's acceptable only for automounts done in private ns */
2471	2796	if (!(mnt_flags & MNT_SHRINKABLE))
2472		- goto unlock;
	2797	+ return -EINVAL;
2473	2798	/* ... and for those we'd better have mountpoint still alive */
2474	2799	if (!parent->mnt_ns)
2475		- goto unlock;
	2800	+ return -EINVAL;
2476	2801	}
2477	2802
2478	2803	/* Refuse the same filesystem on the same mount point */
2479		- err = -EBUSY;
2480	2804	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2481	2805	path->mnt->mnt_root == path->dentry)
2482		- goto unlock;
	2806	+ return -EBUSY;
2483	2807
2484		- err = -EINVAL;
2485	2808	if (d_is_symlink(newmnt->mnt.mnt_root))
2486		- goto unlock;
	2809	+ return -EINVAL;
2487	2810
2488	2811	newmnt->mnt.mnt_flags = mnt_flags;
2489		- err = graft_tree(newmnt, parent, mp);
2490		-
2491		-unlock:
2492		- unlock_mount(mp);
2493		- return err;
	2812	+ return graft_tree(newmnt, parent, mp);
2494	2813	}
2495	2814
2496		-static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags);
	2815	+static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags);
	2816	+
	2817	+/*
	2818	+ * Create a new mount using a superblock configuration and request it
	2819	+ * be added to the namespace tree.
	2820	+ */
	2821	+static int do_new_mount_fc(struct fs_context fc, struct path mountpoint,
	2822	+ unsigned int mnt_flags)
	2823	+{
	2824	+ struct vfsmount *mnt;
	2825	+ struct mountpoint *mp;
	2826	+ struct super_block *sb = fc->root->d_sb;
	2827	+ int error;
	2828	+
	2829	+ error = security_sb_kern_mount(sb);
	2830	+ if (!error && mount_too_revealing(sb, &mnt_flags))
	2831	+ error = -EPERM;
	2832	+
	2833	+ if (unlikely(error)) {
	2834	+ fc_drop_locked(fc);
	2835	+ return error;
	2836	+ }
	2837	+
	2838	+ up_write(&sb->s_umount);
	2839	+
	2840	+ mnt = vfs_create_mount(fc);
	2841	+ if (IS_ERR(mnt))
	2842	+ return PTR_ERR(mnt);
	2843	+
	2844	+ mnt_warn_timestamp_expiry(mountpoint, mnt);
	2845	+
	2846	+ mp = lock_mount(mountpoint);
	2847	+ if (IS_ERR(mp)) {
	2848	+ mntput(mnt);
	2849	+ return PTR_ERR(mp);
	2850	+ }
	2851	+ error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
	2852	+ unlock_mount(mp);
	2853	+ if (error < 0)
	2854	+ mntput(mnt);
	2855	+ return error;
	2856	+}
2497	2857
2498	2858	/*
2499	2859	* create a new mount for userspace and request it to be added into the
..	..	@@ -2503,8 +2863,9 @@
2503	2863	int mnt_flags, const char name, void data)
2504	2864	{
2505	2865	struct file_system_type *type;
2506		- struct vfsmount *mnt;
2507		- int err;
	2866	+ struct fs_context *fc;
	2867	+ const char *subtype = NULL;
	2868	+ int err = 0;
2508	2869
2509	2870	if (!fstype)
2510	2871	return -EINVAL;
..	..	@@ -2513,45 +2874,99 @@
2513	2874	if (!type)
2514	2875	return -ENODEV;
2515	2876
2516		- mnt = vfs_kern_mount(type, sb_flags, name, data);
2517		- if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2518		- !mnt->mnt_sb->s_subtype)
2519		- mnt = fs_set_subtype(mnt, fstype);
2520		-
2521		- put_filesystem(type);
2522		- if (IS_ERR(mnt))
2523		- return PTR_ERR(mnt);
2524		-
2525		- if (mount_too_revealing(mnt, &mnt_flags)) {
2526		- mntput(mnt);
2527		- return -EPERM;
	2877	+ if (type->fs_flags & FS_HAS_SUBTYPE) {
	2878	+ subtype = strchr(fstype, '.');
	2879	+ if (subtype) {
	2880	+ subtype++;
	2881	+ if (!*subtype) {
	2882	+ put_filesystem(type);
	2883	+ return -EINVAL;
	2884	+ }
	2885	+ }
2528	2886	}
2529	2887
2530		- err = do_add_mount(real_mount(mnt), path, mnt_flags);
2531		- if (err)
2532		- mntput(mnt);
	2888	+ fc = fs_context_for_mount(type, sb_flags);
	2889	+ put_filesystem(type);
	2890	+ if (IS_ERR(fc))
	2891	+ return PTR_ERR(fc);
	2892	+
	2893	+ if (subtype)
	2894	+ err = vfs_parse_fs_string(fc, "subtype",
	2895	+ subtype, strlen(subtype));
	2896	+ if (!err && name)
	2897	+ err = vfs_parse_fs_string(fc, "source", name, strlen(name));
	2898	+ if (!err)
	2899	+ err = parse_monolithic_mount_data(fc, data);
	2900	+ if (!err && !mount_capable(fc))
	2901	+ err = -EPERM;
	2902	+ if (!err)
	2903	+ err = vfs_get_tree(fc);
	2904	+ if (!err)
	2905	+ err = do_new_mount_fc(fc, path, mnt_flags);
	2906	+
	2907	+ put_fs_context(fc);
2533	2908	return err;
2534	2909	}
2535	2910
2536	2911	int finish_automount(struct vfsmount m, struct path path)
2537	2912	{
2538		- struct mount *mnt = real_mount(m);
	2913	+ struct dentry *dentry = path->dentry;
	2914	+ struct mountpoint *mp;
	2915	+ struct mount *mnt;
2539	2916	int err;
	2917	+
	2918	+ if (!m)
	2919	+ return 0;
	2920	+ if (IS_ERR(m))
	2921	+ return PTR_ERR(m);
	2922	+
	2923	+ mnt = real_mount(m);
2540	2924	/* The new mount record should have at least 2 refs to prevent it being
2541	2925	* expired before we get a chance to add it
2542	2926	*/
2543	2927	BUG_ON(mnt_get_count(mnt) < 2);
2544	2928
2545	2929	if (m->mnt_sb == path->mnt->mnt_sb &&
2546		- m->mnt_root == path->dentry) {
	2930	+ m->mnt_root == dentry) {
2547	2931	err = -ELOOP;
2548		- goto fail;
	2932	+ goto discard;
2549	2933	}
2550	2934
2551		- err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
2552		- if (!err)
2553		- return 0;
2554		-fail:
	2935	+ /*
	2936	+ * we don't want to use lock_mount() - in this case finding something
	2937	+ * that overmounts our mountpoint to be means "quitely drop what we've
	2938	+ * got", not "try to mount it on top".
	2939	+ */
	2940	+ inode_lock(dentry->d_inode);
	2941	+ namespace_lock();
	2942	+ if (unlikely(cant_mount(dentry))) {
	2943	+ err = -ENOENT;
	2944	+ goto discard_locked;
	2945	+ }
	2946	+ rcu_read_lock();
	2947	+ if (unlikely(__lookup_mnt(path->mnt, dentry))) {
	2948	+ rcu_read_unlock();
	2949	+ err = 0;
	2950	+ goto discard_locked;
	2951	+ }
	2952	+ rcu_read_unlock();
	2953	+ mp = get_mountpoint(dentry);
	2954	+ if (IS_ERR(mp)) {
	2955	+ err = PTR_ERR(mp);
	2956	+ goto discard_locked;
	2957	+ }
	2958	+
	2959	+ err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
	2960	+ unlock_mount(mp);
	2961	+ if (unlikely(err))
	2962	+ goto discard;
	2963	+ mntput(m);
	2964	+ return 0;
	2965	+
	2966	+discard_locked:
	2967	+ namespace_unlock();
	2968	+ inode_unlock(dentry->d_inode);
	2969	+discard:
2555	2970	/* remove m from any expiration list it may be on */
2556	2971	if (!list_empty(&mnt->mnt_expire)) {
2557	2972	namespace_lock();
..	..	@@ -2685,39 +3100,10 @@
2685	3100	}
2686	3101	}
2687	3102
2688		-/*
2689		- * Some copy_from_user() implementations do not return the exact number of
2690		- * bytes remaining to copy on a fault. But copy_mount_options() requires that.
2691		- * Note that this function differs from copy_from_user() in that it will oops
2692		- * on bad values of `to', rather than returning a short copy.
2693		- */
2694		-static long exact_copy_from_user(void to, const void __user from,
2695		- unsigned long n)
	3103	+static void copy_mount_options(const void __user data)
2696	3104	{
2697		- char *t = to;
2698		- const char __user *f = from;
2699		- char c;
2700		-
2701		- if (!access_ok(VERIFY_READ, from, n))
2702		- return n;
2703		-
2704		- while (n) {
2705		- if (__get_user(c, f)) {
2706		- memset(t, 0, n);
2707		- break;
2708		- }
2709		- *t++ = c;
2710		- f++;
2711		- n--;
2712		- }
2713		- return n;
2714		-}
2715		-
2716		-void copy_mount_options(const void __user data)
2717		-{
2718		- int i;
2719		- unsigned long size;
2720	3105	char *copy;
	3106	+ unsigned left, offset;
2721	3107
2722	3108	if (!data)
2723	3109	return NULL;
..	..	@@ -2726,28 +3112,33 @@
2726	3112	if (!copy)
2727	3113	return ERR_PTR(-ENOMEM);
2728	3114
2729		- /* We only care that some data at the address the user
2730		- * gave us is valid. Just in case, we'll zero
2731		- * the remainder of the page.
2732		- */
2733		- /* copy_from_user cannot cross TASK_SIZE ! */
2734		- size = TASK_SIZE - (unsigned long)untagged_addr(data);
2735		- if (size > PAGE_SIZE)
2736		- size = PAGE_SIZE;
	3115	+ left = copy_from_user(copy, data, PAGE_SIZE);
2737	3116
2738		- i = size - exact_copy_from_user(copy, data, size);
2739		- if (!i) {
	3117	+ /*
	3118	+ * Not all architectures have an exact copy_from_user(). Resort to
	3119	+ * byte at a time.
	3120	+ */
	3121	+ offset = PAGE_SIZE - left;
	3122	+ while (left) {
	3123	+ char c;
	3124	+ if (get_user(c, (const char __user *)data + offset))
	3125	+ break;
	3126	+ copy[offset] = c;
	3127	+ left--;
	3128	+ offset++;
	3129	+ }
	3130	+
	3131	+ if (left == PAGE_SIZE) {
2740	3132	kfree(copy);
2741	3133	return ERR_PTR(-EFAULT);
2742	3134	}
2743		- if (i != PAGE_SIZE)
2744		- memset(copy + i, 0, PAGE_SIZE - i);
	3135	+
2745	3136	return copy;
2746	3137	}
2747	3138
2748		-char copy_mount_string(const void __user data)
	3139	+static char copy_mount_string(const void __user data)
2749	3140	{
2750		- return data ? strndup_user(data, PAGE_SIZE) : NULL;
	3141	+ return data ? strndup_user(data, PATH_MAX) : NULL;
2751	3142	}
2752	3143
2753	3144	/*
..	..	@@ -2764,12 +3155,11 @@
2764	3155	* Therefore, if this magic number is present, it carries no information
2765	3156	* and must be discarded.
2766	3157	*/
2767		-long do_mount(const char dev_name, const char __user dir_name,
	3158	+int path_mount(const char dev_name, struct path path,
2768	3159	const char type_page, unsigned long flags, void data_page)
2769	3160	{
2770		- struct path path;
2771	3161	unsigned int mnt_flags = 0, sb_flags;
2772		- int retval = 0;
	3162	+ int ret;
2773	3163
2774	3164	/* Discard magic */
2775	3165	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
..	..	@@ -2782,19 +3172,13 @@
2782	3172	if (flags & MS_NOUSER)
2783	3173	return -EINVAL;
2784	3174
2785		- /* ... and get the mountpoint */
2786		- retval = user_path(dir_name, &path);
2787		- if (retval)
2788		- return retval;
2789		-
2790		- retval = security_sb_mount(dev_name, &path,
2791		- type_page, flags, data_page);
2792		- if (!retval && !may_mount())
2793		- retval = -EPERM;
2794		- if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
2795		- retval = -EPERM;
2796		- if (retval)
2797		- goto dput_out;
	3175	+ ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
	3176	+ if (ret)
	3177	+ return ret;
	3178	+ if (!may_mount())
	3179	+ return -EPERM;
	3180	+ if ((flags & SB_MANDLOCK) && !may_mandlock())
	3181	+ return -EPERM;
2798	3182
2799	3183	/* Default to relatime unless overriden */
2800	3184	if (!(flags & MS_NOATIME))
..	..	@@ -2815,13 +3199,15 @@
2815	3199	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
2816	3200	if (flags & MS_RDONLY)
2817	3201	mnt_flags \|= MNT_READONLY;
	3202	+ if (flags & MS_NOSYMFOLLOW)
	3203	+ mnt_flags \|= MNT_NOSYMFOLLOW;
2818	3204
2819	3205	/* The default atime for remount is preservation */
2820	3206	if ((flags & MS_REMOUNT) &&
2821	3207	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
2822	3208	MS_STRICTATIME)) == 0)) {
2823	3209	mnt_flags &= ~MNT_ATIME_MASK;
2824		- mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
	3210	+ mnt_flags \|= path->mnt->mnt_flags & MNT_ATIME_MASK;
2825	3211	}
2826	3212
2827	3213	sb_flags = flags & (SB_RDONLY \|
..	..	@@ -2833,21 +3219,33 @@
2833	3219	SB_LAZYTIME \|
2834	3220	SB_I_VERSION);
2835	3221
	3222	+ if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
	3223	+ return do_reconfigure_mnt(path, mnt_flags);
2836	3224	if (flags & MS_REMOUNT)
2837		- retval = do_remount(&path, flags, sb_flags, mnt_flags,
2838		- data_page);
2839		- else if (flags & MS_BIND)
2840		- retval = do_loopback(&path, dev_name, flags & MS_REC);
2841		- else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
2842		- retval = do_change_type(&path, flags);
2843		- else if (flags & MS_MOVE)
2844		- retval = do_move_mount(&path, dev_name);
2845		- else
2846		- retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
2847		- dev_name, data_page);
2848		-dput_out:
	3225	+ return do_remount(path, flags, sb_flags, mnt_flags, data_page);
	3226	+ if (flags & MS_BIND)
	3227	+ return do_loopback(path, dev_name, flags & MS_REC);
	3228	+ if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
	3229	+ return do_change_type(path, flags);
	3230	+ if (flags & MS_MOVE)
	3231	+ return do_move_mount_old(path, dev_name);
	3232	+
	3233	+ return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
	3234	+ data_page);
	3235	+}
	3236	+
	3237	+long do_mount(const char dev_name, const char __user dir_name,
	3238	+ const char type_page, unsigned long flags, void data_page)
	3239	+{
	3240	+ struct path path;
	3241	+ int ret;
	3242	+
	3243	+ ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
	3244	+ if (ret)
	3245	+ return ret;
	3246	+ ret = path_mount(dev_name, &path, type_page, flags, data_page);
2849	3247	path_put(&path);
2850		- return retval;
	3248	+ return ret;
2851	3249	}
2852	3250
2853	3251	static struct ucounts inc_mnt_namespaces(struct user_namespace ns)
..	..	@@ -2862,7 +3260,8 @@
2862	3260
2863	3261	static void free_mnt_ns(struct mnt_namespace *ns)
2864	3262	{
2865		- ns_free_inum(&ns->ns);
	3263	+ if (!is_anon_ns(ns))
	3264	+ ns_free_inum(&ns->ns);
2866	3265	dec_mnt_namespaces(ns->ucounts);
2867	3266	put_user_ns(ns->user_ns);
2868	3267	kfree(ns);
..	..	@@ -2877,7 +3276,7 @@
2877	3276	*/
2878	3277	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2879	3278
2880		-static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns)
	3279	+static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns, bool anon)
2881	3280	{
2882	3281	struct mnt_namespace *new_ns;
2883	3282	struct ucounts *ucounts;
..	..	@@ -2887,28 +3286,28 @@
2887	3286	if (!ucounts)
2888	3287	return ERR_PTR(-ENOSPC);
2889	3288
2890		- new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
	3289	+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2891	3290	if (!new_ns) {
2892	3291	dec_mnt_namespaces(ucounts);
2893	3292	return ERR_PTR(-ENOMEM);
2894	3293	}
2895		- ret = ns_alloc_inum(&new_ns->ns);
2896		- if (ret) {
2897		- kfree(new_ns);
2898		- dec_mnt_namespaces(ucounts);
2899		- return ERR_PTR(ret);
	3294	+ if (!anon) {
	3295	+ ret = ns_alloc_inum(&new_ns->ns);
	3296	+ if (ret) {
	3297	+ kfree(new_ns);
	3298	+ dec_mnt_namespaces(ucounts);
	3299	+ return ERR_PTR(ret);
	3300	+ }
2900	3301	}
2901	3302	new_ns->ns.ops = &mntns_operations;
2902		- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
	3303	+ if (!anon)
	3304	+ new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2903	3305	atomic_set(&new_ns->count, 1);
2904		- new_ns->root = NULL;
2905	3306	INIT_LIST_HEAD(&new_ns->list);
2906	3307	init_waitqueue_head(&new_ns->poll);
2907		- new_ns->event = 0;
	3308	+ spin_lock_init(&new_ns->ns_lock);
2908	3309	new_ns->user_ns = get_user_ns(user_ns);
2909	3310	new_ns->ucounts = ucounts;
2910		- new_ns->mounts = 0;
2911		- new_ns->pending_mounts = 0;
2912	3311	return new_ns;
2913	3312	}
2914	3313
..	..	@@ -2932,7 +3331,7 @@
2932	3331
2933	3332	old = ns->root;
2934	3333
2935		- new_ns = alloc_mnt_ns(user_ns);
	3334	+ new_ns = alloc_mnt_ns(user_ns, false);
2936	3335	if (IS_ERR(new_ns))
2937	3336	return new_ns;
2938	3337
..	..	@@ -2940,12 +3339,17 @@
2940	3339	/* First pass: copy the tree topology */
2941	3340	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
2942	3341	if (user_ns != ns->user_ns)
2943		- copy_flags \|= CL_SHARED_TO_SLAVE \| CL_UNPRIVILEGED;
	3342	+ copy_flags \|= CL_SHARED_TO_SLAVE;
2944	3343	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2945	3344	if (IS_ERR(new)) {
2946	3345	namespace_unlock();
2947	3346	free_mnt_ns(new_ns);
2948	3347	return ERR_CAST(new);
	3348	+ }
	3349	+ if (user_ns != ns->user_ns) {
	3350	+ lock_mount_hash();
	3351	+ lock_mnt_tree(new);
	3352	+ unlock_mount_hash();
2949	3353	}
2950	3354	new_ns->root = new;
2951	3355	list_add_tail(&new_ns->list, &new->mnt_list);
..	..	@@ -2987,37 +3391,25 @@
2987	3391	return new_ns;
2988	3392	}
2989	3393
2990		-/**
2991		- * create_mnt_ns - creates a private namespace and adds a root filesystem
2992		- * @mnt: pointer to the new root filesystem mountpoint
2993		- */
2994		-static struct mnt_namespace create_mnt_ns(struct vfsmount m)
	3394	+struct dentry mount_subtree(struct vfsmount m, const char *name)
2995	3395	{
2996		- struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2997		- if (!IS_ERR(new_ns)) {
2998		- struct mount *mnt = real_mount(m);
2999		- mnt->mnt_ns = new_ns;
3000		- new_ns->root = mnt;
3001		- new_ns->mounts++;
3002		- list_add(&mnt->mnt_list, &new_ns->list);
3003		- } else {
3004		- mntput(m);
3005		- }
3006		- return new_ns;
3007		-}
3008		-
3009		-struct dentry mount_subtree(struct vfsmount mnt, const char *name)
3010		-{
	3396	+ struct mount *mnt = real_mount(m);
3011	3397	struct mnt_namespace *ns;
3012	3398	struct super_block *s;
3013	3399	struct path path;
3014	3400	int err;
3015	3401
3016		- ns = create_mnt_ns(mnt);
3017		- if (IS_ERR(ns))
	3402	+ ns = alloc_mnt_ns(&init_user_ns, true);
	3403	+ if (IS_ERR(ns)) {
	3404	+ mntput(m);
3018	3405	return ERR_CAST(ns);
	3406	+ }
	3407	+ mnt->mnt_ns = ns;
	3408	+ ns->root = mnt;
	3409	+ ns->mounts++;
	3410	+ list_add(&mnt->mnt_list, &ns->list);
3019	3411
3020		- err = vfs_path_lookup(mnt->mnt_root, mnt,
	3412	+ err = vfs_path_lookup(m->mnt_root, m,
3021	3413	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
3022	3414
3023	3415	put_mnt_ns(ns);
..	..	@@ -3036,8 +3428,8 @@
3036	3428	}
3037	3429	EXPORT_SYMBOL(mount_subtree);
3038	3430
3039		-int ksys_mount(char __user dev_name, char __user dir_name, char __user *type,
3040		- unsigned long flags, void __user *data)
	3431	+SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
	3432	+ char __user , type, unsigned long, flags, void __user , data)
3041	3433	{
3042	3434	int ret;
3043	3435	char *kernel_type;
..	..	@@ -3070,10 +3462,202 @@
3070	3462	return ret;
3071	3463	}
3072	3464
3073		-SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
3074		- char __user , type, unsigned long, flags, void __user , data)
	3465	+/*
	3466	+ * Create a kernel mount representation for a new, prepared superblock
	3467	+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
	3468	+ */
	3469	+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
	3470	+ unsigned int, attr_flags)
3075	3471	{
3076		- return ksys_mount(dev_name, dir_name, type, flags, data);
	3472	+ struct mnt_namespace *ns;
	3473	+ struct fs_context *fc;
	3474	+ struct file *file;
	3475	+ struct path newmount;
	3476	+ struct mount *mnt;
	3477	+ struct fd f;
	3478	+ unsigned int mnt_flags = 0;
	3479	+ long ret;
	3480	+
	3481	+ if (!may_mount())
	3482	+ return -EPERM;
	3483	+
	3484	+ if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
	3485	+ return -EINVAL;
	3486	+
	3487	+ if (attr_flags & ~(MOUNT_ATTR_RDONLY \|
	3488	+ MOUNT_ATTR_NOSUID \|
	3489	+ MOUNT_ATTR_NODEV \|
	3490	+ MOUNT_ATTR_NOEXEC \|
	3491	+ MOUNT_ATTR__ATIME \|
	3492	+ MOUNT_ATTR_NODIRATIME))
	3493	+ return -EINVAL;
	3494	+
	3495	+ if (attr_flags & MOUNT_ATTR_RDONLY)
	3496	+ mnt_flags \|= MNT_READONLY;
	3497	+ if (attr_flags & MOUNT_ATTR_NOSUID)
	3498	+ mnt_flags \|= MNT_NOSUID;
	3499	+ if (attr_flags & MOUNT_ATTR_NODEV)
	3500	+ mnt_flags \|= MNT_NODEV;
	3501	+ if (attr_flags & MOUNT_ATTR_NOEXEC)
	3502	+ mnt_flags \|= MNT_NOEXEC;
	3503	+ if (attr_flags & MOUNT_ATTR_NODIRATIME)
	3504	+ mnt_flags \|= MNT_NODIRATIME;
	3505	+
	3506	+ switch (attr_flags & MOUNT_ATTR__ATIME) {
	3507	+ case MOUNT_ATTR_STRICTATIME:
	3508	+ break;
	3509	+ case MOUNT_ATTR_NOATIME:
	3510	+ mnt_flags \|= MNT_NOATIME;
	3511	+ break;
	3512	+ case MOUNT_ATTR_RELATIME:
	3513	+ mnt_flags \|= MNT_RELATIME;
	3514	+ break;
	3515	+ default:
	3516	+ return -EINVAL;
	3517	+ }
	3518	+
	3519	+ f = fdget(fs_fd);
	3520	+ if (!f.file)
	3521	+ return -EBADF;
	3522	+
	3523	+ ret = -EINVAL;
	3524	+ if (f.file->f_op != &fscontext_fops)
	3525	+ goto err_fsfd;
	3526	+
	3527	+ fc = f.file->private_data;
	3528	+
	3529	+ ret = mutex_lock_interruptible(&fc->uapi_mutex);
	3530	+ if (ret < 0)
	3531	+ goto err_fsfd;
	3532	+
	3533	+ /* There must be a valid superblock or we can't mount it */
	3534	+ ret = -EINVAL;
	3535	+ if (!fc->root)
	3536	+ goto err_unlock;
	3537	+
	3538	+ ret = -EPERM;
	3539	+ if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
	3540	+ pr_warn("VFS: Mount too revealing\n");
	3541	+ goto err_unlock;
	3542	+ }
	3543	+
	3544	+ ret = -EBUSY;
	3545	+ if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
	3546	+ goto err_unlock;
	3547	+
	3548	+ ret = -EPERM;
	3549	+ if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
	3550	+ goto err_unlock;
	3551	+
	3552	+ newmount.mnt = vfs_create_mount(fc);
	3553	+ if (IS_ERR(newmount.mnt)) {
	3554	+ ret = PTR_ERR(newmount.mnt);
	3555	+ goto err_unlock;
	3556	+ }
	3557	+ newmount.dentry = dget(fc->root);
	3558	+ newmount.mnt->mnt_flags = mnt_flags;
	3559	+
	3560	+ /* We've done the mount bit - now move the file context into more or
	3561	+ * less the same state as if we'd done an fspick(). We don't want to
	3562	+ * do any memory allocation or anything like that at this point as we
	3563	+ * don't want to have to handle any errors incurred.
	3564	+ */
	3565	+ vfs_clean_context(fc);
	3566	+
	3567	+ ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
	3568	+ if (IS_ERR(ns)) {
	3569	+ ret = PTR_ERR(ns);
	3570	+ goto err_path;
	3571	+ }
	3572	+ mnt = real_mount(newmount.mnt);
	3573	+ mnt->mnt_ns = ns;
	3574	+ ns->root = mnt;
	3575	+ ns->mounts = 1;
	3576	+ list_add(&mnt->mnt_list, &ns->list);
	3577	+ mntget(newmount.mnt);
	3578	+
	3579	+ /* Attach to an apparent O_PATH fd with a note that we need to unmount
	3580	+ * it, not just simply put it.
	3581	+ */
	3582	+ file = dentry_open(&newmount, O_PATH, fc->cred);
	3583	+ if (IS_ERR(file)) {
	3584	+ dissolve_on_fput(newmount.mnt);
	3585	+ ret = PTR_ERR(file);
	3586	+ goto err_path;
	3587	+ }
	3588	+ file->f_mode \|= FMODE_NEED_UNMOUNT;
	3589	+
	3590	+ ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
	3591	+ if (ret >= 0)
	3592	+ fd_install(ret, file);
	3593	+ else
	3594	+ fput(file);
	3595	+
	3596	+err_path:
	3597	+ path_put(&newmount);
	3598	+err_unlock:
	3599	+ mutex_unlock(&fc->uapi_mutex);
	3600	+err_fsfd:
	3601	+ fdput(f);
	3602	+ return ret;
	3603	+}
	3604	+
	3605	+/*
	3606	+ * Move a mount from one place to another. In combination with
	3607	+ * fsopen()/fsmount() this is used to install a new mount and in combination
	3608	+ * with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
	3609	+ * a mount subtree.
	3610	+ *
	3611	+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
	3612	+ */
	3613	+SYSCALL_DEFINE5(move_mount,
	3614	+ int, from_dfd, const char __user *, from_pathname,
	3615	+ int, to_dfd, const char __user *, to_pathname,
	3616	+ unsigned int, flags)
	3617	+{
	3618	+ struct path from_path, to_path;
	3619	+ unsigned int lflags;
	3620	+ int ret = 0;
	3621	+
	3622	+ if (!may_mount())
	3623	+ return -EPERM;
	3624	+
	3625	+ if (flags & ~MOVE_MOUNT__MASK)
	3626	+ return -EINVAL;
	3627	+
	3628	+ /* If someone gives a pathname, they aren't permitted to move
	3629	+ * from an fd that requires unmount as we can't get at the flag
	3630	+ * to clear it afterwards.
	3631	+ */
	3632	+ lflags = 0;
	3633	+ if (flags & MOVE_MOUNT_F_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
	3634	+ if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
	3635	+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
	3636	+
	3637	+ ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
	3638	+ if (ret < 0)
	3639	+ return ret;
	3640	+
	3641	+ lflags = 0;
	3642	+ if (flags & MOVE_MOUNT_T_SYMLINKS) lflags \|= LOOKUP_FOLLOW;
	3643	+ if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags \|= LOOKUP_AUTOMOUNT;
	3644	+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags \|= LOOKUP_EMPTY;
	3645	+
	3646	+ ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
	3647	+ if (ret < 0)
	3648	+ goto out_from;
	3649	+
	3650	+ ret = security_move_mount(&from_path, &to_path);
	3651	+ if (ret < 0)
	3652	+ goto out_to;
	3653	+
	3654	+ ret = do_move_mount(&from_path, &to_path);
	3655	+
	3656	+out_to:
	3657	+ path_put(&to_path);
	3658	+out_from:
	3659	+ path_put(&from_path);
	3660	+ return ret;
3077	3661	}
3078	3662
3079	3663	/*
..	..	@@ -3115,7 +3699,7 @@
3115	3699	* file system may be mounted on put_old. After all, new_root is a mountpoint.
3116	3700	*
3117	3701	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
3118		- * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
	3702	+ * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
3119	3703	* in this situation.
3120	3704	*
3121	3705	* Notes:
..	..	@@ -3129,19 +3713,21 @@
3129	3713	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
3130	3714	const char __user *, put_old)
3131	3715	{
3132		- struct path new, old, parent_path, root_parent, root;
3133		- struct mount new_mnt, root_mnt, *old_mnt;
	3716	+ struct path new, old, root;
	3717	+ struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
3134	3718	struct mountpoint old_mp, root_mp;
3135	3719	int error;
3136	3720
3137	3721	if (!may_mount())
3138	3722	return -EPERM;
3139	3723
3140		- error = user_path_dir(new_root, &new);
	3724	+ error = user_path_at(AT_FDCWD, new_root,
	3725	+ LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
3141	3726	if (error)
3142	3727	goto out0;
3143	3728
3144		- error = user_path_dir(put_old, &old);
	3729	+ error = user_path_at(AT_FDCWD, put_old,
	3730	+ LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
3145	3731	if (error)
3146	3732	goto out1;
3147	3733
..	..	@@ -3159,9 +3745,11 @@
3159	3745	new_mnt = real_mount(new.mnt);
3160	3746	root_mnt = real_mount(root.mnt);
3161	3747	old_mnt = real_mount(old.mnt);
	3748	+ ex_parent = new_mnt->mnt_parent;
	3749	+ root_parent = root_mnt->mnt_parent;
3162	3750	if (IS_MNT_SHARED(old_mnt) \|\|
3163		- IS_MNT_SHARED(new_mnt->mnt_parent) \|\|
3164		- IS_MNT_SHARED(root_mnt->mnt_parent))
	3751	+ IS_MNT_SHARED(ex_parent) \|\|
	3752	+ IS_MNT_SHARED(root_parent))
3165	3753	goto out4;
3166	3754	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
3167	3755	goto out4;
..	..	@@ -3178,7 +3766,6 @@
3178	3766	goto out4; /* not a mountpoint */
3179	3767	if (!mnt_has_parent(root_mnt))
3180	3768	goto out4; /* not attached */
3181		- root_mp = root_mnt->mnt_mp;
3182	3769	if (new.mnt->mnt_root != new.dentry)
3183	3770	goto out4; /* not a mountpoint */
3184	3771	if (!mnt_has_parent(new_mnt))
..	..	@@ -3190,9 +3777,8 @@
3190	3777	if (!is_path_reachable(new_mnt, new.dentry, &root))
3191	3778	goto out4;
3192	3779	lock_mount_hash();
3193		- root_mp->m_count++; /* pin it so it won't go away */
3194		- detach_mnt(new_mnt, &parent_path);
3195		- detach_mnt(root_mnt, &root_parent);
	3780	+ umount_mnt(new_mnt);
	3781	+ root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
3196	3782	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
3197	3783	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
3198	3784	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
..	..	@@ -3200,7 +3786,8 @@
3200	3786	/* mount old root on put_old */
3201	3787	attach_mnt(root_mnt, old_mnt, old_mp);
3202	3788	/* mount new_root on / */
3203		- attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
	3789	+ attach_mnt(new_mnt, root_parent, root_mp);
	3790	+ mnt_add_count(root_parent, -1);
3204	3791	touch_mnt_namespace(current->nsproxy->mnt_ns);
3205	3792	/* A moved mount should not expire automatically */
3206	3793	list_del_init(&new_mnt->mnt_expire);
..	..	@@ -3210,10 +3797,8 @@
3210	3797	error = 0;
3211	3798	out4:
3212	3799	unlock_mount(old_mp);
3213		- if (!error) {
3214		- path_put(&root_parent);
3215		- path_put(&parent_path);
3216		- }
	3800	+ if (!error)
	3801	+ mntput_no_expire(ex_parent);
3217	3802	out3:
3218	3803	path_put(&root);
3219	3804	out2:
..	..	@@ -3227,22 +3812,22 @@
3227	3812	static void __init init_mount_tree(void)
3228	3813	{
3229	3814	struct vfsmount *mnt;
	3815	+ struct mount *m;
3230	3816	struct mnt_namespace *ns;
3231	3817	struct path root;
3232		- struct file_system_type *type;
3233	3818
3234		- type = get_fs_type("rootfs");
3235		- if (!type)
3236		- panic("Can't find rootfs type");
3237		- mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
3238		- put_filesystem(type);
	3819	+ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
3239	3820	if (IS_ERR(mnt))
3240	3821	panic("Can't create rootfs");
3241	3822
3242		- ns = create_mnt_ns(mnt);
	3823	+ ns = alloc_mnt_ns(&init_user_ns, false);
3243	3824	if (IS_ERR(ns))
3244	3825	panic("Can't allocate initial namespace");
3245		-
	3826	+ m = real_mount(mnt);
	3827	+ m->mnt_ns = ns;
	3828	+ ns->root = m;
	3829	+ ns->mounts = 1;
	3830	+ list_add(&m->mnt_list, &ns->list);
3246	3831	init_task.nsproxy->mnt_ns = ns;
3247	3832	get_mnt_ns(ns);
3248	3833
..	..	@@ -3284,6 +3869,7 @@
3284	3869	fs_kobj = kobject_create_and_add("fs", NULL);
3285	3870	if (!fs_kobj)
3286	3871	printk(KERN_WARNING "%s: kobj create error\n", __func__);
	3872	+ shmem_init();
3287	3873	init_rootfs();
3288	3874	init_mount_tree();
3289	3875	}
..	..	@@ -3296,10 +3882,10 @@
3296	3882	free_mnt_ns(ns);
3297	3883	}
3298	3884
3299		-struct vfsmount kern_mount_data(struct file_system_type type, void *data)
	3885	+struct vfsmount kern_mount(struct file_system_type type)
3300	3886	{
3301	3887	struct vfsmount *mnt;
3302		- mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
	3888	+ mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
3303	3889	if (!IS_ERR(mnt)) {
3304	3890	/*
3305	3891	* it is a longterm mount, don't release mnt until
..	..	@@ -3309,7 +3895,7 @@
3309	3895	}
3310	3896	return mnt;
3311	3897	}
3312		-EXPORT_SYMBOL_GPL(kern_mount_data);
	3898	+EXPORT_SYMBOL_GPL(kern_mount);
3313	3899
3314	3900	void kern_unmount(struct vfsmount *mnt)
3315	3901	{
..	..	@@ -3321,6 +3907,19 @@
3321	3907	}
3322	3908	}
3323	3909	EXPORT_SYMBOL(kern_unmount);
	3910	+
	3911	+void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
	3912	+{
	3913	+ unsigned int i;
	3914	+
	3915	+ for (i = 0; i < num; i++)
	3916	+ if (mnt[i])
	3917	+ real_mount(mnt[i])->mnt_ns = NULL;
	3918	+ synchronize_rcu_expedited();
	3919	+ for (i = 0; i < num; i++)
	3920	+ mntput(mnt[i]);
	3921	+}
	3922	+EXPORT_SYMBOL(kern_unmount_array);
3324	3923
3325	3924	bool our_mnt(struct vfsmount *mnt)
3326	3925	{
..	..	@@ -3351,7 +3950,8 @@
3351	3950	return chrooted;
3352	3951	}
3353	3952
3354		-static bool mnt_already_visible(struct mnt_namespace ns, struct vfsmount new,
	3953	+static bool mnt_already_visible(struct mnt_namespace *ns,
	3954	+ const struct super_block *sb,
3355	3955	int *new_mnt_flags)
3356	3956	{
3357	3957	int new_flags = *new_mnt_flags;
..	..	@@ -3359,11 +3959,15 @@
3359	3959	bool visible = false;
3360	3960
3361	3961	down_read(&namespace_sem);
	3962	+ lock_ns_list(ns);
3362	3963	list_for_each_entry(mnt, &ns->list, mnt_list) {
3363	3964	struct mount *child;
3364	3965	int mnt_flags;
3365	3966
3366		- if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
	3967	+ if (mnt_is_cursor(mnt))
	3968	+ continue;
	3969	+
	3970	+ if (mnt->mnt.mnt_sb->s_type != sb->s_type)
3367	3971	continue;
3368	3972
3369	3973	/* This mount is not fully visible if it's root directory
..	..	@@ -3410,11 +4014,12 @@
3410	4014	next: ;
3411	4015	}
3412	4016	found:
	4017	+ unlock_ns_list(ns);
3413	4018	up_read(&namespace_sem);
3414	4019	return visible;
3415	4020	}
3416	4021
3417		-static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags)
	4022	+static bool mount_too_revealing(const struct super_block sb, int new_mnt_flags)
3418	4023	{
3419	4024	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
3420	4025	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
..	..	@@ -3424,7 +4029,7 @@
3424	4029	return false;
3425	4030
3426	4031	/* Can this filesystem be too revealing? */
3427		- s_iflags = mnt->mnt_sb->s_iflags;
	4032	+ s_iflags = sb->s_iflags;
3428	4033	if (!(s_iflags & SB_I_USERNS_VISIBLE))
3429	4034	return false;
3430	4035
..	..	@@ -3434,7 +4039,7 @@
3434	4039	return true;
3435	4040	}
3436	4041
3437		- return !mnt_already_visible(ns, mnt, new_mnt_flags);
	4042	+ return !mnt_already_visible(ns, sb, new_mnt_flags);
3438	4043	}
3439	4044
3440	4045	bool mnt_may_suid(struct vfsmount *mnt)
..	..	@@ -3471,18 +4076,23 @@
3471	4076	put_mnt_ns(to_mnt_ns(ns));
3472	4077	}
3473	4078
3474		-static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
	4079	+static int mntns_install(struct nsset nsset, struct ns_common ns)
3475	4080	{
3476		- struct fs_struct *fs = current->fs;
	4081	+ struct nsproxy *nsproxy = nsset->nsproxy;
	4082	+ struct fs_struct *fs = nsset->fs;
3477	4083	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
	4084	+ struct user_namespace *user_ns = nsset->cred->user_ns;
3478	4085	struct path root;
3479	4086	int err;
3480	4087
3481	4088	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
3482		- !ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
3483		- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
	4089	+ !ns_capable(user_ns, CAP_SYS_CHROOT) \|\|
	4090	+ !ns_capable(user_ns, CAP_SYS_ADMIN))
3484	4091	return -EPERM;
3485	4092
	4093	+ if (is_anon_ns(mnt_ns))
	4094	+ return -EINVAL;
	4095	+
3486	4096	if (fs->users != 1)
3487	4097	return -EINVAL;
3488	4098