~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	#include "cgroup-internal.h"
2	3
3	4	#include <linux/ctype.h>
..	..	@@ -13,8 +14,10 @@
13	14	#include <linux/delayacct.h>
14	15	#include <linux/pid_namespace.h>
15	16	#include <linux/cgroupstats.h>
	17	+#include <linux/fs_parser.h>
16	18
17	19	#include <trace/events/cgroup.h>
	20	+#include <trace/hooks/cgroup.h>
18	21
19	22	/*
20	23	* pidlists linger the following amount before being destroyed. The goal
..	..	@@ -36,10 +39,7 @@
36	39	*/
37	40	static struct workqueue_struct *cgroup_pidlist_destroy_wq;
38	41
39		-/*
40		- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
41		- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
42		- */
	42	+/* protects cgroup_subsys->release_agent_path */
43	43	static DEFINE_SPINLOCK(release_agent_path_lock);
44	44
45	45	bool cgroup1_ssid_disabled(int ssid)
..	..	@@ -58,6 +58,7 @@
58	58	int retval = 0;
59	59
60	60	mutex_lock(&cgroup_mutex);
	61	+ cpus_read_lock();
61	62	percpu_down_write(&cgroup_threadgroup_rwsem);
62	63	for_each_root(root) {
63	64	struct cgroup *from_cgrp;
..	..	@@ -74,6 +75,7 @@
74	75	break;
75	76	}
76	77	percpu_up_write(&cgroup_threadgroup_rwsem);
	78	+ cpus_read_unlock();
77	79	mutex_unlock(&cgroup_mutex);
78	80
79	81	return retval;
..	..	@@ -190,25 +192,6 @@
190	192	};
191	193
192	194	/*
193		- * The following two functions "fix" the issue where there are more pids
194		- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
195		- * TODO: replace with a kernel-wide solution to this problem
196		- */
197		-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
198		-static void *pidlist_allocate(int count)
199		-{
200		- if (PIDLIST_TOO_LARGE(count))
201		- return vmalloc(array_size(count, sizeof(pid_t)));
202		- else
203		- return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
204		-}
205		-
206		-static void pidlist_free(void *p)
207		-{
208		- kvfree(p);
209		-}
210		-
211		-/*
212	195	* Used to destroy all pidlists lingering waiting for destroy timer. None
213	196	* should be left afterwards.
214	197	*/
..	..	@@ -240,7 +223,7 @@
240	223	*/
241	224	if (!delayed_work_pending(dwork)) {
242	225	list_del(&l->links);
243		- pidlist_free(l->list);
	226	+ kvfree(l->list);
244	227	put_pid_ns(l->key.ns);
245	228	tofree = l;
246	229	}
..	..	@@ -361,7 +344,7 @@
361	344	* show up until sometime later on.
362	345	*/
363	346	length = cgroup_task_count(cgrp);
364		- array = pidlist_allocate(length);
	347	+ array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
365	348	if (!array)
366	349	return -ENOMEM;
367	350	/* now, populate the array */
..	..	@@ -386,12 +369,12 @@
386	369
387	370	l = cgroup_pidlist_find_create(cgrp, type);
388	371	if (!l) {
389		- pidlist_free(array);
	372	+ kvfree(array);
390	373	return -ENOMEM;
391	374	}
392	375
393	376	/* store array, freeing old if necessary */
394		- pidlist_free(l->list);
	377	+ kvfree(l->list);
395	378	l->list = array;
396	379	l->length = length;
397	380	*lp = l;
..	..	@@ -413,6 +396,7 @@
413	396	* next pid to display, if any
414	397	*/
415	398	struct kernfs_open_file *of = s->private;
	399	+ struct cgroup_file_ctx *ctx = of->priv;
416	400	struct cgroup *cgrp = seq_css(s)->cgroup;
417	401	struct cgroup_pidlist *l;
418	402	enum cgroup_filetype type = seq_cft(s)->private;
..	..	@@ -422,25 +406,24 @@
422	406	mutex_lock(&cgrp->pidlist_mutex);
423	407
424	408	/*
425		- * !NULL @of->priv indicates that this isn't the first start()
426		- * after open. If the matching pidlist is around, we can use that.
427		- * Look for it. Note that @of->priv can't be used directly. It
428		- * could already have been destroyed.
	409	+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
	410	+ * start() after open. If the matching pidlist is around, we can use
	411	+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
	412	+ * directly. It could already have been destroyed.
429	413	*/
430		- if (of->priv)
431		- of->priv = cgroup_pidlist_find(cgrp, type);
	414	+ if (ctx->procs1.pidlist)
	415	+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
432	416
433	417	/*
434	418	* Either this is the first start() after open or the matching
435	419	* pidlist has been destroyed inbetween. Create a new one.
436	420	*/
437		- if (!of->priv) {
438		- ret = pidlist_array_load(cgrp, type,
439		- (struct cgroup_pidlist **)&of->priv);
	421	+ if (!ctx->procs1.pidlist) {
	422	+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
440	423	if (ret)
441	424	return ERR_PTR(ret);
442	425	}
443		- l = of->priv;
	426	+ l = ctx->procs1.pidlist;
444	427
445	428	if (pid) {
446	429	int end = l->length;
..	..	@@ -468,7 +451,8 @@
468	451	static void cgroup_pidlist_stop(struct seq_file s, void v)
469	452	{
470	453	struct kernfs_open_file *of = s->private;
471		- struct cgroup_pidlist *l = of->priv;
	454	+ struct cgroup_file_ctx *ctx = of->priv;
	455	+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
472	456
473	457	if (l)
474	458	mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
..	..	@@ -479,7 +463,8 @@
479	463	static void cgroup_pidlist_next(struct seq_file s, void v, loff_t pos)
480	464	{
481	465	struct kernfs_open_file *of = s->private;
482		- struct cgroup_pidlist *l = of->priv;
	466	+ struct cgroup_file_ctx *ctx = of->priv;
	467	+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
483	468	pid_t *p = v;
484	469	pid_t *end = l->list + l->length;
485	470	/*
..	..	@@ -511,21 +496,23 @@
511	496	struct task_struct *task;
512	497	const struct cred cred, tcred;
513	498	ssize_t ret;
	499	+ bool locked;
514	500
515	501	cgrp = cgroup_kn_lock_live(of->kn, false);
516	502	if (!cgrp)
517	503	return -ENODEV;
518	504
519		- task = cgroup_procs_write_start(buf, threadgroup);
	505	+ task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp);
520	506	ret = PTR_ERR_OR_ZERO(task);
521	507	if (ret)
522	508	goto out_unlock;
523	509
524	510	/*
525		- * Even if we're attaching all tasks in the thread group, we only
526		- * need to check permissions on one of them.
	511	+ * Even if we're attaching all tasks in the thread group, we only need
	512	+ * to check permissions on one of them. Check permissions using the
	513	+ * credentials from file open to protect against inherited fd attacks.
527	514	*/
528		- cred = current_cred();
	515	+ cred = of->file->f_cred;
529	516	tcred = get_task_cred(task);
530	517	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
531	518	!uid_eq(cred->euid, tcred->uid) &&
..	..	@@ -537,9 +524,10 @@
537	524	goto out_finish;
538	525
539	526	ret = cgroup_attach_task(cgrp, task, threadgroup);
	527	+ trace_android_vh_cgroup_set_task(ret, task);
540	528
541	529	out_finish:
542		- cgroup_procs_write_finish(task);
	530	+ cgroup_procs_write_finish(task, locked);
543	531	out_unlock:
544	532	cgroup_kn_unlock(of->kn);
545	533
..	..	@@ -562,6 +550,7 @@
562	550	char *buf, size_t nbytes, loff_t off)
563	551	{
564	552	struct cgroup *cgrp;
	553	+ struct cgroup_file_ctx *ctx;
565	554
566	555	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
567	556
..	..	@@ -569,8 +558,9 @@
569	558	* Release agent gets called with all capabilities,
570	559	* require capabilities to set release agent.
571	560	*/
572		- if ((of->file->f_cred->user_ns != &init_user_ns) \|\|
573		- !capable(CAP_SYS_ADMIN))
	561	+ ctx = of->priv;
	562	+ if ((ctx->ns->user_ns != &init_user_ns) \|\|
	563	+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
574	564	return -EPERM;
575	565
576	566	cgrp = cgroup_kn_lock_live(of->kn, false);
..	..	@@ -800,22 +790,29 @@
800	790	{
801	791	struct cgroup *cgrp =
802	792	container_of(work, struct cgroup, release_agent_work);
803		- char pathbuf = NULL, agentbuf = NULL;
	793	+ char pathbuf, agentbuf;
804	794	char argv[3], envp[3];
805	795	int ret;
806	796
807		- mutex_lock(&cgroup_mutex);
	797	+ /* snoop agent path and exit early if empty */
	798	+ if (!cgrp->root->release_agent_path[0])
	799	+ return;
808	800
	801	+ /* prepare argument buffers */
809	802	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
810		- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
811		- if (!pathbuf \|\| !agentbuf \|\| !strlen(agentbuf))
812		- goto out;
	803	+ agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
	804	+ if (!pathbuf \|\| !agentbuf)
	805	+ goto out_free;
813	806
814		- spin_lock_irq(&css_set_lock);
815		- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
816		- spin_unlock_irq(&css_set_lock);
	807	+ spin_lock(&release_agent_path_lock);
	808	+ strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
	809	+ spin_unlock(&release_agent_path_lock);
	810	+ if (!agentbuf[0])
	811	+ goto out_free;
	812	+
	813	+ ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
817	814	if (ret < 0 \|\| ret >= PATH_MAX)
818		- goto out;
	815	+ goto out_free;
819	816
820	817	argv[0] = agentbuf;
821	818	argv[1] = pathbuf;
..	..	@@ -826,11 +823,7 @@
826	823	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
827	824	envp[2] = NULL;
828	825
829		- mutex_unlock(&cgroup_mutex);
830	826	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
831		- goto out_free;
832		-out:
833		- mutex_unlock(&cgroup_mutex);
834	827	out_free:
835	828	kfree(agentbuf);
836	829	kfree(pathbuf);
..	..	@@ -904,179 +897,203 @@
904	897	return 0;
905	898	}
906	899
907		-static int parse_cgroupfs_options(char data, struct cgroup_sb_opts opts)
	900	+enum cgroup1_param {
	901	+ Opt_all,
	902	+ Opt_clone_children,
	903	+ Opt_cpuset_v2_mode,
	904	+ Opt_name,
	905	+ Opt_none,
	906	+ Opt_noprefix,
	907	+ Opt_release_agent,
	908	+ Opt_xattr,
	909	+};
	910	+
	911	+const struct fs_parameter_spec cgroup1_fs_parameters[] = {
	912	+ fsparam_flag ("all", Opt_all),
	913	+ fsparam_flag ("clone_children", Opt_clone_children),
	914	+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
	915	+ fsparam_string("name", Opt_name),
	916	+ fsparam_flag ("none", Opt_none),
	917	+ fsparam_flag ("noprefix", Opt_noprefix),
	918	+ fsparam_string("release_agent", Opt_release_agent),
	919	+ fsparam_flag ("xattr", Opt_xattr),
	920	+ {}
	921	+};
	922	+
	923	+int cgroup1_parse_param(struct fs_context fc, struct fs_parameter param)
908	924	{
909		- char token, o = data;
910		- bool all_ss = false, one_ss = false;
911		- u16 mask = U16_MAX;
	925	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
912	926	struct cgroup_subsys *ss;
913		- int nr_opts = 0;
	927	+ struct fs_parse_result result;
	928	+ int opt, i;
	929	+
	930	+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
	931	+ if (opt == -ENOPARAM) {
	932	+ if (strcmp(param->key, "source") == 0) {
	933	+ if (param->type != fs_value_is_string)
	934	+ return invalf(fc, "Non-string source");
	935	+ if (fc->source)
	936	+ return invalf(fc, "Multiple sources not supported");
	937	+ fc->source = param->string;
	938	+ param->string = NULL;
	939	+ return 0;
	940	+ }
	941	+ for_each_subsys(ss, i) {
	942	+ if (strcmp(param->key, ss->legacy_name))
	943	+ continue;
	944	+ if (!cgroup_ssid_enabled(i) \|\| cgroup1_ssid_disabled(i))
	945	+ return invalfc(fc, "Disabled controller '%s'",
	946	+ param->key);
	947	+ ctx->subsys_mask \|= (1 << i);
	948	+ return 0;
	949	+ }
	950	+ return invalfc(fc, "Unknown subsys name '%s'", param->key);
	951	+ }
	952	+ if (opt < 0)
	953	+ return opt;
	954	+
	955	+ switch (opt) {
	956	+ case Opt_none:
	957	+ /* Explicitly have no subsystems */
	958	+ ctx->none = true;
	959	+ break;
	960	+ case Opt_all:
	961	+ ctx->all_ss = true;
	962	+ break;
	963	+ case Opt_noprefix:
	964	+ ctx->flags \|= CGRP_ROOT_NOPREFIX;
	965	+ break;
	966	+ case Opt_clone_children:
	967	+ ctx->cpuset_clone_children = true;
	968	+ break;
	969	+ case Opt_cpuset_v2_mode:
	970	+ ctx->flags \|= CGRP_ROOT_CPUSET_V2_MODE;
	971	+ break;
	972	+ case Opt_xattr:
	973	+ ctx->flags \|= CGRP_ROOT_XATTR;
	974	+ break;
	975	+ case Opt_release_agent:
	976	+ /* Specifying two release agents is forbidden */
	977	+ if (ctx->release_agent)
	978	+ return invalfc(fc, "release_agent respecified");
	979	+ /*
	980	+ * Release agent gets called with all capabilities,
	981	+ * require capabilities to set release agent.
	982	+ */
	983	+ if ((fc->user_ns != &init_user_ns) \|\| !capable(CAP_SYS_ADMIN))
	984	+ return invalfc(fc, "Setting release_agent not allowed");
	985	+ ctx->release_agent = param->string;
	986	+ param->string = NULL;
	987	+ break;
	988	+ case Opt_name:
	989	+ /* blocked by boot param? */
	990	+ if (cgroup_no_v1_named)
	991	+ return -ENOENT;
	992	+ /* Can't specify an empty name */
	993	+ if (!param->size)
	994	+ return invalfc(fc, "Empty name");
	995	+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
	996	+ return invalfc(fc, "Name too long");
	997	+ /* Must match [\w.-]+ */
	998	+ for (i = 0; i < param->size; i++) {
	999	+ char c = param->string[i];
	1000	+ if (isalnum(c))
	1001	+ continue;
	1002	+ if ((c == '.') \|\| (c == '-') \|\| (c == '_'))
	1003	+ continue;
	1004	+ return invalfc(fc, "Invalid name");
	1005	+ }
	1006	+ /* Specifying two names is forbidden */
	1007	+ if (ctx->name)
	1008	+ return invalfc(fc, "name respecified");
	1009	+ ctx->name = param->string;
	1010	+ param->string = NULL;
	1011	+ break;
	1012	+ }
	1013	+ return 0;
	1014	+}
	1015	+
	1016	+static int check_cgroupfs_options(struct fs_context *fc)
	1017	+{
	1018	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	1019	+ u16 mask = U16_MAX;
	1020	+ u16 enabled = 0;
	1021	+ struct cgroup_subsys *ss;
914	1022	int i;
915	1023
916	1024	#ifdef CONFIG_CPUSETS
917	1025	mask = ~((u16)1 << cpuset_cgrp_id);
918	1026	#endif
	1027	+ for_each_subsys(ss, i)
	1028	+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
	1029	+ enabled \|= 1 << i;
919	1030
920		- memset(opts, 0, sizeof(*opts));
921		-
922		- while ((token = strsep(&o, ",")) != NULL) {
923		- nr_opts++;
924		-
925		- if (!*token)
926		- return -EINVAL;
927		- if (!strcmp(token, "none")) {
928		- /* Explicitly have no subsystems */
929		- opts->none = true;
930		- continue;
931		- }
932		- if (!strcmp(token, "all")) {
933		- /* Mutually exclusive option 'all' + subsystem name */
934		- if (one_ss)
935		- return -EINVAL;
936		- all_ss = true;
937		- continue;
938		- }
939		- if (!strcmp(token, "noprefix")) {
940		- opts->flags \|= CGRP_ROOT_NOPREFIX;
941		- continue;
942		- }
943		- if (!strcmp(token, "clone_children")) {
944		- opts->cpuset_clone_children = true;
945		- continue;
946		- }
947		- if (!strcmp(token, "cpuset_v2_mode")) {
948		- opts->flags \|= CGRP_ROOT_CPUSET_V2_MODE;
949		- continue;
950		- }
951		- if (!strcmp(token, "xattr")) {
952		- opts->flags \|= CGRP_ROOT_XATTR;
953		- continue;
954		- }
955		- if (!strncmp(token, "release_agent=", 14)) {
956		- /* Specifying two release agents is forbidden */
957		- if (opts->release_agent)
958		- return -EINVAL;
959		- opts->release_agent =
960		- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
961		- if (!opts->release_agent)
962		- return -ENOMEM;
963		- continue;
964		- }
965		- if (!strncmp(token, "name=", 5)) {
966		- const char *name = token + 5;
967		-
968		- /* blocked by boot param? */
969		- if (cgroup_no_v1_named)
970		- return -ENOENT;
971		- /* Can't specify an empty name */
972		- if (!strlen(name))
973		- return -EINVAL;
974		- /* Must match [\w.-]+ */
975		- for (i = 0; i < strlen(name); i++) {
976		- char c = name[i];
977		- if (isalnum(c))
978		- continue;
979		- if ((c == '.') \|\| (c == '-') \|\| (c == '_'))
980		- continue;
981		- return -EINVAL;
982		- }
983		- /* Specifying two names is forbidden */
984		- if (opts->name)
985		- return -EINVAL;
986		- opts->name = kstrndup(name,
987		- MAX_CGROUP_ROOT_NAMELEN - 1,
988		- GFP_KERNEL);
989		- if (!opts->name)
990		- return -ENOMEM;
991		-
992		- continue;
993		- }
994		-
995		- for_each_subsys(ss, i) {
996		- if (strcmp(token, ss->legacy_name))
997		- continue;
998		- if (!cgroup_ssid_enabled(i))
999		- continue;
1000		- if (cgroup1_ssid_disabled(i))
1001		- continue;
1002		-
1003		- /* Mutually exclusive option 'all' + subsystem name */
1004		- if (all_ss)
1005		- return -EINVAL;
1006		- opts->subsys_mask \|= (1 << i);
1007		- one_ss = true;
1008		-
1009		- break;
1010		- }
1011		- if (i == CGROUP_SUBSYS_COUNT)
1012		- return -ENOENT;
1013		- }
	1031	+ ctx->subsys_mask &= enabled;
1014	1032
1015	1033	/*
1016		- * If the 'all' option was specified select all the subsystems,
1017		- * otherwise if 'none', 'name=' and a subsystem name options were
1018		- * not specified, let's default to 'all'
	1034	+ * In absense of 'none', 'name=' or subsystem name options,
	1035	+ * let's default to 'all'.
1019	1036	*/
1020		- if (all_ss \|\| (!one_ss && !opts->none && !opts->name))
1021		- for_each_subsys(ss, i)
1022		- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
1023		- opts->subsys_mask \|= (1 << i);
	1037	+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
	1038	+ ctx->all_ss = true;
	1039	+
	1040	+ if (ctx->all_ss) {
	1041	+ /* Mutually exclusive option 'all' + subsystem name */
	1042	+ if (ctx->subsys_mask)
	1043	+ return invalfc(fc, "subsys name conflicts with all");
	1044	+ /* 'all' => select all the subsystems */
	1045	+ ctx->subsys_mask = enabled;
	1046	+ }
1024	1047
1025	1048	/*
1026	1049	* We either have to specify by name or by subsystems. (So all
1027	1050	* empty hierarchies must have a name).
1028	1051	*/
1029		- if (!opts->subsys_mask && !opts->name)
1030		- return -EINVAL;
	1052	+ if (!ctx->subsys_mask && !ctx->name)
	1053	+ return invalfc(fc, "Need name or subsystem set");
1031	1054
1032	1055	/*
1033	1056	* Option noprefix was introduced just for backward compatibility
1034	1057	* with the old cpuset, so we allow noprefix only if mounting just
1035	1058	* the cpuset subsystem.
1036	1059	*/
1037		- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1038		- return -EINVAL;
	1060	+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
	1061	+ return invalfc(fc, "noprefix used incorrectly");
1039	1062
1040	1063	/* Can't specify "none" and some subsystems */
1041		- if (opts->subsys_mask && opts->none)
1042		- return -EINVAL;
	1064	+ if (ctx->subsys_mask && ctx->none)
	1065	+ return invalfc(fc, "none used incorrectly");
1043	1066
1044	1067	return 0;
1045	1068	}
1046	1069
1047		-static int cgroup1_remount(struct kernfs_root kf_root, int flags, char *data)
	1070	+int cgroup1_reconfigure(struct fs_context *fc)
1048	1071	{
1049		- int ret = 0;
	1072	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	1073	+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
1050	1074	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1051		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1052		- struct cgroup_sb_opts opts;
	1075	+ int ret = 0;
1053	1076	u16 added_mask, removed_mask;
1054	1077
1055	1078	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1056	1079
1057	1080	/* See what subsystems are wanted */
1058		- ret = parse_cgroupfs_options(data, &opts);
	1081	+ ret = check_cgroupfs_options(fc);
1059	1082	if (ret)
1060	1083	goto out_unlock;
1061	1084
1062		- if (opts.subsys_mask != root->subsys_mask \|\| opts.release_agent)
	1085	+ if (ctx->subsys_mask != root->subsys_mask \|\| ctx->release_agent)
1063	1086	pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1064	1087	task_tgid_nr(current), current->comm);
1065		- /* See cgroup1_mount release_agent handling */
1066		- if (opts.release_agent &&
1067		- ((ns->user_ns != &init_user_ns) \|\| !capable(CAP_SYS_ADMIN))) {
1068		- ret = -EINVAL;
1069		- goto out_unlock;
1070		- }
1071	1088
1072		- added_mask = opts.subsys_mask & ~root->subsys_mask;
1073		- removed_mask = root->subsys_mask & ~opts.subsys_mask;
	1089	+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
	1090	+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
1074	1091
1075	1092	/* Don't allow flags or name to change at remount */
1076		- if ((opts.flags ^ root->flags) \|\|
1077		- (opts.name && strcmp(opts.name, root->name))) {
1078		- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1079		- opts.flags, opts.name ?: "", root->flags, root->name);
	1093	+ if ((ctx->flags ^ root->flags) \|\|
	1094	+ (ctx->name && strcmp(ctx->name, root->name))) {
	1095	+ errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
	1096	+ ctx->flags, ctx->name ?: "", root->flags, root->name);
1080	1097	ret = -EINVAL;
1081	1098	goto out_unlock;
1082	1099	}
..	..	@@ -1093,17 +1110,15 @@
1093	1110
1094	1111	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1095	1112
1096		- if (opts.release_agent) {
	1113	+ if (ctx->release_agent) {
1097	1114	spin_lock(&release_agent_path_lock);
1098		- strcpy(root->release_agent_path, opts.release_agent);
	1115	+ strcpy(root->release_agent_path, ctx->release_agent);
1099	1116	spin_unlock(&release_agent_path_lock);
1100	1117	}
1101	1118
1102	1119	trace_cgroup_remount(root);
1103	1120
1104	1121	out_unlock:
1105		- kfree(opts.release_agent);
1106		- kfree(opts.name);
1107	1122	mutex_unlock(&cgroup_mutex);
1108	1123	return ret;
1109	1124	}
..	..	@@ -1111,28 +1126,30 @@
1111	1126	struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
1112	1127	.rename = cgroup1_rename,
1113	1128	.show_options = cgroup1_show_options,
1114		- .remount_fs = cgroup1_remount,
1115	1129	.mkdir = cgroup_mkdir,
1116	1130	.rmdir = cgroup_rmdir,
1117	1131	.show_path = cgroup_show_path,
1118	1132	};
1119	1133
1120		-struct dentry cgroup1_mount(struct file_system_type fs_type, int flags,
1121		- void *data, unsigned long magic,
1122		- struct cgroup_namespace *ns)
	1134	+/*
	1135	+ * The guts of cgroup1 mount - find or create cgroup_root to use.
	1136	+ * Called with cgroup_mutex held; returns 0 on success, -E... on
	1137	+ * error and positive - in case when the candidate is busy dying.
	1138	+ * On success it stashes a reference to cgroup_root into given
	1139	+ * cgroup_fs_context; that reference is NOT counting towards the
	1140	+ * cgroup_root refcount.
	1141	+ */
	1142	+static int cgroup1_root_to_use(struct fs_context *fc)
1123	1143	{
1124		- struct cgroup_sb_opts opts;
	1144	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1125	1145	struct cgroup_root *root;
1126	1146	struct cgroup_subsys *ss;
1127		- struct dentry *dentry;
1128	1147	int i, ret;
1129	1148
1130		- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1131		-
1132	1149	/* First find the desired set of subsystems */
1133		- ret = parse_cgroupfs_options(data, &opts);
	1150	+ ret = check_cgroupfs_options(fc);
1134	1151	if (ret)
1135		- goto out_unlock;
	1152	+ return ret;
1136	1153
1137	1154	/*
1138	1155	* Destruction of cgroup root is asynchronous, so subsystems may
..	..	@@ -1142,16 +1159,12 @@
1142	1159	* starting. Testing ref liveliness is good enough.
1143	1160	*/
1144	1161	for_each_subsys(ss, i) {
1145		- if (!(opts.subsys_mask & (1 << i)) \|\|
	1162	+ if (!(ctx->subsys_mask & (1 << i)) \|\|
1146	1163	ss->root == &cgrp_dfl_root)
1147	1164	continue;
1148	1165
1149		- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1150		- mutex_unlock(&cgroup_mutex);
1151		- msleep(10);
1152		- ret = restart_syscall();
1153		- goto out_free;
1154		- }
	1166	+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
	1167	+ return 1; /* restart */
1155	1168	cgroup_put(&ss->root->cgrp);
1156	1169	}
1157	1170
..	..	@@ -1166,8 +1179,8 @@
1166	1179	* name matches but sybsys_mask doesn't, we should fail.
1167	1180	* Remember whether name matched.
1168	1181	*/
1169		- if (opts.name) {
1170		- if (strcmp(opts.name, root->name))
	1182	+ if (ctx->name) {
	1183	+ if (strcmp(ctx->name, root->name))
1171	1184	continue;
1172	1185	name_match = true;
1173	1186	}
..	..	@@ -1176,19 +1189,18 @@
1176	1189	* If we asked for subsystems (or explicitly for no
1177	1190	* subsystems) then they must match.
1178	1191	*/
1179		- if ((opts.subsys_mask \|\| opts.none) &&
1180		- (opts.subsys_mask != root->subsys_mask)) {
	1192	+ if ((ctx->subsys_mask \|\| ctx->none) &&
	1193	+ (ctx->subsys_mask != root->subsys_mask)) {
1181	1194	if (!name_match)
1182	1195	continue;
1183		- ret = -EBUSY;
1184		- goto out_unlock;
	1196	+ return -EBUSY;
1185	1197	}
1186	1198
1187		- if (root->flags ^ opts.flags)
	1199	+ if (root->flags ^ ctx->flags)
1188	1200	pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1189	1201
1190		- ret = 0;
1191		- goto out_unlock;
	1202	+ ctx->root = root;
	1203	+ return 0;
1192	1204	}
1193	1205
1194	1206	/*
..	..	@@ -1196,64 +1208,56 @@
1196	1208	* specification is allowed for already existing hierarchies but we
1197	1209	* can't create new one without subsys specification.
1198	1210	*/
1199		- if (!opts.subsys_mask && !opts.none) {
1200		- ret = -EINVAL;
1201		- goto out_unlock;
1202		- }
	1211	+ if (!ctx->subsys_mask && !ctx->none)
	1212	+ return invalfc(fc, "No subsys list or none specified");
1203	1213
1204	1214	/* Hierarchies may only be created in the initial cgroup namespace. */
1205		- if (ns != &init_cgroup_ns) {
1206		- ret = -EPERM;
1207		- goto out_unlock;
1208		- }
1209		- /*
1210		- * Release agent gets called with all capabilities,
1211		- * require capabilities to set release agent.
1212		- */
1213		- if (opts.release_agent &&
1214		- ((ns->user_ns != &init_user_ns) \|\| !capable(CAP_SYS_ADMIN))) {
1215		- ret = -EINVAL;
1216		- goto out_unlock;
1217		- }
	1215	+ if (ctx->ns != &init_cgroup_ns)
	1216	+ return -EPERM;
1218	1217
1219	1218	root = kzalloc(sizeof(*root), GFP_KERNEL);
1220		- if (!root) {
1221		- ret = -ENOMEM;
1222		- goto out_unlock;
1223		- }
	1219	+ if (!root)
	1220	+ return -ENOMEM;
1224	1221
1225		- init_cgroup_root(root, &opts);
	1222	+ ctx->root = root;
	1223	+ init_cgroup_root(ctx);
1226	1224
1227		- ret = cgroup_setup_root(root, opts.subsys_mask);
	1225	+ ret = cgroup_setup_root(root, ctx->subsys_mask);
1228	1226	if (ret)
1229	1227	cgroup_free_root(root);
	1228	+ return ret;
	1229	+}
1230	1230
1231		-out_unlock:
1232		- if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1233		- mutex_unlock(&cgroup_mutex);
1234		- msleep(10);
1235		- ret = restart_syscall();
1236		- goto out_free;
1237		- }
	1231	+int cgroup1_get_tree(struct fs_context *fc)
	1232	+{
	1233	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	1234	+ int ret;
	1235	+
	1236	+ /* Check if the caller has permission to mount. */
	1237	+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
	1238	+ return -EPERM;
	1239	+
	1240	+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
	1241	+
	1242	+ ret = cgroup1_root_to_use(fc);
	1243	+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
	1244	+ ret = 1; /* restart */
	1245	+
1238	1246	mutex_unlock(&cgroup_mutex);
1239		-out_free:
1240		- kfree(opts.release_agent);
1241		- kfree(opts.name);
1242	1247
1243		- if (ret)
1244		- return ERR_PTR(ret);
	1248	+ if (!ret)
	1249	+ ret = cgroup_do_get_tree(fc);
1245	1250
1246		- dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
1247		- CGROUP_SUPER_MAGIC, ns);
1248		-
1249		- if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
1250		- struct super_block *sb = dentry->d_sb;
1251		- dput(dentry);
1252		- deactivate_locked_super(sb);
1253		- msleep(10);
1254		- dentry = ERR_PTR(restart_syscall());
	1251	+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
	1252	+ fc_drop_locked(fc);
	1253	+ ret = 1;
1255	1254	}
1256		- return dentry;
	1255	+
	1256	+ if (unlikely(ret > 0)) {
	1257	+ msleep(10);
	1258	+ return restart_syscall();
	1259	+ }
	1260	+ return ret;
1257	1261	}
1258	1262
1259	1263	static int __init cgroup1_wq_init(void)