~hc/RK356X_SDK_RELEASE.git

..	..	@@ -54,12 +54,16 @@
54	54	#include <linux/proc_ns.h>
55	55	#include <linux/nsproxy.h>
56	56	#include <linux/file.h>
	57	+#include <linux/fs_parser.h>
57	58	#include <linux/sched/cputime.h>
58	59	#include <linux/psi.h>
59	60	#include <net/sock.h>
60	61
61	62	#define CREATE_TRACE_POINTS
62	63	#include <trace/events/cgroup.h>
	64	+#undef CREATE_TRACE_POINTS
	65	+
	66	+#include <trace/hooks/cgroup.h>
63	67
64	68	#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
65	69	MAX_CFTYPE_NAME + 2)
..	..	@@ -86,6 +90,7 @@
86	90
87	91	DEFINE_SPINLOCK(trace_cgroup_path_lock);
88	92	char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
	93	+bool cgroup_debug __read_mostly;
89	94
90	95	/*
91	96	* Protects cgroup_idr and css_idr so that IDs can be released without
..	..	@@ -99,7 +104,7 @@
99	104	*/
100	105	static DEFINE_SPINLOCK(cgroup_file_kn_lock);
101	106
102		-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
	107	+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
103	108
104	109	#define cgroup_assert_mutex_or_rcu_locked() \
105	110	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
..	..	@@ -151,11 +156,7 @@
151	156
152	157	static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
153	158
154		-/*
155		- * The default hierarchy, reserved for the subsystems that are otherwise
156		- * unattached - it never has more than a single cgroup, and all tasks are
157		- * part of that cgroup.
158		- */
	159	+/* the default hierarchy */
159	160	struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
160	161	EXPORT_SYMBOL_GPL(cgrp_dfl_root);
161	162
..	..	@@ -264,9 +265,6 @@
264	265	* can be used to test whether a cgroup is on the default hierarchy for
265	266	* cases where a subsystem should behave differnetly depending on the
266	267	* interface version.
267		- *
268		- * The set of behaviors which change on the default hierarchy are still
269		- * being determined and the mount option is prefixed with __DEVEL__.
270	268	*
271	269	* List of changed behaviors:
272	270	*
..	..	@@ -502,7 +500,7 @@
502	500
503	501	rcu_read_lock();
504	502	css = cgroup_css(cgrp, ss);
505		- if (!css \|\| !css_tryget_online(css))
	503	+ if (css && !css_tryget_online(css))
506	504	css = NULL;
507	505	rcu_read_unlock();
508	506
..	..	@@ -510,7 +508,7 @@
510	508	}
511	509
512	510	/**
513		- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
	511	+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
514	512	* @cgrp: the cgroup of interest
515	513	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
516	514	*
..	..	@@ -519,8 +517,8 @@
519	517	* enabled. If @ss is associated with the hierarchy @cgrp is on, this
520	518	* function is guaranteed to return non-NULL css.
521	519	*/
522		-static struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
523		- struct cgroup_subsys *ss)
	520	+static struct cgroup_subsys_state cgroup_e_css_by_mask(struct cgroup cgrp,
	521	+ struct cgroup_subsys *ss)
524	522	{
525	523	lockdep_assert_held(&cgroup_mutex);
526	524
..	..	@@ -538,6 +536,35 @@
538	536	}
539	537
540	538	return cgroup_css(cgrp, ss);
	539	+}
	540	+
	541	+/**
	542	+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
	543	+ * @cgrp: the cgroup of interest
	544	+ * @ss: the subsystem of interest
	545	+ *
	546	+ * Find and get the effective css of @cgrp for @ss. The effective css is
	547	+ * defined as the matching css of the nearest ancestor including self which
	548	+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
	549	+ * the root css is returned, so this function always returns a valid css.
	550	+ *
	551	+ * The returned css is not guaranteed to be online, and therefore it is the
	552	+ * callers responsiblity to tryget a reference for it.
	553	+ */
	554	+struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
	555	+ struct cgroup_subsys *ss)
	556	+{
	557	+ struct cgroup_subsys_state *css;
	558	+
	559	+ do {
	560	+ css = cgroup_css(cgrp, ss);
	561	+
	562	+ if (css)
	563	+ return css;
	564	+ cgrp = cgroup_parent(cgrp);
	565	+ } while (cgrp);
	566	+
	567	+ return init_css_set.subsys[ss->id];
541	568	}
542	569
543	570	/**
..	..	@@ -655,10 +682,11 @@
655	682	*
656	683	* Should be called under cgroup_[tree_]mutex.
657	684	*/
658		-#define for_each_e_css(css, ssid, cgrp) \
659		- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
660		- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
661		- ; \
	685	+#define for_each_e_css(css, ssid, cgrp) \
	686	+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
	687	+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
	688	+ cgroup_subsys[(ssid)]))) \
	689	+ ; \
662	690	else
663	691
664	692	/**
..	..	@@ -718,25 +746,28 @@
718	746	* reference-counted, to improve performance when child cgroups
719	747	* haven't been created.
720	748	*/
721		-struct css_set init_css_set = {
722		- .refcount = REFCOUNT_INIT(1),
723		- .dom_cset = &init_css_set,
724		- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
725		- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
726		- .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
727		- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
728		- .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
729		- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
730		- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
731		- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
732		-
733		- /*
734		- * The following field is re-initialized when this cset gets linked
735		- * in cgroup_init(). However, let's initialize the field
736		- * statically too so that the default cgroup can be accessed safely
737		- * early during boot.
738		- */
739		- .dfl_cgrp = &cgrp_dfl_root.cgrp,
	749	+struct ext_css_set init_ext_css_set = {
	750	+ .cset = {
	751	+ .refcount = REFCOUNT_INIT(1),
	752	+ .dom_cset = &init_css_set,
	753	+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
	754	+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
	755	+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
	756	+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
	757	+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
	758	+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
	759	+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
	760	+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
	761	+ /*
	762	+ * The following field is re-initialized when this cset gets linked
	763	+ * in cgroup_init(). However, let's initialize the field
	764	+ * statically too so that the default cgroup can be accessed safely
	765	+ * early during boot.
	766	+ */
	767	+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
	768	+ },
	769	+ .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
	770	+ .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
740	771	};
741	772
742	773	static int css_set_count = 1; /* 1 for init_css_set */
..	..	@@ -802,6 +833,8 @@
802	833	break;
803	834
804	835	cgroup1_check_for_release(cgrp);
	836	+ TRACE_CGROUP_PATH(notify_populated, cgrp,
	837	+ cgroup_is_populated(cgrp));
805	838	cgroup_file_notify(&cgrp->events_file);
806	839
807	840	child = cgrp;
..	..	@@ -881,8 +914,7 @@
881	914	/*
882	915	* We are synchronized through cgroup_threadgroup_rwsem
883	916	* against PF_EXITING setting such that we can't race
884		- * against cgroup_exit() changing the css_set to
885		- * init_css_set and dropping the old one.
	917	+ * against cgroup_exit()/cgroup_free() dropping the css_set.
886	918	*/
887	919	WARN_ON_ONCE(task->flags & PF_EXITING);
888	920
..	..	@@ -1060,7 +1092,7 @@
1060	1092	* @ss is in this hierarchy, so we want the
1061	1093	* effective css from @cgrp.
1062	1094	*/
1063		- template[i] = cgroup_e_css(cgrp, ss);
	1095	+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
1064	1096	} else {
1065	1097	/*
1066	1098	* @ss is not in this hierarchy, so we don't want
..	..	@@ -1162,6 +1194,7 @@
1162	1194	struct cgroup *cgrp)
1163	1195	{
1164	1196	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
	1197	+ struct ext_css_set *ext_cset;
1165	1198	struct css_set *cset;
1166	1199	struct list_head tmp_links;
1167	1200	struct cgrp_cset_link *link;
..	..	@@ -1182,9 +1215,10 @@
1182	1215	if (cset)
1183	1216	return cset;
1184	1217
1185		- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1186		- if (!cset)
	1218	+ ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
	1219	+ if (!ext_cset)
1187	1220	return NULL;
	1221	+ cset = &ext_cset->cset;
1188	1222
1189	1223	/* Allocate all the cgrp_cset_link objects that we'll need */
1190	1224	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
..	..	@@ -1202,6 +1236,8 @@
1202	1236	INIT_HLIST_NODE(&cset->hlist);
1203	1237	INIT_LIST_HEAD(&cset->cgrp_links);
1204	1238	INIT_LIST_HEAD(&cset->mg_preload_node);
	1239	+ INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
	1240	+ INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
1205	1241	INIT_LIST_HEAD(&cset->mg_node);
1206	1242
1207	1243	/* Copy the set of subsystem state objects generated in
..	..	@@ -1291,10 +1327,7 @@
1291	1327
1292	1328	void cgroup_free_root(struct cgroup_root *root)
1293	1329	{
1294		- if (root) {
1295		- idr_destroy(&root->cgroup_idr);
1296		- kfree(root);
1297		- }
	1330	+ kfree(root);
1298	1331	}
1299	1332
1300	1333	static void cgroup_destroy_root(struct cgroup_root *root)
..	..	@@ -1356,6 +1389,8 @@
1356	1389	cset = current->nsproxy->cgroup_ns->root_cset;
1357	1390	if (cset == &init_css_set) {
1358	1391	res = &root->cgrp;
	1392	+ } else if (root == &cgrp_dfl_root) {
	1393	+ res = cset->dfl_cgrp;
1359	1394	} else {
1360	1395	struct cgrp_cset_link *link;
1361	1396
..	..	@@ -1412,9 +1447,8 @@
1412	1447	struct cgroup_root *root)
1413	1448	{
1414	1449	/*
1415		- * No need to lock the task - since we hold cgroup_mutex the
1416		- * task can't change groups, so the only thing that can happen
1417		- * is that it exits and its css is set back to init_css_set.
	1450	+ * No need to lock the task - since we hold css_set_lock the
	1451	+ * task can't change groups.
1418	1452	*/
1419	1453	return cset_cgroup_from_root(task_css_set(task), root);
1420	1454	}
..	..	@@ -1453,12 +1487,15 @@
1453	1487	struct cgroup_subsys *ss = cft->ss;
1454	1488
1455	1489	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1456		- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1457		- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1458		- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
	1490	+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
	1491	+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
	1492	+
	1493	+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
	1494	+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1459	1495	cft->name);
1460		- else
	1496	+ } else {
1461	1497	strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	1498	+ }
1462	1499	return buf;
1463	1500	}
1464	1501
..	..	@@ -1815,26 +1852,42 @@
1815	1852	return len;
1816	1853	}
1817	1854
1818		-static int parse_cgroup_root_flags(char data, unsigned int root_flags)
	1855	+enum cgroup2_param {
	1856	+ Opt_nsdelegate,
	1857	+ Opt_memory_localevents,
	1858	+ Opt_memory_recursiveprot,
	1859	+ nr__cgroup2_params
	1860	+};
	1861	+
	1862	+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
	1863	+ fsparam_flag("nsdelegate", Opt_nsdelegate),
	1864	+ fsparam_flag("memory_localevents", Opt_memory_localevents),
	1865	+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
	1866	+ {}
	1867	+};
	1868	+
	1869	+static int cgroup2_parse_param(struct fs_context fc, struct fs_parameter param)
1819	1870	{
1820		- char *token;
	1871	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	1872	+ struct fs_parse_result result;
	1873	+ int opt;
1821	1874
1822		- *root_flags = 0;
	1875	+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
	1876	+ if (opt < 0)
	1877	+ return opt;
1823	1878
1824		- if (!data \|\| *data == '\0')
	1879	+ switch (opt) {
	1880	+ case Opt_nsdelegate:
	1881	+ ctx->flags \|= CGRP_ROOT_NS_DELEGATE;
1825	1882	return 0;
1826		-
1827		- while ((token = strsep(&data, ",")) != NULL) {
1828		- if (!strcmp(token, "nsdelegate")) {
1829		- *root_flags \|= CGRP_ROOT_NS_DELEGATE;
1830		- continue;
1831		- }
1832		-
1833		- pr_err("cgroup2: unknown option \"%s\"\n", token);
1834		- return -EINVAL;
	1883	+ case Opt_memory_localevents:
	1884	+ ctx->flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1885	+ return 0;
	1886	+ case Opt_memory_recursiveprot:
	1887	+ ctx->flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
	1888	+ return 0;
1835	1889	}
1836		-
1837		- return 0;
	1890	+ return -EINVAL;
1838	1891	}
1839	1892
1840	1893	static void apply_cgroup_root_flags(unsigned int root_flags)
..	..	@@ -1844,6 +1897,16 @@
1844	1897	cgrp_dfl_root.flags \|= CGRP_ROOT_NS_DELEGATE;
1845	1898	else
1846	1899	cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
	1900	+
	1901	+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
	1902	+ cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1903	+ else
	1904	+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1905	+
	1906	+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
	1907	+ cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
	1908	+ else
	1909	+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1847	1910	}
1848	1911	}
1849	1912
..	..	@@ -1851,79 +1914,19 @@
1851	1914	{
1852	1915	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1853	1916	seq_puts(seq, ",nsdelegate");
	1917	+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
	1918	+ seq_puts(seq, ",memory_localevents");
	1919	+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
	1920	+ seq_puts(seq, ",memory_recursiveprot");
1854	1921	return 0;
1855	1922	}
1856	1923
1857		-static int cgroup_remount(struct kernfs_root kf_root, int flags, char *data)
	1924	+static int cgroup_reconfigure(struct fs_context *fc)
1858	1925	{
1859		- unsigned int root_flags;
1860		- int ret;
	1926	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1861	1927
1862		- ret = parse_cgroup_root_flags(data, &root_flags);
1863		- if (ret)
1864		- return ret;
1865		-
1866		- apply_cgroup_root_flags(root_flags);
	1928	+ apply_cgroup_root_flags(ctx->flags);
1867	1929	return 0;
1868		-}
1869		-
1870		-/*
1871		- * To reduce the fork() overhead for systems that are not actually using
1872		- * their cgroups capability, we don't maintain the lists running through
1873		- * each css_set to its tasks until we see the list actually used - in other
1874		- * words after the first mount.
1875		- */
1876		-static bool use_task_css_set_links __read_mostly;
1877		-
1878		-static void cgroup_enable_task_cg_lists(void)
1879		-{
1880		- struct task_struct p, g;
1881		-
1882		- /*
1883		- * We need tasklist_lock because RCU is not safe against
1884		- * while_each_thread(). Besides, a forking task that has passed
1885		- * cgroup_post_fork() without seeing use_task_css_set_links = 1
1886		- * is not guaranteed to have its child immediately visible in the
1887		- * tasklist if we walk through it with RCU.
1888		- */
1889		- read_lock(&tasklist_lock);
1890		- spin_lock_irq(&css_set_lock);
1891		-
1892		- if (use_task_css_set_links)
1893		- goto out_unlock;
1894		-
1895		- use_task_css_set_links = true;
1896		-
1897		- do_each_thread(g, p) {
1898		- WARN_ON_ONCE(!list_empty(&p->cg_list) \|\|
1899		- task_css_set(p) != &init_css_set);
1900		-
1901		- /*
1902		- * We should check if the process is exiting, otherwise
1903		- * it will race with cgroup_exit() in that the list
1904		- * entry won't be deleted though the process has exited.
1905		- * Do it while holding siglock so that we don't end up
1906		- * racing against cgroup_exit().
1907		- *
1908		- * Interrupts were already disabled while acquiring
1909		- * the css_set_lock, so we do not need to disable it
1910		- * again when acquiring the sighand->siglock here.
1911		- */
1912		- spin_lock(&p->sighand->siglock);
1913		- if (!(p->flags & PF_EXITING)) {
1914		- struct css_set *cset = task_css_set(p);
1915		-
1916		- if (!css_set_populated(cset))
1917		- css_set_update_populated(cset, true);
1918		- list_add_tail(&p->cg_list, &cset->tasks);
1919		- get_css_set(cset);
1920		- cset->nr_tasks++;
1921		- }
1922		- spin_unlock(&p->sighand->siglock);
1923		- } while_each_thread(g, p);
1924		-out_unlock:
1925		- spin_unlock_irq(&css_set_lock);
1926		- read_unlock(&tasklist_lock);
1927	1930	}
1928	1931
1929	1932	static void init_cgroup_housekeeping(struct cgroup *cgrp)
..	..	@@ -1951,22 +1954,22 @@
1951	1954	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1952	1955	}
1953	1956
1954		-void init_cgroup_root(struct cgroup_root root, struct cgroup_sb_opts opts)
	1957	+void init_cgroup_root(struct cgroup_fs_context *ctx)
1955	1958	{
	1959	+ struct cgroup_root *root = ctx->root;
1956	1960	struct cgroup *cgrp = &root->cgrp;
1957	1961
1958	1962	INIT_LIST_HEAD(&root->root_list);
1959	1963	atomic_set(&root->nr_cgrps, 1);
1960	1964	cgrp->root = root;
1961	1965	init_cgroup_housekeeping(cgrp);
1962		- idr_init(&root->cgroup_idr);
1963	1966
1964		- root->flags = opts->flags;
1965		- if (opts->release_agent)
1966		- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1967		- if (opts->name)
1968		- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1969		- if (opts->cpuset_clone_children)
	1967	+ root->flags = ctx->flags;
	1968	+ if (ctx->release_agent)
	1969	+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
	1970	+ if (ctx->name)
	1971	+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
	1972	+ if (ctx->cpuset_clone_children)
1970	1973	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1971	1974	}
1972	1975
..	..	@@ -1979,12 +1982,6 @@
1979	1982	int i, ret;
1980	1983
1981	1984	lockdep_assert_held(&cgroup_mutex);
1982		-
1983		- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1984		- if (ret < 0)
1985		- goto out;
1986		- root_cgrp->id = ret;
1987		- root_cgrp->ancestor_ids[0] = ret;
1988	1985
1989	1986	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1990	1987	0, GFP_KERNEL);
..	..	@@ -2011,13 +2008,16 @@
2011	2008
2012	2009	root->kf_root = kernfs_create_root(kf_sops,
2013	2010	KERNFS_ROOT_CREATE_DEACTIVATED \|
2014		- KERNFS_ROOT_SUPPORT_EXPORTOP,
	2011	+ KERNFS_ROOT_SUPPORT_EXPORTOP \|
	2012	+ KERNFS_ROOT_SUPPORT_USER_XATTR,
2015	2013	root_cgrp);
2016	2014	if (IS_ERR(root->kf_root)) {
2017	2015	ret = PTR_ERR(root->kf_root);
2018	2016	goto exit_root_id;
2019	2017	}
2020	2018	root_cgrp->kn = root->kf_root->kn;
	2019	+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
	2020	+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
2021	2021
2022	2022	ret = css_populate_dir(&root_cgrp->self);
2023	2023	if (ret)
..	..	@@ -2055,7 +2055,6 @@
2055	2055	BUG_ON(!list_empty(&root_cgrp->self.children));
2056	2056	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2057	2057
2058		- kernfs_activate(root_cgrp->kn);
2059	2058	ret = 0;
2060	2059	goto out;
2061	2060
..	..	@@ -2071,91 +2070,117 @@
2071	2070	return ret;
2072	2071	}
2073	2072
2074		-struct dentry cgroup_do_mount(struct file_system_type fs_type, int flags,
2075		- struct cgroup_root *root, unsigned long magic,
2076		- struct cgroup_namespace *ns)
	2073	+int cgroup_do_get_tree(struct fs_context *fc)
2077	2074	{
2078		- struct dentry *dentry;
2079		- bool new_sb = false;
	2075	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	2076	+ int ret;
2080	2077
2081		- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
	2078	+ ctx->kfc.root = ctx->root->kf_root;
	2079	+ if (fc->fs_type == &cgroup2_fs_type)
	2080	+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
	2081	+ else
	2082	+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
	2083	+ ret = kernfs_get_tree(fc);
2082	2084
2083	2085	/*
2084	2086	* In non-init cgroup namespace, instead of root cgroup's dentry,
2085	2087	* we return the dentry corresponding to the cgroupns->root_cgrp.
2086	2088	*/
2087		- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
	2089	+ if (!ret && ctx->ns != &init_cgroup_ns) {
2088	2090	struct dentry *nsdentry;
2089		- struct super_block *sb = dentry->d_sb;
	2091	+ struct super_block *sb = fc->root->d_sb;
2090	2092	struct cgroup *cgrp;
2091	2093
2092	2094	mutex_lock(&cgroup_mutex);
2093	2095	spin_lock_irq(&css_set_lock);
2094	2096
2095		- cgrp = cset_cgroup_from_root(ns->root_cset, root);
	2097	+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2096	2098
2097	2099	spin_unlock_irq(&css_set_lock);
2098	2100	mutex_unlock(&cgroup_mutex);
2099	2101
2100	2102	nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2101		- dput(dentry);
2102		- if (IS_ERR(nsdentry))
	2103	+ dput(fc->root);
	2104	+ if (IS_ERR(nsdentry)) {
2103	2105	deactivate_locked_super(sb);
2104		- dentry = nsdentry;
	2106	+ ret = PTR_ERR(nsdentry);
	2107	+ nsdentry = NULL;
	2108	+ }
	2109	+ fc->root = nsdentry;
2105	2110	}
2106	2111
2107		- if (!new_sb)
2108		- cgroup_put(&root->cgrp);
	2112	+ if (!ctx->kfc.new_sb_created)
	2113	+ cgroup_put(&ctx->root->cgrp);
2109	2114
2110		- return dentry;
	2115	+ return ret;
2111	2116	}
2112	2117
2113		-static struct dentry cgroup_mount(struct file_system_type fs_type,
2114		- int flags, const char *unused_dev_name,
2115		- void *data)
	2118	+/*
	2119	+ * Destroy a cgroup filesystem context.
	2120	+ */
	2121	+static void cgroup_fs_context_free(struct fs_context *fc)
2116	2122	{
2117		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2118		- struct dentry *dentry;
	2123	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	2124	+
	2125	+ kfree(ctx->name);
	2126	+ kfree(ctx->release_agent);
	2127	+ put_cgroup_ns(ctx->ns);
	2128	+ kernfs_free_fs_context(fc);
	2129	+ kfree(ctx);
	2130	+}
	2131	+
	2132	+static int cgroup_get_tree(struct fs_context *fc)
	2133	+{
	2134	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2119	2135	int ret;
2120	2136
2121		- get_cgroup_ns(ns);
	2137	+ cgrp_dfl_visible = true;
	2138	+ cgroup_get_live(&cgrp_dfl_root.cgrp);
	2139	+ ctx->root = &cgrp_dfl_root;
2122	2140
2123		- /* Check if the caller has permission to mount. */
2124		- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2125		- put_cgroup_ns(ns);
2126		- return ERR_PTR(-EPERM);
2127		- }
	2141	+ ret = cgroup_do_get_tree(fc);
	2142	+ if (!ret)
	2143	+ apply_cgroup_root_flags(ctx->flags);
	2144	+ return ret;
	2145	+}
2128	2146
2129		- /*
2130		- * The first time anyone tries to mount a cgroup, enable the list
2131		- * linking each css_set to its tasks and fix up all existing tasks.
2132		- */
2133		- if (!use_task_css_set_links)
2134		- cgroup_enable_task_cg_lists();
	2147	+static const struct fs_context_operations cgroup_fs_context_ops = {
	2148	+ .free = cgroup_fs_context_free,
	2149	+ .parse_param = cgroup2_parse_param,
	2150	+ .get_tree = cgroup_get_tree,
	2151	+ .reconfigure = cgroup_reconfigure,
	2152	+};
2135	2153
2136		- if (fs_type == &cgroup2_fs_type) {
2137		- unsigned int root_flags;
	2154	+static const struct fs_context_operations cgroup1_fs_context_ops = {
	2155	+ .free = cgroup_fs_context_free,
	2156	+ .parse_param = cgroup1_parse_param,
	2157	+ .get_tree = cgroup1_get_tree,
	2158	+ .reconfigure = cgroup1_reconfigure,
	2159	+};
2138	2160
2139		- ret = parse_cgroup_root_flags(data, &root_flags);
2140		- if (ret) {
2141		- put_cgroup_ns(ns);
2142		- return ERR_PTR(ret);
2143		- }
	2161	+/*
	2162	+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
	2163	+ * we select the namespace we're going to use.
	2164	+ */
	2165	+static int cgroup_init_fs_context(struct fs_context *fc)
	2166	+{
	2167	+ struct cgroup_fs_context *ctx;
2144	2168
2145		- cgrp_dfl_visible = true;
2146		- cgroup_get_live(&cgrp_dfl_root.cgrp);
	2169	+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
	2170	+ if (!ctx)
	2171	+ return -ENOMEM;
2147	2172
2148		- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
2149		- CGROUP2_SUPER_MAGIC, ns);
2150		- if (!IS_ERR(dentry))
2151		- apply_cgroup_root_flags(root_flags);
2152		- } else {
2153		- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
2154		- CGROUP_SUPER_MAGIC, ns);
2155		- }
2156		-
2157		- put_cgroup_ns(ns);
2158		- return dentry;
	2173	+ ctx->ns = current->nsproxy->cgroup_ns;
	2174	+ get_cgroup_ns(ctx->ns);
	2175	+ fc->fs_private = &ctx->kfc;
	2176	+ if (fc->fs_type == &cgroup2_fs_type)
	2177	+ fc->ops = &cgroup_fs_context_ops;
	2178	+ else
	2179	+ fc->ops = &cgroup1_fs_context_ops;
	2180	+ put_user_ns(fc->user_ns);
	2181	+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
	2182	+ fc->global = true;
	2183	+ return 0;
2159	2184	}
2160	2185
2161	2186	static void cgroup_kill_sb(struct super_block *sb)
..	..	@@ -2171,25 +2196,73 @@
2171	2196	* And don't kill the default root.
2172	2197	*/
2173	2198	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2174		- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
	2199	+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
	2200	+ cgroup_bpf_offline(&root->cgrp);
2175	2201	percpu_ref_kill(&root->cgrp.self.refcnt);
	2202	+ }
2176	2203	cgroup_put(&root->cgrp);
2177	2204	kernfs_kill_sb(sb);
2178	2205	}
2179	2206
2180	2207	struct file_system_type cgroup_fs_type = {
2181		- .name = "cgroup",
2182		- .mount = cgroup_mount,
2183		- .kill_sb = cgroup_kill_sb,
2184		- .fs_flags = FS_USERNS_MOUNT,
	2208	+ .name = "cgroup",
	2209	+ .init_fs_context = cgroup_init_fs_context,
	2210	+ .parameters = cgroup1_fs_parameters,
	2211	+ .kill_sb = cgroup_kill_sb,
	2212	+ .fs_flags = FS_USERNS_MOUNT,
2185	2213	};
2186	2214
2187	2215	static struct file_system_type cgroup2_fs_type = {
2188		- .name = "cgroup2",
2189		- .mount = cgroup_mount,
2190		- .kill_sb = cgroup_kill_sb,
2191		- .fs_flags = FS_USERNS_MOUNT,
	2216	+ .name = "cgroup2",
	2217	+ .init_fs_context = cgroup_init_fs_context,
	2218	+ .parameters = cgroup2_fs_parameters,
	2219	+ .kill_sb = cgroup_kill_sb,
	2220	+ .fs_flags = FS_USERNS_MOUNT,
2192	2221	};
	2222	+
	2223	+#ifdef CONFIG_CPUSETS
	2224	+static const struct fs_context_operations cpuset_fs_context_ops = {
	2225	+ .get_tree = cgroup1_get_tree,
	2226	+ .free = cgroup_fs_context_free,
	2227	+};
	2228	+
	2229	+/*
	2230	+ * This is ugly, but preserves the userspace API for existing cpuset
	2231	+ * users. If someone tries to mount the "cpuset" filesystem, we
	2232	+ * silently switch it to mount "cgroup" instead
	2233	+ */
	2234	+static int cpuset_init_fs_context(struct fs_context *fc)
	2235	+{
	2236	+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
	2237	+ struct cgroup_fs_context *ctx;
	2238	+ int err;
	2239	+
	2240	+ err = cgroup_init_fs_context(fc);
	2241	+ if (err) {
	2242	+ kfree(agent);
	2243	+ return err;
	2244	+ }
	2245	+
	2246	+ fc->ops = &cpuset_fs_context_ops;
	2247	+
	2248	+ ctx = cgroup_fc2context(fc);
	2249	+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
	2250	+ ctx->flags \|= CGRP_ROOT_NOPREFIX;
	2251	+ ctx->release_agent = agent;
	2252	+
	2253	+ get_filesystem(&cgroup_fs_type);
	2254	+ put_filesystem(fc->fs_type);
	2255	+ fc->fs_type = &cgroup_fs_type;
	2256	+
	2257	+ return 0;
	2258	+}
	2259	+
	2260	+static struct file_system_type cpuset_fs_type = {
	2261	+ .name = "cpuset",
	2262	+ .init_fs_context = cpuset_init_fs_context,
	2263	+ .fs_flags = FS_USERNS_MOUNT,
	2264	+};
	2265	+#endif
2193	2266
2194	2267	int cgroup_path_ns_locked(struct cgroup cgrp, char buf, size_t buflen,
2195	2268	struct cgroup_namespace *ns)
..	..	@@ -2256,6 +2329,47 @@
2256	2329	EXPORT_SYMBOL_GPL(task_cgroup_path);
2257	2330
2258	2331	/**
	2332	+ * cgroup_attach_lock - Lock for ->attach()
	2333	+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
	2334	+ *
	2335	+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
	2336	+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
	2337	+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
	2338	+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
	2339	+ * lead to deadlocks.
	2340	+ *
	2341	+ * Bringing up a CPU may involve creating and destroying tasks which requires
	2342	+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
	2343	+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
	2344	+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
	2345	+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
	2346	+ * the threadgroup_rwsem to be released to create new tasks. For more details:
	2347	+ *
	2348	+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
	2349	+ *
	2350	+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
	2351	+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
	2352	+ * CPU hotplug is disabled on entry.
	2353	+ */
	2354	+static void cgroup_attach_lock(bool lock_threadgroup)
	2355	+{
	2356	+ cpus_read_lock();
	2357	+ if (lock_threadgroup)
	2358	+ percpu_down_write(&cgroup_threadgroup_rwsem);
	2359	+}
	2360	+
	2361	+/**
	2362	+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
	2363	+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
	2364	+ */
	2365	+static void cgroup_attach_unlock(bool lock_threadgroup)
	2366	+{
	2367	+ if (lock_threadgroup)
	2368	+ percpu_up_write(&cgroup_threadgroup_rwsem);
	2369	+ cpus_read_unlock();
	2370	+}
	2371	+
	2372	+/**
2259	2373	* cgroup_migrate_add_task - add a migration target task to a migration context
2260	2374	* @task: target task
2261	2375	* @mgctx: target migration context
..	..	@@ -2276,9 +2390,8 @@
2276	2390	if (task->flags & PF_EXITING)
2277	2391	return;
2278	2392
2279		- /* leave @task alone if post_fork() hasn't linked it yet */
2280		- if (list_empty(&task->cg_list))
2281		- return;
	2393	+ /* cgroup_threadgroup_rwsem protects racing against forks */
	2394	+ WARN_ON_ONCE(list_empty(&task->cg_list));
2282	2395
2283	2396	cset = task_css_set(task);
2284	2397	if (!cset->mg_src_cgrp)
..	..	@@ -2310,6 +2423,7 @@
2310	2423
2311	2424	return cgroup_taskset_next(tset, dst_cssp);
2312	2425	}
	2426	+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
2313	2427
2314	2428	/**
2315	2429	* cgroup_taskset_next - iterate to the next task in taskset
..	..	@@ -2356,6 +2470,7 @@
2356	2470
2357	2471	return NULL;
2358	2472	}
	2473	+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
2359	2474
2360	2475	/**
2361	2476	* cgroup_taskset_migrate - migrate a taskset
..	..	@@ -2426,6 +2541,7 @@
2426	2541	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2427	2542	if (ss->attach) {
2428	2543	tset->ssid = ssid;
	2544	+ trace_android_vh_cgroup_attach(ss, tset);
2429	2545	ss->attach(tset);
2430	2546	}
2431	2547	} while_each_subsys_mask();
..	..	@@ -2510,22 +2626,28 @@
2510	2626	*/
2511	2627	void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2512	2628	{
2513		- LIST_HEAD(preloaded);
2514		- struct css_set cset, tmp_cset;
	2629	+ struct ext_css_set cset, tmp_cset;
2515	2630
2516	2631	lockdep_assert_held(&cgroup_mutex);
2517	2632
2518	2633	spin_lock_irq(&css_set_lock);
2519	2634
2520		- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2521		- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
	2635	+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
	2636	+ mg_src_preload_node) {
	2637	+ cset->cset.mg_src_cgrp = NULL;
	2638	+ cset->cset.mg_dst_cgrp = NULL;
	2639	+ cset->cset.mg_dst_cset = NULL;
	2640	+ list_del_init(&cset->mg_src_preload_node);
	2641	+ put_css_set_locked(&cset->cset);
	2642	+ }
2522	2643
2523		- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2524		- cset->mg_src_cgrp = NULL;
2525		- cset->mg_dst_cgrp = NULL;
2526		- cset->mg_dst_cset = NULL;
2527		- list_del_init(&cset->mg_preload_node);
2528		- put_css_set_locked(cset);
	2644	+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
	2645	+ mg_dst_preload_node) {
	2646	+ cset->cset.mg_src_cgrp = NULL;
	2647	+ cset->cset.mg_dst_cgrp = NULL;
	2648	+ cset->cset.mg_dst_cset = NULL;
	2649	+ list_del_init(&cset->mg_dst_preload_node);
	2650	+ put_css_set_locked(&cset->cset);
2529	2651	}
2530	2652
2531	2653	spin_unlock_irq(&css_set_lock);
..	..	@@ -2552,6 +2674,7 @@
2552	2674	struct cgroup_mgctx *mgctx)
2553	2675	{
2554	2676	struct cgroup *src_cgrp;
	2677	+ struct ext_css_set *ext_src_cset;
2555	2678
2556	2679	lockdep_assert_held(&cgroup_mutex);
2557	2680	lockdep_assert_held(&css_set_lock);
..	..	@@ -2565,8 +2688,9 @@
2565	2688	return;
2566	2689
2567	2690	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
	2691	+ ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
2568	2692
2569		- if (!list_empty(&src_cset->mg_preload_node))
	2693	+ if (!list_empty(&ext_src_cset->mg_src_preload_node))
2570	2694	return;
2571	2695
2572	2696	WARN_ON(src_cset->mg_src_cgrp);
..	..	@@ -2577,7 +2701,7 @@
2577	2701	src_cset->mg_src_cgrp = src_cgrp;
2578	2702	src_cset->mg_dst_cgrp = dst_cgrp;
2579	2703	get_css_set(src_cset);
2580		- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
	2704	+ list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
2581	2705	}
2582	2706
2583	2707	/**
..	..	@@ -2596,20 +2720,23 @@
2596	2720	*/
2597	2721	int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2598	2722	{
2599		- struct css_set src_cset, tmp_cset;
	2723	+ struct ext_css_set ext_src_set, tmp_cset;
2600	2724
2601	2725	lockdep_assert_held(&cgroup_mutex);
2602	2726
2603	2727	/* look up the dst cset for each src cset and link it to src */
2604		- list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2605		- mg_preload_node) {
	2728	+ list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
	2729	+ mg_src_preload_node) {
	2730	+ struct css_set *src_cset = &ext_src_set->cset;
2606	2731	struct css_set *dst_cset;
	2732	+ struct ext_css_set *ext_dst_cset;
2607	2733	struct cgroup_subsys *ss;
2608	2734	int ssid;
2609	2735
2610	2736	dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2611	2737	if (!dst_cset)
2612	2738	return -ENOMEM;
	2739	+ ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
2613	2740
2614	2741	WARN_ON_ONCE(src_cset->mg_dst_cset \|\| dst_cset->mg_dst_cset);
2615	2742
..	..	@@ -2621,7 +2748,7 @@
2621	2748	if (src_cset == dst_cset) {
2622	2749	src_cset->mg_src_cgrp = NULL;
2623	2750	src_cset->mg_dst_cgrp = NULL;
2624		- list_del_init(&src_cset->mg_preload_node);
	2751	+ list_del_init(&ext_src_set->mg_src_preload_node);
2625	2752	put_css_set(src_cset);
2626	2753	put_css_set(dst_cset);
2627	2754	continue;
..	..	@@ -2629,8 +2756,8 @@
2629	2756
2630	2757	src_cset->mg_dst_cset = dst_cset;
2631	2758
2632		- if (list_empty(&dst_cset->mg_preload_node))
2633		- list_add_tail(&dst_cset->mg_preload_node,
	2759	+ if (list_empty(&ext_dst_cset->mg_dst_preload_node))
	2760	+ list_add_tail(&ext_dst_cset->mg_dst_preload_node,
2634	2761	&mgctx->preloaded_dst_csets);
2635	2762	else
2636	2763	put_css_set(dst_cset);
..	..	@@ -2698,11 +2825,7 @@
2698	2825	{
2699	2826	DEFINE_CGROUP_MGCTX(mgctx);
2700	2827	struct task_struct *task;
2701		- int ret;
2702		-
2703		- ret = cgroup_migrate_vet_dst(dst_cgrp);
2704		- if (ret)
2705		- return ret;
	2828	+ int ret = 0;
2706	2829
2707	2830	/* look up all src csets */
2708	2831	spin_lock_irq(&css_set_lock);
..	..	@@ -2729,16 +2852,28 @@
2729	2852	return ret;
2730	2853	}
2731	2854
2732		-struct task_struct cgroup_procs_write_start(char buf, bool threadgroup)
2733		- __acquires(&cgroup_threadgroup_rwsem)
	2855	+struct task_struct cgroup_procs_write_start(char buf, bool threadgroup,
	2856	+ bool *threadgroup_locked,
	2857	+ struct cgroup *dst_cgrp)
2734	2858	{
2735	2859	struct task_struct *tsk;
2736	2860	pid_t pid;
	2861	+ bool force_migration = false;
2737	2862
2738	2863	if (kstrtoint(strstrip(buf), 0, &pid) \|\| pid < 0)
2739	2864	return ERR_PTR(-EINVAL);
2740	2865
2741		- percpu_down_write(&cgroup_threadgroup_rwsem);
	2866	+ /*
	2867	+ * If we migrate a single thread, we don't care about threadgroup
	2868	+ * stability. If the thread is `current`, it won't exit(2) under our
	2869	+ * hands or change PID through exec(2). We exclude
	2870	+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
	2871	+ * callers by cgroup_mutex.
	2872	+ * Therefore, we can skip the global lock.
	2873	+ */
	2874	+ lockdep_assert_held(&cgroup_mutex);
	2875	+ *threadgroup_locked = pid \|\| threadgroup;
	2876	+ cgroup_attach_lock(*threadgroup_locked);
2742	2877
2743	2878	rcu_read_lock();
2744	2879	if (pid) {
..	..	@@ -2754,13 +2889,16 @@
2754	2889	if (threadgroup)
2755	2890	tsk = tsk->group_leader;
2756	2891
	2892	+ if (tsk->flags & PF_KTHREAD)
	2893	+ trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration);
	2894	+
2757	2895	/*
2758	2896	* kthreads may acquire PF_NO_SETAFFINITY during initialization.
2759	2897	* If userland migrates such a kthread to a non-root cgroup, it can
2760	2898	* become trapped in a cpuset, or RT kthread may be born in a
2761	2899	* cgroup with no rt_runtime allocated. Just say no.
2762	2900	*/
2763		- if (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY)) {
	2901	+ if (!force_migration && (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY))) {
2764	2902	tsk = ERR_PTR(-EINVAL);
2765	2903	goto out_unlock_threadgroup;
2766	2904	}
..	..	@@ -2769,14 +2907,14 @@
2769	2907	goto out_unlock_rcu;
2770	2908
2771	2909	out_unlock_threadgroup:
2772		- percpu_up_write(&cgroup_threadgroup_rwsem);
	2910	+ cgroup_attach_unlock(*threadgroup_locked);
	2911	+ *threadgroup_locked = false;
2773	2912	out_unlock_rcu:
2774	2913	rcu_read_unlock();
2775	2914	return tsk;
2776	2915	}
2777	2916
2778		-void cgroup_procs_write_finish(struct task_struct *task)
2779		- __releases(&cgroup_threadgroup_rwsem)
	2917	+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
2780	2918	{
2781	2919	struct cgroup_subsys *ss;
2782	2920	int ssid;
..	..	@@ -2784,7 +2922,8 @@
2784	2922	/* release reference from cgroup_procs_write_start() */
2785	2923	put_task_struct(task);
2786	2924
2787		- percpu_up_write(&cgroup_threadgroup_rwsem);
	2925	+ cgroup_attach_unlock(threadgroup_locked);
	2926	+
2788	2927	for_each_subsys(ss, ssid)
2789	2928	if (ss->post_attach)
2790	2929	ss->post_attach();
..	..	@@ -2799,7 +2938,7 @@
2799	2938	do_each_subsys_mask(ss, ssid, ss_mask) {
2800	2939	if (printed)
2801	2940	seq_putc(seq, ' ');
2802		- seq_printf(seq, "%s", ss->name);
	2941	+ seq_puts(seq, ss->name);
2803	2942	printed = true;
2804	2943	} while_each_subsys_mask();
2805	2944	if (printed)
..	..	@@ -2838,12 +2977,11 @@
2838	2977	DEFINE_CGROUP_MGCTX(mgctx);
2839	2978	struct cgroup_subsys_state *d_css;
2840	2979	struct cgroup *dsct;
2841		- struct css_set *src_cset;
	2980	+ struct ext_css_set *ext_src_set;
	2981	+ bool has_tasks;
2842	2982	int ret;
2843	2983
2844	2984	lockdep_assert_held(&cgroup_mutex);
2845		-
2846		- percpu_down_write(&cgroup_threadgroup_rwsem);
2847	2985
2848	2986	/* look up all csses currently attached to @cgrp's subtree */
2849	2987	spin_lock_irq(&css_set_lock);
..	..	@@ -2855,17 +2993,27 @@
2855	2993	}
2856	2994	spin_unlock_irq(&css_set_lock);
2857	2995
	2996	+ /*
	2997	+ * We need to write-lock threadgroup_rwsem while migrating tasks.
	2998	+ * However, if there are no source csets for @cgrp, changing its
	2999	+ * controllers isn't gonna produce any task migrations and the
	3000	+ * write-locking can be skipped safely.
	3001	+ */
	3002	+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
	3003	+ cgroup_attach_lock(has_tasks);
	3004	+
2858	3005	/* NULL dst indicates self on default hierarchy */
2859	3006	ret = cgroup_migrate_prepare_dst(&mgctx);
2860	3007	if (ret)
2861	3008	goto out_finish;
2862	3009
2863	3010	spin_lock_irq(&css_set_lock);
2864		- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
	3011	+ list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
	3012	+ mg_src_preload_node) {
2865	3013	struct task_struct task, ntask;
2866	3014
2867	3015	/* all tasks in src_csets need to be migrated */
2868		- list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
	3016	+ list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
2869	3017	cgroup_migrate_add_task(task, &mgctx);
2870	3018	}
2871	3019	spin_unlock_irq(&css_set_lock);
..	..	@@ -2873,7 +3021,7 @@
2873	3021	ret = cgroup_migrate_execute(&mgctx);
2874	3022	out_finish:
2875	3023	cgroup_migrate_finish(&mgctx);
2876		- percpu_up_write(&cgroup_threadgroup_rwsem);
	3024	+ cgroup_attach_unlock(has_tasks);
2877	3025	return ret;
2878	3026	}
2879	3027
..	..	@@ -3106,7 +3254,7 @@
3106	3254	return ret;
3107	3255
3108	3256	/*
3109		- * At this point, cgroup_e_css() results reflect the new csses
	3257	+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
3110	3258	* making the following cgroup_update_dfl_csses() properly update
3111	3259	* css associations of all tasks in the subtree.
3112	3260	*/
..	..	@@ -3506,22 +3654,33 @@
3506	3654	#ifdef CONFIG_PSI
3507	3655	static int cgroup_io_pressure_show(struct seq_file seq, void v)
3508	3656	{
3509		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
	3657	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3658	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3659	+
	3660	+ return psi_show(seq, psi, PSI_IO);
3510	3661	}
3511	3662	static int cgroup_memory_pressure_show(struct seq_file seq, void v)
3512	3663	{
3513		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
	3664	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3665	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3666	+
	3667	+ return psi_show(seq, psi, PSI_MEM);
3514	3668	}
3515	3669	static int cgroup_cpu_pressure_show(struct seq_file seq, void v)
3516	3670	{
3517		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
	3671	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3672	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3673	+
	3674	+ return psi_show(seq, psi, PSI_CPU);
3518	3675	}
3519	3676
3520	3677	static ssize_t cgroup_pressure_write(struct kernfs_open_file of, char buf,
3521	3678	size_t nbytes, enum psi_res res)
3522	3679	{
	3680	+ struct cgroup_file_ctx *ctx = of->priv;
3523	3681	struct psi_trigger *new;
3524	3682	struct cgroup *cgrp;
	3683	+ struct psi_group *psi;
3525	3684
3526	3685	cgrp = cgroup_kn_lock_live(of->kn, false);
3527	3686	if (!cgrp)
..	..	@@ -3530,14 +3689,20 @@
3530	3689	cgroup_get(cgrp);
3531	3690	cgroup_kn_unlock(of->kn);
3532	3691
3533		- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
	3692	+ /* Allow only one trigger per file descriptor */
	3693	+ if (ctx->psi.trigger) {
	3694	+ cgroup_put(cgrp);
	3695	+ return -EBUSY;
	3696	+ }
	3697	+
	3698	+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3699	+ new = psi_trigger_create(psi, buf, nbytes, res);
3534	3700	if (IS_ERR(new)) {
3535	3701	cgroup_put(cgrp);
3536	3702	return PTR_ERR(new);
3537	3703	}
3538	3704
3539		- psi_trigger_replace(&of->priv, new);
3540		-
	3705	+ smp_store_release(&ctx->psi.trigger, new);
3541	3706	cgroup_put(cgrp);
3542	3707
3543	3708	return nbytes;
..	..	@@ -3567,12 +3732,15 @@
3567	3732	static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3568	3733	poll_table *pt)
3569	3734	{
3570		- return psi_trigger_poll(&of->priv, of->file, pt);
	3735	+ struct cgroup_file_ctx *ctx = of->priv;
	3736	+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
3571	3737	}
3572	3738
3573	3739	static void cgroup_pressure_release(struct kernfs_open_file *of)
3574	3740	{
3575		- psi_trigger_replace(&of->priv, NULL);
	3741	+ struct cgroup_file_ctx *ctx = of->priv;
	3742	+
	3743	+ psi_trigger_destroy(ctx->psi.trigger);
3576	3744	}
3577	3745
3578	3746	bool cgroup_psi_enabled(void)
..	..	@@ -3625,28 +3793,50 @@
3625	3793	static int cgroup_file_open(struct kernfs_open_file *of)
3626	3794	{
3627	3795	struct cftype *cft = of->kn->priv;
	3796	+ struct cgroup_file_ctx *ctx;
	3797	+ int ret;
3628	3798
3629		- if (cft->open)
3630		- return cft->open(of);
3631		- return 0;
	3799	+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	3800	+ if (!ctx)
	3801	+ return -ENOMEM;
	3802	+
	3803	+ ctx->ns = current->nsproxy->cgroup_ns;
	3804	+ get_cgroup_ns(ctx->ns);
	3805	+ of->priv = ctx;
	3806	+
	3807	+ if (!cft->open)
	3808	+ return 0;
	3809	+
	3810	+ ret = cft->open(of);
	3811	+ if (ret) {
	3812	+ put_cgroup_ns(ctx->ns);
	3813	+ kfree(ctx);
	3814	+ }
	3815	+ return ret;
3632	3816	}
3633	3817
3634	3818	static void cgroup_file_release(struct kernfs_open_file *of)
3635	3819	{
3636	3820	struct cftype *cft = of->kn->priv;
	3821	+ struct cgroup_file_ctx *ctx = of->priv;
3637	3822
3638	3823	if (cft->release)
3639	3824	cft->release(of);
	3825	+ put_cgroup_ns(ctx->ns);
	3826	+ kfree(ctx);
3640	3827	}
3641	3828
3642	3829	static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
3643	3830	size_t nbytes, loff_t off)
3644	3831	{
3645		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
	3832	+ struct cgroup_file_ctx *ctx = of->priv;
3646	3833	struct cgroup *cgrp = of->kn->parent->priv;
3647	3834	struct cftype *cft = of->kn->priv;
3648	3835	struct cgroup_subsys_state *css;
3649	3836	int ret;
	3837	+
	3838	+ if (!nbytes)
	3839	+ return 0;
3650	3840
3651	3841	/*
3652	3842	* If namespaces are delegation boundaries, disallow writes to
..	..	@@ -3656,7 +3846,7 @@
3656	3846	*/
3657	3847	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3658	3848	!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3659		- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
	3849	+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
3660	3850	return -EPERM;
3661	3851
3662	3852	if (cft->write)
..	..	@@ -3843,7 +4033,8 @@
3843	4033	continue;
3844	4034	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3845	4035	continue;
3846		-
	4036	+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
	4037	+ continue;
3847	4038	if (is_add) {
3848	4039	ret = cgroup_add_file(css, cgrp, cft);
3849	4040	if (ret) {
..	..	@@ -4045,6 +4236,7 @@
4045	4236	cft->flags \|= __CFTYPE_NOT_ON_DFL;
4046	4237	return cgroup_add_cftypes(ss, cfts);
4047	4238	}
	4239	+EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
4048	4240
4049	4241	/**
4050	4242	* cgroup_file_notify - generate a file modified event for a cgroup_file
..	..	@@ -4120,7 +4312,8 @@
4120	4312	} else if (likely(!(pos->flags & CSS_RELEASED))) {
4121	4313	next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4122	4314	} else {
4123		- list_for_each_entry_rcu(next, &parent->children, sibling)
	4315	+ list_for_each_entry_rcu(next, &parent->children, sibling,
	4316	+ lockdep_is_held(&cgroup_mutex))
4124	4317	if (next->serial_nr > pos->serial_nr)
4125	4318	break;
4126	4319	}
..	..	@@ -4133,6 +4326,7 @@
4133	4326	return next;
4134	4327	return NULL;
4135	4328	}
	4329	+EXPORT_SYMBOL_GPL(css_next_child);
4136	4330
4137	4331	/**
4138	4332	* css_next_descendant_pre - find the next descendant for pre-order walk
..	..	@@ -4182,6 +4376,7 @@
4182	4376
4183	4377	return NULL;
4184	4378	}
	4379	+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4185	4380
4186	4381	/**
4187	4382	* css_rightmost_descendant - return the rightmost descendant of a css
..	..	@@ -4362,29 +4557,24 @@
4362	4557
4363	4558	lockdep_assert_held(&css_set_lock);
4364	4559
4365		- /* Advance to the next non-empty css_set */
4366		- do {
4367		- cset = css_task_iter_next_css_set(it);
4368		- if (!cset) {
4369		- it->task_pos = NULL;
4370		- return;
	4560	+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
	4561	+ while ((cset = css_task_iter_next_css_set(it))) {
	4562	+ if (!list_empty(&cset->tasks)) {
	4563	+ it->cur_tasks_head = &cset->tasks;
	4564	+ break;
	4565	+ } else if (!list_empty(&cset->mg_tasks)) {
	4566	+ it->cur_tasks_head = &cset->mg_tasks;
	4567	+ break;
	4568	+ } else if (!list_empty(&cset->dying_tasks)) {
	4569	+ it->cur_tasks_head = &cset->dying_tasks;
	4570	+ break;
4371	4571	}
4372		- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4373		-
4374		- if (!list_empty(&cset->tasks)) {
4375		- it->task_pos = cset->tasks.next;
4376		- it->cur_tasks_head = &cset->tasks;
4377		- } else if (!list_empty(&cset->mg_tasks)) {
4378		- it->task_pos = cset->mg_tasks.next;
4379		- it->cur_tasks_head = &cset->mg_tasks;
4380		- } else {
4381		- it->task_pos = cset->dying_tasks.next;
4382		- it->cur_tasks_head = &cset->dying_tasks;
4383	4572	}
4384		-
4385		- it->tasks_head = &cset->tasks;
4386		- it->mg_tasks_head = &cset->mg_tasks;
4387		- it->dying_tasks_head = &cset->dying_tasks;
	4573	+ if (!cset) {
	4574	+ it->task_pos = NULL;
	4575	+ return;
	4576	+ }
	4577	+ it->task_pos = it->cur_tasks_head->next;
4388	4578
4389	4579	/*
4390	4580	* We don't keep css_sets locked across iteration steps and thus
..	..	@@ -4429,24 +4619,24 @@
4429	4619	repeat:
4430	4620	if (it->task_pos) {
4431	4621	/*
4432		- * Advance iterator to find next entry. cset->tasks is
4433		- * consumed first and then ->mg_tasks. After ->mg_tasks,
4434		- * we move onto the next cset.
	4622	+ * Advance iterator to find next entry. We go through cset
	4623	+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
	4624	+ * the next cset.
4435	4625	*/
4436	4626	if (it->flags & CSS_TASK_ITER_SKIPPED)
4437	4627	it->flags &= ~CSS_TASK_ITER_SKIPPED;
4438	4628	else
4439	4629	it->task_pos = it->task_pos->next;
4440	4630
4441		- if (it->task_pos == it->tasks_head) {
4442		- it->task_pos = it->mg_tasks_head->next;
4443		- it->cur_tasks_head = it->mg_tasks_head;
	4631	+ if (it->task_pos == &it->cur_cset->tasks) {
	4632	+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
	4633	+ it->task_pos = it->cur_tasks_head->next;
4444	4634	}
4445		- if (it->task_pos == it->mg_tasks_head) {
4446		- it->task_pos = it->dying_tasks_head->next;
4447		- it->cur_tasks_head = it->dying_tasks_head;
	4635	+ if (it->task_pos == &it->cur_cset->mg_tasks) {
	4636	+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
	4637	+ it->task_pos = it->cur_tasks_head->next;
4448	4638	}
4449		- if (it->task_pos == it->dying_tasks_head)
	4639	+ if (it->task_pos == &it->cur_cset->dying_tasks)
4450	4640	css_task_iter_advance_css_set(it);
4451	4641	} else {
4452	4642	/* called from start, proceed to the first cset */
..	..	@@ -4464,12 +4654,12 @@
4464	4654	goto repeat;
4465	4655
4466	4656	/* and dying leaders w/o live member threads */
4467		- if (it->cur_tasks_head == it->dying_tasks_head &&
	4657	+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4468	4658	!atomic_read(&task->signal->live))
4469	4659	goto repeat;
4470	4660	} else {
4471	4661	/* skip all dying ones */
4472		- if (it->cur_tasks_head == it->dying_tasks_head)
	4662	+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4473	4663	goto repeat;
4474	4664	}
4475	4665	}
..	..	@@ -4488,9 +4678,6 @@
4488	4678	void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4489	4679	struct css_task_iter *it)
4490	4680	{
4491		- /* no one should try to iterate before mounting cgroups */
4492		- WARN_ON_ONCE(!use_task_css_set_links);
4493		-
4494	4681	memset(it, 0, sizeof(*it));
4495	4682
4496	4683	spin_lock_irq(&css_set_lock);
..	..	@@ -4567,21 +4754,21 @@
4567	4754
4568	4755	static void cgroup_procs_release(struct kernfs_open_file *of)
4569	4756	{
4570		- if (of->priv) {
4571		- css_task_iter_end(of->priv);
4572		- kfree(of->priv);
4573		- }
	4757	+ struct cgroup_file_ctx *ctx = of->priv;
	4758	+
	4759	+ if (ctx->procs.started)
	4760	+ css_task_iter_end(&ctx->procs.iter);
4574	4761	}
4575	4762
4576	4763	static void cgroup_procs_next(struct seq_file s, void v, loff_t pos)
4577	4764	{
4578	4765	struct kernfs_open_file *of = s->private;
4579		- struct css_task_iter *it = of->priv;
	4766	+ struct cgroup_file_ctx *ctx = of->priv;
4580	4767
4581	4768	if (pos)
4582	4769	(*pos)++;
4583	4770
4584		- return css_task_iter_next(it);
	4771	+ return css_task_iter_next(&ctx->procs.iter);
4585	4772	}
4586	4773
4587	4774	static void __cgroup_procs_start(struct seq_file s, loff_t *pos,
..	..	@@ -4589,21 +4776,18 @@
4589	4776	{
4590	4777	struct kernfs_open_file *of = s->private;
4591	4778	struct cgroup *cgrp = seq_css(s)->cgroup;
4592		- struct css_task_iter *it = of->priv;
	4779	+ struct cgroup_file_ctx *ctx = of->priv;
	4780	+ struct css_task_iter *it = &ctx->procs.iter;
4593	4781
4594	4782	/*
4595	4783	* When a seq_file is seeked, it's always traversed sequentially
4596	4784	* from position 0, so we can simply keep iterating on !0 *pos.
4597	4785	*/
4598		- if (!it) {
	4786	+ if (!ctx->procs.started) {
4599	4787	if (WARN_ON_ONCE((*pos)))
4600	4788	return ERR_PTR(-EINVAL);
4601		-
4602		- it = kzalloc(sizeof(*it), GFP_KERNEL);
4603		- if (!it)
4604		- return ERR_PTR(-ENOMEM);
4605		- of->priv = it;
4606	4789	css_task_iter_start(&cgrp->self, iter_flags, it);
	4790	+ ctx->procs.started = true;
4607	4791	} else if (!(*pos)) {
4608	4792	css_task_iter_end(it);
4609	4793	css_task_iter_start(&cgrp->self, iter_flags, it);
..	..	@@ -4636,13 +4820,28 @@
4636	4820	return 0;
4637	4821	}
4638	4822
	4823	+static int cgroup_may_write(const struct cgroup cgrp, struct super_block sb)
	4824	+{
	4825	+ int ret;
	4826	+ struct inode *inode;
	4827	+
	4828	+ lockdep_assert_held(&cgroup_mutex);
	4829	+
	4830	+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
	4831	+ if (!inode)
	4832	+ return -ENOMEM;
	4833	+
	4834	+ ret = inode_permission(inode, MAY_WRITE);
	4835	+ iput(inode);
	4836	+ return ret;
	4837	+}
	4838	+
4639	4839	static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4640	4840	struct cgroup *dst_cgrp,
4641		- struct super_block *sb)
	4841	+ struct super_block *sb,
	4842	+ struct cgroup_namespace *ns)
4642	4843	{
4643		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4644	4844	struct cgroup *com_cgrp = src_cgrp;
4645		- struct inode *inode;
4646	4845	int ret;
4647	4846
4648	4847	lockdep_assert_held(&cgroup_mutex);
..	..	@@ -4652,12 +4851,7 @@
4652	4851	com_cgrp = cgroup_parent(com_cgrp);
4653	4852
4654	4853	/* %current should be authorized to migrate to the common ancestor */
4655		- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4656		- if (!inode)
4657		- return -ENOMEM;
4658		-
4659		- ret = inode_permission(inode, MAY_WRITE);
4660		- iput(inode);
	4854	+ ret = cgroup_may_write(com_cgrp, sb);
4661	4855	if (ret)
4662	4856	return ret;
4663	4857
..	..	@@ -4673,18 +4867,42 @@
4673	4867	return 0;
4674	4868	}
4675	4869
	4870	+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
	4871	+ struct cgroup *dst_cgrp,
	4872	+ struct super_block *sb, bool threadgroup,
	4873	+ struct cgroup_namespace *ns)
	4874	+{
	4875	+ int ret = 0;
	4876	+
	4877	+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
	4878	+ if (ret)
	4879	+ return ret;
	4880	+
	4881	+ ret = cgroup_migrate_vet_dst(dst_cgrp);
	4882	+ if (ret)
	4883	+ return ret;
	4884	+
	4885	+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
	4886	+ ret = -EOPNOTSUPP;
	4887	+
	4888	+ return ret;
	4889	+}
	4890	+
4676	4891	static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4677	4892	char *buf, size_t nbytes, loff_t off)
4678	4893	{
	4894	+ struct cgroup_file_ctx *ctx = of->priv;
4679	4895	struct cgroup src_cgrp, dst_cgrp;
4680	4896	struct task_struct *task;
	4897	+ const struct cred *saved_cred;
4681	4898	ssize_t ret;
	4899	+ bool threadgroup_locked;
4682	4900
4683	4901	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4684	4902	if (!dst_cgrp)
4685	4903	return -ENODEV;
4686	4904
4687		- task = cgroup_procs_write_start(buf, true);
	4905	+ task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp);
4688	4906	ret = PTR_ERR_OR_ZERO(task);
4689	4907	if (ret)
4690	4908	goto out_unlock;
..	..	@@ -4694,15 +4912,23 @@
4694	4912	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4695	4913	spin_unlock_irq(&css_set_lock);
4696	4914
4697		- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4698		- of->file->f_path.dentry->d_sb);
	4915	+ /*
	4916	+ * Process and thread migrations follow same delegation rule. Check
	4917	+ * permissions using the credentials from file open to protect against
	4918	+ * inherited fd attacks.
	4919	+ */
	4920	+ saved_cred = override_creds(of->file->f_cred);
	4921	+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
	4922	+ of->file->f_path.dentry->d_sb, true,
	4923	+ ctx->ns);
	4924	+ revert_creds(saved_cred);
4699	4925	if (ret)
4700	4926	goto out_finish;
4701	4927
4702	4928	ret = cgroup_attach_task(dst_cgrp, task, true);
4703	4929
4704	4930	out_finish:
4705		- cgroup_procs_write_finish(task);
	4931	+ cgroup_procs_write_finish(task, threadgroup_locked);
4706	4932	out_unlock:
4707	4933	cgroup_kn_unlock(of->kn);
4708	4934
..	..	@@ -4717,9 +4943,12 @@
4717	4943	static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4718	4944	char *buf, size_t nbytes, loff_t off)
4719	4945	{
	4946	+ struct cgroup_file_ctx *ctx = of->priv;
4720	4947	struct cgroup src_cgrp, dst_cgrp;
4721	4948	struct task_struct *task;
	4949	+ const struct cred *saved_cred;
4722	4950	ssize_t ret;
	4951	+ bool threadgroup_locked;
4723	4952
4724	4953	buf = strstrip(buf);
4725	4954
..	..	@@ -4727,7 +4956,7 @@
4727	4956	if (!dst_cgrp)
4728	4957	return -ENODEV;
4729	4958
4730		- task = cgroup_procs_write_start(buf, false);
	4959	+ task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp);
4731	4960	ret = PTR_ERR_OR_ZERO(task);
4732	4961	if (ret)
4733	4962	goto out_unlock;
..	..	@@ -4737,21 +4966,23 @@
4737	4966	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4738	4967	spin_unlock_irq(&css_set_lock);
4739	4968
4740		- /* thread migrations follow the cgroup.procs delegation rule */
4741		- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4742		- of->file->f_path.dentry->d_sb);
	4969	+ /*
	4970	+ * Process and thread migrations follow same delegation rule. Check
	4971	+ * permissions using the credentials from file open to protect against
	4972	+ * inherited fd attacks.
	4973	+ */
	4974	+ saved_cred = override_creds(of->file->f_cred);
	4975	+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
	4976	+ of->file->f_path.dentry->d_sb, false,
	4977	+ ctx->ns);
	4978	+ revert_creds(saved_cred);
4743	4979	if (ret)
4744		- goto out_finish;
4745		-
4746		- /* and must be contained in the same domain */
4747		- ret = -EOPNOTSUPP;
4748		- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4749	4980	goto out_finish;
4750	4981
4751	4982	ret = cgroup_attach_task(dst_cgrp, task, false);
4752	4983
4753	4984	out_finish:
4754		- cgroup_procs_write_finish(task);
	4985	+ cgroup_procs_write_finish(task, threadgroup_locked);
4755	4986	out_unlock:
4756	4987	cgroup_kn_unlock(of->kn);
4757	4988
..	..	@@ -4823,13 +5054,12 @@
4823	5054	},
4824	5055	{
4825	5056	.name = "cpu.stat",
4826		- .flags = CFTYPE_NOT_ON_ROOT,
4827	5057	.seq_show = cpu_stat_show,
4828	5058	},
4829	5059	#ifdef CONFIG_PSI
4830	5060	{
4831	5061	.name = "io.pressure",
4832		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5062	+ .flags = CFTYPE_PRESSURE,
4833	5063	.seq_show = cgroup_io_pressure_show,
4834	5064	.write = cgroup_io_pressure_write,
4835	5065	.poll = cgroup_pressure_poll,
..	..	@@ -4837,7 +5067,7 @@
4837	5067	},
4838	5068	{
4839	5069	.name = "memory.pressure",
4840		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5070	+ .flags = CFTYPE_PRESSURE,
4841	5071	.seq_show = cgroup_memory_pressure_show,
4842	5072	.write = cgroup_memory_pressure_write,
4843	5073	.poll = cgroup_pressure_poll,
..	..	@@ -4845,7 +5075,7 @@
4845	5075	},
4846	5076	{
4847	5077	.name = "cpu.pressure",
4848		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5078	+ .flags = CFTYPE_PRESSURE,
4849	5079	.seq_show = cgroup_cpu_pressure_show,
4850	5080	.write = cgroup_cpu_pressure_write,
4851	5081	.poll = cgroup_pressure_poll,
..	..	@@ -4927,10 +5157,10 @@
4927	5157	}
4928	5158	}
4929	5159
4930		-static void css_release_work_fn(struct swork_event *sev)
	5160	+static void css_release_work_fn(struct work_struct *work)
4931	5161	{
4932	5162	struct cgroup_subsys_state *css =
4933		- container_of(sev, struct cgroup_subsys_state, destroy_swork);
	5163	+ container_of(work, struct cgroup_subsys_state, destroy_work);
4934	5164	struct cgroup_subsys *ss = css->ss;
4935	5165	struct cgroup *cgrp = css->cgroup;
4936	5166
..	..	@@ -4964,9 +5194,6 @@
4964	5194	tcgrp->nr_dying_descendants--;
4965	5195	spin_unlock_irq(&css_set_lock);
4966	5196
4967		- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4968		- cgrp->id = -1;
4969		-
4970	5197	/*
4971	5198	* There are two control paths which try to determine
4972	5199	* cgroup from dentry without going through kernfs -
..	..	@@ -4977,8 +5204,6 @@
4977	5204	if (cgrp->kn)
4978	5205	RCU_INIT_POINTER((void __rcu __force *)&cgrp->kn->priv,
4979	5206	NULL);
4980		-
4981		- cgroup_bpf_put(cgrp);
4982	5207	}
4983	5208
4984	5209	mutex_unlock(&cgroup_mutex);
..	..	@@ -4992,8 +5217,8 @@
4992	5217	struct cgroup_subsys_state *css =
4993	5218	container_of(ref, struct cgroup_subsys_state, refcnt);
4994	5219
4995		- INIT_SWORK(&css->destroy_swork, css_release_work_fn);
4996		- swork_queue(&css->destroy_swork);
	5220	+ INIT_WORK(&css->destroy_work, css_release_work_fn);
	5221	+ queue_work(cgroup_destroy_wq, &css->destroy_work);
4997	5222	}
4998	5223
4999	5224	static void init_and_link_css(struct cgroup_subsys_state *css,
..	..	@@ -5133,10 +5358,12 @@
5133	5358	* it isn't associated with its kernfs_node and doesn't have the control
5134	5359	* mask applied.
5135	5360	*/
5136		-static struct cgroup cgroup_create(struct cgroup parent)
	5361	+static struct cgroup cgroup_create(struct cgroup parent, const char *name,
	5362	+ umode_t mode)
5137	5363	{
5138	5364	struct cgroup_root *root = parent->root;
5139	5365	struct cgroup cgrp, tcgrp;
	5366	+ struct kernfs_node *kn;
5140	5367	int level = parent->level + 1;
5141	5368	int ret;
5142	5369
..	..	@@ -5156,15 +5383,13 @@
5156	5383	goto out_cancel_ref;
5157	5384	}
5158	5385
5159		- /*
5160		- * Temporarily set the pointer to NULL, so idr_find() won't return
5161		- * a half-baked cgroup.
5162		- */
5163		- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5164		- if (cgrp->id < 0) {
5165		- ret = -ENOMEM;
	5386	+ /* create the directory */
	5387	+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
	5388	+ if (IS_ERR(kn)) {
	5389	+ ret = PTR_ERR(kn);
5166	5390	goto out_stat_exit;
5167	5391	}
	5392	+ cgrp->kn = kn;
5168	5393
5169	5394	init_cgroup_housekeeping(cgrp);
5170	5395
..	..	@@ -5174,7 +5399,7 @@
5174	5399
5175	5400	ret = psi_cgroup_alloc(cgrp);
5176	5401	if (ret)
5177		- goto out_idr_free;
	5402	+ goto out_kernfs_remove;
5178	5403
5179	5404	ret = cgroup_bpf_inherit(cgrp);
5180	5405	if (ret)
..	..	@@ -5198,7 +5423,7 @@
5198	5423
5199	5424	spin_lock_irq(&css_set_lock);
5200	5425	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5201		- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
	5426	+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5202	5427
5203	5428	if (tcgrp != cgrp) {
5204	5429	tcgrp->nr_descendants++;
..	..	@@ -5228,12 +5453,6 @@
5228	5453	cgroup_get_live(parent);
5229	5454
5230	5455	/*
5231		- * @cgrp is now fully operational. If something fails after this
5232		- * point, it'll be released via the normal destruction path.
5233		- */
5234		- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5235		-
5236		- /*
5237	5456	* On the default hierarchy, a child doesn't automatically inherit
5238	5457	* subtree_control from the parent. Each is configured manually.
5239	5458	*/
..	..	@@ -5246,8 +5465,8 @@
5246	5465
5247	5466	out_psi_free:
5248	5467	psi_cgroup_free(cgrp);
5249		-out_idr_free:
5250		- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
	5468	+out_kernfs_remove:
	5469	+ kernfs_remove(cgrp->kn);
5251	5470	out_stat_exit:
5252	5471	if (cgroup_on_dfl(parent))
5253	5472	cgroup_rstat_exit(cgrp);
..	..	@@ -5284,7 +5503,6 @@
5284	5503	int cgroup_mkdir(struct kernfs_node parent_kn, const char name, umode_t mode)
5285	5504	{
5286	5505	struct cgroup parent, cgrp;
5287		- struct kernfs_node *kn;
5288	5506	int ret;
5289	5507
5290	5508	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
..	..	@@ -5300,27 +5518,19 @@
5300	5518	goto out_unlock;
5301	5519	}
5302	5520
5303		- cgrp = cgroup_create(parent);
	5521	+ cgrp = cgroup_create(parent, name, mode);
5304	5522	if (IS_ERR(cgrp)) {
5305	5523	ret = PTR_ERR(cgrp);
5306	5524	goto out_unlock;
5307	5525	}
5308	5526
5309		- /* create the directory */
5310		- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5311		- if (IS_ERR(kn)) {
5312		- ret = PTR_ERR(kn);
5313		- goto out_destroy;
5314		- }
5315		- cgrp->kn = kn;
5316		-
5317	5527	/*
5318	5528	* This extra ref will be put in cgroup_free_fn() and guarantees
5319	5529	* that @cgrp->kn is always accessible.
5320	5530	*/
5321		- kernfs_get(kn);
	5531	+ kernfs_get(cgrp->kn);
5322	5532
5323		- ret = cgroup_kn_set_ugid(kn);
	5533	+ ret = cgroup_kn_set_ugid(cgrp->kn);
5324	5534	if (ret)
5325	5535	goto out_destroy;
5326	5536
..	..	@@ -5335,7 +5545,7 @@
5335	5545	TRACE_CGROUP_PATH(mkdir, cgrp);
5336	5546
5337	5547	/* let's create and online css's */
5338		- kernfs_activate(kn);
	5548	+ kernfs_activate(cgrp->kn);
5339	5549
5340	5550	ret = 0;
5341	5551	goto out_unlock;
..	..	@@ -5512,6 +5722,8 @@
5512	5722
5513	5723	cgroup1_check_for_release(parent);
5514	5724
	5725	+ cgroup_bpf_offline(cgrp);
	5726	+
5515	5727	/* put the base reference */
5516	5728	percpu_ref_kill(&cgrp->self.refcnt);
5517	5729
..	..	@@ -5537,7 +5749,6 @@
5537	5749
5538	5750	static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5539	5751	.show_options = cgroup_show_options,
5540		- .remount_fs = cgroup_remount,
5541	5752	.mkdir = cgroup_mkdir,
5542	5753	.rmdir = cgroup_rmdir,
5543	5754	.show_path = cgroup_show_path,
..	..	@@ -5604,11 +5815,12 @@
5604	5815	*/
5605	5816	int __init cgroup_init_early(void)
5606	5817	{
5607		- static struct cgroup_sb_opts __initdata opts;
	5818	+ static struct cgroup_fs_context __initdata ctx;
5608	5819	struct cgroup_subsys *ss;
5609	5820	int i;
5610	5821
5611		- init_cgroup_root(&cgrp_dfl_root, &opts);
	5822	+ ctx.root = &cgrp_dfl_root;
	5823	+ init_cgroup_root(&ctx);
5612	5824	cgrp_dfl_root.cgrp.self.flags \|= CSS_NO_REF;
5613	5825
5614	5826	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
..	..	@@ -5644,14 +5856,13 @@
5644	5856	int ssid;
5645	5857
5646	5858	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5647		- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5648	5859	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5649	5860	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5650	5861
5651	5862	cgroup_rstat_boot();
5652	5863
5653	5864	/*
5654		- * The latency of the synchronize_sched() is too high for cgroups,
	5865	+ * The latency of the synchronize_rcu() is too high for cgroups,
5655	5866	* avoid it at the cost of forcing all readers into the slow path.
5656	5867	*/
5657	5868	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
..	..	@@ -5735,6 +5946,9 @@
5735	5946	WARN_ON(register_filesystem(&cgroup_fs_type));
5736	5947	WARN_ON(register_filesystem(&cgroup2_fs_type));
5737	5948	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
	5949	+#ifdef CONFIG_CPUSETS
	5950	+ WARN_ON(register_filesystem(&cpuset_fs_type));
	5951	+#endif
5738	5952
5739	5953	return 0;
5740	5954	}
..	..	@@ -5751,17 +5965,15 @@
5751	5965	*/
5752	5966	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5753	5967	BUG_ON(!cgroup_destroy_wq);
5754		- BUG_ON(swork_get());
5755	5968	return 0;
5756	5969	}
5757	5970	core_initcall(cgroup_wq_init);
5758	5971
5759		-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
5760		- char *buf, size_t buflen)
	5972	+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5761	5973	{
5762	5974	struct kernfs_node *kn;
5763	5975
5764		- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
	5976	+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5765	5977	if (!kn)
5766	5978	return;
5767	5979	kernfs_path(kn, buf, buflen);
..	..	@@ -5851,8 +6063,7 @@
5851	6063	* @child: pointer to task_struct of forking parent process.
5852	6064	*
5853	6065	* A task is associated with the init_css_set until cgroup_post_fork()
5854		- * attaches it to the parent's css_set. Empty cg_list indicates that
5855		- * @child isn't holding reference to its css_set.
	6066	+ * attaches it to the target css_set.
5856	6067	*/
5857	6068	void cgroup_fork(struct task_struct *child)
5858	6069	{
..	..	@@ -5860,21 +6071,173 @@
5860	6071	INIT_LIST_HEAD(&child->cg_list);
5861	6072	}
5862	6073
	6074	+static struct cgroup cgroup_get_from_file(struct file f)
	6075	+{
	6076	+ struct cgroup_subsys_state *css;
	6077	+ struct cgroup *cgrp;
	6078	+
	6079	+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
	6080	+ if (IS_ERR(css))
	6081	+ return ERR_CAST(css);
	6082	+
	6083	+ cgrp = css->cgroup;
	6084	+ if (!cgroup_on_dfl(cgrp)) {
	6085	+ cgroup_put(cgrp);
	6086	+ return ERR_PTR(-EBADF);
	6087	+ }
	6088	+
	6089	+ return cgrp;
	6090	+}
	6091	+
	6092	+/**
	6093	+ * cgroup_css_set_fork - find or create a css_set for a child process
	6094	+ * @kargs: the arguments passed to create the child process
	6095	+ *
	6096	+ * This functions finds or creates a new css_set which the child
	6097	+ * process will be attached to in cgroup_post_fork(). By default,
	6098	+ * the child process will be given the same css_set as its parent.
	6099	+ *
	6100	+ * If CLONE_INTO_CGROUP is specified this function will try to find an
	6101	+ * existing css_set which includes the requested cgroup and if not create
	6102	+ * a new css_set that the child will be attached to later. If this function
	6103	+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
	6104	+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
	6105	+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
	6106	+ * to the target cgroup.
	6107	+ */
	6108	+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
	6109	+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
	6110	+{
	6111	+ int ret;
	6112	+ struct cgroup *dst_cgrp = NULL;
	6113	+ struct css_set *cset;
	6114	+ struct super_block *sb;
	6115	+ struct file *f;
	6116	+
	6117	+ if (kargs->flags & CLONE_INTO_CGROUP)
	6118	+ mutex_lock(&cgroup_mutex);
	6119	+
	6120	+ cgroup_threadgroup_change_begin(current);
	6121	+
	6122	+ spin_lock_irq(&css_set_lock);
	6123	+ cset = task_css_set(current);
	6124	+ get_css_set(cset);
	6125	+ spin_unlock_irq(&css_set_lock);
	6126	+
	6127	+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
	6128	+ kargs->cset = cset;
	6129	+ return 0;
	6130	+ }
	6131	+
	6132	+ f = fget_raw(kargs->cgroup);
	6133	+ if (!f) {
	6134	+ ret = -EBADF;
	6135	+ goto err;
	6136	+ }
	6137	+ sb = f->f_path.dentry->d_sb;
	6138	+
	6139	+ dst_cgrp = cgroup_get_from_file(f);
	6140	+ if (IS_ERR(dst_cgrp)) {
	6141	+ ret = PTR_ERR(dst_cgrp);
	6142	+ dst_cgrp = NULL;
	6143	+ goto err;
	6144	+ }
	6145	+
	6146	+ if (cgroup_is_dead(dst_cgrp)) {
	6147	+ ret = -ENODEV;
	6148	+ goto err;
	6149	+ }
	6150	+
	6151	+ /*
	6152	+ * Verify that we the target cgroup is writable for us. This is
	6153	+ * usually done by the vfs layer but since we're not going through
	6154	+ * the vfs layer here we need to do it "manually".
	6155	+ */
	6156	+ ret = cgroup_may_write(dst_cgrp, sb);
	6157	+ if (ret)
	6158	+ goto err;
	6159	+
	6160	+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
	6161	+ !(kargs->flags & CLONE_THREAD),
	6162	+ current->nsproxy->cgroup_ns);
	6163	+ if (ret)
	6164	+ goto err;
	6165	+
	6166	+ kargs->cset = find_css_set(cset, dst_cgrp);
	6167	+ if (!kargs->cset) {
	6168	+ ret = -ENOMEM;
	6169	+ goto err;
	6170	+ }
	6171	+
	6172	+ put_css_set(cset);
	6173	+ fput(f);
	6174	+ kargs->cgrp = dst_cgrp;
	6175	+ return ret;
	6176	+
	6177	+err:
	6178	+ cgroup_threadgroup_change_end(current);
	6179	+ mutex_unlock(&cgroup_mutex);
	6180	+ if (f)
	6181	+ fput(f);
	6182	+ if (dst_cgrp)
	6183	+ cgroup_put(dst_cgrp);
	6184	+ put_css_set(cset);
	6185	+ if (kargs->cset)
	6186	+ put_css_set(kargs->cset);
	6187	+ return ret;
	6188	+}
	6189	+
	6190	+/**
	6191	+ * cgroup_css_set_put_fork - drop references we took during fork
	6192	+ * @kargs: the arguments passed to create the child process
	6193	+ *
	6194	+ * Drop references to the prepared css_set and target cgroup if
	6195	+ * CLONE_INTO_CGROUP was requested.
	6196	+ */
	6197	+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
	6198	+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
	6199	+{
	6200	+ cgroup_threadgroup_change_end(current);
	6201	+
	6202	+ if (kargs->flags & CLONE_INTO_CGROUP) {
	6203	+ struct cgroup *cgrp = kargs->cgrp;
	6204	+ struct css_set *cset = kargs->cset;
	6205	+
	6206	+ mutex_unlock(&cgroup_mutex);
	6207	+
	6208	+ if (cset) {
	6209	+ put_css_set(cset);
	6210	+ kargs->cset = NULL;
	6211	+ }
	6212	+
	6213	+ if (cgrp) {
	6214	+ cgroup_put(cgrp);
	6215	+ kargs->cgrp = NULL;
	6216	+ }
	6217	+ }
	6218	+}
	6219	+
5863	6220	/**
5864	6221	* cgroup_can_fork - called on a new task before the process is exposed
5865		- * @child: the task in question.
	6222	+ * @child: the child process
5866	6223	*
5867		- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5868		- * returns an error, the fork aborts with that error code. This allows for
5869		- * a cgroup subsystem to conditionally allow or deny new forks.
	6224	+ * This prepares a new css_set for the child process which the child will
	6225	+ * be attached to in cgroup_post_fork().
	6226	+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
	6227	+ * callback returns an error, the fork aborts with that error code. This
	6228	+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
5870	6229	*/
5871		-int cgroup_can_fork(struct task_struct *child)
	6230	+int cgroup_can_fork(struct task_struct child, struct kernel_clone_args kargs)
5872	6231	{
5873	6232	struct cgroup_subsys *ss;
5874	6233	int i, j, ret;
5875	6234
	6235	+ ret = cgroup_css_set_fork(kargs);
	6236	+ if (ret)
	6237	+ return ret;
	6238	+
5876	6239	do_each_subsys_mask(ss, i, have_canfork_callback) {
5877		- ret = ss->can_fork(child);
	6240	+ ret = ss->can_fork(child, kargs->cset);
5878	6241	if (ret)
5879	6242	goto out_revert;
5880	6243	} while_each_subsys_mask();
..	..	@@ -5886,97 +6249,86 @@
5886	6249	if (j >= i)
5887	6250	break;
5888	6251	if (ss->cancel_fork)
5889		- ss->cancel_fork(child);
	6252	+ ss->cancel_fork(child, kargs->cset);
5890	6253	}
	6254	+
	6255	+ cgroup_css_set_put_fork(kargs);
5891	6256
5892	6257	return ret;
5893	6258	}
5894	6259
5895	6260	/**
5896	6261	* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5897		- * @child: the task in question
	6262	+ * @child: the child process
	6263	+ * @kargs: the arguments passed to create the child process
5898	6264	*
5899	6265	* This calls the cancel_fork() callbacks if a fork failed after
5900		- * cgroup_can_fork() succeded.
	6266	+ * cgroup_can_fork() succeded and cleans up references we took to
	6267	+ * prepare a new css_set for the child process in cgroup_can_fork().
5901	6268	*/
5902		-void cgroup_cancel_fork(struct task_struct *child)
	6269	+void cgroup_cancel_fork(struct task_struct *child,
	6270	+ struct kernel_clone_args *kargs)
5903	6271	{
5904	6272	struct cgroup_subsys *ss;
5905	6273	int i;
5906	6274
5907	6275	for_each_subsys(ss, i)
5908	6276	if (ss->cancel_fork)
5909		- ss->cancel_fork(child);
	6277	+ ss->cancel_fork(child, kargs->cset);
	6278	+
	6279	+ cgroup_css_set_put_fork(kargs);
5910	6280	}
5911	6281
5912	6282	/**
5913		- * cgroup_post_fork - called on a new task after adding it to the task list
5914		- * @child: the task in question
	6283	+ * cgroup_post_fork - finalize cgroup setup for the child process
	6284	+ * @child: the child process
5915	6285	*
5916		- * Adds the task to the list running through its css_set if necessary and
5917		- * call the subsystem fork() callbacks. Has to be after the task is
5918		- * visible on the task list in case we race with the first call to
5919		- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5920		- * list.
	6286	+ * Attach the child process to its css_set calling the subsystem fork()
	6287	+ * callbacks.
5921	6288	*/
5922		-void cgroup_post_fork(struct task_struct *child)
	6289	+void cgroup_post_fork(struct task_struct *child,
	6290	+ struct kernel_clone_args *kargs)
	6291	+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
5923	6292	{
5924	6293	struct cgroup_subsys *ss;
	6294	+ struct css_set *cset;
5925	6295	int i;
5926	6296
5927		- /*
5928		- * This may race against cgroup_enable_task_cg_lists(). As that
5929		- * function sets use_task_css_set_links before grabbing
5930		- * tasklist_lock and we just went through tasklist_lock to add
5931		- * @child, it's guaranteed that either we see the set
5932		- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5933		- * @child during its iteration.
5934		- *
5935		- * If we won the race, @child is associated with %current's
5936		- * css_set. Grabbing css_set_lock guarantees both that the
5937		- * association is stable, and, on completion of the parent's
5938		- * migration, @child is visible in the source of migration or
5939		- * already in the destination cgroup. This guarantee is necessary
5940		- * when implementing operations which need to migrate all tasks of
5941		- * a cgroup to another.
5942		- *
5943		- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5944		- * will remain in init_css_set. This is safe because all tasks are
5945		- * in the init_css_set before cg_links is enabled and there's no
5946		- * operation which transfers all tasks out of init_css_set.
5947		- */
5948		- if (use_task_css_set_links) {
5949		- struct css_set *cset;
	6297	+ cset = kargs->cset;
	6298	+ kargs->cset = NULL;
5950	6299
5951		- spin_lock_irq(&css_set_lock);
5952		- cset = task_css_set(current);
5953		- if (list_empty(&child->cg_list)) {
5954		- get_css_set(cset);
5955		- cset->nr_tasks++;
5956		- css_set_move_task(child, NULL, cset, false);
5957		- }
	6300	+ spin_lock_irq(&css_set_lock);
	6301	+
	6302	+ /* init tasks are special, only link regular threads */
	6303	+ if (likely(child->pid)) {
	6304	+ WARN_ON_ONCE(!list_empty(&child->cg_list));
	6305	+ cset->nr_tasks++;
	6306	+ css_set_move_task(child, NULL, cset, false);
	6307	+ } else {
	6308	+ put_css_set(cset);
	6309	+ cset = NULL;
	6310	+ }
	6311	+
	6312	+ /*
	6313	+ * If the cgroup has to be frozen, the new task has too. Let's set
	6314	+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
	6315	+ * frozen state.
	6316	+ */
	6317	+ if (unlikely(cgroup_task_freeze(child))) {
	6318	+ spin_lock(&child->sighand->siglock);
	6319	+ WARN_ON_ONCE(child->frozen);
	6320	+ child->jobctl \|= JOBCTL_TRAP_FREEZE;
	6321	+ spin_unlock(&child->sighand->siglock);
5958	6322
5959	6323	/*
5960		- * If the cgroup has to be frozen, the new task has too.
5961		- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
5962		- * the task into the frozen state.
	6324	+ * Calling cgroup_update_frozen() isn't required here,
	6325	+ * because it will be called anyway a bit later from
	6326	+ * do_freezer_trap(). So we avoid cgroup's transient switch
	6327	+ * from the frozen state and back.
5963	6328	*/
5964		- if (unlikely(cgroup_task_freeze(child))) {
5965		- spin_lock(&child->sighand->siglock);
5966		- WARN_ON_ONCE(child->frozen);
5967		- child->jobctl \|= JOBCTL_TRAP_FREEZE;
5968		- spin_unlock(&child->sighand->siglock);
5969		-
5970		- /*
5971		- * Calling cgroup_update_frozen() isn't required here,
5972		- * because it will be called anyway a bit later
5973		- * from do_freezer_trap(). So we avoid cgroup's
5974		- * transient switch from the frozen state and back.
5975		- */
5976		- }
5977		-
5978		- spin_unlock_irq(&css_set_lock);
5979	6329	}
	6330	+
	6331	+ spin_unlock_irq(&css_set_lock);
5980	6332
5981	6333	/*
5982	6334	* Call ss->fork(). This must happen after @child is linked on
..	..	@@ -5986,26 +6338,25 @@
5986	6338	do_each_subsys_mask(ss, i, have_fork_callback) {
5987	6339	ss->fork(child);
5988	6340	} while_each_subsys_mask();
	6341	+
	6342	+ /* Make the new cset the root_cset of the new cgroup namespace. */
	6343	+ if (kargs->flags & CLONE_NEWCGROUP) {
	6344	+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
	6345	+
	6346	+ get_css_set(cset);
	6347	+ child->nsproxy->cgroup_ns->root_cset = cset;
	6348	+ put_css_set(rcset);
	6349	+ }
	6350	+
	6351	+ cgroup_css_set_put_fork(kargs);
5989	6352	}
5990	6353
5991	6354	/**
5992	6355	* cgroup_exit - detach cgroup from exiting task
5993	6356	* @tsk: pointer to task_struct of exiting process
5994	6357	*
5995		- * Description: Detach cgroup from @tsk and release it.
	6358	+ * Description: Detach cgroup from @tsk.
5996	6359	*
5997		- * Note that cgroups marked notify_on_release force every task in
5998		- * them to take the global cgroup_mutex mutex when exiting.
5999		- * This could impact scaling on very large systems. Be reluctant to
6000		- * use notify_on_release cgroups where very high task exit scaling
6001		- * is required on large systems.
6002		- *
6003		- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
6004		- * call cgroup_exit() while the task is still competent to handle
6005		- * notify_on_release(), then leave the task attached to the root cgroup in
6006		- * each hierarchy for the remainder of its exit. No need to bother with
6007		- * init_css_set refcnting. init_css_set never goes away and we can't race
6008		- * with migration path - PF_EXITING is visible to migration path.
6009	6360	*/
6010	6361	void cgroup_exit(struct task_struct *tsk)
6011	6362	{
..	..	@@ -6013,27 +6364,19 @@
6013	6364	struct css_set *cset;
6014	6365	int i;
6015	6366
6016		- /*
6017		- * Unlink from @tsk from its css_set. As migration path can't race
6018		- * with us, we can check css_set and cg_list without synchronization.
6019		- */
	6367	+ spin_lock_irq(&css_set_lock);
	6368	+
	6369	+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
6020	6370	cset = task_css_set(tsk);
	6371	+ css_set_move_task(tsk, cset, NULL, false);
	6372	+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
	6373	+ cset->nr_tasks--;
6021	6374
6022		- if (!list_empty(&tsk->cg_list)) {
6023		- spin_lock_irq(&css_set_lock);
6024		- css_set_move_task(tsk, cset, NULL, false);
6025		- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6026		- cset->nr_tasks--;
	6375	+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
	6376	+ if (unlikely(cgroup_task_freeze(tsk)))
	6377	+ cgroup_update_frozen(task_dfl_cgroup(tsk));
6027	6378
6028		- if (unlikely(cgroup_task_frozen(tsk)))
6029		- cgroup_freezer_frozen_exit(tsk);
6030		- else if (unlikely(cgroup_task_freeze(tsk)))
6031		- cgroup_update_frozen(task_dfl_cgroup(tsk));
6032		-
6033		- spin_unlock_irq(&css_set_lock);
6034		- } else {
6035		- get_css_set(cset);
6036		- }
	6379	+ spin_unlock_irq(&css_set_lock);
6037	6380
6038	6381	/* see cgroup_post_fork() for details */
6039	6382	do_each_subsys_mask(ss, i, have_exit_callback) {
..	..	@@ -6050,12 +6393,10 @@
6050	6393	ss->release(task);
6051	6394	} while_each_subsys_mask();
6052	6395
6053		- if (use_task_css_set_links) {
6054		- spin_lock_irq(&css_set_lock);
6055		- css_set_skip_task_iters(task_css_set(task), task);
6056		- list_del_init(&task->cg_list);
6057		- spin_unlock_irq(&css_set_lock);
6058		- }
	6396	+ spin_lock_irq(&css_set_lock);
	6397	+ css_set_skip_task_iters(task_css_set(task), task);
	6398	+ list_del_init(&task->cg_list);
	6399	+ spin_unlock_irq(&css_set_lock);
6059	6400	}
6060	6401
6061	6402	void cgroup_free(struct task_struct *task)
..	..	@@ -6096,6 +6437,16 @@
6096	6437	return 1;
6097	6438	}
6098	6439	__setup("cgroup_disable=", cgroup_disable);
	6440	+
	6441	+void __init __weak enable_debug_cgroup(void) { }
	6442	+
	6443	+static int __init enable_cgroup_debug(char *str)
	6444	+{
	6445	+ cgroup_debug = true;
	6446	+ enable_debug_cgroup();
	6447	+ return 1;
	6448	+}
	6449	+__setup("cgroup_debug", enable_cgroup_debug);
6099	6450
6100	6451	/**
6101	6452	* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
..	..	@@ -6196,7 +6547,6 @@
6196	6547	*/
6197	6548	struct cgroup *cgroup_get_from_fd(int fd)
6198	6549	{
6199		- struct cgroup_subsys_state *css;
6200	6550	struct cgroup *cgrp;
6201	6551	struct file *f;
6202	6552
..	..	@@ -6204,17 +6554,8 @@
6204	6554	if (!f)
6205	6555	return ERR_PTR(-EBADF);
6206	6556
6207		- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
	6557	+ cgrp = cgroup_get_from_file(f);
6208	6558	fput(f);
6209		- if (IS_ERR(css))
6210		- return ERR_CAST(css);
6211		-
6212		- cgrp = css->cgroup;
6213		- if (!cgroup_on_dfl(cgrp)) {
6214		- cgroup_put(cgrp);
6215		- return ERR_PTR(-EBADF);
6216		- }
6217		-
6218	6559	return cgrp;
6219	6560	}
6220	6561	EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
..	..	@@ -6305,6 +6646,7 @@
6305	6646	cset = task_css_set(current);
6306	6647	if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6307	6648	skcd->val = (unsigned long)cset->dfl_cgrp;
	6649	+ cgroup_bpf_get(cset->dfl_cgrp);
6308	6650	break;
6309	6651	}
6310	6652	cpu_relax();
..	..	@@ -6315,7 +6657,6 @@
6315	6657
6316	6658	void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6317	6659	{
6318		- /* Socket clone path */
6319	6660	if (skcd->val) {
6320	6661	if (skcd->no_refcnt)
6321	6662	return;
..	..	@@ -6325,40 +6666,48 @@
6325	6666	* Don't use cgroup_get_live().
6326	6667	*/
6327	6668	cgroup_get(sock_cgroup_ptr(skcd));
	6669	+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
6328	6670	}
6329	6671	}
6330	6672
6331	6673	void cgroup_sk_free(struct sock_cgroup_data *skcd)
6332	6674	{
	6675	+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
	6676	+
6333	6677	if (skcd->no_refcnt)
6334	6678	return;
6335		-
6336		- cgroup_put(sock_cgroup_ptr(skcd));
	6679	+ cgroup_bpf_put(cgrp);
	6680	+ cgroup_put(cgrp);
6337	6681	}
6338	6682
6339	6683	#endif /* CONFIG_SOCK_CGROUP_DATA */
6340	6684
6341	6685	#ifdef CONFIG_CGROUP_BPF
6342		-int cgroup_bpf_attach(struct cgroup cgrp, struct bpf_prog prog,
6343		- enum bpf_attach_type type, u32 flags)
	6686	+int cgroup_bpf_attach(struct cgroup *cgrp,
	6687	+ struct bpf_prog prog, struct bpf_prog replace_prog,
	6688	+ struct bpf_cgroup_link *link,
	6689	+ enum bpf_attach_type type,
	6690	+ u32 flags)
6344	6691	{
6345	6692	int ret;
6346	6693
6347	6694	mutex_lock(&cgroup_mutex);
6348		- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
	6695	+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6349	6696	mutex_unlock(&cgroup_mutex);
6350	6697	return ret;
6351	6698	}
	6699	+
6352	6700	int cgroup_bpf_detach(struct cgroup cgrp, struct bpf_prog prog,
6353		- enum bpf_attach_type type, u32 flags)
	6701	+ enum bpf_attach_type type)
6354	6702	{
6355	6703	int ret;
6356	6704
6357	6705	mutex_lock(&cgroup_mutex);
6358		- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
	6706	+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6359	6707	mutex_unlock(&cgroup_mutex);
6360	6708	return ret;
6361	6709	}
	6710	+
6362	6711	int cgroup_bpf_query(struct cgroup cgrp, const union bpf_attr attr,
6363	6712	union bpf_attr __user *uattr)
6364	6713	{
..	..	@@ -6419,7 +6768,10 @@
6419	6768	static ssize_t features_show(struct kobject kobj, struct kobj_attribute attr,
6420	6769	char *buf)
6421	6770	{
6422		- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
	6771	+ return snprintf(buf, PAGE_SIZE,
	6772	+ "nsdelegate\n"
	6773	+ "memory_localevents\n"
	6774	+ "memory_recursiveprot\n");
6423	6775	}
6424	6776	static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6425	6777