~hc/RK356X_SDK_RELEASE.git

..	..	@@ -54,12 +54,17 @@
54	54	#include <linux/proc_ns.h>
55	55	#include <linux/nsproxy.h>
56	56	#include <linux/file.h>
	57	+#include <linux/fs_parser.h>
57	58	#include <linux/sched/cputime.h>
	59	+#include <linux/sched/deadline.h>
58	60	#include <linux/psi.h>
59	61	#include <net/sock.h>
60	62
61	63	#define CREATE_TRACE_POINTS
62	64	#include <trace/events/cgroup.h>
	65	+#undef CREATE_TRACE_POINTS
	66	+
	67	+#include <trace/hooks/cgroup.h>
63	68
64	69	#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
65	70	MAX_CFTYPE_NAME + 2)
..	..	@@ -86,6 +91,7 @@
86	91
87	92	DEFINE_SPINLOCK(trace_cgroup_path_lock);
88	93	char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
	94	+bool cgroup_debug __read_mostly;
89	95
90	96	/*
91	97	* Protects cgroup_idr and css_idr so that IDs can be released without
..	..	@@ -99,7 +105,7 @@
99	105	*/
100	106	static DEFINE_SPINLOCK(cgroup_file_kn_lock);
101	107
102		-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
	108	+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
103	109
104	110	#define cgroup_assert_mutex_or_rcu_locked() \
105	111	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
..	..	@@ -151,11 +157,7 @@
151	157
152	158	static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
153	159
154		-/*
155		- * The default hierarchy, reserved for the subsystems that are otherwise
156		- * unattached - it never has more than a single cgroup, and all tasks are
157		- * part of that cgroup.
158		- */
	160	+/* the default hierarchy */
159	161	struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
160	162	EXPORT_SYMBOL_GPL(cgrp_dfl_root);
161	163
..	..	@@ -264,9 +266,6 @@
264	266	* can be used to test whether a cgroup is on the default hierarchy for
265	267	* cases where a subsystem should behave differnetly depending on the
266	268	* interface version.
267		- *
268		- * The set of behaviors which change on the default hierarchy are still
269		- * being determined and the mount option is prefixed with __DEVEL__.
270	269	*
271	270	* List of changed behaviors:
272	271	*
..	..	@@ -502,7 +501,7 @@
502	501
503	502	rcu_read_lock();
504	503	css = cgroup_css(cgrp, ss);
505		- if (!css \|\| !css_tryget_online(css))
	504	+ if (css && !css_tryget_online(css))
506	505	css = NULL;
507	506	rcu_read_unlock();
508	507
..	..	@@ -510,7 +509,7 @@
510	509	}
511	510
512	511	/**
513		- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
	512	+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
514	513	* @cgrp: the cgroup of interest
515	514	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
516	515	*
..	..	@@ -519,8 +518,8 @@
519	518	* enabled. If @ss is associated with the hierarchy @cgrp is on, this
520	519	* function is guaranteed to return non-NULL css.
521	520	*/
522		-static struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
523		- struct cgroup_subsys *ss)
	521	+static struct cgroup_subsys_state cgroup_e_css_by_mask(struct cgroup cgrp,
	522	+ struct cgroup_subsys *ss)
524	523	{
525	524	lockdep_assert_held(&cgroup_mutex);
526	525
..	..	@@ -538,6 +537,35 @@
538	537	}
539	538
540	539	return cgroup_css(cgrp, ss);
	540	+}
	541	+
	542	+/**
	543	+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
	544	+ * @cgrp: the cgroup of interest
	545	+ * @ss: the subsystem of interest
	546	+ *
	547	+ * Find and get the effective css of @cgrp for @ss. The effective css is
	548	+ * defined as the matching css of the nearest ancestor including self which
	549	+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
	550	+ * the root css is returned, so this function always returns a valid css.
	551	+ *
	552	+ * The returned css is not guaranteed to be online, and therefore it is the
	553	+ * callers responsiblity to tryget a reference for it.
	554	+ */
	555	+struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
	556	+ struct cgroup_subsys *ss)
	557	+{
	558	+ struct cgroup_subsys_state *css;
	559	+
	560	+ do {
	561	+ css = cgroup_css(cgrp, ss);
	562	+
	563	+ if (css)
	564	+ return css;
	565	+ cgrp = cgroup_parent(cgrp);
	566	+ } while (cgrp);
	567	+
	568	+ return init_css_set.subsys[ss->id];
541	569	}
542	570
543	571	/**
..	..	@@ -655,10 +683,11 @@
655	683	*
656	684	* Should be called under cgroup_[tree_]mutex.
657	685	*/
658		-#define for_each_e_css(css, ssid, cgrp) \
659		- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
660		- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
661		- ; \
	686	+#define for_each_e_css(css, ssid, cgrp) \
	687	+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
	688	+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
	689	+ cgroup_subsys[(ssid)]))) \
	690	+ ; \
662	691	else
663	692
664	693	/**
..	..	@@ -718,25 +747,28 @@
718	747	* reference-counted, to improve performance when child cgroups
719	748	* haven't been created.
720	749	*/
721		-struct css_set init_css_set = {
722		- .refcount = REFCOUNT_INIT(1),
723		- .dom_cset = &init_css_set,
724		- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
725		- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
726		- .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
727		- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
728		- .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
729		- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
730		- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
731		- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
732		-
733		- /*
734		- * The following field is re-initialized when this cset gets linked
735		- * in cgroup_init(). However, let's initialize the field
736		- * statically too so that the default cgroup can be accessed safely
737		- * early during boot.
738		- */
739		- .dfl_cgrp = &cgrp_dfl_root.cgrp,
	750	+struct ext_css_set init_ext_css_set = {
	751	+ .cset = {
	752	+ .refcount = REFCOUNT_INIT(1),
	753	+ .dom_cset = &init_css_set,
	754	+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
	755	+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
	756	+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
	757	+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
	758	+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
	759	+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
	760	+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
	761	+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
	762	+ /*
	763	+ * The following field is re-initialized when this cset gets linked
	764	+ * in cgroup_init(). However, let's initialize the field
	765	+ * statically too so that the default cgroup can be accessed safely
	766	+ * early during boot.
	767	+ */
	768	+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
	769	+ },
	770	+ .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
	771	+ .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
740	772	};
741	773
742	774	static int css_set_count = 1; /* 1 for init_css_set */
..	..	@@ -802,6 +834,8 @@
802	834	break;
803	835
804	836	cgroup1_check_for_release(cgrp);
	837	+ TRACE_CGROUP_PATH(notify_populated, cgrp,
	838	+ cgroup_is_populated(cgrp));
805	839	cgroup_file_notify(&cgrp->events_file);
806	840
807	841	child = cgrp;
..	..	@@ -881,8 +915,7 @@
881	915	/*
882	916	* We are synchronized through cgroup_threadgroup_rwsem
883	917	* against PF_EXITING setting such that we can't race
884		- * against cgroup_exit() changing the css_set to
885		- * init_css_set and dropping the old one.
	918	+ * against cgroup_exit()/cgroup_free() dropping the css_set.
886	919	*/
887	920	WARN_ON_ONCE(task->flags & PF_EXITING);
888	921
..	..	@@ -1060,7 +1093,7 @@
1060	1093	* @ss is in this hierarchy, so we want the
1061	1094	* effective css from @cgrp.
1062	1095	*/
1063		- template[i] = cgroup_e_css(cgrp, ss);
	1096	+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
1064	1097	} else {
1065	1098	/*
1066	1099	* @ss is not in this hierarchy, so we don't want
..	..	@@ -1162,6 +1195,7 @@
1162	1195	struct cgroup *cgrp)
1163	1196	{
1164	1197	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
	1198	+ struct ext_css_set *ext_cset;
1165	1199	struct css_set *cset;
1166	1200	struct list_head tmp_links;
1167	1201	struct cgrp_cset_link *link;
..	..	@@ -1182,9 +1216,10 @@
1182	1216	if (cset)
1183	1217	return cset;
1184	1218
1185		- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1186		- if (!cset)
	1219	+ ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
	1220	+ if (!ext_cset)
1187	1221	return NULL;
	1222	+ cset = &ext_cset->cset;
1188	1223
1189	1224	/* Allocate all the cgrp_cset_link objects that we'll need */
1190	1225	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
..	..	@@ -1202,6 +1237,8 @@
1202	1237	INIT_HLIST_NODE(&cset->hlist);
1203	1238	INIT_LIST_HEAD(&cset->cgrp_links);
1204	1239	INIT_LIST_HEAD(&cset->mg_preload_node);
	1240	+ INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
	1241	+ INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
1205	1242	INIT_LIST_HEAD(&cset->mg_node);
1206	1243
1207	1244	/* Copy the set of subsystem state objects generated in
..	..	@@ -1291,10 +1328,7 @@
1291	1328
1292	1329	void cgroup_free_root(struct cgroup_root *root)
1293	1330	{
1294		- if (root) {
1295		- idr_destroy(&root->cgroup_idr);
1296		- kfree(root);
1297		- }
	1331	+ kfree(root);
1298	1332	}
1299	1333
1300	1334	static void cgroup_destroy_root(struct cgroup_root *root)
..	..	@@ -1356,6 +1390,8 @@
1356	1390	cset = current->nsproxy->cgroup_ns->root_cset;
1357	1391	if (cset == &init_css_set) {
1358	1392	res = &root->cgrp;
	1393	+ } else if (root == &cgrp_dfl_root) {
	1394	+ res = cset->dfl_cgrp;
1359	1395	} else {
1360	1396	struct cgrp_cset_link *link;
1361	1397
..	..	@@ -1412,9 +1448,8 @@
1412	1448	struct cgroup_root *root)
1413	1449	{
1414	1450	/*
1415		- * No need to lock the task - since we hold cgroup_mutex the
1416		- * task can't change groups, so the only thing that can happen
1417		- * is that it exits and its css is set back to init_css_set.
	1451	+ * No need to lock the task - since we hold css_set_lock the
	1452	+ * task can't change groups.
1418	1453	*/
1419	1454	return cset_cgroup_from_root(task_css_set(task), root);
1420	1455	}
..	..	@@ -1453,12 +1488,15 @@
1453	1488	struct cgroup_subsys *ss = cft->ss;
1454	1489
1455	1490	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1456		- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1457		- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1458		- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
	1491	+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
	1492	+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
	1493	+
	1494	+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
	1495	+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1459	1496	cft->name);
1460		- else
	1497	+ } else {
1461	1498	strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
	1499	+ }
1462	1500	return buf;
1463	1501	}
1464	1502
..	..	@@ -1699,7 +1737,7 @@
1699	1737	{
1700	1738	struct cgroup *dcgrp = &dst_root->cgrp;
1701	1739	struct cgroup_subsys *ss;
1702		- int ssid, i, ret;
	1740	+ int ssid, ret;
1703	1741	u16 dfl_disable_ss_mask = 0;
1704	1742
1705	1743	lockdep_assert_held(&cgroup_mutex);
..	..	@@ -1743,7 +1781,8 @@
1743	1781	struct cgroup_root *src_root = ss->root;
1744	1782	struct cgroup *scgrp = &src_root->cgrp;
1745	1783	struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1746		- struct css_set *cset;
	1784	+ struct css_set cset, cset_pos;
	1785	+ struct css_task_iter *it;
1747	1786
1748	1787	WARN_ON(!css \|\| cgroup_css(dcgrp, ss));
1749	1788
..	..	@@ -1761,9 +1800,22 @@
1761	1800	css->cgroup = dcgrp;
1762	1801
1763	1802	spin_lock_irq(&css_set_lock);
1764		- hash_for_each(css_set_table, i, cset, hlist)
	1803	+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
	1804	+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
	1805	+ e_cset_node[ss->id]) {
1765	1806	list_move_tail(&cset->e_cset_node[ss->id],
1766	1807	&dcgrp->e_csets[ss->id]);
	1808	+ /*
	1809	+ * all css_sets of scgrp together in same order to dcgrp,
	1810	+ * patch in-flight iterators to preserve correct iteration.
	1811	+ * since the iterator is always advanced right away and
	1812	+ * finished when it->cset_pos meets it->cset_head, so only
	1813	+ * update it->cset_head is enough here.
	1814	+ */
	1815	+ list_for_each_entry(it, &cset->task_iters, iters_node)
	1816	+ if (it->cset_head == &scgrp->e_csets[ss->id])
	1817	+ it->cset_head = &dcgrp->e_csets[ss->id];
	1818	+ }
1767	1819	spin_unlock_irq(&css_set_lock);
1768	1820
1769	1821	/* default hierarchy doesn't enable controllers by default */
..	..	@@ -1815,26 +1867,42 @@
1815	1867	return len;
1816	1868	}
1817	1869
1818		-static int parse_cgroup_root_flags(char data, unsigned int root_flags)
	1870	+enum cgroup2_param {
	1871	+ Opt_nsdelegate,
	1872	+ Opt_memory_localevents,
	1873	+ Opt_memory_recursiveprot,
	1874	+ nr__cgroup2_params
	1875	+};
	1876	+
	1877	+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
	1878	+ fsparam_flag("nsdelegate", Opt_nsdelegate),
	1879	+ fsparam_flag("memory_localevents", Opt_memory_localevents),
	1880	+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
	1881	+ {}
	1882	+};
	1883	+
	1884	+static int cgroup2_parse_param(struct fs_context fc, struct fs_parameter param)
1819	1885	{
1820		- char *token;
	1886	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	1887	+ struct fs_parse_result result;
	1888	+ int opt;
1821	1889
1822		- *root_flags = 0;
	1890	+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
	1891	+ if (opt < 0)
	1892	+ return opt;
1823	1893
1824		- if (!data \|\| *data == '\0')
	1894	+ switch (opt) {
	1895	+ case Opt_nsdelegate:
	1896	+ ctx->flags \|= CGRP_ROOT_NS_DELEGATE;
1825	1897	return 0;
1826		-
1827		- while ((token = strsep(&data, ",")) != NULL) {
1828		- if (!strcmp(token, "nsdelegate")) {
1829		- *root_flags \|= CGRP_ROOT_NS_DELEGATE;
1830		- continue;
1831		- }
1832		-
1833		- pr_err("cgroup2: unknown option \"%s\"\n", token);
1834		- return -EINVAL;
	1898	+ case Opt_memory_localevents:
	1899	+ ctx->flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1900	+ return 0;
	1901	+ case Opt_memory_recursiveprot:
	1902	+ ctx->flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
	1903	+ return 0;
1835	1904	}
1836		-
1837		- return 0;
	1905	+ return -EINVAL;
1838	1906	}
1839	1907
1840	1908	static void apply_cgroup_root_flags(unsigned int root_flags)
..	..	@@ -1844,6 +1912,16 @@
1844	1912	cgrp_dfl_root.flags \|= CGRP_ROOT_NS_DELEGATE;
1845	1913	else
1846	1914	cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
	1915	+
	1916	+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
	1917	+ cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1918	+ else
	1919	+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
	1920	+
	1921	+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
	1922	+ cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
	1923	+ else
	1924	+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1847	1925	}
1848	1926	}
1849	1927
..	..	@@ -1851,79 +1929,19 @@
1851	1929	{
1852	1930	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1853	1931	seq_puts(seq, ",nsdelegate");
	1932	+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
	1933	+ seq_puts(seq, ",memory_localevents");
	1934	+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
	1935	+ seq_puts(seq, ",memory_recursiveprot");
1854	1936	return 0;
1855	1937	}
1856	1938
1857		-static int cgroup_remount(struct kernfs_root kf_root, int flags, char *data)
	1939	+static int cgroup_reconfigure(struct fs_context *fc)
1858	1940	{
1859		- unsigned int root_flags;
1860		- int ret;
	1941	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1861	1942
1862		- ret = parse_cgroup_root_flags(data, &root_flags);
1863		- if (ret)
1864		- return ret;
1865		-
1866		- apply_cgroup_root_flags(root_flags);
	1943	+ apply_cgroup_root_flags(ctx->flags);
1867	1944	return 0;
1868		-}
1869		-
1870		-/*
1871		- * To reduce the fork() overhead for systems that are not actually using
1872		- * their cgroups capability, we don't maintain the lists running through
1873		- * each css_set to its tasks until we see the list actually used - in other
1874		- * words after the first mount.
1875		- */
1876		-static bool use_task_css_set_links __read_mostly;
1877		-
1878		-static void cgroup_enable_task_cg_lists(void)
1879		-{
1880		- struct task_struct p, g;
1881		-
1882		- /*
1883		- * We need tasklist_lock because RCU is not safe against
1884		- * while_each_thread(). Besides, a forking task that has passed
1885		- * cgroup_post_fork() without seeing use_task_css_set_links = 1
1886		- * is not guaranteed to have its child immediately visible in the
1887		- * tasklist if we walk through it with RCU.
1888		- */
1889		- read_lock(&tasklist_lock);
1890		- spin_lock_irq(&css_set_lock);
1891		-
1892		- if (use_task_css_set_links)
1893		- goto out_unlock;
1894		-
1895		- use_task_css_set_links = true;
1896		-
1897		- do_each_thread(g, p) {
1898		- WARN_ON_ONCE(!list_empty(&p->cg_list) \|\|
1899		- task_css_set(p) != &init_css_set);
1900		-
1901		- /*
1902		- * We should check if the process is exiting, otherwise
1903		- * it will race with cgroup_exit() in that the list
1904		- * entry won't be deleted though the process has exited.
1905		- * Do it while holding siglock so that we don't end up
1906		- * racing against cgroup_exit().
1907		- *
1908		- * Interrupts were already disabled while acquiring
1909		- * the css_set_lock, so we do not need to disable it
1910		- * again when acquiring the sighand->siglock here.
1911		- */
1912		- spin_lock(&p->sighand->siglock);
1913		- if (!(p->flags & PF_EXITING)) {
1914		- struct css_set *cset = task_css_set(p);
1915		-
1916		- if (!css_set_populated(cset))
1917		- css_set_update_populated(cset, true);
1918		- list_add_tail(&p->cg_list, &cset->tasks);
1919		- get_css_set(cset);
1920		- cset->nr_tasks++;
1921		- }
1922		- spin_unlock(&p->sighand->siglock);
1923		- } while_each_thread(g, p);
1924		-out_unlock:
1925		- spin_unlock_irq(&css_set_lock);
1926		- read_unlock(&tasklist_lock);
1927	1945	}
1928	1946
1929	1947	static void init_cgroup_housekeeping(struct cgroup *cgrp)
..	..	@@ -1951,22 +1969,22 @@
1951	1969	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
1952	1970	}
1953	1971
1954		-void init_cgroup_root(struct cgroup_root root, struct cgroup_sb_opts opts)
	1972	+void init_cgroup_root(struct cgroup_fs_context *ctx)
1955	1973	{
	1974	+ struct cgroup_root *root = ctx->root;
1956	1975	struct cgroup *cgrp = &root->cgrp;
1957	1976
1958	1977	INIT_LIST_HEAD(&root->root_list);
1959	1978	atomic_set(&root->nr_cgrps, 1);
1960	1979	cgrp->root = root;
1961	1980	init_cgroup_housekeeping(cgrp);
1962		- idr_init(&root->cgroup_idr);
1963	1981
1964		- root->flags = opts->flags;
1965		- if (opts->release_agent)
1966		- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1967		- if (opts->name)
1968		- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1969		- if (opts->cpuset_clone_children)
	1982	+ root->flags = ctx->flags;
	1983	+ if (ctx->release_agent)
	1984	+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
	1985	+ if (ctx->name)
	1986	+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
	1987	+ if (ctx->cpuset_clone_children)
1970	1988	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1971	1989	}
1972	1990
..	..	@@ -1979,12 +1997,6 @@
1979	1997	int i, ret;
1980	1998
1981	1999	lockdep_assert_held(&cgroup_mutex);
1982		-
1983		- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1984		- if (ret < 0)
1985		- goto out;
1986		- root_cgrp->id = ret;
1987		- root_cgrp->ancestor_ids[0] = ret;
1988	2000
1989	2001	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
1990	2002	0, GFP_KERNEL);
..	..	@@ -2011,13 +2023,16 @@
2011	2023
2012	2024	root->kf_root = kernfs_create_root(kf_sops,
2013	2025	KERNFS_ROOT_CREATE_DEACTIVATED \|
2014		- KERNFS_ROOT_SUPPORT_EXPORTOP,
	2026	+ KERNFS_ROOT_SUPPORT_EXPORTOP \|
	2027	+ KERNFS_ROOT_SUPPORT_USER_XATTR,
2015	2028	root_cgrp);
2016	2029	if (IS_ERR(root->kf_root)) {
2017	2030	ret = PTR_ERR(root->kf_root);
2018	2031	goto exit_root_id;
2019	2032	}
2020	2033	root_cgrp->kn = root->kf_root->kn;
	2034	+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
	2035	+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
2021	2036
2022	2037	ret = css_populate_dir(&root_cgrp->self);
2023	2038	if (ret)
..	..	@@ -2055,7 +2070,6 @@
2055	2070	BUG_ON(!list_empty(&root_cgrp->self.children));
2056	2071	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2057	2072
2058		- kernfs_activate(root_cgrp->kn);
2059	2073	ret = 0;
2060	2074	goto out;
2061	2075
..	..	@@ -2071,91 +2085,117 @@
2071	2085	return ret;
2072	2086	}
2073	2087
2074		-struct dentry cgroup_do_mount(struct file_system_type fs_type, int flags,
2075		- struct cgroup_root *root, unsigned long magic,
2076		- struct cgroup_namespace *ns)
	2088	+int cgroup_do_get_tree(struct fs_context *fc)
2077	2089	{
2078		- struct dentry *dentry;
2079		- bool new_sb = false;
	2090	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	2091	+ int ret;
2080	2092
2081		- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
	2093	+ ctx->kfc.root = ctx->root->kf_root;
	2094	+ if (fc->fs_type == &cgroup2_fs_type)
	2095	+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
	2096	+ else
	2097	+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
	2098	+ ret = kernfs_get_tree(fc);
2082	2099
2083	2100	/*
2084	2101	* In non-init cgroup namespace, instead of root cgroup's dentry,
2085	2102	* we return the dentry corresponding to the cgroupns->root_cgrp.
2086	2103	*/
2087		- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
	2104	+ if (!ret && ctx->ns != &init_cgroup_ns) {
2088	2105	struct dentry *nsdentry;
2089		- struct super_block *sb = dentry->d_sb;
	2106	+ struct super_block *sb = fc->root->d_sb;
2090	2107	struct cgroup *cgrp;
2091	2108
2092	2109	mutex_lock(&cgroup_mutex);
2093	2110	spin_lock_irq(&css_set_lock);
2094	2111
2095		- cgrp = cset_cgroup_from_root(ns->root_cset, root);
	2112	+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2096	2113
2097	2114	spin_unlock_irq(&css_set_lock);
2098	2115	mutex_unlock(&cgroup_mutex);
2099	2116
2100	2117	nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2101		- dput(dentry);
2102		- if (IS_ERR(nsdentry))
	2118	+ dput(fc->root);
	2119	+ if (IS_ERR(nsdentry)) {
2103	2120	deactivate_locked_super(sb);
2104		- dentry = nsdentry;
	2121	+ ret = PTR_ERR(nsdentry);
	2122	+ nsdentry = NULL;
	2123	+ }
	2124	+ fc->root = nsdentry;
2105	2125	}
2106	2126
2107		- if (!new_sb)
2108		- cgroup_put(&root->cgrp);
	2127	+ if (!ctx->kfc.new_sb_created)
	2128	+ cgroup_put(&ctx->root->cgrp);
2109	2129
2110		- return dentry;
	2130	+ return ret;
2111	2131	}
2112	2132
2113		-static struct dentry cgroup_mount(struct file_system_type fs_type,
2114		- int flags, const char *unused_dev_name,
2115		- void *data)
	2133	+/*
	2134	+ * Destroy a cgroup filesystem context.
	2135	+ */
	2136	+static void cgroup_fs_context_free(struct fs_context *fc)
2116	2137	{
2117		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2118		- struct dentry *dentry;
	2138	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	2139	+
	2140	+ kfree(ctx->name);
	2141	+ kfree(ctx->release_agent);
	2142	+ put_cgroup_ns(ctx->ns);
	2143	+ kernfs_free_fs_context(fc);
	2144	+ kfree(ctx);
	2145	+}
	2146	+
	2147	+static int cgroup_get_tree(struct fs_context *fc)
	2148	+{
	2149	+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2119	2150	int ret;
2120	2151
2121		- get_cgroup_ns(ns);
	2152	+ cgrp_dfl_visible = true;
	2153	+ cgroup_get_live(&cgrp_dfl_root.cgrp);
	2154	+ ctx->root = &cgrp_dfl_root;
2122	2155
2123		- /* Check if the caller has permission to mount. */
2124		- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2125		- put_cgroup_ns(ns);
2126		- return ERR_PTR(-EPERM);
2127		- }
	2156	+ ret = cgroup_do_get_tree(fc);
	2157	+ if (!ret)
	2158	+ apply_cgroup_root_flags(ctx->flags);
	2159	+ return ret;
	2160	+}
2128	2161
2129		- /*
2130		- * The first time anyone tries to mount a cgroup, enable the list
2131		- * linking each css_set to its tasks and fix up all existing tasks.
2132		- */
2133		- if (!use_task_css_set_links)
2134		- cgroup_enable_task_cg_lists();
	2162	+static const struct fs_context_operations cgroup_fs_context_ops = {
	2163	+ .free = cgroup_fs_context_free,
	2164	+ .parse_param = cgroup2_parse_param,
	2165	+ .get_tree = cgroup_get_tree,
	2166	+ .reconfigure = cgroup_reconfigure,
	2167	+};
2135	2168
2136		- if (fs_type == &cgroup2_fs_type) {
2137		- unsigned int root_flags;
	2169	+static const struct fs_context_operations cgroup1_fs_context_ops = {
	2170	+ .free = cgroup_fs_context_free,
	2171	+ .parse_param = cgroup1_parse_param,
	2172	+ .get_tree = cgroup1_get_tree,
	2173	+ .reconfigure = cgroup1_reconfigure,
	2174	+};
2138	2175
2139		- ret = parse_cgroup_root_flags(data, &root_flags);
2140		- if (ret) {
2141		- put_cgroup_ns(ns);
2142		- return ERR_PTR(ret);
2143		- }
	2176	+/*
	2177	+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
	2178	+ * we select the namespace we're going to use.
	2179	+ */
	2180	+static int cgroup_init_fs_context(struct fs_context *fc)
	2181	+{
	2182	+ struct cgroup_fs_context *ctx;
2144	2183
2145		- cgrp_dfl_visible = true;
2146		- cgroup_get_live(&cgrp_dfl_root.cgrp);
	2184	+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
	2185	+ if (!ctx)
	2186	+ return -ENOMEM;
2147	2187
2148		- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
2149		- CGROUP2_SUPER_MAGIC, ns);
2150		- if (!IS_ERR(dentry))
2151		- apply_cgroup_root_flags(root_flags);
2152		- } else {
2153		- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
2154		- CGROUP_SUPER_MAGIC, ns);
2155		- }
2156		-
2157		- put_cgroup_ns(ns);
2158		- return dentry;
	2188	+ ctx->ns = current->nsproxy->cgroup_ns;
	2189	+ get_cgroup_ns(ctx->ns);
	2190	+ fc->fs_private = &ctx->kfc;
	2191	+ if (fc->fs_type == &cgroup2_fs_type)
	2192	+ fc->ops = &cgroup_fs_context_ops;
	2193	+ else
	2194	+ fc->ops = &cgroup1_fs_context_ops;
	2195	+ put_user_ns(fc->user_ns);
	2196	+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
	2197	+ fc->global = true;
	2198	+ return 0;
2159	2199	}
2160	2200
2161	2201	static void cgroup_kill_sb(struct super_block *sb)
..	..	@@ -2171,25 +2211,73 @@
2171	2211	* And don't kill the default root.
2172	2212	*/
2173	2213	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2174		- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
	2214	+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
	2215	+ cgroup_bpf_offline(&root->cgrp);
2175	2216	percpu_ref_kill(&root->cgrp.self.refcnt);
	2217	+ }
2176	2218	cgroup_put(&root->cgrp);
2177	2219	kernfs_kill_sb(sb);
2178	2220	}
2179	2221
2180	2222	struct file_system_type cgroup_fs_type = {
2181		- .name = "cgroup",
2182		- .mount = cgroup_mount,
2183		- .kill_sb = cgroup_kill_sb,
2184		- .fs_flags = FS_USERNS_MOUNT,
	2223	+ .name = "cgroup",
	2224	+ .init_fs_context = cgroup_init_fs_context,
	2225	+ .parameters = cgroup1_fs_parameters,
	2226	+ .kill_sb = cgroup_kill_sb,
	2227	+ .fs_flags = FS_USERNS_MOUNT,
2185	2228	};
2186	2229
2187	2230	static struct file_system_type cgroup2_fs_type = {
2188		- .name = "cgroup2",
2189		- .mount = cgroup_mount,
2190		- .kill_sb = cgroup_kill_sb,
2191		- .fs_flags = FS_USERNS_MOUNT,
	2231	+ .name = "cgroup2",
	2232	+ .init_fs_context = cgroup_init_fs_context,
	2233	+ .parameters = cgroup2_fs_parameters,
	2234	+ .kill_sb = cgroup_kill_sb,
	2235	+ .fs_flags = FS_USERNS_MOUNT,
2192	2236	};
	2237	+
	2238	+#ifdef CONFIG_CPUSETS
	2239	+static const struct fs_context_operations cpuset_fs_context_ops = {
	2240	+ .get_tree = cgroup1_get_tree,
	2241	+ .free = cgroup_fs_context_free,
	2242	+};
	2243	+
	2244	+/*
	2245	+ * This is ugly, but preserves the userspace API for existing cpuset
	2246	+ * users. If someone tries to mount the "cpuset" filesystem, we
	2247	+ * silently switch it to mount "cgroup" instead
	2248	+ */
	2249	+static int cpuset_init_fs_context(struct fs_context *fc)
	2250	+{
	2251	+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
	2252	+ struct cgroup_fs_context *ctx;
	2253	+ int err;
	2254	+
	2255	+ err = cgroup_init_fs_context(fc);
	2256	+ if (err) {
	2257	+ kfree(agent);
	2258	+ return err;
	2259	+ }
	2260	+
	2261	+ fc->ops = &cpuset_fs_context_ops;
	2262	+
	2263	+ ctx = cgroup_fc2context(fc);
	2264	+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
	2265	+ ctx->flags \|= CGRP_ROOT_NOPREFIX;
	2266	+ ctx->release_agent = agent;
	2267	+
	2268	+ get_filesystem(&cgroup_fs_type);
	2269	+ put_filesystem(fc->fs_type);
	2270	+ fc->fs_type = &cgroup_fs_type;
	2271	+
	2272	+ return 0;
	2273	+}
	2274	+
	2275	+static struct file_system_type cpuset_fs_type = {
	2276	+ .name = "cpuset",
	2277	+ .init_fs_context = cpuset_init_fs_context,
	2278	+ .fs_flags = FS_USERNS_MOUNT,
	2279	+};
	2280	+#endif
2193	2281
2194	2282	int cgroup_path_ns_locked(struct cgroup cgrp, char buf, size_t buflen,
2195	2283	struct cgroup_namespace *ns)
..	..	@@ -2256,6 +2344,47 @@
2256	2344	EXPORT_SYMBOL_GPL(task_cgroup_path);
2257	2345
2258	2346	/**
	2347	+ * cgroup_attach_lock - Lock for ->attach()
	2348	+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
	2349	+ *
	2350	+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
	2351	+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
	2352	+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
	2353	+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
	2354	+ * lead to deadlocks.
	2355	+ *
	2356	+ * Bringing up a CPU may involve creating and destroying tasks which requires
	2357	+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
	2358	+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
	2359	+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
	2360	+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
	2361	+ * the threadgroup_rwsem to be released to create new tasks. For more details:
	2362	+ *
	2363	+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
	2364	+ *
	2365	+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
	2366	+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
	2367	+ * CPU hotplug is disabled on entry.
	2368	+ */
	2369	+static void cgroup_attach_lock(bool lock_threadgroup)
	2370	+{
	2371	+ cpus_read_lock();
	2372	+ if (lock_threadgroup)
	2373	+ percpu_down_write(&cgroup_threadgroup_rwsem);
	2374	+}
	2375	+
	2376	+/**
	2377	+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
	2378	+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
	2379	+ */
	2380	+static void cgroup_attach_unlock(bool lock_threadgroup)
	2381	+{
	2382	+ if (lock_threadgroup)
	2383	+ percpu_up_write(&cgroup_threadgroup_rwsem);
	2384	+ cpus_read_unlock();
	2385	+}
	2386	+
	2387	+/**
2259	2388	* cgroup_migrate_add_task - add a migration target task to a migration context
2260	2389	* @task: target task
2261	2390	* @mgctx: target migration context
..	..	@@ -2276,9 +2405,8 @@
2276	2405	if (task->flags & PF_EXITING)
2277	2406	return;
2278	2407
2279		- /* leave @task alone if post_fork() hasn't linked it yet */
2280		- if (list_empty(&task->cg_list))
2281		- return;
	2408	+ /* cgroup_threadgroup_rwsem protects racing against forks */
	2409	+ WARN_ON_ONCE(list_empty(&task->cg_list));
2282	2410
2283	2411	cset = task_css_set(task);
2284	2412	if (!cset->mg_src_cgrp)
..	..	@@ -2310,6 +2438,7 @@
2310	2438
2311	2439	return cgroup_taskset_next(tset, dst_cssp);
2312	2440	}
	2441	+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
2313	2442
2314	2443	/**
2315	2444	* cgroup_taskset_next - iterate to the next task in taskset
..	..	@@ -2356,6 +2485,7 @@
2356	2485
2357	2486	return NULL;
2358	2487	}
	2488	+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
2359	2489
2360	2490	/**
2361	2491	* cgroup_taskset_migrate - migrate a taskset
..	..	@@ -2426,6 +2556,7 @@
2426	2556	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2427	2557	if (ss->attach) {
2428	2558	tset->ssid = ssid;
	2559	+ trace_android_vh_cgroup_attach(ss, tset);
2429	2560	ss->attach(tset);
2430	2561	}
2431	2562	} while_each_subsys_mask();
..	..	@@ -2510,22 +2641,28 @@
2510	2641	*/
2511	2642	void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2512	2643	{
2513		- LIST_HEAD(preloaded);
2514		- struct css_set cset, tmp_cset;
	2644	+ struct ext_css_set cset, tmp_cset;
2515	2645
2516	2646	lockdep_assert_held(&cgroup_mutex);
2517	2647
2518	2648	spin_lock_irq(&css_set_lock);
2519	2649
2520		- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2521		- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
	2650	+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
	2651	+ mg_src_preload_node) {
	2652	+ cset->cset.mg_src_cgrp = NULL;
	2653	+ cset->cset.mg_dst_cgrp = NULL;
	2654	+ cset->cset.mg_dst_cset = NULL;
	2655	+ list_del_init(&cset->mg_src_preload_node);
	2656	+ put_css_set_locked(&cset->cset);
	2657	+ }
2522	2658
2523		- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2524		- cset->mg_src_cgrp = NULL;
2525		- cset->mg_dst_cgrp = NULL;
2526		- cset->mg_dst_cset = NULL;
2527		- list_del_init(&cset->mg_preload_node);
2528		- put_css_set_locked(cset);
	2659	+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
	2660	+ mg_dst_preload_node) {
	2661	+ cset->cset.mg_src_cgrp = NULL;
	2662	+ cset->cset.mg_dst_cgrp = NULL;
	2663	+ cset->cset.mg_dst_cset = NULL;
	2664	+ list_del_init(&cset->mg_dst_preload_node);
	2665	+ put_css_set_locked(&cset->cset);
2529	2666	}
2530	2667
2531	2668	spin_unlock_irq(&css_set_lock);
..	..	@@ -2552,6 +2689,7 @@
2552	2689	struct cgroup_mgctx *mgctx)
2553	2690	{
2554	2691	struct cgroup *src_cgrp;
	2692	+ struct ext_css_set *ext_src_cset;
2555	2693
2556	2694	lockdep_assert_held(&cgroup_mutex);
2557	2695	lockdep_assert_held(&css_set_lock);
..	..	@@ -2565,8 +2703,9 @@
2565	2703	return;
2566	2704
2567	2705	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
	2706	+ ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
2568	2707
2569		- if (!list_empty(&src_cset->mg_preload_node))
	2708	+ if (!list_empty(&ext_src_cset->mg_src_preload_node))
2570	2709	return;
2571	2710
2572	2711	WARN_ON(src_cset->mg_src_cgrp);
..	..	@@ -2577,7 +2716,7 @@
2577	2716	src_cset->mg_src_cgrp = src_cgrp;
2578	2717	src_cset->mg_dst_cgrp = dst_cgrp;
2579	2718	get_css_set(src_cset);
2580		- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
	2719	+ list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
2581	2720	}
2582	2721
2583	2722	/**
..	..	@@ -2596,20 +2735,23 @@
2596	2735	*/
2597	2736	int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2598	2737	{
2599		- struct css_set src_cset, tmp_cset;
	2738	+ struct ext_css_set ext_src_set, tmp_cset;
2600	2739
2601	2740	lockdep_assert_held(&cgroup_mutex);
2602	2741
2603	2742	/* look up the dst cset for each src cset and link it to src */
2604		- list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2605		- mg_preload_node) {
	2743	+ list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
	2744	+ mg_src_preload_node) {
	2745	+ struct css_set *src_cset = &ext_src_set->cset;
2606	2746	struct css_set *dst_cset;
	2747	+ struct ext_css_set *ext_dst_cset;
2607	2748	struct cgroup_subsys *ss;
2608	2749	int ssid;
2609	2750
2610	2751	dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2611	2752	if (!dst_cset)
2612	2753	return -ENOMEM;
	2754	+ ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
2613	2755
2614	2756	WARN_ON_ONCE(src_cset->mg_dst_cset \|\| dst_cset->mg_dst_cset);
2615	2757
..	..	@@ -2621,7 +2763,7 @@
2621	2763	if (src_cset == dst_cset) {
2622	2764	src_cset->mg_src_cgrp = NULL;
2623	2765	src_cset->mg_dst_cgrp = NULL;
2624		- list_del_init(&src_cset->mg_preload_node);
	2766	+ list_del_init(&ext_src_set->mg_src_preload_node);
2625	2767	put_css_set(src_cset);
2626	2768	put_css_set(dst_cset);
2627	2769	continue;
..	..	@@ -2629,8 +2771,8 @@
2629	2771
2630	2772	src_cset->mg_dst_cset = dst_cset;
2631	2773
2632		- if (list_empty(&dst_cset->mg_preload_node))
2633		- list_add_tail(&dst_cset->mg_preload_node,
	2774	+ if (list_empty(&ext_dst_cset->mg_dst_preload_node))
	2775	+ list_add_tail(&ext_dst_cset->mg_dst_preload_node,
2634	2776	&mgctx->preloaded_dst_csets);
2635	2777	else
2636	2778	put_css_set(dst_cset);
..	..	@@ -2698,11 +2840,7 @@
2698	2840	{
2699	2841	DEFINE_CGROUP_MGCTX(mgctx);
2700	2842	struct task_struct *task;
2701		- int ret;
2702		-
2703		- ret = cgroup_migrate_vet_dst(dst_cgrp);
2704		- if (ret)
2705		- return ret;
	2843	+ int ret = 0;
2706	2844
2707	2845	/* look up all src csets */
2708	2846	spin_lock_irq(&css_set_lock);
..	..	@@ -2729,16 +2867,28 @@
2729	2867	return ret;
2730	2868	}
2731	2869
2732		-struct task_struct cgroup_procs_write_start(char buf, bool threadgroup)
2733		- __acquires(&cgroup_threadgroup_rwsem)
	2870	+struct task_struct cgroup_procs_write_start(char buf, bool threadgroup,
	2871	+ bool *threadgroup_locked,
	2872	+ struct cgroup *dst_cgrp)
2734	2873	{
2735	2874	struct task_struct *tsk;
2736	2875	pid_t pid;
	2876	+ bool force_migration = false;
2737	2877
2738	2878	if (kstrtoint(strstrip(buf), 0, &pid) \|\| pid < 0)
2739	2879	return ERR_PTR(-EINVAL);
2740	2880
2741		- percpu_down_write(&cgroup_threadgroup_rwsem);
	2881	+ /*
	2882	+ * If we migrate a single thread, we don't care about threadgroup
	2883	+ * stability. If the thread is `current`, it won't exit(2) under our
	2884	+ * hands or change PID through exec(2). We exclude
	2885	+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
	2886	+ * callers by cgroup_mutex.
	2887	+ * Therefore, we can skip the global lock.
	2888	+ */
	2889	+ lockdep_assert_held(&cgroup_mutex);
	2890	+ *threadgroup_locked = pid \|\| threadgroup;
	2891	+ cgroup_attach_lock(*threadgroup_locked);
2742	2892
2743	2893	rcu_read_lock();
2744	2894	if (pid) {
..	..	@@ -2754,13 +2904,16 @@
2754	2904	if (threadgroup)
2755	2905	tsk = tsk->group_leader;
2756	2906
	2907	+ if (tsk->flags & PF_KTHREAD)
	2908	+ trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration);
	2909	+
2757	2910	/*
2758	2911	* kthreads may acquire PF_NO_SETAFFINITY during initialization.
2759	2912	* If userland migrates such a kthread to a non-root cgroup, it can
2760	2913	* become trapped in a cpuset, or RT kthread may be born in a
2761	2914	* cgroup with no rt_runtime allocated. Just say no.
2762	2915	*/
2763		- if (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY)) {
	2916	+ if (!force_migration && (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY))) {
2764	2917	tsk = ERR_PTR(-EINVAL);
2765	2918	goto out_unlock_threadgroup;
2766	2919	}
..	..	@@ -2769,14 +2922,14 @@
2769	2922	goto out_unlock_rcu;
2770	2923
2771	2924	out_unlock_threadgroup:
2772		- percpu_up_write(&cgroup_threadgroup_rwsem);
	2925	+ cgroup_attach_unlock(*threadgroup_locked);
	2926	+ *threadgroup_locked = false;
2773	2927	out_unlock_rcu:
2774	2928	rcu_read_unlock();
2775	2929	return tsk;
2776	2930	}
2777	2931
2778		-void cgroup_procs_write_finish(struct task_struct *task)
2779		- __releases(&cgroup_threadgroup_rwsem)
	2932	+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
2780	2933	{
2781	2934	struct cgroup_subsys *ss;
2782	2935	int ssid;
..	..	@@ -2784,7 +2937,8 @@
2784	2937	/* release reference from cgroup_procs_write_start() */
2785	2938	put_task_struct(task);
2786	2939
2787		- percpu_up_write(&cgroup_threadgroup_rwsem);
	2940	+ cgroup_attach_unlock(threadgroup_locked);
	2941	+
2788	2942	for_each_subsys(ss, ssid)
2789	2943	if (ss->post_attach)
2790	2944	ss->post_attach();
..	..	@@ -2799,7 +2953,7 @@
2799	2953	do_each_subsys_mask(ss, ssid, ss_mask) {
2800	2954	if (printed)
2801	2955	seq_putc(seq, ' ');
2802		- seq_printf(seq, "%s", ss->name);
	2956	+ seq_puts(seq, ss->name);
2803	2957	printed = true;
2804	2958	} while_each_subsys_mask();
2805	2959	if (printed)
..	..	@@ -2838,12 +2992,11 @@
2838	2992	DEFINE_CGROUP_MGCTX(mgctx);
2839	2993	struct cgroup_subsys_state *d_css;
2840	2994	struct cgroup *dsct;
2841		- struct css_set *src_cset;
	2995	+ struct ext_css_set *ext_src_set;
	2996	+ bool has_tasks;
2842	2997	int ret;
2843	2998
2844	2999	lockdep_assert_held(&cgroup_mutex);
2845		-
2846		- percpu_down_write(&cgroup_threadgroup_rwsem);
2847	3000
2848	3001	/* look up all csses currently attached to @cgrp's subtree */
2849	3002	spin_lock_irq(&css_set_lock);
..	..	@@ -2855,17 +3008,27 @@
2855	3008	}
2856	3009	spin_unlock_irq(&css_set_lock);
2857	3010
	3011	+ /*
	3012	+ * We need to write-lock threadgroup_rwsem while migrating tasks.
	3013	+ * However, if there are no source csets for @cgrp, changing its
	3014	+ * controllers isn't gonna produce any task migrations and the
	3015	+ * write-locking can be skipped safely.
	3016	+ */
	3017	+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
	3018	+ cgroup_attach_lock(has_tasks);
	3019	+
2858	3020	/* NULL dst indicates self on default hierarchy */
2859	3021	ret = cgroup_migrate_prepare_dst(&mgctx);
2860	3022	if (ret)
2861	3023	goto out_finish;
2862	3024
2863	3025	spin_lock_irq(&css_set_lock);
2864		- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
	3026	+ list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
	3027	+ mg_src_preload_node) {
2865	3028	struct task_struct task, ntask;
2866	3029
2867	3030	/* all tasks in src_csets need to be migrated */
2868		- list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
	3031	+ list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
2869	3032	cgroup_migrate_add_task(task, &mgctx);
2870	3033	}
2871	3034	spin_unlock_irq(&css_set_lock);
..	..	@@ -2873,7 +3036,7 @@
2873	3036	ret = cgroup_migrate_execute(&mgctx);
2874	3037	out_finish:
2875	3038	cgroup_migrate_finish(&mgctx);
2876		- percpu_up_write(&cgroup_threadgroup_rwsem);
	3039	+ cgroup_attach_unlock(has_tasks);
2877	3040	return ret;
2878	3041	}
2879	3042
..	..	@@ -3106,7 +3269,7 @@
3106	3269	return ret;
3107	3270
3108	3271	/*
3109		- * At this point, cgroup_e_css() results reflect the new csses
	3272	+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
3110	3273	* making the following cgroup_update_dfl_csses() properly update
3111	3274	* css associations of all tasks in the subtree.
3112	3275	*/
..	..	@@ -3506,22 +3669,33 @@
3506	3669	#ifdef CONFIG_PSI
3507	3670	static int cgroup_io_pressure_show(struct seq_file seq, void v)
3508	3671	{
3509		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
	3672	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3673	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3674	+
	3675	+ return psi_show(seq, psi, PSI_IO);
3510	3676	}
3511	3677	static int cgroup_memory_pressure_show(struct seq_file seq, void v)
3512	3678	{
3513		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
	3679	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3680	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3681	+
	3682	+ return psi_show(seq, psi, PSI_MEM);
3514	3683	}
3515	3684	static int cgroup_cpu_pressure_show(struct seq_file seq, void v)
3516	3685	{
3517		- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
	3686	+ struct cgroup *cgrp = seq_css(seq)->cgroup;
	3687	+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3688	+
	3689	+ return psi_show(seq, psi, PSI_CPU);
3518	3690	}
3519	3691
3520	3692	static ssize_t cgroup_pressure_write(struct kernfs_open_file of, char buf,
3521	3693	size_t nbytes, enum psi_res res)
3522	3694	{
	3695	+ struct cgroup_file_ctx *ctx = of->priv;
3523	3696	struct psi_trigger *new;
3524	3697	struct cgroup *cgrp;
	3698	+ struct psi_group *psi;
3525	3699
3526	3700	cgrp = cgroup_kn_lock_live(of->kn, false);
3527	3701	if (!cgrp)
..	..	@@ -3530,14 +3704,20 @@
3530	3704	cgroup_get(cgrp);
3531	3705	cgroup_kn_unlock(of->kn);
3532	3706
3533		- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
	3707	+ /* Allow only one trigger per file descriptor */
	3708	+ if (ctx->psi.trigger) {
	3709	+ cgroup_put(cgrp);
	3710	+ return -EBUSY;
	3711	+ }
	3712	+
	3713	+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
	3714	+ new = psi_trigger_create(psi, buf, nbytes, res);
3534	3715	if (IS_ERR(new)) {
3535	3716	cgroup_put(cgrp);
3536	3717	return PTR_ERR(new);
3537	3718	}
3538	3719
3539		- psi_trigger_replace(&of->priv, new);
3540		-
	3720	+ smp_store_release(&ctx->psi.trigger, new);
3541	3721	cgroup_put(cgrp);
3542	3722
3543	3723	return nbytes;
..	..	@@ -3567,12 +3747,15 @@
3567	3747	static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3568	3748	poll_table *pt)
3569	3749	{
3570		- return psi_trigger_poll(&of->priv, of->file, pt);
	3750	+ struct cgroup_file_ctx *ctx = of->priv;
	3751	+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
3571	3752	}
3572	3753
3573	3754	static void cgroup_pressure_release(struct kernfs_open_file *of)
3574	3755	{
3575		- psi_trigger_replace(&of->priv, NULL);
	3756	+ struct cgroup_file_ctx *ctx = of->priv;
	3757	+
	3758	+ psi_trigger_destroy(ctx->psi.trigger);
3576	3759	}
3577	3760
3578	3761	bool cgroup_psi_enabled(void)
..	..	@@ -3625,28 +3808,50 @@
3625	3808	static int cgroup_file_open(struct kernfs_open_file *of)
3626	3809	{
3627	3810	struct cftype *cft = of->kn->priv;
	3811	+ struct cgroup_file_ctx *ctx;
	3812	+ int ret;
3628	3813
3629		- if (cft->open)
3630		- return cft->open(of);
3631		- return 0;
	3814	+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
	3815	+ if (!ctx)
	3816	+ return -ENOMEM;
	3817	+
	3818	+ ctx->ns = current->nsproxy->cgroup_ns;
	3819	+ get_cgroup_ns(ctx->ns);
	3820	+ of->priv = ctx;
	3821	+
	3822	+ if (!cft->open)
	3823	+ return 0;
	3824	+
	3825	+ ret = cft->open(of);
	3826	+ if (ret) {
	3827	+ put_cgroup_ns(ctx->ns);
	3828	+ kfree(ctx);
	3829	+ }
	3830	+ return ret;
3632	3831	}
3633	3832
3634	3833	static void cgroup_file_release(struct kernfs_open_file *of)
3635	3834	{
3636	3835	struct cftype *cft = of->kn->priv;
	3836	+ struct cgroup_file_ctx *ctx = of->priv;
3637	3837
3638	3838	if (cft->release)
3639	3839	cft->release(of);
	3840	+ put_cgroup_ns(ctx->ns);
	3841	+ kfree(ctx);
3640	3842	}
3641	3843
3642	3844	static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
3643	3845	size_t nbytes, loff_t off)
3644	3846	{
3645		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
	3847	+ struct cgroup_file_ctx *ctx = of->priv;
3646	3848	struct cgroup *cgrp = of->kn->parent->priv;
3647	3849	struct cftype *cft = of->kn->priv;
3648	3850	struct cgroup_subsys_state *css;
3649	3851	int ret;
	3852	+
	3853	+ if (!nbytes)
	3854	+ return 0;
3650	3855
3651	3856	/*
3652	3857	* If namespaces are delegation boundaries, disallow writes to
..	..	@@ -3656,7 +3861,7 @@
3656	3861	*/
3657	3862	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
3658	3863	!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3659		- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
	3864	+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
3660	3865	return -EPERM;
3661	3866
3662	3867	if (cft->write)
..	..	@@ -3843,7 +4048,8 @@
3843	4048	continue;
3844	4049	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3845	4050	continue;
3846		-
	4051	+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
	4052	+ continue;
3847	4053	if (is_add) {
3848	4054	ret = cgroup_add_file(css, cgrp, cft);
3849	4055	if (ret) {
..	..	@@ -4028,6 +4234,7 @@
4028	4234	cft->flags \|= __CFTYPE_ONLY_ON_DFL;
4029	4235	return cgroup_add_cftypes(ss, cfts);
4030	4236	}
	4237	+EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes);
4031	4238
4032	4239	/**
4033	4240	* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
..	..	@@ -4045,6 +4252,7 @@
4045	4252	cft->flags \|= __CFTYPE_NOT_ON_DFL;
4046	4253	return cgroup_add_cftypes(ss, cfts);
4047	4254	}
	4255	+EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
4048	4256
4049	4257	/**
4050	4258	* cgroup_file_notify - generate a file modified event for a cgroup_file
..	..	@@ -4120,7 +4328,8 @@
4120	4328	} else if (likely(!(pos->flags & CSS_RELEASED))) {
4121	4329	next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4122	4330	} else {
4123		- list_for_each_entry_rcu(next, &parent->children, sibling)
	4331	+ list_for_each_entry_rcu(next, &parent->children, sibling,
	4332	+ lockdep_is_held(&cgroup_mutex))
4124	4333	if (next->serial_nr > pos->serial_nr)
4125	4334	break;
4126	4335	}
..	..	@@ -4133,6 +4342,7 @@
4133	4342	return next;
4134	4343	return NULL;
4135	4344	}
	4345	+EXPORT_SYMBOL_GPL(css_next_child);
4136	4346
4137	4347	/**
4138	4348	* css_next_descendant_pre - find the next descendant for pre-order walk
..	..	@@ -4182,6 +4392,7 @@
4182	4392
4183	4393	return NULL;
4184	4394	}
	4395	+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4185	4396
4186	4397	/**
4187	4398	* css_rightmost_descendant - return the rightmost descendant of a css
..	..	@@ -4362,29 +4573,24 @@
4362	4573
4363	4574	lockdep_assert_held(&css_set_lock);
4364	4575
4365		- /* Advance to the next non-empty css_set */
4366		- do {
4367		- cset = css_task_iter_next_css_set(it);
4368		- if (!cset) {
4369		- it->task_pos = NULL;
4370		- return;
	4576	+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
	4577	+ while ((cset = css_task_iter_next_css_set(it))) {
	4578	+ if (!list_empty(&cset->tasks)) {
	4579	+ it->cur_tasks_head = &cset->tasks;
	4580	+ break;
	4581	+ } else if (!list_empty(&cset->mg_tasks)) {
	4582	+ it->cur_tasks_head = &cset->mg_tasks;
	4583	+ break;
	4584	+ } else if (!list_empty(&cset->dying_tasks)) {
	4585	+ it->cur_tasks_head = &cset->dying_tasks;
	4586	+ break;
4371	4587	}
4372		- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4373		-
4374		- if (!list_empty(&cset->tasks)) {
4375		- it->task_pos = cset->tasks.next;
4376		- it->cur_tasks_head = &cset->tasks;
4377		- } else if (!list_empty(&cset->mg_tasks)) {
4378		- it->task_pos = cset->mg_tasks.next;
4379		- it->cur_tasks_head = &cset->mg_tasks;
4380		- } else {
4381		- it->task_pos = cset->dying_tasks.next;
4382		- it->cur_tasks_head = &cset->dying_tasks;
4383	4588	}
4384		-
4385		- it->tasks_head = &cset->tasks;
4386		- it->mg_tasks_head = &cset->mg_tasks;
4387		- it->dying_tasks_head = &cset->dying_tasks;
	4589	+ if (!cset) {
	4590	+ it->task_pos = NULL;
	4591	+ return;
	4592	+ }
	4593	+ it->task_pos = it->cur_tasks_head->next;
4388	4594
4389	4595	/*
4390	4596	* We don't keep css_sets locked across iteration steps and thus
..	..	@@ -4429,24 +4635,24 @@
4429	4635	repeat:
4430	4636	if (it->task_pos) {
4431	4637	/*
4432		- * Advance iterator to find next entry. cset->tasks is
4433		- * consumed first and then ->mg_tasks. After ->mg_tasks,
4434		- * we move onto the next cset.
	4638	+ * Advance iterator to find next entry. We go through cset
	4639	+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
	4640	+ * the next cset.
4435	4641	*/
4436	4642	if (it->flags & CSS_TASK_ITER_SKIPPED)
4437	4643	it->flags &= ~CSS_TASK_ITER_SKIPPED;
4438	4644	else
4439	4645	it->task_pos = it->task_pos->next;
4440	4646
4441		- if (it->task_pos == it->tasks_head) {
4442		- it->task_pos = it->mg_tasks_head->next;
4443		- it->cur_tasks_head = it->mg_tasks_head;
	4647	+ if (it->task_pos == &it->cur_cset->tasks) {
	4648	+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
	4649	+ it->task_pos = it->cur_tasks_head->next;
4444	4650	}
4445		- if (it->task_pos == it->mg_tasks_head) {
4446		- it->task_pos = it->dying_tasks_head->next;
4447		- it->cur_tasks_head = it->dying_tasks_head;
	4651	+ if (it->task_pos == &it->cur_cset->mg_tasks) {
	4652	+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
	4653	+ it->task_pos = it->cur_tasks_head->next;
4448	4654	}
4449		- if (it->task_pos == it->dying_tasks_head)
	4655	+ if (it->task_pos == &it->cur_cset->dying_tasks)
4450	4656	css_task_iter_advance_css_set(it);
4451	4657	} else {
4452	4658	/* called from start, proceed to the first cset */
..	..	@@ -4464,12 +4670,12 @@
4464	4670	goto repeat;
4465	4671
4466	4672	/* and dying leaders w/o live member threads */
4467		- if (it->cur_tasks_head == it->dying_tasks_head &&
	4673	+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4468	4674	!atomic_read(&task->signal->live))
4469	4675	goto repeat;
4470	4676	} else {
4471	4677	/* skip all dying ones */
4472		- if (it->cur_tasks_head == it->dying_tasks_head)
	4678	+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4473	4679	goto repeat;
4474	4680	}
4475	4681	}
..	..	@@ -4488,9 +4694,6 @@
4488	4694	void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4489	4695	struct css_task_iter *it)
4490	4696	{
4491		- /* no one should try to iterate before mounting cgroups */
4492		- WARN_ON_ONCE(!use_task_css_set_links);
4493		-
4494	4697	memset(it, 0, sizeof(*it));
4495	4698
4496	4699	spin_lock_irq(&css_set_lock);
..	..	@@ -4567,21 +4770,21 @@
4567	4770
4568	4771	static void cgroup_procs_release(struct kernfs_open_file *of)
4569	4772	{
4570		- if (of->priv) {
4571		- css_task_iter_end(of->priv);
4572		- kfree(of->priv);
4573		- }
	4773	+ struct cgroup_file_ctx *ctx = of->priv;
	4774	+
	4775	+ if (ctx->procs.started)
	4776	+ css_task_iter_end(&ctx->procs.iter);
4574	4777	}
4575	4778
4576	4779	static void cgroup_procs_next(struct seq_file s, void v, loff_t pos)
4577	4780	{
4578	4781	struct kernfs_open_file *of = s->private;
4579		- struct css_task_iter *it = of->priv;
	4782	+ struct cgroup_file_ctx *ctx = of->priv;
4580	4783
4581	4784	if (pos)
4582	4785	(*pos)++;
4583	4786
4584		- return css_task_iter_next(it);
	4787	+ return css_task_iter_next(&ctx->procs.iter);
4585	4788	}
4586	4789
4587	4790	static void __cgroup_procs_start(struct seq_file s, loff_t *pos,
..	..	@@ -4589,21 +4792,18 @@
4589	4792	{
4590	4793	struct kernfs_open_file *of = s->private;
4591	4794	struct cgroup *cgrp = seq_css(s)->cgroup;
4592		- struct css_task_iter *it = of->priv;
	4795	+ struct cgroup_file_ctx *ctx = of->priv;
	4796	+ struct css_task_iter *it = &ctx->procs.iter;
4593	4797
4594	4798	/*
4595	4799	* When a seq_file is seeked, it's always traversed sequentially
4596	4800	* from position 0, so we can simply keep iterating on !0 *pos.
4597	4801	*/
4598		- if (!it) {
	4802	+ if (!ctx->procs.started) {
4599	4803	if (WARN_ON_ONCE((*pos)))
4600	4804	return ERR_PTR(-EINVAL);
4601		-
4602		- it = kzalloc(sizeof(*it), GFP_KERNEL);
4603		- if (!it)
4604		- return ERR_PTR(-ENOMEM);
4605		- of->priv = it;
4606	4805	css_task_iter_start(&cgrp->self, iter_flags, it);
	4806	+ ctx->procs.started = true;
4607	4807	} else if (!(*pos)) {
4608	4808	css_task_iter_end(it);
4609	4809	css_task_iter_start(&cgrp->self, iter_flags, it);
..	..	@@ -4636,13 +4836,28 @@
4636	4836	return 0;
4637	4837	}
4638	4838
	4839	+static int cgroup_may_write(const struct cgroup cgrp, struct super_block sb)
	4840	+{
	4841	+ int ret;
	4842	+ struct inode *inode;
	4843	+
	4844	+ lockdep_assert_held(&cgroup_mutex);
	4845	+
	4846	+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
	4847	+ if (!inode)
	4848	+ return -ENOMEM;
	4849	+
	4850	+ ret = inode_permission(inode, MAY_WRITE);
	4851	+ iput(inode);
	4852	+ return ret;
	4853	+}
	4854	+
4639	4855	static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
4640	4856	struct cgroup *dst_cgrp,
4641		- struct super_block *sb)
	4857	+ struct super_block *sb,
	4858	+ struct cgroup_namespace *ns)
4642	4859	{
4643		- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
4644	4860	struct cgroup *com_cgrp = src_cgrp;
4645		- struct inode *inode;
4646	4861	int ret;
4647	4862
4648	4863	lockdep_assert_held(&cgroup_mutex);
..	..	@@ -4652,12 +4867,7 @@
4652	4867	com_cgrp = cgroup_parent(com_cgrp);
4653	4868
4654	4869	/* %current should be authorized to migrate to the common ancestor */
4655		- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4656		- if (!inode)
4657		- return -ENOMEM;
4658		-
4659		- ret = inode_permission(inode, MAY_WRITE);
4660		- iput(inode);
	4870	+ ret = cgroup_may_write(com_cgrp, sb);
4661	4871	if (ret)
4662	4872	return ret;
4663	4873
..	..	@@ -4673,18 +4883,42 @@
4673	4883	return 0;
4674	4884	}
4675	4885
	4886	+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
	4887	+ struct cgroup *dst_cgrp,
	4888	+ struct super_block *sb, bool threadgroup,
	4889	+ struct cgroup_namespace *ns)
	4890	+{
	4891	+ int ret = 0;
	4892	+
	4893	+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
	4894	+ if (ret)
	4895	+ return ret;
	4896	+
	4897	+ ret = cgroup_migrate_vet_dst(dst_cgrp);
	4898	+ if (ret)
	4899	+ return ret;
	4900	+
	4901	+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
	4902	+ ret = -EOPNOTSUPP;
	4903	+
	4904	+ return ret;
	4905	+}
	4906	+
4676	4907	static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
4677	4908	char *buf, size_t nbytes, loff_t off)
4678	4909	{
	4910	+ struct cgroup_file_ctx *ctx = of->priv;
4679	4911	struct cgroup src_cgrp, dst_cgrp;
4680	4912	struct task_struct *task;
	4913	+ const struct cred *saved_cred;
4681	4914	ssize_t ret;
	4915	+ bool threadgroup_locked;
4682	4916
4683	4917	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
4684	4918	if (!dst_cgrp)
4685	4919	return -ENODEV;
4686	4920
4687		- task = cgroup_procs_write_start(buf, true);
	4921	+ task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp);
4688	4922	ret = PTR_ERR_OR_ZERO(task);
4689	4923	if (ret)
4690	4924	goto out_unlock;
..	..	@@ -4694,15 +4928,23 @@
4694	4928	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4695	4929	spin_unlock_irq(&css_set_lock);
4696	4930
4697		- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4698		- of->file->f_path.dentry->d_sb);
	4931	+ /*
	4932	+ * Process and thread migrations follow same delegation rule. Check
	4933	+ * permissions using the credentials from file open to protect against
	4934	+ * inherited fd attacks.
	4935	+ */
	4936	+ saved_cred = override_creds(of->file->f_cred);
	4937	+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
	4938	+ of->file->f_path.dentry->d_sb, true,
	4939	+ ctx->ns);
	4940	+ revert_creds(saved_cred);
4699	4941	if (ret)
4700	4942	goto out_finish;
4701	4943
4702	4944	ret = cgroup_attach_task(dst_cgrp, task, true);
4703	4945
4704	4946	out_finish:
4705		- cgroup_procs_write_finish(task);
	4947	+ cgroup_procs_write_finish(task, threadgroup_locked);
4706	4948	out_unlock:
4707	4949	cgroup_kn_unlock(of->kn);
4708	4950
..	..	@@ -4717,9 +4959,12 @@
4717	4959	static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
4718	4960	char *buf, size_t nbytes, loff_t off)
4719	4961	{
	4962	+ struct cgroup_file_ctx *ctx = of->priv;
4720	4963	struct cgroup src_cgrp, dst_cgrp;
4721	4964	struct task_struct *task;
	4965	+ const struct cred *saved_cred;
4722	4966	ssize_t ret;
	4967	+ bool threadgroup_locked;
4723	4968
4724	4969	buf = strstrip(buf);
4725	4970
..	..	@@ -4727,7 +4972,7 @@
4727	4972	if (!dst_cgrp)
4728	4973	return -ENODEV;
4729	4974
4730		- task = cgroup_procs_write_start(buf, false);
	4975	+ task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp);
4731	4976	ret = PTR_ERR_OR_ZERO(task);
4732	4977	if (ret)
4733	4978	goto out_unlock;
..	..	@@ -4737,21 +4982,23 @@
4737	4982	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
4738	4983	spin_unlock_irq(&css_set_lock);
4739	4984
4740		- /* thread migrations follow the cgroup.procs delegation rule */
4741		- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4742		- of->file->f_path.dentry->d_sb);
	4985	+ /*
	4986	+ * Process and thread migrations follow same delegation rule. Check
	4987	+ * permissions using the credentials from file open to protect against
	4988	+ * inherited fd attacks.
	4989	+ */
	4990	+ saved_cred = override_creds(of->file->f_cred);
	4991	+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
	4992	+ of->file->f_path.dentry->d_sb, false,
	4993	+ ctx->ns);
	4994	+ revert_creds(saved_cred);
4743	4995	if (ret)
4744		- goto out_finish;
4745		-
4746		- /* and must be contained in the same domain */
4747		- ret = -EOPNOTSUPP;
4748		- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
4749	4996	goto out_finish;
4750	4997
4751	4998	ret = cgroup_attach_task(dst_cgrp, task, false);
4752	4999
4753	5000	out_finish:
4754		- cgroup_procs_write_finish(task);
	5001	+ cgroup_procs_write_finish(task, threadgroup_locked);
4755	5002	out_unlock:
4756	5003	cgroup_kn_unlock(of->kn);
4757	5004
..	..	@@ -4823,13 +5070,12 @@
4823	5070	},
4824	5071	{
4825	5072	.name = "cpu.stat",
4826		- .flags = CFTYPE_NOT_ON_ROOT,
4827	5073	.seq_show = cpu_stat_show,
4828	5074	},
4829	5075	#ifdef CONFIG_PSI
4830	5076	{
4831	5077	.name = "io.pressure",
4832		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5078	+ .flags = CFTYPE_PRESSURE,
4833	5079	.seq_show = cgroup_io_pressure_show,
4834	5080	.write = cgroup_io_pressure_write,
4835	5081	.poll = cgroup_pressure_poll,
..	..	@@ -4837,7 +5083,7 @@
4837	5083	},
4838	5084	{
4839	5085	.name = "memory.pressure",
4840		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5086	+ .flags = CFTYPE_PRESSURE,
4841	5087	.seq_show = cgroup_memory_pressure_show,
4842	5088	.write = cgroup_memory_pressure_write,
4843	5089	.poll = cgroup_pressure_poll,
..	..	@@ -4845,7 +5091,7 @@
4845	5091	},
4846	5092	{
4847	5093	.name = "cpu.pressure",
4848		- .flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_PRESSURE,
	5094	+ .flags = CFTYPE_PRESSURE,
4849	5095	.seq_show = cgroup_cpu_pressure_show,
4850	5096	.write = cgroup_cpu_pressure_write,
4851	5097	.poll = cgroup_pressure_poll,
..	..	@@ -4964,9 +5210,6 @@
4964	5210	tcgrp->nr_dying_descendants--;
4965	5211	spin_unlock_irq(&css_set_lock);
4966	5212
4967		- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4968		- cgrp->id = -1;
4969		-
4970	5213	/*
4971	5214	* There are two control paths which try to determine
4972	5215	* cgroup from dentry without going through kernfs -
..	..	@@ -4977,8 +5220,6 @@
4977	5220	if (cgrp->kn)
4978	5221	RCU_INIT_POINTER((void __rcu __force *)&cgrp->kn->priv,
4979	5222	NULL);
4980		-
4981		- cgroup_bpf_put(cgrp);
4982	5223	}
4983	5224
4984	5225	mutex_unlock(&cgroup_mutex);
..	..	@@ -5133,10 +5374,12 @@
5133	5374	* it isn't associated with its kernfs_node and doesn't have the control
5134	5375	* mask applied.
5135	5376	*/
5136		-static struct cgroup cgroup_create(struct cgroup parent)
	5377	+static struct cgroup cgroup_create(struct cgroup parent, const char *name,
	5378	+ umode_t mode)
5137	5379	{
5138	5380	struct cgroup_root *root = parent->root;
5139	5381	struct cgroup cgrp, tcgrp;
	5382	+ struct kernfs_node *kn;
5140	5383	int level = parent->level + 1;
5141	5384	int ret;
5142	5385
..	..	@@ -5156,15 +5399,13 @@
5156	5399	goto out_cancel_ref;
5157	5400	}
5158	5401
5159		- /*
5160		- * Temporarily set the pointer to NULL, so idr_find() won't return
5161		- * a half-baked cgroup.
5162		- */
5163		- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5164		- if (cgrp->id < 0) {
5165		- ret = -ENOMEM;
	5402	+ /* create the directory */
	5403	+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
	5404	+ if (IS_ERR(kn)) {
	5405	+ ret = PTR_ERR(kn);
5166	5406	goto out_stat_exit;
5167	5407	}
	5408	+ cgrp->kn = kn;
5168	5409
5169	5410	init_cgroup_housekeeping(cgrp);
5170	5411
..	..	@@ -5174,7 +5415,7 @@
5174	5415
5175	5416	ret = psi_cgroup_alloc(cgrp);
5176	5417	if (ret)
5177		- goto out_idr_free;
	5418	+ goto out_kernfs_remove;
5178	5419
5179	5420	ret = cgroup_bpf_inherit(cgrp);
5180	5421	if (ret)
..	..	@@ -5198,7 +5439,7 @@
5198	5439
5199	5440	spin_lock_irq(&css_set_lock);
5200	5441	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5201		- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
	5442	+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
5202	5443
5203	5444	if (tcgrp != cgrp) {
5204	5445	tcgrp->nr_descendants++;
..	..	@@ -5228,12 +5469,6 @@
5228	5469	cgroup_get_live(parent);
5229	5470
5230	5471	/*
5231		- * @cgrp is now fully operational. If something fails after this
5232		- * point, it'll be released via the normal destruction path.
5233		- */
5234		- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5235		-
5236		- /*
5237	5472	* On the default hierarchy, a child doesn't automatically inherit
5238	5473	* subtree_control from the parent. Each is configured manually.
5239	5474	*/
..	..	@@ -5246,8 +5481,8 @@
5246	5481
5247	5482	out_psi_free:
5248	5483	psi_cgroup_free(cgrp);
5249		-out_idr_free:
5250		- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
	5484	+out_kernfs_remove:
	5485	+ kernfs_remove(cgrp->kn);
5251	5486	out_stat_exit:
5252	5487	if (cgroup_on_dfl(parent))
5253	5488	cgroup_rstat_exit(cgrp);
..	..	@@ -5284,7 +5519,6 @@
5284	5519	int cgroup_mkdir(struct kernfs_node parent_kn, const char name, umode_t mode)
5285	5520	{
5286	5521	struct cgroup parent, cgrp;
5287		- struct kernfs_node *kn;
5288	5522	int ret;
5289	5523
5290	5524	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
..	..	@@ -5300,27 +5534,19 @@
5300	5534	goto out_unlock;
5301	5535	}
5302	5536
5303		- cgrp = cgroup_create(parent);
	5537	+ cgrp = cgroup_create(parent, name, mode);
5304	5538	if (IS_ERR(cgrp)) {
5305	5539	ret = PTR_ERR(cgrp);
5306	5540	goto out_unlock;
5307	5541	}
5308	5542
5309		- /* create the directory */
5310		- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5311		- if (IS_ERR(kn)) {
5312		- ret = PTR_ERR(kn);
5313		- goto out_destroy;
5314		- }
5315		- cgrp->kn = kn;
5316		-
5317	5543	/*
5318	5544	* This extra ref will be put in cgroup_free_fn() and guarantees
5319	5545	* that @cgrp->kn is always accessible.
5320	5546	*/
5321		- kernfs_get(kn);
	5547	+ kernfs_get(cgrp->kn);
5322	5548
5323		- ret = cgroup_kn_set_ugid(kn);
	5549	+ ret = cgroup_kn_set_ugid(cgrp->kn);
5324	5550	if (ret)
5325	5551	goto out_destroy;
5326	5552
..	..	@@ -5335,7 +5561,7 @@
5335	5561	TRACE_CGROUP_PATH(mkdir, cgrp);
5336	5562
5337	5563	/* let's create and online css's */
5338		- kernfs_activate(kn);
	5564	+ kernfs_activate(cgrp->kn);
5339	5565
5340	5566	ret = 0;
5341	5567	goto out_unlock;
..	..	@@ -5512,6 +5738,8 @@
5512	5738
5513	5739	cgroup1_check_for_release(parent);
5514	5740
	5741	+ cgroup_bpf_offline(cgrp);
	5742	+
5515	5743	/* put the base reference */
5516	5744	percpu_ref_kill(&cgrp->self.refcnt);
5517	5745
..	..	@@ -5537,7 +5765,6 @@
5537	5765
5538	5766	static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5539	5767	.show_options = cgroup_show_options,
5540		- .remount_fs = cgroup_remount,
5541	5768	.mkdir = cgroup_mkdir,
5542	5769	.rmdir = cgroup_rmdir,
5543	5770	.show_path = cgroup_show_path,
..	..	@@ -5604,11 +5831,12 @@
5604	5831	*/
5605	5832	int __init cgroup_init_early(void)
5606	5833	{
5607		- static struct cgroup_sb_opts __initdata opts;
	5834	+ static struct cgroup_fs_context __initdata ctx;
5608	5835	struct cgroup_subsys *ss;
5609	5836	int i;
5610	5837
5611		- init_cgroup_root(&cgrp_dfl_root, &opts);
	5838	+ ctx.root = &cgrp_dfl_root;
	5839	+ init_cgroup_root(&ctx);
5612	5840	cgrp_dfl_root.cgrp.self.flags \|= CSS_NO_REF;
5613	5841
5614	5842	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
..	..	@@ -5644,14 +5872,13 @@
5644	5872	int ssid;
5645	5873
5646	5874	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5647		- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5648	5875	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5649	5876	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5650	5877
5651	5878	cgroup_rstat_boot();
5652	5879
5653	5880	/*
5654		- * The latency of the synchronize_sched() is too high for cgroups,
	5881	+ * The latency of the synchronize_rcu() is too high for cgroups,
5655	5882	* avoid it at the cost of forcing all readers into the slow path.
5656	5883	*/
5657	5884	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
..	..	@@ -5735,6 +5962,9 @@
5735	5962	WARN_ON(register_filesystem(&cgroup_fs_type));
5736	5963	WARN_ON(register_filesystem(&cgroup2_fs_type));
5737	5964	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
	5965	+#ifdef CONFIG_CPUSETS
	5966	+ WARN_ON(register_filesystem(&cpuset_fs_type));
	5967	+#endif
5738	5968
5739	5969	return 0;
5740	5970	}
..	..	@@ -5755,12 +5985,11 @@
5755	5985	}
5756	5986	core_initcall(cgroup_wq_init);
5757	5987
5758		-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
5759		- char *buf, size_t buflen)
	5988	+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
5760	5989	{
5761	5990	struct kernfs_node *kn;
5762	5991
5763		- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
	5992	+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
5764	5993	if (!kn)
5765	5994	return;
5766	5995	kernfs_path(kn, buf, buflen);
..	..	@@ -5850,8 +6079,7 @@
5850	6079	* @child: pointer to task_struct of forking parent process.
5851	6080	*
5852	6081	* A task is associated with the init_css_set until cgroup_post_fork()
5853		- * attaches it to the parent's css_set. Empty cg_list indicates that
5854		- * @child isn't holding reference to its css_set.
	6082	+ * attaches it to the target css_set.
5855	6083	*/
5856	6084	void cgroup_fork(struct task_struct *child)
5857	6085	{
..	..	@@ -5859,21 +6087,172 @@
5859	6087	INIT_LIST_HEAD(&child->cg_list);
5860	6088	}
5861	6089
	6090	+static struct cgroup cgroup_get_from_file(struct file f)
	6091	+{
	6092	+ struct cgroup_subsys_state *css;
	6093	+ struct cgroup *cgrp;
	6094	+
	6095	+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
	6096	+ if (IS_ERR(css))
	6097	+ return ERR_CAST(css);
	6098	+
	6099	+ cgrp = css->cgroup;
	6100	+ if (!cgroup_on_dfl(cgrp)) {
	6101	+ cgroup_put(cgrp);
	6102	+ return ERR_PTR(-EBADF);
	6103	+ }
	6104	+
	6105	+ return cgrp;
	6106	+}
	6107	+
	6108	+/**
	6109	+ * cgroup_css_set_fork - find or create a css_set for a child process
	6110	+ * @kargs: the arguments passed to create the child process
	6111	+ *
	6112	+ * This functions finds or creates a new css_set which the child
	6113	+ * process will be attached to in cgroup_post_fork(). By default,
	6114	+ * the child process will be given the same css_set as its parent.
	6115	+ *
	6116	+ * If CLONE_INTO_CGROUP is specified this function will try to find an
	6117	+ * existing css_set which includes the requested cgroup and if not create
	6118	+ * a new css_set that the child will be attached to later. If this function
	6119	+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
	6120	+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
	6121	+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
	6122	+ * to the target cgroup.
	6123	+ */
	6124	+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
	6125	+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
	6126	+{
	6127	+ int ret;
	6128	+ struct cgroup *dst_cgrp = NULL;
	6129	+ struct css_set *cset;
	6130	+ struct super_block *sb;
	6131	+ struct file *f;
	6132	+
	6133	+ if (kargs->flags & CLONE_INTO_CGROUP)
	6134	+ mutex_lock(&cgroup_mutex);
	6135	+
	6136	+ cgroup_threadgroup_change_begin(current);
	6137	+
	6138	+ spin_lock_irq(&css_set_lock);
	6139	+ cset = task_css_set(current);
	6140	+ get_css_set(cset);
	6141	+ spin_unlock_irq(&css_set_lock);
	6142	+
	6143	+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
	6144	+ kargs->cset = cset;
	6145	+ return 0;
	6146	+ }
	6147	+
	6148	+ f = fget_raw(kargs->cgroup);
	6149	+ if (!f) {
	6150	+ ret = -EBADF;
	6151	+ goto err;
	6152	+ }
	6153	+ sb = f->f_path.dentry->d_sb;
	6154	+
	6155	+ dst_cgrp = cgroup_get_from_file(f);
	6156	+ if (IS_ERR(dst_cgrp)) {
	6157	+ ret = PTR_ERR(dst_cgrp);
	6158	+ dst_cgrp = NULL;
	6159	+ goto err;
	6160	+ }
	6161	+
	6162	+ if (cgroup_is_dead(dst_cgrp)) {
	6163	+ ret = -ENODEV;
	6164	+ goto err;
	6165	+ }
	6166	+
	6167	+ /*
	6168	+ * Verify that we the target cgroup is writable for us. This is
	6169	+ * usually done by the vfs layer but since we're not going through
	6170	+ * the vfs layer here we need to do it "manually".
	6171	+ */
	6172	+ ret = cgroup_may_write(dst_cgrp, sb);
	6173	+ if (ret)
	6174	+ goto err;
	6175	+
	6176	+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
	6177	+ !(kargs->flags & CLONE_THREAD),
	6178	+ current->nsproxy->cgroup_ns);
	6179	+ if (ret)
	6180	+ goto err;
	6181	+
	6182	+ kargs->cset = find_css_set(cset, dst_cgrp);
	6183	+ if (!kargs->cset) {
	6184	+ ret = -ENOMEM;
	6185	+ goto err;
	6186	+ }
	6187	+
	6188	+ put_css_set(cset);
	6189	+ fput(f);
	6190	+ kargs->cgrp = dst_cgrp;
	6191	+ return ret;
	6192	+
	6193	+err:
	6194	+ cgroup_threadgroup_change_end(current);
	6195	+ mutex_unlock(&cgroup_mutex);
	6196	+ if (f)
	6197	+ fput(f);
	6198	+ if (dst_cgrp)
	6199	+ cgroup_put(dst_cgrp);
	6200	+ put_css_set(cset);
	6201	+ if (kargs->cset)
	6202	+ put_css_set(kargs->cset);
	6203	+ return ret;
	6204	+}
	6205	+
	6206	+/**
	6207	+ * cgroup_css_set_put_fork - drop references we took during fork
	6208	+ * @kargs: the arguments passed to create the child process
	6209	+ *
	6210	+ * Drop references to the prepared css_set and target cgroup if
	6211	+ * CLONE_INTO_CGROUP was requested.
	6212	+ */
	6213	+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
	6214	+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
	6215	+{
	6216	+ struct cgroup *cgrp = kargs->cgrp;
	6217	+ struct css_set *cset = kargs->cset;
	6218	+
	6219	+ cgroup_threadgroup_change_end(current);
	6220	+
	6221	+ if (cset) {
	6222	+ put_css_set(cset);
	6223	+ kargs->cset = NULL;
	6224	+ }
	6225	+
	6226	+ if (kargs->flags & CLONE_INTO_CGROUP) {
	6227	+ mutex_unlock(&cgroup_mutex);
	6228	+ if (cgrp) {
	6229	+ cgroup_put(cgrp);
	6230	+ kargs->cgrp = NULL;
	6231	+ }
	6232	+ }
	6233	+}
	6234	+
5862	6235	/**
5863	6236	* cgroup_can_fork - called on a new task before the process is exposed
5864		- * @child: the task in question.
	6237	+ * @child: the child process
5865	6238	*
5866		- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5867		- * returns an error, the fork aborts with that error code. This allows for
5868		- * a cgroup subsystem to conditionally allow or deny new forks.
	6239	+ * This prepares a new css_set for the child process which the child will
	6240	+ * be attached to in cgroup_post_fork().
	6241	+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
	6242	+ * callback returns an error, the fork aborts with that error code. This
	6243	+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
5869	6244	*/
5870		-int cgroup_can_fork(struct task_struct *child)
	6245	+int cgroup_can_fork(struct task_struct child, struct kernel_clone_args kargs)
5871	6246	{
5872	6247	struct cgroup_subsys *ss;
5873	6248	int i, j, ret;
5874	6249
	6250	+ ret = cgroup_css_set_fork(kargs);
	6251	+ if (ret)
	6252	+ return ret;
	6253	+
5875	6254	do_each_subsys_mask(ss, i, have_canfork_callback) {
5876		- ret = ss->can_fork(child);
	6255	+ ret = ss->can_fork(child, kargs->cset);
5877	6256	if (ret)
5878	6257	goto out_revert;
5879	6258	} while_each_subsys_mask();
..	..	@@ -5885,97 +6264,86 @@
5885	6264	if (j >= i)
5886	6265	break;
5887	6266	if (ss->cancel_fork)
5888		- ss->cancel_fork(child);
	6267	+ ss->cancel_fork(child, kargs->cset);
5889	6268	}
	6269	+
	6270	+ cgroup_css_set_put_fork(kargs);
5890	6271
5891	6272	return ret;
5892	6273	}
5893	6274
5894	6275	/**
5895	6276	* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5896		- * @child: the task in question
	6277	+ * @child: the child process
	6278	+ * @kargs: the arguments passed to create the child process
5897	6279	*
5898	6280	* This calls the cancel_fork() callbacks if a fork failed after
5899		- * cgroup_can_fork() succeded.
	6281	+ * cgroup_can_fork() succeded and cleans up references we took to
	6282	+ * prepare a new css_set for the child process in cgroup_can_fork().
5900	6283	*/
5901		-void cgroup_cancel_fork(struct task_struct *child)
	6284	+void cgroup_cancel_fork(struct task_struct *child,
	6285	+ struct kernel_clone_args *kargs)
5902	6286	{
5903	6287	struct cgroup_subsys *ss;
5904	6288	int i;
5905	6289
5906	6290	for_each_subsys(ss, i)
5907	6291	if (ss->cancel_fork)
5908		- ss->cancel_fork(child);
	6292	+ ss->cancel_fork(child, kargs->cset);
	6293	+
	6294	+ cgroup_css_set_put_fork(kargs);
5909	6295	}
5910	6296
5911	6297	/**
5912		- * cgroup_post_fork - called on a new task after adding it to the task list
5913		- * @child: the task in question
	6298	+ * cgroup_post_fork - finalize cgroup setup for the child process
	6299	+ * @child: the child process
5914	6300	*
5915		- * Adds the task to the list running through its css_set if necessary and
5916		- * call the subsystem fork() callbacks. Has to be after the task is
5917		- * visible on the task list in case we race with the first call to
5918		- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5919		- * list.
	6301	+ * Attach the child process to its css_set calling the subsystem fork()
	6302	+ * callbacks.
5920	6303	*/
5921		-void cgroup_post_fork(struct task_struct *child)
	6304	+void cgroup_post_fork(struct task_struct *child,
	6305	+ struct kernel_clone_args *kargs)
	6306	+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
5922	6307	{
5923	6308	struct cgroup_subsys *ss;
	6309	+ struct css_set *cset;
5924	6310	int i;
5925	6311
5926		- /*
5927		- * This may race against cgroup_enable_task_cg_lists(). As that
5928		- * function sets use_task_css_set_links before grabbing
5929		- * tasklist_lock and we just went through tasklist_lock to add
5930		- * @child, it's guaranteed that either we see the set
5931		- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5932		- * @child during its iteration.
5933		- *
5934		- * If we won the race, @child is associated with %current's
5935		- * css_set. Grabbing css_set_lock guarantees both that the
5936		- * association is stable, and, on completion of the parent's
5937		- * migration, @child is visible in the source of migration or
5938		- * already in the destination cgroup. This guarantee is necessary
5939		- * when implementing operations which need to migrate all tasks of
5940		- * a cgroup to another.
5941		- *
5942		- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5943		- * will remain in init_css_set. This is safe because all tasks are
5944		- * in the init_css_set before cg_links is enabled and there's no
5945		- * operation which transfers all tasks out of init_css_set.
5946		- */
5947		- if (use_task_css_set_links) {
5948		- struct css_set *cset;
	6312	+ cset = kargs->cset;
	6313	+ kargs->cset = NULL;
5949	6314
5950		- spin_lock_irq(&css_set_lock);
5951		- cset = task_css_set(current);
5952		- if (list_empty(&child->cg_list)) {
5953		- get_css_set(cset);
5954		- cset->nr_tasks++;
5955		- css_set_move_task(child, NULL, cset, false);
5956		- }
	6315	+ spin_lock_irq(&css_set_lock);
	6316	+
	6317	+ /* init tasks are special, only link regular threads */
	6318	+ if (likely(child->pid)) {
	6319	+ WARN_ON_ONCE(!list_empty(&child->cg_list));
	6320	+ cset->nr_tasks++;
	6321	+ css_set_move_task(child, NULL, cset, false);
	6322	+ } else {
	6323	+ put_css_set(cset);
	6324	+ cset = NULL;
	6325	+ }
	6326	+
	6327	+ /*
	6328	+ * If the cgroup has to be frozen, the new task has too. Let's set
	6329	+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
	6330	+ * frozen state.
	6331	+ */
	6332	+ if (unlikely(cgroup_task_freeze(child))) {
	6333	+ spin_lock(&child->sighand->siglock);
	6334	+ WARN_ON_ONCE(child->frozen);
	6335	+ child->jobctl \|= JOBCTL_TRAP_FREEZE;
	6336	+ spin_unlock(&child->sighand->siglock);
5957	6337
5958	6338	/*
5959		- * If the cgroup has to be frozen, the new task has too.
5960		- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
5961		- * the task into the frozen state.
	6339	+ * Calling cgroup_update_frozen() isn't required here,
	6340	+ * because it will be called anyway a bit later from
	6341	+ * do_freezer_trap(). So we avoid cgroup's transient switch
	6342	+ * from the frozen state and back.
5962	6343	*/
5963		- if (unlikely(cgroup_task_freeze(child))) {
5964		- spin_lock(&child->sighand->siglock);
5965		- WARN_ON_ONCE(child->frozen);
5966		- child->jobctl \|= JOBCTL_TRAP_FREEZE;
5967		- spin_unlock(&child->sighand->siglock);
5968		-
5969		- /*
5970		- * Calling cgroup_update_frozen() isn't required here,
5971		- * because it will be called anyway a bit later
5972		- * from do_freezer_trap(). So we avoid cgroup's
5973		- * transient switch from the frozen state and back.
5974		- */
5975		- }
5976		-
5977		- spin_unlock_irq(&css_set_lock);
5978	6344	}
	6345	+
	6346	+ spin_unlock_irq(&css_set_lock);
5979	6347
5980	6348	/*
5981	6349	* Call ss->fork(). This must happen after @child is linked on
..	..	@@ -5985,26 +6353,25 @@
5985	6353	do_each_subsys_mask(ss, i, have_fork_callback) {
5986	6354	ss->fork(child);
5987	6355	} while_each_subsys_mask();
	6356	+
	6357	+ /* Make the new cset the root_cset of the new cgroup namespace. */
	6358	+ if (kargs->flags & CLONE_NEWCGROUP) {
	6359	+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
	6360	+
	6361	+ get_css_set(cset);
	6362	+ child->nsproxy->cgroup_ns->root_cset = cset;
	6363	+ put_css_set(rcset);
	6364	+ }
	6365	+
	6366	+ cgroup_css_set_put_fork(kargs);
5988	6367	}
5989	6368
5990	6369	/**
5991	6370	* cgroup_exit - detach cgroup from exiting task
5992	6371	* @tsk: pointer to task_struct of exiting process
5993	6372	*
5994		- * Description: Detach cgroup from @tsk and release it.
	6373	+ * Description: Detach cgroup from @tsk.
5995	6374	*
5996		- * Note that cgroups marked notify_on_release force every task in
5997		- * them to take the global cgroup_mutex mutex when exiting.
5998		- * This could impact scaling on very large systems. Be reluctant to
5999		- * use notify_on_release cgroups where very high task exit scaling
6000		- * is required on large systems.
6001		- *
6002		- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
6003		- * call cgroup_exit() while the task is still competent to handle
6004		- * notify_on_release(), then leave the task attached to the root cgroup in
6005		- * each hierarchy for the remainder of its exit. No need to bother with
6006		- * init_css_set refcnting. init_css_set never goes away and we can't race
6007		- * with migration path - PF_EXITING is visible to migration path.
6008	6375	*/
6009	6376	void cgroup_exit(struct task_struct *tsk)
6010	6377	{
..	..	@@ -6012,27 +6379,22 @@
6012	6379	struct css_set *cset;
6013	6380	int i;
6014	6381
6015		- /*
6016		- * Unlink from @tsk from its css_set. As migration path can't race
6017		- * with us, we can check css_set and cg_list without synchronization.
6018		- */
	6382	+ spin_lock_irq(&css_set_lock);
	6383	+
	6384	+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
6019	6385	cset = task_css_set(tsk);
	6386	+ css_set_move_task(tsk, cset, NULL, false);
	6387	+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
	6388	+ cset->nr_tasks--;
6020	6389
6021		- if (!list_empty(&tsk->cg_list)) {
6022		- spin_lock_irq(&css_set_lock);
6023		- css_set_move_task(tsk, cset, NULL, false);
6024		- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6025		- cset->nr_tasks--;
	6390	+ if (dl_task(tsk))
	6391	+ dec_dl_tasks_cs(tsk);
6026	6392
6027		- if (unlikely(cgroup_task_frozen(tsk)))
6028		- cgroup_freezer_frozen_exit(tsk);
6029		- else if (unlikely(cgroup_task_freeze(tsk)))
6030		- cgroup_update_frozen(task_dfl_cgroup(tsk));
	6393	+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
	6394	+ if (unlikely(cgroup_task_freeze(tsk)))
	6395	+ cgroup_update_frozen(task_dfl_cgroup(tsk));
6031	6396
6032		- spin_unlock_irq(&css_set_lock);
6033		- } else {
6034		- get_css_set(cset);
6035		- }
	6397	+ spin_unlock_irq(&css_set_lock);
6036	6398
6037	6399	/* see cgroup_post_fork() for details */
6038	6400	do_each_subsys_mask(ss, i, have_exit_callback) {
..	..	@@ -6049,12 +6411,10 @@
6049	6411	ss->release(task);
6050	6412	} while_each_subsys_mask();
6051	6413
6052		- if (use_task_css_set_links) {
6053		- spin_lock_irq(&css_set_lock);
6054		- css_set_skip_task_iters(task_css_set(task), task);
6055		- list_del_init(&task->cg_list);
6056		- spin_unlock_irq(&css_set_lock);
6057		- }
	6414	+ spin_lock_irq(&css_set_lock);
	6415	+ css_set_skip_task_iters(task_css_set(task), task);
	6416	+ list_del_init(&task->cg_list);
	6417	+ spin_unlock_irq(&css_set_lock);
6058	6418	}
6059	6419
6060	6420	void cgroup_free(struct task_struct *task)
..	..	@@ -6095,6 +6455,16 @@
6095	6455	return 1;
6096	6456	}
6097	6457	__setup("cgroup_disable=", cgroup_disable);
	6458	+
	6459	+void __init __weak enable_debug_cgroup(void) { }
	6460	+
	6461	+static int __init enable_cgroup_debug(char *str)
	6462	+{
	6463	+ cgroup_debug = true;
	6464	+ enable_debug_cgroup();
	6465	+ return 1;
	6466	+}
	6467	+__setup("cgroup_debug", enable_cgroup_debug);
6098	6468
6099	6469	/**
6100	6470	* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
..	..	@@ -6195,7 +6565,6 @@
6195	6565	*/
6196	6566	struct cgroup *cgroup_get_from_fd(int fd)
6197	6567	{
6198		- struct cgroup_subsys_state *css;
6199	6568	struct cgroup *cgrp;
6200	6569	struct file *f;
6201	6570
..	..	@@ -6203,17 +6572,8 @@
6203	6572	if (!f)
6204	6573	return ERR_PTR(-EBADF);
6205	6574
6206		- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
	6575	+ cgrp = cgroup_get_from_file(f);
6207	6576	fput(f);
6208		- if (IS_ERR(css))
6209		- return ERR_CAST(css);
6210		-
6211		- cgrp = css->cgroup;
6212		- if (!cgroup_on_dfl(cgrp)) {
6213		- cgroup_put(cgrp);
6214		- return ERR_PTR(-EBADF);
6215		- }
6216		-
6217	6577	return cgrp;
6218	6578	}
6219	6579	EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
..	..	@@ -6304,6 +6664,7 @@
6304	6664	cset = task_css_set(current);
6305	6665	if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6306	6666	skcd->val = (unsigned long)cset->dfl_cgrp;
	6667	+ cgroup_bpf_get(cset->dfl_cgrp);
6307	6668	break;
6308	6669	}
6309	6670	cpu_relax();
..	..	@@ -6314,7 +6675,6 @@
6314	6675
6315	6676	void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6316	6677	{
6317		- /* Socket clone path */
6318	6678	if (skcd->val) {
6319	6679	if (skcd->no_refcnt)
6320	6680	return;
..	..	@@ -6324,40 +6684,48 @@
6324	6684	* Don't use cgroup_get_live().
6325	6685	*/
6326	6686	cgroup_get(sock_cgroup_ptr(skcd));
	6687	+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
6327	6688	}
6328	6689	}
6329	6690
6330	6691	void cgroup_sk_free(struct sock_cgroup_data *skcd)
6331	6692	{
	6693	+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
	6694	+
6332	6695	if (skcd->no_refcnt)
6333	6696	return;
6334		-
6335		- cgroup_put(sock_cgroup_ptr(skcd));
	6697	+ cgroup_bpf_put(cgrp);
	6698	+ cgroup_put(cgrp);
6336	6699	}
6337	6700
6338	6701	#endif /* CONFIG_SOCK_CGROUP_DATA */
6339	6702
6340	6703	#ifdef CONFIG_CGROUP_BPF
6341		-int cgroup_bpf_attach(struct cgroup cgrp, struct bpf_prog prog,
6342		- enum bpf_attach_type type, u32 flags)
	6704	+int cgroup_bpf_attach(struct cgroup *cgrp,
	6705	+ struct bpf_prog prog, struct bpf_prog replace_prog,
	6706	+ struct bpf_cgroup_link *link,
	6707	+ enum bpf_attach_type type,
	6708	+ u32 flags)
6343	6709	{
6344	6710	int ret;
6345	6711
6346	6712	mutex_lock(&cgroup_mutex);
6347		- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
	6713	+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
6348	6714	mutex_unlock(&cgroup_mutex);
6349	6715	return ret;
6350	6716	}
	6717	+
6351	6718	int cgroup_bpf_detach(struct cgroup cgrp, struct bpf_prog prog,
6352		- enum bpf_attach_type type, u32 flags)
	6719	+ enum bpf_attach_type type)
6353	6720	{
6354	6721	int ret;
6355	6722
6356	6723	mutex_lock(&cgroup_mutex);
6357		- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
	6724	+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
6358	6725	mutex_unlock(&cgroup_mutex);
6359	6726	return ret;
6360	6727	}
	6728	+
6361	6729	int cgroup_bpf_query(struct cgroup cgrp, const union bpf_attr attr,
6362	6730	union bpf_attr __user *uattr)
6363	6731	{
..	..	@@ -6418,7 +6786,10 @@
6418	6786	static ssize_t features_show(struct kobject kobj, struct kobj_attribute attr,
6419	6787	char *buf)
6420	6788	{
6421		- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
	6789	+ return snprintf(buf, PAGE_SIZE,
	6790	+ "nsdelegate\n"
	6791	+ "memory_localevents\n"
	6792	+ "memory_recursiveprot\n");
6422	6793	}
6423	6794	static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6424	6795