hc
2024-05-10 748e4f3d702def1a4bff191e0cf93b6a05340f01
kernel/kernel/cgroup/cgroup.c
....@@ -54,12 +54,17 @@
5454 #include <linux/proc_ns.h>
5555 #include <linux/nsproxy.h>
5656 #include <linux/file.h>
57
+#include <linux/fs_parser.h>
5758 #include <linux/sched/cputime.h>
59
+#include <linux/sched/deadline.h>
5860 #include <linux/psi.h>
5961 #include <net/sock.h>
6062
6163 #define CREATE_TRACE_POINTS
6264 #include <trace/events/cgroup.h>
65
+#undef CREATE_TRACE_POINTS
66
+
67
+#include <trace/hooks/cgroup.h>
6368
6469 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
6570 MAX_CFTYPE_NAME + 2)
....@@ -86,6 +91,7 @@
8691
8792 DEFINE_SPINLOCK(trace_cgroup_path_lock);
8893 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
94
+bool cgroup_debug __read_mostly;
8995
9096 /*
9197 * Protects cgroup_idr and css_idr so that IDs can be released without
....@@ -99,7 +105,7 @@
99105 */
100106 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
101107
102
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
108
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
103109
104110 #define cgroup_assert_mutex_or_rcu_locked() \
105111 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
....@@ -151,11 +157,7 @@
151157
152158 static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
153159
154
-/*
155
- * The default hierarchy, reserved for the subsystems that are otherwise
156
- * unattached - it never has more than a single cgroup, and all tasks are
157
- * part of that cgroup.
158
- */
160
+/* the default hierarchy */
159161 struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
160162 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
161163
....@@ -264,9 +266,6 @@
264266 * can be used to test whether a cgroup is on the default hierarchy for
265267 * cases where a subsystem should behave differnetly depending on the
266268 * interface version.
267
- *
268
- * The set of behaviors which change on the default hierarchy are still
269
- * being determined and the mount option is prefixed with __DEVEL__.
270269 *
271270 * List of changed behaviors:
272271 *
....@@ -502,7 +501,7 @@
502501
503502 rcu_read_lock();
504503 css = cgroup_css(cgrp, ss);
505
- if (!css || !css_tryget_online(css))
504
+ if (css && !css_tryget_online(css))
506505 css = NULL;
507506 rcu_read_unlock();
508507
....@@ -510,7 +509,7 @@
510509 }
511510
512511 /**
513
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
512
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
514513 * @cgrp: the cgroup of interest
515514 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
516515 *
....@@ -519,8 +518,8 @@
519518 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
520519 * function is guaranteed to return non-NULL css.
521520 */
522
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
523
- struct cgroup_subsys *ss)
521
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
522
+ struct cgroup_subsys *ss)
524523 {
525524 lockdep_assert_held(&cgroup_mutex);
526525
....@@ -538,6 +537,35 @@
538537 }
539538
540539 return cgroup_css(cgrp, ss);
540
+}
541
+
542
+/**
543
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
544
+ * @cgrp: the cgroup of interest
545
+ * @ss: the subsystem of interest
546
+ *
547
+ * Find and get the effective css of @cgrp for @ss. The effective css is
548
+ * defined as the matching css of the nearest ancestor including self which
549
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
550
+ * the root css is returned, so this function always returns a valid css.
551
+ *
552
+ * The returned css is not guaranteed to be online, and therefore it is the
553
+ * callers responsiblity to tryget a reference for it.
554
+ */
555
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
556
+ struct cgroup_subsys *ss)
557
+{
558
+ struct cgroup_subsys_state *css;
559
+
560
+ do {
561
+ css = cgroup_css(cgrp, ss);
562
+
563
+ if (css)
564
+ return css;
565
+ cgrp = cgroup_parent(cgrp);
566
+ } while (cgrp);
567
+
568
+ return init_css_set.subsys[ss->id];
541569 }
542570
543571 /**
....@@ -655,10 +683,11 @@
655683 *
656684 * Should be called under cgroup_[tree_]mutex.
657685 */
658
-#define for_each_e_css(css, ssid, cgrp) \
659
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
660
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
661
- ; \
686
+#define for_each_e_css(css, ssid, cgrp) \
687
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
688
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
689
+ cgroup_subsys[(ssid)]))) \
690
+ ; \
662691 else
663692
664693 /**
....@@ -718,25 +747,28 @@
718747 * reference-counted, to improve performance when child cgroups
719748 * haven't been created.
720749 */
721
-struct css_set init_css_set = {
722
- .refcount = REFCOUNT_INIT(1),
723
- .dom_cset = &init_css_set,
724
- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
725
- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
726
- .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
727
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
728
- .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
729
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
730
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
731
- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
732
-
733
- /*
734
- * The following field is re-initialized when this cset gets linked
735
- * in cgroup_init(). However, let's initialize the field
736
- * statically too so that the default cgroup can be accessed safely
737
- * early during boot.
738
- */
739
- .dfl_cgrp = &cgrp_dfl_root.cgrp,
750
+struct ext_css_set init_ext_css_set = {
751
+ .cset = {
752
+ .refcount = REFCOUNT_INIT(1),
753
+ .dom_cset = &init_css_set,
754
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
755
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
756
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
757
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
758
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
759
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
760
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
761
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
762
+ /*
763
+ * The following field is re-initialized when this cset gets linked
764
+ * in cgroup_init(). However, let's initialize the field
765
+ * statically too so that the default cgroup can be accessed safely
766
+ * early during boot.
767
+ */
768
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
769
+ },
770
+ .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
771
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
740772 };
741773
742774 static int css_set_count = 1; /* 1 for init_css_set */
....@@ -802,6 +834,8 @@
802834 break;
803835
804836 cgroup1_check_for_release(cgrp);
837
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
838
+ cgroup_is_populated(cgrp));
805839 cgroup_file_notify(&cgrp->events_file);
806840
807841 child = cgrp;
....@@ -881,8 +915,7 @@
881915 /*
882916 * We are synchronized through cgroup_threadgroup_rwsem
883917 * against PF_EXITING setting such that we can't race
884
- * against cgroup_exit() changing the css_set to
885
- * init_css_set and dropping the old one.
918
+ * against cgroup_exit()/cgroup_free() dropping the css_set.
886919 */
887920 WARN_ON_ONCE(task->flags & PF_EXITING);
888921
....@@ -1060,7 +1093,7 @@
10601093 * @ss is in this hierarchy, so we want the
10611094 * effective css from @cgrp.
10621095 */
1063
- template[i] = cgroup_e_css(cgrp, ss);
1096
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
10641097 } else {
10651098 /*
10661099 * @ss is not in this hierarchy, so we don't want
....@@ -1162,6 +1195,7 @@
11621195 struct cgroup *cgrp)
11631196 {
11641197 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1198
+ struct ext_css_set *ext_cset;
11651199 struct css_set *cset;
11661200 struct list_head tmp_links;
11671201 struct cgrp_cset_link *link;
....@@ -1182,9 +1216,10 @@
11821216 if (cset)
11831217 return cset;
11841218
1185
- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1186
- if (!cset)
1219
+ ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
1220
+ if (!ext_cset)
11871221 return NULL;
1222
+ cset = &ext_cset->cset;
11881223
11891224 /* Allocate all the cgrp_cset_link objects that we'll need */
11901225 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
....@@ -1202,6 +1237,8 @@
12021237 INIT_HLIST_NODE(&cset->hlist);
12031238 INIT_LIST_HEAD(&cset->cgrp_links);
12041239 INIT_LIST_HEAD(&cset->mg_preload_node);
1240
+ INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
1241
+ INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
12051242 INIT_LIST_HEAD(&cset->mg_node);
12061243
12071244 /* Copy the set of subsystem state objects generated in
....@@ -1291,10 +1328,7 @@
12911328
12921329 void cgroup_free_root(struct cgroup_root *root)
12931330 {
1294
- if (root) {
1295
- idr_destroy(&root->cgroup_idr);
1296
- kfree(root);
1297
- }
1331
+ kfree(root);
12981332 }
12991333
13001334 static void cgroup_destroy_root(struct cgroup_root *root)
....@@ -1356,6 +1390,8 @@
13561390 cset = current->nsproxy->cgroup_ns->root_cset;
13571391 if (cset == &init_css_set) {
13581392 res = &root->cgrp;
1393
+ } else if (root == &cgrp_dfl_root) {
1394
+ res = cset->dfl_cgrp;
13591395 } else {
13601396 struct cgrp_cset_link *link;
13611397
....@@ -1412,9 +1448,8 @@
14121448 struct cgroup_root *root)
14131449 {
14141450 /*
1415
- * No need to lock the task - since we hold cgroup_mutex the
1416
- * task can't change groups, so the only thing that can happen
1417
- * is that it exits and its css is set back to init_css_set.
1451
+ * No need to lock the task - since we hold css_set_lock the
1452
+ * task can't change groups.
14181453 */
14191454 return cset_cgroup_from_root(task_css_set(task), root);
14201455 }
....@@ -1453,12 +1488,15 @@
14531488 struct cgroup_subsys *ss = cft->ss;
14541489
14551490 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1456
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1457
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1458
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1491
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1492
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1493
+
1494
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1495
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
14591496 cft->name);
1460
- else
1497
+ } else {
14611498 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1499
+ }
14621500 return buf;
14631501 }
14641502
....@@ -1699,7 +1737,7 @@
16991737 {
17001738 struct cgroup *dcgrp = &dst_root->cgrp;
17011739 struct cgroup_subsys *ss;
1702
- int ssid, i, ret;
1740
+ int ssid, ret;
17031741 u16 dfl_disable_ss_mask = 0;
17041742
17051743 lockdep_assert_held(&cgroup_mutex);
....@@ -1743,7 +1781,8 @@
17431781 struct cgroup_root *src_root = ss->root;
17441782 struct cgroup *scgrp = &src_root->cgrp;
17451783 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1746
- struct css_set *cset;
1784
+ struct css_set *cset, *cset_pos;
1785
+ struct css_task_iter *it;
17471786
17481787 WARN_ON(!css || cgroup_css(dcgrp, ss));
17491788
....@@ -1761,9 +1800,22 @@
17611800 css->cgroup = dcgrp;
17621801
17631802 spin_lock_irq(&css_set_lock);
1764
- hash_for_each(css_set_table, i, cset, hlist)
1803
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
1804
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
1805
+ e_cset_node[ss->id]) {
17651806 list_move_tail(&cset->e_cset_node[ss->id],
17661807 &dcgrp->e_csets[ss->id]);
1808
+ /*
1809
+ * all css_sets of scgrp together in same order to dcgrp,
1810
+ * patch in-flight iterators to preserve correct iteration.
1811
+ * since the iterator is always advanced right away and
1812
+ * finished when it->cset_pos meets it->cset_head, so only
1813
+ * update it->cset_head is enough here.
1814
+ */
1815
+ list_for_each_entry(it, &cset->task_iters, iters_node)
1816
+ if (it->cset_head == &scgrp->e_csets[ss->id])
1817
+ it->cset_head = &dcgrp->e_csets[ss->id];
1818
+ }
17671819 spin_unlock_irq(&css_set_lock);
17681820
17691821 /* default hierarchy doesn't enable controllers by default */
....@@ -1815,26 +1867,42 @@
18151867 return len;
18161868 }
18171869
1818
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1870
+enum cgroup2_param {
1871
+ Opt_nsdelegate,
1872
+ Opt_memory_localevents,
1873
+ Opt_memory_recursiveprot,
1874
+ nr__cgroup2_params
1875
+};
1876
+
1877
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1878
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
1879
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
1880
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1881
+ {}
1882
+};
1883
+
1884
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
18191885 {
1820
- char *token;
1886
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1887
+ struct fs_parse_result result;
1888
+ int opt;
18211889
1822
- *root_flags = 0;
1890
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1891
+ if (opt < 0)
1892
+ return opt;
18231893
1824
- if (!data || *data == '\0')
1894
+ switch (opt) {
1895
+ case Opt_nsdelegate:
1896
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
18251897 return 0;
1826
-
1827
- while ((token = strsep(&data, ",")) != NULL) {
1828
- if (!strcmp(token, "nsdelegate")) {
1829
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
1830
- continue;
1831
- }
1832
-
1833
- pr_err("cgroup2: unknown option \"%s\"\n", token);
1834
- return -EINVAL;
1898
+ case Opt_memory_localevents:
1899
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1900
+ return 0;
1901
+ case Opt_memory_recursiveprot:
1902
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1903
+ return 0;
18351904 }
1836
-
1837
- return 0;
1905
+ return -EINVAL;
18381906 }
18391907
18401908 static void apply_cgroup_root_flags(unsigned int root_flags)
....@@ -1844,6 +1912,16 @@
18441912 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
18451913 else
18461914 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1915
+
1916
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1917
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1918
+ else
1919
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1920
+
1921
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1922
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1923
+ else
1924
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
18471925 }
18481926 }
18491927
....@@ -1851,79 +1929,19 @@
18511929 {
18521930 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
18531931 seq_puts(seq, ",nsdelegate");
1932
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1933
+ seq_puts(seq, ",memory_localevents");
1934
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1935
+ seq_puts(seq, ",memory_recursiveprot");
18541936 return 0;
18551937 }
18561938
1857
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1939
+static int cgroup_reconfigure(struct fs_context *fc)
18581940 {
1859
- unsigned int root_flags;
1860
- int ret;
1941
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
18611942
1862
- ret = parse_cgroup_root_flags(data, &root_flags);
1863
- if (ret)
1864
- return ret;
1865
-
1866
- apply_cgroup_root_flags(root_flags);
1943
+ apply_cgroup_root_flags(ctx->flags);
18671944 return 0;
1868
-}
1869
-
1870
-/*
1871
- * To reduce the fork() overhead for systems that are not actually using
1872
- * their cgroups capability, we don't maintain the lists running through
1873
- * each css_set to its tasks until we see the list actually used - in other
1874
- * words after the first mount.
1875
- */
1876
-static bool use_task_css_set_links __read_mostly;
1877
-
1878
-static void cgroup_enable_task_cg_lists(void)
1879
-{
1880
- struct task_struct *p, *g;
1881
-
1882
- /*
1883
- * We need tasklist_lock because RCU is not safe against
1884
- * while_each_thread(). Besides, a forking task that has passed
1885
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
1886
- * is not guaranteed to have its child immediately visible in the
1887
- * tasklist if we walk through it with RCU.
1888
- */
1889
- read_lock(&tasklist_lock);
1890
- spin_lock_irq(&css_set_lock);
1891
-
1892
- if (use_task_css_set_links)
1893
- goto out_unlock;
1894
-
1895
- use_task_css_set_links = true;
1896
-
1897
- do_each_thread(g, p) {
1898
- WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1899
- task_css_set(p) != &init_css_set);
1900
-
1901
- /*
1902
- * We should check if the process is exiting, otherwise
1903
- * it will race with cgroup_exit() in that the list
1904
- * entry won't be deleted though the process has exited.
1905
- * Do it while holding siglock so that we don't end up
1906
- * racing against cgroup_exit().
1907
- *
1908
- * Interrupts were already disabled while acquiring
1909
- * the css_set_lock, so we do not need to disable it
1910
- * again when acquiring the sighand->siglock here.
1911
- */
1912
- spin_lock(&p->sighand->siglock);
1913
- if (!(p->flags & PF_EXITING)) {
1914
- struct css_set *cset = task_css_set(p);
1915
-
1916
- if (!css_set_populated(cset))
1917
- css_set_update_populated(cset, true);
1918
- list_add_tail(&p->cg_list, &cset->tasks);
1919
- get_css_set(cset);
1920
- cset->nr_tasks++;
1921
- }
1922
- spin_unlock(&p->sighand->siglock);
1923
- } while_each_thread(g, p);
1924
-out_unlock:
1925
- spin_unlock_irq(&css_set_lock);
1926
- read_unlock(&tasklist_lock);
19271945 }
19281946
19291947 static void init_cgroup_housekeeping(struct cgroup *cgrp)
....@@ -1951,22 +1969,22 @@
19511969 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
19521970 }
19531971
1954
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1972
+void init_cgroup_root(struct cgroup_fs_context *ctx)
19551973 {
1974
+ struct cgroup_root *root = ctx->root;
19561975 struct cgroup *cgrp = &root->cgrp;
19571976
19581977 INIT_LIST_HEAD(&root->root_list);
19591978 atomic_set(&root->nr_cgrps, 1);
19601979 cgrp->root = root;
19611980 init_cgroup_housekeeping(cgrp);
1962
- idr_init(&root->cgroup_idr);
19631981
1964
- root->flags = opts->flags;
1965
- if (opts->release_agent)
1966
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1967
- if (opts->name)
1968
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1969
- if (opts->cpuset_clone_children)
1982
+ root->flags = ctx->flags;
1983
+ if (ctx->release_agent)
1984
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1985
+ if (ctx->name)
1986
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1987
+ if (ctx->cpuset_clone_children)
19701988 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
19711989 }
19721990
....@@ -1979,12 +1997,6 @@
19791997 int i, ret;
19801998
19811999 lockdep_assert_held(&cgroup_mutex);
1982
-
1983
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1984
- if (ret < 0)
1985
- goto out;
1986
- root_cgrp->id = ret;
1987
- root_cgrp->ancestor_ids[0] = ret;
19882000
19892001 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
19902002 0, GFP_KERNEL);
....@@ -2011,13 +2023,16 @@
20112023
20122024 root->kf_root = kernfs_create_root(kf_sops,
20132025 KERNFS_ROOT_CREATE_DEACTIVATED |
2014
- KERNFS_ROOT_SUPPORT_EXPORTOP,
2026
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
2027
+ KERNFS_ROOT_SUPPORT_USER_XATTR,
20152028 root_cgrp);
20162029 if (IS_ERR(root->kf_root)) {
20172030 ret = PTR_ERR(root->kf_root);
20182031 goto exit_root_id;
20192032 }
20202033 root_cgrp->kn = root->kf_root->kn;
2034
+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2035
+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
20212036
20222037 ret = css_populate_dir(&root_cgrp->self);
20232038 if (ret)
....@@ -2055,7 +2070,6 @@
20552070 BUG_ON(!list_empty(&root_cgrp->self.children));
20562071 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
20572072
2058
- kernfs_activate(root_cgrp->kn);
20592073 ret = 0;
20602074 goto out;
20612075
....@@ -2071,91 +2085,117 @@
20712085 return ret;
20722086 }
20732087
2074
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
2075
- struct cgroup_root *root, unsigned long magic,
2076
- struct cgroup_namespace *ns)
2088
+int cgroup_do_get_tree(struct fs_context *fc)
20772089 {
2078
- struct dentry *dentry;
2079
- bool new_sb = false;
2090
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2091
+ int ret;
20802092
2081
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
2093
+ ctx->kfc.root = ctx->root->kf_root;
2094
+ if (fc->fs_type == &cgroup2_fs_type)
2095
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2096
+ else
2097
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2098
+ ret = kernfs_get_tree(fc);
20822099
20832100 /*
20842101 * In non-init cgroup namespace, instead of root cgroup's dentry,
20852102 * we return the dentry corresponding to the cgroupns->root_cgrp.
20862103 */
2087
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
2104
+ if (!ret && ctx->ns != &init_cgroup_ns) {
20882105 struct dentry *nsdentry;
2089
- struct super_block *sb = dentry->d_sb;
2106
+ struct super_block *sb = fc->root->d_sb;
20902107 struct cgroup *cgrp;
20912108
20922109 mutex_lock(&cgroup_mutex);
20932110 spin_lock_irq(&css_set_lock);
20942111
2095
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
2112
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
20962113
20972114 spin_unlock_irq(&css_set_lock);
20982115 mutex_unlock(&cgroup_mutex);
20992116
21002117 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2101
- dput(dentry);
2102
- if (IS_ERR(nsdentry))
2118
+ dput(fc->root);
2119
+ if (IS_ERR(nsdentry)) {
21032120 deactivate_locked_super(sb);
2104
- dentry = nsdentry;
2121
+ ret = PTR_ERR(nsdentry);
2122
+ nsdentry = NULL;
2123
+ }
2124
+ fc->root = nsdentry;
21052125 }
21062126
2107
- if (!new_sb)
2108
- cgroup_put(&root->cgrp);
2127
+ if (!ctx->kfc.new_sb_created)
2128
+ cgroup_put(&ctx->root->cgrp);
21092129
2110
- return dentry;
2130
+ return ret;
21112131 }
21122132
2113
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2114
- int flags, const char *unused_dev_name,
2115
- void *data)
2133
+/*
2134
+ * Destroy a cgroup filesystem context.
2135
+ */
2136
+static void cgroup_fs_context_free(struct fs_context *fc)
21162137 {
2117
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2118
- struct dentry *dentry;
2138
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2139
+
2140
+ kfree(ctx->name);
2141
+ kfree(ctx->release_agent);
2142
+ put_cgroup_ns(ctx->ns);
2143
+ kernfs_free_fs_context(fc);
2144
+ kfree(ctx);
2145
+}
2146
+
2147
+static int cgroup_get_tree(struct fs_context *fc)
2148
+{
2149
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
21192150 int ret;
21202151
2121
- get_cgroup_ns(ns);
2152
+ cgrp_dfl_visible = true;
2153
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
2154
+ ctx->root = &cgrp_dfl_root;
21222155
2123
- /* Check if the caller has permission to mount. */
2124
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2125
- put_cgroup_ns(ns);
2126
- return ERR_PTR(-EPERM);
2127
- }
2156
+ ret = cgroup_do_get_tree(fc);
2157
+ if (!ret)
2158
+ apply_cgroup_root_flags(ctx->flags);
2159
+ return ret;
2160
+}
21282161
2129
- /*
2130
- * The first time anyone tries to mount a cgroup, enable the list
2131
- * linking each css_set to its tasks and fix up all existing tasks.
2132
- */
2133
- if (!use_task_css_set_links)
2134
- cgroup_enable_task_cg_lists();
2162
+static const struct fs_context_operations cgroup_fs_context_ops = {
2163
+ .free = cgroup_fs_context_free,
2164
+ .parse_param = cgroup2_parse_param,
2165
+ .get_tree = cgroup_get_tree,
2166
+ .reconfigure = cgroup_reconfigure,
2167
+};
21352168
2136
- if (fs_type == &cgroup2_fs_type) {
2137
- unsigned int root_flags;
2169
+static const struct fs_context_operations cgroup1_fs_context_ops = {
2170
+ .free = cgroup_fs_context_free,
2171
+ .parse_param = cgroup1_parse_param,
2172
+ .get_tree = cgroup1_get_tree,
2173
+ .reconfigure = cgroup1_reconfigure,
2174
+};
21382175
2139
- ret = parse_cgroup_root_flags(data, &root_flags);
2140
- if (ret) {
2141
- put_cgroup_ns(ns);
2142
- return ERR_PTR(ret);
2143
- }
2176
+/*
2177
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
2178
+ * we select the namespace we're going to use.
2179
+ */
2180
+static int cgroup_init_fs_context(struct fs_context *fc)
2181
+{
2182
+ struct cgroup_fs_context *ctx;
21442183
2145
- cgrp_dfl_visible = true;
2146
- cgroup_get_live(&cgrp_dfl_root.cgrp);
2184
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2185
+ if (!ctx)
2186
+ return -ENOMEM;
21472187
2148
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
2149
- CGROUP2_SUPER_MAGIC, ns);
2150
- if (!IS_ERR(dentry))
2151
- apply_cgroup_root_flags(root_flags);
2152
- } else {
2153
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
2154
- CGROUP_SUPER_MAGIC, ns);
2155
- }
2156
-
2157
- put_cgroup_ns(ns);
2158
- return dentry;
2188
+ ctx->ns = current->nsproxy->cgroup_ns;
2189
+ get_cgroup_ns(ctx->ns);
2190
+ fc->fs_private = &ctx->kfc;
2191
+ if (fc->fs_type == &cgroup2_fs_type)
2192
+ fc->ops = &cgroup_fs_context_ops;
2193
+ else
2194
+ fc->ops = &cgroup1_fs_context_ops;
2195
+ put_user_ns(fc->user_ns);
2196
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
2197
+ fc->global = true;
2198
+ return 0;
21592199 }
21602200
21612201 static void cgroup_kill_sb(struct super_block *sb)
....@@ -2171,25 +2211,73 @@
21712211 * And don't kill the default root.
21722212 */
21732213 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2174
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2214
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2215
+ cgroup_bpf_offline(&root->cgrp);
21752216 percpu_ref_kill(&root->cgrp.self.refcnt);
2217
+ }
21762218 cgroup_put(&root->cgrp);
21772219 kernfs_kill_sb(sb);
21782220 }
21792221
21802222 struct file_system_type cgroup_fs_type = {
2181
- .name = "cgroup",
2182
- .mount = cgroup_mount,
2183
- .kill_sb = cgroup_kill_sb,
2184
- .fs_flags = FS_USERNS_MOUNT,
2223
+ .name = "cgroup",
2224
+ .init_fs_context = cgroup_init_fs_context,
2225
+ .parameters = cgroup1_fs_parameters,
2226
+ .kill_sb = cgroup_kill_sb,
2227
+ .fs_flags = FS_USERNS_MOUNT,
21852228 };
21862229
21872230 static struct file_system_type cgroup2_fs_type = {
2188
- .name = "cgroup2",
2189
- .mount = cgroup_mount,
2190
- .kill_sb = cgroup_kill_sb,
2191
- .fs_flags = FS_USERNS_MOUNT,
2231
+ .name = "cgroup2",
2232
+ .init_fs_context = cgroup_init_fs_context,
2233
+ .parameters = cgroup2_fs_parameters,
2234
+ .kill_sb = cgroup_kill_sb,
2235
+ .fs_flags = FS_USERNS_MOUNT,
21922236 };
2237
+
2238
+#ifdef CONFIG_CPUSETS
2239
+static const struct fs_context_operations cpuset_fs_context_ops = {
2240
+ .get_tree = cgroup1_get_tree,
2241
+ .free = cgroup_fs_context_free,
2242
+};
2243
+
2244
+/*
2245
+ * This is ugly, but preserves the userspace API for existing cpuset
2246
+ * users. If someone tries to mount the "cpuset" filesystem, we
2247
+ * silently switch it to mount "cgroup" instead
2248
+ */
2249
+static int cpuset_init_fs_context(struct fs_context *fc)
2250
+{
2251
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2252
+ struct cgroup_fs_context *ctx;
2253
+ int err;
2254
+
2255
+ err = cgroup_init_fs_context(fc);
2256
+ if (err) {
2257
+ kfree(agent);
2258
+ return err;
2259
+ }
2260
+
2261
+ fc->ops = &cpuset_fs_context_ops;
2262
+
2263
+ ctx = cgroup_fc2context(fc);
2264
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
2265
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
2266
+ ctx->release_agent = agent;
2267
+
2268
+ get_filesystem(&cgroup_fs_type);
2269
+ put_filesystem(fc->fs_type);
2270
+ fc->fs_type = &cgroup_fs_type;
2271
+
2272
+ return 0;
2273
+}
2274
+
2275
+static struct file_system_type cpuset_fs_type = {
2276
+ .name = "cpuset",
2277
+ .init_fs_context = cpuset_init_fs_context,
2278
+ .fs_flags = FS_USERNS_MOUNT,
2279
+};
2280
+#endif
21932281
21942282 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
21952283 struct cgroup_namespace *ns)
....@@ -2256,6 +2344,47 @@
22562344 EXPORT_SYMBOL_GPL(task_cgroup_path);
22572345
22582346 /**
2347
+ * cgroup_attach_lock - Lock for ->attach()
2348
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2349
+ *
2350
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
2351
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2352
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
2353
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2354
+ * lead to deadlocks.
2355
+ *
2356
+ * Bringing up a CPU may involve creating and destroying tasks which requires
2357
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2358
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2359
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2360
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
2361
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
2362
+ *
2363
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2364
+ *
2365
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
2366
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2367
+ * CPU hotplug is disabled on entry.
2368
+ */
2369
+static void cgroup_attach_lock(bool lock_threadgroup)
2370
+{
2371
+ cpus_read_lock();
2372
+ if (lock_threadgroup)
2373
+ percpu_down_write(&cgroup_threadgroup_rwsem);
2374
+}
2375
+
2376
+/**
2377
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
2378
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2379
+ */
2380
+static void cgroup_attach_unlock(bool lock_threadgroup)
2381
+{
2382
+ if (lock_threadgroup)
2383
+ percpu_up_write(&cgroup_threadgroup_rwsem);
2384
+ cpus_read_unlock();
2385
+}
2386
+
2387
+/**
22592388 * cgroup_migrate_add_task - add a migration target task to a migration context
22602389 * @task: target task
22612390 * @mgctx: target migration context
....@@ -2276,9 +2405,8 @@
22762405 if (task->flags & PF_EXITING)
22772406 return;
22782407
2279
- /* leave @task alone if post_fork() hasn't linked it yet */
2280
- if (list_empty(&task->cg_list))
2281
- return;
2408
+ /* cgroup_threadgroup_rwsem protects racing against forks */
2409
+ WARN_ON_ONCE(list_empty(&task->cg_list));
22822410
22832411 cset = task_css_set(task);
22842412 if (!cset->mg_src_cgrp)
....@@ -2310,6 +2438,7 @@
23102438
23112439 return cgroup_taskset_next(tset, dst_cssp);
23122440 }
2441
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
23132442
23142443 /**
23152444 * cgroup_taskset_next - iterate to the next task in taskset
....@@ -2356,6 +2485,7 @@
23562485
23572486 return NULL;
23582487 }
2488
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
23592489
23602490 /**
23612491 * cgroup_taskset_migrate - migrate a taskset
....@@ -2426,6 +2556,7 @@
24262556 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
24272557 if (ss->attach) {
24282558 tset->ssid = ssid;
2559
+ trace_android_vh_cgroup_attach(ss, tset);
24292560 ss->attach(tset);
24302561 }
24312562 } while_each_subsys_mask();
....@@ -2510,22 +2641,28 @@
25102641 */
25112642 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
25122643 {
2513
- LIST_HEAD(preloaded);
2514
- struct css_set *cset, *tmp_cset;
2644
+ struct ext_css_set *cset, *tmp_cset;
25152645
25162646 lockdep_assert_held(&cgroup_mutex);
25172647
25182648 spin_lock_irq(&css_set_lock);
25192649
2520
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2521
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2650
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2651
+ mg_src_preload_node) {
2652
+ cset->cset.mg_src_cgrp = NULL;
2653
+ cset->cset.mg_dst_cgrp = NULL;
2654
+ cset->cset.mg_dst_cset = NULL;
2655
+ list_del_init(&cset->mg_src_preload_node);
2656
+ put_css_set_locked(&cset->cset);
2657
+ }
25222658
2523
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2524
- cset->mg_src_cgrp = NULL;
2525
- cset->mg_dst_cgrp = NULL;
2526
- cset->mg_dst_cset = NULL;
2527
- list_del_init(&cset->mg_preload_node);
2528
- put_css_set_locked(cset);
2659
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2660
+ mg_dst_preload_node) {
2661
+ cset->cset.mg_src_cgrp = NULL;
2662
+ cset->cset.mg_dst_cgrp = NULL;
2663
+ cset->cset.mg_dst_cset = NULL;
2664
+ list_del_init(&cset->mg_dst_preload_node);
2665
+ put_css_set_locked(&cset->cset);
25292666 }
25302667
25312668 spin_unlock_irq(&css_set_lock);
....@@ -2552,6 +2689,7 @@
25522689 struct cgroup_mgctx *mgctx)
25532690 {
25542691 struct cgroup *src_cgrp;
2692
+ struct ext_css_set *ext_src_cset;
25552693
25562694 lockdep_assert_held(&cgroup_mutex);
25572695 lockdep_assert_held(&css_set_lock);
....@@ -2565,8 +2703,9 @@
25652703 return;
25662704
25672705 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2706
+ ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
25682707
2569
- if (!list_empty(&src_cset->mg_preload_node))
2708
+ if (!list_empty(&ext_src_cset->mg_src_preload_node))
25702709 return;
25712710
25722711 WARN_ON(src_cset->mg_src_cgrp);
....@@ -2577,7 +2716,7 @@
25772716 src_cset->mg_src_cgrp = src_cgrp;
25782717 src_cset->mg_dst_cgrp = dst_cgrp;
25792718 get_css_set(src_cset);
2580
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2719
+ list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
25812720 }
25822721
25832722 /**
....@@ -2596,20 +2735,23 @@
25962735 */
25972736 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
25982737 {
2599
- struct css_set *src_cset, *tmp_cset;
2738
+ struct ext_css_set *ext_src_set, *tmp_cset;
26002739
26012740 lockdep_assert_held(&cgroup_mutex);
26022741
26032742 /* look up the dst cset for each src cset and link it to src */
2604
- list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2605
- mg_preload_node) {
2743
+ list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
2744
+ mg_src_preload_node) {
2745
+ struct css_set *src_cset = &ext_src_set->cset;
26062746 struct css_set *dst_cset;
2747
+ struct ext_css_set *ext_dst_cset;
26072748 struct cgroup_subsys *ss;
26082749 int ssid;
26092750
26102751 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
26112752 if (!dst_cset)
26122753 return -ENOMEM;
2754
+ ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
26132755
26142756 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
26152757
....@@ -2621,7 +2763,7 @@
26212763 if (src_cset == dst_cset) {
26222764 src_cset->mg_src_cgrp = NULL;
26232765 src_cset->mg_dst_cgrp = NULL;
2624
- list_del_init(&src_cset->mg_preload_node);
2766
+ list_del_init(&ext_src_set->mg_src_preload_node);
26252767 put_css_set(src_cset);
26262768 put_css_set(dst_cset);
26272769 continue;
....@@ -2629,8 +2771,8 @@
26292771
26302772 src_cset->mg_dst_cset = dst_cset;
26312773
2632
- if (list_empty(&dst_cset->mg_preload_node))
2633
- list_add_tail(&dst_cset->mg_preload_node,
2774
+ if (list_empty(&ext_dst_cset->mg_dst_preload_node))
2775
+ list_add_tail(&ext_dst_cset->mg_dst_preload_node,
26342776 &mgctx->preloaded_dst_csets);
26352777 else
26362778 put_css_set(dst_cset);
....@@ -2698,11 +2840,7 @@
26982840 {
26992841 DEFINE_CGROUP_MGCTX(mgctx);
27002842 struct task_struct *task;
2701
- int ret;
2702
-
2703
- ret = cgroup_migrate_vet_dst(dst_cgrp);
2704
- if (ret)
2705
- return ret;
2843
+ int ret = 0;
27062844
27072845 /* look up all src csets */
27082846 spin_lock_irq(&css_set_lock);
....@@ -2729,16 +2867,28 @@
27292867 return ret;
27302868 }
27312869
2732
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
2733
- __acquires(&cgroup_threadgroup_rwsem)
2870
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2871
+ bool *threadgroup_locked,
2872
+ struct cgroup *dst_cgrp)
27342873 {
27352874 struct task_struct *tsk;
27362875 pid_t pid;
2876
+ bool force_migration = false;
27372877
27382878 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
27392879 return ERR_PTR(-EINVAL);
27402880
2741
- percpu_down_write(&cgroup_threadgroup_rwsem);
2881
+ /*
2882
+ * If we migrate a single thread, we don't care about threadgroup
2883
+ * stability. If the thread is `current`, it won't exit(2) under our
2884
+ * hands or change PID through exec(2). We exclude
2885
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
2886
+ * callers by cgroup_mutex.
2887
+ * Therefore, we can skip the global lock.
2888
+ */
2889
+ lockdep_assert_held(&cgroup_mutex);
2890
+ *threadgroup_locked = pid || threadgroup;
2891
+ cgroup_attach_lock(*threadgroup_locked);
27422892
27432893 rcu_read_lock();
27442894 if (pid) {
....@@ -2754,13 +2904,16 @@
27542904 if (threadgroup)
27552905 tsk = tsk->group_leader;
27562906
2907
+ if (tsk->flags & PF_KTHREAD)
2908
+ trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration);
2909
+
27572910 /*
27582911 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
27592912 * If userland migrates such a kthread to a non-root cgroup, it can
27602913 * become trapped in a cpuset, or RT kthread may be born in a
27612914 * cgroup with no rt_runtime allocated. Just say no.
27622915 */
2763
- if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2916
+ if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) {
27642917 tsk = ERR_PTR(-EINVAL);
27652918 goto out_unlock_threadgroup;
27662919 }
....@@ -2769,14 +2922,14 @@
27692922 goto out_unlock_rcu;
27702923
27712924 out_unlock_threadgroup:
2772
- percpu_up_write(&cgroup_threadgroup_rwsem);
2925
+ cgroup_attach_unlock(*threadgroup_locked);
2926
+ *threadgroup_locked = false;
27732927 out_unlock_rcu:
27742928 rcu_read_unlock();
27752929 return tsk;
27762930 }
27772931
2778
-void cgroup_procs_write_finish(struct task_struct *task)
2779
- __releases(&cgroup_threadgroup_rwsem)
2932
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
27802933 {
27812934 struct cgroup_subsys *ss;
27822935 int ssid;
....@@ -2784,7 +2937,8 @@
27842937 /* release reference from cgroup_procs_write_start() */
27852938 put_task_struct(task);
27862939
2787
- percpu_up_write(&cgroup_threadgroup_rwsem);
2940
+ cgroup_attach_unlock(threadgroup_locked);
2941
+
27882942 for_each_subsys(ss, ssid)
27892943 if (ss->post_attach)
27902944 ss->post_attach();
....@@ -2799,7 +2953,7 @@
27992953 do_each_subsys_mask(ss, ssid, ss_mask) {
28002954 if (printed)
28012955 seq_putc(seq, ' ');
2802
- seq_printf(seq, "%s", ss->name);
2956
+ seq_puts(seq, ss->name);
28032957 printed = true;
28042958 } while_each_subsys_mask();
28052959 if (printed)
....@@ -2838,12 +2992,11 @@
28382992 DEFINE_CGROUP_MGCTX(mgctx);
28392993 struct cgroup_subsys_state *d_css;
28402994 struct cgroup *dsct;
2841
- struct css_set *src_cset;
2995
+ struct ext_css_set *ext_src_set;
2996
+ bool has_tasks;
28422997 int ret;
28432998
28442999 lockdep_assert_held(&cgroup_mutex);
2845
-
2846
- percpu_down_write(&cgroup_threadgroup_rwsem);
28473000
28483001 /* look up all csses currently attached to @cgrp's subtree */
28493002 spin_lock_irq(&css_set_lock);
....@@ -2855,17 +3008,27 @@
28553008 }
28563009 spin_unlock_irq(&css_set_lock);
28573010
3011
+ /*
3012
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
3013
+ * However, if there are no source csets for @cgrp, changing its
3014
+ * controllers isn't gonna produce any task migrations and the
3015
+ * write-locking can be skipped safely.
3016
+ */
3017
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
3018
+ cgroup_attach_lock(has_tasks);
3019
+
28583020 /* NULL dst indicates self on default hierarchy */
28593021 ret = cgroup_migrate_prepare_dst(&mgctx);
28603022 if (ret)
28613023 goto out_finish;
28623024
28633025 spin_lock_irq(&css_set_lock);
2864
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
3026
+ list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
3027
+ mg_src_preload_node) {
28653028 struct task_struct *task, *ntask;
28663029
28673030 /* all tasks in src_csets need to be migrated */
2868
- list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3031
+ list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
28693032 cgroup_migrate_add_task(task, &mgctx);
28703033 }
28713034 spin_unlock_irq(&css_set_lock);
....@@ -2873,7 +3036,7 @@
28733036 ret = cgroup_migrate_execute(&mgctx);
28743037 out_finish:
28753038 cgroup_migrate_finish(&mgctx);
2876
- percpu_up_write(&cgroup_threadgroup_rwsem);
3039
+ cgroup_attach_unlock(has_tasks);
28773040 return ret;
28783041 }
28793042
....@@ -3106,7 +3269,7 @@
31063269 return ret;
31073270
31083271 /*
3109
- * At this point, cgroup_e_css() results reflect the new csses
3272
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
31103273 * making the following cgroup_update_dfl_csses() properly update
31113274 * css associations of all tasks in the subtree.
31123275 */
....@@ -3506,22 +3669,33 @@
35063669 #ifdef CONFIG_PSI
35073670 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
35083671 {
3509
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
3672
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3673
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3674
+
3675
+ return psi_show(seq, psi, PSI_IO);
35103676 }
35113677 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
35123678 {
3513
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
3679
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3680
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3681
+
3682
+ return psi_show(seq, psi, PSI_MEM);
35143683 }
35153684 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
35163685 {
3517
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
3686
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3687
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3688
+
3689
+ return psi_show(seq, psi, PSI_CPU);
35183690 }
35193691
35203692 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
35213693 size_t nbytes, enum psi_res res)
35223694 {
3695
+ struct cgroup_file_ctx *ctx = of->priv;
35233696 struct psi_trigger *new;
35243697 struct cgroup *cgrp;
3698
+ struct psi_group *psi;
35253699
35263700 cgrp = cgroup_kn_lock_live(of->kn, false);
35273701 if (!cgrp)
....@@ -3530,14 +3704,20 @@
35303704 cgroup_get(cgrp);
35313705 cgroup_kn_unlock(of->kn);
35323706
3533
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3707
+ /* Allow only one trigger per file descriptor */
3708
+ if (ctx->psi.trigger) {
3709
+ cgroup_put(cgrp);
3710
+ return -EBUSY;
3711
+ }
3712
+
3713
+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3714
+ new = psi_trigger_create(psi, buf, nbytes, res);
35343715 if (IS_ERR(new)) {
35353716 cgroup_put(cgrp);
35363717 return PTR_ERR(new);
35373718 }
35383719
3539
- psi_trigger_replace(&of->priv, new);
3540
-
3720
+ smp_store_release(&ctx->psi.trigger, new);
35413721 cgroup_put(cgrp);
35423722
35433723 return nbytes;
....@@ -3567,12 +3747,15 @@
35673747 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
35683748 poll_table *pt)
35693749 {
3570
- return psi_trigger_poll(&of->priv, of->file, pt);
3750
+ struct cgroup_file_ctx *ctx = of->priv;
3751
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
35713752 }
35723753
35733754 static void cgroup_pressure_release(struct kernfs_open_file *of)
35743755 {
3575
- psi_trigger_replace(&of->priv, NULL);
3756
+ struct cgroup_file_ctx *ctx = of->priv;
3757
+
3758
+ psi_trigger_destroy(ctx->psi.trigger);
35763759 }
35773760
35783761 bool cgroup_psi_enabled(void)
....@@ -3625,28 +3808,50 @@
36253808 static int cgroup_file_open(struct kernfs_open_file *of)
36263809 {
36273810 struct cftype *cft = of->kn->priv;
3811
+ struct cgroup_file_ctx *ctx;
3812
+ int ret;
36283813
3629
- if (cft->open)
3630
- return cft->open(of);
3631
- return 0;
3814
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
3815
+ if (!ctx)
3816
+ return -ENOMEM;
3817
+
3818
+ ctx->ns = current->nsproxy->cgroup_ns;
3819
+ get_cgroup_ns(ctx->ns);
3820
+ of->priv = ctx;
3821
+
3822
+ if (!cft->open)
3823
+ return 0;
3824
+
3825
+ ret = cft->open(of);
3826
+ if (ret) {
3827
+ put_cgroup_ns(ctx->ns);
3828
+ kfree(ctx);
3829
+ }
3830
+ return ret;
36323831 }
36333832
36343833 static void cgroup_file_release(struct kernfs_open_file *of)
36353834 {
36363835 struct cftype *cft = of->kn->priv;
3836
+ struct cgroup_file_ctx *ctx = of->priv;
36373837
36383838 if (cft->release)
36393839 cft->release(of);
3840
+ put_cgroup_ns(ctx->ns);
3841
+ kfree(ctx);
36403842 }
36413843
36423844 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
36433845 size_t nbytes, loff_t off)
36443846 {
3645
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3847
+ struct cgroup_file_ctx *ctx = of->priv;
36463848 struct cgroup *cgrp = of->kn->parent->priv;
36473849 struct cftype *cft = of->kn->priv;
36483850 struct cgroup_subsys_state *css;
36493851 int ret;
3852
+
3853
+ if (!nbytes)
3854
+ return 0;
36503855
36513856 /*
36523857 * If namespaces are delegation boundaries, disallow writes to
....@@ -3656,7 +3861,7 @@
36563861 */
36573862 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
36583863 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3659
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3864
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
36603865 return -EPERM;
36613866
36623867 if (cft->write)
....@@ -3843,7 +4048,8 @@
38434048 continue;
38444049 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
38454050 continue;
3846
-
4051
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4052
+ continue;
38474053 if (is_add) {
38484054 ret = cgroup_add_file(css, cgrp, cft);
38494055 if (ret) {
....@@ -4028,6 +4234,7 @@
40284234 cft->flags |= __CFTYPE_ONLY_ON_DFL;
40294235 return cgroup_add_cftypes(ss, cfts);
40304236 }
4237
+EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes);
40314238
40324239 /**
40334240 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
....@@ -4045,6 +4252,7 @@
40454252 cft->flags |= __CFTYPE_NOT_ON_DFL;
40464253 return cgroup_add_cftypes(ss, cfts);
40474254 }
4255
+EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
40484256
40494257 /**
40504258 * cgroup_file_notify - generate a file modified event for a cgroup_file
....@@ -4120,7 +4328,8 @@
41204328 } else if (likely(!(pos->flags & CSS_RELEASED))) {
41214329 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
41224330 } else {
4123
- list_for_each_entry_rcu(next, &parent->children, sibling)
4331
+ list_for_each_entry_rcu(next, &parent->children, sibling,
4332
+ lockdep_is_held(&cgroup_mutex))
41244333 if (next->serial_nr > pos->serial_nr)
41254334 break;
41264335 }
....@@ -4133,6 +4342,7 @@
41334342 return next;
41344343 return NULL;
41354344 }
4345
+EXPORT_SYMBOL_GPL(css_next_child);
41364346
41374347 /**
41384348 * css_next_descendant_pre - find the next descendant for pre-order walk
....@@ -4182,6 +4392,7 @@
41824392
41834393 return NULL;
41844394 }
4395
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
41854396
41864397 /**
41874398 * css_rightmost_descendant - return the rightmost descendant of a css
....@@ -4362,29 +4573,24 @@
43624573
43634574 lockdep_assert_held(&css_set_lock);
43644575
4365
- /* Advance to the next non-empty css_set */
4366
- do {
4367
- cset = css_task_iter_next_css_set(it);
4368
- if (!cset) {
4369
- it->task_pos = NULL;
4370
- return;
4576
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
4577
+ while ((cset = css_task_iter_next_css_set(it))) {
4578
+ if (!list_empty(&cset->tasks)) {
4579
+ it->cur_tasks_head = &cset->tasks;
4580
+ break;
4581
+ } else if (!list_empty(&cset->mg_tasks)) {
4582
+ it->cur_tasks_head = &cset->mg_tasks;
4583
+ break;
4584
+ } else if (!list_empty(&cset->dying_tasks)) {
4585
+ it->cur_tasks_head = &cset->dying_tasks;
4586
+ break;
43714587 }
4372
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4373
-
4374
- if (!list_empty(&cset->tasks)) {
4375
- it->task_pos = cset->tasks.next;
4376
- it->cur_tasks_head = &cset->tasks;
4377
- } else if (!list_empty(&cset->mg_tasks)) {
4378
- it->task_pos = cset->mg_tasks.next;
4379
- it->cur_tasks_head = &cset->mg_tasks;
4380
- } else {
4381
- it->task_pos = cset->dying_tasks.next;
4382
- it->cur_tasks_head = &cset->dying_tasks;
43834588 }
4384
-
4385
- it->tasks_head = &cset->tasks;
4386
- it->mg_tasks_head = &cset->mg_tasks;
4387
- it->dying_tasks_head = &cset->dying_tasks;
4589
+ if (!cset) {
4590
+ it->task_pos = NULL;
4591
+ return;
4592
+ }
4593
+ it->task_pos = it->cur_tasks_head->next;
43884594
43894595 /*
43904596 * We don't keep css_sets locked across iteration steps and thus
....@@ -4429,24 +4635,24 @@
44294635 repeat:
44304636 if (it->task_pos) {
44314637 /*
4432
- * Advance iterator to find next entry. cset->tasks is
4433
- * consumed first and then ->mg_tasks. After ->mg_tasks,
4434
- * we move onto the next cset.
4638
+ * Advance iterator to find next entry. We go through cset
4639
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
4640
+ * the next cset.
44354641 */
44364642 if (it->flags & CSS_TASK_ITER_SKIPPED)
44374643 it->flags &= ~CSS_TASK_ITER_SKIPPED;
44384644 else
44394645 it->task_pos = it->task_pos->next;
44404646
4441
- if (it->task_pos == it->tasks_head) {
4442
- it->task_pos = it->mg_tasks_head->next;
4443
- it->cur_tasks_head = it->mg_tasks_head;
4647
+ if (it->task_pos == &it->cur_cset->tasks) {
4648
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
4649
+ it->task_pos = it->cur_tasks_head->next;
44444650 }
4445
- if (it->task_pos == it->mg_tasks_head) {
4446
- it->task_pos = it->dying_tasks_head->next;
4447
- it->cur_tasks_head = it->dying_tasks_head;
4651
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
4652
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
4653
+ it->task_pos = it->cur_tasks_head->next;
44484654 }
4449
- if (it->task_pos == it->dying_tasks_head)
4655
+ if (it->task_pos == &it->cur_cset->dying_tasks)
44504656 css_task_iter_advance_css_set(it);
44514657 } else {
44524658 /* called from start, proceed to the first cset */
....@@ -4464,12 +4670,12 @@
44644670 goto repeat;
44654671
44664672 /* and dying leaders w/o live member threads */
4467
- if (it->cur_tasks_head == it->dying_tasks_head &&
4673
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
44684674 !atomic_read(&task->signal->live))
44694675 goto repeat;
44704676 } else {
44714677 /* skip all dying ones */
4472
- if (it->cur_tasks_head == it->dying_tasks_head)
4678
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
44734679 goto repeat;
44744680 }
44754681 }
....@@ -4488,9 +4694,6 @@
44884694 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
44894695 struct css_task_iter *it)
44904696 {
4491
- /* no one should try to iterate before mounting cgroups */
4492
- WARN_ON_ONCE(!use_task_css_set_links);
4493
-
44944697 memset(it, 0, sizeof(*it));
44954698
44964699 spin_lock_irq(&css_set_lock);
....@@ -4567,21 +4770,21 @@
45674770
45684771 static void cgroup_procs_release(struct kernfs_open_file *of)
45694772 {
4570
- if (of->priv) {
4571
- css_task_iter_end(of->priv);
4572
- kfree(of->priv);
4573
- }
4773
+ struct cgroup_file_ctx *ctx = of->priv;
4774
+
4775
+ if (ctx->procs.started)
4776
+ css_task_iter_end(&ctx->procs.iter);
45744777 }
45754778
45764779 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
45774780 {
45784781 struct kernfs_open_file *of = s->private;
4579
- struct css_task_iter *it = of->priv;
4782
+ struct cgroup_file_ctx *ctx = of->priv;
45804783
45814784 if (pos)
45824785 (*pos)++;
45834786
4584
- return css_task_iter_next(it);
4787
+ return css_task_iter_next(&ctx->procs.iter);
45854788 }
45864789
45874790 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
....@@ -4589,21 +4792,18 @@
45894792 {
45904793 struct kernfs_open_file *of = s->private;
45914794 struct cgroup *cgrp = seq_css(s)->cgroup;
4592
- struct css_task_iter *it = of->priv;
4795
+ struct cgroup_file_ctx *ctx = of->priv;
4796
+ struct css_task_iter *it = &ctx->procs.iter;
45934797
45944798 /*
45954799 * When a seq_file is seeked, it's always traversed sequentially
45964800 * from position 0, so we can simply keep iterating on !0 *pos.
45974801 */
4598
- if (!it) {
4802
+ if (!ctx->procs.started) {
45994803 if (WARN_ON_ONCE((*pos)))
46004804 return ERR_PTR(-EINVAL);
4601
-
4602
- it = kzalloc(sizeof(*it), GFP_KERNEL);
4603
- if (!it)
4604
- return ERR_PTR(-ENOMEM);
4605
- of->priv = it;
46064805 css_task_iter_start(&cgrp->self, iter_flags, it);
4806
+ ctx->procs.started = true;
46074807 } else if (!(*pos)) {
46084808 css_task_iter_end(it);
46094809 css_task_iter_start(&cgrp->self, iter_flags, it);
....@@ -4636,13 +4836,28 @@
46364836 return 0;
46374837 }
46384838
4839
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4840
+{
4841
+ int ret;
4842
+ struct inode *inode;
4843
+
4844
+ lockdep_assert_held(&cgroup_mutex);
4845
+
4846
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4847
+ if (!inode)
4848
+ return -ENOMEM;
4849
+
4850
+ ret = inode_permission(inode, MAY_WRITE);
4851
+ iput(inode);
4852
+ return ret;
4853
+}
4854
+
46394855 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
46404856 struct cgroup *dst_cgrp,
4641
- struct super_block *sb)
4857
+ struct super_block *sb,
4858
+ struct cgroup_namespace *ns)
46424859 {
4643
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
46444860 struct cgroup *com_cgrp = src_cgrp;
4645
- struct inode *inode;
46464861 int ret;
46474862
46484863 lockdep_assert_held(&cgroup_mutex);
....@@ -4652,12 +4867,7 @@
46524867 com_cgrp = cgroup_parent(com_cgrp);
46534868
46544869 /* %current should be authorized to migrate to the common ancestor */
4655
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4656
- if (!inode)
4657
- return -ENOMEM;
4658
-
4659
- ret = inode_permission(inode, MAY_WRITE);
4660
- iput(inode);
4870
+ ret = cgroup_may_write(com_cgrp, sb);
46614871 if (ret)
46624872 return ret;
46634873
....@@ -4673,18 +4883,42 @@
46734883 return 0;
46744884 }
46754885
4886
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4887
+ struct cgroup *dst_cgrp,
4888
+ struct super_block *sb, bool threadgroup,
4889
+ struct cgroup_namespace *ns)
4890
+{
4891
+ int ret = 0;
4892
+
4893
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
4894
+ if (ret)
4895
+ return ret;
4896
+
4897
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
4898
+ if (ret)
4899
+ return ret;
4900
+
4901
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4902
+ ret = -EOPNOTSUPP;
4903
+
4904
+ return ret;
4905
+}
4906
+
46764907 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
46774908 char *buf, size_t nbytes, loff_t off)
46784909 {
4910
+ struct cgroup_file_ctx *ctx = of->priv;
46794911 struct cgroup *src_cgrp, *dst_cgrp;
46804912 struct task_struct *task;
4913
+ const struct cred *saved_cred;
46814914 ssize_t ret;
4915
+ bool threadgroup_locked;
46824916
46834917 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
46844918 if (!dst_cgrp)
46854919 return -ENODEV;
46864920
4687
- task = cgroup_procs_write_start(buf, true);
4921
+ task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp);
46884922 ret = PTR_ERR_OR_ZERO(task);
46894923 if (ret)
46904924 goto out_unlock;
....@@ -4694,15 +4928,23 @@
46944928 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
46954929 spin_unlock_irq(&css_set_lock);
46964930
4697
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4698
- of->file->f_path.dentry->d_sb);
4931
+ /*
4932
+ * Process and thread migrations follow same delegation rule. Check
4933
+ * permissions using the credentials from file open to protect against
4934
+ * inherited fd attacks.
4935
+ */
4936
+ saved_cred = override_creds(of->file->f_cred);
4937
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4938
+ of->file->f_path.dentry->d_sb, true,
4939
+ ctx->ns);
4940
+ revert_creds(saved_cred);
46994941 if (ret)
47004942 goto out_finish;
47014943
47024944 ret = cgroup_attach_task(dst_cgrp, task, true);
47034945
47044946 out_finish:
4705
- cgroup_procs_write_finish(task);
4947
+ cgroup_procs_write_finish(task, threadgroup_locked);
47064948 out_unlock:
47074949 cgroup_kn_unlock(of->kn);
47084950
....@@ -4717,9 +4959,12 @@
47174959 static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
47184960 char *buf, size_t nbytes, loff_t off)
47194961 {
4962
+ struct cgroup_file_ctx *ctx = of->priv;
47204963 struct cgroup *src_cgrp, *dst_cgrp;
47214964 struct task_struct *task;
4965
+ const struct cred *saved_cred;
47224966 ssize_t ret;
4967
+ bool threadgroup_locked;
47234968
47244969 buf = strstrip(buf);
47254970
....@@ -4727,7 +4972,7 @@
47274972 if (!dst_cgrp)
47284973 return -ENODEV;
47294974
4730
- task = cgroup_procs_write_start(buf, false);
4975
+ task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp);
47314976 ret = PTR_ERR_OR_ZERO(task);
47324977 if (ret)
47334978 goto out_unlock;
....@@ -4737,21 +4982,23 @@
47374982 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
47384983 spin_unlock_irq(&css_set_lock);
47394984
4740
- /* thread migrations follow the cgroup.procs delegation rule */
4741
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4742
- of->file->f_path.dentry->d_sb);
4985
+ /*
4986
+ * Process and thread migrations follow same delegation rule. Check
4987
+ * permissions using the credentials from file open to protect against
4988
+ * inherited fd attacks.
4989
+ */
4990
+ saved_cred = override_creds(of->file->f_cred);
4991
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4992
+ of->file->f_path.dentry->d_sb, false,
4993
+ ctx->ns);
4994
+ revert_creds(saved_cred);
47434995 if (ret)
4744
- goto out_finish;
4745
-
4746
- /* and must be contained in the same domain */
4747
- ret = -EOPNOTSUPP;
4748
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
47494996 goto out_finish;
47504997
47514998 ret = cgroup_attach_task(dst_cgrp, task, false);
47524999
47535000 out_finish:
4754
- cgroup_procs_write_finish(task);
5001
+ cgroup_procs_write_finish(task, threadgroup_locked);
47555002 out_unlock:
47565003 cgroup_kn_unlock(of->kn);
47575004
....@@ -4823,13 +5070,12 @@
48235070 },
48245071 {
48255072 .name = "cpu.stat",
4826
- .flags = CFTYPE_NOT_ON_ROOT,
48275073 .seq_show = cpu_stat_show,
48285074 },
48295075 #ifdef CONFIG_PSI
48305076 {
48315077 .name = "io.pressure",
4832
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5078
+ .flags = CFTYPE_PRESSURE,
48335079 .seq_show = cgroup_io_pressure_show,
48345080 .write = cgroup_io_pressure_write,
48355081 .poll = cgroup_pressure_poll,
....@@ -4837,7 +5083,7 @@
48375083 },
48385084 {
48395085 .name = "memory.pressure",
4840
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5086
+ .flags = CFTYPE_PRESSURE,
48415087 .seq_show = cgroup_memory_pressure_show,
48425088 .write = cgroup_memory_pressure_write,
48435089 .poll = cgroup_pressure_poll,
....@@ -4845,7 +5091,7 @@
48455091 },
48465092 {
48475093 .name = "cpu.pressure",
4848
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5094
+ .flags = CFTYPE_PRESSURE,
48495095 .seq_show = cgroup_cpu_pressure_show,
48505096 .write = cgroup_cpu_pressure_write,
48515097 .poll = cgroup_pressure_poll,
....@@ -4964,9 +5210,6 @@
49645210 tcgrp->nr_dying_descendants--;
49655211 spin_unlock_irq(&css_set_lock);
49665212
4967
- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4968
- cgrp->id = -1;
4969
-
49705213 /*
49715214 * There are two control paths which try to determine
49725215 * cgroup from dentry without going through kernfs -
....@@ -4977,8 +5220,6 @@
49775220 if (cgrp->kn)
49785221 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
49795222 NULL);
4980
-
4981
- cgroup_bpf_put(cgrp);
49825223 }
49835224
49845225 mutex_unlock(&cgroup_mutex);
....@@ -5133,10 +5374,12 @@
51335374 * it isn't associated with its kernfs_node and doesn't have the control
51345375 * mask applied.
51355376 */
5136
-static struct cgroup *cgroup_create(struct cgroup *parent)
5377
+static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5378
+ umode_t mode)
51375379 {
51385380 struct cgroup_root *root = parent->root;
51395381 struct cgroup *cgrp, *tcgrp;
5382
+ struct kernfs_node *kn;
51405383 int level = parent->level + 1;
51415384 int ret;
51425385
....@@ -5156,15 +5399,13 @@
51565399 goto out_cancel_ref;
51575400 }
51585401
5159
- /*
5160
- * Temporarily set the pointer to NULL, so idr_find() won't return
5161
- * a half-baked cgroup.
5162
- */
5163
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5164
- if (cgrp->id < 0) {
5165
- ret = -ENOMEM;
5402
+ /* create the directory */
5403
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5404
+ if (IS_ERR(kn)) {
5405
+ ret = PTR_ERR(kn);
51665406 goto out_stat_exit;
51675407 }
5408
+ cgrp->kn = kn;
51685409
51695410 init_cgroup_housekeeping(cgrp);
51705411
....@@ -5174,7 +5415,7 @@
51745415
51755416 ret = psi_cgroup_alloc(cgrp);
51765417 if (ret)
5177
- goto out_idr_free;
5418
+ goto out_kernfs_remove;
51785419
51795420 ret = cgroup_bpf_inherit(cgrp);
51805421 if (ret)
....@@ -5198,7 +5439,7 @@
51985439
51995440 spin_lock_irq(&css_set_lock);
52005441 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5201
- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
5442
+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
52025443
52035444 if (tcgrp != cgrp) {
52045445 tcgrp->nr_descendants++;
....@@ -5228,12 +5469,6 @@
52285469 cgroup_get_live(parent);
52295470
52305471 /*
5231
- * @cgrp is now fully operational. If something fails after this
5232
- * point, it'll be released via the normal destruction path.
5233
- */
5234
- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5235
-
5236
- /*
52375472 * On the default hierarchy, a child doesn't automatically inherit
52385473 * subtree_control from the parent. Each is configured manually.
52395474 */
....@@ -5246,8 +5481,8 @@
52465481
52475482 out_psi_free:
52485483 psi_cgroup_free(cgrp);
5249
-out_idr_free:
5250
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
5484
+out_kernfs_remove:
5485
+ kernfs_remove(cgrp->kn);
52515486 out_stat_exit:
52525487 if (cgroup_on_dfl(parent))
52535488 cgroup_rstat_exit(cgrp);
....@@ -5284,7 +5519,6 @@
52845519 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
52855520 {
52865521 struct cgroup *parent, *cgrp;
5287
- struct kernfs_node *kn;
52885522 int ret;
52895523
52905524 /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
....@@ -5300,27 +5534,19 @@
53005534 goto out_unlock;
53015535 }
53025536
5303
- cgrp = cgroup_create(parent);
5537
+ cgrp = cgroup_create(parent, name, mode);
53045538 if (IS_ERR(cgrp)) {
53055539 ret = PTR_ERR(cgrp);
53065540 goto out_unlock;
53075541 }
53085542
5309
- /* create the directory */
5310
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5311
- if (IS_ERR(kn)) {
5312
- ret = PTR_ERR(kn);
5313
- goto out_destroy;
5314
- }
5315
- cgrp->kn = kn;
5316
-
53175543 /*
53185544 * This extra ref will be put in cgroup_free_fn() and guarantees
53195545 * that @cgrp->kn is always accessible.
53205546 */
5321
- kernfs_get(kn);
5547
+ kernfs_get(cgrp->kn);
53225548
5323
- ret = cgroup_kn_set_ugid(kn);
5549
+ ret = cgroup_kn_set_ugid(cgrp->kn);
53245550 if (ret)
53255551 goto out_destroy;
53265552
....@@ -5335,7 +5561,7 @@
53355561 TRACE_CGROUP_PATH(mkdir, cgrp);
53365562
53375563 /* let's create and online css's */
5338
- kernfs_activate(kn);
5564
+ kernfs_activate(cgrp->kn);
53395565
53405566 ret = 0;
53415567 goto out_unlock;
....@@ -5512,6 +5738,8 @@
55125738
55135739 cgroup1_check_for_release(parent);
55145740
5741
+ cgroup_bpf_offline(cgrp);
5742
+
55155743 /* put the base reference */
55165744 percpu_ref_kill(&cgrp->self.refcnt);
55175745
....@@ -5537,7 +5765,6 @@
55375765
55385766 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
55395767 .show_options = cgroup_show_options,
5540
- .remount_fs = cgroup_remount,
55415768 .mkdir = cgroup_mkdir,
55425769 .rmdir = cgroup_rmdir,
55435770 .show_path = cgroup_show_path,
....@@ -5604,11 +5831,12 @@
56045831 */
56055832 int __init cgroup_init_early(void)
56065833 {
5607
- static struct cgroup_sb_opts __initdata opts;
5834
+ static struct cgroup_fs_context __initdata ctx;
56085835 struct cgroup_subsys *ss;
56095836 int i;
56105837
5611
- init_cgroup_root(&cgrp_dfl_root, &opts);
5838
+ ctx.root = &cgrp_dfl_root;
5839
+ init_cgroup_root(&ctx);
56125840 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
56135841
56145842 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
....@@ -5644,14 +5872,13 @@
56445872 int ssid;
56455873
56465874 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5647
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
56485875 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
56495876 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
56505877
56515878 cgroup_rstat_boot();
56525879
56535880 /*
5654
- * The latency of the synchronize_sched() is too high for cgroups,
5881
+ * The latency of the synchronize_rcu() is too high for cgroups,
56555882 * avoid it at the cost of forcing all readers into the slow path.
56565883 */
56575884 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
....@@ -5735,6 +5962,9 @@
57355962 WARN_ON(register_filesystem(&cgroup_fs_type));
57365963 WARN_ON(register_filesystem(&cgroup2_fs_type));
57375964 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5965
+#ifdef CONFIG_CPUSETS
5966
+ WARN_ON(register_filesystem(&cpuset_fs_type));
5967
+#endif
57385968
57395969 return 0;
57405970 }
....@@ -5755,12 +5985,11 @@
57555985 }
57565986 core_initcall(cgroup_wq_init);
57575987
5758
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
5759
- char *buf, size_t buflen)
5988
+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
57605989 {
57615990 struct kernfs_node *kn;
57625991
5763
- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
5992
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
57645993 if (!kn)
57655994 return;
57665995 kernfs_path(kn, buf, buflen);
....@@ -5850,8 +6079,7 @@
58506079 * @child: pointer to task_struct of forking parent process.
58516080 *
58526081 * A task is associated with the init_css_set until cgroup_post_fork()
5853
- * attaches it to the parent's css_set. Empty cg_list indicates that
5854
- * @child isn't holding reference to its css_set.
6082
+ * attaches it to the target css_set.
58556083 */
58566084 void cgroup_fork(struct task_struct *child)
58576085 {
....@@ -5859,21 +6087,172 @@
58596087 INIT_LIST_HEAD(&child->cg_list);
58606088 }
58616089
6090
+static struct cgroup *cgroup_get_from_file(struct file *f)
6091
+{
6092
+ struct cgroup_subsys_state *css;
6093
+ struct cgroup *cgrp;
6094
+
6095
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6096
+ if (IS_ERR(css))
6097
+ return ERR_CAST(css);
6098
+
6099
+ cgrp = css->cgroup;
6100
+ if (!cgroup_on_dfl(cgrp)) {
6101
+ cgroup_put(cgrp);
6102
+ return ERR_PTR(-EBADF);
6103
+ }
6104
+
6105
+ return cgrp;
6106
+}
6107
+
6108
+/**
6109
+ * cgroup_css_set_fork - find or create a css_set for a child process
6110
+ * @kargs: the arguments passed to create the child process
6111
+ *
6112
+ * This functions finds or creates a new css_set which the child
6113
+ * process will be attached to in cgroup_post_fork(). By default,
6114
+ * the child process will be given the same css_set as its parent.
6115
+ *
6116
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
6117
+ * existing css_set which includes the requested cgroup and if not create
6118
+ * a new css_set that the child will be attached to later. If this function
6119
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
6120
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
6121
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
6122
+ * to the target cgroup.
6123
+ */
6124
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6125
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6126
+{
6127
+ int ret;
6128
+ struct cgroup *dst_cgrp = NULL;
6129
+ struct css_set *cset;
6130
+ struct super_block *sb;
6131
+ struct file *f;
6132
+
6133
+ if (kargs->flags & CLONE_INTO_CGROUP)
6134
+ mutex_lock(&cgroup_mutex);
6135
+
6136
+ cgroup_threadgroup_change_begin(current);
6137
+
6138
+ spin_lock_irq(&css_set_lock);
6139
+ cset = task_css_set(current);
6140
+ get_css_set(cset);
6141
+ spin_unlock_irq(&css_set_lock);
6142
+
6143
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6144
+ kargs->cset = cset;
6145
+ return 0;
6146
+ }
6147
+
6148
+ f = fget_raw(kargs->cgroup);
6149
+ if (!f) {
6150
+ ret = -EBADF;
6151
+ goto err;
6152
+ }
6153
+ sb = f->f_path.dentry->d_sb;
6154
+
6155
+ dst_cgrp = cgroup_get_from_file(f);
6156
+ if (IS_ERR(dst_cgrp)) {
6157
+ ret = PTR_ERR(dst_cgrp);
6158
+ dst_cgrp = NULL;
6159
+ goto err;
6160
+ }
6161
+
6162
+ if (cgroup_is_dead(dst_cgrp)) {
6163
+ ret = -ENODEV;
6164
+ goto err;
6165
+ }
6166
+
6167
+ /*
6168
+ * Verify that we the target cgroup is writable for us. This is
6169
+ * usually done by the vfs layer but since we're not going through
6170
+ * the vfs layer here we need to do it "manually".
6171
+ */
6172
+ ret = cgroup_may_write(dst_cgrp, sb);
6173
+ if (ret)
6174
+ goto err;
6175
+
6176
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6177
+ !(kargs->flags & CLONE_THREAD),
6178
+ current->nsproxy->cgroup_ns);
6179
+ if (ret)
6180
+ goto err;
6181
+
6182
+ kargs->cset = find_css_set(cset, dst_cgrp);
6183
+ if (!kargs->cset) {
6184
+ ret = -ENOMEM;
6185
+ goto err;
6186
+ }
6187
+
6188
+ put_css_set(cset);
6189
+ fput(f);
6190
+ kargs->cgrp = dst_cgrp;
6191
+ return ret;
6192
+
6193
+err:
6194
+ cgroup_threadgroup_change_end(current);
6195
+ mutex_unlock(&cgroup_mutex);
6196
+ if (f)
6197
+ fput(f);
6198
+ if (dst_cgrp)
6199
+ cgroup_put(dst_cgrp);
6200
+ put_css_set(cset);
6201
+ if (kargs->cset)
6202
+ put_css_set(kargs->cset);
6203
+ return ret;
6204
+}
6205
+
6206
+/**
6207
+ * cgroup_css_set_put_fork - drop references we took during fork
6208
+ * @kargs: the arguments passed to create the child process
6209
+ *
6210
+ * Drop references to the prepared css_set and target cgroup if
6211
+ * CLONE_INTO_CGROUP was requested.
6212
+ */
6213
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6214
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6215
+{
6216
+ struct cgroup *cgrp = kargs->cgrp;
6217
+ struct css_set *cset = kargs->cset;
6218
+
6219
+ cgroup_threadgroup_change_end(current);
6220
+
6221
+ if (cset) {
6222
+ put_css_set(cset);
6223
+ kargs->cset = NULL;
6224
+ }
6225
+
6226
+ if (kargs->flags & CLONE_INTO_CGROUP) {
6227
+ mutex_unlock(&cgroup_mutex);
6228
+ if (cgrp) {
6229
+ cgroup_put(cgrp);
6230
+ kargs->cgrp = NULL;
6231
+ }
6232
+ }
6233
+}
6234
+
58626235 /**
58636236 * cgroup_can_fork - called on a new task before the process is exposed
5864
- * @child: the task in question.
6237
+ * @child: the child process
58656238 *
5866
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5867
- * returns an error, the fork aborts with that error code. This allows for
5868
- * a cgroup subsystem to conditionally allow or deny new forks.
6239
+ * This prepares a new css_set for the child process which the child will
6240
+ * be attached to in cgroup_post_fork().
6241
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
6242
+ * callback returns an error, the fork aborts with that error code. This
6243
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
58696244 */
5870
-int cgroup_can_fork(struct task_struct *child)
6245
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
58716246 {
58726247 struct cgroup_subsys *ss;
58736248 int i, j, ret;
58746249
6250
+ ret = cgroup_css_set_fork(kargs);
6251
+ if (ret)
6252
+ return ret;
6253
+
58756254 do_each_subsys_mask(ss, i, have_canfork_callback) {
5876
- ret = ss->can_fork(child);
6255
+ ret = ss->can_fork(child, kargs->cset);
58776256 if (ret)
58786257 goto out_revert;
58796258 } while_each_subsys_mask();
....@@ -5885,97 +6264,86 @@
58856264 if (j >= i)
58866265 break;
58876266 if (ss->cancel_fork)
5888
- ss->cancel_fork(child);
6267
+ ss->cancel_fork(child, kargs->cset);
58896268 }
6269
+
6270
+ cgroup_css_set_put_fork(kargs);
58906271
58916272 return ret;
58926273 }
58936274
58946275 /**
58956276 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5896
- * @child: the task in question
6277
+ * @child: the child process
6278
+ * @kargs: the arguments passed to create the child process
58976279 *
58986280 * This calls the cancel_fork() callbacks if a fork failed *after*
5899
- * cgroup_can_fork() succeded.
6281
+ * cgroup_can_fork() succeded and cleans up references we took to
6282
+ * prepare a new css_set for the child process in cgroup_can_fork().
59006283 */
5901
-void cgroup_cancel_fork(struct task_struct *child)
6284
+void cgroup_cancel_fork(struct task_struct *child,
6285
+ struct kernel_clone_args *kargs)
59026286 {
59036287 struct cgroup_subsys *ss;
59046288 int i;
59056289
59066290 for_each_subsys(ss, i)
59076291 if (ss->cancel_fork)
5908
- ss->cancel_fork(child);
6292
+ ss->cancel_fork(child, kargs->cset);
6293
+
6294
+ cgroup_css_set_put_fork(kargs);
59096295 }
59106296
59116297 /**
5912
- * cgroup_post_fork - called on a new task after adding it to the task list
5913
- * @child: the task in question
6298
+ * cgroup_post_fork - finalize cgroup setup for the child process
6299
+ * @child: the child process
59146300 *
5915
- * Adds the task to the list running through its css_set if necessary and
5916
- * call the subsystem fork() callbacks. Has to be after the task is
5917
- * visible on the task list in case we race with the first call to
5918
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5919
- * list.
6301
+ * Attach the child process to its css_set calling the subsystem fork()
6302
+ * callbacks.
59206303 */
5921
-void cgroup_post_fork(struct task_struct *child)
6304
+void cgroup_post_fork(struct task_struct *child,
6305
+ struct kernel_clone_args *kargs)
6306
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
59226307 {
59236308 struct cgroup_subsys *ss;
6309
+ struct css_set *cset;
59246310 int i;
59256311
5926
- /*
5927
- * This may race against cgroup_enable_task_cg_lists(). As that
5928
- * function sets use_task_css_set_links before grabbing
5929
- * tasklist_lock and we just went through tasklist_lock to add
5930
- * @child, it's guaranteed that either we see the set
5931
- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5932
- * @child during its iteration.
5933
- *
5934
- * If we won the race, @child is associated with %current's
5935
- * css_set. Grabbing css_set_lock guarantees both that the
5936
- * association is stable, and, on completion of the parent's
5937
- * migration, @child is visible in the source of migration or
5938
- * already in the destination cgroup. This guarantee is necessary
5939
- * when implementing operations which need to migrate all tasks of
5940
- * a cgroup to another.
5941
- *
5942
- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5943
- * will remain in init_css_set. This is safe because all tasks are
5944
- * in the init_css_set before cg_links is enabled and there's no
5945
- * operation which transfers all tasks out of init_css_set.
5946
- */
5947
- if (use_task_css_set_links) {
5948
- struct css_set *cset;
6312
+ cset = kargs->cset;
6313
+ kargs->cset = NULL;
59496314
5950
- spin_lock_irq(&css_set_lock);
5951
- cset = task_css_set(current);
5952
- if (list_empty(&child->cg_list)) {
5953
- get_css_set(cset);
5954
- cset->nr_tasks++;
5955
- css_set_move_task(child, NULL, cset, false);
5956
- }
6315
+ spin_lock_irq(&css_set_lock);
6316
+
6317
+ /* init tasks are special, only link regular threads */
6318
+ if (likely(child->pid)) {
6319
+ WARN_ON_ONCE(!list_empty(&child->cg_list));
6320
+ cset->nr_tasks++;
6321
+ css_set_move_task(child, NULL, cset, false);
6322
+ } else {
6323
+ put_css_set(cset);
6324
+ cset = NULL;
6325
+ }
6326
+
6327
+ /*
6328
+ * If the cgroup has to be frozen, the new task has too. Let's set
6329
+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
6330
+ * frozen state.
6331
+ */
6332
+ if (unlikely(cgroup_task_freeze(child))) {
6333
+ spin_lock(&child->sighand->siglock);
6334
+ WARN_ON_ONCE(child->frozen);
6335
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
6336
+ spin_unlock(&child->sighand->siglock);
59576337
59586338 /*
5959
- * If the cgroup has to be frozen, the new task has too.
5960
- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
5961
- * the task into the frozen state.
6339
+ * Calling cgroup_update_frozen() isn't required here,
6340
+ * because it will be called anyway a bit later from
6341
+ * do_freezer_trap(). So we avoid cgroup's transient switch
6342
+ * from the frozen state and back.
59626343 */
5963
- if (unlikely(cgroup_task_freeze(child))) {
5964
- spin_lock(&child->sighand->siglock);
5965
- WARN_ON_ONCE(child->frozen);
5966
- child->jobctl |= JOBCTL_TRAP_FREEZE;
5967
- spin_unlock(&child->sighand->siglock);
5968
-
5969
- /*
5970
- * Calling cgroup_update_frozen() isn't required here,
5971
- * because it will be called anyway a bit later
5972
- * from do_freezer_trap(). So we avoid cgroup's
5973
- * transient switch from the frozen state and back.
5974
- */
5975
- }
5976
-
5977
- spin_unlock_irq(&css_set_lock);
59786344 }
6345
+
6346
+ spin_unlock_irq(&css_set_lock);
59796347
59806348 /*
59816349 * Call ss->fork(). This must happen after @child is linked on
....@@ -5985,26 +6353,25 @@
59856353 do_each_subsys_mask(ss, i, have_fork_callback) {
59866354 ss->fork(child);
59876355 } while_each_subsys_mask();
6356
+
6357
+ /* Make the new cset the root_cset of the new cgroup namespace. */
6358
+ if (kargs->flags & CLONE_NEWCGROUP) {
6359
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6360
+
6361
+ get_css_set(cset);
6362
+ child->nsproxy->cgroup_ns->root_cset = cset;
6363
+ put_css_set(rcset);
6364
+ }
6365
+
6366
+ cgroup_css_set_put_fork(kargs);
59886367 }
59896368
59906369 /**
59916370 * cgroup_exit - detach cgroup from exiting task
59926371 * @tsk: pointer to task_struct of exiting process
59936372 *
5994
- * Description: Detach cgroup from @tsk and release it.
6373
+ * Description: Detach cgroup from @tsk.
59956374 *
5996
- * Note that cgroups marked notify_on_release force every task in
5997
- * them to take the global cgroup_mutex mutex when exiting.
5998
- * This could impact scaling on very large systems. Be reluctant to
5999
- * use notify_on_release cgroups where very high task exit scaling
6000
- * is required on large systems.
6001
- *
6002
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
6003
- * call cgroup_exit() while the task is still competent to handle
6004
- * notify_on_release(), then leave the task attached to the root cgroup in
6005
- * each hierarchy for the remainder of its exit. No need to bother with
6006
- * init_css_set refcnting. init_css_set never goes away and we can't race
6007
- * with migration path - PF_EXITING is visible to migration path.
60086375 */
60096376 void cgroup_exit(struct task_struct *tsk)
60106377 {
....@@ -6012,27 +6379,22 @@
60126379 struct css_set *cset;
60136380 int i;
60146381
6015
- /*
6016
- * Unlink from @tsk from its css_set. As migration path can't race
6017
- * with us, we can check css_set and cg_list without synchronization.
6018
- */
6382
+ spin_lock_irq(&css_set_lock);
6383
+
6384
+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
60196385 cset = task_css_set(tsk);
6386
+ css_set_move_task(tsk, cset, NULL, false);
6387
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6388
+ cset->nr_tasks--;
60206389
6021
- if (!list_empty(&tsk->cg_list)) {
6022
- spin_lock_irq(&css_set_lock);
6023
- css_set_move_task(tsk, cset, NULL, false);
6024
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6025
- cset->nr_tasks--;
6390
+ if (dl_task(tsk))
6391
+ dec_dl_tasks_cs(tsk);
60266392
6027
- if (unlikely(cgroup_task_frozen(tsk)))
6028
- cgroup_freezer_frozen_exit(tsk);
6029
- else if (unlikely(cgroup_task_freeze(tsk)))
6030
- cgroup_update_frozen(task_dfl_cgroup(tsk));
6393
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
6394
+ if (unlikely(cgroup_task_freeze(tsk)))
6395
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
60316396
6032
- spin_unlock_irq(&css_set_lock);
6033
- } else {
6034
- get_css_set(cset);
6035
- }
6397
+ spin_unlock_irq(&css_set_lock);
60366398
60376399 /* see cgroup_post_fork() for details */
60386400 do_each_subsys_mask(ss, i, have_exit_callback) {
....@@ -6049,12 +6411,10 @@
60496411 ss->release(task);
60506412 } while_each_subsys_mask();
60516413
6052
- if (use_task_css_set_links) {
6053
- spin_lock_irq(&css_set_lock);
6054
- css_set_skip_task_iters(task_css_set(task), task);
6055
- list_del_init(&task->cg_list);
6056
- spin_unlock_irq(&css_set_lock);
6057
- }
6414
+ spin_lock_irq(&css_set_lock);
6415
+ css_set_skip_task_iters(task_css_set(task), task);
6416
+ list_del_init(&task->cg_list);
6417
+ spin_unlock_irq(&css_set_lock);
60586418 }
60596419
60606420 void cgroup_free(struct task_struct *task)
....@@ -6095,6 +6455,16 @@
60956455 return 1;
60966456 }
60976457 __setup("cgroup_disable=", cgroup_disable);
6458
+
6459
+void __init __weak enable_debug_cgroup(void) { }
6460
+
6461
+static int __init enable_cgroup_debug(char *str)
6462
+{
6463
+ cgroup_debug = true;
6464
+ enable_debug_cgroup();
6465
+ return 1;
6466
+}
6467
+__setup("cgroup_debug", enable_cgroup_debug);
60986468
60996469 /**
61006470 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
....@@ -6195,7 +6565,6 @@
61956565 */
61966566 struct cgroup *cgroup_get_from_fd(int fd)
61976567 {
6198
- struct cgroup_subsys_state *css;
61996568 struct cgroup *cgrp;
62006569 struct file *f;
62016570
....@@ -6203,17 +6572,8 @@
62036572 if (!f)
62046573 return ERR_PTR(-EBADF);
62056574
6206
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6575
+ cgrp = cgroup_get_from_file(f);
62076576 fput(f);
6208
- if (IS_ERR(css))
6209
- return ERR_CAST(css);
6210
-
6211
- cgrp = css->cgroup;
6212
- if (!cgroup_on_dfl(cgrp)) {
6213
- cgroup_put(cgrp);
6214
- return ERR_PTR(-EBADF);
6215
- }
6216
-
62176577 return cgrp;
62186578 }
62196579 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
....@@ -6304,6 +6664,7 @@
63046664 cset = task_css_set(current);
63056665 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
63066666 skcd->val = (unsigned long)cset->dfl_cgrp;
6667
+ cgroup_bpf_get(cset->dfl_cgrp);
63076668 break;
63086669 }
63096670 cpu_relax();
....@@ -6314,7 +6675,6 @@
63146675
63156676 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
63166677 {
6317
- /* Socket clone path */
63186678 if (skcd->val) {
63196679 if (skcd->no_refcnt)
63206680 return;
....@@ -6324,40 +6684,48 @@
63246684 * Don't use cgroup_get_live().
63256685 */
63266686 cgroup_get(sock_cgroup_ptr(skcd));
6687
+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
63276688 }
63286689 }
63296690
63306691 void cgroup_sk_free(struct sock_cgroup_data *skcd)
63316692 {
6693
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6694
+
63326695 if (skcd->no_refcnt)
63336696 return;
6334
-
6335
- cgroup_put(sock_cgroup_ptr(skcd));
6697
+ cgroup_bpf_put(cgrp);
6698
+ cgroup_put(cgrp);
63366699 }
63376700
63386701 #endif /* CONFIG_SOCK_CGROUP_DATA */
63396702
63406703 #ifdef CONFIG_CGROUP_BPF
6341
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
6342
- enum bpf_attach_type type, u32 flags)
6704
+int cgroup_bpf_attach(struct cgroup *cgrp,
6705
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
6706
+ struct bpf_cgroup_link *link,
6707
+ enum bpf_attach_type type,
6708
+ u32 flags)
63436709 {
63446710 int ret;
63456711
63466712 mutex_lock(&cgroup_mutex);
6347
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
6713
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
63486714 mutex_unlock(&cgroup_mutex);
63496715 return ret;
63506716 }
6717
+
63516718 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6352
- enum bpf_attach_type type, u32 flags)
6719
+ enum bpf_attach_type type)
63536720 {
63546721 int ret;
63556722
63566723 mutex_lock(&cgroup_mutex);
6357
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
6724
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
63586725 mutex_unlock(&cgroup_mutex);
63596726 return ret;
63606727 }
6728
+
63616729 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
63626730 union bpf_attr __user *uattr)
63636731 {
....@@ -6418,7 +6786,10 @@
64186786 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
64196787 char *buf)
64206788 {
6421
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
6789
+ return snprintf(buf, PAGE_SIZE,
6790
+ "nsdelegate\n"
6791
+ "memory_localevents\n"
6792
+ "memory_recursiveprot\n");
64226793 }
64236794 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
64246795