hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/cgroup/cgroup.c
....@@ -54,12 +54,16 @@
5454 #include <linux/proc_ns.h>
5555 #include <linux/nsproxy.h>
5656 #include <linux/file.h>
57
+#include <linux/fs_parser.h>
5758 #include <linux/sched/cputime.h>
5859 #include <linux/psi.h>
5960 #include <net/sock.h>
6061
6162 #define CREATE_TRACE_POINTS
6263 #include <trace/events/cgroup.h>
64
+#undef CREATE_TRACE_POINTS
65
+
66
+#include <trace/hooks/cgroup.h>
6367
6468 #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
6569 MAX_CFTYPE_NAME + 2)
....@@ -86,6 +90,7 @@
8690
8791 DEFINE_SPINLOCK(trace_cgroup_path_lock);
8892 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
93
+bool cgroup_debug __read_mostly;
8994
9095 /*
9196 * Protects cgroup_idr and css_idr so that IDs can be released without
....@@ -99,7 +104,7 @@
99104 */
100105 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
101106
102
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
107
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
103108
104109 #define cgroup_assert_mutex_or_rcu_locked() \
105110 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
....@@ -151,11 +156,7 @@
151156
152157 static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
153158
154
-/*
155
- * The default hierarchy, reserved for the subsystems that are otherwise
156
- * unattached - it never has more than a single cgroup, and all tasks are
157
- * part of that cgroup.
158
- */
159
+/* the default hierarchy */
159160 struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
160161 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
161162
....@@ -264,9 +265,6 @@
264265 * can be used to test whether a cgroup is on the default hierarchy for
265266 * cases where a subsystem should behave differnetly depending on the
266267 * interface version.
267
- *
268
- * The set of behaviors which change on the default hierarchy are still
269
- * being determined and the mount option is prefixed with __DEVEL__.
270268 *
271269 * List of changed behaviors:
272270 *
....@@ -502,7 +500,7 @@
502500
503501 rcu_read_lock();
504502 css = cgroup_css(cgrp, ss);
505
- if (!css || !css_tryget_online(css))
503
+ if (css && !css_tryget_online(css))
506504 css = NULL;
507505 rcu_read_unlock();
508506
....@@ -510,7 +508,7 @@
510508 }
511509
512510 /**
513
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
511
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
514512 * @cgrp: the cgroup of interest
515513 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
516514 *
....@@ -519,8 +517,8 @@
519517 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
520518 * function is guaranteed to return non-NULL css.
521519 */
522
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
523
- struct cgroup_subsys *ss)
520
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
521
+ struct cgroup_subsys *ss)
524522 {
525523 lockdep_assert_held(&cgroup_mutex);
526524
....@@ -538,6 +536,35 @@
538536 }
539537
540538 return cgroup_css(cgrp, ss);
539
+}
540
+
541
+/**
542
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
543
+ * @cgrp: the cgroup of interest
544
+ * @ss: the subsystem of interest
545
+ *
546
+ * Find and get the effective css of @cgrp for @ss. The effective css is
547
+ * defined as the matching css of the nearest ancestor including self which
548
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
549
+ * the root css is returned, so this function always returns a valid css.
550
+ *
551
+ * The returned css is not guaranteed to be online, and therefore it is the
552
+ * callers responsiblity to tryget a reference for it.
553
+ */
554
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
555
+ struct cgroup_subsys *ss)
556
+{
557
+ struct cgroup_subsys_state *css;
558
+
559
+ do {
560
+ css = cgroup_css(cgrp, ss);
561
+
562
+ if (css)
563
+ return css;
564
+ cgrp = cgroup_parent(cgrp);
565
+ } while (cgrp);
566
+
567
+ return init_css_set.subsys[ss->id];
541568 }
542569
543570 /**
....@@ -655,10 +682,11 @@
655682 *
656683 * Should be called under cgroup_[tree_]mutex.
657684 */
658
-#define for_each_e_css(css, ssid, cgrp) \
659
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
660
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
661
- ; \
685
+#define for_each_e_css(css, ssid, cgrp) \
686
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
687
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
688
+ cgroup_subsys[(ssid)]))) \
689
+ ; \
662690 else
663691
664692 /**
....@@ -718,25 +746,28 @@
718746 * reference-counted, to improve performance when child cgroups
719747 * haven't been created.
720748 */
721
-struct css_set init_css_set = {
722
- .refcount = REFCOUNT_INIT(1),
723
- .dom_cset = &init_css_set,
724
- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
725
- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
726
- .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
727
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
728
- .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
729
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
730
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
731
- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
732
-
733
- /*
734
- * The following field is re-initialized when this cset gets linked
735
- * in cgroup_init(). However, let's initialize the field
736
- * statically too so that the default cgroup can be accessed safely
737
- * early during boot.
738
- */
739
- .dfl_cgrp = &cgrp_dfl_root.cgrp,
749
+struct ext_css_set init_ext_css_set = {
750
+ .cset = {
751
+ .refcount = REFCOUNT_INIT(1),
752
+ .dom_cset = &init_css_set,
753
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
754
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
755
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
756
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
757
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
758
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
759
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
760
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
761
+ /*
762
+ * The following field is re-initialized when this cset gets linked
763
+ * in cgroup_init(). However, let's initialize the field
764
+ * statically too so that the default cgroup can be accessed safely
765
+ * early during boot.
766
+ */
767
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
768
+ },
769
+ .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
770
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
740771 };
741772
742773 static int css_set_count = 1; /* 1 for init_css_set */
....@@ -802,6 +833,8 @@
802833 break;
803834
804835 cgroup1_check_for_release(cgrp);
836
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
837
+ cgroup_is_populated(cgrp));
805838 cgroup_file_notify(&cgrp->events_file);
806839
807840 child = cgrp;
....@@ -881,8 +914,7 @@
881914 /*
882915 * We are synchronized through cgroup_threadgroup_rwsem
883916 * against PF_EXITING setting such that we can't race
884
- * against cgroup_exit() changing the css_set to
885
- * init_css_set and dropping the old one.
917
+ * against cgroup_exit()/cgroup_free() dropping the css_set.
886918 */
887919 WARN_ON_ONCE(task->flags & PF_EXITING);
888920
....@@ -1060,7 +1092,7 @@
10601092 * @ss is in this hierarchy, so we want the
10611093 * effective css from @cgrp.
10621094 */
1063
- template[i] = cgroup_e_css(cgrp, ss);
1095
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
10641096 } else {
10651097 /*
10661098 * @ss is not in this hierarchy, so we don't want
....@@ -1162,6 +1194,7 @@
11621194 struct cgroup *cgrp)
11631195 {
11641196 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1197
+ struct ext_css_set *ext_cset;
11651198 struct css_set *cset;
11661199 struct list_head tmp_links;
11671200 struct cgrp_cset_link *link;
....@@ -1182,9 +1215,10 @@
11821215 if (cset)
11831216 return cset;
11841217
1185
- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1186
- if (!cset)
1218
+ ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
1219
+ if (!ext_cset)
11871220 return NULL;
1221
+ cset = &ext_cset->cset;
11881222
11891223 /* Allocate all the cgrp_cset_link objects that we'll need */
11901224 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
....@@ -1202,6 +1236,8 @@
12021236 INIT_HLIST_NODE(&cset->hlist);
12031237 INIT_LIST_HEAD(&cset->cgrp_links);
12041238 INIT_LIST_HEAD(&cset->mg_preload_node);
1239
+ INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
1240
+ INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
12051241 INIT_LIST_HEAD(&cset->mg_node);
12061242
12071243 /* Copy the set of subsystem state objects generated in
....@@ -1291,10 +1327,7 @@
12911327
12921328 void cgroup_free_root(struct cgroup_root *root)
12931329 {
1294
- if (root) {
1295
- idr_destroy(&root->cgroup_idr);
1296
- kfree(root);
1297
- }
1330
+ kfree(root);
12981331 }
12991332
13001333 static void cgroup_destroy_root(struct cgroup_root *root)
....@@ -1356,6 +1389,8 @@
13561389 cset = current->nsproxy->cgroup_ns->root_cset;
13571390 if (cset == &init_css_set) {
13581391 res = &root->cgrp;
1392
+ } else if (root == &cgrp_dfl_root) {
1393
+ res = cset->dfl_cgrp;
13591394 } else {
13601395 struct cgrp_cset_link *link;
13611396
....@@ -1412,9 +1447,8 @@
14121447 struct cgroup_root *root)
14131448 {
14141449 /*
1415
- * No need to lock the task - since we hold cgroup_mutex the
1416
- * task can't change groups, so the only thing that can happen
1417
- * is that it exits and its css is set back to init_css_set.
1450
+ * No need to lock the task - since we hold css_set_lock the
1451
+ * task can't change groups.
14181452 */
14191453 return cset_cgroup_from_root(task_css_set(task), root);
14201454 }
....@@ -1453,12 +1487,15 @@
14531487 struct cgroup_subsys *ss = cft->ss;
14541488
14551489 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1456
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1457
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1458
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1490
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1491
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1492
+
1493
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1494
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
14591495 cft->name);
1460
- else
1496
+ } else {
14611497 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1498
+ }
14621499 return buf;
14631500 }
14641501
....@@ -1815,26 +1852,42 @@
18151852 return len;
18161853 }
18171854
1818
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
1855
+enum cgroup2_param {
1856
+ Opt_nsdelegate,
1857
+ Opt_memory_localevents,
1858
+ Opt_memory_recursiveprot,
1859
+ nr__cgroup2_params
1860
+};
1861
+
1862
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1863
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
1864
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
1865
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
1866
+ {}
1867
+};
1868
+
1869
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
18191870 {
1820
- char *token;
1871
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1872
+ struct fs_parse_result result;
1873
+ int opt;
18211874
1822
- *root_flags = 0;
1875
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1876
+ if (opt < 0)
1877
+ return opt;
18231878
1824
- if (!data || *data == '\0')
1879
+ switch (opt) {
1880
+ case Opt_nsdelegate:
1881
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
18251882 return 0;
1826
-
1827
- while ((token = strsep(&data, ",")) != NULL) {
1828
- if (!strcmp(token, "nsdelegate")) {
1829
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
1830
- continue;
1831
- }
1832
-
1833
- pr_err("cgroup2: unknown option \"%s\"\n", token);
1834
- return -EINVAL;
1883
+ case Opt_memory_localevents:
1884
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1885
+ return 0;
1886
+ case Opt_memory_recursiveprot:
1887
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1888
+ return 0;
18351889 }
1836
-
1837
- return 0;
1890
+ return -EINVAL;
18381891 }
18391892
18401893 static void apply_cgroup_root_flags(unsigned int root_flags)
....@@ -1844,6 +1897,16 @@
18441897 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
18451898 else
18461899 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1900
+
1901
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1902
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1903
+ else
1904
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1905
+
1906
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1907
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1908
+ else
1909
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
18471910 }
18481911 }
18491912
....@@ -1851,79 +1914,19 @@
18511914 {
18521915 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
18531916 seq_puts(seq, ",nsdelegate");
1917
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1918
+ seq_puts(seq, ",memory_localevents");
1919
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1920
+ seq_puts(seq, ",memory_recursiveprot");
18541921 return 0;
18551922 }
18561923
1857
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1924
+static int cgroup_reconfigure(struct fs_context *fc)
18581925 {
1859
- unsigned int root_flags;
1860
- int ret;
1926
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
18611927
1862
- ret = parse_cgroup_root_flags(data, &root_flags);
1863
- if (ret)
1864
- return ret;
1865
-
1866
- apply_cgroup_root_flags(root_flags);
1928
+ apply_cgroup_root_flags(ctx->flags);
18671929 return 0;
1868
-}
1869
-
1870
-/*
1871
- * To reduce the fork() overhead for systems that are not actually using
1872
- * their cgroups capability, we don't maintain the lists running through
1873
- * each css_set to its tasks until we see the list actually used - in other
1874
- * words after the first mount.
1875
- */
1876
-static bool use_task_css_set_links __read_mostly;
1877
-
1878
-static void cgroup_enable_task_cg_lists(void)
1879
-{
1880
- struct task_struct *p, *g;
1881
-
1882
- /*
1883
- * We need tasklist_lock because RCU is not safe against
1884
- * while_each_thread(). Besides, a forking task that has passed
1885
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
1886
- * is not guaranteed to have its child immediately visible in the
1887
- * tasklist if we walk through it with RCU.
1888
- */
1889
- read_lock(&tasklist_lock);
1890
- spin_lock_irq(&css_set_lock);
1891
-
1892
- if (use_task_css_set_links)
1893
- goto out_unlock;
1894
-
1895
- use_task_css_set_links = true;
1896
-
1897
- do_each_thread(g, p) {
1898
- WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1899
- task_css_set(p) != &init_css_set);
1900
-
1901
- /*
1902
- * We should check if the process is exiting, otherwise
1903
- * it will race with cgroup_exit() in that the list
1904
- * entry won't be deleted though the process has exited.
1905
- * Do it while holding siglock so that we don't end up
1906
- * racing against cgroup_exit().
1907
- *
1908
- * Interrupts were already disabled while acquiring
1909
- * the css_set_lock, so we do not need to disable it
1910
- * again when acquiring the sighand->siglock here.
1911
- */
1912
- spin_lock(&p->sighand->siglock);
1913
- if (!(p->flags & PF_EXITING)) {
1914
- struct css_set *cset = task_css_set(p);
1915
-
1916
- if (!css_set_populated(cset))
1917
- css_set_update_populated(cset, true);
1918
- list_add_tail(&p->cg_list, &cset->tasks);
1919
- get_css_set(cset);
1920
- cset->nr_tasks++;
1921
- }
1922
- spin_unlock(&p->sighand->siglock);
1923
- } while_each_thread(g, p);
1924
-out_unlock:
1925
- spin_unlock_irq(&css_set_lock);
1926
- read_unlock(&tasklist_lock);
19271930 }
19281931
19291932 static void init_cgroup_housekeeping(struct cgroup *cgrp)
....@@ -1951,22 +1954,22 @@
19511954 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
19521955 }
19531956
1954
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1957
+void init_cgroup_root(struct cgroup_fs_context *ctx)
19551958 {
1959
+ struct cgroup_root *root = ctx->root;
19561960 struct cgroup *cgrp = &root->cgrp;
19571961
19581962 INIT_LIST_HEAD(&root->root_list);
19591963 atomic_set(&root->nr_cgrps, 1);
19601964 cgrp->root = root;
19611965 init_cgroup_housekeeping(cgrp);
1962
- idr_init(&root->cgroup_idr);
19631966
1964
- root->flags = opts->flags;
1965
- if (opts->release_agent)
1966
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1967
- if (opts->name)
1968
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1969
- if (opts->cpuset_clone_children)
1967
+ root->flags = ctx->flags;
1968
+ if (ctx->release_agent)
1969
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
1970
+ if (ctx->name)
1971
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
1972
+ if (ctx->cpuset_clone_children)
19701973 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
19711974 }
19721975
....@@ -1979,12 +1982,6 @@
19791982 int i, ret;
19801983
19811984 lockdep_assert_held(&cgroup_mutex);
1982
-
1983
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1984
- if (ret < 0)
1985
- goto out;
1986
- root_cgrp->id = ret;
1987
- root_cgrp->ancestor_ids[0] = ret;
19881985
19891986 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
19901987 0, GFP_KERNEL);
....@@ -2011,13 +2008,16 @@
20112008
20122009 root->kf_root = kernfs_create_root(kf_sops,
20132010 KERNFS_ROOT_CREATE_DEACTIVATED |
2014
- KERNFS_ROOT_SUPPORT_EXPORTOP,
2011
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
2012
+ KERNFS_ROOT_SUPPORT_USER_XATTR,
20152013 root_cgrp);
20162014 if (IS_ERR(root->kf_root)) {
20172015 ret = PTR_ERR(root->kf_root);
20182016 goto exit_root_id;
20192017 }
20202018 root_cgrp->kn = root->kf_root->kn;
2019
+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2020
+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
20212021
20222022 ret = css_populate_dir(&root_cgrp->self);
20232023 if (ret)
....@@ -2055,7 +2055,6 @@
20552055 BUG_ON(!list_empty(&root_cgrp->self.children));
20562056 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
20572057
2058
- kernfs_activate(root_cgrp->kn);
20592058 ret = 0;
20602059 goto out;
20612060
....@@ -2071,91 +2070,117 @@
20712070 return ret;
20722071 }
20732072
2074
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
2075
- struct cgroup_root *root, unsigned long magic,
2076
- struct cgroup_namespace *ns)
2073
+int cgroup_do_get_tree(struct fs_context *fc)
20772074 {
2078
- struct dentry *dentry;
2079
- bool new_sb = false;
2075
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2076
+ int ret;
20802077
2081
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
2078
+ ctx->kfc.root = ctx->root->kf_root;
2079
+ if (fc->fs_type == &cgroup2_fs_type)
2080
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2081
+ else
2082
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2083
+ ret = kernfs_get_tree(fc);
20822084
20832085 /*
20842086 * In non-init cgroup namespace, instead of root cgroup's dentry,
20852087 * we return the dentry corresponding to the cgroupns->root_cgrp.
20862088 */
2087
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
2089
+ if (!ret && ctx->ns != &init_cgroup_ns) {
20882090 struct dentry *nsdentry;
2089
- struct super_block *sb = dentry->d_sb;
2091
+ struct super_block *sb = fc->root->d_sb;
20902092 struct cgroup *cgrp;
20912093
20922094 mutex_lock(&cgroup_mutex);
20932095 spin_lock_irq(&css_set_lock);
20942096
2095
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
2097
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
20962098
20972099 spin_unlock_irq(&css_set_lock);
20982100 mutex_unlock(&cgroup_mutex);
20992101
21002102 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2101
- dput(dentry);
2102
- if (IS_ERR(nsdentry))
2103
+ dput(fc->root);
2104
+ if (IS_ERR(nsdentry)) {
21032105 deactivate_locked_super(sb);
2104
- dentry = nsdentry;
2106
+ ret = PTR_ERR(nsdentry);
2107
+ nsdentry = NULL;
2108
+ }
2109
+ fc->root = nsdentry;
21052110 }
21062111
2107
- if (!new_sb)
2108
- cgroup_put(&root->cgrp);
2112
+ if (!ctx->kfc.new_sb_created)
2113
+ cgroup_put(&ctx->root->cgrp);
21092114
2110
- return dentry;
2115
+ return ret;
21112116 }
21122117
2113
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2114
- int flags, const char *unused_dev_name,
2115
- void *data)
2118
+/*
2119
+ * Destroy a cgroup filesystem context.
2120
+ */
2121
+static void cgroup_fs_context_free(struct fs_context *fc)
21162122 {
2117
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2118
- struct dentry *dentry;
2123
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2124
+
2125
+ kfree(ctx->name);
2126
+ kfree(ctx->release_agent);
2127
+ put_cgroup_ns(ctx->ns);
2128
+ kernfs_free_fs_context(fc);
2129
+ kfree(ctx);
2130
+}
2131
+
2132
+static int cgroup_get_tree(struct fs_context *fc)
2133
+{
2134
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
21192135 int ret;
21202136
2121
- get_cgroup_ns(ns);
2137
+ cgrp_dfl_visible = true;
2138
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
2139
+ ctx->root = &cgrp_dfl_root;
21222140
2123
- /* Check if the caller has permission to mount. */
2124
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2125
- put_cgroup_ns(ns);
2126
- return ERR_PTR(-EPERM);
2127
- }
2141
+ ret = cgroup_do_get_tree(fc);
2142
+ if (!ret)
2143
+ apply_cgroup_root_flags(ctx->flags);
2144
+ return ret;
2145
+}
21282146
2129
- /*
2130
- * The first time anyone tries to mount a cgroup, enable the list
2131
- * linking each css_set to its tasks and fix up all existing tasks.
2132
- */
2133
- if (!use_task_css_set_links)
2134
- cgroup_enable_task_cg_lists();
2147
+static const struct fs_context_operations cgroup_fs_context_ops = {
2148
+ .free = cgroup_fs_context_free,
2149
+ .parse_param = cgroup2_parse_param,
2150
+ .get_tree = cgroup_get_tree,
2151
+ .reconfigure = cgroup_reconfigure,
2152
+};
21352153
2136
- if (fs_type == &cgroup2_fs_type) {
2137
- unsigned int root_flags;
2154
+static const struct fs_context_operations cgroup1_fs_context_ops = {
2155
+ .free = cgroup_fs_context_free,
2156
+ .parse_param = cgroup1_parse_param,
2157
+ .get_tree = cgroup1_get_tree,
2158
+ .reconfigure = cgroup1_reconfigure,
2159
+};
21382160
2139
- ret = parse_cgroup_root_flags(data, &root_flags);
2140
- if (ret) {
2141
- put_cgroup_ns(ns);
2142
- return ERR_PTR(ret);
2143
- }
2161
+/*
2162
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
2163
+ * we select the namespace we're going to use.
2164
+ */
2165
+static int cgroup_init_fs_context(struct fs_context *fc)
2166
+{
2167
+ struct cgroup_fs_context *ctx;
21442168
2145
- cgrp_dfl_visible = true;
2146
- cgroup_get_live(&cgrp_dfl_root.cgrp);
2169
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2170
+ if (!ctx)
2171
+ return -ENOMEM;
21472172
2148
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
2149
- CGROUP2_SUPER_MAGIC, ns);
2150
- if (!IS_ERR(dentry))
2151
- apply_cgroup_root_flags(root_flags);
2152
- } else {
2153
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
2154
- CGROUP_SUPER_MAGIC, ns);
2155
- }
2156
-
2157
- put_cgroup_ns(ns);
2158
- return dentry;
2173
+ ctx->ns = current->nsproxy->cgroup_ns;
2174
+ get_cgroup_ns(ctx->ns);
2175
+ fc->fs_private = &ctx->kfc;
2176
+ if (fc->fs_type == &cgroup2_fs_type)
2177
+ fc->ops = &cgroup_fs_context_ops;
2178
+ else
2179
+ fc->ops = &cgroup1_fs_context_ops;
2180
+ put_user_ns(fc->user_ns);
2181
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
2182
+ fc->global = true;
2183
+ return 0;
21592184 }
21602185
21612186 static void cgroup_kill_sb(struct super_block *sb)
....@@ -2171,25 +2196,73 @@
21712196 * And don't kill the default root.
21722197 */
21732198 if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2174
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
2199
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2200
+ cgroup_bpf_offline(&root->cgrp);
21752201 percpu_ref_kill(&root->cgrp.self.refcnt);
2202
+ }
21762203 cgroup_put(&root->cgrp);
21772204 kernfs_kill_sb(sb);
21782205 }
21792206
21802207 struct file_system_type cgroup_fs_type = {
2181
- .name = "cgroup",
2182
- .mount = cgroup_mount,
2183
- .kill_sb = cgroup_kill_sb,
2184
- .fs_flags = FS_USERNS_MOUNT,
2208
+ .name = "cgroup",
2209
+ .init_fs_context = cgroup_init_fs_context,
2210
+ .parameters = cgroup1_fs_parameters,
2211
+ .kill_sb = cgroup_kill_sb,
2212
+ .fs_flags = FS_USERNS_MOUNT,
21852213 };
21862214
21872215 static struct file_system_type cgroup2_fs_type = {
2188
- .name = "cgroup2",
2189
- .mount = cgroup_mount,
2190
- .kill_sb = cgroup_kill_sb,
2191
- .fs_flags = FS_USERNS_MOUNT,
2216
+ .name = "cgroup2",
2217
+ .init_fs_context = cgroup_init_fs_context,
2218
+ .parameters = cgroup2_fs_parameters,
2219
+ .kill_sb = cgroup_kill_sb,
2220
+ .fs_flags = FS_USERNS_MOUNT,
21922221 };
2222
+
2223
+#ifdef CONFIG_CPUSETS
2224
+static const struct fs_context_operations cpuset_fs_context_ops = {
2225
+ .get_tree = cgroup1_get_tree,
2226
+ .free = cgroup_fs_context_free,
2227
+};
2228
+
2229
+/*
2230
+ * This is ugly, but preserves the userspace API for existing cpuset
2231
+ * users. If someone tries to mount the "cpuset" filesystem, we
2232
+ * silently switch it to mount "cgroup" instead
2233
+ */
2234
+static int cpuset_init_fs_context(struct fs_context *fc)
2235
+{
2236
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2237
+ struct cgroup_fs_context *ctx;
2238
+ int err;
2239
+
2240
+ err = cgroup_init_fs_context(fc);
2241
+ if (err) {
2242
+ kfree(agent);
2243
+ return err;
2244
+ }
2245
+
2246
+ fc->ops = &cpuset_fs_context_ops;
2247
+
2248
+ ctx = cgroup_fc2context(fc);
2249
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
2250
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
2251
+ ctx->release_agent = agent;
2252
+
2253
+ get_filesystem(&cgroup_fs_type);
2254
+ put_filesystem(fc->fs_type);
2255
+ fc->fs_type = &cgroup_fs_type;
2256
+
2257
+ return 0;
2258
+}
2259
+
2260
+static struct file_system_type cpuset_fs_type = {
2261
+ .name = "cpuset",
2262
+ .init_fs_context = cpuset_init_fs_context,
2263
+ .fs_flags = FS_USERNS_MOUNT,
2264
+};
2265
+#endif
21932266
21942267 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
21952268 struct cgroup_namespace *ns)
....@@ -2256,6 +2329,47 @@
22562329 EXPORT_SYMBOL_GPL(task_cgroup_path);
22572330
22582331 /**
2332
+ * cgroup_attach_lock - Lock for ->attach()
2333
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2334
+ *
2335
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
2336
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2337
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
2338
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2339
+ * lead to deadlocks.
2340
+ *
2341
+ * Bringing up a CPU may involve creating and destroying tasks which requires
2342
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2343
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2344
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2345
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
2346
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
2347
+ *
2348
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2349
+ *
2350
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
2351
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2352
+ * CPU hotplug is disabled on entry.
2353
+ */
2354
+static void cgroup_attach_lock(bool lock_threadgroup)
2355
+{
2356
+ cpus_read_lock();
2357
+ if (lock_threadgroup)
2358
+ percpu_down_write(&cgroup_threadgroup_rwsem);
2359
+}
2360
+
2361
+/**
2362
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
2363
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2364
+ */
2365
+static void cgroup_attach_unlock(bool lock_threadgroup)
2366
+{
2367
+ if (lock_threadgroup)
2368
+ percpu_up_write(&cgroup_threadgroup_rwsem);
2369
+ cpus_read_unlock();
2370
+}
2371
+
2372
+/**
22592373 * cgroup_migrate_add_task - add a migration target task to a migration context
22602374 * @task: target task
22612375 * @mgctx: target migration context
....@@ -2276,9 +2390,8 @@
22762390 if (task->flags & PF_EXITING)
22772391 return;
22782392
2279
- /* leave @task alone if post_fork() hasn't linked it yet */
2280
- if (list_empty(&task->cg_list))
2281
- return;
2393
+ /* cgroup_threadgroup_rwsem protects racing against forks */
2394
+ WARN_ON_ONCE(list_empty(&task->cg_list));
22822395
22832396 cset = task_css_set(task);
22842397 if (!cset->mg_src_cgrp)
....@@ -2310,6 +2423,7 @@
23102423
23112424 return cgroup_taskset_next(tset, dst_cssp);
23122425 }
2426
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
23132427
23142428 /**
23152429 * cgroup_taskset_next - iterate to the next task in taskset
....@@ -2356,6 +2470,7 @@
23562470
23572471 return NULL;
23582472 }
2473
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
23592474
23602475 /**
23612476 * cgroup_taskset_migrate - migrate a taskset
....@@ -2426,6 +2541,7 @@
24262541 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
24272542 if (ss->attach) {
24282543 tset->ssid = ssid;
2544
+ trace_android_vh_cgroup_attach(ss, tset);
24292545 ss->attach(tset);
24302546 }
24312547 } while_each_subsys_mask();
....@@ -2510,22 +2626,28 @@
25102626 */
25112627 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
25122628 {
2513
- LIST_HEAD(preloaded);
2514
- struct css_set *cset, *tmp_cset;
2629
+ struct ext_css_set *cset, *tmp_cset;
25152630
25162631 lockdep_assert_held(&cgroup_mutex);
25172632
25182633 spin_lock_irq(&css_set_lock);
25192634
2520
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2521
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2635
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2636
+ mg_src_preload_node) {
2637
+ cset->cset.mg_src_cgrp = NULL;
2638
+ cset->cset.mg_dst_cgrp = NULL;
2639
+ cset->cset.mg_dst_cset = NULL;
2640
+ list_del_init(&cset->mg_src_preload_node);
2641
+ put_css_set_locked(&cset->cset);
2642
+ }
25222643
2523
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2524
- cset->mg_src_cgrp = NULL;
2525
- cset->mg_dst_cgrp = NULL;
2526
- cset->mg_dst_cset = NULL;
2527
- list_del_init(&cset->mg_preload_node);
2528
- put_css_set_locked(cset);
2644
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2645
+ mg_dst_preload_node) {
2646
+ cset->cset.mg_src_cgrp = NULL;
2647
+ cset->cset.mg_dst_cgrp = NULL;
2648
+ cset->cset.mg_dst_cset = NULL;
2649
+ list_del_init(&cset->mg_dst_preload_node);
2650
+ put_css_set_locked(&cset->cset);
25292651 }
25302652
25312653 spin_unlock_irq(&css_set_lock);
....@@ -2552,6 +2674,7 @@
25522674 struct cgroup_mgctx *mgctx)
25532675 {
25542676 struct cgroup *src_cgrp;
2677
+ struct ext_css_set *ext_src_cset;
25552678
25562679 lockdep_assert_held(&cgroup_mutex);
25572680 lockdep_assert_held(&css_set_lock);
....@@ -2565,8 +2688,9 @@
25652688 return;
25662689
25672690 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2691
+ ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
25682692
2569
- if (!list_empty(&src_cset->mg_preload_node))
2693
+ if (!list_empty(&ext_src_cset->mg_src_preload_node))
25702694 return;
25712695
25722696 WARN_ON(src_cset->mg_src_cgrp);
....@@ -2577,7 +2701,7 @@
25772701 src_cset->mg_src_cgrp = src_cgrp;
25782702 src_cset->mg_dst_cgrp = dst_cgrp;
25792703 get_css_set(src_cset);
2580
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2704
+ list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
25812705 }
25822706
25832707 /**
....@@ -2596,20 +2720,23 @@
25962720 */
25972721 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
25982722 {
2599
- struct css_set *src_cset, *tmp_cset;
2723
+ struct ext_css_set *ext_src_set, *tmp_cset;
26002724
26012725 lockdep_assert_held(&cgroup_mutex);
26022726
26032727 /* look up the dst cset for each src cset and link it to src */
2604
- list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2605
- mg_preload_node) {
2728
+ list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
2729
+ mg_src_preload_node) {
2730
+ struct css_set *src_cset = &ext_src_set->cset;
26062731 struct css_set *dst_cset;
2732
+ struct ext_css_set *ext_dst_cset;
26072733 struct cgroup_subsys *ss;
26082734 int ssid;
26092735
26102736 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
26112737 if (!dst_cset)
26122738 return -ENOMEM;
2739
+ ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
26132740
26142741 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
26152742
....@@ -2621,7 +2748,7 @@
26212748 if (src_cset == dst_cset) {
26222749 src_cset->mg_src_cgrp = NULL;
26232750 src_cset->mg_dst_cgrp = NULL;
2624
- list_del_init(&src_cset->mg_preload_node);
2751
+ list_del_init(&ext_src_set->mg_src_preload_node);
26252752 put_css_set(src_cset);
26262753 put_css_set(dst_cset);
26272754 continue;
....@@ -2629,8 +2756,8 @@
26292756
26302757 src_cset->mg_dst_cset = dst_cset;
26312758
2632
- if (list_empty(&dst_cset->mg_preload_node))
2633
- list_add_tail(&dst_cset->mg_preload_node,
2759
+ if (list_empty(&ext_dst_cset->mg_dst_preload_node))
2760
+ list_add_tail(&ext_dst_cset->mg_dst_preload_node,
26342761 &mgctx->preloaded_dst_csets);
26352762 else
26362763 put_css_set(dst_cset);
....@@ -2698,11 +2825,7 @@
26982825 {
26992826 DEFINE_CGROUP_MGCTX(mgctx);
27002827 struct task_struct *task;
2701
- int ret;
2702
-
2703
- ret = cgroup_migrate_vet_dst(dst_cgrp);
2704
- if (ret)
2705
- return ret;
2828
+ int ret = 0;
27062829
27072830 /* look up all src csets */
27082831 spin_lock_irq(&css_set_lock);
....@@ -2729,16 +2852,28 @@
27292852 return ret;
27302853 }
27312854
2732
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
2733
- __acquires(&cgroup_threadgroup_rwsem)
2855
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2856
+ bool *threadgroup_locked,
2857
+ struct cgroup *dst_cgrp)
27342858 {
27352859 struct task_struct *tsk;
27362860 pid_t pid;
2861
+ bool force_migration = false;
27372862
27382863 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
27392864 return ERR_PTR(-EINVAL);
27402865
2741
- percpu_down_write(&cgroup_threadgroup_rwsem);
2866
+ /*
2867
+ * If we migrate a single thread, we don't care about threadgroup
2868
+ * stability. If the thread is `current`, it won't exit(2) under our
2869
+ * hands or change PID through exec(2). We exclude
2870
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
2871
+ * callers by cgroup_mutex.
2872
+ * Therefore, we can skip the global lock.
2873
+ */
2874
+ lockdep_assert_held(&cgroup_mutex);
2875
+ *threadgroup_locked = pid || threadgroup;
2876
+ cgroup_attach_lock(*threadgroup_locked);
27422877
27432878 rcu_read_lock();
27442879 if (pid) {
....@@ -2754,13 +2889,16 @@
27542889 if (threadgroup)
27552890 tsk = tsk->group_leader;
27562891
2892
+ if (tsk->flags & PF_KTHREAD)
2893
+ trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration);
2894
+
27572895 /*
27582896 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
27592897 * If userland migrates such a kthread to a non-root cgroup, it can
27602898 * become trapped in a cpuset, or RT kthread may be born in a
27612899 * cgroup with no rt_runtime allocated. Just say no.
27622900 */
2763
- if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2901
+ if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) {
27642902 tsk = ERR_PTR(-EINVAL);
27652903 goto out_unlock_threadgroup;
27662904 }
....@@ -2769,14 +2907,14 @@
27692907 goto out_unlock_rcu;
27702908
27712909 out_unlock_threadgroup:
2772
- percpu_up_write(&cgroup_threadgroup_rwsem);
2910
+ cgroup_attach_unlock(*threadgroup_locked);
2911
+ *threadgroup_locked = false;
27732912 out_unlock_rcu:
27742913 rcu_read_unlock();
27752914 return tsk;
27762915 }
27772916
2778
-void cgroup_procs_write_finish(struct task_struct *task)
2779
- __releases(&cgroup_threadgroup_rwsem)
2917
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
27802918 {
27812919 struct cgroup_subsys *ss;
27822920 int ssid;
....@@ -2784,7 +2922,8 @@
27842922 /* release reference from cgroup_procs_write_start() */
27852923 put_task_struct(task);
27862924
2787
- percpu_up_write(&cgroup_threadgroup_rwsem);
2925
+ cgroup_attach_unlock(threadgroup_locked);
2926
+
27882927 for_each_subsys(ss, ssid)
27892928 if (ss->post_attach)
27902929 ss->post_attach();
....@@ -2799,7 +2938,7 @@
27992938 do_each_subsys_mask(ss, ssid, ss_mask) {
28002939 if (printed)
28012940 seq_putc(seq, ' ');
2802
- seq_printf(seq, "%s", ss->name);
2941
+ seq_puts(seq, ss->name);
28032942 printed = true;
28042943 } while_each_subsys_mask();
28052944 if (printed)
....@@ -2838,12 +2977,11 @@
28382977 DEFINE_CGROUP_MGCTX(mgctx);
28392978 struct cgroup_subsys_state *d_css;
28402979 struct cgroup *dsct;
2841
- struct css_set *src_cset;
2980
+ struct ext_css_set *ext_src_set;
2981
+ bool has_tasks;
28422982 int ret;
28432983
28442984 lockdep_assert_held(&cgroup_mutex);
2845
-
2846
- percpu_down_write(&cgroup_threadgroup_rwsem);
28472985
28482986 /* look up all csses currently attached to @cgrp's subtree */
28492987 spin_lock_irq(&css_set_lock);
....@@ -2855,17 +2993,27 @@
28552993 }
28562994 spin_unlock_irq(&css_set_lock);
28572995
2996
+ /*
2997
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
2998
+ * However, if there are no source csets for @cgrp, changing its
2999
+ * controllers isn't gonna produce any task migrations and the
3000
+ * write-locking can be skipped safely.
3001
+ */
3002
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
3003
+ cgroup_attach_lock(has_tasks);
3004
+
28583005 /* NULL dst indicates self on default hierarchy */
28593006 ret = cgroup_migrate_prepare_dst(&mgctx);
28603007 if (ret)
28613008 goto out_finish;
28623009
28633010 spin_lock_irq(&css_set_lock);
2864
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
3011
+ list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
3012
+ mg_src_preload_node) {
28653013 struct task_struct *task, *ntask;
28663014
28673015 /* all tasks in src_csets need to be migrated */
2868
- list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3016
+ list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
28693017 cgroup_migrate_add_task(task, &mgctx);
28703018 }
28713019 spin_unlock_irq(&css_set_lock);
....@@ -2873,7 +3021,7 @@
28733021 ret = cgroup_migrate_execute(&mgctx);
28743022 out_finish:
28753023 cgroup_migrate_finish(&mgctx);
2876
- percpu_up_write(&cgroup_threadgroup_rwsem);
3024
+ cgroup_attach_unlock(has_tasks);
28773025 return ret;
28783026 }
28793027
....@@ -3106,7 +3254,7 @@
31063254 return ret;
31073255
31083256 /*
3109
- * At this point, cgroup_e_css() results reflect the new csses
3257
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
31103258 * making the following cgroup_update_dfl_csses() properly update
31113259 * css associations of all tasks in the subtree.
31123260 */
....@@ -3506,22 +3654,33 @@
35063654 #ifdef CONFIG_PSI
35073655 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
35083656 {
3509
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
3657
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3658
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3659
+
3660
+ return psi_show(seq, psi, PSI_IO);
35103661 }
35113662 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
35123663 {
3513
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
3664
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3665
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3666
+
3667
+ return psi_show(seq, psi, PSI_MEM);
35143668 }
35153669 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
35163670 {
3517
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
3671
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
3672
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3673
+
3674
+ return psi_show(seq, psi, PSI_CPU);
35183675 }
35193676
35203677 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
35213678 size_t nbytes, enum psi_res res)
35223679 {
3680
+ struct cgroup_file_ctx *ctx = of->priv;
35233681 struct psi_trigger *new;
35243682 struct cgroup *cgrp;
3683
+ struct psi_group *psi;
35253684
35263685 cgrp = cgroup_kn_lock_live(of->kn, false);
35273686 if (!cgrp)
....@@ -3530,14 +3689,20 @@
35303689 cgroup_get(cgrp);
35313690 cgroup_kn_unlock(of->kn);
35323691
3533
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3692
+ /* Allow only one trigger per file descriptor */
3693
+ if (ctx->psi.trigger) {
3694
+ cgroup_put(cgrp);
3695
+ return -EBUSY;
3696
+ }
3697
+
3698
+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
3699
+ new = psi_trigger_create(psi, buf, nbytes, res);
35343700 if (IS_ERR(new)) {
35353701 cgroup_put(cgrp);
35363702 return PTR_ERR(new);
35373703 }
35383704
3539
- psi_trigger_replace(&of->priv, new);
3540
-
3705
+ smp_store_release(&ctx->psi.trigger, new);
35413706 cgroup_put(cgrp);
35423707
35433708 return nbytes;
....@@ -3567,12 +3732,15 @@
35673732 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
35683733 poll_table *pt)
35693734 {
3570
- return psi_trigger_poll(&of->priv, of->file, pt);
3735
+ struct cgroup_file_ctx *ctx = of->priv;
3736
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
35713737 }
35723738
35733739 static void cgroup_pressure_release(struct kernfs_open_file *of)
35743740 {
3575
- psi_trigger_replace(&of->priv, NULL);
3741
+ struct cgroup_file_ctx *ctx = of->priv;
3742
+
3743
+ psi_trigger_destroy(ctx->psi.trigger);
35763744 }
35773745
35783746 bool cgroup_psi_enabled(void)
....@@ -3625,28 +3793,50 @@
36253793 static int cgroup_file_open(struct kernfs_open_file *of)
36263794 {
36273795 struct cftype *cft = of->kn->priv;
3796
+ struct cgroup_file_ctx *ctx;
3797
+ int ret;
36283798
3629
- if (cft->open)
3630
- return cft->open(of);
3631
- return 0;
3799
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
3800
+ if (!ctx)
3801
+ return -ENOMEM;
3802
+
3803
+ ctx->ns = current->nsproxy->cgroup_ns;
3804
+ get_cgroup_ns(ctx->ns);
3805
+ of->priv = ctx;
3806
+
3807
+ if (!cft->open)
3808
+ return 0;
3809
+
3810
+ ret = cft->open(of);
3811
+ if (ret) {
3812
+ put_cgroup_ns(ctx->ns);
3813
+ kfree(ctx);
3814
+ }
3815
+ return ret;
36323816 }
36333817
36343818 static void cgroup_file_release(struct kernfs_open_file *of)
36353819 {
36363820 struct cftype *cft = of->kn->priv;
3821
+ struct cgroup_file_ctx *ctx = of->priv;
36373822
36383823 if (cft->release)
36393824 cft->release(of);
3825
+ put_cgroup_ns(ctx->ns);
3826
+ kfree(ctx);
36403827 }
36413828
36423829 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
36433830 size_t nbytes, loff_t off)
36443831 {
3645
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
3832
+ struct cgroup_file_ctx *ctx = of->priv;
36463833 struct cgroup *cgrp = of->kn->parent->priv;
36473834 struct cftype *cft = of->kn->priv;
36483835 struct cgroup_subsys_state *css;
36493836 int ret;
3837
+
3838
+ if (!nbytes)
3839
+ return 0;
36503840
36513841 /*
36523842 * If namespaces are delegation boundaries, disallow writes to
....@@ -3656,7 +3846,7 @@
36563846 */
36573847 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
36583848 !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
3659
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
3849
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
36603850 return -EPERM;
36613851
36623852 if (cft->write)
....@@ -3843,7 +4033,8 @@
38434033 continue;
38444034 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
38454035 continue;
3846
-
4036
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4037
+ continue;
38474038 if (is_add) {
38484039 ret = cgroup_add_file(css, cgrp, cft);
38494040 if (ret) {
....@@ -4045,6 +4236,7 @@
40454236 cft->flags |= __CFTYPE_NOT_ON_DFL;
40464237 return cgroup_add_cftypes(ss, cfts);
40474238 }
4239
+EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
40484240
40494241 /**
40504242 * cgroup_file_notify - generate a file modified event for a cgroup_file
....@@ -4120,7 +4312,8 @@
41204312 } else if (likely(!(pos->flags & CSS_RELEASED))) {
41214313 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
41224314 } else {
4123
- list_for_each_entry_rcu(next, &parent->children, sibling)
4315
+ list_for_each_entry_rcu(next, &parent->children, sibling,
4316
+ lockdep_is_held(&cgroup_mutex))
41244317 if (next->serial_nr > pos->serial_nr)
41254318 break;
41264319 }
....@@ -4133,6 +4326,7 @@
41334326 return next;
41344327 return NULL;
41354328 }
4329
+EXPORT_SYMBOL_GPL(css_next_child);
41364330
41374331 /**
41384332 * css_next_descendant_pre - find the next descendant for pre-order walk
....@@ -4182,6 +4376,7 @@
41824376
41834377 return NULL;
41844378 }
4379
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
41854380
41864381 /**
41874382 * css_rightmost_descendant - return the rightmost descendant of a css
....@@ -4362,29 +4557,24 @@
43624557
43634558 lockdep_assert_held(&css_set_lock);
43644559
4365
- /* Advance to the next non-empty css_set */
4366
- do {
4367
- cset = css_task_iter_next_css_set(it);
4368
- if (!cset) {
4369
- it->task_pos = NULL;
4370
- return;
4560
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
4561
+ while ((cset = css_task_iter_next_css_set(it))) {
4562
+ if (!list_empty(&cset->tasks)) {
4563
+ it->cur_tasks_head = &cset->tasks;
4564
+ break;
4565
+ } else if (!list_empty(&cset->mg_tasks)) {
4566
+ it->cur_tasks_head = &cset->mg_tasks;
4567
+ break;
4568
+ } else if (!list_empty(&cset->dying_tasks)) {
4569
+ it->cur_tasks_head = &cset->dying_tasks;
4570
+ break;
43714571 }
4372
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4373
-
4374
- if (!list_empty(&cset->tasks)) {
4375
- it->task_pos = cset->tasks.next;
4376
- it->cur_tasks_head = &cset->tasks;
4377
- } else if (!list_empty(&cset->mg_tasks)) {
4378
- it->task_pos = cset->mg_tasks.next;
4379
- it->cur_tasks_head = &cset->mg_tasks;
4380
- } else {
4381
- it->task_pos = cset->dying_tasks.next;
4382
- it->cur_tasks_head = &cset->dying_tasks;
43834572 }
4384
-
4385
- it->tasks_head = &cset->tasks;
4386
- it->mg_tasks_head = &cset->mg_tasks;
4387
- it->dying_tasks_head = &cset->dying_tasks;
4573
+ if (!cset) {
4574
+ it->task_pos = NULL;
4575
+ return;
4576
+ }
4577
+ it->task_pos = it->cur_tasks_head->next;
43884578
43894579 /*
43904580 * We don't keep css_sets locked across iteration steps and thus
....@@ -4429,24 +4619,24 @@
44294619 repeat:
44304620 if (it->task_pos) {
44314621 /*
4432
- * Advance iterator to find next entry. cset->tasks is
4433
- * consumed first and then ->mg_tasks. After ->mg_tasks,
4434
- * we move onto the next cset.
4622
+ * Advance iterator to find next entry. We go through cset
4623
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
4624
+ * the next cset.
44354625 */
44364626 if (it->flags & CSS_TASK_ITER_SKIPPED)
44374627 it->flags &= ~CSS_TASK_ITER_SKIPPED;
44384628 else
44394629 it->task_pos = it->task_pos->next;
44404630
4441
- if (it->task_pos == it->tasks_head) {
4442
- it->task_pos = it->mg_tasks_head->next;
4443
- it->cur_tasks_head = it->mg_tasks_head;
4631
+ if (it->task_pos == &it->cur_cset->tasks) {
4632
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
4633
+ it->task_pos = it->cur_tasks_head->next;
44444634 }
4445
- if (it->task_pos == it->mg_tasks_head) {
4446
- it->task_pos = it->dying_tasks_head->next;
4447
- it->cur_tasks_head = it->dying_tasks_head;
4635
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
4636
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
4637
+ it->task_pos = it->cur_tasks_head->next;
44484638 }
4449
- if (it->task_pos == it->dying_tasks_head)
4639
+ if (it->task_pos == &it->cur_cset->dying_tasks)
44504640 css_task_iter_advance_css_set(it);
44514641 } else {
44524642 /* called from start, proceed to the first cset */
....@@ -4464,12 +4654,12 @@
44644654 goto repeat;
44654655
44664656 /* and dying leaders w/o live member threads */
4467
- if (it->cur_tasks_head == it->dying_tasks_head &&
4657
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
44684658 !atomic_read(&task->signal->live))
44694659 goto repeat;
44704660 } else {
44714661 /* skip all dying ones */
4472
- if (it->cur_tasks_head == it->dying_tasks_head)
4662
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
44734663 goto repeat;
44744664 }
44754665 }
....@@ -4488,9 +4678,6 @@
44884678 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
44894679 struct css_task_iter *it)
44904680 {
4491
- /* no one should try to iterate before mounting cgroups */
4492
- WARN_ON_ONCE(!use_task_css_set_links);
4493
-
44944681 memset(it, 0, sizeof(*it));
44954682
44964683 spin_lock_irq(&css_set_lock);
....@@ -4567,21 +4754,21 @@
45674754
45684755 static void cgroup_procs_release(struct kernfs_open_file *of)
45694756 {
4570
- if (of->priv) {
4571
- css_task_iter_end(of->priv);
4572
- kfree(of->priv);
4573
- }
4757
+ struct cgroup_file_ctx *ctx = of->priv;
4758
+
4759
+ if (ctx->procs.started)
4760
+ css_task_iter_end(&ctx->procs.iter);
45744761 }
45754762
45764763 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
45774764 {
45784765 struct kernfs_open_file *of = s->private;
4579
- struct css_task_iter *it = of->priv;
4766
+ struct cgroup_file_ctx *ctx = of->priv;
45804767
45814768 if (pos)
45824769 (*pos)++;
45834770
4584
- return css_task_iter_next(it);
4771
+ return css_task_iter_next(&ctx->procs.iter);
45854772 }
45864773
45874774 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
....@@ -4589,21 +4776,18 @@
45894776 {
45904777 struct kernfs_open_file *of = s->private;
45914778 struct cgroup *cgrp = seq_css(s)->cgroup;
4592
- struct css_task_iter *it = of->priv;
4779
+ struct cgroup_file_ctx *ctx = of->priv;
4780
+ struct css_task_iter *it = &ctx->procs.iter;
45934781
45944782 /*
45954783 * When a seq_file is seeked, it's always traversed sequentially
45964784 * from position 0, so we can simply keep iterating on !0 *pos.
45974785 */
4598
- if (!it) {
4786
+ if (!ctx->procs.started) {
45994787 if (WARN_ON_ONCE((*pos)))
46004788 return ERR_PTR(-EINVAL);
4601
-
4602
- it = kzalloc(sizeof(*it), GFP_KERNEL);
4603
- if (!it)
4604
- return ERR_PTR(-ENOMEM);
4605
- of->priv = it;
46064789 css_task_iter_start(&cgrp->self, iter_flags, it);
4790
+ ctx->procs.started = true;
46074791 } else if (!(*pos)) {
46084792 css_task_iter_end(it);
46094793 css_task_iter_start(&cgrp->self, iter_flags, it);
....@@ -4636,13 +4820,28 @@
46364820 return 0;
46374821 }
46384822
4823
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
4824
+{
4825
+ int ret;
4826
+ struct inode *inode;
4827
+
4828
+ lockdep_assert_held(&cgroup_mutex);
4829
+
4830
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
4831
+ if (!inode)
4832
+ return -ENOMEM;
4833
+
4834
+ ret = inode_permission(inode, MAY_WRITE);
4835
+ iput(inode);
4836
+ return ret;
4837
+}
4838
+
46394839 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
46404840 struct cgroup *dst_cgrp,
4641
- struct super_block *sb)
4841
+ struct super_block *sb,
4842
+ struct cgroup_namespace *ns)
46424843 {
4643
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
46444844 struct cgroup *com_cgrp = src_cgrp;
4645
- struct inode *inode;
46464845 int ret;
46474846
46484847 lockdep_assert_held(&cgroup_mutex);
....@@ -4652,12 +4851,7 @@
46524851 com_cgrp = cgroup_parent(com_cgrp);
46534852
46544853 /* %current should be authorized to migrate to the common ancestor */
4655
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
4656
- if (!inode)
4657
- return -ENOMEM;
4658
-
4659
- ret = inode_permission(inode, MAY_WRITE);
4660
- iput(inode);
4854
+ ret = cgroup_may_write(com_cgrp, sb);
46614855 if (ret)
46624856 return ret;
46634857
....@@ -4673,18 +4867,42 @@
46734867 return 0;
46744868 }
46754869
4870
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
4871
+ struct cgroup *dst_cgrp,
4872
+ struct super_block *sb, bool threadgroup,
4873
+ struct cgroup_namespace *ns)
4874
+{
4875
+ int ret = 0;
4876
+
4877
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
4878
+ if (ret)
4879
+ return ret;
4880
+
4881
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
4882
+ if (ret)
4883
+ return ret;
4884
+
4885
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
4886
+ ret = -EOPNOTSUPP;
4887
+
4888
+ return ret;
4889
+}
4890
+
46764891 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
46774892 char *buf, size_t nbytes, loff_t off)
46784893 {
4894
+ struct cgroup_file_ctx *ctx = of->priv;
46794895 struct cgroup *src_cgrp, *dst_cgrp;
46804896 struct task_struct *task;
4897
+ const struct cred *saved_cred;
46814898 ssize_t ret;
4899
+ bool threadgroup_locked;
46824900
46834901 dst_cgrp = cgroup_kn_lock_live(of->kn, false);
46844902 if (!dst_cgrp)
46854903 return -ENODEV;
46864904
4687
- task = cgroup_procs_write_start(buf, true);
4905
+ task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp);
46884906 ret = PTR_ERR_OR_ZERO(task);
46894907 if (ret)
46904908 goto out_unlock;
....@@ -4694,15 +4912,23 @@
46944912 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
46954913 spin_unlock_irq(&css_set_lock);
46964914
4697
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4698
- of->file->f_path.dentry->d_sb);
4915
+ /*
4916
+ * Process and thread migrations follow same delegation rule. Check
4917
+ * permissions using the credentials from file open to protect against
4918
+ * inherited fd attacks.
4919
+ */
4920
+ saved_cred = override_creds(of->file->f_cred);
4921
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4922
+ of->file->f_path.dentry->d_sb, true,
4923
+ ctx->ns);
4924
+ revert_creds(saved_cred);
46994925 if (ret)
47004926 goto out_finish;
47014927
47024928 ret = cgroup_attach_task(dst_cgrp, task, true);
47034929
47044930 out_finish:
4705
- cgroup_procs_write_finish(task);
4931
+ cgroup_procs_write_finish(task, threadgroup_locked);
47064932 out_unlock:
47074933 cgroup_kn_unlock(of->kn);
47084934
....@@ -4717,9 +4943,12 @@
47174943 static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
47184944 char *buf, size_t nbytes, loff_t off)
47194945 {
4946
+ struct cgroup_file_ctx *ctx = of->priv;
47204947 struct cgroup *src_cgrp, *dst_cgrp;
47214948 struct task_struct *task;
4949
+ const struct cred *saved_cred;
47224950 ssize_t ret;
4951
+ bool threadgroup_locked;
47234952
47244953 buf = strstrip(buf);
47254954
....@@ -4727,7 +4956,7 @@
47274956 if (!dst_cgrp)
47284957 return -ENODEV;
47294958
4730
- task = cgroup_procs_write_start(buf, false);
4959
+ task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp);
47314960 ret = PTR_ERR_OR_ZERO(task);
47324961 if (ret)
47334962 goto out_unlock;
....@@ -4737,21 +4966,23 @@
47374966 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
47384967 spin_unlock_irq(&css_set_lock);
47394968
4740
- /* thread migrations follow the cgroup.procs delegation rule */
4741
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
4742
- of->file->f_path.dentry->d_sb);
4969
+ /*
4970
+ * Process and thread migrations follow same delegation rule. Check
4971
+ * permissions using the credentials from file open to protect against
4972
+ * inherited fd attacks.
4973
+ */
4974
+ saved_cred = override_creds(of->file->f_cred);
4975
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
4976
+ of->file->f_path.dentry->d_sb, false,
4977
+ ctx->ns);
4978
+ revert_creds(saved_cred);
47434979 if (ret)
4744
- goto out_finish;
4745
-
4746
- /* and must be contained in the same domain */
4747
- ret = -EOPNOTSUPP;
4748
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
47494980 goto out_finish;
47504981
47514982 ret = cgroup_attach_task(dst_cgrp, task, false);
47524983
47534984 out_finish:
4754
- cgroup_procs_write_finish(task);
4985
+ cgroup_procs_write_finish(task, threadgroup_locked);
47554986 out_unlock:
47564987 cgroup_kn_unlock(of->kn);
47574988
....@@ -4823,13 +5054,12 @@
48235054 },
48245055 {
48255056 .name = "cpu.stat",
4826
- .flags = CFTYPE_NOT_ON_ROOT,
48275057 .seq_show = cpu_stat_show,
48285058 },
48295059 #ifdef CONFIG_PSI
48305060 {
48315061 .name = "io.pressure",
4832
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5062
+ .flags = CFTYPE_PRESSURE,
48335063 .seq_show = cgroup_io_pressure_show,
48345064 .write = cgroup_io_pressure_write,
48355065 .poll = cgroup_pressure_poll,
....@@ -4837,7 +5067,7 @@
48375067 },
48385068 {
48395069 .name = "memory.pressure",
4840
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5070
+ .flags = CFTYPE_PRESSURE,
48415071 .seq_show = cgroup_memory_pressure_show,
48425072 .write = cgroup_memory_pressure_write,
48435073 .poll = cgroup_pressure_poll,
....@@ -4845,7 +5075,7 @@
48455075 },
48465076 {
48475077 .name = "cpu.pressure",
4848
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
5078
+ .flags = CFTYPE_PRESSURE,
48495079 .seq_show = cgroup_cpu_pressure_show,
48505080 .write = cgroup_cpu_pressure_write,
48515081 .poll = cgroup_pressure_poll,
....@@ -4964,9 +5194,6 @@
49645194 tcgrp->nr_dying_descendants--;
49655195 spin_unlock_irq(&css_set_lock);
49665196
4967
- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4968
- cgrp->id = -1;
4969
-
49705197 /*
49715198 * There are two control paths which try to determine
49725199 * cgroup from dentry without going through kernfs -
....@@ -4977,8 +5204,6 @@
49775204 if (cgrp->kn)
49785205 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
49795206 NULL);
4980
-
4981
- cgroup_bpf_put(cgrp);
49825207 }
49835208
49845209 mutex_unlock(&cgroup_mutex);
....@@ -5133,10 +5358,12 @@
51335358 * it isn't associated with its kernfs_node and doesn't have the control
51345359 * mask applied.
51355360 */
5136
-static struct cgroup *cgroup_create(struct cgroup *parent)
5361
+static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5362
+ umode_t mode)
51375363 {
51385364 struct cgroup_root *root = parent->root;
51395365 struct cgroup *cgrp, *tcgrp;
5366
+ struct kernfs_node *kn;
51405367 int level = parent->level + 1;
51415368 int ret;
51425369
....@@ -5156,15 +5383,13 @@
51565383 goto out_cancel_ref;
51575384 }
51585385
5159
- /*
5160
- * Temporarily set the pointer to NULL, so idr_find() won't return
5161
- * a half-baked cgroup.
5162
- */
5163
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5164
- if (cgrp->id < 0) {
5165
- ret = -ENOMEM;
5386
+ /* create the directory */
5387
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5388
+ if (IS_ERR(kn)) {
5389
+ ret = PTR_ERR(kn);
51665390 goto out_stat_exit;
51675391 }
5392
+ cgrp->kn = kn;
51685393
51695394 init_cgroup_housekeeping(cgrp);
51705395
....@@ -5174,7 +5399,7 @@
51745399
51755400 ret = psi_cgroup_alloc(cgrp);
51765401 if (ret)
5177
- goto out_idr_free;
5402
+ goto out_kernfs_remove;
51785403
51795404 ret = cgroup_bpf_inherit(cgrp);
51805405 if (ret)
....@@ -5198,7 +5423,7 @@
51985423
51995424 spin_lock_irq(&css_set_lock);
52005425 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5201
- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
5426
+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
52025427
52035428 if (tcgrp != cgrp) {
52045429 tcgrp->nr_descendants++;
....@@ -5228,12 +5453,6 @@
52285453 cgroup_get_live(parent);
52295454
52305455 /*
5231
- * @cgrp is now fully operational. If something fails after this
5232
- * point, it'll be released via the normal destruction path.
5233
- */
5234
- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5235
-
5236
- /*
52375456 * On the default hierarchy, a child doesn't automatically inherit
52385457 * subtree_control from the parent. Each is configured manually.
52395458 */
....@@ -5246,8 +5465,8 @@
52465465
52475466 out_psi_free:
52485467 psi_cgroup_free(cgrp);
5249
-out_idr_free:
5250
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
5468
+out_kernfs_remove:
5469
+ kernfs_remove(cgrp->kn);
52515470 out_stat_exit:
52525471 if (cgroup_on_dfl(parent))
52535472 cgroup_rstat_exit(cgrp);
....@@ -5284,7 +5503,6 @@
52845503 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
52855504 {
52865505 struct cgroup *parent, *cgrp;
5287
- struct kernfs_node *kn;
52885506 int ret;
52895507
52905508 /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
....@@ -5300,27 +5518,19 @@
53005518 goto out_unlock;
53015519 }
53025520
5303
- cgrp = cgroup_create(parent);
5521
+ cgrp = cgroup_create(parent, name, mode);
53045522 if (IS_ERR(cgrp)) {
53055523 ret = PTR_ERR(cgrp);
53065524 goto out_unlock;
53075525 }
53085526
5309
- /* create the directory */
5310
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5311
- if (IS_ERR(kn)) {
5312
- ret = PTR_ERR(kn);
5313
- goto out_destroy;
5314
- }
5315
- cgrp->kn = kn;
5316
-
53175527 /*
53185528 * This extra ref will be put in cgroup_free_fn() and guarantees
53195529 * that @cgrp->kn is always accessible.
53205530 */
5321
- kernfs_get(kn);
5531
+ kernfs_get(cgrp->kn);
53225532
5323
- ret = cgroup_kn_set_ugid(kn);
5533
+ ret = cgroup_kn_set_ugid(cgrp->kn);
53245534 if (ret)
53255535 goto out_destroy;
53265536
....@@ -5335,7 +5545,7 @@
53355545 TRACE_CGROUP_PATH(mkdir, cgrp);
53365546
53375547 /* let's create and online css's */
5338
- kernfs_activate(kn);
5548
+ kernfs_activate(cgrp->kn);
53395549
53405550 ret = 0;
53415551 goto out_unlock;
....@@ -5512,6 +5722,8 @@
55125722
55135723 cgroup1_check_for_release(parent);
55145724
5725
+ cgroup_bpf_offline(cgrp);
5726
+
55155727 /* put the base reference */
55165728 percpu_ref_kill(&cgrp->self.refcnt);
55175729
....@@ -5537,7 +5749,6 @@
55375749
55385750 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
55395751 .show_options = cgroup_show_options,
5540
- .remount_fs = cgroup_remount,
55415752 .mkdir = cgroup_mkdir,
55425753 .rmdir = cgroup_rmdir,
55435754 .show_path = cgroup_show_path,
....@@ -5604,11 +5815,12 @@
56045815 */
56055816 int __init cgroup_init_early(void)
56065817 {
5607
- static struct cgroup_sb_opts __initdata opts;
5818
+ static struct cgroup_fs_context __initdata ctx;
56085819 struct cgroup_subsys *ss;
56095820 int i;
56105821
5611
- init_cgroup_root(&cgrp_dfl_root, &opts);
5822
+ ctx.root = &cgrp_dfl_root;
5823
+ init_cgroup_root(&ctx);
56125824 cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
56135825
56145826 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
....@@ -5644,14 +5856,13 @@
56445856 int ssid;
56455857
56465858 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5647
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
56485859 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
56495860 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
56505861
56515862 cgroup_rstat_boot();
56525863
56535864 /*
5654
- * The latency of the synchronize_sched() is too high for cgroups,
5865
+ * The latency of the synchronize_rcu() is too high for cgroups,
56555866 * avoid it at the cost of forcing all readers into the slow path.
56565867 */
56575868 rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
....@@ -5735,6 +5946,9 @@
57355946 WARN_ON(register_filesystem(&cgroup_fs_type));
57365947 WARN_ON(register_filesystem(&cgroup2_fs_type));
57375948 WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
5949
+#ifdef CONFIG_CPUSETS
5950
+ WARN_ON(register_filesystem(&cpuset_fs_type));
5951
+#endif
57385952
57395953 return 0;
57405954 }
....@@ -5755,12 +5969,11 @@
57555969 }
57565970 core_initcall(cgroup_wq_init);
57575971
5758
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
5759
- char *buf, size_t buflen)
5972
+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
57605973 {
57615974 struct kernfs_node *kn;
57625975
5763
- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
5976
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
57645977 if (!kn)
57655978 return;
57665979 kernfs_path(kn, buf, buflen);
....@@ -5850,8 +6063,7 @@
58506063 * @child: pointer to task_struct of forking parent process.
58516064 *
58526065 * A task is associated with the init_css_set until cgroup_post_fork()
5853
- * attaches it to the parent's css_set. Empty cg_list indicates that
5854
- * @child isn't holding reference to its css_set.
6066
+ * attaches it to the target css_set.
58556067 */
58566068 void cgroup_fork(struct task_struct *child)
58576069 {
....@@ -5859,21 +6071,173 @@
58596071 INIT_LIST_HEAD(&child->cg_list);
58606072 }
58616073
6074
+static struct cgroup *cgroup_get_from_file(struct file *f)
6075
+{
6076
+ struct cgroup_subsys_state *css;
6077
+ struct cgroup *cgrp;
6078
+
6079
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6080
+ if (IS_ERR(css))
6081
+ return ERR_CAST(css);
6082
+
6083
+ cgrp = css->cgroup;
6084
+ if (!cgroup_on_dfl(cgrp)) {
6085
+ cgroup_put(cgrp);
6086
+ return ERR_PTR(-EBADF);
6087
+ }
6088
+
6089
+ return cgrp;
6090
+}
6091
+
6092
+/**
6093
+ * cgroup_css_set_fork - find or create a css_set for a child process
6094
+ * @kargs: the arguments passed to create the child process
6095
+ *
6096
+ * This functions finds or creates a new css_set which the child
6097
+ * process will be attached to in cgroup_post_fork(). By default,
6098
+ * the child process will be given the same css_set as its parent.
6099
+ *
6100
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
6101
+ * existing css_set which includes the requested cgroup and if not create
6102
+ * a new css_set that the child will be attached to later. If this function
6103
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
6104
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
6105
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
6106
+ * to the target cgroup.
6107
+ */
6108
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6109
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6110
+{
6111
+ int ret;
6112
+ struct cgroup *dst_cgrp = NULL;
6113
+ struct css_set *cset;
6114
+ struct super_block *sb;
6115
+ struct file *f;
6116
+
6117
+ if (kargs->flags & CLONE_INTO_CGROUP)
6118
+ mutex_lock(&cgroup_mutex);
6119
+
6120
+ cgroup_threadgroup_change_begin(current);
6121
+
6122
+ spin_lock_irq(&css_set_lock);
6123
+ cset = task_css_set(current);
6124
+ get_css_set(cset);
6125
+ spin_unlock_irq(&css_set_lock);
6126
+
6127
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6128
+ kargs->cset = cset;
6129
+ return 0;
6130
+ }
6131
+
6132
+ f = fget_raw(kargs->cgroup);
6133
+ if (!f) {
6134
+ ret = -EBADF;
6135
+ goto err;
6136
+ }
6137
+ sb = f->f_path.dentry->d_sb;
6138
+
6139
+ dst_cgrp = cgroup_get_from_file(f);
6140
+ if (IS_ERR(dst_cgrp)) {
6141
+ ret = PTR_ERR(dst_cgrp);
6142
+ dst_cgrp = NULL;
6143
+ goto err;
6144
+ }
6145
+
6146
+ if (cgroup_is_dead(dst_cgrp)) {
6147
+ ret = -ENODEV;
6148
+ goto err;
6149
+ }
6150
+
6151
+ /*
6152
+ * Verify that we the target cgroup is writable for us. This is
6153
+ * usually done by the vfs layer but since we're not going through
6154
+ * the vfs layer here we need to do it "manually".
6155
+ */
6156
+ ret = cgroup_may_write(dst_cgrp, sb);
6157
+ if (ret)
6158
+ goto err;
6159
+
6160
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6161
+ !(kargs->flags & CLONE_THREAD),
6162
+ current->nsproxy->cgroup_ns);
6163
+ if (ret)
6164
+ goto err;
6165
+
6166
+ kargs->cset = find_css_set(cset, dst_cgrp);
6167
+ if (!kargs->cset) {
6168
+ ret = -ENOMEM;
6169
+ goto err;
6170
+ }
6171
+
6172
+ put_css_set(cset);
6173
+ fput(f);
6174
+ kargs->cgrp = dst_cgrp;
6175
+ return ret;
6176
+
6177
+err:
6178
+ cgroup_threadgroup_change_end(current);
6179
+ mutex_unlock(&cgroup_mutex);
6180
+ if (f)
6181
+ fput(f);
6182
+ if (dst_cgrp)
6183
+ cgroup_put(dst_cgrp);
6184
+ put_css_set(cset);
6185
+ if (kargs->cset)
6186
+ put_css_set(kargs->cset);
6187
+ return ret;
6188
+}
6189
+
6190
+/**
6191
+ * cgroup_css_set_put_fork - drop references we took during fork
6192
+ * @kargs: the arguments passed to create the child process
6193
+ *
6194
+ * Drop references to the prepared css_set and target cgroup if
6195
+ * CLONE_INTO_CGROUP was requested.
6196
+ */
6197
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6198
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6199
+{
6200
+ cgroup_threadgroup_change_end(current);
6201
+
6202
+ if (kargs->flags & CLONE_INTO_CGROUP) {
6203
+ struct cgroup *cgrp = kargs->cgrp;
6204
+ struct css_set *cset = kargs->cset;
6205
+
6206
+ mutex_unlock(&cgroup_mutex);
6207
+
6208
+ if (cset) {
6209
+ put_css_set(cset);
6210
+ kargs->cset = NULL;
6211
+ }
6212
+
6213
+ if (cgrp) {
6214
+ cgroup_put(cgrp);
6215
+ kargs->cgrp = NULL;
6216
+ }
6217
+ }
6218
+}
6219
+
58626220 /**
58636221 * cgroup_can_fork - called on a new task before the process is exposed
5864
- * @child: the task in question.
6222
+ * @child: the child process
58656223 *
5866
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5867
- * returns an error, the fork aborts with that error code. This allows for
5868
- * a cgroup subsystem to conditionally allow or deny new forks.
6224
+ * This prepares a new css_set for the child process which the child will
6225
+ * be attached to in cgroup_post_fork().
6226
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
6227
+ * callback returns an error, the fork aborts with that error code. This
6228
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
58696229 */
5870
-int cgroup_can_fork(struct task_struct *child)
6230
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
58716231 {
58726232 struct cgroup_subsys *ss;
58736233 int i, j, ret;
58746234
6235
+ ret = cgroup_css_set_fork(kargs);
6236
+ if (ret)
6237
+ return ret;
6238
+
58756239 do_each_subsys_mask(ss, i, have_canfork_callback) {
5876
- ret = ss->can_fork(child);
6240
+ ret = ss->can_fork(child, kargs->cset);
58776241 if (ret)
58786242 goto out_revert;
58796243 } while_each_subsys_mask();
....@@ -5885,97 +6249,86 @@
58856249 if (j >= i)
58866250 break;
58876251 if (ss->cancel_fork)
5888
- ss->cancel_fork(child);
6252
+ ss->cancel_fork(child, kargs->cset);
58896253 }
6254
+
6255
+ cgroup_css_set_put_fork(kargs);
58906256
58916257 return ret;
58926258 }
58936259
58946260 /**
58956261 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5896
- * @child: the task in question
6262
+ * @child: the child process
6263
+ * @kargs: the arguments passed to create the child process
58976264 *
58986265 * This calls the cancel_fork() callbacks if a fork failed *after*
5899
- * cgroup_can_fork() succeded.
6266
+ * cgroup_can_fork() succeded and cleans up references we took to
6267
+ * prepare a new css_set for the child process in cgroup_can_fork().
59006268 */
5901
-void cgroup_cancel_fork(struct task_struct *child)
6269
+void cgroup_cancel_fork(struct task_struct *child,
6270
+ struct kernel_clone_args *kargs)
59026271 {
59036272 struct cgroup_subsys *ss;
59046273 int i;
59056274
59066275 for_each_subsys(ss, i)
59076276 if (ss->cancel_fork)
5908
- ss->cancel_fork(child);
6277
+ ss->cancel_fork(child, kargs->cset);
6278
+
6279
+ cgroup_css_set_put_fork(kargs);
59096280 }
59106281
59116282 /**
5912
- * cgroup_post_fork - called on a new task after adding it to the task list
5913
- * @child: the task in question
6283
+ * cgroup_post_fork - finalize cgroup setup for the child process
6284
+ * @child: the child process
59146285 *
5915
- * Adds the task to the list running through its css_set if necessary and
5916
- * call the subsystem fork() callbacks. Has to be after the task is
5917
- * visible on the task list in case we race with the first call to
5918
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5919
- * list.
6286
+ * Attach the child process to its css_set calling the subsystem fork()
6287
+ * callbacks.
59206288 */
5921
-void cgroup_post_fork(struct task_struct *child)
6289
+void cgroup_post_fork(struct task_struct *child,
6290
+ struct kernel_clone_args *kargs)
6291
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
59226292 {
59236293 struct cgroup_subsys *ss;
6294
+ struct css_set *cset;
59246295 int i;
59256296
5926
- /*
5927
- * This may race against cgroup_enable_task_cg_lists(). As that
5928
- * function sets use_task_css_set_links before grabbing
5929
- * tasklist_lock and we just went through tasklist_lock to add
5930
- * @child, it's guaranteed that either we see the set
5931
- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5932
- * @child during its iteration.
5933
- *
5934
- * If we won the race, @child is associated with %current's
5935
- * css_set. Grabbing css_set_lock guarantees both that the
5936
- * association is stable, and, on completion of the parent's
5937
- * migration, @child is visible in the source of migration or
5938
- * already in the destination cgroup. This guarantee is necessary
5939
- * when implementing operations which need to migrate all tasks of
5940
- * a cgroup to another.
5941
- *
5942
- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5943
- * will remain in init_css_set. This is safe because all tasks are
5944
- * in the init_css_set before cg_links is enabled and there's no
5945
- * operation which transfers all tasks out of init_css_set.
5946
- */
5947
- if (use_task_css_set_links) {
5948
- struct css_set *cset;
6297
+ cset = kargs->cset;
6298
+ kargs->cset = NULL;
59496299
5950
- spin_lock_irq(&css_set_lock);
5951
- cset = task_css_set(current);
5952
- if (list_empty(&child->cg_list)) {
5953
- get_css_set(cset);
5954
- cset->nr_tasks++;
5955
- css_set_move_task(child, NULL, cset, false);
5956
- }
6300
+ spin_lock_irq(&css_set_lock);
6301
+
6302
+ /* init tasks are special, only link regular threads */
6303
+ if (likely(child->pid)) {
6304
+ WARN_ON_ONCE(!list_empty(&child->cg_list));
6305
+ cset->nr_tasks++;
6306
+ css_set_move_task(child, NULL, cset, false);
6307
+ } else {
6308
+ put_css_set(cset);
6309
+ cset = NULL;
6310
+ }
6311
+
6312
+ /*
6313
+ * If the cgroup has to be frozen, the new task has too. Let's set
6314
+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
6315
+ * frozen state.
6316
+ */
6317
+ if (unlikely(cgroup_task_freeze(child))) {
6318
+ spin_lock(&child->sighand->siglock);
6319
+ WARN_ON_ONCE(child->frozen);
6320
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
6321
+ spin_unlock(&child->sighand->siglock);
59576322
59586323 /*
5959
- * If the cgroup has to be frozen, the new task has too.
5960
- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
5961
- * the task into the frozen state.
6324
+ * Calling cgroup_update_frozen() isn't required here,
6325
+ * because it will be called anyway a bit later from
6326
+ * do_freezer_trap(). So we avoid cgroup's transient switch
6327
+ * from the frozen state and back.
59626328 */
5963
- if (unlikely(cgroup_task_freeze(child))) {
5964
- spin_lock(&child->sighand->siglock);
5965
- WARN_ON_ONCE(child->frozen);
5966
- child->jobctl |= JOBCTL_TRAP_FREEZE;
5967
- spin_unlock(&child->sighand->siglock);
5968
-
5969
- /*
5970
- * Calling cgroup_update_frozen() isn't required here,
5971
- * because it will be called anyway a bit later
5972
- * from do_freezer_trap(). So we avoid cgroup's
5973
- * transient switch from the frozen state and back.
5974
- */
5975
- }
5976
-
5977
- spin_unlock_irq(&css_set_lock);
59786329 }
6330
+
6331
+ spin_unlock_irq(&css_set_lock);
59796332
59806333 /*
59816334 * Call ss->fork(). This must happen after @child is linked on
....@@ -5985,26 +6338,25 @@
59856338 do_each_subsys_mask(ss, i, have_fork_callback) {
59866339 ss->fork(child);
59876340 } while_each_subsys_mask();
6341
+
6342
+ /* Make the new cset the root_cset of the new cgroup namespace. */
6343
+ if (kargs->flags & CLONE_NEWCGROUP) {
6344
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6345
+
6346
+ get_css_set(cset);
6347
+ child->nsproxy->cgroup_ns->root_cset = cset;
6348
+ put_css_set(rcset);
6349
+ }
6350
+
6351
+ cgroup_css_set_put_fork(kargs);
59886352 }
59896353
59906354 /**
59916355 * cgroup_exit - detach cgroup from exiting task
59926356 * @tsk: pointer to task_struct of exiting process
59936357 *
5994
- * Description: Detach cgroup from @tsk and release it.
6358
+ * Description: Detach cgroup from @tsk.
59956359 *
5996
- * Note that cgroups marked notify_on_release force every task in
5997
- * them to take the global cgroup_mutex mutex when exiting.
5998
- * This could impact scaling on very large systems. Be reluctant to
5999
- * use notify_on_release cgroups where very high task exit scaling
6000
- * is required on large systems.
6001
- *
6002
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
6003
- * call cgroup_exit() while the task is still competent to handle
6004
- * notify_on_release(), then leave the task attached to the root cgroup in
6005
- * each hierarchy for the remainder of its exit. No need to bother with
6006
- * init_css_set refcnting. init_css_set never goes away and we can't race
6007
- * with migration path - PF_EXITING is visible to migration path.
60086360 */
60096361 void cgroup_exit(struct task_struct *tsk)
60106362 {
....@@ -6012,27 +6364,19 @@
60126364 struct css_set *cset;
60136365 int i;
60146366
6015
- /*
6016
- * Unlink from @tsk from its css_set. As migration path can't race
6017
- * with us, we can check css_set and cg_list without synchronization.
6018
- */
6367
+ spin_lock_irq(&css_set_lock);
6368
+
6369
+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
60196370 cset = task_css_set(tsk);
6371
+ css_set_move_task(tsk, cset, NULL, false);
6372
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6373
+ cset->nr_tasks--;
60206374
6021
- if (!list_empty(&tsk->cg_list)) {
6022
- spin_lock_irq(&css_set_lock);
6023
- css_set_move_task(tsk, cset, NULL, false);
6024
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6025
- cset->nr_tasks--;
6375
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
6376
+ if (unlikely(cgroup_task_freeze(tsk)))
6377
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
60266378
6027
- if (unlikely(cgroup_task_frozen(tsk)))
6028
- cgroup_freezer_frozen_exit(tsk);
6029
- else if (unlikely(cgroup_task_freeze(tsk)))
6030
- cgroup_update_frozen(task_dfl_cgroup(tsk));
6031
-
6032
- spin_unlock_irq(&css_set_lock);
6033
- } else {
6034
- get_css_set(cset);
6035
- }
6379
+ spin_unlock_irq(&css_set_lock);
60366380
60376381 /* see cgroup_post_fork() for details */
60386382 do_each_subsys_mask(ss, i, have_exit_callback) {
....@@ -6049,12 +6393,10 @@
60496393 ss->release(task);
60506394 } while_each_subsys_mask();
60516395
6052
- if (use_task_css_set_links) {
6053
- spin_lock_irq(&css_set_lock);
6054
- css_set_skip_task_iters(task_css_set(task), task);
6055
- list_del_init(&task->cg_list);
6056
- spin_unlock_irq(&css_set_lock);
6057
- }
6396
+ spin_lock_irq(&css_set_lock);
6397
+ css_set_skip_task_iters(task_css_set(task), task);
6398
+ list_del_init(&task->cg_list);
6399
+ spin_unlock_irq(&css_set_lock);
60586400 }
60596401
60606402 void cgroup_free(struct task_struct *task)
....@@ -6095,6 +6437,16 @@
60956437 return 1;
60966438 }
60976439 __setup("cgroup_disable=", cgroup_disable);
6440
+
6441
+void __init __weak enable_debug_cgroup(void) { }
6442
+
6443
+static int __init enable_cgroup_debug(char *str)
6444
+{
6445
+ cgroup_debug = true;
6446
+ enable_debug_cgroup();
6447
+ return 1;
6448
+}
6449
+__setup("cgroup_debug", enable_cgroup_debug);
60986450
60996451 /**
61006452 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
....@@ -6195,7 +6547,6 @@
61956547 */
61966548 struct cgroup *cgroup_get_from_fd(int fd)
61976549 {
6198
- struct cgroup_subsys_state *css;
61996550 struct cgroup *cgrp;
62006551 struct file *f;
62016552
....@@ -6203,17 +6554,8 @@
62036554 if (!f)
62046555 return ERR_PTR(-EBADF);
62056556
6206
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6557
+ cgrp = cgroup_get_from_file(f);
62076558 fput(f);
6208
- if (IS_ERR(css))
6209
- return ERR_CAST(css);
6210
-
6211
- cgrp = css->cgroup;
6212
- if (!cgroup_on_dfl(cgrp)) {
6213
- cgroup_put(cgrp);
6214
- return ERR_PTR(-EBADF);
6215
- }
6216
-
62176559 return cgrp;
62186560 }
62196561 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
....@@ -6304,6 +6646,7 @@
63046646 cset = task_css_set(current);
63056647 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
63066648 skcd->val = (unsigned long)cset->dfl_cgrp;
6649
+ cgroup_bpf_get(cset->dfl_cgrp);
63076650 break;
63086651 }
63096652 cpu_relax();
....@@ -6314,7 +6657,6 @@
63146657
63156658 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
63166659 {
6317
- /* Socket clone path */
63186660 if (skcd->val) {
63196661 if (skcd->no_refcnt)
63206662 return;
....@@ -6324,40 +6666,48 @@
63246666 * Don't use cgroup_get_live().
63256667 */
63266668 cgroup_get(sock_cgroup_ptr(skcd));
6669
+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
63276670 }
63286671 }
63296672
63306673 void cgroup_sk_free(struct sock_cgroup_data *skcd)
63316674 {
6675
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6676
+
63326677 if (skcd->no_refcnt)
63336678 return;
6334
-
6335
- cgroup_put(sock_cgroup_ptr(skcd));
6679
+ cgroup_bpf_put(cgrp);
6680
+ cgroup_put(cgrp);
63366681 }
63376682
63386683 #endif /* CONFIG_SOCK_CGROUP_DATA */
63396684
63406685 #ifdef CONFIG_CGROUP_BPF
6341
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
6342
- enum bpf_attach_type type, u32 flags)
6686
+int cgroup_bpf_attach(struct cgroup *cgrp,
6687
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
6688
+ struct bpf_cgroup_link *link,
6689
+ enum bpf_attach_type type,
6690
+ u32 flags)
63436691 {
63446692 int ret;
63456693
63466694 mutex_lock(&cgroup_mutex);
6347
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
6695
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
63486696 mutex_unlock(&cgroup_mutex);
63496697 return ret;
63506698 }
6699
+
63516700 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
6352
- enum bpf_attach_type type, u32 flags)
6701
+ enum bpf_attach_type type)
63536702 {
63546703 int ret;
63556704
63566705 mutex_lock(&cgroup_mutex);
6357
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
6706
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
63586707 mutex_unlock(&cgroup_mutex);
63596708 return ret;
63606709 }
6710
+
63616711 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
63626712 union bpf_attr __user *uattr)
63636713 {
....@@ -6418,7 +6768,10 @@
64186768 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
64196769 char *buf)
64206770 {
6421
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
6771
+ return snprintf(buf, PAGE_SIZE,
6772
+ "nsdelegate\n"
6773
+ "memory_localevents\n"
6774
+ "memory_recursiveprot\n");
64226775 }
64236776 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
64246777