hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/cgroup/cgroup-v1.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 #include "cgroup-internal.h"
23
34 #include <linux/ctype.h>
....@@ -13,8 +14,10 @@
1314 #include <linux/delayacct.h>
1415 #include <linux/pid_namespace.h>
1516 #include <linux/cgroupstats.h>
17
+#include <linux/fs_parser.h>
1618
1719 #include <trace/events/cgroup.h>
20
+#include <trace/hooks/cgroup.h>
1821
1922 /*
2023 * pidlists linger the following amount before being destroyed. The goal
....@@ -36,10 +39,7 @@
3639 */
3740 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
3841
39
-/*
40
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
41
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
42
- */
42
+/* protects cgroup_subsys->release_agent_path */
4343 static DEFINE_SPINLOCK(release_agent_path_lock);
4444
4545 bool cgroup1_ssid_disabled(int ssid)
....@@ -58,6 +58,7 @@
5858 int retval = 0;
5959
6060 mutex_lock(&cgroup_mutex);
61
+ cpus_read_lock();
6162 percpu_down_write(&cgroup_threadgroup_rwsem);
6263 for_each_root(root) {
6364 struct cgroup *from_cgrp;
....@@ -74,6 +75,7 @@
7475 break;
7576 }
7677 percpu_up_write(&cgroup_threadgroup_rwsem);
78
+ cpus_read_unlock();
7779 mutex_unlock(&cgroup_mutex);
7880
7981 return retval;
....@@ -190,25 +192,6 @@
190192 };
191193
192194 /*
193
- * The following two functions "fix" the issue where there are more pids
194
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
195
- * TODO: replace with a kernel-wide solution to this problem
196
- */
197
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
198
-static void *pidlist_allocate(int count)
199
-{
200
- if (PIDLIST_TOO_LARGE(count))
201
- return vmalloc(array_size(count, sizeof(pid_t)));
202
- else
203
- return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
204
-}
205
-
206
-static void pidlist_free(void *p)
207
-{
208
- kvfree(p);
209
-}
210
-
211
-/*
212195 * Used to destroy all pidlists lingering waiting for destroy timer. None
213196 * should be left afterwards.
214197 */
....@@ -240,7 +223,7 @@
240223 */
241224 if (!delayed_work_pending(dwork)) {
242225 list_del(&l->links);
243
- pidlist_free(l->list);
226
+ kvfree(l->list);
244227 put_pid_ns(l->key.ns);
245228 tofree = l;
246229 }
....@@ -361,7 +344,7 @@
361344 * show up until sometime later on.
362345 */
363346 length = cgroup_task_count(cgrp);
364
- array = pidlist_allocate(length);
347
+ array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
365348 if (!array)
366349 return -ENOMEM;
367350 /* now, populate the array */
....@@ -386,12 +369,12 @@
386369
387370 l = cgroup_pidlist_find_create(cgrp, type);
388371 if (!l) {
389
- pidlist_free(array);
372
+ kvfree(array);
390373 return -ENOMEM;
391374 }
392375
393376 /* store array, freeing old if necessary */
394
- pidlist_free(l->list);
377
+ kvfree(l->list);
395378 l->list = array;
396379 l->length = length;
397380 *lp = l;
....@@ -413,6 +396,7 @@
413396 * next pid to display, if any
414397 */
415398 struct kernfs_open_file *of = s->private;
399
+ struct cgroup_file_ctx *ctx = of->priv;
416400 struct cgroup *cgrp = seq_css(s)->cgroup;
417401 struct cgroup_pidlist *l;
418402 enum cgroup_filetype type = seq_cft(s)->private;
....@@ -422,25 +406,24 @@
422406 mutex_lock(&cgrp->pidlist_mutex);
423407
424408 /*
425
- * !NULL @of->priv indicates that this isn't the first start()
426
- * after open. If the matching pidlist is around, we can use that.
427
- * Look for it. Note that @of->priv can't be used directly. It
428
- * could already have been destroyed.
409
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
410
+ * start() after open. If the matching pidlist is around, we can use
411
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
412
+ * directly. It could already have been destroyed.
429413 */
430
- if (of->priv)
431
- of->priv = cgroup_pidlist_find(cgrp, type);
414
+ if (ctx->procs1.pidlist)
415
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
432416
433417 /*
434418 * Either this is the first start() after open or the matching
435419 * pidlist has been destroyed inbetween. Create a new one.
436420 */
437
- if (!of->priv) {
438
- ret = pidlist_array_load(cgrp, type,
439
- (struct cgroup_pidlist **)&of->priv);
421
+ if (!ctx->procs1.pidlist) {
422
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
440423 if (ret)
441424 return ERR_PTR(ret);
442425 }
443
- l = of->priv;
426
+ l = ctx->procs1.pidlist;
444427
445428 if (pid) {
446429 int end = l->length;
....@@ -468,7 +451,8 @@
468451 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
469452 {
470453 struct kernfs_open_file *of = s->private;
471
- struct cgroup_pidlist *l = of->priv;
454
+ struct cgroup_file_ctx *ctx = of->priv;
455
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
472456
473457 if (l)
474458 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
....@@ -479,7 +463,8 @@
479463 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
480464 {
481465 struct kernfs_open_file *of = s->private;
482
- struct cgroup_pidlist *l = of->priv;
466
+ struct cgroup_file_ctx *ctx = of->priv;
467
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
483468 pid_t *p = v;
484469 pid_t *end = l->list + l->length;
485470 /*
....@@ -511,21 +496,23 @@
511496 struct task_struct *task;
512497 const struct cred *cred, *tcred;
513498 ssize_t ret;
499
+ bool locked;
514500
515501 cgrp = cgroup_kn_lock_live(of->kn, false);
516502 if (!cgrp)
517503 return -ENODEV;
518504
519
- task = cgroup_procs_write_start(buf, threadgroup);
505
+ task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp);
520506 ret = PTR_ERR_OR_ZERO(task);
521507 if (ret)
522508 goto out_unlock;
523509
524510 /*
525
- * Even if we're attaching all tasks in the thread group, we only
526
- * need to check permissions on one of them.
511
+ * Even if we're attaching all tasks in the thread group, we only need
512
+ * to check permissions on one of them. Check permissions using the
513
+ * credentials from file open to protect against inherited fd attacks.
527514 */
528
- cred = current_cred();
515
+ cred = of->file->f_cred;
529516 tcred = get_task_cred(task);
530517 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
531518 !uid_eq(cred->euid, tcred->uid) &&
....@@ -537,9 +524,10 @@
537524 goto out_finish;
538525
539526 ret = cgroup_attach_task(cgrp, task, threadgroup);
527
+ trace_android_vh_cgroup_set_task(ret, task);
540528
541529 out_finish:
542
- cgroup_procs_write_finish(task);
530
+ cgroup_procs_write_finish(task, locked);
543531 out_unlock:
544532 cgroup_kn_unlock(of->kn);
545533
....@@ -562,6 +550,7 @@
562550 char *buf, size_t nbytes, loff_t off)
563551 {
564552 struct cgroup *cgrp;
553
+ struct cgroup_file_ctx *ctx;
565554
566555 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
567556
....@@ -569,8 +558,9 @@
569558 * Release agent gets called with all capabilities,
570559 * require capabilities to set release agent.
571560 */
572
- if ((of->file->f_cred->user_ns != &init_user_ns) ||
573
- !capable(CAP_SYS_ADMIN))
561
+ ctx = of->priv;
562
+ if ((ctx->ns->user_ns != &init_user_ns) ||
563
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
574564 return -EPERM;
575565
576566 cgrp = cgroup_kn_lock_live(of->kn, false);
....@@ -800,22 +790,29 @@
800790 {
801791 struct cgroup *cgrp =
802792 container_of(work, struct cgroup, release_agent_work);
803
- char *pathbuf = NULL, *agentbuf = NULL;
793
+ char *pathbuf, *agentbuf;
804794 char *argv[3], *envp[3];
805795 int ret;
806796
807
- mutex_lock(&cgroup_mutex);
797
+ /* snoop agent path and exit early if empty */
798
+ if (!cgrp->root->release_agent_path[0])
799
+ return;
808800
801
+ /* prepare argument buffers */
809802 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
810
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
811
- if (!pathbuf || !agentbuf || !strlen(agentbuf))
812
- goto out;
803
+ agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
804
+ if (!pathbuf || !agentbuf)
805
+ goto out_free;
813806
814
- spin_lock_irq(&css_set_lock);
815
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
816
- spin_unlock_irq(&css_set_lock);
807
+ spin_lock(&release_agent_path_lock);
808
+ strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
809
+ spin_unlock(&release_agent_path_lock);
810
+ if (!agentbuf[0])
811
+ goto out_free;
812
+
813
+ ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
817814 if (ret < 0 || ret >= PATH_MAX)
818
- goto out;
815
+ goto out_free;
819816
820817 argv[0] = agentbuf;
821818 argv[1] = pathbuf;
....@@ -826,11 +823,7 @@
826823 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
827824 envp[2] = NULL;
828825
829
- mutex_unlock(&cgroup_mutex);
830826 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
831
- goto out_free;
832
-out:
833
- mutex_unlock(&cgroup_mutex);
834827 out_free:
835828 kfree(agentbuf);
836829 kfree(pathbuf);
....@@ -904,179 +897,203 @@
904897 return 0;
905898 }
906899
907
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
900
+enum cgroup1_param {
901
+ Opt_all,
902
+ Opt_clone_children,
903
+ Opt_cpuset_v2_mode,
904
+ Opt_name,
905
+ Opt_none,
906
+ Opt_noprefix,
907
+ Opt_release_agent,
908
+ Opt_xattr,
909
+};
910
+
911
+const struct fs_parameter_spec cgroup1_fs_parameters[] = {
912
+ fsparam_flag ("all", Opt_all),
913
+ fsparam_flag ("clone_children", Opt_clone_children),
914
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
915
+ fsparam_string("name", Opt_name),
916
+ fsparam_flag ("none", Opt_none),
917
+ fsparam_flag ("noprefix", Opt_noprefix),
918
+ fsparam_string("release_agent", Opt_release_agent),
919
+ fsparam_flag ("xattr", Opt_xattr),
920
+ {}
921
+};
922
+
923
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
908924 {
909
- char *token, *o = data;
910
- bool all_ss = false, one_ss = false;
911
- u16 mask = U16_MAX;
925
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
912926 struct cgroup_subsys *ss;
913
- int nr_opts = 0;
927
+ struct fs_parse_result result;
928
+ int opt, i;
929
+
930
+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
931
+ if (opt == -ENOPARAM) {
932
+ if (strcmp(param->key, "source") == 0) {
933
+ if (param->type != fs_value_is_string)
934
+ return invalf(fc, "Non-string source");
935
+ if (fc->source)
936
+ return invalf(fc, "Multiple sources not supported");
937
+ fc->source = param->string;
938
+ param->string = NULL;
939
+ return 0;
940
+ }
941
+ for_each_subsys(ss, i) {
942
+ if (strcmp(param->key, ss->legacy_name))
943
+ continue;
944
+ if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
945
+ return invalfc(fc, "Disabled controller '%s'",
946
+ param->key);
947
+ ctx->subsys_mask |= (1 << i);
948
+ return 0;
949
+ }
950
+ return invalfc(fc, "Unknown subsys name '%s'", param->key);
951
+ }
952
+ if (opt < 0)
953
+ return opt;
954
+
955
+ switch (opt) {
956
+ case Opt_none:
957
+ /* Explicitly have no subsystems */
958
+ ctx->none = true;
959
+ break;
960
+ case Opt_all:
961
+ ctx->all_ss = true;
962
+ break;
963
+ case Opt_noprefix:
964
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
965
+ break;
966
+ case Opt_clone_children:
967
+ ctx->cpuset_clone_children = true;
968
+ break;
969
+ case Opt_cpuset_v2_mode:
970
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
971
+ break;
972
+ case Opt_xattr:
973
+ ctx->flags |= CGRP_ROOT_XATTR;
974
+ break;
975
+ case Opt_release_agent:
976
+ /* Specifying two release agents is forbidden */
977
+ if (ctx->release_agent)
978
+ return invalfc(fc, "release_agent respecified");
979
+ /*
980
+ * Release agent gets called with all capabilities,
981
+ * require capabilities to set release agent.
982
+ */
983
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
984
+ return invalfc(fc, "Setting release_agent not allowed");
985
+ ctx->release_agent = param->string;
986
+ param->string = NULL;
987
+ break;
988
+ case Opt_name:
989
+ /* blocked by boot param? */
990
+ if (cgroup_no_v1_named)
991
+ return -ENOENT;
992
+ /* Can't specify an empty name */
993
+ if (!param->size)
994
+ return invalfc(fc, "Empty name");
995
+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
996
+ return invalfc(fc, "Name too long");
997
+ /* Must match [\w.-]+ */
998
+ for (i = 0; i < param->size; i++) {
999
+ char c = param->string[i];
1000
+ if (isalnum(c))
1001
+ continue;
1002
+ if ((c == '.') || (c == '-') || (c == '_'))
1003
+ continue;
1004
+ return invalfc(fc, "Invalid name");
1005
+ }
1006
+ /* Specifying two names is forbidden */
1007
+ if (ctx->name)
1008
+ return invalfc(fc, "name respecified");
1009
+ ctx->name = param->string;
1010
+ param->string = NULL;
1011
+ break;
1012
+ }
1013
+ return 0;
1014
+}
1015
+
1016
+static int check_cgroupfs_options(struct fs_context *fc)
1017
+{
1018
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1019
+ u16 mask = U16_MAX;
1020
+ u16 enabled = 0;
1021
+ struct cgroup_subsys *ss;
9141022 int i;
9151023
9161024 #ifdef CONFIG_CPUSETS
9171025 mask = ~((u16)1 << cpuset_cgrp_id);
9181026 #endif
1027
+ for_each_subsys(ss, i)
1028
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
1029
+ enabled |= 1 << i;
9191030
920
- memset(opts, 0, sizeof(*opts));
921
-
922
- while ((token = strsep(&o, ",")) != NULL) {
923
- nr_opts++;
924
-
925
- if (!*token)
926
- return -EINVAL;
927
- if (!strcmp(token, "none")) {
928
- /* Explicitly have no subsystems */
929
- opts->none = true;
930
- continue;
931
- }
932
- if (!strcmp(token, "all")) {
933
- /* Mutually exclusive option 'all' + subsystem name */
934
- if (one_ss)
935
- return -EINVAL;
936
- all_ss = true;
937
- continue;
938
- }
939
- if (!strcmp(token, "noprefix")) {
940
- opts->flags |= CGRP_ROOT_NOPREFIX;
941
- continue;
942
- }
943
- if (!strcmp(token, "clone_children")) {
944
- opts->cpuset_clone_children = true;
945
- continue;
946
- }
947
- if (!strcmp(token, "cpuset_v2_mode")) {
948
- opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
949
- continue;
950
- }
951
- if (!strcmp(token, "xattr")) {
952
- opts->flags |= CGRP_ROOT_XATTR;
953
- continue;
954
- }
955
- if (!strncmp(token, "release_agent=", 14)) {
956
- /* Specifying two release agents is forbidden */
957
- if (opts->release_agent)
958
- return -EINVAL;
959
- opts->release_agent =
960
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
961
- if (!opts->release_agent)
962
- return -ENOMEM;
963
- continue;
964
- }
965
- if (!strncmp(token, "name=", 5)) {
966
- const char *name = token + 5;
967
-
968
- /* blocked by boot param? */
969
- if (cgroup_no_v1_named)
970
- return -ENOENT;
971
- /* Can't specify an empty name */
972
- if (!strlen(name))
973
- return -EINVAL;
974
- /* Must match [\w.-]+ */
975
- for (i = 0; i < strlen(name); i++) {
976
- char c = name[i];
977
- if (isalnum(c))
978
- continue;
979
- if ((c == '.') || (c == '-') || (c == '_'))
980
- continue;
981
- return -EINVAL;
982
- }
983
- /* Specifying two names is forbidden */
984
- if (opts->name)
985
- return -EINVAL;
986
- opts->name = kstrndup(name,
987
- MAX_CGROUP_ROOT_NAMELEN - 1,
988
- GFP_KERNEL);
989
- if (!opts->name)
990
- return -ENOMEM;
991
-
992
- continue;
993
- }
994
-
995
- for_each_subsys(ss, i) {
996
- if (strcmp(token, ss->legacy_name))
997
- continue;
998
- if (!cgroup_ssid_enabled(i))
999
- continue;
1000
- if (cgroup1_ssid_disabled(i))
1001
- continue;
1002
-
1003
- /* Mutually exclusive option 'all' + subsystem name */
1004
- if (all_ss)
1005
- return -EINVAL;
1006
- opts->subsys_mask |= (1 << i);
1007
- one_ss = true;
1008
-
1009
- break;
1010
- }
1011
- if (i == CGROUP_SUBSYS_COUNT)
1012
- return -ENOENT;
1013
- }
1031
+ ctx->subsys_mask &= enabled;
10141032
10151033 /*
1016
- * If the 'all' option was specified select all the subsystems,
1017
- * otherwise if 'none', 'name=' and a subsystem name options were
1018
- * not specified, let's default to 'all'
1034
+ * In absense of 'none', 'name=' or subsystem name options,
1035
+ * let's default to 'all'.
10191036 */
1020
- if (all_ss || (!one_ss && !opts->none && !opts->name))
1021
- for_each_subsys(ss, i)
1022
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
1023
- opts->subsys_mask |= (1 << i);
1037
+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
1038
+ ctx->all_ss = true;
1039
+
1040
+ if (ctx->all_ss) {
1041
+ /* Mutually exclusive option 'all' + subsystem name */
1042
+ if (ctx->subsys_mask)
1043
+ return invalfc(fc, "subsys name conflicts with all");
1044
+ /* 'all' => select all the subsystems */
1045
+ ctx->subsys_mask = enabled;
1046
+ }
10241047
10251048 /*
10261049 * We either have to specify by name or by subsystems. (So all
10271050 * empty hierarchies must have a name).
10281051 */
1029
- if (!opts->subsys_mask && !opts->name)
1030
- return -EINVAL;
1052
+ if (!ctx->subsys_mask && !ctx->name)
1053
+ return invalfc(fc, "Need name or subsystem set");
10311054
10321055 /*
10331056 * Option noprefix was introduced just for backward compatibility
10341057 * with the old cpuset, so we allow noprefix only if mounting just
10351058 * the cpuset subsystem.
10361059 */
1037
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1038
- return -EINVAL;
1060
+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
1061
+ return invalfc(fc, "noprefix used incorrectly");
10391062
10401063 /* Can't specify "none" and some subsystems */
1041
- if (opts->subsys_mask && opts->none)
1042
- return -EINVAL;
1064
+ if (ctx->subsys_mask && ctx->none)
1065
+ return invalfc(fc, "none used incorrectly");
10431066
10441067 return 0;
10451068 }
10461069
1047
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
1070
+int cgroup1_reconfigure(struct fs_context *fc)
10481071 {
1049
- int ret = 0;
1072
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1073
+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
10501074 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1051
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
1052
- struct cgroup_sb_opts opts;
1075
+ int ret = 0;
10531076 u16 added_mask, removed_mask;
10541077
10551078 cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
10561079
10571080 /* See what subsystems are wanted */
1058
- ret = parse_cgroupfs_options(data, &opts);
1081
+ ret = check_cgroupfs_options(fc);
10591082 if (ret)
10601083 goto out_unlock;
10611084
1062
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1085
+ if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
10631086 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
10641087 task_tgid_nr(current), current->comm);
1065
- /* See cgroup1_mount release_agent handling */
1066
- if (opts.release_agent &&
1067
- ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
1068
- ret = -EINVAL;
1069
- goto out_unlock;
1070
- }
10711088
1072
- added_mask = opts.subsys_mask & ~root->subsys_mask;
1073
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
1089
+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
1090
+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
10741091
10751092 /* Don't allow flags or name to change at remount */
1076
- if ((opts.flags ^ root->flags) ||
1077
- (opts.name && strcmp(opts.name, root->name))) {
1078
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1079
- opts.flags, opts.name ?: "", root->flags, root->name);
1093
+ if ((ctx->flags ^ root->flags) ||
1094
+ (ctx->name && strcmp(ctx->name, root->name))) {
1095
+ errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
1096
+ ctx->flags, ctx->name ?: "", root->flags, root->name);
10801097 ret = -EINVAL;
10811098 goto out_unlock;
10821099 }
....@@ -1093,17 +1110,15 @@
10931110
10941111 WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
10951112
1096
- if (opts.release_agent) {
1113
+ if (ctx->release_agent) {
10971114 spin_lock(&release_agent_path_lock);
1098
- strcpy(root->release_agent_path, opts.release_agent);
1115
+ strcpy(root->release_agent_path, ctx->release_agent);
10991116 spin_unlock(&release_agent_path_lock);
11001117 }
11011118
11021119 trace_cgroup_remount(root);
11031120
11041121 out_unlock:
1105
- kfree(opts.release_agent);
1106
- kfree(opts.name);
11071122 mutex_unlock(&cgroup_mutex);
11081123 return ret;
11091124 }
....@@ -1111,28 +1126,30 @@
11111126 struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
11121127 .rename = cgroup1_rename,
11131128 .show_options = cgroup1_show_options,
1114
- .remount_fs = cgroup1_remount,
11151129 .mkdir = cgroup_mkdir,
11161130 .rmdir = cgroup_rmdir,
11171131 .show_path = cgroup_show_path,
11181132 };
11191133
1120
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
1121
- void *data, unsigned long magic,
1122
- struct cgroup_namespace *ns)
1134
+/*
1135
+ * The guts of cgroup1 mount - find or create cgroup_root to use.
1136
+ * Called with cgroup_mutex held; returns 0 on success, -E... on
1137
+ * error and positive - in case when the candidate is busy dying.
1138
+ * On success it stashes a reference to cgroup_root into given
1139
+ * cgroup_fs_context; that reference is *NOT* counting towards the
1140
+ * cgroup_root refcount.
1141
+ */
1142
+static int cgroup1_root_to_use(struct fs_context *fc)
11231143 {
1124
- struct cgroup_sb_opts opts;
1144
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
11251145 struct cgroup_root *root;
11261146 struct cgroup_subsys *ss;
1127
- struct dentry *dentry;
11281147 int i, ret;
11291148
1130
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1131
-
11321149 /* First find the desired set of subsystems */
1133
- ret = parse_cgroupfs_options(data, &opts);
1150
+ ret = check_cgroupfs_options(fc);
11341151 if (ret)
1135
- goto out_unlock;
1152
+ return ret;
11361153
11371154 /*
11381155 * Destruction of cgroup root is asynchronous, so subsystems may
....@@ -1142,16 +1159,12 @@
11421159 * starting. Testing ref liveliness is good enough.
11431160 */
11441161 for_each_subsys(ss, i) {
1145
- if (!(opts.subsys_mask & (1 << i)) ||
1162
+ if (!(ctx->subsys_mask & (1 << i)) ||
11461163 ss->root == &cgrp_dfl_root)
11471164 continue;
11481165
1149
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
1150
- mutex_unlock(&cgroup_mutex);
1151
- msleep(10);
1152
- ret = restart_syscall();
1153
- goto out_free;
1154
- }
1166
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
1167
+ return 1; /* restart */
11551168 cgroup_put(&ss->root->cgrp);
11561169 }
11571170
....@@ -1166,8 +1179,8 @@
11661179 * name matches but sybsys_mask doesn't, we should fail.
11671180 * Remember whether name matched.
11681181 */
1169
- if (opts.name) {
1170
- if (strcmp(opts.name, root->name))
1182
+ if (ctx->name) {
1183
+ if (strcmp(ctx->name, root->name))
11711184 continue;
11721185 name_match = true;
11731186 }
....@@ -1176,19 +1189,18 @@
11761189 * If we asked for subsystems (or explicitly for no
11771190 * subsystems) then they must match.
11781191 */
1179
- if ((opts.subsys_mask || opts.none) &&
1180
- (opts.subsys_mask != root->subsys_mask)) {
1192
+ if ((ctx->subsys_mask || ctx->none) &&
1193
+ (ctx->subsys_mask != root->subsys_mask)) {
11811194 if (!name_match)
11821195 continue;
1183
- ret = -EBUSY;
1184
- goto out_unlock;
1196
+ return -EBUSY;
11851197 }
11861198
1187
- if (root->flags ^ opts.flags)
1199
+ if (root->flags ^ ctx->flags)
11881200 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
11891201
1190
- ret = 0;
1191
- goto out_unlock;
1202
+ ctx->root = root;
1203
+ return 0;
11921204 }
11931205
11941206 /*
....@@ -1196,64 +1208,56 @@
11961208 * specification is allowed for already existing hierarchies but we
11971209 * can't create new one without subsys specification.
11981210 */
1199
- if (!opts.subsys_mask && !opts.none) {
1200
- ret = -EINVAL;
1201
- goto out_unlock;
1202
- }
1211
+ if (!ctx->subsys_mask && !ctx->none)
1212
+ return invalfc(fc, "No subsys list or none specified");
12031213
12041214 /* Hierarchies may only be created in the initial cgroup namespace. */
1205
- if (ns != &init_cgroup_ns) {
1206
- ret = -EPERM;
1207
- goto out_unlock;
1208
- }
1209
- /*
1210
- * Release agent gets called with all capabilities,
1211
- * require capabilities to set release agent.
1212
- */
1213
- if (opts.release_agent &&
1214
- ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) {
1215
- ret = -EINVAL;
1216
- goto out_unlock;
1217
- }
1215
+ if (ctx->ns != &init_cgroup_ns)
1216
+ return -EPERM;
12181217
12191218 root = kzalloc(sizeof(*root), GFP_KERNEL);
1220
- if (!root) {
1221
- ret = -ENOMEM;
1222
- goto out_unlock;
1223
- }
1219
+ if (!root)
1220
+ return -ENOMEM;
12241221
1225
- init_cgroup_root(root, &opts);
1222
+ ctx->root = root;
1223
+ init_cgroup_root(ctx);
12261224
1227
- ret = cgroup_setup_root(root, opts.subsys_mask);
1225
+ ret = cgroup_setup_root(root, ctx->subsys_mask);
12281226 if (ret)
12291227 cgroup_free_root(root);
1228
+ return ret;
1229
+}
12301230
1231
-out_unlock:
1232
- if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1233
- mutex_unlock(&cgroup_mutex);
1234
- msleep(10);
1235
- ret = restart_syscall();
1236
- goto out_free;
1237
- }
1231
+int cgroup1_get_tree(struct fs_context *fc)
1232
+{
1233
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1234
+ int ret;
1235
+
1236
+ /* Check if the caller has permission to mount. */
1237
+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
1238
+ return -EPERM;
1239
+
1240
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1241
+
1242
+ ret = cgroup1_root_to_use(fc);
1243
+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
1244
+ ret = 1; /* restart */
1245
+
12381246 mutex_unlock(&cgroup_mutex);
1239
-out_free:
1240
- kfree(opts.release_agent);
1241
- kfree(opts.name);
12421247
1243
- if (ret)
1244
- return ERR_PTR(ret);
1248
+ if (!ret)
1249
+ ret = cgroup_do_get_tree(fc);
12451250
1246
- dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
1247
- CGROUP_SUPER_MAGIC, ns);
1248
-
1249
- if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
1250
- struct super_block *sb = dentry->d_sb;
1251
- dput(dentry);
1252
- deactivate_locked_super(sb);
1253
- msleep(10);
1254
- dentry = ERR_PTR(restart_syscall());
1251
+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
1252
+ fc_drop_locked(fc);
1253
+ ret = 1;
12551254 }
1256
- return dentry;
1255
+
1256
+ if (unlikely(ret > 0)) {
1257
+ msleep(10);
1258
+ return restart_syscall();
1259
+ }
1260
+ return ret;
12571261 }
12581262
12591263 static int __init cgroup1_wq_init(void)