.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | #include "cgroup-internal.h" |
---|
2 | 3 | |
---|
3 | 4 | #include <linux/ctype.h> |
---|
.. | .. |
---|
13 | 14 | #include <linux/delayacct.h> |
---|
14 | 15 | #include <linux/pid_namespace.h> |
---|
15 | 16 | #include <linux/cgroupstats.h> |
---|
| 17 | +#include <linux/fs_parser.h> |
---|
16 | 18 | |
---|
17 | 19 | #include <trace/events/cgroup.h> |
---|
| 20 | +#include <trace/hooks/cgroup.h> |
---|
18 | 21 | |
---|
19 | 22 | /* |
---|
20 | 23 | * pidlists linger the following amount before being destroyed. The goal |
---|
.. | .. |
---|
36 | 39 | */ |
---|
37 | 40 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; |
---|
38 | 41 | |
---|
39 | | -/* |
---|
40 | | - * Protects cgroup_subsys->release_agent_path. Modifying it also requires |
---|
41 | | - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. |
---|
42 | | - */ |
---|
| 42 | +/* protects cgroup_subsys->release_agent_path */ |
---|
43 | 43 | static DEFINE_SPINLOCK(release_agent_path_lock); |
---|
44 | 44 | |
---|
45 | 45 | bool cgroup1_ssid_disabled(int ssid) |
---|
.. | .. |
---|
58 | 58 | int retval = 0; |
---|
59 | 59 | |
---|
60 | 60 | mutex_lock(&cgroup_mutex); |
---|
| 61 | + cpus_read_lock(); |
---|
61 | 62 | percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
62 | 63 | for_each_root(root) { |
---|
63 | 64 | struct cgroup *from_cgrp; |
---|
.. | .. |
---|
74 | 75 | break; |
---|
75 | 76 | } |
---|
76 | 77 | percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 78 | + cpus_read_unlock(); |
---|
77 | 79 | mutex_unlock(&cgroup_mutex); |
---|
78 | 80 | |
---|
79 | 81 | return retval; |
---|
.. | .. |
---|
190 | 192 | }; |
---|
191 | 193 | |
---|
192 | 194 | /* |
---|
193 | | - * The following two functions "fix" the issue where there are more pids |
---|
194 | | - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. |
---|
195 | | - * TODO: replace with a kernel-wide solution to this problem |
---|
196 | | - */ |
---|
197 | | -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) |
---|
198 | | -static void *pidlist_allocate(int count) |
---|
199 | | -{ |
---|
200 | | - if (PIDLIST_TOO_LARGE(count)) |
---|
201 | | - return vmalloc(array_size(count, sizeof(pid_t))); |
---|
202 | | - else |
---|
203 | | - return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL); |
---|
204 | | -} |
---|
205 | | - |
---|
206 | | -static void pidlist_free(void *p) |
---|
207 | | -{ |
---|
208 | | - kvfree(p); |
---|
209 | | -} |
---|
210 | | - |
---|
211 | | -/* |
---|
212 | 195 | * Used to destroy all pidlists lingering waiting for destroy timer. None |
---|
213 | 196 | * should be left afterwards. |
---|
214 | 197 | */ |
---|
.. | .. |
---|
240 | 223 | */ |
---|
241 | 224 | if (!delayed_work_pending(dwork)) { |
---|
242 | 225 | list_del(&l->links); |
---|
243 | | - pidlist_free(l->list); |
---|
| 226 | + kvfree(l->list); |
---|
244 | 227 | put_pid_ns(l->key.ns); |
---|
245 | 228 | tofree = l; |
---|
246 | 229 | } |
---|
.. | .. |
---|
361 | 344 | * show up until sometime later on. |
---|
362 | 345 | */ |
---|
363 | 346 | length = cgroup_task_count(cgrp); |
---|
364 | | - array = pidlist_allocate(length); |
---|
| 347 | + array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); |
---|
365 | 348 | if (!array) |
---|
366 | 349 | return -ENOMEM; |
---|
367 | 350 | /* now, populate the array */ |
---|
.. | .. |
---|
386 | 369 | |
---|
387 | 370 | l = cgroup_pidlist_find_create(cgrp, type); |
---|
388 | 371 | if (!l) { |
---|
389 | | - pidlist_free(array); |
---|
| 372 | + kvfree(array); |
---|
390 | 373 | return -ENOMEM; |
---|
391 | 374 | } |
---|
392 | 375 | |
---|
393 | 376 | /* store array, freeing old if necessary */ |
---|
394 | | - pidlist_free(l->list); |
---|
| 377 | + kvfree(l->list); |
---|
395 | 378 | l->list = array; |
---|
396 | 379 | l->length = length; |
---|
397 | 380 | *lp = l; |
---|
.. | .. |
---|
413 | 396 | * next pid to display, if any |
---|
414 | 397 | */ |
---|
415 | 398 | struct kernfs_open_file *of = s->private; |
---|
| 399 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
416 | 400 | struct cgroup *cgrp = seq_css(s)->cgroup; |
---|
417 | 401 | struct cgroup_pidlist *l; |
---|
418 | 402 | enum cgroup_filetype type = seq_cft(s)->private; |
---|
.. | .. |
---|
422 | 406 | mutex_lock(&cgrp->pidlist_mutex); |
---|
423 | 407 | |
---|
424 | 408 | /* |
---|
425 | | - * !NULL @of->priv indicates that this isn't the first start() |
---|
426 | | - * after open. If the matching pidlist is around, we can use that. |
---|
427 | | - * Look for it. Note that @of->priv can't be used directly. It |
---|
428 | | - * could already have been destroyed. |
---|
| 409 | + * !NULL @ctx->procs1.pidlist indicates that this isn't the first |
---|
| 410 | + * start() after open. If the matching pidlist is around, we can use |
---|
| 411 | + * that. Look for it. Note that @ctx->procs1.pidlist can't be used |
---|
| 412 | + * directly. It could already have been destroyed. |
---|
429 | 413 | */ |
---|
430 | | - if (of->priv) |
---|
431 | | - of->priv = cgroup_pidlist_find(cgrp, type); |
---|
| 414 | + if (ctx->procs1.pidlist) |
---|
| 415 | + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); |
---|
432 | 416 | |
---|
433 | 417 | /* |
---|
434 | 418 | * Either this is the first start() after open or the matching |
---|
435 | 419 | * pidlist has been destroyed inbetween. Create a new one. |
---|
436 | 420 | */ |
---|
437 | | - if (!of->priv) { |
---|
438 | | - ret = pidlist_array_load(cgrp, type, |
---|
439 | | - (struct cgroup_pidlist **)&of->priv); |
---|
| 421 | + if (!ctx->procs1.pidlist) { |
---|
| 422 | + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); |
---|
440 | 423 | if (ret) |
---|
441 | 424 | return ERR_PTR(ret); |
---|
442 | 425 | } |
---|
443 | | - l = of->priv; |
---|
| 426 | + l = ctx->procs1.pidlist; |
---|
444 | 427 | |
---|
445 | 428 | if (pid) { |
---|
446 | 429 | int end = l->length; |
---|
.. | .. |
---|
468 | 451 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
---|
469 | 452 | { |
---|
470 | 453 | struct kernfs_open_file *of = s->private; |
---|
471 | | - struct cgroup_pidlist *l = of->priv; |
---|
| 454 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 455 | + struct cgroup_pidlist *l = ctx->procs1.pidlist; |
---|
472 | 456 | |
---|
473 | 457 | if (l) |
---|
474 | 458 | mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, |
---|
.. | .. |
---|
479 | 463 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
---|
480 | 464 | { |
---|
481 | 465 | struct kernfs_open_file *of = s->private; |
---|
482 | | - struct cgroup_pidlist *l = of->priv; |
---|
| 466 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 467 | + struct cgroup_pidlist *l = ctx->procs1.pidlist; |
---|
483 | 468 | pid_t *p = v; |
---|
484 | 469 | pid_t *end = l->list + l->length; |
---|
485 | 470 | /* |
---|
.. | .. |
---|
511 | 496 | struct task_struct *task; |
---|
512 | 497 | const struct cred *cred, *tcred; |
---|
513 | 498 | ssize_t ret; |
---|
| 499 | + bool locked; |
---|
514 | 500 | |
---|
515 | 501 | cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
516 | 502 | if (!cgrp) |
---|
517 | 503 | return -ENODEV; |
---|
518 | 504 | |
---|
519 | | - task = cgroup_procs_write_start(buf, threadgroup); |
---|
| 505 | + task = cgroup_procs_write_start(buf, threadgroup, &locked, cgrp); |
---|
520 | 506 | ret = PTR_ERR_OR_ZERO(task); |
---|
521 | 507 | if (ret) |
---|
522 | 508 | goto out_unlock; |
---|
523 | 509 | |
---|
524 | 510 | /* |
---|
525 | | - * Even if we're attaching all tasks in the thread group, we only |
---|
526 | | - * need to check permissions on one of them. |
---|
| 511 | + * Even if we're attaching all tasks in the thread group, we only need |
---|
| 512 | + * to check permissions on one of them. Check permissions using the |
---|
| 513 | + * credentials from file open to protect against inherited fd attacks. |
---|
527 | 514 | */ |
---|
528 | | - cred = current_cred(); |
---|
| 515 | + cred = of->file->f_cred; |
---|
529 | 516 | tcred = get_task_cred(task); |
---|
530 | 517 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && |
---|
531 | 518 | !uid_eq(cred->euid, tcred->uid) && |
---|
.. | .. |
---|
537 | 524 | goto out_finish; |
---|
538 | 525 | |
---|
539 | 526 | ret = cgroup_attach_task(cgrp, task, threadgroup); |
---|
| 527 | + trace_android_vh_cgroup_set_task(ret, task); |
---|
540 | 528 | |
---|
541 | 529 | out_finish: |
---|
542 | | - cgroup_procs_write_finish(task); |
---|
| 530 | + cgroup_procs_write_finish(task, locked); |
---|
543 | 531 | out_unlock: |
---|
544 | 532 | cgroup_kn_unlock(of->kn); |
---|
545 | 533 | |
---|
.. | .. |
---|
562 | 550 | char *buf, size_t nbytes, loff_t off) |
---|
563 | 551 | { |
---|
564 | 552 | struct cgroup *cgrp; |
---|
| 553 | + struct cgroup_file_ctx *ctx; |
---|
565 | 554 | |
---|
566 | 555 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
---|
567 | 556 | |
---|
.. | .. |
---|
569 | 558 | * Release agent gets called with all capabilities, |
---|
570 | 559 | * require capabilities to set release agent. |
---|
571 | 560 | */ |
---|
572 | | - if ((of->file->f_cred->user_ns != &init_user_ns) || |
---|
573 | | - !capable(CAP_SYS_ADMIN)) |
---|
| 561 | + ctx = of->priv; |
---|
| 562 | + if ((ctx->ns->user_ns != &init_user_ns) || |
---|
| 563 | + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) |
---|
574 | 564 | return -EPERM; |
---|
575 | 565 | |
---|
576 | 566 | cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
.. | .. |
---|
800 | 790 | { |
---|
801 | 791 | struct cgroup *cgrp = |
---|
802 | 792 | container_of(work, struct cgroup, release_agent_work); |
---|
803 | | - char *pathbuf = NULL, *agentbuf = NULL; |
---|
| 793 | + char *pathbuf, *agentbuf; |
---|
804 | 794 | char *argv[3], *envp[3]; |
---|
805 | 795 | int ret; |
---|
806 | 796 | |
---|
807 | | - mutex_lock(&cgroup_mutex); |
---|
| 797 | + /* snoop agent path and exit early if empty */ |
---|
| 798 | + if (!cgrp->root->release_agent_path[0]) |
---|
| 799 | + return; |
---|
808 | 800 | |
---|
| 801 | + /* prepare argument buffers */ |
---|
809 | 802 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
---|
810 | | - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); |
---|
811 | | - if (!pathbuf || !agentbuf || !strlen(agentbuf)) |
---|
812 | | - goto out; |
---|
| 803 | + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
---|
| 804 | + if (!pathbuf || !agentbuf) |
---|
| 805 | + goto out_free; |
---|
813 | 806 | |
---|
814 | | - spin_lock_irq(&css_set_lock); |
---|
815 | | - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); |
---|
816 | | - spin_unlock_irq(&css_set_lock); |
---|
| 807 | + spin_lock(&release_agent_path_lock); |
---|
| 808 | + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); |
---|
| 809 | + spin_unlock(&release_agent_path_lock); |
---|
| 810 | + if (!agentbuf[0]) |
---|
| 811 | + goto out_free; |
---|
| 812 | + |
---|
| 813 | + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); |
---|
817 | 814 | if (ret < 0 || ret >= PATH_MAX) |
---|
818 | | - goto out; |
---|
| 815 | + goto out_free; |
---|
819 | 816 | |
---|
820 | 817 | argv[0] = agentbuf; |
---|
821 | 818 | argv[1] = pathbuf; |
---|
.. | .. |
---|
826 | 823 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
---|
827 | 824 | envp[2] = NULL; |
---|
828 | 825 | |
---|
829 | | - mutex_unlock(&cgroup_mutex); |
---|
830 | 826 | call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
---|
831 | | - goto out_free; |
---|
832 | | -out: |
---|
833 | | - mutex_unlock(&cgroup_mutex); |
---|
834 | 827 | out_free: |
---|
835 | 828 | kfree(agentbuf); |
---|
836 | 829 | kfree(pathbuf); |
---|
.. | .. |
---|
904 | 897 | return 0; |
---|
905 | 898 | } |
---|
906 | 899 | |
---|
907 | | -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
---|
| 900 | +enum cgroup1_param { |
---|
| 901 | + Opt_all, |
---|
| 902 | + Opt_clone_children, |
---|
| 903 | + Opt_cpuset_v2_mode, |
---|
| 904 | + Opt_name, |
---|
| 905 | + Opt_none, |
---|
| 906 | + Opt_noprefix, |
---|
| 907 | + Opt_release_agent, |
---|
| 908 | + Opt_xattr, |
---|
| 909 | +}; |
---|
| 910 | + |
---|
| 911 | +const struct fs_parameter_spec cgroup1_fs_parameters[] = { |
---|
| 912 | + fsparam_flag ("all", Opt_all), |
---|
| 913 | + fsparam_flag ("clone_children", Opt_clone_children), |
---|
| 914 | + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), |
---|
| 915 | + fsparam_string("name", Opt_name), |
---|
| 916 | + fsparam_flag ("none", Opt_none), |
---|
| 917 | + fsparam_flag ("noprefix", Opt_noprefix), |
---|
| 918 | + fsparam_string("release_agent", Opt_release_agent), |
---|
| 919 | + fsparam_flag ("xattr", Opt_xattr), |
---|
| 920 | + {} |
---|
| 921 | +}; |
---|
| 922 | + |
---|
| 923 | +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) |
---|
908 | 924 | { |
---|
909 | | - char *token, *o = data; |
---|
910 | | - bool all_ss = false, one_ss = false; |
---|
911 | | - u16 mask = U16_MAX; |
---|
| 925 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
912 | 926 | struct cgroup_subsys *ss; |
---|
913 | | - int nr_opts = 0; |
---|
| 927 | + struct fs_parse_result result; |
---|
| 928 | + int opt, i; |
---|
| 929 | + |
---|
| 930 | + opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); |
---|
| 931 | + if (opt == -ENOPARAM) { |
---|
| 932 | + if (strcmp(param->key, "source") == 0) { |
---|
| 933 | + if (param->type != fs_value_is_string) |
---|
| 934 | + return invalf(fc, "Non-string source"); |
---|
| 935 | + if (fc->source) |
---|
| 936 | + return invalf(fc, "Multiple sources not supported"); |
---|
| 937 | + fc->source = param->string; |
---|
| 938 | + param->string = NULL; |
---|
| 939 | + return 0; |
---|
| 940 | + } |
---|
| 941 | + for_each_subsys(ss, i) { |
---|
| 942 | + if (strcmp(param->key, ss->legacy_name)) |
---|
| 943 | + continue; |
---|
| 944 | + if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) |
---|
| 945 | + return invalfc(fc, "Disabled controller '%s'", |
---|
| 946 | + param->key); |
---|
| 947 | + ctx->subsys_mask |= (1 << i); |
---|
| 948 | + return 0; |
---|
| 949 | + } |
---|
| 950 | + return invalfc(fc, "Unknown subsys name '%s'", param->key); |
---|
| 951 | + } |
---|
| 952 | + if (opt < 0) |
---|
| 953 | + return opt; |
---|
| 954 | + |
---|
| 955 | + switch (opt) { |
---|
| 956 | + case Opt_none: |
---|
| 957 | + /* Explicitly have no subsystems */ |
---|
| 958 | + ctx->none = true; |
---|
| 959 | + break; |
---|
| 960 | + case Opt_all: |
---|
| 961 | + ctx->all_ss = true; |
---|
| 962 | + break; |
---|
| 963 | + case Opt_noprefix: |
---|
| 964 | + ctx->flags |= CGRP_ROOT_NOPREFIX; |
---|
| 965 | + break; |
---|
| 966 | + case Opt_clone_children: |
---|
| 967 | + ctx->cpuset_clone_children = true; |
---|
| 968 | + break; |
---|
| 969 | + case Opt_cpuset_v2_mode: |
---|
| 970 | + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; |
---|
| 971 | + break; |
---|
| 972 | + case Opt_xattr: |
---|
| 973 | + ctx->flags |= CGRP_ROOT_XATTR; |
---|
| 974 | + break; |
---|
| 975 | + case Opt_release_agent: |
---|
| 976 | + /* Specifying two release agents is forbidden */ |
---|
| 977 | + if (ctx->release_agent) |
---|
| 978 | + return invalfc(fc, "release_agent respecified"); |
---|
| 979 | + /* |
---|
| 980 | + * Release agent gets called with all capabilities, |
---|
| 981 | + * require capabilities to set release agent. |
---|
| 982 | + */ |
---|
| 983 | + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) |
---|
| 984 | + return invalfc(fc, "Setting release_agent not allowed"); |
---|
| 985 | + ctx->release_agent = param->string; |
---|
| 986 | + param->string = NULL; |
---|
| 987 | + break; |
---|
| 988 | + case Opt_name: |
---|
| 989 | + /* blocked by boot param? */ |
---|
| 990 | + if (cgroup_no_v1_named) |
---|
| 991 | + return -ENOENT; |
---|
| 992 | + /* Can't specify an empty name */ |
---|
| 993 | + if (!param->size) |
---|
| 994 | + return invalfc(fc, "Empty name"); |
---|
| 995 | + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) |
---|
| 996 | + return invalfc(fc, "Name too long"); |
---|
| 997 | + /* Must match [\w.-]+ */ |
---|
| 998 | + for (i = 0; i < param->size; i++) { |
---|
| 999 | + char c = param->string[i]; |
---|
| 1000 | + if (isalnum(c)) |
---|
| 1001 | + continue; |
---|
| 1002 | + if ((c == '.') || (c == '-') || (c == '_')) |
---|
| 1003 | + continue; |
---|
| 1004 | + return invalfc(fc, "Invalid name"); |
---|
| 1005 | + } |
---|
| 1006 | + /* Specifying two names is forbidden */ |
---|
| 1007 | + if (ctx->name) |
---|
| 1008 | + return invalfc(fc, "name respecified"); |
---|
| 1009 | + ctx->name = param->string; |
---|
| 1010 | + param->string = NULL; |
---|
| 1011 | + break; |
---|
| 1012 | + } |
---|
| 1013 | + return 0; |
---|
| 1014 | +} |
---|
| 1015 | + |
---|
| 1016 | +static int check_cgroupfs_options(struct fs_context *fc) |
---|
| 1017 | +{ |
---|
| 1018 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 1019 | + u16 mask = U16_MAX; |
---|
| 1020 | + u16 enabled = 0; |
---|
| 1021 | + struct cgroup_subsys *ss; |
---|
914 | 1022 | int i; |
---|
915 | 1023 | |
---|
916 | 1024 | #ifdef CONFIG_CPUSETS |
---|
917 | 1025 | mask = ~((u16)1 << cpuset_cgrp_id); |
---|
918 | 1026 | #endif |
---|
| 1027 | + for_each_subsys(ss, i) |
---|
| 1028 | + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) |
---|
| 1029 | + enabled |= 1 << i; |
---|
919 | 1030 | |
---|
920 | | - memset(opts, 0, sizeof(*opts)); |
---|
921 | | - |
---|
922 | | - while ((token = strsep(&o, ",")) != NULL) { |
---|
923 | | - nr_opts++; |
---|
924 | | - |
---|
925 | | - if (!*token) |
---|
926 | | - return -EINVAL; |
---|
927 | | - if (!strcmp(token, "none")) { |
---|
928 | | - /* Explicitly have no subsystems */ |
---|
929 | | - opts->none = true; |
---|
930 | | - continue; |
---|
931 | | - } |
---|
932 | | - if (!strcmp(token, "all")) { |
---|
933 | | - /* Mutually exclusive option 'all' + subsystem name */ |
---|
934 | | - if (one_ss) |
---|
935 | | - return -EINVAL; |
---|
936 | | - all_ss = true; |
---|
937 | | - continue; |
---|
938 | | - } |
---|
939 | | - if (!strcmp(token, "noprefix")) { |
---|
940 | | - opts->flags |= CGRP_ROOT_NOPREFIX; |
---|
941 | | - continue; |
---|
942 | | - } |
---|
943 | | - if (!strcmp(token, "clone_children")) { |
---|
944 | | - opts->cpuset_clone_children = true; |
---|
945 | | - continue; |
---|
946 | | - } |
---|
947 | | - if (!strcmp(token, "cpuset_v2_mode")) { |
---|
948 | | - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; |
---|
949 | | - continue; |
---|
950 | | - } |
---|
951 | | - if (!strcmp(token, "xattr")) { |
---|
952 | | - opts->flags |= CGRP_ROOT_XATTR; |
---|
953 | | - continue; |
---|
954 | | - } |
---|
955 | | - if (!strncmp(token, "release_agent=", 14)) { |
---|
956 | | - /* Specifying two release agents is forbidden */ |
---|
957 | | - if (opts->release_agent) |
---|
958 | | - return -EINVAL; |
---|
959 | | - opts->release_agent = |
---|
960 | | - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
---|
961 | | - if (!opts->release_agent) |
---|
962 | | - return -ENOMEM; |
---|
963 | | - continue; |
---|
964 | | - } |
---|
965 | | - if (!strncmp(token, "name=", 5)) { |
---|
966 | | - const char *name = token + 5; |
---|
967 | | - |
---|
968 | | - /* blocked by boot param? */ |
---|
969 | | - if (cgroup_no_v1_named) |
---|
970 | | - return -ENOENT; |
---|
971 | | - /* Can't specify an empty name */ |
---|
972 | | - if (!strlen(name)) |
---|
973 | | - return -EINVAL; |
---|
974 | | - /* Must match [\w.-]+ */ |
---|
975 | | - for (i = 0; i < strlen(name); i++) { |
---|
976 | | - char c = name[i]; |
---|
977 | | - if (isalnum(c)) |
---|
978 | | - continue; |
---|
979 | | - if ((c == '.') || (c == '-') || (c == '_')) |
---|
980 | | - continue; |
---|
981 | | - return -EINVAL; |
---|
982 | | - } |
---|
983 | | - /* Specifying two names is forbidden */ |
---|
984 | | - if (opts->name) |
---|
985 | | - return -EINVAL; |
---|
986 | | - opts->name = kstrndup(name, |
---|
987 | | - MAX_CGROUP_ROOT_NAMELEN - 1, |
---|
988 | | - GFP_KERNEL); |
---|
989 | | - if (!opts->name) |
---|
990 | | - return -ENOMEM; |
---|
991 | | - |
---|
992 | | - continue; |
---|
993 | | - } |
---|
994 | | - |
---|
995 | | - for_each_subsys(ss, i) { |
---|
996 | | - if (strcmp(token, ss->legacy_name)) |
---|
997 | | - continue; |
---|
998 | | - if (!cgroup_ssid_enabled(i)) |
---|
999 | | - continue; |
---|
1000 | | - if (cgroup1_ssid_disabled(i)) |
---|
1001 | | - continue; |
---|
1002 | | - |
---|
1003 | | - /* Mutually exclusive option 'all' + subsystem name */ |
---|
1004 | | - if (all_ss) |
---|
1005 | | - return -EINVAL; |
---|
1006 | | - opts->subsys_mask |= (1 << i); |
---|
1007 | | - one_ss = true; |
---|
1008 | | - |
---|
1009 | | - break; |
---|
1010 | | - } |
---|
1011 | | - if (i == CGROUP_SUBSYS_COUNT) |
---|
1012 | | - return -ENOENT; |
---|
1013 | | - } |
---|
| 1031 | + ctx->subsys_mask &= enabled; |
---|
1014 | 1032 | |
---|
1015 | 1033 | /* |
---|
1016 | | - * If the 'all' option was specified select all the subsystems, |
---|
1017 | | - * otherwise if 'none', 'name=' and a subsystem name options were |
---|
1018 | | - * not specified, let's default to 'all' |
---|
| 1034 | + * In absense of 'none', 'name=' or subsystem name options, |
---|
| 1035 | + * let's default to 'all'. |
---|
1019 | 1036 | */ |
---|
1020 | | - if (all_ss || (!one_ss && !opts->none && !opts->name)) |
---|
1021 | | - for_each_subsys(ss, i) |
---|
1022 | | - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) |
---|
1023 | | - opts->subsys_mask |= (1 << i); |
---|
| 1037 | + if (!ctx->subsys_mask && !ctx->none && !ctx->name) |
---|
| 1038 | + ctx->all_ss = true; |
---|
| 1039 | + |
---|
| 1040 | + if (ctx->all_ss) { |
---|
| 1041 | + /* Mutually exclusive option 'all' + subsystem name */ |
---|
| 1042 | + if (ctx->subsys_mask) |
---|
| 1043 | + return invalfc(fc, "subsys name conflicts with all"); |
---|
| 1044 | + /* 'all' => select all the subsystems */ |
---|
| 1045 | + ctx->subsys_mask = enabled; |
---|
| 1046 | + } |
---|
1024 | 1047 | |
---|
1025 | 1048 | /* |
---|
1026 | 1049 | * We either have to specify by name or by subsystems. (So all |
---|
1027 | 1050 | * empty hierarchies must have a name). |
---|
1028 | 1051 | */ |
---|
1029 | | - if (!opts->subsys_mask && !opts->name) |
---|
1030 | | - return -EINVAL; |
---|
| 1052 | + if (!ctx->subsys_mask && !ctx->name) |
---|
| 1053 | + return invalfc(fc, "Need name or subsystem set"); |
---|
1031 | 1054 | |
---|
1032 | 1055 | /* |
---|
1033 | 1056 | * Option noprefix was introduced just for backward compatibility |
---|
1034 | 1057 | * with the old cpuset, so we allow noprefix only if mounting just |
---|
1035 | 1058 | * the cpuset subsystem. |
---|
1036 | 1059 | */ |
---|
1037 | | - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) |
---|
1038 | | - return -EINVAL; |
---|
| 1060 | + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) |
---|
| 1061 | + return invalfc(fc, "noprefix used incorrectly"); |
---|
1039 | 1062 | |
---|
1040 | 1063 | /* Can't specify "none" and some subsystems */ |
---|
1041 | | - if (opts->subsys_mask && opts->none) |
---|
1042 | | - return -EINVAL; |
---|
| 1064 | + if (ctx->subsys_mask && ctx->none) |
---|
| 1065 | + return invalfc(fc, "none used incorrectly"); |
---|
1043 | 1066 | |
---|
1044 | 1067 | return 0; |
---|
1045 | 1068 | } |
---|
1046 | 1069 | |
---|
1047 | | -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) |
---|
| 1070 | +int cgroup1_reconfigure(struct fs_context *fc) |
---|
1048 | 1071 | { |
---|
1049 | | - int ret = 0; |
---|
| 1072 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 1073 | + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); |
---|
1050 | 1074 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
---|
1051 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
1052 | | - struct cgroup_sb_opts opts; |
---|
| 1075 | + int ret = 0; |
---|
1053 | 1076 | u16 added_mask, removed_mask; |
---|
1054 | 1077 | |
---|
1055 | 1078 | cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
---|
1056 | 1079 | |
---|
1057 | 1080 | /* See what subsystems are wanted */ |
---|
1058 | | - ret = parse_cgroupfs_options(data, &opts); |
---|
| 1081 | + ret = check_cgroupfs_options(fc); |
---|
1059 | 1082 | if (ret) |
---|
1060 | 1083 | goto out_unlock; |
---|
1061 | 1084 | |
---|
1062 | | - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
---|
| 1085 | + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) |
---|
1063 | 1086 | pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", |
---|
1064 | 1087 | task_tgid_nr(current), current->comm); |
---|
1065 | | - /* See cgroup1_mount release_agent handling */ |
---|
1066 | | - if (opts.release_agent && |
---|
1067 | | - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { |
---|
1068 | | - ret = -EINVAL; |
---|
1069 | | - goto out_unlock; |
---|
1070 | | - } |
---|
1071 | 1088 | |
---|
1072 | | - added_mask = opts.subsys_mask & ~root->subsys_mask; |
---|
1073 | | - removed_mask = root->subsys_mask & ~opts.subsys_mask; |
---|
| 1089 | + added_mask = ctx->subsys_mask & ~root->subsys_mask; |
---|
| 1090 | + removed_mask = root->subsys_mask & ~ctx->subsys_mask; |
---|
1074 | 1091 | |
---|
1075 | 1092 | /* Don't allow flags or name to change at remount */ |
---|
1076 | | - if ((opts.flags ^ root->flags) || |
---|
1077 | | - (opts.name && strcmp(opts.name, root->name))) { |
---|
1078 | | - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", |
---|
1079 | | - opts.flags, opts.name ?: "", root->flags, root->name); |
---|
| 1093 | + if ((ctx->flags ^ root->flags) || |
---|
| 1094 | + (ctx->name && strcmp(ctx->name, root->name))) { |
---|
| 1095 | + errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", |
---|
| 1096 | + ctx->flags, ctx->name ?: "", root->flags, root->name); |
---|
1080 | 1097 | ret = -EINVAL; |
---|
1081 | 1098 | goto out_unlock; |
---|
1082 | 1099 | } |
---|
.. | .. |
---|
1093 | 1110 | |
---|
1094 | 1111 | WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); |
---|
1095 | 1112 | |
---|
1096 | | - if (opts.release_agent) { |
---|
| 1113 | + if (ctx->release_agent) { |
---|
1097 | 1114 | spin_lock(&release_agent_path_lock); |
---|
1098 | | - strcpy(root->release_agent_path, opts.release_agent); |
---|
| 1115 | + strcpy(root->release_agent_path, ctx->release_agent); |
---|
1099 | 1116 | spin_unlock(&release_agent_path_lock); |
---|
1100 | 1117 | } |
---|
1101 | 1118 | |
---|
1102 | 1119 | trace_cgroup_remount(root); |
---|
1103 | 1120 | |
---|
1104 | 1121 | out_unlock: |
---|
1105 | | - kfree(opts.release_agent); |
---|
1106 | | - kfree(opts.name); |
---|
1107 | 1122 | mutex_unlock(&cgroup_mutex); |
---|
1108 | 1123 | return ret; |
---|
1109 | 1124 | } |
---|
.. | .. |
---|
1111 | 1126 | struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { |
---|
1112 | 1127 | .rename = cgroup1_rename, |
---|
1113 | 1128 | .show_options = cgroup1_show_options, |
---|
1114 | | - .remount_fs = cgroup1_remount, |
---|
1115 | 1129 | .mkdir = cgroup_mkdir, |
---|
1116 | 1130 | .rmdir = cgroup_rmdir, |
---|
1117 | 1131 | .show_path = cgroup_show_path, |
---|
1118 | 1132 | }; |
---|
1119 | 1133 | |
---|
1120 | | -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, |
---|
1121 | | - void *data, unsigned long magic, |
---|
1122 | | - struct cgroup_namespace *ns) |
---|
| 1134 | +/* |
---|
| 1135 | + * The guts of cgroup1 mount - find or create cgroup_root to use. |
---|
| 1136 | + * Called with cgroup_mutex held; returns 0 on success, -E... on |
---|
| 1137 | + * error and positive - in case when the candidate is busy dying. |
---|
| 1138 | + * On success it stashes a reference to cgroup_root into given |
---|
| 1139 | + * cgroup_fs_context; that reference is *NOT* counting towards the |
---|
| 1140 | + * cgroup_root refcount. |
---|
| 1141 | + */ |
---|
| 1142 | +static int cgroup1_root_to_use(struct fs_context *fc) |
---|
1123 | 1143 | { |
---|
1124 | | - struct cgroup_sb_opts opts; |
---|
| 1144 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
1125 | 1145 | struct cgroup_root *root; |
---|
1126 | 1146 | struct cgroup_subsys *ss; |
---|
1127 | | - struct dentry *dentry; |
---|
1128 | 1147 | int i, ret; |
---|
1129 | 1148 | |
---|
1130 | | - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
---|
1131 | | - |
---|
1132 | 1149 | /* First find the desired set of subsystems */ |
---|
1133 | | - ret = parse_cgroupfs_options(data, &opts); |
---|
| 1150 | + ret = check_cgroupfs_options(fc); |
---|
1134 | 1151 | if (ret) |
---|
1135 | | - goto out_unlock; |
---|
| 1152 | + return ret; |
---|
1136 | 1153 | |
---|
1137 | 1154 | /* |
---|
1138 | 1155 | * Destruction of cgroup root is asynchronous, so subsystems may |
---|
.. | .. |
---|
1142 | 1159 | * starting. Testing ref liveliness is good enough. |
---|
1143 | 1160 | */ |
---|
1144 | 1161 | for_each_subsys(ss, i) { |
---|
1145 | | - if (!(opts.subsys_mask & (1 << i)) || |
---|
| 1162 | + if (!(ctx->subsys_mask & (1 << i)) || |
---|
1146 | 1163 | ss->root == &cgrp_dfl_root) |
---|
1147 | 1164 | continue; |
---|
1148 | 1165 | |
---|
1149 | | - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { |
---|
1150 | | - mutex_unlock(&cgroup_mutex); |
---|
1151 | | - msleep(10); |
---|
1152 | | - ret = restart_syscall(); |
---|
1153 | | - goto out_free; |
---|
1154 | | - } |
---|
| 1166 | + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) |
---|
| 1167 | + return 1; /* restart */ |
---|
1155 | 1168 | cgroup_put(&ss->root->cgrp); |
---|
1156 | 1169 | } |
---|
1157 | 1170 | |
---|
.. | .. |
---|
1166 | 1179 | * name matches but sybsys_mask doesn't, we should fail. |
---|
1167 | 1180 | * Remember whether name matched. |
---|
1168 | 1181 | */ |
---|
1169 | | - if (opts.name) { |
---|
1170 | | - if (strcmp(opts.name, root->name)) |
---|
| 1182 | + if (ctx->name) { |
---|
| 1183 | + if (strcmp(ctx->name, root->name)) |
---|
1171 | 1184 | continue; |
---|
1172 | 1185 | name_match = true; |
---|
1173 | 1186 | } |
---|
.. | .. |
---|
1176 | 1189 | * If we asked for subsystems (or explicitly for no |
---|
1177 | 1190 | * subsystems) then they must match. |
---|
1178 | 1191 | */ |
---|
1179 | | - if ((opts.subsys_mask || opts.none) && |
---|
1180 | | - (opts.subsys_mask != root->subsys_mask)) { |
---|
| 1192 | + if ((ctx->subsys_mask || ctx->none) && |
---|
| 1193 | + (ctx->subsys_mask != root->subsys_mask)) { |
---|
1181 | 1194 | if (!name_match) |
---|
1182 | 1195 | continue; |
---|
1183 | | - ret = -EBUSY; |
---|
1184 | | - goto out_unlock; |
---|
| 1196 | + return -EBUSY; |
---|
1185 | 1197 | } |
---|
1186 | 1198 | |
---|
1187 | | - if (root->flags ^ opts.flags) |
---|
| 1199 | + if (root->flags ^ ctx->flags) |
---|
1188 | 1200 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
---|
1189 | 1201 | |
---|
1190 | | - ret = 0; |
---|
1191 | | - goto out_unlock; |
---|
| 1202 | + ctx->root = root; |
---|
| 1203 | + return 0; |
---|
1192 | 1204 | } |
---|
1193 | 1205 | |
---|
1194 | 1206 | /* |
---|
.. | .. |
---|
1196 | 1208 | * specification is allowed for already existing hierarchies but we |
---|
1197 | 1209 | * can't create new one without subsys specification. |
---|
1198 | 1210 | */ |
---|
1199 | | - if (!opts.subsys_mask && !opts.none) { |
---|
1200 | | - ret = -EINVAL; |
---|
1201 | | - goto out_unlock; |
---|
1202 | | - } |
---|
| 1211 | + if (!ctx->subsys_mask && !ctx->none) |
---|
| 1212 | + return invalfc(fc, "No subsys list or none specified"); |
---|
1203 | 1213 | |
---|
1204 | 1214 | /* Hierarchies may only be created in the initial cgroup namespace. */ |
---|
1205 | | - if (ns != &init_cgroup_ns) { |
---|
1206 | | - ret = -EPERM; |
---|
1207 | | - goto out_unlock; |
---|
1208 | | - } |
---|
1209 | | - /* |
---|
1210 | | - * Release agent gets called with all capabilities, |
---|
1211 | | - * require capabilities to set release agent. |
---|
1212 | | - */ |
---|
1213 | | - if (opts.release_agent && |
---|
1214 | | - ((ns->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))) { |
---|
1215 | | - ret = -EINVAL; |
---|
1216 | | - goto out_unlock; |
---|
1217 | | - } |
---|
| 1215 | + if (ctx->ns != &init_cgroup_ns) |
---|
| 1216 | + return -EPERM; |
---|
1218 | 1217 | |
---|
1219 | 1218 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
---|
1220 | | - if (!root) { |
---|
1221 | | - ret = -ENOMEM; |
---|
1222 | | - goto out_unlock; |
---|
1223 | | - } |
---|
| 1219 | + if (!root) |
---|
| 1220 | + return -ENOMEM; |
---|
1224 | 1221 | |
---|
1225 | | - init_cgroup_root(root, &opts); |
---|
| 1222 | + ctx->root = root; |
---|
| 1223 | + init_cgroup_root(ctx); |
---|
1226 | 1224 | |
---|
1227 | | - ret = cgroup_setup_root(root, opts.subsys_mask); |
---|
| 1225 | + ret = cgroup_setup_root(root, ctx->subsys_mask); |
---|
1228 | 1226 | if (ret) |
---|
1229 | 1227 | cgroup_free_root(root); |
---|
| 1228 | + return ret; |
---|
| 1229 | +} |
---|
1230 | 1230 | |
---|
1231 | | -out_unlock: |
---|
1232 | | - if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { |
---|
1233 | | - mutex_unlock(&cgroup_mutex); |
---|
1234 | | - msleep(10); |
---|
1235 | | - ret = restart_syscall(); |
---|
1236 | | - goto out_free; |
---|
1237 | | - } |
---|
| 1231 | +int cgroup1_get_tree(struct fs_context *fc) |
---|
| 1232 | +{ |
---|
| 1233 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 1234 | + int ret; |
---|
| 1235 | + |
---|
| 1236 | + /* Check if the caller has permission to mount. */ |
---|
| 1237 | + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) |
---|
| 1238 | + return -EPERM; |
---|
| 1239 | + |
---|
| 1240 | + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); |
---|
| 1241 | + |
---|
| 1242 | + ret = cgroup1_root_to_use(fc); |
---|
| 1243 | + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) |
---|
| 1244 | + ret = 1; /* restart */ |
---|
| 1245 | + |
---|
1238 | 1246 | mutex_unlock(&cgroup_mutex); |
---|
1239 | | -out_free: |
---|
1240 | | - kfree(opts.release_agent); |
---|
1241 | | - kfree(opts.name); |
---|
1242 | 1247 | |
---|
1243 | | - if (ret) |
---|
1244 | | - return ERR_PTR(ret); |
---|
| 1248 | + if (!ret) |
---|
| 1249 | + ret = cgroup_do_get_tree(fc); |
---|
1245 | 1250 | |
---|
1246 | | - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, |
---|
1247 | | - CGROUP_SUPER_MAGIC, ns); |
---|
1248 | | - |
---|
1249 | | - if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { |
---|
1250 | | - struct super_block *sb = dentry->d_sb; |
---|
1251 | | - dput(dentry); |
---|
1252 | | - deactivate_locked_super(sb); |
---|
1253 | | - msleep(10); |
---|
1254 | | - dentry = ERR_PTR(restart_syscall()); |
---|
| 1251 | + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { |
---|
| 1252 | + fc_drop_locked(fc); |
---|
| 1253 | + ret = 1; |
---|
1255 | 1254 | } |
---|
1256 | | - return dentry; |
---|
| 1255 | + |
---|
| 1256 | + if (unlikely(ret > 0)) { |
---|
| 1257 | + msleep(10); |
---|
| 1258 | + return restart_syscall(); |
---|
| 1259 | + } |
---|
| 1260 | + return ret; |
---|
1257 | 1261 | } |
---|
1258 | 1262 | |
---|
1259 | 1263 | static int __init cgroup1_wq_init(void) |
---|