.. | .. |
---|
54 | 54 | #include <linux/proc_ns.h> |
---|
55 | 55 | #include <linux/nsproxy.h> |
---|
56 | 56 | #include <linux/file.h> |
---|
| 57 | +#include <linux/fs_parser.h> |
---|
57 | 58 | #include <linux/sched/cputime.h> |
---|
58 | 59 | #include <linux/psi.h> |
---|
59 | 60 | #include <net/sock.h> |
---|
60 | 61 | |
---|
61 | 62 | #define CREATE_TRACE_POINTS |
---|
62 | 63 | #include <trace/events/cgroup.h> |
---|
| 64 | +#undef CREATE_TRACE_POINTS |
---|
| 65 | + |
---|
| 66 | +#include <trace/hooks/cgroup.h> |
---|
63 | 67 | |
---|
64 | 68 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ |
---|
65 | 69 | MAX_CFTYPE_NAME + 2) |
---|
.. | .. |
---|
86 | 90 | |
---|
87 | 91 | DEFINE_SPINLOCK(trace_cgroup_path_lock); |
---|
88 | 92 | char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; |
---|
| 93 | +bool cgroup_debug __read_mostly; |
---|
89 | 94 | |
---|
90 | 95 | /* |
---|
91 | 96 | * Protects cgroup_idr and css_idr so that IDs can be released without |
---|
.. | .. |
---|
99 | 104 | */ |
---|
100 | 105 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); |
---|
101 | 106 | |
---|
102 | | -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; |
---|
| 107 | +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); |
---|
103 | 108 | |
---|
104 | 109 | #define cgroup_assert_mutex_or_rcu_locked() \ |
---|
105 | 110 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
---|
.. | .. |
---|
151 | 156 | |
---|
152 | 157 | static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); |
---|
153 | 158 | |
---|
154 | | -/* |
---|
155 | | - * The default hierarchy, reserved for the subsystems that are otherwise |
---|
156 | | - * unattached - it never has more than a single cgroup, and all tasks are |
---|
157 | | - * part of that cgroup. |
---|
158 | | - */ |
---|
| 159 | +/* the default hierarchy */ |
---|
159 | 160 | struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; |
---|
160 | 161 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); |
---|
161 | 162 | |
---|
.. | .. |
---|
264 | 265 | * can be used to test whether a cgroup is on the default hierarchy for |
---|
265 | 266 | * cases where a subsystem should behave differnetly depending on the |
---|
266 | 267 | * interface version. |
---|
267 | | - * |
---|
268 | | - * The set of behaviors which change on the default hierarchy are still |
---|
269 | | - * being determined and the mount option is prefixed with __DEVEL__. |
---|
270 | 268 | * |
---|
271 | 269 | * List of changed behaviors: |
---|
272 | 270 | * |
---|
.. | .. |
---|
502 | 500 | |
---|
503 | 501 | rcu_read_lock(); |
---|
504 | 502 | css = cgroup_css(cgrp, ss); |
---|
505 | | - if (!css || !css_tryget_online(css)) |
---|
| 503 | + if (css && !css_tryget_online(css)) |
---|
506 | 504 | css = NULL; |
---|
507 | 505 | rcu_read_unlock(); |
---|
508 | 506 | |
---|
.. | .. |
---|
510 | 508 | } |
---|
511 | 509 | |
---|
512 | 510 | /** |
---|
513 | | - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem |
---|
| 511 | + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss |
---|
514 | 512 | * @cgrp: the cgroup of interest |
---|
515 | 513 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
---|
516 | 514 | * |
---|
.. | .. |
---|
519 | 517 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this |
---|
520 | 518 | * function is guaranteed to return non-NULL css. |
---|
521 | 519 | */ |
---|
522 | | -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, |
---|
523 | | - struct cgroup_subsys *ss) |
---|
| 520 | +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, |
---|
| 521 | + struct cgroup_subsys *ss) |
---|
524 | 522 | { |
---|
525 | 523 | lockdep_assert_held(&cgroup_mutex); |
---|
526 | 524 | |
---|
.. | .. |
---|
538 | 536 | } |
---|
539 | 537 | |
---|
540 | 538 | return cgroup_css(cgrp, ss); |
---|
| 539 | +} |
---|
| 540 | + |
---|
| 541 | +/** |
---|
| 542 | + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem |
---|
| 543 | + * @cgrp: the cgroup of interest |
---|
| 544 | + * @ss: the subsystem of interest |
---|
| 545 | + * |
---|
| 546 | + * Find and get the effective css of @cgrp for @ss. The effective css is |
---|
| 547 | + * defined as the matching css of the nearest ancestor including self which |
---|
| 548 | + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, |
---|
| 549 | + * the root css is returned, so this function always returns a valid css. |
---|
| 550 | + * |
---|
| 551 | + * The returned css is not guaranteed to be online, and therefore it is the |
---|
| 552 | + * callers responsiblity to tryget a reference for it. |
---|
| 553 | + */ |
---|
| 554 | +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, |
---|
| 555 | + struct cgroup_subsys *ss) |
---|
| 556 | +{ |
---|
| 557 | + struct cgroup_subsys_state *css; |
---|
| 558 | + |
---|
| 559 | + do { |
---|
| 560 | + css = cgroup_css(cgrp, ss); |
---|
| 561 | + |
---|
| 562 | + if (css) |
---|
| 563 | + return css; |
---|
| 564 | + cgrp = cgroup_parent(cgrp); |
---|
| 565 | + } while (cgrp); |
---|
| 566 | + |
---|
| 567 | + return init_css_set.subsys[ss->id]; |
---|
541 | 568 | } |
---|
542 | 569 | |
---|
543 | 570 | /** |
---|
.. | .. |
---|
655 | 682 | * |
---|
656 | 683 | * Should be called under cgroup_[tree_]mutex. |
---|
657 | 684 | */ |
---|
658 | | -#define for_each_e_css(css, ssid, cgrp) \ |
---|
659 | | - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
---|
660 | | - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ |
---|
661 | | - ; \ |
---|
| 685 | +#define for_each_e_css(css, ssid, cgrp) \ |
---|
| 686 | + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
---|
| 687 | + if (!((css) = cgroup_e_css_by_mask(cgrp, \ |
---|
| 688 | + cgroup_subsys[(ssid)]))) \ |
---|
| 689 | + ; \ |
---|
662 | 690 | else |
---|
663 | 691 | |
---|
664 | 692 | /** |
---|
.. | .. |
---|
718 | 746 | * reference-counted, to improve performance when child cgroups |
---|
719 | 747 | * haven't been created. |
---|
720 | 748 | */ |
---|
721 | | -struct css_set init_css_set = { |
---|
722 | | - .refcount = REFCOUNT_INIT(1), |
---|
723 | | - .dom_cset = &init_css_set, |
---|
724 | | - .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
---|
725 | | - .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
---|
726 | | - .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), |
---|
727 | | - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
---|
728 | | - .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), |
---|
729 | | - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
---|
730 | | - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
---|
731 | | - .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
---|
732 | | - |
---|
733 | | - /* |
---|
734 | | - * The following field is re-initialized when this cset gets linked |
---|
735 | | - * in cgroup_init(). However, let's initialize the field |
---|
736 | | - * statically too so that the default cgroup can be accessed safely |
---|
737 | | - * early during boot. |
---|
738 | | - */ |
---|
739 | | - .dfl_cgrp = &cgrp_dfl_root.cgrp, |
---|
| 749 | +struct ext_css_set init_ext_css_set = { |
---|
| 750 | + .cset = { |
---|
| 751 | + .refcount = REFCOUNT_INIT(1), |
---|
| 752 | + .dom_cset = &init_css_set, |
---|
| 753 | + .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
---|
| 754 | + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
---|
| 755 | + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), |
---|
| 756 | + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
---|
| 757 | + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), |
---|
| 758 | + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
---|
| 759 | + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
---|
| 760 | + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
---|
| 761 | + /* |
---|
| 762 | + * The following field is re-initialized when this cset gets linked |
---|
| 763 | + * in cgroup_init(). However, let's initialize the field |
---|
| 764 | + * statically too so that the default cgroup can be accessed safely |
---|
| 765 | + * early during boot. |
---|
| 766 | + */ |
---|
| 767 | + .dfl_cgrp = &cgrp_dfl_root.cgrp, |
---|
| 768 | + }, |
---|
| 769 | + .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node), |
---|
| 770 | + .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node), |
---|
740 | 771 | }; |
---|
741 | 772 | |
---|
742 | 773 | static int css_set_count = 1; /* 1 for init_css_set */ |
---|
.. | .. |
---|
802 | 833 | break; |
---|
803 | 834 | |
---|
804 | 835 | cgroup1_check_for_release(cgrp); |
---|
| 836 | + TRACE_CGROUP_PATH(notify_populated, cgrp, |
---|
| 837 | + cgroup_is_populated(cgrp)); |
---|
805 | 838 | cgroup_file_notify(&cgrp->events_file); |
---|
806 | 839 | |
---|
807 | 840 | child = cgrp; |
---|
.. | .. |
---|
881 | 914 | /* |
---|
882 | 915 | * We are synchronized through cgroup_threadgroup_rwsem |
---|
883 | 916 | * against PF_EXITING setting such that we can't race |
---|
884 | | - * against cgroup_exit() changing the css_set to |
---|
885 | | - * init_css_set and dropping the old one. |
---|
| 917 | + * against cgroup_exit()/cgroup_free() dropping the css_set. |
---|
886 | 918 | */ |
---|
887 | 919 | WARN_ON_ONCE(task->flags & PF_EXITING); |
---|
888 | 920 | |
---|
.. | .. |
---|
1060 | 1092 | * @ss is in this hierarchy, so we want the |
---|
1061 | 1093 | * effective css from @cgrp. |
---|
1062 | 1094 | */ |
---|
1063 | | - template[i] = cgroup_e_css(cgrp, ss); |
---|
| 1095 | + template[i] = cgroup_e_css_by_mask(cgrp, ss); |
---|
1064 | 1096 | } else { |
---|
1065 | 1097 | /* |
---|
1066 | 1098 | * @ss is not in this hierarchy, so we don't want |
---|
.. | .. |
---|
1162 | 1194 | struct cgroup *cgrp) |
---|
1163 | 1195 | { |
---|
1164 | 1196 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; |
---|
| 1197 | + struct ext_css_set *ext_cset; |
---|
1165 | 1198 | struct css_set *cset; |
---|
1166 | 1199 | struct list_head tmp_links; |
---|
1167 | 1200 | struct cgrp_cset_link *link; |
---|
.. | .. |
---|
1182 | 1215 | if (cset) |
---|
1183 | 1216 | return cset; |
---|
1184 | 1217 | |
---|
1185 | | - cset = kzalloc(sizeof(*cset), GFP_KERNEL); |
---|
1186 | | - if (!cset) |
---|
| 1218 | + ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL); |
---|
| 1219 | + if (!ext_cset) |
---|
1187 | 1220 | return NULL; |
---|
| 1221 | + cset = &ext_cset->cset; |
---|
1188 | 1222 | |
---|
1189 | 1223 | /* Allocate all the cgrp_cset_link objects that we'll need */ |
---|
1190 | 1224 | if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { |
---|
.. | .. |
---|
1202 | 1236 | INIT_HLIST_NODE(&cset->hlist); |
---|
1203 | 1237 | INIT_LIST_HEAD(&cset->cgrp_links); |
---|
1204 | 1238 | INIT_LIST_HEAD(&cset->mg_preload_node); |
---|
| 1239 | + INIT_LIST_HEAD(&ext_cset->mg_src_preload_node); |
---|
| 1240 | + INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node); |
---|
1205 | 1241 | INIT_LIST_HEAD(&cset->mg_node); |
---|
1206 | 1242 | |
---|
1207 | 1243 | /* Copy the set of subsystem state objects generated in |
---|
.. | .. |
---|
1291 | 1327 | |
---|
1292 | 1328 | void cgroup_free_root(struct cgroup_root *root) |
---|
1293 | 1329 | { |
---|
1294 | | - if (root) { |
---|
1295 | | - idr_destroy(&root->cgroup_idr); |
---|
1296 | | - kfree(root); |
---|
1297 | | - } |
---|
| 1330 | + kfree(root); |
---|
1298 | 1331 | } |
---|
1299 | 1332 | |
---|
1300 | 1333 | static void cgroup_destroy_root(struct cgroup_root *root) |
---|
.. | .. |
---|
1356 | 1389 | cset = current->nsproxy->cgroup_ns->root_cset; |
---|
1357 | 1390 | if (cset == &init_css_set) { |
---|
1358 | 1391 | res = &root->cgrp; |
---|
| 1392 | + } else if (root == &cgrp_dfl_root) { |
---|
| 1393 | + res = cset->dfl_cgrp; |
---|
1359 | 1394 | } else { |
---|
1360 | 1395 | struct cgrp_cset_link *link; |
---|
1361 | 1396 | |
---|
.. | .. |
---|
1412 | 1447 | struct cgroup_root *root) |
---|
1413 | 1448 | { |
---|
1414 | 1449 | /* |
---|
1415 | | - * No need to lock the task - since we hold cgroup_mutex the |
---|
1416 | | - * task can't change groups, so the only thing that can happen |
---|
1417 | | - * is that it exits and its css is set back to init_css_set. |
---|
| 1450 | + * No need to lock the task - since we hold css_set_lock the |
---|
| 1451 | + * task can't change groups. |
---|
1418 | 1452 | */ |
---|
1419 | 1453 | return cset_cgroup_from_root(task_css_set(task), root); |
---|
1420 | 1454 | } |
---|
.. | .. |
---|
1453 | 1487 | struct cgroup_subsys *ss = cft->ss; |
---|
1454 | 1488 | |
---|
1455 | 1489 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
---|
1456 | | - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) |
---|
1457 | | - snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", |
---|
1458 | | - cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
---|
| 1490 | + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
---|
| 1491 | + const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; |
---|
| 1492 | + |
---|
| 1493 | + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", |
---|
| 1494 | + dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
---|
1459 | 1495 | cft->name); |
---|
1460 | | - else |
---|
| 1496 | + } else { |
---|
1461 | 1497 | strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
---|
| 1498 | + } |
---|
1462 | 1499 | return buf; |
---|
1463 | 1500 | } |
---|
1464 | 1501 | |
---|
.. | .. |
---|
1815 | 1852 | return len; |
---|
1816 | 1853 | } |
---|
1817 | 1854 | |
---|
1818 | | -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) |
---|
| 1855 | +enum cgroup2_param { |
---|
| 1856 | + Opt_nsdelegate, |
---|
| 1857 | + Opt_memory_localevents, |
---|
| 1858 | + Opt_memory_recursiveprot, |
---|
| 1859 | + nr__cgroup2_params |
---|
| 1860 | +}; |
---|
| 1861 | + |
---|
| 1862 | +static const struct fs_parameter_spec cgroup2_fs_parameters[] = { |
---|
| 1863 | + fsparam_flag("nsdelegate", Opt_nsdelegate), |
---|
| 1864 | + fsparam_flag("memory_localevents", Opt_memory_localevents), |
---|
| 1865 | + fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), |
---|
| 1866 | + {} |
---|
| 1867 | +}; |
---|
| 1868 | + |
---|
| 1869 | +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) |
---|
1819 | 1870 | { |
---|
1820 | | - char *token; |
---|
| 1871 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 1872 | + struct fs_parse_result result; |
---|
| 1873 | + int opt; |
---|
1821 | 1874 | |
---|
1822 | | - *root_flags = 0; |
---|
| 1875 | + opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); |
---|
| 1876 | + if (opt < 0) |
---|
| 1877 | + return opt; |
---|
1823 | 1878 | |
---|
1824 | | - if (!data || *data == '\0') |
---|
| 1879 | + switch (opt) { |
---|
| 1880 | + case Opt_nsdelegate: |
---|
| 1881 | + ctx->flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1825 | 1882 | return 0; |
---|
1826 | | - |
---|
1827 | | - while ((token = strsep(&data, ",")) != NULL) { |
---|
1828 | | - if (!strcmp(token, "nsdelegate")) { |
---|
1829 | | - *root_flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1830 | | - continue; |
---|
1831 | | - } |
---|
1832 | | - |
---|
1833 | | - pr_err("cgroup2: unknown option \"%s\"\n", token); |
---|
1834 | | - return -EINVAL; |
---|
| 1883 | + case Opt_memory_localevents: |
---|
| 1884 | + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1885 | + return 0; |
---|
| 1886 | + case Opt_memory_recursiveprot: |
---|
| 1887 | + ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
| 1888 | + return 0; |
---|
1835 | 1889 | } |
---|
1836 | | - |
---|
1837 | | - return 0; |
---|
| 1890 | + return -EINVAL; |
---|
1838 | 1891 | } |
---|
1839 | 1892 | |
---|
1840 | 1893 | static void apply_cgroup_root_flags(unsigned int root_flags) |
---|
.. | .. |
---|
1844 | 1897 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1845 | 1898 | else |
---|
1846 | 1899 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; |
---|
| 1900 | + |
---|
| 1901 | + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) |
---|
| 1902 | + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1903 | + else |
---|
| 1904 | + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1905 | + |
---|
| 1906 | + if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) |
---|
| 1907 | + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
| 1908 | + else |
---|
| 1909 | + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
1847 | 1910 | } |
---|
1848 | 1911 | } |
---|
1849 | 1912 | |
---|
.. | .. |
---|
1851 | 1914 | { |
---|
1852 | 1915 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) |
---|
1853 | 1916 | seq_puts(seq, ",nsdelegate"); |
---|
| 1917 | + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) |
---|
| 1918 | + seq_puts(seq, ",memory_localevents"); |
---|
| 1919 | + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) |
---|
| 1920 | + seq_puts(seq, ",memory_recursiveprot"); |
---|
1854 | 1921 | return 0; |
---|
1855 | 1922 | } |
---|
1856 | 1923 | |
---|
1857 | | -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) |
---|
| 1924 | +static int cgroup_reconfigure(struct fs_context *fc) |
---|
1858 | 1925 | { |
---|
1859 | | - unsigned int root_flags; |
---|
1860 | | - int ret; |
---|
| 1926 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
1861 | 1927 | |
---|
1862 | | - ret = parse_cgroup_root_flags(data, &root_flags); |
---|
1863 | | - if (ret) |
---|
1864 | | - return ret; |
---|
1865 | | - |
---|
1866 | | - apply_cgroup_root_flags(root_flags); |
---|
| 1928 | + apply_cgroup_root_flags(ctx->flags); |
---|
1867 | 1929 | return 0; |
---|
1868 | | -} |
---|
1869 | | - |
---|
1870 | | -/* |
---|
1871 | | - * To reduce the fork() overhead for systems that are not actually using |
---|
1872 | | - * their cgroups capability, we don't maintain the lists running through |
---|
1873 | | - * each css_set to its tasks until we see the list actually used - in other |
---|
1874 | | - * words after the first mount. |
---|
1875 | | - */ |
---|
1876 | | -static bool use_task_css_set_links __read_mostly; |
---|
1877 | | - |
---|
1878 | | -static void cgroup_enable_task_cg_lists(void) |
---|
1879 | | -{ |
---|
1880 | | - struct task_struct *p, *g; |
---|
1881 | | - |
---|
1882 | | - /* |
---|
1883 | | - * We need tasklist_lock because RCU is not safe against |
---|
1884 | | - * while_each_thread(). Besides, a forking task that has passed |
---|
1885 | | - * cgroup_post_fork() without seeing use_task_css_set_links = 1 |
---|
1886 | | - * is not guaranteed to have its child immediately visible in the |
---|
1887 | | - * tasklist if we walk through it with RCU. |
---|
1888 | | - */ |
---|
1889 | | - read_lock(&tasklist_lock); |
---|
1890 | | - spin_lock_irq(&css_set_lock); |
---|
1891 | | - |
---|
1892 | | - if (use_task_css_set_links) |
---|
1893 | | - goto out_unlock; |
---|
1894 | | - |
---|
1895 | | - use_task_css_set_links = true; |
---|
1896 | | - |
---|
1897 | | - do_each_thread(g, p) { |
---|
1898 | | - WARN_ON_ONCE(!list_empty(&p->cg_list) || |
---|
1899 | | - task_css_set(p) != &init_css_set); |
---|
1900 | | - |
---|
1901 | | - /* |
---|
1902 | | - * We should check if the process is exiting, otherwise |
---|
1903 | | - * it will race with cgroup_exit() in that the list |
---|
1904 | | - * entry won't be deleted though the process has exited. |
---|
1905 | | - * Do it while holding siglock so that we don't end up |
---|
1906 | | - * racing against cgroup_exit(). |
---|
1907 | | - * |
---|
1908 | | - * Interrupts were already disabled while acquiring |
---|
1909 | | - * the css_set_lock, so we do not need to disable it |
---|
1910 | | - * again when acquiring the sighand->siglock here. |
---|
1911 | | - */ |
---|
1912 | | - spin_lock(&p->sighand->siglock); |
---|
1913 | | - if (!(p->flags & PF_EXITING)) { |
---|
1914 | | - struct css_set *cset = task_css_set(p); |
---|
1915 | | - |
---|
1916 | | - if (!css_set_populated(cset)) |
---|
1917 | | - css_set_update_populated(cset, true); |
---|
1918 | | - list_add_tail(&p->cg_list, &cset->tasks); |
---|
1919 | | - get_css_set(cset); |
---|
1920 | | - cset->nr_tasks++; |
---|
1921 | | - } |
---|
1922 | | - spin_unlock(&p->sighand->siglock); |
---|
1923 | | - } while_each_thread(g, p); |
---|
1924 | | -out_unlock: |
---|
1925 | | - spin_unlock_irq(&css_set_lock); |
---|
1926 | | - read_unlock(&tasklist_lock); |
---|
1927 | 1930 | } |
---|
1928 | 1931 | |
---|
1929 | 1932 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
---|
.. | .. |
---|
1951 | 1954 | INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); |
---|
1952 | 1955 | } |
---|
1953 | 1956 | |
---|
1954 | | -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) |
---|
| 1957 | +void init_cgroup_root(struct cgroup_fs_context *ctx) |
---|
1955 | 1958 | { |
---|
| 1959 | + struct cgroup_root *root = ctx->root; |
---|
1956 | 1960 | struct cgroup *cgrp = &root->cgrp; |
---|
1957 | 1961 | |
---|
1958 | 1962 | INIT_LIST_HEAD(&root->root_list); |
---|
1959 | 1963 | atomic_set(&root->nr_cgrps, 1); |
---|
1960 | 1964 | cgrp->root = root; |
---|
1961 | 1965 | init_cgroup_housekeeping(cgrp); |
---|
1962 | | - idr_init(&root->cgroup_idr); |
---|
1963 | 1966 | |
---|
1964 | | - root->flags = opts->flags; |
---|
1965 | | - if (opts->release_agent) |
---|
1966 | | - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); |
---|
1967 | | - if (opts->name) |
---|
1968 | | - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); |
---|
1969 | | - if (opts->cpuset_clone_children) |
---|
| 1967 | + root->flags = ctx->flags; |
---|
| 1968 | + if (ctx->release_agent) |
---|
| 1969 | + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); |
---|
| 1970 | + if (ctx->name) |
---|
| 1971 | + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); |
---|
| 1972 | + if (ctx->cpuset_clone_children) |
---|
1970 | 1973 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
---|
1971 | 1974 | } |
---|
1972 | 1975 | |
---|
.. | .. |
---|
1979 | 1982 | int i, ret; |
---|
1980 | 1983 | |
---|
1981 | 1984 | lockdep_assert_held(&cgroup_mutex); |
---|
1982 | | - |
---|
1983 | | - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); |
---|
1984 | | - if (ret < 0) |
---|
1985 | | - goto out; |
---|
1986 | | - root_cgrp->id = ret; |
---|
1987 | | - root_cgrp->ancestor_ids[0] = ret; |
---|
1988 | 1985 | |
---|
1989 | 1986 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, |
---|
1990 | 1987 | 0, GFP_KERNEL); |
---|
.. | .. |
---|
2011 | 2008 | |
---|
2012 | 2009 | root->kf_root = kernfs_create_root(kf_sops, |
---|
2013 | 2010 | KERNFS_ROOT_CREATE_DEACTIVATED | |
---|
2014 | | - KERNFS_ROOT_SUPPORT_EXPORTOP, |
---|
| 2011 | + KERNFS_ROOT_SUPPORT_EXPORTOP | |
---|
| 2012 | + KERNFS_ROOT_SUPPORT_USER_XATTR, |
---|
2015 | 2013 | root_cgrp); |
---|
2016 | 2014 | if (IS_ERR(root->kf_root)) { |
---|
2017 | 2015 | ret = PTR_ERR(root->kf_root); |
---|
2018 | 2016 | goto exit_root_id; |
---|
2019 | 2017 | } |
---|
2020 | 2018 | root_cgrp->kn = root->kf_root->kn; |
---|
| 2019 | + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); |
---|
| 2020 | + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); |
---|
2021 | 2021 | |
---|
2022 | 2022 | ret = css_populate_dir(&root_cgrp->self); |
---|
2023 | 2023 | if (ret) |
---|
.. | .. |
---|
2055 | 2055 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
---|
2056 | 2056 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
---|
2057 | 2057 | |
---|
2058 | | - kernfs_activate(root_cgrp->kn); |
---|
2059 | 2058 | ret = 0; |
---|
2060 | 2059 | goto out; |
---|
2061 | 2060 | |
---|
.. | .. |
---|
2071 | 2070 | return ret; |
---|
2072 | 2071 | } |
---|
2073 | 2072 | |
---|
2074 | | -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, |
---|
2075 | | - struct cgroup_root *root, unsigned long magic, |
---|
2076 | | - struct cgroup_namespace *ns) |
---|
| 2073 | +int cgroup_do_get_tree(struct fs_context *fc) |
---|
2077 | 2074 | { |
---|
2078 | | - struct dentry *dentry; |
---|
2079 | | - bool new_sb = false; |
---|
| 2075 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 2076 | + int ret; |
---|
2080 | 2077 | |
---|
2081 | | - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); |
---|
| 2078 | + ctx->kfc.root = ctx->root->kf_root; |
---|
| 2079 | + if (fc->fs_type == &cgroup2_fs_type) |
---|
| 2080 | + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; |
---|
| 2081 | + else |
---|
| 2082 | + ctx->kfc.magic = CGROUP_SUPER_MAGIC; |
---|
| 2083 | + ret = kernfs_get_tree(fc); |
---|
2082 | 2084 | |
---|
2083 | 2085 | /* |
---|
2084 | 2086 | * In non-init cgroup namespace, instead of root cgroup's dentry, |
---|
2085 | 2087 | * we return the dentry corresponding to the cgroupns->root_cgrp. |
---|
2086 | 2088 | */ |
---|
2087 | | - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { |
---|
| 2089 | + if (!ret && ctx->ns != &init_cgroup_ns) { |
---|
2088 | 2090 | struct dentry *nsdentry; |
---|
2089 | | - struct super_block *sb = dentry->d_sb; |
---|
| 2091 | + struct super_block *sb = fc->root->d_sb; |
---|
2090 | 2092 | struct cgroup *cgrp; |
---|
2091 | 2093 | |
---|
2092 | 2094 | mutex_lock(&cgroup_mutex); |
---|
2093 | 2095 | spin_lock_irq(&css_set_lock); |
---|
2094 | 2096 | |
---|
2095 | | - cgrp = cset_cgroup_from_root(ns->root_cset, root); |
---|
| 2097 | + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); |
---|
2096 | 2098 | |
---|
2097 | 2099 | spin_unlock_irq(&css_set_lock); |
---|
2098 | 2100 | mutex_unlock(&cgroup_mutex); |
---|
2099 | 2101 | |
---|
2100 | 2102 | nsdentry = kernfs_node_dentry(cgrp->kn, sb); |
---|
2101 | | - dput(dentry); |
---|
2102 | | - if (IS_ERR(nsdentry)) |
---|
| 2103 | + dput(fc->root); |
---|
| 2104 | + if (IS_ERR(nsdentry)) { |
---|
2103 | 2105 | deactivate_locked_super(sb); |
---|
2104 | | - dentry = nsdentry; |
---|
| 2106 | + ret = PTR_ERR(nsdentry); |
---|
| 2107 | + nsdentry = NULL; |
---|
| 2108 | + } |
---|
| 2109 | + fc->root = nsdentry; |
---|
2105 | 2110 | } |
---|
2106 | 2111 | |
---|
2107 | | - if (!new_sb) |
---|
2108 | | - cgroup_put(&root->cgrp); |
---|
| 2112 | + if (!ctx->kfc.new_sb_created) |
---|
| 2113 | + cgroup_put(&ctx->root->cgrp); |
---|
2109 | 2114 | |
---|
2110 | | - return dentry; |
---|
| 2115 | + return ret; |
---|
2111 | 2116 | } |
---|
2112 | 2117 | |
---|
2113 | | -static struct dentry *cgroup_mount(struct file_system_type *fs_type, |
---|
2114 | | - int flags, const char *unused_dev_name, |
---|
2115 | | - void *data) |
---|
| 2118 | +/* |
---|
| 2119 | + * Destroy a cgroup filesystem context. |
---|
| 2120 | + */ |
---|
| 2121 | +static void cgroup_fs_context_free(struct fs_context *fc) |
---|
2116 | 2122 | { |
---|
2117 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
2118 | | - struct dentry *dentry; |
---|
| 2123 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 2124 | + |
---|
| 2125 | + kfree(ctx->name); |
---|
| 2126 | + kfree(ctx->release_agent); |
---|
| 2127 | + put_cgroup_ns(ctx->ns); |
---|
| 2128 | + kernfs_free_fs_context(fc); |
---|
| 2129 | + kfree(ctx); |
---|
| 2130 | +} |
---|
| 2131 | + |
---|
| 2132 | +static int cgroup_get_tree(struct fs_context *fc) |
---|
| 2133 | +{ |
---|
| 2134 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
2119 | 2135 | int ret; |
---|
2120 | 2136 | |
---|
2121 | | - get_cgroup_ns(ns); |
---|
| 2137 | + cgrp_dfl_visible = true; |
---|
| 2138 | + cgroup_get_live(&cgrp_dfl_root.cgrp); |
---|
| 2139 | + ctx->root = &cgrp_dfl_root; |
---|
2122 | 2140 | |
---|
2123 | | - /* Check if the caller has permission to mount. */ |
---|
2124 | | - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { |
---|
2125 | | - put_cgroup_ns(ns); |
---|
2126 | | - return ERR_PTR(-EPERM); |
---|
2127 | | - } |
---|
| 2141 | + ret = cgroup_do_get_tree(fc); |
---|
| 2142 | + if (!ret) |
---|
| 2143 | + apply_cgroup_root_flags(ctx->flags); |
---|
| 2144 | + return ret; |
---|
| 2145 | +} |
---|
2128 | 2146 | |
---|
2129 | | - /* |
---|
2130 | | - * The first time anyone tries to mount a cgroup, enable the list |
---|
2131 | | - * linking each css_set to its tasks and fix up all existing tasks. |
---|
2132 | | - */ |
---|
2133 | | - if (!use_task_css_set_links) |
---|
2134 | | - cgroup_enable_task_cg_lists(); |
---|
| 2147 | +static const struct fs_context_operations cgroup_fs_context_ops = { |
---|
| 2148 | + .free = cgroup_fs_context_free, |
---|
| 2149 | + .parse_param = cgroup2_parse_param, |
---|
| 2150 | + .get_tree = cgroup_get_tree, |
---|
| 2151 | + .reconfigure = cgroup_reconfigure, |
---|
| 2152 | +}; |
---|
2135 | 2153 | |
---|
2136 | | - if (fs_type == &cgroup2_fs_type) { |
---|
2137 | | - unsigned int root_flags; |
---|
| 2154 | +static const struct fs_context_operations cgroup1_fs_context_ops = { |
---|
| 2155 | + .free = cgroup_fs_context_free, |
---|
| 2156 | + .parse_param = cgroup1_parse_param, |
---|
| 2157 | + .get_tree = cgroup1_get_tree, |
---|
| 2158 | + .reconfigure = cgroup1_reconfigure, |
---|
| 2159 | +}; |
---|
2138 | 2160 | |
---|
2139 | | - ret = parse_cgroup_root_flags(data, &root_flags); |
---|
2140 | | - if (ret) { |
---|
2141 | | - put_cgroup_ns(ns); |
---|
2142 | | - return ERR_PTR(ret); |
---|
2143 | | - } |
---|
| 2161 | +/* |
---|
| 2162 | + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, |
---|
| 2163 | + * we select the namespace we're going to use. |
---|
| 2164 | + */ |
---|
| 2165 | +static int cgroup_init_fs_context(struct fs_context *fc) |
---|
| 2166 | +{ |
---|
| 2167 | + struct cgroup_fs_context *ctx; |
---|
2144 | 2168 | |
---|
2145 | | - cgrp_dfl_visible = true; |
---|
2146 | | - cgroup_get_live(&cgrp_dfl_root.cgrp); |
---|
| 2169 | + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); |
---|
| 2170 | + if (!ctx) |
---|
| 2171 | + return -ENOMEM; |
---|
2147 | 2172 | |
---|
2148 | | - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, |
---|
2149 | | - CGROUP2_SUPER_MAGIC, ns); |
---|
2150 | | - if (!IS_ERR(dentry)) |
---|
2151 | | - apply_cgroup_root_flags(root_flags); |
---|
2152 | | - } else { |
---|
2153 | | - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, |
---|
2154 | | - CGROUP_SUPER_MAGIC, ns); |
---|
2155 | | - } |
---|
2156 | | - |
---|
2157 | | - put_cgroup_ns(ns); |
---|
2158 | | - return dentry; |
---|
| 2173 | + ctx->ns = current->nsproxy->cgroup_ns; |
---|
| 2174 | + get_cgroup_ns(ctx->ns); |
---|
| 2175 | + fc->fs_private = &ctx->kfc; |
---|
| 2176 | + if (fc->fs_type == &cgroup2_fs_type) |
---|
| 2177 | + fc->ops = &cgroup_fs_context_ops; |
---|
| 2178 | + else |
---|
| 2179 | + fc->ops = &cgroup1_fs_context_ops; |
---|
| 2180 | + put_user_ns(fc->user_ns); |
---|
| 2181 | + fc->user_ns = get_user_ns(ctx->ns->user_ns); |
---|
| 2182 | + fc->global = true; |
---|
| 2183 | + return 0; |
---|
2159 | 2184 | } |
---|
2160 | 2185 | |
---|
2161 | 2186 | static void cgroup_kill_sb(struct super_block *sb) |
---|
.. | .. |
---|
2171 | 2196 | * And don't kill the default root. |
---|
2172 | 2197 | */ |
---|
2173 | 2198 | if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && |
---|
2174 | | - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) |
---|
| 2199 | + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { |
---|
| 2200 | + cgroup_bpf_offline(&root->cgrp); |
---|
2175 | 2201 | percpu_ref_kill(&root->cgrp.self.refcnt); |
---|
| 2202 | + } |
---|
2176 | 2203 | cgroup_put(&root->cgrp); |
---|
2177 | 2204 | kernfs_kill_sb(sb); |
---|
2178 | 2205 | } |
---|
2179 | 2206 | |
---|
2180 | 2207 | struct file_system_type cgroup_fs_type = { |
---|
2181 | | - .name = "cgroup", |
---|
2182 | | - .mount = cgroup_mount, |
---|
2183 | | - .kill_sb = cgroup_kill_sb, |
---|
2184 | | - .fs_flags = FS_USERNS_MOUNT, |
---|
| 2208 | + .name = "cgroup", |
---|
| 2209 | + .init_fs_context = cgroup_init_fs_context, |
---|
| 2210 | + .parameters = cgroup1_fs_parameters, |
---|
| 2211 | + .kill_sb = cgroup_kill_sb, |
---|
| 2212 | + .fs_flags = FS_USERNS_MOUNT, |
---|
2185 | 2213 | }; |
---|
2186 | 2214 | |
---|
2187 | 2215 | static struct file_system_type cgroup2_fs_type = { |
---|
2188 | | - .name = "cgroup2", |
---|
2189 | | - .mount = cgroup_mount, |
---|
2190 | | - .kill_sb = cgroup_kill_sb, |
---|
2191 | | - .fs_flags = FS_USERNS_MOUNT, |
---|
| 2216 | + .name = "cgroup2", |
---|
| 2217 | + .init_fs_context = cgroup_init_fs_context, |
---|
| 2218 | + .parameters = cgroup2_fs_parameters, |
---|
| 2219 | + .kill_sb = cgroup_kill_sb, |
---|
| 2220 | + .fs_flags = FS_USERNS_MOUNT, |
---|
2192 | 2221 | }; |
---|
| 2222 | + |
---|
| 2223 | +#ifdef CONFIG_CPUSETS |
---|
| 2224 | +static const struct fs_context_operations cpuset_fs_context_ops = { |
---|
| 2225 | + .get_tree = cgroup1_get_tree, |
---|
| 2226 | + .free = cgroup_fs_context_free, |
---|
| 2227 | +}; |
---|
| 2228 | + |
---|
| 2229 | +/* |
---|
| 2230 | + * This is ugly, but preserves the userspace API for existing cpuset |
---|
| 2231 | + * users. If someone tries to mount the "cpuset" filesystem, we |
---|
| 2232 | + * silently switch it to mount "cgroup" instead |
---|
| 2233 | + */ |
---|
| 2234 | +static int cpuset_init_fs_context(struct fs_context *fc) |
---|
| 2235 | +{ |
---|
| 2236 | + char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); |
---|
| 2237 | + struct cgroup_fs_context *ctx; |
---|
| 2238 | + int err; |
---|
| 2239 | + |
---|
| 2240 | + err = cgroup_init_fs_context(fc); |
---|
| 2241 | + if (err) { |
---|
| 2242 | + kfree(agent); |
---|
| 2243 | + return err; |
---|
| 2244 | + } |
---|
| 2245 | + |
---|
| 2246 | + fc->ops = &cpuset_fs_context_ops; |
---|
| 2247 | + |
---|
| 2248 | + ctx = cgroup_fc2context(fc); |
---|
| 2249 | + ctx->subsys_mask = 1 << cpuset_cgrp_id; |
---|
| 2250 | + ctx->flags |= CGRP_ROOT_NOPREFIX; |
---|
| 2251 | + ctx->release_agent = agent; |
---|
| 2252 | + |
---|
| 2253 | + get_filesystem(&cgroup_fs_type); |
---|
| 2254 | + put_filesystem(fc->fs_type); |
---|
| 2255 | + fc->fs_type = &cgroup_fs_type; |
---|
| 2256 | + |
---|
| 2257 | + return 0; |
---|
| 2258 | +} |
---|
| 2259 | + |
---|
| 2260 | +static struct file_system_type cpuset_fs_type = { |
---|
| 2261 | + .name = "cpuset", |
---|
| 2262 | + .init_fs_context = cpuset_init_fs_context, |
---|
| 2263 | + .fs_flags = FS_USERNS_MOUNT, |
---|
| 2264 | +}; |
---|
| 2265 | +#endif |
---|
2193 | 2266 | |
---|
2194 | 2267 | int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, |
---|
2195 | 2268 | struct cgroup_namespace *ns) |
---|
.. | .. |
---|
2256 | 2329 | EXPORT_SYMBOL_GPL(task_cgroup_path); |
---|
2257 | 2330 | |
---|
2258 | 2331 | /** |
---|
| 2332 | + * cgroup_attach_lock - Lock for ->attach() |
---|
| 2333 | + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem |
---|
| 2334 | + * |
---|
| 2335 | + * cgroup migration sometimes needs to stabilize threadgroups against forks and |
---|
| 2336 | + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() |
---|
| 2337 | + * implementations (e.g. cpuset), also need to disable CPU hotplug. |
---|
| 2338 | + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can |
---|
| 2339 | + * lead to deadlocks. |
---|
| 2340 | + * |
---|
| 2341 | + * Bringing up a CPU may involve creating and destroying tasks which requires |
---|
| 2342 | + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside |
---|
| 2343 | + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while |
---|
| 2344 | + * write-locking threadgroup_rwsem, the locking order is reversed and we end up |
---|
| 2345 | + * waiting for an on-going CPU hotplug operation which in turn is waiting for |
---|
| 2346 | + * the threadgroup_rwsem to be released to create new tasks. For more details: |
---|
| 2347 | + * |
---|
| 2348 | + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu |
---|
| 2349 | + * |
---|
| 2350 | + * Resolve the situation by always acquiring cpus_read_lock() before optionally |
---|
| 2351 | + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that |
---|
| 2352 | + * CPU hotplug is disabled on entry. |
---|
| 2353 | + */ |
---|
| 2354 | +static void cgroup_attach_lock(bool lock_threadgroup) |
---|
| 2355 | +{ |
---|
| 2356 | + cpus_read_lock(); |
---|
| 2357 | + if (lock_threadgroup) |
---|
| 2358 | + percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
| 2359 | +} |
---|
| 2360 | + |
---|
| 2361 | +/** |
---|
| 2362 | + * cgroup_attach_unlock - Undo cgroup_attach_lock() |
---|
| 2363 | + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem |
---|
| 2364 | + */ |
---|
| 2365 | +static void cgroup_attach_unlock(bool lock_threadgroup) |
---|
| 2366 | +{ |
---|
| 2367 | + if (lock_threadgroup) |
---|
| 2368 | + percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2369 | + cpus_read_unlock(); |
---|
| 2370 | +} |
---|
| 2371 | + |
---|
| 2372 | +/** |
---|
2259 | 2373 | * cgroup_migrate_add_task - add a migration target task to a migration context |
---|
2260 | 2374 | * @task: target task |
---|
2261 | 2375 | * @mgctx: target migration context |
---|
.. | .. |
---|
2276 | 2390 | if (task->flags & PF_EXITING) |
---|
2277 | 2391 | return; |
---|
2278 | 2392 | |
---|
2279 | | - /* leave @task alone if post_fork() hasn't linked it yet */ |
---|
2280 | | - if (list_empty(&task->cg_list)) |
---|
2281 | | - return; |
---|
| 2393 | + /* cgroup_threadgroup_rwsem protects racing against forks */ |
---|
| 2394 | + WARN_ON_ONCE(list_empty(&task->cg_list)); |
---|
2282 | 2395 | |
---|
2283 | 2396 | cset = task_css_set(task); |
---|
2284 | 2397 | if (!cset->mg_src_cgrp) |
---|
.. | .. |
---|
2310 | 2423 | |
---|
2311 | 2424 | return cgroup_taskset_next(tset, dst_cssp); |
---|
2312 | 2425 | } |
---|
| 2426 | +EXPORT_SYMBOL_GPL(cgroup_taskset_first); |
---|
2313 | 2427 | |
---|
2314 | 2428 | /** |
---|
2315 | 2429 | * cgroup_taskset_next - iterate to the next task in taskset |
---|
.. | .. |
---|
2356 | 2470 | |
---|
2357 | 2471 | return NULL; |
---|
2358 | 2472 | } |
---|
| 2473 | +EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
---|
2359 | 2474 | |
---|
2360 | 2475 | /** |
---|
2361 | 2476 | * cgroup_taskset_migrate - migrate a taskset |
---|
.. | .. |
---|
2426 | 2541 | do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { |
---|
2427 | 2542 | if (ss->attach) { |
---|
2428 | 2543 | tset->ssid = ssid; |
---|
| 2544 | + trace_android_vh_cgroup_attach(ss, tset); |
---|
2429 | 2545 | ss->attach(tset); |
---|
2430 | 2546 | } |
---|
2431 | 2547 | } while_each_subsys_mask(); |
---|
.. | .. |
---|
2510 | 2626 | */ |
---|
2511 | 2627 | void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) |
---|
2512 | 2628 | { |
---|
2513 | | - LIST_HEAD(preloaded); |
---|
2514 | | - struct css_set *cset, *tmp_cset; |
---|
| 2629 | + struct ext_css_set *cset, *tmp_cset; |
---|
2515 | 2630 | |
---|
2516 | 2631 | lockdep_assert_held(&cgroup_mutex); |
---|
2517 | 2632 | |
---|
2518 | 2633 | spin_lock_irq(&css_set_lock); |
---|
2519 | 2634 | |
---|
2520 | | - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); |
---|
2521 | | - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); |
---|
| 2635 | + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, |
---|
| 2636 | + mg_src_preload_node) { |
---|
| 2637 | + cset->cset.mg_src_cgrp = NULL; |
---|
| 2638 | + cset->cset.mg_dst_cgrp = NULL; |
---|
| 2639 | + cset->cset.mg_dst_cset = NULL; |
---|
| 2640 | + list_del_init(&cset->mg_src_preload_node); |
---|
| 2641 | + put_css_set_locked(&cset->cset); |
---|
| 2642 | + } |
---|
2522 | 2643 | |
---|
2523 | | - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { |
---|
2524 | | - cset->mg_src_cgrp = NULL; |
---|
2525 | | - cset->mg_dst_cgrp = NULL; |
---|
2526 | | - cset->mg_dst_cset = NULL; |
---|
2527 | | - list_del_init(&cset->mg_preload_node); |
---|
2528 | | - put_css_set_locked(cset); |
---|
| 2644 | + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, |
---|
| 2645 | + mg_dst_preload_node) { |
---|
| 2646 | + cset->cset.mg_src_cgrp = NULL; |
---|
| 2647 | + cset->cset.mg_dst_cgrp = NULL; |
---|
| 2648 | + cset->cset.mg_dst_cset = NULL; |
---|
| 2649 | + list_del_init(&cset->mg_dst_preload_node); |
---|
| 2650 | + put_css_set_locked(&cset->cset); |
---|
2529 | 2651 | } |
---|
2530 | 2652 | |
---|
2531 | 2653 | spin_unlock_irq(&css_set_lock); |
---|
.. | .. |
---|
2552 | 2674 | struct cgroup_mgctx *mgctx) |
---|
2553 | 2675 | { |
---|
2554 | 2676 | struct cgroup *src_cgrp; |
---|
| 2677 | + struct ext_css_set *ext_src_cset; |
---|
2555 | 2678 | |
---|
2556 | 2679 | lockdep_assert_held(&cgroup_mutex); |
---|
2557 | 2680 | lockdep_assert_held(&css_set_lock); |
---|
.. | .. |
---|
2565 | 2688 | return; |
---|
2566 | 2689 | |
---|
2567 | 2690 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
---|
| 2691 | + ext_src_cset = container_of(src_cset, struct ext_css_set, cset); |
---|
2568 | 2692 | |
---|
2569 | | - if (!list_empty(&src_cset->mg_preload_node)) |
---|
| 2693 | + if (!list_empty(&ext_src_cset->mg_src_preload_node)) |
---|
2570 | 2694 | return; |
---|
2571 | 2695 | |
---|
2572 | 2696 | WARN_ON(src_cset->mg_src_cgrp); |
---|
.. | .. |
---|
2577 | 2701 | src_cset->mg_src_cgrp = src_cgrp; |
---|
2578 | 2702 | src_cset->mg_dst_cgrp = dst_cgrp; |
---|
2579 | 2703 | get_css_set(src_cset); |
---|
2580 | | - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); |
---|
| 2704 | + list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); |
---|
2581 | 2705 | } |
---|
2582 | 2706 | |
---|
2583 | 2707 | /** |
---|
.. | .. |
---|
2596 | 2720 | */ |
---|
2597 | 2721 | int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) |
---|
2598 | 2722 | { |
---|
2599 | | - struct css_set *src_cset, *tmp_cset; |
---|
| 2723 | + struct ext_css_set *ext_src_set, *tmp_cset; |
---|
2600 | 2724 | |
---|
2601 | 2725 | lockdep_assert_held(&cgroup_mutex); |
---|
2602 | 2726 | |
---|
2603 | 2727 | /* look up the dst cset for each src cset and link it to src */ |
---|
2604 | | - list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, |
---|
2605 | | - mg_preload_node) { |
---|
| 2728 | + list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets, |
---|
| 2729 | + mg_src_preload_node) { |
---|
| 2730 | + struct css_set *src_cset = &ext_src_set->cset; |
---|
2606 | 2731 | struct css_set *dst_cset; |
---|
| 2732 | + struct ext_css_set *ext_dst_cset; |
---|
2607 | 2733 | struct cgroup_subsys *ss; |
---|
2608 | 2734 | int ssid; |
---|
2609 | 2735 | |
---|
2610 | 2736 | dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); |
---|
2611 | 2737 | if (!dst_cset) |
---|
2612 | 2738 | return -ENOMEM; |
---|
| 2739 | + ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset); |
---|
2613 | 2740 | |
---|
2614 | 2741 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); |
---|
2615 | 2742 | |
---|
.. | .. |
---|
2621 | 2748 | if (src_cset == dst_cset) { |
---|
2622 | 2749 | src_cset->mg_src_cgrp = NULL; |
---|
2623 | 2750 | src_cset->mg_dst_cgrp = NULL; |
---|
2624 | | - list_del_init(&src_cset->mg_preload_node); |
---|
| 2751 | + list_del_init(&ext_src_set->mg_src_preload_node); |
---|
2625 | 2752 | put_css_set(src_cset); |
---|
2626 | 2753 | put_css_set(dst_cset); |
---|
2627 | 2754 | continue; |
---|
.. | .. |
---|
2629 | 2756 | |
---|
2630 | 2757 | src_cset->mg_dst_cset = dst_cset; |
---|
2631 | 2758 | |
---|
2632 | | - if (list_empty(&dst_cset->mg_preload_node)) |
---|
2633 | | - list_add_tail(&dst_cset->mg_preload_node, |
---|
| 2759 | + if (list_empty(&ext_dst_cset->mg_dst_preload_node)) |
---|
| 2760 | + list_add_tail(&ext_dst_cset->mg_dst_preload_node, |
---|
2634 | 2761 | &mgctx->preloaded_dst_csets); |
---|
2635 | 2762 | else |
---|
2636 | 2763 | put_css_set(dst_cset); |
---|
.. | .. |
---|
2698 | 2825 | { |
---|
2699 | 2826 | DEFINE_CGROUP_MGCTX(mgctx); |
---|
2700 | 2827 | struct task_struct *task; |
---|
2701 | | - int ret; |
---|
2702 | | - |
---|
2703 | | - ret = cgroup_migrate_vet_dst(dst_cgrp); |
---|
2704 | | - if (ret) |
---|
2705 | | - return ret; |
---|
| 2828 | + int ret = 0; |
---|
2706 | 2829 | |
---|
2707 | 2830 | /* look up all src csets */ |
---|
2708 | 2831 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
2729 | 2852 | return ret; |
---|
2730 | 2853 | } |
---|
2731 | 2854 | |
---|
2732 | | -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) |
---|
2733 | | - __acquires(&cgroup_threadgroup_rwsem) |
---|
| 2855 | +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, |
---|
| 2856 | + bool *threadgroup_locked, |
---|
| 2857 | + struct cgroup *dst_cgrp) |
---|
2734 | 2858 | { |
---|
2735 | 2859 | struct task_struct *tsk; |
---|
2736 | 2860 | pid_t pid; |
---|
| 2861 | + bool force_migration = false; |
---|
2737 | 2862 | |
---|
2738 | 2863 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
---|
2739 | 2864 | return ERR_PTR(-EINVAL); |
---|
2740 | 2865 | |
---|
2741 | | - percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
| 2866 | + /* |
---|
| 2867 | + * If we migrate a single thread, we don't care about threadgroup |
---|
| 2868 | + * stability. If the thread is `current`, it won't exit(2) under our |
---|
| 2869 | + * hands or change PID through exec(2). We exclude |
---|
| 2870 | + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write |
---|
| 2871 | + * callers by cgroup_mutex. |
---|
| 2872 | + * Therefore, we can skip the global lock. |
---|
| 2873 | + */ |
---|
| 2874 | + lockdep_assert_held(&cgroup_mutex); |
---|
| 2875 | + *threadgroup_locked = pid || threadgroup; |
---|
| 2876 | + cgroup_attach_lock(*threadgroup_locked); |
---|
2742 | 2877 | |
---|
2743 | 2878 | rcu_read_lock(); |
---|
2744 | 2879 | if (pid) { |
---|
.. | .. |
---|
2754 | 2889 | if (threadgroup) |
---|
2755 | 2890 | tsk = tsk->group_leader; |
---|
2756 | 2891 | |
---|
| 2892 | + if (tsk->flags & PF_KTHREAD) |
---|
| 2893 | + trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration); |
---|
| 2894 | + |
---|
2757 | 2895 | /* |
---|
2758 | 2896 | * kthreads may acquire PF_NO_SETAFFINITY during initialization. |
---|
2759 | 2897 | * If userland migrates such a kthread to a non-root cgroup, it can |
---|
2760 | 2898 | * become trapped in a cpuset, or RT kthread may be born in a |
---|
2761 | 2899 | * cgroup with no rt_runtime allocated. Just say no. |
---|
2762 | 2900 | */ |
---|
2763 | | - if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { |
---|
| 2901 | + if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) { |
---|
2764 | 2902 | tsk = ERR_PTR(-EINVAL); |
---|
2765 | 2903 | goto out_unlock_threadgroup; |
---|
2766 | 2904 | } |
---|
.. | .. |
---|
2769 | 2907 | goto out_unlock_rcu; |
---|
2770 | 2908 | |
---|
2771 | 2909 | out_unlock_threadgroup: |
---|
2772 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2910 | + cgroup_attach_unlock(*threadgroup_locked); |
---|
| 2911 | + *threadgroup_locked = false; |
---|
2773 | 2912 | out_unlock_rcu: |
---|
2774 | 2913 | rcu_read_unlock(); |
---|
2775 | 2914 | return tsk; |
---|
2776 | 2915 | } |
---|
2777 | 2916 | |
---|
2778 | | -void cgroup_procs_write_finish(struct task_struct *task) |
---|
2779 | | - __releases(&cgroup_threadgroup_rwsem) |
---|
| 2917 | +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) |
---|
2780 | 2918 | { |
---|
2781 | 2919 | struct cgroup_subsys *ss; |
---|
2782 | 2920 | int ssid; |
---|
.. | .. |
---|
2784 | 2922 | /* release reference from cgroup_procs_write_start() */ |
---|
2785 | 2923 | put_task_struct(task); |
---|
2786 | 2924 | |
---|
2787 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2925 | + cgroup_attach_unlock(threadgroup_locked); |
---|
| 2926 | + |
---|
2788 | 2927 | for_each_subsys(ss, ssid) |
---|
2789 | 2928 | if (ss->post_attach) |
---|
2790 | 2929 | ss->post_attach(); |
---|
.. | .. |
---|
2799 | 2938 | do_each_subsys_mask(ss, ssid, ss_mask) { |
---|
2800 | 2939 | if (printed) |
---|
2801 | 2940 | seq_putc(seq, ' '); |
---|
2802 | | - seq_printf(seq, "%s", ss->name); |
---|
| 2941 | + seq_puts(seq, ss->name); |
---|
2803 | 2942 | printed = true; |
---|
2804 | 2943 | } while_each_subsys_mask(); |
---|
2805 | 2944 | if (printed) |
---|
.. | .. |
---|
2838 | 2977 | DEFINE_CGROUP_MGCTX(mgctx); |
---|
2839 | 2978 | struct cgroup_subsys_state *d_css; |
---|
2840 | 2979 | struct cgroup *dsct; |
---|
2841 | | - struct css_set *src_cset; |
---|
| 2980 | + struct ext_css_set *ext_src_set; |
---|
| 2981 | + bool has_tasks; |
---|
2842 | 2982 | int ret; |
---|
2843 | 2983 | |
---|
2844 | 2984 | lockdep_assert_held(&cgroup_mutex); |
---|
2845 | | - |
---|
2846 | | - percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
2847 | 2985 | |
---|
2848 | 2986 | /* look up all csses currently attached to @cgrp's subtree */ |
---|
2849 | 2987 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
2855 | 2993 | } |
---|
2856 | 2994 | spin_unlock_irq(&css_set_lock); |
---|
2857 | 2995 | |
---|
| 2996 | + /* |
---|
| 2997 | + * We need to write-lock threadgroup_rwsem while migrating tasks. |
---|
| 2998 | + * However, if there are no source csets for @cgrp, changing its |
---|
| 2999 | + * controllers isn't gonna produce any task migrations and the |
---|
| 3000 | + * write-locking can be skipped safely. |
---|
| 3001 | + */ |
---|
| 3002 | + has_tasks = !list_empty(&mgctx.preloaded_src_csets); |
---|
| 3003 | + cgroup_attach_lock(has_tasks); |
---|
| 3004 | + |
---|
2858 | 3005 | /* NULL dst indicates self on default hierarchy */ |
---|
2859 | 3006 | ret = cgroup_migrate_prepare_dst(&mgctx); |
---|
2860 | 3007 | if (ret) |
---|
2861 | 3008 | goto out_finish; |
---|
2862 | 3009 | |
---|
2863 | 3010 | spin_lock_irq(&css_set_lock); |
---|
2864 | | - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { |
---|
| 3011 | + list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets, |
---|
| 3012 | + mg_src_preload_node) { |
---|
2865 | 3013 | struct task_struct *task, *ntask; |
---|
2866 | 3014 | |
---|
2867 | 3015 | /* all tasks in src_csets need to be migrated */ |
---|
2868 | | - list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) |
---|
| 3016 | + list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list) |
---|
2869 | 3017 | cgroup_migrate_add_task(task, &mgctx); |
---|
2870 | 3018 | } |
---|
2871 | 3019 | spin_unlock_irq(&css_set_lock); |
---|
.. | .. |
---|
2873 | 3021 | ret = cgroup_migrate_execute(&mgctx); |
---|
2874 | 3022 | out_finish: |
---|
2875 | 3023 | cgroup_migrate_finish(&mgctx); |
---|
2876 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 3024 | + cgroup_attach_unlock(has_tasks); |
---|
2877 | 3025 | return ret; |
---|
2878 | 3026 | } |
---|
2879 | 3027 | |
---|
.. | .. |
---|
3106 | 3254 | return ret; |
---|
3107 | 3255 | |
---|
3108 | 3256 | /* |
---|
3109 | | - * At this point, cgroup_e_css() results reflect the new csses |
---|
| 3257 | + * At this point, cgroup_e_css_by_mask() results reflect the new csses |
---|
3110 | 3258 | * making the following cgroup_update_dfl_csses() properly update |
---|
3111 | 3259 | * css associations of all tasks in the subtree. |
---|
3112 | 3260 | */ |
---|
.. | .. |
---|
3506 | 3654 | #ifdef CONFIG_PSI |
---|
3507 | 3655 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) |
---|
3508 | 3656 | { |
---|
3509 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); |
---|
| 3657 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3658 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3659 | + |
---|
| 3660 | + return psi_show(seq, psi, PSI_IO); |
---|
3510 | 3661 | } |
---|
3511 | 3662 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) |
---|
3512 | 3663 | { |
---|
3513 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); |
---|
| 3664 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3665 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3666 | + |
---|
| 3667 | + return psi_show(seq, psi, PSI_MEM); |
---|
3514 | 3668 | } |
---|
3515 | 3669 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) |
---|
3516 | 3670 | { |
---|
3517 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); |
---|
| 3671 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3672 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3673 | + |
---|
| 3674 | + return psi_show(seq, psi, PSI_CPU); |
---|
3518 | 3675 | } |
---|
3519 | 3676 | |
---|
3520 | 3677 | static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, |
---|
3521 | 3678 | size_t nbytes, enum psi_res res) |
---|
3522 | 3679 | { |
---|
| 3680 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3523 | 3681 | struct psi_trigger *new; |
---|
3524 | 3682 | struct cgroup *cgrp; |
---|
| 3683 | + struct psi_group *psi; |
---|
3525 | 3684 | |
---|
3526 | 3685 | cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
3527 | 3686 | if (!cgrp) |
---|
.. | .. |
---|
3530 | 3689 | cgroup_get(cgrp); |
---|
3531 | 3690 | cgroup_kn_unlock(of->kn); |
---|
3532 | 3691 | |
---|
3533 | | - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); |
---|
| 3692 | + /* Allow only one trigger per file descriptor */ |
---|
| 3693 | + if (ctx->psi.trigger) { |
---|
| 3694 | + cgroup_put(cgrp); |
---|
| 3695 | + return -EBUSY; |
---|
| 3696 | + } |
---|
| 3697 | + |
---|
| 3698 | + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3699 | + new = psi_trigger_create(psi, buf, nbytes, res); |
---|
3534 | 3700 | if (IS_ERR(new)) { |
---|
3535 | 3701 | cgroup_put(cgrp); |
---|
3536 | 3702 | return PTR_ERR(new); |
---|
3537 | 3703 | } |
---|
3538 | 3704 | |
---|
3539 | | - psi_trigger_replace(&of->priv, new); |
---|
3540 | | - |
---|
| 3705 | + smp_store_release(&ctx->psi.trigger, new); |
---|
3541 | 3706 | cgroup_put(cgrp); |
---|
3542 | 3707 | |
---|
3543 | 3708 | return nbytes; |
---|
.. | .. |
---|
3567 | 3732 | static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, |
---|
3568 | 3733 | poll_table *pt) |
---|
3569 | 3734 | { |
---|
3570 | | - return psi_trigger_poll(&of->priv, of->file, pt); |
---|
| 3735 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 3736 | + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); |
---|
3571 | 3737 | } |
---|
3572 | 3738 | |
---|
3573 | 3739 | static void cgroup_pressure_release(struct kernfs_open_file *of) |
---|
3574 | 3740 | { |
---|
3575 | | - psi_trigger_replace(&of->priv, NULL); |
---|
| 3741 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 3742 | + |
---|
| 3743 | + psi_trigger_destroy(ctx->psi.trigger); |
---|
3576 | 3744 | } |
---|
3577 | 3745 | |
---|
3578 | 3746 | bool cgroup_psi_enabled(void) |
---|
.. | .. |
---|
3625 | 3793 | static int cgroup_file_open(struct kernfs_open_file *of) |
---|
3626 | 3794 | { |
---|
3627 | 3795 | struct cftype *cft = of->kn->priv; |
---|
| 3796 | + struct cgroup_file_ctx *ctx; |
---|
| 3797 | + int ret; |
---|
3628 | 3798 | |
---|
3629 | | - if (cft->open) |
---|
3630 | | - return cft->open(of); |
---|
3631 | | - return 0; |
---|
| 3799 | + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); |
---|
| 3800 | + if (!ctx) |
---|
| 3801 | + return -ENOMEM; |
---|
| 3802 | + |
---|
| 3803 | + ctx->ns = current->nsproxy->cgroup_ns; |
---|
| 3804 | + get_cgroup_ns(ctx->ns); |
---|
| 3805 | + of->priv = ctx; |
---|
| 3806 | + |
---|
| 3807 | + if (!cft->open) |
---|
| 3808 | + return 0; |
---|
| 3809 | + |
---|
| 3810 | + ret = cft->open(of); |
---|
| 3811 | + if (ret) { |
---|
| 3812 | + put_cgroup_ns(ctx->ns); |
---|
| 3813 | + kfree(ctx); |
---|
| 3814 | + } |
---|
| 3815 | + return ret; |
---|
3632 | 3816 | } |
---|
3633 | 3817 | |
---|
3634 | 3818 | static void cgroup_file_release(struct kernfs_open_file *of) |
---|
3635 | 3819 | { |
---|
3636 | 3820 | struct cftype *cft = of->kn->priv; |
---|
| 3821 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3637 | 3822 | |
---|
3638 | 3823 | if (cft->release) |
---|
3639 | 3824 | cft->release(of); |
---|
| 3825 | + put_cgroup_ns(ctx->ns); |
---|
| 3826 | + kfree(ctx); |
---|
3640 | 3827 | } |
---|
3641 | 3828 | |
---|
3642 | 3829 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
---|
3643 | 3830 | size_t nbytes, loff_t off) |
---|
3644 | 3831 | { |
---|
3645 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
| 3832 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3646 | 3833 | struct cgroup *cgrp = of->kn->parent->priv; |
---|
3647 | 3834 | struct cftype *cft = of->kn->priv; |
---|
3648 | 3835 | struct cgroup_subsys_state *css; |
---|
3649 | 3836 | int ret; |
---|
| 3837 | + |
---|
| 3838 | + if (!nbytes) |
---|
| 3839 | + return 0; |
---|
3650 | 3840 | |
---|
3651 | 3841 | /* |
---|
3652 | 3842 | * If namespaces are delegation boundaries, disallow writes to |
---|
.. | .. |
---|
3656 | 3846 | */ |
---|
3657 | 3847 | if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && |
---|
3658 | 3848 | !(cft->flags & CFTYPE_NS_DELEGATABLE) && |
---|
3659 | | - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) |
---|
| 3849 | + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) |
---|
3660 | 3850 | return -EPERM; |
---|
3661 | 3851 | |
---|
3662 | 3852 | if (cft->write) |
---|
.. | .. |
---|
3843 | 4033 | continue; |
---|
3844 | 4034 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) |
---|
3845 | 4035 | continue; |
---|
3846 | | - |
---|
| 4036 | + if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) |
---|
| 4037 | + continue; |
---|
3847 | 4038 | if (is_add) { |
---|
3848 | 4039 | ret = cgroup_add_file(css, cgrp, cft); |
---|
3849 | 4040 | if (ret) { |
---|
.. | .. |
---|
4045 | 4236 | cft->flags |= __CFTYPE_NOT_ON_DFL; |
---|
4046 | 4237 | return cgroup_add_cftypes(ss, cfts); |
---|
4047 | 4238 | } |
---|
| 4239 | +EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes); |
---|
4048 | 4240 | |
---|
4049 | 4241 | /** |
---|
4050 | 4242 | * cgroup_file_notify - generate a file modified event for a cgroup_file |
---|
.. | .. |
---|
4120 | 4312 | } else if (likely(!(pos->flags & CSS_RELEASED))) { |
---|
4121 | 4313 | next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); |
---|
4122 | 4314 | } else { |
---|
4123 | | - list_for_each_entry_rcu(next, &parent->children, sibling) |
---|
| 4315 | + list_for_each_entry_rcu(next, &parent->children, sibling, |
---|
| 4316 | + lockdep_is_held(&cgroup_mutex)) |
---|
4124 | 4317 | if (next->serial_nr > pos->serial_nr) |
---|
4125 | 4318 | break; |
---|
4126 | 4319 | } |
---|
.. | .. |
---|
4133 | 4326 | return next; |
---|
4134 | 4327 | return NULL; |
---|
4135 | 4328 | } |
---|
| 4329 | +EXPORT_SYMBOL_GPL(css_next_child); |
---|
4136 | 4330 | |
---|
4137 | 4331 | /** |
---|
4138 | 4332 | * css_next_descendant_pre - find the next descendant for pre-order walk |
---|
.. | .. |
---|
4182 | 4376 | |
---|
4183 | 4377 | return NULL; |
---|
4184 | 4378 | } |
---|
| 4379 | +EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
---|
4185 | 4380 | |
---|
4186 | 4381 | /** |
---|
4187 | 4382 | * css_rightmost_descendant - return the rightmost descendant of a css |
---|
.. | .. |
---|
4362 | 4557 | |
---|
4363 | 4558 | lockdep_assert_held(&css_set_lock); |
---|
4364 | 4559 | |
---|
4365 | | - /* Advance to the next non-empty css_set */ |
---|
4366 | | - do { |
---|
4367 | | - cset = css_task_iter_next_css_set(it); |
---|
4368 | | - if (!cset) { |
---|
4369 | | - it->task_pos = NULL; |
---|
4370 | | - return; |
---|
| 4560 | + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ |
---|
| 4561 | + while ((cset = css_task_iter_next_css_set(it))) { |
---|
| 4562 | + if (!list_empty(&cset->tasks)) { |
---|
| 4563 | + it->cur_tasks_head = &cset->tasks; |
---|
| 4564 | + break; |
---|
| 4565 | + } else if (!list_empty(&cset->mg_tasks)) { |
---|
| 4566 | + it->cur_tasks_head = &cset->mg_tasks; |
---|
| 4567 | + break; |
---|
| 4568 | + } else if (!list_empty(&cset->dying_tasks)) { |
---|
| 4569 | + it->cur_tasks_head = &cset->dying_tasks; |
---|
| 4570 | + break; |
---|
4371 | 4571 | } |
---|
4372 | | - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); |
---|
4373 | | - |
---|
4374 | | - if (!list_empty(&cset->tasks)) { |
---|
4375 | | - it->task_pos = cset->tasks.next; |
---|
4376 | | - it->cur_tasks_head = &cset->tasks; |
---|
4377 | | - } else if (!list_empty(&cset->mg_tasks)) { |
---|
4378 | | - it->task_pos = cset->mg_tasks.next; |
---|
4379 | | - it->cur_tasks_head = &cset->mg_tasks; |
---|
4380 | | - } else { |
---|
4381 | | - it->task_pos = cset->dying_tasks.next; |
---|
4382 | | - it->cur_tasks_head = &cset->dying_tasks; |
---|
4383 | 4572 | } |
---|
4384 | | - |
---|
4385 | | - it->tasks_head = &cset->tasks; |
---|
4386 | | - it->mg_tasks_head = &cset->mg_tasks; |
---|
4387 | | - it->dying_tasks_head = &cset->dying_tasks; |
---|
| 4573 | + if (!cset) { |
---|
| 4574 | + it->task_pos = NULL; |
---|
| 4575 | + return; |
---|
| 4576 | + } |
---|
| 4577 | + it->task_pos = it->cur_tasks_head->next; |
---|
4388 | 4578 | |
---|
4389 | 4579 | /* |
---|
4390 | 4580 | * We don't keep css_sets locked across iteration steps and thus |
---|
.. | .. |
---|
4429 | 4619 | repeat: |
---|
4430 | 4620 | if (it->task_pos) { |
---|
4431 | 4621 | /* |
---|
4432 | | - * Advance iterator to find next entry. cset->tasks is |
---|
4433 | | - * consumed first and then ->mg_tasks. After ->mg_tasks, |
---|
4434 | | - * we move onto the next cset. |
---|
| 4622 | + * Advance iterator to find next entry. We go through cset |
---|
| 4623 | + * tasks, mg_tasks and dying_tasks, when consumed we move onto |
---|
| 4624 | + * the next cset. |
---|
4435 | 4625 | */ |
---|
4436 | 4626 | if (it->flags & CSS_TASK_ITER_SKIPPED) |
---|
4437 | 4627 | it->flags &= ~CSS_TASK_ITER_SKIPPED; |
---|
4438 | 4628 | else |
---|
4439 | 4629 | it->task_pos = it->task_pos->next; |
---|
4440 | 4630 | |
---|
4441 | | - if (it->task_pos == it->tasks_head) { |
---|
4442 | | - it->task_pos = it->mg_tasks_head->next; |
---|
4443 | | - it->cur_tasks_head = it->mg_tasks_head; |
---|
| 4631 | + if (it->task_pos == &it->cur_cset->tasks) { |
---|
| 4632 | + it->cur_tasks_head = &it->cur_cset->mg_tasks; |
---|
| 4633 | + it->task_pos = it->cur_tasks_head->next; |
---|
4444 | 4634 | } |
---|
4445 | | - if (it->task_pos == it->mg_tasks_head) { |
---|
4446 | | - it->task_pos = it->dying_tasks_head->next; |
---|
4447 | | - it->cur_tasks_head = it->dying_tasks_head; |
---|
| 4635 | + if (it->task_pos == &it->cur_cset->mg_tasks) { |
---|
| 4636 | + it->cur_tasks_head = &it->cur_cset->dying_tasks; |
---|
| 4637 | + it->task_pos = it->cur_tasks_head->next; |
---|
4448 | 4638 | } |
---|
4449 | | - if (it->task_pos == it->dying_tasks_head) |
---|
| 4639 | + if (it->task_pos == &it->cur_cset->dying_tasks) |
---|
4450 | 4640 | css_task_iter_advance_css_set(it); |
---|
4451 | 4641 | } else { |
---|
4452 | 4642 | /* called from start, proceed to the first cset */ |
---|
.. | .. |
---|
4464 | 4654 | goto repeat; |
---|
4465 | 4655 | |
---|
4466 | 4656 | /* and dying leaders w/o live member threads */ |
---|
4467 | | - if (it->cur_tasks_head == it->dying_tasks_head && |
---|
| 4657 | + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && |
---|
4468 | 4658 | !atomic_read(&task->signal->live)) |
---|
4469 | 4659 | goto repeat; |
---|
4470 | 4660 | } else { |
---|
4471 | 4661 | /* skip all dying ones */ |
---|
4472 | | - if (it->cur_tasks_head == it->dying_tasks_head) |
---|
| 4662 | + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) |
---|
4473 | 4663 | goto repeat; |
---|
4474 | 4664 | } |
---|
4475 | 4665 | } |
---|
.. | .. |
---|
4488 | 4678 | void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, |
---|
4489 | 4679 | struct css_task_iter *it) |
---|
4490 | 4680 | { |
---|
4491 | | - /* no one should try to iterate before mounting cgroups */ |
---|
4492 | | - WARN_ON_ONCE(!use_task_css_set_links); |
---|
4493 | | - |
---|
4494 | 4681 | memset(it, 0, sizeof(*it)); |
---|
4495 | 4682 | |
---|
4496 | 4683 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
4567 | 4754 | |
---|
4568 | 4755 | static void cgroup_procs_release(struct kernfs_open_file *of) |
---|
4569 | 4756 | { |
---|
4570 | | - if (of->priv) { |
---|
4571 | | - css_task_iter_end(of->priv); |
---|
4572 | | - kfree(of->priv); |
---|
4573 | | - } |
---|
| 4757 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 4758 | + |
---|
| 4759 | + if (ctx->procs.started) |
---|
| 4760 | + css_task_iter_end(&ctx->procs.iter); |
---|
4574 | 4761 | } |
---|
4575 | 4762 | |
---|
4576 | 4763 | static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) |
---|
4577 | 4764 | { |
---|
4578 | 4765 | struct kernfs_open_file *of = s->private; |
---|
4579 | | - struct css_task_iter *it = of->priv; |
---|
| 4766 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4580 | 4767 | |
---|
4581 | 4768 | if (pos) |
---|
4582 | 4769 | (*pos)++; |
---|
4583 | 4770 | |
---|
4584 | | - return css_task_iter_next(it); |
---|
| 4771 | + return css_task_iter_next(&ctx->procs.iter); |
---|
4585 | 4772 | } |
---|
4586 | 4773 | |
---|
4587 | 4774 | static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, |
---|
.. | .. |
---|
4589 | 4776 | { |
---|
4590 | 4777 | struct kernfs_open_file *of = s->private; |
---|
4591 | 4778 | struct cgroup *cgrp = seq_css(s)->cgroup; |
---|
4592 | | - struct css_task_iter *it = of->priv; |
---|
| 4779 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 4780 | + struct css_task_iter *it = &ctx->procs.iter; |
---|
4593 | 4781 | |
---|
4594 | 4782 | /* |
---|
4595 | 4783 | * When a seq_file is seeked, it's always traversed sequentially |
---|
4596 | 4784 | * from position 0, so we can simply keep iterating on !0 *pos. |
---|
4597 | 4785 | */ |
---|
4598 | | - if (!it) { |
---|
| 4786 | + if (!ctx->procs.started) { |
---|
4599 | 4787 | if (WARN_ON_ONCE((*pos))) |
---|
4600 | 4788 | return ERR_PTR(-EINVAL); |
---|
4601 | | - |
---|
4602 | | - it = kzalloc(sizeof(*it), GFP_KERNEL); |
---|
4603 | | - if (!it) |
---|
4604 | | - return ERR_PTR(-ENOMEM); |
---|
4605 | | - of->priv = it; |
---|
4606 | 4789 | css_task_iter_start(&cgrp->self, iter_flags, it); |
---|
| 4790 | + ctx->procs.started = true; |
---|
4607 | 4791 | } else if (!(*pos)) { |
---|
4608 | 4792 | css_task_iter_end(it); |
---|
4609 | 4793 | css_task_iter_start(&cgrp->self, iter_flags, it); |
---|
.. | .. |
---|
4636 | 4820 | return 0; |
---|
4637 | 4821 | } |
---|
4638 | 4822 | |
---|
| 4823 | +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) |
---|
| 4824 | +{ |
---|
| 4825 | + int ret; |
---|
| 4826 | + struct inode *inode; |
---|
| 4827 | + |
---|
| 4828 | + lockdep_assert_held(&cgroup_mutex); |
---|
| 4829 | + |
---|
| 4830 | + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); |
---|
| 4831 | + if (!inode) |
---|
| 4832 | + return -ENOMEM; |
---|
| 4833 | + |
---|
| 4834 | + ret = inode_permission(inode, MAY_WRITE); |
---|
| 4835 | + iput(inode); |
---|
| 4836 | + return ret; |
---|
| 4837 | +} |
---|
| 4838 | + |
---|
4639 | 4839 | static int cgroup_procs_write_permission(struct cgroup *src_cgrp, |
---|
4640 | 4840 | struct cgroup *dst_cgrp, |
---|
4641 | | - struct super_block *sb) |
---|
| 4841 | + struct super_block *sb, |
---|
| 4842 | + struct cgroup_namespace *ns) |
---|
4642 | 4843 | { |
---|
4643 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
4644 | 4844 | struct cgroup *com_cgrp = src_cgrp; |
---|
4645 | | - struct inode *inode; |
---|
4646 | 4845 | int ret; |
---|
4647 | 4846 | |
---|
4648 | 4847 | lockdep_assert_held(&cgroup_mutex); |
---|
.. | .. |
---|
4652 | 4851 | com_cgrp = cgroup_parent(com_cgrp); |
---|
4653 | 4852 | |
---|
4654 | 4853 | /* %current should be authorized to migrate to the common ancestor */ |
---|
4655 | | - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); |
---|
4656 | | - if (!inode) |
---|
4657 | | - return -ENOMEM; |
---|
4658 | | - |
---|
4659 | | - ret = inode_permission(inode, MAY_WRITE); |
---|
4660 | | - iput(inode); |
---|
| 4854 | + ret = cgroup_may_write(com_cgrp, sb); |
---|
4661 | 4855 | if (ret) |
---|
4662 | 4856 | return ret; |
---|
4663 | 4857 | |
---|
.. | .. |
---|
4673 | 4867 | return 0; |
---|
4674 | 4868 | } |
---|
4675 | 4869 | |
---|
| 4870 | +static int cgroup_attach_permissions(struct cgroup *src_cgrp, |
---|
| 4871 | + struct cgroup *dst_cgrp, |
---|
| 4872 | + struct super_block *sb, bool threadgroup, |
---|
| 4873 | + struct cgroup_namespace *ns) |
---|
| 4874 | +{ |
---|
| 4875 | + int ret = 0; |
---|
| 4876 | + |
---|
| 4877 | + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); |
---|
| 4878 | + if (ret) |
---|
| 4879 | + return ret; |
---|
| 4880 | + |
---|
| 4881 | + ret = cgroup_migrate_vet_dst(dst_cgrp); |
---|
| 4882 | + if (ret) |
---|
| 4883 | + return ret; |
---|
| 4884 | + |
---|
| 4885 | + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) |
---|
| 4886 | + ret = -EOPNOTSUPP; |
---|
| 4887 | + |
---|
| 4888 | + return ret; |
---|
| 4889 | +} |
---|
| 4890 | + |
---|
4676 | 4891 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, |
---|
4677 | 4892 | char *buf, size_t nbytes, loff_t off) |
---|
4678 | 4893 | { |
---|
| 4894 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4679 | 4895 | struct cgroup *src_cgrp, *dst_cgrp; |
---|
4680 | 4896 | struct task_struct *task; |
---|
| 4897 | + const struct cred *saved_cred; |
---|
4681 | 4898 | ssize_t ret; |
---|
| 4899 | + bool threadgroup_locked; |
---|
4682 | 4900 | |
---|
4683 | 4901 | dst_cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
4684 | 4902 | if (!dst_cgrp) |
---|
4685 | 4903 | return -ENODEV; |
---|
4686 | 4904 | |
---|
4687 | | - task = cgroup_procs_write_start(buf, true); |
---|
| 4905 | + task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp); |
---|
4688 | 4906 | ret = PTR_ERR_OR_ZERO(task); |
---|
4689 | 4907 | if (ret) |
---|
4690 | 4908 | goto out_unlock; |
---|
.. | .. |
---|
4694 | 4912 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
---|
4695 | 4913 | spin_unlock_irq(&css_set_lock); |
---|
4696 | 4914 | |
---|
4697 | | - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, |
---|
4698 | | - of->file->f_path.dentry->d_sb); |
---|
| 4915 | + /* |
---|
| 4916 | + * Process and thread migrations follow same delegation rule. Check |
---|
| 4917 | + * permissions using the credentials from file open to protect against |
---|
| 4918 | + * inherited fd attacks. |
---|
| 4919 | + */ |
---|
| 4920 | + saved_cred = override_creds(of->file->f_cred); |
---|
| 4921 | + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, |
---|
| 4922 | + of->file->f_path.dentry->d_sb, true, |
---|
| 4923 | + ctx->ns); |
---|
| 4924 | + revert_creds(saved_cred); |
---|
4699 | 4925 | if (ret) |
---|
4700 | 4926 | goto out_finish; |
---|
4701 | 4927 | |
---|
4702 | 4928 | ret = cgroup_attach_task(dst_cgrp, task, true); |
---|
4703 | 4929 | |
---|
4704 | 4930 | out_finish: |
---|
4705 | | - cgroup_procs_write_finish(task); |
---|
| 4931 | + cgroup_procs_write_finish(task, threadgroup_locked); |
---|
4706 | 4932 | out_unlock: |
---|
4707 | 4933 | cgroup_kn_unlock(of->kn); |
---|
4708 | 4934 | |
---|
.. | .. |
---|
4717 | 4943 | static ssize_t cgroup_threads_write(struct kernfs_open_file *of, |
---|
4718 | 4944 | char *buf, size_t nbytes, loff_t off) |
---|
4719 | 4945 | { |
---|
| 4946 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4720 | 4947 | struct cgroup *src_cgrp, *dst_cgrp; |
---|
4721 | 4948 | struct task_struct *task; |
---|
| 4949 | + const struct cred *saved_cred; |
---|
4722 | 4950 | ssize_t ret; |
---|
| 4951 | + bool threadgroup_locked; |
---|
4723 | 4952 | |
---|
4724 | 4953 | buf = strstrip(buf); |
---|
4725 | 4954 | |
---|
.. | .. |
---|
4727 | 4956 | if (!dst_cgrp) |
---|
4728 | 4957 | return -ENODEV; |
---|
4729 | 4958 | |
---|
4730 | | - task = cgroup_procs_write_start(buf, false); |
---|
| 4959 | + task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp); |
---|
4731 | 4960 | ret = PTR_ERR_OR_ZERO(task); |
---|
4732 | 4961 | if (ret) |
---|
4733 | 4962 | goto out_unlock; |
---|
.. | .. |
---|
4737 | 4966 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
---|
4738 | 4967 | spin_unlock_irq(&css_set_lock); |
---|
4739 | 4968 | |
---|
4740 | | - /* thread migrations follow the cgroup.procs delegation rule */ |
---|
4741 | | - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, |
---|
4742 | | - of->file->f_path.dentry->d_sb); |
---|
| 4969 | + /* |
---|
| 4970 | + * Process and thread migrations follow same delegation rule. Check |
---|
| 4971 | + * permissions using the credentials from file open to protect against |
---|
| 4972 | + * inherited fd attacks. |
---|
| 4973 | + */ |
---|
| 4974 | + saved_cred = override_creds(of->file->f_cred); |
---|
| 4975 | + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, |
---|
| 4976 | + of->file->f_path.dentry->d_sb, false, |
---|
| 4977 | + ctx->ns); |
---|
| 4978 | + revert_creds(saved_cred); |
---|
4743 | 4979 | if (ret) |
---|
4744 | | - goto out_finish; |
---|
4745 | | - |
---|
4746 | | - /* and must be contained in the same domain */ |
---|
4747 | | - ret = -EOPNOTSUPP; |
---|
4748 | | - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) |
---|
4749 | 4980 | goto out_finish; |
---|
4750 | 4981 | |
---|
4751 | 4982 | ret = cgroup_attach_task(dst_cgrp, task, false); |
---|
4752 | 4983 | |
---|
4753 | 4984 | out_finish: |
---|
4754 | | - cgroup_procs_write_finish(task); |
---|
| 4985 | + cgroup_procs_write_finish(task, threadgroup_locked); |
---|
4755 | 4986 | out_unlock: |
---|
4756 | 4987 | cgroup_kn_unlock(of->kn); |
---|
4757 | 4988 | |
---|
.. | .. |
---|
4823 | 5054 | }, |
---|
4824 | 5055 | { |
---|
4825 | 5056 | .name = "cpu.stat", |
---|
4826 | | - .flags = CFTYPE_NOT_ON_ROOT, |
---|
4827 | 5057 | .seq_show = cpu_stat_show, |
---|
4828 | 5058 | }, |
---|
4829 | 5059 | #ifdef CONFIG_PSI |
---|
4830 | 5060 | { |
---|
4831 | 5061 | .name = "io.pressure", |
---|
4832 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5062 | + .flags = CFTYPE_PRESSURE, |
---|
4833 | 5063 | .seq_show = cgroup_io_pressure_show, |
---|
4834 | 5064 | .write = cgroup_io_pressure_write, |
---|
4835 | 5065 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4837 | 5067 | }, |
---|
4838 | 5068 | { |
---|
4839 | 5069 | .name = "memory.pressure", |
---|
4840 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5070 | + .flags = CFTYPE_PRESSURE, |
---|
4841 | 5071 | .seq_show = cgroup_memory_pressure_show, |
---|
4842 | 5072 | .write = cgroup_memory_pressure_write, |
---|
4843 | 5073 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4845 | 5075 | }, |
---|
4846 | 5076 | { |
---|
4847 | 5077 | .name = "cpu.pressure", |
---|
4848 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5078 | + .flags = CFTYPE_PRESSURE, |
---|
4849 | 5079 | .seq_show = cgroup_cpu_pressure_show, |
---|
4850 | 5080 | .write = cgroup_cpu_pressure_write, |
---|
4851 | 5081 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4927 | 5157 | } |
---|
4928 | 5158 | } |
---|
4929 | 5159 | |
---|
4930 | | -static void css_release_work_fn(struct swork_event *sev) |
---|
| 5160 | +static void css_release_work_fn(struct work_struct *work) |
---|
4931 | 5161 | { |
---|
4932 | 5162 | struct cgroup_subsys_state *css = |
---|
4933 | | - container_of(sev, struct cgroup_subsys_state, destroy_swork); |
---|
| 5163 | + container_of(work, struct cgroup_subsys_state, destroy_work); |
---|
4934 | 5164 | struct cgroup_subsys *ss = css->ss; |
---|
4935 | 5165 | struct cgroup *cgrp = css->cgroup; |
---|
4936 | 5166 | |
---|
.. | .. |
---|
4964 | 5194 | tcgrp->nr_dying_descendants--; |
---|
4965 | 5195 | spin_unlock_irq(&css_set_lock); |
---|
4966 | 5196 | |
---|
4967 | | - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
---|
4968 | | - cgrp->id = -1; |
---|
4969 | | - |
---|
4970 | 5197 | /* |
---|
4971 | 5198 | * There are two control paths which try to determine |
---|
4972 | 5199 | * cgroup from dentry without going through kernfs - |
---|
.. | .. |
---|
4977 | 5204 | if (cgrp->kn) |
---|
4978 | 5205 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, |
---|
4979 | 5206 | NULL); |
---|
4980 | | - |
---|
4981 | | - cgroup_bpf_put(cgrp); |
---|
4982 | 5207 | } |
---|
4983 | 5208 | |
---|
4984 | 5209 | mutex_unlock(&cgroup_mutex); |
---|
.. | .. |
---|
4992 | 5217 | struct cgroup_subsys_state *css = |
---|
4993 | 5218 | container_of(ref, struct cgroup_subsys_state, refcnt); |
---|
4994 | 5219 | |
---|
4995 | | - INIT_SWORK(&css->destroy_swork, css_release_work_fn); |
---|
4996 | | - swork_queue(&css->destroy_swork); |
---|
| 5220 | + INIT_WORK(&css->destroy_work, css_release_work_fn); |
---|
| 5221 | + queue_work(cgroup_destroy_wq, &css->destroy_work); |
---|
4997 | 5222 | } |
---|
4998 | 5223 | |
---|
4999 | 5224 | static void init_and_link_css(struct cgroup_subsys_state *css, |
---|
.. | .. |
---|
5133 | 5358 | * it isn't associated with its kernfs_node and doesn't have the control |
---|
5134 | 5359 | * mask applied. |
---|
5135 | 5360 | */ |
---|
5136 | | -static struct cgroup *cgroup_create(struct cgroup *parent) |
---|
| 5361 | +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, |
---|
| 5362 | + umode_t mode) |
---|
5137 | 5363 | { |
---|
5138 | 5364 | struct cgroup_root *root = parent->root; |
---|
5139 | 5365 | struct cgroup *cgrp, *tcgrp; |
---|
| 5366 | + struct kernfs_node *kn; |
---|
5140 | 5367 | int level = parent->level + 1; |
---|
5141 | 5368 | int ret; |
---|
5142 | 5369 | |
---|
.. | .. |
---|
5156 | 5383 | goto out_cancel_ref; |
---|
5157 | 5384 | } |
---|
5158 | 5385 | |
---|
5159 | | - /* |
---|
5160 | | - * Temporarily set the pointer to NULL, so idr_find() won't return |
---|
5161 | | - * a half-baked cgroup. |
---|
5162 | | - */ |
---|
5163 | | - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); |
---|
5164 | | - if (cgrp->id < 0) { |
---|
5165 | | - ret = -ENOMEM; |
---|
| 5386 | + /* create the directory */ |
---|
| 5387 | + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
---|
| 5388 | + if (IS_ERR(kn)) { |
---|
| 5389 | + ret = PTR_ERR(kn); |
---|
5166 | 5390 | goto out_stat_exit; |
---|
5167 | 5391 | } |
---|
| 5392 | + cgrp->kn = kn; |
---|
5168 | 5393 | |
---|
5169 | 5394 | init_cgroup_housekeeping(cgrp); |
---|
5170 | 5395 | |
---|
.. | .. |
---|
5174 | 5399 | |
---|
5175 | 5400 | ret = psi_cgroup_alloc(cgrp); |
---|
5176 | 5401 | if (ret) |
---|
5177 | | - goto out_idr_free; |
---|
| 5402 | + goto out_kernfs_remove; |
---|
5178 | 5403 | |
---|
5179 | 5404 | ret = cgroup_bpf_inherit(cgrp); |
---|
5180 | 5405 | if (ret) |
---|
.. | .. |
---|
5198 | 5423 | |
---|
5199 | 5424 | spin_lock_irq(&css_set_lock); |
---|
5200 | 5425 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { |
---|
5201 | | - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; |
---|
| 5426 | + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); |
---|
5202 | 5427 | |
---|
5203 | 5428 | if (tcgrp != cgrp) { |
---|
5204 | 5429 | tcgrp->nr_descendants++; |
---|
.. | .. |
---|
5228 | 5453 | cgroup_get_live(parent); |
---|
5229 | 5454 | |
---|
5230 | 5455 | /* |
---|
5231 | | - * @cgrp is now fully operational. If something fails after this |
---|
5232 | | - * point, it'll be released via the normal destruction path. |
---|
5233 | | - */ |
---|
5234 | | - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
---|
5235 | | - |
---|
5236 | | - /* |
---|
5237 | 5456 | * On the default hierarchy, a child doesn't automatically inherit |
---|
5238 | 5457 | * subtree_control from the parent. Each is configured manually. |
---|
5239 | 5458 | */ |
---|
.. | .. |
---|
5246 | 5465 | |
---|
5247 | 5466 | out_psi_free: |
---|
5248 | 5467 | psi_cgroup_free(cgrp); |
---|
5249 | | -out_idr_free: |
---|
5250 | | - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
---|
| 5468 | +out_kernfs_remove: |
---|
| 5469 | + kernfs_remove(cgrp->kn); |
---|
5251 | 5470 | out_stat_exit: |
---|
5252 | 5471 | if (cgroup_on_dfl(parent)) |
---|
5253 | 5472 | cgroup_rstat_exit(cgrp); |
---|
.. | .. |
---|
5284 | 5503 | int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) |
---|
5285 | 5504 | { |
---|
5286 | 5505 | struct cgroup *parent, *cgrp; |
---|
5287 | | - struct kernfs_node *kn; |
---|
5288 | 5506 | int ret; |
---|
5289 | 5507 | |
---|
5290 | 5508 | /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ |
---|
.. | .. |
---|
5300 | 5518 | goto out_unlock; |
---|
5301 | 5519 | } |
---|
5302 | 5520 | |
---|
5303 | | - cgrp = cgroup_create(parent); |
---|
| 5521 | + cgrp = cgroup_create(parent, name, mode); |
---|
5304 | 5522 | if (IS_ERR(cgrp)) { |
---|
5305 | 5523 | ret = PTR_ERR(cgrp); |
---|
5306 | 5524 | goto out_unlock; |
---|
5307 | 5525 | } |
---|
5308 | 5526 | |
---|
5309 | | - /* create the directory */ |
---|
5310 | | - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
---|
5311 | | - if (IS_ERR(kn)) { |
---|
5312 | | - ret = PTR_ERR(kn); |
---|
5313 | | - goto out_destroy; |
---|
5314 | | - } |
---|
5315 | | - cgrp->kn = kn; |
---|
5316 | | - |
---|
5317 | 5527 | /* |
---|
5318 | 5528 | * This extra ref will be put in cgroup_free_fn() and guarantees |
---|
5319 | 5529 | * that @cgrp->kn is always accessible. |
---|
5320 | 5530 | */ |
---|
5321 | | - kernfs_get(kn); |
---|
| 5531 | + kernfs_get(cgrp->kn); |
---|
5322 | 5532 | |
---|
5323 | | - ret = cgroup_kn_set_ugid(kn); |
---|
| 5533 | + ret = cgroup_kn_set_ugid(cgrp->kn); |
---|
5324 | 5534 | if (ret) |
---|
5325 | 5535 | goto out_destroy; |
---|
5326 | 5536 | |
---|
.. | .. |
---|
5335 | 5545 | TRACE_CGROUP_PATH(mkdir, cgrp); |
---|
5336 | 5546 | |
---|
5337 | 5547 | /* let's create and online css's */ |
---|
5338 | | - kernfs_activate(kn); |
---|
| 5548 | + kernfs_activate(cgrp->kn); |
---|
5339 | 5549 | |
---|
5340 | 5550 | ret = 0; |
---|
5341 | 5551 | goto out_unlock; |
---|
.. | .. |
---|
5512 | 5722 | |
---|
5513 | 5723 | cgroup1_check_for_release(parent); |
---|
5514 | 5724 | |
---|
| 5725 | + cgroup_bpf_offline(cgrp); |
---|
| 5726 | + |
---|
5515 | 5727 | /* put the base reference */ |
---|
5516 | 5728 | percpu_ref_kill(&cgrp->self.refcnt); |
---|
5517 | 5729 | |
---|
.. | .. |
---|
5537 | 5749 | |
---|
5538 | 5750 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { |
---|
5539 | 5751 | .show_options = cgroup_show_options, |
---|
5540 | | - .remount_fs = cgroup_remount, |
---|
5541 | 5752 | .mkdir = cgroup_mkdir, |
---|
5542 | 5753 | .rmdir = cgroup_rmdir, |
---|
5543 | 5754 | .show_path = cgroup_show_path, |
---|
.. | .. |
---|
5604 | 5815 | */ |
---|
5605 | 5816 | int __init cgroup_init_early(void) |
---|
5606 | 5817 | { |
---|
5607 | | - static struct cgroup_sb_opts __initdata opts; |
---|
| 5818 | + static struct cgroup_fs_context __initdata ctx; |
---|
5608 | 5819 | struct cgroup_subsys *ss; |
---|
5609 | 5820 | int i; |
---|
5610 | 5821 | |
---|
5611 | | - init_cgroup_root(&cgrp_dfl_root, &opts); |
---|
| 5822 | + ctx.root = &cgrp_dfl_root; |
---|
| 5823 | + init_cgroup_root(&ctx); |
---|
5612 | 5824 | cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; |
---|
5613 | 5825 | |
---|
5614 | 5826 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
---|
.. | .. |
---|
5644 | 5856 | int ssid; |
---|
5645 | 5857 | |
---|
5646 | 5858 | BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); |
---|
5647 | | - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); |
---|
5648 | 5859 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
---|
5649 | 5860 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); |
---|
5650 | 5861 | |
---|
5651 | 5862 | cgroup_rstat_boot(); |
---|
5652 | 5863 | |
---|
5653 | 5864 | /* |
---|
5654 | | - * The latency of the synchronize_sched() is too high for cgroups, |
---|
| 5865 | + * The latency of the synchronize_rcu() is too high for cgroups, |
---|
5655 | 5866 | * avoid it at the cost of forcing all readers into the slow path. |
---|
5656 | 5867 | */ |
---|
5657 | 5868 | rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); |
---|
.. | .. |
---|
5735 | 5946 | WARN_ON(register_filesystem(&cgroup_fs_type)); |
---|
5736 | 5947 | WARN_ON(register_filesystem(&cgroup2_fs_type)); |
---|
5737 | 5948 | WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); |
---|
| 5949 | +#ifdef CONFIG_CPUSETS |
---|
| 5950 | + WARN_ON(register_filesystem(&cpuset_fs_type)); |
---|
| 5951 | +#endif |
---|
5738 | 5952 | |
---|
5739 | 5953 | return 0; |
---|
5740 | 5954 | } |
---|
.. | .. |
---|
5751 | 5965 | */ |
---|
5752 | 5966 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); |
---|
5753 | 5967 | BUG_ON(!cgroup_destroy_wq); |
---|
5754 | | - BUG_ON(swork_get()); |
---|
5755 | 5968 | return 0; |
---|
5756 | 5969 | } |
---|
5757 | 5970 | core_initcall(cgroup_wq_init); |
---|
5758 | 5971 | |
---|
5759 | | -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, |
---|
5760 | | - char *buf, size_t buflen) |
---|
| 5972 | +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) |
---|
5761 | 5973 | { |
---|
5762 | 5974 | struct kernfs_node *kn; |
---|
5763 | 5975 | |
---|
5764 | | - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); |
---|
| 5976 | + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); |
---|
5765 | 5977 | if (!kn) |
---|
5766 | 5978 | return; |
---|
5767 | 5979 | kernfs_path(kn, buf, buflen); |
---|
.. | .. |
---|
5851 | 6063 | * @child: pointer to task_struct of forking parent process. |
---|
5852 | 6064 | * |
---|
5853 | 6065 | * A task is associated with the init_css_set until cgroup_post_fork() |
---|
5854 | | - * attaches it to the parent's css_set. Empty cg_list indicates that |
---|
5855 | | - * @child isn't holding reference to its css_set. |
---|
| 6066 | + * attaches it to the target css_set. |
---|
5856 | 6067 | */ |
---|
5857 | 6068 | void cgroup_fork(struct task_struct *child) |
---|
5858 | 6069 | { |
---|
.. | .. |
---|
5860 | 6071 | INIT_LIST_HEAD(&child->cg_list); |
---|
5861 | 6072 | } |
---|
5862 | 6073 | |
---|
| 6074 | +static struct cgroup *cgroup_get_from_file(struct file *f) |
---|
| 6075 | +{ |
---|
| 6076 | + struct cgroup_subsys_state *css; |
---|
| 6077 | + struct cgroup *cgrp; |
---|
| 6078 | + |
---|
| 6079 | + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); |
---|
| 6080 | + if (IS_ERR(css)) |
---|
| 6081 | + return ERR_CAST(css); |
---|
| 6082 | + |
---|
| 6083 | + cgrp = css->cgroup; |
---|
| 6084 | + if (!cgroup_on_dfl(cgrp)) { |
---|
| 6085 | + cgroup_put(cgrp); |
---|
| 6086 | + return ERR_PTR(-EBADF); |
---|
| 6087 | + } |
---|
| 6088 | + |
---|
| 6089 | + return cgrp; |
---|
| 6090 | +} |
---|
| 6091 | + |
---|
| 6092 | +/** |
---|
| 6093 | + * cgroup_css_set_fork - find or create a css_set for a child process |
---|
| 6094 | + * @kargs: the arguments passed to create the child process |
---|
| 6095 | + * |
---|
| 6096 | + * This functions finds or creates a new css_set which the child |
---|
| 6097 | + * process will be attached to in cgroup_post_fork(). By default, |
---|
| 6098 | + * the child process will be given the same css_set as its parent. |
---|
| 6099 | + * |
---|
| 6100 | + * If CLONE_INTO_CGROUP is specified this function will try to find an |
---|
| 6101 | + * existing css_set which includes the requested cgroup and if not create |
---|
| 6102 | + * a new css_set that the child will be attached to later. If this function |
---|
| 6103 | + * succeeds it will hold cgroup_threadgroup_rwsem on return. If |
---|
| 6104 | + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex |
---|
| 6105 | + * before grabbing cgroup_threadgroup_rwsem and will hold a reference |
---|
| 6106 | + * to the target cgroup. |
---|
| 6107 | + */ |
---|
| 6108 | +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) |
---|
| 6109 | + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) |
---|
| 6110 | +{ |
---|
| 6111 | + int ret; |
---|
| 6112 | + struct cgroup *dst_cgrp = NULL; |
---|
| 6113 | + struct css_set *cset; |
---|
| 6114 | + struct super_block *sb; |
---|
| 6115 | + struct file *f; |
---|
| 6116 | + |
---|
| 6117 | + if (kargs->flags & CLONE_INTO_CGROUP) |
---|
| 6118 | + mutex_lock(&cgroup_mutex); |
---|
| 6119 | + |
---|
| 6120 | + cgroup_threadgroup_change_begin(current); |
---|
| 6121 | + |
---|
| 6122 | + spin_lock_irq(&css_set_lock); |
---|
| 6123 | + cset = task_css_set(current); |
---|
| 6124 | + get_css_set(cset); |
---|
| 6125 | + spin_unlock_irq(&css_set_lock); |
---|
| 6126 | + |
---|
| 6127 | + if (!(kargs->flags & CLONE_INTO_CGROUP)) { |
---|
| 6128 | + kargs->cset = cset; |
---|
| 6129 | + return 0; |
---|
| 6130 | + } |
---|
| 6131 | + |
---|
| 6132 | + f = fget_raw(kargs->cgroup); |
---|
| 6133 | + if (!f) { |
---|
| 6134 | + ret = -EBADF; |
---|
| 6135 | + goto err; |
---|
| 6136 | + } |
---|
| 6137 | + sb = f->f_path.dentry->d_sb; |
---|
| 6138 | + |
---|
| 6139 | + dst_cgrp = cgroup_get_from_file(f); |
---|
| 6140 | + if (IS_ERR(dst_cgrp)) { |
---|
| 6141 | + ret = PTR_ERR(dst_cgrp); |
---|
| 6142 | + dst_cgrp = NULL; |
---|
| 6143 | + goto err; |
---|
| 6144 | + } |
---|
| 6145 | + |
---|
| 6146 | + if (cgroup_is_dead(dst_cgrp)) { |
---|
| 6147 | + ret = -ENODEV; |
---|
| 6148 | + goto err; |
---|
| 6149 | + } |
---|
| 6150 | + |
---|
| 6151 | + /* |
---|
| 6152 | + * Verify that we the target cgroup is writable for us. This is |
---|
| 6153 | + * usually done by the vfs layer but since we're not going through |
---|
| 6154 | + * the vfs layer here we need to do it "manually". |
---|
| 6155 | + */ |
---|
| 6156 | + ret = cgroup_may_write(dst_cgrp, sb); |
---|
| 6157 | + if (ret) |
---|
| 6158 | + goto err; |
---|
| 6159 | + |
---|
| 6160 | + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, |
---|
| 6161 | + !(kargs->flags & CLONE_THREAD), |
---|
| 6162 | + current->nsproxy->cgroup_ns); |
---|
| 6163 | + if (ret) |
---|
| 6164 | + goto err; |
---|
| 6165 | + |
---|
| 6166 | + kargs->cset = find_css_set(cset, dst_cgrp); |
---|
| 6167 | + if (!kargs->cset) { |
---|
| 6168 | + ret = -ENOMEM; |
---|
| 6169 | + goto err; |
---|
| 6170 | + } |
---|
| 6171 | + |
---|
| 6172 | + put_css_set(cset); |
---|
| 6173 | + fput(f); |
---|
| 6174 | + kargs->cgrp = dst_cgrp; |
---|
| 6175 | + return ret; |
---|
| 6176 | + |
---|
| 6177 | +err: |
---|
| 6178 | + cgroup_threadgroup_change_end(current); |
---|
| 6179 | + mutex_unlock(&cgroup_mutex); |
---|
| 6180 | + if (f) |
---|
| 6181 | + fput(f); |
---|
| 6182 | + if (dst_cgrp) |
---|
| 6183 | + cgroup_put(dst_cgrp); |
---|
| 6184 | + put_css_set(cset); |
---|
| 6185 | + if (kargs->cset) |
---|
| 6186 | + put_css_set(kargs->cset); |
---|
| 6187 | + return ret; |
---|
| 6188 | +} |
---|
| 6189 | + |
---|
| 6190 | +/** |
---|
| 6191 | + * cgroup_css_set_put_fork - drop references we took during fork |
---|
| 6192 | + * @kargs: the arguments passed to create the child process |
---|
| 6193 | + * |
---|
| 6194 | + * Drop references to the prepared css_set and target cgroup if |
---|
| 6195 | + * CLONE_INTO_CGROUP was requested. |
---|
| 6196 | + */ |
---|
| 6197 | +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) |
---|
| 6198 | + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) |
---|
| 6199 | +{ |
---|
| 6200 | + cgroup_threadgroup_change_end(current); |
---|
| 6201 | + |
---|
| 6202 | + if (kargs->flags & CLONE_INTO_CGROUP) { |
---|
| 6203 | + struct cgroup *cgrp = kargs->cgrp; |
---|
| 6204 | + struct css_set *cset = kargs->cset; |
---|
| 6205 | + |
---|
| 6206 | + mutex_unlock(&cgroup_mutex); |
---|
| 6207 | + |
---|
| 6208 | + if (cset) { |
---|
| 6209 | + put_css_set(cset); |
---|
| 6210 | + kargs->cset = NULL; |
---|
| 6211 | + } |
---|
| 6212 | + |
---|
| 6213 | + if (cgrp) { |
---|
| 6214 | + cgroup_put(cgrp); |
---|
| 6215 | + kargs->cgrp = NULL; |
---|
| 6216 | + } |
---|
| 6217 | + } |
---|
| 6218 | +} |
---|
| 6219 | + |
---|
5863 | 6220 | /** |
---|
5864 | 6221 | * cgroup_can_fork - called on a new task before the process is exposed |
---|
5865 | | - * @child: the task in question. |
---|
| 6222 | + * @child: the child process |
---|
5866 | 6223 | * |
---|
5867 | | - * This calls the subsystem can_fork() callbacks. If the can_fork() callback |
---|
5868 | | - * returns an error, the fork aborts with that error code. This allows for |
---|
5869 | | - * a cgroup subsystem to conditionally allow or deny new forks. |
---|
| 6224 | + * This prepares a new css_set for the child process which the child will |
---|
| 6225 | + * be attached to in cgroup_post_fork(). |
---|
| 6226 | + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() |
---|
| 6227 | + * callback returns an error, the fork aborts with that error code. This |
---|
| 6228 | + * allows for a cgroup subsystem to conditionally allow or deny new forks. |
---|
5870 | 6229 | */ |
---|
5871 | | -int cgroup_can_fork(struct task_struct *child) |
---|
| 6230 | +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) |
---|
5872 | 6231 | { |
---|
5873 | 6232 | struct cgroup_subsys *ss; |
---|
5874 | 6233 | int i, j, ret; |
---|
5875 | 6234 | |
---|
| 6235 | + ret = cgroup_css_set_fork(kargs); |
---|
| 6236 | + if (ret) |
---|
| 6237 | + return ret; |
---|
| 6238 | + |
---|
5876 | 6239 | do_each_subsys_mask(ss, i, have_canfork_callback) { |
---|
5877 | | - ret = ss->can_fork(child); |
---|
| 6240 | + ret = ss->can_fork(child, kargs->cset); |
---|
5878 | 6241 | if (ret) |
---|
5879 | 6242 | goto out_revert; |
---|
5880 | 6243 | } while_each_subsys_mask(); |
---|
.. | .. |
---|
5886 | 6249 | if (j >= i) |
---|
5887 | 6250 | break; |
---|
5888 | 6251 | if (ss->cancel_fork) |
---|
5889 | | - ss->cancel_fork(child); |
---|
| 6252 | + ss->cancel_fork(child, kargs->cset); |
---|
5890 | 6253 | } |
---|
| 6254 | + |
---|
| 6255 | + cgroup_css_set_put_fork(kargs); |
---|
5891 | 6256 | |
---|
5892 | 6257 | return ret; |
---|
5893 | 6258 | } |
---|
5894 | 6259 | |
---|
5895 | 6260 | /** |
---|
5896 | 6261 | * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() |
---|
5897 | | - * @child: the task in question |
---|
| 6262 | + * @child: the child process |
---|
| 6263 | + * @kargs: the arguments passed to create the child process |
---|
5898 | 6264 | * |
---|
5899 | 6265 | * This calls the cancel_fork() callbacks if a fork failed *after* |
---|
5900 | | - * cgroup_can_fork() succeded. |
---|
| 6266 | + * cgroup_can_fork() succeded and cleans up references we took to |
---|
| 6267 | + * prepare a new css_set for the child process in cgroup_can_fork(). |
---|
5901 | 6268 | */ |
---|
5902 | | -void cgroup_cancel_fork(struct task_struct *child) |
---|
| 6269 | +void cgroup_cancel_fork(struct task_struct *child, |
---|
| 6270 | + struct kernel_clone_args *kargs) |
---|
5903 | 6271 | { |
---|
5904 | 6272 | struct cgroup_subsys *ss; |
---|
5905 | 6273 | int i; |
---|
5906 | 6274 | |
---|
5907 | 6275 | for_each_subsys(ss, i) |
---|
5908 | 6276 | if (ss->cancel_fork) |
---|
5909 | | - ss->cancel_fork(child); |
---|
| 6277 | + ss->cancel_fork(child, kargs->cset); |
---|
| 6278 | + |
---|
| 6279 | + cgroup_css_set_put_fork(kargs); |
---|
5910 | 6280 | } |
---|
5911 | 6281 | |
---|
5912 | 6282 | /** |
---|
5913 | | - * cgroup_post_fork - called on a new task after adding it to the task list |
---|
5914 | | - * @child: the task in question |
---|
| 6283 | + * cgroup_post_fork - finalize cgroup setup for the child process |
---|
| 6284 | + * @child: the child process |
---|
5915 | 6285 | * |
---|
5916 | | - * Adds the task to the list running through its css_set if necessary and |
---|
5917 | | - * call the subsystem fork() callbacks. Has to be after the task is |
---|
5918 | | - * visible on the task list in case we race with the first call to |
---|
5919 | | - * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
---|
5920 | | - * list. |
---|
| 6286 | + * Attach the child process to its css_set calling the subsystem fork() |
---|
| 6287 | + * callbacks. |
---|
5921 | 6288 | */ |
---|
5922 | | -void cgroup_post_fork(struct task_struct *child) |
---|
| 6289 | +void cgroup_post_fork(struct task_struct *child, |
---|
| 6290 | + struct kernel_clone_args *kargs) |
---|
| 6291 | + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) |
---|
5923 | 6292 | { |
---|
5924 | 6293 | struct cgroup_subsys *ss; |
---|
| 6294 | + struct css_set *cset; |
---|
5925 | 6295 | int i; |
---|
5926 | 6296 | |
---|
5927 | | - /* |
---|
5928 | | - * This may race against cgroup_enable_task_cg_lists(). As that |
---|
5929 | | - * function sets use_task_css_set_links before grabbing |
---|
5930 | | - * tasklist_lock and we just went through tasklist_lock to add |
---|
5931 | | - * @child, it's guaranteed that either we see the set |
---|
5932 | | - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees |
---|
5933 | | - * @child during its iteration. |
---|
5934 | | - * |
---|
5935 | | - * If we won the race, @child is associated with %current's |
---|
5936 | | - * css_set. Grabbing css_set_lock guarantees both that the |
---|
5937 | | - * association is stable, and, on completion of the parent's |
---|
5938 | | - * migration, @child is visible in the source of migration or |
---|
5939 | | - * already in the destination cgroup. This guarantee is necessary |
---|
5940 | | - * when implementing operations which need to migrate all tasks of |
---|
5941 | | - * a cgroup to another. |
---|
5942 | | - * |
---|
5943 | | - * Note that if we lose to cgroup_enable_task_cg_lists(), @child |
---|
5944 | | - * will remain in init_css_set. This is safe because all tasks are |
---|
5945 | | - * in the init_css_set before cg_links is enabled and there's no |
---|
5946 | | - * operation which transfers all tasks out of init_css_set. |
---|
5947 | | - */ |
---|
5948 | | - if (use_task_css_set_links) { |
---|
5949 | | - struct css_set *cset; |
---|
| 6297 | + cset = kargs->cset; |
---|
| 6298 | + kargs->cset = NULL; |
---|
5950 | 6299 | |
---|
5951 | | - spin_lock_irq(&css_set_lock); |
---|
5952 | | - cset = task_css_set(current); |
---|
5953 | | - if (list_empty(&child->cg_list)) { |
---|
5954 | | - get_css_set(cset); |
---|
5955 | | - cset->nr_tasks++; |
---|
5956 | | - css_set_move_task(child, NULL, cset, false); |
---|
5957 | | - } |
---|
| 6300 | + spin_lock_irq(&css_set_lock); |
---|
| 6301 | + |
---|
| 6302 | + /* init tasks are special, only link regular threads */ |
---|
| 6303 | + if (likely(child->pid)) { |
---|
| 6304 | + WARN_ON_ONCE(!list_empty(&child->cg_list)); |
---|
| 6305 | + cset->nr_tasks++; |
---|
| 6306 | + css_set_move_task(child, NULL, cset, false); |
---|
| 6307 | + } else { |
---|
| 6308 | + put_css_set(cset); |
---|
| 6309 | + cset = NULL; |
---|
| 6310 | + } |
---|
| 6311 | + |
---|
| 6312 | + /* |
---|
| 6313 | + * If the cgroup has to be frozen, the new task has too. Let's set |
---|
| 6314 | + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the |
---|
| 6315 | + * frozen state. |
---|
| 6316 | + */ |
---|
| 6317 | + if (unlikely(cgroup_task_freeze(child))) { |
---|
| 6318 | + spin_lock(&child->sighand->siglock); |
---|
| 6319 | + WARN_ON_ONCE(child->frozen); |
---|
| 6320 | + child->jobctl |= JOBCTL_TRAP_FREEZE; |
---|
| 6321 | + spin_unlock(&child->sighand->siglock); |
---|
5958 | 6322 | |
---|
5959 | 6323 | /* |
---|
5960 | | - * If the cgroup has to be frozen, the new task has too. |
---|
5961 | | - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get |
---|
5962 | | - * the task into the frozen state. |
---|
| 6324 | + * Calling cgroup_update_frozen() isn't required here, |
---|
| 6325 | + * because it will be called anyway a bit later from |
---|
| 6326 | + * do_freezer_trap(). So we avoid cgroup's transient switch |
---|
| 6327 | + * from the frozen state and back. |
---|
5963 | 6328 | */ |
---|
5964 | | - if (unlikely(cgroup_task_freeze(child))) { |
---|
5965 | | - spin_lock(&child->sighand->siglock); |
---|
5966 | | - WARN_ON_ONCE(child->frozen); |
---|
5967 | | - child->jobctl |= JOBCTL_TRAP_FREEZE; |
---|
5968 | | - spin_unlock(&child->sighand->siglock); |
---|
5969 | | - |
---|
5970 | | - /* |
---|
5971 | | - * Calling cgroup_update_frozen() isn't required here, |
---|
5972 | | - * because it will be called anyway a bit later |
---|
5973 | | - * from do_freezer_trap(). So we avoid cgroup's |
---|
5974 | | - * transient switch from the frozen state and back. |
---|
5975 | | - */ |
---|
5976 | | - } |
---|
5977 | | - |
---|
5978 | | - spin_unlock_irq(&css_set_lock); |
---|
5979 | 6329 | } |
---|
| 6330 | + |
---|
| 6331 | + spin_unlock_irq(&css_set_lock); |
---|
5980 | 6332 | |
---|
5981 | 6333 | /* |
---|
5982 | 6334 | * Call ss->fork(). This must happen after @child is linked on |
---|
.. | .. |
---|
5986 | 6338 | do_each_subsys_mask(ss, i, have_fork_callback) { |
---|
5987 | 6339 | ss->fork(child); |
---|
5988 | 6340 | } while_each_subsys_mask(); |
---|
| 6341 | + |
---|
| 6342 | + /* Make the new cset the root_cset of the new cgroup namespace. */ |
---|
| 6343 | + if (kargs->flags & CLONE_NEWCGROUP) { |
---|
| 6344 | + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; |
---|
| 6345 | + |
---|
| 6346 | + get_css_set(cset); |
---|
| 6347 | + child->nsproxy->cgroup_ns->root_cset = cset; |
---|
| 6348 | + put_css_set(rcset); |
---|
| 6349 | + } |
---|
| 6350 | + |
---|
| 6351 | + cgroup_css_set_put_fork(kargs); |
---|
5989 | 6352 | } |
---|
5990 | 6353 | |
---|
5991 | 6354 | /** |
---|
5992 | 6355 | * cgroup_exit - detach cgroup from exiting task |
---|
5993 | 6356 | * @tsk: pointer to task_struct of exiting process |
---|
5994 | 6357 | * |
---|
5995 | | - * Description: Detach cgroup from @tsk and release it. |
---|
| 6358 | + * Description: Detach cgroup from @tsk. |
---|
5996 | 6359 | * |
---|
5997 | | - * Note that cgroups marked notify_on_release force every task in |
---|
5998 | | - * them to take the global cgroup_mutex mutex when exiting. |
---|
5999 | | - * This could impact scaling on very large systems. Be reluctant to |
---|
6000 | | - * use notify_on_release cgroups where very high task exit scaling |
---|
6001 | | - * is required on large systems. |
---|
6002 | | - * |
---|
6003 | | - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We |
---|
6004 | | - * call cgroup_exit() while the task is still competent to handle |
---|
6005 | | - * notify_on_release(), then leave the task attached to the root cgroup in |
---|
6006 | | - * each hierarchy for the remainder of its exit. No need to bother with |
---|
6007 | | - * init_css_set refcnting. init_css_set never goes away and we can't race |
---|
6008 | | - * with migration path - PF_EXITING is visible to migration path. |
---|
6009 | 6360 | */ |
---|
6010 | 6361 | void cgroup_exit(struct task_struct *tsk) |
---|
6011 | 6362 | { |
---|
.. | .. |
---|
6013 | 6364 | struct css_set *cset; |
---|
6014 | 6365 | int i; |
---|
6015 | 6366 | |
---|
6016 | | - /* |
---|
6017 | | - * Unlink from @tsk from its css_set. As migration path can't race |
---|
6018 | | - * with us, we can check css_set and cg_list without synchronization. |
---|
6019 | | - */ |
---|
| 6367 | + spin_lock_irq(&css_set_lock); |
---|
| 6368 | + |
---|
| 6369 | + WARN_ON_ONCE(list_empty(&tsk->cg_list)); |
---|
6020 | 6370 | cset = task_css_set(tsk); |
---|
| 6371 | + css_set_move_task(tsk, cset, NULL, false); |
---|
| 6372 | + list_add_tail(&tsk->cg_list, &cset->dying_tasks); |
---|
| 6373 | + cset->nr_tasks--; |
---|
6021 | 6374 | |
---|
6022 | | - if (!list_empty(&tsk->cg_list)) { |
---|
6023 | | - spin_lock_irq(&css_set_lock); |
---|
6024 | | - css_set_move_task(tsk, cset, NULL, false); |
---|
6025 | | - list_add_tail(&tsk->cg_list, &cset->dying_tasks); |
---|
6026 | | - cset->nr_tasks--; |
---|
| 6375 | + WARN_ON_ONCE(cgroup_task_frozen(tsk)); |
---|
| 6376 | + if (unlikely(cgroup_task_freeze(tsk))) |
---|
| 6377 | + cgroup_update_frozen(task_dfl_cgroup(tsk)); |
---|
6027 | 6378 | |
---|
6028 | | - if (unlikely(cgroup_task_frozen(tsk))) |
---|
6029 | | - cgroup_freezer_frozen_exit(tsk); |
---|
6030 | | - else if (unlikely(cgroup_task_freeze(tsk))) |
---|
6031 | | - cgroup_update_frozen(task_dfl_cgroup(tsk)); |
---|
6032 | | - |
---|
6033 | | - spin_unlock_irq(&css_set_lock); |
---|
6034 | | - } else { |
---|
6035 | | - get_css_set(cset); |
---|
6036 | | - } |
---|
| 6379 | + spin_unlock_irq(&css_set_lock); |
---|
6037 | 6380 | |
---|
6038 | 6381 | /* see cgroup_post_fork() for details */ |
---|
6039 | 6382 | do_each_subsys_mask(ss, i, have_exit_callback) { |
---|
.. | .. |
---|
6050 | 6393 | ss->release(task); |
---|
6051 | 6394 | } while_each_subsys_mask(); |
---|
6052 | 6395 | |
---|
6053 | | - if (use_task_css_set_links) { |
---|
6054 | | - spin_lock_irq(&css_set_lock); |
---|
6055 | | - css_set_skip_task_iters(task_css_set(task), task); |
---|
6056 | | - list_del_init(&task->cg_list); |
---|
6057 | | - spin_unlock_irq(&css_set_lock); |
---|
6058 | | - } |
---|
| 6396 | + spin_lock_irq(&css_set_lock); |
---|
| 6397 | + css_set_skip_task_iters(task_css_set(task), task); |
---|
| 6398 | + list_del_init(&task->cg_list); |
---|
| 6399 | + spin_unlock_irq(&css_set_lock); |
---|
6059 | 6400 | } |
---|
6060 | 6401 | |
---|
6061 | 6402 | void cgroup_free(struct task_struct *task) |
---|
.. | .. |
---|
6096 | 6437 | return 1; |
---|
6097 | 6438 | } |
---|
6098 | 6439 | __setup("cgroup_disable=", cgroup_disable); |
---|
| 6440 | + |
---|
| 6441 | +void __init __weak enable_debug_cgroup(void) { } |
---|
| 6442 | + |
---|
| 6443 | +static int __init enable_cgroup_debug(char *str) |
---|
| 6444 | +{ |
---|
| 6445 | + cgroup_debug = true; |
---|
| 6446 | + enable_debug_cgroup(); |
---|
| 6447 | + return 1; |
---|
| 6448 | +} |
---|
| 6449 | +__setup("cgroup_debug", enable_cgroup_debug); |
---|
6099 | 6450 | |
---|
6100 | 6451 | /** |
---|
6101 | 6452 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
---|
.. | .. |
---|
6196 | 6547 | */ |
---|
6197 | 6548 | struct cgroup *cgroup_get_from_fd(int fd) |
---|
6198 | 6549 | { |
---|
6199 | | - struct cgroup_subsys_state *css; |
---|
6200 | 6550 | struct cgroup *cgrp; |
---|
6201 | 6551 | struct file *f; |
---|
6202 | 6552 | |
---|
.. | .. |
---|
6204 | 6554 | if (!f) |
---|
6205 | 6555 | return ERR_PTR(-EBADF); |
---|
6206 | 6556 | |
---|
6207 | | - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); |
---|
| 6557 | + cgrp = cgroup_get_from_file(f); |
---|
6208 | 6558 | fput(f); |
---|
6209 | | - if (IS_ERR(css)) |
---|
6210 | | - return ERR_CAST(css); |
---|
6211 | | - |
---|
6212 | | - cgrp = css->cgroup; |
---|
6213 | | - if (!cgroup_on_dfl(cgrp)) { |
---|
6214 | | - cgroup_put(cgrp); |
---|
6215 | | - return ERR_PTR(-EBADF); |
---|
6216 | | - } |
---|
6217 | | - |
---|
6218 | 6559 | return cgrp; |
---|
6219 | 6560 | } |
---|
6220 | 6561 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); |
---|
.. | .. |
---|
6305 | 6646 | cset = task_css_set(current); |
---|
6306 | 6647 | if (likely(cgroup_tryget(cset->dfl_cgrp))) { |
---|
6307 | 6648 | skcd->val = (unsigned long)cset->dfl_cgrp; |
---|
| 6649 | + cgroup_bpf_get(cset->dfl_cgrp); |
---|
6308 | 6650 | break; |
---|
6309 | 6651 | } |
---|
6310 | 6652 | cpu_relax(); |
---|
.. | .. |
---|
6315 | 6657 | |
---|
6316 | 6658 | void cgroup_sk_clone(struct sock_cgroup_data *skcd) |
---|
6317 | 6659 | { |
---|
6318 | | - /* Socket clone path */ |
---|
6319 | 6660 | if (skcd->val) { |
---|
6320 | 6661 | if (skcd->no_refcnt) |
---|
6321 | 6662 | return; |
---|
.. | .. |
---|
6325 | 6666 | * Don't use cgroup_get_live(). |
---|
6326 | 6667 | */ |
---|
6327 | 6668 | cgroup_get(sock_cgroup_ptr(skcd)); |
---|
| 6669 | + cgroup_bpf_get(sock_cgroup_ptr(skcd)); |
---|
6328 | 6670 | } |
---|
6329 | 6671 | } |
---|
6330 | 6672 | |
---|
6331 | 6673 | void cgroup_sk_free(struct sock_cgroup_data *skcd) |
---|
6332 | 6674 | { |
---|
| 6675 | + struct cgroup *cgrp = sock_cgroup_ptr(skcd); |
---|
| 6676 | + |
---|
6333 | 6677 | if (skcd->no_refcnt) |
---|
6334 | 6678 | return; |
---|
6335 | | - |
---|
6336 | | - cgroup_put(sock_cgroup_ptr(skcd)); |
---|
| 6679 | + cgroup_bpf_put(cgrp); |
---|
| 6680 | + cgroup_put(cgrp); |
---|
6337 | 6681 | } |
---|
6338 | 6682 | |
---|
6339 | 6683 | #endif /* CONFIG_SOCK_CGROUP_DATA */ |
---|
6340 | 6684 | |
---|
6341 | 6685 | #ifdef CONFIG_CGROUP_BPF |
---|
6342 | | -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, |
---|
6343 | | - enum bpf_attach_type type, u32 flags) |
---|
| 6686 | +int cgroup_bpf_attach(struct cgroup *cgrp, |
---|
| 6687 | + struct bpf_prog *prog, struct bpf_prog *replace_prog, |
---|
| 6688 | + struct bpf_cgroup_link *link, |
---|
| 6689 | + enum bpf_attach_type type, |
---|
| 6690 | + u32 flags) |
---|
6344 | 6691 | { |
---|
6345 | 6692 | int ret; |
---|
6346 | 6693 | |
---|
6347 | 6694 | mutex_lock(&cgroup_mutex); |
---|
6348 | | - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); |
---|
| 6695 | + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); |
---|
6349 | 6696 | mutex_unlock(&cgroup_mutex); |
---|
6350 | 6697 | return ret; |
---|
6351 | 6698 | } |
---|
| 6699 | + |
---|
6352 | 6700 | int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, |
---|
6353 | | - enum bpf_attach_type type, u32 flags) |
---|
| 6701 | + enum bpf_attach_type type) |
---|
6354 | 6702 | { |
---|
6355 | 6703 | int ret; |
---|
6356 | 6704 | |
---|
6357 | 6705 | mutex_lock(&cgroup_mutex); |
---|
6358 | | - ret = __cgroup_bpf_detach(cgrp, prog, type, flags); |
---|
| 6706 | + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); |
---|
6359 | 6707 | mutex_unlock(&cgroup_mutex); |
---|
6360 | 6708 | return ret; |
---|
6361 | 6709 | } |
---|
| 6710 | + |
---|
6362 | 6711 | int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, |
---|
6363 | 6712 | union bpf_attr __user *uattr) |
---|
6364 | 6713 | { |
---|
.. | .. |
---|
6419 | 6768 | static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, |
---|
6420 | 6769 | char *buf) |
---|
6421 | 6770 | { |
---|
6422 | | - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); |
---|
| 6771 | + return snprintf(buf, PAGE_SIZE, |
---|
| 6772 | + "nsdelegate\n" |
---|
| 6773 | + "memory_localevents\n" |
---|
| 6774 | + "memory_recursiveprot\n"); |
---|
6423 | 6775 | } |
---|
6424 | 6776 | static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); |
---|
6425 | 6777 | |
---|