.. | .. |
---|
54 | 54 | #include <linux/proc_ns.h> |
---|
55 | 55 | #include <linux/nsproxy.h> |
---|
56 | 56 | #include <linux/file.h> |
---|
| 57 | +#include <linux/fs_parser.h> |
---|
57 | 58 | #include <linux/sched/cputime.h> |
---|
| 59 | +#include <linux/sched/deadline.h> |
---|
58 | 60 | #include <linux/psi.h> |
---|
59 | 61 | #include <net/sock.h> |
---|
60 | 62 | |
---|
61 | 63 | #define CREATE_TRACE_POINTS |
---|
62 | 64 | #include <trace/events/cgroup.h> |
---|
| 65 | +#undef CREATE_TRACE_POINTS |
---|
| 66 | + |
---|
| 67 | +#include <trace/hooks/cgroup.h> |
---|
63 | 68 | |
---|
64 | 69 | #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ |
---|
65 | 70 | MAX_CFTYPE_NAME + 2) |
---|
.. | .. |
---|
86 | 91 | |
---|
87 | 92 | DEFINE_SPINLOCK(trace_cgroup_path_lock); |
---|
88 | 93 | char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; |
---|
| 94 | +bool cgroup_debug __read_mostly; |
---|
89 | 95 | |
---|
90 | 96 | /* |
---|
91 | 97 | * Protects cgroup_idr and css_idr so that IDs can be released without |
---|
.. | .. |
---|
99 | 105 | */ |
---|
100 | 106 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); |
---|
101 | 107 | |
---|
102 | | -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; |
---|
| 108 | +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); |
---|
103 | 109 | |
---|
104 | 110 | #define cgroup_assert_mutex_or_rcu_locked() \ |
---|
105 | 111 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
---|
.. | .. |
---|
151 | 157 | |
---|
152 | 158 | static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); |
---|
153 | 159 | |
---|
154 | | -/* |
---|
155 | | - * The default hierarchy, reserved for the subsystems that are otherwise |
---|
156 | | - * unattached - it never has more than a single cgroup, and all tasks are |
---|
157 | | - * part of that cgroup. |
---|
158 | | - */ |
---|
| 160 | +/* the default hierarchy */ |
---|
159 | 161 | struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; |
---|
160 | 162 | EXPORT_SYMBOL_GPL(cgrp_dfl_root); |
---|
161 | 163 | |
---|
.. | .. |
---|
264 | 266 | * can be used to test whether a cgroup is on the default hierarchy for |
---|
265 | 267 | * cases where a subsystem should behave differnetly depending on the |
---|
266 | 268 | * interface version. |
---|
267 | | - * |
---|
268 | | - * The set of behaviors which change on the default hierarchy are still |
---|
269 | | - * being determined and the mount option is prefixed with __DEVEL__. |
---|
270 | 269 | * |
---|
271 | 270 | * List of changed behaviors: |
---|
272 | 271 | * |
---|
.. | .. |
---|
502 | 501 | |
---|
503 | 502 | rcu_read_lock(); |
---|
504 | 503 | css = cgroup_css(cgrp, ss); |
---|
505 | | - if (!css || !css_tryget_online(css)) |
---|
| 504 | + if (css && !css_tryget_online(css)) |
---|
506 | 505 | css = NULL; |
---|
507 | 506 | rcu_read_unlock(); |
---|
508 | 507 | |
---|
.. | .. |
---|
510 | 509 | } |
---|
511 | 510 | |
---|
512 | 511 | /** |
---|
513 | | - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem |
---|
| 512 | + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss |
---|
514 | 513 | * @cgrp: the cgroup of interest |
---|
515 | 514 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
---|
516 | 515 | * |
---|
.. | .. |
---|
519 | 518 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this |
---|
520 | 519 | * function is guaranteed to return non-NULL css. |
---|
521 | 520 | */ |
---|
522 | | -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, |
---|
523 | | - struct cgroup_subsys *ss) |
---|
| 521 | +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, |
---|
| 522 | + struct cgroup_subsys *ss) |
---|
524 | 523 | { |
---|
525 | 524 | lockdep_assert_held(&cgroup_mutex); |
---|
526 | 525 | |
---|
.. | .. |
---|
538 | 537 | } |
---|
539 | 538 | |
---|
540 | 539 | return cgroup_css(cgrp, ss); |
---|
| 540 | +} |
---|
| 541 | + |
---|
| 542 | +/** |
---|
| 543 | + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem |
---|
| 544 | + * @cgrp: the cgroup of interest |
---|
| 545 | + * @ss: the subsystem of interest |
---|
| 546 | + * |
---|
| 547 | + * Find and get the effective css of @cgrp for @ss. The effective css is |
---|
| 548 | + * defined as the matching css of the nearest ancestor including self which |
---|
| 549 | + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, |
---|
| 550 | + * the root css is returned, so this function always returns a valid css. |
---|
| 551 | + * |
---|
| 552 | + * The returned css is not guaranteed to be online, and therefore it is the |
---|
| 553 | + * callers responsiblity to tryget a reference for it. |
---|
| 554 | + */ |
---|
| 555 | +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, |
---|
| 556 | + struct cgroup_subsys *ss) |
---|
| 557 | +{ |
---|
| 558 | + struct cgroup_subsys_state *css; |
---|
| 559 | + |
---|
| 560 | + do { |
---|
| 561 | + css = cgroup_css(cgrp, ss); |
---|
| 562 | + |
---|
| 563 | + if (css) |
---|
| 564 | + return css; |
---|
| 565 | + cgrp = cgroup_parent(cgrp); |
---|
| 566 | + } while (cgrp); |
---|
| 567 | + |
---|
| 568 | + return init_css_set.subsys[ss->id]; |
---|
541 | 569 | } |
---|
542 | 570 | |
---|
543 | 571 | /** |
---|
.. | .. |
---|
655 | 683 | * |
---|
656 | 684 | * Should be called under cgroup_[tree_]mutex. |
---|
657 | 685 | */ |
---|
658 | | -#define for_each_e_css(css, ssid, cgrp) \ |
---|
659 | | - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
---|
660 | | - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ |
---|
661 | | - ; \ |
---|
| 686 | +#define for_each_e_css(css, ssid, cgrp) \ |
---|
| 687 | + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
---|
| 688 | + if (!((css) = cgroup_e_css_by_mask(cgrp, \ |
---|
| 689 | + cgroup_subsys[(ssid)]))) \ |
---|
| 690 | + ; \ |
---|
662 | 691 | else |
---|
663 | 692 | |
---|
664 | 693 | /** |
---|
.. | .. |
---|
718 | 747 | * reference-counted, to improve performance when child cgroups |
---|
719 | 748 | * haven't been created. |
---|
720 | 749 | */ |
---|
721 | | -struct css_set init_css_set = { |
---|
722 | | - .refcount = REFCOUNT_INIT(1), |
---|
723 | | - .dom_cset = &init_css_set, |
---|
724 | | - .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
---|
725 | | - .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
---|
726 | | - .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), |
---|
727 | | - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
---|
728 | | - .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), |
---|
729 | | - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
---|
730 | | - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
---|
731 | | - .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
---|
732 | | - |
---|
733 | | - /* |
---|
734 | | - * The following field is re-initialized when this cset gets linked |
---|
735 | | - * in cgroup_init(). However, let's initialize the field |
---|
736 | | - * statically too so that the default cgroup can be accessed safely |
---|
737 | | - * early during boot. |
---|
738 | | - */ |
---|
739 | | - .dfl_cgrp = &cgrp_dfl_root.cgrp, |
---|
| 750 | +struct ext_css_set init_ext_css_set = { |
---|
| 751 | + .cset = { |
---|
| 752 | + .refcount = REFCOUNT_INIT(1), |
---|
| 753 | + .dom_cset = &init_css_set, |
---|
| 754 | + .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
---|
| 755 | + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
---|
| 756 | + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), |
---|
| 757 | + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
---|
| 758 | + .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), |
---|
| 759 | + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
---|
| 760 | + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
---|
| 761 | + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
---|
| 762 | + /* |
---|
| 763 | + * The following field is re-initialized when this cset gets linked |
---|
| 764 | + * in cgroup_init(). However, let's initialize the field |
---|
| 765 | + * statically too so that the default cgroup can be accessed safely |
---|
| 766 | + * early during boot. |
---|
| 767 | + */ |
---|
| 768 | + .dfl_cgrp = &cgrp_dfl_root.cgrp, |
---|
| 769 | + }, |
---|
| 770 | + .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node), |
---|
| 771 | + .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node), |
---|
740 | 772 | }; |
---|
741 | 773 | |
---|
742 | 774 | static int css_set_count = 1; /* 1 for init_css_set */ |
---|
.. | .. |
---|
802 | 834 | break; |
---|
803 | 835 | |
---|
804 | 836 | cgroup1_check_for_release(cgrp); |
---|
| 837 | + TRACE_CGROUP_PATH(notify_populated, cgrp, |
---|
| 838 | + cgroup_is_populated(cgrp)); |
---|
805 | 839 | cgroup_file_notify(&cgrp->events_file); |
---|
806 | 840 | |
---|
807 | 841 | child = cgrp; |
---|
.. | .. |
---|
881 | 915 | /* |
---|
882 | 916 | * We are synchronized through cgroup_threadgroup_rwsem |
---|
883 | 917 | * against PF_EXITING setting such that we can't race |
---|
884 | | - * against cgroup_exit() changing the css_set to |
---|
885 | | - * init_css_set and dropping the old one. |
---|
| 918 | + * against cgroup_exit()/cgroup_free() dropping the css_set. |
---|
886 | 919 | */ |
---|
887 | 920 | WARN_ON_ONCE(task->flags & PF_EXITING); |
---|
888 | 921 | |
---|
.. | .. |
---|
1060 | 1093 | * @ss is in this hierarchy, so we want the |
---|
1061 | 1094 | * effective css from @cgrp. |
---|
1062 | 1095 | */ |
---|
1063 | | - template[i] = cgroup_e_css(cgrp, ss); |
---|
| 1096 | + template[i] = cgroup_e_css_by_mask(cgrp, ss); |
---|
1064 | 1097 | } else { |
---|
1065 | 1098 | /* |
---|
1066 | 1099 | * @ss is not in this hierarchy, so we don't want |
---|
.. | .. |
---|
1162 | 1195 | struct cgroup *cgrp) |
---|
1163 | 1196 | { |
---|
1164 | 1197 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; |
---|
| 1198 | + struct ext_css_set *ext_cset; |
---|
1165 | 1199 | struct css_set *cset; |
---|
1166 | 1200 | struct list_head tmp_links; |
---|
1167 | 1201 | struct cgrp_cset_link *link; |
---|
.. | .. |
---|
1182 | 1216 | if (cset) |
---|
1183 | 1217 | return cset; |
---|
1184 | 1218 | |
---|
1185 | | - cset = kzalloc(sizeof(*cset), GFP_KERNEL); |
---|
1186 | | - if (!cset) |
---|
| 1219 | + ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL); |
---|
| 1220 | + if (!ext_cset) |
---|
1187 | 1221 | return NULL; |
---|
| 1222 | + cset = &ext_cset->cset; |
---|
1188 | 1223 | |
---|
1189 | 1224 | /* Allocate all the cgrp_cset_link objects that we'll need */ |
---|
1190 | 1225 | if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { |
---|
.. | .. |
---|
1202 | 1237 | INIT_HLIST_NODE(&cset->hlist); |
---|
1203 | 1238 | INIT_LIST_HEAD(&cset->cgrp_links); |
---|
1204 | 1239 | INIT_LIST_HEAD(&cset->mg_preload_node); |
---|
| 1240 | + INIT_LIST_HEAD(&ext_cset->mg_src_preload_node); |
---|
| 1241 | + INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node); |
---|
1205 | 1242 | INIT_LIST_HEAD(&cset->mg_node); |
---|
1206 | 1243 | |
---|
1207 | 1244 | /* Copy the set of subsystem state objects generated in |
---|
.. | .. |
---|
1291 | 1328 | |
---|
1292 | 1329 | void cgroup_free_root(struct cgroup_root *root) |
---|
1293 | 1330 | { |
---|
1294 | | - if (root) { |
---|
1295 | | - idr_destroy(&root->cgroup_idr); |
---|
1296 | | - kfree(root); |
---|
1297 | | - } |
---|
| 1331 | + kfree(root); |
---|
1298 | 1332 | } |
---|
1299 | 1333 | |
---|
1300 | 1334 | static void cgroup_destroy_root(struct cgroup_root *root) |
---|
.. | .. |
---|
1356 | 1390 | cset = current->nsproxy->cgroup_ns->root_cset; |
---|
1357 | 1391 | if (cset == &init_css_set) { |
---|
1358 | 1392 | res = &root->cgrp; |
---|
| 1393 | + } else if (root == &cgrp_dfl_root) { |
---|
| 1394 | + res = cset->dfl_cgrp; |
---|
1359 | 1395 | } else { |
---|
1360 | 1396 | struct cgrp_cset_link *link; |
---|
1361 | 1397 | |
---|
.. | .. |
---|
1412 | 1448 | struct cgroup_root *root) |
---|
1413 | 1449 | { |
---|
1414 | 1450 | /* |
---|
1415 | | - * No need to lock the task - since we hold cgroup_mutex the |
---|
1416 | | - * task can't change groups, so the only thing that can happen |
---|
1417 | | - * is that it exits and its css is set back to init_css_set. |
---|
| 1451 | + * No need to lock the task - since we hold css_set_lock the |
---|
| 1452 | + * task can't change groups. |
---|
1418 | 1453 | */ |
---|
1419 | 1454 | return cset_cgroup_from_root(task_css_set(task), root); |
---|
1420 | 1455 | } |
---|
.. | .. |
---|
1453 | 1488 | struct cgroup_subsys *ss = cft->ss; |
---|
1454 | 1489 | |
---|
1455 | 1490 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
---|
1456 | | - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) |
---|
1457 | | - snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", |
---|
1458 | | - cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
---|
| 1491 | + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
---|
| 1492 | + const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : ""; |
---|
| 1493 | + |
---|
| 1494 | + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", |
---|
| 1495 | + dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
---|
1459 | 1496 | cft->name); |
---|
1460 | | - else |
---|
| 1497 | + } else { |
---|
1461 | 1498 | strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
---|
| 1499 | + } |
---|
1462 | 1500 | return buf; |
---|
1463 | 1501 | } |
---|
1464 | 1502 | |
---|
.. | .. |
---|
1699 | 1737 | { |
---|
1700 | 1738 | struct cgroup *dcgrp = &dst_root->cgrp; |
---|
1701 | 1739 | struct cgroup_subsys *ss; |
---|
1702 | | - int ssid, i, ret; |
---|
| 1740 | + int ssid, ret; |
---|
1703 | 1741 | u16 dfl_disable_ss_mask = 0; |
---|
1704 | 1742 | |
---|
1705 | 1743 | lockdep_assert_held(&cgroup_mutex); |
---|
.. | .. |
---|
1743 | 1781 | struct cgroup_root *src_root = ss->root; |
---|
1744 | 1782 | struct cgroup *scgrp = &src_root->cgrp; |
---|
1745 | 1783 | struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); |
---|
1746 | | - struct css_set *cset; |
---|
| 1784 | + struct css_set *cset, *cset_pos; |
---|
| 1785 | + struct css_task_iter *it; |
---|
1747 | 1786 | |
---|
1748 | 1787 | WARN_ON(!css || cgroup_css(dcgrp, ss)); |
---|
1749 | 1788 | |
---|
.. | .. |
---|
1761 | 1800 | css->cgroup = dcgrp; |
---|
1762 | 1801 | |
---|
1763 | 1802 | spin_lock_irq(&css_set_lock); |
---|
1764 | | - hash_for_each(css_set_table, i, cset, hlist) |
---|
| 1803 | + WARN_ON(!list_empty(&dcgrp->e_csets[ss->id])); |
---|
| 1804 | + list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id], |
---|
| 1805 | + e_cset_node[ss->id]) { |
---|
1765 | 1806 | list_move_tail(&cset->e_cset_node[ss->id], |
---|
1766 | 1807 | &dcgrp->e_csets[ss->id]); |
---|
| 1808 | + /* |
---|
| 1809 | + * all css_sets of scgrp together in same order to dcgrp, |
---|
| 1810 | + * patch in-flight iterators to preserve correct iteration. |
---|
| 1811 | + * since the iterator is always advanced right away and |
---|
| 1812 | + * finished when it->cset_pos meets it->cset_head, so only |
---|
| 1813 | + * update it->cset_head is enough here. |
---|
| 1814 | + */ |
---|
| 1815 | + list_for_each_entry(it, &cset->task_iters, iters_node) |
---|
| 1816 | + if (it->cset_head == &scgrp->e_csets[ss->id]) |
---|
| 1817 | + it->cset_head = &dcgrp->e_csets[ss->id]; |
---|
| 1818 | + } |
---|
1767 | 1819 | spin_unlock_irq(&css_set_lock); |
---|
1768 | 1820 | |
---|
1769 | 1821 | /* default hierarchy doesn't enable controllers by default */ |
---|
.. | .. |
---|
1815 | 1867 | return len; |
---|
1816 | 1868 | } |
---|
1817 | 1869 | |
---|
1818 | | -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) |
---|
| 1870 | +enum cgroup2_param { |
---|
| 1871 | + Opt_nsdelegate, |
---|
| 1872 | + Opt_memory_localevents, |
---|
| 1873 | + Opt_memory_recursiveprot, |
---|
| 1874 | + nr__cgroup2_params |
---|
| 1875 | +}; |
---|
| 1876 | + |
---|
| 1877 | +static const struct fs_parameter_spec cgroup2_fs_parameters[] = { |
---|
| 1878 | + fsparam_flag("nsdelegate", Opt_nsdelegate), |
---|
| 1879 | + fsparam_flag("memory_localevents", Opt_memory_localevents), |
---|
| 1880 | + fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), |
---|
| 1881 | + {} |
---|
| 1882 | +}; |
---|
| 1883 | + |
---|
| 1884 | +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) |
---|
1819 | 1885 | { |
---|
1820 | | - char *token; |
---|
| 1886 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 1887 | + struct fs_parse_result result; |
---|
| 1888 | + int opt; |
---|
1821 | 1889 | |
---|
1822 | | - *root_flags = 0; |
---|
| 1890 | + opt = fs_parse(fc, cgroup2_fs_parameters, param, &result); |
---|
| 1891 | + if (opt < 0) |
---|
| 1892 | + return opt; |
---|
1823 | 1893 | |
---|
1824 | | - if (!data || *data == '\0') |
---|
| 1894 | + switch (opt) { |
---|
| 1895 | + case Opt_nsdelegate: |
---|
| 1896 | + ctx->flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1825 | 1897 | return 0; |
---|
1826 | | - |
---|
1827 | | - while ((token = strsep(&data, ",")) != NULL) { |
---|
1828 | | - if (!strcmp(token, "nsdelegate")) { |
---|
1829 | | - *root_flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1830 | | - continue; |
---|
1831 | | - } |
---|
1832 | | - |
---|
1833 | | - pr_err("cgroup2: unknown option \"%s\"\n", token); |
---|
1834 | | - return -EINVAL; |
---|
| 1898 | + case Opt_memory_localevents: |
---|
| 1899 | + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1900 | + return 0; |
---|
| 1901 | + case Opt_memory_recursiveprot: |
---|
| 1902 | + ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
| 1903 | + return 0; |
---|
1835 | 1904 | } |
---|
1836 | | - |
---|
1837 | | - return 0; |
---|
| 1905 | + return -EINVAL; |
---|
1838 | 1906 | } |
---|
1839 | 1907 | |
---|
1840 | 1908 | static void apply_cgroup_root_flags(unsigned int root_flags) |
---|
.. | .. |
---|
1844 | 1912 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; |
---|
1845 | 1913 | else |
---|
1846 | 1914 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; |
---|
| 1915 | + |
---|
| 1916 | + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) |
---|
| 1917 | + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1918 | + else |
---|
| 1919 | + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; |
---|
| 1920 | + |
---|
| 1921 | + if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) |
---|
| 1922 | + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
| 1923 | + else |
---|
| 1924 | + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; |
---|
1847 | 1925 | } |
---|
1848 | 1926 | } |
---|
1849 | 1927 | |
---|
.. | .. |
---|
1851 | 1929 | { |
---|
1852 | 1930 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) |
---|
1853 | 1931 | seq_puts(seq, ",nsdelegate"); |
---|
| 1932 | + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) |
---|
| 1933 | + seq_puts(seq, ",memory_localevents"); |
---|
| 1934 | + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) |
---|
| 1935 | + seq_puts(seq, ",memory_recursiveprot"); |
---|
1854 | 1936 | return 0; |
---|
1855 | 1937 | } |
---|
1856 | 1938 | |
---|
1857 | | -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) |
---|
| 1939 | +static int cgroup_reconfigure(struct fs_context *fc) |
---|
1858 | 1940 | { |
---|
1859 | | - unsigned int root_flags; |
---|
1860 | | - int ret; |
---|
| 1941 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
1861 | 1942 | |
---|
1862 | | - ret = parse_cgroup_root_flags(data, &root_flags); |
---|
1863 | | - if (ret) |
---|
1864 | | - return ret; |
---|
1865 | | - |
---|
1866 | | - apply_cgroup_root_flags(root_flags); |
---|
| 1943 | + apply_cgroup_root_flags(ctx->flags); |
---|
1867 | 1944 | return 0; |
---|
1868 | | -} |
---|
1869 | | - |
---|
1870 | | -/* |
---|
1871 | | - * To reduce the fork() overhead for systems that are not actually using |
---|
1872 | | - * their cgroups capability, we don't maintain the lists running through |
---|
1873 | | - * each css_set to its tasks until we see the list actually used - in other |
---|
1874 | | - * words after the first mount. |
---|
1875 | | - */ |
---|
1876 | | -static bool use_task_css_set_links __read_mostly; |
---|
1877 | | - |
---|
1878 | | -static void cgroup_enable_task_cg_lists(void) |
---|
1879 | | -{ |
---|
1880 | | - struct task_struct *p, *g; |
---|
1881 | | - |
---|
1882 | | - /* |
---|
1883 | | - * We need tasklist_lock because RCU is not safe against |
---|
1884 | | - * while_each_thread(). Besides, a forking task that has passed |
---|
1885 | | - * cgroup_post_fork() without seeing use_task_css_set_links = 1 |
---|
1886 | | - * is not guaranteed to have its child immediately visible in the |
---|
1887 | | - * tasklist if we walk through it with RCU. |
---|
1888 | | - */ |
---|
1889 | | - read_lock(&tasklist_lock); |
---|
1890 | | - spin_lock_irq(&css_set_lock); |
---|
1891 | | - |
---|
1892 | | - if (use_task_css_set_links) |
---|
1893 | | - goto out_unlock; |
---|
1894 | | - |
---|
1895 | | - use_task_css_set_links = true; |
---|
1896 | | - |
---|
1897 | | - do_each_thread(g, p) { |
---|
1898 | | - WARN_ON_ONCE(!list_empty(&p->cg_list) || |
---|
1899 | | - task_css_set(p) != &init_css_set); |
---|
1900 | | - |
---|
1901 | | - /* |
---|
1902 | | - * We should check if the process is exiting, otherwise |
---|
1903 | | - * it will race with cgroup_exit() in that the list |
---|
1904 | | - * entry won't be deleted though the process has exited. |
---|
1905 | | - * Do it while holding siglock so that we don't end up |
---|
1906 | | - * racing against cgroup_exit(). |
---|
1907 | | - * |
---|
1908 | | - * Interrupts were already disabled while acquiring |
---|
1909 | | - * the css_set_lock, so we do not need to disable it |
---|
1910 | | - * again when acquiring the sighand->siglock here. |
---|
1911 | | - */ |
---|
1912 | | - spin_lock(&p->sighand->siglock); |
---|
1913 | | - if (!(p->flags & PF_EXITING)) { |
---|
1914 | | - struct css_set *cset = task_css_set(p); |
---|
1915 | | - |
---|
1916 | | - if (!css_set_populated(cset)) |
---|
1917 | | - css_set_update_populated(cset, true); |
---|
1918 | | - list_add_tail(&p->cg_list, &cset->tasks); |
---|
1919 | | - get_css_set(cset); |
---|
1920 | | - cset->nr_tasks++; |
---|
1921 | | - } |
---|
1922 | | - spin_unlock(&p->sighand->siglock); |
---|
1923 | | - } while_each_thread(g, p); |
---|
1924 | | -out_unlock: |
---|
1925 | | - spin_unlock_irq(&css_set_lock); |
---|
1926 | | - read_unlock(&tasklist_lock); |
---|
1927 | 1945 | } |
---|
1928 | 1946 | |
---|
1929 | 1947 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
---|
.. | .. |
---|
1951 | 1969 | INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); |
---|
1952 | 1970 | } |
---|
1953 | 1971 | |
---|
1954 | | -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) |
---|
| 1972 | +void init_cgroup_root(struct cgroup_fs_context *ctx) |
---|
1955 | 1973 | { |
---|
| 1974 | + struct cgroup_root *root = ctx->root; |
---|
1956 | 1975 | struct cgroup *cgrp = &root->cgrp; |
---|
1957 | 1976 | |
---|
1958 | 1977 | INIT_LIST_HEAD(&root->root_list); |
---|
1959 | 1978 | atomic_set(&root->nr_cgrps, 1); |
---|
1960 | 1979 | cgrp->root = root; |
---|
1961 | 1980 | init_cgroup_housekeeping(cgrp); |
---|
1962 | | - idr_init(&root->cgroup_idr); |
---|
1963 | 1981 | |
---|
1964 | | - root->flags = opts->flags; |
---|
1965 | | - if (opts->release_agent) |
---|
1966 | | - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); |
---|
1967 | | - if (opts->name) |
---|
1968 | | - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); |
---|
1969 | | - if (opts->cpuset_clone_children) |
---|
| 1982 | + root->flags = ctx->flags; |
---|
| 1983 | + if (ctx->release_agent) |
---|
| 1984 | + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); |
---|
| 1985 | + if (ctx->name) |
---|
| 1986 | + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); |
---|
| 1987 | + if (ctx->cpuset_clone_children) |
---|
1970 | 1988 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
---|
1971 | 1989 | } |
---|
1972 | 1990 | |
---|
.. | .. |
---|
1979 | 1997 | int i, ret; |
---|
1980 | 1998 | |
---|
1981 | 1999 | lockdep_assert_held(&cgroup_mutex); |
---|
1982 | | - |
---|
1983 | | - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); |
---|
1984 | | - if (ret < 0) |
---|
1985 | | - goto out; |
---|
1986 | | - root_cgrp->id = ret; |
---|
1987 | | - root_cgrp->ancestor_ids[0] = ret; |
---|
1988 | 2000 | |
---|
1989 | 2001 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, |
---|
1990 | 2002 | 0, GFP_KERNEL); |
---|
.. | .. |
---|
2011 | 2023 | |
---|
2012 | 2024 | root->kf_root = kernfs_create_root(kf_sops, |
---|
2013 | 2025 | KERNFS_ROOT_CREATE_DEACTIVATED | |
---|
2014 | | - KERNFS_ROOT_SUPPORT_EXPORTOP, |
---|
| 2026 | + KERNFS_ROOT_SUPPORT_EXPORTOP | |
---|
| 2027 | + KERNFS_ROOT_SUPPORT_USER_XATTR, |
---|
2015 | 2028 | root_cgrp); |
---|
2016 | 2029 | if (IS_ERR(root->kf_root)) { |
---|
2017 | 2030 | ret = PTR_ERR(root->kf_root); |
---|
2018 | 2031 | goto exit_root_id; |
---|
2019 | 2032 | } |
---|
2020 | 2033 | root_cgrp->kn = root->kf_root->kn; |
---|
| 2034 | + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); |
---|
| 2035 | + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); |
---|
2021 | 2036 | |
---|
2022 | 2037 | ret = css_populate_dir(&root_cgrp->self); |
---|
2023 | 2038 | if (ret) |
---|
.. | .. |
---|
2055 | 2070 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
---|
2056 | 2071 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
---|
2057 | 2072 | |
---|
2058 | | - kernfs_activate(root_cgrp->kn); |
---|
2059 | 2073 | ret = 0; |
---|
2060 | 2074 | goto out; |
---|
2061 | 2075 | |
---|
.. | .. |
---|
2071 | 2085 | return ret; |
---|
2072 | 2086 | } |
---|
2073 | 2087 | |
---|
2074 | | -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, |
---|
2075 | | - struct cgroup_root *root, unsigned long magic, |
---|
2076 | | - struct cgroup_namespace *ns) |
---|
| 2088 | +int cgroup_do_get_tree(struct fs_context *fc) |
---|
2077 | 2089 | { |
---|
2078 | | - struct dentry *dentry; |
---|
2079 | | - bool new_sb = false; |
---|
| 2090 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 2091 | + int ret; |
---|
2080 | 2092 | |
---|
2081 | | - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); |
---|
| 2093 | + ctx->kfc.root = ctx->root->kf_root; |
---|
| 2094 | + if (fc->fs_type == &cgroup2_fs_type) |
---|
| 2095 | + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; |
---|
| 2096 | + else |
---|
| 2097 | + ctx->kfc.magic = CGROUP_SUPER_MAGIC; |
---|
| 2098 | + ret = kernfs_get_tree(fc); |
---|
2082 | 2099 | |
---|
2083 | 2100 | /* |
---|
2084 | 2101 | * In non-init cgroup namespace, instead of root cgroup's dentry, |
---|
2085 | 2102 | * we return the dentry corresponding to the cgroupns->root_cgrp. |
---|
2086 | 2103 | */ |
---|
2087 | | - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { |
---|
| 2104 | + if (!ret && ctx->ns != &init_cgroup_ns) { |
---|
2088 | 2105 | struct dentry *nsdentry; |
---|
2089 | | - struct super_block *sb = dentry->d_sb; |
---|
| 2106 | + struct super_block *sb = fc->root->d_sb; |
---|
2090 | 2107 | struct cgroup *cgrp; |
---|
2091 | 2108 | |
---|
2092 | 2109 | mutex_lock(&cgroup_mutex); |
---|
2093 | 2110 | spin_lock_irq(&css_set_lock); |
---|
2094 | 2111 | |
---|
2095 | | - cgrp = cset_cgroup_from_root(ns->root_cset, root); |
---|
| 2112 | + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); |
---|
2096 | 2113 | |
---|
2097 | 2114 | spin_unlock_irq(&css_set_lock); |
---|
2098 | 2115 | mutex_unlock(&cgroup_mutex); |
---|
2099 | 2116 | |
---|
2100 | 2117 | nsdentry = kernfs_node_dentry(cgrp->kn, sb); |
---|
2101 | | - dput(dentry); |
---|
2102 | | - if (IS_ERR(nsdentry)) |
---|
| 2118 | + dput(fc->root); |
---|
| 2119 | + if (IS_ERR(nsdentry)) { |
---|
2103 | 2120 | deactivate_locked_super(sb); |
---|
2104 | | - dentry = nsdentry; |
---|
| 2121 | + ret = PTR_ERR(nsdentry); |
---|
| 2122 | + nsdentry = NULL; |
---|
| 2123 | + } |
---|
| 2124 | + fc->root = nsdentry; |
---|
2105 | 2125 | } |
---|
2106 | 2126 | |
---|
2107 | | - if (!new_sb) |
---|
2108 | | - cgroup_put(&root->cgrp); |
---|
| 2127 | + if (!ctx->kfc.new_sb_created) |
---|
| 2128 | + cgroup_put(&ctx->root->cgrp); |
---|
2109 | 2129 | |
---|
2110 | | - return dentry; |
---|
| 2130 | + return ret; |
---|
2111 | 2131 | } |
---|
2112 | 2132 | |
---|
2113 | | -static struct dentry *cgroup_mount(struct file_system_type *fs_type, |
---|
2114 | | - int flags, const char *unused_dev_name, |
---|
2115 | | - void *data) |
---|
| 2133 | +/* |
---|
| 2134 | + * Destroy a cgroup filesystem context. |
---|
| 2135 | + */ |
---|
| 2136 | +static void cgroup_fs_context_free(struct fs_context *fc) |
---|
2116 | 2137 | { |
---|
2117 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
2118 | | - struct dentry *dentry; |
---|
| 2138 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
| 2139 | + |
---|
| 2140 | + kfree(ctx->name); |
---|
| 2141 | + kfree(ctx->release_agent); |
---|
| 2142 | + put_cgroup_ns(ctx->ns); |
---|
| 2143 | + kernfs_free_fs_context(fc); |
---|
| 2144 | + kfree(ctx); |
---|
| 2145 | +} |
---|
| 2146 | + |
---|
| 2147 | +static int cgroup_get_tree(struct fs_context *fc) |
---|
| 2148 | +{ |
---|
| 2149 | + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
---|
2119 | 2150 | int ret; |
---|
2120 | 2151 | |
---|
2121 | | - get_cgroup_ns(ns); |
---|
| 2152 | + cgrp_dfl_visible = true; |
---|
| 2153 | + cgroup_get_live(&cgrp_dfl_root.cgrp); |
---|
| 2154 | + ctx->root = &cgrp_dfl_root; |
---|
2122 | 2155 | |
---|
2123 | | - /* Check if the caller has permission to mount. */ |
---|
2124 | | - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { |
---|
2125 | | - put_cgroup_ns(ns); |
---|
2126 | | - return ERR_PTR(-EPERM); |
---|
2127 | | - } |
---|
| 2156 | + ret = cgroup_do_get_tree(fc); |
---|
| 2157 | + if (!ret) |
---|
| 2158 | + apply_cgroup_root_flags(ctx->flags); |
---|
| 2159 | + return ret; |
---|
| 2160 | +} |
---|
2128 | 2161 | |
---|
2129 | | - /* |
---|
2130 | | - * The first time anyone tries to mount a cgroup, enable the list |
---|
2131 | | - * linking each css_set to its tasks and fix up all existing tasks. |
---|
2132 | | - */ |
---|
2133 | | - if (!use_task_css_set_links) |
---|
2134 | | - cgroup_enable_task_cg_lists(); |
---|
| 2162 | +static const struct fs_context_operations cgroup_fs_context_ops = { |
---|
| 2163 | + .free = cgroup_fs_context_free, |
---|
| 2164 | + .parse_param = cgroup2_parse_param, |
---|
| 2165 | + .get_tree = cgroup_get_tree, |
---|
| 2166 | + .reconfigure = cgroup_reconfigure, |
---|
| 2167 | +}; |
---|
2135 | 2168 | |
---|
2136 | | - if (fs_type == &cgroup2_fs_type) { |
---|
2137 | | - unsigned int root_flags; |
---|
| 2169 | +static const struct fs_context_operations cgroup1_fs_context_ops = { |
---|
| 2170 | + .free = cgroup_fs_context_free, |
---|
| 2171 | + .parse_param = cgroup1_parse_param, |
---|
| 2172 | + .get_tree = cgroup1_get_tree, |
---|
| 2173 | + .reconfigure = cgroup1_reconfigure, |
---|
| 2174 | +}; |
---|
2138 | 2175 | |
---|
2139 | | - ret = parse_cgroup_root_flags(data, &root_flags); |
---|
2140 | | - if (ret) { |
---|
2141 | | - put_cgroup_ns(ns); |
---|
2142 | | - return ERR_PTR(ret); |
---|
2143 | | - } |
---|
| 2176 | +/* |
---|
| 2177 | + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, |
---|
| 2178 | + * we select the namespace we're going to use. |
---|
| 2179 | + */ |
---|
| 2180 | +static int cgroup_init_fs_context(struct fs_context *fc) |
---|
| 2181 | +{ |
---|
| 2182 | + struct cgroup_fs_context *ctx; |
---|
2144 | 2183 | |
---|
2145 | | - cgrp_dfl_visible = true; |
---|
2146 | | - cgroup_get_live(&cgrp_dfl_root.cgrp); |
---|
| 2184 | + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); |
---|
| 2185 | + if (!ctx) |
---|
| 2186 | + return -ENOMEM; |
---|
2147 | 2187 | |
---|
2148 | | - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, |
---|
2149 | | - CGROUP2_SUPER_MAGIC, ns); |
---|
2150 | | - if (!IS_ERR(dentry)) |
---|
2151 | | - apply_cgroup_root_flags(root_flags); |
---|
2152 | | - } else { |
---|
2153 | | - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, |
---|
2154 | | - CGROUP_SUPER_MAGIC, ns); |
---|
2155 | | - } |
---|
2156 | | - |
---|
2157 | | - put_cgroup_ns(ns); |
---|
2158 | | - return dentry; |
---|
| 2188 | + ctx->ns = current->nsproxy->cgroup_ns; |
---|
| 2189 | + get_cgroup_ns(ctx->ns); |
---|
| 2190 | + fc->fs_private = &ctx->kfc; |
---|
| 2191 | + if (fc->fs_type == &cgroup2_fs_type) |
---|
| 2192 | + fc->ops = &cgroup_fs_context_ops; |
---|
| 2193 | + else |
---|
| 2194 | + fc->ops = &cgroup1_fs_context_ops; |
---|
| 2195 | + put_user_ns(fc->user_ns); |
---|
| 2196 | + fc->user_ns = get_user_ns(ctx->ns->user_ns); |
---|
| 2197 | + fc->global = true; |
---|
| 2198 | + return 0; |
---|
2159 | 2199 | } |
---|
2160 | 2200 | |
---|
2161 | 2201 | static void cgroup_kill_sb(struct super_block *sb) |
---|
.. | .. |
---|
2171 | 2211 | * And don't kill the default root. |
---|
2172 | 2212 | */ |
---|
2173 | 2213 | if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && |
---|
2174 | | - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) |
---|
| 2214 | + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { |
---|
| 2215 | + cgroup_bpf_offline(&root->cgrp); |
---|
2175 | 2216 | percpu_ref_kill(&root->cgrp.self.refcnt); |
---|
| 2217 | + } |
---|
2176 | 2218 | cgroup_put(&root->cgrp); |
---|
2177 | 2219 | kernfs_kill_sb(sb); |
---|
2178 | 2220 | } |
---|
2179 | 2221 | |
---|
2180 | 2222 | struct file_system_type cgroup_fs_type = { |
---|
2181 | | - .name = "cgroup", |
---|
2182 | | - .mount = cgroup_mount, |
---|
2183 | | - .kill_sb = cgroup_kill_sb, |
---|
2184 | | - .fs_flags = FS_USERNS_MOUNT, |
---|
| 2223 | + .name = "cgroup", |
---|
| 2224 | + .init_fs_context = cgroup_init_fs_context, |
---|
| 2225 | + .parameters = cgroup1_fs_parameters, |
---|
| 2226 | + .kill_sb = cgroup_kill_sb, |
---|
| 2227 | + .fs_flags = FS_USERNS_MOUNT, |
---|
2185 | 2228 | }; |
---|
2186 | 2229 | |
---|
2187 | 2230 | static struct file_system_type cgroup2_fs_type = { |
---|
2188 | | - .name = "cgroup2", |
---|
2189 | | - .mount = cgroup_mount, |
---|
2190 | | - .kill_sb = cgroup_kill_sb, |
---|
2191 | | - .fs_flags = FS_USERNS_MOUNT, |
---|
| 2231 | + .name = "cgroup2", |
---|
| 2232 | + .init_fs_context = cgroup_init_fs_context, |
---|
| 2233 | + .parameters = cgroup2_fs_parameters, |
---|
| 2234 | + .kill_sb = cgroup_kill_sb, |
---|
| 2235 | + .fs_flags = FS_USERNS_MOUNT, |
---|
2192 | 2236 | }; |
---|
| 2237 | + |
---|
| 2238 | +#ifdef CONFIG_CPUSETS |
---|
| 2239 | +static const struct fs_context_operations cpuset_fs_context_ops = { |
---|
| 2240 | + .get_tree = cgroup1_get_tree, |
---|
| 2241 | + .free = cgroup_fs_context_free, |
---|
| 2242 | +}; |
---|
| 2243 | + |
---|
| 2244 | +/* |
---|
| 2245 | + * This is ugly, but preserves the userspace API for existing cpuset |
---|
| 2246 | + * users. If someone tries to mount the "cpuset" filesystem, we |
---|
| 2247 | + * silently switch it to mount "cgroup" instead |
---|
| 2248 | + */ |
---|
| 2249 | +static int cpuset_init_fs_context(struct fs_context *fc) |
---|
| 2250 | +{ |
---|
| 2251 | + char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); |
---|
| 2252 | + struct cgroup_fs_context *ctx; |
---|
| 2253 | + int err; |
---|
| 2254 | + |
---|
| 2255 | + err = cgroup_init_fs_context(fc); |
---|
| 2256 | + if (err) { |
---|
| 2257 | + kfree(agent); |
---|
| 2258 | + return err; |
---|
| 2259 | + } |
---|
| 2260 | + |
---|
| 2261 | + fc->ops = &cpuset_fs_context_ops; |
---|
| 2262 | + |
---|
| 2263 | + ctx = cgroup_fc2context(fc); |
---|
| 2264 | + ctx->subsys_mask = 1 << cpuset_cgrp_id; |
---|
| 2265 | + ctx->flags |= CGRP_ROOT_NOPREFIX; |
---|
| 2266 | + ctx->release_agent = agent; |
---|
| 2267 | + |
---|
| 2268 | + get_filesystem(&cgroup_fs_type); |
---|
| 2269 | + put_filesystem(fc->fs_type); |
---|
| 2270 | + fc->fs_type = &cgroup_fs_type; |
---|
| 2271 | + |
---|
| 2272 | + return 0; |
---|
| 2273 | +} |
---|
| 2274 | + |
---|
| 2275 | +static struct file_system_type cpuset_fs_type = { |
---|
| 2276 | + .name = "cpuset", |
---|
| 2277 | + .init_fs_context = cpuset_init_fs_context, |
---|
| 2278 | + .fs_flags = FS_USERNS_MOUNT, |
---|
| 2279 | +}; |
---|
| 2280 | +#endif |
---|
2193 | 2281 | |
---|
2194 | 2282 | int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, |
---|
2195 | 2283 | struct cgroup_namespace *ns) |
---|
.. | .. |
---|
2256 | 2344 | EXPORT_SYMBOL_GPL(task_cgroup_path); |
---|
2257 | 2345 | |
---|
2258 | 2346 | /** |
---|
| 2347 | + * cgroup_attach_lock - Lock for ->attach() |
---|
| 2348 | + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem |
---|
| 2349 | + * |
---|
| 2350 | + * cgroup migration sometimes needs to stabilize threadgroups against forks and |
---|
| 2351 | + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() |
---|
| 2352 | + * implementations (e.g. cpuset), also need to disable CPU hotplug. |
---|
| 2353 | + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can |
---|
| 2354 | + * lead to deadlocks. |
---|
| 2355 | + * |
---|
| 2356 | + * Bringing up a CPU may involve creating and destroying tasks which requires |
---|
| 2357 | + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside |
---|
| 2358 | + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while |
---|
| 2359 | + * write-locking threadgroup_rwsem, the locking order is reversed and we end up |
---|
| 2360 | + * waiting for an on-going CPU hotplug operation which in turn is waiting for |
---|
| 2361 | + * the threadgroup_rwsem to be released to create new tasks. For more details: |
---|
| 2362 | + * |
---|
| 2363 | + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu |
---|
| 2364 | + * |
---|
| 2365 | + * Resolve the situation by always acquiring cpus_read_lock() before optionally |
---|
| 2366 | + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that |
---|
| 2367 | + * CPU hotplug is disabled on entry. |
---|
| 2368 | + */ |
---|
| 2369 | +static void cgroup_attach_lock(bool lock_threadgroup) |
---|
| 2370 | +{ |
---|
| 2371 | + cpus_read_lock(); |
---|
| 2372 | + if (lock_threadgroup) |
---|
| 2373 | + percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
| 2374 | +} |
---|
| 2375 | + |
---|
| 2376 | +/** |
---|
| 2377 | + * cgroup_attach_unlock - Undo cgroup_attach_lock() |
---|
| 2378 | + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem |
---|
| 2379 | + */ |
---|
| 2380 | +static void cgroup_attach_unlock(bool lock_threadgroup) |
---|
| 2381 | +{ |
---|
| 2382 | + if (lock_threadgroup) |
---|
| 2383 | + percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2384 | + cpus_read_unlock(); |
---|
| 2385 | +} |
---|
| 2386 | + |
---|
| 2387 | +/** |
---|
2259 | 2388 | * cgroup_migrate_add_task - add a migration target task to a migration context |
---|
2260 | 2389 | * @task: target task |
---|
2261 | 2390 | * @mgctx: target migration context |
---|
.. | .. |
---|
2276 | 2405 | if (task->flags & PF_EXITING) |
---|
2277 | 2406 | return; |
---|
2278 | 2407 | |
---|
2279 | | - /* leave @task alone if post_fork() hasn't linked it yet */ |
---|
2280 | | - if (list_empty(&task->cg_list)) |
---|
2281 | | - return; |
---|
| 2408 | + /* cgroup_threadgroup_rwsem protects racing against forks */ |
---|
| 2409 | + WARN_ON_ONCE(list_empty(&task->cg_list)); |
---|
2282 | 2410 | |
---|
2283 | 2411 | cset = task_css_set(task); |
---|
2284 | 2412 | if (!cset->mg_src_cgrp) |
---|
.. | .. |
---|
2310 | 2438 | |
---|
2311 | 2439 | return cgroup_taskset_next(tset, dst_cssp); |
---|
2312 | 2440 | } |
---|
| 2441 | +EXPORT_SYMBOL_GPL(cgroup_taskset_first); |
---|
2313 | 2442 | |
---|
2314 | 2443 | /** |
---|
2315 | 2444 | * cgroup_taskset_next - iterate to the next task in taskset |
---|
.. | .. |
---|
2356 | 2485 | |
---|
2357 | 2486 | return NULL; |
---|
2358 | 2487 | } |
---|
| 2488 | +EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
---|
2359 | 2489 | |
---|
2360 | 2490 | /** |
---|
2361 | 2491 | * cgroup_taskset_migrate - migrate a taskset |
---|
.. | .. |
---|
2426 | 2556 | do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { |
---|
2427 | 2557 | if (ss->attach) { |
---|
2428 | 2558 | tset->ssid = ssid; |
---|
| 2559 | + trace_android_vh_cgroup_attach(ss, tset); |
---|
2429 | 2560 | ss->attach(tset); |
---|
2430 | 2561 | } |
---|
2431 | 2562 | } while_each_subsys_mask(); |
---|
.. | .. |
---|
2510 | 2641 | */ |
---|
2511 | 2642 | void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) |
---|
2512 | 2643 | { |
---|
2513 | | - LIST_HEAD(preloaded); |
---|
2514 | | - struct css_set *cset, *tmp_cset; |
---|
| 2644 | + struct ext_css_set *cset, *tmp_cset; |
---|
2515 | 2645 | |
---|
2516 | 2646 | lockdep_assert_held(&cgroup_mutex); |
---|
2517 | 2647 | |
---|
2518 | 2648 | spin_lock_irq(&css_set_lock); |
---|
2519 | 2649 | |
---|
2520 | | - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); |
---|
2521 | | - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); |
---|
| 2650 | + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, |
---|
| 2651 | + mg_src_preload_node) { |
---|
| 2652 | + cset->cset.mg_src_cgrp = NULL; |
---|
| 2653 | + cset->cset.mg_dst_cgrp = NULL; |
---|
| 2654 | + cset->cset.mg_dst_cset = NULL; |
---|
| 2655 | + list_del_init(&cset->mg_src_preload_node); |
---|
| 2656 | + put_css_set_locked(&cset->cset); |
---|
| 2657 | + } |
---|
2522 | 2658 | |
---|
2523 | | - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { |
---|
2524 | | - cset->mg_src_cgrp = NULL; |
---|
2525 | | - cset->mg_dst_cgrp = NULL; |
---|
2526 | | - cset->mg_dst_cset = NULL; |
---|
2527 | | - list_del_init(&cset->mg_preload_node); |
---|
2528 | | - put_css_set_locked(cset); |
---|
| 2659 | + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, |
---|
| 2660 | + mg_dst_preload_node) { |
---|
| 2661 | + cset->cset.mg_src_cgrp = NULL; |
---|
| 2662 | + cset->cset.mg_dst_cgrp = NULL; |
---|
| 2663 | + cset->cset.mg_dst_cset = NULL; |
---|
| 2664 | + list_del_init(&cset->mg_dst_preload_node); |
---|
| 2665 | + put_css_set_locked(&cset->cset); |
---|
2529 | 2666 | } |
---|
2530 | 2667 | |
---|
2531 | 2668 | spin_unlock_irq(&css_set_lock); |
---|
.. | .. |
---|
2552 | 2689 | struct cgroup_mgctx *mgctx) |
---|
2553 | 2690 | { |
---|
2554 | 2691 | struct cgroup *src_cgrp; |
---|
| 2692 | + struct ext_css_set *ext_src_cset; |
---|
2555 | 2693 | |
---|
2556 | 2694 | lockdep_assert_held(&cgroup_mutex); |
---|
2557 | 2695 | lockdep_assert_held(&css_set_lock); |
---|
.. | .. |
---|
2565 | 2703 | return; |
---|
2566 | 2704 | |
---|
2567 | 2705 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
---|
| 2706 | + ext_src_cset = container_of(src_cset, struct ext_css_set, cset); |
---|
2568 | 2707 | |
---|
2569 | | - if (!list_empty(&src_cset->mg_preload_node)) |
---|
| 2708 | + if (!list_empty(&ext_src_cset->mg_src_preload_node)) |
---|
2570 | 2709 | return; |
---|
2571 | 2710 | |
---|
2572 | 2711 | WARN_ON(src_cset->mg_src_cgrp); |
---|
.. | .. |
---|
2577 | 2716 | src_cset->mg_src_cgrp = src_cgrp; |
---|
2578 | 2717 | src_cset->mg_dst_cgrp = dst_cgrp; |
---|
2579 | 2718 | get_css_set(src_cset); |
---|
2580 | | - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); |
---|
| 2719 | + list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); |
---|
2581 | 2720 | } |
---|
2582 | 2721 | |
---|
2583 | 2722 | /** |
---|
.. | .. |
---|
2596 | 2735 | */ |
---|
2597 | 2736 | int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) |
---|
2598 | 2737 | { |
---|
2599 | | - struct css_set *src_cset, *tmp_cset; |
---|
| 2738 | + struct ext_css_set *ext_src_set, *tmp_cset; |
---|
2600 | 2739 | |
---|
2601 | 2740 | lockdep_assert_held(&cgroup_mutex); |
---|
2602 | 2741 | |
---|
2603 | 2742 | /* look up the dst cset for each src cset and link it to src */ |
---|
2604 | | - list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, |
---|
2605 | | - mg_preload_node) { |
---|
| 2743 | + list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets, |
---|
| 2744 | + mg_src_preload_node) { |
---|
| 2745 | + struct css_set *src_cset = &ext_src_set->cset; |
---|
2606 | 2746 | struct css_set *dst_cset; |
---|
| 2747 | + struct ext_css_set *ext_dst_cset; |
---|
2607 | 2748 | struct cgroup_subsys *ss; |
---|
2608 | 2749 | int ssid; |
---|
2609 | 2750 | |
---|
2610 | 2751 | dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); |
---|
2611 | 2752 | if (!dst_cset) |
---|
2612 | 2753 | return -ENOMEM; |
---|
| 2754 | + ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset); |
---|
2613 | 2755 | |
---|
2614 | 2756 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); |
---|
2615 | 2757 | |
---|
.. | .. |
---|
2621 | 2763 | if (src_cset == dst_cset) { |
---|
2622 | 2764 | src_cset->mg_src_cgrp = NULL; |
---|
2623 | 2765 | src_cset->mg_dst_cgrp = NULL; |
---|
2624 | | - list_del_init(&src_cset->mg_preload_node); |
---|
| 2766 | + list_del_init(&ext_src_set->mg_src_preload_node); |
---|
2625 | 2767 | put_css_set(src_cset); |
---|
2626 | 2768 | put_css_set(dst_cset); |
---|
2627 | 2769 | continue; |
---|
.. | .. |
---|
2629 | 2771 | |
---|
2630 | 2772 | src_cset->mg_dst_cset = dst_cset; |
---|
2631 | 2773 | |
---|
2632 | | - if (list_empty(&dst_cset->mg_preload_node)) |
---|
2633 | | - list_add_tail(&dst_cset->mg_preload_node, |
---|
| 2774 | + if (list_empty(&ext_dst_cset->mg_dst_preload_node)) |
---|
| 2775 | + list_add_tail(&ext_dst_cset->mg_dst_preload_node, |
---|
2634 | 2776 | &mgctx->preloaded_dst_csets); |
---|
2635 | 2777 | else |
---|
2636 | 2778 | put_css_set(dst_cset); |
---|
.. | .. |
---|
2698 | 2840 | { |
---|
2699 | 2841 | DEFINE_CGROUP_MGCTX(mgctx); |
---|
2700 | 2842 | struct task_struct *task; |
---|
2701 | | - int ret; |
---|
2702 | | - |
---|
2703 | | - ret = cgroup_migrate_vet_dst(dst_cgrp); |
---|
2704 | | - if (ret) |
---|
2705 | | - return ret; |
---|
| 2843 | + int ret = 0; |
---|
2706 | 2844 | |
---|
2707 | 2845 | /* look up all src csets */ |
---|
2708 | 2846 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
2729 | 2867 | return ret; |
---|
2730 | 2868 | } |
---|
2731 | 2869 | |
---|
2732 | | -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) |
---|
2733 | | - __acquires(&cgroup_threadgroup_rwsem) |
---|
| 2870 | +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, |
---|
| 2871 | + bool *threadgroup_locked, |
---|
| 2872 | + struct cgroup *dst_cgrp) |
---|
2734 | 2873 | { |
---|
2735 | 2874 | struct task_struct *tsk; |
---|
2736 | 2875 | pid_t pid; |
---|
| 2876 | + bool force_migration = false; |
---|
2737 | 2877 | |
---|
2738 | 2878 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
---|
2739 | 2879 | return ERR_PTR(-EINVAL); |
---|
2740 | 2880 | |
---|
2741 | | - percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
| 2881 | + /* |
---|
| 2882 | + * If we migrate a single thread, we don't care about threadgroup |
---|
| 2883 | + * stability. If the thread is `current`, it won't exit(2) under our |
---|
| 2884 | + * hands or change PID through exec(2). We exclude |
---|
| 2885 | + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write |
---|
| 2886 | + * callers by cgroup_mutex. |
---|
| 2887 | + * Therefore, we can skip the global lock. |
---|
| 2888 | + */ |
---|
| 2889 | + lockdep_assert_held(&cgroup_mutex); |
---|
| 2890 | + *threadgroup_locked = pid || threadgroup; |
---|
| 2891 | + cgroup_attach_lock(*threadgroup_locked); |
---|
2742 | 2892 | |
---|
2743 | 2893 | rcu_read_lock(); |
---|
2744 | 2894 | if (pid) { |
---|
.. | .. |
---|
2754 | 2904 | if (threadgroup) |
---|
2755 | 2905 | tsk = tsk->group_leader; |
---|
2756 | 2906 | |
---|
| 2907 | + if (tsk->flags & PF_KTHREAD) |
---|
| 2908 | + trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration); |
---|
| 2909 | + |
---|
2757 | 2910 | /* |
---|
2758 | 2911 | * kthreads may acquire PF_NO_SETAFFINITY during initialization. |
---|
2759 | 2912 | * If userland migrates such a kthread to a non-root cgroup, it can |
---|
2760 | 2913 | * become trapped in a cpuset, or RT kthread may be born in a |
---|
2761 | 2914 | * cgroup with no rt_runtime allocated. Just say no. |
---|
2762 | 2915 | */ |
---|
2763 | | - if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { |
---|
| 2916 | + if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) { |
---|
2764 | 2917 | tsk = ERR_PTR(-EINVAL); |
---|
2765 | 2918 | goto out_unlock_threadgroup; |
---|
2766 | 2919 | } |
---|
.. | .. |
---|
2769 | 2922 | goto out_unlock_rcu; |
---|
2770 | 2923 | |
---|
2771 | 2924 | out_unlock_threadgroup: |
---|
2772 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2925 | + cgroup_attach_unlock(*threadgroup_locked); |
---|
| 2926 | + *threadgroup_locked = false; |
---|
2773 | 2927 | out_unlock_rcu: |
---|
2774 | 2928 | rcu_read_unlock(); |
---|
2775 | 2929 | return tsk; |
---|
2776 | 2930 | } |
---|
2777 | 2931 | |
---|
2778 | | -void cgroup_procs_write_finish(struct task_struct *task) |
---|
2779 | | - __releases(&cgroup_threadgroup_rwsem) |
---|
| 2932 | +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) |
---|
2780 | 2933 | { |
---|
2781 | 2934 | struct cgroup_subsys *ss; |
---|
2782 | 2935 | int ssid; |
---|
.. | .. |
---|
2784 | 2937 | /* release reference from cgroup_procs_write_start() */ |
---|
2785 | 2938 | put_task_struct(task); |
---|
2786 | 2939 | |
---|
2787 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 2940 | + cgroup_attach_unlock(threadgroup_locked); |
---|
| 2941 | + |
---|
2788 | 2942 | for_each_subsys(ss, ssid) |
---|
2789 | 2943 | if (ss->post_attach) |
---|
2790 | 2944 | ss->post_attach(); |
---|
.. | .. |
---|
2799 | 2953 | do_each_subsys_mask(ss, ssid, ss_mask) { |
---|
2800 | 2954 | if (printed) |
---|
2801 | 2955 | seq_putc(seq, ' '); |
---|
2802 | | - seq_printf(seq, "%s", ss->name); |
---|
| 2956 | + seq_puts(seq, ss->name); |
---|
2803 | 2957 | printed = true; |
---|
2804 | 2958 | } while_each_subsys_mask(); |
---|
2805 | 2959 | if (printed) |
---|
.. | .. |
---|
2838 | 2992 | DEFINE_CGROUP_MGCTX(mgctx); |
---|
2839 | 2993 | struct cgroup_subsys_state *d_css; |
---|
2840 | 2994 | struct cgroup *dsct; |
---|
2841 | | - struct css_set *src_cset; |
---|
| 2995 | + struct ext_css_set *ext_src_set; |
---|
| 2996 | + bool has_tasks; |
---|
2842 | 2997 | int ret; |
---|
2843 | 2998 | |
---|
2844 | 2999 | lockdep_assert_held(&cgroup_mutex); |
---|
2845 | | - |
---|
2846 | | - percpu_down_write(&cgroup_threadgroup_rwsem); |
---|
2847 | 3000 | |
---|
2848 | 3001 | /* look up all csses currently attached to @cgrp's subtree */ |
---|
2849 | 3002 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
2855 | 3008 | } |
---|
2856 | 3009 | spin_unlock_irq(&css_set_lock); |
---|
2857 | 3010 | |
---|
| 3011 | + /* |
---|
| 3012 | + * We need to write-lock threadgroup_rwsem while migrating tasks. |
---|
| 3013 | + * However, if there are no source csets for @cgrp, changing its |
---|
| 3014 | + * controllers isn't gonna produce any task migrations and the |
---|
| 3015 | + * write-locking can be skipped safely. |
---|
| 3016 | + */ |
---|
| 3017 | + has_tasks = !list_empty(&mgctx.preloaded_src_csets); |
---|
| 3018 | + cgroup_attach_lock(has_tasks); |
---|
| 3019 | + |
---|
2858 | 3020 | /* NULL dst indicates self on default hierarchy */ |
---|
2859 | 3021 | ret = cgroup_migrate_prepare_dst(&mgctx); |
---|
2860 | 3022 | if (ret) |
---|
2861 | 3023 | goto out_finish; |
---|
2862 | 3024 | |
---|
2863 | 3025 | spin_lock_irq(&css_set_lock); |
---|
2864 | | - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { |
---|
| 3026 | + list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets, |
---|
| 3027 | + mg_src_preload_node) { |
---|
2865 | 3028 | struct task_struct *task, *ntask; |
---|
2866 | 3029 | |
---|
2867 | 3030 | /* all tasks in src_csets need to be migrated */ |
---|
2868 | | - list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) |
---|
| 3031 | + list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list) |
---|
2869 | 3032 | cgroup_migrate_add_task(task, &mgctx); |
---|
2870 | 3033 | } |
---|
2871 | 3034 | spin_unlock_irq(&css_set_lock); |
---|
.. | .. |
---|
2873 | 3036 | ret = cgroup_migrate_execute(&mgctx); |
---|
2874 | 3037 | out_finish: |
---|
2875 | 3038 | cgroup_migrate_finish(&mgctx); |
---|
2876 | | - percpu_up_write(&cgroup_threadgroup_rwsem); |
---|
| 3039 | + cgroup_attach_unlock(has_tasks); |
---|
2877 | 3040 | return ret; |
---|
2878 | 3041 | } |
---|
2879 | 3042 | |
---|
.. | .. |
---|
3106 | 3269 | return ret; |
---|
3107 | 3270 | |
---|
3108 | 3271 | /* |
---|
3109 | | - * At this point, cgroup_e_css() results reflect the new csses |
---|
| 3272 | + * At this point, cgroup_e_css_by_mask() results reflect the new csses |
---|
3110 | 3273 | * making the following cgroup_update_dfl_csses() properly update |
---|
3111 | 3274 | * css associations of all tasks in the subtree. |
---|
3112 | 3275 | */ |
---|
.. | .. |
---|
3506 | 3669 | #ifdef CONFIG_PSI |
---|
3507 | 3670 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) |
---|
3508 | 3671 | { |
---|
3509 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); |
---|
| 3672 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3673 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3674 | + |
---|
| 3675 | + return psi_show(seq, psi, PSI_IO); |
---|
3510 | 3676 | } |
---|
3511 | 3677 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) |
---|
3512 | 3678 | { |
---|
3513 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); |
---|
| 3679 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3680 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3681 | + |
---|
| 3682 | + return psi_show(seq, psi, PSI_MEM); |
---|
3514 | 3683 | } |
---|
3515 | 3684 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) |
---|
3516 | 3685 | { |
---|
3517 | | - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); |
---|
| 3686 | + struct cgroup *cgrp = seq_css(seq)->cgroup; |
---|
| 3687 | + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3688 | + |
---|
| 3689 | + return psi_show(seq, psi, PSI_CPU); |
---|
3518 | 3690 | } |
---|
3519 | 3691 | |
---|
3520 | 3692 | static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, |
---|
3521 | 3693 | size_t nbytes, enum psi_res res) |
---|
3522 | 3694 | { |
---|
| 3695 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3523 | 3696 | struct psi_trigger *new; |
---|
3524 | 3697 | struct cgroup *cgrp; |
---|
| 3698 | + struct psi_group *psi; |
---|
3525 | 3699 | |
---|
3526 | 3700 | cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
3527 | 3701 | if (!cgrp) |
---|
.. | .. |
---|
3530 | 3704 | cgroup_get(cgrp); |
---|
3531 | 3705 | cgroup_kn_unlock(of->kn); |
---|
3532 | 3706 | |
---|
3533 | | - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); |
---|
| 3707 | + /* Allow only one trigger per file descriptor */ |
---|
| 3708 | + if (ctx->psi.trigger) { |
---|
| 3709 | + cgroup_put(cgrp); |
---|
| 3710 | + return -EBUSY; |
---|
| 3711 | + } |
---|
| 3712 | + |
---|
| 3713 | + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; |
---|
| 3714 | + new = psi_trigger_create(psi, buf, nbytes, res); |
---|
3534 | 3715 | if (IS_ERR(new)) { |
---|
3535 | 3716 | cgroup_put(cgrp); |
---|
3536 | 3717 | return PTR_ERR(new); |
---|
3537 | 3718 | } |
---|
3538 | 3719 | |
---|
3539 | | - psi_trigger_replace(&of->priv, new); |
---|
3540 | | - |
---|
| 3720 | + smp_store_release(&ctx->psi.trigger, new); |
---|
3541 | 3721 | cgroup_put(cgrp); |
---|
3542 | 3722 | |
---|
3543 | 3723 | return nbytes; |
---|
.. | .. |
---|
3567 | 3747 | static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, |
---|
3568 | 3748 | poll_table *pt) |
---|
3569 | 3749 | { |
---|
3570 | | - return psi_trigger_poll(&of->priv, of->file, pt); |
---|
| 3750 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 3751 | + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); |
---|
3571 | 3752 | } |
---|
3572 | 3753 | |
---|
3573 | 3754 | static void cgroup_pressure_release(struct kernfs_open_file *of) |
---|
3574 | 3755 | { |
---|
3575 | | - psi_trigger_replace(&of->priv, NULL); |
---|
| 3756 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 3757 | + |
---|
| 3758 | + psi_trigger_destroy(ctx->psi.trigger); |
---|
3576 | 3759 | } |
---|
3577 | 3760 | |
---|
3578 | 3761 | bool cgroup_psi_enabled(void) |
---|
.. | .. |
---|
3625 | 3808 | static int cgroup_file_open(struct kernfs_open_file *of) |
---|
3626 | 3809 | { |
---|
3627 | 3810 | struct cftype *cft = of->kn->priv; |
---|
| 3811 | + struct cgroup_file_ctx *ctx; |
---|
| 3812 | + int ret; |
---|
3628 | 3813 | |
---|
3629 | | - if (cft->open) |
---|
3630 | | - return cft->open(of); |
---|
3631 | | - return 0; |
---|
| 3814 | + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); |
---|
| 3815 | + if (!ctx) |
---|
| 3816 | + return -ENOMEM; |
---|
| 3817 | + |
---|
| 3818 | + ctx->ns = current->nsproxy->cgroup_ns; |
---|
| 3819 | + get_cgroup_ns(ctx->ns); |
---|
| 3820 | + of->priv = ctx; |
---|
| 3821 | + |
---|
| 3822 | + if (!cft->open) |
---|
| 3823 | + return 0; |
---|
| 3824 | + |
---|
| 3825 | + ret = cft->open(of); |
---|
| 3826 | + if (ret) { |
---|
| 3827 | + put_cgroup_ns(ctx->ns); |
---|
| 3828 | + kfree(ctx); |
---|
| 3829 | + } |
---|
| 3830 | + return ret; |
---|
3632 | 3831 | } |
---|
3633 | 3832 | |
---|
3634 | 3833 | static void cgroup_file_release(struct kernfs_open_file *of) |
---|
3635 | 3834 | { |
---|
3636 | 3835 | struct cftype *cft = of->kn->priv; |
---|
| 3836 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3637 | 3837 | |
---|
3638 | 3838 | if (cft->release) |
---|
3639 | 3839 | cft->release(of); |
---|
| 3840 | + put_cgroup_ns(ctx->ns); |
---|
| 3841 | + kfree(ctx); |
---|
3640 | 3842 | } |
---|
3641 | 3843 | |
---|
3642 | 3844 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
---|
3643 | 3845 | size_t nbytes, loff_t off) |
---|
3644 | 3846 | { |
---|
3645 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
| 3847 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
3646 | 3848 | struct cgroup *cgrp = of->kn->parent->priv; |
---|
3647 | 3849 | struct cftype *cft = of->kn->priv; |
---|
3648 | 3850 | struct cgroup_subsys_state *css; |
---|
3649 | 3851 | int ret; |
---|
| 3852 | + |
---|
| 3853 | + if (!nbytes) |
---|
| 3854 | + return 0; |
---|
3650 | 3855 | |
---|
3651 | 3856 | /* |
---|
3652 | 3857 | * If namespaces are delegation boundaries, disallow writes to |
---|
.. | .. |
---|
3656 | 3861 | */ |
---|
3657 | 3862 | if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && |
---|
3658 | 3863 | !(cft->flags & CFTYPE_NS_DELEGATABLE) && |
---|
3659 | | - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) |
---|
| 3864 | + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) |
---|
3660 | 3865 | return -EPERM; |
---|
3661 | 3866 | |
---|
3662 | 3867 | if (cft->write) |
---|
.. | .. |
---|
3843 | 4048 | continue; |
---|
3844 | 4049 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) |
---|
3845 | 4050 | continue; |
---|
3846 | | - |
---|
| 4051 | + if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) |
---|
| 4052 | + continue; |
---|
3847 | 4053 | if (is_add) { |
---|
3848 | 4054 | ret = cgroup_add_file(css, cgrp, cft); |
---|
3849 | 4055 | if (ret) { |
---|
.. | .. |
---|
4028 | 4234 | cft->flags |= __CFTYPE_ONLY_ON_DFL; |
---|
4029 | 4235 | return cgroup_add_cftypes(ss, cfts); |
---|
4030 | 4236 | } |
---|
| 4237 | +EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes); |
---|
4031 | 4238 | |
---|
4032 | 4239 | /** |
---|
4033 | 4240 | * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies |
---|
.. | .. |
---|
4045 | 4252 | cft->flags |= __CFTYPE_NOT_ON_DFL; |
---|
4046 | 4253 | return cgroup_add_cftypes(ss, cfts); |
---|
4047 | 4254 | } |
---|
| 4255 | +EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes); |
---|
4048 | 4256 | |
---|
4049 | 4257 | /** |
---|
4050 | 4258 | * cgroup_file_notify - generate a file modified event for a cgroup_file |
---|
.. | .. |
---|
4120 | 4328 | } else if (likely(!(pos->flags & CSS_RELEASED))) { |
---|
4121 | 4329 | next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); |
---|
4122 | 4330 | } else { |
---|
4123 | | - list_for_each_entry_rcu(next, &parent->children, sibling) |
---|
| 4331 | + list_for_each_entry_rcu(next, &parent->children, sibling, |
---|
| 4332 | + lockdep_is_held(&cgroup_mutex)) |
---|
4124 | 4333 | if (next->serial_nr > pos->serial_nr) |
---|
4125 | 4334 | break; |
---|
4126 | 4335 | } |
---|
.. | .. |
---|
4133 | 4342 | return next; |
---|
4134 | 4343 | return NULL; |
---|
4135 | 4344 | } |
---|
| 4345 | +EXPORT_SYMBOL_GPL(css_next_child); |
---|
4136 | 4346 | |
---|
4137 | 4347 | /** |
---|
4138 | 4348 | * css_next_descendant_pre - find the next descendant for pre-order walk |
---|
.. | .. |
---|
4182 | 4392 | |
---|
4183 | 4393 | return NULL; |
---|
4184 | 4394 | } |
---|
| 4395 | +EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
---|
4185 | 4396 | |
---|
4186 | 4397 | /** |
---|
4187 | 4398 | * css_rightmost_descendant - return the rightmost descendant of a css |
---|
.. | .. |
---|
4362 | 4573 | |
---|
4363 | 4574 | lockdep_assert_held(&css_set_lock); |
---|
4364 | 4575 | |
---|
4365 | | - /* Advance to the next non-empty css_set */ |
---|
4366 | | - do { |
---|
4367 | | - cset = css_task_iter_next_css_set(it); |
---|
4368 | | - if (!cset) { |
---|
4369 | | - it->task_pos = NULL; |
---|
4370 | | - return; |
---|
| 4576 | + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ |
---|
| 4577 | + while ((cset = css_task_iter_next_css_set(it))) { |
---|
| 4578 | + if (!list_empty(&cset->tasks)) { |
---|
| 4579 | + it->cur_tasks_head = &cset->tasks; |
---|
| 4580 | + break; |
---|
| 4581 | + } else if (!list_empty(&cset->mg_tasks)) { |
---|
| 4582 | + it->cur_tasks_head = &cset->mg_tasks; |
---|
| 4583 | + break; |
---|
| 4584 | + } else if (!list_empty(&cset->dying_tasks)) { |
---|
| 4585 | + it->cur_tasks_head = &cset->dying_tasks; |
---|
| 4586 | + break; |
---|
4371 | 4587 | } |
---|
4372 | | - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); |
---|
4373 | | - |
---|
4374 | | - if (!list_empty(&cset->tasks)) { |
---|
4375 | | - it->task_pos = cset->tasks.next; |
---|
4376 | | - it->cur_tasks_head = &cset->tasks; |
---|
4377 | | - } else if (!list_empty(&cset->mg_tasks)) { |
---|
4378 | | - it->task_pos = cset->mg_tasks.next; |
---|
4379 | | - it->cur_tasks_head = &cset->mg_tasks; |
---|
4380 | | - } else { |
---|
4381 | | - it->task_pos = cset->dying_tasks.next; |
---|
4382 | | - it->cur_tasks_head = &cset->dying_tasks; |
---|
4383 | 4588 | } |
---|
4384 | | - |
---|
4385 | | - it->tasks_head = &cset->tasks; |
---|
4386 | | - it->mg_tasks_head = &cset->mg_tasks; |
---|
4387 | | - it->dying_tasks_head = &cset->dying_tasks; |
---|
| 4589 | + if (!cset) { |
---|
| 4590 | + it->task_pos = NULL; |
---|
| 4591 | + return; |
---|
| 4592 | + } |
---|
| 4593 | + it->task_pos = it->cur_tasks_head->next; |
---|
4388 | 4594 | |
---|
4389 | 4595 | /* |
---|
4390 | 4596 | * We don't keep css_sets locked across iteration steps and thus |
---|
.. | .. |
---|
4429 | 4635 | repeat: |
---|
4430 | 4636 | if (it->task_pos) { |
---|
4431 | 4637 | /* |
---|
4432 | | - * Advance iterator to find next entry. cset->tasks is |
---|
4433 | | - * consumed first and then ->mg_tasks. After ->mg_tasks, |
---|
4434 | | - * we move onto the next cset. |
---|
| 4638 | + * Advance iterator to find next entry. We go through cset |
---|
| 4639 | + * tasks, mg_tasks and dying_tasks, when consumed we move onto |
---|
| 4640 | + * the next cset. |
---|
4435 | 4641 | */ |
---|
4436 | 4642 | if (it->flags & CSS_TASK_ITER_SKIPPED) |
---|
4437 | 4643 | it->flags &= ~CSS_TASK_ITER_SKIPPED; |
---|
4438 | 4644 | else |
---|
4439 | 4645 | it->task_pos = it->task_pos->next; |
---|
4440 | 4646 | |
---|
4441 | | - if (it->task_pos == it->tasks_head) { |
---|
4442 | | - it->task_pos = it->mg_tasks_head->next; |
---|
4443 | | - it->cur_tasks_head = it->mg_tasks_head; |
---|
| 4647 | + if (it->task_pos == &it->cur_cset->tasks) { |
---|
| 4648 | + it->cur_tasks_head = &it->cur_cset->mg_tasks; |
---|
| 4649 | + it->task_pos = it->cur_tasks_head->next; |
---|
4444 | 4650 | } |
---|
4445 | | - if (it->task_pos == it->mg_tasks_head) { |
---|
4446 | | - it->task_pos = it->dying_tasks_head->next; |
---|
4447 | | - it->cur_tasks_head = it->dying_tasks_head; |
---|
| 4651 | + if (it->task_pos == &it->cur_cset->mg_tasks) { |
---|
| 4652 | + it->cur_tasks_head = &it->cur_cset->dying_tasks; |
---|
| 4653 | + it->task_pos = it->cur_tasks_head->next; |
---|
4448 | 4654 | } |
---|
4449 | | - if (it->task_pos == it->dying_tasks_head) |
---|
| 4655 | + if (it->task_pos == &it->cur_cset->dying_tasks) |
---|
4450 | 4656 | css_task_iter_advance_css_set(it); |
---|
4451 | 4657 | } else { |
---|
4452 | 4658 | /* called from start, proceed to the first cset */ |
---|
.. | .. |
---|
4464 | 4670 | goto repeat; |
---|
4465 | 4671 | |
---|
4466 | 4672 | /* and dying leaders w/o live member threads */ |
---|
4467 | | - if (it->cur_tasks_head == it->dying_tasks_head && |
---|
| 4673 | + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && |
---|
4468 | 4674 | !atomic_read(&task->signal->live)) |
---|
4469 | 4675 | goto repeat; |
---|
4470 | 4676 | } else { |
---|
4471 | 4677 | /* skip all dying ones */ |
---|
4472 | | - if (it->cur_tasks_head == it->dying_tasks_head) |
---|
| 4678 | + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) |
---|
4473 | 4679 | goto repeat; |
---|
4474 | 4680 | } |
---|
4475 | 4681 | } |
---|
.. | .. |
---|
4488 | 4694 | void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, |
---|
4489 | 4695 | struct css_task_iter *it) |
---|
4490 | 4696 | { |
---|
4491 | | - /* no one should try to iterate before mounting cgroups */ |
---|
4492 | | - WARN_ON_ONCE(!use_task_css_set_links); |
---|
4493 | | - |
---|
4494 | 4697 | memset(it, 0, sizeof(*it)); |
---|
4495 | 4698 | |
---|
4496 | 4699 | spin_lock_irq(&css_set_lock); |
---|
.. | .. |
---|
4567 | 4770 | |
---|
4568 | 4771 | static void cgroup_procs_release(struct kernfs_open_file *of) |
---|
4569 | 4772 | { |
---|
4570 | | - if (of->priv) { |
---|
4571 | | - css_task_iter_end(of->priv); |
---|
4572 | | - kfree(of->priv); |
---|
4573 | | - } |
---|
| 4773 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 4774 | + |
---|
| 4775 | + if (ctx->procs.started) |
---|
| 4776 | + css_task_iter_end(&ctx->procs.iter); |
---|
4574 | 4777 | } |
---|
4575 | 4778 | |
---|
4576 | 4779 | static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) |
---|
4577 | 4780 | { |
---|
4578 | 4781 | struct kernfs_open_file *of = s->private; |
---|
4579 | | - struct css_task_iter *it = of->priv; |
---|
| 4782 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4580 | 4783 | |
---|
4581 | 4784 | if (pos) |
---|
4582 | 4785 | (*pos)++; |
---|
4583 | 4786 | |
---|
4584 | | - return css_task_iter_next(it); |
---|
| 4787 | + return css_task_iter_next(&ctx->procs.iter); |
---|
4585 | 4788 | } |
---|
4586 | 4789 | |
---|
4587 | 4790 | static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, |
---|
.. | .. |
---|
4589 | 4792 | { |
---|
4590 | 4793 | struct kernfs_open_file *of = s->private; |
---|
4591 | 4794 | struct cgroup *cgrp = seq_css(s)->cgroup; |
---|
4592 | | - struct css_task_iter *it = of->priv; |
---|
| 4795 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
| 4796 | + struct css_task_iter *it = &ctx->procs.iter; |
---|
4593 | 4797 | |
---|
4594 | 4798 | /* |
---|
4595 | 4799 | * When a seq_file is seeked, it's always traversed sequentially |
---|
4596 | 4800 | * from position 0, so we can simply keep iterating on !0 *pos. |
---|
4597 | 4801 | */ |
---|
4598 | | - if (!it) { |
---|
| 4802 | + if (!ctx->procs.started) { |
---|
4599 | 4803 | if (WARN_ON_ONCE((*pos))) |
---|
4600 | 4804 | return ERR_PTR(-EINVAL); |
---|
4601 | | - |
---|
4602 | | - it = kzalloc(sizeof(*it), GFP_KERNEL); |
---|
4603 | | - if (!it) |
---|
4604 | | - return ERR_PTR(-ENOMEM); |
---|
4605 | | - of->priv = it; |
---|
4606 | 4805 | css_task_iter_start(&cgrp->self, iter_flags, it); |
---|
| 4806 | + ctx->procs.started = true; |
---|
4607 | 4807 | } else if (!(*pos)) { |
---|
4608 | 4808 | css_task_iter_end(it); |
---|
4609 | 4809 | css_task_iter_start(&cgrp->self, iter_flags, it); |
---|
.. | .. |
---|
4636 | 4836 | return 0; |
---|
4637 | 4837 | } |
---|
4638 | 4838 | |
---|
| 4839 | +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) |
---|
| 4840 | +{ |
---|
| 4841 | + int ret; |
---|
| 4842 | + struct inode *inode; |
---|
| 4843 | + |
---|
| 4844 | + lockdep_assert_held(&cgroup_mutex); |
---|
| 4845 | + |
---|
| 4846 | + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); |
---|
| 4847 | + if (!inode) |
---|
| 4848 | + return -ENOMEM; |
---|
| 4849 | + |
---|
| 4850 | + ret = inode_permission(inode, MAY_WRITE); |
---|
| 4851 | + iput(inode); |
---|
| 4852 | + return ret; |
---|
| 4853 | +} |
---|
| 4854 | + |
---|
4639 | 4855 | static int cgroup_procs_write_permission(struct cgroup *src_cgrp, |
---|
4640 | 4856 | struct cgroup *dst_cgrp, |
---|
4641 | | - struct super_block *sb) |
---|
| 4857 | + struct super_block *sb, |
---|
| 4858 | + struct cgroup_namespace *ns) |
---|
4642 | 4859 | { |
---|
4643 | | - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
---|
4644 | 4860 | struct cgroup *com_cgrp = src_cgrp; |
---|
4645 | | - struct inode *inode; |
---|
4646 | 4861 | int ret; |
---|
4647 | 4862 | |
---|
4648 | 4863 | lockdep_assert_held(&cgroup_mutex); |
---|
.. | .. |
---|
4652 | 4867 | com_cgrp = cgroup_parent(com_cgrp); |
---|
4653 | 4868 | |
---|
4654 | 4869 | /* %current should be authorized to migrate to the common ancestor */ |
---|
4655 | | - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); |
---|
4656 | | - if (!inode) |
---|
4657 | | - return -ENOMEM; |
---|
4658 | | - |
---|
4659 | | - ret = inode_permission(inode, MAY_WRITE); |
---|
4660 | | - iput(inode); |
---|
| 4870 | + ret = cgroup_may_write(com_cgrp, sb); |
---|
4661 | 4871 | if (ret) |
---|
4662 | 4872 | return ret; |
---|
4663 | 4873 | |
---|
.. | .. |
---|
4673 | 4883 | return 0; |
---|
4674 | 4884 | } |
---|
4675 | 4885 | |
---|
| 4886 | +static int cgroup_attach_permissions(struct cgroup *src_cgrp, |
---|
| 4887 | + struct cgroup *dst_cgrp, |
---|
| 4888 | + struct super_block *sb, bool threadgroup, |
---|
| 4889 | + struct cgroup_namespace *ns) |
---|
| 4890 | +{ |
---|
| 4891 | + int ret = 0; |
---|
| 4892 | + |
---|
| 4893 | + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); |
---|
| 4894 | + if (ret) |
---|
| 4895 | + return ret; |
---|
| 4896 | + |
---|
| 4897 | + ret = cgroup_migrate_vet_dst(dst_cgrp); |
---|
| 4898 | + if (ret) |
---|
| 4899 | + return ret; |
---|
| 4900 | + |
---|
| 4901 | + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) |
---|
| 4902 | + ret = -EOPNOTSUPP; |
---|
| 4903 | + |
---|
| 4904 | + return ret; |
---|
| 4905 | +} |
---|
| 4906 | + |
---|
4676 | 4907 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, |
---|
4677 | 4908 | char *buf, size_t nbytes, loff_t off) |
---|
4678 | 4909 | { |
---|
| 4910 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4679 | 4911 | struct cgroup *src_cgrp, *dst_cgrp; |
---|
4680 | 4912 | struct task_struct *task; |
---|
| 4913 | + const struct cred *saved_cred; |
---|
4681 | 4914 | ssize_t ret; |
---|
| 4915 | + bool threadgroup_locked; |
---|
4682 | 4916 | |
---|
4683 | 4917 | dst_cgrp = cgroup_kn_lock_live(of->kn, false); |
---|
4684 | 4918 | if (!dst_cgrp) |
---|
4685 | 4919 | return -ENODEV; |
---|
4686 | 4920 | |
---|
4687 | | - task = cgroup_procs_write_start(buf, true); |
---|
| 4921 | + task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp); |
---|
4688 | 4922 | ret = PTR_ERR_OR_ZERO(task); |
---|
4689 | 4923 | if (ret) |
---|
4690 | 4924 | goto out_unlock; |
---|
.. | .. |
---|
4694 | 4928 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
---|
4695 | 4929 | spin_unlock_irq(&css_set_lock); |
---|
4696 | 4930 | |
---|
4697 | | - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, |
---|
4698 | | - of->file->f_path.dentry->d_sb); |
---|
| 4931 | + /* |
---|
| 4932 | + * Process and thread migrations follow same delegation rule. Check |
---|
| 4933 | + * permissions using the credentials from file open to protect against |
---|
| 4934 | + * inherited fd attacks. |
---|
| 4935 | + */ |
---|
| 4936 | + saved_cred = override_creds(of->file->f_cred); |
---|
| 4937 | + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, |
---|
| 4938 | + of->file->f_path.dentry->d_sb, true, |
---|
| 4939 | + ctx->ns); |
---|
| 4940 | + revert_creds(saved_cred); |
---|
4699 | 4941 | if (ret) |
---|
4700 | 4942 | goto out_finish; |
---|
4701 | 4943 | |
---|
4702 | 4944 | ret = cgroup_attach_task(dst_cgrp, task, true); |
---|
4703 | 4945 | |
---|
4704 | 4946 | out_finish: |
---|
4705 | | - cgroup_procs_write_finish(task); |
---|
| 4947 | + cgroup_procs_write_finish(task, threadgroup_locked); |
---|
4706 | 4948 | out_unlock: |
---|
4707 | 4949 | cgroup_kn_unlock(of->kn); |
---|
4708 | 4950 | |
---|
.. | .. |
---|
4717 | 4959 | static ssize_t cgroup_threads_write(struct kernfs_open_file *of, |
---|
4718 | 4960 | char *buf, size_t nbytes, loff_t off) |
---|
4719 | 4961 | { |
---|
| 4962 | + struct cgroup_file_ctx *ctx = of->priv; |
---|
4720 | 4963 | struct cgroup *src_cgrp, *dst_cgrp; |
---|
4721 | 4964 | struct task_struct *task; |
---|
| 4965 | + const struct cred *saved_cred; |
---|
4722 | 4966 | ssize_t ret; |
---|
| 4967 | + bool threadgroup_locked; |
---|
4723 | 4968 | |
---|
4724 | 4969 | buf = strstrip(buf); |
---|
4725 | 4970 | |
---|
.. | .. |
---|
4727 | 4972 | if (!dst_cgrp) |
---|
4728 | 4973 | return -ENODEV; |
---|
4729 | 4974 | |
---|
4730 | | - task = cgroup_procs_write_start(buf, false); |
---|
| 4975 | + task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp); |
---|
4731 | 4976 | ret = PTR_ERR_OR_ZERO(task); |
---|
4732 | 4977 | if (ret) |
---|
4733 | 4978 | goto out_unlock; |
---|
.. | .. |
---|
4737 | 4982 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
---|
4738 | 4983 | spin_unlock_irq(&css_set_lock); |
---|
4739 | 4984 | |
---|
4740 | | - /* thread migrations follow the cgroup.procs delegation rule */ |
---|
4741 | | - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, |
---|
4742 | | - of->file->f_path.dentry->d_sb); |
---|
| 4985 | + /* |
---|
| 4986 | + * Process and thread migrations follow same delegation rule. Check |
---|
| 4987 | + * permissions using the credentials from file open to protect against |
---|
| 4988 | + * inherited fd attacks. |
---|
| 4989 | + */ |
---|
| 4990 | + saved_cred = override_creds(of->file->f_cred); |
---|
| 4991 | + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, |
---|
| 4992 | + of->file->f_path.dentry->d_sb, false, |
---|
| 4993 | + ctx->ns); |
---|
| 4994 | + revert_creds(saved_cred); |
---|
4743 | 4995 | if (ret) |
---|
4744 | | - goto out_finish; |
---|
4745 | | - |
---|
4746 | | - /* and must be contained in the same domain */ |
---|
4747 | | - ret = -EOPNOTSUPP; |
---|
4748 | | - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) |
---|
4749 | 4996 | goto out_finish; |
---|
4750 | 4997 | |
---|
4751 | 4998 | ret = cgroup_attach_task(dst_cgrp, task, false); |
---|
4752 | 4999 | |
---|
4753 | 5000 | out_finish: |
---|
4754 | | - cgroup_procs_write_finish(task); |
---|
| 5001 | + cgroup_procs_write_finish(task, threadgroup_locked); |
---|
4755 | 5002 | out_unlock: |
---|
4756 | 5003 | cgroup_kn_unlock(of->kn); |
---|
4757 | 5004 | |
---|
.. | .. |
---|
4823 | 5070 | }, |
---|
4824 | 5071 | { |
---|
4825 | 5072 | .name = "cpu.stat", |
---|
4826 | | - .flags = CFTYPE_NOT_ON_ROOT, |
---|
4827 | 5073 | .seq_show = cpu_stat_show, |
---|
4828 | 5074 | }, |
---|
4829 | 5075 | #ifdef CONFIG_PSI |
---|
4830 | 5076 | { |
---|
4831 | 5077 | .name = "io.pressure", |
---|
4832 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5078 | + .flags = CFTYPE_PRESSURE, |
---|
4833 | 5079 | .seq_show = cgroup_io_pressure_show, |
---|
4834 | 5080 | .write = cgroup_io_pressure_write, |
---|
4835 | 5081 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4837 | 5083 | }, |
---|
4838 | 5084 | { |
---|
4839 | 5085 | .name = "memory.pressure", |
---|
4840 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5086 | + .flags = CFTYPE_PRESSURE, |
---|
4841 | 5087 | .seq_show = cgroup_memory_pressure_show, |
---|
4842 | 5088 | .write = cgroup_memory_pressure_write, |
---|
4843 | 5089 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4845 | 5091 | }, |
---|
4846 | 5092 | { |
---|
4847 | 5093 | .name = "cpu.pressure", |
---|
4848 | | - .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE, |
---|
| 5094 | + .flags = CFTYPE_PRESSURE, |
---|
4849 | 5095 | .seq_show = cgroup_cpu_pressure_show, |
---|
4850 | 5096 | .write = cgroup_cpu_pressure_write, |
---|
4851 | 5097 | .poll = cgroup_pressure_poll, |
---|
.. | .. |
---|
4964 | 5210 | tcgrp->nr_dying_descendants--; |
---|
4965 | 5211 | spin_unlock_irq(&css_set_lock); |
---|
4966 | 5212 | |
---|
4967 | | - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
---|
4968 | | - cgrp->id = -1; |
---|
4969 | | - |
---|
4970 | 5213 | /* |
---|
4971 | 5214 | * There are two control paths which try to determine |
---|
4972 | 5215 | * cgroup from dentry without going through kernfs - |
---|
.. | .. |
---|
4977 | 5220 | if (cgrp->kn) |
---|
4978 | 5221 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, |
---|
4979 | 5222 | NULL); |
---|
4980 | | - |
---|
4981 | | - cgroup_bpf_put(cgrp); |
---|
4982 | 5223 | } |
---|
4983 | 5224 | |
---|
4984 | 5225 | mutex_unlock(&cgroup_mutex); |
---|
.. | .. |
---|
5133 | 5374 | * it isn't associated with its kernfs_node and doesn't have the control |
---|
5134 | 5375 | * mask applied. |
---|
5135 | 5376 | */ |
---|
5136 | | -static struct cgroup *cgroup_create(struct cgroup *parent) |
---|
| 5377 | +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, |
---|
| 5378 | + umode_t mode) |
---|
5137 | 5379 | { |
---|
5138 | 5380 | struct cgroup_root *root = parent->root; |
---|
5139 | 5381 | struct cgroup *cgrp, *tcgrp; |
---|
| 5382 | + struct kernfs_node *kn; |
---|
5140 | 5383 | int level = parent->level + 1; |
---|
5141 | 5384 | int ret; |
---|
5142 | 5385 | |
---|
.. | .. |
---|
5156 | 5399 | goto out_cancel_ref; |
---|
5157 | 5400 | } |
---|
5158 | 5401 | |
---|
5159 | | - /* |
---|
5160 | | - * Temporarily set the pointer to NULL, so idr_find() won't return |
---|
5161 | | - * a half-baked cgroup. |
---|
5162 | | - */ |
---|
5163 | | - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); |
---|
5164 | | - if (cgrp->id < 0) { |
---|
5165 | | - ret = -ENOMEM; |
---|
| 5402 | + /* create the directory */ |
---|
| 5403 | + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
---|
| 5404 | + if (IS_ERR(kn)) { |
---|
| 5405 | + ret = PTR_ERR(kn); |
---|
5166 | 5406 | goto out_stat_exit; |
---|
5167 | 5407 | } |
---|
| 5408 | + cgrp->kn = kn; |
---|
5168 | 5409 | |
---|
5169 | 5410 | init_cgroup_housekeeping(cgrp); |
---|
5170 | 5411 | |
---|
.. | .. |
---|
5174 | 5415 | |
---|
5175 | 5416 | ret = psi_cgroup_alloc(cgrp); |
---|
5176 | 5417 | if (ret) |
---|
5177 | | - goto out_idr_free; |
---|
| 5418 | + goto out_kernfs_remove; |
---|
5178 | 5419 | |
---|
5179 | 5420 | ret = cgroup_bpf_inherit(cgrp); |
---|
5180 | 5421 | if (ret) |
---|
.. | .. |
---|
5198 | 5439 | |
---|
5199 | 5440 | spin_lock_irq(&css_set_lock); |
---|
5200 | 5441 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { |
---|
5201 | | - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; |
---|
| 5442 | + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); |
---|
5202 | 5443 | |
---|
5203 | 5444 | if (tcgrp != cgrp) { |
---|
5204 | 5445 | tcgrp->nr_descendants++; |
---|
.. | .. |
---|
5228 | 5469 | cgroup_get_live(parent); |
---|
5229 | 5470 | |
---|
5230 | 5471 | /* |
---|
5231 | | - * @cgrp is now fully operational. If something fails after this |
---|
5232 | | - * point, it'll be released via the normal destruction path. |
---|
5233 | | - */ |
---|
5234 | | - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
---|
5235 | | - |
---|
5236 | | - /* |
---|
5237 | 5472 | * On the default hierarchy, a child doesn't automatically inherit |
---|
5238 | 5473 | * subtree_control from the parent. Each is configured manually. |
---|
5239 | 5474 | */ |
---|
.. | .. |
---|
5246 | 5481 | |
---|
5247 | 5482 | out_psi_free: |
---|
5248 | 5483 | psi_cgroup_free(cgrp); |
---|
5249 | | -out_idr_free: |
---|
5250 | | - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
---|
| 5484 | +out_kernfs_remove: |
---|
| 5485 | + kernfs_remove(cgrp->kn); |
---|
5251 | 5486 | out_stat_exit: |
---|
5252 | 5487 | if (cgroup_on_dfl(parent)) |
---|
5253 | 5488 | cgroup_rstat_exit(cgrp); |
---|
.. | .. |
---|
5284 | 5519 | int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) |
---|
5285 | 5520 | { |
---|
5286 | 5521 | struct cgroup *parent, *cgrp; |
---|
5287 | | - struct kernfs_node *kn; |
---|
5288 | 5522 | int ret; |
---|
5289 | 5523 | |
---|
5290 | 5524 | /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ |
---|
.. | .. |
---|
5300 | 5534 | goto out_unlock; |
---|
5301 | 5535 | } |
---|
5302 | 5536 | |
---|
5303 | | - cgrp = cgroup_create(parent); |
---|
| 5537 | + cgrp = cgroup_create(parent, name, mode); |
---|
5304 | 5538 | if (IS_ERR(cgrp)) { |
---|
5305 | 5539 | ret = PTR_ERR(cgrp); |
---|
5306 | 5540 | goto out_unlock; |
---|
5307 | 5541 | } |
---|
5308 | 5542 | |
---|
5309 | | - /* create the directory */ |
---|
5310 | | - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
---|
5311 | | - if (IS_ERR(kn)) { |
---|
5312 | | - ret = PTR_ERR(kn); |
---|
5313 | | - goto out_destroy; |
---|
5314 | | - } |
---|
5315 | | - cgrp->kn = kn; |
---|
5316 | | - |
---|
5317 | 5543 | /* |
---|
5318 | 5544 | * This extra ref will be put in cgroup_free_fn() and guarantees |
---|
5319 | 5545 | * that @cgrp->kn is always accessible. |
---|
5320 | 5546 | */ |
---|
5321 | | - kernfs_get(kn); |
---|
| 5547 | + kernfs_get(cgrp->kn); |
---|
5322 | 5548 | |
---|
5323 | | - ret = cgroup_kn_set_ugid(kn); |
---|
| 5549 | + ret = cgroup_kn_set_ugid(cgrp->kn); |
---|
5324 | 5550 | if (ret) |
---|
5325 | 5551 | goto out_destroy; |
---|
5326 | 5552 | |
---|
.. | .. |
---|
5335 | 5561 | TRACE_CGROUP_PATH(mkdir, cgrp); |
---|
5336 | 5562 | |
---|
5337 | 5563 | /* let's create and online css's */ |
---|
5338 | | - kernfs_activate(kn); |
---|
| 5564 | + kernfs_activate(cgrp->kn); |
---|
5339 | 5565 | |
---|
5340 | 5566 | ret = 0; |
---|
5341 | 5567 | goto out_unlock; |
---|
.. | .. |
---|
5512 | 5738 | |
---|
5513 | 5739 | cgroup1_check_for_release(parent); |
---|
5514 | 5740 | |
---|
| 5741 | + cgroup_bpf_offline(cgrp); |
---|
| 5742 | + |
---|
5515 | 5743 | /* put the base reference */ |
---|
5516 | 5744 | percpu_ref_kill(&cgrp->self.refcnt); |
---|
5517 | 5745 | |
---|
.. | .. |
---|
5537 | 5765 | |
---|
5538 | 5766 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { |
---|
5539 | 5767 | .show_options = cgroup_show_options, |
---|
5540 | | - .remount_fs = cgroup_remount, |
---|
5541 | 5768 | .mkdir = cgroup_mkdir, |
---|
5542 | 5769 | .rmdir = cgroup_rmdir, |
---|
5543 | 5770 | .show_path = cgroup_show_path, |
---|
.. | .. |
---|
5604 | 5831 | */ |
---|
5605 | 5832 | int __init cgroup_init_early(void) |
---|
5606 | 5833 | { |
---|
5607 | | - static struct cgroup_sb_opts __initdata opts; |
---|
| 5834 | + static struct cgroup_fs_context __initdata ctx; |
---|
5608 | 5835 | struct cgroup_subsys *ss; |
---|
5609 | 5836 | int i; |
---|
5610 | 5837 | |
---|
5611 | | - init_cgroup_root(&cgrp_dfl_root, &opts); |
---|
| 5838 | + ctx.root = &cgrp_dfl_root; |
---|
| 5839 | + init_cgroup_root(&ctx); |
---|
5612 | 5840 | cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; |
---|
5613 | 5841 | |
---|
5614 | 5842 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
---|
.. | .. |
---|
5644 | 5872 | int ssid; |
---|
5645 | 5873 | |
---|
5646 | 5874 | BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); |
---|
5647 | | - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); |
---|
5648 | 5875 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
---|
5649 | 5876 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); |
---|
5650 | 5877 | |
---|
5651 | 5878 | cgroup_rstat_boot(); |
---|
5652 | 5879 | |
---|
5653 | 5880 | /* |
---|
5654 | | - * The latency of the synchronize_sched() is too high for cgroups, |
---|
| 5881 | + * The latency of the synchronize_rcu() is too high for cgroups, |
---|
5655 | 5882 | * avoid it at the cost of forcing all readers into the slow path. |
---|
5656 | 5883 | */ |
---|
5657 | 5884 | rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); |
---|
.. | .. |
---|
5735 | 5962 | WARN_ON(register_filesystem(&cgroup_fs_type)); |
---|
5736 | 5963 | WARN_ON(register_filesystem(&cgroup2_fs_type)); |
---|
5737 | 5964 | WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); |
---|
| 5965 | +#ifdef CONFIG_CPUSETS |
---|
| 5966 | + WARN_ON(register_filesystem(&cpuset_fs_type)); |
---|
| 5967 | +#endif |
---|
5738 | 5968 | |
---|
5739 | 5969 | return 0; |
---|
5740 | 5970 | } |
---|
.. | .. |
---|
5755 | 5985 | } |
---|
5756 | 5986 | core_initcall(cgroup_wq_init); |
---|
5757 | 5987 | |
---|
5758 | | -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, |
---|
5759 | | - char *buf, size_t buflen) |
---|
| 5988 | +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) |
---|
5760 | 5989 | { |
---|
5761 | 5990 | struct kernfs_node *kn; |
---|
5762 | 5991 | |
---|
5763 | | - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); |
---|
| 5992 | + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); |
---|
5764 | 5993 | if (!kn) |
---|
5765 | 5994 | return; |
---|
5766 | 5995 | kernfs_path(kn, buf, buflen); |
---|
.. | .. |
---|
5850 | 6079 | * @child: pointer to task_struct of forking parent process. |
---|
5851 | 6080 | * |
---|
5852 | 6081 | * A task is associated with the init_css_set until cgroup_post_fork() |
---|
5853 | | - * attaches it to the parent's css_set. Empty cg_list indicates that |
---|
5854 | | - * @child isn't holding reference to its css_set. |
---|
| 6082 | + * attaches it to the target css_set. |
---|
5855 | 6083 | */ |
---|
5856 | 6084 | void cgroup_fork(struct task_struct *child) |
---|
5857 | 6085 | { |
---|
.. | .. |
---|
5859 | 6087 | INIT_LIST_HEAD(&child->cg_list); |
---|
5860 | 6088 | } |
---|
5861 | 6089 | |
---|
| 6090 | +static struct cgroup *cgroup_get_from_file(struct file *f) |
---|
| 6091 | +{ |
---|
| 6092 | + struct cgroup_subsys_state *css; |
---|
| 6093 | + struct cgroup *cgrp; |
---|
| 6094 | + |
---|
| 6095 | + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); |
---|
| 6096 | + if (IS_ERR(css)) |
---|
| 6097 | + return ERR_CAST(css); |
---|
| 6098 | + |
---|
| 6099 | + cgrp = css->cgroup; |
---|
| 6100 | + if (!cgroup_on_dfl(cgrp)) { |
---|
| 6101 | + cgroup_put(cgrp); |
---|
| 6102 | + return ERR_PTR(-EBADF); |
---|
| 6103 | + } |
---|
| 6104 | + |
---|
| 6105 | + return cgrp; |
---|
| 6106 | +} |
---|
| 6107 | + |
---|
| 6108 | +/** |
---|
| 6109 | + * cgroup_css_set_fork - find or create a css_set for a child process |
---|
| 6110 | + * @kargs: the arguments passed to create the child process |
---|
| 6111 | + * |
---|
| 6112 | + * This functions finds or creates a new css_set which the child |
---|
| 6113 | + * process will be attached to in cgroup_post_fork(). By default, |
---|
| 6114 | + * the child process will be given the same css_set as its parent. |
---|
| 6115 | + * |
---|
| 6116 | + * If CLONE_INTO_CGROUP is specified this function will try to find an |
---|
| 6117 | + * existing css_set which includes the requested cgroup and if not create |
---|
| 6118 | + * a new css_set that the child will be attached to later. If this function |
---|
| 6119 | + * succeeds it will hold cgroup_threadgroup_rwsem on return. If |
---|
| 6120 | + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex |
---|
| 6121 | + * before grabbing cgroup_threadgroup_rwsem and will hold a reference |
---|
| 6122 | + * to the target cgroup. |
---|
| 6123 | + */ |
---|
| 6124 | +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) |
---|
| 6125 | + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) |
---|
| 6126 | +{ |
---|
| 6127 | + int ret; |
---|
| 6128 | + struct cgroup *dst_cgrp = NULL; |
---|
| 6129 | + struct css_set *cset; |
---|
| 6130 | + struct super_block *sb; |
---|
| 6131 | + struct file *f; |
---|
| 6132 | + |
---|
| 6133 | + if (kargs->flags & CLONE_INTO_CGROUP) |
---|
| 6134 | + mutex_lock(&cgroup_mutex); |
---|
| 6135 | + |
---|
| 6136 | + cgroup_threadgroup_change_begin(current); |
---|
| 6137 | + |
---|
| 6138 | + spin_lock_irq(&css_set_lock); |
---|
| 6139 | + cset = task_css_set(current); |
---|
| 6140 | + get_css_set(cset); |
---|
| 6141 | + spin_unlock_irq(&css_set_lock); |
---|
| 6142 | + |
---|
| 6143 | + if (!(kargs->flags & CLONE_INTO_CGROUP)) { |
---|
| 6144 | + kargs->cset = cset; |
---|
| 6145 | + return 0; |
---|
| 6146 | + } |
---|
| 6147 | + |
---|
| 6148 | + f = fget_raw(kargs->cgroup); |
---|
| 6149 | + if (!f) { |
---|
| 6150 | + ret = -EBADF; |
---|
| 6151 | + goto err; |
---|
| 6152 | + } |
---|
| 6153 | + sb = f->f_path.dentry->d_sb; |
---|
| 6154 | + |
---|
| 6155 | + dst_cgrp = cgroup_get_from_file(f); |
---|
| 6156 | + if (IS_ERR(dst_cgrp)) { |
---|
| 6157 | + ret = PTR_ERR(dst_cgrp); |
---|
| 6158 | + dst_cgrp = NULL; |
---|
| 6159 | + goto err; |
---|
| 6160 | + } |
---|
| 6161 | + |
---|
| 6162 | + if (cgroup_is_dead(dst_cgrp)) { |
---|
| 6163 | + ret = -ENODEV; |
---|
| 6164 | + goto err; |
---|
| 6165 | + } |
---|
| 6166 | + |
---|
| 6167 | + /* |
---|
| 6168 | + * Verify that we the target cgroup is writable for us. This is |
---|
| 6169 | + * usually done by the vfs layer but since we're not going through |
---|
| 6170 | + * the vfs layer here we need to do it "manually". |
---|
| 6171 | + */ |
---|
| 6172 | + ret = cgroup_may_write(dst_cgrp, sb); |
---|
| 6173 | + if (ret) |
---|
| 6174 | + goto err; |
---|
| 6175 | + |
---|
| 6176 | + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, |
---|
| 6177 | + !(kargs->flags & CLONE_THREAD), |
---|
| 6178 | + current->nsproxy->cgroup_ns); |
---|
| 6179 | + if (ret) |
---|
| 6180 | + goto err; |
---|
| 6181 | + |
---|
| 6182 | + kargs->cset = find_css_set(cset, dst_cgrp); |
---|
| 6183 | + if (!kargs->cset) { |
---|
| 6184 | + ret = -ENOMEM; |
---|
| 6185 | + goto err; |
---|
| 6186 | + } |
---|
| 6187 | + |
---|
| 6188 | + put_css_set(cset); |
---|
| 6189 | + fput(f); |
---|
| 6190 | + kargs->cgrp = dst_cgrp; |
---|
| 6191 | + return ret; |
---|
| 6192 | + |
---|
| 6193 | +err: |
---|
| 6194 | + cgroup_threadgroup_change_end(current); |
---|
| 6195 | + mutex_unlock(&cgroup_mutex); |
---|
| 6196 | + if (f) |
---|
| 6197 | + fput(f); |
---|
| 6198 | + if (dst_cgrp) |
---|
| 6199 | + cgroup_put(dst_cgrp); |
---|
| 6200 | + put_css_set(cset); |
---|
| 6201 | + if (kargs->cset) |
---|
| 6202 | + put_css_set(kargs->cset); |
---|
| 6203 | + return ret; |
---|
| 6204 | +} |
---|
| 6205 | + |
---|
| 6206 | +/** |
---|
| 6207 | + * cgroup_css_set_put_fork - drop references we took during fork |
---|
| 6208 | + * @kargs: the arguments passed to create the child process |
---|
| 6209 | + * |
---|
| 6210 | + * Drop references to the prepared css_set and target cgroup if |
---|
| 6211 | + * CLONE_INTO_CGROUP was requested. |
---|
| 6212 | + */ |
---|
| 6213 | +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) |
---|
| 6214 | + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) |
---|
| 6215 | +{ |
---|
| 6216 | + struct cgroup *cgrp = kargs->cgrp; |
---|
| 6217 | + struct css_set *cset = kargs->cset; |
---|
| 6218 | + |
---|
| 6219 | + cgroup_threadgroup_change_end(current); |
---|
| 6220 | + |
---|
| 6221 | + if (cset) { |
---|
| 6222 | + put_css_set(cset); |
---|
| 6223 | + kargs->cset = NULL; |
---|
| 6224 | + } |
---|
| 6225 | + |
---|
| 6226 | + if (kargs->flags & CLONE_INTO_CGROUP) { |
---|
| 6227 | + mutex_unlock(&cgroup_mutex); |
---|
| 6228 | + if (cgrp) { |
---|
| 6229 | + cgroup_put(cgrp); |
---|
| 6230 | + kargs->cgrp = NULL; |
---|
| 6231 | + } |
---|
| 6232 | + } |
---|
| 6233 | +} |
---|
| 6234 | + |
---|
5862 | 6235 | /** |
---|
5863 | 6236 | * cgroup_can_fork - called on a new task before the process is exposed |
---|
5864 | | - * @child: the task in question. |
---|
| 6237 | + * @child: the child process |
---|
5865 | 6238 | * |
---|
5866 | | - * This calls the subsystem can_fork() callbacks. If the can_fork() callback |
---|
5867 | | - * returns an error, the fork aborts with that error code. This allows for |
---|
5868 | | - * a cgroup subsystem to conditionally allow or deny new forks. |
---|
| 6239 | + * This prepares a new css_set for the child process which the child will |
---|
| 6240 | + * be attached to in cgroup_post_fork(). |
---|
| 6241 | + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() |
---|
| 6242 | + * callback returns an error, the fork aborts with that error code. This |
---|
| 6243 | + * allows for a cgroup subsystem to conditionally allow or deny new forks. |
---|
5869 | 6244 | */ |
---|
5870 | | -int cgroup_can_fork(struct task_struct *child) |
---|
| 6245 | +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) |
---|
5871 | 6246 | { |
---|
5872 | 6247 | struct cgroup_subsys *ss; |
---|
5873 | 6248 | int i, j, ret; |
---|
5874 | 6249 | |
---|
| 6250 | + ret = cgroup_css_set_fork(kargs); |
---|
| 6251 | + if (ret) |
---|
| 6252 | + return ret; |
---|
| 6253 | + |
---|
5875 | 6254 | do_each_subsys_mask(ss, i, have_canfork_callback) { |
---|
5876 | | - ret = ss->can_fork(child); |
---|
| 6255 | + ret = ss->can_fork(child, kargs->cset); |
---|
5877 | 6256 | if (ret) |
---|
5878 | 6257 | goto out_revert; |
---|
5879 | 6258 | } while_each_subsys_mask(); |
---|
.. | .. |
---|
5885 | 6264 | if (j >= i) |
---|
5886 | 6265 | break; |
---|
5887 | 6266 | if (ss->cancel_fork) |
---|
5888 | | - ss->cancel_fork(child); |
---|
| 6267 | + ss->cancel_fork(child, kargs->cset); |
---|
5889 | 6268 | } |
---|
| 6269 | + |
---|
| 6270 | + cgroup_css_set_put_fork(kargs); |
---|
5890 | 6271 | |
---|
5891 | 6272 | return ret; |
---|
5892 | 6273 | } |
---|
5893 | 6274 | |
---|
5894 | 6275 | /** |
---|
5895 | 6276 | * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() |
---|
5896 | | - * @child: the task in question |
---|
| 6277 | + * @child: the child process |
---|
| 6278 | + * @kargs: the arguments passed to create the child process |
---|
5897 | 6279 | * |
---|
5898 | 6280 | * This calls the cancel_fork() callbacks if a fork failed *after* |
---|
5899 | | - * cgroup_can_fork() succeded. |
---|
| 6281 | + * cgroup_can_fork() succeded and cleans up references we took to |
---|
| 6282 | + * prepare a new css_set for the child process in cgroup_can_fork(). |
---|
5900 | 6283 | */ |
---|
5901 | | -void cgroup_cancel_fork(struct task_struct *child) |
---|
| 6284 | +void cgroup_cancel_fork(struct task_struct *child, |
---|
| 6285 | + struct kernel_clone_args *kargs) |
---|
5902 | 6286 | { |
---|
5903 | 6287 | struct cgroup_subsys *ss; |
---|
5904 | 6288 | int i; |
---|
5905 | 6289 | |
---|
5906 | 6290 | for_each_subsys(ss, i) |
---|
5907 | 6291 | if (ss->cancel_fork) |
---|
5908 | | - ss->cancel_fork(child); |
---|
| 6292 | + ss->cancel_fork(child, kargs->cset); |
---|
| 6293 | + |
---|
| 6294 | + cgroup_css_set_put_fork(kargs); |
---|
5909 | 6295 | } |
---|
5910 | 6296 | |
---|
5911 | 6297 | /** |
---|
5912 | | - * cgroup_post_fork - called on a new task after adding it to the task list |
---|
5913 | | - * @child: the task in question |
---|
| 6298 | + * cgroup_post_fork - finalize cgroup setup for the child process |
---|
| 6299 | + * @child: the child process |
---|
5914 | 6300 | * |
---|
5915 | | - * Adds the task to the list running through its css_set if necessary and |
---|
5916 | | - * call the subsystem fork() callbacks. Has to be after the task is |
---|
5917 | | - * visible on the task list in case we race with the first call to |
---|
5918 | | - * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
---|
5919 | | - * list. |
---|
| 6301 | + * Attach the child process to its css_set calling the subsystem fork() |
---|
| 6302 | + * callbacks. |
---|
5920 | 6303 | */ |
---|
5921 | | -void cgroup_post_fork(struct task_struct *child) |
---|
| 6304 | +void cgroup_post_fork(struct task_struct *child, |
---|
| 6305 | + struct kernel_clone_args *kargs) |
---|
| 6306 | + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) |
---|
5922 | 6307 | { |
---|
5923 | 6308 | struct cgroup_subsys *ss; |
---|
| 6309 | + struct css_set *cset; |
---|
5924 | 6310 | int i; |
---|
5925 | 6311 | |
---|
5926 | | - /* |
---|
5927 | | - * This may race against cgroup_enable_task_cg_lists(). As that |
---|
5928 | | - * function sets use_task_css_set_links before grabbing |
---|
5929 | | - * tasklist_lock and we just went through tasklist_lock to add |
---|
5930 | | - * @child, it's guaranteed that either we see the set |
---|
5931 | | - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees |
---|
5932 | | - * @child during its iteration. |
---|
5933 | | - * |
---|
5934 | | - * If we won the race, @child is associated with %current's |
---|
5935 | | - * css_set. Grabbing css_set_lock guarantees both that the |
---|
5936 | | - * association is stable, and, on completion of the parent's |
---|
5937 | | - * migration, @child is visible in the source of migration or |
---|
5938 | | - * already in the destination cgroup. This guarantee is necessary |
---|
5939 | | - * when implementing operations which need to migrate all tasks of |
---|
5940 | | - * a cgroup to another. |
---|
5941 | | - * |
---|
5942 | | - * Note that if we lose to cgroup_enable_task_cg_lists(), @child |
---|
5943 | | - * will remain in init_css_set. This is safe because all tasks are |
---|
5944 | | - * in the init_css_set before cg_links is enabled and there's no |
---|
5945 | | - * operation which transfers all tasks out of init_css_set. |
---|
5946 | | - */ |
---|
5947 | | - if (use_task_css_set_links) { |
---|
5948 | | - struct css_set *cset; |
---|
| 6312 | + cset = kargs->cset; |
---|
| 6313 | + kargs->cset = NULL; |
---|
5949 | 6314 | |
---|
5950 | | - spin_lock_irq(&css_set_lock); |
---|
5951 | | - cset = task_css_set(current); |
---|
5952 | | - if (list_empty(&child->cg_list)) { |
---|
5953 | | - get_css_set(cset); |
---|
5954 | | - cset->nr_tasks++; |
---|
5955 | | - css_set_move_task(child, NULL, cset, false); |
---|
5956 | | - } |
---|
| 6315 | + spin_lock_irq(&css_set_lock); |
---|
| 6316 | + |
---|
| 6317 | + /* init tasks are special, only link regular threads */ |
---|
| 6318 | + if (likely(child->pid)) { |
---|
| 6319 | + WARN_ON_ONCE(!list_empty(&child->cg_list)); |
---|
| 6320 | + cset->nr_tasks++; |
---|
| 6321 | + css_set_move_task(child, NULL, cset, false); |
---|
| 6322 | + } else { |
---|
| 6323 | + put_css_set(cset); |
---|
| 6324 | + cset = NULL; |
---|
| 6325 | + } |
---|
| 6326 | + |
---|
| 6327 | + /* |
---|
| 6328 | + * If the cgroup has to be frozen, the new task has too. Let's set |
---|
| 6329 | + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the |
---|
| 6330 | + * frozen state. |
---|
| 6331 | + */ |
---|
| 6332 | + if (unlikely(cgroup_task_freeze(child))) { |
---|
| 6333 | + spin_lock(&child->sighand->siglock); |
---|
| 6334 | + WARN_ON_ONCE(child->frozen); |
---|
| 6335 | + child->jobctl |= JOBCTL_TRAP_FREEZE; |
---|
| 6336 | + spin_unlock(&child->sighand->siglock); |
---|
5957 | 6337 | |
---|
5958 | 6338 | /* |
---|
5959 | | - * If the cgroup has to be frozen, the new task has too. |
---|
5960 | | - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get |
---|
5961 | | - * the task into the frozen state. |
---|
| 6339 | + * Calling cgroup_update_frozen() isn't required here, |
---|
| 6340 | + * because it will be called anyway a bit later from |
---|
| 6341 | + * do_freezer_trap(). So we avoid cgroup's transient switch |
---|
| 6342 | + * from the frozen state and back. |
---|
5962 | 6343 | */ |
---|
5963 | | - if (unlikely(cgroup_task_freeze(child))) { |
---|
5964 | | - spin_lock(&child->sighand->siglock); |
---|
5965 | | - WARN_ON_ONCE(child->frozen); |
---|
5966 | | - child->jobctl |= JOBCTL_TRAP_FREEZE; |
---|
5967 | | - spin_unlock(&child->sighand->siglock); |
---|
5968 | | - |
---|
5969 | | - /* |
---|
5970 | | - * Calling cgroup_update_frozen() isn't required here, |
---|
5971 | | - * because it will be called anyway a bit later |
---|
5972 | | - * from do_freezer_trap(). So we avoid cgroup's |
---|
5973 | | - * transient switch from the frozen state and back. |
---|
5974 | | - */ |
---|
5975 | | - } |
---|
5976 | | - |
---|
5977 | | - spin_unlock_irq(&css_set_lock); |
---|
5978 | 6344 | } |
---|
| 6345 | + |
---|
| 6346 | + spin_unlock_irq(&css_set_lock); |
---|
5979 | 6347 | |
---|
5980 | 6348 | /* |
---|
5981 | 6349 | * Call ss->fork(). This must happen after @child is linked on |
---|
.. | .. |
---|
5985 | 6353 | do_each_subsys_mask(ss, i, have_fork_callback) { |
---|
5986 | 6354 | ss->fork(child); |
---|
5987 | 6355 | } while_each_subsys_mask(); |
---|
| 6356 | + |
---|
| 6357 | + /* Make the new cset the root_cset of the new cgroup namespace. */ |
---|
| 6358 | + if (kargs->flags & CLONE_NEWCGROUP) { |
---|
| 6359 | + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; |
---|
| 6360 | + |
---|
| 6361 | + get_css_set(cset); |
---|
| 6362 | + child->nsproxy->cgroup_ns->root_cset = cset; |
---|
| 6363 | + put_css_set(rcset); |
---|
| 6364 | + } |
---|
| 6365 | + |
---|
| 6366 | + cgroup_css_set_put_fork(kargs); |
---|
5988 | 6367 | } |
---|
5989 | 6368 | |
---|
5990 | 6369 | /** |
---|
5991 | 6370 | * cgroup_exit - detach cgroup from exiting task |
---|
5992 | 6371 | * @tsk: pointer to task_struct of exiting process |
---|
5993 | 6372 | * |
---|
5994 | | - * Description: Detach cgroup from @tsk and release it. |
---|
| 6373 | + * Description: Detach cgroup from @tsk. |
---|
5995 | 6374 | * |
---|
5996 | | - * Note that cgroups marked notify_on_release force every task in |
---|
5997 | | - * them to take the global cgroup_mutex mutex when exiting. |
---|
5998 | | - * This could impact scaling on very large systems. Be reluctant to |
---|
5999 | | - * use notify_on_release cgroups where very high task exit scaling |
---|
6000 | | - * is required on large systems. |
---|
6001 | | - * |
---|
6002 | | - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We |
---|
6003 | | - * call cgroup_exit() while the task is still competent to handle |
---|
6004 | | - * notify_on_release(), then leave the task attached to the root cgroup in |
---|
6005 | | - * each hierarchy for the remainder of its exit. No need to bother with |
---|
6006 | | - * init_css_set refcnting. init_css_set never goes away and we can't race |
---|
6007 | | - * with migration path - PF_EXITING is visible to migration path. |
---|
6008 | 6375 | */ |
---|
6009 | 6376 | void cgroup_exit(struct task_struct *tsk) |
---|
6010 | 6377 | { |
---|
.. | .. |
---|
6012 | 6379 | struct css_set *cset; |
---|
6013 | 6380 | int i; |
---|
6014 | 6381 | |
---|
6015 | | - /* |
---|
6016 | | - * Unlink from @tsk from its css_set. As migration path can't race |
---|
6017 | | - * with us, we can check css_set and cg_list without synchronization. |
---|
6018 | | - */ |
---|
| 6382 | + spin_lock_irq(&css_set_lock); |
---|
| 6383 | + |
---|
| 6384 | + WARN_ON_ONCE(list_empty(&tsk->cg_list)); |
---|
6019 | 6385 | cset = task_css_set(tsk); |
---|
| 6386 | + css_set_move_task(tsk, cset, NULL, false); |
---|
| 6387 | + list_add_tail(&tsk->cg_list, &cset->dying_tasks); |
---|
| 6388 | + cset->nr_tasks--; |
---|
6020 | 6389 | |
---|
6021 | | - if (!list_empty(&tsk->cg_list)) { |
---|
6022 | | - spin_lock_irq(&css_set_lock); |
---|
6023 | | - css_set_move_task(tsk, cset, NULL, false); |
---|
6024 | | - list_add_tail(&tsk->cg_list, &cset->dying_tasks); |
---|
6025 | | - cset->nr_tasks--; |
---|
| 6390 | + if (dl_task(tsk)) |
---|
| 6391 | + dec_dl_tasks_cs(tsk); |
---|
6026 | 6392 | |
---|
6027 | | - if (unlikely(cgroup_task_frozen(tsk))) |
---|
6028 | | - cgroup_freezer_frozen_exit(tsk); |
---|
6029 | | - else if (unlikely(cgroup_task_freeze(tsk))) |
---|
6030 | | - cgroup_update_frozen(task_dfl_cgroup(tsk)); |
---|
| 6393 | + WARN_ON_ONCE(cgroup_task_frozen(tsk)); |
---|
| 6394 | + if (unlikely(cgroup_task_freeze(tsk))) |
---|
| 6395 | + cgroup_update_frozen(task_dfl_cgroup(tsk)); |
---|
6031 | 6396 | |
---|
6032 | | - spin_unlock_irq(&css_set_lock); |
---|
6033 | | - } else { |
---|
6034 | | - get_css_set(cset); |
---|
6035 | | - } |
---|
| 6397 | + spin_unlock_irq(&css_set_lock); |
---|
6036 | 6398 | |
---|
6037 | 6399 | /* see cgroup_post_fork() for details */ |
---|
6038 | 6400 | do_each_subsys_mask(ss, i, have_exit_callback) { |
---|
.. | .. |
---|
6049 | 6411 | ss->release(task); |
---|
6050 | 6412 | } while_each_subsys_mask(); |
---|
6051 | 6413 | |
---|
6052 | | - if (use_task_css_set_links) { |
---|
6053 | | - spin_lock_irq(&css_set_lock); |
---|
6054 | | - css_set_skip_task_iters(task_css_set(task), task); |
---|
6055 | | - list_del_init(&task->cg_list); |
---|
6056 | | - spin_unlock_irq(&css_set_lock); |
---|
6057 | | - } |
---|
| 6414 | + spin_lock_irq(&css_set_lock); |
---|
| 6415 | + css_set_skip_task_iters(task_css_set(task), task); |
---|
| 6416 | + list_del_init(&task->cg_list); |
---|
| 6417 | + spin_unlock_irq(&css_set_lock); |
---|
6058 | 6418 | } |
---|
6059 | 6419 | |
---|
6060 | 6420 | void cgroup_free(struct task_struct *task) |
---|
.. | .. |
---|
6095 | 6455 | return 1; |
---|
6096 | 6456 | } |
---|
6097 | 6457 | __setup("cgroup_disable=", cgroup_disable); |
---|
| 6458 | + |
---|
| 6459 | +void __init __weak enable_debug_cgroup(void) { } |
---|
| 6460 | + |
---|
| 6461 | +static int __init enable_cgroup_debug(char *str) |
---|
| 6462 | +{ |
---|
| 6463 | + cgroup_debug = true; |
---|
| 6464 | + enable_debug_cgroup(); |
---|
| 6465 | + return 1; |
---|
| 6466 | +} |
---|
| 6467 | +__setup("cgroup_debug", enable_cgroup_debug); |
---|
6098 | 6468 | |
---|
6099 | 6469 | /** |
---|
6100 | 6470 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
---|
.. | .. |
---|
6195 | 6565 | */ |
---|
6196 | 6566 | struct cgroup *cgroup_get_from_fd(int fd) |
---|
6197 | 6567 | { |
---|
6198 | | - struct cgroup_subsys_state *css; |
---|
6199 | 6568 | struct cgroup *cgrp; |
---|
6200 | 6569 | struct file *f; |
---|
6201 | 6570 | |
---|
.. | .. |
---|
6203 | 6572 | if (!f) |
---|
6204 | 6573 | return ERR_PTR(-EBADF); |
---|
6205 | 6574 | |
---|
6206 | | - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); |
---|
| 6575 | + cgrp = cgroup_get_from_file(f); |
---|
6207 | 6576 | fput(f); |
---|
6208 | | - if (IS_ERR(css)) |
---|
6209 | | - return ERR_CAST(css); |
---|
6210 | | - |
---|
6211 | | - cgrp = css->cgroup; |
---|
6212 | | - if (!cgroup_on_dfl(cgrp)) { |
---|
6213 | | - cgroup_put(cgrp); |
---|
6214 | | - return ERR_PTR(-EBADF); |
---|
6215 | | - } |
---|
6216 | | - |
---|
6217 | 6577 | return cgrp; |
---|
6218 | 6578 | } |
---|
6219 | 6579 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); |
---|
.. | .. |
---|
6304 | 6664 | cset = task_css_set(current); |
---|
6305 | 6665 | if (likely(cgroup_tryget(cset->dfl_cgrp))) { |
---|
6306 | 6666 | skcd->val = (unsigned long)cset->dfl_cgrp; |
---|
| 6667 | + cgroup_bpf_get(cset->dfl_cgrp); |
---|
6307 | 6668 | break; |
---|
6308 | 6669 | } |
---|
6309 | 6670 | cpu_relax(); |
---|
.. | .. |
---|
6314 | 6675 | |
---|
6315 | 6676 | void cgroup_sk_clone(struct sock_cgroup_data *skcd) |
---|
6316 | 6677 | { |
---|
6317 | | - /* Socket clone path */ |
---|
6318 | 6678 | if (skcd->val) { |
---|
6319 | 6679 | if (skcd->no_refcnt) |
---|
6320 | 6680 | return; |
---|
.. | .. |
---|
6324 | 6684 | * Don't use cgroup_get_live(). |
---|
6325 | 6685 | */ |
---|
6326 | 6686 | cgroup_get(sock_cgroup_ptr(skcd)); |
---|
| 6687 | + cgroup_bpf_get(sock_cgroup_ptr(skcd)); |
---|
6327 | 6688 | } |
---|
6328 | 6689 | } |
---|
6329 | 6690 | |
---|
6330 | 6691 | void cgroup_sk_free(struct sock_cgroup_data *skcd) |
---|
6331 | 6692 | { |
---|
| 6693 | + struct cgroup *cgrp = sock_cgroup_ptr(skcd); |
---|
| 6694 | + |
---|
6332 | 6695 | if (skcd->no_refcnt) |
---|
6333 | 6696 | return; |
---|
6334 | | - |
---|
6335 | | - cgroup_put(sock_cgroup_ptr(skcd)); |
---|
| 6697 | + cgroup_bpf_put(cgrp); |
---|
| 6698 | + cgroup_put(cgrp); |
---|
6336 | 6699 | } |
---|
6337 | 6700 | |
---|
6338 | 6701 | #endif /* CONFIG_SOCK_CGROUP_DATA */ |
---|
6339 | 6702 | |
---|
6340 | 6703 | #ifdef CONFIG_CGROUP_BPF |
---|
6341 | | -int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, |
---|
6342 | | - enum bpf_attach_type type, u32 flags) |
---|
| 6704 | +int cgroup_bpf_attach(struct cgroup *cgrp, |
---|
| 6705 | + struct bpf_prog *prog, struct bpf_prog *replace_prog, |
---|
| 6706 | + struct bpf_cgroup_link *link, |
---|
| 6707 | + enum bpf_attach_type type, |
---|
| 6708 | + u32 flags) |
---|
6343 | 6709 | { |
---|
6344 | 6710 | int ret; |
---|
6345 | 6711 | |
---|
6346 | 6712 | mutex_lock(&cgroup_mutex); |
---|
6347 | | - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); |
---|
| 6713 | + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); |
---|
6348 | 6714 | mutex_unlock(&cgroup_mutex); |
---|
6349 | 6715 | return ret; |
---|
6350 | 6716 | } |
---|
| 6717 | + |
---|
6351 | 6718 | int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, |
---|
6352 | | - enum bpf_attach_type type, u32 flags) |
---|
| 6719 | + enum bpf_attach_type type) |
---|
6353 | 6720 | { |
---|
6354 | 6721 | int ret; |
---|
6355 | 6722 | |
---|
6356 | 6723 | mutex_lock(&cgroup_mutex); |
---|
6357 | | - ret = __cgroup_bpf_detach(cgrp, prog, type, flags); |
---|
| 6724 | + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); |
---|
6358 | 6725 | mutex_unlock(&cgroup_mutex); |
---|
6359 | 6726 | return ret; |
---|
6360 | 6727 | } |
---|
| 6728 | + |
---|
6361 | 6729 | int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, |
---|
6362 | 6730 | union bpf_attr __user *uattr) |
---|
6363 | 6731 | { |
---|
.. | .. |
---|
6418 | 6786 | static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, |
---|
6419 | 6787 | char *buf) |
---|
6420 | 6788 | { |
---|
6421 | | - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); |
---|
| 6789 | + return snprintf(buf, PAGE_SIZE, |
---|
| 6790 | + "nsdelegate\n" |
---|
| 6791 | + "memory_localevents\n" |
---|
| 6792 | + "memory_recursiveprot\n"); |
---|
6422 | 6793 | } |
---|
6423 | 6794 | static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); |
---|
6424 | 6795 | |
---|