From 244b2c5ca8b14627e4a17755e5922221e121c771 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Wed, 09 Oct 2024 06:15:07 +0000
Subject: [PATCH] change system file
---
kernel/kernel/cgroup/cgroup.c | 1412 +++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 891 insertions(+), 521 deletions(-)
diff --git a/kernel/kernel/cgroup/cgroup.c b/kernel/kernel/cgroup/cgroup.c
index c0430df..193f092 100644
--- a/kernel/kernel/cgroup/cgroup.c
+++ b/kernel/kernel/cgroup/cgroup.c
@@ -54,12 +54,17 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
+#undef CREATE_TRACE_POINTS
+
+#include <trace/hooks/cgroup.h>
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
@@ -86,6 +91,7 @@
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -99,7 +105,7 @@
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -151,11 +157,7 @@
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
+/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
@@ -264,9 +266,6 @@
* can be used to test whether a cgroup is on the default hierarchy for
* cases where a subsystem should behave differnetly depending on the
* interface version.
- *
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
*
* List of changed behaviors:
*
@@ -502,7 +501,7 @@
rcu_read_lock();
css = cgroup_css(cgrp, ss);
- if (!css || !css_tryget_online(css))
+ if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
@@ -510,7 +509,7 @@
}
/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
@@ -519,8 +518,8 @@
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
@@ -538,6 +537,35 @@
}
return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css)
+ return css;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ return init_css_set.subsys[ss->id];
}
/**
@@ -655,10 +683,11 @@
*
* Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- ; \
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
+ cgroup_subsys[(ssid)]))) \
+ ; \
else
/**
@@ -718,25 +747,28 @@
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
-struct css_set init_css_set = {
- .refcount = REFCOUNT_INIT(1),
- .dom_cset = &init_css_set,
- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
- .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
- .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
-
- /*
- * The following field is re-initialized when this cset gets linked
- * in cgroup_init(). However, let's initialize the field
- * statically too so that the default cgroup can be accessed safely
- * early during boot.
- */
- .dfl_cgrp = &cgrp_dfl_root.cgrp,
+struct ext_css_set init_ext_css_set = {
+ .cset = {
+ .refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
+ },
+ .mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -802,6 +834,8 @@
break;
cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
cgroup_file_notify(&cgrp->events_file);
child = cgrp;
@@ -881,8 +915,7 @@
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_exit() changing the css_set to
- * init_css_set and dropping the old one.
+ * against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -1060,7 +1093,7 @@
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
- template[i] = cgroup_e_css(cgrp, ss);
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
@@ -1162,6 +1195,7 @@
struct cgroup *cgrp)
{
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct ext_css_set *ext_cset;
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
@@ -1182,9 +1216,10 @@
if (cset)
return cset;
- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
- if (!cset)
+ ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
+ if (!ext_cset)
return NULL;
+ cset = &ext_cset->cset;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
@@ -1202,6 +1237,8 @@
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1291,10 +1328,7 @@
void cgroup_free_root(struct cgroup_root *root)
{
- if (root) {
- idr_destroy(&root->cgroup_idr);
- kfree(root);
- }
+ kfree(root);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1356,6 +1390,8 @@
cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
res = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
@@ -1412,9 +1448,8 @@
struct cgroup_root *root)
{
/*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
+ * No need to lock the task - since we hold css_set_lock the
+ * task can't change groups.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
@@ -1453,12 +1488,15 @@
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
- else
+ } else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
return buf;
}
@@ -1699,7 +1737,7 @@
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- int ssid, i, ret;
+ int ssid, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1743,7 +1781,8 @@
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- struct css_set *cset;
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
WARN_ON(!css || cgroup_css(dcgrp, ss));
@@ -1761,9 +1800,22 @@
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
- hash_for_each(css_set_table, i, cset, hlist)
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
spin_unlock_irq(&css_set_lock);
/* default hierarchy doesn't enable controllers by default */
@@ -1815,26 +1867,42 @@
return len;
}
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+enum cgroup2_param {
+ Opt_nsdelegate,
+ Opt_memory_localevents,
+ Opt_memory_recursiveprot,
+ nr__cgroup2_params
+};
+
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ {}
+};
+
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *token;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
- *root_flags = 0;
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- if (!data || *data == '\0')
+ switch (opt) {
+ case Opt_nsdelegate:
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
-
- while ((token = strsep(&data, ",")) != NULL) {
- if (!strcmp(token, "nsdelegate")) {
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
- continue;
- }
-
- pr_err("cgroup2: unknown option \"%s\"\n", token);
- return -EINVAL;
+ case Opt_memory_localevents:
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
+ case Opt_memory_recursiveprot:
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ return 0;
}
-
- return 0;
+ return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
@@ -1844,6 +1912,16 @@
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
}
}
@@ -1851,79 +1929,19 @@
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ seq_puts(seq, ",memory_localevents");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ seq_puts(seq, ",memory_recursiveprot");
return 0;
}
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+static int cgroup_reconfigure(struct fs_context *fc)
{
- unsigned int root_flags;
- int ret;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret)
- return ret;
-
- apply_cgroup_root_flags(root_flags);
+ apply_cgroup_root_flags(ctx->flags);
return 0;
-}
-
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first mount.
- */
-static bool use_task_css_set_links __read_mostly;
-
-static void cgroup_enable_task_cg_lists(void)
-{
- struct task_struct *p, *g;
-
- /*
- * We need tasklist_lock because RCU is not safe against
- * while_each_thread(). Besides, a forking task that has passed
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
- * is not guaranteed to have its child immediately visible in the
- * tasklist if we walk through it with RCU.
- */
- read_lock(&tasklist_lock);
- spin_lock_irq(&css_set_lock);
-
- if (use_task_css_set_links)
- goto out_unlock;
-
- use_task_css_set_links = true;
-
- do_each_thread(g, p) {
- WARN_ON_ONCE(!list_empty(&p->cg_list) ||
- task_css_set(p) != &init_css_set);
-
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- * Do it while holding siglock so that we don't end up
- * racing against cgroup_exit().
- *
- * Interrupts were already disabled while acquiring
- * the css_set_lock, so we do not need to disable it
- * again when acquiring the sighand->siglock here.
- */
- spin_lock(&p->sighand->siglock);
- if (!(p->flags & PF_EXITING)) {
- struct css_set *cset = task_css_set(p);
-
- if (!css_set_populated(cset))
- css_set_update_populated(cset, true);
- list_add_tail(&p->cg_list, &cset->tasks);
- get_css_set(cset);
- cset->nr_tasks++;
- }
- spin_unlock(&p->sighand->siglock);
- } while_each_thread(g, p);
-out_unlock:
- spin_unlock_irq(&css_set_lock);
- read_unlock(&tasklist_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1951,22 +1969,22 @@
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_fs_context *ctx)
{
+ struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- idr_init(&root->cgroup_idr);
- root->flags = opts->flags;
- if (opts->release_agent)
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
- if (opts->name)
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
- if (opts->cpuset_clone_children)
+ root->flags = ctx->flags;
+ if (ctx->release_agent)
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+ if (ctx->name)
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
+ if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -1979,12 +1997,6 @@
int i, ret;
lockdep_assert_held(&cgroup_mutex);
-
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
- if (ret < 0)
- goto out;
- root_cgrp->id = ret;
- root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL);
@@ -2011,13 +2023,16 @@
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_SUPPORT_EXPORTOP,
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
+ KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
+ root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
ret = css_populate_dir(&root_cgrp->self);
if (ret)
@@ -2055,7 +2070,6 @@
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
@@ -2071,91 +2085,117 @@
return ret;
}
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns)
+int cgroup_do_get_tree(struct fs_context *fc)
{
- struct dentry *dentry;
- bool new_sb = false;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
+ ctx->kfc.root = ctx->root->kf_root;
+ if (fc->fs_type == &cgroup2_fs_type)
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
+ else
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+ ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+ if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
- struct super_block *sb = dentry->d_sb;
+ struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
- dput(dentry);
- if (IS_ERR(nsdentry))
+ dput(fc->root);
+ if (IS_ERR(nsdentry)) {
deactivate_locked_super(sb);
- dentry = nsdentry;
+ ret = PTR_ERR(nsdentry);
+ nsdentry = NULL;
+ }
+ fc->root = nsdentry;
}
- if (!new_sb)
- cgroup_put(&root->cgrp);
+ if (!ctx->kfc.new_sb_created)
+ cgroup_put(&ctx->root->cgrp);
- return dentry;
+ return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+/*
+ * Destroy a cgroup filesystem context.
+ */
+static void cgroup_fs_context_free(struct fs_context *fc)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct dentry *dentry;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+ kfree(ctx->name);
+ kfree(ctx->release_agent);
+ put_cgroup_ns(ctx->ns);
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static int cgroup_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- get_cgroup_ns(ns);
+ cgrp_dfl_visible = true;
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx->root = &cgrp_dfl_root;
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
+ ret = cgroup_do_get_tree(fc);
+ if (!ret)
+ apply_cgroup_root_flags(ctx->flags);
+ return ret;
+}
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
+static const struct fs_context_operations cgroup_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup2_parse_param,
+ .get_tree = cgroup_get_tree,
+ .reconfigure = cgroup_reconfigure,
+};
- if (fs_type == &cgroup2_fs_type) {
- unsigned int root_flags;
+static const struct fs_context_operations cgroup1_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup1_parse_param,
+ .get_tree = cgroup1_get_tree,
+ .reconfigure = cgroup1_reconfigure,
+};
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
+/*
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
+ * we select the namespace we're going to use.
+ */
+static int cgroup_init_fs_context(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx;
- cgrp_dfl_visible = true;
- cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
- CGROUP2_SUPER_MAGIC, ns);
- if (!IS_ERR(dentry))
- apply_cgroup_root_flags(root_flags);
- } else {
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
- CGROUP_SUPER_MAGIC, ns);
- }
-
- put_cgroup_ns(ns);
- return dentry;
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ fc->fs_private = &ctx->kfc;
+ if (fc->fs_type == &cgroup2_fs_type)
+ fc->ops = &cgroup_fs_context_ops;
+ else
+ fc->ops = &cgroup1_fs_context_ops;
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
+ fc->global = true;
+ return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
@@ -2171,25 +2211,73 @@
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = cgroup1_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
- .name = "cgroup2",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup2",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = cgroup2_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
+
+#ifdef CONFIG_CPUSETS
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cgroup1_get_tree,
+ .free = cgroup_fs_context_free,
+};
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
+ struct cgroup_fs_context *ctx;
+ int err;
+
+ err = cgroup_init_fs_context(fc);
+ if (err) {
+ kfree(agent);
+ return err;
+ }
+
+ fc->ops = &cpuset_fs_context_ops;
+
+ ctx = cgroup_fc2context(fc);
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ ctx->release_agent = agent;
+
+ get_filesystem(&cgroup_fs_type);
+ put_filesystem(fc->fs_type);
+ fc->fs_type = &cgroup_fs_type;
+
+ return 0;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+#endif
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
@@ -2256,6 +2344,47 @@
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2276,9 +2405,8 @@
if (task->flags & PF_EXITING)
return;
- /* leave @task alone if post_fork() hasn't linked it yet */
- if (list_empty(&task->cg_list))
- return;
+ /* cgroup_threadgroup_rwsem protects racing against forks */
+ WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task);
if (!cset->mg_src_cgrp)
@@ -2310,6 +2438,7 @@
return cgroup_taskset_next(tset, dst_cssp);
}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
/**
* cgroup_taskset_next - iterate to the next task in taskset
@@ -2356,6 +2485,7 @@
return NULL;
}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
/**
* cgroup_taskset_migrate - migrate a taskset
@@ -2426,6 +2556,7 @@
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
+ trace_android_vh_cgroup_attach(ss, tset);
ss->attach(tset);
}
} while_each_subsys_mask();
@@ -2510,22 +2641,28 @@
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
- struct css_set *cset, *tmp_cset;
+ struct ext_css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->cset.mg_src_cgrp = NULL;
+ cset->cset.mg_dst_cgrp = NULL;
+ cset->cset.mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(&cset->cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
- cset->mg_src_cgrp = NULL;
- cset->mg_dst_cgrp = NULL;
- cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
+ cset->cset.mg_src_cgrp = NULL;
+ cset->cset.mg_dst_cgrp = NULL;
+ cset->cset.mg_dst_cset = NULL;
+ list_del_init(&cset->mg_dst_preload_node);
+ put_css_set_locked(&cset->cset);
}
spin_unlock_irq(&css_set_lock);
@@ -2552,6 +2689,7 @@
struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
+ struct ext_css_set *ext_src_cset;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
@@ -2565,8 +2703,9 @@
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+ ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&ext_src_cset->mg_src_preload_node))
return;
WARN_ON(src_cset->mg_src_cgrp);
@@ -2577,7 +2716,7 @@
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2596,20 +2735,23 @@
*/
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
- struct css_set *src_cset, *tmp_cset;
+ struct ext_css_set *ext_src_set, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ struct css_set *src_cset = &ext_src_set->cset;
struct css_set *dst_cset;
+ struct ext_css_set *ext_dst_cset;
struct cgroup_subsys *ss;
int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
return -ENOMEM;
+ ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
@@ -2621,7 +2763,7 @@
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&ext_src_set->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2629,8 +2771,8 @@
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&ext_dst_cset->mg_dst_preload_node))
+ list_add_tail(&ext_dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2698,11 +2840,7 @@
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
- int ret;
-
- ret = cgroup_migrate_vet_dst(dst_cgrp);
- if (ret)
- return ret;
+ int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2729,16 +2867,28 @@
return ret;
}
-struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
- __acquires(&cgroup_threadgroup_rwsem)
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ bool *threadgroup_locked,
+ struct cgroup *dst_cgrp)
{
struct task_struct *tsk;
pid_t pid;
+ bool force_migration = false;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
+ * callers by cgroup_mutex.
+ * Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2754,13 +2904,16 @@
if (threadgroup)
tsk = tsk->group_leader;
+ if (tsk->flags & PF_KTHREAD)
+ trace_android_rvh_cgroup_force_kthread_migration(tsk, dst_cgrp, &force_migration);
+
/*
* kthreads may acquire PF_NO_SETAFFINITY during initialization.
* If userland migrates such a kthread to a non-root cgroup, it can
* become trapped in a cpuset, or RT kthread may be born in a
* cgroup with no rt_runtime allocated. Just say no.
*/
- if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
+ if (!force_migration && (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY))) {
tsk = ERR_PTR(-EINVAL);
goto out_unlock_threadgroup;
}
@@ -2769,14 +2922,14 @@
goto out_unlock_rcu;
out_unlock_threadgroup:
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2784,7 +2937,8 @@
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2799,7 +2953,7 @@
do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
+ seq_puts(seq, ss->name);
printed = true;
} while_each_subsys_mask();
if (printed)
@@ -2838,12 +2992,11 @@
DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
- struct css_set *src_cset;
+ struct ext_css_set *ext_src_set;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
-
- percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
@@ -2855,17 +3008,27 @@
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
- list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+ list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
@@ -2873,7 +3036,7 @@
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3106,7 +3269,7 @@
return ret;
/*
- * At this point, cgroup_e_css() results reflect the new csses
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
@@ -3506,22 +3669,33 @@
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+
+ return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+
+ return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+
+ return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
+ struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
@@ -3530,14 +3704,20 @@
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3567,12 +3747,15 @@
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
@@ -3625,28 +3808,50 @@
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
int ret;
+
+ if (!nbytes)
+ return 0;
/*
* If namespaces are delegation boundaries, disallow writes to
@@ -3656,7 +3861,7 @@
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -3843,7 +4048,8 @@
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
-
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
@@ -4028,6 +4234,7 @@
cft->flags |= __CFTYPE_ONLY_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
+EXPORT_SYMBOL_GPL(cgroup_add_dfl_cftypes);
/**
* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
@@ -4045,6 +4252,7 @@
cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
+EXPORT_SYMBOL_GPL(cgroup_add_legacy_cftypes);
/**
* cgroup_file_notify - generate a file modified event for a cgroup_file
@@ -4120,7 +4328,8 @@
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
- list_for_each_entry_rcu(next, &parent->children, sibling)
+ list_for_each_entry_rcu(next, &parent->children, sibling,
+ lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
@@ -4133,6 +4342,7 @@
return next;
return NULL;
}
+EXPORT_SYMBOL_GPL(css_next_child);
/**
* css_next_descendant_pre - find the next descendant for pre-order walk
@@ -4182,6 +4392,7 @@
return NULL;
}
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
@@ -4362,29 +4573,24 @@
lockdep_assert_held(&css_set_lock);
- /* Advance to the next non-empty css_set */
- do {
- cset = css_task_iter_next_css_set(it);
- if (!cset) {
- it->task_pos = NULL;
- return;
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+ while ((cset = css_task_iter_next_css_set(it))) {
+ if (!list_empty(&cset->tasks)) {
+ it->cur_tasks_head = &cset->tasks;
+ break;
+ } else if (!list_empty(&cset->mg_tasks)) {
+ it->cur_tasks_head = &cset->mg_tasks;
+ break;
+ } else if (!list_empty(&cset->dying_tasks)) {
+ it->cur_tasks_head = &cset->dying_tasks;
+ break;
}
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
- if (!list_empty(&cset->tasks)) {
- it->task_pos = cset->tasks.next;
- it->cur_tasks_head = &cset->tasks;
- } else if (!list_empty(&cset->mg_tasks)) {
- it->task_pos = cset->mg_tasks.next;
- it->cur_tasks_head = &cset->mg_tasks;
- } else {
- it->task_pos = cset->dying_tasks.next;
- it->cur_tasks_head = &cset->dying_tasks;
}
-
- it->tasks_head = &cset->tasks;
- it->mg_tasks_head = &cset->mg_tasks;
- it->dying_tasks_head = &cset->dying_tasks;
+ if (!cset) {
+ it->task_pos = NULL;
+ return;
+ }
+ it->task_pos = it->cur_tasks_head->next;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4429,24 +4635,24 @@
repeat:
if (it->task_pos) {
/*
- * Advance iterator to find next entry. cset->tasks is
- * consumed first and then ->mg_tasks. After ->mg_tasks,
- * we move onto the next cset.
+ * Advance iterator to find next entry. We go through cset
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
+ * the next cset.
*/
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head) {
- it->task_pos = it->mg_tasks_head->next;
- it->cur_tasks_head = it->mg_tasks_head;
+ if (it->task_pos == &it->cur_cset->tasks) {
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->mg_tasks_head) {
- it->task_pos = it->dying_tasks_head->next;
- it->cur_tasks_head = it->dying_tasks_head;
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->dying_tasks_head)
+ if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
@@ -4464,12 +4670,12 @@
goto repeat;
/* and dying leaders w/o live member threads */
- if (it->cur_tasks_head == it->dying_tasks_head &&
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (it->cur_tasks_head == it->dying_tasks_head)
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat;
}
}
@@ -4488,9 +4694,6 @@
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
- /* no one should try to iterate before mounting cgroups */
- WARN_ON_ONCE(!use_task_css_set_links);
-
memset(it, 0, sizeof(*it));
spin_lock_irq(&css_set_lock);
@@ -4567,21 +4770,21 @@
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4589,21 +4792,18 @@
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4636,13 +4836,28 @@
return 0;
}
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+{
+ int ret;
+ struct inode *inode;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ return ret;
+}
+
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
- struct inode *inode;
int ret;
lockdep_assert_held(&cgroup_mutex);
@@ -4652,12 +4867,7 @@
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
+ ret = cgroup_may_write(com_cgrp, sb);
if (ret)
return ret;
@@ -4673,18 +4883,42 @@
return 0;
}
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ int ret = 0;
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
+ if (ret)
+ return ret;
+
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
+
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true);
+ task = cgroup_procs_write_start(buf, true, &threadgroup_locked, dst_cgrp);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4694,15 +4928,23 @@
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb, true,
+ ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, true);
out_finish:
- cgroup_procs_write_finish(task);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -4717,9 +4959,12 @@
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
+ bool threadgroup_locked;
buf = strstrip(buf);
@@ -4727,7 +4972,7 @@
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, false);
+ task = cgroup_procs_write_start(buf, false, &threadgroup_locked, dst_cgrp);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4737,21 +4982,23 @@
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb, false,
+ ctx->ns);
+ revert_creds(saved_cred);
if (ret)
- goto out_finish;
-
- /* and must be contained in the same domain */
- ret = -EOPNOTSUPP;
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, false);
out_finish:
- cgroup_procs_write_finish(task);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
@@ -4823,13 +5070,12 @@
},
{
.name = "cpu.stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4837,7 +5083,7 @@
},
{
.name = "memory.pressure",
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4845,7 +5091,7 @@
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_PRESSURE,
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4927,10 +5173,10 @@
}
}
-static void css_release_work_fn(struct swork_event *sev)
+static void css_release_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
- container_of(sev, struct cgroup_subsys_state, destroy_swork);
+ container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
@@ -4964,9 +5210,6 @@
tcgrp->nr_dying_descendants--;
spin_unlock_irq(&css_set_lock);
- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- cgrp->id = -1;
-
/*
* There are two control paths which try to determine
* cgroup from dentry without going through kernfs -
@@ -4977,8 +5220,6 @@
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
-
- cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -4992,8 +5233,8 @@
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- INIT_SWORK(&css->destroy_swork, css_release_work_fn);
- swork_queue(&css->destroy_swork);
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5133,10 +5374,12 @@
* it isn't associated with its kernfs_node and doesn't have the control
* mask applied.
*/
-static struct cgroup *cgroup_create(struct cgroup *parent)
+static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
+ umode_t mode)
{
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
+ struct kernfs_node *kn;
int level = parent->level + 1;
int ret;
@@ -5156,15 +5399,13 @@
goto out_cancel_ref;
}
- /*
- * Temporarily set the pointer to NULL, so idr_find() won't return
- * a half-baked cgroup.
- */
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
- if (cgrp->id < 0) {
- ret = -ENOMEM;
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
goto out_stat_exit;
}
+ cgrp->kn = kn;
init_cgroup_housekeeping(cgrp);
@@ -5174,7 +5415,7 @@
ret = psi_cgroup_alloc(cgrp);
if (ret)
- goto out_idr_free;
+ goto out_kernfs_remove;
ret = cgroup_bpf_inherit(cgrp);
if (ret)
@@ -5198,7 +5439,7 @@
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
+ cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5228,12 +5469,6 @@
cgroup_get_live(parent);
/*
- * @cgrp is now fully operational. If something fails after this
- * point, it'll be released via the normal destruction path.
- */
- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-
- /*
* On the default hierarchy, a child doesn't automatically inherit
* subtree_control from the parent. Each is configured manually.
*/
@@ -5246,8 +5481,8 @@
out_psi_free:
psi_cgroup_free(cgrp);
-out_idr_free:
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_kernfs_remove:
+ kernfs_remove(cgrp->kn);
out_stat_exit:
if (cgroup_on_dfl(parent))
cgroup_rstat_exit(cgrp);
@@ -5284,7 +5519,6 @@
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
- struct kernfs_node *kn;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
@@ -5300,27 +5534,19 @@
goto out_unlock;
}
- cgrp = cgroup_create(parent);
+ cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
goto out_unlock;
}
- /* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- goto out_destroy;
- }
- cgrp->kn = kn;
-
/*
* This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
- kernfs_get(kn);
+ kernfs_get(cgrp->kn);
- ret = cgroup_kn_set_ugid(kn);
+ ret = cgroup_kn_set_ugid(cgrp->kn);
if (ret)
goto out_destroy;
@@ -5335,7 +5561,7 @@
TRACE_CGROUP_PATH(mkdir, cgrp);
/* let's create and online css's */
- kernfs_activate(kn);
+ kernfs_activate(cgrp->kn);
ret = 0;
goto out_unlock;
@@ -5512,6 +5738,8 @@
cgroup1_check_for_release(parent);
+ cgroup_bpf_offline(cgrp);
+
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5537,7 +5765,6 @@
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
@@ -5604,11 +5831,12 @@
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts;
+ static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
- init_cgroup_root(&cgrp_dfl_root, &opts);
+ ctx.root = &cgrp_dfl_root;
+ init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
@@ -5644,14 +5872,13 @@
int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
/*
- * The latency of the synchronize_sched() is too high for cgroups,
+ * The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
@@ -5735,6 +5962,9 @@
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
+#ifdef CONFIG_CPUSETS
+ WARN_ON(register_filesystem(&cpuset_fs_type));
+#endif
return 0;
}
@@ -5751,17 +5981,15 @@
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
- BUG_ON(swork_get());
return 0;
}
core_initcall(cgroup_wq_init);
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
- char *buf, size_t buflen)
+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
struct kernfs_node *kn;
- kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
return;
kernfs_path(kn, buf, buflen);
@@ -5851,8 +6079,7 @@
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set. Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
+ * attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
@@ -5860,21 +6087,172 @@
INIT_LIST_HEAD(&child->cg_list);
}
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *cgrp;
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ cgrp = css->cgroup;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+
+/**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+ int ret;
+ struct cgroup *dst_cgrp = NULL;
+ struct css_set *cset;
+ struct super_block *sb;
+ struct file *f;
+
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_threadgroup_change_begin(current);
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+ kargs->cset = cset;
+ return 0;
+ }
+
+ f = fget_raw(kargs->cgroup);
+ if (!f) {
+ ret = -EBADF;
+ goto err;
+ }
+ sb = f->f_path.dentry->d_sb;
+
+ dst_cgrp = cgroup_get_from_file(f);
+ if (IS_ERR(dst_cgrp)) {
+ ret = PTR_ERR(dst_cgrp);
+ dst_cgrp = NULL;
+ goto err;
+ }
+
+ if (cgroup_is_dead(dst_cgrp)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ /*
+ * Verify that we the target cgroup is writable for us. This is
+ * usually done by the vfs layer but since we're not going through
+ * the vfs layer here we need to do it "manually".
+ */
+ ret = cgroup_may_write(dst_cgrp, sb);
+ if (ret)
+ goto err;
+
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
+ if (ret)
+ goto err;
+
+ kargs->cset = find_css_set(cset, dst_cgrp);
+ if (!kargs->cset) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ put_css_set(cset);
+ fput(f);
+ kargs->cgrp = dst_cgrp;
+ return ret;
+
+err:
+ cgroup_threadgroup_change_end(current);
+ mutex_unlock(&cgroup_mutex);
+ if (f)
+ fput(f);
+ if (dst_cgrp)
+ cgroup_put(dst_cgrp);
+ put_css_set(cset);
+ if (kargs->cset)
+ put_css_set(kargs->cset);
+ return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
+ cgroup_threadgroup_change_end(current);
+
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ mutex_unlock(&cgroup_mutex);
+ if (cgrp) {
+ cgroup_put(cgrp);
+ kargs->cgrp = NULL;
+ }
+ }
+}
+
/**
* cgroup_can_fork - called on a new task before the process is exposed
- * @child: the task in question.
+ * @child: the child process
*
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
- * returns an error, the fork aborts with that error code. This allows for
- * a cgroup subsystem to conditionally allow or deny new forks.
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+ * callback returns an error, the fork aborts with that error code. This
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child)
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
+ ret = cgroup_css_set_fork(kargs);
+ if (ret)
+ return ret;
+
do_each_subsys_mask(ss, i, have_canfork_callback) {
- ret = ss->can_fork(child);
+ ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
@@ -5886,97 +6264,86 @@
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
}
+
+ cgroup_css_set_put_fork(kargs);
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
- * @child: the task in question
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded.
+ * cgroup_can_fork() succeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
*/
-void cgroup_cancel_fork(struct task_struct *child)
+void cgroup_cancel_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
+ * cgroup_post_fork - finalize cgroup setup for the child process
+ * @child: the child process
*
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
+ * Attach the child process to its css_set calling the subsystem fork()
+ * callbacks.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
struct cgroup_subsys *ss;
+ struct css_set *cset;
int i;
- /*
- * This may race against cgroup_enable_task_cg_lists(). As that
- * function sets use_task_css_set_links before grabbing
- * tasklist_lock and we just went through tasklist_lock to add
- * @child, it's guaranteed that either we see the set
- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
- * @child during its iteration.
- *
- * If we won the race, @child is associated with %current's
- * css_set. Grabbing css_set_lock guarantees both that the
- * association is stable, and, on completion of the parent's
- * migration, @child is visible in the source of migration or
- * already in the destination cgroup. This guarantee is necessary
- * when implementing operations which need to migrate all tasks of
- * a cgroup to another.
- *
- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
- * will remain in init_css_set. This is safe because all tasks are
- * in the init_css_set before cg_links is enabled and there's no
- * operation which transfers all tasks out of init_css_set.
- */
- if (use_task_css_set_links) {
- struct css_set *cset;
+ cset = kargs->cset;
+ kargs->cset = NULL;
- spin_lock_irq(&css_set_lock);
- cset = task_css_set(current);
- if (list_empty(&child->cg_list)) {
- get_css_set(cset);
- cset->nr_tasks++;
- css_set_move_task(child, NULL, cset, false);
- }
+ spin_lock_irq(&css_set_lock);
+
+ /* init tasks are special, only link regular threads */
+ if (likely(child->pid)) {
+ WARN_ON_ONCE(!list_empty(&child->cg_list));
+ cset->nr_tasks++;
+ css_set_move_task(child, NULL, cset, false);
+ } else {
+ put_css_set(cset);
+ cset = NULL;
+ }
+
+ /*
+ * If the cgroup has to be frozen, the new task has too. Let's set
+ * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
+ * frozen state.
+ */
+ if (unlikely(cgroup_task_freeze(child))) {
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
/*
- * If the cgroup has to be frozen, the new task has too.
- * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
- * the task into the frozen state.
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient switch
+ * from the frozen state and back.
*/
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
-
- /*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later
- * from do_freezer_trap(). So we avoid cgroup's
- * transient switch from the frozen state and back.
- */
- }
-
- spin_unlock_irq(&css_set_lock);
}
+
+ spin_unlock_irq(&css_set_lock);
/*
* Call ss->fork(). This must happen after @child is linked on
@@ -5986,26 +6353,25 @@
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
+
+ /* Make the new cset the root_cset of the new cgroup namespace. */
+ if (kargs->flags & CLONE_NEWCGROUP) {
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+ get_css_set(cset);
+ child->nsproxy->cgroup_ns->root_cset = cset;
+ put_css_set(rcset);
+ }
+
+ cgroup_css_set_put_fork(kargs);
}
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
- * Description: Detach cgroup from @tsk and release it.
+ * Description: Detach cgroup from @tsk.
*
- * Note that cgroups marked notify_on_release force every task in
- * them to take the global cgroup_mutex mutex when exiting.
- * This could impact scaling on very large systems. Be reluctant to
- * use notify_on_release cgroups where very high task exit scaling
- * is required on large systems.
- *
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
- * call cgroup_exit() while the task is still competent to handle
- * notify_on_release(), then leave the task attached to the root cgroup in
- * each hierarchy for the remainder of its exit. No need to bother with
- * init_css_set refcnting. init_css_set never goes away and we can't race
- * with migration path - PF_EXITING is visible to migration path.
*/
void cgroup_exit(struct task_struct *tsk)
{
@@ -6013,27 +6379,22 @@
struct css_set *cset;
int i;
- /*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check css_set and cg_list without synchronization.
- */
+ spin_lock_irq(&css_set_lock);
+
+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
+ css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ cset->nr_tasks--;
- if (!list_empty(&tsk->cg_list)) {
- spin_lock_irq(&css_set_lock);
- css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
- cset->nr_tasks--;
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
- if (unlikely(cgroup_task_frozen(tsk)))
- cgroup_freezer_frozen_exit(tsk);
- else if (unlikely(cgroup_task_freeze(tsk)))
- cgroup_update_frozen(task_dfl_cgroup(tsk));
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
+ if (unlikely(cgroup_task_freeze(tsk)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
- } else {
- get_css_set(cset);
- }
+ spin_unlock_irq(&css_set_lock);
/* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) {
@@ -6050,12 +6411,10 @@
ss->release(task);
} while_each_subsys_mask();
- if (use_task_css_set_links) {
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
- }
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
}
void cgroup_free(struct task_struct *task)
@@ -6096,6 +6455,16 @@
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
+
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
@@ -6196,7 +6565,6 @@
*/
struct cgroup *cgroup_get_from_fd(int fd)
{
- struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
@@ -6204,17 +6572,8 @@
if (!f)
return ERR_PTR(-EBADF);
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ cgrp = cgroup_get_from_file(f);
fput(f);
- if (IS_ERR(css))
- return ERR_CAST(css);
-
- cgrp = css->cgroup;
- if (!cgroup_on_dfl(cgrp)) {
- cgroup_put(cgrp);
- return ERR_PTR(-EBADF);
- }
-
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6305,6 +6664,7 @@
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->val = (unsigned long)cset->dfl_cgrp;
+ cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
@@ -6315,7 +6675,6 @@
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- /* Socket clone path */
if (skcd->val) {
if (skcd->no_refcnt)
return;
@@ -6325,40 +6684,48 @@
* Don't use cgroup_get_live().
*/
cgroup_get(sock_cgroup_ptr(skcd));
+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
}
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
if (skcd->no_refcnt)
return;
-
- cgroup_put(sock_cgroup_ptr(skcd));
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
+
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+ enum bpf_attach_type type)
{
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
+
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
@@ -6419,7 +6786,10 @@
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE,
+ "nsdelegate\n"
+ "memory_localevents\n"
+ "memory_recursiveprot\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
--
Gitblit v1.6.2