From 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Thu, 16 May 2024 03:11:33 +0000 Subject: [PATCH] AX88772C_eeprom and ax8872c build together --- kernel/kernel/cgroup/cpuset.c | 1367 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 1,192 insertions(+), 175 deletions(-) diff --git a/kernel/kernel/cgroup/cpuset.c b/kernel/kernel/cgroup/cpuset.c index 45b4dbf..270310e 100644 --- a/kernel/kernel/cgroup/cpuset.c +++ b/kernel/kernel/cgroup/cpuset.c @@ -33,17 +33,20 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/kmod.h> +#include <linux/kthread.h> #include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/seq_file.h> @@ -63,6 +66,9 @@ #include <linux/mutex.h> #include <linux/cgroup.h> #include <linux/wait.h> + +#include <trace/hooks/sched.h> +#include <trace/hooks/cgroup.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -111,6 +117,16 @@ nodemask_t effective_mems; /* + * CPUs allocated to child sub-partitions (default hierarchy only) + * - CPUs granted by the parent = effective_cpus U subparts_cpus + * - effective_cpus and subparts_cpus are mutually exclusive. + * + * effective_cpus contains only onlined CPUs, but subparts_cpus + * may have offlined ones. + */ + cpumask_var_t subparts_cpus; + + /* * This is old Memory Nodes tasks took on. * * - top_cpuset.old_mems_allowed is initialized to mems_allowed. @@ -135,6 +151,55 @@ /* for custom sched domain */ int relax_domain_level; + + /* number of CPUs in subparts_cpus */ + int nr_subparts_cpus; + + /* partition root state */ + int partition_root_state; + + /* + * Default hierarchy only: + * use_parent_ecpus - set if using parent's effective_cpus + * child_ecpus_count - # of children with use_parent_ecpus set + */ + int use_parent_ecpus; + int child_ecpus_count; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; +}; + +/* + * Partition root states: + * + * 0 - not a partition root + * + * 1 - partition root + * + * -1 - invalid partition root + * None of the cpus in cpus_allowed can be put into the parent's + * subparts_cpus. In this case, the cpuset is not a real partition + * root anymore. However, the CPU_EXCLUSIVE bit will still be set + * and the cpuset can be restored back to a partition root if the + * parent cpuset can give more CPUs back to this child cpuset. + */ +#define PRS_DISABLED 0 +#define PRS_ENABLED 1 +#define PRS_ERROR -1 + +/* + * Temporary cpumasks for working with partitions that are passed among + * functions to avoid memory allocation in inner functions. + */ +struct tmpmasks { + cpumask_var_t addmask, delmask; /* For partition root */ + cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -153,18 +218,19 @@ return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) +void inc_dl_tasks_cs(struct task_struct *p) { - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif + struct cpuset *cs = task_cs(p); + cs->nr_deadline_tasks++; +} + +void dec_dl_tasks_cs(struct task_struct *p) +{ + struct cpuset *cs = task_cs(p); + + cs->nr_deadline_tasks--; +} /* bits in struct cpuset flags field */ typedef enum { @@ -219,9 +285,15 @@ return test_bit(CS_SPREAD_SLAB, &cs->flags); } +static inline int is_partition_root(const struct cpuset *cs) +{ + return cs->partition_root_state > 0; +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .partition_root_state = PRS_ENABLED, }; /** @@ -289,21 +361,36 @@ */ static DEFINE_MUTEX(cpuset_mutex); + +void cpuset_lock(void) +{ + mutex_lock(&cpuset_mutex); +} + +void cpuset_unlock(void) +{ + mutex_unlock(&cpuset_mutex); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; /* - * CPU / memory hotplug is handled asynchronously. + * CPU / memory hotplug is handled asynchronously + * for hotplug, synchronously for resume_cpus */ -static void cpuset_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* - * Cgroup v2 behavior is used when on default hierarchy or the - * cgroup_v2_mode flag is set. + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. + * With v2 behavior, "cpus" and "mems" are always what the users have + * requested and won't be changed by hotplug events. Only the effective + * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { @@ -312,58 +399,45 @@ } /* - * This is ugly, but preserves the userspace API for existing cpuset - * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead - */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); - } - return ret; -} - -static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, -}; - -/* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ - cpumask_copy(pmask, cpu_online_mask); - return; + goto out_unlock; } } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: + rcu_read_unlock(); } /* @@ -420,6 +494,71 @@ } /** + * alloc_cpumasks - allocate three cpumasks for cpuset + * @cs: the cpuset that have cpumasks to be allocated. + * @tmp: the tmpmasks structure pointer + * Return: 0 if successful, -ENOMEM otherwise. + * + * Only one of the two input arguments should be non-NULL. + */ +static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +{ + cpumask_var_t *pmask1, *pmask2, *pmask3; + + if (cs) { + pmask1 = &cs->cpus_allowed; + pmask2 = &cs->effective_cpus; + pmask3 = &cs->subparts_cpus; + } else { + pmask1 = &tmp->new_cpus; + pmask2 = &tmp->addmask; + pmask3 = &tmp->delmask; + } + + if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) + goto free_one; + + if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) + goto free_two; + + if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_three; + + return 0; + +free_three: + free_cpumask_var(*pmask3); +free_two: + free_cpumask_var(*pmask2); +free_one: + free_cpumask_var(*pmask1); + return -ENOMEM; +} + +/** + * free_cpumasks - free cpumasks in a tmpmasks structure + * @cs: the cpuset that have cpumasks to be free. + * @tmp: the tmpmasks structure pointer + */ +static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) +{ + if (cs) { + free_cpumask_var(cs->cpus_allowed); + free_cpumask_var(cs->cpus_requested); + free_cpumask_var(cs->effective_cpus); + free_cpumask_var(cs->subparts_cpus); + } + if (tmp) { + free_cpumask_var(tmp->new_cpus); + free_cpumask_var(tmp->addmask); + free_cpumask_var(tmp->delmask); + } +} + +/** * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ @@ -431,37 +570,25 @@ if (!trial) return NULL; - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) - goto free_cs; - if (!alloc_cpumask_var(&trial->cpus_requested, GFP_KERNEL)) - goto free_allowed; - if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) - goto free_cpus; + if (alloc_cpumasks(trial, NULL)) { + kfree(trial); + return NULL; + } cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->cpus_requested, cs->cpus_requested); cpumask_copy(trial->effective_cpus, cs->effective_cpus); return trial; - -free_cpus: - free_cpumask_var(trial->cpus_requested); -free_allowed: - free_cpumask_var(trial->cpus_allowed); -free_cs: - kfree(trial); - return NULL; } /** - * free_trial_cpuset - free the trial cpuset - * @trial: the trial cpuset to be freed + * free_cpuset - free the cpuset + * @cs: the cpuset to be freed */ -static void free_trial_cpuset(struct cpuset *trial) +static inline void free_cpuset(struct cpuset *cs) { - free_cpumask_var(trial->effective_cpus); - free_cpumask_var(trial->cpus_requested); - free_cpumask_var(trial->cpus_allowed); - kfree(trial); + free_cpumasks(cs, NULL); + kfree(cs); } /* @@ -612,7 +739,7 @@ * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this @@ -623,11 +750,10 @@ * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -658,7 +784,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ @@ -667,13 +793,14 @@ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; + bool root_load_balance = is_sched_load_balance(&top_cpuset); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (is_sched_load_balance(&top_cpuset)) { + if (root_load_balance && !top_cpuset.nr_subparts_cpus) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -696,6 +823,8 @@ csn = 0; rcu_read_lock(); + if (root_load_balance) + csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; @@ -706,6 +835,9 @@ * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. + * + * If root is load-balancing, we can skip @cp if it + * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -713,11 +845,17 @@ housekeeping_cpumask(HK_FLAG_DOMAIN)))) continue; - if (is_sched_load_balance(cp)) + if (root_load_balance && + cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); + /* skip @cp's subtree if not a partition root */ + if (!is_partition_root(cp)) + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -820,6 +958,68 @@ return ndoms; } +static void dl_update_tasks_root_domain(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + if (cs->nr_deadline_tasks == 0) + return; + + css_task_iter_start(&cs->css, 0, &it); + + while ((task = css_task_iter_next(&it))) + dl_add_task_root_domain(task); + + css_task_iter_end(&it); +} + +static void dl_rebuild_rd_accounting(void) +{ + struct cpuset *cs = NULL; + struct cgroup_subsys_state *pos_css; + + lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + rcu_read_lock(); + + /* + * Clear default root domain DL accounting, it will be computed again + * if a task belongs to it. + */ + dl_clear_root_domain(&def_root_domain); + + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + + if (cpumask_empty(cs->effective_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + css_get(&cs->css); + + rcu_read_unlock(); + + dl_update_tasks_root_domain(cs); + + rcu_read_lock(); + css_put(&cs->css); + } + rcu_read_unlock(); +} + +static void +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + mutex_lock(&sched_domains_mutex); + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + dl_rebuild_rd_accounting(); + mutex_unlock(&sched_domains_mutex); +} + /* * Rebuild scheduler domains. * @@ -833,28 +1033,53 @@ */ static void rebuild_sched_domains_locked(void) { + struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; + struct cpuset *cs; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* - * We have raced with CPU hotplug. Don't do anything to avoid + * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, hotplug work item will rebuild sched domains. + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * + * With no CPUs in any subpartitions, top_cpuset's effective CPUs + * should be the same as the active CPUs, so checking only top_cpuset + * is enough to detect racing CPU offlines. */ - if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + if (!top_cpuset.nr_subparts_cpus && + !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) + return; + + /* + * With subpartition CPUs, however, the effective CPUs of a partition + * root should be only a subset of the active CPUs. Since a CPU in any + * partition root could be offlined, all must be checked. + */ + if (top_cpuset.nr_subparts_cpus) { + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!is_partition_root(cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + if (!cpumask_subset(cs->effective_cpus, + cpu_active_mask)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); + partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) @@ -864,9 +1089,23 @@ void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); +} + +static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, + const struct cpumask *new_mask) +{ + int ret = -EINVAL; + + trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret); + if (!ret) + return ret; + + return set_cpus_allowed_ptr(p, new_mask); } /** @@ -881,17 +1120,268 @@ { struct css_task_iter it; struct task_struct *task; + bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cs->effective_cpus); + while ((task = css_task_iter_next(&it))) { + /* + * Percpu kthreads in top_cpuset are ignored + */ + if (top_cs && (task->flags & PF_KTHREAD) && + kthread_is_per_cpu(task)) + continue; + update_cpus_allowed(cs, task, cs->effective_cpus); + } css_task_iter_end(&it); +} + +/** + * compute_effective_cpumask - Compute the effective cpumask of the cpuset + * @new_cpus: the temp variable for the new effective_cpus mask + * @cs: the cpuset the need to recompute the new effective_cpus mask + * @parent: the parent cpuset + * + * If the parent has subpartition CPUs, include them in the list of + * allowable CPUs in computing the new effective_cpus mask. Since offlined + * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask + * to mask those out. + */ +static void compute_effective_cpumask(struct cpumask *new_cpus, + struct cpuset *cs, struct cpuset *parent) +{ + if (parent->nr_subparts_cpus) { + cpumask_or(new_cpus, parent->effective_cpus, + parent->subparts_cpus); + cpumask_and(new_cpus, new_cpus, cs->cpus_requested); + cpumask_and(new_cpus, new_cpus, cpu_active_mask); + } else { + cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); + } +} + +/* + * Commands for update_parent_subparts_cpumask + */ +enum subparts_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's subparts_cpus */ +}; + +/** + * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset + * @cpuset: The cpuset that requests change in partition root state + * @cmd: Partition root state change command + * @newmask: Optional new cpumask for partcmd_update + * @tmp: Temporary addmask and delmask + * Return: 0, 1 or an error code + * + * For partcmd_enable, the cpuset is being transformed from a non-partition + * root to a partition root. The cpus_allowed mask of the given cpuset will + * be put into parent's subparts_cpus and taken away from parent's + * effective_cpus. The function will return 0 if all the CPUs listed in + * cpus_allowed can be granted or an error code will be returned. + * + * For partcmd_disable, the cpuset is being transofrmed from a partition + * root back to a non-partition root. Any CPUs in cpus_allowed that are in + * parent's subparts_cpus will be taken away from that cpumask and put back + * into parent's effective_cpus. 0 should always be returned. + * + * For partcmd_update, if the optional newmask is specified, the cpu + * list is to be changed from cpus_allowed to newmask. Otherwise, + * cpus_allowed is assumed to remain the same. The cpuset should either + * be a partition root or an invalid partition root. The partition root + * state may change if newmask is NULL and none of the requested CPUs can + * be granted by the parent. The function will return 1 if changes to + * parent's subparts_cpus and effective_cpus happen or 0 otherwise. + * Error code should only be returned when newmask is non-NULL. + * + * The partcmd_enable and partcmd_disable commands are used by + * update_prstate(). The partcmd_update command is used by + * update_cpumasks_hier() with newmask NULL and update_cpumask() with + * newmask set. + * + * The checking is more strict when enabling partition root than the + * other two commands. + * + * Because of the implicit cpu exclusive nature of a partition root, + * cpumask changes that violates the cpu exclusivity rule will not be + * permitted when checked by validate_change(). The validate_change() + * function will also prevent any changes to the cpu list if it is not + * a superset of children's cpu lists. + */ +static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) +{ + struct cpuset *parent = parent_cs(cpuset); + int adding; /* Moving cpus from effective_cpus to subparts_cpus */ + int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int new_prs; + bool part_error = false; /* Partition error? */ + + lockdep_assert_held(&cpuset_mutex); + + /* + * The parent must be a partition root. + * The new cpumask, if present, or the current cpus_allowed must + * not be empty. + */ + if (!is_partition_root(parent) || + (newmask && cpumask_empty(newmask)) || + (!newmask && cpumask_empty(cpuset->cpus_allowed))) + return -EINVAL; + + /* + * Enabling/disabling partition root is not allowed if there are + * online children. + */ + if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) + return -EBUSY; + + /* + * Enabling partition root is not allowed if not all the CPUs + * can be granted from parent's effective_cpus or at least one + * CPU will be left after that. + */ + if ((cmd == partcmd_enable) && + (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || + cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) + return -EINVAL; + + /* + * A cpumask update cannot make parent's effective_cpus become empty. + */ + adding = deleting = false; + new_prs = cpuset->partition_root_state; + if (cmd == partcmd_enable) { + cpumask_copy(tmp->addmask, cpuset->cpus_allowed); + adding = true; + } else if (cmd == partcmd_disable) { + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + parent->subparts_cpus); + } else if (newmask) { + /* + * partcmd_update with newmask: + * + * delmask = cpus_allowed & ~newmask & parent->subparts_cpus + * addmask = newmask & parent->effective_cpus + * & ~parent->subparts_cpus + */ + cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->subparts_cpus); + + cpumask_and(tmp->addmask, newmask, parent->effective_cpus); + adding = cpumask_andnot(tmp->addmask, tmp->addmask, + parent->subparts_cpus); + /* + * Return error if the new effective_cpus could become empty. + */ + if (adding && + cpumask_equal(parent->effective_cpus, tmp->addmask)) { + if (!deleting) + return -EINVAL; + /* + * As some of the CPUs in subparts_cpus might have + * been offlined, we need to compute the real delmask + * to confirm that. + */ + if (!cpumask_and(tmp->addmask, tmp->delmask, + cpu_active_mask)) + return -EINVAL; + cpumask_copy(tmp->addmask, parent->effective_cpus); + } + } else { + /* + * partcmd_update w/o newmask: + * + * addmask = cpus_allowed & parent->effective_cpus + * + * Note that parent's subparts_cpus may have been + * pre-shrunk in case there is a change in the cpu list. + * So no deletion is needed. + */ + adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, + parent->effective_cpus); + part_error = cpumask_equal(tmp->addmask, + parent->effective_cpus); + } + + if (cmd == partcmd_update) { + int prev_prs = cpuset->partition_root_state; + + /* + * Check for possible transition between PRS_ENABLED + * and PRS_ERROR. + */ + switch (cpuset->partition_root_state) { + case PRS_ENABLED: + if (part_error) + new_prs = PRS_ERROR; + break; + case PRS_ERROR: + if (!part_error) + new_prs = PRS_ENABLED; + break; + } + /* + * Set part_error if previously in invalid state. + */ + part_error = (prev_prs == PRS_ERROR); + } + + if (!part_error && (new_prs == PRS_ERROR)) + return 0; /* Nothing need to be done */ + + if (new_prs == PRS_ERROR) { + /* + * Remove all its cpus from parent's subparts_cpus. + */ + adding = false; + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + parent->subparts_cpus); + } + + if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) + return 0; + + /* + * Change the parent's subparts_cpus. + * Newly added CPUs will be removed from effective_cpus and + * newly deleted ones will be added back to effective_cpus. + */ + spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(parent->subparts_cpus, + parent->subparts_cpus, tmp->addmask); + cpumask_andnot(parent->effective_cpus, + parent->effective_cpus, tmp->addmask); + } + if (deleting) { + cpumask_andnot(parent->subparts_cpus, + parent->subparts_cpus, tmp->delmask); + /* + * Some of the CPUs in subparts_cpus might have been offlined. + */ + cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); + cpumask_or(parent->effective_cpus, + parent->effective_cpus, tmp->delmask); + } + + parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + + if (cpuset->partition_root_state != new_prs) + cpuset->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + + return cmd == partcmd_update; } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree - * @cs: the cpuset to consider - * @new_cpus: temp variable for calculating new effective_cpus + * @cs: the cpuset to consider + * @tmp: temp variables for calculating effective_cpus & partition setup * * When congifured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. @@ -900,29 +1390,86 @@ * * Called with cpuset_mutex held */ -static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; + int new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (is_in_v2_mode() && cpumask_empty(new_cpus)) - cpumask_copy(new_cpus, parent->effective_cpus); + if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { + cpumask_copy(tmp->new_cpus, parent->effective_cpus); + if (!cp->use_parent_ecpus) { + cp->use_parent_ecpus = true; + parent->child_ecpus_count++; + } + } else if (cp->use_parent_ecpus) { + cp->use_parent_ecpus = false; + WARN_ON_ONCE(!parent->child_ecpus_count); + parent->child_ecpus_count--; + } - /* Skip the whole subtree if the cpumask remains the same. */ - if (cpumask_equal(new_cpus, cp->effective_cpus)) { + /* + * Skip the whole subtree if the cpumask remains the same + * and has no partition root state. + */ + if (!cp->partition_root_state && + cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; + } + + /* + * update_parent_subparts_cpumask() should have been called + * for cs already in update_cpumask(). We should also call + * update_tasks_cpumask() again for tasks in the parent + * cpuset if the parent's subparts_cpus changes. + */ + new_prs = cp->partition_root_state; + if ((cp != cs) && new_prs) { + switch (parent->partition_root_state) { + case PRS_DISABLED: + /* + * If parent is not a partition root or an + * invalid partition root, clear its state + * and its CS_CPU_EXCLUSIVE flag. + */ + WARN_ON_ONCE(cp->partition_root_state + != PRS_ERROR); + new_prs = PRS_DISABLED; + + /* + * clear_bit() is an atomic operation and + * readers aren't interested in the state + * of CS_CPU_EXCLUSIVE anyway. So we can + * just update the flag without holding + * the callback_lock. + */ + clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); + break; + + case PRS_ENABLED: + if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) + update_tasks_cpumask(parent); + break; + + case PRS_ERROR: + /* + * When parent is invalid, it has to be too. + */ + new_prs = PRS_ERROR; + break; + } } if (!css_tryget_online(&cp->css)) @@ -930,7 +1477,39 @@ rcu_read_unlock(); spin_lock_irq(&callback_lock); - cpumask_copy(cp->effective_cpus, new_cpus); + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { + cp->nr_subparts_cpus = 0; + cpumask_clear(cp->subparts_cpus); + } else if (cp->nr_subparts_cpus) { + /* + * Make sure that effective_cpus & subparts_cpus + * are mutually exclusive. + * + * In the unlikely event that effective_cpus + * becomes empty. we clear cp->nr_subparts_cpus and + * let its child partition roots to compete for + * CPUs again. + */ + cpumask_andnot(cp->effective_cpus, cp->effective_cpus, + cp->subparts_cpus); + if (cpumask_empty(cp->effective_cpus)) { + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + cpumask_clear(cp->subparts_cpus); + cp->nr_subparts_cpus = 0; + } else if (!cpumask_subset(cp->subparts_cpus, + tmp->new_cpus)) { + cpumask_andnot(cp->subparts_cpus, + cp->subparts_cpus, tmp->new_cpus); + cp->nr_subparts_cpus + = cpumask_weight(cp->subparts_cpus); + } + } + + if (new_prs != cp->partition_root_state) + cp->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -939,11 +1518,15 @@ update_tasks_cpumask(cp); /* - * If the effective cpumask of any non-empty cpuset is changed, - * we need to rebuild sched domains. + * On legacy hierarchy, if the effective cpumask of any non- + * empty cpuset is changed, we need to rebuild sched domains. + * On default hierarchy, the cpuset needs to be a partition + * root as well. */ if (!cpumask_empty(cp->cpus_allowed) && - is_sched_load_balance(cp)) + is_sched_load_balance(cp) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + is_partition_root(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -956,6 +1539,45 @@ } /** + * update_sibling_cpumasks - Update siblings cpumasks + * @parent: Parent cpuset + * @cs: Current cpuset + * @tmp: Temp variables + */ +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp) +{ + struct cpuset *sibling; + struct cgroup_subsys_state *pos_css; + + lockdep_assert_held(&cpuset_mutex); + + /* + * Check all its siblings and call update_cpumasks_hier() + * if their use_parent_ecpus flag is set in order for them + * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, pos_css, parent) { + if (sibling == cs) + continue; + if (!sibling->use_parent_ecpus) + continue; + if (!css_tryget_online(&sibling->css)) + continue; + + rcu_read_unlock(); + update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); + } + rcu_read_unlock(); +} + +/** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset @@ -965,6 +1587,7 @@ const char *buf) { int retval; + struct tmpmasks tmp; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -997,13 +1620,50 @@ if (retval < 0) return retval; +#ifdef CONFIG_CPUMASK_OFFSTACK + /* + * Use the cpumasks in trialcs for tmpmasks when they are pointers + * to allocated cpumasks. + */ + tmp.addmask = trialcs->subparts_cpus; + tmp.delmask = trialcs->effective_cpus; + tmp.new_cpus = trialcs->cpus_allowed; +#endif + + if (cs->partition_root_state) { + /* Cpumask of a partition root cannot be empty */ + if (cpumask_empty(trialcs->cpus_allowed)) + return -EINVAL; + if (update_parent_subparts_cpumask(cs, partcmd_update, + trialcs->cpus_allowed, &tmp) < 0) + return -EINVAL; + } + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); + + /* + * Make sure that subparts_cpus is a subset of cpus_allowed. + */ + if (cs->nr_subparts_cpus) { + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } spin_unlock_irq(&callback_lock); - /* use trialcs->cpus_allowed as a temp variable */ - update_cpumasks_hier(cs, trialcs->cpus_allowed); + update_cpumasks_hier(cs, &tmp); + + if (cs->partition_root_state) { + struct cpuset *parent = parent_cs(cs); + + /* + * For partition root, update the cpumasks of sibling + * cpusets if they use parent's effective_cpus. + */ + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, &tmp); + } return 0; } @@ -1104,7 +1764,7 @@ guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1209,7 +1869,7 @@ * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, @@ -1357,7 +2017,90 @@ if (spread_flag_changed) update_tasks_flags(cs); out: - free_trial_cpuset(trialcs); + free_cpuset(trialcs); + return err; +} + +/* + * update_prstate - update partititon_root_state + * cs: the cpuset to update + * new_prs: new partition root state + * + * Call with cpuset_mutex held. + */ +static int update_prstate(struct cpuset *cs, int new_prs) +{ + int err, old_prs = cs->partition_root_state; + struct cpuset *parent = parent_cs(cs); + struct tmpmasks tmpmask; + + if (old_prs == new_prs) + return 0; + + /* + * Cannot force a partial or invalid partition root to a full + * partition root. + */ + if (new_prs && (old_prs == PRS_ERROR)) + return -EINVAL; + + if (alloc_cpumasks(NULL, &tmpmask)) + return -ENOMEM; + + err = -EINVAL; + if (!old_prs) { + /* + * Turning on partition root requires setting the + * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed + * cannot be NULL. + */ + if (cpumask_empty(cs->cpus_allowed)) + goto out; + + err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); + if (err) + goto out; + + err = update_parent_subparts_cpumask(cs, partcmd_enable, + NULL, &tmpmask); + if (err) { + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + goto out; + } + } else { + /* + * Turning off partition root will clear the + * CS_CPU_EXCLUSIVE bit. + */ + if (old_prs == PRS_ERROR) { + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + err = 0; + goto out; + } + + err = update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, &tmpmask); + if (err) + goto out; + + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + + update_tasks_cpumask(parent); + + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, &tmpmask); + + rebuild_sched_domains_locked(); +out: + if (!err) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + } + + free_cpumasks(NULL, &tmpmask); return err; } @@ -1464,16 +2207,23 @@ static struct cpuset *cpuset_attach_old_cs; +static void reset_migrate_dl_data(struct cpuset *cs) +{ + cs->nr_migrate_dl_tasks = 0; + cs->sum_migrate_dl_bw = 0; +} + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; - struct cpuset *cs; + struct cpuset *cs, *oldcs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + oldcs = cpuset_attach_old_cs; cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1485,14 +2235,39 @@ goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task); if (ret) goto out_unlock; ret = security_task_setscheduler(task); if (ret) goto out_unlock; + + if (dl_task(task)) { + cs->nr_migrate_dl_tasks++; + cs->sum_migrate_dl_bw += task->dl.dl_bw; + } } + if (!cs->nr_migrate_dl_tasks) + goto out_success; + + if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) { + reset_migrate_dl_data(cs); + ret = -EINVAL; + goto out_unlock; + } + + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) { + reset_migrate_dl_data(cs); + goto out_unlock; + } + } + +out_success: /* * Mark attach is in progress. This makes validate_change() fail * changes which zero cpus/mems_allowed. @@ -1513,7 +2288,17 @@ cs = css_cs(css); mutex_lock(&cpuset_mutex); - css_cs(css)->attach_in_progress--; + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); + + if (cs->nr_migrate_dl_tasks) { + int cpu = cpumask_any(cs->effective_cpus); + + dl_bw_free(cpu, cs->sum_migrate_dl_bw); + reset_migrate_dl_data(cs); + } + mutex_unlock(&cpuset_mutex); } @@ -1537,23 +2322,21 @@ cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ mutex_lock(&cpuset_mutex); - - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -1588,12 +2371,17 @@ cs->old_mems_allowed = cpuset_attach_nodemask_to; + if (cs->nr_migrate_dl_tasks) { + cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; + oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; + reset_migrate_dl_data(cs); + } + cs->attach_in_progress--; if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ @@ -1604,10 +2392,12 @@ FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, @@ -1622,6 +2412,7 @@ cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1659,6 +2450,7 @@ } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1669,6 +2461,7 @@ cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1683,6 +2476,7 @@ } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1721,6 +2515,7 @@ kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1743,9 +2538,10 @@ break; } - free_trial_cpuset(trialcs); + free_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -1780,6 +2576,9 @@ break; case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); + break; + case FILE_SUBPARTS_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); break; default: ret = -EINVAL; @@ -1835,12 +2634,62 @@ return 0; } +static int sched_partition_show(struct seq_file *seq, void *v) +{ + struct cpuset *cs = css_cs(seq_css(seq)); + + switch (cs->partition_root_state) { + case PRS_ENABLED: + seq_puts(seq, "root\n"); + break; + case PRS_DISABLED: + seq_puts(seq, "member\n"); + break; + case PRS_ERROR: + seq_puts(seq, "root invalid\n"); + break; + } + return 0; +} + +static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cpuset *cs = css_cs(of_css(of)); + int val; + int retval = -ENODEV; + + buf = strstrip(buf); + + /* + * Convert "root" to ENABLED, and convert "member" to DISABLED. + */ + if (!strcmp(buf, "root")) + val = PRS_ENABLED; + else if (!strcmp(buf, "member")) + val = PRS_DISABLED; + else + return -EINVAL; + + css_get(&cs->css); + get_online_cpus(); + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; + + retval = update_prstate(cs, val); +out_unlock: + mutex_unlock(&cpuset_mutex); + put_online_cpus(); + css_put(&cs->css); + return retval ?: nbytes; +} /* * for the common functions, 'private' gives the type of file */ -static struct cftype files[] = { +static struct cftype legacy_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, @@ -1943,6 +2792,60 @@ }; /* + * This is currently a minimal set for the default hierarchy. It can be + * expanded later on by migrating more features and control files from v1. + */ +static struct cftype dfl_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "cpus.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "mems.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpus.partition", + .seq_show = sched_partition_show, + .write = sched_partition_write, + .private = FILE_PARTITION_ROOT, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "cpus.subpartitions", + .seq_show = cpuset_common_seq_show, + .private = FILE_SUBPARTS_CPULIST, + .flags = CFTYPE_DEBUG, + }, + + { } /* terminate */ +}; + + +/* * cpuset_css_alloc - allocate a cpuset css * cgrp: control group that the new cpuset will be part of */ @@ -1958,31 +2861,19 @@ cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) - goto free_cs; - if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) - goto free_allowed; - if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) - goto free_requested; + + if (alloc_cpumasks(cs, NULL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpumask_clear(cs->cpus_allowed); - cpumask_clear(cs->cpus_requested); nodes_clear(cs->mems_allowed); - cpumask_clear(cs->effective_cpus); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; return &cs->css; - -free_requested: - free_cpumask_var(cs->cpus_requested); -free_allowed: - free_cpumask_var(cs->cpus_allowed); -free_cs: - kfree(cs); - return ERR_PTR(-ENOMEM); } static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1995,6 +2886,7 @@ if (!parent) return 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); @@ -2009,6 +2901,8 @@ if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -2046,38 +2940,54 @@ spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return 0; } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_locked(). That is not needed + * in the default hierarchy where only changes in partition + * will cause repartitioning. + * + * If the cpuset has the 'sched.partition' flag enabled, simulate + * turning 'sched.partition" off. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); mutex_lock(&cpuset_mutex); - if (is_sched_load_balance(cs)) + if (is_partition_root(cs)) + update_prstate(cs, 0); + + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->cpus_allowed); - free_cpumask_var(cs->cpus_requested); - kfree(cs); + free_cpuset(cs); } static void cpuset_bind(struct cgroup_subsys_state *root_css) @@ -2105,10 +3015,13 @@ */ static void cpuset_fork(struct task_struct *task) { + int inherit_cpus = 0; if (task_css_is_root(task, cpuset_cgrp_id)) return; - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + trace_android_rvh_cpuset_fork(task, &inherit_cpus); + if (!inherit_cpus) + set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; } @@ -2123,22 +3036,23 @@ .post_attach = cpuset_post_attach, .bind = cpuset_bind, .fork = cpuset_fork, - .legacy_cftypes = files, + .legacy_cftypes = legacy_files, + .dfl_cftypes = dfl_files, .early_init = true, + .threaded = true, }; /** * cpuset_init - initialize cpusets at system boot * - * Description: Initialize top_cpuset and the cpuset internal file system, + * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { - int err = 0; - BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); @@ -2150,10 +3064,6 @@ fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; - - err = register_filesystem(&cpuset_fs_type); - if (err < 0) - return err; BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -2247,20 +3157,29 @@ update_tasks_nodemask(cs); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest + * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ -static void cpuset_hotplug_update_tasks(struct cpuset *cs) +static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2275,9 +3194,64 @@ goto retry; } - cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); - nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); + parent = parent_cs(cs); + compute_effective_cpumask(&new_cpus, cs, parent); + nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); + if (cs->nr_subparts_cpus) + /* + * Make sure that CPUs allocated to child partitions + * do not show up in effective_cpus. + */ + cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); + + if (!tmp || !cs->partition_root_state) + goto update_tasks; + + /* + * In the unlikely event that a partition root has empty + * effective_cpus or its parent becomes erroneous, we have to + * transition it to the erroneous state. + */ + if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || + (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus) { + spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); + compute_effective_cpumask(&new_cpus, cs, parent); + } + + /* + * If the effective_cpus is empty because the child + * partitions take away all the CPUs, we can keep + * the current partition and let the child partitions + * fight for available CPUs. + */ + if ((parent->partition_root_state == PRS_ERROR) || + cpumask_empty(&new_cpus)) { + update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, tmp); + spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); + } + cpuset_force_rebuild(); + } + + /* + * On the other hand, an erroneous partition root may be transitioned + * back to a regular one or a partition root with no CPU allocated + * from the parent may change to erroneous. + */ + if (is_partition_root(parent) && + ((cs->partition_root_state == PRS_ERROR) || + !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) + cpuset_force_rebuild(); + +update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); @@ -2289,13 +3263,6 @@ cpus_updated, mems_updated); mutex_unlock(&cpuset_mutex); -} - -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; } /** @@ -2314,12 +3281,16 @@ * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; bool on_dfl = is_in_v2_mode(); + struct tmpmasks tmp, *ptmp = NULL; + + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) + ptmp = &tmp; mutex_lock(&cpuset_mutex); @@ -2327,14 +3298,42 @@ cpumask_copy(&new_cpus, cpu_active_mask); new_mems = node_states[N_MEMORY]; + /* + * If subparts_cpus is populated, it is likely that the check below + * will produce a false positive on cpus_updated when the cpu list + * isn't changed. It is extra work, but it is better to be safe. + */ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); + + /* + * In the rare case that hotplug removes all the cpus in subparts_cpus, + * we assumed that cpus are updated. + */ + if (!cpus_updated && top_cpuset.nr_subparts_cpus) + cpus_updated = true; /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + /* + * Make sure that CPUs allocated to child partitions + * do not show up in effective_cpus. If no CPU is left, + * we clear the subparts_cpus & let the child partitions + * fight for the CPUs again. + */ + if (top_cpuset.nr_subparts_cpus) { + if (cpumask_subset(&new_cpus, + top_cpuset.subparts_cpus)) { + top_cpuset.nr_subparts_cpus = 0; + cpumask_clear(top_cpuset.subparts_cpus); + } else { + cpumask_andnot(&new_cpus, &new_cpus, + top_cpuset.subparts_cpus); + } + } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ @@ -2363,7 +3362,7 @@ continue; rcu_read_unlock(); - cpuset_hotplug_update_tasks(cs); + cpuset_hotplug_update_tasks(cs, ptmp); rcu_read_lock(); css_put(&cs->css); @@ -2376,6 +3375,8 @@ force_rebuild = false; rebuild_sched_domains(); } + + free_cpumasks(NULL, ptmp); } void cpuset_update_active_cpus(void) @@ -2386,6 +3387,11 @@ * to a work item to avoid reverse locking order. */ schedule_work(&cpuset_hotplug_work); +} + +void cpuset_update_active_cpus_affine(int cpu) +{ + schedule_work_on(cpu, &cpuset_hotplug_work); } void cpuset_wait_for_hotplug(void) @@ -2417,8 +3423,11 @@ */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); @@ -2447,11 +3456,11 @@ spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); + guarantee_online_cpus(tsk, pmask); rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } - +EXPORT_SYMBOL_GPL(cpuset_cpus_allowed); /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling @@ -2466,9 +3475,17 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + cs_mask = task_cs(tsk)->cpus_allowed; + + if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask)) + goto unlock; /* select_fallback_rq will try harder */ + + do_set_cpus_allowed(tsk, cs_mask); +unlock: rcu_read_unlock(); /* @@ -2699,9 +3716,9 @@ rcu_read_lock(); cgrp = task_cs(current)->css.cgroup; - pr_info("%s cpuset=", current->comm); + pr_cont(",cpuset="); pr_cont_cgroup_name(cgrp); - pr_cont(" mems_allowed=%*pbl\n", + pr_cont(",mems_allowed=%*pbl", nodemask_pr_args(¤t->mems_allowed)); rcu_read_unlock(); -- Gitblit v1.6.2