hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/cgroup/cpuset.c
....@@ -33,17 +33,20 @@
3333 #include <linux/interrupt.h>
3434 #include <linux/kernel.h>
3535 #include <linux/kmod.h>
36
+#include <linux/kthread.h>
3637 #include <linux/list.h>
3738 #include <linux/mempolicy.h>
3839 #include <linux/mm.h>
3940 #include <linux/memory.h>
4041 #include <linux/export.h>
4142 #include <linux/mount.h>
43
+#include <linux/fs_context.h>
4244 #include <linux/namei.h>
4345 #include <linux/pagemap.h>
4446 #include <linux/proc_fs.h>
4547 #include <linux/rcupdate.h>
4648 #include <linux/sched.h>
49
+#include <linux/sched/deadline.h>
4750 #include <linux/sched/mm.h>
4851 #include <linux/sched/task.h>
4952 #include <linux/seq_file.h>
....@@ -63,6 +66,9 @@
6366 #include <linux/mutex.h>
6467 #include <linux/cgroup.h>
6568 #include <linux/wait.h>
69
+
70
+#include <trace/hooks/sched.h>
71
+#include <trace/hooks/cgroup.h>
6672
6773 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
6874 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
....@@ -111,6 +117,16 @@
111117 nodemask_t effective_mems;
112118
113119 /*
120
+ * CPUs allocated to child sub-partitions (default hierarchy only)
121
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
122
+ * - effective_cpus and subparts_cpus are mutually exclusive.
123
+ *
124
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
125
+ * may have offlined ones.
126
+ */
127
+ cpumask_var_t subparts_cpus;
128
+
129
+ /*
114130 * This is old Memory Nodes tasks took on.
115131 *
116132 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
....@@ -135,6 +151,47 @@
135151
136152 /* for custom sched domain */
137153 int relax_domain_level;
154
+
155
+ /* number of CPUs in subparts_cpus */
156
+ int nr_subparts_cpus;
157
+
158
+ /* partition root state */
159
+ int partition_root_state;
160
+
161
+ /*
162
+ * Default hierarchy only:
163
+ * use_parent_ecpus - set if using parent's effective_cpus
164
+ * child_ecpus_count - # of children with use_parent_ecpus set
165
+ */
166
+ int use_parent_ecpus;
167
+ int child_ecpus_count;
168
+};
169
+
170
+/*
171
+ * Partition root states:
172
+ *
173
+ * 0 - not a partition root
174
+ *
175
+ * 1 - partition root
176
+ *
177
+ * -1 - invalid partition root
178
+ * None of the cpus in cpus_allowed can be put into the parent's
179
+ * subparts_cpus. In this case, the cpuset is not a real partition
180
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
181
+ * and the cpuset can be restored back to a partition root if the
182
+ * parent cpuset can give more CPUs back to this child cpuset.
183
+ */
184
+#define PRS_DISABLED 0
185
+#define PRS_ENABLED 1
186
+#define PRS_ERROR -1
187
+
188
+/*
189
+ * Temporary cpumasks for working with partitions that are passed among
190
+ * functions to avoid memory allocation in inner functions.
191
+ */
192
+struct tmpmasks {
193
+ cpumask_var_t addmask, delmask; /* For partition root */
194
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
138195 };
139196
140197 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
....@@ -152,19 +209,6 @@
152209 {
153210 return css_cs(cs->css.parent);
154211 }
155
-
156
-#ifdef CONFIG_NUMA
157
-static inline bool task_has_mempolicy(struct task_struct *task)
158
-{
159
- return task->mempolicy;
160
-}
161
-#else
162
-static inline bool task_has_mempolicy(struct task_struct *task)
163
-{
164
- return false;
165
-}
166
-#endif
167
-
168212
169213 /* bits in struct cpuset flags field */
170214 typedef enum {
....@@ -219,9 +263,15 @@
219263 return test_bit(CS_SPREAD_SLAB, &cs->flags);
220264 }
221265
266
+static inline int is_partition_root(const struct cpuset *cs)
267
+{
268
+ return cs->partition_root_state > 0;
269
+}
270
+
222271 static struct cpuset top_cpuset = {
223272 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
224273 (1 << CS_MEM_EXCLUSIVE)),
274
+ .partition_root_state = PRS_ENABLED,
225275 };
226276
227277 /**
....@@ -289,21 +339,25 @@
289339 */
290340
291341 static DEFINE_MUTEX(cpuset_mutex);
292
-static DEFINE_SPINLOCK(callback_lock);
342
+static DEFINE_RAW_SPINLOCK(callback_lock);
293343
294344 static struct workqueue_struct *cpuset_migrate_mm_wq;
295345
296346 /*
297
- * CPU / memory hotplug is handled asynchronously.
347
+ * CPU / memory hotplug is handled asynchronously
348
+ * for hotplug, synchronously for resume_cpus
298349 */
299
-static void cpuset_hotplug_workfn(struct work_struct *work);
300350 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
301351
302352 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
303353
304354 /*
305
- * Cgroup v2 behavior is used when on default hierarchy or the
306
- * cgroup_v2_mode flag is set.
355
+ * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
356
+ * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
357
+ * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
358
+ * With v2 behavior, "cpus" and "mems" are always what the users have
359
+ * requested and won't be changed by hotplug events. Only the effective
360
+ * cpus or mems will be affected.
307361 */
308362 static inline bool is_in_v2_mode(void)
309363 {
....@@ -312,58 +366,45 @@
312366 }
313367
314368 /*
315
- * This is ugly, but preserves the userspace API for existing cpuset
316
- * users. If someone tries to mount the "cpuset" filesystem, we
317
- * silently switch it to mount "cgroup" instead
318
- */
319
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
320
- int flags, const char *unused_dev_name, void *data)
321
-{
322
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
323
- struct dentry *ret = ERR_PTR(-ENODEV);
324
- if (cgroup_fs) {
325
- char mountopts[] =
326
- "cpuset,noprefix,"
327
- "release_agent=/sbin/cpuset_release_agent";
328
- ret = cgroup_fs->mount(cgroup_fs, flags,
329
- unused_dev_name, mountopts);
330
- put_filesystem(cgroup_fs);
331
- }
332
- return ret;
333
-}
334
-
335
-static struct file_system_type cpuset_fs_type = {
336
- .name = "cpuset",
337
- .mount = cpuset_mount,
338
-};
339
-
340
-/*
341
- * Return in pmask the portion of a cpusets's cpus_allowed that
342
- * are online. If none are online, walk up the cpuset hierarchy
343
- * until we find one that does have some online cpus.
369
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
370
+ * are online and are capable of running the task. If none are found,
371
+ * walk up the cpuset hierarchy until we find one that does have some
372
+ * appropriate cpus.
344373 *
345374 * One way or another, we guarantee to return some non-empty subset
346
- * of cpu_online_mask.
375
+ * of cpu_active_mask.
347376 *
348377 * Call with callback_lock or cpuset_mutex held.
349378 */
350
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
379
+static void guarantee_online_cpus(struct task_struct *tsk,
380
+ struct cpumask *pmask)
351381 {
352
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
382
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
383
+ struct cpuset *cs;
384
+
385
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
386
+ cpumask_copy(pmask, cpu_active_mask);
387
+
388
+ rcu_read_lock();
389
+ cs = task_cs(tsk);
390
+
391
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
353392 cs = parent_cs(cs);
354393 if (unlikely(!cs)) {
355394 /*
356395 * The top cpuset doesn't have any online cpu as a
357396 * consequence of a race between cpuset_hotplug_work
358397 * and cpu hotplug notifier. But we know the top
359
- * cpuset's effective_cpus is on its way to to be
398
+ * cpuset's effective_cpus is on its way to be
360399 * identical to cpu_online_mask.
361400 */
362
- cpumask_copy(pmask, cpu_online_mask);
363
- return;
401
+ goto out_unlock;
364402 }
365403 }
366
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
404
+ cpumask_and(pmask, pmask, cs->effective_cpus);
405
+
406
+out_unlock:
407
+ rcu_read_unlock();
367408 }
368409
369410 /*
....@@ -420,6 +461,71 @@
420461 }
421462
422463 /**
464
+ * alloc_cpumasks - allocate three cpumasks for cpuset
465
+ * @cs: the cpuset that have cpumasks to be allocated.
466
+ * @tmp: the tmpmasks structure pointer
467
+ * Return: 0 if successful, -ENOMEM otherwise.
468
+ *
469
+ * Only one of the two input arguments should be non-NULL.
470
+ */
471
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
472
+{
473
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
474
+
475
+ if (cs) {
476
+ pmask1 = &cs->cpus_allowed;
477
+ pmask2 = &cs->effective_cpus;
478
+ pmask3 = &cs->subparts_cpus;
479
+ } else {
480
+ pmask1 = &tmp->new_cpus;
481
+ pmask2 = &tmp->addmask;
482
+ pmask3 = &tmp->delmask;
483
+ }
484
+
485
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
486
+ return -ENOMEM;
487
+
488
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
489
+ goto free_one;
490
+
491
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
492
+ goto free_two;
493
+
494
+ if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
495
+ goto free_three;
496
+
497
+ return 0;
498
+
499
+free_three:
500
+ free_cpumask_var(*pmask3);
501
+free_two:
502
+ free_cpumask_var(*pmask2);
503
+free_one:
504
+ free_cpumask_var(*pmask1);
505
+ return -ENOMEM;
506
+}
507
+
508
+/**
509
+ * free_cpumasks - free cpumasks in a tmpmasks structure
510
+ * @cs: the cpuset that have cpumasks to be free.
511
+ * @tmp: the tmpmasks structure pointer
512
+ */
513
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
514
+{
515
+ if (cs) {
516
+ free_cpumask_var(cs->cpus_allowed);
517
+ free_cpumask_var(cs->cpus_requested);
518
+ free_cpumask_var(cs->effective_cpus);
519
+ free_cpumask_var(cs->subparts_cpus);
520
+ }
521
+ if (tmp) {
522
+ free_cpumask_var(tmp->new_cpus);
523
+ free_cpumask_var(tmp->addmask);
524
+ free_cpumask_var(tmp->delmask);
525
+ }
526
+}
527
+
528
+/**
423529 * alloc_trial_cpuset - allocate a trial cpuset
424530 * @cs: the cpuset that the trial cpuset duplicates
425531 */
....@@ -431,37 +537,25 @@
431537 if (!trial)
432538 return NULL;
433539
434
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
435
- goto free_cs;
436
- if (!alloc_cpumask_var(&trial->cpus_requested, GFP_KERNEL))
437
- goto free_allowed;
438
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
439
- goto free_cpus;
540
+ if (alloc_cpumasks(trial, NULL)) {
541
+ kfree(trial);
542
+ return NULL;
543
+ }
440544
441545 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
442546 cpumask_copy(trial->cpus_requested, cs->cpus_requested);
443547 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
444548 return trial;
445
-
446
-free_cpus:
447
- free_cpumask_var(trial->cpus_requested);
448
-free_allowed:
449
- free_cpumask_var(trial->cpus_allowed);
450
-free_cs:
451
- kfree(trial);
452
- return NULL;
453549 }
454550
455551 /**
456
- * free_trial_cpuset - free the trial cpuset
457
- * @trial: the trial cpuset to be freed
552
+ * free_cpuset - free the cpuset
553
+ * @cs: the cpuset to be freed
458554 */
459
-static void free_trial_cpuset(struct cpuset *trial)
555
+static inline void free_cpuset(struct cpuset *cs)
460556 {
461
- free_cpumask_var(trial->effective_cpus);
462
- free_cpumask_var(trial->cpus_requested);
463
- free_cpumask_var(trial->cpus_allowed);
464
- kfree(trial);
557
+ free_cpumasks(cs, NULL);
558
+ kfree(cs);
465559 }
466560
467561 /*
....@@ -612,7 +706,7 @@
612706 * load balancing domains (sched domains) as specified by that partial
613707 * partition.
614708 *
615
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
709
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
616710 * for a background explanation of this.
617711 *
618712 * Does not return errors, on the theory that the callers of this
....@@ -623,11 +717,10 @@
623717 * Must be called with cpuset_mutex held.
624718 *
625719 * The three key local variables below are:
626
- * q - a linked-list queue of cpuset pointers, used to implement a
627
- * top-down scan of all cpusets. This scan loads a pointer
628
- * to each cpuset marked is_sched_load_balance into the
629
- * array 'csa'. For our purposes, rebuilding the schedulers
630
- * sched domains, we can ignore !is_sched_load_balance cpusets.
720
+ * cp - cpuset pointer, used (together with pos_css) to perform a
721
+ * top-down scan of all cpusets. For our purposes, rebuilding
722
+ * the schedulers sched domains, we can ignore !is_sched_load_
723
+ * balance cpusets.
631724 * csa - (for CpuSet Array) Array of pointers to all the cpusets
632725 * that need to be load balanced, for convenient iterative
633726 * access by the subsequent code that finds the best partition,
....@@ -658,7 +751,7 @@
658751 static int generate_sched_domains(cpumask_var_t **domains,
659752 struct sched_domain_attr **attributes)
660753 {
661
- struct cpuset *cp; /* scans q */
754
+ struct cpuset *cp; /* top-down scan of cpusets */
662755 struct cpuset **csa; /* array of all cpuset ptrs */
663756 int csn; /* how many cpuset ptrs in csa so far */
664757 int i, j, k; /* indices for partition finding loops */
....@@ -667,13 +760,14 @@
667760 int ndoms = 0; /* number of sched domains in result */
668761 int nslot; /* next empty doms[] struct cpumask slot */
669762 struct cgroup_subsys_state *pos_css;
763
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
670764
671765 doms = NULL;
672766 dattr = NULL;
673767 csa = NULL;
674768
675769 /* Special case for the 99% of systems with one, full, sched domain */
676
- if (is_sched_load_balance(&top_cpuset)) {
770
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
677771 ndoms = 1;
678772 doms = alloc_sched_domains(ndoms);
679773 if (!doms)
....@@ -696,6 +790,8 @@
696790 csn = 0;
697791
698792 rcu_read_lock();
793
+ if (root_load_balance)
794
+ csa[csn++] = &top_cpuset;
699795 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
700796 if (cp == &top_cpuset)
701797 continue;
....@@ -706,6 +802,9 @@
706802 * parent's cpus, so just skip them, and then we call
707803 * update_domain_attr_tree() to calc relax_domain_level of
708804 * the corresponding sched domain.
805
+ *
806
+ * If root is load-balancing, we can skip @cp if it
807
+ * is a subset of the root's effective_cpus.
709808 */
710809 if (!cpumask_empty(cp->cpus_allowed) &&
711810 !(is_sched_load_balance(cp) &&
....@@ -713,11 +812,17 @@
713812 housekeeping_cpumask(HK_FLAG_DOMAIN))))
714813 continue;
715814
716
- if (is_sched_load_balance(cp))
815
+ if (root_load_balance &&
816
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
817
+ continue;
818
+
819
+ if (is_sched_load_balance(cp) &&
820
+ !cpumask_empty(cp->effective_cpus))
717821 csa[csn++] = cp;
718822
719
- /* skip @cp's subtree */
720
- pos_css = css_rightmost_descendant(pos_css);
823
+ /* skip @cp's subtree if not a partition root */
824
+ if (!is_partition_root(cp))
825
+ pos_css = css_rightmost_descendant(pos_css);
721826 }
722827 rcu_read_unlock();
723828
....@@ -820,6 +925,65 @@
820925 return ndoms;
821926 }
822927
928
+static void update_tasks_root_domain(struct cpuset *cs)
929
+{
930
+ struct css_task_iter it;
931
+ struct task_struct *task;
932
+
933
+ css_task_iter_start(&cs->css, 0, &it);
934
+
935
+ while ((task = css_task_iter_next(&it)))
936
+ dl_add_task_root_domain(task);
937
+
938
+ css_task_iter_end(&it);
939
+}
940
+
941
+static void rebuild_root_domains(void)
942
+{
943
+ struct cpuset *cs = NULL;
944
+ struct cgroup_subsys_state *pos_css;
945
+
946
+ lockdep_assert_held(&cpuset_mutex);
947
+ lockdep_assert_cpus_held();
948
+ lockdep_assert_held(&sched_domains_mutex);
949
+
950
+ rcu_read_lock();
951
+
952
+ /*
953
+ * Clear default root domain DL accounting, it will be computed again
954
+ * if a task belongs to it.
955
+ */
956
+ dl_clear_root_domain(&def_root_domain);
957
+
958
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
959
+
960
+ if (cpumask_empty(cs->effective_cpus)) {
961
+ pos_css = css_rightmost_descendant(pos_css);
962
+ continue;
963
+ }
964
+
965
+ css_get(&cs->css);
966
+
967
+ rcu_read_unlock();
968
+
969
+ update_tasks_root_domain(cs);
970
+
971
+ rcu_read_lock();
972
+ css_put(&cs->css);
973
+ }
974
+ rcu_read_unlock();
975
+}
976
+
977
+static void
978
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
979
+ struct sched_domain_attr *dattr_new)
980
+{
981
+ mutex_lock(&sched_domains_mutex);
982
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
983
+ rebuild_root_domains();
984
+ mutex_unlock(&sched_domains_mutex);
985
+}
986
+
823987 /*
824988 * Rebuild scheduler domains.
825989 *
....@@ -833,28 +997,53 @@
833997 */
834998 static void rebuild_sched_domains_locked(void)
835999 {
1000
+ struct cgroup_subsys_state *pos_css;
8361001 struct sched_domain_attr *attr;
8371002 cpumask_var_t *doms;
1003
+ struct cpuset *cs;
8381004 int ndoms;
8391005
8401006 lockdep_assert_held(&cpuset_mutex);
841
- get_online_cpus();
8421007
8431008 /*
844
- * We have raced with CPU hotplug. Don't do anything to avoid
1009
+ * If we have raced with CPU hotplug, return early to avoid
8451010 * passing doms with offlined cpu to partition_sched_domains().
846
- * Anyways, hotplug work item will rebuild sched domains.
1011
+ * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1012
+ *
1013
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1014
+ * should be the same as the active CPUs, so checking only top_cpuset
1015
+ * is enough to detect racing CPU offlines.
8471016 */
848
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
849
- goto out;
1017
+ if (!top_cpuset.nr_subparts_cpus &&
1018
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1019
+ return;
1020
+
1021
+ /*
1022
+ * With subpartition CPUs, however, the effective CPUs of a partition
1023
+ * root should be only a subset of the active CPUs. Since a CPU in any
1024
+ * partition root could be offlined, all must be checked.
1025
+ */
1026
+ if (top_cpuset.nr_subparts_cpus) {
1027
+ rcu_read_lock();
1028
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1029
+ if (!is_partition_root(cs)) {
1030
+ pos_css = css_rightmost_descendant(pos_css);
1031
+ continue;
1032
+ }
1033
+ if (!cpumask_subset(cs->effective_cpus,
1034
+ cpu_active_mask)) {
1035
+ rcu_read_unlock();
1036
+ return;
1037
+ }
1038
+ }
1039
+ rcu_read_unlock();
1040
+ }
8501041
8511042 /* Generate domain masks and attrs */
8521043 ndoms = generate_sched_domains(&doms, &attr);
8531044
8541045 /* Have scheduler rebuild the domains */
855
- partition_sched_domains(ndoms, doms, attr);
856
-out:
857
- put_online_cpus();
1046
+ partition_and_rebuild_sched_domains(ndoms, doms, attr);
8581047 }
8591048 #else /* !CONFIG_SMP */
8601049 static void rebuild_sched_domains_locked(void)
....@@ -864,9 +1053,23 @@
8641053
8651054 void rebuild_sched_domains(void)
8661055 {
1056
+ get_online_cpus();
8671057 mutex_lock(&cpuset_mutex);
8681058 rebuild_sched_domains_locked();
8691059 mutex_unlock(&cpuset_mutex);
1060
+ put_online_cpus();
1061
+}
1062
+
1063
+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
1064
+ const struct cpumask *new_mask)
1065
+{
1066
+ int ret = -EINVAL;
1067
+
1068
+ trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret);
1069
+ if (!ret)
1070
+ return ret;
1071
+
1072
+ return set_cpus_allowed_ptr(p, new_mask);
8701073 }
8711074
8721075 /**
....@@ -881,17 +1084,268 @@
8811084 {
8821085 struct css_task_iter it;
8831086 struct task_struct *task;
1087
+ bool top_cs = cs == &top_cpuset;
8841088
8851089 css_task_iter_start(&cs->css, 0, &it);
886
- while ((task = css_task_iter_next(&it)))
887
- set_cpus_allowed_ptr(task, cs->effective_cpus);
1090
+ while ((task = css_task_iter_next(&it))) {
1091
+ /*
1092
+ * Percpu kthreads in top_cpuset are ignored
1093
+ */
1094
+ if (top_cs && (task->flags & PF_KTHREAD) &&
1095
+ kthread_is_per_cpu(task))
1096
+ continue;
1097
+ update_cpus_allowed(cs, task, cs->effective_cpus);
1098
+ }
8881099 css_task_iter_end(&it);
1100
+}
1101
+
1102
+/**
1103
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1104
+ * @new_cpus: the temp variable for the new effective_cpus mask
1105
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
1106
+ * @parent: the parent cpuset
1107
+ *
1108
+ * If the parent has subpartition CPUs, include them in the list of
1109
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
1110
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
1111
+ * to mask those out.
1112
+ */
1113
+static void compute_effective_cpumask(struct cpumask *new_cpus,
1114
+ struct cpuset *cs, struct cpuset *parent)
1115
+{
1116
+ if (parent->nr_subparts_cpus) {
1117
+ cpumask_or(new_cpus, parent->effective_cpus,
1118
+ parent->subparts_cpus);
1119
+ cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
1120
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1121
+ } else {
1122
+ cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
1123
+ }
1124
+}
1125
+
1126
+/*
1127
+ * Commands for update_parent_subparts_cpumask
1128
+ */
1129
+enum subparts_cmd {
1130
+ partcmd_enable, /* Enable partition root */
1131
+ partcmd_disable, /* Disable partition root */
1132
+ partcmd_update, /* Update parent's subparts_cpus */
1133
+};
1134
+
1135
+/**
1136
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1137
+ * @cpuset: The cpuset that requests change in partition root state
1138
+ * @cmd: Partition root state change command
1139
+ * @newmask: Optional new cpumask for partcmd_update
1140
+ * @tmp: Temporary addmask and delmask
1141
+ * Return: 0, 1 or an error code
1142
+ *
1143
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
1144
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
1145
+ * be put into parent's subparts_cpus and taken away from parent's
1146
+ * effective_cpus. The function will return 0 if all the CPUs listed in
1147
+ * cpus_allowed can be granted or an error code will be returned.
1148
+ *
1149
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
1150
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
1151
+ * parent's subparts_cpus will be taken away from that cpumask and put back
1152
+ * into parent's effective_cpus. 0 should always be returned.
1153
+ *
1154
+ * For partcmd_update, if the optional newmask is specified, the cpu
1155
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
1156
+ * cpus_allowed is assumed to remain the same. The cpuset should either
1157
+ * be a partition root or an invalid partition root. The partition root
1158
+ * state may change if newmask is NULL and none of the requested CPUs can
1159
+ * be granted by the parent. The function will return 1 if changes to
1160
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1161
+ * Error code should only be returned when newmask is non-NULL.
1162
+ *
1163
+ * The partcmd_enable and partcmd_disable commands are used by
1164
+ * update_prstate(). The partcmd_update command is used by
1165
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1166
+ * newmask set.
1167
+ *
1168
+ * The checking is more strict when enabling partition root than the
1169
+ * other two commands.
1170
+ *
1171
+ * Because of the implicit cpu exclusive nature of a partition root,
1172
+ * cpumask changes that violates the cpu exclusivity rule will not be
1173
+ * permitted when checked by validate_change(). The validate_change()
1174
+ * function will also prevent any changes to the cpu list if it is not
1175
+ * a superset of children's cpu lists.
1176
+ */
1177
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1178
+ struct cpumask *newmask,
1179
+ struct tmpmasks *tmp)
1180
+{
1181
+ struct cpuset *parent = parent_cs(cpuset);
1182
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
1183
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
1184
+ int new_prs;
1185
+ bool part_error = false; /* Partition error? */
1186
+
1187
+ lockdep_assert_held(&cpuset_mutex);
1188
+
1189
+ /*
1190
+ * The parent must be a partition root.
1191
+ * The new cpumask, if present, or the current cpus_allowed must
1192
+ * not be empty.
1193
+ */
1194
+ if (!is_partition_root(parent) ||
1195
+ (newmask && cpumask_empty(newmask)) ||
1196
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1197
+ return -EINVAL;
1198
+
1199
+ /*
1200
+ * Enabling/disabling partition root is not allowed if there are
1201
+ * online children.
1202
+ */
1203
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1204
+ return -EBUSY;
1205
+
1206
+ /*
1207
+ * Enabling partition root is not allowed if not all the CPUs
1208
+ * can be granted from parent's effective_cpus or at least one
1209
+ * CPU will be left after that.
1210
+ */
1211
+ if ((cmd == partcmd_enable) &&
1212
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1213
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1214
+ return -EINVAL;
1215
+
1216
+ /*
1217
+ * A cpumask update cannot make parent's effective_cpus become empty.
1218
+ */
1219
+ adding = deleting = false;
1220
+ new_prs = cpuset->partition_root_state;
1221
+ if (cmd == partcmd_enable) {
1222
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1223
+ adding = true;
1224
+ } else if (cmd == partcmd_disable) {
1225
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1226
+ parent->subparts_cpus);
1227
+ } else if (newmask) {
1228
+ /*
1229
+ * partcmd_update with newmask:
1230
+ *
1231
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1232
+ * addmask = newmask & parent->effective_cpus
1233
+ * & ~parent->subparts_cpus
1234
+ */
1235
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1236
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
1237
+ parent->subparts_cpus);
1238
+
1239
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1240
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1241
+ parent->subparts_cpus);
1242
+ /*
1243
+ * Return error if the new effective_cpus could become empty.
1244
+ */
1245
+ if (adding &&
1246
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1247
+ if (!deleting)
1248
+ return -EINVAL;
1249
+ /*
1250
+ * As some of the CPUs in subparts_cpus might have
1251
+ * been offlined, we need to compute the real delmask
1252
+ * to confirm that.
1253
+ */
1254
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
1255
+ cpu_active_mask))
1256
+ return -EINVAL;
1257
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
1258
+ }
1259
+ } else {
1260
+ /*
1261
+ * partcmd_update w/o newmask:
1262
+ *
1263
+ * addmask = cpus_allowed & parent->effective_cpus
1264
+ *
1265
+ * Note that parent's subparts_cpus may have been
1266
+ * pre-shrunk in case there is a change in the cpu list.
1267
+ * So no deletion is needed.
1268
+ */
1269
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1270
+ parent->effective_cpus);
1271
+ part_error = cpumask_equal(tmp->addmask,
1272
+ parent->effective_cpus);
1273
+ }
1274
+
1275
+ if (cmd == partcmd_update) {
1276
+ int prev_prs = cpuset->partition_root_state;
1277
+
1278
+ /*
1279
+ * Check for possible transition between PRS_ENABLED
1280
+ * and PRS_ERROR.
1281
+ */
1282
+ switch (cpuset->partition_root_state) {
1283
+ case PRS_ENABLED:
1284
+ if (part_error)
1285
+ new_prs = PRS_ERROR;
1286
+ break;
1287
+ case PRS_ERROR:
1288
+ if (!part_error)
1289
+ new_prs = PRS_ENABLED;
1290
+ break;
1291
+ }
1292
+ /*
1293
+ * Set part_error if previously in invalid state.
1294
+ */
1295
+ part_error = (prev_prs == PRS_ERROR);
1296
+ }
1297
+
1298
+ if (!part_error && (new_prs == PRS_ERROR))
1299
+ return 0; /* Nothing need to be done */
1300
+
1301
+ if (new_prs == PRS_ERROR) {
1302
+ /*
1303
+ * Remove all its cpus from parent's subparts_cpus.
1304
+ */
1305
+ adding = false;
1306
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1307
+ parent->subparts_cpus);
1308
+ }
1309
+
1310
+ if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
1311
+ return 0;
1312
+
1313
+ /*
1314
+ * Change the parent's subparts_cpus.
1315
+ * Newly added CPUs will be removed from effective_cpus and
1316
+ * newly deleted ones will be added back to effective_cpus.
1317
+ */
1318
+ raw_spin_lock_irq(&callback_lock);
1319
+ if (adding) {
1320
+ cpumask_or(parent->subparts_cpus,
1321
+ parent->subparts_cpus, tmp->addmask);
1322
+ cpumask_andnot(parent->effective_cpus,
1323
+ parent->effective_cpus, tmp->addmask);
1324
+ }
1325
+ if (deleting) {
1326
+ cpumask_andnot(parent->subparts_cpus,
1327
+ parent->subparts_cpus, tmp->delmask);
1328
+ /*
1329
+ * Some of the CPUs in subparts_cpus might have been offlined.
1330
+ */
1331
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1332
+ cpumask_or(parent->effective_cpus,
1333
+ parent->effective_cpus, tmp->delmask);
1334
+ }
1335
+
1336
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1337
+
1338
+ if (cpuset->partition_root_state != new_prs)
1339
+ cpuset->partition_root_state = new_prs;
1340
+ raw_spin_unlock_irq(&callback_lock);
1341
+
1342
+ return cmd == partcmd_update;
8891343 }
8901344
8911345 /*
8921346 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
893
- * @cs: the cpuset to consider
894
- * @new_cpus: temp variable for calculating new effective_cpus
1347
+ * @cs: the cpuset to consider
1348
+ * @tmp: temp variables for calculating effective_cpus & partition setup
8951349 *
8961350 * When congifured cpumask is changed, the effective cpumasks of this cpuset
8971351 * and all its descendants need to be updated.
....@@ -900,38 +1354,127 @@
9001354 *
9011355 * Called with cpuset_mutex held
9021356 */
903
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
1357
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
9041358 {
9051359 struct cpuset *cp;
9061360 struct cgroup_subsys_state *pos_css;
9071361 bool need_rebuild_sched_domains = false;
1362
+ int new_prs;
9081363
9091364 rcu_read_lock();
9101365 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
9111366 struct cpuset *parent = parent_cs(cp);
9121367
913
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
1368
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
9141369
9151370 /*
9161371 * If it becomes empty, inherit the effective mask of the
9171372 * parent, which is guaranteed to have some CPUs.
9181373 */
919
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
920
- cpumask_copy(new_cpus, parent->effective_cpus);
1374
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1375
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1376
+ if (!cp->use_parent_ecpus) {
1377
+ cp->use_parent_ecpus = true;
1378
+ parent->child_ecpus_count++;
1379
+ }
1380
+ } else if (cp->use_parent_ecpus) {
1381
+ cp->use_parent_ecpus = false;
1382
+ WARN_ON_ONCE(!parent->child_ecpus_count);
1383
+ parent->child_ecpus_count--;
1384
+ }
9211385
922
- /* Skip the whole subtree if the cpumask remains the same. */
923
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
1386
+ /*
1387
+ * Skip the whole subtree if the cpumask remains the same
1388
+ * and has no partition root state.
1389
+ */
1390
+ if (!cp->partition_root_state &&
1391
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
9241392 pos_css = css_rightmost_descendant(pos_css);
9251393 continue;
1394
+ }
1395
+
1396
+ /*
1397
+ * update_parent_subparts_cpumask() should have been called
1398
+ * for cs already in update_cpumask(). We should also call
1399
+ * update_tasks_cpumask() again for tasks in the parent
1400
+ * cpuset if the parent's subparts_cpus changes.
1401
+ */
1402
+ new_prs = cp->partition_root_state;
1403
+ if ((cp != cs) && new_prs) {
1404
+ switch (parent->partition_root_state) {
1405
+ case PRS_DISABLED:
1406
+ /*
1407
+ * If parent is not a partition root or an
1408
+ * invalid partition root, clear its state
1409
+ * and its CS_CPU_EXCLUSIVE flag.
1410
+ */
1411
+ WARN_ON_ONCE(cp->partition_root_state
1412
+ != PRS_ERROR);
1413
+ new_prs = PRS_DISABLED;
1414
+
1415
+ /*
1416
+ * clear_bit() is an atomic operation and
1417
+ * readers aren't interested in the state
1418
+ * of CS_CPU_EXCLUSIVE anyway. So we can
1419
+ * just update the flag without holding
1420
+ * the callback_lock.
1421
+ */
1422
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1423
+ break;
1424
+
1425
+ case PRS_ENABLED:
1426
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1427
+ update_tasks_cpumask(parent);
1428
+ break;
1429
+
1430
+ case PRS_ERROR:
1431
+ /*
1432
+ * When parent is invalid, it has to be too.
1433
+ */
1434
+ new_prs = PRS_ERROR;
1435
+ break;
1436
+ }
9261437 }
9271438
9281439 if (!css_tryget_online(&cp->css))
9291440 continue;
9301441 rcu_read_unlock();
9311442
932
- spin_lock_irq(&callback_lock);
933
- cpumask_copy(cp->effective_cpus, new_cpus);
934
- spin_unlock_irq(&callback_lock);
1443
+ raw_spin_lock_irq(&callback_lock);
1444
+
1445
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1446
+ if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1447
+ cp->nr_subparts_cpus = 0;
1448
+ cpumask_clear(cp->subparts_cpus);
1449
+ } else if (cp->nr_subparts_cpus) {
1450
+ /*
1451
+ * Make sure that effective_cpus & subparts_cpus
1452
+ * are mutually exclusive.
1453
+ *
1454
+ * In the unlikely event that effective_cpus
1455
+ * becomes empty. we clear cp->nr_subparts_cpus and
1456
+ * let its child partition roots to compete for
1457
+ * CPUs again.
1458
+ */
1459
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1460
+ cp->subparts_cpus);
1461
+ if (cpumask_empty(cp->effective_cpus)) {
1462
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1463
+ cpumask_clear(cp->subparts_cpus);
1464
+ cp->nr_subparts_cpus = 0;
1465
+ } else if (!cpumask_subset(cp->subparts_cpus,
1466
+ tmp->new_cpus)) {
1467
+ cpumask_andnot(cp->subparts_cpus,
1468
+ cp->subparts_cpus, tmp->new_cpus);
1469
+ cp->nr_subparts_cpus
1470
+ = cpumask_weight(cp->subparts_cpus);
1471
+ }
1472
+ }
1473
+
1474
+ if (new_prs != cp->partition_root_state)
1475
+ cp->partition_root_state = new_prs;
1476
+
1477
+ raw_spin_unlock_irq(&callback_lock);
9351478
9361479 WARN_ON(!is_in_v2_mode() &&
9371480 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
....@@ -939,11 +1482,15 @@
9391482 update_tasks_cpumask(cp);
9401483
9411484 /*
942
- * If the effective cpumask of any non-empty cpuset is changed,
943
- * we need to rebuild sched domains.
1485
+ * On legacy hierarchy, if the effective cpumask of any non-
1486
+ * empty cpuset is changed, we need to rebuild sched domains.
1487
+ * On default hierarchy, the cpuset needs to be a partition
1488
+ * root as well.
9441489 */
9451490 if (!cpumask_empty(cp->cpus_allowed) &&
946
- is_sched_load_balance(cp))
1491
+ is_sched_load_balance(cp) &&
1492
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1493
+ is_partition_root(cp)))
9471494 need_rebuild_sched_domains = true;
9481495
9491496 rcu_read_lock();
....@@ -956,6 +1503,45 @@
9561503 }
9571504
9581505 /**
1506
+ * update_sibling_cpumasks - Update siblings cpumasks
1507
+ * @parent: Parent cpuset
1508
+ * @cs: Current cpuset
1509
+ * @tmp: Temp variables
1510
+ */
1511
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1512
+ struct tmpmasks *tmp)
1513
+{
1514
+ struct cpuset *sibling;
1515
+ struct cgroup_subsys_state *pos_css;
1516
+
1517
+ lockdep_assert_held(&cpuset_mutex);
1518
+
1519
+ /*
1520
+ * Check all its siblings and call update_cpumasks_hier()
1521
+ * if their use_parent_ecpus flag is set in order for them
1522
+ * to use the right effective_cpus value.
1523
+ *
1524
+ * The update_cpumasks_hier() function may sleep. So we have to
1525
+ * release the RCU read lock before calling it.
1526
+ */
1527
+ rcu_read_lock();
1528
+ cpuset_for_each_child(sibling, pos_css, parent) {
1529
+ if (sibling == cs)
1530
+ continue;
1531
+ if (!sibling->use_parent_ecpus)
1532
+ continue;
1533
+ if (!css_tryget_online(&sibling->css))
1534
+ continue;
1535
+
1536
+ rcu_read_unlock();
1537
+ update_cpumasks_hier(sibling, tmp);
1538
+ rcu_read_lock();
1539
+ css_put(&sibling->css);
1540
+ }
1541
+ rcu_read_unlock();
1542
+}
1543
+
1544
+/**
9591545 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
9601546 * @cs: the cpuset to consider
9611547 * @trialcs: trial cpuset
....@@ -965,6 +1551,7 @@
9651551 const char *buf)
9661552 {
9671553 int retval;
1554
+ struct tmpmasks tmp;
9681555
9691556 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
9701557 if (cs == &top_cpuset)
....@@ -997,13 +1584,50 @@
9971584 if (retval < 0)
9981585 return retval;
9991586
1000
- spin_lock_irq(&callback_lock);
1587
+#ifdef CONFIG_CPUMASK_OFFSTACK
1588
+ /*
1589
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
1590
+ * to allocated cpumasks.
1591
+ */
1592
+ tmp.addmask = trialcs->subparts_cpus;
1593
+ tmp.delmask = trialcs->effective_cpus;
1594
+ tmp.new_cpus = trialcs->cpus_allowed;
1595
+#endif
1596
+
1597
+ if (cs->partition_root_state) {
1598
+ /* Cpumask of a partition root cannot be empty */
1599
+ if (cpumask_empty(trialcs->cpus_allowed))
1600
+ return -EINVAL;
1601
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
1602
+ trialcs->cpus_allowed, &tmp) < 0)
1603
+ return -EINVAL;
1604
+ }
1605
+
1606
+ raw_spin_lock_irq(&callback_lock);
10011607 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
10021608 cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
1003
- spin_unlock_irq(&callback_lock);
10041609
1005
- /* use trialcs->cpus_allowed as a temp variable */
1006
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
1610
+ /*
1611
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
1612
+ */
1613
+ if (cs->nr_subparts_cpus) {
1614
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
1615
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1616
+ }
1617
+ raw_spin_unlock_irq(&callback_lock);
1618
+
1619
+ update_cpumasks_hier(cs, &tmp);
1620
+
1621
+ if (cs->partition_root_state) {
1622
+ struct cpuset *parent = parent_cs(cs);
1623
+
1624
+ /*
1625
+ * For partition root, update the cpumasks of sibling
1626
+ * cpusets if they use parent's effective_cpus.
1627
+ */
1628
+ if (parent->child_ecpus_count)
1629
+ update_sibling_cpumasks(parent, cs, &tmp);
1630
+ }
10071631 return 0;
10081632 }
10091633
....@@ -1104,7 +1728,7 @@
11041728 guarantee_online_mems(cs, &newmems);
11051729
11061730 /*
1107
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1731
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
11081732 * take while holding tasklist_lock. Forks can happen - the
11091733 * mpol_dup() cpuset_being_rebound check will catch such forks,
11101734 * and rebind their vma mempolicies too. Because we still hold
....@@ -1184,9 +1808,9 @@
11841808 continue;
11851809 rcu_read_unlock();
11861810
1187
- spin_lock_irq(&callback_lock);
1811
+ raw_spin_lock_irq(&callback_lock);
11881812 cp->effective_mems = *new_mems;
1189
- spin_unlock_irq(&callback_lock);
1813
+ raw_spin_unlock_irq(&callback_lock);
11901814
11911815 WARN_ON(!is_in_v2_mode() &&
11921816 !nodes_equal(cp->mems_allowed, cp->effective_mems));
....@@ -1209,7 +1833,7 @@
12091833 *
12101834 * Call with cpuset_mutex held. May take callback_lock during call.
12111835 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1212
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1836
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
12131837 * their mempolicies to the cpusets new mems_allowed.
12141838 */
12151839 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
....@@ -1254,9 +1878,9 @@
12541878 if (retval < 0)
12551879 goto done;
12561880
1257
- spin_lock_irq(&callback_lock);
1881
+ raw_spin_lock_irq(&callback_lock);
12581882 cs->mems_allowed = trialcs->mems_allowed;
1259
- spin_unlock_irq(&callback_lock);
1883
+ raw_spin_unlock_irq(&callback_lock);
12601884
12611885 /* use trialcs->mems_allowed as a temp variable */
12621886 update_nodemasks_hier(cs, &trialcs->mems_allowed);
....@@ -1347,9 +1971,9 @@
13471971 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
13481972 || (is_spread_page(cs) != is_spread_page(trialcs)));
13491973
1350
- spin_lock_irq(&callback_lock);
1974
+ raw_spin_lock_irq(&callback_lock);
13511975 cs->flags = trialcs->flags;
1352
- spin_unlock_irq(&callback_lock);
1976
+ raw_spin_unlock_irq(&callback_lock);
13531977
13541978 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
13551979 rebuild_sched_domains_locked();
....@@ -1357,7 +1981,90 @@
13571981 if (spread_flag_changed)
13581982 update_tasks_flags(cs);
13591983 out:
1360
- free_trial_cpuset(trialcs);
1984
+ free_cpuset(trialcs);
1985
+ return err;
1986
+}
1987
+
1988
+/*
1989
+ * update_prstate - update partititon_root_state
1990
+ * cs: the cpuset to update
1991
+ * new_prs: new partition root state
1992
+ *
1993
+ * Call with cpuset_mutex held.
1994
+ */
1995
+static int update_prstate(struct cpuset *cs, int new_prs)
1996
+{
1997
+ int err, old_prs = cs->partition_root_state;
1998
+ struct cpuset *parent = parent_cs(cs);
1999
+ struct tmpmasks tmpmask;
2000
+
2001
+ if (old_prs == new_prs)
2002
+ return 0;
2003
+
2004
+ /*
2005
+ * Cannot force a partial or invalid partition root to a full
2006
+ * partition root.
2007
+ */
2008
+ if (new_prs && (old_prs == PRS_ERROR))
2009
+ return -EINVAL;
2010
+
2011
+ if (alloc_cpumasks(NULL, &tmpmask))
2012
+ return -ENOMEM;
2013
+
2014
+ err = -EINVAL;
2015
+ if (!old_prs) {
2016
+ /*
2017
+ * Turning on partition root requires setting the
2018
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
2019
+ * cannot be NULL.
2020
+ */
2021
+ if (cpumask_empty(cs->cpus_allowed))
2022
+ goto out;
2023
+
2024
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2025
+ if (err)
2026
+ goto out;
2027
+
2028
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
2029
+ NULL, &tmpmask);
2030
+ if (err) {
2031
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2032
+ goto out;
2033
+ }
2034
+ } else {
2035
+ /*
2036
+ * Turning off partition root will clear the
2037
+ * CS_CPU_EXCLUSIVE bit.
2038
+ */
2039
+ if (old_prs == PRS_ERROR) {
2040
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2041
+ err = 0;
2042
+ goto out;
2043
+ }
2044
+
2045
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
2046
+ NULL, &tmpmask);
2047
+ if (err)
2048
+ goto out;
2049
+
2050
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
2051
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2052
+ }
2053
+
2054
+ update_tasks_cpumask(parent);
2055
+
2056
+ if (parent->child_ecpus_count)
2057
+ update_sibling_cpumasks(parent, cs, &tmpmask);
2058
+
2059
+ rebuild_sched_domains_locked();
2060
+out:
2061
+ if (!err) {
2062
+ raw_spin_lock_irq(&callback_lock);
2063
+ cs->partition_root_state = new_prs;
2064
+ raw_spin_unlock_irq(&callback_lock);
2065
+ }
2066
+
2067
+ free_cpumasks(NULL, &tmpmask);
13612068 return err;
13622069 }
13632070
....@@ -1485,7 +2192,7 @@
14852192 goto out_unlock;
14862193
14872194 cgroup_taskset_for_each(task, css, tset) {
1488
- ret = task_can_attach(task, cs->cpus_allowed);
2195
+ ret = task_can_attach(task, cs->effective_cpus);
14892196 if (ret)
14902197 goto out_unlock;
14912198 ret = security_task_setscheduler(task);
....@@ -1507,10 +2214,8 @@
15072214 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
15082215 {
15092216 struct cgroup_subsys_state *css;
1510
- struct cpuset *cs;
15112217
15122218 cgroup_taskset_first(tset, &css);
1513
- cs = css_cs(css);
15142219
15152220 mutex_lock(&cpuset_mutex);
15162221 css_cs(css)->attach_in_progress--;
....@@ -1537,23 +2242,21 @@
15372242 cgroup_taskset_first(tset, &css);
15382243 cs = css_cs(css);
15392244
1540
- cpus_read_lock();
2245
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
15412246 mutex_lock(&cpuset_mutex);
1542
-
1543
- /* prepare for attach */
1544
- if (cs == &top_cpuset)
1545
- cpumask_copy(cpus_attach, cpu_possible_mask);
1546
- else
1547
- guarantee_online_cpus(cs, cpus_attach);
15482247
15492248 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
15502249
15512250 cgroup_taskset_for_each(task, css, tset) {
2251
+ if (cs != &top_cpuset)
2252
+ guarantee_online_cpus(task, cpus_attach);
2253
+ else
2254
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
15522255 /*
15532256 * can_attach beforehand should guarantee that this doesn't
15542257 * fail. TODO: have a better way to handle failure here
15552258 */
1556
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2259
+ WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
15572260
15582261 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
15592262 cpuset_update_task_spread_flag(cs, task);
....@@ -1593,7 +2296,6 @@
15932296 wake_up(&cpuset_attach_wq);
15942297
15952298 mutex_unlock(&cpuset_mutex);
1596
- cpus_read_unlock();
15972299 }
15982300
15992301 /* The various types of files and directories in a cpuset file system */
....@@ -1604,10 +2306,12 @@
16042306 FILE_MEMLIST,
16052307 FILE_EFFECTIVE_CPULIST,
16062308 FILE_EFFECTIVE_MEMLIST,
2309
+ FILE_SUBPARTS_CPULIST,
16072310 FILE_CPU_EXCLUSIVE,
16082311 FILE_MEM_EXCLUSIVE,
16092312 FILE_MEM_HARDWALL,
16102313 FILE_SCHED_LOAD_BALANCE,
2314
+ FILE_PARTITION_ROOT,
16112315 FILE_SCHED_RELAX_DOMAIN_LEVEL,
16122316 FILE_MEMORY_PRESSURE_ENABLED,
16132317 FILE_MEMORY_PRESSURE,
....@@ -1622,6 +2326,7 @@
16222326 cpuset_filetype_t type = cft->private;
16232327 int retval = 0;
16242328
2329
+ get_online_cpus();
16252330 mutex_lock(&cpuset_mutex);
16262331 if (!is_cpuset_online(cs)) {
16272332 retval = -ENODEV;
....@@ -1659,6 +2364,7 @@
16592364 }
16602365 out_unlock:
16612366 mutex_unlock(&cpuset_mutex);
2367
+ put_online_cpus();
16622368 return retval;
16632369 }
16642370
....@@ -1669,6 +2375,7 @@
16692375 cpuset_filetype_t type = cft->private;
16702376 int retval = -ENODEV;
16712377
2378
+ get_online_cpus();
16722379 mutex_lock(&cpuset_mutex);
16732380 if (!is_cpuset_online(cs))
16742381 goto out_unlock;
....@@ -1683,6 +2390,7 @@
16832390 }
16842391 out_unlock:
16852392 mutex_unlock(&cpuset_mutex);
2393
+ put_online_cpus();
16862394 return retval;
16872395 }
16882396
....@@ -1721,6 +2429,7 @@
17212429 kernfs_break_active_protection(of->kn);
17222430 flush_work(&cpuset_hotplug_work);
17232431
2432
+ get_online_cpus();
17242433 mutex_lock(&cpuset_mutex);
17252434 if (!is_cpuset_online(cs))
17262435 goto out_unlock;
....@@ -1743,9 +2452,10 @@
17432452 break;
17442453 }
17452454
1746
- free_trial_cpuset(trialcs);
2455
+ free_cpuset(trialcs);
17472456 out_unlock:
17482457 mutex_unlock(&cpuset_mutex);
2458
+ put_online_cpus();
17492459 kernfs_unbreak_active_protection(of->kn);
17502460 css_put(&cs->css);
17512461 flush_workqueue(cpuset_migrate_mm_wq);
....@@ -1766,7 +2476,7 @@
17662476 cpuset_filetype_t type = seq_cft(sf)->private;
17672477 int ret = 0;
17682478
1769
- spin_lock_irq(&callback_lock);
2479
+ raw_spin_lock_irq(&callback_lock);
17702480
17712481 switch (type) {
17722482 case FILE_CPULIST:
....@@ -1781,11 +2491,14 @@
17812491 case FILE_EFFECTIVE_MEMLIST:
17822492 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
17832493 break;
2494
+ case FILE_SUBPARTS_CPULIST:
2495
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2496
+ break;
17842497 default:
17852498 ret = -EINVAL;
17862499 }
17872500
1788
- spin_unlock_irq(&callback_lock);
2501
+ raw_spin_unlock_irq(&callback_lock);
17892502 return ret;
17902503 }
17912504
....@@ -1835,12 +2548,62 @@
18352548 return 0;
18362549 }
18372550
2551
+static int sched_partition_show(struct seq_file *seq, void *v)
2552
+{
2553
+ struct cpuset *cs = css_cs(seq_css(seq));
2554
+
2555
+ switch (cs->partition_root_state) {
2556
+ case PRS_ENABLED:
2557
+ seq_puts(seq, "root\n");
2558
+ break;
2559
+ case PRS_DISABLED:
2560
+ seq_puts(seq, "member\n");
2561
+ break;
2562
+ case PRS_ERROR:
2563
+ seq_puts(seq, "root invalid\n");
2564
+ break;
2565
+ }
2566
+ return 0;
2567
+}
2568
+
2569
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2570
+ size_t nbytes, loff_t off)
2571
+{
2572
+ struct cpuset *cs = css_cs(of_css(of));
2573
+ int val;
2574
+ int retval = -ENODEV;
2575
+
2576
+ buf = strstrip(buf);
2577
+
2578
+ /*
2579
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
2580
+ */
2581
+ if (!strcmp(buf, "root"))
2582
+ val = PRS_ENABLED;
2583
+ else if (!strcmp(buf, "member"))
2584
+ val = PRS_DISABLED;
2585
+ else
2586
+ return -EINVAL;
2587
+
2588
+ css_get(&cs->css);
2589
+ get_online_cpus();
2590
+ mutex_lock(&cpuset_mutex);
2591
+ if (!is_cpuset_online(cs))
2592
+ goto out_unlock;
2593
+
2594
+ retval = update_prstate(cs, val);
2595
+out_unlock:
2596
+ mutex_unlock(&cpuset_mutex);
2597
+ put_online_cpus();
2598
+ css_put(&cs->css);
2599
+ return retval ?: nbytes;
2600
+}
18382601
18392602 /*
18402603 * for the common functions, 'private' gives the type of file
18412604 */
18422605
1843
-static struct cftype files[] = {
2606
+static struct cftype legacy_files[] = {
18442607 {
18452608 .name = "cpus",
18462609 .seq_show = cpuset_common_seq_show,
....@@ -1943,6 +2706,60 @@
19432706 };
19442707
19452708 /*
2709
+ * This is currently a minimal set for the default hierarchy. It can be
2710
+ * expanded later on by migrating more features and control files from v1.
2711
+ */
2712
+static struct cftype dfl_files[] = {
2713
+ {
2714
+ .name = "cpus",
2715
+ .seq_show = cpuset_common_seq_show,
2716
+ .write = cpuset_write_resmask,
2717
+ .max_write_len = (100U + 6 * NR_CPUS),
2718
+ .private = FILE_CPULIST,
2719
+ .flags = CFTYPE_NOT_ON_ROOT,
2720
+ },
2721
+
2722
+ {
2723
+ .name = "mems",
2724
+ .seq_show = cpuset_common_seq_show,
2725
+ .write = cpuset_write_resmask,
2726
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
2727
+ .private = FILE_MEMLIST,
2728
+ .flags = CFTYPE_NOT_ON_ROOT,
2729
+ },
2730
+
2731
+ {
2732
+ .name = "cpus.effective",
2733
+ .seq_show = cpuset_common_seq_show,
2734
+ .private = FILE_EFFECTIVE_CPULIST,
2735
+ },
2736
+
2737
+ {
2738
+ .name = "mems.effective",
2739
+ .seq_show = cpuset_common_seq_show,
2740
+ .private = FILE_EFFECTIVE_MEMLIST,
2741
+ },
2742
+
2743
+ {
2744
+ .name = "cpus.partition",
2745
+ .seq_show = sched_partition_show,
2746
+ .write = sched_partition_write,
2747
+ .private = FILE_PARTITION_ROOT,
2748
+ .flags = CFTYPE_NOT_ON_ROOT,
2749
+ },
2750
+
2751
+ {
2752
+ .name = "cpus.subpartitions",
2753
+ .seq_show = cpuset_common_seq_show,
2754
+ .private = FILE_SUBPARTS_CPULIST,
2755
+ .flags = CFTYPE_DEBUG,
2756
+ },
2757
+
2758
+ { } /* terminate */
2759
+};
2760
+
2761
+
2762
+/*
19462763 * cpuset_css_alloc - allocate a cpuset css
19472764 * cgrp: control group that the new cpuset will be part of
19482765 */
....@@ -1958,31 +2775,19 @@
19582775 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
19592776 if (!cs)
19602777 return ERR_PTR(-ENOMEM);
1961
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1962
- goto free_cs;
1963
- if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
1964
- goto free_allowed;
1965
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1966
- goto free_requested;
2778
+
2779
+ if (alloc_cpumasks(cs, NULL)) {
2780
+ kfree(cs);
2781
+ return ERR_PTR(-ENOMEM);
2782
+ }
19672783
19682784 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1969
- cpumask_clear(cs->cpus_allowed);
1970
- cpumask_clear(cs->cpus_requested);
19712785 nodes_clear(cs->mems_allowed);
1972
- cpumask_clear(cs->effective_cpus);
19732786 nodes_clear(cs->effective_mems);
19742787 fmeter_init(&cs->fmeter);
19752788 cs->relax_domain_level = -1;
19762789
19772790 return &cs->css;
1978
-
1979
-free_requested:
1980
- free_cpumask_var(cs->cpus_requested);
1981
-free_allowed:
1982
- free_cpumask_var(cs->cpus_allowed);
1983
-free_cs:
1984
- kfree(cs);
1985
- return ERR_PTR(-ENOMEM);
19862791 }
19872792
19882793 static int cpuset_css_online(struct cgroup_subsys_state *css)
....@@ -1995,6 +2800,7 @@
19952800 if (!parent)
19962801 return 0;
19972802
2803
+ get_online_cpus();
19982804 mutex_lock(&cpuset_mutex);
19992805
20002806 set_bit(CS_ONLINE, &cs->flags);
....@@ -2005,12 +2811,14 @@
20052811
20062812 cpuset_inc();
20072813
2008
- spin_lock_irq(&callback_lock);
2814
+ raw_spin_lock_irq(&callback_lock);
20092815 if (is_in_v2_mode()) {
20102816 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
20112817 cs->effective_mems = parent->effective_mems;
2818
+ cs->use_parent_ecpus = true;
2819
+ parent->child_ecpus_count++;
20122820 }
2013
- spin_unlock_irq(&callback_lock);
2821
+ raw_spin_unlock_irq(&callback_lock);
20142822
20152823 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
20162824 goto out_unlock;
....@@ -2037,53 +2845,69 @@
20372845 }
20382846 rcu_read_unlock();
20392847
2040
- spin_lock_irq(&callback_lock);
2848
+ raw_spin_lock_irq(&callback_lock);
20412849 cs->mems_allowed = parent->mems_allowed;
20422850 cs->effective_mems = parent->mems_allowed;
20432851 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
20442852 cpumask_copy(cs->cpus_requested, parent->cpus_requested);
20452853 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2046
- spin_unlock_irq(&callback_lock);
2854
+ raw_spin_unlock_irq(&callback_lock);
20472855 out_unlock:
20482856 mutex_unlock(&cpuset_mutex);
2857
+ put_online_cpus();
20492858 return 0;
20502859 }
20512860
20522861 /*
20532862 * If the cpuset being removed has its flag 'sched_load_balance'
20542863 * enabled, then simulate turning sched_load_balance off, which
2055
- * will call rebuild_sched_domains_locked().
2864
+ * will call rebuild_sched_domains_locked(). That is not needed
2865
+ * in the default hierarchy where only changes in partition
2866
+ * will cause repartitioning.
2867
+ *
2868
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
2869
+ * turning 'sched.partition" off.
20562870 */
20572871
20582872 static void cpuset_css_offline(struct cgroup_subsys_state *css)
20592873 {
20602874 struct cpuset *cs = css_cs(css);
20612875
2876
+ get_online_cpus();
20622877 mutex_lock(&cpuset_mutex);
20632878
2064
- if (is_sched_load_balance(cs))
2879
+ if (is_partition_root(cs))
2880
+ update_prstate(cs, 0);
2881
+
2882
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2883
+ is_sched_load_balance(cs))
20652884 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2885
+
2886
+ if (cs->use_parent_ecpus) {
2887
+ struct cpuset *parent = parent_cs(cs);
2888
+
2889
+ cs->use_parent_ecpus = false;
2890
+ parent->child_ecpus_count--;
2891
+ }
20662892
20672893 cpuset_dec();
20682894 clear_bit(CS_ONLINE, &cs->flags);
20692895
20702896 mutex_unlock(&cpuset_mutex);
2897
+ put_online_cpus();
20712898 }
20722899
20732900 static void cpuset_css_free(struct cgroup_subsys_state *css)
20742901 {
20752902 struct cpuset *cs = css_cs(css);
20762903
2077
- free_cpumask_var(cs->effective_cpus);
2078
- free_cpumask_var(cs->cpus_allowed);
2079
- free_cpumask_var(cs->cpus_requested);
2080
- kfree(cs);
2904
+ free_cpuset(cs);
20812905 }
20822906
20832907 static void cpuset_bind(struct cgroup_subsys_state *root_css)
20842908 {
20852909 mutex_lock(&cpuset_mutex);
2086
- spin_lock_irq(&callback_lock);
2910
+ raw_spin_lock_irq(&callback_lock);
20872911
20882912 if (is_in_v2_mode()) {
20892913 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
....@@ -2094,7 +2918,7 @@
20942918 top_cpuset.mems_allowed = top_cpuset.effective_mems;
20952919 }
20962920
2097
- spin_unlock_irq(&callback_lock);
2921
+ raw_spin_unlock_irq(&callback_lock);
20982922 mutex_unlock(&cpuset_mutex);
20992923 }
21002924
....@@ -2105,10 +2929,13 @@
21052929 */
21062930 static void cpuset_fork(struct task_struct *task)
21072931 {
2932
+ int inherit_cpus = 0;
21082933 if (task_css_is_root(task, cpuset_cgrp_id))
21092934 return;
21102935
2111
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
2936
+ trace_android_rvh_cpuset_fork(task, &inherit_cpus);
2937
+ if (!inherit_cpus)
2938
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
21122939 task->mems_allowed = current->mems_allowed;
21132940 }
21142941
....@@ -2123,22 +2950,23 @@
21232950 .post_attach = cpuset_post_attach,
21242951 .bind = cpuset_bind,
21252952 .fork = cpuset_fork,
2126
- .legacy_cftypes = files,
2953
+ .legacy_cftypes = legacy_files,
2954
+ .dfl_cftypes = dfl_files,
21272955 .early_init = true,
2956
+ .threaded = true,
21282957 };
21292958
21302959 /**
21312960 * cpuset_init - initialize cpusets at system boot
21322961 *
2133
- * Description: Initialize top_cpuset and the cpuset internal file system,
2962
+ * Description: Initialize top_cpuset
21342963 **/
21352964
21362965 int __init cpuset_init(void)
21372966 {
2138
- int err = 0;
2139
-
21402967 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
21412968 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2969
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
21422970 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
21432971
21442972 cpumask_setall(top_cpuset.cpus_allowed);
....@@ -2150,10 +2978,6 @@
21502978 fmeter_init(&top_cpuset.fmeter);
21512979 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
21522980 top_cpuset.relax_domain_level = -1;
2153
-
2154
- err = register_filesystem(&cpuset_fs_type);
2155
- if (err < 0)
2156
- return err;
21572981
21582982 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
21592983
....@@ -2194,12 +3018,12 @@
21943018 {
21953019 bool is_empty;
21963020
2197
- spin_lock_irq(&callback_lock);
3021
+ raw_spin_lock_irq(&callback_lock);
21983022 cpumask_copy(cs->cpus_allowed, new_cpus);
21993023 cpumask_copy(cs->effective_cpus, new_cpus);
22003024 cs->mems_allowed = *new_mems;
22013025 cs->effective_mems = *new_mems;
2202
- spin_unlock_irq(&callback_lock);
3026
+ raw_spin_unlock_irq(&callback_lock);
22033027
22043028 /*
22053029 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
....@@ -2236,10 +3060,10 @@
22363060 if (nodes_empty(*new_mems))
22373061 *new_mems = parent_cs(cs)->effective_mems;
22383062
2239
- spin_lock_irq(&callback_lock);
3063
+ raw_spin_lock_irq(&callback_lock);
22403064 cpumask_copy(cs->effective_cpus, new_cpus);
22413065 cs->effective_mems = *new_mems;
2242
- spin_unlock_irq(&callback_lock);
3066
+ raw_spin_unlock_irq(&callback_lock);
22433067
22443068 if (cpus_updated)
22453069 update_tasks_cpumask(cs);
....@@ -2247,20 +3071,29 @@
22473071 update_tasks_nodemask(cs);
22483072 }
22493073
3074
+static bool force_rebuild;
3075
+
3076
+void cpuset_force_rebuild(void)
3077
+{
3078
+ force_rebuild = true;
3079
+}
3080
+
22503081 /**
22513082 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
22523083 * @cs: cpuset in interest
3084
+ * @tmp: the tmpmasks structure pointer
22533085 *
22543086 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
22553087 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
22563088 * all its tasks are moved to the nearest ancestor with both resources.
22573089 */
2258
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
3090
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
22593091 {
22603092 static cpumask_t new_cpus;
22613093 static nodemask_t new_mems;
22623094 bool cpus_updated;
22633095 bool mems_updated;
3096
+ struct cpuset *parent;
22643097 retry:
22653098 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
22663099
....@@ -2275,9 +3108,64 @@
22753108 goto retry;
22763109 }
22773110
2278
- cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
2279
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
3111
+ parent = parent_cs(cs);
3112
+ compute_effective_cpumask(&new_cpus, cs, parent);
3113
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
22803114
3115
+ if (cs->nr_subparts_cpus)
3116
+ /*
3117
+ * Make sure that CPUs allocated to child partitions
3118
+ * do not show up in effective_cpus.
3119
+ */
3120
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3121
+
3122
+ if (!tmp || !cs->partition_root_state)
3123
+ goto update_tasks;
3124
+
3125
+ /*
3126
+ * In the unlikely event that a partition root has empty
3127
+ * effective_cpus or its parent becomes erroneous, we have to
3128
+ * transition it to the erroneous state.
3129
+ */
3130
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3131
+ (parent->partition_root_state == PRS_ERROR))) {
3132
+ if (cs->nr_subparts_cpus) {
3133
+ raw_spin_lock_irq(&callback_lock);
3134
+ cs->nr_subparts_cpus = 0;
3135
+ cpumask_clear(cs->subparts_cpus);
3136
+ raw_spin_unlock_irq(&callback_lock);
3137
+ compute_effective_cpumask(&new_cpus, cs, parent);
3138
+ }
3139
+
3140
+ /*
3141
+ * If the effective_cpus is empty because the child
3142
+ * partitions take away all the CPUs, we can keep
3143
+ * the current partition and let the child partitions
3144
+ * fight for available CPUs.
3145
+ */
3146
+ if ((parent->partition_root_state == PRS_ERROR) ||
3147
+ cpumask_empty(&new_cpus)) {
3148
+ update_parent_subparts_cpumask(cs, partcmd_disable,
3149
+ NULL, tmp);
3150
+ raw_spin_lock_irq(&callback_lock);
3151
+ cs->partition_root_state = PRS_ERROR;
3152
+ raw_spin_unlock_irq(&callback_lock);
3153
+ }
3154
+ cpuset_force_rebuild();
3155
+ }
3156
+
3157
+ /*
3158
+ * On the other hand, an erroneous partition root may be transitioned
3159
+ * back to a regular one or a partition root with no CPU allocated
3160
+ * from the parent may change to erroneous.
3161
+ */
3162
+ if (is_partition_root(parent) &&
3163
+ ((cs->partition_root_state == PRS_ERROR) ||
3164
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3165
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3166
+ cpuset_force_rebuild();
3167
+
3168
+update_tasks:
22813169 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
22823170 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
22833171
....@@ -2289,13 +3177,6 @@
22893177 cpus_updated, mems_updated);
22903178
22913179 mutex_unlock(&cpuset_mutex);
2292
-}
2293
-
2294
-static bool force_rebuild;
2295
-
2296
-void cpuset_force_rebuild(void)
2297
-{
2298
- force_rebuild = true;
22993180 }
23003181
23013182 /**
....@@ -2314,12 +3195,16 @@
23143195 * Note that CPU offlining during suspend is ignored. We don't modify
23153196 * cpusets across suspend/resume cycles at all.
23163197 */
2317
-static void cpuset_hotplug_workfn(struct work_struct *work)
3198
+void cpuset_hotplug_workfn(struct work_struct *work)
23183199 {
23193200 static cpumask_t new_cpus;
23203201 static nodemask_t new_mems;
23213202 bool cpus_updated, mems_updated;
23223203 bool on_dfl = is_in_v2_mode();
3204
+ struct tmpmasks tmp, *ptmp = NULL;
3205
+
3206
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3207
+ ptmp = &tmp;
23233208
23243209 mutex_lock(&cpuset_mutex);
23253210
....@@ -2327,26 +3212,54 @@
23273212 cpumask_copy(&new_cpus, cpu_active_mask);
23283213 new_mems = node_states[N_MEMORY];
23293214
3215
+ /*
3216
+ * If subparts_cpus is populated, it is likely that the check below
3217
+ * will produce a false positive on cpus_updated when the cpu list
3218
+ * isn't changed. It is extra work, but it is better to be safe.
3219
+ */
23303220 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
23313221 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
23323222
3223
+ /*
3224
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
3225
+ * we assumed that cpus are updated.
3226
+ */
3227
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3228
+ cpus_updated = true;
3229
+
23333230 /* synchronize cpus_allowed to cpu_active_mask */
23343231 if (cpus_updated) {
2335
- spin_lock_irq(&callback_lock);
3232
+ raw_spin_lock_irq(&callback_lock);
23363233 if (!on_dfl)
23373234 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3235
+ /*
3236
+ * Make sure that CPUs allocated to child partitions
3237
+ * do not show up in effective_cpus. If no CPU is left,
3238
+ * we clear the subparts_cpus & let the child partitions
3239
+ * fight for the CPUs again.
3240
+ */
3241
+ if (top_cpuset.nr_subparts_cpus) {
3242
+ if (cpumask_subset(&new_cpus,
3243
+ top_cpuset.subparts_cpus)) {
3244
+ top_cpuset.nr_subparts_cpus = 0;
3245
+ cpumask_clear(top_cpuset.subparts_cpus);
3246
+ } else {
3247
+ cpumask_andnot(&new_cpus, &new_cpus,
3248
+ top_cpuset.subparts_cpus);
3249
+ }
3250
+ }
23383251 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2339
- spin_unlock_irq(&callback_lock);
3252
+ raw_spin_unlock_irq(&callback_lock);
23403253 /* we don't mess with cpumasks of tasks in top_cpuset */
23413254 }
23423255
23433256 /* synchronize mems_allowed to N_MEMORY */
23443257 if (mems_updated) {
2345
- spin_lock_irq(&callback_lock);
3258
+ raw_spin_lock_irq(&callback_lock);
23463259 if (!on_dfl)
23473260 top_cpuset.mems_allowed = new_mems;
23483261 top_cpuset.effective_mems = new_mems;
2349
- spin_unlock_irq(&callback_lock);
3262
+ raw_spin_unlock_irq(&callback_lock);
23503263 update_tasks_nodemask(&top_cpuset);
23513264 }
23523265
....@@ -2363,7 +3276,7 @@
23633276 continue;
23643277 rcu_read_unlock();
23653278
2366
- cpuset_hotplug_update_tasks(cs);
3279
+ cpuset_hotplug_update_tasks(cs, ptmp);
23673280
23683281 rcu_read_lock();
23693282 css_put(&cs->css);
....@@ -2376,6 +3289,8 @@
23763289 force_rebuild = false;
23773290 rebuild_sched_domains();
23783291 }
3292
+
3293
+ free_cpumasks(NULL, ptmp);
23793294 }
23803295
23813296 void cpuset_update_active_cpus(void)
....@@ -2386,6 +3301,11 @@
23863301 * to a work item to avoid reverse locking order.
23873302 */
23883303 schedule_work(&cpuset_hotplug_work);
3304
+}
3305
+
3306
+void cpuset_update_active_cpus_affine(int cpu)
3307
+{
3308
+ schedule_work_on(cpu, &cpuset_hotplug_work);
23893309 }
23903310
23913311 void cpuset_wait_for_hotplug(void)
....@@ -2417,8 +3337,11 @@
24173337 */
24183338 void __init cpuset_init_smp(void)
24193339 {
2420
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2421
- top_cpuset.mems_allowed = node_states[N_MEMORY];
3340
+ /*
3341
+ * cpus_allowd/mems_allowed set to v2 values in the initial
3342
+ * cpuset_bind() call will be reset to v1 values in another
3343
+ * cpuset_bind() call when v1 cpuset is mounted.
3344
+ */
24223345 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
24233346
24243347 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
....@@ -2445,13 +3368,13 @@
24453368 {
24463369 unsigned long flags;
24473370
2448
- spin_lock_irqsave(&callback_lock, flags);
3371
+ raw_spin_lock_irqsave(&callback_lock, flags);
24493372 rcu_read_lock();
2450
- guarantee_online_cpus(task_cs(tsk), pmask);
3373
+ guarantee_online_cpus(tsk, pmask);
24513374 rcu_read_unlock();
2452
- spin_unlock_irqrestore(&callback_lock, flags);
3375
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
24533376 }
2454
-
3377
+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
24553378 /**
24563379 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
24573380 * @tsk: pointer to task_struct with which the scheduler is struggling
....@@ -2466,9 +3389,17 @@
24663389
24673390 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
24683391 {
3392
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3393
+ const struct cpumask *cs_mask;
3394
+
24693395 rcu_read_lock();
2470
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
2471
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3396
+ cs_mask = task_cs(tsk)->cpus_allowed;
3397
+
3398
+ if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
3399
+ goto unlock; /* select_fallback_rq will try harder */
3400
+
3401
+ do_set_cpus_allowed(tsk, cs_mask);
3402
+unlock:
24723403 rcu_read_unlock();
24733404
24743405 /*
....@@ -2510,11 +3441,11 @@
25103441 nodemask_t mask;
25113442 unsigned long flags;
25123443
2513
- spin_lock_irqsave(&callback_lock, flags);
3444
+ raw_spin_lock_irqsave(&callback_lock, flags);
25143445 rcu_read_lock();
25153446 guarantee_online_mems(task_cs(tsk), &mask);
25163447 rcu_read_unlock();
2517
- spin_unlock_irqrestore(&callback_lock, flags);
3448
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
25183449
25193450 return mask;
25203451 }
....@@ -2606,14 +3537,14 @@
26063537 return true;
26073538
26083539 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2609
- spin_lock_irqsave(&callback_lock, flags);
3540
+ raw_spin_lock_irqsave(&callback_lock, flags);
26103541
26113542 rcu_read_lock();
26123543 cs = nearest_hardwall_ancestor(task_cs(current));
26133544 allowed = node_isset(node, cs->mems_allowed);
26143545 rcu_read_unlock();
26153546
2616
- spin_unlock_irqrestore(&callback_lock, flags);
3547
+ raw_spin_unlock_irqrestore(&callback_lock, flags);
26173548 return allowed;
26183549 }
26193550
....@@ -2699,9 +3630,9 @@
26993630 rcu_read_lock();
27003631
27013632 cgrp = task_cs(current)->css.cgroup;
2702
- pr_info("%s cpuset=", current->comm);
3633
+ pr_cont(",cpuset=");
27033634 pr_cont_cgroup_name(cgrp);
2704
- pr_cont(" mems_allowed=%*pbl\n",
3635
+ pr_cont(",mems_allowed=%*pbl",
27053636 nodemask_pr_args(&current->mems_allowed));
27063637
27073638 rcu_read_unlock();