hc
2024-05-16 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb
kernel/kernel/cgroup/cpuset.c
....@@ -33,17 +33,20 @@
3333 #include <linux/interrupt.h>
3434 #include <linux/kernel.h>
3535 #include <linux/kmod.h>
36
+#include <linux/kthread.h>
3637 #include <linux/list.h>
3738 #include <linux/mempolicy.h>
3839 #include <linux/mm.h>
3940 #include <linux/memory.h>
4041 #include <linux/export.h>
4142 #include <linux/mount.h>
43
+#include <linux/fs_context.h>
4244 #include <linux/namei.h>
4345 #include <linux/pagemap.h>
4446 #include <linux/proc_fs.h>
4547 #include <linux/rcupdate.h>
4648 #include <linux/sched.h>
49
+#include <linux/sched/deadline.h>
4750 #include <linux/sched/mm.h>
4851 #include <linux/sched/task.h>
4952 #include <linux/seq_file.h>
....@@ -63,6 +66,9 @@
6366 #include <linux/mutex.h>
6467 #include <linux/cgroup.h>
6568 #include <linux/wait.h>
69
+
70
+#include <trace/hooks/sched.h>
71
+#include <trace/hooks/cgroup.h>
6672
6773 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
6874 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
....@@ -111,6 +117,16 @@
111117 nodemask_t effective_mems;
112118
113119 /*
120
+ * CPUs allocated to child sub-partitions (default hierarchy only)
121
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
122
+ * - effective_cpus and subparts_cpus are mutually exclusive.
123
+ *
124
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
125
+ * may have offlined ones.
126
+ */
127
+ cpumask_var_t subparts_cpus;
128
+
129
+ /*
114130 * This is old Memory Nodes tasks took on.
115131 *
116132 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
....@@ -135,6 +151,55 @@
135151
136152 /* for custom sched domain */
137153 int relax_domain_level;
154
+
155
+ /* number of CPUs in subparts_cpus */
156
+ int nr_subparts_cpus;
157
+
158
+ /* partition root state */
159
+ int partition_root_state;
160
+
161
+ /*
162
+ * Default hierarchy only:
163
+ * use_parent_ecpus - set if using parent's effective_cpus
164
+ * child_ecpus_count - # of children with use_parent_ecpus set
165
+ */
166
+ int use_parent_ecpus;
167
+ int child_ecpus_count;
168
+
169
+ /*
170
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
171
+ * know when to rebuild associated root domain bandwidth information.
172
+ */
173
+ int nr_deadline_tasks;
174
+ int nr_migrate_dl_tasks;
175
+ u64 sum_migrate_dl_bw;
176
+};
177
+
178
+/*
179
+ * Partition root states:
180
+ *
181
+ * 0 - not a partition root
182
+ *
183
+ * 1 - partition root
184
+ *
185
+ * -1 - invalid partition root
186
+ * None of the cpus in cpus_allowed can be put into the parent's
187
+ * subparts_cpus. In this case, the cpuset is not a real partition
188
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
189
+ * and the cpuset can be restored back to a partition root if the
190
+ * parent cpuset can give more CPUs back to this child cpuset.
191
+ */
192
+#define PRS_DISABLED 0
193
+#define PRS_ENABLED 1
194
+#define PRS_ERROR -1
195
+
196
+/*
197
+ * Temporary cpumasks for working with partitions that are passed among
198
+ * functions to avoid memory allocation in inner functions.
199
+ */
200
+struct tmpmasks {
201
+ cpumask_var_t addmask, delmask; /* For partition root */
202
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
138203 };
139204
140205 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
....@@ -153,18 +218,19 @@
153218 return css_cs(cs->css.parent);
154219 }
155220
156
-#ifdef CONFIG_NUMA
157
-static inline bool task_has_mempolicy(struct task_struct *task)
221
+void inc_dl_tasks_cs(struct task_struct *p)
158222 {
159
- return task->mempolicy;
160
-}
161
-#else
162
-static inline bool task_has_mempolicy(struct task_struct *task)
163
-{
164
- return false;
165
-}
166
-#endif
223
+ struct cpuset *cs = task_cs(p);
167224
225
+ cs->nr_deadline_tasks++;
226
+}
227
+
228
+void dec_dl_tasks_cs(struct task_struct *p)
229
+{
230
+ struct cpuset *cs = task_cs(p);
231
+
232
+ cs->nr_deadline_tasks--;
233
+}
168234
169235 /* bits in struct cpuset flags field */
170236 typedef enum {
....@@ -219,9 +285,15 @@
219285 return test_bit(CS_SPREAD_SLAB, &cs->flags);
220286 }
221287
288
+static inline int is_partition_root(const struct cpuset *cs)
289
+{
290
+ return cs->partition_root_state > 0;
291
+}
292
+
222293 static struct cpuset top_cpuset = {
223294 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
224295 (1 << CS_MEM_EXCLUSIVE)),
296
+ .partition_root_state = PRS_ENABLED,
225297 };
226298
227299 /**
....@@ -289,21 +361,36 @@
289361 */
290362
291363 static DEFINE_MUTEX(cpuset_mutex);
292
-static DEFINE_RAW_SPINLOCK(callback_lock);
364
+
365
+void cpuset_lock(void)
366
+{
367
+ mutex_lock(&cpuset_mutex);
368
+}
369
+
370
+void cpuset_unlock(void)
371
+{
372
+ mutex_unlock(&cpuset_mutex);
373
+}
374
+
375
+static DEFINE_SPINLOCK(callback_lock);
293376
294377 static struct workqueue_struct *cpuset_migrate_mm_wq;
295378
296379 /*
297
- * CPU / memory hotplug is handled asynchronously.
380
+ * CPU / memory hotplug is handled asynchronously
381
+ * for hotplug, synchronously for resume_cpus
298382 */
299
-static void cpuset_hotplug_workfn(struct work_struct *work);
300383 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
301384
302385 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
303386
304387 /*
305
- * Cgroup v2 behavior is used when on default hierarchy or the
306
- * cgroup_v2_mode flag is set.
388
+ * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
389
+ * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
390
+ * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
391
+ * With v2 behavior, "cpus" and "mems" are always what the users have
392
+ * requested and won't be changed by hotplug events. Only the effective
393
+ * cpus or mems will be affected.
307394 */
308395 static inline bool is_in_v2_mode(void)
309396 {
....@@ -312,58 +399,45 @@
312399 }
313400
314401 /*
315
- * This is ugly, but preserves the userspace API for existing cpuset
316
- * users. If someone tries to mount the "cpuset" filesystem, we
317
- * silently switch it to mount "cgroup" instead
318
- */
319
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
320
- int flags, const char *unused_dev_name, void *data)
321
-{
322
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
323
- struct dentry *ret = ERR_PTR(-ENODEV);
324
- if (cgroup_fs) {
325
- char mountopts[] =
326
- "cpuset,noprefix,"
327
- "release_agent=/sbin/cpuset_release_agent";
328
- ret = cgroup_fs->mount(cgroup_fs, flags,
329
- unused_dev_name, mountopts);
330
- put_filesystem(cgroup_fs);
331
- }
332
- return ret;
333
-}
334
-
335
-static struct file_system_type cpuset_fs_type = {
336
- .name = "cpuset",
337
- .mount = cpuset_mount,
338
-};
339
-
340
-/*
341
- * Return in pmask the portion of a cpusets's cpus_allowed that
342
- * are online. If none are online, walk up the cpuset hierarchy
343
- * until we find one that does have some online cpus.
402
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
403
+ * are online and are capable of running the task. If none are found,
404
+ * walk up the cpuset hierarchy until we find one that does have some
405
+ * appropriate cpus.
344406 *
345407 * One way or another, we guarantee to return some non-empty subset
346
- * of cpu_online_mask.
408
+ * of cpu_active_mask.
347409 *
348410 * Call with callback_lock or cpuset_mutex held.
349411 */
350
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
412
+static void guarantee_online_cpus(struct task_struct *tsk,
413
+ struct cpumask *pmask)
351414 {
352
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
415
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
416
+ struct cpuset *cs;
417
+
418
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
419
+ cpumask_copy(pmask, cpu_active_mask);
420
+
421
+ rcu_read_lock();
422
+ cs = task_cs(tsk);
423
+
424
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
353425 cs = parent_cs(cs);
354426 if (unlikely(!cs)) {
355427 /*
356428 * The top cpuset doesn't have any online cpu as a
357429 * consequence of a race between cpuset_hotplug_work
358430 * and cpu hotplug notifier. But we know the top
359
- * cpuset's effective_cpus is on its way to to be
431
+ * cpuset's effective_cpus is on its way to be
360432 * identical to cpu_online_mask.
361433 */
362
- cpumask_copy(pmask, cpu_online_mask);
363
- return;
434
+ goto out_unlock;
364435 }
365436 }
366
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
437
+ cpumask_and(pmask, pmask, cs->effective_cpus);
438
+
439
+out_unlock:
440
+ rcu_read_unlock();
367441 }
368442
369443 /*
....@@ -420,6 +494,71 @@
420494 }
421495
422496 /**
497
+ * alloc_cpumasks - allocate three cpumasks for cpuset
498
+ * @cs: the cpuset that have cpumasks to be allocated.
499
+ * @tmp: the tmpmasks structure pointer
500
+ * Return: 0 if successful, -ENOMEM otherwise.
501
+ *
502
+ * Only one of the two input arguments should be non-NULL.
503
+ */
504
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
505
+{
506
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
507
+
508
+ if (cs) {
509
+ pmask1 = &cs->cpus_allowed;
510
+ pmask2 = &cs->effective_cpus;
511
+ pmask3 = &cs->subparts_cpus;
512
+ } else {
513
+ pmask1 = &tmp->new_cpus;
514
+ pmask2 = &tmp->addmask;
515
+ pmask3 = &tmp->delmask;
516
+ }
517
+
518
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
519
+ return -ENOMEM;
520
+
521
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
522
+ goto free_one;
523
+
524
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
525
+ goto free_two;
526
+
527
+ if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
528
+ goto free_three;
529
+
530
+ return 0;
531
+
532
+free_three:
533
+ free_cpumask_var(*pmask3);
534
+free_two:
535
+ free_cpumask_var(*pmask2);
536
+free_one:
537
+ free_cpumask_var(*pmask1);
538
+ return -ENOMEM;
539
+}
540
+
541
+/**
542
+ * free_cpumasks - free cpumasks in a tmpmasks structure
543
+ * @cs: the cpuset that have cpumasks to be free.
544
+ * @tmp: the tmpmasks structure pointer
545
+ */
546
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
547
+{
548
+ if (cs) {
549
+ free_cpumask_var(cs->cpus_allowed);
550
+ free_cpumask_var(cs->cpus_requested);
551
+ free_cpumask_var(cs->effective_cpus);
552
+ free_cpumask_var(cs->subparts_cpus);
553
+ }
554
+ if (tmp) {
555
+ free_cpumask_var(tmp->new_cpus);
556
+ free_cpumask_var(tmp->addmask);
557
+ free_cpumask_var(tmp->delmask);
558
+ }
559
+}
560
+
561
+/**
423562 * alloc_trial_cpuset - allocate a trial cpuset
424563 * @cs: the cpuset that the trial cpuset duplicates
425564 */
....@@ -431,37 +570,25 @@
431570 if (!trial)
432571 return NULL;
433572
434
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
435
- goto free_cs;
436
- if (!alloc_cpumask_var(&trial->cpus_requested, GFP_KERNEL))
437
- goto free_allowed;
438
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
439
- goto free_cpus;
573
+ if (alloc_cpumasks(trial, NULL)) {
574
+ kfree(trial);
575
+ return NULL;
576
+ }
440577
441578 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
442579 cpumask_copy(trial->cpus_requested, cs->cpus_requested);
443580 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
444581 return trial;
445
-
446
-free_cpus:
447
- free_cpumask_var(trial->cpus_requested);
448
-free_allowed:
449
- free_cpumask_var(trial->cpus_allowed);
450
-free_cs:
451
- kfree(trial);
452
- return NULL;
453582 }
454583
455584 /**
456
- * free_trial_cpuset - free the trial cpuset
457
- * @trial: the trial cpuset to be freed
585
+ * free_cpuset - free the cpuset
586
+ * @cs: the cpuset to be freed
458587 */
459
-static void free_trial_cpuset(struct cpuset *trial)
588
+static inline void free_cpuset(struct cpuset *cs)
460589 {
461
- free_cpumask_var(trial->effective_cpus);
462
- free_cpumask_var(trial->cpus_requested);
463
- free_cpumask_var(trial->cpus_allowed);
464
- kfree(trial);
590
+ free_cpumasks(cs, NULL);
591
+ kfree(cs);
465592 }
466593
467594 /*
....@@ -612,7 +739,7 @@
612739 * load balancing domains (sched domains) as specified by that partial
613740 * partition.
614741 *
615
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
742
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
616743 * for a background explanation of this.
617744 *
618745 * Does not return errors, on the theory that the callers of this
....@@ -623,11 +750,10 @@
623750 * Must be called with cpuset_mutex held.
624751 *
625752 * The three key local variables below are:
626
- * q - a linked-list queue of cpuset pointers, used to implement a
627
- * top-down scan of all cpusets. This scan loads a pointer
628
- * to each cpuset marked is_sched_load_balance into the
629
- * array 'csa'. For our purposes, rebuilding the schedulers
630
- * sched domains, we can ignore !is_sched_load_balance cpusets.
753
+ * cp - cpuset pointer, used (together with pos_css) to perform a
754
+ * top-down scan of all cpusets. For our purposes, rebuilding
755
+ * the schedulers sched domains, we can ignore !is_sched_load_
756
+ * balance cpusets.
631757 * csa - (for CpuSet Array) Array of pointers to all the cpusets
632758 * that need to be load balanced, for convenient iterative
633759 * access by the subsequent code that finds the best partition,
....@@ -658,7 +784,7 @@
658784 static int generate_sched_domains(cpumask_var_t **domains,
659785 struct sched_domain_attr **attributes)
660786 {
661
- struct cpuset *cp; /* scans q */
787
+ struct cpuset *cp; /* top-down scan of cpusets */
662788 struct cpuset **csa; /* array of all cpuset ptrs */
663789 int csn; /* how many cpuset ptrs in csa so far */
664790 int i, j, k; /* indices for partition finding loops */
....@@ -667,13 +793,14 @@
667793 int ndoms = 0; /* number of sched domains in result */
668794 int nslot; /* next empty doms[] struct cpumask slot */
669795 struct cgroup_subsys_state *pos_css;
796
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
670797
671798 doms = NULL;
672799 dattr = NULL;
673800 csa = NULL;
674801
675802 /* Special case for the 99% of systems with one, full, sched domain */
676
- if (is_sched_load_balance(&top_cpuset)) {
803
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
677804 ndoms = 1;
678805 doms = alloc_sched_domains(ndoms);
679806 if (!doms)
....@@ -696,6 +823,8 @@
696823 csn = 0;
697824
698825 rcu_read_lock();
826
+ if (root_load_balance)
827
+ csa[csn++] = &top_cpuset;
699828 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
700829 if (cp == &top_cpuset)
701830 continue;
....@@ -706,6 +835,9 @@
706835 * parent's cpus, so just skip them, and then we call
707836 * update_domain_attr_tree() to calc relax_domain_level of
708837 * the corresponding sched domain.
838
+ *
839
+ * If root is load-balancing, we can skip @cp if it
840
+ * is a subset of the root's effective_cpus.
709841 */
710842 if (!cpumask_empty(cp->cpus_allowed) &&
711843 !(is_sched_load_balance(cp) &&
....@@ -713,11 +845,17 @@
713845 housekeeping_cpumask(HK_FLAG_DOMAIN))))
714846 continue;
715847
716
- if (is_sched_load_balance(cp))
848
+ if (root_load_balance &&
849
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
850
+ continue;
851
+
852
+ if (is_sched_load_balance(cp) &&
853
+ !cpumask_empty(cp->effective_cpus))
717854 csa[csn++] = cp;
718855
719
- /* skip @cp's subtree */
720
- pos_css = css_rightmost_descendant(pos_css);
856
+ /* skip @cp's subtree if not a partition root */
857
+ if (!is_partition_root(cp))
858
+ pos_css = css_rightmost_descendant(pos_css);
721859 }
722860 rcu_read_unlock();
723861
....@@ -820,6 +958,68 @@
820958 return ndoms;
821959 }
822960
961
+static void dl_update_tasks_root_domain(struct cpuset *cs)
962
+{
963
+ struct css_task_iter it;
964
+ struct task_struct *task;
965
+
966
+ if (cs->nr_deadline_tasks == 0)
967
+ return;
968
+
969
+ css_task_iter_start(&cs->css, 0, &it);
970
+
971
+ while ((task = css_task_iter_next(&it)))
972
+ dl_add_task_root_domain(task);
973
+
974
+ css_task_iter_end(&it);
975
+}
976
+
977
+static void dl_rebuild_rd_accounting(void)
978
+{
979
+ struct cpuset *cs = NULL;
980
+ struct cgroup_subsys_state *pos_css;
981
+
982
+ lockdep_assert_held(&cpuset_mutex);
983
+ lockdep_assert_cpus_held();
984
+ lockdep_assert_held(&sched_domains_mutex);
985
+
986
+ rcu_read_lock();
987
+
988
+ /*
989
+ * Clear default root domain DL accounting, it will be computed again
990
+ * if a task belongs to it.
991
+ */
992
+ dl_clear_root_domain(&def_root_domain);
993
+
994
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
995
+
996
+ if (cpumask_empty(cs->effective_cpus)) {
997
+ pos_css = css_rightmost_descendant(pos_css);
998
+ continue;
999
+ }
1000
+
1001
+ css_get(&cs->css);
1002
+
1003
+ rcu_read_unlock();
1004
+
1005
+ dl_update_tasks_root_domain(cs);
1006
+
1007
+ rcu_read_lock();
1008
+ css_put(&cs->css);
1009
+ }
1010
+ rcu_read_unlock();
1011
+}
1012
+
1013
+static void
1014
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1015
+ struct sched_domain_attr *dattr_new)
1016
+{
1017
+ mutex_lock(&sched_domains_mutex);
1018
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1019
+ dl_rebuild_rd_accounting();
1020
+ mutex_unlock(&sched_domains_mutex);
1021
+}
1022
+
8231023 /*
8241024 * Rebuild scheduler domains.
8251025 *
....@@ -833,28 +1033,53 @@
8331033 */
8341034 static void rebuild_sched_domains_locked(void)
8351035 {
1036
+ struct cgroup_subsys_state *pos_css;
8361037 struct sched_domain_attr *attr;
8371038 cpumask_var_t *doms;
1039
+ struct cpuset *cs;
8381040 int ndoms;
8391041
8401042 lockdep_assert_held(&cpuset_mutex);
841
- get_online_cpus();
8421043
8431044 /*
844
- * We have raced with CPU hotplug. Don't do anything to avoid
1045
+ * If we have raced with CPU hotplug, return early to avoid
8451046 * passing doms with offlined cpu to partition_sched_domains().
846
- * Anyways, hotplug work item will rebuild sched domains.
1047
+ * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1048
+ *
1049
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1050
+ * should be the same as the active CPUs, so checking only top_cpuset
1051
+ * is enough to detect racing CPU offlines.
8471052 */
848
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
849
- goto out;
1053
+ if (!top_cpuset.nr_subparts_cpus &&
1054
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1055
+ return;
1056
+
1057
+ /*
1058
+ * With subpartition CPUs, however, the effective CPUs of a partition
1059
+ * root should be only a subset of the active CPUs. Since a CPU in any
1060
+ * partition root could be offlined, all must be checked.
1061
+ */
1062
+ if (top_cpuset.nr_subparts_cpus) {
1063
+ rcu_read_lock();
1064
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1065
+ if (!is_partition_root(cs)) {
1066
+ pos_css = css_rightmost_descendant(pos_css);
1067
+ continue;
1068
+ }
1069
+ if (!cpumask_subset(cs->effective_cpus,
1070
+ cpu_active_mask)) {
1071
+ rcu_read_unlock();
1072
+ return;
1073
+ }
1074
+ }
1075
+ rcu_read_unlock();
1076
+ }
8501077
8511078 /* Generate domain masks and attrs */
8521079 ndoms = generate_sched_domains(&doms, &attr);
8531080
8541081 /* Have scheduler rebuild the domains */
855
- partition_sched_domains(ndoms, doms, attr);
856
-out:
857
- put_online_cpus();
1082
+ partition_and_rebuild_sched_domains(ndoms, doms, attr);
8581083 }
8591084 #else /* !CONFIG_SMP */
8601085 static void rebuild_sched_domains_locked(void)
....@@ -864,9 +1089,23 @@
8641089
8651090 void rebuild_sched_domains(void)
8661091 {
1092
+ get_online_cpus();
8671093 mutex_lock(&cpuset_mutex);
8681094 rebuild_sched_domains_locked();
8691095 mutex_unlock(&cpuset_mutex);
1096
+ put_online_cpus();
1097
+}
1098
+
1099
+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
1100
+ const struct cpumask *new_mask)
1101
+{
1102
+ int ret = -EINVAL;
1103
+
1104
+ trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret);
1105
+ if (!ret)
1106
+ return ret;
1107
+
1108
+ return set_cpus_allowed_ptr(p, new_mask);
8701109 }
8711110
8721111 /**
....@@ -881,17 +1120,268 @@
8811120 {
8821121 struct css_task_iter it;
8831122 struct task_struct *task;
1123
+ bool top_cs = cs == &top_cpuset;
8841124
8851125 css_task_iter_start(&cs->css, 0, &it);
886
- while ((task = css_task_iter_next(&it)))
887
- set_cpus_allowed_ptr(task, cs->effective_cpus);
1126
+ while ((task = css_task_iter_next(&it))) {
1127
+ /*
1128
+ * Percpu kthreads in top_cpuset are ignored
1129
+ */
1130
+ if (top_cs && (task->flags & PF_KTHREAD) &&
1131
+ kthread_is_per_cpu(task))
1132
+ continue;
1133
+ update_cpus_allowed(cs, task, cs->effective_cpus);
1134
+ }
8881135 css_task_iter_end(&it);
1136
+}
1137
+
1138
+/**
1139
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1140
+ * @new_cpus: the temp variable for the new effective_cpus mask
1141
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
1142
+ * @parent: the parent cpuset
1143
+ *
1144
+ * If the parent has subpartition CPUs, include them in the list of
1145
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
1146
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
1147
+ * to mask those out.
1148
+ */
1149
+static void compute_effective_cpumask(struct cpumask *new_cpus,
1150
+ struct cpuset *cs, struct cpuset *parent)
1151
+{
1152
+ if (parent->nr_subparts_cpus) {
1153
+ cpumask_or(new_cpus, parent->effective_cpus,
1154
+ parent->subparts_cpus);
1155
+ cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
1156
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1157
+ } else {
1158
+ cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
1159
+ }
1160
+}
1161
+
1162
+/*
1163
+ * Commands for update_parent_subparts_cpumask
1164
+ */
1165
+enum subparts_cmd {
1166
+ partcmd_enable, /* Enable partition root */
1167
+ partcmd_disable, /* Disable partition root */
1168
+ partcmd_update, /* Update parent's subparts_cpus */
1169
+};
1170
+
1171
+/**
1172
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1173
+ * @cpuset: The cpuset that requests change in partition root state
1174
+ * @cmd: Partition root state change command
1175
+ * @newmask: Optional new cpumask for partcmd_update
1176
+ * @tmp: Temporary addmask and delmask
1177
+ * Return: 0, 1 or an error code
1178
+ *
1179
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
1180
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
1181
+ * be put into parent's subparts_cpus and taken away from parent's
1182
+ * effective_cpus. The function will return 0 if all the CPUs listed in
1183
+ * cpus_allowed can be granted or an error code will be returned.
1184
+ *
1185
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
1186
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
1187
+ * parent's subparts_cpus will be taken away from that cpumask and put back
1188
+ * into parent's effective_cpus. 0 should always be returned.
1189
+ *
1190
+ * For partcmd_update, if the optional newmask is specified, the cpu
1191
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
1192
+ * cpus_allowed is assumed to remain the same. The cpuset should either
1193
+ * be a partition root or an invalid partition root. The partition root
1194
+ * state may change if newmask is NULL and none of the requested CPUs can
1195
+ * be granted by the parent. The function will return 1 if changes to
1196
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1197
+ * Error code should only be returned when newmask is non-NULL.
1198
+ *
1199
+ * The partcmd_enable and partcmd_disable commands are used by
1200
+ * update_prstate(). The partcmd_update command is used by
1201
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1202
+ * newmask set.
1203
+ *
1204
+ * The checking is more strict when enabling partition root than the
1205
+ * other two commands.
1206
+ *
1207
+ * Because of the implicit cpu exclusive nature of a partition root,
1208
+ * cpumask changes that violates the cpu exclusivity rule will not be
1209
+ * permitted when checked by validate_change(). The validate_change()
1210
+ * function will also prevent any changes to the cpu list if it is not
1211
+ * a superset of children's cpu lists.
1212
+ */
1213
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1214
+ struct cpumask *newmask,
1215
+ struct tmpmasks *tmp)
1216
+{
1217
+ struct cpuset *parent = parent_cs(cpuset);
1218
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
1219
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
1220
+ int new_prs;
1221
+ bool part_error = false; /* Partition error? */
1222
+
1223
+ lockdep_assert_held(&cpuset_mutex);
1224
+
1225
+ /*
1226
+ * The parent must be a partition root.
1227
+ * The new cpumask, if present, or the current cpus_allowed must
1228
+ * not be empty.
1229
+ */
1230
+ if (!is_partition_root(parent) ||
1231
+ (newmask && cpumask_empty(newmask)) ||
1232
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1233
+ return -EINVAL;
1234
+
1235
+ /*
1236
+ * Enabling/disabling partition root is not allowed if there are
1237
+ * online children.
1238
+ */
1239
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1240
+ return -EBUSY;
1241
+
1242
+ /*
1243
+ * Enabling partition root is not allowed if not all the CPUs
1244
+ * can be granted from parent's effective_cpus or at least one
1245
+ * CPU will be left after that.
1246
+ */
1247
+ if ((cmd == partcmd_enable) &&
1248
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1249
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1250
+ return -EINVAL;
1251
+
1252
+ /*
1253
+ * A cpumask update cannot make parent's effective_cpus become empty.
1254
+ */
1255
+ adding = deleting = false;
1256
+ new_prs = cpuset->partition_root_state;
1257
+ if (cmd == partcmd_enable) {
1258
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1259
+ adding = true;
1260
+ } else if (cmd == partcmd_disable) {
1261
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1262
+ parent->subparts_cpus);
1263
+ } else if (newmask) {
1264
+ /*
1265
+ * partcmd_update with newmask:
1266
+ *
1267
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1268
+ * addmask = newmask & parent->effective_cpus
1269
+ * & ~parent->subparts_cpus
1270
+ */
1271
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1272
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
1273
+ parent->subparts_cpus);
1274
+
1275
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1276
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1277
+ parent->subparts_cpus);
1278
+ /*
1279
+ * Return error if the new effective_cpus could become empty.
1280
+ */
1281
+ if (adding &&
1282
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1283
+ if (!deleting)
1284
+ return -EINVAL;
1285
+ /*
1286
+ * As some of the CPUs in subparts_cpus might have
1287
+ * been offlined, we need to compute the real delmask
1288
+ * to confirm that.
1289
+ */
1290
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
1291
+ cpu_active_mask))
1292
+ return -EINVAL;
1293
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
1294
+ }
1295
+ } else {
1296
+ /*
1297
+ * partcmd_update w/o newmask:
1298
+ *
1299
+ * addmask = cpus_allowed & parent->effective_cpus
1300
+ *
1301
+ * Note that parent's subparts_cpus may have been
1302
+ * pre-shrunk in case there is a change in the cpu list.
1303
+ * So no deletion is needed.
1304
+ */
1305
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1306
+ parent->effective_cpus);
1307
+ part_error = cpumask_equal(tmp->addmask,
1308
+ parent->effective_cpus);
1309
+ }
1310
+
1311
+ if (cmd == partcmd_update) {
1312
+ int prev_prs = cpuset->partition_root_state;
1313
+
1314
+ /*
1315
+ * Check for possible transition between PRS_ENABLED
1316
+ * and PRS_ERROR.
1317
+ */
1318
+ switch (cpuset->partition_root_state) {
1319
+ case PRS_ENABLED:
1320
+ if (part_error)
1321
+ new_prs = PRS_ERROR;
1322
+ break;
1323
+ case PRS_ERROR:
1324
+ if (!part_error)
1325
+ new_prs = PRS_ENABLED;
1326
+ break;
1327
+ }
1328
+ /*
1329
+ * Set part_error if previously in invalid state.
1330
+ */
1331
+ part_error = (prev_prs == PRS_ERROR);
1332
+ }
1333
+
1334
+ if (!part_error && (new_prs == PRS_ERROR))
1335
+ return 0; /* Nothing need to be done */
1336
+
1337
+ if (new_prs == PRS_ERROR) {
1338
+ /*
1339
+ * Remove all its cpus from parent's subparts_cpus.
1340
+ */
1341
+ adding = false;
1342
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1343
+ parent->subparts_cpus);
1344
+ }
1345
+
1346
+ if (!adding && !deleting && (new_prs == cpuset->partition_root_state))
1347
+ return 0;
1348
+
1349
+ /*
1350
+ * Change the parent's subparts_cpus.
1351
+ * Newly added CPUs will be removed from effective_cpus and
1352
+ * newly deleted ones will be added back to effective_cpus.
1353
+ */
1354
+ spin_lock_irq(&callback_lock);
1355
+ if (adding) {
1356
+ cpumask_or(parent->subparts_cpus,
1357
+ parent->subparts_cpus, tmp->addmask);
1358
+ cpumask_andnot(parent->effective_cpus,
1359
+ parent->effective_cpus, tmp->addmask);
1360
+ }
1361
+ if (deleting) {
1362
+ cpumask_andnot(parent->subparts_cpus,
1363
+ parent->subparts_cpus, tmp->delmask);
1364
+ /*
1365
+ * Some of the CPUs in subparts_cpus might have been offlined.
1366
+ */
1367
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1368
+ cpumask_or(parent->effective_cpus,
1369
+ parent->effective_cpus, tmp->delmask);
1370
+ }
1371
+
1372
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1373
+
1374
+ if (cpuset->partition_root_state != new_prs)
1375
+ cpuset->partition_root_state = new_prs;
1376
+ spin_unlock_irq(&callback_lock);
1377
+
1378
+ return cmd == partcmd_update;
8891379 }
8901380
8911381 /*
8921382 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
893
- * @cs: the cpuset to consider
894
- * @new_cpus: temp variable for calculating new effective_cpus
1383
+ * @cs: the cpuset to consider
1384
+ * @tmp: temp variables for calculating effective_cpus & partition setup
8951385 *
8961386 * When congifured cpumask is changed, the effective cpumasks of this cpuset
8971387 * and all its descendants need to be updated.
....@@ -900,38 +1390,127 @@
9001390 *
9011391 * Called with cpuset_mutex held
9021392 */
903
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
1393
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
9041394 {
9051395 struct cpuset *cp;
9061396 struct cgroup_subsys_state *pos_css;
9071397 bool need_rebuild_sched_domains = false;
1398
+ int new_prs;
9081399
9091400 rcu_read_lock();
9101401 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
9111402 struct cpuset *parent = parent_cs(cp);
9121403
913
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
1404
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
9141405
9151406 /*
9161407 * If it becomes empty, inherit the effective mask of the
9171408 * parent, which is guaranteed to have some CPUs.
9181409 */
919
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
920
- cpumask_copy(new_cpus, parent->effective_cpus);
1410
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1411
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1412
+ if (!cp->use_parent_ecpus) {
1413
+ cp->use_parent_ecpus = true;
1414
+ parent->child_ecpus_count++;
1415
+ }
1416
+ } else if (cp->use_parent_ecpus) {
1417
+ cp->use_parent_ecpus = false;
1418
+ WARN_ON_ONCE(!parent->child_ecpus_count);
1419
+ parent->child_ecpus_count--;
1420
+ }
9211421
922
- /* Skip the whole subtree if the cpumask remains the same. */
923
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
1422
+ /*
1423
+ * Skip the whole subtree if the cpumask remains the same
1424
+ * and has no partition root state.
1425
+ */
1426
+ if (!cp->partition_root_state &&
1427
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
9241428 pos_css = css_rightmost_descendant(pos_css);
9251429 continue;
1430
+ }
1431
+
1432
+ /*
1433
+ * update_parent_subparts_cpumask() should have been called
1434
+ * for cs already in update_cpumask(). We should also call
1435
+ * update_tasks_cpumask() again for tasks in the parent
1436
+ * cpuset if the parent's subparts_cpus changes.
1437
+ */
1438
+ new_prs = cp->partition_root_state;
1439
+ if ((cp != cs) && new_prs) {
1440
+ switch (parent->partition_root_state) {
1441
+ case PRS_DISABLED:
1442
+ /*
1443
+ * If parent is not a partition root or an
1444
+ * invalid partition root, clear its state
1445
+ * and its CS_CPU_EXCLUSIVE flag.
1446
+ */
1447
+ WARN_ON_ONCE(cp->partition_root_state
1448
+ != PRS_ERROR);
1449
+ new_prs = PRS_DISABLED;
1450
+
1451
+ /*
1452
+ * clear_bit() is an atomic operation and
1453
+ * readers aren't interested in the state
1454
+ * of CS_CPU_EXCLUSIVE anyway. So we can
1455
+ * just update the flag without holding
1456
+ * the callback_lock.
1457
+ */
1458
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1459
+ break;
1460
+
1461
+ case PRS_ENABLED:
1462
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1463
+ update_tasks_cpumask(parent);
1464
+ break;
1465
+
1466
+ case PRS_ERROR:
1467
+ /*
1468
+ * When parent is invalid, it has to be too.
1469
+ */
1470
+ new_prs = PRS_ERROR;
1471
+ break;
1472
+ }
9261473 }
9271474
9281475 if (!css_tryget_online(&cp->css))
9291476 continue;
9301477 rcu_read_unlock();
9311478
932
- raw_spin_lock_irq(&callback_lock);
933
- cpumask_copy(cp->effective_cpus, new_cpus);
934
- raw_spin_unlock_irq(&callback_lock);
1479
+ spin_lock_irq(&callback_lock);
1480
+
1481
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1482
+ if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1483
+ cp->nr_subparts_cpus = 0;
1484
+ cpumask_clear(cp->subparts_cpus);
1485
+ } else if (cp->nr_subparts_cpus) {
1486
+ /*
1487
+ * Make sure that effective_cpus & subparts_cpus
1488
+ * are mutually exclusive.
1489
+ *
1490
+ * In the unlikely event that effective_cpus
1491
+ * becomes empty. we clear cp->nr_subparts_cpus and
1492
+ * let its child partition roots to compete for
1493
+ * CPUs again.
1494
+ */
1495
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1496
+ cp->subparts_cpus);
1497
+ if (cpumask_empty(cp->effective_cpus)) {
1498
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1499
+ cpumask_clear(cp->subparts_cpus);
1500
+ cp->nr_subparts_cpus = 0;
1501
+ } else if (!cpumask_subset(cp->subparts_cpus,
1502
+ tmp->new_cpus)) {
1503
+ cpumask_andnot(cp->subparts_cpus,
1504
+ cp->subparts_cpus, tmp->new_cpus);
1505
+ cp->nr_subparts_cpus
1506
+ = cpumask_weight(cp->subparts_cpus);
1507
+ }
1508
+ }
1509
+
1510
+ if (new_prs != cp->partition_root_state)
1511
+ cp->partition_root_state = new_prs;
1512
+
1513
+ spin_unlock_irq(&callback_lock);
9351514
9361515 WARN_ON(!is_in_v2_mode() &&
9371516 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
....@@ -939,11 +1518,15 @@
9391518 update_tasks_cpumask(cp);
9401519
9411520 /*
942
- * If the effective cpumask of any non-empty cpuset is changed,
943
- * we need to rebuild sched domains.
1521
+ * On legacy hierarchy, if the effective cpumask of any non-
1522
+ * empty cpuset is changed, we need to rebuild sched domains.
1523
+ * On default hierarchy, the cpuset needs to be a partition
1524
+ * root as well.
9441525 */
9451526 if (!cpumask_empty(cp->cpus_allowed) &&
946
- is_sched_load_balance(cp))
1527
+ is_sched_load_balance(cp) &&
1528
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1529
+ is_partition_root(cp)))
9471530 need_rebuild_sched_domains = true;
9481531
9491532 rcu_read_lock();
....@@ -956,6 +1539,45 @@
9561539 }
9571540
9581541 /**
1542
+ * update_sibling_cpumasks - Update siblings cpumasks
1543
+ * @parent: Parent cpuset
1544
+ * @cs: Current cpuset
1545
+ * @tmp: Temp variables
1546
+ */
1547
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1548
+ struct tmpmasks *tmp)
1549
+{
1550
+ struct cpuset *sibling;
1551
+ struct cgroup_subsys_state *pos_css;
1552
+
1553
+ lockdep_assert_held(&cpuset_mutex);
1554
+
1555
+ /*
1556
+ * Check all its siblings and call update_cpumasks_hier()
1557
+ * if their use_parent_ecpus flag is set in order for them
1558
+ * to use the right effective_cpus value.
1559
+ *
1560
+ * The update_cpumasks_hier() function may sleep. So we have to
1561
+ * release the RCU read lock before calling it.
1562
+ */
1563
+ rcu_read_lock();
1564
+ cpuset_for_each_child(sibling, pos_css, parent) {
1565
+ if (sibling == cs)
1566
+ continue;
1567
+ if (!sibling->use_parent_ecpus)
1568
+ continue;
1569
+ if (!css_tryget_online(&sibling->css))
1570
+ continue;
1571
+
1572
+ rcu_read_unlock();
1573
+ update_cpumasks_hier(sibling, tmp);
1574
+ rcu_read_lock();
1575
+ css_put(&sibling->css);
1576
+ }
1577
+ rcu_read_unlock();
1578
+}
1579
+
1580
+/**
9591581 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
9601582 * @cs: the cpuset to consider
9611583 * @trialcs: trial cpuset
....@@ -965,6 +1587,7 @@
9651587 const char *buf)
9661588 {
9671589 int retval;
1590
+ struct tmpmasks tmp;
9681591
9691592 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
9701593 if (cs == &top_cpuset)
....@@ -997,13 +1620,50 @@
9971620 if (retval < 0)
9981621 return retval;
9991622
1000
- raw_spin_lock_irq(&callback_lock);
1623
+#ifdef CONFIG_CPUMASK_OFFSTACK
1624
+ /*
1625
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
1626
+ * to allocated cpumasks.
1627
+ */
1628
+ tmp.addmask = trialcs->subparts_cpus;
1629
+ tmp.delmask = trialcs->effective_cpus;
1630
+ tmp.new_cpus = trialcs->cpus_allowed;
1631
+#endif
1632
+
1633
+ if (cs->partition_root_state) {
1634
+ /* Cpumask of a partition root cannot be empty */
1635
+ if (cpumask_empty(trialcs->cpus_allowed))
1636
+ return -EINVAL;
1637
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
1638
+ trialcs->cpus_allowed, &tmp) < 0)
1639
+ return -EINVAL;
1640
+ }
1641
+
1642
+ spin_lock_irq(&callback_lock);
10011643 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
10021644 cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
1003
- raw_spin_unlock_irq(&callback_lock);
10041645
1005
- /* use trialcs->cpus_allowed as a temp variable */
1006
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
1646
+ /*
1647
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
1648
+ */
1649
+ if (cs->nr_subparts_cpus) {
1650
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
1651
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1652
+ }
1653
+ spin_unlock_irq(&callback_lock);
1654
+
1655
+ update_cpumasks_hier(cs, &tmp);
1656
+
1657
+ if (cs->partition_root_state) {
1658
+ struct cpuset *parent = parent_cs(cs);
1659
+
1660
+ /*
1661
+ * For partition root, update the cpumasks of sibling
1662
+ * cpusets if they use parent's effective_cpus.
1663
+ */
1664
+ if (parent->child_ecpus_count)
1665
+ update_sibling_cpumasks(parent, cs, &tmp);
1666
+ }
10071667 return 0;
10081668 }
10091669
....@@ -1104,7 +1764,7 @@
11041764 guarantee_online_mems(cs, &newmems);
11051765
11061766 /*
1107
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1767
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
11081768 * take while holding tasklist_lock. Forks can happen - the
11091769 * mpol_dup() cpuset_being_rebound check will catch such forks,
11101770 * and rebind their vma mempolicies too. Because we still hold
....@@ -1184,9 +1844,9 @@
11841844 continue;
11851845 rcu_read_unlock();
11861846
1187
- raw_spin_lock_irq(&callback_lock);
1847
+ spin_lock_irq(&callback_lock);
11881848 cp->effective_mems = *new_mems;
1189
- raw_spin_unlock_irq(&callback_lock);
1849
+ spin_unlock_irq(&callback_lock);
11901850
11911851 WARN_ON(!is_in_v2_mode() &&
11921852 !nodes_equal(cp->mems_allowed, cp->effective_mems));
....@@ -1209,7 +1869,7 @@
12091869 *
12101870 * Call with cpuset_mutex held. May take callback_lock during call.
12111871 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1212
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1872
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
12131873 * their mempolicies to the cpusets new mems_allowed.
12141874 */
12151875 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
....@@ -1254,9 +1914,9 @@
12541914 if (retval < 0)
12551915 goto done;
12561916
1257
- raw_spin_lock_irq(&callback_lock);
1917
+ spin_lock_irq(&callback_lock);
12581918 cs->mems_allowed = trialcs->mems_allowed;
1259
- raw_spin_unlock_irq(&callback_lock);
1919
+ spin_unlock_irq(&callback_lock);
12601920
12611921 /* use trialcs->mems_allowed as a temp variable */
12621922 update_nodemasks_hier(cs, &trialcs->mems_allowed);
....@@ -1347,9 +2007,9 @@
13472007 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
13482008 || (is_spread_page(cs) != is_spread_page(trialcs)));
13492009
1350
- raw_spin_lock_irq(&callback_lock);
2010
+ spin_lock_irq(&callback_lock);
13512011 cs->flags = trialcs->flags;
1352
- raw_spin_unlock_irq(&callback_lock);
2012
+ spin_unlock_irq(&callback_lock);
13532013
13542014 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
13552015 rebuild_sched_domains_locked();
....@@ -1357,7 +2017,90 @@
13572017 if (spread_flag_changed)
13582018 update_tasks_flags(cs);
13592019 out:
1360
- free_trial_cpuset(trialcs);
2020
+ free_cpuset(trialcs);
2021
+ return err;
2022
+}
2023
+
2024
+/*
2025
+ * update_prstate - update partititon_root_state
2026
+ * cs: the cpuset to update
2027
+ * new_prs: new partition root state
2028
+ *
2029
+ * Call with cpuset_mutex held.
2030
+ */
2031
+static int update_prstate(struct cpuset *cs, int new_prs)
2032
+{
2033
+ int err, old_prs = cs->partition_root_state;
2034
+ struct cpuset *parent = parent_cs(cs);
2035
+ struct tmpmasks tmpmask;
2036
+
2037
+ if (old_prs == new_prs)
2038
+ return 0;
2039
+
2040
+ /*
2041
+ * Cannot force a partial or invalid partition root to a full
2042
+ * partition root.
2043
+ */
2044
+ if (new_prs && (old_prs == PRS_ERROR))
2045
+ return -EINVAL;
2046
+
2047
+ if (alloc_cpumasks(NULL, &tmpmask))
2048
+ return -ENOMEM;
2049
+
2050
+ err = -EINVAL;
2051
+ if (!old_prs) {
2052
+ /*
2053
+ * Turning on partition root requires setting the
2054
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
2055
+ * cannot be NULL.
2056
+ */
2057
+ if (cpumask_empty(cs->cpus_allowed))
2058
+ goto out;
2059
+
2060
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2061
+ if (err)
2062
+ goto out;
2063
+
2064
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
2065
+ NULL, &tmpmask);
2066
+ if (err) {
2067
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2068
+ goto out;
2069
+ }
2070
+ } else {
2071
+ /*
2072
+ * Turning off partition root will clear the
2073
+ * CS_CPU_EXCLUSIVE bit.
2074
+ */
2075
+ if (old_prs == PRS_ERROR) {
2076
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2077
+ err = 0;
2078
+ goto out;
2079
+ }
2080
+
2081
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
2082
+ NULL, &tmpmask);
2083
+ if (err)
2084
+ goto out;
2085
+
2086
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
2087
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2088
+ }
2089
+
2090
+ update_tasks_cpumask(parent);
2091
+
2092
+ if (parent->child_ecpus_count)
2093
+ update_sibling_cpumasks(parent, cs, &tmpmask);
2094
+
2095
+ rebuild_sched_domains_locked();
2096
+out:
2097
+ if (!err) {
2098
+ spin_lock_irq(&callback_lock);
2099
+ cs->partition_root_state = new_prs;
2100
+ spin_unlock_irq(&callback_lock);
2101
+ }
2102
+
2103
+ free_cpumasks(NULL, &tmpmask);
13612104 return err;
13622105 }
13632106
....@@ -1464,16 +2207,23 @@
14642207
14652208 static struct cpuset *cpuset_attach_old_cs;
14662209
2210
+static void reset_migrate_dl_data(struct cpuset *cs)
2211
+{
2212
+ cs->nr_migrate_dl_tasks = 0;
2213
+ cs->sum_migrate_dl_bw = 0;
2214
+}
2215
+
14672216 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
14682217 static int cpuset_can_attach(struct cgroup_taskset *tset)
14692218 {
14702219 struct cgroup_subsys_state *css;
1471
- struct cpuset *cs;
2220
+ struct cpuset *cs, *oldcs;
14722221 struct task_struct *task;
14732222 int ret;
14742223
14752224 /* used later by cpuset_attach() */
14762225 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2226
+ oldcs = cpuset_attach_old_cs;
14772227 cs = css_cs(css);
14782228
14792229 mutex_lock(&cpuset_mutex);
....@@ -1485,14 +2235,39 @@
14852235 goto out_unlock;
14862236
14872237 cgroup_taskset_for_each(task, css, tset) {
1488
- ret = task_can_attach(task, cs->cpus_allowed);
2238
+ ret = task_can_attach(task);
14892239 if (ret)
14902240 goto out_unlock;
14912241 ret = security_task_setscheduler(task);
14922242 if (ret)
14932243 goto out_unlock;
2244
+
2245
+ if (dl_task(task)) {
2246
+ cs->nr_migrate_dl_tasks++;
2247
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
2248
+ }
14942249 }
14952250
2251
+ if (!cs->nr_migrate_dl_tasks)
2252
+ goto out_success;
2253
+
2254
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
2255
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
2256
+
2257
+ if (unlikely(cpu >= nr_cpu_ids)) {
2258
+ reset_migrate_dl_data(cs);
2259
+ ret = -EINVAL;
2260
+ goto out_unlock;
2261
+ }
2262
+
2263
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
2264
+ if (ret) {
2265
+ reset_migrate_dl_data(cs);
2266
+ goto out_unlock;
2267
+ }
2268
+ }
2269
+
2270
+out_success:
14962271 /*
14972272 * Mark attach is in progress. This makes validate_change() fail
14982273 * changes which zero cpus/mems_allowed.
....@@ -1513,7 +2288,17 @@
15132288 cs = css_cs(css);
15142289
15152290 mutex_lock(&cpuset_mutex);
1516
- css_cs(css)->attach_in_progress--;
2291
+ cs->attach_in_progress--;
2292
+ if (!cs->attach_in_progress)
2293
+ wake_up(&cpuset_attach_wq);
2294
+
2295
+ if (cs->nr_migrate_dl_tasks) {
2296
+ int cpu = cpumask_any(cs->effective_cpus);
2297
+
2298
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
2299
+ reset_migrate_dl_data(cs);
2300
+ }
2301
+
15172302 mutex_unlock(&cpuset_mutex);
15182303 }
15192304
....@@ -1537,23 +2322,21 @@
15372322 cgroup_taskset_first(tset, &css);
15382323 cs = css_cs(css);
15392324
1540
- cpus_read_lock();
2325
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
15412326 mutex_lock(&cpuset_mutex);
1542
-
1543
- /* prepare for attach */
1544
- if (cs == &top_cpuset)
1545
- cpumask_copy(cpus_attach, cpu_possible_mask);
1546
- else
1547
- guarantee_online_cpus(cs, cpus_attach);
15482327
15492328 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
15502329
15512330 cgroup_taskset_for_each(task, css, tset) {
2331
+ if (cs != &top_cpuset)
2332
+ guarantee_online_cpus(task, cpus_attach);
2333
+ else
2334
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
15522335 /*
15532336 * can_attach beforehand should guarantee that this doesn't
15542337 * fail. TODO: have a better way to handle failure here
15552338 */
1556
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2339
+ WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
15572340
15582341 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
15592342 cpuset_update_task_spread_flag(cs, task);
....@@ -1588,12 +2371,17 @@
15882371
15892372 cs->old_mems_allowed = cpuset_attach_nodemask_to;
15902373
2374
+ if (cs->nr_migrate_dl_tasks) {
2375
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
2376
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
2377
+ reset_migrate_dl_data(cs);
2378
+ }
2379
+
15912380 cs->attach_in_progress--;
15922381 if (!cs->attach_in_progress)
15932382 wake_up(&cpuset_attach_wq);
15942383
15952384 mutex_unlock(&cpuset_mutex);
1596
- cpus_read_unlock();
15972385 }
15982386
15992387 /* The various types of files and directories in a cpuset file system */
....@@ -1604,10 +2392,12 @@
16042392 FILE_MEMLIST,
16052393 FILE_EFFECTIVE_CPULIST,
16062394 FILE_EFFECTIVE_MEMLIST,
2395
+ FILE_SUBPARTS_CPULIST,
16072396 FILE_CPU_EXCLUSIVE,
16082397 FILE_MEM_EXCLUSIVE,
16092398 FILE_MEM_HARDWALL,
16102399 FILE_SCHED_LOAD_BALANCE,
2400
+ FILE_PARTITION_ROOT,
16112401 FILE_SCHED_RELAX_DOMAIN_LEVEL,
16122402 FILE_MEMORY_PRESSURE_ENABLED,
16132403 FILE_MEMORY_PRESSURE,
....@@ -1622,6 +2412,7 @@
16222412 cpuset_filetype_t type = cft->private;
16232413 int retval = 0;
16242414
2415
+ get_online_cpus();
16252416 mutex_lock(&cpuset_mutex);
16262417 if (!is_cpuset_online(cs)) {
16272418 retval = -ENODEV;
....@@ -1659,6 +2450,7 @@
16592450 }
16602451 out_unlock:
16612452 mutex_unlock(&cpuset_mutex);
2453
+ put_online_cpus();
16622454 return retval;
16632455 }
16642456
....@@ -1669,6 +2461,7 @@
16692461 cpuset_filetype_t type = cft->private;
16702462 int retval = -ENODEV;
16712463
2464
+ get_online_cpus();
16722465 mutex_lock(&cpuset_mutex);
16732466 if (!is_cpuset_online(cs))
16742467 goto out_unlock;
....@@ -1683,6 +2476,7 @@
16832476 }
16842477 out_unlock:
16852478 mutex_unlock(&cpuset_mutex);
2479
+ put_online_cpus();
16862480 return retval;
16872481 }
16882482
....@@ -1721,6 +2515,7 @@
17212515 kernfs_break_active_protection(of->kn);
17222516 flush_work(&cpuset_hotplug_work);
17232517
2518
+ get_online_cpus();
17242519 mutex_lock(&cpuset_mutex);
17252520 if (!is_cpuset_online(cs))
17262521 goto out_unlock;
....@@ -1743,9 +2538,10 @@
17432538 break;
17442539 }
17452540
1746
- free_trial_cpuset(trialcs);
2541
+ free_cpuset(trialcs);
17472542 out_unlock:
17482543 mutex_unlock(&cpuset_mutex);
2544
+ put_online_cpus();
17492545 kernfs_unbreak_active_protection(of->kn);
17502546 css_put(&cs->css);
17512547 flush_workqueue(cpuset_migrate_mm_wq);
....@@ -1766,7 +2562,7 @@
17662562 cpuset_filetype_t type = seq_cft(sf)->private;
17672563 int ret = 0;
17682564
1769
- raw_spin_lock_irq(&callback_lock);
2565
+ spin_lock_irq(&callback_lock);
17702566
17712567 switch (type) {
17722568 case FILE_CPULIST:
....@@ -1781,11 +2577,14 @@
17812577 case FILE_EFFECTIVE_MEMLIST:
17822578 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
17832579 break;
2580
+ case FILE_SUBPARTS_CPULIST:
2581
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2582
+ break;
17842583 default:
17852584 ret = -EINVAL;
17862585 }
17872586
1788
- raw_spin_unlock_irq(&callback_lock);
2587
+ spin_unlock_irq(&callback_lock);
17892588 return ret;
17902589 }
17912590
....@@ -1835,12 +2634,62 @@
18352634 return 0;
18362635 }
18372636
2637
+static int sched_partition_show(struct seq_file *seq, void *v)
2638
+{
2639
+ struct cpuset *cs = css_cs(seq_css(seq));
2640
+
2641
+ switch (cs->partition_root_state) {
2642
+ case PRS_ENABLED:
2643
+ seq_puts(seq, "root\n");
2644
+ break;
2645
+ case PRS_DISABLED:
2646
+ seq_puts(seq, "member\n");
2647
+ break;
2648
+ case PRS_ERROR:
2649
+ seq_puts(seq, "root invalid\n");
2650
+ break;
2651
+ }
2652
+ return 0;
2653
+}
2654
+
2655
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2656
+ size_t nbytes, loff_t off)
2657
+{
2658
+ struct cpuset *cs = css_cs(of_css(of));
2659
+ int val;
2660
+ int retval = -ENODEV;
2661
+
2662
+ buf = strstrip(buf);
2663
+
2664
+ /*
2665
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
2666
+ */
2667
+ if (!strcmp(buf, "root"))
2668
+ val = PRS_ENABLED;
2669
+ else if (!strcmp(buf, "member"))
2670
+ val = PRS_DISABLED;
2671
+ else
2672
+ return -EINVAL;
2673
+
2674
+ css_get(&cs->css);
2675
+ get_online_cpus();
2676
+ mutex_lock(&cpuset_mutex);
2677
+ if (!is_cpuset_online(cs))
2678
+ goto out_unlock;
2679
+
2680
+ retval = update_prstate(cs, val);
2681
+out_unlock:
2682
+ mutex_unlock(&cpuset_mutex);
2683
+ put_online_cpus();
2684
+ css_put(&cs->css);
2685
+ return retval ?: nbytes;
2686
+}
18382687
18392688 /*
18402689 * for the common functions, 'private' gives the type of file
18412690 */
18422691
1843
-static struct cftype files[] = {
2692
+static struct cftype legacy_files[] = {
18442693 {
18452694 .name = "cpus",
18462695 .seq_show = cpuset_common_seq_show,
....@@ -1943,6 +2792,60 @@
19432792 };
19442793
19452794 /*
2795
+ * This is currently a minimal set for the default hierarchy. It can be
2796
+ * expanded later on by migrating more features and control files from v1.
2797
+ */
2798
+static struct cftype dfl_files[] = {
2799
+ {
2800
+ .name = "cpus",
2801
+ .seq_show = cpuset_common_seq_show,
2802
+ .write = cpuset_write_resmask,
2803
+ .max_write_len = (100U + 6 * NR_CPUS),
2804
+ .private = FILE_CPULIST,
2805
+ .flags = CFTYPE_NOT_ON_ROOT,
2806
+ },
2807
+
2808
+ {
2809
+ .name = "mems",
2810
+ .seq_show = cpuset_common_seq_show,
2811
+ .write = cpuset_write_resmask,
2812
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
2813
+ .private = FILE_MEMLIST,
2814
+ .flags = CFTYPE_NOT_ON_ROOT,
2815
+ },
2816
+
2817
+ {
2818
+ .name = "cpus.effective",
2819
+ .seq_show = cpuset_common_seq_show,
2820
+ .private = FILE_EFFECTIVE_CPULIST,
2821
+ },
2822
+
2823
+ {
2824
+ .name = "mems.effective",
2825
+ .seq_show = cpuset_common_seq_show,
2826
+ .private = FILE_EFFECTIVE_MEMLIST,
2827
+ },
2828
+
2829
+ {
2830
+ .name = "cpus.partition",
2831
+ .seq_show = sched_partition_show,
2832
+ .write = sched_partition_write,
2833
+ .private = FILE_PARTITION_ROOT,
2834
+ .flags = CFTYPE_NOT_ON_ROOT,
2835
+ },
2836
+
2837
+ {
2838
+ .name = "cpus.subpartitions",
2839
+ .seq_show = cpuset_common_seq_show,
2840
+ .private = FILE_SUBPARTS_CPULIST,
2841
+ .flags = CFTYPE_DEBUG,
2842
+ },
2843
+
2844
+ { } /* terminate */
2845
+};
2846
+
2847
+
2848
+/*
19462849 * cpuset_css_alloc - allocate a cpuset css
19472850 * cgrp: control group that the new cpuset will be part of
19482851 */
....@@ -1958,31 +2861,19 @@
19582861 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
19592862 if (!cs)
19602863 return ERR_PTR(-ENOMEM);
1961
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
1962
- goto free_cs;
1963
- if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
1964
- goto free_allowed;
1965
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
1966
- goto free_requested;
2864
+
2865
+ if (alloc_cpumasks(cs, NULL)) {
2866
+ kfree(cs);
2867
+ return ERR_PTR(-ENOMEM);
2868
+ }
19672869
19682870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1969
- cpumask_clear(cs->cpus_allowed);
1970
- cpumask_clear(cs->cpus_requested);
19712871 nodes_clear(cs->mems_allowed);
1972
- cpumask_clear(cs->effective_cpus);
19732872 nodes_clear(cs->effective_mems);
19742873 fmeter_init(&cs->fmeter);
19752874 cs->relax_domain_level = -1;
19762875
19772876 return &cs->css;
1978
-
1979
-free_requested:
1980
- free_cpumask_var(cs->cpus_requested);
1981
-free_allowed:
1982
- free_cpumask_var(cs->cpus_allowed);
1983
-free_cs:
1984
- kfree(cs);
1985
- return ERR_PTR(-ENOMEM);
19862877 }
19872878
19882879 static int cpuset_css_online(struct cgroup_subsys_state *css)
....@@ -1995,6 +2886,7 @@
19952886 if (!parent)
19962887 return 0;
19972888
2889
+ get_online_cpus();
19982890 mutex_lock(&cpuset_mutex);
19992891
20002892 set_bit(CS_ONLINE, &cs->flags);
....@@ -2005,12 +2897,14 @@
20052897
20062898 cpuset_inc();
20072899
2008
- raw_spin_lock_irq(&callback_lock);
2900
+ spin_lock_irq(&callback_lock);
20092901 if (is_in_v2_mode()) {
20102902 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
20112903 cs->effective_mems = parent->effective_mems;
2904
+ cs->use_parent_ecpus = true;
2905
+ parent->child_ecpus_count++;
20122906 }
2013
- raw_spin_unlock_irq(&callback_lock);
2907
+ spin_unlock_irq(&callback_lock);
20142908
20152909 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
20162910 goto out_unlock;
....@@ -2037,53 +2931,69 @@
20372931 }
20382932 rcu_read_unlock();
20392933
2040
- raw_spin_lock_irq(&callback_lock);
2934
+ spin_lock_irq(&callback_lock);
20412935 cs->mems_allowed = parent->mems_allowed;
20422936 cs->effective_mems = parent->mems_allowed;
20432937 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
20442938 cpumask_copy(cs->cpus_requested, parent->cpus_requested);
20452939 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2046
- raw_spin_unlock_irq(&callback_lock);
2940
+ spin_unlock_irq(&callback_lock);
20472941 out_unlock:
20482942 mutex_unlock(&cpuset_mutex);
2943
+ put_online_cpus();
20492944 return 0;
20502945 }
20512946
20522947 /*
20532948 * If the cpuset being removed has its flag 'sched_load_balance'
20542949 * enabled, then simulate turning sched_load_balance off, which
2055
- * will call rebuild_sched_domains_locked().
2950
+ * will call rebuild_sched_domains_locked(). That is not needed
2951
+ * in the default hierarchy where only changes in partition
2952
+ * will cause repartitioning.
2953
+ *
2954
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
2955
+ * turning 'sched.partition" off.
20562956 */
20572957
20582958 static void cpuset_css_offline(struct cgroup_subsys_state *css)
20592959 {
20602960 struct cpuset *cs = css_cs(css);
20612961
2962
+ get_online_cpus();
20622963 mutex_lock(&cpuset_mutex);
20632964
2064
- if (is_sched_load_balance(cs))
2965
+ if (is_partition_root(cs))
2966
+ update_prstate(cs, 0);
2967
+
2968
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2969
+ is_sched_load_balance(cs))
20652970 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2971
+
2972
+ if (cs->use_parent_ecpus) {
2973
+ struct cpuset *parent = parent_cs(cs);
2974
+
2975
+ cs->use_parent_ecpus = false;
2976
+ parent->child_ecpus_count--;
2977
+ }
20662978
20672979 cpuset_dec();
20682980 clear_bit(CS_ONLINE, &cs->flags);
20692981
20702982 mutex_unlock(&cpuset_mutex);
2983
+ put_online_cpus();
20712984 }
20722985
20732986 static void cpuset_css_free(struct cgroup_subsys_state *css)
20742987 {
20752988 struct cpuset *cs = css_cs(css);
20762989
2077
- free_cpumask_var(cs->effective_cpus);
2078
- free_cpumask_var(cs->cpus_allowed);
2079
- free_cpumask_var(cs->cpus_requested);
2080
- kfree(cs);
2990
+ free_cpuset(cs);
20812991 }
20822992
20832993 static void cpuset_bind(struct cgroup_subsys_state *root_css)
20842994 {
20852995 mutex_lock(&cpuset_mutex);
2086
- raw_spin_lock_irq(&callback_lock);
2996
+ spin_lock_irq(&callback_lock);
20872997
20882998 if (is_in_v2_mode()) {
20892999 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
....@@ -2094,7 +3004,7 @@
20943004 top_cpuset.mems_allowed = top_cpuset.effective_mems;
20953005 }
20963006
2097
- raw_spin_unlock_irq(&callback_lock);
3007
+ spin_unlock_irq(&callback_lock);
20983008 mutex_unlock(&cpuset_mutex);
20993009 }
21003010
....@@ -2105,10 +3015,13 @@
21053015 */
21063016 static void cpuset_fork(struct task_struct *task)
21073017 {
3018
+ int inherit_cpus = 0;
21083019 if (task_css_is_root(task, cpuset_cgrp_id))
21093020 return;
21103021
2111
- set_cpus_allowed_ptr(task, current->cpus_ptr);
3022
+ trace_android_rvh_cpuset_fork(task, &inherit_cpus);
3023
+ if (!inherit_cpus)
3024
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
21123025 task->mems_allowed = current->mems_allowed;
21133026 }
21143027
....@@ -2123,22 +3036,23 @@
21233036 .post_attach = cpuset_post_attach,
21243037 .bind = cpuset_bind,
21253038 .fork = cpuset_fork,
2126
- .legacy_cftypes = files,
3039
+ .legacy_cftypes = legacy_files,
3040
+ .dfl_cftypes = dfl_files,
21273041 .early_init = true,
3042
+ .threaded = true,
21283043 };
21293044
21303045 /**
21313046 * cpuset_init - initialize cpusets at system boot
21323047 *
2133
- * Description: Initialize top_cpuset and the cpuset internal file system,
3048
+ * Description: Initialize top_cpuset
21343049 **/
21353050
21363051 int __init cpuset_init(void)
21373052 {
2138
- int err = 0;
2139
-
21403053 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
21413054 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
3055
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
21423056 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
21433057
21443058 cpumask_setall(top_cpuset.cpus_allowed);
....@@ -2150,10 +3064,6 @@
21503064 fmeter_init(&top_cpuset.fmeter);
21513065 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
21523066 top_cpuset.relax_domain_level = -1;
2153
-
2154
- err = register_filesystem(&cpuset_fs_type);
2155
- if (err < 0)
2156
- return err;
21573067
21583068 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
21593069
....@@ -2194,12 +3104,12 @@
21943104 {
21953105 bool is_empty;
21963106
2197
- raw_spin_lock_irq(&callback_lock);
3107
+ spin_lock_irq(&callback_lock);
21983108 cpumask_copy(cs->cpus_allowed, new_cpus);
21993109 cpumask_copy(cs->effective_cpus, new_cpus);
22003110 cs->mems_allowed = *new_mems;
22013111 cs->effective_mems = *new_mems;
2202
- raw_spin_unlock_irq(&callback_lock);
3112
+ spin_unlock_irq(&callback_lock);
22033113
22043114 /*
22053115 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
....@@ -2236,10 +3146,10 @@
22363146 if (nodes_empty(*new_mems))
22373147 *new_mems = parent_cs(cs)->effective_mems;
22383148
2239
- raw_spin_lock_irq(&callback_lock);
3149
+ spin_lock_irq(&callback_lock);
22403150 cpumask_copy(cs->effective_cpus, new_cpus);
22413151 cs->effective_mems = *new_mems;
2242
- raw_spin_unlock_irq(&callback_lock);
3152
+ spin_unlock_irq(&callback_lock);
22433153
22443154 if (cpus_updated)
22453155 update_tasks_cpumask(cs);
....@@ -2247,20 +3157,29 @@
22473157 update_tasks_nodemask(cs);
22483158 }
22493159
3160
+static bool force_rebuild;
3161
+
3162
+void cpuset_force_rebuild(void)
3163
+{
3164
+ force_rebuild = true;
3165
+}
3166
+
22503167 /**
22513168 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
22523169 * @cs: cpuset in interest
3170
+ * @tmp: the tmpmasks structure pointer
22533171 *
22543172 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
22553173 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
22563174 * all its tasks are moved to the nearest ancestor with both resources.
22573175 */
2258
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
3176
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
22593177 {
22603178 static cpumask_t new_cpus;
22613179 static nodemask_t new_mems;
22623180 bool cpus_updated;
22633181 bool mems_updated;
3182
+ struct cpuset *parent;
22643183 retry:
22653184 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
22663185
....@@ -2275,9 +3194,64 @@
22753194 goto retry;
22763195 }
22773196
2278
- cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
2279
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
3197
+ parent = parent_cs(cs);
3198
+ compute_effective_cpumask(&new_cpus, cs, parent);
3199
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
22803200
3201
+ if (cs->nr_subparts_cpus)
3202
+ /*
3203
+ * Make sure that CPUs allocated to child partitions
3204
+ * do not show up in effective_cpus.
3205
+ */
3206
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3207
+
3208
+ if (!tmp || !cs->partition_root_state)
3209
+ goto update_tasks;
3210
+
3211
+ /*
3212
+ * In the unlikely event that a partition root has empty
3213
+ * effective_cpus or its parent becomes erroneous, we have to
3214
+ * transition it to the erroneous state.
3215
+ */
3216
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3217
+ (parent->partition_root_state == PRS_ERROR))) {
3218
+ if (cs->nr_subparts_cpus) {
3219
+ spin_lock_irq(&callback_lock);
3220
+ cs->nr_subparts_cpus = 0;
3221
+ cpumask_clear(cs->subparts_cpus);
3222
+ spin_unlock_irq(&callback_lock);
3223
+ compute_effective_cpumask(&new_cpus, cs, parent);
3224
+ }
3225
+
3226
+ /*
3227
+ * If the effective_cpus is empty because the child
3228
+ * partitions take away all the CPUs, we can keep
3229
+ * the current partition and let the child partitions
3230
+ * fight for available CPUs.
3231
+ */
3232
+ if ((parent->partition_root_state == PRS_ERROR) ||
3233
+ cpumask_empty(&new_cpus)) {
3234
+ update_parent_subparts_cpumask(cs, partcmd_disable,
3235
+ NULL, tmp);
3236
+ spin_lock_irq(&callback_lock);
3237
+ cs->partition_root_state = PRS_ERROR;
3238
+ spin_unlock_irq(&callback_lock);
3239
+ }
3240
+ cpuset_force_rebuild();
3241
+ }
3242
+
3243
+ /*
3244
+ * On the other hand, an erroneous partition root may be transitioned
3245
+ * back to a regular one or a partition root with no CPU allocated
3246
+ * from the parent may change to erroneous.
3247
+ */
3248
+ if (is_partition_root(parent) &&
3249
+ ((cs->partition_root_state == PRS_ERROR) ||
3250
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3251
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3252
+ cpuset_force_rebuild();
3253
+
3254
+update_tasks:
22813255 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
22823256 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
22833257
....@@ -2289,13 +3263,6 @@
22893263 cpus_updated, mems_updated);
22903264
22913265 mutex_unlock(&cpuset_mutex);
2292
-}
2293
-
2294
-static bool force_rebuild;
2295
-
2296
-void cpuset_force_rebuild(void)
2297
-{
2298
- force_rebuild = true;
22993266 }
23003267
23013268 /**
....@@ -2314,12 +3281,16 @@
23143281 * Note that CPU offlining during suspend is ignored. We don't modify
23153282 * cpusets across suspend/resume cycles at all.
23163283 */
2317
-static void cpuset_hotplug_workfn(struct work_struct *work)
3284
+void cpuset_hotplug_workfn(struct work_struct *work)
23183285 {
23193286 static cpumask_t new_cpus;
23203287 static nodemask_t new_mems;
23213288 bool cpus_updated, mems_updated;
23223289 bool on_dfl = is_in_v2_mode();
3290
+ struct tmpmasks tmp, *ptmp = NULL;
3291
+
3292
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3293
+ ptmp = &tmp;
23233294
23243295 mutex_lock(&cpuset_mutex);
23253296
....@@ -2327,26 +3298,54 @@
23273298 cpumask_copy(&new_cpus, cpu_active_mask);
23283299 new_mems = node_states[N_MEMORY];
23293300
3301
+ /*
3302
+ * If subparts_cpus is populated, it is likely that the check below
3303
+ * will produce a false positive on cpus_updated when the cpu list
3304
+ * isn't changed. It is extra work, but it is better to be safe.
3305
+ */
23303306 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
23313307 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
23323308
3309
+ /*
3310
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
3311
+ * we assumed that cpus are updated.
3312
+ */
3313
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3314
+ cpus_updated = true;
3315
+
23333316 /* synchronize cpus_allowed to cpu_active_mask */
23343317 if (cpus_updated) {
2335
- raw_spin_lock_irq(&callback_lock);
3318
+ spin_lock_irq(&callback_lock);
23363319 if (!on_dfl)
23373320 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3321
+ /*
3322
+ * Make sure that CPUs allocated to child partitions
3323
+ * do not show up in effective_cpus. If no CPU is left,
3324
+ * we clear the subparts_cpus & let the child partitions
3325
+ * fight for the CPUs again.
3326
+ */
3327
+ if (top_cpuset.nr_subparts_cpus) {
3328
+ if (cpumask_subset(&new_cpus,
3329
+ top_cpuset.subparts_cpus)) {
3330
+ top_cpuset.nr_subparts_cpus = 0;
3331
+ cpumask_clear(top_cpuset.subparts_cpus);
3332
+ } else {
3333
+ cpumask_andnot(&new_cpus, &new_cpus,
3334
+ top_cpuset.subparts_cpus);
3335
+ }
3336
+ }
23383337 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2339
- raw_spin_unlock_irq(&callback_lock);
3338
+ spin_unlock_irq(&callback_lock);
23403339 /* we don't mess with cpumasks of tasks in top_cpuset */
23413340 }
23423341
23433342 /* synchronize mems_allowed to N_MEMORY */
23443343 if (mems_updated) {
2345
- raw_spin_lock_irq(&callback_lock);
3344
+ spin_lock_irq(&callback_lock);
23463345 if (!on_dfl)
23473346 top_cpuset.mems_allowed = new_mems;
23483347 top_cpuset.effective_mems = new_mems;
2349
- raw_spin_unlock_irq(&callback_lock);
3348
+ spin_unlock_irq(&callback_lock);
23503349 update_tasks_nodemask(&top_cpuset);
23513350 }
23523351
....@@ -2363,7 +3362,7 @@
23633362 continue;
23643363 rcu_read_unlock();
23653364
2366
- cpuset_hotplug_update_tasks(cs);
3365
+ cpuset_hotplug_update_tasks(cs, ptmp);
23673366
23683367 rcu_read_lock();
23693368 css_put(&cs->css);
....@@ -2376,6 +3375,8 @@
23763375 force_rebuild = false;
23773376 rebuild_sched_domains();
23783377 }
3378
+
3379
+ free_cpumasks(NULL, ptmp);
23793380 }
23803381
23813382 void cpuset_update_active_cpus(void)
....@@ -2386,6 +3387,11 @@
23863387 * to a work item to avoid reverse locking order.
23873388 */
23883389 schedule_work(&cpuset_hotplug_work);
3390
+}
3391
+
3392
+void cpuset_update_active_cpus_affine(int cpu)
3393
+{
3394
+ schedule_work_on(cpu, &cpuset_hotplug_work);
23893395 }
23903396
23913397 void cpuset_wait_for_hotplug(void)
....@@ -2417,8 +3423,11 @@
24173423 */
24183424 void __init cpuset_init_smp(void)
24193425 {
2420
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2421
- top_cpuset.mems_allowed = node_states[N_MEMORY];
3426
+ /*
3427
+ * cpus_allowd/mems_allowed set to v2 values in the initial
3428
+ * cpuset_bind() call will be reset to v1 values in another
3429
+ * cpuset_bind() call when v1 cpuset is mounted.
3430
+ */
24223431 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
24233432
24243433 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
....@@ -2445,13 +3454,13 @@
24453454 {
24463455 unsigned long flags;
24473456
2448
- raw_spin_lock_irqsave(&callback_lock, flags);
3457
+ spin_lock_irqsave(&callback_lock, flags);
24493458 rcu_read_lock();
2450
- guarantee_online_cpus(task_cs(tsk), pmask);
3459
+ guarantee_online_cpus(tsk, pmask);
24513460 rcu_read_unlock();
2452
- raw_spin_unlock_irqrestore(&callback_lock, flags);
3461
+ spin_unlock_irqrestore(&callback_lock, flags);
24533462 }
2454
-
3463
+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
24553464 /**
24563465 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
24573466 * @tsk: pointer to task_struct with which the scheduler is struggling
....@@ -2466,9 +3475,17 @@
24663475
24673476 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
24683477 {
3478
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3479
+ const struct cpumask *cs_mask;
3480
+
24693481 rcu_read_lock();
2470
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
2471
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3482
+ cs_mask = task_cs(tsk)->cpus_allowed;
3483
+
3484
+ if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
3485
+ goto unlock; /* select_fallback_rq will try harder */
3486
+
3487
+ do_set_cpus_allowed(tsk, cs_mask);
3488
+unlock:
24723489 rcu_read_unlock();
24733490
24743491 /*
....@@ -2510,11 +3527,11 @@
25103527 nodemask_t mask;
25113528 unsigned long flags;
25123529
2513
- raw_spin_lock_irqsave(&callback_lock, flags);
3530
+ spin_lock_irqsave(&callback_lock, flags);
25143531 rcu_read_lock();
25153532 guarantee_online_mems(task_cs(tsk), &mask);
25163533 rcu_read_unlock();
2517
- raw_spin_unlock_irqrestore(&callback_lock, flags);
3534
+ spin_unlock_irqrestore(&callback_lock, flags);
25183535
25193536 return mask;
25203537 }
....@@ -2606,14 +3623,14 @@
26063623 return true;
26073624
26083625 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2609
- raw_spin_lock_irqsave(&callback_lock, flags);
3626
+ spin_lock_irqsave(&callback_lock, flags);
26103627
26113628 rcu_read_lock();
26123629 cs = nearest_hardwall_ancestor(task_cs(current));
26133630 allowed = node_isset(node, cs->mems_allowed);
26143631 rcu_read_unlock();
26153632
2616
- raw_spin_unlock_irqrestore(&callback_lock, flags);
3633
+ spin_unlock_irqrestore(&callback_lock, flags);
26173634 return allowed;
26183635 }
26193636
....@@ -2699,9 +3716,9 @@
26993716 rcu_read_lock();
27003717
27013718 cgrp = task_cs(current)->css.cgroup;
2702
- pr_info("%s cpuset=", current->comm);
3719
+ pr_cont(",cpuset=");
27033720 pr_cont_cgroup_name(cgrp);
2704
- pr_cont(" mems_allowed=%*pbl\n",
3721
+ pr_cont(",mems_allowed=%*pbl",
27053722 nodemask_pr_args(&current->mems_allowed));
27063723
27073724 rcu_read_unlock();