.. | .. |
---|
33 | 33 | #include <linux/interrupt.h> |
---|
34 | 34 | #include <linux/kernel.h> |
---|
35 | 35 | #include <linux/kmod.h> |
---|
| 36 | +#include <linux/kthread.h> |
---|
36 | 37 | #include <linux/list.h> |
---|
37 | 38 | #include <linux/mempolicy.h> |
---|
38 | 39 | #include <linux/mm.h> |
---|
39 | 40 | #include <linux/memory.h> |
---|
40 | 41 | #include <linux/export.h> |
---|
41 | 42 | #include <linux/mount.h> |
---|
| 43 | +#include <linux/fs_context.h> |
---|
42 | 44 | #include <linux/namei.h> |
---|
43 | 45 | #include <linux/pagemap.h> |
---|
44 | 46 | #include <linux/proc_fs.h> |
---|
45 | 47 | #include <linux/rcupdate.h> |
---|
46 | 48 | #include <linux/sched.h> |
---|
| 49 | +#include <linux/sched/deadline.h> |
---|
47 | 50 | #include <linux/sched/mm.h> |
---|
48 | 51 | #include <linux/sched/task.h> |
---|
49 | 52 | #include <linux/seq_file.h> |
---|
.. | .. |
---|
63 | 66 | #include <linux/mutex.h> |
---|
64 | 67 | #include <linux/cgroup.h> |
---|
65 | 68 | #include <linux/wait.h> |
---|
| 69 | + |
---|
| 70 | +#include <trace/hooks/sched.h> |
---|
| 71 | +#include <trace/hooks/cgroup.h> |
---|
66 | 72 | |
---|
67 | 73 | DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); |
---|
68 | 74 | DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); |
---|
.. | .. |
---|
111 | 117 | nodemask_t effective_mems; |
---|
112 | 118 | |
---|
113 | 119 | /* |
---|
| 120 | + * CPUs allocated to child sub-partitions (default hierarchy only) |
---|
| 121 | + * - CPUs granted by the parent = effective_cpus U subparts_cpus |
---|
| 122 | + * - effective_cpus and subparts_cpus are mutually exclusive. |
---|
| 123 | + * |
---|
| 124 | + * effective_cpus contains only onlined CPUs, but subparts_cpus |
---|
| 125 | + * may have offlined ones. |
---|
| 126 | + */ |
---|
| 127 | + cpumask_var_t subparts_cpus; |
---|
| 128 | + |
---|
| 129 | + /* |
---|
114 | 130 | * This is old Memory Nodes tasks took on. |
---|
115 | 131 | * |
---|
116 | 132 | * - top_cpuset.old_mems_allowed is initialized to mems_allowed. |
---|
.. | .. |
---|
135 | 151 | |
---|
136 | 152 | /* for custom sched domain */ |
---|
137 | 153 | int relax_domain_level; |
---|
| 154 | + |
---|
| 155 | + /* number of CPUs in subparts_cpus */ |
---|
| 156 | + int nr_subparts_cpus; |
---|
| 157 | + |
---|
| 158 | + /* partition root state */ |
---|
| 159 | + int partition_root_state; |
---|
| 160 | + |
---|
| 161 | + /* |
---|
| 162 | + * Default hierarchy only: |
---|
| 163 | + * use_parent_ecpus - set if using parent's effective_cpus |
---|
| 164 | + * child_ecpus_count - # of children with use_parent_ecpus set |
---|
| 165 | + */ |
---|
| 166 | + int use_parent_ecpus; |
---|
| 167 | + int child_ecpus_count; |
---|
| 168 | +}; |
---|
| 169 | + |
---|
| 170 | +/* |
---|
| 171 | + * Partition root states: |
---|
| 172 | + * |
---|
| 173 | + * 0 - not a partition root |
---|
| 174 | + * |
---|
| 175 | + * 1 - partition root |
---|
| 176 | + * |
---|
| 177 | + * -1 - invalid partition root |
---|
| 178 | + * None of the cpus in cpus_allowed can be put into the parent's |
---|
| 179 | + * subparts_cpus. In this case, the cpuset is not a real partition |
---|
| 180 | + * root anymore. However, the CPU_EXCLUSIVE bit will still be set |
---|
| 181 | + * and the cpuset can be restored back to a partition root if the |
---|
| 182 | + * parent cpuset can give more CPUs back to this child cpuset. |
---|
| 183 | + */ |
---|
| 184 | +#define PRS_DISABLED 0 |
---|
| 185 | +#define PRS_ENABLED 1 |
---|
| 186 | +#define PRS_ERROR -1 |
---|
| 187 | + |
---|
| 188 | +/* |
---|
| 189 | + * Temporary cpumasks for working with partitions that are passed among |
---|
| 190 | + * functions to avoid memory allocation in inner functions. |
---|
| 191 | + */ |
---|
| 192 | +struct tmpmasks { |
---|
| 193 | + cpumask_var_t addmask, delmask; /* For partition root */ |
---|
| 194 | + cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ |
---|
138 | 195 | }; |
---|
139 | 196 | |
---|
140 | 197 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
---|
.. | .. |
---|
152 | 209 | { |
---|
153 | 210 | return css_cs(cs->css.parent); |
---|
154 | 211 | } |
---|
155 | | - |
---|
156 | | -#ifdef CONFIG_NUMA |
---|
157 | | -static inline bool task_has_mempolicy(struct task_struct *task) |
---|
158 | | -{ |
---|
159 | | - return task->mempolicy; |
---|
160 | | -} |
---|
161 | | -#else |
---|
162 | | -static inline bool task_has_mempolicy(struct task_struct *task) |
---|
163 | | -{ |
---|
164 | | - return false; |
---|
165 | | -} |
---|
166 | | -#endif |
---|
167 | | - |
---|
168 | 212 | |
---|
169 | 213 | /* bits in struct cpuset flags field */ |
---|
170 | 214 | typedef enum { |
---|
.. | .. |
---|
219 | 263 | return test_bit(CS_SPREAD_SLAB, &cs->flags); |
---|
220 | 264 | } |
---|
221 | 265 | |
---|
| 266 | +static inline int is_partition_root(const struct cpuset *cs) |
---|
| 267 | +{ |
---|
| 268 | + return cs->partition_root_state > 0; |
---|
| 269 | +} |
---|
| 270 | + |
---|
222 | 271 | static struct cpuset top_cpuset = { |
---|
223 | 272 | .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | |
---|
224 | 273 | (1 << CS_MEM_EXCLUSIVE)), |
---|
| 274 | + .partition_root_state = PRS_ENABLED, |
---|
225 | 275 | }; |
---|
226 | 276 | |
---|
227 | 277 | /** |
---|
.. | .. |
---|
289 | 339 | */ |
---|
290 | 340 | |
---|
291 | 341 | static DEFINE_MUTEX(cpuset_mutex); |
---|
292 | | -static DEFINE_SPINLOCK(callback_lock); |
---|
| 342 | +static DEFINE_RAW_SPINLOCK(callback_lock); |
---|
293 | 343 | |
---|
294 | 344 | static struct workqueue_struct *cpuset_migrate_mm_wq; |
---|
295 | 345 | |
---|
296 | 346 | /* |
---|
297 | | - * CPU / memory hotplug is handled asynchronously. |
---|
| 347 | + * CPU / memory hotplug is handled asynchronously |
---|
| 348 | + * for hotplug, synchronously for resume_cpus |
---|
298 | 349 | */ |
---|
299 | | -static void cpuset_hotplug_workfn(struct work_struct *work); |
---|
300 | 350 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); |
---|
301 | 351 | |
---|
302 | 352 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); |
---|
303 | 353 | |
---|
304 | 354 | /* |
---|
305 | | - * Cgroup v2 behavior is used when on default hierarchy or the |
---|
306 | | - * cgroup_v2_mode flag is set. |
---|
| 355 | + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when |
---|
| 356 | + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting |
---|
| 357 | + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. |
---|
| 358 | + * With v2 behavior, "cpus" and "mems" are always what the users have |
---|
| 359 | + * requested and won't be changed by hotplug events. Only the effective |
---|
| 360 | + * cpus or mems will be affected. |
---|
307 | 361 | */ |
---|
308 | 362 | static inline bool is_in_v2_mode(void) |
---|
309 | 363 | { |
---|
.. | .. |
---|
312 | 366 | } |
---|
313 | 367 | |
---|
314 | 368 | /* |
---|
315 | | - * This is ugly, but preserves the userspace API for existing cpuset |
---|
316 | | - * users. If someone tries to mount the "cpuset" filesystem, we |
---|
317 | | - * silently switch it to mount "cgroup" instead |
---|
318 | | - */ |
---|
319 | | -static struct dentry *cpuset_mount(struct file_system_type *fs_type, |
---|
320 | | - int flags, const char *unused_dev_name, void *data) |
---|
321 | | -{ |
---|
322 | | - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); |
---|
323 | | - struct dentry *ret = ERR_PTR(-ENODEV); |
---|
324 | | - if (cgroup_fs) { |
---|
325 | | - char mountopts[] = |
---|
326 | | - "cpuset,noprefix," |
---|
327 | | - "release_agent=/sbin/cpuset_release_agent"; |
---|
328 | | - ret = cgroup_fs->mount(cgroup_fs, flags, |
---|
329 | | - unused_dev_name, mountopts); |
---|
330 | | - put_filesystem(cgroup_fs); |
---|
331 | | - } |
---|
332 | | - return ret; |
---|
333 | | -} |
---|
334 | | - |
---|
335 | | -static struct file_system_type cpuset_fs_type = { |
---|
336 | | - .name = "cpuset", |
---|
337 | | - .mount = cpuset_mount, |
---|
338 | | -}; |
---|
339 | | - |
---|
340 | | -/* |
---|
341 | | - * Return in pmask the portion of a cpusets's cpus_allowed that |
---|
342 | | - * are online. If none are online, walk up the cpuset hierarchy |
---|
343 | | - * until we find one that does have some online cpus. |
---|
| 369 | + * Return in pmask the portion of a task's cpusets's cpus_allowed that |
---|
| 370 | + * are online and are capable of running the task. If none are found, |
---|
| 371 | + * walk up the cpuset hierarchy until we find one that does have some |
---|
| 372 | + * appropriate cpus. |
---|
344 | 373 | * |
---|
345 | 374 | * One way or another, we guarantee to return some non-empty subset |
---|
346 | | - * of cpu_online_mask. |
---|
| 375 | + * of cpu_active_mask. |
---|
347 | 376 | * |
---|
348 | 377 | * Call with callback_lock or cpuset_mutex held. |
---|
349 | 378 | */ |
---|
350 | | -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
---|
| 379 | +static void guarantee_online_cpus(struct task_struct *tsk, |
---|
| 380 | + struct cpumask *pmask) |
---|
351 | 381 | { |
---|
352 | | - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { |
---|
| 382 | + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
---|
| 383 | + struct cpuset *cs; |
---|
| 384 | + |
---|
| 385 | + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) |
---|
| 386 | + cpumask_copy(pmask, cpu_active_mask); |
---|
| 387 | + |
---|
| 388 | + rcu_read_lock(); |
---|
| 389 | + cs = task_cs(tsk); |
---|
| 390 | + |
---|
| 391 | + while (!cpumask_intersects(cs->effective_cpus, pmask)) { |
---|
353 | 392 | cs = parent_cs(cs); |
---|
354 | 393 | if (unlikely(!cs)) { |
---|
355 | 394 | /* |
---|
356 | 395 | * The top cpuset doesn't have any online cpu as a |
---|
357 | 396 | * consequence of a race between cpuset_hotplug_work |
---|
358 | 397 | * and cpu hotplug notifier. But we know the top |
---|
359 | | - * cpuset's effective_cpus is on its way to to be |
---|
| 398 | + * cpuset's effective_cpus is on its way to be |
---|
360 | 399 | * identical to cpu_online_mask. |
---|
361 | 400 | */ |
---|
362 | | - cpumask_copy(pmask, cpu_online_mask); |
---|
363 | | - return; |
---|
| 401 | + goto out_unlock; |
---|
364 | 402 | } |
---|
365 | 403 | } |
---|
366 | | - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); |
---|
| 404 | + cpumask_and(pmask, pmask, cs->effective_cpus); |
---|
| 405 | + |
---|
| 406 | +out_unlock: |
---|
| 407 | + rcu_read_unlock(); |
---|
367 | 408 | } |
---|
368 | 409 | |
---|
369 | 410 | /* |
---|
.. | .. |
---|
420 | 461 | } |
---|
421 | 462 | |
---|
422 | 463 | /** |
---|
| 464 | + * alloc_cpumasks - allocate three cpumasks for cpuset |
---|
| 465 | + * @cs: the cpuset that have cpumasks to be allocated. |
---|
| 466 | + * @tmp: the tmpmasks structure pointer |
---|
| 467 | + * Return: 0 if successful, -ENOMEM otherwise. |
---|
| 468 | + * |
---|
| 469 | + * Only one of the two input arguments should be non-NULL. |
---|
| 470 | + */ |
---|
| 471 | +static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) |
---|
| 472 | +{ |
---|
| 473 | + cpumask_var_t *pmask1, *pmask2, *pmask3; |
---|
| 474 | + |
---|
| 475 | + if (cs) { |
---|
| 476 | + pmask1 = &cs->cpus_allowed; |
---|
| 477 | + pmask2 = &cs->effective_cpus; |
---|
| 478 | + pmask3 = &cs->subparts_cpus; |
---|
| 479 | + } else { |
---|
| 480 | + pmask1 = &tmp->new_cpus; |
---|
| 481 | + pmask2 = &tmp->addmask; |
---|
| 482 | + pmask3 = &tmp->delmask; |
---|
| 483 | + } |
---|
| 484 | + |
---|
| 485 | + if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) |
---|
| 486 | + return -ENOMEM; |
---|
| 487 | + |
---|
| 488 | + if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) |
---|
| 489 | + goto free_one; |
---|
| 490 | + |
---|
| 491 | + if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) |
---|
| 492 | + goto free_two; |
---|
| 493 | + |
---|
| 494 | + if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) |
---|
| 495 | + goto free_three; |
---|
| 496 | + |
---|
| 497 | + return 0; |
---|
| 498 | + |
---|
| 499 | +free_three: |
---|
| 500 | + free_cpumask_var(*pmask3); |
---|
| 501 | +free_two: |
---|
| 502 | + free_cpumask_var(*pmask2); |
---|
| 503 | +free_one: |
---|
| 504 | + free_cpumask_var(*pmask1); |
---|
| 505 | + return -ENOMEM; |
---|
| 506 | +} |
---|
| 507 | + |
---|
| 508 | +/** |
---|
| 509 | + * free_cpumasks - free cpumasks in a tmpmasks structure |
---|
| 510 | + * @cs: the cpuset that have cpumasks to be free. |
---|
| 511 | + * @tmp: the tmpmasks structure pointer |
---|
| 512 | + */ |
---|
| 513 | +static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) |
---|
| 514 | +{ |
---|
| 515 | + if (cs) { |
---|
| 516 | + free_cpumask_var(cs->cpus_allowed); |
---|
| 517 | + free_cpumask_var(cs->cpus_requested); |
---|
| 518 | + free_cpumask_var(cs->effective_cpus); |
---|
| 519 | + free_cpumask_var(cs->subparts_cpus); |
---|
| 520 | + } |
---|
| 521 | + if (tmp) { |
---|
| 522 | + free_cpumask_var(tmp->new_cpus); |
---|
| 523 | + free_cpumask_var(tmp->addmask); |
---|
| 524 | + free_cpumask_var(tmp->delmask); |
---|
| 525 | + } |
---|
| 526 | +} |
---|
| 527 | + |
---|
| 528 | +/** |
---|
423 | 529 | * alloc_trial_cpuset - allocate a trial cpuset |
---|
424 | 530 | * @cs: the cpuset that the trial cpuset duplicates |
---|
425 | 531 | */ |
---|
.. | .. |
---|
431 | 537 | if (!trial) |
---|
432 | 538 | return NULL; |
---|
433 | 539 | |
---|
434 | | - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) |
---|
435 | | - goto free_cs; |
---|
436 | | - if (!alloc_cpumask_var(&trial->cpus_requested, GFP_KERNEL)) |
---|
437 | | - goto free_allowed; |
---|
438 | | - if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) |
---|
439 | | - goto free_cpus; |
---|
| 540 | + if (alloc_cpumasks(trial, NULL)) { |
---|
| 541 | + kfree(trial); |
---|
| 542 | + return NULL; |
---|
| 543 | + } |
---|
440 | 544 | |
---|
441 | 545 | cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); |
---|
442 | 546 | cpumask_copy(trial->cpus_requested, cs->cpus_requested); |
---|
443 | 547 | cpumask_copy(trial->effective_cpus, cs->effective_cpus); |
---|
444 | 548 | return trial; |
---|
445 | | - |
---|
446 | | -free_cpus: |
---|
447 | | - free_cpumask_var(trial->cpus_requested); |
---|
448 | | -free_allowed: |
---|
449 | | - free_cpumask_var(trial->cpus_allowed); |
---|
450 | | -free_cs: |
---|
451 | | - kfree(trial); |
---|
452 | | - return NULL; |
---|
453 | 549 | } |
---|
454 | 550 | |
---|
455 | 551 | /** |
---|
456 | | - * free_trial_cpuset - free the trial cpuset |
---|
457 | | - * @trial: the trial cpuset to be freed |
---|
| 552 | + * free_cpuset - free the cpuset |
---|
| 553 | + * @cs: the cpuset to be freed |
---|
458 | 554 | */ |
---|
459 | | -static void free_trial_cpuset(struct cpuset *trial) |
---|
| 555 | +static inline void free_cpuset(struct cpuset *cs) |
---|
460 | 556 | { |
---|
461 | | - free_cpumask_var(trial->effective_cpus); |
---|
462 | | - free_cpumask_var(trial->cpus_requested); |
---|
463 | | - free_cpumask_var(trial->cpus_allowed); |
---|
464 | | - kfree(trial); |
---|
| 557 | + free_cpumasks(cs, NULL); |
---|
| 558 | + kfree(cs); |
---|
465 | 559 | } |
---|
466 | 560 | |
---|
467 | 561 | /* |
---|
.. | .. |
---|
612 | 706 | * load balancing domains (sched domains) as specified by that partial |
---|
613 | 707 | * partition. |
---|
614 | 708 | * |
---|
615 | | - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt |
---|
| 709 | + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst |
---|
616 | 710 | * for a background explanation of this. |
---|
617 | 711 | * |
---|
618 | 712 | * Does not return errors, on the theory that the callers of this |
---|
.. | .. |
---|
623 | 717 | * Must be called with cpuset_mutex held. |
---|
624 | 718 | * |
---|
625 | 719 | * The three key local variables below are: |
---|
626 | | - * q - a linked-list queue of cpuset pointers, used to implement a |
---|
627 | | - * top-down scan of all cpusets. This scan loads a pointer |
---|
628 | | - * to each cpuset marked is_sched_load_balance into the |
---|
629 | | - * array 'csa'. For our purposes, rebuilding the schedulers |
---|
630 | | - * sched domains, we can ignore !is_sched_load_balance cpusets. |
---|
| 720 | + * cp - cpuset pointer, used (together with pos_css) to perform a |
---|
| 721 | + * top-down scan of all cpusets. For our purposes, rebuilding |
---|
| 722 | + * the schedulers sched domains, we can ignore !is_sched_load_ |
---|
| 723 | + * balance cpusets. |
---|
631 | 724 | * csa - (for CpuSet Array) Array of pointers to all the cpusets |
---|
632 | 725 | * that need to be load balanced, for convenient iterative |
---|
633 | 726 | * access by the subsequent code that finds the best partition, |
---|
.. | .. |
---|
658 | 751 | static int generate_sched_domains(cpumask_var_t **domains, |
---|
659 | 752 | struct sched_domain_attr **attributes) |
---|
660 | 753 | { |
---|
661 | | - struct cpuset *cp; /* scans q */ |
---|
| 754 | + struct cpuset *cp; /* top-down scan of cpusets */ |
---|
662 | 755 | struct cpuset **csa; /* array of all cpuset ptrs */ |
---|
663 | 756 | int csn; /* how many cpuset ptrs in csa so far */ |
---|
664 | 757 | int i, j, k; /* indices for partition finding loops */ |
---|
.. | .. |
---|
667 | 760 | int ndoms = 0; /* number of sched domains in result */ |
---|
668 | 761 | int nslot; /* next empty doms[] struct cpumask slot */ |
---|
669 | 762 | struct cgroup_subsys_state *pos_css; |
---|
| 763 | + bool root_load_balance = is_sched_load_balance(&top_cpuset); |
---|
670 | 764 | |
---|
671 | 765 | doms = NULL; |
---|
672 | 766 | dattr = NULL; |
---|
673 | 767 | csa = NULL; |
---|
674 | 768 | |
---|
675 | 769 | /* Special case for the 99% of systems with one, full, sched domain */ |
---|
676 | | - if (is_sched_load_balance(&top_cpuset)) { |
---|
| 770 | + if (root_load_balance && !top_cpuset.nr_subparts_cpus) { |
---|
677 | 771 | ndoms = 1; |
---|
678 | 772 | doms = alloc_sched_domains(ndoms); |
---|
679 | 773 | if (!doms) |
---|
.. | .. |
---|
696 | 790 | csn = 0; |
---|
697 | 791 | |
---|
698 | 792 | rcu_read_lock(); |
---|
| 793 | + if (root_load_balance) |
---|
| 794 | + csa[csn++] = &top_cpuset; |
---|
699 | 795 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
---|
700 | 796 | if (cp == &top_cpuset) |
---|
701 | 797 | continue; |
---|
.. | .. |
---|
706 | 802 | * parent's cpus, so just skip them, and then we call |
---|
707 | 803 | * update_domain_attr_tree() to calc relax_domain_level of |
---|
708 | 804 | * the corresponding sched domain. |
---|
| 805 | + * |
---|
| 806 | + * If root is load-balancing, we can skip @cp if it |
---|
| 807 | + * is a subset of the root's effective_cpus. |
---|
709 | 808 | */ |
---|
710 | 809 | if (!cpumask_empty(cp->cpus_allowed) && |
---|
711 | 810 | !(is_sched_load_balance(cp) && |
---|
.. | .. |
---|
713 | 812 | housekeeping_cpumask(HK_FLAG_DOMAIN)))) |
---|
714 | 813 | continue; |
---|
715 | 814 | |
---|
716 | | - if (is_sched_load_balance(cp)) |
---|
| 815 | + if (root_load_balance && |
---|
| 816 | + cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) |
---|
| 817 | + continue; |
---|
| 818 | + |
---|
| 819 | + if (is_sched_load_balance(cp) && |
---|
| 820 | + !cpumask_empty(cp->effective_cpus)) |
---|
717 | 821 | csa[csn++] = cp; |
---|
718 | 822 | |
---|
719 | | - /* skip @cp's subtree */ |
---|
720 | | - pos_css = css_rightmost_descendant(pos_css); |
---|
| 823 | + /* skip @cp's subtree if not a partition root */ |
---|
| 824 | + if (!is_partition_root(cp)) |
---|
| 825 | + pos_css = css_rightmost_descendant(pos_css); |
---|
721 | 826 | } |
---|
722 | 827 | rcu_read_unlock(); |
---|
723 | 828 | |
---|
.. | .. |
---|
820 | 925 | return ndoms; |
---|
821 | 926 | } |
---|
822 | 927 | |
---|
| 928 | +static void update_tasks_root_domain(struct cpuset *cs) |
---|
| 929 | +{ |
---|
| 930 | + struct css_task_iter it; |
---|
| 931 | + struct task_struct *task; |
---|
| 932 | + |
---|
| 933 | + css_task_iter_start(&cs->css, 0, &it); |
---|
| 934 | + |
---|
| 935 | + while ((task = css_task_iter_next(&it))) |
---|
| 936 | + dl_add_task_root_domain(task); |
---|
| 937 | + |
---|
| 938 | + css_task_iter_end(&it); |
---|
| 939 | +} |
---|
| 940 | + |
---|
| 941 | +static void rebuild_root_domains(void) |
---|
| 942 | +{ |
---|
| 943 | + struct cpuset *cs = NULL; |
---|
| 944 | + struct cgroup_subsys_state *pos_css; |
---|
| 945 | + |
---|
| 946 | + lockdep_assert_held(&cpuset_mutex); |
---|
| 947 | + lockdep_assert_cpus_held(); |
---|
| 948 | + lockdep_assert_held(&sched_domains_mutex); |
---|
| 949 | + |
---|
| 950 | + rcu_read_lock(); |
---|
| 951 | + |
---|
| 952 | + /* |
---|
| 953 | + * Clear default root domain DL accounting, it will be computed again |
---|
| 954 | + * if a task belongs to it. |
---|
| 955 | + */ |
---|
| 956 | + dl_clear_root_domain(&def_root_domain); |
---|
| 957 | + |
---|
| 958 | + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
---|
| 959 | + |
---|
| 960 | + if (cpumask_empty(cs->effective_cpus)) { |
---|
| 961 | + pos_css = css_rightmost_descendant(pos_css); |
---|
| 962 | + continue; |
---|
| 963 | + } |
---|
| 964 | + |
---|
| 965 | + css_get(&cs->css); |
---|
| 966 | + |
---|
| 967 | + rcu_read_unlock(); |
---|
| 968 | + |
---|
| 969 | + update_tasks_root_domain(cs); |
---|
| 970 | + |
---|
| 971 | + rcu_read_lock(); |
---|
| 972 | + css_put(&cs->css); |
---|
| 973 | + } |
---|
| 974 | + rcu_read_unlock(); |
---|
| 975 | +} |
---|
| 976 | + |
---|
| 977 | +static void |
---|
| 978 | +partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
---|
| 979 | + struct sched_domain_attr *dattr_new) |
---|
| 980 | +{ |
---|
| 981 | + mutex_lock(&sched_domains_mutex); |
---|
| 982 | + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); |
---|
| 983 | + rebuild_root_domains(); |
---|
| 984 | + mutex_unlock(&sched_domains_mutex); |
---|
| 985 | +} |
---|
| 986 | + |
---|
823 | 987 | /* |
---|
824 | 988 | * Rebuild scheduler domains. |
---|
825 | 989 | * |
---|
.. | .. |
---|
833 | 997 | */ |
---|
834 | 998 | static void rebuild_sched_domains_locked(void) |
---|
835 | 999 | { |
---|
| 1000 | + struct cgroup_subsys_state *pos_css; |
---|
836 | 1001 | struct sched_domain_attr *attr; |
---|
837 | 1002 | cpumask_var_t *doms; |
---|
| 1003 | + struct cpuset *cs; |
---|
838 | 1004 | int ndoms; |
---|
839 | 1005 | |
---|
840 | 1006 | lockdep_assert_held(&cpuset_mutex); |
---|
841 | | - get_online_cpus(); |
---|
842 | 1007 | |
---|
843 | 1008 | /* |
---|
844 | | - * We have raced with CPU hotplug. Don't do anything to avoid |
---|
| 1009 | + * If we have raced with CPU hotplug, return early to avoid |
---|
845 | 1010 | * passing doms with offlined cpu to partition_sched_domains(). |
---|
846 | | - * Anyways, hotplug work item will rebuild sched domains. |
---|
| 1011 | + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. |
---|
| 1012 | + * |
---|
| 1013 | + * With no CPUs in any subpartitions, top_cpuset's effective CPUs |
---|
| 1014 | + * should be the same as the active CPUs, so checking only top_cpuset |
---|
| 1015 | + * is enough to detect racing CPU offlines. |
---|
847 | 1016 | */ |
---|
848 | | - if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) |
---|
849 | | - goto out; |
---|
| 1017 | + if (!top_cpuset.nr_subparts_cpus && |
---|
| 1018 | + !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) |
---|
| 1019 | + return; |
---|
| 1020 | + |
---|
| 1021 | + /* |
---|
| 1022 | + * With subpartition CPUs, however, the effective CPUs of a partition |
---|
| 1023 | + * root should be only a subset of the active CPUs. Since a CPU in any |
---|
| 1024 | + * partition root could be offlined, all must be checked. |
---|
| 1025 | + */ |
---|
| 1026 | + if (top_cpuset.nr_subparts_cpus) { |
---|
| 1027 | + rcu_read_lock(); |
---|
| 1028 | + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
---|
| 1029 | + if (!is_partition_root(cs)) { |
---|
| 1030 | + pos_css = css_rightmost_descendant(pos_css); |
---|
| 1031 | + continue; |
---|
| 1032 | + } |
---|
| 1033 | + if (!cpumask_subset(cs->effective_cpus, |
---|
| 1034 | + cpu_active_mask)) { |
---|
| 1035 | + rcu_read_unlock(); |
---|
| 1036 | + return; |
---|
| 1037 | + } |
---|
| 1038 | + } |
---|
| 1039 | + rcu_read_unlock(); |
---|
| 1040 | + } |
---|
850 | 1041 | |
---|
851 | 1042 | /* Generate domain masks and attrs */ |
---|
852 | 1043 | ndoms = generate_sched_domains(&doms, &attr); |
---|
853 | 1044 | |
---|
854 | 1045 | /* Have scheduler rebuild the domains */ |
---|
855 | | - partition_sched_domains(ndoms, doms, attr); |
---|
856 | | -out: |
---|
857 | | - put_online_cpus(); |
---|
| 1046 | + partition_and_rebuild_sched_domains(ndoms, doms, attr); |
---|
858 | 1047 | } |
---|
859 | 1048 | #else /* !CONFIG_SMP */ |
---|
860 | 1049 | static void rebuild_sched_domains_locked(void) |
---|
.. | .. |
---|
864 | 1053 | |
---|
865 | 1054 | void rebuild_sched_domains(void) |
---|
866 | 1055 | { |
---|
| 1056 | + get_online_cpus(); |
---|
867 | 1057 | mutex_lock(&cpuset_mutex); |
---|
868 | 1058 | rebuild_sched_domains_locked(); |
---|
869 | 1059 | mutex_unlock(&cpuset_mutex); |
---|
| 1060 | + put_online_cpus(); |
---|
| 1061 | +} |
---|
| 1062 | + |
---|
| 1063 | +static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, |
---|
| 1064 | + const struct cpumask *new_mask) |
---|
| 1065 | +{ |
---|
| 1066 | + int ret = -EINVAL; |
---|
| 1067 | + |
---|
| 1068 | + trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret); |
---|
| 1069 | + if (!ret) |
---|
| 1070 | + return ret; |
---|
| 1071 | + |
---|
| 1072 | + return set_cpus_allowed_ptr(p, new_mask); |
---|
870 | 1073 | } |
---|
871 | 1074 | |
---|
872 | 1075 | /** |
---|
.. | .. |
---|
881 | 1084 | { |
---|
882 | 1085 | struct css_task_iter it; |
---|
883 | 1086 | struct task_struct *task; |
---|
| 1087 | + bool top_cs = cs == &top_cpuset; |
---|
884 | 1088 | |
---|
885 | 1089 | css_task_iter_start(&cs->css, 0, &it); |
---|
886 | | - while ((task = css_task_iter_next(&it))) |
---|
887 | | - set_cpus_allowed_ptr(task, cs->effective_cpus); |
---|
| 1090 | + while ((task = css_task_iter_next(&it))) { |
---|
| 1091 | + /* |
---|
| 1092 | + * Percpu kthreads in top_cpuset are ignored |
---|
| 1093 | + */ |
---|
| 1094 | + if (top_cs && (task->flags & PF_KTHREAD) && |
---|
| 1095 | + kthread_is_per_cpu(task)) |
---|
| 1096 | + continue; |
---|
| 1097 | + update_cpus_allowed(cs, task, cs->effective_cpus); |
---|
| 1098 | + } |
---|
888 | 1099 | css_task_iter_end(&it); |
---|
| 1100 | +} |
---|
| 1101 | + |
---|
| 1102 | +/** |
---|
| 1103 | + * compute_effective_cpumask - Compute the effective cpumask of the cpuset |
---|
| 1104 | + * @new_cpus: the temp variable for the new effective_cpus mask |
---|
| 1105 | + * @cs: the cpuset the need to recompute the new effective_cpus mask |
---|
| 1106 | + * @parent: the parent cpuset |
---|
| 1107 | + * |
---|
| 1108 | + * If the parent has subpartition CPUs, include them in the list of |
---|
| 1109 | + * allowable CPUs in computing the new effective_cpus mask. Since offlined |
---|
| 1110 | + * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask |
---|
| 1111 | + * to mask those out. |
---|
| 1112 | + */ |
---|
| 1113 | +static void compute_effective_cpumask(struct cpumask *new_cpus, |
---|
| 1114 | + struct cpuset *cs, struct cpuset *parent) |
---|
| 1115 | +{ |
---|
| 1116 | + if (parent->nr_subparts_cpus) { |
---|
| 1117 | + cpumask_or(new_cpus, parent->effective_cpus, |
---|
| 1118 | + parent->subparts_cpus); |
---|
| 1119 | + cpumask_and(new_cpus, new_cpus, cs->cpus_requested); |
---|
| 1120 | + cpumask_and(new_cpus, new_cpus, cpu_active_mask); |
---|
| 1121 | + } else { |
---|
| 1122 | + cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); |
---|
| 1123 | + } |
---|
| 1124 | +} |
---|
| 1125 | + |
---|
| 1126 | +/* |
---|
| 1127 | + * Commands for update_parent_subparts_cpumask |
---|
| 1128 | + */ |
---|
| 1129 | +enum subparts_cmd { |
---|
| 1130 | + partcmd_enable, /* Enable partition root */ |
---|
| 1131 | + partcmd_disable, /* Disable partition root */ |
---|
| 1132 | + partcmd_update, /* Update parent's subparts_cpus */ |
---|
| 1133 | +}; |
---|
| 1134 | + |
---|
| 1135 | +/** |
---|
| 1136 | + * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset |
---|
| 1137 | + * @cpuset: The cpuset that requests change in partition root state |
---|
| 1138 | + * @cmd: Partition root state change command |
---|
| 1139 | + * @newmask: Optional new cpumask for partcmd_update |
---|
| 1140 | + * @tmp: Temporary addmask and delmask |
---|
| 1141 | + * Return: 0, 1 or an error code |
---|
| 1142 | + * |
---|
| 1143 | + * For partcmd_enable, the cpuset is being transformed from a non-partition |
---|
| 1144 | + * root to a partition root. The cpus_allowed mask of the given cpuset will |
---|
| 1145 | + * be put into parent's subparts_cpus and taken away from parent's |
---|
| 1146 | + * effective_cpus. The function will return 0 if all the CPUs listed in |
---|
| 1147 | + * cpus_allowed can be granted or an error code will be returned. |
---|
| 1148 | + * |
---|
| 1149 | + * For partcmd_disable, the cpuset is being transofrmed from a partition |
---|
| 1150 | + * root back to a non-partition root. Any CPUs in cpus_allowed that are in |
---|
| 1151 | + * parent's subparts_cpus will be taken away from that cpumask and put back |
---|
| 1152 | + * into parent's effective_cpus. 0 should always be returned. |
---|
| 1153 | + * |
---|
| 1154 | + * For partcmd_update, if the optional newmask is specified, the cpu |
---|
| 1155 | + * list is to be changed from cpus_allowed to newmask. Otherwise, |
---|
| 1156 | + * cpus_allowed is assumed to remain the same. The cpuset should either |
---|
| 1157 | + * be a partition root or an invalid partition root. The partition root |
---|
| 1158 | + * state may change if newmask is NULL and none of the requested CPUs can |
---|
| 1159 | + * be granted by the parent. The function will return 1 if changes to |
---|
| 1160 | + * parent's subparts_cpus and effective_cpus happen or 0 otherwise. |
---|
| 1161 | + * Error code should only be returned when newmask is non-NULL. |
---|
| 1162 | + * |
---|
| 1163 | + * The partcmd_enable and partcmd_disable commands are used by |
---|
| 1164 | + * update_prstate(). The partcmd_update command is used by |
---|
| 1165 | + * update_cpumasks_hier() with newmask NULL and update_cpumask() with |
---|
| 1166 | + * newmask set. |
---|
| 1167 | + * |
---|
| 1168 | + * The checking is more strict when enabling partition root than the |
---|
| 1169 | + * other two commands. |
---|
| 1170 | + * |
---|
| 1171 | + * Because of the implicit cpu exclusive nature of a partition root, |
---|
| 1172 | + * cpumask changes that violates the cpu exclusivity rule will not be |
---|
| 1173 | + * permitted when checked by validate_change(). The validate_change() |
---|
| 1174 | + * function will also prevent any changes to the cpu list if it is not |
---|
| 1175 | + * a superset of children's cpu lists. |
---|
| 1176 | + */ |
---|
| 1177 | +static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, |
---|
| 1178 | + struct cpumask *newmask, |
---|
| 1179 | + struct tmpmasks *tmp) |
---|
| 1180 | +{ |
---|
| 1181 | + struct cpuset *parent = parent_cs(cpuset); |
---|
| 1182 | + int adding; /* Moving cpus from effective_cpus to subparts_cpus */ |
---|
| 1183 | + int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ |
---|
| 1184 | + int new_prs; |
---|
| 1185 | + bool part_error = false; /* Partition error? */ |
---|
| 1186 | + |
---|
| 1187 | + lockdep_assert_held(&cpuset_mutex); |
---|
| 1188 | + |
---|
| 1189 | + /* |
---|
| 1190 | + * The parent must be a partition root. |
---|
| 1191 | + * The new cpumask, if present, or the current cpus_allowed must |
---|
| 1192 | + * not be empty. |
---|
| 1193 | + */ |
---|
| 1194 | + if (!is_partition_root(parent) || |
---|
| 1195 | + (newmask && cpumask_empty(newmask)) || |
---|
| 1196 | + (!newmask && cpumask_empty(cpuset->cpus_allowed))) |
---|
| 1197 | + return -EINVAL; |
---|
| 1198 | + |
---|
| 1199 | + /* |
---|
| 1200 | + * Enabling/disabling partition root is not allowed if there are |
---|
| 1201 | + * online children. |
---|
| 1202 | + */ |
---|
| 1203 | + if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) |
---|
| 1204 | + return -EBUSY; |
---|
| 1205 | + |
---|
| 1206 | + /* |
---|
| 1207 | + * Enabling partition root is not allowed if not all the CPUs |
---|
| 1208 | + * can be granted from parent's effective_cpus or at least one |
---|
| 1209 | + * CPU will be left after that. |
---|
| 1210 | + */ |
---|
| 1211 | + if ((cmd == partcmd_enable) && |
---|
| 1212 | + (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || |
---|
| 1213 | + cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) |
---|
| 1214 | + return -EINVAL; |
---|
| 1215 | + |
---|
| 1216 | + /* |
---|
| 1217 | + * A cpumask update cannot make parent's effective_cpus become empty. |
---|
| 1218 | + */ |
---|
| 1219 | + adding = deleting = false; |
---|
| 1220 | + new_prs = cpuset->partition_root_state; |
---|
| 1221 | + if (cmd == partcmd_enable) { |
---|
| 1222 | + cpumask_copy(tmp->addmask, cpuset->cpus_allowed); |
---|
| 1223 | + adding = true; |
---|
| 1224 | + } else if (cmd == partcmd_disable) { |
---|
| 1225 | + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, |
---|
| 1226 | + parent->subparts_cpus); |
---|
| 1227 | + } else if (newmask) { |
---|
| 1228 | + /* |
---|
| 1229 | + * partcmd_update with newmask: |
---|
| 1230 | + * |
---|
| 1231 | + * delmask = cpus_allowed & ~newmask & parent->subparts_cpus |
---|
| 1232 | + * addmask = newmask & parent->effective_cpus |
---|
| 1233 | + * & ~parent->subparts_cpus |
---|
| 1234 | + */ |
---|
| 1235 | + cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); |
---|
| 1236 | + deleting = cpumask_and(tmp->delmask, tmp->delmask, |
---|
| 1237 | + parent->subparts_cpus); |
---|
| 1238 | + |
---|
| 1239 | + cpumask_and(tmp->addmask, newmask, parent->effective_cpus); |
---|
| 1240 | + adding = cpumask_andnot(tmp->addmask, tmp->addmask, |
---|
| 1241 | + parent->subparts_cpus); |
---|
| 1242 | + /* |
---|
| 1243 | + * Return error if the new effective_cpus could become empty. |
---|
| 1244 | + */ |
---|
| 1245 | + if (adding && |
---|
| 1246 | + cpumask_equal(parent->effective_cpus, tmp->addmask)) { |
---|
| 1247 | + if (!deleting) |
---|
| 1248 | + return -EINVAL; |
---|
| 1249 | + /* |
---|
| 1250 | + * As some of the CPUs in subparts_cpus might have |
---|
| 1251 | + * been offlined, we need to compute the real delmask |
---|
| 1252 | + * to confirm that. |
---|
| 1253 | + */ |
---|
| 1254 | + if (!cpumask_and(tmp->addmask, tmp->delmask, |
---|
| 1255 | + cpu_active_mask)) |
---|
| 1256 | + return -EINVAL; |
---|
| 1257 | + cpumask_copy(tmp->addmask, parent->effective_cpus); |
---|
| 1258 | + } |
---|
| 1259 | + } else { |
---|
| 1260 | + /* |
---|
| 1261 | + * partcmd_update w/o newmask: |
---|
| 1262 | + * |
---|
| 1263 | + * addmask = cpus_allowed & parent->effective_cpus |
---|
| 1264 | + * |
---|
| 1265 | + * Note that parent's subparts_cpus may have been |
---|
| 1266 | + * pre-shrunk in case there is a change in the cpu list. |
---|
| 1267 | + * So no deletion is needed. |
---|
| 1268 | + */ |
---|
| 1269 | + adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, |
---|
| 1270 | + parent->effective_cpus); |
---|
| 1271 | + part_error = cpumask_equal(tmp->addmask, |
---|
| 1272 | + parent->effective_cpus); |
---|
| 1273 | + } |
---|
| 1274 | + |
---|
| 1275 | + if (cmd == partcmd_update) { |
---|
| 1276 | + int prev_prs = cpuset->partition_root_state; |
---|
| 1277 | + |
---|
| 1278 | + /* |
---|
| 1279 | + * Check for possible transition between PRS_ENABLED |
---|
| 1280 | + * and PRS_ERROR. |
---|
| 1281 | + */ |
---|
| 1282 | + switch (cpuset->partition_root_state) { |
---|
| 1283 | + case PRS_ENABLED: |
---|
| 1284 | + if (part_error) |
---|
| 1285 | + new_prs = PRS_ERROR; |
---|
| 1286 | + break; |
---|
| 1287 | + case PRS_ERROR: |
---|
| 1288 | + if (!part_error) |
---|
| 1289 | + new_prs = PRS_ENABLED; |
---|
| 1290 | + break; |
---|
| 1291 | + } |
---|
| 1292 | + /* |
---|
| 1293 | + * Set part_error if previously in invalid state. |
---|
| 1294 | + */ |
---|
| 1295 | + part_error = (prev_prs == PRS_ERROR); |
---|
| 1296 | + } |
---|
| 1297 | + |
---|
| 1298 | + if (!part_error && (new_prs == PRS_ERROR)) |
---|
| 1299 | + return 0; /* Nothing need to be done */ |
---|
| 1300 | + |
---|
| 1301 | + if (new_prs == PRS_ERROR) { |
---|
| 1302 | + /* |
---|
| 1303 | + * Remove all its cpus from parent's subparts_cpus. |
---|
| 1304 | + */ |
---|
| 1305 | + adding = false; |
---|
| 1306 | + deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, |
---|
| 1307 | + parent->subparts_cpus); |
---|
| 1308 | + } |
---|
| 1309 | + |
---|
| 1310 | + if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) |
---|
| 1311 | + return 0; |
---|
| 1312 | + |
---|
| 1313 | + /* |
---|
| 1314 | + * Change the parent's subparts_cpus. |
---|
| 1315 | + * Newly added CPUs will be removed from effective_cpus and |
---|
| 1316 | + * newly deleted ones will be added back to effective_cpus. |
---|
| 1317 | + */ |
---|
| 1318 | + raw_spin_lock_irq(&callback_lock); |
---|
| 1319 | + if (adding) { |
---|
| 1320 | + cpumask_or(parent->subparts_cpus, |
---|
| 1321 | + parent->subparts_cpus, tmp->addmask); |
---|
| 1322 | + cpumask_andnot(parent->effective_cpus, |
---|
| 1323 | + parent->effective_cpus, tmp->addmask); |
---|
| 1324 | + } |
---|
| 1325 | + if (deleting) { |
---|
| 1326 | + cpumask_andnot(parent->subparts_cpus, |
---|
| 1327 | + parent->subparts_cpus, tmp->delmask); |
---|
| 1328 | + /* |
---|
| 1329 | + * Some of the CPUs in subparts_cpus might have been offlined. |
---|
| 1330 | + */ |
---|
| 1331 | + cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); |
---|
| 1332 | + cpumask_or(parent->effective_cpus, |
---|
| 1333 | + parent->effective_cpus, tmp->delmask); |
---|
| 1334 | + } |
---|
| 1335 | + |
---|
| 1336 | + parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); |
---|
| 1337 | + |
---|
| 1338 | + if (cpuset->partition_root_state != new_prs) |
---|
| 1339 | + cpuset->partition_root_state = new_prs; |
---|
| 1340 | + raw_spin_unlock_irq(&callback_lock); |
---|
| 1341 | + |
---|
| 1342 | + return cmd == partcmd_update; |
---|
889 | 1343 | } |
---|
890 | 1344 | |
---|
891 | 1345 | /* |
---|
892 | 1346 | * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree |
---|
893 | | - * @cs: the cpuset to consider |
---|
894 | | - * @new_cpus: temp variable for calculating new effective_cpus |
---|
| 1347 | + * @cs: the cpuset to consider |
---|
| 1348 | + * @tmp: temp variables for calculating effective_cpus & partition setup |
---|
895 | 1349 | * |
---|
896 | 1350 | * When congifured cpumask is changed, the effective cpumasks of this cpuset |
---|
897 | 1351 | * and all its descendants need to be updated. |
---|
.. | .. |
---|
900 | 1354 | * |
---|
901 | 1355 | * Called with cpuset_mutex held |
---|
902 | 1356 | */ |
---|
903 | | -static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) |
---|
| 1357 | +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) |
---|
904 | 1358 | { |
---|
905 | 1359 | struct cpuset *cp; |
---|
906 | 1360 | struct cgroup_subsys_state *pos_css; |
---|
907 | 1361 | bool need_rebuild_sched_domains = false; |
---|
| 1362 | + int new_prs; |
---|
908 | 1363 | |
---|
909 | 1364 | rcu_read_lock(); |
---|
910 | 1365 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
---|
911 | 1366 | struct cpuset *parent = parent_cs(cp); |
---|
912 | 1367 | |
---|
913 | | - cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); |
---|
| 1368 | + compute_effective_cpumask(tmp->new_cpus, cp, parent); |
---|
914 | 1369 | |
---|
915 | 1370 | /* |
---|
916 | 1371 | * If it becomes empty, inherit the effective mask of the |
---|
917 | 1372 | * parent, which is guaranteed to have some CPUs. |
---|
918 | 1373 | */ |
---|
919 | | - if (is_in_v2_mode() && cpumask_empty(new_cpus)) |
---|
920 | | - cpumask_copy(new_cpus, parent->effective_cpus); |
---|
| 1374 | + if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { |
---|
| 1375 | + cpumask_copy(tmp->new_cpus, parent->effective_cpus); |
---|
| 1376 | + if (!cp->use_parent_ecpus) { |
---|
| 1377 | + cp->use_parent_ecpus = true; |
---|
| 1378 | + parent->child_ecpus_count++; |
---|
| 1379 | + } |
---|
| 1380 | + } else if (cp->use_parent_ecpus) { |
---|
| 1381 | + cp->use_parent_ecpus = false; |
---|
| 1382 | + WARN_ON_ONCE(!parent->child_ecpus_count); |
---|
| 1383 | + parent->child_ecpus_count--; |
---|
| 1384 | + } |
---|
921 | 1385 | |
---|
922 | | - /* Skip the whole subtree if the cpumask remains the same. */ |
---|
923 | | - if (cpumask_equal(new_cpus, cp->effective_cpus)) { |
---|
| 1386 | + /* |
---|
| 1387 | + * Skip the whole subtree if the cpumask remains the same |
---|
| 1388 | + * and has no partition root state. |
---|
| 1389 | + */ |
---|
| 1390 | + if (!cp->partition_root_state && |
---|
| 1391 | + cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { |
---|
924 | 1392 | pos_css = css_rightmost_descendant(pos_css); |
---|
925 | 1393 | continue; |
---|
| 1394 | + } |
---|
| 1395 | + |
---|
| 1396 | + /* |
---|
| 1397 | + * update_parent_subparts_cpumask() should have been called |
---|
| 1398 | + * for cs already in update_cpumask(). We should also call |
---|
| 1399 | + * update_tasks_cpumask() again for tasks in the parent |
---|
| 1400 | + * cpuset if the parent's subparts_cpus changes. |
---|
| 1401 | + */ |
---|
| 1402 | + new_prs = cp->partition_root_state; |
---|
| 1403 | + if ((cp != cs) && new_prs) { |
---|
| 1404 | + switch (parent->partition_root_state) { |
---|
| 1405 | + case PRS_DISABLED: |
---|
| 1406 | + /* |
---|
| 1407 | + * If parent is not a partition root or an |
---|
| 1408 | + * invalid partition root, clear its state |
---|
| 1409 | + * and its CS_CPU_EXCLUSIVE flag. |
---|
| 1410 | + */ |
---|
| 1411 | + WARN_ON_ONCE(cp->partition_root_state |
---|
| 1412 | + != PRS_ERROR); |
---|
| 1413 | + new_prs = PRS_DISABLED; |
---|
| 1414 | + |
---|
| 1415 | + /* |
---|
| 1416 | + * clear_bit() is an atomic operation and |
---|
| 1417 | + * readers aren't interested in the state |
---|
| 1418 | + * of CS_CPU_EXCLUSIVE anyway. So we can |
---|
| 1419 | + * just update the flag without holding |
---|
| 1420 | + * the callback_lock. |
---|
| 1421 | + */ |
---|
| 1422 | + clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); |
---|
| 1423 | + break; |
---|
| 1424 | + |
---|
| 1425 | + case PRS_ENABLED: |
---|
| 1426 | + if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) |
---|
| 1427 | + update_tasks_cpumask(parent); |
---|
| 1428 | + break; |
---|
| 1429 | + |
---|
| 1430 | + case PRS_ERROR: |
---|
| 1431 | + /* |
---|
| 1432 | + * When parent is invalid, it has to be too. |
---|
| 1433 | + */ |
---|
| 1434 | + new_prs = PRS_ERROR; |
---|
| 1435 | + break; |
---|
| 1436 | + } |
---|
926 | 1437 | } |
---|
927 | 1438 | |
---|
928 | 1439 | if (!css_tryget_online(&cp->css)) |
---|
929 | 1440 | continue; |
---|
930 | 1441 | rcu_read_unlock(); |
---|
931 | 1442 | |
---|
932 | | - spin_lock_irq(&callback_lock); |
---|
933 | | - cpumask_copy(cp->effective_cpus, new_cpus); |
---|
934 | | - spin_unlock_irq(&callback_lock); |
---|
| 1443 | + raw_spin_lock_irq(&callback_lock); |
---|
| 1444 | + |
---|
| 1445 | + cpumask_copy(cp->effective_cpus, tmp->new_cpus); |
---|
| 1446 | + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { |
---|
| 1447 | + cp->nr_subparts_cpus = 0; |
---|
| 1448 | + cpumask_clear(cp->subparts_cpus); |
---|
| 1449 | + } else if (cp->nr_subparts_cpus) { |
---|
| 1450 | + /* |
---|
| 1451 | + * Make sure that effective_cpus & subparts_cpus |
---|
| 1452 | + * are mutually exclusive. |
---|
| 1453 | + * |
---|
| 1454 | + * In the unlikely event that effective_cpus |
---|
| 1455 | + * becomes empty. we clear cp->nr_subparts_cpus and |
---|
| 1456 | + * let its child partition roots to compete for |
---|
| 1457 | + * CPUs again. |
---|
| 1458 | + */ |
---|
| 1459 | + cpumask_andnot(cp->effective_cpus, cp->effective_cpus, |
---|
| 1460 | + cp->subparts_cpus); |
---|
| 1461 | + if (cpumask_empty(cp->effective_cpus)) { |
---|
| 1462 | + cpumask_copy(cp->effective_cpus, tmp->new_cpus); |
---|
| 1463 | + cpumask_clear(cp->subparts_cpus); |
---|
| 1464 | + cp->nr_subparts_cpus = 0; |
---|
| 1465 | + } else if (!cpumask_subset(cp->subparts_cpus, |
---|
| 1466 | + tmp->new_cpus)) { |
---|
| 1467 | + cpumask_andnot(cp->subparts_cpus, |
---|
| 1468 | + cp->subparts_cpus, tmp->new_cpus); |
---|
| 1469 | + cp->nr_subparts_cpus |
---|
| 1470 | + = cpumask_weight(cp->subparts_cpus); |
---|
| 1471 | + } |
---|
| 1472 | + } |
---|
| 1473 | + |
---|
| 1474 | + if (new_prs != cp->partition_root_state) |
---|
| 1475 | + cp->partition_root_state = new_prs; |
---|
| 1476 | + |
---|
| 1477 | + raw_spin_unlock_irq(&callback_lock); |
---|
935 | 1478 | |
---|
936 | 1479 | WARN_ON(!is_in_v2_mode() && |
---|
937 | 1480 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
---|
.. | .. |
---|
939 | 1482 | update_tasks_cpumask(cp); |
---|
940 | 1483 | |
---|
941 | 1484 | /* |
---|
942 | | - * If the effective cpumask of any non-empty cpuset is changed, |
---|
943 | | - * we need to rebuild sched domains. |
---|
| 1485 | + * On legacy hierarchy, if the effective cpumask of any non- |
---|
| 1486 | + * empty cpuset is changed, we need to rebuild sched domains. |
---|
| 1487 | + * On default hierarchy, the cpuset needs to be a partition |
---|
| 1488 | + * root as well. |
---|
944 | 1489 | */ |
---|
945 | 1490 | if (!cpumask_empty(cp->cpus_allowed) && |
---|
946 | | - is_sched_load_balance(cp)) |
---|
| 1491 | + is_sched_load_balance(cp) && |
---|
| 1492 | + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || |
---|
| 1493 | + is_partition_root(cp))) |
---|
947 | 1494 | need_rebuild_sched_domains = true; |
---|
948 | 1495 | |
---|
949 | 1496 | rcu_read_lock(); |
---|
.. | .. |
---|
956 | 1503 | } |
---|
957 | 1504 | |
---|
958 | 1505 | /** |
---|
| 1506 | + * update_sibling_cpumasks - Update siblings cpumasks |
---|
| 1507 | + * @parent: Parent cpuset |
---|
| 1508 | + * @cs: Current cpuset |
---|
| 1509 | + * @tmp: Temp variables |
---|
| 1510 | + */ |
---|
| 1511 | +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, |
---|
| 1512 | + struct tmpmasks *tmp) |
---|
| 1513 | +{ |
---|
| 1514 | + struct cpuset *sibling; |
---|
| 1515 | + struct cgroup_subsys_state *pos_css; |
---|
| 1516 | + |
---|
| 1517 | + lockdep_assert_held(&cpuset_mutex); |
---|
| 1518 | + |
---|
| 1519 | + /* |
---|
| 1520 | + * Check all its siblings and call update_cpumasks_hier() |
---|
| 1521 | + * if their use_parent_ecpus flag is set in order for them |
---|
| 1522 | + * to use the right effective_cpus value. |
---|
| 1523 | + * |
---|
| 1524 | + * The update_cpumasks_hier() function may sleep. So we have to |
---|
| 1525 | + * release the RCU read lock before calling it. |
---|
| 1526 | + */ |
---|
| 1527 | + rcu_read_lock(); |
---|
| 1528 | + cpuset_for_each_child(sibling, pos_css, parent) { |
---|
| 1529 | + if (sibling == cs) |
---|
| 1530 | + continue; |
---|
| 1531 | + if (!sibling->use_parent_ecpus) |
---|
| 1532 | + continue; |
---|
| 1533 | + if (!css_tryget_online(&sibling->css)) |
---|
| 1534 | + continue; |
---|
| 1535 | + |
---|
| 1536 | + rcu_read_unlock(); |
---|
| 1537 | + update_cpumasks_hier(sibling, tmp); |
---|
| 1538 | + rcu_read_lock(); |
---|
| 1539 | + css_put(&sibling->css); |
---|
| 1540 | + } |
---|
| 1541 | + rcu_read_unlock(); |
---|
| 1542 | +} |
---|
| 1543 | + |
---|
| 1544 | +/** |
---|
959 | 1545 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
---|
960 | 1546 | * @cs: the cpuset to consider |
---|
961 | 1547 | * @trialcs: trial cpuset |
---|
.. | .. |
---|
965 | 1551 | const char *buf) |
---|
966 | 1552 | { |
---|
967 | 1553 | int retval; |
---|
| 1554 | + struct tmpmasks tmp; |
---|
968 | 1555 | |
---|
969 | 1556 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ |
---|
970 | 1557 | if (cs == &top_cpuset) |
---|
.. | .. |
---|
997 | 1584 | if (retval < 0) |
---|
998 | 1585 | return retval; |
---|
999 | 1586 | |
---|
1000 | | - spin_lock_irq(&callback_lock); |
---|
| 1587 | +#ifdef CONFIG_CPUMASK_OFFSTACK |
---|
| 1588 | + /* |
---|
| 1589 | + * Use the cpumasks in trialcs for tmpmasks when they are pointers |
---|
| 1590 | + * to allocated cpumasks. |
---|
| 1591 | + */ |
---|
| 1592 | + tmp.addmask = trialcs->subparts_cpus; |
---|
| 1593 | + tmp.delmask = trialcs->effective_cpus; |
---|
| 1594 | + tmp.new_cpus = trialcs->cpus_allowed; |
---|
| 1595 | +#endif |
---|
| 1596 | + |
---|
| 1597 | + if (cs->partition_root_state) { |
---|
| 1598 | + /* Cpumask of a partition root cannot be empty */ |
---|
| 1599 | + if (cpumask_empty(trialcs->cpus_allowed)) |
---|
| 1600 | + return -EINVAL; |
---|
| 1601 | + if (update_parent_subparts_cpumask(cs, partcmd_update, |
---|
| 1602 | + trialcs->cpus_allowed, &tmp) < 0) |
---|
| 1603 | + return -EINVAL; |
---|
| 1604 | + } |
---|
| 1605 | + |
---|
| 1606 | + raw_spin_lock_irq(&callback_lock); |
---|
1001 | 1607 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
---|
1002 | 1608 | cpumask_copy(cs->cpus_requested, trialcs->cpus_requested); |
---|
1003 | | - spin_unlock_irq(&callback_lock); |
---|
1004 | 1609 | |
---|
1005 | | - /* use trialcs->cpus_allowed as a temp variable */ |
---|
1006 | | - update_cpumasks_hier(cs, trialcs->cpus_allowed); |
---|
| 1610 | + /* |
---|
| 1611 | + * Make sure that subparts_cpus is a subset of cpus_allowed. |
---|
| 1612 | + */ |
---|
| 1613 | + if (cs->nr_subparts_cpus) { |
---|
| 1614 | + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); |
---|
| 1615 | + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); |
---|
| 1616 | + } |
---|
| 1617 | + raw_spin_unlock_irq(&callback_lock); |
---|
| 1618 | + |
---|
| 1619 | + update_cpumasks_hier(cs, &tmp); |
---|
| 1620 | + |
---|
| 1621 | + if (cs->partition_root_state) { |
---|
| 1622 | + struct cpuset *parent = parent_cs(cs); |
---|
| 1623 | + |
---|
| 1624 | + /* |
---|
| 1625 | + * For partition root, update the cpumasks of sibling |
---|
| 1626 | + * cpusets if they use parent's effective_cpus. |
---|
| 1627 | + */ |
---|
| 1628 | + if (parent->child_ecpus_count) |
---|
| 1629 | + update_sibling_cpumasks(parent, cs, &tmp); |
---|
| 1630 | + } |
---|
1007 | 1631 | return 0; |
---|
1008 | 1632 | } |
---|
1009 | 1633 | |
---|
.. | .. |
---|
1104 | 1728 | guarantee_online_mems(cs, &newmems); |
---|
1105 | 1729 | |
---|
1106 | 1730 | /* |
---|
1107 | | - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
---|
| 1731 | + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't |
---|
1108 | 1732 | * take while holding tasklist_lock. Forks can happen - the |
---|
1109 | 1733 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
---|
1110 | 1734 | * and rebind their vma mempolicies too. Because we still hold |
---|
.. | .. |
---|
1184 | 1808 | continue; |
---|
1185 | 1809 | rcu_read_unlock(); |
---|
1186 | 1810 | |
---|
1187 | | - spin_lock_irq(&callback_lock); |
---|
| 1811 | + raw_spin_lock_irq(&callback_lock); |
---|
1188 | 1812 | cp->effective_mems = *new_mems; |
---|
1189 | | - spin_unlock_irq(&callback_lock); |
---|
| 1813 | + raw_spin_unlock_irq(&callback_lock); |
---|
1190 | 1814 | |
---|
1191 | 1815 | WARN_ON(!is_in_v2_mode() && |
---|
1192 | 1816 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
---|
.. | .. |
---|
1209 | 1833 | * |
---|
1210 | 1834 | * Call with cpuset_mutex held. May take callback_lock during call. |
---|
1211 | 1835 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
---|
1212 | | - * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
---|
| 1836 | + * lock each such tasks mm->mmap_lock, scan its vma's and rebind |
---|
1213 | 1837 | * their mempolicies to the cpusets new mems_allowed. |
---|
1214 | 1838 | */ |
---|
1215 | 1839 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
---|
.. | .. |
---|
1254 | 1878 | if (retval < 0) |
---|
1255 | 1879 | goto done; |
---|
1256 | 1880 | |
---|
1257 | | - spin_lock_irq(&callback_lock); |
---|
| 1881 | + raw_spin_lock_irq(&callback_lock); |
---|
1258 | 1882 | cs->mems_allowed = trialcs->mems_allowed; |
---|
1259 | | - spin_unlock_irq(&callback_lock); |
---|
| 1883 | + raw_spin_unlock_irq(&callback_lock); |
---|
1260 | 1884 | |
---|
1261 | 1885 | /* use trialcs->mems_allowed as a temp variable */ |
---|
1262 | 1886 | update_nodemasks_hier(cs, &trialcs->mems_allowed); |
---|
.. | .. |
---|
1347 | 1971 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
---|
1348 | 1972 | || (is_spread_page(cs) != is_spread_page(trialcs))); |
---|
1349 | 1973 | |
---|
1350 | | - spin_lock_irq(&callback_lock); |
---|
| 1974 | + raw_spin_lock_irq(&callback_lock); |
---|
1351 | 1975 | cs->flags = trialcs->flags; |
---|
1352 | | - spin_unlock_irq(&callback_lock); |
---|
| 1976 | + raw_spin_unlock_irq(&callback_lock); |
---|
1353 | 1977 | |
---|
1354 | 1978 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
---|
1355 | 1979 | rebuild_sched_domains_locked(); |
---|
.. | .. |
---|
1357 | 1981 | if (spread_flag_changed) |
---|
1358 | 1982 | update_tasks_flags(cs); |
---|
1359 | 1983 | out: |
---|
1360 | | - free_trial_cpuset(trialcs); |
---|
| 1984 | + free_cpuset(trialcs); |
---|
| 1985 | + return err; |
---|
| 1986 | +} |
---|
| 1987 | + |
---|
| 1988 | +/* |
---|
| 1989 | + * update_prstate - update partititon_root_state |
---|
| 1990 | + * cs: the cpuset to update |
---|
| 1991 | + * new_prs: new partition root state |
---|
| 1992 | + * |
---|
| 1993 | + * Call with cpuset_mutex held. |
---|
| 1994 | + */ |
---|
| 1995 | +static int update_prstate(struct cpuset *cs, int new_prs) |
---|
| 1996 | +{ |
---|
| 1997 | + int err, old_prs = cs->partition_root_state; |
---|
| 1998 | + struct cpuset *parent = parent_cs(cs); |
---|
| 1999 | + struct tmpmasks tmpmask; |
---|
| 2000 | + |
---|
| 2001 | + if (old_prs == new_prs) |
---|
| 2002 | + return 0; |
---|
| 2003 | + |
---|
| 2004 | + /* |
---|
| 2005 | + * Cannot force a partial or invalid partition root to a full |
---|
| 2006 | + * partition root. |
---|
| 2007 | + */ |
---|
| 2008 | + if (new_prs && (old_prs == PRS_ERROR)) |
---|
| 2009 | + return -EINVAL; |
---|
| 2010 | + |
---|
| 2011 | + if (alloc_cpumasks(NULL, &tmpmask)) |
---|
| 2012 | + return -ENOMEM; |
---|
| 2013 | + |
---|
| 2014 | + err = -EINVAL; |
---|
| 2015 | + if (!old_prs) { |
---|
| 2016 | + /* |
---|
| 2017 | + * Turning on partition root requires setting the |
---|
| 2018 | + * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed |
---|
| 2019 | + * cannot be NULL. |
---|
| 2020 | + */ |
---|
| 2021 | + if (cpumask_empty(cs->cpus_allowed)) |
---|
| 2022 | + goto out; |
---|
| 2023 | + |
---|
| 2024 | + err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); |
---|
| 2025 | + if (err) |
---|
| 2026 | + goto out; |
---|
| 2027 | + |
---|
| 2028 | + err = update_parent_subparts_cpumask(cs, partcmd_enable, |
---|
| 2029 | + NULL, &tmpmask); |
---|
| 2030 | + if (err) { |
---|
| 2031 | + update_flag(CS_CPU_EXCLUSIVE, cs, 0); |
---|
| 2032 | + goto out; |
---|
| 2033 | + } |
---|
| 2034 | + } else { |
---|
| 2035 | + /* |
---|
| 2036 | + * Turning off partition root will clear the |
---|
| 2037 | + * CS_CPU_EXCLUSIVE bit. |
---|
| 2038 | + */ |
---|
| 2039 | + if (old_prs == PRS_ERROR) { |
---|
| 2040 | + update_flag(CS_CPU_EXCLUSIVE, cs, 0); |
---|
| 2041 | + err = 0; |
---|
| 2042 | + goto out; |
---|
| 2043 | + } |
---|
| 2044 | + |
---|
| 2045 | + err = update_parent_subparts_cpumask(cs, partcmd_disable, |
---|
| 2046 | + NULL, &tmpmask); |
---|
| 2047 | + if (err) |
---|
| 2048 | + goto out; |
---|
| 2049 | + |
---|
| 2050 | + /* Turning off CS_CPU_EXCLUSIVE will not return error */ |
---|
| 2051 | + update_flag(CS_CPU_EXCLUSIVE, cs, 0); |
---|
| 2052 | + } |
---|
| 2053 | + |
---|
| 2054 | + update_tasks_cpumask(parent); |
---|
| 2055 | + |
---|
| 2056 | + if (parent->child_ecpus_count) |
---|
| 2057 | + update_sibling_cpumasks(parent, cs, &tmpmask); |
---|
| 2058 | + |
---|
| 2059 | + rebuild_sched_domains_locked(); |
---|
| 2060 | +out: |
---|
| 2061 | + if (!err) { |
---|
| 2062 | + raw_spin_lock_irq(&callback_lock); |
---|
| 2063 | + cs->partition_root_state = new_prs; |
---|
| 2064 | + raw_spin_unlock_irq(&callback_lock); |
---|
| 2065 | + } |
---|
| 2066 | + |
---|
| 2067 | + free_cpumasks(NULL, &tmpmask); |
---|
1361 | 2068 | return err; |
---|
1362 | 2069 | } |
---|
1363 | 2070 | |
---|
.. | .. |
---|
1485 | 2192 | goto out_unlock; |
---|
1486 | 2193 | |
---|
1487 | 2194 | cgroup_taskset_for_each(task, css, tset) { |
---|
1488 | | - ret = task_can_attach(task, cs->cpus_allowed); |
---|
| 2195 | + ret = task_can_attach(task, cs->effective_cpus); |
---|
1489 | 2196 | if (ret) |
---|
1490 | 2197 | goto out_unlock; |
---|
1491 | 2198 | ret = security_task_setscheduler(task); |
---|
.. | .. |
---|
1507 | 2214 | static void cpuset_cancel_attach(struct cgroup_taskset *tset) |
---|
1508 | 2215 | { |
---|
1509 | 2216 | struct cgroup_subsys_state *css; |
---|
1510 | | - struct cpuset *cs; |
---|
1511 | 2217 | |
---|
1512 | 2218 | cgroup_taskset_first(tset, &css); |
---|
1513 | | - cs = css_cs(css); |
---|
1514 | 2219 | |
---|
1515 | 2220 | mutex_lock(&cpuset_mutex); |
---|
1516 | 2221 | css_cs(css)->attach_in_progress--; |
---|
.. | .. |
---|
1537 | 2242 | cgroup_taskset_first(tset, &css); |
---|
1538 | 2243 | cs = css_cs(css); |
---|
1539 | 2244 | |
---|
1540 | | - cpus_read_lock(); |
---|
| 2245 | + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ |
---|
1541 | 2246 | mutex_lock(&cpuset_mutex); |
---|
1542 | | - |
---|
1543 | | - /* prepare for attach */ |
---|
1544 | | - if (cs == &top_cpuset) |
---|
1545 | | - cpumask_copy(cpus_attach, cpu_possible_mask); |
---|
1546 | | - else |
---|
1547 | | - guarantee_online_cpus(cs, cpus_attach); |
---|
1548 | 2247 | |
---|
1549 | 2248 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
---|
1550 | 2249 | |
---|
1551 | 2250 | cgroup_taskset_for_each(task, css, tset) { |
---|
| 2251 | + if (cs != &top_cpuset) |
---|
| 2252 | + guarantee_online_cpus(task, cpus_attach); |
---|
| 2253 | + else |
---|
| 2254 | + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); |
---|
1552 | 2255 | /* |
---|
1553 | 2256 | * can_attach beforehand should guarantee that this doesn't |
---|
1554 | 2257 | * fail. TODO: have a better way to handle failure here |
---|
1555 | 2258 | */ |
---|
1556 | | - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); |
---|
| 2259 | + WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); |
---|
1557 | 2260 | |
---|
1558 | 2261 | cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); |
---|
1559 | 2262 | cpuset_update_task_spread_flag(cs, task); |
---|
.. | .. |
---|
1593 | 2296 | wake_up(&cpuset_attach_wq); |
---|
1594 | 2297 | |
---|
1595 | 2298 | mutex_unlock(&cpuset_mutex); |
---|
1596 | | - cpus_read_unlock(); |
---|
1597 | 2299 | } |
---|
1598 | 2300 | |
---|
1599 | 2301 | /* The various types of files and directories in a cpuset file system */ |
---|
.. | .. |
---|
1604 | 2306 | FILE_MEMLIST, |
---|
1605 | 2307 | FILE_EFFECTIVE_CPULIST, |
---|
1606 | 2308 | FILE_EFFECTIVE_MEMLIST, |
---|
| 2309 | + FILE_SUBPARTS_CPULIST, |
---|
1607 | 2310 | FILE_CPU_EXCLUSIVE, |
---|
1608 | 2311 | FILE_MEM_EXCLUSIVE, |
---|
1609 | 2312 | FILE_MEM_HARDWALL, |
---|
1610 | 2313 | FILE_SCHED_LOAD_BALANCE, |
---|
| 2314 | + FILE_PARTITION_ROOT, |
---|
1611 | 2315 | FILE_SCHED_RELAX_DOMAIN_LEVEL, |
---|
1612 | 2316 | FILE_MEMORY_PRESSURE_ENABLED, |
---|
1613 | 2317 | FILE_MEMORY_PRESSURE, |
---|
.. | .. |
---|
1622 | 2326 | cpuset_filetype_t type = cft->private; |
---|
1623 | 2327 | int retval = 0; |
---|
1624 | 2328 | |
---|
| 2329 | + get_online_cpus(); |
---|
1625 | 2330 | mutex_lock(&cpuset_mutex); |
---|
1626 | 2331 | if (!is_cpuset_online(cs)) { |
---|
1627 | 2332 | retval = -ENODEV; |
---|
.. | .. |
---|
1659 | 2364 | } |
---|
1660 | 2365 | out_unlock: |
---|
1661 | 2366 | mutex_unlock(&cpuset_mutex); |
---|
| 2367 | + put_online_cpus(); |
---|
1662 | 2368 | return retval; |
---|
1663 | 2369 | } |
---|
1664 | 2370 | |
---|
.. | .. |
---|
1669 | 2375 | cpuset_filetype_t type = cft->private; |
---|
1670 | 2376 | int retval = -ENODEV; |
---|
1671 | 2377 | |
---|
| 2378 | + get_online_cpus(); |
---|
1672 | 2379 | mutex_lock(&cpuset_mutex); |
---|
1673 | 2380 | if (!is_cpuset_online(cs)) |
---|
1674 | 2381 | goto out_unlock; |
---|
.. | .. |
---|
1683 | 2390 | } |
---|
1684 | 2391 | out_unlock: |
---|
1685 | 2392 | mutex_unlock(&cpuset_mutex); |
---|
| 2393 | + put_online_cpus(); |
---|
1686 | 2394 | return retval; |
---|
1687 | 2395 | } |
---|
1688 | 2396 | |
---|
.. | .. |
---|
1721 | 2429 | kernfs_break_active_protection(of->kn); |
---|
1722 | 2430 | flush_work(&cpuset_hotplug_work); |
---|
1723 | 2431 | |
---|
| 2432 | + get_online_cpus(); |
---|
1724 | 2433 | mutex_lock(&cpuset_mutex); |
---|
1725 | 2434 | if (!is_cpuset_online(cs)) |
---|
1726 | 2435 | goto out_unlock; |
---|
.. | .. |
---|
1743 | 2452 | break; |
---|
1744 | 2453 | } |
---|
1745 | 2454 | |
---|
1746 | | - free_trial_cpuset(trialcs); |
---|
| 2455 | + free_cpuset(trialcs); |
---|
1747 | 2456 | out_unlock: |
---|
1748 | 2457 | mutex_unlock(&cpuset_mutex); |
---|
| 2458 | + put_online_cpus(); |
---|
1749 | 2459 | kernfs_unbreak_active_protection(of->kn); |
---|
1750 | 2460 | css_put(&cs->css); |
---|
1751 | 2461 | flush_workqueue(cpuset_migrate_mm_wq); |
---|
.. | .. |
---|
1766 | 2476 | cpuset_filetype_t type = seq_cft(sf)->private; |
---|
1767 | 2477 | int ret = 0; |
---|
1768 | 2478 | |
---|
1769 | | - spin_lock_irq(&callback_lock); |
---|
| 2479 | + raw_spin_lock_irq(&callback_lock); |
---|
1770 | 2480 | |
---|
1771 | 2481 | switch (type) { |
---|
1772 | 2482 | case FILE_CPULIST: |
---|
.. | .. |
---|
1781 | 2491 | case FILE_EFFECTIVE_MEMLIST: |
---|
1782 | 2492 | seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); |
---|
1783 | 2493 | break; |
---|
| 2494 | + case FILE_SUBPARTS_CPULIST: |
---|
| 2495 | + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); |
---|
| 2496 | + break; |
---|
1784 | 2497 | default: |
---|
1785 | 2498 | ret = -EINVAL; |
---|
1786 | 2499 | } |
---|
1787 | 2500 | |
---|
1788 | | - spin_unlock_irq(&callback_lock); |
---|
| 2501 | + raw_spin_unlock_irq(&callback_lock); |
---|
1789 | 2502 | return ret; |
---|
1790 | 2503 | } |
---|
1791 | 2504 | |
---|
.. | .. |
---|
1835 | 2548 | return 0; |
---|
1836 | 2549 | } |
---|
1837 | 2550 | |
---|
| 2551 | +static int sched_partition_show(struct seq_file *seq, void *v) |
---|
| 2552 | +{ |
---|
| 2553 | + struct cpuset *cs = css_cs(seq_css(seq)); |
---|
| 2554 | + |
---|
| 2555 | + switch (cs->partition_root_state) { |
---|
| 2556 | + case PRS_ENABLED: |
---|
| 2557 | + seq_puts(seq, "root\n"); |
---|
| 2558 | + break; |
---|
| 2559 | + case PRS_DISABLED: |
---|
| 2560 | + seq_puts(seq, "member\n"); |
---|
| 2561 | + break; |
---|
| 2562 | + case PRS_ERROR: |
---|
| 2563 | + seq_puts(seq, "root invalid\n"); |
---|
| 2564 | + break; |
---|
| 2565 | + } |
---|
| 2566 | + return 0; |
---|
| 2567 | +} |
---|
| 2568 | + |
---|
| 2569 | +static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, |
---|
| 2570 | + size_t nbytes, loff_t off) |
---|
| 2571 | +{ |
---|
| 2572 | + struct cpuset *cs = css_cs(of_css(of)); |
---|
| 2573 | + int val; |
---|
| 2574 | + int retval = -ENODEV; |
---|
| 2575 | + |
---|
| 2576 | + buf = strstrip(buf); |
---|
| 2577 | + |
---|
| 2578 | + /* |
---|
| 2579 | + * Convert "root" to ENABLED, and convert "member" to DISABLED. |
---|
| 2580 | + */ |
---|
| 2581 | + if (!strcmp(buf, "root")) |
---|
| 2582 | + val = PRS_ENABLED; |
---|
| 2583 | + else if (!strcmp(buf, "member")) |
---|
| 2584 | + val = PRS_DISABLED; |
---|
| 2585 | + else |
---|
| 2586 | + return -EINVAL; |
---|
| 2587 | + |
---|
| 2588 | + css_get(&cs->css); |
---|
| 2589 | + get_online_cpus(); |
---|
| 2590 | + mutex_lock(&cpuset_mutex); |
---|
| 2591 | + if (!is_cpuset_online(cs)) |
---|
| 2592 | + goto out_unlock; |
---|
| 2593 | + |
---|
| 2594 | + retval = update_prstate(cs, val); |
---|
| 2595 | +out_unlock: |
---|
| 2596 | + mutex_unlock(&cpuset_mutex); |
---|
| 2597 | + put_online_cpus(); |
---|
| 2598 | + css_put(&cs->css); |
---|
| 2599 | + return retval ?: nbytes; |
---|
| 2600 | +} |
---|
1838 | 2601 | |
---|
1839 | 2602 | /* |
---|
1840 | 2603 | * for the common functions, 'private' gives the type of file |
---|
1841 | 2604 | */ |
---|
1842 | 2605 | |
---|
1843 | | -static struct cftype files[] = { |
---|
| 2606 | +static struct cftype legacy_files[] = { |
---|
1844 | 2607 | { |
---|
1845 | 2608 | .name = "cpus", |
---|
1846 | 2609 | .seq_show = cpuset_common_seq_show, |
---|
.. | .. |
---|
1943 | 2706 | }; |
---|
1944 | 2707 | |
---|
1945 | 2708 | /* |
---|
| 2709 | + * This is currently a minimal set for the default hierarchy. It can be |
---|
| 2710 | + * expanded later on by migrating more features and control files from v1. |
---|
| 2711 | + */ |
---|
| 2712 | +static struct cftype dfl_files[] = { |
---|
| 2713 | + { |
---|
| 2714 | + .name = "cpus", |
---|
| 2715 | + .seq_show = cpuset_common_seq_show, |
---|
| 2716 | + .write = cpuset_write_resmask, |
---|
| 2717 | + .max_write_len = (100U + 6 * NR_CPUS), |
---|
| 2718 | + .private = FILE_CPULIST, |
---|
| 2719 | + .flags = CFTYPE_NOT_ON_ROOT, |
---|
| 2720 | + }, |
---|
| 2721 | + |
---|
| 2722 | + { |
---|
| 2723 | + .name = "mems", |
---|
| 2724 | + .seq_show = cpuset_common_seq_show, |
---|
| 2725 | + .write = cpuset_write_resmask, |
---|
| 2726 | + .max_write_len = (100U + 6 * MAX_NUMNODES), |
---|
| 2727 | + .private = FILE_MEMLIST, |
---|
| 2728 | + .flags = CFTYPE_NOT_ON_ROOT, |
---|
| 2729 | + }, |
---|
| 2730 | + |
---|
| 2731 | + { |
---|
| 2732 | + .name = "cpus.effective", |
---|
| 2733 | + .seq_show = cpuset_common_seq_show, |
---|
| 2734 | + .private = FILE_EFFECTIVE_CPULIST, |
---|
| 2735 | + }, |
---|
| 2736 | + |
---|
| 2737 | + { |
---|
| 2738 | + .name = "mems.effective", |
---|
| 2739 | + .seq_show = cpuset_common_seq_show, |
---|
| 2740 | + .private = FILE_EFFECTIVE_MEMLIST, |
---|
| 2741 | + }, |
---|
| 2742 | + |
---|
| 2743 | + { |
---|
| 2744 | + .name = "cpus.partition", |
---|
| 2745 | + .seq_show = sched_partition_show, |
---|
| 2746 | + .write = sched_partition_write, |
---|
| 2747 | + .private = FILE_PARTITION_ROOT, |
---|
| 2748 | + .flags = CFTYPE_NOT_ON_ROOT, |
---|
| 2749 | + }, |
---|
| 2750 | + |
---|
| 2751 | + { |
---|
| 2752 | + .name = "cpus.subpartitions", |
---|
| 2753 | + .seq_show = cpuset_common_seq_show, |
---|
| 2754 | + .private = FILE_SUBPARTS_CPULIST, |
---|
| 2755 | + .flags = CFTYPE_DEBUG, |
---|
| 2756 | + }, |
---|
| 2757 | + |
---|
| 2758 | + { } /* terminate */ |
---|
| 2759 | +}; |
---|
| 2760 | + |
---|
| 2761 | + |
---|
| 2762 | +/* |
---|
1946 | 2763 | * cpuset_css_alloc - allocate a cpuset css |
---|
1947 | 2764 | * cgrp: control group that the new cpuset will be part of |
---|
1948 | 2765 | */ |
---|
.. | .. |
---|
1958 | 2775 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
---|
1959 | 2776 | if (!cs) |
---|
1960 | 2777 | return ERR_PTR(-ENOMEM); |
---|
1961 | | - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) |
---|
1962 | | - goto free_cs; |
---|
1963 | | - if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) |
---|
1964 | | - goto free_allowed; |
---|
1965 | | - if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) |
---|
1966 | | - goto free_requested; |
---|
| 2778 | + |
---|
| 2779 | + if (alloc_cpumasks(cs, NULL)) { |
---|
| 2780 | + kfree(cs); |
---|
| 2781 | + return ERR_PTR(-ENOMEM); |
---|
| 2782 | + } |
---|
1967 | 2783 | |
---|
1968 | 2784 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
---|
1969 | | - cpumask_clear(cs->cpus_allowed); |
---|
1970 | | - cpumask_clear(cs->cpus_requested); |
---|
1971 | 2785 | nodes_clear(cs->mems_allowed); |
---|
1972 | | - cpumask_clear(cs->effective_cpus); |
---|
1973 | 2786 | nodes_clear(cs->effective_mems); |
---|
1974 | 2787 | fmeter_init(&cs->fmeter); |
---|
1975 | 2788 | cs->relax_domain_level = -1; |
---|
1976 | 2789 | |
---|
1977 | 2790 | return &cs->css; |
---|
1978 | | - |
---|
1979 | | -free_requested: |
---|
1980 | | - free_cpumask_var(cs->cpus_requested); |
---|
1981 | | -free_allowed: |
---|
1982 | | - free_cpumask_var(cs->cpus_allowed); |
---|
1983 | | -free_cs: |
---|
1984 | | - kfree(cs); |
---|
1985 | | - return ERR_PTR(-ENOMEM); |
---|
1986 | 2791 | } |
---|
1987 | 2792 | |
---|
1988 | 2793 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
---|
.. | .. |
---|
1995 | 2800 | if (!parent) |
---|
1996 | 2801 | return 0; |
---|
1997 | 2802 | |
---|
| 2803 | + get_online_cpus(); |
---|
1998 | 2804 | mutex_lock(&cpuset_mutex); |
---|
1999 | 2805 | |
---|
2000 | 2806 | set_bit(CS_ONLINE, &cs->flags); |
---|
.. | .. |
---|
2005 | 2811 | |
---|
2006 | 2812 | cpuset_inc(); |
---|
2007 | 2813 | |
---|
2008 | | - spin_lock_irq(&callback_lock); |
---|
| 2814 | + raw_spin_lock_irq(&callback_lock); |
---|
2009 | 2815 | if (is_in_v2_mode()) { |
---|
2010 | 2816 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
---|
2011 | 2817 | cs->effective_mems = parent->effective_mems; |
---|
| 2818 | + cs->use_parent_ecpus = true; |
---|
| 2819 | + parent->child_ecpus_count++; |
---|
2012 | 2820 | } |
---|
2013 | | - spin_unlock_irq(&callback_lock); |
---|
| 2821 | + raw_spin_unlock_irq(&callback_lock); |
---|
2014 | 2822 | |
---|
2015 | 2823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
---|
2016 | 2824 | goto out_unlock; |
---|
.. | .. |
---|
2037 | 2845 | } |
---|
2038 | 2846 | rcu_read_unlock(); |
---|
2039 | 2847 | |
---|
2040 | | - spin_lock_irq(&callback_lock); |
---|
| 2848 | + raw_spin_lock_irq(&callback_lock); |
---|
2041 | 2849 | cs->mems_allowed = parent->mems_allowed; |
---|
2042 | 2850 | cs->effective_mems = parent->mems_allowed; |
---|
2043 | 2851 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
---|
2044 | 2852 | cpumask_copy(cs->cpus_requested, parent->cpus_requested); |
---|
2045 | 2853 | cpumask_copy(cs->effective_cpus, parent->cpus_allowed); |
---|
2046 | | - spin_unlock_irq(&callback_lock); |
---|
| 2854 | + raw_spin_unlock_irq(&callback_lock); |
---|
2047 | 2855 | out_unlock: |
---|
2048 | 2856 | mutex_unlock(&cpuset_mutex); |
---|
| 2857 | + put_online_cpus(); |
---|
2049 | 2858 | return 0; |
---|
2050 | 2859 | } |
---|
2051 | 2860 | |
---|
2052 | 2861 | /* |
---|
2053 | 2862 | * If the cpuset being removed has its flag 'sched_load_balance' |
---|
2054 | 2863 | * enabled, then simulate turning sched_load_balance off, which |
---|
2055 | | - * will call rebuild_sched_domains_locked(). |
---|
| 2864 | + * will call rebuild_sched_domains_locked(). That is not needed |
---|
| 2865 | + * in the default hierarchy where only changes in partition |
---|
| 2866 | + * will cause repartitioning. |
---|
| 2867 | + * |
---|
| 2868 | + * If the cpuset has the 'sched.partition' flag enabled, simulate |
---|
| 2869 | + * turning 'sched.partition" off. |
---|
2056 | 2870 | */ |
---|
2057 | 2871 | |
---|
2058 | 2872 | static void cpuset_css_offline(struct cgroup_subsys_state *css) |
---|
2059 | 2873 | { |
---|
2060 | 2874 | struct cpuset *cs = css_cs(css); |
---|
2061 | 2875 | |
---|
| 2876 | + get_online_cpus(); |
---|
2062 | 2877 | mutex_lock(&cpuset_mutex); |
---|
2063 | 2878 | |
---|
2064 | | - if (is_sched_load_balance(cs)) |
---|
| 2879 | + if (is_partition_root(cs)) |
---|
| 2880 | + update_prstate(cs, 0); |
---|
| 2881 | + |
---|
| 2882 | + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
---|
| 2883 | + is_sched_load_balance(cs)) |
---|
2065 | 2884 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
---|
| 2885 | + |
---|
| 2886 | + if (cs->use_parent_ecpus) { |
---|
| 2887 | + struct cpuset *parent = parent_cs(cs); |
---|
| 2888 | + |
---|
| 2889 | + cs->use_parent_ecpus = false; |
---|
| 2890 | + parent->child_ecpus_count--; |
---|
| 2891 | + } |
---|
2066 | 2892 | |
---|
2067 | 2893 | cpuset_dec(); |
---|
2068 | 2894 | clear_bit(CS_ONLINE, &cs->flags); |
---|
2069 | 2895 | |
---|
2070 | 2896 | mutex_unlock(&cpuset_mutex); |
---|
| 2897 | + put_online_cpus(); |
---|
2071 | 2898 | } |
---|
2072 | 2899 | |
---|
2073 | 2900 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
---|
2074 | 2901 | { |
---|
2075 | 2902 | struct cpuset *cs = css_cs(css); |
---|
2076 | 2903 | |
---|
2077 | | - free_cpumask_var(cs->effective_cpus); |
---|
2078 | | - free_cpumask_var(cs->cpus_allowed); |
---|
2079 | | - free_cpumask_var(cs->cpus_requested); |
---|
2080 | | - kfree(cs); |
---|
| 2904 | + free_cpuset(cs); |
---|
2081 | 2905 | } |
---|
2082 | 2906 | |
---|
2083 | 2907 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
---|
2084 | 2908 | { |
---|
2085 | 2909 | mutex_lock(&cpuset_mutex); |
---|
2086 | | - spin_lock_irq(&callback_lock); |
---|
| 2910 | + raw_spin_lock_irq(&callback_lock); |
---|
2087 | 2911 | |
---|
2088 | 2912 | if (is_in_v2_mode()) { |
---|
2089 | 2913 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
---|
.. | .. |
---|
2094 | 2918 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
---|
2095 | 2919 | } |
---|
2096 | 2920 | |
---|
2097 | | - spin_unlock_irq(&callback_lock); |
---|
| 2921 | + raw_spin_unlock_irq(&callback_lock); |
---|
2098 | 2922 | mutex_unlock(&cpuset_mutex); |
---|
2099 | 2923 | } |
---|
2100 | 2924 | |
---|
.. | .. |
---|
2105 | 2929 | */ |
---|
2106 | 2930 | static void cpuset_fork(struct task_struct *task) |
---|
2107 | 2931 | { |
---|
| 2932 | + int inherit_cpus = 0; |
---|
2108 | 2933 | if (task_css_is_root(task, cpuset_cgrp_id)) |
---|
2109 | 2934 | return; |
---|
2110 | 2935 | |
---|
2111 | | - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); |
---|
| 2936 | + trace_android_rvh_cpuset_fork(task, &inherit_cpus); |
---|
| 2937 | + if (!inherit_cpus) |
---|
| 2938 | + set_cpus_allowed_ptr(task, current->cpus_ptr); |
---|
2112 | 2939 | task->mems_allowed = current->mems_allowed; |
---|
2113 | 2940 | } |
---|
2114 | 2941 | |
---|
.. | .. |
---|
2123 | 2950 | .post_attach = cpuset_post_attach, |
---|
2124 | 2951 | .bind = cpuset_bind, |
---|
2125 | 2952 | .fork = cpuset_fork, |
---|
2126 | | - .legacy_cftypes = files, |
---|
| 2953 | + .legacy_cftypes = legacy_files, |
---|
| 2954 | + .dfl_cftypes = dfl_files, |
---|
2127 | 2955 | .early_init = true, |
---|
| 2956 | + .threaded = true, |
---|
2128 | 2957 | }; |
---|
2129 | 2958 | |
---|
2130 | 2959 | /** |
---|
2131 | 2960 | * cpuset_init - initialize cpusets at system boot |
---|
2132 | 2961 | * |
---|
2133 | | - * Description: Initialize top_cpuset and the cpuset internal file system, |
---|
| 2962 | + * Description: Initialize top_cpuset |
---|
2134 | 2963 | **/ |
---|
2135 | 2964 | |
---|
2136 | 2965 | int __init cpuset_init(void) |
---|
2137 | 2966 | { |
---|
2138 | | - int err = 0; |
---|
2139 | | - |
---|
2140 | 2967 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); |
---|
2141 | 2968 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); |
---|
| 2969 | + BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); |
---|
2142 | 2970 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); |
---|
2143 | 2971 | |
---|
2144 | 2972 | cpumask_setall(top_cpuset.cpus_allowed); |
---|
.. | .. |
---|
2150 | 2978 | fmeter_init(&top_cpuset.fmeter); |
---|
2151 | 2979 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
---|
2152 | 2980 | top_cpuset.relax_domain_level = -1; |
---|
2153 | | - |
---|
2154 | | - err = register_filesystem(&cpuset_fs_type); |
---|
2155 | | - if (err < 0) |
---|
2156 | | - return err; |
---|
2157 | 2981 | |
---|
2158 | 2982 | BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); |
---|
2159 | 2983 | |
---|
.. | .. |
---|
2194 | 3018 | { |
---|
2195 | 3019 | bool is_empty; |
---|
2196 | 3020 | |
---|
2197 | | - spin_lock_irq(&callback_lock); |
---|
| 3021 | + raw_spin_lock_irq(&callback_lock); |
---|
2198 | 3022 | cpumask_copy(cs->cpus_allowed, new_cpus); |
---|
2199 | 3023 | cpumask_copy(cs->effective_cpus, new_cpus); |
---|
2200 | 3024 | cs->mems_allowed = *new_mems; |
---|
2201 | 3025 | cs->effective_mems = *new_mems; |
---|
2202 | | - spin_unlock_irq(&callback_lock); |
---|
| 3026 | + raw_spin_unlock_irq(&callback_lock); |
---|
2203 | 3027 | |
---|
2204 | 3028 | /* |
---|
2205 | 3029 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, |
---|
.. | .. |
---|
2236 | 3060 | if (nodes_empty(*new_mems)) |
---|
2237 | 3061 | *new_mems = parent_cs(cs)->effective_mems; |
---|
2238 | 3062 | |
---|
2239 | | - spin_lock_irq(&callback_lock); |
---|
| 3063 | + raw_spin_lock_irq(&callback_lock); |
---|
2240 | 3064 | cpumask_copy(cs->effective_cpus, new_cpus); |
---|
2241 | 3065 | cs->effective_mems = *new_mems; |
---|
2242 | | - spin_unlock_irq(&callback_lock); |
---|
| 3066 | + raw_spin_unlock_irq(&callback_lock); |
---|
2243 | 3067 | |
---|
2244 | 3068 | if (cpus_updated) |
---|
2245 | 3069 | update_tasks_cpumask(cs); |
---|
.. | .. |
---|
2247 | 3071 | update_tasks_nodemask(cs); |
---|
2248 | 3072 | } |
---|
2249 | 3073 | |
---|
| 3074 | +static bool force_rebuild; |
---|
| 3075 | + |
---|
| 3076 | +void cpuset_force_rebuild(void) |
---|
| 3077 | +{ |
---|
| 3078 | + force_rebuild = true; |
---|
| 3079 | +} |
---|
| 3080 | + |
---|
2250 | 3081 | /** |
---|
2251 | 3082 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
---|
2252 | 3083 | * @cs: cpuset in interest |
---|
| 3084 | + * @tmp: the tmpmasks structure pointer |
---|
2253 | 3085 | * |
---|
2254 | 3086 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
---|
2255 | 3087 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, |
---|
2256 | 3088 | * all its tasks are moved to the nearest ancestor with both resources. |
---|
2257 | 3089 | */ |
---|
2258 | | -static void cpuset_hotplug_update_tasks(struct cpuset *cs) |
---|
| 3090 | +static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) |
---|
2259 | 3091 | { |
---|
2260 | 3092 | static cpumask_t new_cpus; |
---|
2261 | 3093 | static nodemask_t new_mems; |
---|
2262 | 3094 | bool cpus_updated; |
---|
2263 | 3095 | bool mems_updated; |
---|
| 3096 | + struct cpuset *parent; |
---|
2264 | 3097 | retry: |
---|
2265 | 3098 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); |
---|
2266 | 3099 | |
---|
.. | .. |
---|
2275 | 3108 | goto retry; |
---|
2276 | 3109 | } |
---|
2277 | 3110 | |
---|
2278 | | - cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); |
---|
2279 | | - nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); |
---|
| 3111 | + parent = parent_cs(cs); |
---|
| 3112 | + compute_effective_cpumask(&new_cpus, cs, parent); |
---|
| 3113 | + nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); |
---|
2280 | 3114 | |
---|
| 3115 | + if (cs->nr_subparts_cpus) |
---|
| 3116 | + /* |
---|
| 3117 | + * Make sure that CPUs allocated to child partitions |
---|
| 3118 | + * do not show up in effective_cpus. |
---|
| 3119 | + */ |
---|
| 3120 | + cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); |
---|
| 3121 | + |
---|
| 3122 | + if (!tmp || !cs->partition_root_state) |
---|
| 3123 | + goto update_tasks; |
---|
| 3124 | + |
---|
| 3125 | + /* |
---|
| 3126 | + * In the unlikely event that a partition root has empty |
---|
| 3127 | + * effective_cpus or its parent becomes erroneous, we have to |
---|
| 3128 | + * transition it to the erroneous state. |
---|
| 3129 | + */ |
---|
| 3130 | + if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || |
---|
| 3131 | + (parent->partition_root_state == PRS_ERROR))) { |
---|
| 3132 | + if (cs->nr_subparts_cpus) { |
---|
| 3133 | + raw_spin_lock_irq(&callback_lock); |
---|
| 3134 | + cs->nr_subparts_cpus = 0; |
---|
| 3135 | + cpumask_clear(cs->subparts_cpus); |
---|
| 3136 | + raw_spin_unlock_irq(&callback_lock); |
---|
| 3137 | + compute_effective_cpumask(&new_cpus, cs, parent); |
---|
| 3138 | + } |
---|
| 3139 | + |
---|
| 3140 | + /* |
---|
| 3141 | + * If the effective_cpus is empty because the child |
---|
| 3142 | + * partitions take away all the CPUs, we can keep |
---|
| 3143 | + * the current partition and let the child partitions |
---|
| 3144 | + * fight for available CPUs. |
---|
| 3145 | + */ |
---|
| 3146 | + if ((parent->partition_root_state == PRS_ERROR) || |
---|
| 3147 | + cpumask_empty(&new_cpus)) { |
---|
| 3148 | + update_parent_subparts_cpumask(cs, partcmd_disable, |
---|
| 3149 | + NULL, tmp); |
---|
| 3150 | + raw_spin_lock_irq(&callback_lock); |
---|
| 3151 | + cs->partition_root_state = PRS_ERROR; |
---|
| 3152 | + raw_spin_unlock_irq(&callback_lock); |
---|
| 3153 | + } |
---|
| 3154 | + cpuset_force_rebuild(); |
---|
| 3155 | + } |
---|
| 3156 | + |
---|
| 3157 | + /* |
---|
| 3158 | + * On the other hand, an erroneous partition root may be transitioned |
---|
| 3159 | + * back to a regular one or a partition root with no CPU allocated |
---|
| 3160 | + * from the parent may change to erroneous. |
---|
| 3161 | + */ |
---|
| 3162 | + if (is_partition_root(parent) && |
---|
| 3163 | + ((cs->partition_root_state == PRS_ERROR) || |
---|
| 3164 | + !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && |
---|
| 3165 | + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) |
---|
| 3166 | + cpuset_force_rebuild(); |
---|
| 3167 | + |
---|
| 3168 | +update_tasks: |
---|
2281 | 3169 | cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); |
---|
2282 | 3170 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); |
---|
2283 | 3171 | |
---|
.. | .. |
---|
2289 | 3177 | cpus_updated, mems_updated); |
---|
2290 | 3178 | |
---|
2291 | 3179 | mutex_unlock(&cpuset_mutex); |
---|
2292 | | -} |
---|
2293 | | - |
---|
2294 | | -static bool force_rebuild; |
---|
2295 | | - |
---|
2296 | | -void cpuset_force_rebuild(void) |
---|
2297 | | -{ |
---|
2298 | | - force_rebuild = true; |
---|
2299 | 3180 | } |
---|
2300 | 3181 | |
---|
2301 | 3182 | /** |
---|
.. | .. |
---|
2314 | 3195 | * Note that CPU offlining during suspend is ignored. We don't modify |
---|
2315 | 3196 | * cpusets across suspend/resume cycles at all. |
---|
2316 | 3197 | */ |
---|
2317 | | -static void cpuset_hotplug_workfn(struct work_struct *work) |
---|
| 3198 | +void cpuset_hotplug_workfn(struct work_struct *work) |
---|
2318 | 3199 | { |
---|
2319 | 3200 | static cpumask_t new_cpus; |
---|
2320 | 3201 | static nodemask_t new_mems; |
---|
2321 | 3202 | bool cpus_updated, mems_updated; |
---|
2322 | 3203 | bool on_dfl = is_in_v2_mode(); |
---|
| 3204 | + struct tmpmasks tmp, *ptmp = NULL; |
---|
| 3205 | + |
---|
| 3206 | + if (on_dfl && !alloc_cpumasks(NULL, &tmp)) |
---|
| 3207 | + ptmp = &tmp; |
---|
2323 | 3208 | |
---|
2324 | 3209 | mutex_lock(&cpuset_mutex); |
---|
2325 | 3210 | |
---|
.. | .. |
---|
2327 | 3212 | cpumask_copy(&new_cpus, cpu_active_mask); |
---|
2328 | 3213 | new_mems = node_states[N_MEMORY]; |
---|
2329 | 3214 | |
---|
| 3215 | + /* |
---|
| 3216 | + * If subparts_cpus is populated, it is likely that the check below |
---|
| 3217 | + * will produce a false positive on cpus_updated when the cpu list |
---|
| 3218 | + * isn't changed. It is extra work, but it is better to be safe. |
---|
| 3219 | + */ |
---|
2330 | 3220 | cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); |
---|
2331 | 3221 | mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); |
---|
2332 | 3222 | |
---|
| 3223 | + /* |
---|
| 3224 | + * In the rare case that hotplug removes all the cpus in subparts_cpus, |
---|
| 3225 | + * we assumed that cpus are updated. |
---|
| 3226 | + */ |
---|
| 3227 | + if (!cpus_updated && top_cpuset.nr_subparts_cpus) |
---|
| 3228 | + cpus_updated = true; |
---|
| 3229 | + |
---|
2333 | 3230 | /* synchronize cpus_allowed to cpu_active_mask */ |
---|
2334 | 3231 | if (cpus_updated) { |
---|
2335 | | - spin_lock_irq(&callback_lock); |
---|
| 3232 | + raw_spin_lock_irq(&callback_lock); |
---|
2336 | 3233 | if (!on_dfl) |
---|
2337 | 3234 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
---|
| 3235 | + /* |
---|
| 3236 | + * Make sure that CPUs allocated to child partitions |
---|
| 3237 | + * do not show up in effective_cpus. If no CPU is left, |
---|
| 3238 | + * we clear the subparts_cpus & let the child partitions |
---|
| 3239 | + * fight for the CPUs again. |
---|
| 3240 | + */ |
---|
| 3241 | + if (top_cpuset.nr_subparts_cpus) { |
---|
| 3242 | + if (cpumask_subset(&new_cpus, |
---|
| 3243 | + top_cpuset.subparts_cpus)) { |
---|
| 3244 | + top_cpuset.nr_subparts_cpus = 0; |
---|
| 3245 | + cpumask_clear(top_cpuset.subparts_cpus); |
---|
| 3246 | + } else { |
---|
| 3247 | + cpumask_andnot(&new_cpus, &new_cpus, |
---|
| 3248 | + top_cpuset.subparts_cpus); |
---|
| 3249 | + } |
---|
| 3250 | + } |
---|
2338 | 3251 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); |
---|
2339 | | - spin_unlock_irq(&callback_lock); |
---|
| 3252 | + raw_spin_unlock_irq(&callback_lock); |
---|
2340 | 3253 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
---|
2341 | 3254 | } |
---|
2342 | 3255 | |
---|
2343 | 3256 | /* synchronize mems_allowed to N_MEMORY */ |
---|
2344 | 3257 | if (mems_updated) { |
---|
2345 | | - spin_lock_irq(&callback_lock); |
---|
| 3258 | + raw_spin_lock_irq(&callback_lock); |
---|
2346 | 3259 | if (!on_dfl) |
---|
2347 | 3260 | top_cpuset.mems_allowed = new_mems; |
---|
2348 | 3261 | top_cpuset.effective_mems = new_mems; |
---|
2349 | | - spin_unlock_irq(&callback_lock); |
---|
| 3262 | + raw_spin_unlock_irq(&callback_lock); |
---|
2350 | 3263 | update_tasks_nodemask(&top_cpuset); |
---|
2351 | 3264 | } |
---|
2352 | 3265 | |
---|
.. | .. |
---|
2363 | 3276 | continue; |
---|
2364 | 3277 | rcu_read_unlock(); |
---|
2365 | 3278 | |
---|
2366 | | - cpuset_hotplug_update_tasks(cs); |
---|
| 3279 | + cpuset_hotplug_update_tasks(cs, ptmp); |
---|
2367 | 3280 | |
---|
2368 | 3281 | rcu_read_lock(); |
---|
2369 | 3282 | css_put(&cs->css); |
---|
.. | .. |
---|
2376 | 3289 | force_rebuild = false; |
---|
2377 | 3290 | rebuild_sched_domains(); |
---|
2378 | 3291 | } |
---|
| 3292 | + |
---|
| 3293 | + free_cpumasks(NULL, ptmp); |
---|
2379 | 3294 | } |
---|
2380 | 3295 | |
---|
2381 | 3296 | void cpuset_update_active_cpus(void) |
---|
.. | .. |
---|
2386 | 3301 | * to a work item to avoid reverse locking order. |
---|
2387 | 3302 | */ |
---|
2388 | 3303 | schedule_work(&cpuset_hotplug_work); |
---|
| 3304 | +} |
---|
| 3305 | + |
---|
| 3306 | +void cpuset_update_active_cpus_affine(int cpu) |
---|
| 3307 | +{ |
---|
| 3308 | + schedule_work_on(cpu, &cpuset_hotplug_work); |
---|
2389 | 3309 | } |
---|
2390 | 3310 | |
---|
2391 | 3311 | void cpuset_wait_for_hotplug(void) |
---|
.. | .. |
---|
2417 | 3337 | */ |
---|
2418 | 3338 | void __init cpuset_init_smp(void) |
---|
2419 | 3339 | { |
---|
2420 | | - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
---|
2421 | | - top_cpuset.mems_allowed = node_states[N_MEMORY]; |
---|
| 3340 | + /* |
---|
| 3341 | + * cpus_allowd/mems_allowed set to v2 values in the initial |
---|
| 3342 | + * cpuset_bind() call will be reset to v1 values in another |
---|
| 3343 | + * cpuset_bind() call when v1 cpuset is mounted. |
---|
| 3344 | + */ |
---|
2422 | 3345 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; |
---|
2423 | 3346 | |
---|
2424 | 3347 | cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); |
---|
.. | .. |
---|
2445 | 3368 | { |
---|
2446 | 3369 | unsigned long flags; |
---|
2447 | 3370 | |
---|
2448 | | - spin_lock_irqsave(&callback_lock, flags); |
---|
| 3371 | + raw_spin_lock_irqsave(&callback_lock, flags); |
---|
2449 | 3372 | rcu_read_lock(); |
---|
2450 | | - guarantee_online_cpus(task_cs(tsk), pmask); |
---|
| 3373 | + guarantee_online_cpus(tsk, pmask); |
---|
2451 | 3374 | rcu_read_unlock(); |
---|
2452 | | - spin_unlock_irqrestore(&callback_lock, flags); |
---|
| 3375 | + raw_spin_unlock_irqrestore(&callback_lock, flags); |
---|
2453 | 3376 | } |
---|
2454 | | - |
---|
| 3377 | +EXPORT_SYMBOL_GPL(cpuset_cpus_allowed); |
---|
2455 | 3378 | /** |
---|
2456 | 3379 | * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. |
---|
2457 | 3380 | * @tsk: pointer to task_struct with which the scheduler is struggling |
---|
.. | .. |
---|
2466 | 3389 | |
---|
2467 | 3390 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
---|
2468 | 3391 | { |
---|
| 3392 | + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
---|
| 3393 | + const struct cpumask *cs_mask; |
---|
| 3394 | + |
---|
2469 | 3395 | rcu_read_lock(); |
---|
2470 | | - do_set_cpus_allowed(tsk, is_in_v2_mode() ? |
---|
2471 | | - task_cs(tsk)->cpus_allowed : cpu_possible_mask); |
---|
| 3396 | + cs_mask = task_cs(tsk)->cpus_allowed; |
---|
| 3397 | + |
---|
| 3398 | + if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask)) |
---|
| 3399 | + goto unlock; /* select_fallback_rq will try harder */ |
---|
| 3400 | + |
---|
| 3401 | + do_set_cpus_allowed(tsk, cs_mask); |
---|
| 3402 | +unlock: |
---|
2472 | 3403 | rcu_read_unlock(); |
---|
2473 | 3404 | |
---|
2474 | 3405 | /* |
---|
.. | .. |
---|
2510 | 3441 | nodemask_t mask; |
---|
2511 | 3442 | unsigned long flags; |
---|
2512 | 3443 | |
---|
2513 | | - spin_lock_irqsave(&callback_lock, flags); |
---|
| 3444 | + raw_spin_lock_irqsave(&callback_lock, flags); |
---|
2514 | 3445 | rcu_read_lock(); |
---|
2515 | 3446 | guarantee_online_mems(task_cs(tsk), &mask); |
---|
2516 | 3447 | rcu_read_unlock(); |
---|
2517 | | - spin_unlock_irqrestore(&callback_lock, flags); |
---|
| 3448 | + raw_spin_unlock_irqrestore(&callback_lock, flags); |
---|
2518 | 3449 | |
---|
2519 | 3450 | return mask; |
---|
2520 | 3451 | } |
---|
.. | .. |
---|
2606 | 3537 | return true; |
---|
2607 | 3538 | |
---|
2608 | 3539 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
---|
2609 | | - spin_lock_irqsave(&callback_lock, flags); |
---|
| 3540 | + raw_spin_lock_irqsave(&callback_lock, flags); |
---|
2610 | 3541 | |
---|
2611 | 3542 | rcu_read_lock(); |
---|
2612 | 3543 | cs = nearest_hardwall_ancestor(task_cs(current)); |
---|
2613 | 3544 | allowed = node_isset(node, cs->mems_allowed); |
---|
2614 | 3545 | rcu_read_unlock(); |
---|
2615 | 3546 | |
---|
2616 | | - spin_unlock_irqrestore(&callback_lock, flags); |
---|
| 3547 | + raw_spin_unlock_irqrestore(&callback_lock, flags); |
---|
2617 | 3548 | return allowed; |
---|
2618 | 3549 | } |
---|
2619 | 3550 | |
---|
.. | .. |
---|
2699 | 3630 | rcu_read_lock(); |
---|
2700 | 3631 | |
---|
2701 | 3632 | cgrp = task_cs(current)->css.cgroup; |
---|
2702 | | - pr_info("%s cpuset=", current->comm); |
---|
| 3633 | + pr_cont(",cpuset="); |
---|
2703 | 3634 | pr_cont_cgroup_name(cgrp); |
---|
2704 | | - pr_cont(" mems_allowed=%*pbl\n", |
---|
| 3635 | + pr_cont(",mems_allowed=%*pbl", |
---|
2705 | 3636 | nodemask_pr_args(¤t->mems_allowed)); |
---|
2706 | 3637 | |
---|
2707 | 3638 | rcu_read_unlock(); |
---|