.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * kernel/sched/core.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
5 | 6 | * |
---|
6 | 7 | * Copyright (C) 1991-2002 Linus Torvalds |
---|
7 | 8 | */ |
---|
| 9 | +#define CREATE_TRACE_POINTS |
---|
| 10 | +#include <trace/events/sched.h> |
---|
| 11 | +#undef CREATE_TRACE_POINTS |
---|
| 12 | + |
---|
8 | 13 | #include "sched.h" |
---|
9 | 14 | |
---|
10 | 15 | #include <linux/nospec.h> |
---|
.. | .. |
---|
16 | 21 | #include <asm/tlb.h> |
---|
17 | 22 | |
---|
18 | 23 | #include "../workqueue_internal.h" |
---|
| 24 | +#include "../../io_uring/io-wq.h" |
---|
19 | 25 | #include "../smpboot.h" |
---|
20 | 26 | |
---|
21 | 27 | #include "pelt.h" |
---|
| 28 | +#include "smp.h" |
---|
22 | 29 | |
---|
23 | | -#define CREATE_TRACE_POINTS |
---|
24 | | -#include <trace/events/sched.h> |
---|
| 30 | +#include <trace/hooks/sched.h> |
---|
| 31 | +#include <trace/hooks/dtask.h> |
---|
| 32 | + |
---|
| 33 | +/* |
---|
| 34 | + * Export tracepoints that act as a bare tracehook (ie: have no trace event |
---|
| 35 | + * associated with them) to allow external modules to probe them. |
---|
| 36 | + */ |
---|
| 37 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); |
---|
| 38 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); |
---|
| 39 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); |
---|
| 40 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); |
---|
| 41 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); |
---|
| 42 | +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); |
---|
| 43 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); |
---|
| 44 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); |
---|
| 45 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); |
---|
| 46 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); |
---|
| 47 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); |
---|
| 48 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch); |
---|
| 49 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking); |
---|
| 50 | +#ifdef CONFIG_SCHEDSTATS |
---|
| 51 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep); |
---|
| 52 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait); |
---|
| 53 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait); |
---|
| 54 | +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked); |
---|
| 55 | +#endif |
---|
25 | 56 | |
---|
26 | 57 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
---|
| 58 | +EXPORT_SYMBOL_GPL(runqueues); |
---|
27 | 59 | |
---|
28 | 60 | #ifdef CONFIG_SCHED_DEBUG |
---|
29 | 61 | /* |
---|
.. | .. |
---|
38 | 70 | const_debug unsigned int sysctl_sched_features = |
---|
39 | 71 | #include "features.h" |
---|
40 | 72 | 0; |
---|
| 73 | +EXPORT_SYMBOL_GPL(sysctl_sched_features); |
---|
41 | 74 | #undef SCHED_FEAT |
---|
42 | 75 | #endif |
---|
43 | 76 | |
---|
.. | .. |
---|
60 | 93 | * default: 0.95s |
---|
61 | 94 | */ |
---|
62 | 95 | int sysctl_sched_rt_runtime = 950000; |
---|
| 96 | + |
---|
| 97 | + |
---|
| 98 | +/* |
---|
| 99 | + * Serialization rules: |
---|
| 100 | + * |
---|
| 101 | + * Lock order: |
---|
| 102 | + * |
---|
| 103 | + * p->pi_lock |
---|
| 104 | + * rq->lock |
---|
| 105 | + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) |
---|
| 106 | + * |
---|
| 107 | + * rq1->lock |
---|
| 108 | + * rq2->lock where: rq1 < rq2 |
---|
| 109 | + * |
---|
| 110 | + * Regular state: |
---|
| 111 | + * |
---|
| 112 | + * Normal scheduling state is serialized by rq->lock. __schedule() takes the |
---|
| 113 | + * local CPU's rq->lock, it optionally removes the task from the runqueue and |
---|
| 114 | + * always looks at the local rq data structures to find the most elegible task |
---|
| 115 | + * to run next. |
---|
| 116 | + * |
---|
| 117 | + * Task enqueue is also under rq->lock, possibly taken from another CPU. |
---|
| 118 | + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to |
---|
| 119 | + * the local CPU to avoid bouncing the runqueue state around [ see |
---|
| 120 | + * ttwu_queue_wakelist() ] |
---|
| 121 | + * |
---|
| 122 | + * Task wakeup, specifically wakeups that involve migration, are horribly |
---|
| 123 | + * complicated to avoid having to take two rq->locks. |
---|
| 124 | + * |
---|
| 125 | + * Special state: |
---|
| 126 | + * |
---|
| 127 | + * System-calls and anything external will use task_rq_lock() which acquires |
---|
| 128 | + * both p->pi_lock and rq->lock. As a consequence the state they change is |
---|
| 129 | + * stable while holding either lock: |
---|
| 130 | + * |
---|
| 131 | + * - sched_setaffinity()/ |
---|
| 132 | + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed |
---|
| 133 | + * - set_user_nice(): p->se.load, p->*prio |
---|
| 134 | + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, |
---|
| 135 | + * p->se.load, p->rt_priority, |
---|
| 136 | + * p->dl.dl_{runtime, deadline, period, flags, bw, density} |
---|
| 137 | + * - sched_setnuma(): p->numa_preferred_nid |
---|
| 138 | + * - sched_move_task()/ |
---|
| 139 | + * cpu_cgroup_fork(): p->sched_task_group |
---|
| 140 | + * - uclamp_update_active() p->uclamp* |
---|
| 141 | + * |
---|
| 142 | + * p->state <- TASK_*: |
---|
| 143 | + * |
---|
| 144 | + * is changed locklessly using set_current_state(), __set_current_state() or |
---|
| 145 | + * set_special_state(), see their respective comments, or by |
---|
| 146 | + * try_to_wake_up(). This latter uses p->pi_lock to serialize against |
---|
| 147 | + * concurrent self. |
---|
| 148 | + * |
---|
| 149 | + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: |
---|
| 150 | + * |
---|
| 151 | + * is set by activate_task() and cleared by deactivate_task(), under |
---|
| 152 | + * rq->lock. Non-zero indicates the task is runnable, the special |
---|
| 153 | + * ON_RQ_MIGRATING state is used for migration without holding both |
---|
| 154 | + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). |
---|
| 155 | + * |
---|
| 156 | + * p->on_cpu <- { 0, 1 }: |
---|
| 157 | + * |
---|
| 158 | + * is set by prepare_task() and cleared by finish_task() such that it will be |
---|
| 159 | + * set before p is scheduled-in and cleared after p is scheduled-out, both |
---|
| 160 | + * under rq->lock. Non-zero indicates the task is running on its CPU. |
---|
| 161 | + * |
---|
| 162 | + * [ The astute reader will observe that it is possible for two tasks on one |
---|
| 163 | + * CPU to have ->on_cpu = 1 at the same time. ] |
---|
| 164 | + * |
---|
| 165 | + * task_cpu(p): is changed by set_task_cpu(), the rules are: |
---|
| 166 | + * |
---|
| 167 | + * - Don't call set_task_cpu() on a blocked task: |
---|
| 168 | + * |
---|
| 169 | + * We don't care what CPU we're not running on, this simplifies hotplug, |
---|
| 170 | + * the CPU assignment of blocked tasks isn't required to be valid. |
---|
| 171 | + * |
---|
| 172 | + * - for try_to_wake_up(), called under p->pi_lock: |
---|
| 173 | + * |
---|
| 174 | + * This allows try_to_wake_up() to only take one rq->lock, see its comment. |
---|
| 175 | + * |
---|
| 176 | + * - for migration called under rq->lock: |
---|
| 177 | + * [ see task_on_rq_migrating() in task_rq_lock() ] |
---|
| 178 | + * |
---|
| 179 | + * o move_queued_task() |
---|
| 180 | + * o detach_task() |
---|
| 181 | + * |
---|
| 182 | + * - for migration called under double_rq_lock(): |
---|
| 183 | + * |
---|
| 184 | + * o __migrate_swap_task() |
---|
| 185 | + * o push_rt_task() / pull_rt_task() |
---|
| 186 | + * o push_dl_task() / pull_dl_task() |
---|
| 187 | + * o dl_task_offline_migration() |
---|
| 188 | + * |
---|
| 189 | + */ |
---|
63 | 190 | |
---|
64 | 191 | /* |
---|
65 | 192 | * __task_rq_lock - lock the rq @p resides on. |
---|
.. | .. |
---|
84 | 211 | cpu_relax(); |
---|
85 | 212 | } |
---|
86 | 213 | } |
---|
| 214 | +EXPORT_SYMBOL_GPL(__task_rq_lock); |
---|
87 | 215 | |
---|
88 | 216 | /* |
---|
89 | 217 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
---|
.. | .. |
---|
126 | 254 | cpu_relax(); |
---|
127 | 255 | } |
---|
128 | 256 | } |
---|
| 257 | +EXPORT_SYMBOL_GPL(task_rq_lock); |
---|
129 | 258 | |
---|
130 | 259 | /* |
---|
131 | 260 | * RQ-clock updating methods: |
---|
.. | .. |
---|
206 | 335 | rq->clock += delta; |
---|
207 | 336 | update_rq_clock_task(rq, delta); |
---|
208 | 337 | } |
---|
| 338 | +EXPORT_SYMBOL_GPL(update_rq_clock); |
---|
209 | 339 | |
---|
| 340 | +static inline void |
---|
| 341 | +rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func) |
---|
| 342 | +{ |
---|
| 343 | + csd->flags = 0; |
---|
| 344 | + csd->func = func; |
---|
| 345 | + csd->info = rq; |
---|
| 346 | +} |
---|
210 | 347 | |
---|
211 | 348 | #ifdef CONFIG_SCHED_HRTICK |
---|
212 | 349 | /* |
---|
.. | .. |
---|
243 | 380 | static void __hrtick_restart(struct rq *rq) |
---|
244 | 381 | { |
---|
245 | 382 | struct hrtimer *timer = &rq->hrtick_timer; |
---|
| 383 | + ktime_t time = rq->hrtick_time; |
---|
246 | 384 | |
---|
247 | | - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); |
---|
| 385 | + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); |
---|
248 | 386 | } |
---|
249 | 387 | |
---|
250 | 388 | /* |
---|
.. | .. |
---|
257 | 395 | |
---|
258 | 396 | rq_lock(rq, &rf); |
---|
259 | 397 | __hrtick_restart(rq); |
---|
260 | | - rq->hrtick_csd_pending = 0; |
---|
261 | 398 | rq_unlock(rq, &rf); |
---|
262 | 399 | } |
---|
263 | 400 | |
---|
.. | .. |
---|
269 | 406 | void hrtick_start(struct rq *rq, u64 delay) |
---|
270 | 407 | { |
---|
271 | 408 | struct hrtimer *timer = &rq->hrtick_timer; |
---|
272 | | - ktime_t time; |
---|
273 | 409 | s64 delta; |
---|
274 | 410 | |
---|
275 | 411 | /* |
---|
.. | .. |
---|
277 | 413 | * doesn't make sense and can cause timer DoS. |
---|
278 | 414 | */ |
---|
279 | 415 | delta = max_t(s64, delay, 10000LL); |
---|
280 | | - time = ktime_add_ns(timer->base->get_time(), delta); |
---|
| 416 | + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); |
---|
281 | 417 | |
---|
282 | | - hrtimer_set_expires(timer, time); |
---|
283 | | - |
---|
284 | | - if (rq == this_rq()) { |
---|
| 418 | + if (rq == this_rq()) |
---|
285 | 419 | __hrtick_restart(rq); |
---|
286 | | - } else if (!rq->hrtick_csd_pending) { |
---|
| 420 | + else |
---|
287 | 421 | smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); |
---|
288 | | - rq->hrtick_csd_pending = 1; |
---|
289 | | - } |
---|
290 | 422 | } |
---|
291 | 423 | |
---|
292 | 424 | #else |
---|
.. | .. |
---|
303 | 435 | */ |
---|
304 | 436 | delay = max_t(u64, delay, 10000LL); |
---|
305 | 437 | hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), |
---|
306 | | - HRTIMER_MODE_REL_PINNED); |
---|
| 438 | + HRTIMER_MODE_REL_PINNED_HARD); |
---|
307 | 439 | } |
---|
| 440 | + |
---|
308 | 441 | #endif /* CONFIG_SMP */ |
---|
309 | 442 | |
---|
310 | 443 | static void hrtick_rq_init(struct rq *rq) |
---|
311 | 444 | { |
---|
312 | 445 | #ifdef CONFIG_SMP |
---|
313 | | - rq->hrtick_csd_pending = 0; |
---|
314 | | - |
---|
315 | | - rq->hrtick_csd.flags = 0; |
---|
316 | | - rq->hrtick_csd.func = __hrtick_start; |
---|
317 | | - rq->hrtick_csd.info = rq; |
---|
| 446 | + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); |
---|
318 | 447 | #endif |
---|
319 | | - |
---|
320 | | - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
---|
| 448 | + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); |
---|
321 | 449 | rq->hrtick_timer.function = hrtick; |
---|
322 | 450 | } |
---|
323 | 451 | #else /* CONFIG_SCHED_HRTICK */ |
---|
.. | .. |
---|
399 | 527 | #endif |
---|
400 | 528 | #endif |
---|
401 | 529 | |
---|
402 | | -void wake_q_add(struct wake_q_head *head, struct task_struct *task) |
---|
| 530 | +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) |
---|
403 | 531 | { |
---|
404 | 532 | struct wake_q_node *node = &task->wake_q; |
---|
405 | 533 | |
---|
.. | .. |
---|
412 | 540 | * state, even in the failed case, an explicit smp_mb() must be used. |
---|
413 | 541 | */ |
---|
414 | 542 | smp_mb__before_atomic(); |
---|
415 | | - if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) |
---|
416 | | - return; |
---|
417 | | - |
---|
418 | | - head->count++; |
---|
419 | | - |
---|
420 | | - get_task_struct(task); |
---|
| 543 | + if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) |
---|
| 544 | + return false; |
---|
421 | 545 | |
---|
422 | 546 | /* |
---|
423 | 547 | * The head is context local, there can be no concurrency. |
---|
424 | 548 | */ |
---|
425 | 549 | *head->lastp = node; |
---|
426 | 550 | head->lastp = &node->next; |
---|
| 551 | + head->count++; |
---|
| 552 | + return true; |
---|
427 | 553 | } |
---|
428 | 554 | |
---|
429 | | -static int |
---|
430 | | -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, |
---|
431 | | - int sibling_count_hint); |
---|
| 555 | +/** |
---|
| 556 | + * wake_q_add() - queue a wakeup for 'later' waking. |
---|
| 557 | + * @head: the wake_q_head to add @task to |
---|
| 558 | + * @task: the task to queue for 'later' wakeup |
---|
| 559 | + * |
---|
| 560 | + * Queue a task for later wakeup, most likely by the wake_up_q() call in the |
---|
| 561 | + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come |
---|
| 562 | + * instantly. |
---|
| 563 | + * |
---|
| 564 | + * This function must be used as-if it were wake_up_process(); IOW the task |
---|
| 565 | + * must be ready to be woken at this location. |
---|
| 566 | + */ |
---|
| 567 | +void wake_q_add(struct wake_q_head *head, struct task_struct *task) |
---|
| 568 | +{ |
---|
| 569 | + if (__wake_q_add(head, task)) |
---|
| 570 | + get_task_struct(task); |
---|
| 571 | +} |
---|
| 572 | + |
---|
| 573 | +/** |
---|
| 574 | + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. |
---|
| 575 | + * @head: the wake_q_head to add @task to |
---|
| 576 | + * @task: the task to queue for 'later' wakeup |
---|
| 577 | + * |
---|
| 578 | + * Queue a task for later wakeup, most likely by the wake_up_q() call in the |
---|
| 579 | + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come |
---|
| 580 | + * instantly. |
---|
| 581 | + * |
---|
| 582 | + * This function must be used as-if it were wake_up_process(); IOW the task |
---|
| 583 | + * must be ready to be woken at this location. |
---|
| 584 | + * |
---|
| 585 | + * This function is essentially a task-safe equivalent to wake_q_add(). Callers |
---|
| 586 | + * that already hold reference to @task can call the 'safe' version and trust |
---|
| 587 | + * wake_q to do the right thing depending whether or not the @task is already |
---|
| 588 | + * queued for wakeup. |
---|
| 589 | + */ |
---|
| 590 | +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) |
---|
| 591 | +{ |
---|
| 592 | + if (!__wake_q_add(head, task)) |
---|
| 593 | + put_task_struct(task); |
---|
| 594 | +} |
---|
432 | 595 | |
---|
433 | 596 | void wake_up_q(struct wake_q_head *head) |
---|
434 | 597 | { |
---|
.. | .. |
---|
442 | 605 | /* Task can safely be re-inserted now: */ |
---|
443 | 606 | node = node->next; |
---|
444 | 607 | task->wake_q.next = NULL; |
---|
| 608 | + task->wake_q_count = head->count; |
---|
445 | 609 | |
---|
446 | 610 | /* |
---|
447 | | - * try_to_wake_up() executes a full barrier, which pairs with |
---|
| 611 | + * wake_up_process() executes a full barrier, which pairs with |
---|
448 | 612 | * the queueing in wake_q_add() so as not to miss wakeups. |
---|
449 | 613 | */ |
---|
450 | | - try_to_wake_up(task, TASK_NORMAL, 0, head->count); |
---|
| 614 | + wake_up_process(task); |
---|
| 615 | + task->wake_q_count = 0; |
---|
451 | 616 | put_task_struct(task); |
---|
452 | 617 | } |
---|
453 | 618 | } |
---|
.. | .. |
---|
477 | 642 | return; |
---|
478 | 643 | } |
---|
479 | 644 | |
---|
480 | | -#ifdef CONFIG_PREEMPT |
---|
481 | 645 | if (set_nr_and_not_polling(curr)) |
---|
482 | | -#else |
---|
483 | | - if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle)) |
---|
484 | | -#endif |
---|
485 | 646 | smp_send_reschedule(cpu); |
---|
486 | 647 | else |
---|
487 | 648 | trace_sched_wake_idle_without_ipi(cpu); |
---|
488 | 649 | } |
---|
| 650 | +EXPORT_SYMBOL_GPL(resched_curr); |
---|
489 | 651 | |
---|
490 | 652 | void resched_cpu(int cpu) |
---|
491 | 653 | { |
---|
.. | .. |
---|
510 | 672 | */ |
---|
511 | 673 | int get_nohz_timer_target(void) |
---|
512 | 674 | { |
---|
513 | | - int i, cpu = smp_processor_id(); |
---|
| 675 | + int i, cpu = smp_processor_id(), default_cpu = -1; |
---|
514 | 676 | struct sched_domain *sd; |
---|
515 | 677 | |
---|
516 | | - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) |
---|
517 | | - return cpu; |
---|
| 678 | + if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) { |
---|
| 679 | + if (!idle_cpu(cpu)) |
---|
| 680 | + return cpu; |
---|
| 681 | + default_cpu = cpu; |
---|
| 682 | + } |
---|
518 | 683 | |
---|
519 | 684 | rcu_read_lock(); |
---|
520 | 685 | for_each_domain(cpu, sd) { |
---|
521 | | - for_each_cpu(i, sched_domain_span(sd)) { |
---|
| 686 | + for_each_cpu_and(i, sched_domain_span(sd), |
---|
| 687 | + housekeeping_cpumask(HK_FLAG_TIMER)) { |
---|
522 | 688 | if (cpu == i) |
---|
523 | 689 | continue; |
---|
524 | 690 | |
---|
525 | | - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { |
---|
| 691 | + if (!idle_cpu(i)) { |
---|
526 | 692 | cpu = i; |
---|
527 | 693 | goto unlock; |
---|
528 | 694 | } |
---|
529 | 695 | } |
---|
530 | 696 | } |
---|
531 | 697 | |
---|
532 | | - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) |
---|
533 | | - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); |
---|
| 698 | + if (default_cpu == -1) { |
---|
| 699 | + for_each_cpu_and(i, cpu_active_mask, |
---|
| 700 | + housekeeping_cpumask(HK_FLAG_TIMER)) { |
---|
| 701 | + if (cpu == i) |
---|
| 702 | + continue; |
---|
| 703 | + |
---|
| 704 | + if (!idle_cpu(i)) { |
---|
| 705 | + cpu = i; |
---|
| 706 | + goto unlock; |
---|
| 707 | + } |
---|
| 708 | + } |
---|
| 709 | + |
---|
| 710 | + /* no active, not-idle, housekpeeing CPU found. */ |
---|
| 711 | + default_cpu = cpumask_any(cpu_active_mask); |
---|
| 712 | + |
---|
| 713 | + if (unlikely(default_cpu >= nr_cpu_ids)) |
---|
| 714 | + goto unlock; |
---|
| 715 | + } |
---|
| 716 | + |
---|
| 717 | + cpu = default_cpu; |
---|
534 | 718 | unlock: |
---|
535 | 719 | rcu_read_unlock(); |
---|
536 | 720 | return cpu; |
---|
.. | .. |
---|
590 | 774 | wake_up_idle_cpu(cpu); |
---|
591 | 775 | } |
---|
592 | 776 | |
---|
593 | | -static inline bool got_nohz_idle_kick(void) |
---|
| 777 | +static void nohz_csd_func(void *info) |
---|
594 | 778 | { |
---|
595 | | - int cpu = smp_processor_id(); |
---|
596 | | - |
---|
597 | | - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) |
---|
598 | | - return false; |
---|
599 | | - |
---|
600 | | - if (idle_cpu(cpu) && !need_resched()) |
---|
601 | | - return true; |
---|
| 779 | + struct rq *rq = info; |
---|
| 780 | + int cpu = cpu_of(rq); |
---|
| 781 | + unsigned int flags; |
---|
602 | 782 | |
---|
603 | 783 | /* |
---|
604 | | - * We can't run Idle Load Balance on this CPU for this time so we |
---|
605 | | - * cancel it and clear NOHZ_BALANCE_KICK |
---|
| 784 | + * Release the rq::nohz_csd. |
---|
606 | 785 | */ |
---|
607 | | - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); |
---|
608 | | - return false; |
---|
609 | | -} |
---|
| 786 | + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); |
---|
| 787 | + WARN_ON(!(flags & NOHZ_KICK_MASK)); |
---|
610 | 788 | |
---|
611 | | -#else /* CONFIG_NO_HZ_COMMON */ |
---|
612 | | - |
---|
613 | | -static inline bool got_nohz_idle_kick(void) |
---|
614 | | -{ |
---|
615 | | - return false; |
---|
| 789 | + rq->idle_balance = idle_cpu(cpu); |
---|
| 790 | + if (rq->idle_balance && !need_resched()) { |
---|
| 791 | + rq->nohz_idle_balance = flags; |
---|
| 792 | + raise_softirq_irqoff(SCHED_SOFTIRQ); |
---|
| 793 | + } |
---|
616 | 794 | } |
---|
617 | 795 | |
---|
618 | 796 | #endif /* CONFIG_NO_HZ_COMMON */ |
---|
.. | .. |
---|
703 | 881 | } |
---|
704 | 882 | #endif |
---|
705 | 883 | |
---|
706 | | -static void set_load_weight(struct task_struct *p, bool update_load) |
---|
| 884 | +static void set_load_weight(struct task_struct *p) |
---|
707 | 885 | { |
---|
| 886 | + bool update_load = !(READ_ONCE(p->state) & TASK_NEW); |
---|
708 | 887 | int prio = p->static_prio - MAX_RT_PRIO; |
---|
709 | 888 | struct load_weight *load = &p->se.load; |
---|
710 | 889 | |
---|
711 | 890 | /* |
---|
712 | 891 | * SCHED_IDLE tasks get minimal weight: |
---|
713 | 892 | */ |
---|
714 | | - if (idle_policy(p->policy)) { |
---|
| 893 | + if (task_has_idle_policy(p)) { |
---|
715 | 894 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
---|
716 | 895 | load->inv_weight = WMULT_IDLEPRIO; |
---|
717 | | - p->se.runnable_weight = load->weight; |
---|
718 | 896 | return; |
---|
719 | 897 | } |
---|
720 | 898 | |
---|
.. | .. |
---|
727 | 905 | } else { |
---|
728 | 906 | load->weight = scale_load(sched_prio_to_weight[prio]); |
---|
729 | 907 | load->inv_weight = sched_prio_to_wmult[prio]; |
---|
730 | | - p->se.runnable_weight = load->weight; |
---|
731 | 908 | } |
---|
732 | 909 | } |
---|
733 | 910 | |
---|
.. | .. |
---|
750 | 927 | /* Max allowed maximum utilization */ |
---|
751 | 928 | unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; |
---|
752 | 929 | |
---|
| 930 | +/* |
---|
| 931 | + * By default RT tasks run at the maximum performance point/capacity of the |
---|
| 932 | + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to |
---|
| 933 | + * SCHED_CAPACITY_SCALE. |
---|
| 934 | + * |
---|
| 935 | + * This knob allows admins to change the default behavior when uclamp is being |
---|
| 936 | + * used. In battery powered devices, particularly, running at the maximum |
---|
| 937 | + * capacity and frequency will increase energy consumption and shorten the |
---|
| 938 | + * battery life. |
---|
| 939 | + * |
---|
| 940 | + * This knob only affects RT tasks that their uclamp_se->user_defined == false. |
---|
| 941 | + * |
---|
| 942 | + * This knob will not override the system default sched_util_clamp_min defined |
---|
| 943 | + * above. |
---|
| 944 | + */ |
---|
| 945 | +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; |
---|
| 946 | + |
---|
753 | 947 | /* All clamps are required to be less or equal than these values */ |
---|
754 | 948 | static struct uclamp_se uclamp_default[UCLAMP_CNT]; |
---|
| 949 | + |
---|
| 950 | +/* |
---|
| 951 | + * This static key is used to reduce the uclamp overhead in the fast path. It |
---|
| 952 | + * primarily disables the call to uclamp_rq_{inc, dec}() in |
---|
| 953 | + * enqueue/dequeue_task(). |
---|
| 954 | + * |
---|
| 955 | + * This allows users to continue to enable uclamp in their kernel config with |
---|
| 956 | + * minimum uclamp overhead in the fast path. |
---|
| 957 | + * |
---|
| 958 | + * As soon as userspace modifies any of the uclamp knobs, the static key is |
---|
| 959 | + * enabled, since we have an actual users that make use of uclamp |
---|
| 960 | + * functionality. |
---|
| 961 | + * |
---|
| 962 | + * The knobs that would enable this static key are: |
---|
| 963 | + * |
---|
| 964 | + * * A task modifying its uclamp value with sched_setattr(). |
---|
| 965 | + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. |
---|
| 966 | + * * An admin modifying the cgroup cpu.uclamp.{min, max} |
---|
| 967 | + */ |
---|
| 968 | +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); |
---|
| 969 | +EXPORT_SYMBOL_GPL(sched_uclamp_used); |
---|
755 | 970 | |
---|
756 | 971 | /* Integer rounded range for each bucket */ |
---|
757 | 972 | #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) |
---|
.. | .. |
---|
762 | 977 | static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) |
---|
763 | 978 | { |
---|
764 | 979 | return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); |
---|
765 | | -} |
---|
766 | | - |
---|
767 | | -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) |
---|
768 | | -{ |
---|
769 | | - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); |
---|
770 | 980 | } |
---|
771 | 981 | |
---|
772 | 982 | static inline unsigned int uclamp_none(enum uclamp_id clamp_id) |
---|
.. | .. |
---|
808 | 1018 | if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) |
---|
809 | 1019 | return; |
---|
810 | 1020 | |
---|
811 | | - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); |
---|
| 1021 | + uclamp_rq_set(rq, clamp_id, clamp_value); |
---|
812 | 1022 | } |
---|
813 | 1023 | |
---|
814 | 1024 | static inline |
---|
.. | .. |
---|
832 | 1042 | return uclamp_idle_value(rq, clamp_id, clamp_value); |
---|
833 | 1043 | } |
---|
834 | 1044 | |
---|
| 1045 | +static void __uclamp_update_util_min_rt_default(struct task_struct *p) |
---|
| 1046 | +{ |
---|
| 1047 | + unsigned int default_util_min; |
---|
| 1048 | + struct uclamp_se *uc_se; |
---|
| 1049 | + |
---|
| 1050 | + lockdep_assert_held(&p->pi_lock); |
---|
| 1051 | + |
---|
| 1052 | + uc_se = &p->uclamp_req[UCLAMP_MIN]; |
---|
| 1053 | + |
---|
| 1054 | + /* Only sync if user didn't override the default */ |
---|
| 1055 | + if (uc_se->user_defined) |
---|
| 1056 | + return; |
---|
| 1057 | + |
---|
| 1058 | + default_util_min = sysctl_sched_uclamp_util_min_rt_default; |
---|
| 1059 | + uclamp_se_set(uc_se, default_util_min, false); |
---|
| 1060 | +} |
---|
| 1061 | + |
---|
| 1062 | +static void uclamp_update_util_min_rt_default(struct task_struct *p) |
---|
| 1063 | +{ |
---|
| 1064 | + struct rq_flags rf; |
---|
| 1065 | + struct rq *rq; |
---|
| 1066 | + |
---|
| 1067 | + if (!rt_task(p)) |
---|
| 1068 | + return; |
---|
| 1069 | + |
---|
| 1070 | + /* Protect updates to p->uclamp_* */ |
---|
| 1071 | + rq = task_rq_lock(p, &rf); |
---|
| 1072 | + __uclamp_update_util_min_rt_default(p); |
---|
| 1073 | + task_rq_unlock(rq, p, &rf); |
---|
| 1074 | +} |
---|
| 1075 | + |
---|
| 1076 | +static void uclamp_sync_util_min_rt_default(void) |
---|
| 1077 | +{ |
---|
| 1078 | + struct task_struct *g, *p; |
---|
| 1079 | + |
---|
| 1080 | + /* |
---|
| 1081 | + * copy_process() sysctl_uclamp |
---|
| 1082 | + * uclamp_min_rt = X; |
---|
| 1083 | + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) |
---|
| 1084 | + * // link thread smp_mb__after_spinlock() |
---|
| 1085 | + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); |
---|
| 1086 | + * sched_post_fork() for_each_process_thread() |
---|
| 1087 | + * __uclamp_sync_rt() __uclamp_sync_rt() |
---|
| 1088 | + * |
---|
| 1089 | + * Ensures that either sched_post_fork() will observe the new |
---|
| 1090 | + * uclamp_min_rt or for_each_process_thread() will observe the new |
---|
| 1091 | + * task. |
---|
| 1092 | + */ |
---|
| 1093 | + read_lock(&tasklist_lock); |
---|
| 1094 | + smp_mb__after_spinlock(); |
---|
| 1095 | + read_unlock(&tasklist_lock); |
---|
| 1096 | + |
---|
| 1097 | + rcu_read_lock(); |
---|
| 1098 | + for_each_process_thread(g, p) |
---|
| 1099 | + uclamp_update_util_min_rt_default(p); |
---|
| 1100 | + rcu_read_unlock(); |
---|
| 1101 | +} |
---|
| 1102 | + |
---|
| 1103 | +#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE) |
---|
| 1104 | +void rockchip_perf_uclamp_sync_util_min_rt_default(void) |
---|
| 1105 | +{ |
---|
| 1106 | + uclamp_sync_util_min_rt_default(); |
---|
| 1107 | +} |
---|
| 1108 | +EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default); |
---|
| 1109 | +#endif |
---|
| 1110 | + |
---|
835 | 1111 | static inline struct uclamp_se |
---|
836 | 1112 | uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) |
---|
837 | 1113 | { |
---|
| 1114 | + /* Copy by value as we could modify it */ |
---|
838 | 1115 | struct uclamp_se uc_req = p->uclamp_req[clamp_id]; |
---|
839 | 1116 | #ifdef CONFIG_UCLAMP_TASK_GROUP |
---|
840 | | - struct uclamp_se uc_max; |
---|
| 1117 | + unsigned int tg_min, tg_max, value; |
---|
841 | 1118 | |
---|
842 | 1119 | /* |
---|
843 | 1120 | * Tasks in autogroups or root task group will be |
---|
.. | .. |
---|
848 | 1125 | if (task_group(p) == &root_task_group) |
---|
849 | 1126 | return uc_req; |
---|
850 | 1127 | |
---|
851 | | - uc_max = task_group(p)->uclamp[clamp_id]; |
---|
852 | | - if (uc_req.value > uc_max.value || !uc_req.user_defined) |
---|
853 | | - return uc_max; |
---|
| 1128 | + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; |
---|
| 1129 | + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; |
---|
| 1130 | + value = uc_req.value; |
---|
| 1131 | + value = clamp(value, tg_min, tg_max); |
---|
| 1132 | + uclamp_se_set(&uc_req, value, false); |
---|
854 | 1133 | #endif |
---|
855 | 1134 | |
---|
856 | 1135 | return uc_req; |
---|
.. | .. |
---|
869 | 1148 | { |
---|
870 | 1149 | struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); |
---|
871 | 1150 | struct uclamp_se uc_max = uclamp_default[clamp_id]; |
---|
| 1151 | + struct uclamp_se uc_eff; |
---|
| 1152 | + int ret = 0; |
---|
| 1153 | + |
---|
| 1154 | + trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret); |
---|
| 1155 | + if (ret) |
---|
| 1156 | + return uc_eff; |
---|
872 | 1157 | |
---|
873 | 1158 | /* System default restrictions always apply */ |
---|
874 | 1159 | if (unlikely(uc_req.value > uc_max.value)) |
---|
.. | .. |
---|
889 | 1174 | |
---|
890 | 1175 | return (unsigned long)uc_eff.value; |
---|
891 | 1176 | } |
---|
| 1177 | +EXPORT_SYMBOL_GPL(uclamp_eff_value); |
---|
892 | 1178 | |
---|
893 | 1179 | /* |
---|
894 | 1180 | * When a task is enqueued on a rq, the clamp bucket currently defined by the |
---|
.. | .. |
---|
925 | 1211 | if (bucket->tasks == 1 || uc_se->value > bucket->value) |
---|
926 | 1212 | bucket->value = uc_se->value; |
---|
927 | 1213 | |
---|
928 | | - if (uc_se->value > READ_ONCE(uc_rq->value)) |
---|
929 | | - WRITE_ONCE(uc_rq->value, uc_se->value); |
---|
| 1214 | + if (uc_se->value > uclamp_rq_get(rq, clamp_id)) |
---|
| 1215 | + uclamp_rq_set(rq, clamp_id, uc_se->value); |
---|
930 | 1216 | } |
---|
931 | 1217 | |
---|
932 | 1218 | /* |
---|
.. | .. |
---|
949 | 1235 | |
---|
950 | 1236 | lockdep_assert_held(&rq->lock); |
---|
951 | 1237 | |
---|
| 1238 | + /* |
---|
| 1239 | + * If sched_uclamp_used was enabled after task @p was enqueued, |
---|
| 1240 | + * we could end up with unbalanced call to uclamp_rq_dec_id(). |
---|
| 1241 | + * |
---|
| 1242 | + * In this case the uc_se->active flag should be false since no uclamp |
---|
| 1243 | + * accounting was performed at enqueue time and we can just return |
---|
| 1244 | + * here. |
---|
| 1245 | + * |
---|
| 1246 | + * Need to be careful of the following enqeueue/dequeue ordering |
---|
| 1247 | + * problem too |
---|
| 1248 | + * |
---|
| 1249 | + * enqueue(taskA) |
---|
| 1250 | + * // sched_uclamp_used gets enabled |
---|
| 1251 | + * enqueue(taskB) |
---|
| 1252 | + * dequeue(taskA) |
---|
| 1253 | + * // Must not decrement bukcet->tasks here |
---|
| 1254 | + * dequeue(taskB) |
---|
| 1255 | + * |
---|
| 1256 | + * where we could end up with stale data in uc_se and |
---|
| 1257 | + * bucket[uc_se->bucket_id]. |
---|
| 1258 | + * |
---|
| 1259 | + * The following check here eliminates the possibility of such race. |
---|
| 1260 | + */ |
---|
| 1261 | + if (unlikely(!uc_se->active)) |
---|
| 1262 | + return; |
---|
| 1263 | + |
---|
952 | 1264 | bucket = &uc_rq->bucket[uc_se->bucket_id]; |
---|
| 1265 | + |
---|
953 | 1266 | SCHED_WARN_ON(!bucket->tasks); |
---|
954 | 1267 | if (likely(bucket->tasks)) |
---|
955 | 1268 | bucket->tasks--; |
---|
| 1269 | + |
---|
956 | 1270 | uc_se->active = false; |
---|
957 | 1271 | |
---|
958 | 1272 | /* |
---|
.. | .. |
---|
964 | 1278 | if (likely(bucket->tasks)) |
---|
965 | 1279 | return; |
---|
966 | 1280 | |
---|
967 | | - rq_clamp = READ_ONCE(uc_rq->value); |
---|
| 1281 | + rq_clamp = uclamp_rq_get(rq, clamp_id); |
---|
968 | 1282 | /* |
---|
969 | 1283 | * Defensive programming: this should never happen. If it happens, |
---|
970 | 1284 | * e.g. due to future modification, warn and fixup the expected value. |
---|
.. | .. |
---|
972 | 1286 | SCHED_WARN_ON(bucket->value > rq_clamp); |
---|
973 | 1287 | if (bucket->value >= rq_clamp) { |
---|
974 | 1288 | bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); |
---|
975 | | - WRITE_ONCE(uc_rq->value, bkt_clamp); |
---|
| 1289 | + uclamp_rq_set(rq, clamp_id, bkt_clamp); |
---|
976 | 1290 | } |
---|
977 | 1291 | } |
---|
978 | 1292 | |
---|
979 | 1293 | static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) |
---|
980 | 1294 | { |
---|
981 | 1295 | enum uclamp_id clamp_id; |
---|
| 1296 | + |
---|
| 1297 | + /* |
---|
| 1298 | + * Avoid any overhead until uclamp is actually used by the userspace. |
---|
| 1299 | + * |
---|
| 1300 | + * The condition is constructed such that a NOP is generated when |
---|
| 1301 | + * sched_uclamp_used is disabled. |
---|
| 1302 | + */ |
---|
| 1303 | + if (!static_branch_unlikely(&sched_uclamp_used)) |
---|
| 1304 | + return; |
---|
982 | 1305 | |
---|
983 | 1306 | if (unlikely(!p->sched_class->uclamp_enabled)) |
---|
984 | 1307 | return; |
---|
.. | .. |
---|
995 | 1318 | { |
---|
996 | 1319 | enum uclamp_id clamp_id; |
---|
997 | 1320 | |
---|
| 1321 | + /* |
---|
| 1322 | + * Avoid any overhead until uclamp is actually used by the userspace. |
---|
| 1323 | + * |
---|
| 1324 | + * The condition is constructed such that a NOP is generated when |
---|
| 1325 | + * sched_uclamp_used is disabled. |
---|
| 1326 | + */ |
---|
| 1327 | + if (!static_branch_unlikely(&sched_uclamp_used)) |
---|
| 1328 | + return; |
---|
| 1329 | + |
---|
998 | 1330 | if (unlikely(!p->sched_class->uclamp_enabled)) |
---|
999 | 1331 | return; |
---|
1000 | 1332 | |
---|
.. | .. |
---|
1002 | 1334 | uclamp_rq_dec_id(rq, p, clamp_id); |
---|
1003 | 1335 | } |
---|
1004 | 1336 | |
---|
1005 | | -static inline void |
---|
1006 | | -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) |
---|
| 1337 | +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, |
---|
| 1338 | + enum uclamp_id clamp_id) |
---|
1007 | 1339 | { |
---|
| 1340 | + if (!p->uclamp[clamp_id].active) |
---|
| 1341 | + return; |
---|
| 1342 | + |
---|
| 1343 | + uclamp_rq_dec_id(rq, p, clamp_id); |
---|
| 1344 | + uclamp_rq_inc_id(rq, p, clamp_id); |
---|
| 1345 | + |
---|
| 1346 | + /* |
---|
| 1347 | + * Make sure to clear the idle flag if we've transiently reached 0 |
---|
| 1348 | + * active tasks on rq. |
---|
| 1349 | + */ |
---|
| 1350 | + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) |
---|
| 1351 | + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; |
---|
| 1352 | +} |
---|
| 1353 | + |
---|
| 1354 | +static inline void |
---|
| 1355 | +uclamp_update_active(struct task_struct *p) |
---|
| 1356 | +{ |
---|
| 1357 | + enum uclamp_id clamp_id; |
---|
1008 | 1358 | struct rq_flags rf; |
---|
1009 | 1359 | struct rq *rq; |
---|
1010 | 1360 | |
---|
.. | .. |
---|
1024 | 1374 | * affecting a valid clamp bucket, the next time it's enqueued, |
---|
1025 | 1375 | * it will already see the updated clamp bucket value. |
---|
1026 | 1376 | */ |
---|
1027 | | - if (p->uclamp[clamp_id].active) { |
---|
1028 | | - uclamp_rq_dec_id(rq, p, clamp_id); |
---|
1029 | | - uclamp_rq_inc_id(rq, p, clamp_id); |
---|
1030 | | - } |
---|
| 1377 | + for_each_clamp_id(clamp_id) |
---|
| 1378 | + uclamp_rq_reinc_id(rq, p, clamp_id); |
---|
1031 | 1379 | |
---|
1032 | 1380 | task_rq_unlock(rq, p, &rf); |
---|
1033 | 1381 | } |
---|
1034 | 1382 | |
---|
1035 | 1383 | #ifdef CONFIG_UCLAMP_TASK_GROUP |
---|
1036 | 1384 | static inline void |
---|
1037 | | -uclamp_update_active_tasks(struct cgroup_subsys_state *css, |
---|
1038 | | - unsigned int clamps) |
---|
| 1385 | +uclamp_update_active_tasks(struct cgroup_subsys_state *css) |
---|
1039 | 1386 | { |
---|
1040 | | - enum uclamp_id clamp_id; |
---|
1041 | 1387 | struct css_task_iter it; |
---|
1042 | 1388 | struct task_struct *p; |
---|
1043 | 1389 | |
---|
1044 | 1390 | css_task_iter_start(css, 0, &it); |
---|
1045 | | - while ((p = css_task_iter_next(&it))) { |
---|
1046 | | - for_each_clamp_id(clamp_id) { |
---|
1047 | | - if ((0x1 << clamp_id) & clamps) |
---|
1048 | | - uclamp_update_active(p, clamp_id); |
---|
1049 | | - } |
---|
1050 | | - } |
---|
| 1391 | + while ((p = css_task_iter_next(&it))) |
---|
| 1392 | + uclamp_update_active(p); |
---|
1051 | 1393 | css_task_iter_end(&it); |
---|
1052 | 1394 | } |
---|
1053 | 1395 | |
---|
.. | .. |
---|
1070 | 1412 | #endif |
---|
1071 | 1413 | |
---|
1072 | 1414 | int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, |
---|
1073 | | - void __user *buffer, size_t *lenp, |
---|
1074 | | - loff_t *ppos) |
---|
| 1415 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
1075 | 1416 | { |
---|
1076 | 1417 | bool update_root_tg = false; |
---|
1077 | | - int old_min, old_max; |
---|
| 1418 | + int old_min, old_max, old_min_rt; |
---|
1078 | 1419 | int result; |
---|
1079 | 1420 | |
---|
1080 | 1421 | mutex_lock(&uclamp_mutex); |
---|
1081 | 1422 | old_min = sysctl_sched_uclamp_util_min; |
---|
1082 | 1423 | old_max = sysctl_sched_uclamp_util_max; |
---|
| 1424 | + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; |
---|
1083 | 1425 | |
---|
1084 | 1426 | result = proc_dointvec(table, write, buffer, lenp, ppos); |
---|
1085 | 1427 | if (result) |
---|
.. | .. |
---|
1088 | 1430 | goto done; |
---|
1089 | 1431 | |
---|
1090 | 1432 | if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || |
---|
1091 | | - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { |
---|
| 1433 | + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || |
---|
| 1434 | + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { |
---|
| 1435 | + |
---|
1092 | 1436 | result = -EINVAL; |
---|
1093 | 1437 | goto undo; |
---|
1094 | 1438 | } |
---|
.. | .. |
---|
1104 | 1448 | update_root_tg = true; |
---|
1105 | 1449 | } |
---|
1106 | 1450 | |
---|
1107 | | - if (update_root_tg) |
---|
| 1451 | + if (update_root_tg) { |
---|
| 1452 | + static_branch_enable(&sched_uclamp_used); |
---|
1108 | 1453 | uclamp_update_root_tg(); |
---|
| 1454 | + } |
---|
| 1455 | + |
---|
| 1456 | + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { |
---|
| 1457 | + static_branch_enable(&sched_uclamp_used); |
---|
| 1458 | + uclamp_sync_util_min_rt_default(); |
---|
| 1459 | + } |
---|
1109 | 1460 | |
---|
1110 | 1461 | /* |
---|
1111 | 1462 | * We update all RUNNABLE tasks only when task groups are in use. |
---|
.. | .. |
---|
1118 | 1469 | undo: |
---|
1119 | 1470 | sysctl_sched_uclamp_util_min = old_min; |
---|
1120 | 1471 | sysctl_sched_uclamp_util_max = old_max; |
---|
| 1472 | + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; |
---|
1121 | 1473 | done: |
---|
1122 | 1474 | mutex_unlock(&uclamp_mutex); |
---|
1123 | 1475 | |
---|
.. | .. |
---|
1127 | 1479 | static int uclamp_validate(struct task_struct *p, |
---|
1128 | 1480 | const struct sched_attr *attr) |
---|
1129 | 1481 | { |
---|
1130 | | - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; |
---|
1131 | | - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; |
---|
| 1482 | + int util_min = p->uclamp_req[UCLAMP_MIN].value; |
---|
| 1483 | + int util_max = p->uclamp_req[UCLAMP_MAX].value; |
---|
1132 | 1484 | |
---|
1133 | | - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) |
---|
1134 | | - lower_bound = attr->sched_util_min; |
---|
1135 | | - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) |
---|
1136 | | - upper_bound = attr->sched_util_max; |
---|
| 1485 | + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { |
---|
| 1486 | + util_min = attr->sched_util_min; |
---|
1137 | 1487 | |
---|
1138 | | - if (lower_bound > upper_bound) |
---|
| 1488 | + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) |
---|
| 1489 | + return -EINVAL; |
---|
| 1490 | + } |
---|
| 1491 | + |
---|
| 1492 | + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { |
---|
| 1493 | + util_max = attr->sched_util_max; |
---|
| 1494 | + |
---|
| 1495 | + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) |
---|
| 1496 | + return -EINVAL; |
---|
| 1497 | + } |
---|
| 1498 | + |
---|
| 1499 | + if (util_min != -1 && util_max != -1 && util_min > util_max) |
---|
1139 | 1500 | return -EINVAL; |
---|
1140 | | - if (upper_bound > SCHED_CAPACITY_SCALE) |
---|
1141 | | - return -EINVAL; |
---|
| 1501 | + |
---|
| 1502 | + /* |
---|
| 1503 | + * We have valid uclamp attributes; make sure uclamp is enabled. |
---|
| 1504 | + * |
---|
| 1505 | + * We need to do that here, because enabling static branches is a |
---|
| 1506 | + * blocking operation which obviously cannot be done while holding |
---|
| 1507 | + * scheduler locks. |
---|
| 1508 | + */ |
---|
| 1509 | + static_branch_enable(&sched_uclamp_used); |
---|
1142 | 1510 | |
---|
1143 | 1511 | return 0; |
---|
| 1512 | +} |
---|
| 1513 | + |
---|
| 1514 | +static bool uclamp_reset(const struct sched_attr *attr, |
---|
| 1515 | + enum uclamp_id clamp_id, |
---|
| 1516 | + struct uclamp_se *uc_se) |
---|
| 1517 | +{ |
---|
| 1518 | + /* Reset on sched class change for a non user-defined clamp value. */ |
---|
| 1519 | + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && |
---|
| 1520 | + !uc_se->user_defined) |
---|
| 1521 | + return true; |
---|
| 1522 | + |
---|
| 1523 | + /* Reset on sched_util_{min,max} == -1. */ |
---|
| 1524 | + if (clamp_id == UCLAMP_MIN && |
---|
| 1525 | + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && |
---|
| 1526 | + attr->sched_util_min == -1) { |
---|
| 1527 | + return true; |
---|
| 1528 | + } |
---|
| 1529 | + |
---|
| 1530 | + if (clamp_id == UCLAMP_MAX && |
---|
| 1531 | + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && |
---|
| 1532 | + attr->sched_util_max == -1) { |
---|
| 1533 | + return true; |
---|
| 1534 | + } |
---|
| 1535 | + |
---|
| 1536 | + return false; |
---|
1144 | 1537 | } |
---|
1145 | 1538 | |
---|
1146 | 1539 | static void __setscheduler_uclamp(struct task_struct *p, |
---|
.. | .. |
---|
1148 | 1541 | { |
---|
1149 | 1542 | enum uclamp_id clamp_id; |
---|
1150 | 1543 | |
---|
1151 | | - /* |
---|
1152 | | - * On scheduling class change, reset to default clamps for tasks |
---|
1153 | | - * without a task-specific value. |
---|
1154 | | - */ |
---|
1155 | 1544 | for_each_clamp_id(clamp_id) { |
---|
1156 | 1545 | struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; |
---|
1157 | | - unsigned int clamp_value = uclamp_none(clamp_id); |
---|
| 1546 | + unsigned int value; |
---|
1158 | 1547 | |
---|
1159 | | - /* Keep using defined clamps across class changes */ |
---|
1160 | | - if (uc_se->user_defined) |
---|
| 1548 | + if (!uclamp_reset(attr, clamp_id, uc_se)) |
---|
1161 | 1549 | continue; |
---|
1162 | 1550 | |
---|
1163 | | - /* By default, RT tasks always get 100% boost */ |
---|
1164 | | - if (sched_feat(SUGOV_RT_MAX_FREQ) && |
---|
1165 | | - unlikely(rt_task(p) && |
---|
1166 | | - clamp_id == UCLAMP_MIN)) { |
---|
| 1551 | + /* |
---|
| 1552 | + * RT by default have a 100% boost value that could be modified |
---|
| 1553 | + * at runtime. |
---|
| 1554 | + */ |
---|
| 1555 | + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) |
---|
| 1556 | + value = sysctl_sched_uclamp_util_min_rt_default; |
---|
| 1557 | + else |
---|
| 1558 | + value = uclamp_none(clamp_id); |
---|
1167 | 1559 | |
---|
1168 | | - clamp_value = uclamp_none(UCLAMP_MAX); |
---|
1169 | | - } |
---|
| 1560 | + uclamp_se_set(uc_se, value, false); |
---|
1170 | 1561 | |
---|
1171 | | - uclamp_se_set(uc_se, clamp_value, false); |
---|
1172 | 1562 | } |
---|
1173 | 1563 | |
---|
1174 | 1564 | if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) |
---|
1175 | 1565 | return; |
---|
1176 | 1566 | |
---|
1177 | | - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { |
---|
| 1567 | + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && |
---|
| 1568 | + attr->sched_util_min != -1) { |
---|
1178 | 1569 | uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], |
---|
1179 | 1570 | attr->sched_util_min, true); |
---|
| 1571 | + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min); |
---|
1180 | 1572 | } |
---|
1181 | 1573 | |
---|
1182 | | - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { |
---|
| 1574 | + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && |
---|
| 1575 | + attr->sched_util_max != -1) { |
---|
1183 | 1576 | uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], |
---|
1184 | 1577 | attr->sched_util_max, true); |
---|
| 1578 | + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max); |
---|
1185 | 1579 | } |
---|
1186 | 1580 | } |
---|
1187 | 1581 | |
---|
.. | .. |
---|
1189 | 1583 | { |
---|
1190 | 1584 | enum uclamp_id clamp_id; |
---|
1191 | 1585 | |
---|
| 1586 | + /* |
---|
| 1587 | + * We don't need to hold task_rq_lock() when updating p->uclamp_* here |
---|
| 1588 | + * as the task is still at its early fork stages. |
---|
| 1589 | + */ |
---|
1192 | 1590 | for_each_clamp_id(clamp_id) |
---|
1193 | 1591 | p->uclamp[clamp_id].active = false; |
---|
1194 | 1592 | |
---|
.. | .. |
---|
1201 | 1599 | } |
---|
1202 | 1600 | } |
---|
1203 | 1601 | |
---|
1204 | | -#ifdef CONFIG_SMP |
---|
1205 | | -unsigned int uclamp_task(struct task_struct *p) |
---|
| 1602 | +static void uclamp_post_fork(struct task_struct *p) |
---|
1206 | 1603 | { |
---|
1207 | | - unsigned long util; |
---|
1208 | | - |
---|
1209 | | - util = task_util_est(p); |
---|
1210 | | - util = max(util, uclamp_eff_value(p, UCLAMP_MIN)); |
---|
1211 | | - util = min(util, uclamp_eff_value(p, UCLAMP_MAX)); |
---|
1212 | | - |
---|
1213 | | - return util; |
---|
| 1604 | + uclamp_update_util_min_rt_default(p); |
---|
1214 | 1605 | } |
---|
1215 | 1606 | |
---|
1216 | | -bool uclamp_boosted(struct task_struct *p) |
---|
| 1607 | +static void __init init_uclamp_rq(struct rq *rq) |
---|
1217 | 1608 | { |
---|
1218 | | - return uclamp_eff_value(p, UCLAMP_MIN) > 0; |
---|
| 1609 | + enum uclamp_id clamp_id; |
---|
| 1610 | + struct uclamp_rq *uc_rq = rq->uclamp; |
---|
| 1611 | + |
---|
| 1612 | + for_each_clamp_id(clamp_id) { |
---|
| 1613 | + uc_rq[clamp_id] = (struct uclamp_rq) { |
---|
| 1614 | + .value = uclamp_none(clamp_id) |
---|
| 1615 | + }; |
---|
| 1616 | + } |
---|
| 1617 | + |
---|
| 1618 | + rq->uclamp_flags = UCLAMP_FLAG_IDLE; |
---|
1219 | 1619 | } |
---|
1220 | | - |
---|
1221 | | -bool uclamp_latency_sensitive(struct task_struct *p) |
---|
1222 | | -{ |
---|
1223 | | -#ifdef CONFIG_UCLAMP_TASK_GROUP |
---|
1224 | | - struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); |
---|
1225 | | - struct task_group *tg; |
---|
1226 | | - |
---|
1227 | | - if (!css) |
---|
1228 | | - return false; |
---|
1229 | | - tg = container_of(css, struct task_group, css); |
---|
1230 | | - |
---|
1231 | | - return tg->latency_sensitive; |
---|
1232 | | -#else |
---|
1233 | | - return false; |
---|
1234 | | -#endif |
---|
1235 | | -} |
---|
1236 | | -#endif /* CONFIG_SMP */ |
---|
1237 | 1620 | |
---|
1238 | 1621 | static void __init init_uclamp(void) |
---|
1239 | 1622 | { |
---|
.. | .. |
---|
1241 | 1624 | enum uclamp_id clamp_id; |
---|
1242 | 1625 | int cpu; |
---|
1243 | 1626 | |
---|
1244 | | - mutex_init(&uclamp_mutex); |
---|
1245 | | - |
---|
1246 | | - for_each_possible_cpu(cpu) { |
---|
1247 | | - memset(&cpu_rq(cpu)->uclamp, 0, |
---|
1248 | | - sizeof(struct uclamp_rq)*UCLAMP_CNT); |
---|
1249 | | - cpu_rq(cpu)->uclamp_flags = 0; |
---|
1250 | | - } |
---|
| 1627 | + for_each_possible_cpu(cpu) |
---|
| 1628 | + init_uclamp_rq(cpu_rq(cpu)); |
---|
1251 | 1629 | |
---|
1252 | 1630 | for_each_clamp_id(clamp_id) { |
---|
1253 | 1631 | uclamp_se_set(&init_task.uclamp_req[clamp_id], |
---|
.. | .. |
---|
1276 | 1654 | static void __setscheduler_uclamp(struct task_struct *p, |
---|
1277 | 1655 | const struct sched_attr *attr) { } |
---|
1278 | 1656 | static inline void uclamp_fork(struct task_struct *p) { } |
---|
1279 | | - |
---|
1280 | | -long schedtune_task_margin(struct task_struct *task); |
---|
1281 | | - |
---|
1282 | | -#ifdef CONFIG_SMP |
---|
1283 | | -unsigned int uclamp_task(struct task_struct *p) |
---|
1284 | | -{ |
---|
1285 | | - unsigned long util = task_util_est(p); |
---|
1286 | | -#ifdef CONFIG_SCHED_TUNE |
---|
1287 | | - long margin = schedtune_task_margin(p); |
---|
1288 | | - |
---|
1289 | | - trace_sched_boost_task(p, util, margin); |
---|
1290 | | - |
---|
1291 | | - util += margin; |
---|
1292 | | -#endif |
---|
1293 | | - |
---|
1294 | | - return util; |
---|
1295 | | -} |
---|
1296 | | - |
---|
1297 | | -bool uclamp_boosted(struct task_struct *p) |
---|
1298 | | -{ |
---|
1299 | | -#ifdef CONFIG_SCHED_TUNE |
---|
1300 | | - return schedtune_task_boost(p) > 0; |
---|
1301 | | -#endif |
---|
1302 | | - return false; |
---|
1303 | | -} |
---|
1304 | | - |
---|
1305 | | -bool uclamp_latency_sensitive(struct task_struct *p) |
---|
1306 | | -{ |
---|
1307 | | -#ifdef CONFIG_SCHED_TUNE |
---|
1308 | | - return schedtune_prefer_idle(p) != 0; |
---|
1309 | | -#endif |
---|
1310 | | - return false; |
---|
1311 | | -} |
---|
1312 | | -#endif /* CONFIG_SMP */ |
---|
1313 | | - |
---|
| 1657 | +static inline void uclamp_post_fork(struct task_struct *p) { } |
---|
1314 | 1658 | static inline void init_uclamp(void) { } |
---|
1315 | 1659 | #endif /* CONFIG_UCLAMP_TASK */ |
---|
1316 | 1660 | |
---|
.. | .. |
---|
1325 | 1669 | } |
---|
1326 | 1670 | |
---|
1327 | 1671 | uclamp_rq_inc(rq, p); |
---|
| 1672 | + trace_android_rvh_enqueue_task(rq, p, flags); |
---|
1328 | 1673 | p->sched_class->enqueue_task(rq, p, flags); |
---|
| 1674 | + trace_android_rvh_after_enqueue_task(rq, p); |
---|
1329 | 1675 | } |
---|
1330 | 1676 | |
---|
1331 | 1677 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
---|
.. | .. |
---|
1339 | 1685 | } |
---|
1340 | 1686 | |
---|
1341 | 1687 | uclamp_rq_dec(rq, p); |
---|
| 1688 | + trace_android_rvh_dequeue_task(rq, p, flags); |
---|
1342 | 1689 | p->sched_class->dequeue_task(rq, p, flags); |
---|
| 1690 | + trace_android_rvh_after_dequeue_task(rq, p); |
---|
1343 | 1691 | } |
---|
1344 | 1692 | |
---|
1345 | 1693 | void activate_task(struct rq *rq, struct task_struct *p, int flags) |
---|
1346 | 1694 | { |
---|
1347 | | - if (task_contributes_to_load(p)) |
---|
1348 | | - rq->nr_uninterruptible--; |
---|
| 1695 | + if (task_on_rq_migrating(p)) |
---|
| 1696 | + flags |= ENQUEUE_MIGRATED; |
---|
1349 | 1697 | |
---|
1350 | 1698 | enqueue_task(rq, p, flags); |
---|
| 1699 | + |
---|
| 1700 | + p->on_rq = TASK_ON_RQ_QUEUED; |
---|
1351 | 1701 | } |
---|
| 1702 | +EXPORT_SYMBOL_GPL(activate_task); |
---|
1352 | 1703 | |
---|
1353 | 1704 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
---|
1354 | 1705 | { |
---|
1355 | | - if (task_contributes_to_load(p)) |
---|
1356 | | - rq->nr_uninterruptible++; |
---|
| 1706 | + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; |
---|
1357 | 1707 | |
---|
1358 | 1708 | dequeue_task(rq, p, flags); |
---|
1359 | 1709 | } |
---|
| 1710 | +EXPORT_SYMBOL_GPL(deactivate_task); |
---|
1360 | 1711 | |
---|
1361 | | -/* |
---|
1362 | | - * __normal_prio - return the priority that is based on the static prio |
---|
1363 | | - */ |
---|
1364 | | -static inline int __normal_prio(struct task_struct *p) |
---|
| 1712 | +static inline int __normal_prio(int policy, int rt_prio, int nice) |
---|
1365 | 1713 | { |
---|
1366 | | - return p->static_prio; |
---|
| 1714 | + int prio; |
---|
| 1715 | + |
---|
| 1716 | + if (dl_policy(policy)) |
---|
| 1717 | + prio = MAX_DL_PRIO - 1; |
---|
| 1718 | + else if (rt_policy(policy)) |
---|
| 1719 | + prio = MAX_RT_PRIO - 1 - rt_prio; |
---|
| 1720 | + else |
---|
| 1721 | + prio = NICE_TO_PRIO(nice); |
---|
| 1722 | + |
---|
| 1723 | + return prio; |
---|
1367 | 1724 | } |
---|
1368 | 1725 | |
---|
1369 | 1726 | /* |
---|
.. | .. |
---|
1375 | 1732 | */ |
---|
1376 | 1733 | static inline int normal_prio(struct task_struct *p) |
---|
1377 | 1734 | { |
---|
1378 | | - int prio; |
---|
1379 | | - |
---|
1380 | | - if (task_has_dl_policy(p)) |
---|
1381 | | - prio = MAX_DL_PRIO-1; |
---|
1382 | | - else if (task_has_rt_policy(p)) |
---|
1383 | | - prio = MAX_RT_PRIO-1 - p->rt_priority; |
---|
1384 | | - else |
---|
1385 | | - prio = __normal_prio(p); |
---|
1386 | | - return prio; |
---|
| 1735 | + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); |
---|
1387 | 1736 | } |
---|
1388 | 1737 | |
---|
1389 | 1738 | /* |
---|
.. | .. |
---|
1439 | 1788 | |
---|
1440 | 1789 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
---|
1441 | 1790 | { |
---|
1442 | | - const struct sched_class *class; |
---|
1443 | | - |
---|
1444 | | - if (p->sched_class == rq->curr->sched_class) { |
---|
| 1791 | + if (p->sched_class == rq->curr->sched_class) |
---|
1445 | 1792 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
---|
1446 | | - } else { |
---|
1447 | | - for_each_class(class) { |
---|
1448 | | - if (class == rq->curr->sched_class) |
---|
1449 | | - break; |
---|
1450 | | - if (class == p->sched_class) { |
---|
1451 | | - resched_curr(rq); |
---|
1452 | | - break; |
---|
1453 | | - } |
---|
1454 | | - } |
---|
1455 | | - } |
---|
| 1793 | + else if (p->sched_class > rq->curr->sched_class) |
---|
| 1794 | + resched_curr(rq); |
---|
1456 | 1795 | |
---|
1457 | 1796 | /* |
---|
1458 | 1797 | * A queue event has occurred, and we're going to schedule. In |
---|
.. | .. |
---|
1461 | 1800 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
---|
1462 | 1801 | rq_clock_skip_update(rq); |
---|
1463 | 1802 | } |
---|
| 1803 | +EXPORT_SYMBOL_GPL(check_preempt_curr); |
---|
1464 | 1804 | |
---|
1465 | 1805 | #ifdef CONFIG_SMP |
---|
1466 | 1806 | |
---|
1467 | | -static inline bool is_per_cpu_kthread(struct task_struct *p) |
---|
1468 | | -{ |
---|
1469 | | - if (!(p->flags & PF_KTHREAD)) |
---|
1470 | | - return false; |
---|
1471 | | - |
---|
1472 | | - if (p->nr_cpus_allowed != 1) |
---|
1473 | | - return false; |
---|
1474 | | - |
---|
1475 | | - return true; |
---|
1476 | | -} |
---|
1477 | | - |
---|
1478 | 1807 | /* |
---|
1479 | | - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see |
---|
| 1808 | + * Per-CPU kthreads are allowed to run on !active && online CPUs, see |
---|
1480 | 1809 | * __set_cpus_allowed_ptr() and select_fallback_rq(). |
---|
1481 | 1810 | */ |
---|
1482 | 1811 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) |
---|
1483 | 1812 | { |
---|
1484 | | - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) |
---|
| 1813 | + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
---|
1485 | 1814 | return false; |
---|
1486 | 1815 | |
---|
1487 | 1816 | if (is_per_cpu_kthread(p)) |
---|
1488 | 1817 | return cpu_online(cpu); |
---|
1489 | 1818 | |
---|
1490 | | - return cpu_active(cpu); |
---|
| 1819 | + if (!cpu_active(cpu)) |
---|
| 1820 | + return false; |
---|
| 1821 | + |
---|
| 1822 | + return cpumask_test_cpu(cpu, task_cpu_possible_mask(p)); |
---|
1491 | 1823 | } |
---|
1492 | 1824 | |
---|
1493 | 1825 | /* |
---|
.. | .. |
---|
1512 | 1844 | static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, |
---|
1513 | 1845 | struct task_struct *p, int new_cpu) |
---|
1514 | 1846 | { |
---|
| 1847 | + int detached = 0; |
---|
| 1848 | + |
---|
1515 | 1849 | lockdep_assert_held(&rq->lock); |
---|
1516 | 1850 | |
---|
1517 | | - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); |
---|
1518 | | - dequeue_task(rq, p, DEQUEUE_NOCLOCK); |
---|
1519 | | - set_task_cpu(p, new_cpu); |
---|
1520 | | - rq_unlock(rq, rf); |
---|
| 1851 | + /* |
---|
| 1852 | + * The vendor hook may drop the lock temporarily, so |
---|
| 1853 | + * pass the rq flags to unpin lock. We expect the |
---|
| 1854 | + * rq lock to be held after return. |
---|
| 1855 | + */ |
---|
| 1856 | + trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached); |
---|
| 1857 | + if (detached) |
---|
| 1858 | + goto attach; |
---|
1521 | 1859 | |
---|
| 1860 | + deactivate_task(rq, p, DEQUEUE_NOCLOCK); |
---|
| 1861 | + set_task_cpu(p, new_cpu); |
---|
| 1862 | + |
---|
| 1863 | +attach: |
---|
| 1864 | + rq_unlock(rq, rf); |
---|
1522 | 1865 | rq = cpu_rq(new_cpu); |
---|
1523 | 1866 | |
---|
1524 | 1867 | rq_lock(rq, rf); |
---|
1525 | 1868 | BUG_ON(task_cpu(p) != new_cpu); |
---|
1526 | | - enqueue_task(rq, p, 0); |
---|
1527 | | - p->on_rq = TASK_ON_RQ_QUEUED; |
---|
| 1869 | + activate_task(rq, p, 0); |
---|
1528 | 1870 | check_preempt_curr(rq, p, 0); |
---|
1529 | 1871 | |
---|
1530 | 1872 | return rq; |
---|
.. | .. |
---|
1576 | 1918 | local_irq_disable(); |
---|
1577 | 1919 | /* |
---|
1578 | 1920 | * We need to explicitly wake pending tasks before running |
---|
1579 | | - * __migrate_task() such that we will not miss enforcing cpus_allowed |
---|
| 1921 | + * __migrate_task() such that we will not miss enforcing cpus_ptr |
---|
1580 | 1922 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. |
---|
1581 | 1923 | */ |
---|
1582 | | - sched_ttwu_pending(); |
---|
| 1924 | + flush_smp_call_function_from_idle(); |
---|
1583 | 1925 | |
---|
1584 | 1926 | raw_spin_lock(&p->pi_lock); |
---|
1585 | 1927 | rq_lock(rq, &rf); |
---|
.. | .. |
---|
1607 | 1949 | */ |
---|
1608 | 1950 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) |
---|
1609 | 1951 | { |
---|
1610 | | - cpumask_copy(&p->cpus_allowed, new_mask); |
---|
| 1952 | + cpumask_copy(&p->cpus_mask, new_mask); |
---|
1611 | 1953 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
---|
| 1954 | + trace_android_rvh_set_cpus_allowed_comm(p, new_mask); |
---|
1612 | 1955 | } |
---|
1613 | 1956 | |
---|
1614 | 1957 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
---|
.. | .. |
---|
1637 | 1980 | if (queued) |
---|
1638 | 1981 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
---|
1639 | 1982 | if (running) |
---|
1640 | | - set_curr_task(rq, p); |
---|
| 1983 | + set_next_task(rq, p); |
---|
1641 | 1984 | } |
---|
1642 | 1985 | |
---|
1643 | 1986 | /* |
---|
1644 | | - * Change a given task's CPU affinity. Migrate the thread to a |
---|
1645 | | - * proper CPU and schedule it away if the CPU it's executing on |
---|
1646 | | - * is removed from the allowed bitmask. |
---|
1647 | | - * |
---|
1648 | | - * NOTE: the caller must have a valid reference to the task, the |
---|
1649 | | - * task must not exit() & deallocate itself prematurely. The |
---|
1650 | | - * call is not atomic; no spinlocks may be held. |
---|
| 1987 | + * Called with both p->pi_lock and rq->lock held; drops both before returning. |
---|
1651 | 1988 | */ |
---|
1652 | | -static int __set_cpus_allowed_ptr(struct task_struct *p, |
---|
1653 | | - const struct cpumask *new_mask, bool check) |
---|
| 1989 | +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, |
---|
| 1990 | + const struct cpumask *new_mask, |
---|
| 1991 | + bool check, |
---|
| 1992 | + struct rq *rq, |
---|
| 1993 | + struct rq_flags *rf) |
---|
1654 | 1994 | { |
---|
1655 | 1995 | const struct cpumask *cpu_valid_mask = cpu_active_mask; |
---|
| 1996 | + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); |
---|
1656 | 1997 | unsigned int dest_cpu; |
---|
1657 | | - struct rq_flags rf; |
---|
1658 | | - struct rq *rq; |
---|
1659 | 1998 | int ret = 0; |
---|
1660 | 1999 | |
---|
1661 | | - rq = task_rq_lock(p, &rf); |
---|
1662 | 2000 | update_rq_clock(rq); |
---|
1663 | 2001 | |
---|
1664 | 2002 | if (p->flags & PF_KTHREAD) { |
---|
.. | .. |
---|
1666 | 2004 | * Kernel threads are allowed on online && !active CPUs |
---|
1667 | 2005 | */ |
---|
1668 | 2006 | cpu_valid_mask = cpu_online_mask; |
---|
| 2007 | + } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) { |
---|
| 2008 | + ret = -EINVAL; |
---|
| 2009 | + goto out; |
---|
1669 | 2010 | } |
---|
1670 | 2011 | |
---|
1671 | 2012 | /* |
---|
.. | .. |
---|
1677 | 2018 | goto out; |
---|
1678 | 2019 | } |
---|
1679 | 2020 | |
---|
1680 | | - if (cpumask_equal(&p->cpus_allowed, new_mask)) |
---|
| 2021 | + if (cpumask_equal(&p->cpus_mask, new_mask)) |
---|
1681 | 2022 | goto out; |
---|
1682 | 2023 | |
---|
1683 | | - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); |
---|
| 2024 | + /* |
---|
| 2025 | + * Picking a ~random cpu helps in cases where we are changing affinity |
---|
| 2026 | + * for groups of tasks (ie. cpuset), so that load balancing is not |
---|
| 2027 | + * immediately required to distribute the tasks within their new mask. |
---|
| 2028 | + */ |
---|
| 2029 | + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); |
---|
1684 | 2030 | if (dest_cpu >= nr_cpu_ids) { |
---|
1685 | 2031 | ret = -EINVAL; |
---|
1686 | 2032 | goto out; |
---|
.. | .. |
---|
1705 | 2051 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
---|
1706 | 2052 | struct migration_arg arg = { p, dest_cpu }; |
---|
1707 | 2053 | /* Need help from migration thread: drop lock and wait. */ |
---|
1708 | | - task_rq_unlock(rq, p, &rf); |
---|
| 2054 | + task_rq_unlock(rq, p, rf); |
---|
1709 | 2055 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
---|
1710 | | - tlb_migrate_finish(p->mm); |
---|
1711 | 2056 | return 0; |
---|
1712 | 2057 | } else if (task_on_rq_queued(p)) { |
---|
1713 | 2058 | /* |
---|
1714 | 2059 | * OK, since we're going to drop the lock immediately |
---|
1715 | 2060 | * afterwards anyway. |
---|
1716 | 2061 | */ |
---|
1717 | | - rq = move_queued_task(rq, &rf, p, dest_cpu); |
---|
| 2062 | + rq = move_queued_task(rq, rf, p, dest_cpu); |
---|
1718 | 2063 | } |
---|
1719 | 2064 | out: |
---|
1720 | | - task_rq_unlock(rq, p, &rf); |
---|
| 2065 | + task_rq_unlock(rq, p, rf); |
---|
1721 | 2066 | |
---|
1722 | 2067 | return ret; |
---|
| 2068 | +} |
---|
| 2069 | + |
---|
| 2070 | +/* |
---|
| 2071 | + * Change a given task's CPU affinity. Migrate the thread to a |
---|
| 2072 | + * proper CPU and schedule it away if the CPU it's executing on |
---|
| 2073 | + * is removed from the allowed bitmask. |
---|
| 2074 | + * |
---|
| 2075 | + * NOTE: the caller must have a valid reference to the task, the |
---|
| 2076 | + * task must not exit() & deallocate itself prematurely. The |
---|
| 2077 | + * call is not atomic; no spinlocks may be held. |
---|
| 2078 | + */ |
---|
| 2079 | +static int __set_cpus_allowed_ptr(struct task_struct *p, |
---|
| 2080 | + const struct cpumask *new_mask, bool check) |
---|
| 2081 | +{ |
---|
| 2082 | + struct rq_flags rf; |
---|
| 2083 | + struct rq *rq; |
---|
| 2084 | + |
---|
| 2085 | + rq = task_rq_lock(p, &rf); |
---|
| 2086 | + return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf); |
---|
1723 | 2087 | } |
---|
1724 | 2088 | |
---|
1725 | 2089 | int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) |
---|
.. | .. |
---|
1727 | 2091 | return __set_cpus_allowed_ptr(p, new_mask, false); |
---|
1728 | 2092 | } |
---|
1729 | 2093 | EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); |
---|
| 2094 | + |
---|
| 2095 | +/* |
---|
| 2096 | + * Change a given task's CPU affinity to the intersection of its current |
---|
| 2097 | + * affinity mask and @subset_mask, writing the resulting mask to @new_mask. |
---|
| 2098 | + * If the resulting mask is empty, leave the affinity unchanged and return |
---|
| 2099 | + * -EINVAL. |
---|
| 2100 | + */ |
---|
| 2101 | +static int restrict_cpus_allowed_ptr(struct task_struct *p, |
---|
| 2102 | + struct cpumask *new_mask, |
---|
| 2103 | + const struct cpumask *subset_mask) |
---|
| 2104 | +{ |
---|
| 2105 | + struct rq_flags rf; |
---|
| 2106 | + struct rq *rq; |
---|
| 2107 | + |
---|
| 2108 | + rq = task_rq_lock(p, &rf); |
---|
| 2109 | + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { |
---|
| 2110 | + task_rq_unlock(rq, p, &rf); |
---|
| 2111 | + return -EINVAL; |
---|
| 2112 | + } |
---|
| 2113 | + |
---|
| 2114 | + return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf); |
---|
| 2115 | +} |
---|
| 2116 | + |
---|
| 2117 | +/* |
---|
| 2118 | + * Restrict a given task's CPU affinity so that it is a subset of |
---|
| 2119 | + * task_cpu_possible_mask(). If the resulting mask is empty, we warn and |
---|
| 2120 | + * walk up the cpuset hierarchy until we find a suitable mask. |
---|
| 2121 | + */ |
---|
| 2122 | +void force_compatible_cpus_allowed_ptr(struct task_struct *p) |
---|
| 2123 | +{ |
---|
| 2124 | + cpumask_var_t new_mask; |
---|
| 2125 | + const struct cpumask *override_mask = task_cpu_possible_mask(p); |
---|
| 2126 | + |
---|
| 2127 | + alloc_cpumask_var(&new_mask, GFP_KERNEL); |
---|
| 2128 | + |
---|
| 2129 | + /* |
---|
| 2130 | + * __migrate_task() can fail silently in the face of concurrent |
---|
| 2131 | + * offlining of the chosen destination CPU, so take the hotplug |
---|
| 2132 | + * lock to ensure that the migration succeeds. |
---|
| 2133 | + */ |
---|
| 2134 | + trace_android_rvh_force_compatible_pre(NULL); |
---|
| 2135 | + cpus_read_lock(); |
---|
| 2136 | + if (!cpumask_available(new_mask)) |
---|
| 2137 | + goto out_set_mask; |
---|
| 2138 | + |
---|
| 2139 | + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) |
---|
| 2140 | + goto out_free_mask; |
---|
| 2141 | + |
---|
| 2142 | + /* |
---|
| 2143 | + * We failed to find a valid subset of the affinity mask for the |
---|
| 2144 | + * task, so override it based on its cpuset hierarchy. |
---|
| 2145 | + */ |
---|
| 2146 | + cpuset_cpus_allowed(p, new_mask); |
---|
| 2147 | + override_mask = new_mask; |
---|
| 2148 | + |
---|
| 2149 | +out_set_mask: |
---|
| 2150 | + if (printk_ratelimit()) { |
---|
| 2151 | + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", |
---|
| 2152 | + task_pid_nr(p), p->comm, |
---|
| 2153 | + cpumask_pr_args(override_mask)); |
---|
| 2154 | + } |
---|
| 2155 | + |
---|
| 2156 | + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); |
---|
| 2157 | +out_free_mask: |
---|
| 2158 | + cpus_read_unlock(); |
---|
| 2159 | + trace_android_rvh_force_compatible_post(NULL); |
---|
| 2160 | + free_cpumask_var(new_mask); |
---|
| 2161 | +} |
---|
1730 | 2162 | |
---|
1731 | 2163 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
---|
1732 | 2164 | { |
---|
.. | .. |
---|
1775 | 2207 | p->se.nr_migrations++; |
---|
1776 | 2208 | rseq_migrate(p); |
---|
1777 | 2209 | perf_event_task_migrate(p); |
---|
| 2210 | + trace_android_rvh_set_task_cpu(p, new_cpu); |
---|
1778 | 2211 | } |
---|
1779 | 2212 | |
---|
1780 | 2213 | __set_task_cpu(p, new_cpu); |
---|
1781 | 2214 | } |
---|
| 2215 | +EXPORT_SYMBOL_GPL(set_task_cpu); |
---|
1782 | 2216 | |
---|
1783 | | -#ifdef CONFIG_NUMA_BALANCING |
---|
1784 | 2217 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
---|
1785 | 2218 | { |
---|
1786 | 2219 | if (task_on_rq_queued(p)) { |
---|
.. | .. |
---|
1793 | 2226 | rq_pin_lock(src_rq, &srf); |
---|
1794 | 2227 | rq_pin_lock(dst_rq, &drf); |
---|
1795 | 2228 | |
---|
1796 | | - p->on_rq = TASK_ON_RQ_MIGRATING; |
---|
1797 | 2229 | deactivate_task(src_rq, p, 0); |
---|
1798 | 2230 | set_task_cpu(p, cpu); |
---|
1799 | 2231 | activate_task(dst_rq, p, 0); |
---|
1800 | | - p->on_rq = TASK_ON_RQ_QUEUED; |
---|
1801 | 2232 | check_preempt_curr(dst_rq, p, 0); |
---|
1802 | 2233 | |
---|
1803 | 2234 | rq_unpin_lock(dst_rq, &drf); |
---|
.. | .. |
---|
1840 | 2271 | if (task_cpu(arg->src_task) != arg->src_cpu) |
---|
1841 | 2272 | goto unlock; |
---|
1842 | 2273 | |
---|
1843 | | - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) |
---|
| 2274 | + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) |
---|
1844 | 2275 | goto unlock; |
---|
1845 | 2276 | |
---|
1846 | | - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) |
---|
| 2277 | + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) |
---|
1847 | 2278 | goto unlock; |
---|
1848 | 2279 | |
---|
1849 | 2280 | __migrate_swap_task(arg->src_task, arg->dst_cpu); |
---|
.. | .. |
---|
1885 | 2316 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) |
---|
1886 | 2317 | goto out; |
---|
1887 | 2318 | |
---|
1888 | | - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) |
---|
| 2319 | + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) |
---|
1889 | 2320 | goto out; |
---|
1890 | 2321 | |
---|
1891 | | - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) |
---|
| 2322 | + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) |
---|
1892 | 2323 | goto out; |
---|
1893 | 2324 | |
---|
1894 | 2325 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); |
---|
.. | .. |
---|
1897 | 2328 | out: |
---|
1898 | 2329 | return ret; |
---|
1899 | 2330 | } |
---|
1900 | | -#endif /* CONFIG_NUMA_BALANCING */ |
---|
| 2331 | +EXPORT_SYMBOL_GPL(migrate_swap); |
---|
1901 | 2332 | |
---|
1902 | 2333 | /* |
---|
1903 | 2334 | * wait_task_inactive - wait for a thread to unschedule. |
---|
.. | .. |
---|
2033 | 2464 | EXPORT_SYMBOL_GPL(kick_process); |
---|
2034 | 2465 | |
---|
2035 | 2466 | /* |
---|
2036 | | - * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
---|
| 2467 | + * ->cpus_ptr is protected by both rq->lock and p->pi_lock |
---|
2037 | 2468 | * |
---|
2038 | 2469 | * A few notes on cpu_active vs cpu_online: |
---|
2039 | 2470 | * |
---|
.. | .. |
---|
2059 | 2490 | int nid = cpu_to_node(cpu); |
---|
2060 | 2491 | const struct cpumask *nodemask = NULL; |
---|
2061 | 2492 | enum { cpuset, possible, fail } state = cpuset; |
---|
2062 | | - int dest_cpu; |
---|
| 2493 | + int dest_cpu = -1; |
---|
| 2494 | + |
---|
| 2495 | + trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu); |
---|
| 2496 | + if (dest_cpu >= 0) |
---|
| 2497 | + return dest_cpu; |
---|
2063 | 2498 | |
---|
2064 | 2499 | /* |
---|
2065 | 2500 | * If the node that the CPU is on has been offlined, cpu_to_node() |
---|
.. | .. |
---|
2071 | 2506 | |
---|
2072 | 2507 | /* Look for allowed, online CPU in same node. */ |
---|
2073 | 2508 | for_each_cpu(dest_cpu, nodemask) { |
---|
2074 | | - if (!cpu_active(dest_cpu)) |
---|
2075 | | - continue; |
---|
2076 | | - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
---|
| 2509 | + if (is_cpu_allowed(p, dest_cpu)) |
---|
2077 | 2510 | return dest_cpu; |
---|
2078 | 2511 | } |
---|
2079 | 2512 | } |
---|
2080 | 2513 | |
---|
2081 | 2514 | for (;;) { |
---|
2082 | 2515 | /* Any allowed, online CPU? */ |
---|
2083 | | - for_each_cpu(dest_cpu, &p->cpus_allowed) { |
---|
| 2516 | + for_each_cpu(dest_cpu, p->cpus_ptr) { |
---|
2084 | 2517 | if (!is_cpu_allowed(p, dest_cpu)) |
---|
2085 | 2518 | continue; |
---|
2086 | 2519 | |
---|
.. | .. |
---|
2095 | 2528 | state = possible; |
---|
2096 | 2529 | break; |
---|
2097 | 2530 | } |
---|
2098 | | - /* Fall-through */ |
---|
| 2531 | + fallthrough; |
---|
2099 | 2532 | case possible: |
---|
2100 | | - do_set_cpus_allowed(p, cpu_possible_mask); |
---|
| 2533 | + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); |
---|
2101 | 2534 | state = fail; |
---|
2102 | 2535 | break; |
---|
2103 | | - |
---|
2104 | 2536 | case fail: |
---|
2105 | 2537 | BUG(); |
---|
2106 | 2538 | break; |
---|
.. | .. |
---|
2124 | 2556 | } |
---|
2125 | 2557 | |
---|
2126 | 2558 | /* |
---|
2127 | | - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
---|
| 2559 | + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. |
---|
2128 | 2560 | */ |
---|
2129 | 2561 | static inline |
---|
2130 | | -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, |
---|
2131 | | - int sibling_count_hint) |
---|
| 2562 | +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
---|
2132 | 2563 | { |
---|
2133 | 2564 | lockdep_assert_held(&p->pi_lock); |
---|
2134 | 2565 | |
---|
2135 | 2566 | if (p->nr_cpus_allowed > 1) |
---|
2136 | | - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, |
---|
2137 | | - sibling_count_hint); |
---|
| 2567 | + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
---|
2138 | 2568 | else |
---|
2139 | | - cpu = cpumask_any(&p->cpus_allowed); |
---|
| 2569 | + cpu = cpumask_any(p->cpus_ptr); |
---|
2140 | 2570 | |
---|
2141 | 2571 | /* |
---|
2142 | 2572 | * In order not to call set_task_cpu() on a blocking task we need |
---|
2143 | | - * to rely on ttwu() to place the task on a valid ->cpus_allowed |
---|
| 2573 | + * to rely on ttwu() to place the task on a valid ->cpus_ptr |
---|
2144 | 2574 | * CPU. |
---|
2145 | 2575 | * |
---|
2146 | 2576 | * Since this is common to all placement strategies, this lives here. |
---|
.. | .. |
---|
2152 | 2582 | cpu = select_fallback_rq(task_cpu(p), p); |
---|
2153 | 2583 | |
---|
2154 | 2584 | return cpu; |
---|
2155 | | -} |
---|
2156 | | - |
---|
2157 | | -static void update_avg(u64 *avg, u64 sample) |
---|
2158 | | -{ |
---|
2159 | | - s64 diff = sample - *avg; |
---|
2160 | | - *avg += diff >> 3; |
---|
2161 | 2585 | } |
---|
2162 | 2586 | |
---|
2163 | 2587 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
---|
.. | .. |
---|
2239 | 2663 | __schedstat_inc(p->se.statistics.nr_wakeups_sync); |
---|
2240 | 2664 | } |
---|
2241 | 2665 | |
---|
2242 | | -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
---|
2243 | | -{ |
---|
2244 | | - activate_task(rq, p, en_flags); |
---|
2245 | | - p->on_rq = TASK_ON_RQ_QUEUED; |
---|
2246 | | - |
---|
2247 | | - /* If a worker is waking up, notify the workqueue: */ |
---|
2248 | | - if (p->flags & PF_WQ_WORKER) |
---|
2249 | | - wq_worker_waking_up(p, cpu_of(rq)); |
---|
2250 | | -} |
---|
2251 | | - |
---|
2252 | 2666 | /* |
---|
2253 | 2667 | * Mark the task runnable and perform wakeup-preemption. |
---|
2254 | 2668 | */ |
---|
.. | .. |
---|
2290 | 2704 | { |
---|
2291 | 2705 | int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; |
---|
2292 | 2706 | |
---|
| 2707 | + if (wake_flags & WF_SYNC) |
---|
| 2708 | + en_flags |= ENQUEUE_WAKEUP_SYNC; |
---|
| 2709 | + |
---|
2293 | 2710 | lockdep_assert_held(&rq->lock); |
---|
2294 | 2711 | |
---|
2295 | | -#ifdef CONFIG_SMP |
---|
2296 | 2712 | if (p->sched_contributes_to_load) |
---|
2297 | 2713 | rq->nr_uninterruptible--; |
---|
2298 | 2714 | |
---|
| 2715 | +#ifdef CONFIG_SMP |
---|
2299 | 2716 | if (wake_flags & WF_MIGRATED) |
---|
2300 | 2717 | en_flags |= ENQUEUE_MIGRATED; |
---|
| 2718 | + else |
---|
2301 | 2719 | #endif |
---|
| 2720 | + if (p->in_iowait) { |
---|
| 2721 | + delayacct_blkio_end(p); |
---|
| 2722 | + atomic_dec(&task_rq(p)->nr_iowait); |
---|
| 2723 | + } |
---|
2302 | 2724 | |
---|
2303 | | - ttwu_activate(rq, p, en_flags); |
---|
| 2725 | + activate_task(rq, p, en_flags); |
---|
2304 | 2726 | ttwu_do_wakeup(rq, p, wake_flags, rf); |
---|
2305 | 2727 | } |
---|
2306 | 2728 | |
---|
2307 | 2729 | /* |
---|
2308 | | - * Called in case the task @p isn't fully descheduled from its runqueue, |
---|
2309 | | - * in this case we must do a remote wakeup. Its a 'light' wakeup though, |
---|
2310 | | - * since all we need to do is flip p->state to TASK_RUNNING, since |
---|
2311 | | - * the task is still ->on_rq. |
---|
| 2730 | + * Consider @p being inside a wait loop: |
---|
| 2731 | + * |
---|
| 2732 | + * for (;;) { |
---|
| 2733 | + * set_current_state(TASK_UNINTERRUPTIBLE); |
---|
| 2734 | + * |
---|
| 2735 | + * if (CONDITION) |
---|
| 2736 | + * break; |
---|
| 2737 | + * |
---|
| 2738 | + * schedule(); |
---|
| 2739 | + * } |
---|
| 2740 | + * __set_current_state(TASK_RUNNING); |
---|
| 2741 | + * |
---|
| 2742 | + * between set_current_state() and schedule(). In this case @p is still |
---|
| 2743 | + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in |
---|
| 2744 | + * an atomic manner. |
---|
| 2745 | + * |
---|
| 2746 | + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq |
---|
| 2747 | + * then schedule() must still happen and p->state can be changed to |
---|
| 2748 | + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we |
---|
| 2749 | + * need to do a full wakeup with enqueue. |
---|
| 2750 | + * |
---|
| 2751 | + * Returns: %true when the wakeup is done, |
---|
| 2752 | + * %false otherwise. |
---|
2312 | 2753 | */ |
---|
2313 | | -static int ttwu_remote(struct task_struct *p, int wake_flags) |
---|
| 2754 | +static int ttwu_runnable(struct task_struct *p, int wake_flags) |
---|
2314 | 2755 | { |
---|
2315 | 2756 | struct rq_flags rf; |
---|
2316 | 2757 | struct rq *rq; |
---|
.. | .. |
---|
2329 | 2770 | } |
---|
2330 | 2771 | |
---|
2331 | 2772 | #ifdef CONFIG_SMP |
---|
2332 | | -void sched_ttwu_pending(void) |
---|
| 2773 | +void sched_ttwu_pending(void *arg) |
---|
2333 | 2774 | { |
---|
| 2775 | + struct llist_node *llist = arg; |
---|
2334 | 2776 | struct rq *rq = this_rq(); |
---|
2335 | | - struct llist_node *llist = llist_del_all(&rq->wake_list); |
---|
2336 | 2777 | struct task_struct *p, *t; |
---|
2337 | 2778 | struct rq_flags rf; |
---|
2338 | 2779 | |
---|
2339 | 2780 | if (!llist) |
---|
2340 | 2781 | return; |
---|
2341 | 2782 | |
---|
| 2783 | + /* |
---|
| 2784 | + * rq::ttwu_pending racy indication of out-standing wakeups. |
---|
| 2785 | + * Races such that false-negatives are possible, since they |
---|
| 2786 | + * are shorter lived that false-positives would be. |
---|
| 2787 | + */ |
---|
| 2788 | + WRITE_ONCE(rq->ttwu_pending, 0); |
---|
| 2789 | + |
---|
2342 | 2790 | rq_lock_irqsave(rq, &rf); |
---|
2343 | 2791 | update_rq_clock(rq); |
---|
2344 | 2792 | |
---|
2345 | | - llist_for_each_entry_safe(p, t, llist, wake_entry) |
---|
| 2793 | + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { |
---|
| 2794 | + if (WARN_ON_ONCE(p->on_cpu)) |
---|
| 2795 | + smp_cond_load_acquire(&p->on_cpu, !VAL); |
---|
| 2796 | + |
---|
| 2797 | + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) |
---|
| 2798 | + set_task_cpu(p, cpu_of(rq)); |
---|
| 2799 | + |
---|
2346 | 2800 | ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); |
---|
| 2801 | + } |
---|
2347 | 2802 | |
---|
2348 | 2803 | rq_unlock_irqrestore(rq, &rf); |
---|
2349 | 2804 | } |
---|
2350 | 2805 | |
---|
2351 | | -void scheduler_ipi(void) |
---|
| 2806 | +void send_call_function_single_ipi(int cpu) |
---|
2352 | 2807 | { |
---|
2353 | | - /* |
---|
2354 | | - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting |
---|
2355 | | - * TIF_NEED_RESCHED remotely (for the first time) will also send |
---|
2356 | | - * this IPI. |
---|
2357 | | - */ |
---|
2358 | | - preempt_fold_need_resched(); |
---|
| 2808 | + struct rq *rq = cpu_rq(cpu); |
---|
2359 | 2809 | |
---|
2360 | | - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
---|
2361 | | - return; |
---|
2362 | | - |
---|
2363 | | - /* |
---|
2364 | | - * Not all reschedule IPI handlers call irq_enter/irq_exit, since |
---|
2365 | | - * traditionally all their work was done from the interrupt return |
---|
2366 | | - * path. Now that we actually do some work, we need to make sure |
---|
2367 | | - * we do call them. |
---|
2368 | | - * |
---|
2369 | | - * Some archs already do call them, luckily irq_enter/exit nest |
---|
2370 | | - * properly. |
---|
2371 | | - * |
---|
2372 | | - * Arguably we should visit all archs and update all handlers, |
---|
2373 | | - * however a fair share of IPIs are still resched only so this would |
---|
2374 | | - * somewhat pessimize the simple resched case. |
---|
2375 | | - */ |
---|
2376 | | - irq_enter(); |
---|
2377 | | - sched_ttwu_pending(); |
---|
2378 | | - |
---|
2379 | | - /* |
---|
2380 | | - * Check if someone kicked us for doing the nohz idle load balance. |
---|
2381 | | - */ |
---|
2382 | | - if (unlikely(got_nohz_idle_kick())) { |
---|
2383 | | - this_rq()->idle_balance = 1; |
---|
2384 | | - raise_softirq_irqoff(SCHED_SOFTIRQ); |
---|
2385 | | - } |
---|
2386 | | - irq_exit(); |
---|
| 2810 | + if (!set_nr_if_polling(rq->idle)) |
---|
| 2811 | + arch_send_call_function_single_ipi(cpu); |
---|
| 2812 | + else |
---|
| 2813 | + trace_sched_wake_idle_without_ipi(cpu); |
---|
2387 | 2814 | } |
---|
2388 | 2815 | |
---|
2389 | | -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) |
---|
| 2816 | +/* |
---|
| 2817 | + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if |
---|
| 2818 | + * necessary. The wakee CPU on receipt of the IPI will queue the task |
---|
| 2819 | + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost |
---|
| 2820 | + * of the wakeup instead of the waker. |
---|
| 2821 | + */ |
---|
| 2822 | +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) |
---|
2390 | 2823 | { |
---|
2391 | 2824 | struct rq *rq = cpu_rq(cpu); |
---|
2392 | 2825 | |
---|
2393 | 2826 | p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); |
---|
2394 | 2827 | |
---|
2395 | | - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { |
---|
2396 | | - if (!set_nr_if_polling(rq->idle)) |
---|
2397 | | - smp_send_reschedule(cpu); |
---|
2398 | | - else |
---|
2399 | | - trace_sched_wake_idle_without_ipi(cpu); |
---|
2400 | | - } |
---|
| 2828 | + WRITE_ONCE(rq->ttwu_pending, 1); |
---|
| 2829 | + __smp_call_single_queue(cpu, &p->wake_entry.llist); |
---|
2401 | 2830 | } |
---|
2402 | 2831 | |
---|
2403 | 2832 | void wake_up_if_idle(int cpu) |
---|
.. | .. |
---|
2423 | 2852 | out: |
---|
2424 | 2853 | rcu_read_unlock(); |
---|
2425 | 2854 | } |
---|
| 2855 | +EXPORT_SYMBOL_GPL(wake_up_if_idle); |
---|
2426 | 2856 | |
---|
2427 | 2857 | bool cpus_share_cache(int this_cpu, int that_cpu) |
---|
2428 | 2858 | { |
---|
.. | .. |
---|
2431 | 2861 | |
---|
2432 | 2862 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
---|
2433 | 2863 | } |
---|
| 2864 | + |
---|
| 2865 | +static inline bool ttwu_queue_cond(int cpu, int wake_flags) |
---|
| 2866 | +{ |
---|
| 2867 | + /* |
---|
| 2868 | + * If the CPU does not share cache, then queue the task on the |
---|
| 2869 | + * remote rqs wakelist to avoid accessing remote data. |
---|
| 2870 | + */ |
---|
| 2871 | + if (!cpus_share_cache(smp_processor_id(), cpu)) |
---|
| 2872 | + return true; |
---|
| 2873 | + |
---|
| 2874 | + /* |
---|
| 2875 | + * If the task is descheduling and the only running task on the |
---|
| 2876 | + * CPU then use the wakelist to offload the task activation to |
---|
| 2877 | + * the soon-to-be-idle CPU as the current CPU is likely busy. |
---|
| 2878 | + * nr_running is checked to avoid unnecessary task stacking. |
---|
| 2879 | + * |
---|
| 2880 | + * Note that we can only get here with (wakee) p->on_rq=0, |
---|
| 2881 | + * p->on_cpu can be whatever, we've done the dequeue, so |
---|
| 2882 | + * the wakee has been accounted out of ->nr_running. |
---|
| 2883 | + */ |
---|
| 2884 | + if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running) |
---|
| 2885 | + return true; |
---|
| 2886 | + |
---|
| 2887 | + return false; |
---|
| 2888 | +} |
---|
| 2889 | + |
---|
| 2890 | +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) |
---|
| 2891 | +{ |
---|
| 2892 | + bool cond = false; |
---|
| 2893 | + |
---|
| 2894 | + trace_android_rvh_ttwu_cond(&cond); |
---|
| 2895 | + |
---|
| 2896 | + if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) || |
---|
| 2897 | + cond) { |
---|
| 2898 | + if (WARN_ON_ONCE(cpu == smp_processor_id())) |
---|
| 2899 | + return false; |
---|
| 2900 | + |
---|
| 2901 | + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ |
---|
| 2902 | + __ttwu_queue_wakelist(p, cpu, wake_flags); |
---|
| 2903 | + return true; |
---|
| 2904 | + } |
---|
| 2905 | + |
---|
| 2906 | + return false; |
---|
| 2907 | +} |
---|
| 2908 | + |
---|
| 2909 | +#else /* !CONFIG_SMP */ |
---|
| 2910 | + |
---|
| 2911 | +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) |
---|
| 2912 | +{ |
---|
| 2913 | + return false; |
---|
| 2914 | +} |
---|
| 2915 | + |
---|
2434 | 2916 | #endif /* CONFIG_SMP */ |
---|
2435 | 2917 | |
---|
2436 | 2918 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) |
---|
.. | .. |
---|
2438 | 2920 | struct rq *rq = cpu_rq(cpu); |
---|
2439 | 2921 | struct rq_flags rf; |
---|
2440 | 2922 | |
---|
2441 | | -#if defined(CONFIG_SMP) |
---|
2442 | | - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
---|
2443 | | - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ |
---|
2444 | | - ttwu_queue_remote(p, cpu, wake_flags); |
---|
| 2923 | + if (ttwu_queue_wakelist(p, cpu, wake_flags)) |
---|
2445 | 2924 | return; |
---|
2446 | | - } |
---|
2447 | | -#endif |
---|
2448 | 2925 | |
---|
2449 | 2926 | rq_lock(rq, &rf); |
---|
2450 | 2927 | update_rq_clock(rq); |
---|
.. | .. |
---|
2500 | 2977 | * migration. However the means are completely different as there is no lock |
---|
2501 | 2978 | * chain to provide order. Instead we do: |
---|
2502 | 2979 | * |
---|
2503 | | - * 1) smp_store_release(X->on_cpu, 0) |
---|
2504 | | - * 2) smp_cond_load_acquire(!X->on_cpu) |
---|
| 2980 | + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() |
---|
| 2981 | + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() |
---|
2505 | 2982 | * |
---|
2506 | 2983 | * Example: |
---|
2507 | 2984 | * |
---|
.. | .. |
---|
2540 | 3017 | * @p: the thread to be awakened |
---|
2541 | 3018 | * @state: the mask of task states that can be woken |
---|
2542 | 3019 | * @wake_flags: wake modifier flags (WF_*) |
---|
2543 | | - * @sibling_count_hint: A hint at the number of threads that are being woken up |
---|
2544 | | - * in this event. |
---|
2545 | 3020 | * |
---|
2546 | | - * If (@state & @p->state) @p->state = TASK_RUNNING. |
---|
| 3021 | + * Conceptually does: |
---|
| 3022 | + * |
---|
| 3023 | + * If (@state & @p->state) @p->state = TASK_RUNNING. |
---|
2547 | 3024 | * |
---|
2548 | 3025 | * If the task was not queued/runnable, also place it back on a runqueue. |
---|
2549 | 3026 | * |
---|
2550 | | - * Atomic against schedule() which would dequeue a task, also see |
---|
2551 | | - * set_current_state(). |
---|
| 3027 | + * This function is atomic against schedule() which would dequeue the task. |
---|
2552 | 3028 | * |
---|
2553 | | - * This function executes a full memory barrier before accessing the task |
---|
2554 | | - * state; see set_current_state(). |
---|
| 3029 | + * It issues a full memory barrier before accessing @p->state, see the comment |
---|
| 3030 | + * with set_current_state(). |
---|
| 3031 | + * |
---|
| 3032 | + * Uses p->pi_lock to serialize against concurrent wake-ups. |
---|
| 3033 | + * |
---|
| 3034 | + * Relies on p->pi_lock stabilizing: |
---|
| 3035 | + * - p->sched_class |
---|
| 3036 | + * - p->cpus_ptr |
---|
| 3037 | + * - p->sched_task_group |
---|
| 3038 | + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). |
---|
| 3039 | + * |
---|
| 3040 | + * Tries really hard to only take one task_rq(p)->lock for performance. |
---|
| 3041 | + * Takes rq->lock in: |
---|
| 3042 | + * - ttwu_runnable() -- old rq, unavoidable, see comment there; |
---|
| 3043 | + * - ttwu_queue() -- new rq, for enqueue of the task; |
---|
| 3044 | + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. |
---|
| 3045 | + * |
---|
| 3046 | + * As a consequence we race really badly with just about everything. See the |
---|
| 3047 | + * many memory barriers and their comments for details. |
---|
2555 | 3048 | * |
---|
2556 | 3049 | * Return: %true if @p->state changes (an actual wakeup was done), |
---|
2557 | 3050 | * %false otherwise. |
---|
2558 | 3051 | */ |
---|
2559 | 3052 | static int |
---|
2560 | | -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, |
---|
2561 | | - int sibling_count_hint) |
---|
| 3053 | +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
---|
2562 | 3054 | { |
---|
2563 | 3055 | unsigned long flags; |
---|
2564 | 3056 | int cpu, success = 0; |
---|
2565 | 3057 | |
---|
| 3058 | + preempt_disable(); |
---|
| 3059 | + if (p == current) { |
---|
| 3060 | + /* |
---|
| 3061 | + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) |
---|
| 3062 | + * == smp_processor_id()'. Together this means we can special |
---|
| 3063 | + * case the whole 'p->on_rq && ttwu_runnable()' case below |
---|
| 3064 | + * without taking any locks. |
---|
| 3065 | + * |
---|
| 3066 | + * In particular: |
---|
| 3067 | + * - we rely on Program-Order guarantees for all the ordering, |
---|
| 3068 | + * - we're serialized against set_special_state() by virtue of |
---|
| 3069 | + * it disabling IRQs (this allows not taking ->pi_lock). |
---|
| 3070 | + */ |
---|
| 3071 | + if (!(p->state & state)) |
---|
| 3072 | + goto out; |
---|
| 3073 | + |
---|
| 3074 | + success = 1; |
---|
| 3075 | + trace_sched_waking(p); |
---|
| 3076 | + p->state = TASK_RUNNING; |
---|
| 3077 | + trace_sched_wakeup(p); |
---|
| 3078 | + goto out; |
---|
| 3079 | + } |
---|
| 3080 | + |
---|
2566 | 3081 | /* |
---|
2567 | 3082 | * If we are going to wake up a thread waiting for CONDITION we |
---|
2568 | 3083 | * need to ensure that CONDITION=1 done by the caller can not be |
---|
2569 | | - * reordered with p->state check below. This pairs with mb() in |
---|
2570 | | - * set_current_state() the waiting thread does. |
---|
| 3084 | + * reordered with p->state check below. This pairs with smp_store_mb() |
---|
| 3085 | + * in set_current_state() that the waiting thread does. |
---|
2571 | 3086 | */ |
---|
2572 | 3087 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
---|
2573 | 3088 | smp_mb__after_spinlock(); |
---|
2574 | 3089 | if (!(p->state & state)) |
---|
2575 | | - goto out; |
---|
| 3090 | + goto unlock; |
---|
| 3091 | + |
---|
| 3092 | +#ifdef CONFIG_FREEZER |
---|
| 3093 | + /* |
---|
| 3094 | + * If we're going to wake up a thread which may be frozen, then |
---|
| 3095 | + * we can only do so if we have an active CPU which is capable of |
---|
| 3096 | + * running it. This may not be the case when resuming from suspend, |
---|
| 3097 | + * as the secondary CPUs may not yet be back online. See __thaw_task() |
---|
| 3098 | + * for the actual wakeup. |
---|
| 3099 | + */ |
---|
| 3100 | + if (unlikely(frozen_or_skipped(p)) && |
---|
| 3101 | + !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) |
---|
| 3102 | + goto unlock; |
---|
| 3103 | +#endif |
---|
2576 | 3104 | |
---|
2577 | 3105 | trace_sched_waking(p); |
---|
2578 | 3106 | |
---|
2579 | 3107 | /* We're going to change ->state: */ |
---|
2580 | 3108 | success = 1; |
---|
2581 | | - cpu = task_cpu(p); |
---|
2582 | 3109 | |
---|
2583 | 3110 | /* |
---|
2584 | 3111 | * Ensure we load p->on_rq _after_ p->state, otherwise it would |
---|
.. | .. |
---|
2599 | 3126 | * |
---|
2600 | 3127 | * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in |
---|
2601 | 3128 | * __schedule(). See the comment for smp_mb__after_spinlock(). |
---|
| 3129 | + * |
---|
| 3130 | + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). |
---|
2602 | 3131 | */ |
---|
2603 | 3132 | smp_rmb(); |
---|
2604 | | - if (p->on_rq && ttwu_remote(p, wake_flags)) |
---|
2605 | | - goto stat; |
---|
| 3133 | + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) |
---|
| 3134 | + goto unlock; |
---|
| 3135 | + |
---|
| 3136 | + if (p->state & TASK_UNINTERRUPTIBLE) |
---|
| 3137 | + trace_sched_blocked_reason(p); |
---|
2606 | 3138 | |
---|
2607 | 3139 | #ifdef CONFIG_SMP |
---|
2608 | 3140 | /* |
---|
.. | .. |
---|
2623 | 3155 | * |
---|
2624 | 3156 | * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in |
---|
2625 | 3157 | * __schedule(). See the comment for smp_mb__after_spinlock(). |
---|
| 3158 | + * |
---|
| 3159 | + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure |
---|
| 3160 | + * schedule()'s deactivate_task() has 'happened' and p will no longer |
---|
| 3161 | + * care about it's own p->state. See the comment in __schedule(). |
---|
2626 | 3162 | */ |
---|
2627 | | - smp_rmb(); |
---|
| 3163 | + smp_acquire__after_ctrl_dep(); |
---|
| 3164 | + |
---|
| 3165 | + /* |
---|
| 3166 | + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq |
---|
| 3167 | + * == 0), which means we need to do an enqueue, change p->state to |
---|
| 3168 | + * TASK_WAKING such that we can unlock p->pi_lock before doing the |
---|
| 3169 | + * enqueue, such as ttwu_queue_wakelist(). |
---|
| 3170 | + */ |
---|
| 3171 | + p->state = TASK_WAKING; |
---|
| 3172 | + |
---|
| 3173 | + /* |
---|
| 3174 | + * If the owning (remote) CPU is still in the middle of schedule() with |
---|
| 3175 | + * this task as prev, considering queueing p on the remote CPUs wake_list |
---|
| 3176 | + * which potentially sends an IPI instead of spinning on p->on_cpu to |
---|
| 3177 | + * let the waker make forward progress. This is safe because IRQs are |
---|
| 3178 | + * disabled and the IPI will deliver after on_cpu is cleared. |
---|
| 3179 | + * |
---|
| 3180 | + * Ensure we load task_cpu(p) after p->on_cpu: |
---|
| 3181 | + * |
---|
| 3182 | + * set_task_cpu(p, cpu); |
---|
| 3183 | + * STORE p->cpu = @cpu |
---|
| 3184 | + * __schedule() (switch to task 'p') |
---|
| 3185 | + * LOCK rq->lock |
---|
| 3186 | + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) |
---|
| 3187 | + * STORE p->on_cpu = 1 LOAD p->cpu |
---|
| 3188 | + * |
---|
| 3189 | + * to ensure we observe the correct CPU on which the task is currently |
---|
| 3190 | + * scheduling. |
---|
| 3191 | + */ |
---|
| 3192 | + if (smp_load_acquire(&p->on_cpu) && |
---|
| 3193 | + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) |
---|
| 3194 | + goto unlock; |
---|
2628 | 3195 | |
---|
2629 | 3196 | /* |
---|
2630 | 3197 | * If the owning (remote) CPU is still in the middle of schedule() with |
---|
.. | .. |
---|
2637 | 3204 | */ |
---|
2638 | 3205 | smp_cond_load_acquire(&p->on_cpu, !VAL); |
---|
2639 | 3206 | |
---|
2640 | | - p->sched_contributes_to_load = !!task_contributes_to_load(p); |
---|
2641 | | - p->state = TASK_WAKING; |
---|
| 3207 | + trace_android_rvh_try_to_wake_up(p); |
---|
2642 | 3208 | |
---|
2643 | | - if (p->in_iowait) { |
---|
2644 | | - delayacct_blkio_end(p); |
---|
2645 | | - atomic_dec(&task_rq(p)->nr_iowait); |
---|
2646 | | - } |
---|
2647 | | - |
---|
2648 | | - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, |
---|
2649 | | - sibling_count_hint); |
---|
| 3209 | + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
---|
2650 | 3210 | if (task_cpu(p) != cpu) { |
---|
| 3211 | + if (p->in_iowait) { |
---|
| 3212 | + delayacct_blkio_end(p); |
---|
| 3213 | + atomic_dec(&task_rq(p)->nr_iowait); |
---|
| 3214 | + } |
---|
| 3215 | + |
---|
2651 | 3216 | wake_flags |= WF_MIGRATED; |
---|
2652 | 3217 | psi_ttwu_dequeue(p); |
---|
2653 | 3218 | set_task_cpu(p, cpu); |
---|
2654 | 3219 | } |
---|
2655 | | - |
---|
2656 | | -#else /* CONFIG_SMP */ |
---|
2657 | | - |
---|
2658 | | - if (p->in_iowait) { |
---|
2659 | | - delayacct_blkio_end(p); |
---|
2660 | | - atomic_dec(&task_rq(p)->nr_iowait); |
---|
2661 | | - } |
---|
2662 | | - |
---|
| 3220 | +#else |
---|
| 3221 | + cpu = task_cpu(p); |
---|
2663 | 3222 | #endif /* CONFIG_SMP */ |
---|
2664 | 3223 | |
---|
2665 | 3224 | ttwu_queue(p, cpu, wake_flags); |
---|
2666 | | -stat: |
---|
2667 | | - ttwu_stat(p, cpu, wake_flags); |
---|
2668 | | -out: |
---|
| 3225 | +unlock: |
---|
2669 | 3226 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
---|
| 3227 | +out: |
---|
| 3228 | + if (success) { |
---|
| 3229 | + trace_android_rvh_try_to_wake_up_success(p); |
---|
| 3230 | + ttwu_stat(p, task_cpu(p), wake_flags); |
---|
| 3231 | + } |
---|
| 3232 | + preempt_enable(); |
---|
2670 | 3233 | |
---|
2671 | 3234 | return success; |
---|
2672 | 3235 | } |
---|
2673 | 3236 | |
---|
2674 | 3237 | /** |
---|
2675 | | - * try_to_wake_up_local - try to wake up a local task with rq lock held |
---|
2676 | | - * @p: the thread to be awakened |
---|
2677 | | - * @rf: request-queue flags for pinning |
---|
| 3238 | + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state |
---|
| 3239 | + * @p: Process for which the function is to be invoked, can be @current. |
---|
| 3240 | + * @func: Function to invoke. |
---|
| 3241 | + * @arg: Argument to function. |
---|
2678 | 3242 | * |
---|
2679 | | - * Put @p on the run-queue if it's not already there. The caller must |
---|
2680 | | - * ensure that this_rq() is locked, @p is bound to this_rq() and not |
---|
2681 | | - * the current task. |
---|
| 3243 | + * If the specified task can be quickly locked into a definite state |
---|
| 3244 | + * (either sleeping or on a given runqueue), arrange to keep it in that |
---|
| 3245 | + * state while invoking @func(@arg). This function can use ->on_rq and |
---|
| 3246 | + * task_curr() to work out what the state is, if required. Given that |
---|
| 3247 | + * @func can be invoked with a runqueue lock held, it had better be quite |
---|
| 3248 | + * lightweight. |
---|
| 3249 | + * |
---|
| 3250 | + * Returns: |
---|
| 3251 | + * @false if the task slipped out from under the locks. |
---|
| 3252 | + * @true if the task was locked onto a runqueue or is sleeping. |
---|
| 3253 | + * However, @func can override this by returning @false. |
---|
2682 | 3254 | */ |
---|
2683 | | -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) |
---|
| 3255 | +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) |
---|
2684 | 3256 | { |
---|
2685 | | - struct rq *rq = task_rq(p); |
---|
| 3257 | + struct rq_flags rf; |
---|
| 3258 | + bool ret = false; |
---|
| 3259 | + struct rq *rq; |
---|
2686 | 3260 | |
---|
2687 | | - if (WARN_ON_ONCE(rq != this_rq()) || |
---|
2688 | | - WARN_ON_ONCE(p == current)) |
---|
2689 | | - return; |
---|
2690 | | - |
---|
2691 | | - lockdep_assert_held(&rq->lock); |
---|
2692 | | - |
---|
2693 | | - if (!raw_spin_trylock(&p->pi_lock)) { |
---|
2694 | | - /* |
---|
2695 | | - * This is OK, because current is on_cpu, which avoids it being |
---|
2696 | | - * picked for load-balance and preemption/IRQs are still |
---|
2697 | | - * disabled avoiding further scheduler activity on it and we've |
---|
2698 | | - * not yet picked a replacement task. |
---|
2699 | | - */ |
---|
2700 | | - rq_unlock(rq, rf); |
---|
2701 | | - raw_spin_lock(&p->pi_lock); |
---|
2702 | | - rq_relock(rq, rf); |
---|
2703 | | - } |
---|
2704 | | - |
---|
2705 | | - if (!(p->state & TASK_NORMAL)) |
---|
2706 | | - goto out; |
---|
2707 | | - |
---|
2708 | | - trace_sched_waking(p); |
---|
2709 | | - |
---|
2710 | | - if (!task_on_rq_queued(p)) { |
---|
2711 | | - if (p->in_iowait) { |
---|
2712 | | - delayacct_blkio_end(p); |
---|
2713 | | - atomic_dec(&rq->nr_iowait); |
---|
| 3261 | + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); |
---|
| 3262 | + if (p->on_rq) { |
---|
| 3263 | + rq = __task_rq_lock(p, &rf); |
---|
| 3264 | + if (task_rq(p) == rq) |
---|
| 3265 | + ret = func(p, arg); |
---|
| 3266 | + rq_unlock(rq, &rf); |
---|
| 3267 | + } else { |
---|
| 3268 | + switch (p->state) { |
---|
| 3269 | + case TASK_RUNNING: |
---|
| 3270 | + case TASK_WAKING: |
---|
| 3271 | + break; |
---|
| 3272 | + default: |
---|
| 3273 | + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). |
---|
| 3274 | + if (!p->on_rq) |
---|
| 3275 | + ret = func(p, arg); |
---|
2714 | 3276 | } |
---|
2715 | | - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); |
---|
2716 | 3277 | } |
---|
2717 | | - |
---|
2718 | | - ttwu_do_wakeup(rq, p, 0, rf); |
---|
2719 | | - ttwu_stat(p, smp_processor_id(), 0); |
---|
2720 | | -out: |
---|
2721 | | - raw_spin_unlock(&p->pi_lock); |
---|
| 3278 | + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); |
---|
| 3279 | + return ret; |
---|
2722 | 3280 | } |
---|
2723 | 3281 | |
---|
2724 | 3282 | /** |
---|
.. | .. |
---|
2734 | 3292 | */ |
---|
2735 | 3293 | int wake_up_process(struct task_struct *p) |
---|
2736 | 3294 | { |
---|
2737 | | - return try_to_wake_up(p, TASK_NORMAL, 0, 1); |
---|
| 3295 | + return try_to_wake_up(p, TASK_NORMAL, 0); |
---|
2738 | 3296 | } |
---|
2739 | 3297 | EXPORT_SYMBOL(wake_up_process); |
---|
2740 | 3298 | |
---|
2741 | 3299 | int wake_up_state(struct task_struct *p, unsigned int state) |
---|
2742 | 3300 | { |
---|
2743 | | - return try_to_wake_up(p, state, 0, 1); |
---|
| 3301 | + return try_to_wake_up(p, state, 0); |
---|
2744 | 3302 | } |
---|
2745 | 3303 | |
---|
2746 | 3304 | /* |
---|
.. | .. |
---|
2765 | 3323 | p->se.cfs_rq = NULL; |
---|
2766 | 3324 | #endif |
---|
2767 | 3325 | |
---|
| 3326 | + trace_android_rvh_sched_fork_init(p); |
---|
| 3327 | + |
---|
2768 | 3328 | #ifdef CONFIG_SCHEDSTATS |
---|
2769 | 3329 | /* Even if schedstat is disabled, there should not be garbage */ |
---|
2770 | 3330 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
---|
.. | .. |
---|
2785 | 3345 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
---|
2786 | 3346 | #endif |
---|
2787 | 3347 | |
---|
| 3348 | +#ifdef CONFIG_COMPACTION |
---|
| 3349 | + p->capture_control = NULL; |
---|
| 3350 | +#endif |
---|
2788 | 3351 | init_numa_balancing(clone_flags, p); |
---|
| 3352 | +#ifdef CONFIG_SMP |
---|
| 3353 | + p->wake_entry.u_flags = CSD_TYPE_TTWU; |
---|
| 3354 | +#endif |
---|
2789 | 3355 | } |
---|
2790 | 3356 | |
---|
2791 | 3357 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); |
---|
.. | .. |
---|
2802 | 3368 | |
---|
2803 | 3369 | #ifdef CONFIG_PROC_SYSCTL |
---|
2804 | 3370 | int sysctl_numa_balancing(struct ctl_table *table, int write, |
---|
2805 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 3371 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
2806 | 3372 | { |
---|
2807 | 3373 | struct ctl_table t; |
---|
2808 | 3374 | int err; |
---|
.. | .. |
---|
2876 | 3442 | } |
---|
2877 | 3443 | |
---|
2878 | 3444 | #ifdef CONFIG_PROC_SYSCTL |
---|
2879 | | -int sysctl_schedstats(struct ctl_table *table, int write, |
---|
2880 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 3445 | +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, |
---|
| 3446 | + size_t *lenp, loff_t *ppos) |
---|
2881 | 3447 | { |
---|
2882 | 3448 | struct ctl_table t; |
---|
2883 | 3449 | int err; |
---|
.. | .. |
---|
2905 | 3471 | */ |
---|
2906 | 3472 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
---|
2907 | 3473 | { |
---|
2908 | | - unsigned long flags; |
---|
| 3474 | + trace_android_rvh_sched_fork(p); |
---|
2909 | 3475 | |
---|
2910 | 3476 | __sched_fork(clone_flags, p); |
---|
2911 | 3477 | /* |
---|
.. | .. |
---|
2919 | 3485 | * Make sure we do not leak PI boosting priority to the child. |
---|
2920 | 3486 | */ |
---|
2921 | 3487 | p->prio = current->normal_prio; |
---|
| 3488 | + trace_android_rvh_prepare_prio_fork(p); |
---|
2922 | 3489 | |
---|
2923 | 3490 | uclamp_fork(p); |
---|
2924 | 3491 | |
---|
.. | .. |
---|
2933 | 3500 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
---|
2934 | 3501 | p->static_prio = NICE_TO_PRIO(0); |
---|
2935 | 3502 | |
---|
2936 | | - p->prio = p->normal_prio = __normal_prio(p); |
---|
2937 | | - set_load_weight(p, false); |
---|
| 3503 | + p->prio = p->normal_prio = p->static_prio; |
---|
| 3504 | + set_load_weight(p); |
---|
2938 | 3505 | |
---|
2939 | 3506 | /* |
---|
2940 | 3507 | * We don't need the reset flag anymore after the fork. It has |
---|
.. | .. |
---|
2951 | 3518 | p->sched_class = &fair_sched_class; |
---|
2952 | 3519 | |
---|
2953 | 3520 | init_entity_runnable_average(&p->se); |
---|
| 3521 | + trace_android_rvh_finish_prio_fork(p); |
---|
2954 | 3522 | |
---|
2955 | | - /* |
---|
2956 | | - * The child is not yet in the pid-hash so no cgroup attach races, |
---|
2957 | | - * and the cgroup is pinned to this child due to cgroup_fork() |
---|
2958 | | - * is ran before sched_fork(). |
---|
2959 | | - * |
---|
2960 | | - * Silence PROVE_RCU. |
---|
2961 | | - */ |
---|
2962 | | - raw_spin_lock_irqsave(&p->pi_lock, flags); |
---|
2963 | | - rseq_migrate(p); |
---|
2964 | | - /* |
---|
2965 | | - * We're setting the CPU for the first time, we don't migrate, |
---|
2966 | | - * so use __set_task_cpu(). |
---|
2967 | | - */ |
---|
2968 | | - __set_task_cpu(p, smp_processor_id()); |
---|
2969 | | - if (p->sched_class->task_fork) |
---|
2970 | | - p->sched_class->task_fork(p); |
---|
2971 | | - raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
---|
2972 | 3523 | |
---|
2973 | 3524 | #ifdef CONFIG_SCHED_INFO |
---|
2974 | 3525 | if (likely(sched_info_on())) |
---|
.. | .. |
---|
2983 | 3534 | RB_CLEAR_NODE(&p->pushable_dl_tasks); |
---|
2984 | 3535 | #endif |
---|
2985 | 3536 | return 0; |
---|
| 3537 | +} |
---|
| 3538 | + |
---|
| 3539 | +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) |
---|
| 3540 | +{ |
---|
| 3541 | + unsigned long flags; |
---|
| 3542 | + |
---|
| 3543 | + /* |
---|
| 3544 | + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly |
---|
| 3545 | + * required yet, but lockdep gets upset if rules are violated. |
---|
| 3546 | + */ |
---|
| 3547 | + raw_spin_lock_irqsave(&p->pi_lock, flags); |
---|
| 3548 | +#ifdef CONFIG_CGROUP_SCHED |
---|
| 3549 | + if (1) { |
---|
| 3550 | + struct task_group *tg; |
---|
| 3551 | + |
---|
| 3552 | + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], |
---|
| 3553 | + struct task_group, css); |
---|
| 3554 | + tg = autogroup_task_group(p, tg); |
---|
| 3555 | + p->sched_task_group = tg; |
---|
| 3556 | + } |
---|
| 3557 | +#endif |
---|
| 3558 | + rseq_migrate(p); |
---|
| 3559 | + /* |
---|
| 3560 | + * We're setting the CPU for the first time, we don't migrate, |
---|
| 3561 | + * so use __set_task_cpu(). |
---|
| 3562 | + */ |
---|
| 3563 | + __set_task_cpu(p, smp_processor_id()); |
---|
| 3564 | + if (p->sched_class->task_fork) |
---|
| 3565 | + p->sched_class->task_fork(p); |
---|
| 3566 | + raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
---|
| 3567 | +} |
---|
| 3568 | + |
---|
| 3569 | +void sched_post_fork(struct task_struct *p) |
---|
| 3570 | +{ |
---|
| 3571 | + uclamp_post_fork(p); |
---|
2986 | 3572 | } |
---|
2987 | 3573 | |
---|
2988 | 3574 | unsigned long to_ratio(u64 period, u64 runtime) |
---|
.. | .. |
---|
3013 | 3599 | struct rq_flags rf; |
---|
3014 | 3600 | struct rq *rq; |
---|
3015 | 3601 | |
---|
| 3602 | + trace_android_rvh_wake_up_new_task(p); |
---|
| 3603 | + |
---|
3016 | 3604 | raw_spin_lock_irqsave(&p->pi_lock, rf.flags); |
---|
3017 | 3605 | p->state = TASK_RUNNING; |
---|
3018 | 3606 | #ifdef CONFIG_SMP |
---|
3019 | 3607 | /* |
---|
3020 | 3608 | * Fork balancing, do it here and not earlier because: |
---|
3021 | | - * - cpus_allowed can change in the fork path |
---|
| 3609 | + * - cpus_ptr can change in the fork path |
---|
3022 | 3610 | * - any previously selected CPU might disappear through hotplug |
---|
3023 | 3611 | * |
---|
3024 | 3612 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
---|
.. | .. |
---|
3026 | 3614 | */ |
---|
3027 | 3615 | p->recent_used_cpu = task_cpu(p); |
---|
3028 | 3616 | rseq_migrate(p); |
---|
3029 | | - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); |
---|
| 3617 | + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
---|
3030 | 3618 | #endif |
---|
3031 | 3619 | rq = __task_rq_lock(p, &rf); |
---|
3032 | 3620 | update_rq_clock(rq); |
---|
3033 | | - post_init_entity_util_avg(&p->se); |
---|
| 3621 | + post_init_entity_util_avg(p); |
---|
| 3622 | + trace_android_rvh_new_task_stats(p); |
---|
3034 | 3623 | |
---|
3035 | 3624 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
---|
3036 | | - p->on_rq = TASK_ON_RQ_QUEUED; |
---|
3037 | 3625 | trace_sched_wakeup_new(p); |
---|
3038 | 3626 | check_preempt_curr(rq, p, WF_FORK); |
---|
3039 | 3627 | #ifdef CONFIG_SMP |
---|
.. | .. |
---|
3143 | 3731 | /* |
---|
3144 | 3732 | * Claim the task as running, we do this before switching to it |
---|
3145 | 3733 | * such that any running task will have this set. |
---|
| 3734 | + * |
---|
| 3735 | + * See the ttwu() WF_ON_CPU case and its ordering comment. |
---|
3146 | 3736 | */ |
---|
3147 | | - next->on_cpu = 1; |
---|
| 3737 | + WRITE_ONCE(next->on_cpu, 1); |
---|
3148 | 3738 | #endif |
---|
3149 | 3739 | } |
---|
3150 | 3740 | |
---|
.. | .. |
---|
3152 | 3742 | { |
---|
3153 | 3743 | #ifdef CONFIG_SMP |
---|
3154 | 3744 | /* |
---|
3155 | | - * After ->on_cpu is cleared, the task can be moved to a different CPU. |
---|
3156 | | - * We must ensure this doesn't happen until the switch is completely |
---|
| 3745 | + * This must be the very last reference to @prev from this CPU. After |
---|
| 3746 | + * p->on_cpu is cleared, the task can be moved to a different CPU. We |
---|
| 3747 | + * must ensure this doesn't happen until the switch is completely |
---|
3157 | 3748 | * finished. |
---|
3158 | 3749 | * |
---|
3159 | 3750 | * In particular, the load of prev->state in finish_task_switch() must |
---|
.. | .. |
---|
3175 | 3766 | * do an early lockdep release here: |
---|
3176 | 3767 | */ |
---|
3177 | 3768 | rq_unpin_lock(rq, rf); |
---|
3178 | | - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
---|
| 3769 | + spin_release(&rq->lock.dep_map, _THIS_IP_); |
---|
3179 | 3770 | #ifdef CONFIG_DEBUG_SPINLOCK |
---|
3180 | 3771 | /* this is a valid case when another task releases the spinlock */ |
---|
3181 | 3772 | rq->lock.owner = next; |
---|
.. | .. |
---|
3320 | 3911 | * task and put them back on the free list. |
---|
3321 | 3912 | */ |
---|
3322 | 3913 | kprobe_flush_task(prev); |
---|
| 3914 | + trace_android_rvh_flush_task(prev); |
---|
3323 | 3915 | |
---|
3324 | 3916 | /* Task is done with its stack. */ |
---|
3325 | 3917 | put_task_stack(prev); |
---|
3326 | 3918 | |
---|
3327 | | - put_task_struct(prev); |
---|
| 3919 | + put_task_struct_rcu_user(prev); |
---|
3328 | 3920 | } |
---|
3329 | 3921 | |
---|
3330 | 3922 | tick_nohz_task_switch(); |
---|
.. | .. |
---|
3403 | 3995 | context_switch(struct rq *rq, struct task_struct *prev, |
---|
3404 | 3996 | struct task_struct *next, struct rq_flags *rf) |
---|
3405 | 3997 | { |
---|
3406 | | - struct mm_struct *mm, *oldmm; |
---|
3407 | | - |
---|
3408 | 3998 | prepare_task_switch(rq, prev, next); |
---|
3409 | 3999 | |
---|
3410 | | - mm = next->mm; |
---|
3411 | | - oldmm = prev->active_mm; |
---|
3412 | 4000 | /* |
---|
3413 | 4001 | * For paravirt, this is coupled with an exit in switch_to to |
---|
3414 | 4002 | * combine the page table reload and the switch backend into |
---|
.. | .. |
---|
3417 | 4005 | arch_start_context_switch(prev); |
---|
3418 | 4006 | |
---|
3419 | 4007 | /* |
---|
3420 | | - * If mm is non-NULL, we pass through switch_mm(). If mm is |
---|
3421 | | - * NULL, we will pass through mmdrop() in finish_task_switch(). |
---|
3422 | | - * Both of these contain the full memory barrier required by |
---|
3423 | | - * membarrier after storing to rq->curr, before returning to |
---|
3424 | | - * user-space. |
---|
| 4008 | + * kernel -> kernel lazy + transfer active |
---|
| 4009 | + * user -> kernel lazy + mmgrab() active |
---|
| 4010 | + * |
---|
| 4011 | + * kernel -> user switch + mmdrop() active |
---|
| 4012 | + * user -> user switch |
---|
3425 | 4013 | */ |
---|
3426 | | - if (!mm) { |
---|
3427 | | - next->active_mm = oldmm; |
---|
3428 | | - mmgrab(oldmm); |
---|
3429 | | - enter_lazy_tlb(oldmm, next); |
---|
3430 | | - } else |
---|
3431 | | - switch_mm_irqs_off(oldmm, mm, next); |
---|
| 4014 | + if (!next->mm) { // to kernel |
---|
| 4015 | + enter_lazy_tlb(prev->active_mm, next); |
---|
3432 | 4016 | |
---|
3433 | | - if (!prev->mm) { |
---|
3434 | | - prev->active_mm = NULL; |
---|
3435 | | - rq->prev_mm = oldmm; |
---|
| 4017 | + next->active_mm = prev->active_mm; |
---|
| 4018 | + if (prev->mm) // from user |
---|
| 4019 | + mmgrab(prev->active_mm); |
---|
| 4020 | + else |
---|
| 4021 | + prev->active_mm = NULL; |
---|
| 4022 | + } else { // to user |
---|
| 4023 | + membarrier_switch_mm(rq, prev->active_mm, next->mm); |
---|
| 4024 | + /* |
---|
| 4025 | + * sys_membarrier() requires an smp_mb() between setting |
---|
| 4026 | + * rq->curr / membarrier_switch_mm() and returning to userspace. |
---|
| 4027 | + * |
---|
| 4028 | + * The below provides this either through switch_mm(), or in |
---|
| 4029 | + * case 'prev->active_mm == next->mm' through |
---|
| 4030 | + * finish_task_switch()'s mmdrop(). |
---|
| 4031 | + */ |
---|
| 4032 | + switch_mm_irqs_off(prev->active_mm, next->mm, next); |
---|
| 4033 | + |
---|
| 4034 | + if (!prev->mm) { // from kernel |
---|
| 4035 | + /* will mmdrop() in finish_task_switch(). */ |
---|
| 4036 | + rq->prev_mm = prev->active_mm; |
---|
| 4037 | + prev->active_mm = NULL; |
---|
| 4038 | + } |
---|
3436 | 4039 | } |
---|
3437 | 4040 | |
---|
3438 | 4041 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); |
---|
.. | .. |
---|
3469 | 4072 | * preemption, thus the result might have a time-of-check-to-time-of-use |
---|
3470 | 4073 | * race. The caller is responsible to use it correctly, for example: |
---|
3471 | 4074 | * |
---|
3472 | | - * - from a non-preemptable section (of course) |
---|
| 4075 | + * - from a non-preemptible section (of course) |
---|
3473 | 4076 | * |
---|
3474 | 4077 | * - from a thread that is bound to a single CPU |
---|
3475 | 4078 | * |
---|
.. | .. |
---|
3490 | 4093 | sum += cpu_rq(i)->nr_switches; |
---|
3491 | 4094 | |
---|
3492 | 4095 | return sum; |
---|
| 4096 | +} |
---|
| 4097 | + |
---|
| 4098 | +/* |
---|
| 4099 | + * Consumers of these two interfaces, like for example the cpuidle menu |
---|
| 4100 | + * governor, are using nonsensical data. Preferring shallow idle state selection |
---|
| 4101 | + * for a CPU that has IO-wait which might not even end up running the task when |
---|
| 4102 | + * it does become runnable. |
---|
| 4103 | + */ |
---|
| 4104 | + |
---|
| 4105 | +unsigned long nr_iowait_cpu(int cpu) |
---|
| 4106 | +{ |
---|
| 4107 | + return atomic_read(&cpu_rq(cpu)->nr_iowait); |
---|
3493 | 4108 | } |
---|
3494 | 4109 | |
---|
3495 | 4110 | /* |
---|
.. | .. |
---|
3527 | 4142 | unsigned long i, sum = 0; |
---|
3528 | 4143 | |
---|
3529 | 4144 | for_each_possible_cpu(i) |
---|
3530 | | - sum += atomic_read(&cpu_rq(i)->nr_iowait); |
---|
| 4145 | + sum += nr_iowait_cpu(i); |
---|
3531 | 4146 | |
---|
3532 | 4147 | return sum; |
---|
3533 | | -} |
---|
3534 | | - |
---|
3535 | | -/* |
---|
3536 | | - * Consumers of these two interfaces, like for example the cpufreq menu |
---|
3537 | | - * governor are using nonsensical data. Boosting frequency for a CPU that has |
---|
3538 | | - * IO-wait which might not even end up running the task when it does become |
---|
3539 | | - * runnable. |
---|
3540 | | - */ |
---|
3541 | | - |
---|
3542 | | -unsigned long nr_iowait_cpu(int cpu) |
---|
3543 | | -{ |
---|
3544 | | - struct rq *this = cpu_rq(cpu); |
---|
3545 | | - return atomic_read(&this->nr_iowait); |
---|
3546 | | -} |
---|
3547 | | - |
---|
3548 | | -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) |
---|
3549 | | -{ |
---|
3550 | | - struct rq *rq = this_rq(); |
---|
3551 | | - *nr_waiters = atomic_read(&rq->nr_iowait); |
---|
3552 | | - *load = rq->load.weight; |
---|
3553 | 4148 | } |
---|
3554 | 4149 | |
---|
3555 | 4150 | #ifdef CONFIG_SMP |
---|
.. | .. |
---|
3563 | 4158 | struct task_struct *p = current; |
---|
3564 | 4159 | unsigned long flags; |
---|
3565 | 4160 | int dest_cpu; |
---|
| 4161 | + bool cond = false; |
---|
| 4162 | + |
---|
| 4163 | + trace_android_rvh_sched_exec(&cond); |
---|
| 4164 | + if (cond) |
---|
| 4165 | + return; |
---|
3566 | 4166 | |
---|
3567 | 4167 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
---|
3568 | | - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); |
---|
| 4168 | + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); |
---|
3569 | 4169 | if (dest_cpu == smp_processor_id()) |
---|
3570 | 4170 | goto unlock; |
---|
3571 | 4171 | |
---|
.. | .. |
---|
3648 | 4248 | |
---|
3649 | 4249 | return ns; |
---|
3650 | 4250 | } |
---|
| 4251 | +EXPORT_SYMBOL_GPL(task_sched_runtime); |
---|
3651 | 4252 | |
---|
3652 | 4253 | /* |
---|
3653 | 4254 | * This function gets called by the timer code, with HZ frequency. |
---|
.. | .. |
---|
3659 | 4260 | struct rq *rq = cpu_rq(cpu); |
---|
3660 | 4261 | struct task_struct *curr = rq->curr; |
---|
3661 | 4262 | struct rq_flags rf; |
---|
| 4263 | + unsigned long thermal_pressure; |
---|
3662 | 4264 | |
---|
| 4265 | + arch_scale_freq_tick(); |
---|
3663 | 4266 | sched_clock_tick(); |
---|
3664 | 4267 | |
---|
3665 | 4268 | rq_lock(rq, &rf); |
---|
3666 | 4269 | |
---|
| 4270 | + trace_android_rvh_tick_entry(rq); |
---|
3667 | 4271 | update_rq_clock(rq); |
---|
| 4272 | + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); |
---|
| 4273 | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); |
---|
3668 | 4274 | curr->sched_class->task_tick(rq, curr, 0); |
---|
3669 | | - cpu_load_update_active(rq); |
---|
3670 | 4275 | calc_global_load_tick(rq); |
---|
3671 | 4276 | psi_task_tick(rq); |
---|
3672 | 4277 | |
---|
.. | .. |
---|
3678 | 4283 | rq->idle_balance = idle_cpu(cpu); |
---|
3679 | 4284 | trigger_load_balance(rq); |
---|
3680 | 4285 | #endif |
---|
| 4286 | + |
---|
| 4287 | + trace_android_vh_scheduler_tick(rq); |
---|
3681 | 4288 | } |
---|
3682 | 4289 | |
---|
3683 | 4290 | #ifdef CONFIG_NO_HZ_FULL |
---|
.. | .. |
---|
3735 | 4342 | * statistics and checks timeslices in a time-independent way, regardless |
---|
3736 | 4343 | * of when exactly it is running. |
---|
3737 | 4344 | */ |
---|
3738 | | - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) |
---|
| 4345 | + if (!tick_nohz_tick_stopped_cpu(cpu)) |
---|
3739 | 4346 | goto out_requeue; |
---|
3740 | 4347 | |
---|
3741 | 4348 | rq_lock_irq(rq, &rf); |
---|
3742 | 4349 | curr = rq->curr; |
---|
3743 | | - if (is_idle_task(curr) || cpu_is_offline(cpu)) |
---|
| 4350 | + if (cpu_is_offline(cpu)) |
---|
3744 | 4351 | goto out_unlock; |
---|
3745 | 4352 | |
---|
3746 | 4353 | update_rq_clock(rq); |
---|
3747 | | - delta = rq_clock_task(rq) - curr->se.exec_start; |
---|
3748 | 4354 | |
---|
3749 | | - /* |
---|
3750 | | - * Make sure the next tick runs within a reasonable |
---|
3751 | | - * amount of time. |
---|
3752 | | - */ |
---|
3753 | | - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); |
---|
| 4355 | + if (!is_idle_task(curr)) { |
---|
| 4356 | + /* |
---|
| 4357 | + * Make sure the next tick runs within a reasonable |
---|
| 4358 | + * amount of time. |
---|
| 4359 | + */ |
---|
| 4360 | + delta = rq_clock_task(rq) - curr->se.exec_start; |
---|
| 4361 | + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); |
---|
| 4362 | + } |
---|
3754 | 4363 | curr->sched_class->task_tick(rq, curr, 0); |
---|
3755 | 4364 | |
---|
| 4365 | + calc_load_nohz_remote(rq); |
---|
3756 | 4366 | out_unlock: |
---|
3757 | 4367 | rq_unlock_irq(rq, &rf); |
---|
3758 | | - |
---|
3759 | 4368 | out_requeue: |
---|
| 4369 | + |
---|
3760 | 4370 | /* |
---|
3761 | 4371 | * Run the remote tick once per second (1Hz). This arbitrary |
---|
3762 | 4372 | * frequency is large enough to avoid overload but short enough |
---|
.. | .. |
---|
3820 | 4430 | static inline void sched_tick_stop(int cpu) { } |
---|
3821 | 4431 | #endif |
---|
3822 | 4432 | |
---|
3823 | | -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
---|
| 4433 | +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
---|
3824 | 4434 | defined(CONFIG_TRACE_PREEMPT_TOGGLE)) |
---|
3825 | 4435 | /* |
---|
3826 | 4436 | * If the value passed in is equal to the current preempt count |
---|
.. | .. |
---|
3926 | 4536 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
---|
3927 | 4537 | && in_atomic_preempt_off()) { |
---|
3928 | 4538 | pr_err("Preemption disabled at:"); |
---|
3929 | | - print_ip_sym(preempt_disable_ip); |
---|
3930 | | - pr_cont("\n"); |
---|
| 4539 | + print_ip_sym(KERN_ERR, preempt_disable_ip); |
---|
3931 | 4540 | } |
---|
3932 | | - if (panic_on_warn) |
---|
3933 | | - panic("scheduling while atomic\n"); |
---|
| 4541 | + check_panic_on_warn("scheduling while atomic"); |
---|
| 4542 | + |
---|
| 4543 | + trace_android_rvh_schedule_bug(prev); |
---|
3934 | 4544 | |
---|
3935 | 4545 | dump_stack(); |
---|
3936 | 4546 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
---|
.. | .. |
---|
3939 | 4549 | /* |
---|
3940 | 4550 | * Various schedule()-time debugging checks and statistics: |
---|
3941 | 4551 | */ |
---|
3942 | | -static inline void schedule_debug(struct task_struct *prev) |
---|
| 4552 | +static inline void schedule_debug(struct task_struct *prev, bool preempt) |
---|
3943 | 4553 | { |
---|
3944 | 4554 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
---|
3945 | 4555 | if (task_stack_end_corrupted(prev)) |
---|
3946 | 4556 | panic("corrupted stack end detected inside scheduler\n"); |
---|
| 4557 | + |
---|
| 4558 | + if (task_scs_end_corrupted(prev)) |
---|
| 4559 | + panic("corrupted shadow stack detected inside scheduler\n"); |
---|
| 4560 | +#endif |
---|
| 4561 | + |
---|
| 4562 | +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
---|
| 4563 | + if (!preempt && prev->state && prev->non_block_count) { |
---|
| 4564 | + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", |
---|
| 4565 | + prev->comm, prev->pid, prev->non_block_count); |
---|
| 4566 | + dump_stack(); |
---|
| 4567 | + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
---|
| 4568 | + } |
---|
3947 | 4569 | #endif |
---|
3948 | 4570 | |
---|
3949 | 4571 | if (unlikely(in_atomic_preempt_off())) { |
---|
.. | .. |
---|
3955 | 4577 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
---|
3956 | 4578 | |
---|
3957 | 4579 | schedstat_inc(this_rq()->sched_count); |
---|
| 4580 | +} |
---|
| 4581 | + |
---|
| 4582 | +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, |
---|
| 4583 | + struct rq_flags *rf) |
---|
| 4584 | +{ |
---|
| 4585 | +#ifdef CONFIG_SMP |
---|
| 4586 | + const struct sched_class *class; |
---|
| 4587 | + /* |
---|
| 4588 | + * We must do the balancing pass before put_prev_task(), such |
---|
| 4589 | + * that when we release the rq->lock the task is in the same |
---|
| 4590 | + * state as before we took rq->lock. |
---|
| 4591 | + * |
---|
| 4592 | + * We can terminate the balance pass as soon as we know there is |
---|
| 4593 | + * a runnable task of @class priority or higher. |
---|
| 4594 | + */ |
---|
| 4595 | + for_class_range(class, prev->sched_class, &idle_sched_class) { |
---|
| 4596 | + if (class->balance(rq, prev, rf)) |
---|
| 4597 | + break; |
---|
| 4598 | + } |
---|
| 4599 | +#endif |
---|
| 4600 | + |
---|
| 4601 | + put_prev_task(rq, prev); |
---|
3958 | 4602 | } |
---|
3959 | 4603 | |
---|
3960 | 4604 | /* |
---|
.. | .. |
---|
3972 | 4616 | * higher scheduling class, because otherwise those loose the |
---|
3973 | 4617 | * opportunity to pull in more work from other CPUs. |
---|
3974 | 4618 | */ |
---|
3975 | | - if (likely((prev->sched_class == &idle_sched_class || |
---|
3976 | | - prev->sched_class == &fair_sched_class) && |
---|
| 4619 | + if (likely(prev->sched_class <= &fair_sched_class && |
---|
3977 | 4620 | rq->nr_running == rq->cfs.h_nr_running)) { |
---|
3978 | 4621 | |
---|
3979 | | - p = fair_sched_class.pick_next_task(rq, prev, rf); |
---|
| 4622 | + p = pick_next_task_fair(rq, prev, rf); |
---|
3980 | 4623 | if (unlikely(p == RETRY_TASK)) |
---|
3981 | | - goto again; |
---|
| 4624 | + goto restart; |
---|
3982 | 4625 | |
---|
3983 | 4626 | /* Assumes fair_sched_class->next == idle_sched_class */ |
---|
3984 | | - if (unlikely(!p)) |
---|
3985 | | - p = idle_sched_class.pick_next_task(rq, prev, rf); |
---|
| 4627 | + if (!p) { |
---|
| 4628 | + put_prev_task(rq, prev); |
---|
| 4629 | + p = pick_next_task_idle(rq); |
---|
| 4630 | + } |
---|
3986 | 4631 | |
---|
3987 | 4632 | return p; |
---|
3988 | 4633 | } |
---|
3989 | 4634 | |
---|
3990 | | -again: |
---|
| 4635 | +restart: |
---|
| 4636 | + put_prev_task_balance(rq, prev, rf); |
---|
| 4637 | + |
---|
3991 | 4638 | for_each_class(class) { |
---|
3992 | | - p = class->pick_next_task(rq, prev, rf); |
---|
3993 | | - if (p) { |
---|
3994 | | - if (unlikely(p == RETRY_TASK)) |
---|
3995 | | - goto again; |
---|
| 4639 | + p = class->pick_next_task(rq); |
---|
| 4640 | + if (p) |
---|
3996 | 4641 | return p; |
---|
3997 | | - } |
---|
3998 | 4642 | } |
---|
3999 | 4643 | |
---|
4000 | 4644 | /* The idle class should always have a runnable task: */ |
---|
.. | .. |
---|
4021 | 4665 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets |
---|
4022 | 4666 | * called on the nearest possible occasion: |
---|
4023 | 4667 | * |
---|
4024 | | - * - If the kernel is preemptible (CONFIG_PREEMPT=y): |
---|
| 4668 | + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): |
---|
4025 | 4669 | * |
---|
4026 | 4670 | * - in syscall or exception context, at the next outmost |
---|
4027 | 4671 | * preempt_enable(). (this might be as soon as the wake_up()'s |
---|
.. | .. |
---|
4030 | 4674 | * - in IRQ context, return from interrupt-handler to |
---|
4031 | 4675 | * preemptible context |
---|
4032 | 4676 | * |
---|
4033 | | - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) |
---|
| 4677 | + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) |
---|
4034 | 4678 | * then at the next: |
---|
4035 | 4679 | * |
---|
4036 | 4680 | * - cond_resched() call |
---|
.. | .. |
---|
4044 | 4688 | { |
---|
4045 | 4689 | struct task_struct *prev, *next; |
---|
4046 | 4690 | unsigned long *switch_count; |
---|
| 4691 | + unsigned long prev_state; |
---|
4047 | 4692 | struct rq_flags rf; |
---|
4048 | 4693 | struct rq *rq; |
---|
4049 | 4694 | int cpu; |
---|
.. | .. |
---|
4052 | 4697 | rq = cpu_rq(cpu); |
---|
4053 | 4698 | prev = rq->curr; |
---|
4054 | 4699 | |
---|
4055 | | - schedule_debug(prev); |
---|
| 4700 | + schedule_debug(prev, preempt); |
---|
4056 | 4701 | |
---|
4057 | 4702 | if (sched_feat(HRTICK)) |
---|
4058 | 4703 | hrtick_clear(rq); |
---|
.. | .. |
---|
4063 | 4708 | /* |
---|
4064 | 4709 | * Make sure that signal_pending_state()->signal_pending() below |
---|
4065 | 4710 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) |
---|
4066 | | - * done by the caller to avoid the race with signal_wake_up(). |
---|
| 4711 | + * done by the caller to avoid the race with signal_wake_up(): |
---|
4067 | 4712 | * |
---|
4068 | | - * The membarrier system call requires a full memory barrier |
---|
| 4713 | + * __set_current_state(@state) signal_wake_up() |
---|
| 4714 | + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) |
---|
| 4715 | + * wake_up_state(p, state) |
---|
| 4716 | + * LOCK rq->lock LOCK p->pi_state |
---|
| 4717 | + * smp_mb__after_spinlock() smp_mb__after_spinlock() |
---|
| 4718 | + * if (signal_pending_state()) if (p->state & @state) |
---|
| 4719 | + * |
---|
| 4720 | + * Also, the membarrier system call requires a full memory barrier |
---|
4069 | 4721 | * after coming from user-space, before storing to rq->curr. |
---|
4070 | 4722 | */ |
---|
4071 | 4723 | rq_lock(rq, &rf); |
---|
.. | .. |
---|
4076 | 4728 | update_rq_clock(rq); |
---|
4077 | 4729 | |
---|
4078 | 4730 | switch_count = &prev->nivcsw; |
---|
4079 | | - if (!preempt && prev->state) { |
---|
4080 | | - if (unlikely(signal_pending_state(prev->state, prev))) { |
---|
| 4731 | + |
---|
| 4732 | + /* |
---|
| 4733 | + * We must load prev->state once (task_struct::state is volatile), such |
---|
| 4734 | + * that: |
---|
| 4735 | + * |
---|
| 4736 | + * - we form a control dependency vs deactivate_task() below. |
---|
| 4737 | + * - ptrace_{,un}freeze_traced() can change ->state underneath us. |
---|
| 4738 | + */ |
---|
| 4739 | + prev_state = prev->state; |
---|
| 4740 | + if (!preempt && prev_state) { |
---|
| 4741 | + if (signal_pending_state(prev_state, prev)) { |
---|
4081 | 4742 | prev->state = TASK_RUNNING; |
---|
4082 | 4743 | } else { |
---|
| 4744 | + prev->sched_contributes_to_load = |
---|
| 4745 | + (prev_state & TASK_UNINTERRUPTIBLE) && |
---|
| 4746 | + !(prev_state & TASK_NOLOAD) && |
---|
| 4747 | + !(prev->flags & PF_FROZEN); |
---|
| 4748 | + |
---|
| 4749 | + if (prev->sched_contributes_to_load) |
---|
| 4750 | + rq->nr_uninterruptible++; |
---|
| 4751 | + |
---|
| 4752 | + /* |
---|
| 4753 | + * __schedule() ttwu() |
---|
| 4754 | + * prev_state = prev->state; if (p->on_rq && ...) |
---|
| 4755 | + * if (prev_state) goto out; |
---|
| 4756 | + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); |
---|
| 4757 | + * p->state = TASK_WAKING |
---|
| 4758 | + * |
---|
| 4759 | + * Where __schedule() and ttwu() have matching control dependencies. |
---|
| 4760 | + * |
---|
| 4761 | + * After this, schedule() must not care about p->state any more. |
---|
| 4762 | + */ |
---|
4083 | 4763 | deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); |
---|
4084 | | - prev->on_rq = 0; |
---|
4085 | 4764 | |
---|
4086 | 4765 | if (prev->in_iowait) { |
---|
4087 | 4766 | atomic_inc(&rq->nr_iowait); |
---|
4088 | 4767 | delayacct_blkio_start(); |
---|
4089 | | - } |
---|
4090 | | - |
---|
4091 | | - /* |
---|
4092 | | - * If a worker went to sleep, notify and ask workqueue |
---|
4093 | | - * whether it wants to wake up a task to maintain |
---|
4094 | | - * concurrency. |
---|
4095 | | - */ |
---|
4096 | | - if (prev->flags & PF_WQ_WORKER) { |
---|
4097 | | - struct task_struct *to_wakeup; |
---|
4098 | | - |
---|
4099 | | - to_wakeup = wq_worker_sleeping(prev); |
---|
4100 | | - if (to_wakeup) |
---|
4101 | | - try_to_wake_up_local(to_wakeup, &rf); |
---|
4102 | 4768 | } |
---|
4103 | 4769 | } |
---|
4104 | 4770 | switch_count = &prev->nvcsw; |
---|
.. | .. |
---|
4108 | 4774 | clear_tsk_need_resched(prev); |
---|
4109 | 4775 | clear_preempt_need_resched(); |
---|
4110 | 4776 | |
---|
| 4777 | + trace_android_rvh_schedule(prev, next, rq); |
---|
4111 | 4778 | if (likely(prev != next)) { |
---|
4112 | 4779 | rq->nr_switches++; |
---|
4113 | | - rq->curr = next; |
---|
| 4780 | + /* |
---|
| 4781 | + * RCU users of rcu_dereference(rq->curr) may not see |
---|
| 4782 | + * changes to task_struct made by pick_next_task(). |
---|
| 4783 | + */ |
---|
| 4784 | + RCU_INIT_POINTER(rq->curr, next); |
---|
4114 | 4785 | /* |
---|
4115 | 4786 | * The membarrier system call requires each architecture |
---|
4116 | 4787 | * to have a full memory barrier after updating |
---|
.. | .. |
---|
4126 | 4797 | * is a RELEASE barrier), |
---|
4127 | 4798 | */ |
---|
4128 | 4799 | ++*switch_count; |
---|
| 4800 | + |
---|
| 4801 | + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); |
---|
4129 | 4802 | |
---|
4130 | 4803 | trace_sched_switch(preempt, prev, next); |
---|
4131 | 4804 | |
---|
.. | .. |
---|
4157 | 4830 | |
---|
4158 | 4831 | static inline void sched_submit_work(struct task_struct *tsk) |
---|
4159 | 4832 | { |
---|
4160 | | - if (!tsk->state || tsk_is_pi_blocked(tsk)) |
---|
| 4833 | + unsigned int task_flags; |
---|
| 4834 | + |
---|
| 4835 | + if (!tsk->state) |
---|
4161 | 4836 | return; |
---|
| 4837 | + |
---|
| 4838 | + task_flags = tsk->flags; |
---|
| 4839 | + /* |
---|
| 4840 | + * If a worker went to sleep, notify and ask workqueue whether |
---|
| 4841 | + * it wants to wake up a task to maintain concurrency. |
---|
| 4842 | + * As this function is called inside the schedule() context, |
---|
| 4843 | + * we disable preemption to avoid it calling schedule() again |
---|
| 4844 | + * in the possible wakeup of a kworker and because wq_worker_sleeping() |
---|
| 4845 | + * requires it. |
---|
| 4846 | + */ |
---|
| 4847 | + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { |
---|
| 4848 | + preempt_disable(); |
---|
| 4849 | + if (task_flags & PF_WQ_WORKER) |
---|
| 4850 | + wq_worker_sleeping(tsk); |
---|
| 4851 | + else |
---|
| 4852 | + io_wq_worker_sleeping(tsk); |
---|
| 4853 | + preempt_enable_no_resched(); |
---|
| 4854 | + } |
---|
| 4855 | + |
---|
| 4856 | + if (tsk_is_pi_blocked(tsk)) |
---|
| 4857 | + return; |
---|
| 4858 | + |
---|
4162 | 4859 | /* |
---|
4163 | 4860 | * If we are going to sleep and we have plugged IO queued, |
---|
4164 | 4861 | * make sure to submit it to avoid deadlocks. |
---|
4165 | 4862 | */ |
---|
4166 | 4863 | if (blk_needs_flush_plug(tsk)) |
---|
4167 | 4864 | blk_schedule_flush_plug(tsk); |
---|
| 4865 | +} |
---|
| 4866 | + |
---|
| 4867 | +static void sched_update_worker(struct task_struct *tsk) |
---|
| 4868 | +{ |
---|
| 4869 | + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { |
---|
| 4870 | + if (tsk->flags & PF_WQ_WORKER) |
---|
| 4871 | + wq_worker_running(tsk); |
---|
| 4872 | + else |
---|
| 4873 | + io_wq_worker_running(tsk); |
---|
| 4874 | + } |
---|
4168 | 4875 | } |
---|
4169 | 4876 | |
---|
4170 | 4877 | asmlinkage __visible void __sched schedule(void) |
---|
.. | .. |
---|
4177 | 4884 | __schedule(false); |
---|
4178 | 4885 | sched_preempt_enable_no_resched(); |
---|
4179 | 4886 | } while (need_resched()); |
---|
| 4887 | + sched_update_worker(tsk); |
---|
4180 | 4888 | } |
---|
4181 | 4889 | EXPORT_SYMBOL(schedule); |
---|
4182 | 4890 | |
---|
.. | .. |
---|
4265 | 4973 | } while (need_resched()); |
---|
4266 | 4974 | } |
---|
4267 | 4975 | |
---|
4268 | | -#ifdef CONFIG_PREEMPT |
---|
| 4976 | +#ifdef CONFIG_PREEMPTION |
---|
4269 | 4977 | /* |
---|
4270 | | - * this is the entry point to schedule() from in-kernel preemption |
---|
4271 | | - * off of preempt_enable. Kernel preemptions off return from interrupt |
---|
4272 | | - * occur there and call schedule directly. |
---|
| 4978 | + * This is the entry point to schedule() from in-kernel preemption |
---|
| 4979 | + * off of preempt_enable. |
---|
4273 | 4980 | */ |
---|
4274 | 4981 | asmlinkage __visible void __sched notrace preempt_schedule(void) |
---|
4275 | 4982 | { |
---|
.. | .. |
---|
4337 | 5044 | } |
---|
4338 | 5045 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); |
---|
4339 | 5046 | |
---|
4340 | | -#endif /* CONFIG_PREEMPT */ |
---|
| 5047 | +#endif /* CONFIG_PREEMPTION */ |
---|
4341 | 5048 | |
---|
4342 | 5049 | /* |
---|
4343 | | - * this is the entry point to schedule() from kernel preemption |
---|
| 5050 | + * This is the entry point to schedule() from kernel preemption |
---|
4344 | 5051 | * off of irq context. |
---|
4345 | 5052 | * Note, that this is called and return with irqs disabled. This will |
---|
4346 | 5053 | * protect us against recursive calling from irq. |
---|
.. | .. |
---|
4368 | 5075 | int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, |
---|
4369 | 5076 | void *key) |
---|
4370 | 5077 | { |
---|
4371 | | - return try_to_wake_up(curr->private, mode, wake_flags, 1); |
---|
| 5078 | + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR)); |
---|
| 5079 | + return try_to_wake_up(curr->private, mode, wake_flags); |
---|
4372 | 5080 | } |
---|
4373 | 5081 | EXPORT_SYMBOL(default_wake_function); |
---|
| 5082 | + |
---|
| 5083 | +static void __setscheduler_prio(struct task_struct *p, int prio) |
---|
| 5084 | +{ |
---|
| 5085 | + if (dl_prio(prio)) |
---|
| 5086 | + p->sched_class = &dl_sched_class; |
---|
| 5087 | + else if (rt_prio(prio)) |
---|
| 5088 | + p->sched_class = &rt_sched_class; |
---|
| 5089 | + else |
---|
| 5090 | + p->sched_class = &fair_sched_class; |
---|
| 5091 | + |
---|
| 5092 | + p->prio = prio; |
---|
| 5093 | +} |
---|
4374 | 5094 | |
---|
4375 | 5095 | #ifdef CONFIG_RT_MUTEXES |
---|
4376 | 5096 | |
---|
.. | .. |
---|
4408 | 5128 | struct rq_flags rf; |
---|
4409 | 5129 | struct rq *rq; |
---|
4410 | 5130 | |
---|
| 5131 | + trace_android_rvh_rtmutex_prepare_setprio(p, pi_task); |
---|
4411 | 5132 | /* XXX used to be waiter->prio, not waiter->task->prio */ |
---|
4412 | 5133 | prio = __rt_effective_prio(pi_task, p->normal_prio); |
---|
4413 | 5134 | |
---|
.. | .. |
---|
4482 | 5203 | if (!dl_prio(p->normal_prio) || |
---|
4483 | 5204 | (pi_task && dl_prio(pi_task->prio) && |
---|
4484 | 5205 | dl_entity_preempt(&pi_task->dl, &p->dl))) { |
---|
4485 | | - p->dl.dl_boosted = 1; |
---|
| 5206 | + p->dl.pi_se = pi_task->dl.pi_se; |
---|
4486 | 5207 | queue_flag |= ENQUEUE_REPLENISH; |
---|
4487 | | - } else |
---|
4488 | | - p->dl.dl_boosted = 0; |
---|
4489 | | - p->sched_class = &dl_sched_class; |
---|
| 5208 | + } else { |
---|
| 5209 | + p->dl.pi_se = &p->dl; |
---|
| 5210 | + } |
---|
4490 | 5211 | } else if (rt_prio(prio)) { |
---|
4491 | 5212 | if (dl_prio(oldprio)) |
---|
4492 | | - p->dl.dl_boosted = 0; |
---|
| 5213 | + p->dl.pi_se = &p->dl; |
---|
4493 | 5214 | if (oldprio < prio) |
---|
4494 | 5215 | queue_flag |= ENQUEUE_HEAD; |
---|
4495 | | - p->sched_class = &rt_sched_class; |
---|
4496 | 5216 | } else { |
---|
4497 | 5217 | if (dl_prio(oldprio)) |
---|
4498 | | - p->dl.dl_boosted = 0; |
---|
| 5218 | + p->dl.pi_se = &p->dl; |
---|
4499 | 5219 | if (rt_prio(oldprio)) |
---|
4500 | 5220 | p->rt.timeout = 0; |
---|
4501 | | - p->sched_class = &fair_sched_class; |
---|
4502 | 5221 | } |
---|
4503 | 5222 | |
---|
4504 | | - p->prio = prio; |
---|
| 5223 | + __setscheduler_prio(p, prio); |
---|
4505 | 5224 | |
---|
4506 | 5225 | if (queued) |
---|
4507 | 5226 | enqueue_task(rq, p, queue_flag); |
---|
4508 | 5227 | if (running) |
---|
4509 | | - set_curr_task(rq, p); |
---|
| 5228 | + set_next_task(rq, p); |
---|
4510 | 5229 | |
---|
4511 | 5230 | check_class_changed(rq, p, prev_class, oldprio); |
---|
4512 | 5231 | out_unlock: |
---|
.. | .. |
---|
4526 | 5245 | |
---|
4527 | 5246 | void set_user_nice(struct task_struct *p, long nice) |
---|
4528 | 5247 | { |
---|
4529 | | - bool queued, running; |
---|
4530 | | - int old_prio, delta; |
---|
| 5248 | + bool queued, running, allowed = false; |
---|
| 5249 | + int old_prio; |
---|
4531 | 5250 | struct rq_flags rf; |
---|
4532 | 5251 | struct rq *rq; |
---|
4533 | 5252 | |
---|
4534 | | - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) |
---|
| 5253 | + trace_android_rvh_set_user_nice(p, &nice, &allowed); |
---|
| 5254 | + if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed) |
---|
4535 | 5255 | return; |
---|
4536 | 5256 | /* |
---|
4537 | 5257 | * We have to be careful, if called from sys_setpriority(), |
---|
.. | .. |
---|
4558 | 5278 | put_prev_task(rq, p); |
---|
4559 | 5279 | |
---|
4560 | 5280 | p->static_prio = NICE_TO_PRIO(nice); |
---|
4561 | | - set_load_weight(p, true); |
---|
| 5281 | + set_load_weight(p); |
---|
4562 | 5282 | old_prio = p->prio; |
---|
4563 | 5283 | p->prio = effective_prio(p); |
---|
4564 | | - delta = p->prio - old_prio; |
---|
4565 | 5284 | |
---|
4566 | | - if (queued) { |
---|
| 5285 | + if (queued) |
---|
4567 | 5286 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
---|
4568 | | - /* |
---|
4569 | | - * If the task increased its priority or is running and |
---|
4570 | | - * lowered its priority, then reschedule its CPU: |
---|
4571 | | - */ |
---|
4572 | | - if (delta < 0 || (delta > 0 && task_running(rq, p))) |
---|
4573 | | - resched_curr(rq); |
---|
4574 | | - } |
---|
4575 | 5287 | if (running) |
---|
4576 | | - set_curr_task(rq, p); |
---|
| 5288 | + set_next_task(rq, p); |
---|
| 5289 | + |
---|
| 5290 | + /* |
---|
| 5291 | + * If the task increased its priority or is running and |
---|
| 5292 | + * lowered its priority, then reschedule its CPU: |
---|
| 5293 | + */ |
---|
| 5294 | + p->sched_class->prio_changed(rq, p, old_prio); |
---|
| 5295 | + |
---|
4577 | 5296 | out_unlock: |
---|
4578 | 5297 | task_rq_unlock(rq, p, &rf); |
---|
4579 | 5298 | } |
---|
.. | .. |
---|
4658 | 5377 | return 0; |
---|
4659 | 5378 | |
---|
4660 | 5379 | #ifdef CONFIG_SMP |
---|
4661 | | - if (!llist_empty(&rq->wake_list)) |
---|
| 5380 | + if (rq->ttwu_pending) |
---|
4662 | 5381 | return 0; |
---|
4663 | 5382 | #endif |
---|
4664 | 5383 | |
---|
.. | .. |
---|
4681 | 5400 | |
---|
4682 | 5401 | return 1; |
---|
4683 | 5402 | } |
---|
| 5403 | +EXPORT_SYMBOL_GPL(available_idle_cpu); |
---|
4684 | 5404 | |
---|
4685 | 5405 | /** |
---|
4686 | 5406 | * idle_task - return the idle task for a given CPU. |
---|
.. | .. |
---|
4732 | 5452 | */ |
---|
4733 | 5453 | p->rt_priority = attr->sched_priority; |
---|
4734 | 5454 | p->normal_prio = normal_prio(p); |
---|
4735 | | - set_load_weight(p, true); |
---|
4736 | | -} |
---|
4737 | | - |
---|
4738 | | -/* Actually do priority change: must hold pi & rq lock. */ |
---|
4739 | | -static void __setscheduler(struct rq *rq, struct task_struct *p, |
---|
4740 | | - const struct sched_attr *attr, bool keep_boost) |
---|
4741 | | -{ |
---|
4742 | | - /* |
---|
4743 | | - * If params can't change scheduling class changes aren't allowed |
---|
4744 | | - * either. |
---|
4745 | | - */ |
---|
4746 | | - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) |
---|
4747 | | - return; |
---|
4748 | | - |
---|
4749 | | - __setscheduler_params(p, attr); |
---|
4750 | | - |
---|
4751 | | - /* |
---|
4752 | | - * Keep a potential priority boosting if called from |
---|
4753 | | - * sched_setscheduler(). |
---|
4754 | | - */ |
---|
4755 | | - p->prio = normal_prio(p); |
---|
4756 | | - if (keep_boost) |
---|
4757 | | - p->prio = rt_effective_prio(p, p->prio); |
---|
4758 | | - |
---|
4759 | | - if (dl_prio(p->prio)) |
---|
4760 | | - p->sched_class = &dl_sched_class; |
---|
4761 | | - else if (rt_prio(p->prio)) |
---|
4762 | | - p->sched_class = &rt_sched_class; |
---|
4763 | | - else |
---|
4764 | | - p->sched_class = &fair_sched_class; |
---|
| 5455 | + set_load_weight(p); |
---|
4765 | 5456 | } |
---|
4766 | 5457 | |
---|
4767 | 5458 | /* |
---|
.. | .. |
---|
4784 | 5475 | const struct sched_attr *attr, |
---|
4785 | 5476 | bool user, bool pi) |
---|
4786 | 5477 | { |
---|
4787 | | - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : |
---|
4788 | | - MAX_RT_PRIO - 1 - attr->sched_priority; |
---|
4789 | | - int retval, oldprio, oldpolicy = -1, queued, running; |
---|
4790 | | - int new_effective_prio, policy = attr->sched_policy; |
---|
| 5478 | + int oldpolicy = -1, policy = attr->sched_policy; |
---|
| 5479 | + int retval, oldprio, newprio, queued, running; |
---|
4791 | 5480 | const struct sched_class *prev_class; |
---|
4792 | 5481 | struct rq_flags rf; |
---|
4793 | 5482 | int reset_on_fork; |
---|
4794 | 5483 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; |
---|
4795 | 5484 | struct rq *rq; |
---|
| 5485 | + bool cpuset_locked = false; |
---|
4796 | 5486 | |
---|
4797 | 5487 | /* The pi code expects interrupts enabled */ |
---|
4798 | 5488 | BUG_ON(pi && in_interrupt()); |
---|
.. | .. |
---|
4860 | 5550 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
---|
4861 | 5551 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
---|
4862 | 5552 | */ |
---|
4863 | | - if (idle_policy(p->policy) && !idle_policy(policy)) { |
---|
| 5553 | + if (task_has_idle_policy(p) && !idle_policy(policy)) { |
---|
4864 | 5554 | if (!can_nice(p, task_nice(p))) |
---|
4865 | 5555 | return -EPERM; |
---|
4866 | 5556 | } |
---|
.. | .. |
---|
4871 | 5561 | |
---|
4872 | 5562 | /* Normal users shall not reset the sched_reset_on_fork flag: */ |
---|
4873 | 5563 | if (p->sched_reset_on_fork && !reset_on_fork) |
---|
| 5564 | + return -EPERM; |
---|
| 5565 | + |
---|
| 5566 | + /* Can't change util-clamps */ |
---|
| 5567 | + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) |
---|
4874 | 5568 | return -EPERM; |
---|
4875 | 5569 | } |
---|
4876 | 5570 | |
---|
.. | .. |
---|
4891 | 5585 | } |
---|
4892 | 5586 | |
---|
4893 | 5587 | /* |
---|
| 5588 | + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets |
---|
| 5589 | + * information. |
---|
| 5590 | + */ |
---|
| 5591 | + if (dl_policy(policy) || dl_policy(p->policy)) { |
---|
| 5592 | + cpuset_locked = true; |
---|
| 5593 | + cpuset_lock(); |
---|
| 5594 | + } |
---|
| 5595 | + |
---|
| 5596 | + /* |
---|
4894 | 5597 | * Make sure no PI-waiters arrive (or leave) while we are |
---|
4895 | 5598 | * changing the priority of the task: |
---|
4896 | 5599 | * |
---|
.. | .. |
---|
4904 | 5607 | * Changing the policy of the stop threads its a very bad idea: |
---|
4905 | 5608 | */ |
---|
4906 | 5609 | if (p == rq->stop) { |
---|
4907 | | - task_rq_unlock(rq, p, &rf); |
---|
4908 | | - return -EINVAL; |
---|
| 5610 | + retval = -EINVAL; |
---|
| 5611 | + goto unlock; |
---|
4909 | 5612 | } |
---|
4910 | 5613 | |
---|
4911 | 5614 | /* |
---|
.. | .. |
---|
4923 | 5626 | goto change; |
---|
4924 | 5627 | |
---|
4925 | 5628 | p->sched_reset_on_fork = reset_on_fork; |
---|
4926 | | - task_rq_unlock(rq, p, &rf); |
---|
4927 | | - return 0; |
---|
| 5629 | + retval = 0; |
---|
| 5630 | + goto unlock; |
---|
4928 | 5631 | } |
---|
4929 | 5632 | change: |
---|
4930 | 5633 | |
---|
.. | .. |
---|
4937 | 5640 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
---|
4938 | 5641 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
---|
4939 | 5642 | !task_group_is_autogroup(task_group(p))) { |
---|
4940 | | - task_rq_unlock(rq, p, &rf); |
---|
4941 | | - return -EPERM; |
---|
| 5643 | + retval = -EPERM; |
---|
| 5644 | + goto unlock; |
---|
4942 | 5645 | } |
---|
4943 | 5646 | #endif |
---|
4944 | 5647 | #ifdef CONFIG_SMP |
---|
.. | .. |
---|
4951 | 5654 | * the entire root_domain to become SCHED_DEADLINE. We |
---|
4952 | 5655 | * will also fail if there's no bandwidth available. |
---|
4953 | 5656 | */ |
---|
4954 | | - if (!cpumask_subset(span, &p->cpus_allowed) || |
---|
| 5657 | + if (!cpumask_subset(span, p->cpus_ptr) || |
---|
4955 | 5658 | rq->rd->dl_bw.bw == 0) { |
---|
4956 | | - task_rq_unlock(rq, p, &rf); |
---|
4957 | | - return -EPERM; |
---|
| 5659 | + retval = -EPERM; |
---|
| 5660 | + goto unlock; |
---|
4958 | 5661 | } |
---|
4959 | 5662 | } |
---|
4960 | 5663 | #endif |
---|
.. | .. |
---|
4964 | 5667 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
---|
4965 | 5668 | policy = oldpolicy = -1; |
---|
4966 | 5669 | task_rq_unlock(rq, p, &rf); |
---|
| 5670 | + if (cpuset_locked) |
---|
| 5671 | + cpuset_unlock(); |
---|
4967 | 5672 | goto recheck; |
---|
4968 | 5673 | } |
---|
4969 | 5674 | |
---|
.. | .. |
---|
4973 | 5678 | * is available. |
---|
4974 | 5679 | */ |
---|
4975 | 5680 | if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { |
---|
4976 | | - task_rq_unlock(rq, p, &rf); |
---|
4977 | | - return -EBUSY; |
---|
| 5681 | + retval = -EBUSY; |
---|
| 5682 | + goto unlock; |
---|
4978 | 5683 | } |
---|
4979 | 5684 | |
---|
4980 | 5685 | p->sched_reset_on_fork = reset_on_fork; |
---|
4981 | 5686 | oldprio = p->prio; |
---|
4982 | 5687 | |
---|
| 5688 | + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); |
---|
4983 | 5689 | if (pi) { |
---|
4984 | 5690 | /* |
---|
4985 | 5691 | * Take priority boosted tasks into account. If the new |
---|
.. | .. |
---|
4988 | 5694 | * the runqueue. This will be done when the task deboost |
---|
4989 | 5695 | * itself. |
---|
4990 | 5696 | */ |
---|
4991 | | - new_effective_prio = rt_effective_prio(p, newprio); |
---|
4992 | | - if (new_effective_prio == oldprio) |
---|
| 5697 | + newprio = rt_effective_prio(p, newprio); |
---|
| 5698 | + if (newprio == oldprio) |
---|
4993 | 5699 | queue_flags &= ~DEQUEUE_MOVE; |
---|
4994 | 5700 | } |
---|
4995 | 5701 | |
---|
.. | .. |
---|
5002 | 5708 | |
---|
5003 | 5709 | prev_class = p->sched_class; |
---|
5004 | 5710 | |
---|
5005 | | - __setscheduler(rq, p, attr, pi); |
---|
| 5711 | + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { |
---|
| 5712 | + __setscheduler_params(p, attr); |
---|
| 5713 | + __setscheduler_prio(p, newprio); |
---|
| 5714 | + trace_android_rvh_setscheduler(p); |
---|
| 5715 | + } |
---|
5006 | 5716 | __setscheduler_uclamp(p, attr); |
---|
5007 | 5717 | |
---|
5008 | 5718 | if (queued) { |
---|
.. | .. |
---|
5016 | 5726 | enqueue_task(rq, p, queue_flags); |
---|
5017 | 5727 | } |
---|
5018 | 5728 | if (running) |
---|
5019 | | - set_curr_task(rq, p); |
---|
| 5729 | + set_next_task(rq, p); |
---|
5020 | 5730 | |
---|
5021 | 5731 | check_class_changed(rq, p, prev_class, oldprio); |
---|
5022 | 5732 | |
---|
.. | .. |
---|
5024 | 5734 | preempt_disable(); |
---|
5025 | 5735 | task_rq_unlock(rq, p, &rf); |
---|
5026 | 5736 | |
---|
5027 | | - if (pi) |
---|
| 5737 | + if (pi) { |
---|
| 5738 | + if (cpuset_locked) |
---|
| 5739 | + cpuset_unlock(); |
---|
5028 | 5740 | rt_mutex_adjust_pi(p); |
---|
| 5741 | + } |
---|
5029 | 5742 | |
---|
5030 | 5743 | /* Run balance callbacks after we've adjusted the PI chain: */ |
---|
5031 | 5744 | balance_callback(rq); |
---|
5032 | 5745 | preempt_enable(); |
---|
5033 | 5746 | |
---|
5034 | 5747 | return 0; |
---|
| 5748 | + |
---|
| 5749 | +unlock: |
---|
| 5750 | + task_rq_unlock(rq, p, &rf); |
---|
| 5751 | + if (cpuset_locked) |
---|
| 5752 | + cpuset_unlock(); |
---|
| 5753 | + return retval; |
---|
5035 | 5754 | } |
---|
5036 | 5755 | |
---|
5037 | 5756 | static int _sched_setscheduler(struct task_struct *p, int policy, |
---|
.. | .. |
---|
5043 | 5762 | .sched_nice = PRIO_TO_NICE(p->static_prio), |
---|
5044 | 5763 | }; |
---|
5045 | 5764 | |
---|
| 5765 | + if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) && |
---|
| 5766 | + ((policy == SCHED_FIFO) || (policy == SCHED_RR))) { |
---|
| 5767 | + attr.sched_priority /= 2; |
---|
| 5768 | + if (!check) |
---|
| 5769 | + attr.sched_priority += MAX_RT_PRIO / 2; |
---|
| 5770 | + if (!attr.sched_priority) |
---|
| 5771 | + attr.sched_priority = 1; |
---|
| 5772 | + } |
---|
5046 | 5773 | /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ |
---|
5047 | 5774 | if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { |
---|
5048 | 5775 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; |
---|
.. | .. |
---|
5057 | 5784 | * @p: the task in question. |
---|
5058 | 5785 | * @policy: new policy. |
---|
5059 | 5786 | * @param: structure containing the new RT priority. |
---|
| 5787 | + * |
---|
| 5788 | + * Use sched_set_fifo(), read its comment. |
---|
5060 | 5789 | * |
---|
5061 | 5790 | * Return: 0 on success. An error code otherwise. |
---|
5062 | 5791 | * |
---|
.. | .. |
---|
5079 | 5808 | { |
---|
5080 | 5809 | return __sched_setscheduler(p, attr, false, true); |
---|
5081 | 5810 | } |
---|
| 5811 | +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); |
---|
5082 | 5812 | |
---|
5083 | 5813 | /** |
---|
5084 | 5814 | * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. |
---|
.. | .. |
---|
5099 | 5829 | return _sched_setscheduler(p, policy, param, false); |
---|
5100 | 5830 | } |
---|
5101 | 5831 | EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); |
---|
| 5832 | + |
---|
| 5833 | +/* |
---|
| 5834 | + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally |
---|
| 5835 | + * incapable of resource management, which is the one thing an OS really should |
---|
| 5836 | + * be doing. |
---|
| 5837 | + * |
---|
| 5838 | + * This is of course the reason it is limited to privileged users only. |
---|
| 5839 | + * |
---|
| 5840 | + * Worse still; it is fundamentally impossible to compose static priority |
---|
| 5841 | + * workloads. You cannot take two correctly working static prio workloads |
---|
| 5842 | + * and smash them together and still expect them to work. |
---|
| 5843 | + * |
---|
| 5844 | + * For this reason 'all' FIFO tasks the kernel creates are basically at: |
---|
| 5845 | + * |
---|
| 5846 | + * MAX_RT_PRIO / 2 |
---|
| 5847 | + * |
---|
| 5848 | + * The administrator _MUST_ configure the system, the kernel simply doesn't |
---|
| 5849 | + * know enough information to make a sensible choice. |
---|
| 5850 | + */ |
---|
| 5851 | +void sched_set_fifo(struct task_struct *p) |
---|
| 5852 | +{ |
---|
| 5853 | + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; |
---|
| 5854 | + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); |
---|
| 5855 | +} |
---|
| 5856 | +EXPORT_SYMBOL_GPL(sched_set_fifo); |
---|
| 5857 | + |
---|
| 5858 | +/* |
---|
| 5859 | + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. |
---|
| 5860 | + */ |
---|
| 5861 | +void sched_set_fifo_low(struct task_struct *p) |
---|
| 5862 | +{ |
---|
| 5863 | + struct sched_param sp = { .sched_priority = 1 }; |
---|
| 5864 | + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); |
---|
| 5865 | +} |
---|
| 5866 | +EXPORT_SYMBOL_GPL(sched_set_fifo_low); |
---|
| 5867 | + |
---|
| 5868 | +void sched_set_normal(struct task_struct *p, int nice) |
---|
| 5869 | +{ |
---|
| 5870 | + struct sched_attr attr = { |
---|
| 5871 | + .sched_policy = SCHED_NORMAL, |
---|
| 5872 | + .sched_nice = nice, |
---|
| 5873 | + }; |
---|
| 5874 | + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); |
---|
| 5875 | +} |
---|
| 5876 | +EXPORT_SYMBOL_GPL(sched_set_normal); |
---|
5102 | 5877 | |
---|
5103 | 5878 | static int |
---|
5104 | 5879 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
---|
.. | .. |
---|
5130 | 5905 | u32 size; |
---|
5131 | 5906 | int ret; |
---|
5132 | 5907 | |
---|
5133 | | - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) |
---|
5134 | | - return -EFAULT; |
---|
5135 | | - |
---|
5136 | 5908 | /* Zero the full structure, so that a short copy will be nice: */ |
---|
5137 | 5909 | memset(attr, 0, sizeof(*attr)); |
---|
5138 | 5910 | |
---|
.. | .. |
---|
5140 | 5912 | if (ret) |
---|
5141 | 5913 | return ret; |
---|
5142 | 5914 | |
---|
5143 | | - /* Bail out on silly large: */ |
---|
5144 | | - if (size > PAGE_SIZE) |
---|
5145 | | - goto err_size; |
---|
5146 | | - |
---|
5147 | 5915 | /* ABI compatibility quirk: */ |
---|
5148 | 5916 | if (!size) |
---|
5149 | 5917 | size = SCHED_ATTR_SIZE_VER0; |
---|
5150 | | - |
---|
5151 | | - if (size < SCHED_ATTR_SIZE_VER0) |
---|
| 5918 | + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) |
---|
5152 | 5919 | goto err_size; |
---|
5153 | 5920 | |
---|
5154 | | - /* |
---|
5155 | | - * If we're handed a bigger struct than we know of, |
---|
5156 | | - * ensure all the unknown bits are 0 - i.e. new |
---|
5157 | | - * user-space does not rely on any kernel feature |
---|
5158 | | - * extensions we dont know about yet. |
---|
5159 | | - */ |
---|
5160 | | - if (size > sizeof(*attr)) { |
---|
5161 | | - unsigned char __user *addr; |
---|
5162 | | - unsigned char __user *end; |
---|
5163 | | - unsigned char val; |
---|
5164 | | - |
---|
5165 | | - addr = (void __user *)uattr + sizeof(*attr); |
---|
5166 | | - end = (void __user *)uattr + size; |
---|
5167 | | - |
---|
5168 | | - for (; addr < end; addr++) { |
---|
5169 | | - ret = get_user(val, addr); |
---|
5170 | | - if (ret) |
---|
5171 | | - return ret; |
---|
5172 | | - if (val) |
---|
5173 | | - goto err_size; |
---|
5174 | | - } |
---|
5175 | | - size = sizeof(*attr); |
---|
| 5921 | + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); |
---|
| 5922 | + if (ret) { |
---|
| 5923 | + if (ret == -E2BIG) |
---|
| 5924 | + goto err_size; |
---|
| 5925 | + return ret; |
---|
5176 | 5926 | } |
---|
5177 | | - |
---|
5178 | | - ret = copy_from_user(attr, uattr, size); |
---|
5179 | | - if (ret) |
---|
5180 | | - return -EFAULT; |
---|
5181 | 5927 | |
---|
5182 | 5928 | if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && |
---|
5183 | 5929 | size < SCHED_ATTR_SIZE_VER1) |
---|
.. | .. |
---|
5194 | 5940 | err_size: |
---|
5195 | 5941 | put_user(sizeof(*attr), &uattr->size); |
---|
5196 | 5942 | return -E2BIG; |
---|
| 5943 | +} |
---|
| 5944 | + |
---|
| 5945 | +static void get_params(struct task_struct *p, struct sched_attr *attr) |
---|
| 5946 | +{ |
---|
| 5947 | + if (task_has_dl_policy(p)) |
---|
| 5948 | + __getparam_dl(p, attr); |
---|
| 5949 | + else if (task_has_rt_policy(p)) |
---|
| 5950 | + attr->sched_priority = p->rt_priority; |
---|
| 5951 | + else |
---|
| 5952 | + attr->sched_nice = task_nice(p); |
---|
5197 | 5953 | } |
---|
5198 | 5954 | |
---|
5199 | 5955 | /** |
---|
.. | .. |
---|
5257 | 6013 | rcu_read_unlock(); |
---|
5258 | 6014 | |
---|
5259 | 6015 | if (likely(p)) { |
---|
| 6016 | + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) |
---|
| 6017 | + get_params(p, &attr); |
---|
5260 | 6018 | retval = sched_setattr(p, &attr); |
---|
5261 | 6019 | put_task_struct(p); |
---|
5262 | 6020 | } |
---|
.. | .. |
---|
5350 | 6108 | { |
---|
5351 | 6109 | unsigned int ksize = sizeof(*kattr); |
---|
5352 | 6110 | |
---|
5353 | | - if (!access_ok(VERIFY_WRITE, uattr, usize)) |
---|
| 6111 | + if (!access_ok(uattr, usize)) |
---|
5354 | 6112 | return -EFAULT; |
---|
5355 | 6113 | |
---|
5356 | 6114 | /* |
---|
.. | .. |
---|
5378 | 6136 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr |
---|
5379 | 6137 | * @pid: the pid in question. |
---|
5380 | 6138 | * @uattr: structure containing the extended parameters. |
---|
5381 | | - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. |
---|
| 6139 | + * @usize: sizeof(attr) for fwd/bwd comp. |
---|
5382 | 6140 | * @flags: for future extension. |
---|
5383 | 6141 | */ |
---|
5384 | 6142 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, |
---|
.. | .. |
---|
5405 | 6163 | kattr.sched_policy = p->policy; |
---|
5406 | 6164 | if (p->sched_reset_on_fork) |
---|
5407 | 6165 | kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; |
---|
5408 | | - if (task_has_dl_policy(p)) |
---|
5409 | | - __getparam_dl(p, &kattr); |
---|
5410 | | - else if (task_has_rt_policy(p)) |
---|
5411 | | - kattr.sched_priority = p->rt_priority; |
---|
5412 | | - else |
---|
5413 | | - kattr.sched_nice = task_nice(p); |
---|
| 6166 | + get_params(p, &kattr); |
---|
| 6167 | + kattr.sched_flags &= SCHED_FLAG_ALL; |
---|
5414 | 6168 | |
---|
5415 | 6169 | #ifdef CONFIG_UCLAMP_TASK |
---|
| 6170 | + /* |
---|
| 6171 | + * This could race with another potential updater, but this is fine |
---|
| 6172 | + * because it'll correctly read the old or the new value. We don't need |
---|
| 6173 | + * to guarantee who wins the race as long as it doesn't return garbage. |
---|
| 6174 | + */ |
---|
5416 | 6175 | kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; |
---|
5417 | 6176 | kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; |
---|
5418 | 6177 | #endif |
---|
.. | .. |
---|
5431 | 6190 | cpumask_var_t cpus_allowed, new_mask; |
---|
5432 | 6191 | struct task_struct *p; |
---|
5433 | 6192 | int retval; |
---|
| 6193 | + int skip = 0; |
---|
5434 | 6194 | |
---|
5435 | 6195 | rcu_read_lock(); |
---|
5436 | 6196 | |
---|
.. | .. |
---|
5466 | 6226 | rcu_read_unlock(); |
---|
5467 | 6227 | } |
---|
5468 | 6228 | |
---|
| 6229 | + trace_android_vh_sched_setaffinity_early(p, in_mask, &skip); |
---|
| 6230 | + if (skip) |
---|
| 6231 | + goto out_free_new_mask; |
---|
5469 | 6232 | retval = security_task_setscheduler(p); |
---|
5470 | 6233 | if (retval) |
---|
5471 | 6234 | goto out_free_new_mask; |
---|
.. | .. |
---|
5506 | 6269 | goto again; |
---|
5507 | 6270 | } |
---|
5508 | 6271 | } |
---|
| 6272 | + |
---|
| 6273 | + trace_android_rvh_sched_setaffinity(p, in_mask, &retval); |
---|
| 6274 | + |
---|
5509 | 6275 | out_free_new_mask: |
---|
5510 | 6276 | free_cpumask_var(new_mask); |
---|
5511 | 6277 | out_free_cpus_allowed: |
---|
.. | .. |
---|
5514 | 6280 | put_task_struct(p); |
---|
5515 | 6281 | return retval; |
---|
5516 | 6282 | } |
---|
5517 | | -EXPORT_SYMBOL_GPL(sched_setaffinity); |
---|
5518 | 6283 | |
---|
5519 | 6284 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
---|
5520 | 6285 | struct cpumask *new_mask) |
---|
.. | .. |
---|
5569 | 6334 | goto out_unlock; |
---|
5570 | 6335 | |
---|
5571 | 6336 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
---|
5572 | | - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); |
---|
| 6337 | + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); |
---|
5573 | 6338 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
---|
5574 | 6339 | |
---|
5575 | 6340 | out_unlock: |
---|
.. | .. |
---|
5598 | 6363 | if (len & (sizeof(unsigned long)-1)) |
---|
5599 | 6364 | return -EINVAL; |
---|
5600 | 6365 | |
---|
5601 | | - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) |
---|
| 6366 | + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) |
---|
5602 | 6367 | return -ENOMEM; |
---|
5603 | 6368 | |
---|
5604 | 6369 | ret = sched_getaffinity(pid, mask); |
---|
5605 | 6370 | if (ret == 0) { |
---|
5606 | 6371 | unsigned int retlen = min(len, cpumask_size()); |
---|
5607 | 6372 | |
---|
5608 | | - if (copy_to_user(user_mask_ptr, mask, retlen)) |
---|
| 6373 | + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) |
---|
5609 | 6374 | ret = -EFAULT; |
---|
5610 | 6375 | else |
---|
5611 | 6376 | ret = retlen; |
---|
.. | .. |
---|
5633 | 6398 | schedstat_inc(rq->yld_count); |
---|
5634 | 6399 | current->sched_class->yield_task(rq); |
---|
5635 | 6400 | |
---|
| 6401 | + trace_android_rvh_do_sched_yield(rq); |
---|
| 6402 | + |
---|
5636 | 6403 | preempt_disable(); |
---|
5637 | 6404 | rq_unlock_irq(rq, &rf); |
---|
5638 | 6405 | sched_preempt_enable_no_resched(); |
---|
.. | .. |
---|
5646 | 6413 | return 0; |
---|
5647 | 6414 | } |
---|
5648 | 6415 | |
---|
5649 | | -#ifndef CONFIG_PREEMPT |
---|
| 6416 | +#ifndef CONFIG_PREEMPTION |
---|
5650 | 6417 | int __sched _cond_resched(void) |
---|
5651 | 6418 | { |
---|
5652 | 6419 | if (should_resched(0)) { |
---|
.. | .. |
---|
5663 | 6430 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
---|
5664 | 6431 | * call schedule, and on return reacquire the lock. |
---|
5665 | 6432 | * |
---|
5666 | | - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
---|
| 6433 | + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level |
---|
5667 | 6434 | * operations here to prevent schedule() from being called twice (once via |
---|
5668 | 6435 | * spin_unlock(), once by hand). |
---|
5669 | 6436 | */ |
---|
.. | .. |
---|
5767 | 6534 | if (task_running(p_rq, p) || p->state) |
---|
5768 | 6535 | goto out_unlock; |
---|
5769 | 6536 | |
---|
5770 | | - yielded = curr->sched_class->yield_to_task(rq, p, preempt); |
---|
| 6537 | + yielded = curr->sched_class->yield_to_task(rq, p); |
---|
5771 | 6538 | if (yielded) { |
---|
5772 | 6539 | schedstat_inc(rq->yld_count); |
---|
5773 | 6540 | /* |
---|
.. | .. |
---|
5933 | 6700 | * an error code. |
---|
5934 | 6701 | */ |
---|
5935 | 6702 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
---|
5936 | | - struct timespec __user *, interval) |
---|
| 6703 | + struct __kernel_timespec __user *, interval) |
---|
5937 | 6704 | { |
---|
5938 | 6705 | struct timespec64 t; |
---|
5939 | 6706 | int retval = sched_rr_get_interval(pid, &t); |
---|
.. | .. |
---|
5944 | 6711 | return retval; |
---|
5945 | 6712 | } |
---|
5946 | 6713 | |
---|
5947 | | -#ifdef CONFIG_COMPAT |
---|
5948 | | -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, |
---|
5949 | | - compat_pid_t, pid, |
---|
5950 | | - struct compat_timespec __user *, interval) |
---|
| 6714 | +#ifdef CONFIG_COMPAT_32BIT_TIME |
---|
| 6715 | +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, |
---|
| 6716 | + struct old_timespec32 __user *, interval) |
---|
5951 | 6717 | { |
---|
5952 | 6718 | struct timespec64 t; |
---|
5953 | 6719 | int retval = sched_rr_get_interval(pid, &t); |
---|
5954 | 6720 | |
---|
5955 | 6721 | if (retval == 0) |
---|
5956 | | - retval = compat_put_timespec64(&t, interval); |
---|
| 6722 | + retval = put_old_timespec32(&t, interval); |
---|
5957 | 6723 | return retval; |
---|
5958 | 6724 | } |
---|
5959 | 6725 | #endif |
---|
.. | .. |
---|
5966 | 6732 | if (!try_get_task_stack(p)) |
---|
5967 | 6733 | return; |
---|
5968 | 6734 | |
---|
5969 | | - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); |
---|
| 6735 | + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); |
---|
5970 | 6736 | |
---|
5971 | 6737 | if (p->state == TASK_RUNNING) |
---|
5972 | | - printk(KERN_CONT " running task "); |
---|
| 6738 | + pr_cont(" running task "); |
---|
5973 | 6739 | #ifdef CONFIG_DEBUG_STACK_USAGE |
---|
5974 | 6740 | free = stack_not_used(p); |
---|
5975 | 6741 | #endif |
---|
.. | .. |
---|
5978 | 6744 | if (pid_alive(p)) |
---|
5979 | 6745 | ppid = task_pid_nr(rcu_dereference(p->real_parent)); |
---|
5980 | 6746 | rcu_read_unlock(); |
---|
5981 | | - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
---|
5982 | | - task_pid_nr(p), ppid, |
---|
| 6747 | + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", |
---|
| 6748 | + free, task_pid_nr(p), ppid, |
---|
5983 | 6749 | (unsigned long)task_thread_info(p)->flags); |
---|
5984 | 6750 | |
---|
5985 | 6751 | print_worker_info(KERN_INFO, p); |
---|
5986 | | - show_stack(p, NULL); |
---|
| 6752 | + trace_android_vh_sched_show_task(p); |
---|
| 6753 | + show_stack(p, NULL, KERN_INFO); |
---|
5987 | 6754 | put_task_stack(p); |
---|
5988 | 6755 | } |
---|
5989 | 6756 | EXPORT_SYMBOL_GPL(sched_show_task); |
---|
.. | .. |
---|
6014 | 6781 | { |
---|
6015 | 6782 | struct task_struct *g, *p; |
---|
6016 | 6783 | |
---|
6017 | | -#if BITS_PER_LONG == 32 |
---|
6018 | | - printk(KERN_INFO |
---|
6019 | | - " task PC stack pid father\n"); |
---|
6020 | | -#else |
---|
6021 | | - printk(KERN_INFO |
---|
6022 | | - " task PC stack pid father\n"); |
---|
6023 | | -#endif |
---|
6024 | 6784 | rcu_read_lock(); |
---|
6025 | 6785 | for_each_process_thread(g, p) { |
---|
6026 | 6786 | /* |
---|
.. | .. |
---|
6056 | 6816 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
---|
6057 | 6817 | * flag, to make booting more robust. |
---|
6058 | 6818 | */ |
---|
6059 | | -void init_idle(struct task_struct *idle, int cpu) |
---|
| 6819 | +void __init init_idle(struct task_struct *idle, int cpu) |
---|
6060 | 6820 | { |
---|
6061 | 6821 | struct rq *rq = cpu_rq(cpu); |
---|
6062 | 6822 | unsigned long flags; |
---|
.. | .. |
---|
6069 | 6829 | idle->state = TASK_RUNNING; |
---|
6070 | 6830 | idle->se.exec_start = sched_clock(); |
---|
6071 | 6831 | idle->flags |= PF_IDLE; |
---|
6072 | | - |
---|
6073 | | - scs_task_reset(idle); |
---|
6074 | | - kasan_unpoison_task_stack(idle); |
---|
6075 | 6832 | |
---|
6076 | 6833 | #ifdef CONFIG_SMP |
---|
6077 | 6834 | /* |
---|
.. | .. |
---|
6096 | 6853 | __set_task_cpu(idle, cpu); |
---|
6097 | 6854 | rcu_read_unlock(); |
---|
6098 | 6855 | |
---|
6099 | | - rq->curr = rq->idle = idle; |
---|
| 6856 | + rq->idle = idle; |
---|
| 6857 | + rcu_assign_pointer(rq->curr, idle); |
---|
6100 | 6858 | idle->on_rq = TASK_ON_RQ_QUEUED; |
---|
6101 | 6859 | #ifdef CONFIG_SMP |
---|
6102 | 6860 | idle->on_cpu = 1; |
---|
.. | .. |
---|
6133 | 6891 | return ret; |
---|
6134 | 6892 | } |
---|
6135 | 6893 | |
---|
6136 | | -int task_can_attach(struct task_struct *p, |
---|
6137 | | - const struct cpumask *cs_cpus_allowed) |
---|
| 6894 | +int task_can_attach(struct task_struct *p) |
---|
6138 | 6895 | { |
---|
6139 | 6896 | int ret = 0; |
---|
6140 | 6897 | |
---|
.. | .. |
---|
6145 | 6902 | * allowed nodes is unnecessary. Thus, cpusets are not |
---|
6146 | 6903 | * applicable for such threads. This prevents checking for |
---|
6147 | 6904 | * success of set_cpus_allowed_ptr() on all attached tasks |
---|
6148 | | - * before cpus_allowed may be changed. |
---|
| 6905 | + * before cpus_mask may be changed. |
---|
6149 | 6906 | */ |
---|
6150 | | - if (p->flags & PF_NO_SETAFFINITY) { |
---|
| 6907 | + if (p->flags & PF_NO_SETAFFINITY) |
---|
6151 | 6908 | ret = -EINVAL; |
---|
6152 | | - goto out; |
---|
6153 | | - } |
---|
6154 | 6909 | |
---|
6155 | | - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, |
---|
6156 | | - cs_cpus_allowed)) |
---|
6157 | | - ret = dl_task_can_attach(p, cs_cpus_allowed); |
---|
6158 | | - |
---|
6159 | | -out: |
---|
6160 | 6910 | return ret; |
---|
6161 | 6911 | } |
---|
6162 | 6912 | |
---|
.. | .. |
---|
6172 | 6922 | if (curr_cpu == target_cpu) |
---|
6173 | 6923 | return 0; |
---|
6174 | 6924 | |
---|
6175 | | - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) |
---|
| 6925 | + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) |
---|
6176 | 6926 | return -EINVAL; |
---|
6177 | 6927 | |
---|
6178 | 6928 | /* TODO: This is not properly updating schedstats */ |
---|
.. | .. |
---|
6205 | 6955 | if (queued) |
---|
6206 | 6956 | enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); |
---|
6207 | 6957 | if (running) |
---|
6208 | | - set_curr_task(rq, p); |
---|
| 6958 | + set_next_task(rq, p); |
---|
6209 | 6959 | task_rq_unlock(rq, p, &rf); |
---|
6210 | 6960 | } |
---|
6211 | 6961 | #endif /* CONFIG_NUMA_BALANCING */ |
---|
.. | .. |
---|
6246 | 6996 | atomic_long_add(delta, &calc_load_tasks); |
---|
6247 | 6997 | } |
---|
6248 | 6998 | |
---|
6249 | | -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) |
---|
| 6999 | +static struct task_struct *__pick_migrate_task(struct rq *rq) |
---|
6250 | 7000 | { |
---|
| 7001 | + const struct sched_class *class; |
---|
| 7002 | + struct task_struct *next; |
---|
| 7003 | + |
---|
| 7004 | + for_each_class(class) { |
---|
| 7005 | + next = class->pick_next_task(rq); |
---|
| 7006 | + if (next) { |
---|
| 7007 | + next->sched_class->put_prev_task(rq, next); |
---|
| 7008 | + return next; |
---|
| 7009 | + } |
---|
| 7010 | + } |
---|
| 7011 | + |
---|
| 7012 | + /* The idle class should always have a runnable task */ |
---|
| 7013 | + BUG(); |
---|
6251 | 7014 | } |
---|
6252 | | - |
---|
6253 | | -static const struct sched_class fake_sched_class = { |
---|
6254 | | - .put_prev_task = put_prev_task_fake, |
---|
6255 | | -}; |
---|
6256 | | - |
---|
6257 | | -static struct task_struct fake_task = { |
---|
6258 | | - /* |
---|
6259 | | - * Avoid pull_{rt,dl}_task() |
---|
6260 | | - */ |
---|
6261 | | - .prio = MAX_PRIO + 1, |
---|
6262 | | - .sched_class = &fake_sched_class, |
---|
6263 | | -}; |
---|
6264 | 7015 | |
---|
6265 | 7016 | /* |
---|
6266 | 7017 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
---|
.. | .. |
---|
6269 | 7020 | * Called with rq->lock held even though we'er in stop_machine() and |
---|
6270 | 7021 | * there's no concurrency possible, we hold the required locks anyway |
---|
6271 | 7022 | * because of lock validation efforts. |
---|
| 7023 | + * |
---|
| 7024 | + * force: if false, the function will skip CPU pinned kthreads. |
---|
6272 | 7025 | */ |
---|
6273 | | -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) |
---|
| 7026 | +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force) |
---|
6274 | 7027 | { |
---|
6275 | 7028 | struct rq *rq = dead_rq; |
---|
6276 | | - struct task_struct *next, *stop = rq->stop; |
---|
| 7029 | + struct task_struct *next, *tmp, *stop = rq->stop; |
---|
| 7030 | + LIST_HEAD(percpu_kthreads); |
---|
6277 | 7031 | struct rq_flags orf = *rf; |
---|
6278 | 7032 | int dest_cpu; |
---|
6279 | 7033 | |
---|
.. | .. |
---|
6295 | 7049 | */ |
---|
6296 | 7050 | update_rq_clock(rq); |
---|
6297 | 7051 | |
---|
| 7052 | +#ifdef CONFIG_SCHED_DEBUG |
---|
| 7053 | + /* note the clock update in orf */ |
---|
| 7054 | + orf.clock_update_flags |= RQCF_UPDATED; |
---|
| 7055 | +#endif |
---|
| 7056 | + |
---|
6298 | 7057 | for (;;) { |
---|
6299 | 7058 | /* |
---|
6300 | 7059 | * There's this thread running, bail when that's the only |
---|
.. | .. |
---|
6303 | 7062 | if (rq->nr_running == 1) |
---|
6304 | 7063 | break; |
---|
6305 | 7064 | |
---|
6306 | | - /* |
---|
6307 | | - * pick_next_task() assumes pinned rq->lock: |
---|
6308 | | - */ |
---|
6309 | | - next = pick_next_task(rq, &fake_task, rf); |
---|
6310 | | - BUG_ON(!next); |
---|
6311 | | - put_prev_task(rq, next); |
---|
| 7065 | + next = __pick_migrate_task(rq); |
---|
6312 | 7066 | |
---|
6313 | 7067 | /* |
---|
6314 | | - * Rules for changing task_struct::cpus_allowed are holding |
---|
| 7068 | + * Argh ... no iterator for tasks, we need to remove the |
---|
| 7069 | + * kthread from the run-queue to continue. |
---|
| 7070 | + */ |
---|
| 7071 | + if (!force && is_per_cpu_kthread(next)) { |
---|
| 7072 | + INIT_LIST_HEAD(&next->percpu_kthread_node); |
---|
| 7073 | + list_add(&next->percpu_kthread_node, &percpu_kthreads); |
---|
| 7074 | + |
---|
| 7075 | + /* DEQUEUE_SAVE not used due to move_entity in rt */ |
---|
| 7076 | + deactivate_task(rq, next, |
---|
| 7077 | + DEQUEUE_NOCLOCK); |
---|
| 7078 | + continue; |
---|
| 7079 | + } |
---|
| 7080 | + |
---|
| 7081 | + /* |
---|
| 7082 | + * Rules for changing task_struct::cpus_mask are holding |
---|
6315 | 7083 | * both pi_lock and rq->lock, such that holding either |
---|
6316 | 7084 | * stabilizes the mask. |
---|
6317 | 7085 | * |
---|
.. | .. |
---|
6328 | 7096 | * changed the task, WARN if weird stuff happened, because in |
---|
6329 | 7097 | * that case the above rq->lock drop is a fail too. |
---|
6330 | 7098 | */ |
---|
6331 | | - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { |
---|
| 7099 | + if (task_rq(next) != rq || !task_on_rq_queued(next)) { |
---|
| 7100 | + /* |
---|
| 7101 | + * In the !force case, there is a hole between |
---|
| 7102 | + * rq_unlock() and rq_relock(), where another CPU might |
---|
| 7103 | + * not observe an up to date cpu_active_mask and try to |
---|
| 7104 | + * move tasks around. |
---|
| 7105 | + */ |
---|
| 7106 | + WARN_ON(force); |
---|
6332 | 7107 | raw_spin_unlock(&next->pi_lock); |
---|
6333 | 7108 | continue; |
---|
6334 | 7109 | } |
---|
.. | .. |
---|
6345 | 7120 | raw_spin_unlock(&next->pi_lock); |
---|
6346 | 7121 | } |
---|
6347 | 7122 | |
---|
| 7123 | + list_for_each_entry_safe(next, tmp, &percpu_kthreads, |
---|
| 7124 | + percpu_kthread_node) { |
---|
| 7125 | + |
---|
| 7126 | + /* ENQUEUE_RESTORE not used due to move_entity in rt */ |
---|
| 7127 | + activate_task(rq, next, ENQUEUE_NOCLOCK); |
---|
| 7128 | + list_del(&next->percpu_kthread_node); |
---|
| 7129 | + } |
---|
| 7130 | + |
---|
6348 | 7131 | rq->stop = stop; |
---|
| 7132 | +} |
---|
| 7133 | + |
---|
| 7134 | +static int drain_rq_cpu_stop(void *data) |
---|
| 7135 | +{ |
---|
| 7136 | + struct rq *rq = this_rq(); |
---|
| 7137 | + struct rq_flags rf; |
---|
| 7138 | + |
---|
| 7139 | + rq_lock_irqsave(rq, &rf); |
---|
| 7140 | + migrate_tasks(rq, &rf, false); |
---|
| 7141 | + rq_unlock_irqrestore(rq, &rf); |
---|
| 7142 | + |
---|
| 7143 | + return 0; |
---|
| 7144 | +} |
---|
| 7145 | + |
---|
| 7146 | +int sched_cpu_drain_rq(unsigned int cpu) |
---|
| 7147 | +{ |
---|
| 7148 | + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); |
---|
| 7149 | + struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done); |
---|
| 7150 | + |
---|
| 7151 | + if (idle_cpu(cpu)) { |
---|
| 7152 | + rq_drain->done = NULL; |
---|
| 7153 | + return 0; |
---|
| 7154 | + } |
---|
| 7155 | + |
---|
| 7156 | + return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain, |
---|
| 7157 | + rq_drain_done); |
---|
| 7158 | +} |
---|
| 7159 | + |
---|
| 7160 | +void sched_cpu_drain_rq_wait(unsigned int cpu) |
---|
| 7161 | +{ |
---|
| 7162 | + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); |
---|
| 7163 | + |
---|
| 7164 | + if (rq_drain->done) |
---|
| 7165 | + cpu_stop_work_wait(rq_drain); |
---|
6349 | 7166 | } |
---|
6350 | 7167 | #endif /* CONFIG_HOTPLUG_CPU */ |
---|
6351 | 7168 | |
---|
.. | .. |
---|
6417 | 7234 | static int cpuset_cpu_inactive(unsigned int cpu) |
---|
6418 | 7235 | { |
---|
6419 | 7236 | if (!cpuhp_tasks_frozen) { |
---|
6420 | | - if (dl_cpu_busy(cpu)) |
---|
6421 | | - return -EBUSY; |
---|
| 7237 | + int ret = dl_bw_check_overflow(cpu); |
---|
| 7238 | + |
---|
| 7239 | + if (ret) |
---|
| 7240 | + return ret; |
---|
6422 | 7241 | cpuset_update_active_cpus(); |
---|
6423 | 7242 | } else { |
---|
6424 | 7243 | num_cpus_frozen++; |
---|
.. | .. |
---|
6467 | 7286 | return 0; |
---|
6468 | 7287 | } |
---|
6469 | 7288 | |
---|
6470 | | -int sched_cpu_deactivate(unsigned int cpu) |
---|
| 7289 | +int sched_cpus_activate(struct cpumask *cpus) |
---|
| 7290 | +{ |
---|
| 7291 | + unsigned int cpu; |
---|
| 7292 | + |
---|
| 7293 | + for_each_cpu(cpu, cpus) { |
---|
| 7294 | + if (sched_cpu_activate(cpu)) { |
---|
| 7295 | + for_each_cpu_and(cpu, cpus, cpu_active_mask) |
---|
| 7296 | + sched_cpu_deactivate(cpu); |
---|
| 7297 | + |
---|
| 7298 | + return -EBUSY; |
---|
| 7299 | + } |
---|
| 7300 | + } |
---|
| 7301 | + |
---|
| 7302 | + return 0; |
---|
| 7303 | +} |
---|
| 7304 | + |
---|
| 7305 | +int _sched_cpu_deactivate(unsigned int cpu) |
---|
6471 | 7306 | { |
---|
6472 | 7307 | int ret; |
---|
6473 | 7308 | |
---|
6474 | 7309 | set_cpu_active(cpu, false); |
---|
6475 | | - /* |
---|
6476 | | - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU |
---|
6477 | | - * users of this state to go away such that all new such users will |
---|
6478 | | - * observe it. |
---|
6479 | | - * |
---|
6480 | | - * Do sync before park smpboot threads to take care the rcu boost case. |
---|
6481 | | - */ |
---|
6482 | | - synchronize_rcu_mult(call_rcu, call_rcu_sched); |
---|
6483 | 7310 | |
---|
6484 | 7311 | #ifdef CONFIG_SCHED_SMT |
---|
6485 | 7312 | /* |
---|
.. | .. |
---|
6498 | 7325 | return ret; |
---|
6499 | 7326 | } |
---|
6500 | 7327 | sched_domains_numa_masks_clear(cpu); |
---|
| 7328 | + |
---|
| 7329 | + update_max_interval(); |
---|
| 7330 | + |
---|
| 7331 | + return 0; |
---|
| 7332 | +} |
---|
| 7333 | + |
---|
| 7334 | +int sched_cpu_deactivate(unsigned int cpu) |
---|
| 7335 | +{ |
---|
| 7336 | + int ret = _sched_cpu_deactivate(cpu); |
---|
| 7337 | + |
---|
| 7338 | + if (ret) |
---|
| 7339 | + return ret; |
---|
| 7340 | + |
---|
| 7341 | + /* |
---|
| 7342 | + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU |
---|
| 7343 | + * users of this state to go away such that all new such users will |
---|
| 7344 | + * observe it. |
---|
| 7345 | + * |
---|
| 7346 | + * Do sync before park smpboot threads to take care the rcu boost case. |
---|
| 7347 | + */ |
---|
| 7348 | + synchronize_rcu(); |
---|
| 7349 | + |
---|
| 7350 | + return 0; |
---|
| 7351 | +} |
---|
| 7352 | + |
---|
| 7353 | +int sched_cpus_deactivate_nosync(struct cpumask *cpus) |
---|
| 7354 | +{ |
---|
| 7355 | + unsigned int cpu; |
---|
| 7356 | + |
---|
| 7357 | + for_each_cpu(cpu, cpus) { |
---|
| 7358 | + if (_sched_cpu_deactivate(cpu)) { |
---|
| 7359 | + for_each_cpu(cpu, cpus) { |
---|
| 7360 | + if (!cpu_active(cpu)) |
---|
| 7361 | + sched_cpu_activate(cpu); |
---|
| 7362 | + } |
---|
| 7363 | + |
---|
| 7364 | + return -EBUSY; |
---|
| 7365 | + } |
---|
| 7366 | + } |
---|
| 7367 | + |
---|
6501 | 7368 | return 0; |
---|
6502 | 7369 | } |
---|
6503 | 7370 | |
---|
.. | .. |
---|
6506 | 7373 | struct rq *rq = cpu_rq(cpu); |
---|
6507 | 7374 | |
---|
6508 | 7375 | rq->calc_load_update = calc_load_update; |
---|
6509 | | - update_max_interval(); |
---|
6510 | 7376 | } |
---|
6511 | 7377 | |
---|
6512 | 7378 | int sched_cpu_starting(unsigned int cpu) |
---|
6513 | 7379 | { |
---|
6514 | 7380 | sched_rq_cpu_starting(cpu); |
---|
6515 | 7381 | sched_tick_start(cpu); |
---|
| 7382 | + trace_android_rvh_sched_cpu_starting(cpu); |
---|
6516 | 7383 | return 0; |
---|
6517 | 7384 | } |
---|
6518 | 7385 | |
---|
.. | .. |
---|
6523 | 7390 | struct rq_flags rf; |
---|
6524 | 7391 | |
---|
6525 | 7392 | /* Handle pending wakeups and then migrate everything off */ |
---|
6526 | | - sched_ttwu_pending(); |
---|
6527 | 7393 | sched_tick_stop(cpu); |
---|
6528 | 7394 | |
---|
6529 | 7395 | rq_lock_irqsave(rq, &rf); |
---|
.. | .. |
---|
6531 | 7397 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
---|
6532 | 7398 | set_rq_offline(rq); |
---|
6533 | 7399 | } |
---|
6534 | | - migrate_tasks(rq, &rf); |
---|
| 7400 | + migrate_tasks(rq, &rf, true); |
---|
6535 | 7401 | BUG_ON(rq->nr_running != 1); |
---|
6536 | 7402 | rq_unlock_irqrestore(rq, &rf); |
---|
6537 | 7403 | |
---|
| 7404 | + trace_android_rvh_sched_cpu_dying(cpu); |
---|
| 7405 | + |
---|
6538 | 7406 | calc_load_migrate(rq); |
---|
6539 | | - update_max_interval(); |
---|
6540 | 7407 | nohz_balance_exit_idle(rq); |
---|
6541 | 7408 | hrtick_clear(rq); |
---|
6542 | 7409 | return 0; |
---|
.. | .. |
---|
6550 | 7417 | /* |
---|
6551 | 7418 | * There's no userspace yet to cause hotplug operations; hence all the |
---|
6552 | 7419 | * CPU masks are stable and all blatant races in the below code cannot |
---|
6553 | | - * happen. The hotplug lock is nevertheless taken to satisfy lockdep, |
---|
6554 | | - * but there won't be any contention on it. |
---|
| 7420 | + * happen. |
---|
6555 | 7421 | */ |
---|
6556 | | - cpus_read_lock(); |
---|
6557 | 7422 | mutex_lock(&sched_domains_mutex); |
---|
6558 | 7423 | sched_init_domains(cpu_active_mask); |
---|
6559 | 7424 | mutex_unlock(&sched_domains_mutex); |
---|
6560 | | - cpus_read_unlock(); |
---|
6561 | 7425 | |
---|
6562 | 7426 | /* Move init over to a non-isolated CPU */ |
---|
6563 | 7427 | if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) |
---|
6564 | 7428 | BUG(); |
---|
| 7429 | + |
---|
6565 | 7430 | sched_init_granularity(); |
---|
6566 | 7431 | |
---|
6567 | 7432 | init_sched_rt_class(); |
---|
.. | .. |
---|
6572 | 7437 | |
---|
6573 | 7438 | static int __init migration_init(void) |
---|
6574 | 7439 | { |
---|
6575 | | - sched_rq_cpu_starting(smp_processor_id()); |
---|
| 7440 | + sched_cpu_starting(smp_processor_id()); |
---|
6576 | 7441 | return 0; |
---|
6577 | 7442 | } |
---|
6578 | 7443 | early_initcall(migration_init); |
---|
.. | .. |
---|
6597 | 7462 | * Every task in system belongs to this group at bootup. |
---|
6598 | 7463 | */ |
---|
6599 | 7464 | struct task_group root_task_group; |
---|
| 7465 | +EXPORT_SYMBOL_GPL(root_task_group); |
---|
6600 | 7466 | LIST_HEAD(task_groups); |
---|
| 7467 | +EXPORT_SYMBOL_GPL(task_groups); |
---|
6601 | 7468 | |
---|
6602 | 7469 | /* Cacheline aligned slab cache for task_group */ |
---|
6603 | 7470 | static struct kmem_cache *task_group_cache __read_mostly; |
---|
.. | .. |
---|
6608 | 7475 | |
---|
6609 | 7476 | void __init sched_init(void) |
---|
6610 | 7477 | { |
---|
6611 | | - int i, j; |
---|
6612 | | - unsigned long alloc_size = 0, ptr; |
---|
| 7478 | + unsigned long ptr = 0; |
---|
| 7479 | + int i; |
---|
| 7480 | + |
---|
| 7481 | + /* Make sure the linker didn't screw up */ |
---|
| 7482 | + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || |
---|
| 7483 | + &fair_sched_class + 1 != &rt_sched_class || |
---|
| 7484 | + &rt_sched_class + 1 != &dl_sched_class); |
---|
| 7485 | +#ifdef CONFIG_SMP |
---|
| 7486 | + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); |
---|
| 7487 | +#endif |
---|
6613 | 7488 | |
---|
6614 | 7489 | wait_bit_init(); |
---|
6615 | 7490 | |
---|
6616 | 7491 | #ifdef CONFIG_FAIR_GROUP_SCHED |
---|
6617 | | - alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
---|
| 7492 | + ptr += 2 * nr_cpu_ids * sizeof(void **); |
---|
6618 | 7493 | #endif |
---|
6619 | 7494 | #ifdef CONFIG_RT_GROUP_SCHED |
---|
6620 | | - alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
---|
| 7495 | + ptr += 2 * nr_cpu_ids * sizeof(void **); |
---|
6621 | 7496 | #endif |
---|
6622 | | - if (alloc_size) { |
---|
6623 | | - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
---|
| 7497 | + if (ptr) { |
---|
| 7498 | + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); |
---|
6624 | 7499 | |
---|
6625 | 7500 | #ifdef CONFIG_FAIR_GROUP_SCHED |
---|
6626 | 7501 | root_task_group.se = (struct sched_entity **)ptr; |
---|
.. | .. |
---|
6629 | 7504 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
---|
6630 | 7505 | ptr += nr_cpu_ids * sizeof(void **); |
---|
6631 | 7506 | |
---|
| 7507 | + root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
---|
| 7508 | + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); |
---|
6632 | 7509 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
---|
6633 | 7510 | #ifdef CONFIG_RT_GROUP_SCHED |
---|
6634 | 7511 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
---|
.. | .. |
---|
6681 | 7558 | init_rt_rq(&rq->rt); |
---|
6682 | 7559 | init_dl_rq(&rq->dl); |
---|
6683 | 7560 | #ifdef CONFIG_FAIR_GROUP_SCHED |
---|
6684 | | - root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
---|
6685 | 7561 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
---|
6686 | 7562 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; |
---|
6687 | 7563 | /* |
---|
.. | .. |
---|
6703 | 7579 | * We achieve this by letting root_task_group's tasks sit |
---|
6704 | 7580 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
---|
6705 | 7581 | */ |
---|
6706 | | - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); |
---|
6707 | 7582 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
---|
6708 | 7583 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
---|
6709 | 7584 | |
---|
.. | .. |
---|
6711 | 7586 | #ifdef CONFIG_RT_GROUP_SCHED |
---|
6712 | 7587 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
---|
6713 | 7588 | #endif |
---|
6714 | | - |
---|
6715 | | - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
---|
6716 | | - rq->cpu_load[j] = 0; |
---|
6717 | | - |
---|
6718 | 7589 | #ifdef CONFIG_SMP |
---|
6719 | 7590 | rq->sd = NULL; |
---|
6720 | 7591 | rq->rd = NULL; |
---|
.. | .. |
---|
6733 | 7604 | |
---|
6734 | 7605 | rq_attach_root(rq, &def_root_domain); |
---|
6735 | 7606 | #ifdef CONFIG_NO_HZ_COMMON |
---|
6736 | | - rq->last_load_update_tick = jiffies; |
---|
6737 | 7607 | rq->last_blocked_load_update_tick = jiffies; |
---|
6738 | 7608 | atomic_set(&rq->nohz_flags, 0); |
---|
| 7609 | + |
---|
| 7610 | + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); |
---|
6739 | 7611 | #endif |
---|
6740 | 7612 | #endif /* CONFIG_SMP */ |
---|
6741 | 7613 | hrtick_rq_init(rq); |
---|
6742 | 7614 | atomic_set(&rq->nr_iowait, 0); |
---|
6743 | 7615 | } |
---|
6744 | 7616 | |
---|
6745 | | - set_load_weight(&init_task, false); |
---|
| 7617 | + set_load_weight(&init_task); |
---|
6746 | 7618 | |
---|
6747 | 7619 | /* |
---|
6748 | 7620 | * The boot idle thread does lazy MMU switching as well: |
---|
.. | .. |
---|
6811 | 7683 | rcu_sleep_check(); |
---|
6812 | 7684 | |
---|
6813 | 7685 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
---|
6814 | | - !is_idle_task(current)) || |
---|
| 7686 | + !is_idle_task(current) && !current->non_block_count) || |
---|
6815 | 7687 | system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || |
---|
6816 | 7688 | oops_in_progress) |
---|
6817 | 7689 | return; |
---|
.. | .. |
---|
6827 | 7699 | "BUG: sleeping function called from invalid context at %s:%d\n", |
---|
6828 | 7700 | file, line); |
---|
6829 | 7701 | printk(KERN_ERR |
---|
6830 | | - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
---|
6831 | | - in_atomic(), irqs_disabled(), |
---|
| 7702 | + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", |
---|
| 7703 | + in_atomic(), irqs_disabled(), current->non_block_count, |
---|
6832 | 7704 | current->pid, current->comm); |
---|
6833 | 7705 | |
---|
6834 | 7706 | if (task_stack_end_corrupted(current)) |
---|
.. | .. |
---|
6840 | 7712 | if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) |
---|
6841 | 7713 | && !preempt_count_equals(preempt_offset)) { |
---|
6842 | 7714 | pr_err("Preemption disabled at:"); |
---|
6843 | | - print_ip_sym(preempt_disable_ip); |
---|
6844 | | - pr_cont("\n"); |
---|
| 7715 | + print_ip_sym(KERN_ERR, preempt_disable_ip); |
---|
6845 | 7716 | } |
---|
| 7717 | + |
---|
| 7718 | + trace_android_rvh_schedule_bug(NULL); |
---|
| 7719 | + |
---|
6846 | 7720 | dump_stack(); |
---|
6847 | 7721 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
---|
6848 | 7722 | } |
---|
6849 | 7723 | EXPORT_SYMBOL(___might_sleep); |
---|
| 7724 | + |
---|
| 7725 | +void __cant_sleep(const char *file, int line, int preempt_offset) |
---|
| 7726 | +{ |
---|
| 7727 | + static unsigned long prev_jiffy; |
---|
| 7728 | + |
---|
| 7729 | + if (irqs_disabled()) |
---|
| 7730 | + return; |
---|
| 7731 | + |
---|
| 7732 | + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) |
---|
| 7733 | + return; |
---|
| 7734 | + |
---|
| 7735 | + if (preempt_count() > preempt_offset) |
---|
| 7736 | + return; |
---|
| 7737 | + |
---|
| 7738 | + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
---|
| 7739 | + return; |
---|
| 7740 | + prev_jiffy = jiffies; |
---|
| 7741 | + |
---|
| 7742 | + printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); |
---|
| 7743 | + printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", |
---|
| 7744 | + in_atomic(), irqs_disabled(), |
---|
| 7745 | + current->pid, current->comm); |
---|
| 7746 | + |
---|
| 7747 | + debug_show_held_locks(current); |
---|
| 7748 | + dump_stack(); |
---|
| 7749 | + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); |
---|
| 7750 | +} |
---|
| 7751 | +EXPORT_SYMBOL_GPL(__cant_sleep); |
---|
6850 | 7752 | #endif |
---|
6851 | 7753 | |
---|
6852 | 7754 | #ifdef CONFIG_MAGIC_SYSRQ |
---|
.. | .. |
---|
6915 | 7817 | |
---|
6916 | 7818 | #ifdef CONFIG_IA64 |
---|
6917 | 7819 | /** |
---|
6918 | | - * set_curr_task - set the current task for a given CPU. |
---|
| 7820 | + * ia64_set_curr_task - set the current task for a given CPU. |
---|
6919 | 7821 | * @cpu: the processor in question. |
---|
6920 | 7822 | * @p: the task pointer to set. |
---|
6921 | 7823 | * |
---|
.. | .. |
---|
7081 | 7983 | |
---|
7082 | 7984 | if (queued) |
---|
7083 | 7985 | enqueue_task(rq, tsk, queue_flags); |
---|
7084 | | - if (running) |
---|
7085 | | - set_curr_task(rq, tsk); |
---|
| 7986 | + if (running) { |
---|
| 7987 | + set_next_task(rq, tsk); |
---|
| 7988 | + /* |
---|
| 7989 | + * After changing group, the running task may have joined a |
---|
| 7990 | + * throttled one but it's still the running task. Trigger a |
---|
| 7991 | + * resched to make sure that task can still run. |
---|
| 7992 | + */ |
---|
| 7993 | + resched_curr(rq); |
---|
| 7994 | + } |
---|
7086 | 7995 | |
---|
7087 | 7996 | task_rq_unlock(rq, tsk, &rf); |
---|
7088 | 7997 | } |
---|
.. | .. |
---|
7121 | 8030 | |
---|
7122 | 8031 | #ifdef CONFIG_UCLAMP_TASK_GROUP |
---|
7123 | 8032 | /* Propagate the effective uclamp value for the new group */ |
---|
| 8033 | + mutex_lock(&uclamp_mutex); |
---|
| 8034 | + rcu_read_lock(); |
---|
7124 | 8035 | cpu_util_update_eff(css); |
---|
| 8036 | + rcu_read_unlock(); |
---|
| 8037 | + mutex_unlock(&uclamp_mutex); |
---|
7125 | 8038 | #endif |
---|
7126 | 8039 | |
---|
| 8040 | + trace_android_rvh_cpu_cgroup_online(css); |
---|
7127 | 8041 | return 0; |
---|
7128 | 8042 | } |
---|
7129 | 8043 | |
---|
.. | .. |
---|
7189 | 8103 | if (ret) |
---|
7190 | 8104 | break; |
---|
7191 | 8105 | } |
---|
| 8106 | + |
---|
| 8107 | + trace_android_rvh_cpu_cgroup_can_attach(tset, &ret); |
---|
| 8108 | + |
---|
7192 | 8109 | return ret; |
---|
7193 | 8110 | } |
---|
7194 | 8111 | |
---|
.. | .. |
---|
7199 | 8116 | |
---|
7200 | 8117 | cgroup_taskset_for_each(task, css, tset) |
---|
7201 | 8118 | sched_move_task(task); |
---|
| 8119 | + |
---|
| 8120 | + trace_android_rvh_cpu_cgroup_attach(tset); |
---|
7202 | 8121 | } |
---|
7203 | 8122 | |
---|
7204 | 8123 | #ifdef CONFIG_UCLAMP_TASK_GROUP |
---|
.. | .. |
---|
7210 | 8129 | unsigned int eff[UCLAMP_CNT]; |
---|
7211 | 8130 | enum uclamp_id clamp_id; |
---|
7212 | 8131 | unsigned int clamps; |
---|
| 8132 | + |
---|
| 8133 | + lockdep_assert_held(&uclamp_mutex); |
---|
| 8134 | + SCHED_WARN_ON(!rcu_read_lock_held()); |
---|
7213 | 8135 | |
---|
7214 | 8136 | css_for_each_descendant_pre(css, top_css) { |
---|
7215 | 8137 | uc_parent = css_tg(css)->parent |
---|
.. | .. |
---|
7243 | 8165 | } |
---|
7244 | 8166 | |
---|
7245 | 8167 | /* Immediately update descendants RUNNABLE tasks */ |
---|
7246 | | - uclamp_update_active_tasks(css, clamps); |
---|
| 8168 | + uclamp_update_active_tasks(css); |
---|
7247 | 8169 | } |
---|
7248 | 8170 | } |
---|
7249 | 8171 | |
---|
.. | .. |
---|
7300 | 8222 | req = capacity_from_percent(buf); |
---|
7301 | 8223 | if (req.ret) |
---|
7302 | 8224 | return req.ret; |
---|
| 8225 | + |
---|
| 8226 | + static_branch_enable(&sched_uclamp_used); |
---|
7303 | 8227 | |
---|
7304 | 8228 | mutex_lock(&uclamp_mutex); |
---|
7305 | 8229 | rcu_read_lock(); |
---|
.. | .. |
---|
7415 | 8339 | static DEFINE_MUTEX(cfs_constraints_mutex); |
---|
7416 | 8340 | |
---|
7417 | 8341 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ |
---|
7418 | | -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ |
---|
| 8342 | +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ |
---|
| 8343 | +/* More than 203 days if BW_SHIFT equals 20. */ |
---|
| 8344 | +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; |
---|
7419 | 8345 | |
---|
7420 | 8346 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); |
---|
7421 | 8347 | |
---|
.. | .. |
---|
7441 | 8367 | * feasibility. |
---|
7442 | 8368 | */ |
---|
7443 | 8369 | if (period > max_cfs_quota_period) |
---|
| 8370 | + return -EINVAL; |
---|
| 8371 | + |
---|
| 8372 | + /* |
---|
| 8373 | + * Bound quota to defend quota against overflow during bandwidth shift. |
---|
| 8374 | + */ |
---|
| 8375 | + if (quota != RUNTIME_INF && quota > max_cfs_runtime) |
---|
7444 | 8376 | return -EINVAL; |
---|
7445 | 8377 | |
---|
7446 | 8378 | /* |
---|
.. | .. |
---|
7495 | 8427 | return ret; |
---|
7496 | 8428 | } |
---|
7497 | 8429 | |
---|
7498 | | -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) |
---|
| 8430 | +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) |
---|
7499 | 8431 | { |
---|
7500 | 8432 | u64 quota, period; |
---|
7501 | 8433 | |
---|
.. | .. |
---|
7510 | 8442 | return tg_set_cfs_bandwidth(tg, period, quota); |
---|
7511 | 8443 | } |
---|
7512 | 8444 | |
---|
7513 | | -long tg_get_cfs_quota(struct task_group *tg) |
---|
| 8445 | +static long tg_get_cfs_quota(struct task_group *tg) |
---|
7514 | 8446 | { |
---|
7515 | 8447 | u64 quota_us; |
---|
7516 | 8448 | |
---|
.. | .. |
---|
7523 | 8455 | return quota_us; |
---|
7524 | 8456 | } |
---|
7525 | 8457 | |
---|
7526 | | -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) |
---|
| 8458 | +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) |
---|
7527 | 8459 | { |
---|
7528 | 8460 | u64 quota, period; |
---|
7529 | 8461 | |
---|
.. | .. |
---|
7536 | 8468 | return tg_set_cfs_bandwidth(tg, period, quota); |
---|
7537 | 8469 | } |
---|
7538 | 8470 | |
---|
7539 | | -long tg_get_cfs_period(struct task_group *tg) |
---|
| 8471 | +static long tg_get_cfs_period(struct task_group *tg) |
---|
7540 | 8472 | { |
---|
7541 | 8473 | u64 cfs_period_us; |
---|
7542 | 8474 | |
---|
.. | .. |
---|
8013 | 8945 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
---|
8014 | 8946 | }; |
---|
8015 | 8947 | |
---|
8016 | | -#undef CREATE_TRACE_POINTS |
---|
| 8948 | +void call_trace_sched_update_nr_running(struct rq *rq, int count) |
---|
| 8949 | +{ |
---|
| 8950 | + trace_sched_update_nr_running_tp(rq, count); |
---|
| 8951 | +} |
---|