forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-11 297b60346df8beafee954a0fd7c2d64f33f3b9bc
kernel/kernel/sched/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * kernel/sched/core.c
34 *
....@@ -5,6 +6,10 @@
56 *
67 * Copyright (C) 1991-2002 Linus Torvalds
78 */
9
+#define CREATE_TRACE_POINTS
10
+#include <trace/events/sched.h>
11
+#undef CREATE_TRACE_POINTS
12
+
813 #include "sched.h"
914
1015 #include <linux/nospec.h>
....@@ -16,14 +21,41 @@
1621 #include <asm/tlb.h>
1722
1823 #include "../workqueue_internal.h"
24
+#include "../../io_uring/io-wq.h"
1925 #include "../smpboot.h"
2026
2127 #include "pelt.h"
28
+#include "smp.h"
2229
23
-#define CREATE_TRACE_POINTS
24
-#include <trace/events/sched.h>
30
+#include <trace/hooks/sched.h>
31
+#include <trace/hooks/dtask.h>
32
+
33
+/*
34
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
35
+ * associated with them) to allow external modules to probe them.
36
+ */
37
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
38
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
39
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
40
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
41
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
42
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
43
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
44
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
45
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
46
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
47
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
48
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
49
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50
+#ifdef CONFIG_SCHEDSTATS
51
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55
+#endif
2556
2657 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
+EXPORT_SYMBOL_GPL(runqueues);
2759
2860 #ifdef CONFIG_SCHED_DEBUG
2961 /*
....@@ -38,6 +70,7 @@
3870 const_debug unsigned int sysctl_sched_features =
3971 #include "features.h"
4072 0;
73
+EXPORT_SYMBOL_GPL(sysctl_sched_features);
4174 #undef SCHED_FEAT
4275 #endif
4376
....@@ -45,11 +78,7 @@
4578 * Number of tasks to iterate in a single balance run.
4679 * Limited because this is done with IRQs disabled.
4780 */
48
-#ifdef CONFIG_PREEMPT_RT_FULL
49
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
50
-#else
5181 const_debug unsigned int sysctl_sched_nr_migrate = 32;
52
-#endif
5382
5483 /*
5584 * period over which we measure -rt task CPU usage in us.
....@@ -64,6 +93,100 @@
6493 * default: 0.95s
6594 */
6695 int sysctl_sched_rt_runtime = 950000;
96
+
97
+
98
+/*
99
+ * Serialization rules:
100
+ *
101
+ * Lock order:
102
+ *
103
+ * p->pi_lock
104
+ * rq->lock
105
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
106
+ *
107
+ * rq1->lock
108
+ * rq2->lock where: rq1 < rq2
109
+ *
110
+ * Regular state:
111
+ *
112
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
113
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
114
+ * always looks at the local rq data structures to find the most elegible task
115
+ * to run next.
116
+ *
117
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
118
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
119
+ * the local CPU to avoid bouncing the runqueue state around [ see
120
+ * ttwu_queue_wakelist() ]
121
+ *
122
+ * Task wakeup, specifically wakeups that involve migration, are horribly
123
+ * complicated to avoid having to take two rq->locks.
124
+ *
125
+ * Special state:
126
+ *
127
+ * System-calls and anything external will use task_rq_lock() which acquires
128
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
129
+ * stable while holding either lock:
130
+ *
131
+ * - sched_setaffinity()/
132
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
133
+ * - set_user_nice(): p->se.load, p->*prio
134
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
135
+ * p->se.load, p->rt_priority,
136
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
137
+ * - sched_setnuma(): p->numa_preferred_nid
138
+ * - sched_move_task()/
139
+ * cpu_cgroup_fork(): p->sched_task_group
140
+ * - uclamp_update_active() p->uclamp*
141
+ *
142
+ * p->state <- TASK_*:
143
+ *
144
+ * is changed locklessly using set_current_state(), __set_current_state() or
145
+ * set_special_state(), see their respective comments, or by
146
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
147
+ * concurrent self.
148
+ *
149
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
150
+ *
151
+ * is set by activate_task() and cleared by deactivate_task(), under
152
+ * rq->lock. Non-zero indicates the task is runnable, the special
153
+ * ON_RQ_MIGRATING state is used for migration without holding both
154
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
155
+ *
156
+ * p->on_cpu <- { 0, 1 }:
157
+ *
158
+ * is set by prepare_task() and cleared by finish_task() such that it will be
159
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
160
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
161
+ *
162
+ * [ The astute reader will observe that it is possible for two tasks on one
163
+ * CPU to have ->on_cpu = 1 at the same time. ]
164
+ *
165
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
166
+ *
167
+ * - Don't call set_task_cpu() on a blocked task:
168
+ *
169
+ * We don't care what CPU we're not running on, this simplifies hotplug,
170
+ * the CPU assignment of blocked tasks isn't required to be valid.
171
+ *
172
+ * - for try_to_wake_up(), called under p->pi_lock:
173
+ *
174
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
175
+ *
176
+ * - for migration called under rq->lock:
177
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
178
+ *
179
+ * o move_queued_task()
180
+ * o detach_task()
181
+ *
182
+ * - for migration called under double_rq_lock():
183
+ *
184
+ * o __migrate_swap_task()
185
+ * o push_rt_task() / pull_rt_task()
186
+ * o push_dl_task() / pull_dl_task()
187
+ * o dl_task_offline_migration()
188
+ *
189
+ */
67190
68191 /*
69192 * __task_rq_lock - lock the rq @p resides on.
....@@ -88,6 +211,7 @@
88211 cpu_relax();
89212 }
90213 }
214
+EXPORT_SYMBOL_GPL(__task_rq_lock);
91215
92216 /*
93217 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
....@@ -130,6 +254,7 @@
130254 cpu_relax();
131255 }
132256 }
257
+EXPORT_SYMBOL_GPL(task_rq_lock);
133258
134259 /*
135260 * RQ-clock updating methods:
....@@ -210,7 +335,15 @@
210335 rq->clock += delta;
211336 update_rq_clock_task(rq, delta);
212337 }
338
+EXPORT_SYMBOL_GPL(update_rq_clock);
213339
340
+static inline void
341
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
342
+{
343
+ csd->flags = 0;
344
+ csd->func = func;
345
+ csd->info = rq;
346
+}
214347
215348 #ifdef CONFIG_SCHED_HRTICK
216349 /*
....@@ -247,8 +380,9 @@
247380 static void __hrtick_restart(struct rq *rq)
248381 {
249382 struct hrtimer *timer = &rq->hrtick_timer;
383
+ ktime_t time = rq->hrtick_time;
250384
251
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
385
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
252386 }
253387
254388 /*
....@@ -261,7 +395,6 @@
261395
262396 rq_lock(rq, &rf);
263397 __hrtick_restart(rq);
264
- rq->hrtick_csd_pending = 0;
265398 rq_unlock(rq, &rf);
266399 }
267400
....@@ -273,7 +406,6 @@
273406 void hrtick_start(struct rq *rq, u64 delay)
274407 {
275408 struct hrtimer *timer = &rq->hrtick_timer;
276
- ktime_t time;
277409 s64 delta;
278410
279411 /*
....@@ -281,16 +413,12 @@
281413 * doesn't make sense and can cause timer DoS.
282414 */
283415 delta = max_t(s64, delay, 10000LL);
284
- time = ktime_add_ns(timer->base->get_time(), delta);
416
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
285417
286
- hrtimer_set_expires(timer, time);
287
-
288
- if (rq == this_rq()) {
418
+ if (rq == this_rq())
289419 __hrtick_restart(rq);
290
- } else if (!rq->hrtick_csd_pending) {
420
+ else
291421 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292
- rq->hrtick_csd_pending = 1;
293
- }
294422 }
295423
296424 #else
....@@ -307,20 +435,16 @@
307435 */
308436 delay = max_t(u64, delay, 10000LL);
309437 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310
- HRTIMER_MODE_REL_PINNED);
438
+ HRTIMER_MODE_REL_PINNED_HARD);
311439 }
440
+
312441 #endif /* CONFIG_SMP */
313442
314443 static void hrtick_rq_init(struct rq *rq)
315444 {
316445 #ifdef CONFIG_SMP
317
- rq->hrtick_csd_pending = 0;
318
-
319
- rq->hrtick_csd.flags = 0;
320
- rq->hrtick_csd.func = __hrtick_start;
321
- rq->hrtick_csd.info = rq;
446
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
322447 #endif
323
-
324448 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
325449 rq->hrtick_timer.function = hrtick;
326450 }
....@@ -403,15 +527,9 @@
403527 #endif
404528 #endif
405529
406
-void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
407
- bool sleeper)
530
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
408531 {
409
- struct wake_q_node *node;
410
-
411
- if (sleeper)
412
- node = &task->wake_q_sleeper;
413
- else
414
- node = &task->wake_q;
532
+ struct wake_q_node *node = &task->wake_q;
415533
416534 /*
417535 * Atomically grab the task, if ->wake_q is !nil already it means
....@@ -422,50 +540,79 @@
422540 * state, even in the failed case, an explicit smp_mb() must be used.
423541 */
424542 smp_mb__before_atomic();
425
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
426
- return;
427
-
428
- head->count++;
429
-
430
- get_task_struct(task);
543
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
544
+ return false;
431545
432546 /*
433547 * The head is context local, there can be no concurrency.
434548 */
435549 *head->lastp = node;
436550 head->lastp = &node->next;
551
+ head->count++;
552
+ return true;
437553 }
438554
439
-static int
440
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
441
- int sibling_count_hint);
442
-void __wake_up_q(struct wake_q_head *head, bool sleeper)
555
+/**
556
+ * wake_q_add() - queue a wakeup for 'later' waking.
557
+ * @head: the wake_q_head to add @task to
558
+ * @task: the task to queue for 'later' wakeup
559
+ *
560
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
561
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
562
+ * instantly.
563
+ *
564
+ * This function must be used as-if it were wake_up_process(); IOW the task
565
+ * must be ready to be woken at this location.
566
+ */
567
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
568
+{
569
+ if (__wake_q_add(head, task))
570
+ get_task_struct(task);
571
+}
572
+
573
+/**
574
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
575
+ * @head: the wake_q_head to add @task to
576
+ * @task: the task to queue for 'later' wakeup
577
+ *
578
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
579
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
580
+ * instantly.
581
+ *
582
+ * This function must be used as-if it were wake_up_process(); IOW the task
583
+ * must be ready to be woken at this location.
584
+ *
585
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
586
+ * that already hold reference to @task can call the 'safe' version and trust
587
+ * wake_q to do the right thing depending whether or not the @task is already
588
+ * queued for wakeup.
589
+ */
590
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
591
+{
592
+ if (!__wake_q_add(head, task))
593
+ put_task_struct(task);
594
+}
595
+
596
+void wake_up_q(struct wake_q_head *head)
443597 {
444598 struct wake_q_node *node = head->first;
445599
446600 while (node != WAKE_Q_TAIL) {
447601 struct task_struct *task;
448602
449
- if (sleeper)
450
- task = container_of(node, struct task_struct, wake_q_sleeper);
451
- else
452
- task = container_of(node, struct task_struct, wake_q);
603
+ task = container_of(node, struct task_struct, wake_q);
453604 BUG_ON(!task);
454605 /* Task can safely be re-inserted now: */
455606 node = node->next;
456
- if (sleeper)
457
- task->wake_q_sleeper.next = NULL;
458
- else
459
- task->wake_q.next = NULL;
607
+ task->wake_q.next = NULL;
608
+ task->wake_q_count = head->count;
609
+
460610 /*
461611 * wake_up_process() executes a full barrier, which pairs with
462612 * the queueing in wake_q_add() so as not to miss wakeups.
463613 */
464
- if (sleeper)
465
- wake_up_lock_sleeper(task);
466
- else
467
- wake_up_process(task);
468
-
614
+ wake_up_process(task);
615
+ task->wake_q_count = 0;
469616 put_task_struct(task);
470617 }
471618 }
....@@ -495,57 +642,12 @@
495642 return;
496643 }
497644
498
-#ifdef CONFIG_PREEMPT
499645 if (set_nr_and_not_polling(curr))
500
-#else
501
- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
502
-#endif
503646 smp_send_reschedule(cpu);
504647 else
505648 trace_sched_wake_idle_without_ipi(cpu);
506649 }
507
-
508
-#ifdef CONFIG_PREEMPT_LAZY
509
-
510
-static int tsk_is_polling(struct task_struct *p)
511
-{
512
-#ifdef TIF_POLLING_NRFLAG
513
- return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
514
-#else
515
- return 0;
516
-#endif
517
-}
518
-
519
-void resched_curr_lazy(struct rq *rq)
520
-{
521
- struct task_struct *curr = rq->curr;
522
- int cpu;
523
-
524
- if (!sched_feat(PREEMPT_LAZY)) {
525
- resched_curr(rq);
526
- return;
527
- }
528
-
529
- lockdep_assert_held(&rq->lock);
530
-
531
- if (test_tsk_need_resched(curr))
532
- return;
533
-
534
- if (test_tsk_need_resched_lazy(curr))
535
- return;
536
-
537
- set_tsk_need_resched_lazy(curr);
538
-
539
- cpu = cpu_of(rq);
540
- if (cpu == smp_processor_id())
541
- return;
542
-
543
- /* NEED_RESCHED_LAZY must be visible before we test polling */
544
- smp_mb();
545
- if (!tsk_is_polling(curr))
546
- smp_send_reschedule(cpu);
547
-}
548
-#endif
650
+EXPORT_SYMBOL_GPL(resched_curr);
549651
550652 void resched_cpu(int cpu)
551653 {
....@@ -570,27 +672,49 @@
570672 */
571673 int get_nohz_timer_target(void)
572674 {
573
- int i, cpu = smp_processor_id();
675
+ int i, cpu = smp_processor_id(), default_cpu = -1;
574676 struct sched_domain *sd;
575677
576
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
577
- return cpu;
678
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
679
+ if (!idle_cpu(cpu))
680
+ return cpu;
681
+ default_cpu = cpu;
682
+ }
578683
579684 rcu_read_lock();
580685 for_each_domain(cpu, sd) {
581
- for_each_cpu(i, sched_domain_span(sd)) {
686
+ for_each_cpu_and(i, sched_domain_span(sd),
687
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
582688 if (cpu == i)
583689 continue;
584690
585
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
691
+ if (!idle_cpu(i)) {
586692 cpu = i;
587693 goto unlock;
588694 }
589695 }
590696 }
591697
592
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
593
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
698
+ if (default_cpu == -1) {
699
+ for_each_cpu_and(i, cpu_active_mask,
700
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
701
+ if (cpu == i)
702
+ continue;
703
+
704
+ if (!idle_cpu(i)) {
705
+ cpu = i;
706
+ goto unlock;
707
+ }
708
+ }
709
+
710
+ /* no active, not-idle, housekpeeing CPU found. */
711
+ default_cpu = cpumask_any(cpu_active_mask);
712
+
713
+ if (unlikely(default_cpu >= nr_cpu_ids))
714
+ goto unlock;
715
+ }
716
+
717
+ cpu = default_cpu;
594718 unlock:
595719 rcu_read_unlock();
596720 return cpu;
....@@ -650,29 +774,23 @@
650774 wake_up_idle_cpu(cpu);
651775 }
652776
653
-static inline bool got_nohz_idle_kick(void)
777
+static void nohz_csd_func(void *info)
654778 {
655
- int cpu = smp_processor_id();
656
-
657
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
658
- return false;
659
-
660
- if (idle_cpu(cpu) && !need_resched())
661
- return true;
779
+ struct rq *rq = info;
780
+ int cpu = cpu_of(rq);
781
+ unsigned int flags;
662782
663783 /*
664
- * We can't run Idle Load Balance on this CPU for this time so we
665
- * cancel it and clear NOHZ_BALANCE_KICK
784
+ * Release the rq::nohz_csd.
666785 */
667
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
668
- return false;
669
-}
786
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
787
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
670788
671
-#else /* CONFIG_NO_HZ_COMMON */
672
-
673
-static inline bool got_nohz_idle_kick(void)
674
-{
675
- return false;
789
+ rq->idle_balance = idle_cpu(cpu);
790
+ if (rq->idle_balance && !need_resched()) {
791
+ rq->nohz_idle_balance = flags;
792
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
793
+ }
676794 }
677795
678796 #endif /* CONFIG_NO_HZ_COMMON */
....@@ -763,18 +881,18 @@
763881 }
764882 #endif
765883
766
-static void set_load_weight(struct task_struct *p, bool update_load)
884
+static void set_load_weight(struct task_struct *p)
767885 {
886
+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
768887 int prio = p->static_prio - MAX_RT_PRIO;
769888 struct load_weight *load = &p->se.load;
770889
771890 /*
772891 * SCHED_IDLE tasks get minimal weight:
773892 */
774
- if (idle_policy(p->policy)) {
893
+ if (task_has_idle_policy(p)) {
775894 load->weight = scale_load(WEIGHT_IDLEPRIO);
776895 load->inv_weight = WMULT_IDLEPRIO;
777
- p->se.runnable_weight = load->weight;
778896 return;
779897 }
780898
....@@ -787,7 +905,6 @@
787905 } else {
788906 load->weight = scale_load(sched_prio_to_weight[prio]);
789907 load->inv_weight = sched_prio_to_wmult[prio];
790
- p->se.runnable_weight = load->weight;
791908 }
792909 }
793910
....@@ -810,8 +927,46 @@
810927 /* Max allowed maximum utilization */
811928 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
812929
930
+/*
931
+ * By default RT tasks run at the maximum performance point/capacity of the
932
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
933
+ * SCHED_CAPACITY_SCALE.
934
+ *
935
+ * This knob allows admins to change the default behavior when uclamp is being
936
+ * used. In battery powered devices, particularly, running at the maximum
937
+ * capacity and frequency will increase energy consumption and shorten the
938
+ * battery life.
939
+ *
940
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
941
+ *
942
+ * This knob will not override the system default sched_util_clamp_min defined
943
+ * above.
944
+ */
945
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
946
+
813947 /* All clamps are required to be less or equal than these values */
814948 static struct uclamp_se uclamp_default[UCLAMP_CNT];
949
+
950
+/*
951
+ * This static key is used to reduce the uclamp overhead in the fast path. It
952
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
953
+ * enqueue/dequeue_task().
954
+ *
955
+ * This allows users to continue to enable uclamp in their kernel config with
956
+ * minimum uclamp overhead in the fast path.
957
+ *
958
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
959
+ * enabled, since we have an actual users that make use of uclamp
960
+ * functionality.
961
+ *
962
+ * The knobs that would enable this static key are:
963
+ *
964
+ * * A task modifying its uclamp value with sched_setattr().
965
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
966
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
967
+ */
968
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
969
+EXPORT_SYMBOL_GPL(sched_uclamp_used);
815970
816971 /* Integer rounded range for each bucket */
817972 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
....@@ -822,11 +977,6 @@
822977 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
823978 {
824979 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
825
-}
826
-
827
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
828
-{
829
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
830980 }
831981
832982 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
....@@ -868,7 +1018,7 @@
8681018 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
8691019 return;
8701020
871
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1021
+ uclamp_rq_set(rq, clamp_id, clamp_value);
8721022 }
8731023
8741024 static inline
....@@ -892,12 +1042,79 @@
8921042 return uclamp_idle_value(rq, clamp_id, clamp_value);
8931043 }
8941044
1045
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1046
+{
1047
+ unsigned int default_util_min;
1048
+ struct uclamp_se *uc_se;
1049
+
1050
+ lockdep_assert_held(&p->pi_lock);
1051
+
1052
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
1053
+
1054
+ /* Only sync if user didn't override the default */
1055
+ if (uc_se->user_defined)
1056
+ return;
1057
+
1058
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1059
+ uclamp_se_set(uc_se, default_util_min, false);
1060
+}
1061
+
1062
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
1063
+{
1064
+ struct rq_flags rf;
1065
+ struct rq *rq;
1066
+
1067
+ if (!rt_task(p))
1068
+ return;
1069
+
1070
+ /* Protect updates to p->uclamp_* */
1071
+ rq = task_rq_lock(p, &rf);
1072
+ __uclamp_update_util_min_rt_default(p);
1073
+ task_rq_unlock(rq, p, &rf);
1074
+}
1075
+
1076
+static void uclamp_sync_util_min_rt_default(void)
1077
+{
1078
+ struct task_struct *g, *p;
1079
+
1080
+ /*
1081
+ * copy_process() sysctl_uclamp
1082
+ * uclamp_min_rt = X;
1083
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1084
+ * // link thread smp_mb__after_spinlock()
1085
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1086
+ * sched_post_fork() for_each_process_thread()
1087
+ * __uclamp_sync_rt() __uclamp_sync_rt()
1088
+ *
1089
+ * Ensures that either sched_post_fork() will observe the new
1090
+ * uclamp_min_rt or for_each_process_thread() will observe the new
1091
+ * task.
1092
+ */
1093
+ read_lock(&tasklist_lock);
1094
+ smp_mb__after_spinlock();
1095
+ read_unlock(&tasklist_lock);
1096
+
1097
+ rcu_read_lock();
1098
+ for_each_process_thread(g, p)
1099
+ uclamp_update_util_min_rt_default(p);
1100
+ rcu_read_unlock();
1101
+}
1102
+
1103
+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
1104
+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
1105
+{
1106
+ uclamp_sync_util_min_rt_default();
1107
+}
1108
+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
1109
+#endif
1110
+
8951111 static inline struct uclamp_se
8961112 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
8971113 {
1114
+ /* Copy by value as we could modify it */
8981115 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
8991116 #ifdef CONFIG_UCLAMP_TASK_GROUP
900
- struct uclamp_se uc_max;
1117
+ unsigned int tg_min, tg_max, value;
9011118
9021119 /*
9031120 * Tasks in autogroups or root task group will be
....@@ -908,9 +1125,11 @@
9081125 if (task_group(p) == &root_task_group)
9091126 return uc_req;
9101127
911
- uc_max = task_group(p)->uclamp[clamp_id];
912
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
913
- return uc_max;
1128
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1129
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1130
+ value = uc_req.value;
1131
+ value = clamp(value, tg_min, tg_max);
1132
+ uclamp_se_set(&uc_req, value, false);
9141133 #endif
9151134
9161135 return uc_req;
....@@ -929,6 +1148,12 @@
9291148 {
9301149 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
9311150 struct uclamp_se uc_max = uclamp_default[clamp_id];
1151
+ struct uclamp_se uc_eff;
1152
+ int ret = 0;
1153
+
1154
+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
1155
+ if (ret)
1156
+ return uc_eff;
9321157
9331158 /* System default restrictions always apply */
9341159 if (unlikely(uc_req.value > uc_max.value))
....@@ -949,6 +1174,7 @@
9491174
9501175 return (unsigned long)uc_eff.value;
9511176 }
1177
+EXPORT_SYMBOL_GPL(uclamp_eff_value);
9521178
9531179 /*
9541180 * When a task is enqueued on a rq, the clamp bucket currently defined by the
....@@ -985,8 +1211,8 @@
9851211 if (bucket->tasks == 1 || uc_se->value > bucket->value)
9861212 bucket->value = uc_se->value;
9871213
988
- if (uc_se->value > READ_ONCE(uc_rq->value))
989
- WRITE_ONCE(uc_rq->value, uc_se->value);
1214
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
1215
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
9901216 }
9911217
9921218 /*
....@@ -1009,10 +1235,38 @@
10091235
10101236 lockdep_assert_held(&rq->lock);
10111237
1238
+ /*
1239
+ * If sched_uclamp_used was enabled after task @p was enqueued,
1240
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
1241
+ *
1242
+ * In this case the uc_se->active flag should be false since no uclamp
1243
+ * accounting was performed at enqueue time and we can just return
1244
+ * here.
1245
+ *
1246
+ * Need to be careful of the following enqeueue/dequeue ordering
1247
+ * problem too
1248
+ *
1249
+ * enqueue(taskA)
1250
+ * // sched_uclamp_used gets enabled
1251
+ * enqueue(taskB)
1252
+ * dequeue(taskA)
1253
+ * // Must not decrement bukcet->tasks here
1254
+ * dequeue(taskB)
1255
+ *
1256
+ * where we could end up with stale data in uc_se and
1257
+ * bucket[uc_se->bucket_id].
1258
+ *
1259
+ * The following check here eliminates the possibility of such race.
1260
+ */
1261
+ if (unlikely(!uc_se->active))
1262
+ return;
1263
+
10121264 bucket = &uc_rq->bucket[uc_se->bucket_id];
1265
+
10131266 SCHED_WARN_ON(!bucket->tasks);
10141267 if (likely(bucket->tasks))
10151268 bucket->tasks--;
1269
+
10161270 uc_se->active = false;
10171271
10181272 /*
....@@ -1024,7 +1278,7 @@
10241278 if (likely(bucket->tasks))
10251279 return;
10261280
1027
- rq_clamp = READ_ONCE(uc_rq->value);
1281
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
10281282 /*
10291283 * Defensive programming: this should never happen. If it happens,
10301284 * e.g. due to future modification, warn and fixup the expected value.
....@@ -1032,13 +1286,22 @@
10321286 SCHED_WARN_ON(bucket->value > rq_clamp);
10331287 if (bucket->value >= rq_clamp) {
10341288 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1035
- WRITE_ONCE(uc_rq->value, bkt_clamp);
1289
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
10361290 }
10371291 }
10381292
10391293 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
10401294 {
10411295 enum uclamp_id clamp_id;
1296
+
1297
+ /*
1298
+ * Avoid any overhead until uclamp is actually used by the userspace.
1299
+ *
1300
+ * The condition is constructed such that a NOP is generated when
1301
+ * sched_uclamp_used is disabled.
1302
+ */
1303
+ if (!static_branch_unlikely(&sched_uclamp_used))
1304
+ return;
10421305
10431306 if (unlikely(!p->sched_class->uclamp_enabled))
10441307 return;
....@@ -1055,6 +1318,15 @@
10551318 {
10561319 enum uclamp_id clamp_id;
10571320
1321
+ /*
1322
+ * Avoid any overhead until uclamp is actually used by the userspace.
1323
+ *
1324
+ * The condition is constructed such that a NOP is generated when
1325
+ * sched_uclamp_used is disabled.
1326
+ */
1327
+ if (!static_branch_unlikely(&sched_uclamp_used))
1328
+ return;
1329
+
10581330 if (unlikely(!p->sched_class->uclamp_enabled))
10591331 return;
10601332
....@@ -1062,9 +1334,27 @@
10621334 uclamp_rq_dec_id(rq, p, clamp_id);
10631335 }
10641336
1065
-static inline void
1066
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1337
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1338
+ enum uclamp_id clamp_id)
10671339 {
1340
+ if (!p->uclamp[clamp_id].active)
1341
+ return;
1342
+
1343
+ uclamp_rq_dec_id(rq, p, clamp_id);
1344
+ uclamp_rq_inc_id(rq, p, clamp_id);
1345
+
1346
+ /*
1347
+ * Make sure to clear the idle flag if we've transiently reached 0
1348
+ * active tasks on rq.
1349
+ */
1350
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1351
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1352
+}
1353
+
1354
+static inline void
1355
+uclamp_update_active(struct task_struct *p)
1356
+{
1357
+ enum uclamp_id clamp_id;
10681358 struct rq_flags rf;
10691359 struct rq *rq;
10701360
....@@ -1084,30 +1374,22 @@
10841374 * affecting a valid clamp bucket, the next time it's enqueued,
10851375 * it will already see the updated clamp bucket value.
10861376 */
1087
- if (p->uclamp[clamp_id].active) {
1088
- uclamp_rq_dec_id(rq, p, clamp_id);
1089
- uclamp_rq_inc_id(rq, p, clamp_id);
1090
- }
1377
+ for_each_clamp_id(clamp_id)
1378
+ uclamp_rq_reinc_id(rq, p, clamp_id);
10911379
10921380 task_rq_unlock(rq, p, &rf);
10931381 }
10941382
10951383 #ifdef CONFIG_UCLAMP_TASK_GROUP
10961384 static inline void
1097
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1098
- unsigned int clamps)
1385
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
10991386 {
1100
- enum uclamp_id clamp_id;
11011387 struct css_task_iter it;
11021388 struct task_struct *p;
11031389
11041390 css_task_iter_start(css, 0, &it);
1105
- while ((p = css_task_iter_next(&it))) {
1106
- for_each_clamp_id(clamp_id) {
1107
- if ((0x1 << clamp_id) & clamps)
1108
- uclamp_update_active(p, clamp_id);
1109
- }
1110
- }
1391
+ while ((p = css_task_iter_next(&it)))
1392
+ uclamp_update_active(p);
11111393 css_task_iter_end(&it);
11121394 }
11131395
....@@ -1130,16 +1412,16 @@
11301412 #endif
11311413
11321414 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1133
- void __user *buffer, size_t *lenp,
1134
- loff_t *ppos)
1415
+ void *buffer, size_t *lenp, loff_t *ppos)
11351416 {
11361417 bool update_root_tg = false;
1137
- int old_min, old_max;
1418
+ int old_min, old_max, old_min_rt;
11381419 int result;
11391420
11401421 mutex_lock(&uclamp_mutex);
11411422 old_min = sysctl_sched_uclamp_util_min;
11421423 old_max = sysctl_sched_uclamp_util_max;
1424
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
11431425
11441426 result = proc_dointvec(table, write, buffer, lenp, ppos);
11451427 if (result)
....@@ -1148,7 +1430,9 @@
11481430 goto done;
11491431
11501432 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1151
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1433
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1434
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1435
+
11521436 result = -EINVAL;
11531437 goto undo;
11541438 }
....@@ -1164,8 +1448,15 @@
11641448 update_root_tg = true;
11651449 }
11661450
1167
- if (update_root_tg)
1451
+ if (update_root_tg) {
1452
+ static_branch_enable(&sched_uclamp_used);
11681453 uclamp_update_root_tg();
1454
+ }
1455
+
1456
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1457
+ static_branch_enable(&sched_uclamp_used);
1458
+ uclamp_sync_util_min_rt_default();
1459
+ }
11691460
11701461 /*
11711462 * We update all RUNNABLE tasks only when task groups are in use.
....@@ -1178,6 +1469,7 @@
11781469 undo:
11791470 sysctl_sched_uclamp_util_min = old_min;
11801471 sysctl_sched_uclamp_util_max = old_max;
1472
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
11811473 done:
11821474 mutex_unlock(&uclamp_mutex);
11831475
....@@ -1187,20 +1479,61 @@
11871479 static int uclamp_validate(struct task_struct *p,
11881480 const struct sched_attr *attr)
11891481 {
1190
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1191
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1482
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
1483
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
11921484
1193
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1194
- lower_bound = attr->sched_util_min;
1195
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1196
- upper_bound = attr->sched_util_max;
1485
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1486
+ util_min = attr->sched_util_min;
11971487
1198
- if (lower_bound > upper_bound)
1488
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1489
+ return -EINVAL;
1490
+ }
1491
+
1492
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1493
+ util_max = attr->sched_util_max;
1494
+
1495
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1496
+ return -EINVAL;
1497
+ }
1498
+
1499
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
11991500 return -EINVAL;
1200
- if (upper_bound > SCHED_CAPACITY_SCALE)
1201
- return -EINVAL;
1501
+
1502
+ /*
1503
+ * We have valid uclamp attributes; make sure uclamp is enabled.
1504
+ *
1505
+ * We need to do that here, because enabling static branches is a
1506
+ * blocking operation which obviously cannot be done while holding
1507
+ * scheduler locks.
1508
+ */
1509
+ static_branch_enable(&sched_uclamp_used);
12021510
12031511 return 0;
1512
+}
1513
+
1514
+static bool uclamp_reset(const struct sched_attr *attr,
1515
+ enum uclamp_id clamp_id,
1516
+ struct uclamp_se *uc_se)
1517
+{
1518
+ /* Reset on sched class change for a non user-defined clamp value. */
1519
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1520
+ !uc_se->user_defined)
1521
+ return true;
1522
+
1523
+ /* Reset on sched_util_{min,max} == -1. */
1524
+ if (clamp_id == UCLAMP_MIN &&
1525
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1526
+ attr->sched_util_min == -1) {
1527
+ return true;
1528
+ }
1529
+
1530
+ if (clamp_id == UCLAMP_MAX &&
1531
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1532
+ attr->sched_util_max == -1) {
1533
+ return true;
1534
+ }
1535
+
1536
+ return false;
12041537 }
12051538
12061539 static void __setscheduler_uclamp(struct task_struct *p,
....@@ -1208,40 +1541,41 @@
12081541 {
12091542 enum uclamp_id clamp_id;
12101543
1211
- /*
1212
- * On scheduling class change, reset to default clamps for tasks
1213
- * without a task-specific value.
1214
- */
12151544 for_each_clamp_id(clamp_id) {
12161545 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1217
- unsigned int clamp_value = uclamp_none(clamp_id);
1546
+ unsigned int value;
12181547
1219
- /* Keep using defined clamps across class changes */
1220
- if (uc_se->user_defined)
1548
+ if (!uclamp_reset(attr, clamp_id, uc_se))
12211549 continue;
12221550
1223
- /* By default, RT tasks always get 100% boost */
1224
- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1225
- unlikely(rt_task(p) &&
1226
- clamp_id == UCLAMP_MIN)) {
1551
+ /*
1552
+ * RT by default have a 100% boost value that could be modified
1553
+ * at runtime.
1554
+ */
1555
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1556
+ value = sysctl_sched_uclamp_util_min_rt_default;
1557
+ else
1558
+ value = uclamp_none(clamp_id);
12271559
1228
- clamp_value = uclamp_none(UCLAMP_MAX);
1229
- }
1560
+ uclamp_se_set(uc_se, value, false);
12301561
1231
- uclamp_se_set(uc_se, clamp_value, false);
12321562 }
12331563
12341564 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
12351565 return;
12361566
1237
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1567
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1568
+ attr->sched_util_min != -1) {
12381569 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
12391570 attr->sched_util_min, true);
1571
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
12401572 }
12411573
1242
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1574
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1575
+ attr->sched_util_max != -1) {
12431576 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
12441577 attr->sched_util_max, true);
1578
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
12451579 }
12461580 }
12471581
....@@ -1249,6 +1583,10 @@
12491583 {
12501584 enum uclamp_id clamp_id;
12511585
1586
+ /*
1587
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1588
+ * as the task is still at its early fork stages.
1589
+ */
12521590 for_each_clamp_id(clamp_id)
12531591 p->uclamp[clamp_id].active = false;
12541592
....@@ -1261,39 +1599,24 @@
12611599 }
12621600 }
12631601
1264
-#ifdef CONFIG_SMP
1265
-unsigned int uclamp_task(struct task_struct *p)
1602
+static void uclamp_post_fork(struct task_struct *p)
12661603 {
1267
- unsigned long util;
1268
-
1269
- util = task_util_est(p);
1270
- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1271
- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1272
-
1273
- return util;
1604
+ uclamp_update_util_min_rt_default(p);
12741605 }
12751606
1276
-bool uclamp_boosted(struct task_struct *p)
1607
+static void __init init_uclamp_rq(struct rq *rq)
12771608 {
1278
- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
1609
+ enum uclamp_id clamp_id;
1610
+ struct uclamp_rq *uc_rq = rq->uclamp;
1611
+
1612
+ for_each_clamp_id(clamp_id) {
1613
+ uc_rq[clamp_id] = (struct uclamp_rq) {
1614
+ .value = uclamp_none(clamp_id)
1615
+ };
1616
+ }
1617
+
1618
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
12791619 }
1280
-
1281
-bool uclamp_latency_sensitive(struct task_struct *p)
1282
-{
1283
-#ifdef CONFIG_UCLAMP_TASK_GROUP
1284
- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1285
- struct task_group *tg;
1286
-
1287
- if (!css)
1288
- return false;
1289
- tg = container_of(css, struct task_group, css);
1290
-
1291
- return tg->latency_sensitive;
1292
-#else
1293
- return false;
1294
-#endif
1295
-}
1296
-#endif /* CONFIG_SMP */
12971620
12981621 static void __init init_uclamp(void)
12991622 {
....@@ -1301,13 +1624,8 @@
13011624 enum uclamp_id clamp_id;
13021625 int cpu;
13031626
1304
- mutex_init(&uclamp_mutex);
1305
-
1306
- for_each_possible_cpu(cpu) {
1307
- memset(&cpu_rq(cpu)->uclamp, 0,
1308
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1309
- cpu_rq(cpu)->uclamp_flags = 0;
1310
- }
1627
+ for_each_possible_cpu(cpu)
1628
+ init_uclamp_rq(cpu_rq(cpu));
13111629
13121630 for_each_clamp_id(clamp_id) {
13131631 uclamp_se_set(&init_task.uclamp_req[clamp_id],
....@@ -1336,41 +1654,7 @@
13361654 static void __setscheduler_uclamp(struct task_struct *p,
13371655 const struct sched_attr *attr) { }
13381656 static inline void uclamp_fork(struct task_struct *p) { }
1339
-
1340
-long schedtune_task_margin(struct task_struct *task);
1341
-
1342
-#ifdef CONFIG_SMP
1343
-unsigned int uclamp_task(struct task_struct *p)
1344
-{
1345
- unsigned long util = task_util_est(p);
1346
-#ifdef CONFIG_SCHED_TUNE
1347
- long margin = schedtune_task_margin(p);
1348
-
1349
- trace_sched_boost_task(p, util, margin);
1350
-
1351
- util += margin;
1352
-#endif
1353
-
1354
- return util;
1355
-}
1356
-
1357
-bool uclamp_boosted(struct task_struct *p)
1358
-{
1359
-#ifdef CONFIG_SCHED_TUNE
1360
- return schedtune_task_boost(p) > 0;
1361
-#endif
1362
- return false;
1363
-}
1364
-
1365
-bool uclamp_latency_sensitive(struct task_struct *p)
1366
-{
1367
-#ifdef CONFIG_SCHED_TUNE
1368
- return schedtune_prefer_idle(p) != 0;
1369
-#endif
1370
- return false;
1371
-}
1372
-#endif /* CONFIG_SMP */
1373
-
1657
+static inline void uclamp_post_fork(struct task_struct *p) { }
13741658 static inline void init_uclamp(void) { }
13751659 #endif /* CONFIG_UCLAMP_TASK */
13761660
....@@ -1385,7 +1669,9 @@
13851669 }
13861670
13871671 uclamp_rq_inc(rq, p);
1672
+ trace_android_rvh_enqueue_task(rq, p, flags);
13881673 p->sched_class->enqueue_task(rq, p, flags);
1674
+ trace_android_rvh_after_enqueue_task(rq, p);
13891675 }
13901676
13911677 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
....@@ -1399,31 +1685,42 @@
13991685 }
14001686
14011687 uclamp_rq_dec(rq, p);
1688
+ trace_android_rvh_dequeue_task(rq, p, flags);
14021689 p->sched_class->dequeue_task(rq, p, flags);
1690
+ trace_android_rvh_after_dequeue_task(rq, p);
14031691 }
14041692
14051693 void activate_task(struct rq *rq, struct task_struct *p, int flags)
14061694 {
1407
- if (task_contributes_to_load(p))
1408
- rq->nr_uninterruptible--;
1695
+ if (task_on_rq_migrating(p))
1696
+ flags |= ENQUEUE_MIGRATED;
14091697
14101698 enqueue_task(rq, p, flags);
1699
+
1700
+ p->on_rq = TASK_ON_RQ_QUEUED;
14111701 }
1702
+EXPORT_SYMBOL_GPL(activate_task);
14121703
14131704 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
14141705 {
1415
- if (task_contributes_to_load(p))
1416
- rq->nr_uninterruptible++;
1706
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
14171707
14181708 dequeue_task(rq, p, flags);
14191709 }
1710
+EXPORT_SYMBOL_GPL(deactivate_task);
14201711
1421
-/*
1422
- * __normal_prio - return the priority that is based on the static prio
1423
- */
1424
-static inline int __normal_prio(struct task_struct *p)
1712
+static inline int __normal_prio(int policy, int rt_prio, int nice)
14251713 {
1426
- return p->static_prio;
1714
+ int prio;
1715
+
1716
+ if (dl_policy(policy))
1717
+ prio = MAX_DL_PRIO - 1;
1718
+ else if (rt_policy(policy))
1719
+ prio = MAX_RT_PRIO - 1 - rt_prio;
1720
+ else
1721
+ prio = NICE_TO_PRIO(nice);
1722
+
1723
+ return prio;
14271724 }
14281725
14291726 /*
....@@ -1435,15 +1732,7 @@
14351732 */
14361733 static inline int normal_prio(struct task_struct *p)
14371734 {
1438
- int prio;
1439
-
1440
- if (task_has_dl_policy(p))
1441
- prio = MAX_DL_PRIO-1;
1442
- else if (task_has_rt_policy(p))
1443
- prio = MAX_RT_PRIO-1 - p->rt_priority;
1444
- else
1445
- prio = __normal_prio(p);
1446
- return prio;
1735
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
14471736 }
14481737
14491738 /*
....@@ -1499,20 +1788,10 @@
14991788
15001789 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
15011790 {
1502
- const struct sched_class *class;
1503
-
1504
- if (p->sched_class == rq->curr->sched_class) {
1791
+ if (p->sched_class == rq->curr->sched_class)
15051792 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1506
- } else {
1507
- for_each_class(class) {
1508
- if (class == rq->curr->sched_class)
1509
- break;
1510
- if (class == p->sched_class) {
1511
- resched_curr(rq);
1512
- break;
1513
- }
1514
- }
1515
- }
1793
+ else if (p->sched_class > rq->curr->sched_class)
1794
+ resched_curr(rq);
15161795
15171796 /*
15181797 * A queue event has occurred, and we're going to schedule. In
....@@ -1521,22 +1800,12 @@
15211800 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
15221801 rq_clock_skip_update(rq);
15231802 }
1803
+EXPORT_SYMBOL_GPL(check_preempt_curr);
15241804
15251805 #ifdef CONFIG_SMP
15261806
1527
-static inline bool is_per_cpu_kthread(struct task_struct *p)
1528
-{
1529
- if (!(p->flags & PF_KTHREAD))
1530
- return false;
1531
-
1532
- if (p->nr_cpus_allowed != 1)
1533
- return false;
1534
-
1535
- return true;
1536
-}
1537
-
15381807 /*
1539
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
1808
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
15401809 * __set_cpus_allowed_ptr() and select_fallback_rq().
15411810 */
15421811 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
....@@ -1544,10 +1813,13 @@
15441813 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
15451814 return false;
15461815
1547
- if (is_per_cpu_kthread(p) || __migrate_disabled(p))
1816
+ if (is_per_cpu_kthread(p))
15481817 return cpu_online(cpu);
15491818
1550
- return cpu_active(cpu);
1819
+ if (!cpu_active(cpu))
1820
+ return false;
1821
+
1822
+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
15511823 }
15521824
15531825 /*
....@@ -1572,19 +1844,29 @@
15721844 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15731845 struct task_struct *p, int new_cpu)
15741846 {
1847
+ int detached = 0;
1848
+
15751849 lockdep_assert_held(&rq->lock);
15761850
1577
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1578
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1579
- set_task_cpu(p, new_cpu);
1580
- rq_unlock(rq, rf);
1851
+ /*
1852
+ * The vendor hook may drop the lock temporarily, so
1853
+ * pass the rq flags to unpin lock. We expect the
1854
+ * rq lock to be held after return.
1855
+ */
1856
+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
1857
+ if (detached)
1858
+ goto attach;
15811859
1860
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1861
+ set_task_cpu(p, new_cpu);
1862
+
1863
+attach:
1864
+ rq_unlock(rq, rf);
15821865 rq = cpu_rq(new_cpu);
15831866
15841867 rq_lock(rq, rf);
15851868 BUG_ON(task_cpu(p) != new_cpu);
1586
- enqueue_task(rq, p, 0);
1587
- p->on_rq = TASK_ON_RQ_QUEUED;
1869
+ activate_task(rq, p, 0);
15881870 check_preempt_curr(rq, p, 0);
15891871
15901872 return rq;
....@@ -1593,7 +1875,6 @@
15931875 struct migration_arg {
15941876 struct task_struct *task;
15951877 int dest_cpu;
1596
- bool done;
15971878 };
15981879
15991880 /*
....@@ -1629,11 +1910,6 @@
16291910 struct task_struct *p = arg->task;
16301911 struct rq *rq = this_rq();
16311912 struct rq_flags rf;
1632
- int dest_cpu = arg->dest_cpu;
1633
-
1634
- /* We don't look at arg after this point. */
1635
- smp_mb();
1636
- arg->done = true;
16371913
16381914 /*
16391915 * The original target CPU might have gone down and we might
....@@ -1645,7 +1921,7 @@
16451921 * __migrate_task() such that we will not miss enforcing cpus_ptr
16461922 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
16471923 */
1648
- sched_ttwu_pending();
1924
+ flush_smp_call_function_from_idle();
16491925
16501926 raw_spin_lock(&p->pi_lock);
16511927 rq_lock(rq, &rf);
....@@ -1656,9 +1932,9 @@
16561932 */
16571933 if (task_rq(p) == rq) {
16581934 if (task_on_rq_queued(p))
1659
- rq = __migrate_task(rq, &rf, p, dest_cpu);
1935
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
16601936 else
1661
- p->wake_cpu = dest_cpu;
1937
+ p->wake_cpu = arg->dest_cpu;
16621938 }
16631939 rq_unlock(rq, &rf);
16641940 raw_spin_unlock(&p->pi_lock);
....@@ -1674,17 +1950,9 @@
16741950 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
16751951 {
16761952 cpumask_copy(&p->cpus_mask, new_mask);
1677
- if (p->cpus_ptr == &p->cpus_mask)
1678
- p->nr_cpus_allowed = cpumask_weight(new_mask);
1953
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
1954
+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
16791955 }
1680
-
1681
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682
-int __migrate_disabled(struct task_struct *p)
1683
-{
1684
- return p->migrate_disable;
1685
-}
1686
-EXPORT_SYMBOL_GPL(__migrate_disabled);
1687
-#endif
16881956
16891957 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
16901958 {
....@@ -1712,28 +1980,23 @@
17121980 if (queued)
17131981 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
17141982 if (running)
1715
- set_curr_task(rq, p);
1983
+ set_next_task(rq, p);
17161984 }
17171985
17181986 /*
1719
- * Change a given task's CPU affinity. Migrate the thread to a
1720
- * proper CPU and schedule it away if the CPU it's executing on
1721
- * is removed from the allowed bitmask.
1722
- *
1723
- * NOTE: the caller must have a valid reference to the task, the
1724
- * task must not exit() & deallocate itself prematurely. The
1725
- * call is not atomic; no spinlocks may be held.
1987
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
17261988 */
1727
-static int __set_cpus_allowed_ptr(struct task_struct *p,
1728
- const struct cpumask *new_mask, bool check)
1989
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
1990
+ const struct cpumask *new_mask,
1991
+ bool check,
1992
+ struct rq *rq,
1993
+ struct rq_flags *rf)
17291994 {
17301995 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1996
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
17311997 unsigned int dest_cpu;
1732
- struct rq_flags rf;
1733
- struct rq *rq;
17341998 int ret = 0;
17351999
1736
- rq = task_rq_lock(p, &rf);
17372000 update_rq_clock(rq);
17382001
17392002 if (p->flags & PF_KTHREAD) {
....@@ -1741,6 +2004,9 @@
17412004 * Kernel threads are allowed on online && !active CPUs
17422005 */
17432006 cpu_valid_mask = cpu_online_mask;
2007
+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
2008
+ ret = -EINVAL;
2009
+ goto out;
17442010 }
17452011
17462012 /*
....@@ -1755,7 +2021,12 @@
17552021 if (cpumask_equal(&p->cpus_mask, new_mask))
17562022 goto out;
17572023
1758
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
2024
+ /*
2025
+ * Picking a ~random cpu helps in cases where we are changing affinity
2026
+ * for groups of tasks (ie. cpuset), so that load balancing is not
2027
+ * immediately required to distribute the tasks within their new mask.
2028
+ */
2029
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
17592030 if (dest_cpu >= nr_cpu_ids) {
17602031 ret = -EINVAL;
17612032 goto out;
....@@ -1774,28 +2045,45 @@
17742045 }
17752046
17762047 /* Can the task run on the task's current CPU? If so, we're done */
1777
- if (cpumask_test_cpu(task_cpu(p), new_mask) ||
1778
- p->cpus_ptr != &p->cpus_mask)
2048
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
17792049 goto out;
17802050
17812051 if (task_running(rq, p) || p->state == TASK_WAKING) {
17822052 struct migration_arg arg = { p, dest_cpu };
17832053 /* Need help from migration thread: drop lock and wait. */
1784
- task_rq_unlock(rq, p, &rf);
2054
+ task_rq_unlock(rq, p, rf);
17852055 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1786
- tlb_migrate_finish(p->mm);
17872056 return 0;
17882057 } else if (task_on_rq_queued(p)) {
17892058 /*
17902059 * OK, since we're going to drop the lock immediately
17912060 * afterwards anyway.
17922061 */
1793
- rq = move_queued_task(rq, &rf, p, dest_cpu);
2062
+ rq = move_queued_task(rq, rf, p, dest_cpu);
17942063 }
17952064 out:
1796
- task_rq_unlock(rq, p, &rf);
2065
+ task_rq_unlock(rq, p, rf);
17972066
17982067 return ret;
2068
+}
2069
+
2070
+/*
2071
+ * Change a given task's CPU affinity. Migrate the thread to a
2072
+ * proper CPU and schedule it away if the CPU it's executing on
2073
+ * is removed from the allowed bitmask.
2074
+ *
2075
+ * NOTE: the caller must have a valid reference to the task, the
2076
+ * task must not exit() & deallocate itself prematurely. The
2077
+ * call is not atomic; no spinlocks may be held.
2078
+ */
2079
+static int __set_cpus_allowed_ptr(struct task_struct *p,
2080
+ const struct cpumask *new_mask, bool check)
2081
+{
2082
+ struct rq_flags rf;
2083
+ struct rq *rq;
2084
+
2085
+ rq = task_rq_lock(p, &rf);
2086
+ return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
17992087 }
18002088
18012089 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
....@@ -1803,6 +2091,74 @@
18032091 return __set_cpus_allowed_ptr(p, new_mask, false);
18042092 }
18052093 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2094
+
2095
+/*
2096
+ * Change a given task's CPU affinity to the intersection of its current
2097
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
2098
+ * If the resulting mask is empty, leave the affinity unchanged and return
2099
+ * -EINVAL.
2100
+ */
2101
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
2102
+ struct cpumask *new_mask,
2103
+ const struct cpumask *subset_mask)
2104
+{
2105
+ struct rq_flags rf;
2106
+ struct rq *rq;
2107
+
2108
+ rq = task_rq_lock(p, &rf);
2109
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2110
+ task_rq_unlock(rq, p, &rf);
2111
+ return -EINVAL;
2112
+ }
2113
+
2114
+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
2115
+}
2116
+
2117
+/*
2118
+ * Restrict a given task's CPU affinity so that it is a subset of
2119
+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
2120
+ * walk up the cpuset hierarchy until we find a suitable mask.
2121
+ */
2122
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2123
+{
2124
+ cpumask_var_t new_mask;
2125
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
2126
+
2127
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
2128
+
2129
+ /*
2130
+ * __migrate_task() can fail silently in the face of concurrent
2131
+ * offlining of the chosen destination CPU, so take the hotplug
2132
+ * lock to ensure that the migration succeeds.
2133
+ */
2134
+ trace_android_rvh_force_compatible_pre(NULL);
2135
+ cpus_read_lock();
2136
+ if (!cpumask_available(new_mask))
2137
+ goto out_set_mask;
2138
+
2139
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2140
+ goto out_free_mask;
2141
+
2142
+ /*
2143
+ * We failed to find a valid subset of the affinity mask for the
2144
+ * task, so override it based on its cpuset hierarchy.
2145
+ */
2146
+ cpuset_cpus_allowed(p, new_mask);
2147
+ override_mask = new_mask;
2148
+
2149
+out_set_mask:
2150
+ if (printk_ratelimit()) {
2151
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2152
+ task_pid_nr(p), p->comm,
2153
+ cpumask_pr_args(override_mask));
2154
+ }
2155
+
2156
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2157
+out_free_mask:
2158
+ cpus_read_unlock();
2159
+ trace_android_rvh_force_compatible_post(NULL);
2160
+ free_cpumask_var(new_mask);
2161
+}
18062162
18072163 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
18082164 {
....@@ -1851,12 +2207,13 @@
18512207 p->se.nr_migrations++;
18522208 rseq_migrate(p);
18532209 perf_event_task_migrate(p);
2210
+ trace_android_rvh_set_task_cpu(p, new_cpu);
18542211 }
18552212
18562213 __set_task_cpu(p, new_cpu);
18572214 }
2215
+EXPORT_SYMBOL_GPL(set_task_cpu);
18582216
1859
-#ifdef CONFIG_NUMA_BALANCING
18602217 static void __migrate_swap_task(struct task_struct *p, int cpu)
18612218 {
18622219 if (task_on_rq_queued(p)) {
....@@ -1869,11 +2226,9 @@
18692226 rq_pin_lock(src_rq, &srf);
18702227 rq_pin_lock(dst_rq, &drf);
18712228
1872
- p->on_rq = TASK_ON_RQ_MIGRATING;
18732229 deactivate_task(src_rq, p, 0);
18742230 set_task_cpu(p, cpu);
18752231 activate_task(dst_rq, p, 0);
1876
- p->on_rq = TASK_ON_RQ_QUEUED;
18772232 check_preempt_curr(dst_rq, p, 0);
18782233
18792234 rq_unpin_lock(dst_rq, &drf);
....@@ -1973,19 +2328,7 @@
19732328 out:
19742329 return ret;
19752330 }
1976
-#endif /* CONFIG_NUMA_BALANCING */
1977
-
1978
-static bool check_task_state(struct task_struct *p, long match_state)
1979
-{
1980
- bool match = false;
1981
-
1982
- raw_spin_lock_irq(&p->pi_lock);
1983
- if (p->state == match_state || p->saved_state == match_state)
1984
- match = true;
1985
- raw_spin_unlock_irq(&p->pi_lock);
1986
-
1987
- return match;
1988
-}
2331
+EXPORT_SYMBOL_GPL(migrate_swap);
19892332
19902333 /*
19912334 * wait_task_inactive - wait for a thread to unschedule.
....@@ -2031,7 +2374,7 @@
20312374 * is actually now running somewhere else!
20322375 */
20332376 while (task_running(rq, p)) {
2034
- if (match_state && !check_task_state(p, match_state))
2377
+ if (match_state && unlikely(p->state != match_state))
20352378 return 0;
20362379 cpu_relax();
20372380 }
....@@ -2046,8 +2389,7 @@
20462389 running = task_running(rq, p);
20472390 queued = task_on_rq_queued(p);
20482391 ncsw = 0;
2049
- if (!match_state || p->state == match_state ||
2050
- p->saved_state == match_state)
2392
+ if (!match_state || p->state == match_state)
20512393 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
20522394 task_rq_unlock(rq, p, &rf);
20532395
....@@ -2148,7 +2490,11 @@
21482490 int nid = cpu_to_node(cpu);
21492491 const struct cpumask *nodemask = NULL;
21502492 enum { cpuset, possible, fail } state = cpuset;
2151
- int dest_cpu;
2493
+ int dest_cpu = -1;
2494
+
2495
+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
2496
+ if (dest_cpu >= 0)
2497
+ return dest_cpu;
21522498
21532499 /*
21542500 * If the node that the CPU is on has been offlined, cpu_to_node()
....@@ -2160,9 +2506,7 @@
21602506
21612507 /* Look for allowed, online CPU in same node. */
21622508 for_each_cpu(dest_cpu, nodemask) {
2163
- if (!cpu_active(dest_cpu))
2164
- continue;
2165
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
2509
+ if (is_cpu_allowed(p, dest_cpu))
21662510 return dest_cpu;
21672511 }
21682512 }
....@@ -2184,12 +2528,11 @@
21842528 state = possible;
21852529 break;
21862530 }
2187
- /* Fall-through */
2531
+ fallthrough;
21882532 case possible:
2189
- do_set_cpus_allowed(p, cpu_possible_mask);
2533
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
21902534 state = fail;
21912535 break;
2192
-
21932536 case fail:
21942537 BUG();
21952538 break;
....@@ -2216,14 +2559,12 @@
22162559 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22172560 */
22182561 static inline
2219
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2220
- int sibling_count_hint)
2562
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22212563 {
22222564 lockdep_assert_held(&p->pi_lock);
22232565
22242566 if (p->nr_cpus_allowed > 1)
2225
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2226
- sibling_count_hint);
2567
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22272568 else
22282569 cpu = cpumask_any(p->cpus_ptr);
22292570
....@@ -2241,12 +2582,6 @@
22412582 cpu = select_fallback_rq(task_cpu(p), p);
22422583
22432584 return cpu;
2244
-}
2245
-
2246
-static void update_avg(u64 *avg, u64 sample)
2247
-{
2248
- s64 diff = sample - *avg;
2249
- *avg += diff >> 3;
22502585 }
22512586
22522587 void sched_set_stop_task(int cpu, struct task_struct *stop)
....@@ -2328,12 +2663,6 @@
23282663 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
23292664 }
23302665
2331
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2332
-{
2333
- activate_task(rq, p, en_flags);
2334
- p->on_rq = TASK_ON_RQ_QUEUED;
2335
-}
2336
-
23372666 /*
23382667 * Mark the task runnable and perform wakeup-preemption.
23392668 */
....@@ -2375,27 +2704,54 @@
23752704 {
23762705 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
23772706
2707
+ if (wake_flags & WF_SYNC)
2708
+ en_flags |= ENQUEUE_WAKEUP_SYNC;
2709
+
23782710 lockdep_assert_held(&rq->lock);
23792711
2380
-#ifdef CONFIG_SMP
23812712 if (p->sched_contributes_to_load)
23822713 rq->nr_uninterruptible--;
23832714
2715
+#ifdef CONFIG_SMP
23842716 if (wake_flags & WF_MIGRATED)
23852717 en_flags |= ENQUEUE_MIGRATED;
2718
+ else
23862719 #endif
2720
+ if (p->in_iowait) {
2721
+ delayacct_blkio_end(p);
2722
+ atomic_dec(&task_rq(p)->nr_iowait);
2723
+ }
23872724
2388
- ttwu_activate(rq, p, en_flags);
2725
+ activate_task(rq, p, en_flags);
23892726 ttwu_do_wakeup(rq, p, wake_flags, rf);
23902727 }
23912728
23922729 /*
2393
- * Called in case the task @p isn't fully descheduled from its runqueue,
2394
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2395
- * since all we need to do is flip p->state to TASK_RUNNING, since
2396
- * the task is still ->on_rq.
2730
+ * Consider @p being inside a wait loop:
2731
+ *
2732
+ * for (;;) {
2733
+ * set_current_state(TASK_UNINTERRUPTIBLE);
2734
+ *
2735
+ * if (CONDITION)
2736
+ * break;
2737
+ *
2738
+ * schedule();
2739
+ * }
2740
+ * __set_current_state(TASK_RUNNING);
2741
+ *
2742
+ * between set_current_state() and schedule(). In this case @p is still
2743
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2744
+ * an atomic manner.
2745
+ *
2746
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2747
+ * then schedule() must still happen and p->state can be changed to
2748
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2749
+ * need to do a full wakeup with enqueue.
2750
+ *
2751
+ * Returns: %true when the wakeup is done,
2752
+ * %false otherwise.
23972753 */
2398
-static int ttwu_remote(struct task_struct *p, int wake_flags)
2754
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
23992755 {
24002756 struct rq_flags rf;
24012757 struct rq *rq;
....@@ -2414,75 +2770,63 @@
24142770 }
24152771
24162772 #ifdef CONFIG_SMP
2417
-void sched_ttwu_pending(void)
2773
+void sched_ttwu_pending(void *arg)
24182774 {
2775
+ struct llist_node *llist = arg;
24192776 struct rq *rq = this_rq();
2420
- struct llist_node *llist = llist_del_all(&rq->wake_list);
24212777 struct task_struct *p, *t;
24222778 struct rq_flags rf;
24232779
24242780 if (!llist)
24252781 return;
24262782
2783
+ /*
2784
+ * rq::ttwu_pending racy indication of out-standing wakeups.
2785
+ * Races such that false-negatives are possible, since they
2786
+ * are shorter lived that false-positives would be.
2787
+ */
2788
+ WRITE_ONCE(rq->ttwu_pending, 0);
2789
+
24272790 rq_lock_irqsave(rq, &rf);
24282791 update_rq_clock(rq);
24292792
2430
- llist_for_each_entry_safe(p, t, llist, wake_entry)
2793
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
2794
+ if (WARN_ON_ONCE(p->on_cpu))
2795
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
2796
+
2797
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
2798
+ set_task_cpu(p, cpu_of(rq));
2799
+
24312800 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2801
+ }
24322802
24332803 rq_unlock_irqrestore(rq, &rf);
24342804 }
24352805
2436
-void scheduler_ipi(void)
2806
+void send_call_function_single_ipi(int cpu)
24372807 {
2438
- /*
2439
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2440
- * TIF_NEED_RESCHED remotely (for the first time) will also send
2441
- * this IPI.
2442
- */
2443
- preempt_fold_need_resched();
2808
+ struct rq *rq = cpu_rq(cpu);
24442809
2445
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2446
- return;
2447
-
2448
- /*
2449
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2450
- * traditionally all their work was done from the interrupt return
2451
- * path. Now that we actually do some work, we need to make sure
2452
- * we do call them.
2453
- *
2454
- * Some archs already do call them, luckily irq_enter/exit nest
2455
- * properly.
2456
- *
2457
- * Arguably we should visit all archs and update all handlers,
2458
- * however a fair share of IPIs are still resched only so this would
2459
- * somewhat pessimize the simple resched case.
2460
- */
2461
- irq_enter();
2462
- sched_ttwu_pending();
2463
-
2464
- /*
2465
- * Check if someone kicked us for doing the nohz idle load balance.
2466
- */
2467
- if (unlikely(got_nohz_idle_kick())) {
2468
- this_rq()->idle_balance = 1;
2469
- raise_softirq_irqoff(SCHED_SOFTIRQ);
2470
- }
2471
- irq_exit();
2810
+ if (!set_nr_if_polling(rq->idle))
2811
+ arch_send_call_function_single_ipi(cpu);
2812
+ else
2813
+ trace_sched_wake_idle_without_ipi(cpu);
24722814 }
24732815
2474
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
2816
+/*
2817
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
2818
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
2819
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
2820
+ * of the wakeup instead of the waker.
2821
+ */
2822
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
24752823 {
24762824 struct rq *rq = cpu_rq(cpu);
24772825
24782826 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
24792827
2480
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2481
- if (!set_nr_if_polling(rq->idle))
2482
- smp_send_reschedule(cpu);
2483
- else
2484
- trace_sched_wake_idle_without_ipi(cpu);
2485
- }
2828
+ WRITE_ONCE(rq->ttwu_pending, 1);
2829
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
24862830 }
24872831
24882832 void wake_up_if_idle(int cpu)
....@@ -2508,6 +2852,7 @@
25082852 out:
25092853 rcu_read_unlock();
25102854 }
2855
+EXPORT_SYMBOL_GPL(wake_up_if_idle);
25112856
25122857 bool cpus_share_cache(int this_cpu, int that_cpu)
25132858 {
....@@ -2516,6 +2861,58 @@
25162861
25172862 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
25182863 }
2864
+
2865
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2866
+{
2867
+ /*
2868
+ * If the CPU does not share cache, then queue the task on the
2869
+ * remote rqs wakelist to avoid accessing remote data.
2870
+ */
2871
+ if (!cpus_share_cache(smp_processor_id(), cpu))
2872
+ return true;
2873
+
2874
+ /*
2875
+ * If the task is descheduling and the only running task on the
2876
+ * CPU then use the wakelist to offload the task activation to
2877
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
2878
+ * nr_running is checked to avoid unnecessary task stacking.
2879
+ *
2880
+ * Note that we can only get here with (wakee) p->on_rq=0,
2881
+ * p->on_cpu can be whatever, we've done the dequeue, so
2882
+ * the wakee has been accounted out of ->nr_running.
2883
+ */
2884
+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
2885
+ return true;
2886
+
2887
+ return false;
2888
+}
2889
+
2890
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2891
+{
2892
+ bool cond = false;
2893
+
2894
+ trace_android_rvh_ttwu_cond(&cond);
2895
+
2896
+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) ||
2897
+ cond) {
2898
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
2899
+ return false;
2900
+
2901
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2902
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
2903
+ return true;
2904
+ }
2905
+
2906
+ return false;
2907
+}
2908
+
2909
+#else /* !CONFIG_SMP */
2910
+
2911
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2912
+{
2913
+ return false;
2914
+}
2915
+
25192916 #endif /* CONFIG_SMP */
25202917
25212918 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
....@@ -2523,13 +2920,8 @@
25232920 struct rq *rq = cpu_rq(cpu);
25242921 struct rq_flags rf;
25252922
2526
-#if defined(CONFIG_SMP)
2527
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2528
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2529
- ttwu_queue_remote(p, cpu, wake_flags);
2923
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
25302924 return;
2531
- }
2532
-#endif
25332925
25342926 rq_lock(rq, &rf);
25352927 update_rq_clock(rq);
....@@ -2585,8 +2977,8 @@
25852977 * migration. However the means are completely different as there is no lock
25862978 * chain to provide order. Instead we do:
25872979 *
2588
- * 1) smp_store_release(X->on_cpu, 0)
2589
- * 2) smp_cond_load_acquire(!X->on_cpu)
2980
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
2981
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25902982 *
25912983 * Example:
25922984 *
....@@ -2625,64 +3017,95 @@
26253017 * @p: the thread to be awakened
26263018 * @state: the mask of task states that can be woken
26273019 * @wake_flags: wake modifier flags (WF_*)
2628
- * @sibling_count_hint: A hint at the number of threads that are being woken up
2629
- * in this event.
26303020 *
2631
- * If (@state & @p->state) @p->state = TASK_RUNNING.
3021
+ * Conceptually does:
3022
+ *
3023
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
26323024 *
26333025 * If the task was not queued/runnable, also place it back on a runqueue.
26343026 *
2635
- * Atomic against schedule() which would dequeue a task, also see
2636
- * set_current_state().
3027
+ * This function is atomic against schedule() which would dequeue the task.
26373028 *
2638
- * This function executes a full memory barrier before accessing the task
2639
- * state; see set_current_state().
3029
+ * It issues a full memory barrier before accessing @p->state, see the comment
3030
+ * with set_current_state().
3031
+ *
3032
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
3033
+ *
3034
+ * Relies on p->pi_lock stabilizing:
3035
+ * - p->sched_class
3036
+ * - p->cpus_ptr
3037
+ * - p->sched_task_group
3038
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3039
+ *
3040
+ * Tries really hard to only take one task_rq(p)->lock for performance.
3041
+ * Takes rq->lock in:
3042
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3043
+ * - ttwu_queue() -- new rq, for enqueue of the task;
3044
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3045
+ *
3046
+ * As a consequence we race really badly with just about everything. See the
3047
+ * many memory barriers and their comments for details.
26403048 *
26413049 * Return: %true if @p->state changes (an actual wakeup was done),
26423050 * %false otherwise.
26433051 */
26443052 static int
2645
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2646
- int sibling_count_hint)
3053
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26473054 {
26483055 unsigned long flags;
26493056 int cpu, success = 0;
26503057
2651
- /*
2652
- * If we are going to wake up a thread waiting for CONDITION we
2653
- * need to ensure that CONDITION=1 done by the caller can not be
2654
- * reordered with p->state check below. This pairs with mb() in
2655
- * set_current_state() the waiting thread does.
2656
- */
2657
- raw_spin_lock_irqsave(&p->pi_lock, flags);
2658
- smp_mb__after_spinlock();
2659
- if (!(p->state & state)) {
3058
+ preempt_disable();
3059
+ if (p == current) {
26603060 /*
2661
- * The task might be running due to a spinlock sleeper
2662
- * wakeup. Check the saved state and set it to running
2663
- * if the wakeup condition is true.
3061
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3062
+ * == smp_processor_id()'. Together this means we can special
3063
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
3064
+ * without taking any locks.
3065
+ *
3066
+ * In particular:
3067
+ * - we rely on Program-Order guarantees for all the ordering,
3068
+ * - we're serialized against set_special_state() by virtue of
3069
+ * it disabling IRQs (this allows not taking ->pi_lock).
26643070 */
2665
- if (!(wake_flags & WF_LOCK_SLEEPER)) {
2666
- if (p->saved_state & state) {
2667
- p->saved_state = TASK_RUNNING;
2668
- success = 1;
2669
- }
2670
- }
3071
+ if (!(p->state & state))
3072
+ goto out;
3073
+
3074
+ success = 1;
3075
+ trace_sched_waking(p);
3076
+ p->state = TASK_RUNNING;
3077
+ trace_sched_wakeup(p);
26713078 goto out;
26723079 }
26733080
26743081 /*
2675
- * If this is a regular wakeup, then we can unconditionally
2676
- * clear the saved state of a "lock sleeper".
3082
+ * If we are going to wake up a thread waiting for CONDITION we
3083
+ * need to ensure that CONDITION=1 done by the caller can not be
3084
+ * reordered with p->state check below. This pairs with smp_store_mb()
3085
+ * in set_current_state() that the waiting thread does.
26773086 */
2678
- if (!(wake_flags & WF_LOCK_SLEEPER))
2679
- p->saved_state = TASK_RUNNING;
3087
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
3088
+ smp_mb__after_spinlock();
3089
+ if (!(p->state & state))
3090
+ goto unlock;
3091
+
3092
+#ifdef CONFIG_FREEZER
3093
+ /*
3094
+ * If we're going to wake up a thread which may be frozen, then
3095
+ * we can only do so if we have an active CPU which is capable of
3096
+ * running it. This may not be the case when resuming from suspend,
3097
+ * as the secondary CPUs may not yet be back online. See __thaw_task()
3098
+ * for the actual wakeup.
3099
+ */
3100
+ if (unlikely(frozen_or_skipped(p)) &&
3101
+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
3102
+ goto unlock;
3103
+#endif
26803104
26813105 trace_sched_waking(p);
26823106
26833107 /* We're going to change ->state: */
26843108 success = 1;
2685
- cpu = task_cpu(p);
26863109
26873110 /*
26883111 * Ensure we load p->on_rq _after_ p->state, otherwise it would
....@@ -2703,10 +3126,15 @@
27033126 *
27043127 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27053128 * __schedule(). See the comment for smp_mb__after_spinlock().
3129
+ *
3130
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
27063131 */
27073132 smp_rmb();
2708
- if (p->on_rq && ttwu_remote(p, wake_flags))
2709
- goto stat;
3133
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3134
+ goto unlock;
3135
+
3136
+ if (p->state & TASK_UNINTERRUPTIBLE)
3137
+ trace_sched_blocked_reason(p);
27103138
27113139 #ifdef CONFIG_SMP
27123140 /*
....@@ -2727,8 +3155,43 @@
27273155 *
27283156 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27293157 * __schedule(). See the comment for smp_mb__after_spinlock().
3158
+ *
3159
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3160
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
3161
+ * care about it's own p->state. See the comment in __schedule().
27303162 */
2731
- smp_rmb();
3163
+ smp_acquire__after_ctrl_dep();
3164
+
3165
+ /*
3166
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3167
+ * == 0), which means we need to do an enqueue, change p->state to
3168
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
3169
+ * enqueue, such as ttwu_queue_wakelist().
3170
+ */
3171
+ p->state = TASK_WAKING;
3172
+
3173
+ /*
3174
+ * If the owning (remote) CPU is still in the middle of schedule() with
3175
+ * this task as prev, considering queueing p on the remote CPUs wake_list
3176
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
3177
+ * let the waker make forward progress. This is safe because IRQs are
3178
+ * disabled and the IPI will deliver after on_cpu is cleared.
3179
+ *
3180
+ * Ensure we load task_cpu(p) after p->on_cpu:
3181
+ *
3182
+ * set_task_cpu(p, cpu);
3183
+ * STORE p->cpu = @cpu
3184
+ * __schedule() (switch to task 'p')
3185
+ * LOCK rq->lock
3186
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3187
+ * STORE p->on_cpu = 1 LOAD p->cpu
3188
+ *
3189
+ * to ensure we observe the correct CPU on which the task is currently
3190
+ * scheduling.
3191
+ */
3192
+ if (smp_load_acquire(&p->on_cpu) &&
3193
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3194
+ goto unlock;
27323195
27333196 /*
27343197 * If the owning (remote) CPU is still in the middle of schedule() with
....@@ -2741,38 +3204,79 @@
27413204 */
27423205 smp_cond_load_acquire(&p->on_cpu, !VAL);
27433206
2744
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2745
- p->state = TASK_WAKING;
3207
+ trace_android_rvh_try_to_wake_up(p);
27463208
2747
- if (p->in_iowait) {
2748
- delayacct_blkio_end(p);
2749
- atomic_dec(&task_rq(p)->nr_iowait);
2750
- }
2751
-
2752
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2753
- sibling_count_hint);
3209
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
27543210 if (task_cpu(p) != cpu) {
3211
+ if (p->in_iowait) {
3212
+ delayacct_blkio_end(p);
3213
+ atomic_dec(&task_rq(p)->nr_iowait);
3214
+ }
3215
+
27553216 wake_flags |= WF_MIGRATED;
27563217 psi_ttwu_dequeue(p);
27573218 set_task_cpu(p, cpu);
27583219 }
2759
-
2760
-#else /* CONFIG_SMP */
2761
-
2762
- if (p->in_iowait) {
2763
- delayacct_blkio_end(p);
2764
- atomic_dec(&task_rq(p)->nr_iowait);
2765
- }
2766
-
3220
+#else
3221
+ cpu = task_cpu(p);
27673222 #endif /* CONFIG_SMP */
27683223
27693224 ttwu_queue(p, cpu, wake_flags);
2770
-stat:
2771
- ttwu_stat(p, cpu, wake_flags);
2772
-out:
3225
+unlock:
27733226 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3227
+out:
3228
+ if (success) {
3229
+ trace_android_rvh_try_to_wake_up_success(p);
3230
+ ttwu_stat(p, task_cpu(p), wake_flags);
3231
+ }
3232
+ preempt_enable();
27743233
27753234 return success;
3235
+}
3236
+
3237
+/**
3238
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3239
+ * @p: Process for which the function is to be invoked, can be @current.
3240
+ * @func: Function to invoke.
3241
+ * @arg: Argument to function.
3242
+ *
3243
+ * If the specified task can be quickly locked into a definite state
3244
+ * (either sleeping or on a given runqueue), arrange to keep it in that
3245
+ * state while invoking @func(@arg). This function can use ->on_rq and
3246
+ * task_curr() to work out what the state is, if required. Given that
3247
+ * @func can be invoked with a runqueue lock held, it had better be quite
3248
+ * lightweight.
3249
+ *
3250
+ * Returns:
3251
+ * @false if the task slipped out from under the locks.
3252
+ * @true if the task was locked onto a runqueue or is sleeping.
3253
+ * However, @func can override this by returning @false.
3254
+ */
3255
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3256
+{
3257
+ struct rq_flags rf;
3258
+ bool ret = false;
3259
+ struct rq *rq;
3260
+
3261
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3262
+ if (p->on_rq) {
3263
+ rq = __task_rq_lock(p, &rf);
3264
+ if (task_rq(p) == rq)
3265
+ ret = func(p, arg);
3266
+ rq_unlock(rq, &rf);
3267
+ } else {
3268
+ switch (p->state) {
3269
+ case TASK_RUNNING:
3270
+ case TASK_WAKING:
3271
+ break;
3272
+ default:
3273
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3274
+ if (!p->on_rq)
3275
+ ret = func(p, arg);
3276
+ }
3277
+ }
3278
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3279
+ return ret;
27763280 }
27773281
27783282 /**
....@@ -2788,25 +3292,13 @@
27883292 */
27893293 int wake_up_process(struct task_struct *p)
27903294 {
2791
- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
3295
+ return try_to_wake_up(p, TASK_NORMAL, 0);
27923296 }
27933297 EXPORT_SYMBOL(wake_up_process);
27943298
2795
-/**
2796
- * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
2797
- * @p: The process to be woken up.
2798
- *
2799
- * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
2800
- * the nature of the wakeup.
2801
- */
2802
-int wake_up_lock_sleeper(struct task_struct *p)
2803
-{
2804
- return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
2805
-}
2806
-
28073299 int wake_up_state(struct task_struct *p, unsigned int state)
28083300 {
2809
- return try_to_wake_up(p, state, 0, 1);
3301
+ return try_to_wake_up(p, state, 0);
28103302 }
28113303
28123304 /*
....@@ -2831,6 +3323,8 @@
28313323 p->se.cfs_rq = NULL;
28323324 #endif
28333325
3326
+ trace_android_rvh_sched_fork_init(p);
3327
+
28343328 #ifdef CONFIG_SCHEDSTATS
28353329 /* Even if schedstat is disabled, there should not be garbage */
28363330 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
....@@ -2851,7 +3345,13 @@
28513345 INIT_HLIST_HEAD(&p->preempt_notifiers);
28523346 #endif
28533347
3348
+#ifdef CONFIG_COMPACTION
3349
+ p->capture_control = NULL;
3350
+#endif
28543351 init_numa_balancing(clone_flags, p);
3352
+#ifdef CONFIG_SMP
3353
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
3354
+#endif
28553355 }
28563356
28573357 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
....@@ -2868,7 +3368,7 @@
28683368
28693369 #ifdef CONFIG_PROC_SYSCTL
28703370 int sysctl_numa_balancing(struct ctl_table *table, int write,
2871
- void __user *buffer, size_t *lenp, loff_t *ppos)
3371
+ void *buffer, size_t *lenp, loff_t *ppos)
28723372 {
28733373 struct ctl_table t;
28743374 int err;
....@@ -2942,8 +3442,8 @@
29423442 }
29433443
29443444 #ifdef CONFIG_PROC_SYSCTL
2945
-int sysctl_schedstats(struct ctl_table *table, int write,
2946
- void __user *buffer, size_t *lenp, loff_t *ppos)
3445
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3446
+ size_t *lenp, loff_t *ppos)
29473447 {
29483448 struct ctl_table t;
29493449 int err;
....@@ -2971,7 +3471,7 @@
29713471 */
29723472 int sched_fork(unsigned long clone_flags, struct task_struct *p)
29733473 {
2974
- unsigned long flags;
3474
+ trace_android_rvh_sched_fork(p);
29753475
29763476 __sched_fork(clone_flags, p);
29773477 /*
....@@ -2985,6 +3485,7 @@
29853485 * Make sure we do not leak PI boosting priority to the child.
29863486 */
29873487 p->prio = current->normal_prio;
3488
+ trace_android_rvh_prepare_prio_fork(p);
29883489
29893490 uclamp_fork(p);
29903491
....@@ -2999,8 +3500,8 @@
29993500 } else if (PRIO_TO_NICE(p->static_prio) < 0)
30003501 p->static_prio = NICE_TO_PRIO(0);
30013502
3002
- p->prio = p->normal_prio = __normal_prio(p);
3003
- set_load_weight(p, false);
3503
+ p->prio = p->normal_prio = p->static_prio;
3504
+ set_load_weight(p);
30043505
30053506 /*
30063507 * We don't need the reset flag anymore after the fork. It has
....@@ -3017,24 +3518,8 @@
30173518 p->sched_class = &fair_sched_class;
30183519
30193520 init_entity_runnable_average(&p->se);
3521
+ trace_android_rvh_finish_prio_fork(p);
30203522
3021
- /*
3022
- * The child is not yet in the pid-hash so no cgroup attach races,
3023
- * and the cgroup is pinned to this child due to cgroup_fork()
3024
- * is ran before sched_fork().
3025
- *
3026
- * Silence PROVE_RCU.
3027
- */
3028
- raw_spin_lock_irqsave(&p->pi_lock, flags);
3029
- rseq_migrate(p);
3030
- /*
3031
- * We're setting the CPU for the first time, we don't migrate,
3032
- * so use __set_task_cpu().
3033
- */
3034
- __set_task_cpu(p, smp_processor_id());
3035
- if (p->sched_class->task_fork)
3036
- p->sched_class->task_fork(p);
3037
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
30383523
30393524 #ifdef CONFIG_SCHED_INFO
30403525 if (likely(sched_info_on()))
....@@ -3044,14 +3529,46 @@
30443529 p->on_cpu = 0;
30453530 #endif
30463531 init_task_preempt_count(p);
3047
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
3048
- task_thread_info(p)->preempt_lazy_count = 0;
3049
-#endif
30503532 #ifdef CONFIG_SMP
30513533 plist_node_init(&p->pushable_tasks, MAX_PRIO);
30523534 RB_CLEAR_NODE(&p->pushable_dl_tasks);
30533535 #endif
30543536 return 0;
3537
+}
3538
+
3539
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
3540
+{
3541
+ unsigned long flags;
3542
+
3543
+ /*
3544
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
3545
+ * required yet, but lockdep gets upset if rules are violated.
3546
+ */
3547
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
3548
+#ifdef CONFIG_CGROUP_SCHED
3549
+ if (1) {
3550
+ struct task_group *tg;
3551
+
3552
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
3553
+ struct task_group, css);
3554
+ tg = autogroup_task_group(p, tg);
3555
+ p->sched_task_group = tg;
3556
+ }
3557
+#endif
3558
+ rseq_migrate(p);
3559
+ /*
3560
+ * We're setting the CPU for the first time, we don't migrate,
3561
+ * so use __set_task_cpu().
3562
+ */
3563
+ __set_task_cpu(p, smp_processor_id());
3564
+ if (p->sched_class->task_fork)
3565
+ p->sched_class->task_fork(p);
3566
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3567
+}
3568
+
3569
+void sched_post_fork(struct task_struct *p)
3570
+{
3571
+ uclamp_post_fork(p);
30553572 }
30563573
30573574 unsigned long to_ratio(u64 period, u64 runtime)
....@@ -3082,6 +3599,8 @@
30823599 struct rq_flags rf;
30833600 struct rq *rq;
30843601
3602
+ trace_android_rvh_wake_up_new_task(p);
3603
+
30853604 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
30863605 p->state = TASK_RUNNING;
30873606 #ifdef CONFIG_SMP
....@@ -3095,14 +3614,14 @@
30953614 */
30963615 p->recent_used_cpu = task_cpu(p);
30973616 rseq_migrate(p);
3098
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
3617
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
30993618 #endif
31003619 rq = __task_rq_lock(p, &rf);
31013620 update_rq_clock(rq);
3102
- post_init_entity_util_avg(&p->se);
3621
+ post_init_entity_util_avg(p);
3622
+ trace_android_rvh_new_task_stats(p);
31033623
31043624 activate_task(rq, p, ENQUEUE_NOCLOCK);
3105
- p->on_rq = TASK_ON_RQ_QUEUED;
31063625 trace_sched_wakeup_new(p);
31073626 check_preempt_curr(rq, p, WF_FORK);
31083627 #ifdef CONFIG_SMP
....@@ -3212,8 +3731,10 @@
32123731 /*
32133732 * Claim the task as running, we do this before switching to it
32143733 * such that any running task will have this set.
3734
+ *
3735
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
32153736 */
3216
- next->on_cpu = 1;
3737
+ WRITE_ONCE(next->on_cpu, 1);
32173738 #endif
32183739 }
32193740
....@@ -3221,8 +3742,9 @@
32213742 {
32223743 #ifdef CONFIG_SMP
32233744 /*
3224
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3225
- * We must ensure this doesn't happen until the switch is completely
3745
+ * This must be the very last reference to @prev from this CPU. After
3746
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
3747
+ * must ensure this doesn't happen until the switch is completely
32263748 * finished.
32273749 *
32283750 * In particular, the load of prev->state in finish_task_switch() must
....@@ -3244,7 +3766,7 @@
32443766 * do an early lockdep release here:
32453767 */
32463768 rq_unpin_lock(rq, rf);
3247
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3769
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
32483770 #ifdef CONFIG_DEBUG_SPINLOCK
32493771 /* this is a valid case when another task releases the spinlock */
32503772 rq->lock.owner = next;
....@@ -3376,19 +3898,25 @@
33763898 * provided by mmdrop(),
33773899 * - a sync_core for SYNC_CORE.
33783900 */
3379
- /*
3380
- * We use mmdrop_delayed() here so we don't have to do the
3381
- * full __mmdrop() when we are the last user.
3382
- */
33833901 if (mm) {
33843902 membarrier_mm_sync_core_before_usermode(mm);
3385
- mmdrop_delayed(mm);
3903
+ mmdrop(mm);
33863904 }
33873905 if (unlikely(prev_state == TASK_DEAD)) {
33883906 if (prev->sched_class->task_dead)
33893907 prev->sched_class->task_dead(prev);
33903908
3391
- put_task_struct(prev);
3909
+ /*
3910
+ * Remove function-return probe instances associated with this
3911
+ * task and put them back on the free list.
3912
+ */
3913
+ kprobe_flush_task(prev);
3914
+ trace_android_rvh_flush_task(prev);
3915
+
3916
+ /* Task is done with its stack. */
3917
+ put_task_stack(prev);
3918
+
3919
+ put_task_struct_rcu_user(prev);
33923920 }
33933921
33943922 tick_nohz_task_switch();
....@@ -3467,12 +3995,8 @@
34673995 context_switch(struct rq *rq, struct task_struct *prev,
34683996 struct task_struct *next, struct rq_flags *rf)
34693997 {
3470
- struct mm_struct *mm, *oldmm;
3471
-
34723998 prepare_task_switch(rq, prev, next);
34733999
3474
- mm = next->mm;
3475
- oldmm = prev->active_mm;
34764000 /*
34774001 * For paravirt, this is coupled with an exit in switch_to to
34784002 * combine the page table reload and the switch backend into
....@@ -3481,22 +4005,37 @@
34814005 arch_start_context_switch(prev);
34824006
34834007 /*
3484
- * If mm is non-NULL, we pass through switch_mm(). If mm is
3485
- * NULL, we will pass through mmdrop() in finish_task_switch().
3486
- * Both of these contain the full memory barrier required by
3487
- * membarrier after storing to rq->curr, before returning to
3488
- * user-space.
4008
+ * kernel -> kernel lazy + transfer active
4009
+ * user -> kernel lazy + mmgrab() active
4010
+ *
4011
+ * kernel -> user switch + mmdrop() active
4012
+ * user -> user switch
34894013 */
3490
- if (!mm) {
3491
- next->active_mm = oldmm;
3492
- mmgrab(oldmm);
3493
- enter_lazy_tlb(oldmm, next);
3494
- } else
3495
- switch_mm_irqs_off(oldmm, mm, next);
4014
+ if (!next->mm) { // to kernel
4015
+ enter_lazy_tlb(prev->active_mm, next);
34964016
3497
- if (!prev->mm) {
3498
- prev->active_mm = NULL;
3499
- rq->prev_mm = oldmm;
4017
+ next->active_mm = prev->active_mm;
4018
+ if (prev->mm) // from user
4019
+ mmgrab(prev->active_mm);
4020
+ else
4021
+ prev->active_mm = NULL;
4022
+ } else { // to user
4023
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
4024
+ /*
4025
+ * sys_membarrier() requires an smp_mb() between setting
4026
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
4027
+ *
4028
+ * The below provides this either through switch_mm(), or in
4029
+ * case 'prev->active_mm == next->mm' through
4030
+ * finish_task_switch()'s mmdrop().
4031
+ */
4032
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
4033
+
4034
+ if (!prev->mm) { // from kernel
4035
+ /* will mmdrop() in finish_task_switch(). */
4036
+ rq->prev_mm = prev->active_mm;
4037
+ prev->active_mm = NULL;
4038
+ }
35004039 }
35014040
35024041 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
....@@ -3533,7 +4072,7 @@
35334072 * preemption, thus the result might have a time-of-check-to-time-of-use
35344073 * race. The caller is responsible to use it correctly, for example:
35354074 *
3536
- * - from a non-preemptable section (of course)
4075
+ * - from a non-preemptible section (of course)
35374076 *
35384077 * - from a thread that is bound to a single CPU
35394078 *
....@@ -3554,6 +4093,18 @@
35544093 sum += cpu_rq(i)->nr_switches;
35554094
35564095 return sum;
4096
+}
4097
+
4098
+/*
4099
+ * Consumers of these two interfaces, like for example the cpuidle menu
4100
+ * governor, are using nonsensical data. Preferring shallow idle state selection
4101
+ * for a CPU that has IO-wait which might not even end up running the task when
4102
+ * it does become runnable.
4103
+ */
4104
+
4105
+unsigned long nr_iowait_cpu(int cpu)
4106
+{
4107
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
35574108 }
35584109
35594110 /*
....@@ -3591,29 +4142,9 @@
35914142 unsigned long i, sum = 0;
35924143
35934144 for_each_possible_cpu(i)
3594
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
4145
+ sum += nr_iowait_cpu(i);
35954146
35964147 return sum;
3597
-}
3598
-
3599
-/*
3600
- * Consumers of these two interfaces, like for example the cpufreq menu
3601
- * governor are using nonsensical data. Boosting frequency for a CPU that has
3602
- * IO-wait which might not even end up running the task when it does become
3603
- * runnable.
3604
- */
3605
-
3606
-unsigned long nr_iowait_cpu(int cpu)
3607
-{
3608
- struct rq *this = cpu_rq(cpu);
3609
- return atomic_read(&this->nr_iowait);
3610
-}
3611
-
3612
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3613
-{
3614
- struct rq *rq = this_rq();
3615
- *nr_waiters = atomic_read(&rq->nr_iowait);
3616
- *load = rq->load.weight;
36174148 }
36184149
36194150 #ifdef CONFIG_SMP
....@@ -3627,9 +4158,14 @@
36274158 struct task_struct *p = current;
36284159 unsigned long flags;
36294160 int dest_cpu;
4161
+ bool cond = false;
4162
+
4163
+ trace_android_rvh_sched_exec(&cond);
4164
+ if (cond)
4165
+ return;
36304166
36314167 raw_spin_lock_irqsave(&p->pi_lock, flags);
3632
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
4168
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
36334169 if (dest_cpu == smp_processor_id())
36344170 goto unlock;
36354171
....@@ -3712,6 +4248,7 @@
37124248
37134249 return ns;
37144250 }
4251
+EXPORT_SYMBOL_GPL(task_sched_runtime);
37154252
37164253 /*
37174254 * This function gets called by the timer code, with HZ frequency.
....@@ -3723,14 +4260,18 @@
37234260 struct rq *rq = cpu_rq(cpu);
37244261 struct task_struct *curr = rq->curr;
37254262 struct rq_flags rf;
4263
+ unsigned long thermal_pressure;
37264264
4265
+ arch_scale_freq_tick();
37274266 sched_clock_tick();
37284267
37294268 rq_lock(rq, &rf);
37304269
4270
+ trace_android_rvh_tick_entry(rq);
37314271 update_rq_clock(rq);
4272
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4273
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
37324274 curr->sched_class->task_tick(rq, curr, 0);
3733
- cpu_load_update_active(rq);
37344275 calc_global_load_tick(rq);
37354276 psi_task_tick(rq);
37364277
....@@ -3742,6 +4283,8 @@
37424283 rq->idle_balance = idle_cpu(cpu);
37434284 trigger_load_balance(rq);
37444285 #endif
4286
+
4287
+ trace_android_vh_scheduler_tick(rq);
37454288 }
37464289
37474290 #ifdef CONFIG_NO_HZ_FULL
....@@ -3799,28 +4342,31 @@
37994342 * statistics and checks timeslices in a time-independent way, regardless
38004343 * of when exactly it is running.
38014344 */
3802
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
4345
+ if (!tick_nohz_tick_stopped_cpu(cpu))
38034346 goto out_requeue;
38044347
38054348 rq_lock_irq(rq, &rf);
38064349 curr = rq->curr;
3807
- if (is_idle_task(curr) || cpu_is_offline(cpu))
4350
+ if (cpu_is_offline(cpu))
38084351 goto out_unlock;
38094352
38104353 update_rq_clock(rq);
3811
- delta = rq_clock_task(rq) - curr->se.exec_start;
38124354
3813
- /*
3814
- * Make sure the next tick runs within a reasonable
3815
- * amount of time.
3816
- */
3817
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4355
+ if (!is_idle_task(curr)) {
4356
+ /*
4357
+ * Make sure the next tick runs within a reasonable
4358
+ * amount of time.
4359
+ */
4360
+ delta = rq_clock_task(rq) - curr->se.exec_start;
4361
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4362
+ }
38184363 curr->sched_class->task_tick(rq, curr, 0);
38194364
4365
+ calc_load_nohz_remote(rq);
38204366 out_unlock:
38214367 rq_unlock_irq(rq, &rf);
3822
-
38234368 out_requeue:
4369
+
38244370 /*
38254371 * Run the remote tick once per second (1Hz). This arbitrary
38264372 * frequency is large enough to avoid overload but short enough
....@@ -3884,7 +4430,7 @@
38844430 static inline void sched_tick_stop(int cpu) { }
38854431 #endif
38864432
3887
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4433
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
38884434 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
38894435 /*
38904436 * If the value passed in is equal to the current preempt count
....@@ -3990,11 +4536,11 @@
39904536 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
39914537 && in_atomic_preempt_off()) {
39924538 pr_err("Preemption disabled at:");
3993
- print_ip_sym(preempt_disable_ip);
3994
- pr_cont("\n");
4539
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
39954540 }
3996
- if (panic_on_warn)
3997
- panic("scheduling while atomic\n");
4541
+ check_panic_on_warn("scheduling while atomic");
4542
+
4543
+ trace_android_rvh_schedule_bug(prev);
39984544
39994545 dump_stack();
40004546 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
....@@ -4003,11 +4549,23 @@
40034549 /*
40044550 * Various schedule()-time debugging checks and statistics:
40054551 */
4006
-static inline void schedule_debug(struct task_struct *prev)
4552
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
40074553 {
40084554 #ifdef CONFIG_SCHED_STACK_END_CHECK
40094555 if (task_stack_end_corrupted(prev))
40104556 panic("corrupted stack end detected inside scheduler\n");
4557
+
4558
+ if (task_scs_end_corrupted(prev))
4559
+ panic("corrupted shadow stack detected inside scheduler\n");
4560
+#endif
4561
+
4562
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4563
+ if (!preempt && prev->state && prev->non_block_count) {
4564
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4565
+ prev->comm, prev->pid, prev->non_block_count);
4566
+ dump_stack();
4567
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4568
+ }
40114569 #endif
40124570
40134571 if (unlikely(in_atomic_preempt_off())) {
....@@ -4019,6 +4577,28 @@
40194577 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
40204578
40214579 schedstat_inc(this_rq()->sched_count);
4580
+}
4581
+
4582
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4583
+ struct rq_flags *rf)
4584
+{
4585
+#ifdef CONFIG_SMP
4586
+ const struct sched_class *class;
4587
+ /*
4588
+ * We must do the balancing pass before put_prev_task(), such
4589
+ * that when we release the rq->lock the task is in the same
4590
+ * state as before we took rq->lock.
4591
+ *
4592
+ * We can terminate the balance pass as soon as we know there is
4593
+ * a runnable task of @class priority or higher.
4594
+ */
4595
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
4596
+ if (class->balance(rq, prev, rf))
4597
+ break;
4598
+ }
4599
+#endif
4600
+
4601
+ put_prev_task(rq, prev);
40224602 }
40234603
40244604 /*
....@@ -4036,36 +4616,34 @@
40364616 * higher scheduling class, because otherwise those loose the
40374617 * opportunity to pull in more work from other CPUs.
40384618 */
4039
- if (likely((prev->sched_class == &idle_sched_class ||
4040
- prev->sched_class == &fair_sched_class) &&
4619
+ if (likely(prev->sched_class <= &fair_sched_class &&
40414620 rq->nr_running == rq->cfs.h_nr_running)) {
40424621
4043
- p = fair_sched_class.pick_next_task(rq, prev, rf);
4622
+ p = pick_next_task_fair(rq, prev, rf);
40444623 if (unlikely(p == RETRY_TASK))
4045
- goto again;
4624
+ goto restart;
40464625
40474626 /* Assumes fair_sched_class->next == idle_sched_class */
4048
- if (unlikely(!p))
4049
- p = idle_sched_class.pick_next_task(rq, prev, rf);
4627
+ if (!p) {
4628
+ put_prev_task(rq, prev);
4629
+ p = pick_next_task_idle(rq);
4630
+ }
40504631
40514632 return p;
40524633 }
40534634
4054
-again:
4635
+restart:
4636
+ put_prev_task_balance(rq, prev, rf);
4637
+
40554638 for_each_class(class) {
4056
- p = class->pick_next_task(rq, prev, rf);
4057
- if (p) {
4058
- if (unlikely(p == RETRY_TASK))
4059
- goto again;
4639
+ p = class->pick_next_task(rq);
4640
+ if (p)
40604641 return p;
4061
- }
40624642 }
40634643
40644644 /* The idle class should always have a runnable task: */
40654645 BUG();
40664646 }
4067
-
4068
-static void migrate_disabled_sched(struct task_struct *p);
40694647
40704648 /*
40714649 * __schedule() is the main scheduler function.
....@@ -4087,7 +4665,7 @@
40874665 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
40884666 * called on the nearest possible occasion:
40894667 *
4090
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
4668
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
40914669 *
40924670 * - in syscall or exception context, at the next outmost
40934671 * preempt_enable(). (this might be as soon as the wake_up()'s
....@@ -4096,7 +4674,7 @@
40964674 * - in IRQ context, return from interrupt-handler to
40974675 * preemptible context
40984676 *
4099
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4677
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
41004678 * then at the next:
41014679 *
41024680 * - cond_resched() call
....@@ -4110,6 +4688,7 @@
41104688 {
41114689 struct task_struct *prev, *next;
41124690 unsigned long *switch_count;
4691
+ unsigned long prev_state;
41134692 struct rq_flags rf;
41144693 struct rq *rq;
41154694 int cpu;
....@@ -4118,7 +4697,7 @@
41184697 rq = cpu_rq(cpu);
41194698 prev = rq->curr;
41204699
4121
- schedule_debug(prev);
4700
+ schedule_debug(prev, preempt);
41224701
41234702 if (sched_feat(HRTICK))
41244703 hrtick_clear(rq);
....@@ -4129,28 +4708,59 @@
41294708 /*
41304709 * Make sure that signal_pending_state()->signal_pending() below
41314710 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4132
- * done by the caller to avoid the race with signal_wake_up().
4711
+ * done by the caller to avoid the race with signal_wake_up():
41334712 *
4134
- * The membarrier system call requires a full memory barrier
4713
+ * __set_current_state(@state) signal_wake_up()
4714
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
4715
+ * wake_up_state(p, state)
4716
+ * LOCK rq->lock LOCK p->pi_state
4717
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
4718
+ * if (signal_pending_state()) if (p->state & @state)
4719
+ *
4720
+ * Also, the membarrier system call requires a full memory barrier
41354721 * after coming from user-space, before storing to rq->curr.
41364722 */
41374723 rq_lock(rq, &rf);
41384724 smp_mb__after_spinlock();
4139
-
4140
- if (__migrate_disabled(prev))
4141
- migrate_disabled_sched(prev);
41424725
41434726 /* Promote REQ to ACT */
41444727 rq->clock_update_flags <<= 1;
41454728 update_rq_clock(rq);
41464729
41474730 switch_count = &prev->nivcsw;
4148
- if (!preempt && prev->state) {
4149
- if (unlikely(signal_pending_state(prev->state, prev))) {
4731
+
4732
+ /*
4733
+ * We must load prev->state once (task_struct::state is volatile), such
4734
+ * that:
4735
+ *
4736
+ * - we form a control dependency vs deactivate_task() below.
4737
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
4738
+ */
4739
+ prev_state = prev->state;
4740
+ if (!preempt && prev_state) {
4741
+ if (signal_pending_state(prev_state, prev)) {
41504742 prev->state = TASK_RUNNING;
41514743 } else {
4744
+ prev->sched_contributes_to_load =
4745
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
4746
+ !(prev_state & TASK_NOLOAD) &&
4747
+ !(prev->flags & PF_FROZEN);
4748
+
4749
+ if (prev->sched_contributes_to_load)
4750
+ rq->nr_uninterruptible++;
4751
+
4752
+ /*
4753
+ * __schedule() ttwu()
4754
+ * prev_state = prev->state; if (p->on_rq && ...)
4755
+ * if (prev_state) goto out;
4756
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
4757
+ * p->state = TASK_WAKING
4758
+ *
4759
+ * Where __schedule() and ttwu() have matching control dependencies.
4760
+ *
4761
+ * After this, schedule() must not care about p->state any more.
4762
+ */
41524763 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4153
- prev->on_rq = 0;
41544764
41554765 if (prev->in_iowait) {
41564766 atomic_inc(&rq->nr_iowait);
....@@ -4162,12 +4772,16 @@
41624772
41634773 next = pick_next_task(rq, prev, &rf);
41644774 clear_tsk_need_resched(prev);
4165
- clear_tsk_need_resched_lazy(prev);
41664775 clear_preempt_need_resched();
41674776
4777
+ trace_android_rvh_schedule(prev, next, rq);
41684778 if (likely(prev != next)) {
41694779 rq->nr_switches++;
4170
- rq->curr = next;
4780
+ /*
4781
+ * RCU users of rcu_dereference(rq->curr) may not see
4782
+ * changes to task_struct made by pick_next_task().
4783
+ */
4784
+ RCU_INIT_POINTER(rq->curr, next);
41714785 /*
41724786 * The membarrier system call requires each architecture
41734787 * to have a full memory barrier after updating
....@@ -4183,6 +4797,8 @@
41834797 * is a RELEASE barrier),
41844798 */
41854799 ++*switch_count;
4800
+
4801
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
41864802
41874803 trace_sched_switch(preempt, prev, next);
41884804
....@@ -4214,19 +4830,26 @@
42144830
42154831 static inline void sched_submit_work(struct task_struct *tsk)
42164832 {
4833
+ unsigned int task_flags;
4834
+
42174835 if (!tsk->state)
42184836 return;
42194837
4838
+ task_flags = tsk->flags;
42204839 /*
42214840 * If a worker went to sleep, notify and ask workqueue whether
42224841 * it wants to wake up a task to maintain concurrency.
42234842 * As this function is called inside the schedule() context,
42244843 * we disable preemption to avoid it calling schedule() again
4225
- * in the possible wakeup of a kworker.
4844
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
4845
+ * requires it.
42264846 */
4227
- if (tsk->flags & PF_WQ_WORKER) {
4847
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
42284848 preempt_disable();
4229
- wq_worker_sleeping(tsk);
4849
+ if (task_flags & PF_WQ_WORKER)
4850
+ wq_worker_sleeping(tsk);
4851
+ else
4852
+ io_wq_worker_sleeping(tsk);
42304853 preempt_enable_no_resched();
42314854 }
42324855
....@@ -4243,8 +4866,12 @@
42434866
42444867 static void sched_update_worker(struct task_struct *tsk)
42454868 {
4246
- if (tsk->flags & PF_WQ_WORKER)
4247
- wq_worker_running(tsk);
4869
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4870
+ if (tsk->flags & PF_WQ_WORKER)
4871
+ wq_worker_running(tsk);
4872
+ else
4873
+ io_wq_worker_running(tsk);
4874
+ }
42484875 }
42494876
42504877 asmlinkage __visible void __sched schedule(void)
....@@ -4346,35 +4973,10 @@
43464973 } while (need_resched());
43474974 }
43484975
4349
-#ifdef CONFIG_PREEMPT_LAZY
4976
+#ifdef CONFIG_PREEMPTION
43504977 /*
4351
- * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
4352
- * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
4353
- * preempt_lazy_count counter >0.
4354
- */
4355
-static __always_inline int preemptible_lazy(void)
4356
-{
4357
- if (test_thread_flag(TIF_NEED_RESCHED))
4358
- return 1;
4359
- if (current_thread_info()->preempt_lazy_count)
4360
- return 0;
4361
- return 1;
4362
-}
4363
-
4364
-#else
4365
-
4366
-static inline int preemptible_lazy(void)
4367
-{
4368
- return 1;
4369
-}
4370
-
4371
-#endif
4372
-
4373
-#ifdef CONFIG_PREEMPT
4374
-/*
4375
- * this is the entry point to schedule() from in-kernel preemption
4376
- * off of preempt_enable. Kernel preemptions off return from interrupt
4377
- * occur there and call schedule directly.
4978
+ * This is the entry point to schedule() from in-kernel preemption
4979
+ * off of preempt_enable.
43784980 */
43794981 asmlinkage __visible void __sched notrace preempt_schedule(void)
43804982 {
....@@ -4384,8 +4986,7 @@
43844986 */
43854987 if (likely(!preemptible()))
43864988 return;
4387
- if (!preemptible_lazy())
4388
- return;
4989
+
43894990 preempt_schedule_common();
43904991 }
43914992 NOKPROBE_SYMBOL(preempt_schedule);
....@@ -4410,9 +5011,6 @@
44105011 enum ctx_state prev_ctx;
44115012
44125013 if (likely(!preemptible()))
4413
- return;
4414
-
4415
- if (!preemptible_lazy())
44165014 return;
44175015
44185016 do {
....@@ -4446,10 +5044,10 @@
44465044 }
44475045 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
44485046
4449
-#endif /* CONFIG_PREEMPT */
5047
+#endif /* CONFIG_PREEMPTION */
44505048
44515049 /*
4452
- * this is the entry point to schedule() from kernel preemption
5050
+ * This is the entry point to schedule() from kernel preemption
44535051 * off of irq context.
44545052 * Note, that this is called and return with irqs disabled. This will
44555053 * protect us against recursive calling from irq.
....@@ -4477,9 +5075,22 @@
44775075 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
44785076 void *key)
44795077 {
4480
- return try_to_wake_up(curr->private, mode, wake_flags, 1);
5078
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
5079
+ return try_to_wake_up(curr->private, mode, wake_flags);
44815080 }
44825081 EXPORT_SYMBOL(default_wake_function);
5082
+
5083
+static void __setscheduler_prio(struct task_struct *p, int prio)
5084
+{
5085
+ if (dl_prio(prio))
5086
+ p->sched_class = &dl_sched_class;
5087
+ else if (rt_prio(prio))
5088
+ p->sched_class = &rt_sched_class;
5089
+ else
5090
+ p->sched_class = &fair_sched_class;
5091
+
5092
+ p->prio = prio;
5093
+}
44835094
44845095 #ifdef CONFIG_RT_MUTEXES
44855096
....@@ -4517,6 +5128,7 @@
45175128 struct rq_flags rf;
45185129 struct rq *rq;
45195130
5131
+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
45205132 /* XXX used to be waiter->prio, not waiter->task->prio */
45215133 prio = __rt_effective_prio(pi_task, p->normal_prio);
45225134
....@@ -4591,31 +5203,29 @@
45915203 if (!dl_prio(p->normal_prio) ||
45925204 (pi_task && dl_prio(pi_task->prio) &&
45935205 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4594
- p->dl.dl_boosted = 1;
5206
+ p->dl.pi_se = pi_task->dl.pi_se;
45955207 queue_flag |= ENQUEUE_REPLENISH;
4596
- } else
4597
- p->dl.dl_boosted = 0;
4598
- p->sched_class = &dl_sched_class;
5208
+ } else {
5209
+ p->dl.pi_se = &p->dl;
5210
+ }
45995211 } else if (rt_prio(prio)) {
46005212 if (dl_prio(oldprio))
4601
- p->dl.dl_boosted = 0;
5213
+ p->dl.pi_se = &p->dl;
46025214 if (oldprio < prio)
46035215 queue_flag |= ENQUEUE_HEAD;
4604
- p->sched_class = &rt_sched_class;
46055216 } else {
46065217 if (dl_prio(oldprio))
4607
- p->dl.dl_boosted = 0;
5218
+ p->dl.pi_se = &p->dl;
46085219 if (rt_prio(oldprio))
46095220 p->rt.timeout = 0;
4610
- p->sched_class = &fair_sched_class;
46115221 }
46125222
4613
- p->prio = prio;
5223
+ __setscheduler_prio(p, prio);
46145224
46155225 if (queued)
46165226 enqueue_task(rq, p, queue_flag);
46175227 if (running)
4618
- set_curr_task(rq, p);
5228
+ set_next_task(rq, p);
46195229
46205230 check_class_changed(rq, p, prev_class, oldprio);
46215231 out_unlock:
....@@ -4635,12 +5245,13 @@
46355245
46365246 void set_user_nice(struct task_struct *p, long nice)
46375247 {
4638
- bool queued, running;
4639
- int old_prio, delta;
5248
+ bool queued, running, allowed = false;
5249
+ int old_prio;
46405250 struct rq_flags rf;
46415251 struct rq *rq;
46425252
4643
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5253
+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
5254
+ if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed)
46445255 return;
46455256 /*
46465257 * We have to be careful, if called from sys_setpriority(),
....@@ -4667,22 +5278,21 @@
46675278 put_prev_task(rq, p);
46685279
46695280 p->static_prio = NICE_TO_PRIO(nice);
4670
- set_load_weight(p, true);
5281
+ set_load_weight(p);
46715282 old_prio = p->prio;
46725283 p->prio = effective_prio(p);
4673
- delta = p->prio - old_prio;
46745284
4675
- if (queued) {
5285
+ if (queued)
46765286 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4677
- /*
4678
- * If the task increased its priority or is running and
4679
- * lowered its priority, then reschedule its CPU:
4680
- */
4681
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
4682
- resched_curr(rq);
4683
- }
46845287 if (running)
4685
- set_curr_task(rq, p);
5288
+ set_next_task(rq, p);
5289
+
5290
+ /*
5291
+ * If the task increased its priority or is running and
5292
+ * lowered its priority, then reschedule its CPU:
5293
+ */
5294
+ p->sched_class->prio_changed(rq, p, old_prio);
5295
+
46865296 out_unlock:
46875297 task_rq_unlock(rq, p, &rf);
46885298 }
....@@ -4767,7 +5377,7 @@
47675377 return 0;
47685378
47695379 #ifdef CONFIG_SMP
4770
- if (!llist_empty(&rq->wake_list))
5380
+ if (rq->ttwu_pending)
47715381 return 0;
47725382 #endif
47735383
....@@ -4790,6 +5400,7 @@
47905400
47915401 return 1;
47925402 }
5403
+EXPORT_SYMBOL_GPL(available_idle_cpu);
47935404
47945405 /**
47955406 * idle_task - return the idle task for a given CPU.
....@@ -4841,36 +5452,7 @@
48415452 */
48425453 p->rt_priority = attr->sched_priority;
48435454 p->normal_prio = normal_prio(p);
4844
- set_load_weight(p, true);
4845
-}
4846
-
4847
-/* Actually do priority change: must hold pi & rq lock. */
4848
-static void __setscheduler(struct rq *rq, struct task_struct *p,
4849
- const struct sched_attr *attr, bool keep_boost)
4850
-{
4851
- /*
4852
- * If params can't change scheduling class changes aren't allowed
4853
- * either.
4854
- */
4855
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4856
- return;
4857
-
4858
- __setscheduler_params(p, attr);
4859
-
4860
- /*
4861
- * Keep a potential priority boosting if called from
4862
- * sched_setscheduler().
4863
- */
4864
- p->prio = normal_prio(p);
4865
- if (keep_boost)
4866
- p->prio = rt_effective_prio(p, p->prio);
4867
-
4868
- if (dl_prio(p->prio))
4869
- p->sched_class = &dl_sched_class;
4870
- else if (rt_prio(p->prio))
4871
- p->sched_class = &rt_sched_class;
4872
- else
4873
- p->sched_class = &fair_sched_class;
5455
+ set_load_weight(p);
48745456 }
48755457
48765458 /*
....@@ -4893,15 +5475,14 @@
48935475 const struct sched_attr *attr,
48945476 bool user, bool pi)
48955477 {
4896
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4897
- MAX_RT_PRIO - 1 - attr->sched_priority;
4898
- int retval, oldprio, oldpolicy = -1, queued, running;
4899
- int new_effective_prio, policy = attr->sched_policy;
5478
+ int oldpolicy = -1, policy = attr->sched_policy;
5479
+ int retval, oldprio, newprio, queued, running;
49005480 const struct sched_class *prev_class;
49015481 struct rq_flags rf;
49025482 int reset_on_fork;
49035483 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
49045484 struct rq *rq;
5485
+ bool cpuset_locked = false;
49055486
49065487 /* The pi code expects interrupts enabled */
49075488 BUG_ON(pi && in_interrupt());
....@@ -4969,7 +5550,7 @@
49695550 * Treat SCHED_IDLE as nice 20. Only allow a switch to
49705551 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
49715552 */
4972
- if (idle_policy(p->policy) && !idle_policy(policy)) {
5553
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
49735554 if (!can_nice(p, task_nice(p)))
49745555 return -EPERM;
49755556 }
....@@ -4980,6 +5561,10 @@
49805561
49815562 /* Normal users shall not reset the sched_reset_on_fork flag: */
49825563 if (p->sched_reset_on_fork && !reset_on_fork)
5564
+ return -EPERM;
5565
+
5566
+ /* Can't change util-clamps */
5567
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
49835568 return -EPERM;
49845569 }
49855570
....@@ -5000,6 +5585,15 @@
50005585 }
50015586
50025587 /*
5588
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
5589
+ * information.
5590
+ */
5591
+ if (dl_policy(policy) || dl_policy(p->policy)) {
5592
+ cpuset_locked = true;
5593
+ cpuset_lock();
5594
+ }
5595
+
5596
+ /*
50035597 * Make sure no PI-waiters arrive (or leave) while we are
50045598 * changing the priority of the task:
50055599 *
....@@ -5013,8 +5607,8 @@
50135607 * Changing the policy of the stop threads its a very bad idea:
50145608 */
50155609 if (p == rq->stop) {
5016
- task_rq_unlock(rq, p, &rf);
5017
- return -EINVAL;
5610
+ retval = -EINVAL;
5611
+ goto unlock;
50185612 }
50195613
50205614 /*
....@@ -5032,8 +5626,8 @@
50325626 goto change;
50335627
50345628 p->sched_reset_on_fork = reset_on_fork;
5035
- task_rq_unlock(rq, p, &rf);
5036
- return 0;
5629
+ retval = 0;
5630
+ goto unlock;
50375631 }
50385632 change:
50395633
....@@ -5046,8 +5640,8 @@
50465640 if (rt_bandwidth_enabled() && rt_policy(policy) &&
50475641 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
50485642 !task_group_is_autogroup(task_group(p))) {
5049
- task_rq_unlock(rq, p, &rf);
5050
- return -EPERM;
5643
+ retval = -EPERM;
5644
+ goto unlock;
50515645 }
50525646 #endif
50535647 #ifdef CONFIG_SMP
....@@ -5062,8 +5656,8 @@
50625656 */
50635657 if (!cpumask_subset(span, p->cpus_ptr) ||
50645658 rq->rd->dl_bw.bw == 0) {
5065
- task_rq_unlock(rq, p, &rf);
5066
- return -EPERM;
5659
+ retval = -EPERM;
5660
+ goto unlock;
50675661 }
50685662 }
50695663 #endif
....@@ -5073,6 +5667,8 @@
50735667 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
50745668 policy = oldpolicy = -1;
50755669 task_rq_unlock(rq, p, &rf);
5670
+ if (cpuset_locked)
5671
+ cpuset_unlock();
50765672 goto recheck;
50775673 }
50785674
....@@ -5082,13 +5678,14 @@
50825678 * is available.
50835679 */
50845680 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5085
- task_rq_unlock(rq, p, &rf);
5086
- return -EBUSY;
5681
+ retval = -EBUSY;
5682
+ goto unlock;
50875683 }
50885684
50895685 p->sched_reset_on_fork = reset_on_fork;
50905686 oldprio = p->prio;
50915687
5688
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
50925689 if (pi) {
50935690 /*
50945691 * Take priority boosted tasks into account. If the new
....@@ -5097,8 +5694,8 @@
50975694 * the runqueue. This will be done when the task deboost
50985695 * itself.
50995696 */
5100
- new_effective_prio = rt_effective_prio(p, newprio);
5101
- if (new_effective_prio == oldprio)
5697
+ newprio = rt_effective_prio(p, newprio);
5698
+ if (newprio == oldprio)
51025699 queue_flags &= ~DEQUEUE_MOVE;
51035700 }
51045701
....@@ -5111,7 +5708,11 @@
51115708
51125709 prev_class = p->sched_class;
51135710
5114
- __setscheduler(rq, p, attr, pi);
5711
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
5712
+ __setscheduler_params(p, attr);
5713
+ __setscheduler_prio(p, newprio);
5714
+ trace_android_rvh_setscheduler(p);
5715
+ }
51155716 __setscheduler_uclamp(p, attr);
51165717
51175718 if (queued) {
....@@ -5125,7 +5726,7 @@
51255726 enqueue_task(rq, p, queue_flags);
51265727 }
51275728 if (running)
5128
- set_curr_task(rq, p);
5729
+ set_next_task(rq, p);
51295730
51305731 check_class_changed(rq, p, prev_class, oldprio);
51315732
....@@ -5133,14 +5734,23 @@
51335734 preempt_disable();
51345735 task_rq_unlock(rq, p, &rf);
51355736
5136
- if (pi)
5737
+ if (pi) {
5738
+ if (cpuset_locked)
5739
+ cpuset_unlock();
51375740 rt_mutex_adjust_pi(p);
5741
+ }
51385742
51395743 /* Run balance callbacks after we've adjusted the PI chain: */
51405744 balance_callback(rq);
51415745 preempt_enable();
51425746
51435747 return 0;
5748
+
5749
+unlock:
5750
+ task_rq_unlock(rq, p, &rf);
5751
+ if (cpuset_locked)
5752
+ cpuset_unlock();
5753
+ return retval;
51445754 }
51455755
51465756 static int _sched_setscheduler(struct task_struct *p, int policy,
....@@ -5152,6 +5762,14 @@
51525762 .sched_nice = PRIO_TO_NICE(p->static_prio),
51535763 };
51545764
5765
+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
5766
+ ((policy == SCHED_FIFO) || (policy == SCHED_RR))) {
5767
+ attr.sched_priority /= 2;
5768
+ if (!check)
5769
+ attr.sched_priority += MAX_RT_PRIO / 2;
5770
+ if (!attr.sched_priority)
5771
+ attr.sched_priority = 1;
5772
+ }
51555773 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
51565774 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
51575775 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
....@@ -5166,6 +5784,8 @@
51665784 * @p: the task in question.
51675785 * @policy: new policy.
51685786 * @param: structure containing the new RT priority.
5787
+ *
5788
+ * Use sched_set_fifo(), read its comment.
51695789 *
51705790 * Return: 0 on success. An error code otherwise.
51715791 *
....@@ -5188,6 +5808,7 @@
51885808 {
51895809 return __sched_setscheduler(p, attr, false, true);
51905810 }
5811
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
51915812
51925813 /**
51935814 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
....@@ -5208,6 +5829,51 @@
52085829 return _sched_setscheduler(p, policy, param, false);
52095830 }
52105831 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
5832
+
5833
+/*
5834
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
5835
+ * incapable of resource management, which is the one thing an OS really should
5836
+ * be doing.
5837
+ *
5838
+ * This is of course the reason it is limited to privileged users only.
5839
+ *
5840
+ * Worse still; it is fundamentally impossible to compose static priority
5841
+ * workloads. You cannot take two correctly working static prio workloads
5842
+ * and smash them together and still expect them to work.
5843
+ *
5844
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
5845
+ *
5846
+ * MAX_RT_PRIO / 2
5847
+ *
5848
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
5849
+ * know enough information to make a sensible choice.
5850
+ */
5851
+void sched_set_fifo(struct task_struct *p)
5852
+{
5853
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
5854
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5855
+}
5856
+EXPORT_SYMBOL_GPL(sched_set_fifo);
5857
+
5858
+/*
5859
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
5860
+ */
5861
+void sched_set_fifo_low(struct task_struct *p)
5862
+{
5863
+ struct sched_param sp = { .sched_priority = 1 };
5864
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5865
+}
5866
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
5867
+
5868
+void sched_set_normal(struct task_struct *p, int nice)
5869
+{
5870
+ struct sched_attr attr = {
5871
+ .sched_policy = SCHED_NORMAL,
5872
+ .sched_nice = nice,
5873
+ };
5874
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
5875
+}
5876
+EXPORT_SYMBOL_GPL(sched_set_normal);
52115877
52125878 static int
52135879 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
....@@ -5239,9 +5905,6 @@
52395905 u32 size;
52405906 int ret;
52415907
5242
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5243
- return -EFAULT;
5244
-
52455908 /* Zero the full structure, so that a short copy will be nice: */
52465909 memset(attr, 0, sizeof(*attr));
52475910
....@@ -5249,44 +5912,18 @@
52495912 if (ret)
52505913 return ret;
52515914
5252
- /* Bail out on silly large: */
5253
- if (size > PAGE_SIZE)
5254
- goto err_size;
5255
-
52565915 /* ABI compatibility quirk: */
52575916 if (!size)
52585917 size = SCHED_ATTR_SIZE_VER0;
5259
-
5260
- if (size < SCHED_ATTR_SIZE_VER0)
5918
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
52615919 goto err_size;
52625920
5263
- /*
5264
- * If we're handed a bigger struct than we know of,
5265
- * ensure all the unknown bits are 0 - i.e. new
5266
- * user-space does not rely on any kernel feature
5267
- * extensions we dont know about yet.
5268
- */
5269
- if (size > sizeof(*attr)) {
5270
- unsigned char __user *addr;
5271
- unsigned char __user *end;
5272
- unsigned char val;
5273
-
5274
- addr = (void __user *)uattr + sizeof(*attr);
5275
- end = (void __user *)uattr + size;
5276
-
5277
- for (; addr < end; addr++) {
5278
- ret = get_user(val, addr);
5279
- if (ret)
5280
- return ret;
5281
- if (val)
5282
- goto err_size;
5283
- }
5284
- size = sizeof(*attr);
5921
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
5922
+ if (ret) {
5923
+ if (ret == -E2BIG)
5924
+ goto err_size;
5925
+ return ret;
52855926 }
5286
-
5287
- ret = copy_from_user(attr, uattr, size);
5288
- if (ret)
5289
- return -EFAULT;
52905927
52915928 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
52925929 size < SCHED_ATTR_SIZE_VER1)
....@@ -5303,6 +5940,16 @@
53035940 err_size:
53045941 put_user(sizeof(*attr), &uattr->size);
53055942 return -E2BIG;
5943
+}
5944
+
5945
+static void get_params(struct task_struct *p, struct sched_attr *attr)
5946
+{
5947
+ if (task_has_dl_policy(p))
5948
+ __getparam_dl(p, attr);
5949
+ else if (task_has_rt_policy(p))
5950
+ attr->sched_priority = p->rt_priority;
5951
+ else
5952
+ attr->sched_nice = task_nice(p);
53065953 }
53075954
53085955 /**
....@@ -5366,6 +6013,8 @@
53666013 rcu_read_unlock();
53676014
53686015 if (likely(p)) {
6016
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
6017
+ get_params(p, &attr);
53696018 retval = sched_setattr(p, &attr);
53706019 put_task_struct(p);
53716020 }
....@@ -5459,7 +6108,7 @@
54596108 {
54606109 unsigned int ksize = sizeof(*kattr);
54616110
5462
- if (!access_ok(VERIFY_WRITE, uattr, usize))
6111
+ if (!access_ok(uattr, usize))
54636112 return -EFAULT;
54646113
54656114 /*
....@@ -5487,7 +6136,7 @@
54876136 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
54886137 * @pid: the pid in question.
54896138 * @uattr: structure containing the extended parameters.
5490
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
6139
+ * @usize: sizeof(attr) for fwd/bwd comp.
54916140 * @flags: for future extension.
54926141 */
54936142 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
....@@ -5514,14 +6163,15 @@
55146163 kattr.sched_policy = p->policy;
55156164 if (p->sched_reset_on_fork)
55166165 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5517
- if (task_has_dl_policy(p))
5518
- __getparam_dl(p, &kattr);
5519
- else if (task_has_rt_policy(p))
5520
- kattr.sched_priority = p->rt_priority;
5521
- else
5522
- kattr.sched_nice = task_nice(p);
6166
+ get_params(p, &kattr);
6167
+ kattr.sched_flags &= SCHED_FLAG_ALL;
55236168
55246169 #ifdef CONFIG_UCLAMP_TASK
6170
+ /*
6171
+ * This could race with another potential updater, but this is fine
6172
+ * because it'll correctly read the old or the new value. We don't need
6173
+ * to guarantee who wins the race as long as it doesn't return garbage.
6174
+ */
55256175 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
55266176 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
55276177 #endif
....@@ -5540,6 +6190,7 @@
55406190 cpumask_var_t cpus_allowed, new_mask;
55416191 struct task_struct *p;
55426192 int retval;
6193
+ int skip = 0;
55436194
55446195 rcu_read_lock();
55456196
....@@ -5575,6 +6226,9 @@
55756226 rcu_read_unlock();
55766227 }
55776228
6229
+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
6230
+ if (skip)
6231
+ goto out_free_new_mask;
55786232 retval = security_task_setscheduler(p);
55796233 if (retval)
55806234 goto out_free_new_mask;
....@@ -5615,6 +6269,9 @@
56156269 goto again;
56166270 }
56176271 }
6272
+
6273
+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
6274
+
56186275 out_free_new_mask:
56196276 free_cpumask_var(new_mask);
56206277 out_free_cpus_allowed:
....@@ -5623,7 +6280,6 @@
56236280 put_task_struct(p);
56246281 return retval;
56256282 }
5626
-EXPORT_SYMBOL_GPL(sched_setaffinity);
56276283
56286284 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
56296285 struct cpumask *new_mask)
....@@ -5707,14 +6363,14 @@
57076363 if (len & (sizeof(unsigned long)-1))
57086364 return -EINVAL;
57096365
5710
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6366
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
57116367 return -ENOMEM;
57126368
57136369 ret = sched_getaffinity(pid, mask);
57146370 if (ret == 0) {
57156371 unsigned int retlen = min(len, cpumask_size());
57166372
5717
- if (copy_to_user(user_mask_ptr, mask, retlen))
6373
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
57186374 ret = -EFAULT;
57196375 else
57206376 ret = retlen;
....@@ -5742,6 +6398,8 @@
57426398 schedstat_inc(rq->yld_count);
57436399 current->sched_class->yield_task(rq);
57446400
6401
+ trace_android_rvh_do_sched_yield(rq);
6402
+
57456403 preempt_disable();
57466404 rq_unlock_irq(rq, &rf);
57476405 sched_preempt_enable_no_resched();
....@@ -5755,7 +6413,7 @@
57556413 return 0;
57566414 }
57576415
5758
-#ifndef CONFIG_PREEMPT
6416
+#ifndef CONFIG_PREEMPTION
57596417 int __sched _cond_resched(void)
57606418 {
57616419 if (should_resched(0)) {
....@@ -5772,7 +6430,7 @@
57726430 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
57736431 * call schedule, and on return reacquire the lock.
57746432 *
5775
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6433
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
57766434 * operations here to prevent schedule() from being called twice (once via
57776435 * spin_unlock(), once by hand).
57786436 */
....@@ -5876,7 +6534,7 @@
58766534 if (task_running(p_rq, p) || p->state)
58776535 goto out_unlock;
58786536
5879
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
6537
+ yielded = curr->sched_class->yield_to_task(rq, p);
58806538 if (yielded) {
58816539 schedstat_inc(rq->yld_count);
58826540 /*
....@@ -6042,7 +6700,7 @@
60426700 * an error code.
60436701 */
60446702 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6045
- struct timespec __user *, interval)
6703
+ struct __kernel_timespec __user *, interval)
60466704 {
60476705 struct timespec64 t;
60486706 int retval = sched_rr_get_interval(pid, &t);
....@@ -6053,16 +6711,15 @@
60536711 return retval;
60546712 }
60556713
6056
-#ifdef CONFIG_COMPAT
6057
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
6058
- compat_pid_t, pid,
6059
- struct compat_timespec __user *, interval)
6714
+#ifdef CONFIG_COMPAT_32BIT_TIME
6715
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
6716
+ struct old_timespec32 __user *, interval)
60606717 {
60616718 struct timespec64 t;
60626719 int retval = sched_rr_get_interval(pid, &t);
60636720
60646721 if (retval == 0)
6065
- retval = compat_put_timespec64(&t, interval);
6722
+ retval = put_old_timespec32(&t, interval);
60666723 return retval;
60676724 }
60686725 #endif
....@@ -6075,10 +6732,10 @@
60756732 if (!try_get_task_stack(p))
60766733 return;
60776734
6078
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
6735
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
60796736
60806737 if (p->state == TASK_RUNNING)
6081
- printk(KERN_CONT " running task ");
6738
+ pr_cont(" running task ");
60826739 #ifdef CONFIG_DEBUG_STACK_USAGE
60836740 free = stack_not_used(p);
60846741 #endif
....@@ -6087,12 +6744,13 @@
60876744 if (pid_alive(p))
60886745 ppid = task_pid_nr(rcu_dereference(p->real_parent));
60896746 rcu_read_unlock();
6090
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6091
- task_pid_nr(p), ppid,
6747
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
6748
+ free, task_pid_nr(p), ppid,
60926749 (unsigned long)task_thread_info(p)->flags);
60936750
60946751 print_worker_info(KERN_INFO, p);
6095
- show_stack(p, NULL);
6752
+ trace_android_vh_sched_show_task(p);
6753
+ show_stack(p, NULL, KERN_INFO);
60966754 put_task_stack(p);
60976755 }
60986756 EXPORT_SYMBOL_GPL(sched_show_task);
....@@ -6123,13 +6781,6 @@
61236781 {
61246782 struct task_struct *g, *p;
61256783
6126
-#if BITS_PER_LONG == 32
6127
- printk(KERN_INFO
6128
- " task PC stack pid father\n");
6129
-#else
6130
- printk(KERN_INFO
6131
- " task PC stack pid father\n");
6132
-#endif
61336784 rcu_read_lock();
61346785 for_each_process_thread(g, p) {
61356786 /*
....@@ -6165,7 +6816,7 @@
61656816 * NOTE: this function does not set the idle thread's NEED_RESCHED
61666817 * flag, to make booting more robust.
61676818 */
6168
-void init_idle(struct task_struct *idle, int cpu)
6819
+void __init init_idle(struct task_struct *idle, int cpu)
61696820 {
61706821 struct rq *rq = cpu_rq(cpu);
61716822 unsigned long flags;
....@@ -6178,9 +6829,6 @@
61786829 idle->state = TASK_RUNNING;
61796830 idle->se.exec_start = sched_clock();
61806831 idle->flags |= PF_IDLE;
6181
-
6182
- scs_task_reset(idle);
6183
- kasan_unpoison_task_stack(idle);
61846832
61856833 #ifdef CONFIG_SMP
61866834 /*
....@@ -6205,7 +6853,8 @@
62056853 __set_task_cpu(idle, cpu);
62066854 rcu_read_unlock();
62076855
6208
- rq->curr = rq->idle = idle;
6856
+ rq->idle = idle;
6857
+ rcu_assign_pointer(rq->curr, idle);
62096858 idle->on_rq = TASK_ON_RQ_QUEUED;
62106859 #ifdef CONFIG_SMP
62116860 idle->on_cpu = 1;
....@@ -6215,9 +6864,7 @@
62156864
62166865 /* Set the preempt count _outside_ the spinlocks! */
62176866 init_idle_preempt_count(idle, cpu);
6218
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
6219
- task_thread_info(idle)->preempt_lazy_count = 0;
6220
-#endif
6867
+
62216868 /*
62226869 * The idle tasks have their own, simple scheduling class:
62236870 */
....@@ -6244,8 +6891,7 @@
62446891 return ret;
62456892 }
62466893
6247
-int task_can_attach(struct task_struct *p,
6248
- const struct cpumask *cs_cpus_allowed)
6894
+int task_can_attach(struct task_struct *p)
62496895 {
62506896 int ret = 0;
62516897
....@@ -6258,16 +6904,9 @@
62586904 * success of set_cpus_allowed_ptr() on all attached tasks
62596905 * before cpus_mask may be changed.
62606906 */
6261
- if (p->flags & PF_NO_SETAFFINITY) {
6907
+ if (p->flags & PF_NO_SETAFFINITY)
62626908 ret = -EINVAL;
6263
- goto out;
6264
- }
62656909
6266
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6267
- cs_cpus_allowed))
6268
- ret = dl_task_can_attach(p, cs_cpus_allowed);
6269
-
6270
-out:
62716910 return ret;
62726911 }
62736912
....@@ -6316,13 +6955,12 @@
63166955 if (queued)
63176956 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
63186957 if (running)
6319
- set_curr_task(rq, p);
6958
+ set_next_task(rq, p);
63206959 task_rq_unlock(rq, p, &rf);
63216960 }
63226961 #endif /* CONFIG_NUMA_BALANCING */
63236962
63246963 #ifdef CONFIG_HOTPLUG_CPU
6325
-
63266964 /*
63276965 * Ensure that the idle task is using init_mm right before its CPU goes
63286966 * offline.
....@@ -6358,21 +6996,22 @@
63586996 atomic_long_add(delta, &calc_load_tasks);
63596997 }
63606998
6361
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6999
+static struct task_struct *__pick_migrate_task(struct rq *rq)
63627000 {
7001
+ const struct sched_class *class;
7002
+ struct task_struct *next;
7003
+
7004
+ for_each_class(class) {
7005
+ next = class->pick_next_task(rq);
7006
+ if (next) {
7007
+ next->sched_class->put_prev_task(rq, next);
7008
+ return next;
7009
+ }
7010
+ }
7011
+
7012
+ /* The idle class should always have a runnable task */
7013
+ BUG();
63637014 }
6364
-
6365
-static const struct sched_class fake_sched_class = {
6366
- .put_prev_task = put_prev_task_fake,
6367
-};
6368
-
6369
-static struct task_struct fake_task = {
6370
- /*
6371
- * Avoid pull_{rt,dl}_task()
6372
- */
6373
- .prio = MAX_PRIO + 1,
6374
- .sched_class = &fake_sched_class,
6375
-};
63767015
63777016 /*
63787017 * Migrate all tasks from the rq, sleeping tasks will be migrated by
....@@ -6381,11 +7020,14 @@
63817020 * Called with rq->lock held even though we'er in stop_machine() and
63827021 * there's no concurrency possible, we hold the required locks anyway
63837022 * because of lock validation efforts.
7023
+ *
7024
+ * force: if false, the function will skip CPU pinned kthreads.
63847025 */
6385
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
7026
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force)
63867027 {
63877028 struct rq *rq = dead_rq;
6388
- struct task_struct *next, *stop = rq->stop;
7029
+ struct task_struct *next, *tmp, *stop = rq->stop;
7030
+ LIST_HEAD(percpu_kthreads);
63897031 struct rq_flags orf = *rf;
63907032 int dest_cpu;
63917033
....@@ -6407,6 +7049,11 @@
64077049 */
64087050 update_rq_clock(rq);
64097051
7052
+#ifdef CONFIG_SCHED_DEBUG
7053
+ /* note the clock update in orf */
7054
+ orf.clock_update_flags |= RQCF_UPDATED;
7055
+#endif
7056
+
64107057 for (;;) {
64117058 /*
64127059 * There's this thread running, bail when that's the only
....@@ -6415,14 +7062,21 @@
64157062 if (rq->nr_running == 1)
64167063 break;
64177064
6418
- /*
6419
- * pick_next_task() assumes pinned rq->lock:
6420
- */
6421
- next = pick_next_task(rq, &fake_task, rf);
6422
- BUG_ON(!next);
6423
- put_prev_task(rq, next);
7065
+ next = __pick_migrate_task(rq);
64247066
6425
- WARN_ON_ONCE(__migrate_disabled(next));
7067
+ /*
7068
+ * Argh ... no iterator for tasks, we need to remove the
7069
+ * kthread from the run-queue to continue.
7070
+ */
7071
+ if (!force && is_per_cpu_kthread(next)) {
7072
+ INIT_LIST_HEAD(&next->percpu_kthread_node);
7073
+ list_add(&next->percpu_kthread_node, &percpu_kthreads);
7074
+
7075
+ /* DEQUEUE_SAVE not used due to move_entity in rt */
7076
+ deactivate_task(rq, next,
7077
+ DEQUEUE_NOCLOCK);
7078
+ continue;
7079
+ }
64267080
64277081 /*
64287082 * Rules for changing task_struct::cpus_mask are holding
....@@ -6442,7 +7096,14 @@
64427096 * changed the task, WARN if weird stuff happened, because in
64437097 * that case the above rq->lock drop is a fail too.
64447098 */
6445
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
7099
+ if (task_rq(next) != rq || !task_on_rq_queued(next)) {
7100
+ /*
7101
+ * In the !force case, there is a hole between
7102
+ * rq_unlock() and rq_relock(), where another CPU might
7103
+ * not observe an up to date cpu_active_mask and try to
7104
+ * move tasks around.
7105
+ */
7106
+ WARN_ON(force);
64467107 raw_spin_unlock(&next->pi_lock);
64477108 continue;
64487109 }
....@@ -6459,7 +7120,49 @@
64597120 raw_spin_unlock(&next->pi_lock);
64607121 }
64617122
7123
+ list_for_each_entry_safe(next, tmp, &percpu_kthreads,
7124
+ percpu_kthread_node) {
7125
+
7126
+ /* ENQUEUE_RESTORE not used due to move_entity in rt */
7127
+ activate_task(rq, next, ENQUEUE_NOCLOCK);
7128
+ list_del(&next->percpu_kthread_node);
7129
+ }
7130
+
64627131 rq->stop = stop;
7132
+}
7133
+
7134
+static int drain_rq_cpu_stop(void *data)
7135
+{
7136
+ struct rq *rq = this_rq();
7137
+ struct rq_flags rf;
7138
+
7139
+ rq_lock_irqsave(rq, &rf);
7140
+ migrate_tasks(rq, &rf, false);
7141
+ rq_unlock_irqrestore(rq, &rf);
7142
+
7143
+ return 0;
7144
+}
7145
+
7146
+int sched_cpu_drain_rq(unsigned int cpu)
7147
+{
7148
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7149
+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
7150
+
7151
+ if (idle_cpu(cpu)) {
7152
+ rq_drain->done = NULL;
7153
+ return 0;
7154
+ }
7155
+
7156
+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
7157
+ rq_drain_done);
7158
+}
7159
+
7160
+void sched_cpu_drain_rq_wait(unsigned int cpu)
7161
+{
7162
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7163
+
7164
+ if (rq_drain->done)
7165
+ cpu_stop_work_wait(rq_drain);
64637166 }
64647167 #endif /* CONFIG_HOTPLUG_CPU */
64657168
....@@ -6531,8 +7234,10 @@
65317234 static int cpuset_cpu_inactive(unsigned int cpu)
65327235 {
65337236 if (!cpuhp_tasks_frozen) {
6534
- if (dl_cpu_busy(cpu))
6535
- return -EBUSY;
7237
+ int ret = dl_bw_check_overflow(cpu);
7238
+
7239
+ if (ret)
7240
+ return ret;
65367241 cpuset_update_active_cpus();
65377242 } else {
65387243 num_cpus_frozen++;
....@@ -6581,19 +7286,27 @@
65817286 return 0;
65827287 }
65837288
6584
-int sched_cpu_deactivate(unsigned int cpu)
7289
+int sched_cpus_activate(struct cpumask *cpus)
7290
+{
7291
+ unsigned int cpu;
7292
+
7293
+ for_each_cpu(cpu, cpus) {
7294
+ if (sched_cpu_activate(cpu)) {
7295
+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
7296
+ sched_cpu_deactivate(cpu);
7297
+
7298
+ return -EBUSY;
7299
+ }
7300
+ }
7301
+
7302
+ return 0;
7303
+}
7304
+
7305
+int _sched_cpu_deactivate(unsigned int cpu)
65857306 {
65867307 int ret;
65877308
65887309 set_cpu_active(cpu, false);
6589
- /*
6590
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6591
- * users of this state to go away such that all new such users will
6592
- * observe it.
6593
- *
6594
- * Do sync before park smpboot threads to take care the rcu boost case.
6595
- */
6596
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
65977310
65987311 #ifdef CONFIG_SCHED_SMT
65997312 /*
....@@ -6612,6 +7325,46 @@
66127325 return ret;
66137326 }
66147327 sched_domains_numa_masks_clear(cpu);
7328
+
7329
+ update_max_interval();
7330
+
7331
+ return 0;
7332
+}
7333
+
7334
+int sched_cpu_deactivate(unsigned int cpu)
7335
+{
7336
+ int ret = _sched_cpu_deactivate(cpu);
7337
+
7338
+ if (ret)
7339
+ return ret;
7340
+
7341
+ /*
7342
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7343
+ * users of this state to go away such that all new such users will
7344
+ * observe it.
7345
+ *
7346
+ * Do sync before park smpboot threads to take care the rcu boost case.
7347
+ */
7348
+ synchronize_rcu();
7349
+
7350
+ return 0;
7351
+}
7352
+
7353
+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
7354
+{
7355
+ unsigned int cpu;
7356
+
7357
+ for_each_cpu(cpu, cpus) {
7358
+ if (_sched_cpu_deactivate(cpu)) {
7359
+ for_each_cpu(cpu, cpus) {
7360
+ if (!cpu_active(cpu))
7361
+ sched_cpu_activate(cpu);
7362
+ }
7363
+
7364
+ return -EBUSY;
7365
+ }
7366
+ }
7367
+
66157368 return 0;
66167369 }
66177370
....@@ -6620,13 +7373,13 @@
66207373 struct rq *rq = cpu_rq(cpu);
66217374
66227375 rq->calc_load_update = calc_load_update;
6623
- update_max_interval();
66247376 }
66257377
66267378 int sched_cpu_starting(unsigned int cpu)
66277379 {
66287380 sched_rq_cpu_starting(cpu);
66297381 sched_tick_start(cpu);
7382
+ trace_android_rvh_sched_cpu_starting(cpu);
66307383 return 0;
66317384 }
66327385
....@@ -6637,7 +7390,6 @@
66377390 struct rq_flags rf;
66387391
66397392 /* Handle pending wakeups and then migrate everything off */
6640
- sched_ttwu_pending();
66417393 sched_tick_stop(cpu);
66427394
66437395 rq_lock_irqsave(rq, &rf);
....@@ -6645,12 +7397,13 @@
66457397 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
66467398 set_rq_offline(rq);
66477399 }
6648
- migrate_tasks(rq, &rf);
7400
+ migrate_tasks(rq, &rf, true);
66497401 BUG_ON(rq->nr_running != 1);
66507402 rq_unlock_irqrestore(rq, &rf);
66517403
7404
+ trace_android_rvh_sched_cpu_dying(cpu);
7405
+
66527406 calc_load_migrate(rq);
6653
- update_max_interval();
66547407 nohz_balance_exit_idle(rq);
66557408 hrtick_clear(rq);
66567409 return 0;
....@@ -6664,18 +7417,16 @@
66647417 /*
66657418 * There's no userspace yet to cause hotplug operations; hence all the
66667419 * CPU masks are stable and all blatant races in the below code cannot
6667
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6668
- * but there won't be any contention on it.
7420
+ * happen.
66697421 */
6670
- cpus_read_lock();
66717422 mutex_lock(&sched_domains_mutex);
66727423 sched_init_domains(cpu_active_mask);
66737424 mutex_unlock(&sched_domains_mutex);
6674
- cpus_read_unlock();
66757425
66767426 /* Move init over to a non-isolated CPU */
66777427 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
66787428 BUG();
7429
+
66797430 sched_init_granularity();
66807431
66817432 init_sched_rt_class();
....@@ -6686,7 +7437,7 @@
66867437
66877438 static int __init migration_init(void)
66887439 {
6689
- sched_rq_cpu_starting(smp_processor_id());
7440
+ sched_cpu_starting(smp_processor_id());
66907441 return 0;
66917442 }
66927443 early_initcall(migration_init);
....@@ -6711,7 +7462,9 @@
67117462 * Every task in system belongs to this group at bootup.
67127463 */
67137464 struct task_group root_task_group;
7465
+EXPORT_SYMBOL_GPL(root_task_group);
67147466 LIST_HEAD(task_groups);
7467
+EXPORT_SYMBOL_GPL(task_groups);
67157468
67167469 /* Cacheline aligned slab cache for task_group */
67177470 static struct kmem_cache *task_group_cache __read_mostly;
....@@ -6722,19 +7475,27 @@
67227475
67237476 void __init sched_init(void)
67247477 {
6725
- int i, j;
6726
- unsigned long alloc_size = 0, ptr;
7478
+ unsigned long ptr = 0;
7479
+ int i;
7480
+
7481
+ /* Make sure the linker didn't screw up */
7482
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
7483
+ &fair_sched_class + 1 != &rt_sched_class ||
7484
+ &rt_sched_class + 1 != &dl_sched_class);
7485
+#ifdef CONFIG_SMP
7486
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
7487
+#endif
67277488
67287489 wait_bit_init();
67297490
67307491 #ifdef CONFIG_FAIR_GROUP_SCHED
6731
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7492
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67327493 #endif
67337494 #ifdef CONFIG_RT_GROUP_SCHED
6734
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7495
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67357496 #endif
6736
- if (alloc_size) {
6737
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7497
+ if (ptr) {
7498
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
67387499
67397500 #ifdef CONFIG_FAIR_GROUP_SCHED
67407501 root_task_group.se = (struct sched_entity **)ptr;
....@@ -6743,6 +7504,8 @@
67437504 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
67447505 ptr += nr_cpu_ids * sizeof(void **);
67457506
7507
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7508
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
67467509 #endif /* CONFIG_FAIR_GROUP_SCHED */
67477510 #ifdef CONFIG_RT_GROUP_SCHED
67487511 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
....@@ -6795,7 +7558,6 @@
67957558 init_rt_rq(&rq->rt);
67967559 init_dl_rq(&rq->dl);
67977560 #ifdef CONFIG_FAIR_GROUP_SCHED
6798
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
67997561 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
68007562 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
68017563 /*
....@@ -6817,7 +7579,6 @@
68177579 * We achieve this by letting root_task_group's tasks sit
68187580 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
68197581 */
6820
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
68217582 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
68227583 #endif /* CONFIG_FAIR_GROUP_SCHED */
68237584
....@@ -6825,10 +7586,6 @@
68257586 #ifdef CONFIG_RT_GROUP_SCHED
68267587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
68277588 #endif
6828
-
6829
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6830
- rq->cpu_load[j] = 0;
6831
-
68327589 #ifdef CONFIG_SMP
68337590 rq->sd = NULL;
68347591 rq->rd = NULL;
....@@ -6847,16 +7604,17 @@
68477604
68487605 rq_attach_root(rq, &def_root_domain);
68497606 #ifdef CONFIG_NO_HZ_COMMON
6850
- rq->last_load_update_tick = jiffies;
68517607 rq->last_blocked_load_update_tick = jiffies;
68527608 atomic_set(&rq->nohz_flags, 0);
7609
+
7610
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
68537611 #endif
68547612 #endif /* CONFIG_SMP */
68557613 hrtick_rq_init(rq);
68567614 atomic_set(&rq->nr_iowait, 0);
68577615 }
68587616
6859
- set_load_weight(&init_task, false);
7617
+ set_load_weight(&init_task);
68607618
68617619 /*
68627620 * The boot idle thread does lazy MMU switching as well:
....@@ -6891,7 +7649,7 @@
68917649 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
68927650 static inline int preempt_count_equals(int preempt_offset)
68937651 {
6894
- int nested = preempt_count() + sched_rcu_preempt_depth();
7652
+ int nested = preempt_count() + rcu_preempt_depth();
68957653
68967654 return (nested == preempt_offset);
68977655 }
....@@ -6925,7 +7683,7 @@
69257683 rcu_sleep_check();
69267684
69277685 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6928
- !is_idle_task(current)) ||
7686
+ !is_idle_task(current) && !current->non_block_count) ||
69297687 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
69307688 oops_in_progress)
69317689 return;
....@@ -6941,8 +7699,8 @@
69417699 "BUG: sleeping function called from invalid context at %s:%d\n",
69427700 file, line);
69437701 printk(KERN_ERR
6944
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6945
- in_atomic(), irqs_disabled(),
7702
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
7703
+ in_atomic(), irqs_disabled(), current->non_block_count,
69467704 current->pid, current->comm);
69477705
69487706 if (task_stack_end_corrupted(current))
....@@ -6954,13 +7712,43 @@
69547712 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
69557713 && !preempt_count_equals(preempt_offset)) {
69567714 pr_err("Preemption disabled at:");
6957
- print_ip_sym(preempt_disable_ip);
6958
- pr_cont("\n");
7715
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
69597716 }
7717
+
7718
+ trace_android_rvh_schedule_bug(NULL);
7719
+
69607720 dump_stack();
69617721 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
69627722 }
69637723 EXPORT_SYMBOL(___might_sleep);
7724
+
7725
+void __cant_sleep(const char *file, int line, int preempt_offset)
7726
+{
7727
+ static unsigned long prev_jiffy;
7728
+
7729
+ if (irqs_disabled())
7730
+ return;
7731
+
7732
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
7733
+ return;
7734
+
7735
+ if (preempt_count() > preempt_offset)
7736
+ return;
7737
+
7738
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7739
+ return;
7740
+ prev_jiffy = jiffies;
7741
+
7742
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
7743
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7744
+ in_atomic(), irqs_disabled(),
7745
+ current->pid, current->comm);
7746
+
7747
+ debug_show_held_locks(current);
7748
+ dump_stack();
7749
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7750
+}
7751
+EXPORT_SYMBOL_GPL(__cant_sleep);
69647752 #endif
69657753
69667754 #ifdef CONFIG_MAGIC_SYSRQ
....@@ -7029,7 +7817,7 @@
70297817
70307818 #ifdef CONFIG_IA64
70317819 /**
7032
- * set_curr_task - set the current task for a given CPU.
7820
+ * ia64_set_curr_task - set the current task for a given CPU.
70337821 * @cpu: the processor in question.
70347822 * @p: the task pointer to set.
70357823 *
....@@ -7195,8 +7983,15 @@
71957983
71967984 if (queued)
71977985 enqueue_task(rq, tsk, queue_flags);
7198
- if (running)
7199
- set_curr_task(rq, tsk);
7986
+ if (running) {
7987
+ set_next_task(rq, tsk);
7988
+ /*
7989
+ * After changing group, the running task may have joined a
7990
+ * throttled one but it's still the running task. Trigger a
7991
+ * resched to make sure that task can still run.
7992
+ */
7993
+ resched_curr(rq);
7994
+ }
72007995
72017996 task_rq_unlock(rq, tsk, &rf);
72027997 }
....@@ -7235,9 +8030,14 @@
72358030
72368031 #ifdef CONFIG_UCLAMP_TASK_GROUP
72378032 /* Propagate the effective uclamp value for the new group */
8033
+ mutex_lock(&uclamp_mutex);
8034
+ rcu_read_lock();
72388035 cpu_util_update_eff(css);
8036
+ rcu_read_unlock();
8037
+ mutex_unlock(&uclamp_mutex);
72398038 #endif
72408039
8040
+ trace_android_rvh_cpu_cgroup_online(css);
72418041 return 0;
72428042 }
72438043
....@@ -7303,6 +8103,9 @@
73038103 if (ret)
73048104 break;
73058105 }
8106
+
8107
+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
8108
+
73068109 return ret;
73078110 }
73088111
....@@ -7313,6 +8116,8 @@
73138116
73148117 cgroup_taskset_for_each(task, css, tset)
73158118 sched_move_task(task);
8119
+
8120
+ trace_android_rvh_cpu_cgroup_attach(tset);
73168121 }
73178122
73188123 #ifdef CONFIG_UCLAMP_TASK_GROUP
....@@ -7324,6 +8129,9 @@
73248129 unsigned int eff[UCLAMP_CNT];
73258130 enum uclamp_id clamp_id;
73268131 unsigned int clamps;
8132
+
8133
+ lockdep_assert_held(&uclamp_mutex);
8134
+ SCHED_WARN_ON(!rcu_read_lock_held());
73278135
73288136 css_for_each_descendant_pre(css, top_css) {
73298137 uc_parent = css_tg(css)->parent
....@@ -7357,7 +8165,7 @@
73578165 }
73588166
73598167 /* Immediately update descendants RUNNABLE tasks */
7360
- uclamp_update_active_tasks(css, clamps);
8168
+ uclamp_update_active_tasks(css);
73618169 }
73628170 }
73638171
....@@ -7414,6 +8222,8 @@
74148222 req = capacity_from_percent(buf);
74158223 if (req.ret)
74168224 return req.ret;
8225
+
8226
+ static_branch_enable(&sched_uclamp_used);
74178227
74188228 mutex_lock(&uclamp_mutex);
74198229 rcu_read_lock();
....@@ -7529,7 +8339,9 @@
75298339 static DEFINE_MUTEX(cfs_constraints_mutex);
75308340
75318341 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7532
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8342
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8343
+/* More than 203 days if BW_SHIFT equals 20. */
8344
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
75338345
75348346 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
75358347
....@@ -7555,6 +8367,12 @@
75558367 * feasibility.
75568368 */
75578369 if (period > max_cfs_quota_period)
8370
+ return -EINVAL;
8371
+
8372
+ /*
8373
+ * Bound quota to defend quota against overflow during bandwidth shift.
8374
+ */
8375
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
75588376 return -EINVAL;
75598377
75608378 /*
....@@ -7609,7 +8427,7 @@
76098427 return ret;
76108428 }
76118429
7612
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8430
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
76138431 {
76148432 u64 quota, period;
76158433
....@@ -7624,7 +8442,7 @@
76248442 return tg_set_cfs_bandwidth(tg, period, quota);
76258443 }
76268444
7627
-long tg_get_cfs_quota(struct task_group *tg)
8445
+static long tg_get_cfs_quota(struct task_group *tg)
76288446 {
76298447 u64 quota_us;
76308448
....@@ -7637,7 +8455,7 @@
76378455 return quota_us;
76388456 }
76398457
7640
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8458
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
76418459 {
76428460 u64 quota, period;
76438461
....@@ -7650,7 +8468,7 @@
76508468 return tg_set_cfs_bandwidth(tg, period, quota);
76518469 }
76528470
7653
-long tg_get_cfs_period(struct task_group *tg)
8471
+static long tg_get_cfs_period(struct task_group *tg)
76548472 {
76558473 u64 cfs_period_us;
76568474
....@@ -8127,172 +8945,7 @@
81278945 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
81288946 };
81298947
8130
-#undef CREATE_TRACE_POINTS
8131
-
8132
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133
-
8134
-static inline void
8135
-update_nr_migratory(struct task_struct *p, long delta)
8948
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
81368949 {
8137
- if (unlikely((p->sched_class == &rt_sched_class ||
8138
- p->sched_class == &dl_sched_class) &&
8139
- p->nr_cpus_allowed > 1)) {
8140
- if (p->sched_class == &rt_sched_class)
8141
- task_rq(p)->rt.rt_nr_migratory += delta;
8142
- else
8143
- task_rq(p)->dl.dl_nr_migratory += delta;
8144
- }
8950
+ trace_sched_update_nr_running_tp(rq, count);
81458951 }
8146
-
8147
-static inline void
8148
-migrate_disable_update_cpus_allowed(struct task_struct *p)
8149
-{
8150
- p->cpus_ptr = cpumask_of(smp_processor_id());
8151
- update_nr_migratory(p, -1);
8152
- p->nr_cpus_allowed = 1;
8153
-}
8154
-
8155
-static inline void
8156
-migrate_enable_update_cpus_allowed(struct task_struct *p)
8157
-{
8158
- struct rq *rq;
8159
- struct rq_flags rf;
8160
-
8161
- rq = task_rq_lock(p, &rf);
8162
- p->cpus_ptr = &p->cpus_mask;
8163
- p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164
- update_nr_migratory(p, 1);
8165
- task_rq_unlock(rq, p, &rf);
8166
-}
8167
-
8168
-void migrate_disable(void)
8169
-{
8170
- preempt_disable();
8171
-
8172
- if (++current->migrate_disable == 1) {
8173
- this_rq()->nr_pinned++;
8174
- preempt_lazy_disable();
8175
-#ifdef CONFIG_SCHED_DEBUG
8176
- WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177
- current->pinned_on_cpu = smp_processor_id();
8178
-#endif
8179
- }
8180
-
8181
- preempt_enable();
8182
-}
8183
-EXPORT_SYMBOL(migrate_disable);
8184
-
8185
-static void migrate_disabled_sched(struct task_struct *p)
8186
-{
8187
- if (p->migrate_disable_scheduled)
8188
- return;
8189
-
8190
- migrate_disable_update_cpus_allowed(p);
8191
- p->migrate_disable_scheduled = 1;
8192
-}
8193
-
8194
-static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195
-static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196
-
8197
-void migrate_enable(void)
8198
-{
8199
- struct task_struct *p = current;
8200
- struct rq *rq = this_rq();
8201
- int cpu = task_cpu(p);
8202
-
8203
- WARN_ON_ONCE(p->migrate_disable <= 0);
8204
- if (p->migrate_disable > 1) {
8205
- p->migrate_disable--;
8206
- return;
8207
- }
8208
-
8209
- preempt_disable();
8210
-
8211
-#ifdef CONFIG_SCHED_DEBUG
8212
- WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213
- current->pinned_on_cpu = -1;
8214
-#endif
8215
-
8216
- WARN_ON_ONCE(rq->nr_pinned < 1);
8217
-
8218
- p->migrate_disable = 0;
8219
- rq->nr_pinned--;
8220
-#ifdef CONFIG_HOTPLUG_CPU
8221
- if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222
- takedown_cpu_task)
8223
- wake_up_process(takedown_cpu_task);
8224
-#endif
8225
-
8226
- if (!p->migrate_disable_scheduled)
8227
- goto out;
8228
-
8229
- p->migrate_disable_scheduled = 0;
8230
-
8231
- migrate_enable_update_cpus_allowed(p);
8232
-
8233
- WARN_ON(smp_processor_id() != cpu);
8234
- if (!is_cpu_allowed(p, cpu)) {
8235
- struct migration_arg __percpu *arg;
8236
- struct cpu_stop_work __percpu *work;
8237
- struct rq_flags rf;
8238
-
8239
- work = this_cpu_ptr(&migrate_work);
8240
- arg = this_cpu_ptr(&migrate_arg);
8241
- WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242
-
8243
- arg->task = p;
8244
- arg->done = false;
8245
-
8246
- rq = task_rq_lock(p, &rf);
8247
- update_rq_clock(rq);
8248
- arg->dest_cpu = select_fallback_rq(cpu, p);
8249
- task_rq_unlock(rq, p, &rf);
8250
-
8251
- stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252
- arg, work);
8253
- tlb_migrate_finish(p->mm);
8254
- }
8255
-
8256
-out:
8257
- preempt_lazy_enable();
8258
- preempt_enable();
8259
-}
8260
-EXPORT_SYMBOL(migrate_enable);
8261
-
8262
-int cpu_nr_pinned(int cpu)
8263
-{
8264
- struct rq *rq = cpu_rq(cpu);
8265
-
8266
- return rq->nr_pinned;
8267
-}
8268
-
8269
-#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270
-static void migrate_disabled_sched(struct task_struct *p)
8271
-{
8272
-}
8273
-
8274
-void migrate_disable(void)
8275
-{
8276
-#ifdef CONFIG_SCHED_DEBUG
8277
- current->migrate_disable++;
8278
-#endif
8279
- barrier();
8280
-}
8281
-EXPORT_SYMBOL(migrate_disable);
8282
-
8283
-void migrate_enable(void)
8284
-{
8285
-#ifdef CONFIG_SCHED_DEBUG
8286
- struct task_struct *p = current;
8287
-
8288
- WARN_ON_ONCE(p->migrate_disable <= 0);
8289
- p->migrate_disable--;
8290
-#endif
8291
- barrier();
8292
-}
8293
-EXPORT_SYMBOL(migrate_enable);
8294
-#else
8295
-static void migrate_disabled_sched(struct task_struct *p)
8296
-{
8297
-}
8298
-#endif