forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/kernel/sched/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * kernel/sched/core.c
34 *
....@@ -5,6 +6,10 @@
56 *
67 * Copyright (C) 1991-2002 Linus Torvalds
78 */
9
+#define CREATE_TRACE_POINTS
10
+#include <trace/events/sched.h>
11
+#undef CREATE_TRACE_POINTS
12
+
813 #include "sched.h"
914
1015 #include <linux/nospec.h>
....@@ -16,14 +21,41 @@
1621 #include <asm/tlb.h>
1722
1823 #include "../workqueue_internal.h"
24
+#include "../../io_uring/io-wq.h"
1925 #include "../smpboot.h"
2026
2127 #include "pelt.h"
28
+#include "smp.h"
2229
23
-#define CREATE_TRACE_POINTS
24
-#include <trace/events/sched.h>
30
+#include <trace/hooks/sched.h>
31
+#include <trace/hooks/dtask.h>
32
+
33
+/*
34
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
35
+ * associated with them) to allow external modules to probe them.
36
+ */
37
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
38
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
39
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
40
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
41
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
42
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
43
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
44
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
45
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
46
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
47
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
48
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
49
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50
+#ifdef CONFIG_SCHEDSTATS
51
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55
+#endif
2556
2657 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
+EXPORT_SYMBOL_GPL(runqueues);
2759
2860 #ifdef CONFIG_SCHED_DEBUG
2961 /*
....@@ -38,6 +70,7 @@
3870 const_debug unsigned int sysctl_sched_features =
3971 #include "features.h"
4072 0;
73
+EXPORT_SYMBOL_GPL(sysctl_sched_features);
4174 #undef SCHED_FEAT
4275 #endif
4376
....@@ -45,11 +78,7 @@
4578 * Number of tasks to iterate in a single balance run.
4679 * Limited because this is done with IRQs disabled.
4780 */
48
-#ifdef CONFIG_PREEMPT_RT_FULL
49
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
50
-#else
5181 const_debug unsigned int sysctl_sched_nr_migrate = 32;
52
-#endif
5382
5483 /*
5584 * period over which we measure -rt task CPU usage in us.
....@@ -64,6 +93,100 @@
6493 * default: 0.95s
6594 */
6695 int sysctl_sched_rt_runtime = 950000;
96
+
97
+
98
+/*
99
+ * Serialization rules:
100
+ *
101
+ * Lock order:
102
+ *
103
+ * p->pi_lock
104
+ * rq->lock
105
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
106
+ *
107
+ * rq1->lock
108
+ * rq2->lock where: rq1 < rq2
109
+ *
110
+ * Regular state:
111
+ *
112
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
113
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
114
+ * always looks at the local rq data structures to find the most elegible task
115
+ * to run next.
116
+ *
117
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
118
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
119
+ * the local CPU to avoid bouncing the runqueue state around [ see
120
+ * ttwu_queue_wakelist() ]
121
+ *
122
+ * Task wakeup, specifically wakeups that involve migration, are horribly
123
+ * complicated to avoid having to take two rq->locks.
124
+ *
125
+ * Special state:
126
+ *
127
+ * System-calls and anything external will use task_rq_lock() which acquires
128
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
129
+ * stable while holding either lock:
130
+ *
131
+ * - sched_setaffinity()/
132
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
133
+ * - set_user_nice(): p->se.load, p->*prio
134
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
135
+ * p->se.load, p->rt_priority,
136
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
137
+ * - sched_setnuma(): p->numa_preferred_nid
138
+ * - sched_move_task()/
139
+ * cpu_cgroup_fork(): p->sched_task_group
140
+ * - uclamp_update_active() p->uclamp*
141
+ *
142
+ * p->state <- TASK_*:
143
+ *
144
+ * is changed locklessly using set_current_state(), __set_current_state() or
145
+ * set_special_state(), see their respective comments, or by
146
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
147
+ * concurrent self.
148
+ *
149
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
150
+ *
151
+ * is set by activate_task() and cleared by deactivate_task(), under
152
+ * rq->lock. Non-zero indicates the task is runnable, the special
153
+ * ON_RQ_MIGRATING state is used for migration without holding both
154
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
155
+ *
156
+ * p->on_cpu <- { 0, 1 }:
157
+ *
158
+ * is set by prepare_task() and cleared by finish_task() such that it will be
159
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
160
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
161
+ *
162
+ * [ The astute reader will observe that it is possible for two tasks on one
163
+ * CPU to have ->on_cpu = 1 at the same time. ]
164
+ *
165
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
166
+ *
167
+ * - Don't call set_task_cpu() on a blocked task:
168
+ *
169
+ * We don't care what CPU we're not running on, this simplifies hotplug,
170
+ * the CPU assignment of blocked tasks isn't required to be valid.
171
+ *
172
+ * - for try_to_wake_up(), called under p->pi_lock:
173
+ *
174
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
175
+ *
176
+ * - for migration called under rq->lock:
177
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
178
+ *
179
+ * o move_queued_task()
180
+ * o detach_task()
181
+ *
182
+ * - for migration called under double_rq_lock():
183
+ *
184
+ * o __migrate_swap_task()
185
+ * o push_rt_task() / pull_rt_task()
186
+ * o push_dl_task() / pull_dl_task()
187
+ * o dl_task_offline_migration()
188
+ *
189
+ */
67190
68191 /*
69192 * __task_rq_lock - lock the rq @p resides on.
....@@ -88,6 +211,7 @@
88211 cpu_relax();
89212 }
90213 }
214
+EXPORT_SYMBOL_GPL(__task_rq_lock);
91215
92216 /*
93217 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
....@@ -130,6 +254,7 @@
130254 cpu_relax();
131255 }
132256 }
257
+EXPORT_SYMBOL_GPL(task_rq_lock);
133258
134259 /*
135260 * RQ-clock updating methods:
....@@ -210,7 +335,15 @@
210335 rq->clock += delta;
211336 update_rq_clock_task(rq, delta);
212337 }
338
+EXPORT_SYMBOL_GPL(update_rq_clock);
213339
340
+static inline void
341
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
342
+{
343
+ csd->flags = 0;
344
+ csd->func = func;
345
+ csd->info = rq;
346
+}
214347
215348 #ifdef CONFIG_SCHED_HRTICK
216349 /*
....@@ -247,8 +380,9 @@
247380 static void __hrtick_restart(struct rq *rq)
248381 {
249382 struct hrtimer *timer = &rq->hrtick_timer;
383
+ ktime_t time = rq->hrtick_time;
250384
251
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
385
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
252386 }
253387
254388 /*
....@@ -261,7 +395,6 @@
261395
262396 rq_lock(rq, &rf);
263397 __hrtick_restart(rq);
264
- rq->hrtick_csd_pending = 0;
265398 rq_unlock(rq, &rf);
266399 }
267400
....@@ -273,7 +406,6 @@
273406 void hrtick_start(struct rq *rq, u64 delay)
274407 {
275408 struct hrtimer *timer = &rq->hrtick_timer;
276
- ktime_t time;
277409 s64 delta;
278410
279411 /*
....@@ -281,16 +413,12 @@
281413 * doesn't make sense and can cause timer DoS.
282414 */
283415 delta = max_t(s64, delay, 10000LL);
284
- time = ktime_add_ns(timer->base->get_time(), delta);
416
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
285417
286
- hrtimer_set_expires(timer, time);
287
-
288
- if (rq == this_rq()) {
418
+ if (rq == this_rq())
289419 __hrtick_restart(rq);
290
- } else if (!rq->hrtick_csd_pending) {
420
+ else
291421 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292
- rq->hrtick_csd_pending = 1;
293
- }
294422 }
295423
296424 #else
....@@ -307,20 +435,16 @@
307435 */
308436 delay = max_t(u64, delay, 10000LL);
309437 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310
- HRTIMER_MODE_REL_PINNED);
438
+ HRTIMER_MODE_REL_PINNED_HARD);
311439 }
440
+
312441 #endif /* CONFIG_SMP */
313442
314443 static void hrtick_rq_init(struct rq *rq)
315444 {
316445 #ifdef CONFIG_SMP
317
- rq->hrtick_csd_pending = 0;
318
-
319
- rq->hrtick_csd.flags = 0;
320
- rq->hrtick_csd.func = __hrtick_start;
321
- rq->hrtick_csd.info = rq;
446
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
322447 #endif
323
-
324448 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
325449 rq->hrtick_timer.function = hrtick;
326450 }
....@@ -403,15 +527,9 @@
403527 #endif
404528 #endif
405529
406
-void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
407
- bool sleeper)
530
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
408531 {
409
- struct wake_q_node *node;
410
-
411
- if (sleeper)
412
- node = &task->wake_q_sleeper;
413
- else
414
- node = &task->wake_q;
532
+ struct wake_q_node *node = &task->wake_q;
415533
416534 /*
417535 * Atomically grab the task, if ->wake_q is !nil already it means
....@@ -422,50 +540,79 @@
422540 * state, even in the failed case, an explicit smp_mb() must be used.
423541 */
424542 smp_mb__before_atomic();
425
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
426
- return;
427
-
428
- head->count++;
429
-
430
- get_task_struct(task);
543
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
544
+ return false;
431545
432546 /*
433547 * The head is context local, there can be no concurrency.
434548 */
435549 *head->lastp = node;
436550 head->lastp = &node->next;
551
+ head->count++;
552
+ return true;
437553 }
438554
439
-static int
440
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
441
- int sibling_count_hint);
442
-void __wake_up_q(struct wake_q_head *head, bool sleeper)
555
+/**
556
+ * wake_q_add() - queue a wakeup for 'later' waking.
557
+ * @head: the wake_q_head to add @task to
558
+ * @task: the task to queue for 'later' wakeup
559
+ *
560
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
561
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
562
+ * instantly.
563
+ *
564
+ * This function must be used as-if it were wake_up_process(); IOW the task
565
+ * must be ready to be woken at this location.
566
+ */
567
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
568
+{
569
+ if (__wake_q_add(head, task))
570
+ get_task_struct(task);
571
+}
572
+
573
+/**
574
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
575
+ * @head: the wake_q_head to add @task to
576
+ * @task: the task to queue for 'later' wakeup
577
+ *
578
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
579
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
580
+ * instantly.
581
+ *
582
+ * This function must be used as-if it were wake_up_process(); IOW the task
583
+ * must be ready to be woken at this location.
584
+ *
585
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
586
+ * that already hold reference to @task can call the 'safe' version and trust
587
+ * wake_q to do the right thing depending whether or not the @task is already
588
+ * queued for wakeup.
589
+ */
590
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
591
+{
592
+ if (!__wake_q_add(head, task))
593
+ put_task_struct(task);
594
+}
595
+
596
+void wake_up_q(struct wake_q_head *head)
443597 {
444598 struct wake_q_node *node = head->first;
445599
446600 while (node != WAKE_Q_TAIL) {
447601 struct task_struct *task;
448602
449
- if (sleeper)
450
- task = container_of(node, struct task_struct, wake_q_sleeper);
451
- else
452
- task = container_of(node, struct task_struct, wake_q);
603
+ task = container_of(node, struct task_struct, wake_q);
453604 BUG_ON(!task);
454605 /* Task can safely be re-inserted now: */
455606 node = node->next;
456
- if (sleeper)
457
- task->wake_q_sleeper.next = NULL;
458
- else
459
- task->wake_q.next = NULL;
607
+ task->wake_q.next = NULL;
608
+ task->wake_q_count = head->count;
609
+
460610 /*
461611 * wake_up_process() executes a full barrier, which pairs with
462612 * the queueing in wake_q_add() so as not to miss wakeups.
463613 */
464
- if (sleeper)
465
- wake_up_lock_sleeper(task);
466
- else
467
- wake_up_process(task);
468
-
614
+ wake_up_process(task);
615
+ task->wake_q_count = 0;
469616 put_task_struct(task);
470617 }
471618 }
....@@ -495,57 +642,12 @@
495642 return;
496643 }
497644
498
-#ifdef CONFIG_PREEMPT
499645 if (set_nr_and_not_polling(curr))
500
-#else
501
- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
502
-#endif
503646 smp_send_reschedule(cpu);
504647 else
505648 trace_sched_wake_idle_without_ipi(cpu);
506649 }
507
-
508
-#ifdef CONFIG_PREEMPT_LAZY
509
-
510
-static int tsk_is_polling(struct task_struct *p)
511
-{
512
-#ifdef TIF_POLLING_NRFLAG
513
- return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
514
-#else
515
- return 0;
516
-#endif
517
-}
518
-
519
-void resched_curr_lazy(struct rq *rq)
520
-{
521
- struct task_struct *curr = rq->curr;
522
- int cpu;
523
-
524
- if (!sched_feat(PREEMPT_LAZY)) {
525
- resched_curr(rq);
526
- return;
527
- }
528
-
529
- lockdep_assert_held(&rq->lock);
530
-
531
- if (test_tsk_need_resched(curr))
532
- return;
533
-
534
- if (test_tsk_need_resched_lazy(curr))
535
- return;
536
-
537
- set_tsk_need_resched_lazy(curr);
538
-
539
- cpu = cpu_of(rq);
540
- if (cpu == smp_processor_id())
541
- return;
542
-
543
- /* NEED_RESCHED_LAZY must be visible before we test polling */
544
- smp_mb();
545
- if (!tsk_is_polling(curr))
546
- smp_send_reschedule(cpu);
547
-}
548
-#endif
650
+EXPORT_SYMBOL_GPL(resched_curr);
549651
550652 void resched_cpu(int cpu)
551653 {
....@@ -570,27 +672,49 @@
570672 */
571673 int get_nohz_timer_target(void)
572674 {
573
- int i, cpu = smp_processor_id();
675
+ int i, cpu = smp_processor_id(), default_cpu = -1;
574676 struct sched_domain *sd;
575677
576
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
577
- return cpu;
678
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
679
+ if (!idle_cpu(cpu))
680
+ return cpu;
681
+ default_cpu = cpu;
682
+ }
578683
579684 rcu_read_lock();
580685 for_each_domain(cpu, sd) {
581
- for_each_cpu(i, sched_domain_span(sd)) {
686
+ for_each_cpu_and(i, sched_domain_span(sd),
687
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
582688 if (cpu == i)
583689 continue;
584690
585
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
691
+ if (!idle_cpu(i)) {
586692 cpu = i;
587693 goto unlock;
588694 }
589695 }
590696 }
591697
592
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
593
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
698
+ if (default_cpu == -1) {
699
+ for_each_cpu_and(i, cpu_active_mask,
700
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
701
+ if (cpu == i)
702
+ continue;
703
+
704
+ if (!idle_cpu(i)) {
705
+ cpu = i;
706
+ goto unlock;
707
+ }
708
+ }
709
+
710
+ /* no active, not-idle, housekpeeing CPU found. */
711
+ default_cpu = cpumask_any(cpu_active_mask);
712
+
713
+ if (unlikely(default_cpu >= nr_cpu_ids))
714
+ goto unlock;
715
+ }
716
+
717
+ cpu = default_cpu;
594718 unlock:
595719 rcu_read_unlock();
596720 return cpu;
....@@ -650,29 +774,23 @@
650774 wake_up_idle_cpu(cpu);
651775 }
652776
653
-static inline bool got_nohz_idle_kick(void)
777
+static void nohz_csd_func(void *info)
654778 {
655
- int cpu = smp_processor_id();
656
-
657
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
658
- return false;
659
-
660
- if (idle_cpu(cpu) && !need_resched())
661
- return true;
779
+ struct rq *rq = info;
780
+ int cpu = cpu_of(rq);
781
+ unsigned int flags;
662782
663783 /*
664
- * We can't run Idle Load Balance on this CPU for this time so we
665
- * cancel it and clear NOHZ_BALANCE_KICK
784
+ * Release the rq::nohz_csd.
666785 */
667
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
668
- return false;
669
-}
786
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
787
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
670788
671
-#else /* CONFIG_NO_HZ_COMMON */
672
-
673
-static inline bool got_nohz_idle_kick(void)
674
-{
675
- return false;
789
+ rq->idle_balance = idle_cpu(cpu);
790
+ if (rq->idle_balance && !need_resched()) {
791
+ rq->nohz_idle_balance = flags;
792
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
793
+ }
676794 }
677795
678796 #endif /* CONFIG_NO_HZ_COMMON */
....@@ -763,18 +881,18 @@
763881 }
764882 #endif
765883
766
-static void set_load_weight(struct task_struct *p, bool update_load)
884
+static void set_load_weight(struct task_struct *p)
767885 {
886
+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
768887 int prio = p->static_prio - MAX_RT_PRIO;
769888 struct load_weight *load = &p->se.load;
770889
771890 /*
772891 * SCHED_IDLE tasks get minimal weight:
773892 */
774
- if (idle_policy(p->policy)) {
893
+ if (task_has_idle_policy(p)) {
775894 load->weight = scale_load(WEIGHT_IDLEPRIO);
776895 load->inv_weight = WMULT_IDLEPRIO;
777
- p->se.runnable_weight = load->weight;
778896 return;
779897 }
780898
....@@ -787,7 +905,6 @@
787905 } else {
788906 load->weight = scale_load(sched_prio_to_weight[prio]);
789907 load->inv_weight = sched_prio_to_wmult[prio];
790
- p->se.runnable_weight = load->weight;
791908 }
792909 }
793910
....@@ -810,8 +927,46 @@
810927 /* Max allowed maximum utilization */
811928 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
812929
930
+/*
931
+ * By default RT tasks run at the maximum performance point/capacity of the
932
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
933
+ * SCHED_CAPACITY_SCALE.
934
+ *
935
+ * This knob allows admins to change the default behavior when uclamp is being
936
+ * used. In battery powered devices, particularly, running at the maximum
937
+ * capacity and frequency will increase energy consumption and shorten the
938
+ * battery life.
939
+ *
940
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
941
+ *
942
+ * This knob will not override the system default sched_util_clamp_min defined
943
+ * above.
944
+ */
945
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
946
+
813947 /* All clamps are required to be less or equal than these values */
814948 static struct uclamp_se uclamp_default[UCLAMP_CNT];
949
+
950
+/*
951
+ * This static key is used to reduce the uclamp overhead in the fast path. It
952
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
953
+ * enqueue/dequeue_task().
954
+ *
955
+ * This allows users to continue to enable uclamp in their kernel config with
956
+ * minimum uclamp overhead in the fast path.
957
+ *
958
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
959
+ * enabled, since we have an actual users that make use of uclamp
960
+ * functionality.
961
+ *
962
+ * The knobs that would enable this static key are:
963
+ *
964
+ * * A task modifying its uclamp value with sched_setattr().
965
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
966
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
967
+ */
968
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
969
+EXPORT_SYMBOL_GPL(sched_uclamp_used);
815970
816971 /* Integer rounded range for each bucket */
817972 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
....@@ -822,11 +977,6 @@
822977 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
823978 {
824979 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
825
-}
826
-
827
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
828
-{
829
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
830980 }
831981
832982 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
....@@ -892,12 +1042,79 @@
8921042 return uclamp_idle_value(rq, clamp_id, clamp_value);
8931043 }
8941044
1045
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1046
+{
1047
+ unsigned int default_util_min;
1048
+ struct uclamp_se *uc_se;
1049
+
1050
+ lockdep_assert_held(&p->pi_lock);
1051
+
1052
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
1053
+
1054
+ /* Only sync if user didn't override the default */
1055
+ if (uc_se->user_defined)
1056
+ return;
1057
+
1058
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1059
+ uclamp_se_set(uc_se, default_util_min, false);
1060
+}
1061
+
1062
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
1063
+{
1064
+ struct rq_flags rf;
1065
+ struct rq *rq;
1066
+
1067
+ if (!rt_task(p))
1068
+ return;
1069
+
1070
+ /* Protect updates to p->uclamp_* */
1071
+ rq = task_rq_lock(p, &rf);
1072
+ __uclamp_update_util_min_rt_default(p);
1073
+ task_rq_unlock(rq, p, &rf);
1074
+}
1075
+
1076
+static void uclamp_sync_util_min_rt_default(void)
1077
+{
1078
+ struct task_struct *g, *p;
1079
+
1080
+ /*
1081
+ * copy_process() sysctl_uclamp
1082
+ * uclamp_min_rt = X;
1083
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1084
+ * // link thread smp_mb__after_spinlock()
1085
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1086
+ * sched_post_fork() for_each_process_thread()
1087
+ * __uclamp_sync_rt() __uclamp_sync_rt()
1088
+ *
1089
+ * Ensures that either sched_post_fork() will observe the new
1090
+ * uclamp_min_rt or for_each_process_thread() will observe the new
1091
+ * task.
1092
+ */
1093
+ read_lock(&tasklist_lock);
1094
+ smp_mb__after_spinlock();
1095
+ read_unlock(&tasklist_lock);
1096
+
1097
+ rcu_read_lock();
1098
+ for_each_process_thread(g, p)
1099
+ uclamp_update_util_min_rt_default(p);
1100
+ rcu_read_unlock();
1101
+}
1102
+
1103
+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
1104
+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
1105
+{
1106
+ uclamp_sync_util_min_rt_default();
1107
+}
1108
+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
1109
+#endif
1110
+
8951111 static inline struct uclamp_se
8961112 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
8971113 {
1114
+ /* Copy by value as we could modify it */
8981115 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
8991116 #ifdef CONFIG_UCLAMP_TASK_GROUP
900
- struct uclamp_se uc_max;
1117
+ unsigned int tg_min, tg_max, value;
9011118
9021119 /*
9031120 * Tasks in autogroups or root task group will be
....@@ -908,9 +1125,11 @@
9081125 if (task_group(p) == &root_task_group)
9091126 return uc_req;
9101127
911
- uc_max = task_group(p)->uclamp[clamp_id];
912
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
913
- return uc_max;
1128
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1129
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1130
+ value = uc_req.value;
1131
+ value = clamp(value, tg_min, tg_max);
1132
+ uclamp_se_set(&uc_req, value, false);
9141133 #endif
9151134
9161135 return uc_req;
....@@ -929,6 +1148,12 @@
9291148 {
9301149 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
9311150 struct uclamp_se uc_max = uclamp_default[clamp_id];
1151
+ struct uclamp_se uc_eff;
1152
+ int ret = 0;
1153
+
1154
+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
1155
+ if (ret)
1156
+ return uc_eff;
9321157
9331158 /* System default restrictions always apply */
9341159 if (unlikely(uc_req.value > uc_max.value))
....@@ -949,6 +1174,7 @@
9491174
9501175 return (unsigned long)uc_eff.value;
9511176 }
1177
+EXPORT_SYMBOL_GPL(uclamp_eff_value);
9521178
9531179 /*
9541180 * When a task is enqueued on a rq, the clamp bucket currently defined by the
....@@ -1009,10 +1235,38 @@
10091235
10101236 lockdep_assert_held(&rq->lock);
10111237
1238
+ /*
1239
+ * If sched_uclamp_used was enabled after task @p was enqueued,
1240
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
1241
+ *
1242
+ * In this case the uc_se->active flag should be false since no uclamp
1243
+ * accounting was performed at enqueue time and we can just return
1244
+ * here.
1245
+ *
1246
+ * Need to be careful of the following enqeueue/dequeue ordering
1247
+ * problem too
1248
+ *
1249
+ * enqueue(taskA)
1250
+ * // sched_uclamp_used gets enabled
1251
+ * enqueue(taskB)
1252
+ * dequeue(taskA)
1253
+ * // Must not decrement bukcet->tasks here
1254
+ * dequeue(taskB)
1255
+ *
1256
+ * where we could end up with stale data in uc_se and
1257
+ * bucket[uc_se->bucket_id].
1258
+ *
1259
+ * The following check here eliminates the possibility of such race.
1260
+ */
1261
+ if (unlikely(!uc_se->active))
1262
+ return;
1263
+
10121264 bucket = &uc_rq->bucket[uc_se->bucket_id];
1265
+
10131266 SCHED_WARN_ON(!bucket->tasks);
10141267 if (likely(bucket->tasks))
10151268 bucket->tasks--;
1269
+
10161270 uc_se->active = false;
10171271
10181272 /*
....@@ -1040,6 +1294,15 @@
10401294 {
10411295 enum uclamp_id clamp_id;
10421296
1297
+ /*
1298
+ * Avoid any overhead until uclamp is actually used by the userspace.
1299
+ *
1300
+ * The condition is constructed such that a NOP is generated when
1301
+ * sched_uclamp_used is disabled.
1302
+ */
1303
+ if (!static_branch_unlikely(&sched_uclamp_used))
1304
+ return;
1305
+
10431306 if (unlikely(!p->sched_class->uclamp_enabled))
10441307 return;
10451308
....@@ -1055,6 +1318,15 @@
10551318 {
10561319 enum uclamp_id clamp_id;
10571320
1321
+ /*
1322
+ * Avoid any overhead until uclamp is actually used by the userspace.
1323
+ *
1324
+ * The condition is constructed such that a NOP is generated when
1325
+ * sched_uclamp_used is disabled.
1326
+ */
1327
+ if (!static_branch_unlikely(&sched_uclamp_used))
1328
+ return;
1329
+
10581330 if (unlikely(!p->sched_class->uclamp_enabled))
10591331 return;
10601332
....@@ -1062,9 +1334,27 @@
10621334 uclamp_rq_dec_id(rq, p, clamp_id);
10631335 }
10641336
1065
-static inline void
1066
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1337
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1338
+ enum uclamp_id clamp_id)
10671339 {
1340
+ if (!p->uclamp[clamp_id].active)
1341
+ return;
1342
+
1343
+ uclamp_rq_dec_id(rq, p, clamp_id);
1344
+ uclamp_rq_inc_id(rq, p, clamp_id);
1345
+
1346
+ /*
1347
+ * Make sure to clear the idle flag if we've transiently reached 0
1348
+ * active tasks on rq.
1349
+ */
1350
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1351
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1352
+}
1353
+
1354
+static inline void
1355
+uclamp_update_active(struct task_struct *p)
1356
+{
1357
+ enum uclamp_id clamp_id;
10681358 struct rq_flags rf;
10691359 struct rq *rq;
10701360
....@@ -1084,30 +1374,22 @@
10841374 * affecting a valid clamp bucket, the next time it's enqueued,
10851375 * it will already see the updated clamp bucket value.
10861376 */
1087
- if (p->uclamp[clamp_id].active) {
1088
- uclamp_rq_dec_id(rq, p, clamp_id);
1089
- uclamp_rq_inc_id(rq, p, clamp_id);
1090
- }
1377
+ for_each_clamp_id(clamp_id)
1378
+ uclamp_rq_reinc_id(rq, p, clamp_id);
10911379
10921380 task_rq_unlock(rq, p, &rf);
10931381 }
10941382
10951383 #ifdef CONFIG_UCLAMP_TASK_GROUP
10961384 static inline void
1097
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1098
- unsigned int clamps)
1385
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
10991386 {
1100
- enum uclamp_id clamp_id;
11011387 struct css_task_iter it;
11021388 struct task_struct *p;
11031389
11041390 css_task_iter_start(css, 0, &it);
1105
- while ((p = css_task_iter_next(&it))) {
1106
- for_each_clamp_id(clamp_id) {
1107
- if ((0x1 << clamp_id) & clamps)
1108
- uclamp_update_active(p, clamp_id);
1109
- }
1110
- }
1391
+ while ((p = css_task_iter_next(&it)))
1392
+ uclamp_update_active(p);
11111393 css_task_iter_end(&it);
11121394 }
11131395
....@@ -1130,16 +1412,16 @@
11301412 #endif
11311413
11321414 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1133
- void __user *buffer, size_t *lenp,
1134
- loff_t *ppos)
1415
+ void *buffer, size_t *lenp, loff_t *ppos)
11351416 {
11361417 bool update_root_tg = false;
1137
- int old_min, old_max;
1418
+ int old_min, old_max, old_min_rt;
11381419 int result;
11391420
11401421 mutex_lock(&uclamp_mutex);
11411422 old_min = sysctl_sched_uclamp_util_min;
11421423 old_max = sysctl_sched_uclamp_util_max;
1424
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
11431425
11441426 result = proc_dointvec(table, write, buffer, lenp, ppos);
11451427 if (result)
....@@ -1148,7 +1430,9 @@
11481430 goto done;
11491431
11501432 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1151
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1433
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1434
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1435
+
11521436 result = -EINVAL;
11531437 goto undo;
11541438 }
....@@ -1164,8 +1448,15 @@
11641448 update_root_tg = true;
11651449 }
11661450
1167
- if (update_root_tg)
1451
+ if (update_root_tg) {
1452
+ static_branch_enable(&sched_uclamp_used);
11681453 uclamp_update_root_tg();
1454
+ }
1455
+
1456
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1457
+ static_branch_enable(&sched_uclamp_used);
1458
+ uclamp_sync_util_min_rt_default();
1459
+ }
11691460
11701461 /*
11711462 * We update all RUNNABLE tasks only when task groups are in use.
....@@ -1178,6 +1469,7 @@
11781469 undo:
11791470 sysctl_sched_uclamp_util_min = old_min;
11801471 sysctl_sched_uclamp_util_max = old_max;
1472
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
11811473 done:
11821474 mutex_unlock(&uclamp_mutex);
11831475
....@@ -1187,20 +1479,61 @@
11871479 static int uclamp_validate(struct task_struct *p,
11881480 const struct sched_attr *attr)
11891481 {
1190
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1191
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1482
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
1483
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
11921484
1193
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1194
- lower_bound = attr->sched_util_min;
1195
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1196
- upper_bound = attr->sched_util_max;
1485
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1486
+ util_min = attr->sched_util_min;
11971487
1198
- if (lower_bound > upper_bound)
1488
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1489
+ return -EINVAL;
1490
+ }
1491
+
1492
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1493
+ util_max = attr->sched_util_max;
1494
+
1495
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1496
+ return -EINVAL;
1497
+ }
1498
+
1499
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
11991500 return -EINVAL;
1200
- if (upper_bound > SCHED_CAPACITY_SCALE)
1201
- return -EINVAL;
1501
+
1502
+ /*
1503
+ * We have valid uclamp attributes; make sure uclamp is enabled.
1504
+ *
1505
+ * We need to do that here, because enabling static branches is a
1506
+ * blocking operation which obviously cannot be done while holding
1507
+ * scheduler locks.
1508
+ */
1509
+ static_branch_enable(&sched_uclamp_used);
12021510
12031511 return 0;
1512
+}
1513
+
1514
+static bool uclamp_reset(const struct sched_attr *attr,
1515
+ enum uclamp_id clamp_id,
1516
+ struct uclamp_se *uc_se)
1517
+{
1518
+ /* Reset on sched class change for a non user-defined clamp value. */
1519
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1520
+ !uc_se->user_defined)
1521
+ return true;
1522
+
1523
+ /* Reset on sched_util_{min,max} == -1. */
1524
+ if (clamp_id == UCLAMP_MIN &&
1525
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1526
+ attr->sched_util_min == -1) {
1527
+ return true;
1528
+ }
1529
+
1530
+ if (clamp_id == UCLAMP_MAX &&
1531
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1532
+ attr->sched_util_max == -1) {
1533
+ return true;
1534
+ }
1535
+
1536
+ return false;
12041537 }
12051538
12061539 static void __setscheduler_uclamp(struct task_struct *p,
....@@ -1208,40 +1541,41 @@
12081541 {
12091542 enum uclamp_id clamp_id;
12101543
1211
- /*
1212
- * On scheduling class change, reset to default clamps for tasks
1213
- * without a task-specific value.
1214
- */
12151544 for_each_clamp_id(clamp_id) {
12161545 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1217
- unsigned int clamp_value = uclamp_none(clamp_id);
1546
+ unsigned int value;
12181547
1219
- /* Keep using defined clamps across class changes */
1220
- if (uc_se->user_defined)
1548
+ if (!uclamp_reset(attr, clamp_id, uc_se))
12211549 continue;
12221550
1223
- /* By default, RT tasks always get 100% boost */
1224
- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1225
- unlikely(rt_task(p) &&
1226
- clamp_id == UCLAMP_MIN)) {
1551
+ /*
1552
+ * RT by default have a 100% boost value that could be modified
1553
+ * at runtime.
1554
+ */
1555
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1556
+ value = sysctl_sched_uclamp_util_min_rt_default;
1557
+ else
1558
+ value = uclamp_none(clamp_id);
12271559
1228
- clamp_value = uclamp_none(UCLAMP_MAX);
1229
- }
1560
+ uclamp_se_set(uc_se, value, false);
12301561
1231
- uclamp_se_set(uc_se, clamp_value, false);
12321562 }
12331563
12341564 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
12351565 return;
12361566
1237
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1567
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1568
+ attr->sched_util_min != -1) {
12381569 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
12391570 attr->sched_util_min, true);
1571
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
12401572 }
12411573
1242
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1574
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1575
+ attr->sched_util_max != -1) {
12431576 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
12441577 attr->sched_util_max, true);
1578
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
12451579 }
12461580 }
12471581
....@@ -1249,6 +1583,10 @@
12491583 {
12501584 enum uclamp_id clamp_id;
12511585
1586
+ /*
1587
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1588
+ * as the task is still at its early fork stages.
1589
+ */
12521590 for_each_clamp_id(clamp_id)
12531591 p->uclamp[clamp_id].active = false;
12541592
....@@ -1261,39 +1599,24 @@
12611599 }
12621600 }
12631601
1264
-#ifdef CONFIG_SMP
1265
-unsigned int uclamp_task(struct task_struct *p)
1602
+static void uclamp_post_fork(struct task_struct *p)
12661603 {
1267
- unsigned long util;
1268
-
1269
- util = task_util_est(p);
1270
- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1271
- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1272
-
1273
- return util;
1604
+ uclamp_update_util_min_rt_default(p);
12741605 }
12751606
1276
-bool uclamp_boosted(struct task_struct *p)
1607
+static void __init init_uclamp_rq(struct rq *rq)
12771608 {
1278
- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
1609
+ enum uclamp_id clamp_id;
1610
+ struct uclamp_rq *uc_rq = rq->uclamp;
1611
+
1612
+ for_each_clamp_id(clamp_id) {
1613
+ uc_rq[clamp_id] = (struct uclamp_rq) {
1614
+ .value = uclamp_none(clamp_id)
1615
+ };
1616
+ }
1617
+
1618
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
12791619 }
1280
-
1281
-bool uclamp_latency_sensitive(struct task_struct *p)
1282
-{
1283
-#ifdef CONFIG_UCLAMP_TASK_GROUP
1284
- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1285
- struct task_group *tg;
1286
-
1287
- if (!css)
1288
- return false;
1289
- tg = container_of(css, struct task_group, css);
1290
-
1291
- return tg->latency_sensitive;
1292
-#else
1293
- return false;
1294
-#endif
1295
-}
1296
-#endif /* CONFIG_SMP */
12971620
12981621 static void __init init_uclamp(void)
12991622 {
....@@ -1301,13 +1624,8 @@
13011624 enum uclamp_id clamp_id;
13021625 int cpu;
13031626
1304
- mutex_init(&uclamp_mutex);
1305
-
1306
- for_each_possible_cpu(cpu) {
1307
- memset(&cpu_rq(cpu)->uclamp, 0,
1308
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1309
- cpu_rq(cpu)->uclamp_flags = 0;
1310
- }
1627
+ for_each_possible_cpu(cpu)
1628
+ init_uclamp_rq(cpu_rq(cpu));
13111629
13121630 for_each_clamp_id(clamp_id) {
13131631 uclamp_se_set(&init_task.uclamp_req[clamp_id],
....@@ -1336,41 +1654,7 @@
13361654 static void __setscheduler_uclamp(struct task_struct *p,
13371655 const struct sched_attr *attr) { }
13381656 static inline void uclamp_fork(struct task_struct *p) { }
1339
-
1340
-long schedtune_task_margin(struct task_struct *task);
1341
-
1342
-#ifdef CONFIG_SMP
1343
-unsigned int uclamp_task(struct task_struct *p)
1344
-{
1345
- unsigned long util = task_util_est(p);
1346
-#ifdef CONFIG_SCHED_TUNE
1347
- long margin = schedtune_task_margin(p);
1348
-
1349
- trace_sched_boost_task(p, util, margin);
1350
-
1351
- util += margin;
1352
-#endif
1353
-
1354
- return util;
1355
-}
1356
-
1357
-bool uclamp_boosted(struct task_struct *p)
1358
-{
1359
-#ifdef CONFIG_SCHED_TUNE
1360
- return schedtune_task_boost(p) > 0;
1361
-#endif
1362
- return false;
1363
-}
1364
-
1365
-bool uclamp_latency_sensitive(struct task_struct *p)
1366
-{
1367
-#ifdef CONFIG_SCHED_TUNE
1368
- return schedtune_prefer_idle(p) != 0;
1369
-#endif
1370
- return false;
1371
-}
1372
-#endif /* CONFIG_SMP */
1373
-
1657
+static inline void uclamp_post_fork(struct task_struct *p) { }
13741658 static inline void init_uclamp(void) { }
13751659 #endif /* CONFIG_UCLAMP_TASK */
13761660
....@@ -1385,7 +1669,9 @@
13851669 }
13861670
13871671 uclamp_rq_inc(rq, p);
1672
+ trace_android_rvh_enqueue_task(rq, p, flags);
13881673 p->sched_class->enqueue_task(rq, p, flags);
1674
+ trace_android_rvh_after_enqueue_task(rq, p);
13891675 }
13901676
13911677 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
....@@ -1399,31 +1685,39 @@
13991685 }
14001686
14011687 uclamp_rq_dec(rq, p);
1688
+ trace_android_rvh_dequeue_task(rq, p, flags);
14021689 p->sched_class->dequeue_task(rq, p, flags);
1690
+ trace_android_rvh_after_dequeue_task(rq, p);
14031691 }
14041692
14051693 void activate_task(struct rq *rq, struct task_struct *p, int flags)
14061694 {
1407
- if (task_contributes_to_load(p))
1408
- rq->nr_uninterruptible--;
1409
-
14101695 enqueue_task(rq, p, flags);
1696
+
1697
+ p->on_rq = TASK_ON_RQ_QUEUED;
14111698 }
1699
+EXPORT_SYMBOL_GPL(activate_task);
14121700
14131701 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
14141702 {
1415
- if (task_contributes_to_load(p))
1416
- rq->nr_uninterruptible++;
1703
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
14171704
14181705 dequeue_task(rq, p, flags);
14191706 }
1707
+EXPORT_SYMBOL_GPL(deactivate_task);
14201708
1421
-/*
1422
- * __normal_prio - return the priority that is based on the static prio
1423
- */
1424
-static inline int __normal_prio(struct task_struct *p)
1709
+static inline int __normal_prio(int policy, int rt_prio, int nice)
14251710 {
1426
- return p->static_prio;
1711
+ int prio;
1712
+
1713
+ if (dl_policy(policy))
1714
+ prio = MAX_DL_PRIO - 1;
1715
+ else if (rt_policy(policy))
1716
+ prio = MAX_RT_PRIO - 1 - rt_prio;
1717
+ else
1718
+ prio = NICE_TO_PRIO(nice);
1719
+
1720
+ return prio;
14271721 }
14281722
14291723 /*
....@@ -1435,15 +1729,7 @@
14351729 */
14361730 static inline int normal_prio(struct task_struct *p)
14371731 {
1438
- int prio;
1439
-
1440
- if (task_has_dl_policy(p))
1441
- prio = MAX_DL_PRIO-1;
1442
- else if (task_has_rt_policy(p))
1443
- prio = MAX_RT_PRIO-1 - p->rt_priority;
1444
- else
1445
- prio = __normal_prio(p);
1446
- return prio;
1732
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
14471733 }
14481734
14491735 /*
....@@ -1499,20 +1785,10 @@
14991785
15001786 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
15011787 {
1502
- const struct sched_class *class;
1503
-
1504
- if (p->sched_class == rq->curr->sched_class) {
1788
+ if (p->sched_class == rq->curr->sched_class)
15051789 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1506
- } else {
1507
- for_each_class(class) {
1508
- if (class == rq->curr->sched_class)
1509
- break;
1510
- if (class == p->sched_class) {
1511
- resched_curr(rq);
1512
- break;
1513
- }
1514
- }
1515
- }
1790
+ else if (p->sched_class > rq->curr->sched_class)
1791
+ resched_curr(rq);
15161792
15171793 /*
15181794 * A queue event has occurred, and we're going to schedule. In
....@@ -1521,22 +1797,12 @@
15211797 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
15221798 rq_clock_skip_update(rq);
15231799 }
1800
+EXPORT_SYMBOL_GPL(check_preempt_curr);
15241801
15251802 #ifdef CONFIG_SMP
15261803
1527
-static inline bool is_per_cpu_kthread(struct task_struct *p)
1528
-{
1529
- if (!(p->flags & PF_KTHREAD))
1530
- return false;
1531
-
1532
- if (p->nr_cpus_allowed != 1)
1533
- return false;
1534
-
1535
- return true;
1536
-}
1537
-
15381804 /*
1539
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
1805
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
15401806 * __set_cpus_allowed_ptr() and select_fallback_rq().
15411807 */
15421808 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
....@@ -1544,10 +1810,13 @@
15441810 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
15451811 return false;
15461812
1547
- if (is_per_cpu_kthread(p) || __migrate_disabled(p))
1813
+ if (is_per_cpu_kthread(p))
15481814 return cpu_online(cpu);
15491815
1550
- return cpu_active(cpu);
1816
+ if (!cpu_active(cpu))
1817
+ return false;
1818
+
1819
+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
15511820 }
15521821
15531822 /*
....@@ -1572,19 +1841,29 @@
15721841 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15731842 struct task_struct *p, int new_cpu)
15741843 {
1844
+ int detached = 0;
1845
+
15751846 lockdep_assert_held(&rq->lock);
15761847
1577
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1578
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1579
- set_task_cpu(p, new_cpu);
1580
- rq_unlock(rq, rf);
1848
+ /*
1849
+ * The vendor hook may drop the lock temporarily, so
1850
+ * pass the rq flags to unpin lock. We expect the
1851
+ * rq lock to be held after return.
1852
+ */
1853
+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
1854
+ if (detached)
1855
+ goto attach;
15811856
1857
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1858
+ set_task_cpu(p, new_cpu);
1859
+
1860
+attach:
1861
+ rq_unlock(rq, rf);
15821862 rq = cpu_rq(new_cpu);
15831863
15841864 rq_lock(rq, rf);
15851865 BUG_ON(task_cpu(p) != new_cpu);
1586
- enqueue_task(rq, p, 0);
1587
- p->on_rq = TASK_ON_RQ_QUEUED;
1866
+ activate_task(rq, p, 0);
15881867 check_preempt_curr(rq, p, 0);
15891868
15901869 return rq;
....@@ -1593,7 +1872,6 @@
15931872 struct migration_arg {
15941873 struct task_struct *task;
15951874 int dest_cpu;
1596
- bool done;
15971875 };
15981876
15991877 /*
....@@ -1629,11 +1907,6 @@
16291907 struct task_struct *p = arg->task;
16301908 struct rq *rq = this_rq();
16311909 struct rq_flags rf;
1632
- int dest_cpu = arg->dest_cpu;
1633
-
1634
- /* We don't look at arg after this point. */
1635
- smp_mb();
1636
- arg->done = true;
16371910
16381911 /*
16391912 * The original target CPU might have gone down and we might
....@@ -1645,7 +1918,7 @@
16451918 * __migrate_task() such that we will not miss enforcing cpus_ptr
16461919 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
16471920 */
1648
- sched_ttwu_pending();
1921
+ flush_smp_call_function_from_idle();
16491922
16501923 raw_spin_lock(&p->pi_lock);
16511924 rq_lock(rq, &rf);
....@@ -1656,9 +1929,9 @@
16561929 */
16571930 if (task_rq(p) == rq) {
16581931 if (task_on_rq_queued(p))
1659
- rq = __migrate_task(rq, &rf, p, dest_cpu);
1932
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
16601933 else
1661
- p->wake_cpu = dest_cpu;
1934
+ p->wake_cpu = arg->dest_cpu;
16621935 }
16631936 rq_unlock(rq, &rf);
16641937 raw_spin_unlock(&p->pi_lock);
....@@ -1674,17 +1947,9 @@
16741947 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
16751948 {
16761949 cpumask_copy(&p->cpus_mask, new_mask);
1677
- if (p->cpus_ptr == &p->cpus_mask)
1678
- p->nr_cpus_allowed = cpumask_weight(new_mask);
1950
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
1951
+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
16791952 }
1680
-
1681
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682
-int __migrate_disabled(struct task_struct *p)
1683
-{
1684
- return p->migrate_disable;
1685
-}
1686
-EXPORT_SYMBOL_GPL(__migrate_disabled);
1687
-#endif
16881953
16891954 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
16901955 {
....@@ -1712,28 +1977,23 @@
17121977 if (queued)
17131978 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
17141979 if (running)
1715
- set_curr_task(rq, p);
1980
+ set_next_task(rq, p);
17161981 }
17171982
17181983 /*
1719
- * Change a given task's CPU affinity. Migrate the thread to a
1720
- * proper CPU and schedule it away if the CPU it's executing on
1721
- * is removed from the allowed bitmask.
1722
- *
1723
- * NOTE: the caller must have a valid reference to the task, the
1724
- * task must not exit() & deallocate itself prematurely. The
1725
- * call is not atomic; no spinlocks may be held.
1984
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
17261985 */
1727
-static int __set_cpus_allowed_ptr(struct task_struct *p,
1728
- const struct cpumask *new_mask, bool check)
1986
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
1987
+ const struct cpumask *new_mask,
1988
+ bool check,
1989
+ struct rq *rq,
1990
+ struct rq_flags *rf)
17291991 {
17301992 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1993
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
17311994 unsigned int dest_cpu;
1732
- struct rq_flags rf;
1733
- struct rq *rq;
17341995 int ret = 0;
17351996
1736
- rq = task_rq_lock(p, &rf);
17371997 update_rq_clock(rq);
17381998
17391999 if (p->flags & PF_KTHREAD) {
....@@ -1741,6 +2001,9 @@
17412001 * Kernel threads are allowed on online && !active CPUs
17422002 */
17432003 cpu_valid_mask = cpu_online_mask;
2004
+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
2005
+ ret = -EINVAL;
2006
+ goto out;
17442007 }
17452008
17462009 /*
....@@ -1755,7 +2018,12 @@
17552018 if (cpumask_equal(&p->cpus_mask, new_mask))
17562019 goto out;
17572020
1758
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
2021
+ /*
2022
+ * Picking a ~random cpu helps in cases where we are changing affinity
2023
+ * for groups of tasks (ie. cpuset), so that load balancing is not
2024
+ * immediately required to distribute the tasks within their new mask.
2025
+ */
2026
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
17592027 if (dest_cpu >= nr_cpu_ids) {
17602028 ret = -EINVAL;
17612029 goto out;
....@@ -1774,28 +2042,45 @@
17742042 }
17752043
17762044 /* Can the task run on the task's current CPU? If so, we're done */
1777
- if (cpumask_test_cpu(task_cpu(p), new_mask) ||
1778
- p->cpus_ptr != &p->cpus_mask)
2045
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
17792046 goto out;
17802047
17812048 if (task_running(rq, p) || p->state == TASK_WAKING) {
17822049 struct migration_arg arg = { p, dest_cpu };
17832050 /* Need help from migration thread: drop lock and wait. */
1784
- task_rq_unlock(rq, p, &rf);
2051
+ task_rq_unlock(rq, p, rf);
17852052 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1786
- tlb_migrate_finish(p->mm);
17872053 return 0;
17882054 } else if (task_on_rq_queued(p)) {
17892055 /*
17902056 * OK, since we're going to drop the lock immediately
17912057 * afterwards anyway.
17922058 */
1793
- rq = move_queued_task(rq, &rf, p, dest_cpu);
2059
+ rq = move_queued_task(rq, rf, p, dest_cpu);
17942060 }
17952061 out:
1796
- task_rq_unlock(rq, p, &rf);
2062
+ task_rq_unlock(rq, p, rf);
17972063
17982064 return ret;
2065
+}
2066
+
2067
+/*
2068
+ * Change a given task's CPU affinity. Migrate the thread to a
2069
+ * proper CPU and schedule it away if the CPU it's executing on
2070
+ * is removed from the allowed bitmask.
2071
+ *
2072
+ * NOTE: the caller must have a valid reference to the task, the
2073
+ * task must not exit() & deallocate itself prematurely. The
2074
+ * call is not atomic; no spinlocks may be held.
2075
+ */
2076
+static int __set_cpus_allowed_ptr(struct task_struct *p,
2077
+ const struct cpumask *new_mask, bool check)
2078
+{
2079
+ struct rq_flags rf;
2080
+ struct rq *rq;
2081
+
2082
+ rq = task_rq_lock(p, &rf);
2083
+ return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
17992084 }
18002085
18012086 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
....@@ -1803,6 +2088,74 @@
18032088 return __set_cpus_allowed_ptr(p, new_mask, false);
18042089 }
18052090 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2091
+
2092
+/*
2093
+ * Change a given task's CPU affinity to the intersection of its current
2094
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
2095
+ * If the resulting mask is empty, leave the affinity unchanged and return
2096
+ * -EINVAL.
2097
+ */
2098
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
2099
+ struct cpumask *new_mask,
2100
+ const struct cpumask *subset_mask)
2101
+{
2102
+ struct rq_flags rf;
2103
+ struct rq *rq;
2104
+
2105
+ rq = task_rq_lock(p, &rf);
2106
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2107
+ task_rq_unlock(rq, p, &rf);
2108
+ return -EINVAL;
2109
+ }
2110
+
2111
+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
2112
+}
2113
+
2114
+/*
2115
+ * Restrict a given task's CPU affinity so that it is a subset of
2116
+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
2117
+ * walk up the cpuset hierarchy until we find a suitable mask.
2118
+ */
2119
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2120
+{
2121
+ cpumask_var_t new_mask;
2122
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
2123
+
2124
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
2125
+
2126
+ /*
2127
+ * __migrate_task() can fail silently in the face of concurrent
2128
+ * offlining of the chosen destination CPU, so take the hotplug
2129
+ * lock to ensure that the migration succeeds.
2130
+ */
2131
+ trace_android_rvh_force_compatible_pre(NULL);
2132
+ cpus_read_lock();
2133
+ if (!cpumask_available(new_mask))
2134
+ goto out_set_mask;
2135
+
2136
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2137
+ goto out_free_mask;
2138
+
2139
+ /*
2140
+ * We failed to find a valid subset of the affinity mask for the
2141
+ * task, so override it based on its cpuset hierarchy.
2142
+ */
2143
+ cpuset_cpus_allowed(p, new_mask);
2144
+ override_mask = new_mask;
2145
+
2146
+out_set_mask:
2147
+ if (printk_ratelimit()) {
2148
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2149
+ task_pid_nr(p), p->comm,
2150
+ cpumask_pr_args(override_mask));
2151
+ }
2152
+
2153
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2154
+out_free_mask:
2155
+ cpus_read_unlock();
2156
+ trace_android_rvh_force_compatible_post(NULL);
2157
+ free_cpumask_var(new_mask);
2158
+}
18062159
18072160 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
18082161 {
....@@ -1851,12 +2204,13 @@
18512204 p->se.nr_migrations++;
18522205 rseq_migrate(p);
18532206 perf_event_task_migrate(p);
2207
+ trace_android_rvh_set_task_cpu(p, new_cpu);
18542208 }
18552209
18562210 __set_task_cpu(p, new_cpu);
18572211 }
2212
+EXPORT_SYMBOL_GPL(set_task_cpu);
18582213
1859
-#ifdef CONFIG_NUMA_BALANCING
18602214 static void __migrate_swap_task(struct task_struct *p, int cpu)
18612215 {
18622216 if (task_on_rq_queued(p)) {
....@@ -1869,11 +2223,9 @@
18692223 rq_pin_lock(src_rq, &srf);
18702224 rq_pin_lock(dst_rq, &drf);
18712225
1872
- p->on_rq = TASK_ON_RQ_MIGRATING;
18732226 deactivate_task(src_rq, p, 0);
18742227 set_task_cpu(p, cpu);
18752228 activate_task(dst_rq, p, 0);
1876
- p->on_rq = TASK_ON_RQ_QUEUED;
18772229 check_preempt_curr(dst_rq, p, 0);
18782230
18792231 rq_unpin_lock(dst_rq, &drf);
....@@ -1973,19 +2325,7 @@
19732325 out:
19742326 return ret;
19752327 }
1976
-#endif /* CONFIG_NUMA_BALANCING */
1977
-
1978
-static bool check_task_state(struct task_struct *p, long match_state)
1979
-{
1980
- bool match = false;
1981
-
1982
- raw_spin_lock_irq(&p->pi_lock);
1983
- if (p->state == match_state || p->saved_state == match_state)
1984
- match = true;
1985
- raw_spin_unlock_irq(&p->pi_lock);
1986
-
1987
- return match;
1988
-}
2328
+EXPORT_SYMBOL_GPL(migrate_swap);
19892329
19902330 /*
19912331 * wait_task_inactive - wait for a thread to unschedule.
....@@ -2031,7 +2371,7 @@
20312371 * is actually now running somewhere else!
20322372 */
20332373 while (task_running(rq, p)) {
2034
- if (match_state && !check_task_state(p, match_state))
2374
+ if (match_state && unlikely(p->state != match_state))
20352375 return 0;
20362376 cpu_relax();
20372377 }
....@@ -2046,8 +2386,7 @@
20462386 running = task_running(rq, p);
20472387 queued = task_on_rq_queued(p);
20482388 ncsw = 0;
2049
- if (!match_state || p->state == match_state ||
2050
- p->saved_state == match_state)
2389
+ if (!match_state || p->state == match_state)
20512390 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
20522391 task_rq_unlock(rq, p, &rf);
20532392
....@@ -2148,7 +2487,11 @@
21482487 int nid = cpu_to_node(cpu);
21492488 const struct cpumask *nodemask = NULL;
21502489 enum { cpuset, possible, fail } state = cpuset;
2151
- int dest_cpu;
2490
+ int dest_cpu = -1;
2491
+
2492
+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
2493
+ if (dest_cpu >= 0)
2494
+ return dest_cpu;
21522495
21532496 /*
21542497 * If the node that the CPU is on has been offlined, cpu_to_node()
....@@ -2160,9 +2503,7 @@
21602503
21612504 /* Look for allowed, online CPU in same node. */
21622505 for_each_cpu(dest_cpu, nodemask) {
2163
- if (!cpu_active(dest_cpu))
2164
- continue;
2165
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
2506
+ if (is_cpu_allowed(p, dest_cpu))
21662507 return dest_cpu;
21672508 }
21682509 }
....@@ -2184,12 +2525,11 @@
21842525 state = possible;
21852526 break;
21862527 }
2187
- /* Fall-through */
2528
+ fallthrough;
21882529 case possible:
2189
- do_set_cpus_allowed(p, cpu_possible_mask);
2530
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
21902531 state = fail;
21912532 break;
2192
-
21932533 case fail:
21942534 BUG();
21952535 break;
....@@ -2216,14 +2556,12 @@
22162556 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22172557 */
22182558 static inline
2219
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2220
- int sibling_count_hint)
2559
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22212560 {
22222561 lockdep_assert_held(&p->pi_lock);
22232562
22242563 if (p->nr_cpus_allowed > 1)
2225
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2226
- sibling_count_hint);
2564
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22272565 else
22282566 cpu = cpumask_any(p->cpus_ptr);
22292567
....@@ -2241,12 +2579,6 @@
22412579 cpu = select_fallback_rq(task_cpu(p), p);
22422580
22432581 return cpu;
2244
-}
2245
-
2246
-static void update_avg(u64 *avg, u64 sample)
2247
-{
2248
- s64 diff = sample - *avg;
2249
- *avg += diff >> 3;
22502582 }
22512583
22522584 void sched_set_stop_task(int cpu, struct task_struct *stop)
....@@ -2328,12 +2660,6 @@
23282660 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
23292661 }
23302662
2331
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2332
-{
2333
- activate_task(rq, p, en_flags);
2334
- p->on_rq = TASK_ON_RQ_QUEUED;
2335
-}
2336
-
23372663 /*
23382664 * Mark the task runnable and perform wakeup-preemption.
23392665 */
....@@ -2375,27 +2701,54 @@
23752701 {
23762702 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
23772703
2704
+ if (wake_flags & WF_SYNC)
2705
+ en_flags |= ENQUEUE_WAKEUP_SYNC;
2706
+
23782707 lockdep_assert_held(&rq->lock);
23792708
2380
-#ifdef CONFIG_SMP
23812709 if (p->sched_contributes_to_load)
23822710 rq->nr_uninterruptible--;
23832711
2712
+#ifdef CONFIG_SMP
23842713 if (wake_flags & WF_MIGRATED)
23852714 en_flags |= ENQUEUE_MIGRATED;
2715
+ else
23862716 #endif
2717
+ if (p->in_iowait) {
2718
+ delayacct_blkio_end(p);
2719
+ atomic_dec(&task_rq(p)->nr_iowait);
2720
+ }
23872721
2388
- ttwu_activate(rq, p, en_flags);
2722
+ activate_task(rq, p, en_flags);
23892723 ttwu_do_wakeup(rq, p, wake_flags, rf);
23902724 }
23912725
23922726 /*
2393
- * Called in case the task @p isn't fully descheduled from its runqueue,
2394
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2395
- * since all we need to do is flip p->state to TASK_RUNNING, since
2396
- * the task is still ->on_rq.
2727
+ * Consider @p being inside a wait loop:
2728
+ *
2729
+ * for (;;) {
2730
+ * set_current_state(TASK_UNINTERRUPTIBLE);
2731
+ *
2732
+ * if (CONDITION)
2733
+ * break;
2734
+ *
2735
+ * schedule();
2736
+ * }
2737
+ * __set_current_state(TASK_RUNNING);
2738
+ *
2739
+ * between set_current_state() and schedule(). In this case @p is still
2740
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2741
+ * an atomic manner.
2742
+ *
2743
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2744
+ * then schedule() must still happen and p->state can be changed to
2745
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2746
+ * need to do a full wakeup with enqueue.
2747
+ *
2748
+ * Returns: %true when the wakeup is done,
2749
+ * %false otherwise.
23972750 */
2398
-static int ttwu_remote(struct task_struct *p, int wake_flags)
2751
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
23992752 {
24002753 struct rq_flags rf;
24012754 struct rq *rq;
....@@ -2414,75 +2767,63 @@
24142767 }
24152768
24162769 #ifdef CONFIG_SMP
2417
-void sched_ttwu_pending(void)
2770
+void sched_ttwu_pending(void *arg)
24182771 {
2772
+ struct llist_node *llist = arg;
24192773 struct rq *rq = this_rq();
2420
- struct llist_node *llist = llist_del_all(&rq->wake_list);
24212774 struct task_struct *p, *t;
24222775 struct rq_flags rf;
24232776
24242777 if (!llist)
24252778 return;
24262779
2780
+ /*
2781
+ * rq::ttwu_pending racy indication of out-standing wakeups.
2782
+ * Races such that false-negatives are possible, since they
2783
+ * are shorter lived that false-positives would be.
2784
+ */
2785
+ WRITE_ONCE(rq->ttwu_pending, 0);
2786
+
24272787 rq_lock_irqsave(rq, &rf);
24282788 update_rq_clock(rq);
24292789
2430
- llist_for_each_entry_safe(p, t, llist, wake_entry)
2790
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
2791
+ if (WARN_ON_ONCE(p->on_cpu))
2792
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
2793
+
2794
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
2795
+ set_task_cpu(p, cpu_of(rq));
2796
+
24312797 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2798
+ }
24322799
24332800 rq_unlock_irqrestore(rq, &rf);
24342801 }
24352802
2436
-void scheduler_ipi(void)
2803
+void send_call_function_single_ipi(int cpu)
24372804 {
2438
- /*
2439
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2440
- * TIF_NEED_RESCHED remotely (for the first time) will also send
2441
- * this IPI.
2442
- */
2443
- preempt_fold_need_resched();
2805
+ struct rq *rq = cpu_rq(cpu);
24442806
2445
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2446
- return;
2447
-
2448
- /*
2449
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2450
- * traditionally all their work was done from the interrupt return
2451
- * path. Now that we actually do some work, we need to make sure
2452
- * we do call them.
2453
- *
2454
- * Some archs already do call them, luckily irq_enter/exit nest
2455
- * properly.
2456
- *
2457
- * Arguably we should visit all archs and update all handlers,
2458
- * however a fair share of IPIs are still resched only so this would
2459
- * somewhat pessimize the simple resched case.
2460
- */
2461
- irq_enter();
2462
- sched_ttwu_pending();
2463
-
2464
- /*
2465
- * Check if someone kicked us for doing the nohz idle load balance.
2466
- */
2467
- if (unlikely(got_nohz_idle_kick())) {
2468
- this_rq()->idle_balance = 1;
2469
- raise_softirq_irqoff(SCHED_SOFTIRQ);
2470
- }
2471
- irq_exit();
2807
+ if (!set_nr_if_polling(rq->idle))
2808
+ arch_send_call_function_single_ipi(cpu);
2809
+ else
2810
+ trace_sched_wake_idle_without_ipi(cpu);
24722811 }
24732812
2474
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
2813
+/*
2814
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
2815
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
2816
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
2817
+ * of the wakeup instead of the waker.
2818
+ */
2819
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
24752820 {
24762821 struct rq *rq = cpu_rq(cpu);
24772822
24782823 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
24792824
2480
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2481
- if (!set_nr_if_polling(rq->idle))
2482
- smp_send_reschedule(cpu);
2483
- else
2484
- trace_sched_wake_idle_without_ipi(cpu);
2485
- }
2825
+ WRITE_ONCE(rq->ttwu_pending, 1);
2826
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
24862827 }
24872828
24882829 void wake_up_if_idle(int cpu)
....@@ -2508,6 +2849,7 @@
25082849 out:
25092850 rcu_read_unlock();
25102851 }
2852
+EXPORT_SYMBOL_GPL(wake_up_if_idle);
25112853
25122854 bool cpus_share_cache(int this_cpu, int that_cpu)
25132855 {
....@@ -2516,6 +2858,58 @@
25162858
25172859 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
25182860 }
2861
+
2862
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2863
+{
2864
+ /*
2865
+ * If the CPU does not share cache, then queue the task on the
2866
+ * remote rqs wakelist to avoid accessing remote data.
2867
+ */
2868
+ if (!cpus_share_cache(smp_processor_id(), cpu))
2869
+ return true;
2870
+
2871
+ /*
2872
+ * If the task is descheduling and the only running task on the
2873
+ * CPU then use the wakelist to offload the task activation to
2874
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
2875
+ * nr_running is checked to avoid unnecessary task stacking.
2876
+ *
2877
+ * Note that we can only get here with (wakee) p->on_rq=0,
2878
+ * p->on_cpu can be whatever, we've done the dequeue, so
2879
+ * the wakee has been accounted out of ->nr_running.
2880
+ */
2881
+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
2882
+ return true;
2883
+
2884
+ return false;
2885
+}
2886
+
2887
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2888
+{
2889
+ bool cond = false;
2890
+
2891
+ trace_android_rvh_ttwu_cond(&cond);
2892
+
2893
+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) ||
2894
+ cond) {
2895
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
2896
+ return false;
2897
+
2898
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2899
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
2900
+ return true;
2901
+ }
2902
+
2903
+ return false;
2904
+}
2905
+
2906
+#else /* !CONFIG_SMP */
2907
+
2908
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2909
+{
2910
+ return false;
2911
+}
2912
+
25192913 #endif /* CONFIG_SMP */
25202914
25212915 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
....@@ -2523,13 +2917,8 @@
25232917 struct rq *rq = cpu_rq(cpu);
25242918 struct rq_flags rf;
25252919
2526
-#if defined(CONFIG_SMP)
2527
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2528
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2529
- ttwu_queue_remote(p, cpu, wake_flags);
2920
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
25302921 return;
2531
- }
2532
-#endif
25332922
25342923 rq_lock(rq, &rf);
25352924 update_rq_clock(rq);
....@@ -2585,8 +2974,8 @@
25852974 * migration. However the means are completely different as there is no lock
25862975 * chain to provide order. Instead we do:
25872976 *
2588
- * 1) smp_store_release(X->on_cpu, 0)
2589
- * 2) smp_cond_load_acquire(!X->on_cpu)
2977
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
2978
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25902979 *
25912980 * Example:
25922981 *
....@@ -2625,64 +3014,95 @@
26253014 * @p: the thread to be awakened
26263015 * @state: the mask of task states that can be woken
26273016 * @wake_flags: wake modifier flags (WF_*)
2628
- * @sibling_count_hint: A hint at the number of threads that are being woken up
2629
- * in this event.
26303017 *
2631
- * If (@state & @p->state) @p->state = TASK_RUNNING.
3018
+ * Conceptually does:
3019
+ *
3020
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
26323021 *
26333022 * If the task was not queued/runnable, also place it back on a runqueue.
26343023 *
2635
- * Atomic against schedule() which would dequeue a task, also see
2636
- * set_current_state().
3024
+ * This function is atomic against schedule() which would dequeue the task.
26373025 *
2638
- * This function executes a full memory barrier before accessing the task
2639
- * state; see set_current_state().
3026
+ * It issues a full memory barrier before accessing @p->state, see the comment
3027
+ * with set_current_state().
3028
+ *
3029
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
3030
+ *
3031
+ * Relies on p->pi_lock stabilizing:
3032
+ * - p->sched_class
3033
+ * - p->cpus_ptr
3034
+ * - p->sched_task_group
3035
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3036
+ *
3037
+ * Tries really hard to only take one task_rq(p)->lock for performance.
3038
+ * Takes rq->lock in:
3039
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3040
+ * - ttwu_queue() -- new rq, for enqueue of the task;
3041
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3042
+ *
3043
+ * As a consequence we race really badly with just about everything. See the
3044
+ * many memory barriers and their comments for details.
26403045 *
26413046 * Return: %true if @p->state changes (an actual wakeup was done),
26423047 * %false otherwise.
26433048 */
26443049 static int
2645
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2646
- int sibling_count_hint)
3050
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26473051 {
26483052 unsigned long flags;
26493053 int cpu, success = 0;
26503054
2651
- /*
2652
- * If we are going to wake up a thread waiting for CONDITION we
2653
- * need to ensure that CONDITION=1 done by the caller can not be
2654
- * reordered with p->state check below. This pairs with mb() in
2655
- * set_current_state() the waiting thread does.
2656
- */
2657
- raw_spin_lock_irqsave(&p->pi_lock, flags);
2658
- smp_mb__after_spinlock();
2659
- if (!(p->state & state)) {
3055
+ preempt_disable();
3056
+ if (p == current) {
26603057 /*
2661
- * The task might be running due to a spinlock sleeper
2662
- * wakeup. Check the saved state and set it to running
2663
- * if the wakeup condition is true.
3058
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3059
+ * == smp_processor_id()'. Together this means we can special
3060
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
3061
+ * without taking any locks.
3062
+ *
3063
+ * In particular:
3064
+ * - we rely on Program-Order guarantees for all the ordering,
3065
+ * - we're serialized against set_special_state() by virtue of
3066
+ * it disabling IRQs (this allows not taking ->pi_lock).
26643067 */
2665
- if (!(wake_flags & WF_LOCK_SLEEPER)) {
2666
- if (p->saved_state & state) {
2667
- p->saved_state = TASK_RUNNING;
2668
- success = 1;
2669
- }
2670
- }
3068
+ if (!(p->state & state))
3069
+ goto out;
3070
+
3071
+ success = 1;
3072
+ trace_sched_waking(p);
3073
+ p->state = TASK_RUNNING;
3074
+ trace_sched_wakeup(p);
26713075 goto out;
26723076 }
26733077
26743078 /*
2675
- * If this is a regular wakeup, then we can unconditionally
2676
- * clear the saved state of a "lock sleeper".
3079
+ * If we are going to wake up a thread waiting for CONDITION we
3080
+ * need to ensure that CONDITION=1 done by the caller can not be
3081
+ * reordered with p->state check below. This pairs with smp_store_mb()
3082
+ * in set_current_state() that the waiting thread does.
26773083 */
2678
- if (!(wake_flags & WF_LOCK_SLEEPER))
2679
- p->saved_state = TASK_RUNNING;
3084
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
3085
+ smp_mb__after_spinlock();
3086
+ if (!(p->state & state))
3087
+ goto unlock;
3088
+
3089
+#ifdef CONFIG_FREEZER
3090
+ /*
3091
+ * If we're going to wake up a thread which may be frozen, then
3092
+ * we can only do so if we have an active CPU which is capable of
3093
+ * running it. This may not be the case when resuming from suspend,
3094
+ * as the secondary CPUs may not yet be back online. See __thaw_task()
3095
+ * for the actual wakeup.
3096
+ */
3097
+ if (unlikely(frozen_or_skipped(p)) &&
3098
+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
3099
+ goto unlock;
3100
+#endif
26803101
26813102 trace_sched_waking(p);
26823103
26833104 /* We're going to change ->state: */
26843105 success = 1;
2685
- cpu = task_cpu(p);
26863106
26873107 /*
26883108 * Ensure we load p->on_rq _after_ p->state, otherwise it would
....@@ -2703,10 +3123,15 @@
27033123 *
27043124 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27053125 * __schedule(). See the comment for smp_mb__after_spinlock().
3126
+ *
3127
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
27063128 */
27073129 smp_rmb();
2708
- if (p->on_rq && ttwu_remote(p, wake_flags))
2709
- goto stat;
3130
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3131
+ goto unlock;
3132
+
3133
+ if (p->state & TASK_UNINTERRUPTIBLE)
3134
+ trace_sched_blocked_reason(p);
27103135
27113136 #ifdef CONFIG_SMP
27123137 /*
....@@ -2727,8 +3152,43 @@
27273152 *
27283153 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27293154 * __schedule(). See the comment for smp_mb__after_spinlock().
3155
+ *
3156
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3157
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
3158
+ * care about it's own p->state. See the comment in __schedule().
27303159 */
2731
- smp_rmb();
3160
+ smp_acquire__after_ctrl_dep();
3161
+
3162
+ /*
3163
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3164
+ * == 0), which means we need to do an enqueue, change p->state to
3165
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
3166
+ * enqueue, such as ttwu_queue_wakelist().
3167
+ */
3168
+ p->state = TASK_WAKING;
3169
+
3170
+ /*
3171
+ * If the owning (remote) CPU is still in the middle of schedule() with
3172
+ * this task as prev, considering queueing p on the remote CPUs wake_list
3173
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
3174
+ * let the waker make forward progress. This is safe because IRQs are
3175
+ * disabled and the IPI will deliver after on_cpu is cleared.
3176
+ *
3177
+ * Ensure we load task_cpu(p) after p->on_cpu:
3178
+ *
3179
+ * set_task_cpu(p, cpu);
3180
+ * STORE p->cpu = @cpu
3181
+ * __schedule() (switch to task 'p')
3182
+ * LOCK rq->lock
3183
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3184
+ * STORE p->on_cpu = 1 LOAD p->cpu
3185
+ *
3186
+ * to ensure we observe the correct CPU on which the task is currently
3187
+ * scheduling.
3188
+ */
3189
+ if (smp_load_acquire(&p->on_cpu) &&
3190
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3191
+ goto unlock;
27323192
27333193 /*
27343194 * If the owning (remote) CPU is still in the middle of schedule() with
....@@ -2741,38 +3201,79 @@
27413201 */
27423202 smp_cond_load_acquire(&p->on_cpu, !VAL);
27433203
2744
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2745
- p->state = TASK_WAKING;
3204
+ trace_android_rvh_try_to_wake_up(p);
27463205
2747
- if (p->in_iowait) {
2748
- delayacct_blkio_end(p);
2749
- atomic_dec(&task_rq(p)->nr_iowait);
2750
- }
2751
-
2752
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2753
- sibling_count_hint);
3206
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
27543207 if (task_cpu(p) != cpu) {
3208
+ if (p->in_iowait) {
3209
+ delayacct_blkio_end(p);
3210
+ atomic_dec(&task_rq(p)->nr_iowait);
3211
+ }
3212
+
27553213 wake_flags |= WF_MIGRATED;
27563214 psi_ttwu_dequeue(p);
27573215 set_task_cpu(p, cpu);
27583216 }
2759
-
2760
-#else /* CONFIG_SMP */
2761
-
2762
- if (p->in_iowait) {
2763
- delayacct_blkio_end(p);
2764
- atomic_dec(&task_rq(p)->nr_iowait);
2765
- }
2766
-
3217
+#else
3218
+ cpu = task_cpu(p);
27673219 #endif /* CONFIG_SMP */
27683220
27693221 ttwu_queue(p, cpu, wake_flags);
2770
-stat:
2771
- ttwu_stat(p, cpu, wake_flags);
2772
-out:
3222
+unlock:
27733223 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3224
+out:
3225
+ if (success) {
3226
+ trace_android_rvh_try_to_wake_up_success(p);
3227
+ ttwu_stat(p, task_cpu(p), wake_flags);
3228
+ }
3229
+ preempt_enable();
27743230
27753231 return success;
3232
+}
3233
+
3234
+/**
3235
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3236
+ * @p: Process for which the function is to be invoked, can be @current.
3237
+ * @func: Function to invoke.
3238
+ * @arg: Argument to function.
3239
+ *
3240
+ * If the specified task can be quickly locked into a definite state
3241
+ * (either sleeping or on a given runqueue), arrange to keep it in that
3242
+ * state while invoking @func(@arg). This function can use ->on_rq and
3243
+ * task_curr() to work out what the state is, if required. Given that
3244
+ * @func can be invoked with a runqueue lock held, it had better be quite
3245
+ * lightweight.
3246
+ *
3247
+ * Returns:
3248
+ * @false if the task slipped out from under the locks.
3249
+ * @true if the task was locked onto a runqueue or is sleeping.
3250
+ * However, @func can override this by returning @false.
3251
+ */
3252
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3253
+{
3254
+ struct rq_flags rf;
3255
+ bool ret = false;
3256
+ struct rq *rq;
3257
+
3258
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3259
+ if (p->on_rq) {
3260
+ rq = __task_rq_lock(p, &rf);
3261
+ if (task_rq(p) == rq)
3262
+ ret = func(p, arg);
3263
+ rq_unlock(rq, &rf);
3264
+ } else {
3265
+ switch (p->state) {
3266
+ case TASK_RUNNING:
3267
+ case TASK_WAKING:
3268
+ break;
3269
+ default:
3270
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3271
+ if (!p->on_rq)
3272
+ ret = func(p, arg);
3273
+ }
3274
+ }
3275
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3276
+ return ret;
27763277 }
27773278
27783279 /**
....@@ -2788,25 +3289,13 @@
27883289 */
27893290 int wake_up_process(struct task_struct *p)
27903291 {
2791
- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
3292
+ return try_to_wake_up(p, TASK_NORMAL, 0);
27923293 }
27933294 EXPORT_SYMBOL(wake_up_process);
27943295
2795
-/**
2796
- * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
2797
- * @p: The process to be woken up.
2798
- *
2799
- * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
2800
- * the nature of the wakeup.
2801
- */
2802
-int wake_up_lock_sleeper(struct task_struct *p)
2803
-{
2804
- return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
2805
-}
2806
-
28073296 int wake_up_state(struct task_struct *p, unsigned int state)
28083297 {
2809
- return try_to_wake_up(p, state, 0, 1);
3298
+ return try_to_wake_up(p, state, 0);
28103299 }
28113300
28123301 /*
....@@ -2831,6 +3320,8 @@
28313320 p->se.cfs_rq = NULL;
28323321 #endif
28333322
3323
+ trace_android_rvh_sched_fork_init(p);
3324
+
28343325 #ifdef CONFIG_SCHEDSTATS
28353326 /* Even if schedstat is disabled, there should not be garbage */
28363327 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
....@@ -2851,7 +3342,13 @@
28513342 INIT_HLIST_HEAD(&p->preempt_notifiers);
28523343 #endif
28533344
3345
+#ifdef CONFIG_COMPACTION
3346
+ p->capture_control = NULL;
3347
+#endif
28543348 init_numa_balancing(clone_flags, p);
3349
+#ifdef CONFIG_SMP
3350
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
3351
+#endif
28553352 }
28563353
28573354 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
....@@ -2868,7 +3365,7 @@
28683365
28693366 #ifdef CONFIG_PROC_SYSCTL
28703367 int sysctl_numa_balancing(struct ctl_table *table, int write,
2871
- void __user *buffer, size_t *lenp, loff_t *ppos)
3368
+ void *buffer, size_t *lenp, loff_t *ppos)
28723369 {
28733370 struct ctl_table t;
28743371 int err;
....@@ -2942,8 +3439,8 @@
29423439 }
29433440
29443441 #ifdef CONFIG_PROC_SYSCTL
2945
-int sysctl_schedstats(struct ctl_table *table, int write,
2946
- void __user *buffer, size_t *lenp, loff_t *ppos)
3442
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3443
+ size_t *lenp, loff_t *ppos)
29473444 {
29483445 struct ctl_table t;
29493446 int err;
....@@ -2971,7 +3468,7 @@
29713468 */
29723469 int sched_fork(unsigned long clone_flags, struct task_struct *p)
29733470 {
2974
- unsigned long flags;
3471
+ trace_android_rvh_sched_fork(p);
29753472
29763473 __sched_fork(clone_flags, p);
29773474 /*
....@@ -2985,6 +3482,7 @@
29853482 * Make sure we do not leak PI boosting priority to the child.
29863483 */
29873484 p->prio = current->normal_prio;
3485
+ trace_android_rvh_prepare_prio_fork(p);
29883486
29893487 uclamp_fork(p);
29903488
....@@ -2999,8 +3497,8 @@
29993497 } else if (PRIO_TO_NICE(p->static_prio) < 0)
30003498 p->static_prio = NICE_TO_PRIO(0);
30013499
3002
- p->prio = p->normal_prio = __normal_prio(p);
3003
- set_load_weight(p, false);
3500
+ p->prio = p->normal_prio = p->static_prio;
3501
+ set_load_weight(p);
30043502
30053503 /*
30063504 * We don't need the reset flag anymore after the fork. It has
....@@ -3017,24 +3515,8 @@
30173515 p->sched_class = &fair_sched_class;
30183516
30193517 init_entity_runnable_average(&p->se);
3518
+ trace_android_rvh_finish_prio_fork(p);
30203519
3021
- /*
3022
- * The child is not yet in the pid-hash so no cgroup attach races,
3023
- * and the cgroup is pinned to this child due to cgroup_fork()
3024
- * is ran before sched_fork().
3025
- *
3026
- * Silence PROVE_RCU.
3027
- */
3028
- raw_spin_lock_irqsave(&p->pi_lock, flags);
3029
- rseq_migrate(p);
3030
- /*
3031
- * We're setting the CPU for the first time, we don't migrate,
3032
- * so use __set_task_cpu().
3033
- */
3034
- __set_task_cpu(p, smp_processor_id());
3035
- if (p->sched_class->task_fork)
3036
- p->sched_class->task_fork(p);
3037
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
30383520
30393521 #ifdef CONFIG_SCHED_INFO
30403522 if (likely(sched_info_on()))
....@@ -3044,14 +3526,46 @@
30443526 p->on_cpu = 0;
30453527 #endif
30463528 init_task_preempt_count(p);
3047
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
3048
- task_thread_info(p)->preempt_lazy_count = 0;
3049
-#endif
30503529 #ifdef CONFIG_SMP
30513530 plist_node_init(&p->pushable_tasks, MAX_PRIO);
30523531 RB_CLEAR_NODE(&p->pushable_dl_tasks);
30533532 #endif
30543533 return 0;
3534
+}
3535
+
3536
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
3537
+{
3538
+ unsigned long flags;
3539
+
3540
+ /*
3541
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
3542
+ * required yet, but lockdep gets upset if rules are violated.
3543
+ */
3544
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
3545
+#ifdef CONFIG_CGROUP_SCHED
3546
+ if (1) {
3547
+ struct task_group *tg;
3548
+
3549
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
3550
+ struct task_group, css);
3551
+ tg = autogroup_task_group(p, tg);
3552
+ p->sched_task_group = tg;
3553
+ }
3554
+#endif
3555
+ rseq_migrate(p);
3556
+ /*
3557
+ * We're setting the CPU for the first time, we don't migrate,
3558
+ * so use __set_task_cpu().
3559
+ */
3560
+ __set_task_cpu(p, smp_processor_id());
3561
+ if (p->sched_class->task_fork)
3562
+ p->sched_class->task_fork(p);
3563
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3564
+}
3565
+
3566
+void sched_post_fork(struct task_struct *p)
3567
+{
3568
+ uclamp_post_fork(p);
30553569 }
30563570
30573571 unsigned long to_ratio(u64 period, u64 runtime)
....@@ -3082,6 +3596,8 @@
30823596 struct rq_flags rf;
30833597 struct rq *rq;
30843598
3599
+ trace_android_rvh_wake_up_new_task(p);
3600
+
30853601 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
30863602 p->state = TASK_RUNNING;
30873603 #ifdef CONFIG_SMP
....@@ -3095,14 +3611,14 @@
30953611 */
30963612 p->recent_used_cpu = task_cpu(p);
30973613 rseq_migrate(p);
3098
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
3614
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
30993615 #endif
31003616 rq = __task_rq_lock(p, &rf);
31013617 update_rq_clock(rq);
3102
- post_init_entity_util_avg(&p->se);
3618
+ post_init_entity_util_avg(p);
3619
+ trace_android_rvh_new_task_stats(p);
31033620
31043621 activate_task(rq, p, ENQUEUE_NOCLOCK);
3105
- p->on_rq = TASK_ON_RQ_QUEUED;
31063622 trace_sched_wakeup_new(p);
31073623 check_preempt_curr(rq, p, WF_FORK);
31083624 #ifdef CONFIG_SMP
....@@ -3212,8 +3728,10 @@
32123728 /*
32133729 * Claim the task as running, we do this before switching to it
32143730 * such that any running task will have this set.
3731
+ *
3732
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
32153733 */
3216
- next->on_cpu = 1;
3734
+ WRITE_ONCE(next->on_cpu, 1);
32173735 #endif
32183736 }
32193737
....@@ -3221,8 +3739,9 @@
32213739 {
32223740 #ifdef CONFIG_SMP
32233741 /*
3224
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3225
- * We must ensure this doesn't happen until the switch is completely
3742
+ * This must be the very last reference to @prev from this CPU. After
3743
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
3744
+ * must ensure this doesn't happen until the switch is completely
32263745 * finished.
32273746 *
32283747 * In particular, the load of prev->state in finish_task_switch() must
....@@ -3244,7 +3763,7 @@
32443763 * do an early lockdep release here:
32453764 */
32463765 rq_unpin_lock(rq, rf);
3247
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3766
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
32483767 #ifdef CONFIG_DEBUG_SPINLOCK
32493768 /* this is a valid case when another task releases the spinlock */
32503769 rq->lock.owner = next;
....@@ -3376,19 +3895,25 @@
33763895 * provided by mmdrop(),
33773896 * - a sync_core for SYNC_CORE.
33783897 */
3379
- /*
3380
- * We use mmdrop_delayed() here so we don't have to do the
3381
- * full __mmdrop() when we are the last user.
3382
- */
33833898 if (mm) {
33843899 membarrier_mm_sync_core_before_usermode(mm);
3385
- mmdrop_delayed(mm);
3900
+ mmdrop(mm);
33863901 }
33873902 if (unlikely(prev_state == TASK_DEAD)) {
33883903 if (prev->sched_class->task_dead)
33893904 prev->sched_class->task_dead(prev);
33903905
3391
- put_task_struct(prev);
3906
+ /*
3907
+ * Remove function-return probe instances associated with this
3908
+ * task and put them back on the free list.
3909
+ */
3910
+ kprobe_flush_task(prev);
3911
+ trace_android_rvh_flush_task(prev);
3912
+
3913
+ /* Task is done with its stack. */
3914
+ put_task_stack(prev);
3915
+
3916
+ put_task_struct_rcu_user(prev);
33923917 }
33933918
33943919 tick_nohz_task_switch();
....@@ -3467,12 +3992,8 @@
34673992 context_switch(struct rq *rq, struct task_struct *prev,
34683993 struct task_struct *next, struct rq_flags *rf)
34693994 {
3470
- struct mm_struct *mm, *oldmm;
3471
-
34723995 prepare_task_switch(rq, prev, next);
34733996
3474
- mm = next->mm;
3475
- oldmm = prev->active_mm;
34763997 /*
34773998 * For paravirt, this is coupled with an exit in switch_to to
34783999 * combine the page table reload and the switch backend into
....@@ -3481,22 +4002,37 @@
34814002 arch_start_context_switch(prev);
34824003
34834004 /*
3484
- * If mm is non-NULL, we pass through switch_mm(). If mm is
3485
- * NULL, we will pass through mmdrop() in finish_task_switch().
3486
- * Both of these contain the full memory barrier required by
3487
- * membarrier after storing to rq->curr, before returning to
3488
- * user-space.
4005
+ * kernel -> kernel lazy + transfer active
4006
+ * user -> kernel lazy + mmgrab() active
4007
+ *
4008
+ * kernel -> user switch + mmdrop() active
4009
+ * user -> user switch
34894010 */
3490
- if (!mm) {
3491
- next->active_mm = oldmm;
3492
- mmgrab(oldmm);
3493
- enter_lazy_tlb(oldmm, next);
3494
- } else
3495
- switch_mm_irqs_off(oldmm, mm, next);
4011
+ if (!next->mm) { // to kernel
4012
+ enter_lazy_tlb(prev->active_mm, next);
34964013
3497
- if (!prev->mm) {
3498
- prev->active_mm = NULL;
3499
- rq->prev_mm = oldmm;
4014
+ next->active_mm = prev->active_mm;
4015
+ if (prev->mm) // from user
4016
+ mmgrab(prev->active_mm);
4017
+ else
4018
+ prev->active_mm = NULL;
4019
+ } else { // to user
4020
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
4021
+ /*
4022
+ * sys_membarrier() requires an smp_mb() between setting
4023
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
4024
+ *
4025
+ * The below provides this either through switch_mm(), or in
4026
+ * case 'prev->active_mm == next->mm' through
4027
+ * finish_task_switch()'s mmdrop().
4028
+ */
4029
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
4030
+
4031
+ if (!prev->mm) { // from kernel
4032
+ /* will mmdrop() in finish_task_switch(). */
4033
+ rq->prev_mm = prev->active_mm;
4034
+ prev->active_mm = NULL;
4035
+ }
35004036 }
35014037
35024038 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
....@@ -3533,7 +4069,7 @@
35334069 * preemption, thus the result might have a time-of-check-to-time-of-use
35344070 * race. The caller is responsible to use it correctly, for example:
35354071 *
3536
- * - from a non-preemptable section (of course)
4072
+ * - from a non-preemptible section (of course)
35374073 *
35384074 * - from a thread that is bound to a single CPU
35394075 *
....@@ -3554,6 +4090,18 @@
35544090 sum += cpu_rq(i)->nr_switches;
35554091
35564092 return sum;
4093
+}
4094
+
4095
+/*
4096
+ * Consumers of these two interfaces, like for example the cpuidle menu
4097
+ * governor, are using nonsensical data. Preferring shallow idle state selection
4098
+ * for a CPU that has IO-wait which might not even end up running the task when
4099
+ * it does become runnable.
4100
+ */
4101
+
4102
+unsigned long nr_iowait_cpu(int cpu)
4103
+{
4104
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
35574105 }
35584106
35594107 /*
....@@ -3591,29 +4139,9 @@
35914139 unsigned long i, sum = 0;
35924140
35934141 for_each_possible_cpu(i)
3594
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
4142
+ sum += nr_iowait_cpu(i);
35954143
35964144 return sum;
3597
-}
3598
-
3599
-/*
3600
- * Consumers of these two interfaces, like for example the cpufreq menu
3601
- * governor are using nonsensical data. Boosting frequency for a CPU that has
3602
- * IO-wait which might not even end up running the task when it does become
3603
- * runnable.
3604
- */
3605
-
3606
-unsigned long nr_iowait_cpu(int cpu)
3607
-{
3608
- struct rq *this = cpu_rq(cpu);
3609
- return atomic_read(&this->nr_iowait);
3610
-}
3611
-
3612
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3613
-{
3614
- struct rq *rq = this_rq();
3615
- *nr_waiters = atomic_read(&rq->nr_iowait);
3616
- *load = rq->load.weight;
36174145 }
36184146
36194147 #ifdef CONFIG_SMP
....@@ -3627,9 +4155,14 @@
36274155 struct task_struct *p = current;
36284156 unsigned long flags;
36294157 int dest_cpu;
4158
+ bool cond = false;
4159
+
4160
+ trace_android_rvh_sched_exec(&cond);
4161
+ if (cond)
4162
+ return;
36304163
36314164 raw_spin_lock_irqsave(&p->pi_lock, flags);
3632
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
4165
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
36334166 if (dest_cpu == smp_processor_id())
36344167 goto unlock;
36354168
....@@ -3712,6 +4245,7 @@
37124245
37134246 return ns;
37144247 }
4248
+EXPORT_SYMBOL_GPL(task_sched_runtime);
37154249
37164250 /*
37174251 * This function gets called by the timer code, with HZ frequency.
....@@ -3723,14 +4257,18 @@
37234257 struct rq *rq = cpu_rq(cpu);
37244258 struct task_struct *curr = rq->curr;
37254259 struct rq_flags rf;
4260
+ unsigned long thermal_pressure;
37264261
4262
+ arch_scale_freq_tick();
37274263 sched_clock_tick();
37284264
37294265 rq_lock(rq, &rf);
37304266
4267
+ trace_android_rvh_tick_entry(rq);
37314268 update_rq_clock(rq);
4269
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4270
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
37324271 curr->sched_class->task_tick(rq, curr, 0);
3733
- cpu_load_update_active(rq);
37344272 calc_global_load_tick(rq);
37354273 psi_task_tick(rq);
37364274
....@@ -3742,6 +4280,8 @@
37424280 rq->idle_balance = idle_cpu(cpu);
37434281 trigger_load_balance(rq);
37444282 #endif
4283
+
4284
+ trace_android_vh_scheduler_tick(rq);
37454285 }
37464286
37474287 #ifdef CONFIG_NO_HZ_FULL
....@@ -3799,28 +4339,31 @@
37994339 * statistics and checks timeslices in a time-independent way, regardless
38004340 * of when exactly it is running.
38014341 */
3802
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
4342
+ if (!tick_nohz_tick_stopped_cpu(cpu))
38034343 goto out_requeue;
38044344
38054345 rq_lock_irq(rq, &rf);
38064346 curr = rq->curr;
3807
- if (is_idle_task(curr) || cpu_is_offline(cpu))
4347
+ if (cpu_is_offline(cpu))
38084348 goto out_unlock;
38094349
38104350 update_rq_clock(rq);
3811
- delta = rq_clock_task(rq) - curr->se.exec_start;
38124351
3813
- /*
3814
- * Make sure the next tick runs within a reasonable
3815
- * amount of time.
3816
- */
3817
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4352
+ if (!is_idle_task(curr)) {
4353
+ /*
4354
+ * Make sure the next tick runs within a reasonable
4355
+ * amount of time.
4356
+ */
4357
+ delta = rq_clock_task(rq) - curr->se.exec_start;
4358
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4359
+ }
38184360 curr->sched_class->task_tick(rq, curr, 0);
38194361
4362
+ calc_load_nohz_remote(rq);
38204363 out_unlock:
38214364 rq_unlock_irq(rq, &rf);
3822
-
38234365 out_requeue:
4366
+
38244367 /*
38254368 * Run the remote tick once per second (1Hz). This arbitrary
38264369 * frequency is large enough to avoid overload but short enough
....@@ -3884,7 +4427,7 @@
38844427 static inline void sched_tick_stop(int cpu) { }
38854428 #endif
38864429
3887
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4430
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
38884431 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
38894432 /*
38904433 * If the value passed in is equal to the current preempt count
....@@ -3990,11 +4533,12 @@
39904533 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
39914534 && in_atomic_preempt_off()) {
39924535 pr_err("Preemption disabled at:");
3993
- print_ip_sym(preempt_disable_ip);
3994
- pr_cont("\n");
4536
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
39954537 }
39964538 if (panic_on_warn)
39974539 panic("scheduling while atomic\n");
4540
+
4541
+ trace_android_rvh_schedule_bug(prev);
39984542
39994543 dump_stack();
40004544 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
....@@ -4003,11 +4547,23 @@
40034547 /*
40044548 * Various schedule()-time debugging checks and statistics:
40054549 */
4006
-static inline void schedule_debug(struct task_struct *prev)
4550
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
40074551 {
40084552 #ifdef CONFIG_SCHED_STACK_END_CHECK
40094553 if (task_stack_end_corrupted(prev))
40104554 panic("corrupted stack end detected inside scheduler\n");
4555
+
4556
+ if (task_scs_end_corrupted(prev))
4557
+ panic("corrupted shadow stack detected inside scheduler\n");
4558
+#endif
4559
+
4560
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4561
+ if (!preempt && prev->state && prev->non_block_count) {
4562
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4563
+ prev->comm, prev->pid, prev->non_block_count);
4564
+ dump_stack();
4565
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4566
+ }
40114567 #endif
40124568
40134569 if (unlikely(in_atomic_preempt_off())) {
....@@ -4019,6 +4575,28 @@
40194575 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
40204576
40214577 schedstat_inc(this_rq()->sched_count);
4578
+}
4579
+
4580
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4581
+ struct rq_flags *rf)
4582
+{
4583
+#ifdef CONFIG_SMP
4584
+ const struct sched_class *class;
4585
+ /*
4586
+ * We must do the balancing pass before put_prev_task(), such
4587
+ * that when we release the rq->lock the task is in the same
4588
+ * state as before we took rq->lock.
4589
+ *
4590
+ * We can terminate the balance pass as soon as we know there is
4591
+ * a runnable task of @class priority or higher.
4592
+ */
4593
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
4594
+ if (class->balance(rq, prev, rf))
4595
+ break;
4596
+ }
4597
+#endif
4598
+
4599
+ put_prev_task(rq, prev);
40224600 }
40234601
40244602 /*
....@@ -4036,36 +4614,34 @@
40364614 * higher scheduling class, because otherwise those loose the
40374615 * opportunity to pull in more work from other CPUs.
40384616 */
4039
- if (likely((prev->sched_class == &idle_sched_class ||
4040
- prev->sched_class == &fair_sched_class) &&
4617
+ if (likely(prev->sched_class <= &fair_sched_class &&
40414618 rq->nr_running == rq->cfs.h_nr_running)) {
40424619
4043
- p = fair_sched_class.pick_next_task(rq, prev, rf);
4620
+ p = pick_next_task_fair(rq, prev, rf);
40444621 if (unlikely(p == RETRY_TASK))
4045
- goto again;
4622
+ goto restart;
40464623
40474624 /* Assumes fair_sched_class->next == idle_sched_class */
4048
- if (unlikely(!p))
4049
- p = idle_sched_class.pick_next_task(rq, prev, rf);
4625
+ if (!p) {
4626
+ put_prev_task(rq, prev);
4627
+ p = pick_next_task_idle(rq);
4628
+ }
40504629
40514630 return p;
40524631 }
40534632
4054
-again:
4633
+restart:
4634
+ put_prev_task_balance(rq, prev, rf);
4635
+
40554636 for_each_class(class) {
4056
- p = class->pick_next_task(rq, prev, rf);
4057
- if (p) {
4058
- if (unlikely(p == RETRY_TASK))
4059
- goto again;
4637
+ p = class->pick_next_task(rq);
4638
+ if (p)
40604639 return p;
4061
- }
40624640 }
40634641
40644642 /* The idle class should always have a runnable task: */
40654643 BUG();
40664644 }
4067
-
4068
-static void migrate_disabled_sched(struct task_struct *p);
40694645
40704646 /*
40714647 * __schedule() is the main scheduler function.
....@@ -4087,7 +4663,7 @@
40874663 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
40884664 * called on the nearest possible occasion:
40894665 *
4090
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
4666
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
40914667 *
40924668 * - in syscall or exception context, at the next outmost
40934669 * preempt_enable(). (this might be as soon as the wake_up()'s
....@@ -4096,7 +4672,7 @@
40964672 * - in IRQ context, return from interrupt-handler to
40974673 * preemptible context
40984674 *
4099
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4675
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
41004676 * then at the next:
41014677 *
41024678 * - cond_resched() call
....@@ -4110,6 +4686,7 @@
41104686 {
41114687 struct task_struct *prev, *next;
41124688 unsigned long *switch_count;
4689
+ unsigned long prev_state;
41134690 struct rq_flags rf;
41144691 struct rq *rq;
41154692 int cpu;
....@@ -4118,7 +4695,7 @@
41184695 rq = cpu_rq(cpu);
41194696 prev = rq->curr;
41204697
4121
- schedule_debug(prev);
4698
+ schedule_debug(prev, preempt);
41224699
41234700 if (sched_feat(HRTICK))
41244701 hrtick_clear(rq);
....@@ -4129,28 +4706,59 @@
41294706 /*
41304707 * Make sure that signal_pending_state()->signal_pending() below
41314708 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4132
- * done by the caller to avoid the race with signal_wake_up().
4709
+ * done by the caller to avoid the race with signal_wake_up():
41334710 *
4134
- * The membarrier system call requires a full memory barrier
4711
+ * __set_current_state(@state) signal_wake_up()
4712
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
4713
+ * wake_up_state(p, state)
4714
+ * LOCK rq->lock LOCK p->pi_state
4715
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
4716
+ * if (signal_pending_state()) if (p->state & @state)
4717
+ *
4718
+ * Also, the membarrier system call requires a full memory barrier
41354719 * after coming from user-space, before storing to rq->curr.
41364720 */
41374721 rq_lock(rq, &rf);
41384722 smp_mb__after_spinlock();
4139
-
4140
- if (__migrate_disabled(prev))
4141
- migrate_disabled_sched(prev);
41424723
41434724 /* Promote REQ to ACT */
41444725 rq->clock_update_flags <<= 1;
41454726 update_rq_clock(rq);
41464727
41474728 switch_count = &prev->nivcsw;
4148
- if (!preempt && prev->state) {
4149
- if (unlikely(signal_pending_state(prev->state, prev))) {
4729
+
4730
+ /*
4731
+ * We must load prev->state once (task_struct::state is volatile), such
4732
+ * that:
4733
+ *
4734
+ * - we form a control dependency vs deactivate_task() below.
4735
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
4736
+ */
4737
+ prev_state = prev->state;
4738
+ if (!preempt && prev_state) {
4739
+ if (signal_pending_state(prev_state, prev)) {
41504740 prev->state = TASK_RUNNING;
41514741 } else {
4742
+ prev->sched_contributes_to_load =
4743
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
4744
+ !(prev_state & TASK_NOLOAD) &&
4745
+ !(prev->flags & PF_FROZEN);
4746
+
4747
+ if (prev->sched_contributes_to_load)
4748
+ rq->nr_uninterruptible++;
4749
+
4750
+ /*
4751
+ * __schedule() ttwu()
4752
+ * prev_state = prev->state; if (p->on_rq && ...)
4753
+ * if (prev_state) goto out;
4754
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
4755
+ * p->state = TASK_WAKING
4756
+ *
4757
+ * Where __schedule() and ttwu() have matching control dependencies.
4758
+ *
4759
+ * After this, schedule() must not care about p->state any more.
4760
+ */
41524761 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4153
- prev->on_rq = 0;
41544762
41554763 if (prev->in_iowait) {
41564764 atomic_inc(&rq->nr_iowait);
....@@ -4162,12 +4770,16 @@
41624770
41634771 next = pick_next_task(rq, prev, &rf);
41644772 clear_tsk_need_resched(prev);
4165
- clear_tsk_need_resched_lazy(prev);
41664773 clear_preempt_need_resched();
41674774
4775
+ trace_android_rvh_schedule(prev, next, rq);
41684776 if (likely(prev != next)) {
41694777 rq->nr_switches++;
4170
- rq->curr = next;
4778
+ /*
4779
+ * RCU users of rcu_dereference(rq->curr) may not see
4780
+ * changes to task_struct made by pick_next_task().
4781
+ */
4782
+ RCU_INIT_POINTER(rq->curr, next);
41714783 /*
41724784 * The membarrier system call requires each architecture
41734785 * to have a full memory barrier after updating
....@@ -4183,6 +4795,8 @@
41834795 * is a RELEASE barrier),
41844796 */
41854797 ++*switch_count;
4798
+
4799
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
41864800
41874801 trace_sched_switch(preempt, prev, next);
41884802
....@@ -4214,19 +4828,26 @@
42144828
42154829 static inline void sched_submit_work(struct task_struct *tsk)
42164830 {
4831
+ unsigned int task_flags;
4832
+
42174833 if (!tsk->state)
42184834 return;
42194835
4836
+ task_flags = tsk->flags;
42204837 /*
42214838 * If a worker went to sleep, notify and ask workqueue whether
42224839 * it wants to wake up a task to maintain concurrency.
42234840 * As this function is called inside the schedule() context,
42244841 * we disable preemption to avoid it calling schedule() again
4225
- * in the possible wakeup of a kworker.
4842
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
4843
+ * requires it.
42264844 */
4227
- if (tsk->flags & PF_WQ_WORKER) {
4845
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
42284846 preempt_disable();
4229
- wq_worker_sleeping(tsk);
4847
+ if (task_flags & PF_WQ_WORKER)
4848
+ wq_worker_sleeping(tsk);
4849
+ else
4850
+ io_wq_worker_sleeping(tsk);
42304851 preempt_enable_no_resched();
42314852 }
42324853
....@@ -4243,8 +4864,12 @@
42434864
42444865 static void sched_update_worker(struct task_struct *tsk)
42454866 {
4246
- if (tsk->flags & PF_WQ_WORKER)
4247
- wq_worker_running(tsk);
4867
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4868
+ if (tsk->flags & PF_WQ_WORKER)
4869
+ wq_worker_running(tsk);
4870
+ else
4871
+ io_wq_worker_running(tsk);
4872
+ }
42484873 }
42494874
42504875 asmlinkage __visible void __sched schedule(void)
....@@ -4346,35 +4971,10 @@
43464971 } while (need_resched());
43474972 }
43484973
4349
-#ifdef CONFIG_PREEMPT_LAZY
4974
+#ifdef CONFIG_PREEMPTION
43504975 /*
4351
- * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
4352
- * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
4353
- * preempt_lazy_count counter >0.
4354
- */
4355
-static __always_inline int preemptible_lazy(void)
4356
-{
4357
- if (test_thread_flag(TIF_NEED_RESCHED))
4358
- return 1;
4359
- if (current_thread_info()->preempt_lazy_count)
4360
- return 0;
4361
- return 1;
4362
-}
4363
-
4364
-#else
4365
-
4366
-static inline int preemptible_lazy(void)
4367
-{
4368
- return 1;
4369
-}
4370
-
4371
-#endif
4372
-
4373
-#ifdef CONFIG_PREEMPT
4374
-/*
4375
- * this is the entry point to schedule() from in-kernel preemption
4376
- * off of preempt_enable. Kernel preemptions off return from interrupt
4377
- * occur there and call schedule directly.
4976
+ * This is the entry point to schedule() from in-kernel preemption
4977
+ * off of preempt_enable.
43784978 */
43794979 asmlinkage __visible void __sched notrace preempt_schedule(void)
43804980 {
....@@ -4384,8 +4984,7 @@
43844984 */
43854985 if (likely(!preemptible()))
43864986 return;
4387
- if (!preemptible_lazy())
4388
- return;
4987
+
43894988 preempt_schedule_common();
43904989 }
43914990 NOKPROBE_SYMBOL(preempt_schedule);
....@@ -4410,9 +5009,6 @@
44105009 enum ctx_state prev_ctx;
44115010
44125011 if (likely(!preemptible()))
4413
- return;
4414
-
4415
- if (!preemptible_lazy())
44165012 return;
44175013
44185014 do {
....@@ -4446,10 +5042,10 @@
44465042 }
44475043 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
44485044
4449
-#endif /* CONFIG_PREEMPT */
5045
+#endif /* CONFIG_PREEMPTION */
44505046
44515047 /*
4452
- * this is the entry point to schedule() from kernel preemption
5048
+ * This is the entry point to schedule() from kernel preemption
44535049 * off of irq context.
44545050 * Note, that this is called and return with irqs disabled. This will
44555051 * protect us against recursive calling from irq.
....@@ -4477,9 +5073,22 @@
44775073 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
44785074 void *key)
44795075 {
4480
- return try_to_wake_up(curr->private, mode, wake_flags, 1);
5076
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
5077
+ return try_to_wake_up(curr->private, mode, wake_flags);
44815078 }
44825079 EXPORT_SYMBOL(default_wake_function);
5080
+
5081
+static void __setscheduler_prio(struct task_struct *p, int prio)
5082
+{
5083
+ if (dl_prio(prio))
5084
+ p->sched_class = &dl_sched_class;
5085
+ else if (rt_prio(prio))
5086
+ p->sched_class = &rt_sched_class;
5087
+ else
5088
+ p->sched_class = &fair_sched_class;
5089
+
5090
+ p->prio = prio;
5091
+}
44835092
44845093 #ifdef CONFIG_RT_MUTEXES
44855094
....@@ -4517,6 +5126,7 @@
45175126 struct rq_flags rf;
45185127 struct rq *rq;
45195128
5129
+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
45205130 /* XXX used to be waiter->prio, not waiter->task->prio */
45215131 prio = __rt_effective_prio(pi_task, p->normal_prio);
45225132
....@@ -4591,31 +5201,29 @@
45915201 if (!dl_prio(p->normal_prio) ||
45925202 (pi_task && dl_prio(pi_task->prio) &&
45935203 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4594
- p->dl.dl_boosted = 1;
5204
+ p->dl.pi_se = pi_task->dl.pi_se;
45955205 queue_flag |= ENQUEUE_REPLENISH;
4596
- } else
4597
- p->dl.dl_boosted = 0;
4598
- p->sched_class = &dl_sched_class;
5206
+ } else {
5207
+ p->dl.pi_se = &p->dl;
5208
+ }
45995209 } else if (rt_prio(prio)) {
46005210 if (dl_prio(oldprio))
4601
- p->dl.dl_boosted = 0;
5211
+ p->dl.pi_se = &p->dl;
46025212 if (oldprio < prio)
46035213 queue_flag |= ENQUEUE_HEAD;
4604
- p->sched_class = &rt_sched_class;
46055214 } else {
46065215 if (dl_prio(oldprio))
4607
- p->dl.dl_boosted = 0;
5216
+ p->dl.pi_se = &p->dl;
46085217 if (rt_prio(oldprio))
46095218 p->rt.timeout = 0;
4610
- p->sched_class = &fair_sched_class;
46115219 }
46125220
4613
- p->prio = prio;
5221
+ __setscheduler_prio(p, prio);
46145222
46155223 if (queued)
46165224 enqueue_task(rq, p, queue_flag);
46175225 if (running)
4618
- set_curr_task(rq, p);
5226
+ set_next_task(rq, p);
46195227
46205228 check_class_changed(rq, p, prev_class, oldprio);
46215229 out_unlock:
....@@ -4635,12 +5243,13 @@
46355243
46365244 void set_user_nice(struct task_struct *p, long nice)
46375245 {
4638
- bool queued, running;
4639
- int old_prio, delta;
5246
+ bool queued, running, allowed = false;
5247
+ int old_prio;
46405248 struct rq_flags rf;
46415249 struct rq *rq;
46425250
4643
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5251
+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
5252
+ if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed)
46445253 return;
46455254 /*
46465255 * We have to be careful, if called from sys_setpriority(),
....@@ -4667,22 +5276,21 @@
46675276 put_prev_task(rq, p);
46685277
46695278 p->static_prio = NICE_TO_PRIO(nice);
4670
- set_load_weight(p, true);
5279
+ set_load_weight(p);
46715280 old_prio = p->prio;
46725281 p->prio = effective_prio(p);
4673
- delta = p->prio - old_prio;
46745282
4675
- if (queued) {
5283
+ if (queued)
46765284 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4677
- /*
4678
- * If the task increased its priority or is running and
4679
- * lowered its priority, then reschedule its CPU:
4680
- */
4681
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
4682
- resched_curr(rq);
4683
- }
46845285 if (running)
4685
- set_curr_task(rq, p);
5286
+ set_next_task(rq, p);
5287
+
5288
+ /*
5289
+ * If the task increased its priority or is running and
5290
+ * lowered its priority, then reschedule its CPU:
5291
+ */
5292
+ p->sched_class->prio_changed(rq, p, old_prio);
5293
+
46865294 out_unlock:
46875295 task_rq_unlock(rq, p, &rf);
46885296 }
....@@ -4767,7 +5375,7 @@
47675375 return 0;
47685376
47695377 #ifdef CONFIG_SMP
4770
- if (!llist_empty(&rq->wake_list))
5378
+ if (rq->ttwu_pending)
47715379 return 0;
47725380 #endif
47735381
....@@ -4790,6 +5398,7 @@
47905398
47915399 return 1;
47925400 }
5401
+EXPORT_SYMBOL_GPL(available_idle_cpu);
47935402
47945403 /**
47955404 * idle_task - return the idle task for a given CPU.
....@@ -4841,36 +5450,7 @@
48415450 */
48425451 p->rt_priority = attr->sched_priority;
48435452 p->normal_prio = normal_prio(p);
4844
- set_load_weight(p, true);
4845
-}
4846
-
4847
-/* Actually do priority change: must hold pi & rq lock. */
4848
-static void __setscheduler(struct rq *rq, struct task_struct *p,
4849
- const struct sched_attr *attr, bool keep_boost)
4850
-{
4851
- /*
4852
- * If params can't change scheduling class changes aren't allowed
4853
- * either.
4854
- */
4855
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4856
- return;
4857
-
4858
- __setscheduler_params(p, attr);
4859
-
4860
- /*
4861
- * Keep a potential priority boosting if called from
4862
- * sched_setscheduler().
4863
- */
4864
- p->prio = normal_prio(p);
4865
- if (keep_boost)
4866
- p->prio = rt_effective_prio(p, p->prio);
4867
-
4868
- if (dl_prio(p->prio))
4869
- p->sched_class = &dl_sched_class;
4870
- else if (rt_prio(p->prio))
4871
- p->sched_class = &rt_sched_class;
4872
- else
4873
- p->sched_class = &fair_sched_class;
5453
+ set_load_weight(p);
48745454 }
48755455
48765456 /*
....@@ -4893,10 +5473,8 @@
48935473 const struct sched_attr *attr,
48945474 bool user, bool pi)
48955475 {
4896
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4897
- MAX_RT_PRIO - 1 - attr->sched_priority;
4898
- int retval, oldprio, oldpolicy = -1, queued, running;
4899
- int new_effective_prio, policy = attr->sched_policy;
5476
+ int oldpolicy = -1, policy = attr->sched_policy;
5477
+ int retval, oldprio, newprio, queued, running;
49005478 const struct sched_class *prev_class;
49015479 struct rq_flags rf;
49025480 int reset_on_fork;
....@@ -4969,7 +5547,7 @@
49695547 * Treat SCHED_IDLE as nice 20. Only allow a switch to
49705548 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
49715549 */
4972
- if (idle_policy(p->policy) && !idle_policy(policy)) {
5550
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
49735551 if (!can_nice(p, task_nice(p)))
49745552 return -EPERM;
49755553 }
....@@ -4980,6 +5558,10 @@
49805558
49815559 /* Normal users shall not reset the sched_reset_on_fork flag: */
49825560 if (p->sched_reset_on_fork && !reset_on_fork)
5561
+ return -EPERM;
5562
+
5563
+ /* Can't change util-clamps */
5564
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
49835565 return -EPERM;
49845566 }
49855567
....@@ -5013,8 +5595,8 @@
50135595 * Changing the policy of the stop threads its a very bad idea:
50145596 */
50155597 if (p == rq->stop) {
5016
- task_rq_unlock(rq, p, &rf);
5017
- return -EINVAL;
5598
+ retval = -EINVAL;
5599
+ goto unlock;
50185600 }
50195601
50205602 /*
....@@ -5032,8 +5614,8 @@
50325614 goto change;
50335615
50345616 p->sched_reset_on_fork = reset_on_fork;
5035
- task_rq_unlock(rq, p, &rf);
5036
- return 0;
5617
+ retval = 0;
5618
+ goto unlock;
50375619 }
50385620 change:
50395621
....@@ -5046,8 +5628,8 @@
50465628 if (rt_bandwidth_enabled() && rt_policy(policy) &&
50475629 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
50485630 !task_group_is_autogroup(task_group(p))) {
5049
- task_rq_unlock(rq, p, &rf);
5050
- return -EPERM;
5631
+ retval = -EPERM;
5632
+ goto unlock;
50515633 }
50525634 #endif
50535635 #ifdef CONFIG_SMP
....@@ -5062,8 +5644,8 @@
50625644 */
50635645 if (!cpumask_subset(span, p->cpus_ptr) ||
50645646 rq->rd->dl_bw.bw == 0) {
5065
- task_rq_unlock(rq, p, &rf);
5066
- return -EPERM;
5647
+ retval = -EPERM;
5648
+ goto unlock;
50675649 }
50685650 }
50695651 #endif
....@@ -5082,13 +5664,14 @@
50825664 * is available.
50835665 */
50845666 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5085
- task_rq_unlock(rq, p, &rf);
5086
- return -EBUSY;
5667
+ retval = -EBUSY;
5668
+ goto unlock;
50875669 }
50885670
50895671 p->sched_reset_on_fork = reset_on_fork;
50905672 oldprio = p->prio;
50915673
5674
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
50925675 if (pi) {
50935676 /*
50945677 * Take priority boosted tasks into account. If the new
....@@ -5097,8 +5680,8 @@
50975680 * the runqueue. This will be done when the task deboost
50985681 * itself.
50995682 */
5100
- new_effective_prio = rt_effective_prio(p, newprio);
5101
- if (new_effective_prio == oldprio)
5683
+ newprio = rt_effective_prio(p, newprio);
5684
+ if (newprio == oldprio)
51025685 queue_flags &= ~DEQUEUE_MOVE;
51035686 }
51045687
....@@ -5111,7 +5694,11 @@
51115694
51125695 prev_class = p->sched_class;
51135696
5114
- __setscheduler(rq, p, attr, pi);
5697
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
5698
+ __setscheduler_params(p, attr);
5699
+ __setscheduler_prio(p, newprio);
5700
+ trace_android_rvh_setscheduler(p);
5701
+ }
51155702 __setscheduler_uclamp(p, attr);
51165703
51175704 if (queued) {
....@@ -5125,7 +5712,7 @@
51255712 enqueue_task(rq, p, queue_flags);
51265713 }
51275714 if (running)
5128
- set_curr_task(rq, p);
5715
+ set_next_task(rq, p);
51295716
51305717 check_class_changed(rq, p, prev_class, oldprio);
51315718
....@@ -5141,6 +5728,10 @@
51415728 preempt_enable();
51425729
51435730 return 0;
5731
+
5732
+unlock:
5733
+ task_rq_unlock(rq, p, &rf);
5734
+ return retval;
51445735 }
51455736
51465737 static int _sched_setscheduler(struct task_struct *p, int policy,
....@@ -5152,6 +5743,14 @@
51525743 .sched_nice = PRIO_TO_NICE(p->static_prio),
51535744 };
51545745
5746
+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
5747
+ ((policy == SCHED_FIFO) || (policy == SCHED_RR))) {
5748
+ attr.sched_priority /= 2;
5749
+ if (!check)
5750
+ attr.sched_priority += MAX_RT_PRIO / 2;
5751
+ if (!attr.sched_priority)
5752
+ attr.sched_priority = 1;
5753
+ }
51555754 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
51565755 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
51575756 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
....@@ -5166,6 +5765,8 @@
51665765 * @p: the task in question.
51675766 * @policy: new policy.
51685767 * @param: structure containing the new RT priority.
5768
+ *
5769
+ * Use sched_set_fifo(), read its comment.
51695770 *
51705771 * Return: 0 on success. An error code otherwise.
51715772 *
....@@ -5188,6 +5789,7 @@
51885789 {
51895790 return __sched_setscheduler(p, attr, false, true);
51905791 }
5792
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
51915793
51925794 /**
51935795 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
....@@ -5208,6 +5810,51 @@
52085810 return _sched_setscheduler(p, policy, param, false);
52095811 }
52105812 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
5813
+
5814
+/*
5815
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
5816
+ * incapable of resource management, which is the one thing an OS really should
5817
+ * be doing.
5818
+ *
5819
+ * This is of course the reason it is limited to privileged users only.
5820
+ *
5821
+ * Worse still; it is fundamentally impossible to compose static priority
5822
+ * workloads. You cannot take two correctly working static prio workloads
5823
+ * and smash them together and still expect them to work.
5824
+ *
5825
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
5826
+ *
5827
+ * MAX_RT_PRIO / 2
5828
+ *
5829
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
5830
+ * know enough information to make a sensible choice.
5831
+ */
5832
+void sched_set_fifo(struct task_struct *p)
5833
+{
5834
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
5835
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5836
+}
5837
+EXPORT_SYMBOL_GPL(sched_set_fifo);
5838
+
5839
+/*
5840
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
5841
+ */
5842
+void sched_set_fifo_low(struct task_struct *p)
5843
+{
5844
+ struct sched_param sp = { .sched_priority = 1 };
5845
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5846
+}
5847
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
5848
+
5849
+void sched_set_normal(struct task_struct *p, int nice)
5850
+{
5851
+ struct sched_attr attr = {
5852
+ .sched_policy = SCHED_NORMAL,
5853
+ .sched_nice = nice,
5854
+ };
5855
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
5856
+}
5857
+EXPORT_SYMBOL_GPL(sched_set_normal);
52115858
52125859 static int
52135860 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
....@@ -5239,9 +5886,6 @@
52395886 u32 size;
52405887 int ret;
52415888
5242
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5243
- return -EFAULT;
5244
-
52455889 /* Zero the full structure, so that a short copy will be nice: */
52465890 memset(attr, 0, sizeof(*attr));
52475891
....@@ -5249,44 +5893,18 @@
52495893 if (ret)
52505894 return ret;
52515895
5252
- /* Bail out on silly large: */
5253
- if (size > PAGE_SIZE)
5254
- goto err_size;
5255
-
52565896 /* ABI compatibility quirk: */
52575897 if (!size)
52585898 size = SCHED_ATTR_SIZE_VER0;
5259
-
5260
- if (size < SCHED_ATTR_SIZE_VER0)
5899
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
52615900 goto err_size;
52625901
5263
- /*
5264
- * If we're handed a bigger struct than we know of,
5265
- * ensure all the unknown bits are 0 - i.e. new
5266
- * user-space does not rely on any kernel feature
5267
- * extensions we dont know about yet.
5268
- */
5269
- if (size > sizeof(*attr)) {
5270
- unsigned char __user *addr;
5271
- unsigned char __user *end;
5272
- unsigned char val;
5273
-
5274
- addr = (void __user *)uattr + sizeof(*attr);
5275
- end = (void __user *)uattr + size;
5276
-
5277
- for (; addr < end; addr++) {
5278
- ret = get_user(val, addr);
5279
- if (ret)
5280
- return ret;
5281
- if (val)
5282
- goto err_size;
5283
- }
5284
- size = sizeof(*attr);
5902
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
5903
+ if (ret) {
5904
+ if (ret == -E2BIG)
5905
+ goto err_size;
5906
+ return ret;
52855907 }
5286
-
5287
- ret = copy_from_user(attr, uattr, size);
5288
- if (ret)
5289
- return -EFAULT;
52905908
52915909 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
52925910 size < SCHED_ATTR_SIZE_VER1)
....@@ -5303,6 +5921,16 @@
53035921 err_size:
53045922 put_user(sizeof(*attr), &uattr->size);
53055923 return -E2BIG;
5924
+}
5925
+
5926
+static void get_params(struct task_struct *p, struct sched_attr *attr)
5927
+{
5928
+ if (task_has_dl_policy(p))
5929
+ __getparam_dl(p, attr);
5930
+ else if (task_has_rt_policy(p))
5931
+ attr->sched_priority = p->rt_priority;
5932
+ else
5933
+ attr->sched_nice = task_nice(p);
53065934 }
53075935
53085936 /**
....@@ -5366,6 +5994,8 @@
53665994 rcu_read_unlock();
53675995
53685996 if (likely(p)) {
5997
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
5998
+ get_params(p, &attr);
53695999 retval = sched_setattr(p, &attr);
53706000 put_task_struct(p);
53716001 }
....@@ -5459,7 +6089,7 @@
54596089 {
54606090 unsigned int ksize = sizeof(*kattr);
54616091
5462
- if (!access_ok(VERIFY_WRITE, uattr, usize))
6092
+ if (!access_ok(uattr, usize))
54636093 return -EFAULT;
54646094
54656095 /*
....@@ -5487,7 +6117,7 @@
54876117 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
54886118 * @pid: the pid in question.
54896119 * @uattr: structure containing the extended parameters.
5490
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
6120
+ * @usize: sizeof(attr) for fwd/bwd comp.
54916121 * @flags: for future extension.
54926122 */
54936123 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
....@@ -5514,14 +6144,15 @@
55146144 kattr.sched_policy = p->policy;
55156145 if (p->sched_reset_on_fork)
55166146 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5517
- if (task_has_dl_policy(p))
5518
- __getparam_dl(p, &kattr);
5519
- else if (task_has_rt_policy(p))
5520
- kattr.sched_priority = p->rt_priority;
5521
- else
5522
- kattr.sched_nice = task_nice(p);
6147
+ get_params(p, &kattr);
6148
+ kattr.sched_flags &= SCHED_FLAG_ALL;
55236149
55246150 #ifdef CONFIG_UCLAMP_TASK
6151
+ /*
6152
+ * This could race with another potential updater, but this is fine
6153
+ * because it'll correctly read the old or the new value. We don't need
6154
+ * to guarantee who wins the race as long as it doesn't return garbage.
6155
+ */
55256156 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
55266157 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
55276158 #endif
....@@ -5540,6 +6171,7 @@
55406171 cpumask_var_t cpus_allowed, new_mask;
55416172 struct task_struct *p;
55426173 int retval;
6174
+ int skip = 0;
55436175
55446176 rcu_read_lock();
55456177
....@@ -5575,6 +6207,9 @@
55756207 rcu_read_unlock();
55766208 }
55776209
6210
+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
6211
+ if (skip)
6212
+ goto out_free_new_mask;
55786213 retval = security_task_setscheduler(p);
55796214 if (retval)
55806215 goto out_free_new_mask;
....@@ -5615,6 +6250,9 @@
56156250 goto again;
56166251 }
56176252 }
6253
+
6254
+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
6255
+
56186256 out_free_new_mask:
56196257 free_cpumask_var(new_mask);
56206258 out_free_cpus_allowed:
....@@ -5623,7 +6261,6 @@
56236261 put_task_struct(p);
56246262 return retval;
56256263 }
5626
-EXPORT_SYMBOL_GPL(sched_setaffinity);
56276264
56286265 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
56296266 struct cpumask *new_mask)
....@@ -5742,6 +6379,8 @@
57426379 schedstat_inc(rq->yld_count);
57436380 current->sched_class->yield_task(rq);
57446381
6382
+ trace_android_rvh_do_sched_yield(rq);
6383
+
57456384 preempt_disable();
57466385 rq_unlock_irq(rq, &rf);
57476386 sched_preempt_enable_no_resched();
....@@ -5755,7 +6394,7 @@
57556394 return 0;
57566395 }
57576396
5758
-#ifndef CONFIG_PREEMPT
6397
+#ifndef CONFIG_PREEMPTION
57596398 int __sched _cond_resched(void)
57606399 {
57616400 if (should_resched(0)) {
....@@ -5772,7 +6411,7 @@
57726411 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
57736412 * call schedule, and on return reacquire the lock.
57746413 *
5775
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6414
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
57766415 * operations here to prevent schedule() from being called twice (once via
57776416 * spin_unlock(), once by hand).
57786417 */
....@@ -5876,7 +6515,7 @@
58766515 if (task_running(p_rq, p) || p->state)
58776516 goto out_unlock;
58786517
5879
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
6518
+ yielded = curr->sched_class->yield_to_task(rq, p);
58806519 if (yielded) {
58816520 schedstat_inc(rq->yld_count);
58826521 /*
....@@ -6042,7 +6681,7 @@
60426681 * an error code.
60436682 */
60446683 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6045
- struct timespec __user *, interval)
6684
+ struct __kernel_timespec __user *, interval)
60466685 {
60476686 struct timespec64 t;
60486687 int retval = sched_rr_get_interval(pid, &t);
....@@ -6053,16 +6692,15 @@
60536692 return retval;
60546693 }
60556694
6056
-#ifdef CONFIG_COMPAT
6057
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
6058
- compat_pid_t, pid,
6059
- struct compat_timespec __user *, interval)
6695
+#ifdef CONFIG_COMPAT_32BIT_TIME
6696
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
6697
+ struct old_timespec32 __user *, interval)
60606698 {
60616699 struct timespec64 t;
60626700 int retval = sched_rr_get_interval(pid, &t);
60636701
60646702 if (retval == 0)
6065
- retval = compat_put_timespec64(&t, interval);
6703
+ retval = put_old_timespec32(&t, interval);
60666704 return retval;
60676705 }
60686706 #endif
....@@ -6075,10 +6713,10 @@
60756713 if (!try_get_task_stack(p))
60766714 return;
60776715
6078
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
6716
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
60796717
60806718 if (p->state == TASK_RUNNING)
6081
- printk(KERN_CONT " running task ");
6719
+ pr_cont(" running task ");
60826720 #ifdef CONFIG_DEBUG_STACK_USAGE
60836721 free = stack_not_used(p);
60846722 #endif
....@@ -6087,12 +6725,13 @@
60876725 if (pid_alive(p))
60886726 ppid = task_pid_nr(rcu_dereference(p->real_parent));
60896727 rcu_read_unlock();
6090
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6091
- task_pid_nr(p), ppid,
6728
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
6729
+ free, task_pid_nr(p), ppid,
60926730 (unsigned long)task_thread_info(p)->flags);
60936731
60946732 print_worker_info(KERN_INFO, p);
6095
- show_stack(p, NULL);
6733
+ trace_android_vh_sched_show_task(p);
6734
+ show_stack(p, NULL, KERN_INFO);
60966735 put_task_stack(p);
60976736 }
60986737 EXPORT_SYMBOL_GPL(sched_show_task);
....@@ -6123,13 +6762,6 @@
61236762 {
61246763 struct task_struct *g, *p;
61256764
6126
-#if BITS_PER_LONG == 32
6127
- printk(KERN_INFO
6128
- " task PC stack pid father\n");
6129
-#else
6130
- printk(KERN_INFO
6131
- " task PC stack pid father\n");
6132
-#endif
61336765 rcu_read_lock();
61346766 for_each_process_thread(g, p) {
61356767 /*
....@@ -6165,7 +6797,7 @@
61656797 * NOTE: this function does not set the idle thread's NEED_RESCHED
61666798 * flag, to make booting more robust.
61676799 */
6168
-void init_idle(struct task_struct *idle, int cpu)
6800
+void __init init_idle(struct task_struct *idle, int cpu)
61696801 {
61706802 struct rq *rq = cpu_rq(cpu);
61716803 unsigned long flags;
....@@ -6178,9 +6810,6 @@
61786810 idle->state = TASK_RUNNING;
61796811 idle->se.exec_start = sched_clock();
61806812 idle->flags |= PF_IDLE;
6181
-
6182
- scs_task_reset(idle);
6183
- kasan_unpoison_task_stack(idle);
61846813
61856814 #ifdef CONFIG_SMP
61866815 /*
....@@ -6205,7 +6834,8 @@
62056834 __set_task_cpu(idle, cpu);
62066835 rcu_read_unlock();
62076836
6208
- rq->curr = rq->idle = idle;
6837
+ rq->idle = idle;
6838
+ rcu_assign_pointer(rq->curr, idle);
62096839 idle->on_rq = TASK_ON_RQ_QUEUED;
62106840 #ifdef CONFIG_SMP
62116841 idle->on_cpu = 1;
....@@ -6215,9 +6845,7 @@
62156845
62166846 /* Set the preempt count _outside_ the spinlocks! */
62176847 init_idle_preempt_count(idle, cpu);
6218
-#ifdef CONFIG_HAVE_PREEMPT_LAZY
6219
- task_thread_info(idle)->preempt_lazy_count = 0;
6220
-#endif
6848
+
62216849 /*
62226850 * The idle tasks have their own, simple scheduling class:
62236851 */
....@@ -6245,7 +6873,7 @@
62456873 }
62466874
62476875 int task_can_attach(struct task_struct *p,
6248
- const struct cpumask *cs_cpus_allowed)
6876
+ const struct cpumask *cs_effective_cpus)
62496877 {
62506878 int ret = 0;
62516879
....@@ -6264,8 +6892,13 @@
62646892 }
62656893
62666894 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6267
- cs_cpus_allowed))
6268
- ret = dl_task_can_attach(p, cs_cpus_allowed);
6895
+ cs_effective_cpus)) {
6896
+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
6897
+
6898
+ if (unlikely(cpu >= nr_cpu_ids))
6899
+ return -EINVAL;
6900
+ ret = dl_cpu_busy(cpu, p);
6901
+ }
62696902
62706903 out:
62716904 return ret;
....@@ -6316,13 +6949,12 @@
63166949 if (queued)
63176950 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
63186951 if (running)
6319
- set_curr_task(rq, p);
6952
+ set_next_task(rq, p);
63206953 task_rq_unlock(rq, p, &rf);
63216954 }
63226955 #endif /* CONFIG_NUMA_BALANCING */
63236956
63246957 #ifdef CONFIG_HOTPLUG_CPU
6325
-
63266958 /*
63276959 * Ensure that the idle task is using init_mm right before its CPU goes
63286960 * offline.
....@@ -6358,21 +6990,22 @@
63586990 atomic_long_add(delta, &calc_load_tasks);
63596991 }
63606992
6361
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6993
+static struct task_struct *__pick_migrate_task(struct rq *rq)
63626994 {
6995
+ const struct sched_class *class;
6996
+ struct task_struct *next;
6997
+
6998
+ for_each_class(class) {
6999
+ next = class->pick_next_task(rq);
7000
+ if (next) {
7001
+ next->sched_class->put_prev_task(rq, next);
7002
+ return next;
7003
+ }
7004
+ }
7005
+
7006
+ /* The idle class should always have a runnable task */
7007
+ BUG();
63637008 }
6364
-
6365
-static const struct sched_class fake_sched_class = {
6366
- .put_prev_task = put_prev_task_fake,
6367
-};
6368
-
6369
-static struct task_struct fake_task = {
6370
- /*
6371
- * Avoid pull_{rt,dl}_task()
6372
- */
6373
- .prio = MAX_PRIO + 1,
6374
- .sched_class = &fake_sched_class,
6375
-};
63767009
63777010 /*
63787011 * Migrate all tasks from the rq, sleeping tasks will be migrated by
....@@ -6381,11 +7014,14 @@
63817014 * Called with rq->lock held even though we'er in stop_machine() and
63827015 * there's no concurrency possible, we hold the required locks anyway
63837016 * because of lock validation efforts.
7017
+ *
7018
+ * force: if false, the function will skip CPU pinned kthreads.
63847019 */
6385
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
7020
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force)
63867021 {
63877022 struct rq *rq = dead_rq;
6388
- struct task_struct *next, *stop = rq->stop;
7023
+ struct task_struct *next, *tmp, *stop = rq->stop;
7024
+ LIST_HEAD(percpu_kthreads);
63897025 struct rq_flags orf = *rf;
63907026 int dest_cpu;
63917027
....@@ -6407,6 +7043,11 @@
64077043 */
64087044 update_rq_clock(rq);
64097045
7046
+#ifdef CONFIG_SCHED_DEBUG
7047
+ /* note the clock update in orf */
7048
+ orf.clock_update_flags |= RQCF_UPDATED;
7049
+#endif
7050
+
64107051 for (;;) {
64117052 /*
64127053 * There's this thread running, bail when that's the only
....@@ -6415,14 +7056,21 @@
64157056 if (rq->nr_running == 1)
64167057 break;
64177058
6418
- /*
6419
- * pick_next_task() assumes pinned rq->lock:
6420
- */
6421
- next = pick_next_task(rq, &fake_task, rf);
6422
- BUG_ON(!next);
6423
- put_prev_task(rq, next);
7059
+ next = __pick_migrate_task(rq);
64247060
6425
- WARN_ON_ONCE(__migrate_disabled(next));
7061
+ /*
7062
+ * Argh ... no iterator for tasks, we need to remove the
7063
+ * kthread from the run-queue to continue.
7064
+ */
7065
+ if (!force && is_per_cpu_kthread(next)) {
7066
+ INIT_LIST_HEAD(&next->percpu_kthread_node);
7067
+ list_add(&next->percpu_kthread_node, &percpu_kthreads);
7068
+
7069
+ /* DEQUEUE_SAVE not used due to move_entity in rt */
7070
+ deactivate_task(rq, next,
7071
+ DEQUEUE_NOCLOCK);
7072
+ continue;
7073
+ }
64267074
64277075 /*
64287076 * Rules for changing task_struct::cpus_mask are holding
....@@ -6442,7 +7090,14 @@
64427090 * changed the task, WARN if weird stuff happened, because in
64437091 * that case the above rq->lock drop is a fail too.
64447092 */
6445
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
7093
+ if (task_rq(next) != rq || !task_on_rq_queued(next)) {
7094
+ /*
7095
+ * In the !force case, there is a hole between
7096
+ * rq_unlock() and rq_relock(), where another CPU might
7097
+ * not observe an up to date cpu_active_mask and try to
7098
+ * move tasks around.
7099
+ */
7100
+ WARN_ON(force);
64467101 raw_spin_unlock(&next->pi_lock);
64477102 continue;
64487103 }
....@@ -6459,7 +7114,49 @@
64597114 raw_spin_unlock(&next->pi_lock);
64607115 }
64617116
7117
+ list_for_each_entry_safe(next, tmp, &percpu_kthreads,
7118
+ percpu_kthread_node) {
7119
+
7120
+ /* ENQUEUE_RESTORE not used due to move_entity in rt */
7121
+ activate_task(rq, next, ENQUEUE_NOCLOCK);
7122
+ list_del(&next->percpu_kthread_node);
7123
+ }
7124
+
64627125 rq->stop = stop;
7126
+}
7127
+
7128
+static int drain_rq_cpu_stop(void *data)
7129
+{
7130
+ struct rq *rq = this_rq();
7131
+ struct rq_flags rf;
7132
+
7133
+ rq_lock_irqsave(rq, &rf);
7134
+ migrate_tasks(rq, &rf, false);
7135
+ rq_unlock_irqrestore(rq, &rf);
7136
+
7137
+ return 0;
7138
+}
7139
+
7140
+int sched_cpu_drain_rq(unsigned int cpu)
7141
+{
7142
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7143
+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
7144
+
7145
+ if (idle_cpu(cpu)) {
7146
+ rq_drain->done = NULL;
7147
+ return 0;
7148
+ }
7149
+
7150
+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
7151
+ rq_drain_done);
7152
+}
7153
+
7154
+void sched_cpu_drain_rq_wait(unsigned int cpu)
7155
+{
7156
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7157
+
7158
+ if (rq_drain->done)
7159
+ cpu_stop_work_wait(rq_drain);
64637160 }
64647161 #endif /* CONFIG_HOTPLUG_CPU */
64657162
....@@ -6531,8 +7228,10 @@
65317228 static int cpuset_cpu_inactive(unsigned int cpu)
65327229 {
65337230 if (!cpuhp_tasks_frozen) {
6534
- if (dl_cpu_busy(cpu))
6535
- return -EBUSY;
7231
+ int ret = dl_cpu_busy(cpu, NULL);
7232
+
7233
+ if (ret)
7234
+ return ret;
65367235 cpuset_update_active_cpus();
65377236 } else {
65387237 num_cpus_frozen++;
....@@ -6581,19 +7280,27 @@
65817280 return 0;
65827281 }
65837282
6584
-int sched_cpu_deactivate(unsigned int cpu)
7283
+int sched_cpus_activate(struct cpumask *cpus)
7284
+{
7285
+ unsigned int cpu;
7286
+
7287
+ for_each_cpu(cpu, cpus) {
7288
+ if (sched_cpu_activate(cpu)) {
7289
+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
7290
+ sched_cpu_deactivate(cpu);
7291
+
7292
+ return -EBUSY;
7293
+ }
7294
+ }
7295
+
7296
+ return 0;
7297
+}
7298
+
7299
+int _sched_cpu_deactivate(unsigned int cpu)
65857300 {
65867301 int ret;
65877302
65887303 set_cpu_active(cpu, false);
6589
- /*
6590
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6591
- * users of this state to go away such that all new such users will
6592
- * observe it.
6593
- *
6594
- * Do sync before park smpboot threads to take care the rcu boost case.
6595
- */
6596
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
65977304
65987305 #ifdef CONFIG_SCHED_SMT
65997306 /*
....@@ -6612,6 +7319,46 @@
66127319 return ret;
66137320 }
66147321 sched_domains_numa_masks_clear(cpu);
7322
+
7323
+ update_max_interval();
7324
+
7325
+ return 0;
7326
+}
7327
+
7328
+int sched_cpu_deactivate(unsigned int cpu)
7329
+{
7330
+ int ret = _sched_cpu_deactivate(cpu);
7331
+
7332
+ if (ret)
7333
+ return ret;
7334
+
7335
+ /*
7336
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7337
+ * users of this state to go away such that all new such users will
7338
+ * observe it.
7339
+ *
7340
+ * Do sync before park smpboot threads to take care the rcu boost case.
7341
+ */
7342
+ synchronize_rcu();
7343
+
7344
+ return 0;
7345
+}
7346
+
7347
+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
7348
+{
7349
+ unsigned int cpu;
7350
+
7351
+ for_each_cpu(cpu, cpus) {
7352
+ if (_sched_cpu_deactivate(cpu)) {
7353
+ for_each_cpu(cpu, cpus) {
7354
+ if (!cpu_active(cpu))
7355
+ sched_cpu_activate(cpu);
7356
+ }
7357
+
7358
+ return -EBUSY;
7359
+ }
7360
+ }
7361
+
66157362 return 0;
66167363 }
66177364
....@@ -6620,13 +7367,13 @@
66207367 struct rq *rq = cpu_rq(cpu);
66217368
66227369 rq->calc_load_update = calc_load_update;
6623
- update_max_interval();
66247370 }
66257371
66267372 int sched_cpu_starting(unsigned int cpu)
66277373 {
66287374 sched_rq_cpu_starting(cpu);
66297375 sched_tick_start(cpu);
7376
+ trace_android_rvh_sched_cpu_starting(cpu);
66307377 return 0;
66317378 }
66327379
....@@ -6637,7 +7384,6 @@
66377384 struct rq_flags rf;
66387385
66397386 /* Handle pending wakeups and then migrate everything off */
6640
- sched_ttwu_pending();
66417387 sched_tick_stop(cpu);
66427388
66437389 rq_lock_irqsave(rq, &rf);
....@@ -6645,12 +7391,13 @@
66457391 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
66467392 set_rq_offline(rq);
66477393 }
6648
- migrate_tasks(rq, &rf);
7394
+ migrate_tasks(rq, &rf, true);
66497395 BUG_ON(rq->nr_running != 1);
66507396 rq_unlock_irqrestore(rq, &rf);
66517397
7398
+ trace_android_rvh_sched_cpu_dying(cpu);
7399
+
66527400 calc_load_migrate(rq);
6653
- update_max_interval();
66547401 nohz_balance_exit_idle(rq);
66557402 hrtick_clear(rq);
66567403 return 0;
....@@ -6664,18 +7411,16 @@
66647411 /*
66657412 * There's no userspace yet to cause hotplug operations; hence all the
66667413 * CPU masks are stable and all blatant races in the below code cannot
6667
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6668
- * but there won't be any contention on it.
7414
+ * happen.
66697415 */
6670
- cpus_read_lock();
66717416 mutex_lock(&sched_domains_mutex);
66727417 sched_init_domains(cpu_active_mask);
66737418 mutex_unlock(&sched_domains_mutex);
6674
- cpus_read_unlock();
66757419
66767420 /* Move init over to a non-isolated CPU */
66777421 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
66787422 BUG();
7423
+
66797424 sched_init_granularity();
66807425
66817426 init_sched_rt_class();
....@@ -6686,7 +7431,7 @@
66867431
66877432 static int __init migration_init(void)
66887433 {
6689
- sched_rq_cpu_starting(smp_processor_id());
7434
+ sched_cpu_starting(smp_processor_id());
66907435 return 0;
66917436 }
66927437 early_initcall(migration_init);
....@@ -6711,7 +7456,9 @@
67117456 * Every task in system belongs to this group at bootup.
67127457 */
67137458 struct task_group root_task_group;
7459
+EXPORT_SYMBOL_GPL(root_task_group);
67147460 LIST_HEAD(task_groups);
7461
+EXPORT_SYMBOL_GPL(task_groups);
67157462
67167463 /* Cacheline aligned slab cache for task_group */
67177464 static struct kmem_cache *task_group_cache __read_mostly;
....@@ -6722,19 +7469,27 @@
67227469
67237470 void __init sched_init(void)
67247471 {
6725
- int i, j;
6726
- unsigned long alloc_size = 0, ptr;
7472
+ unsigned long ptr = 0;
7473
+ int i;
7474
+
7475
+ /* Make sure the linker didn't screw up */
7476
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
7477
+ &fair_sched_class + 1 != &rt_sched_class ||
7478
+ &rt_sched_class + 1 != &dl_sched_class);
7479
+#ifdef CONFIG_SMP
7480
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
7481
+#endif
67277482
67287483 wait_bit_init();
67297484
67307485 #ifdef CONFIG_FAIR_GROUP_SCHED
6731
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7486
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67327487 #endif
67337488 #ifdef CONFIG_RT_GROUP_SCHED
6734
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7489
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67357490 #endif
6736
- if (alloc_size) {
6737
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7491
+ if (ptr) {
7492
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
67387493
67397494 #ifdef CONFIG_FAIR_GROUP_SCHED
67407495 root_task_group.se = (struct sched_entity **)ptr;
....@@ -6743,6 +7498,8 @@
67437498 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
67447499 ptr += nr_cpu_ids * sizeof(void **);
67457500
7501
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7502
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
67467503 #endif /* CONFIG_FAIR_GROUP_SCHED */
67477504 #ifdef CONFIG_RT_GROUP_SCHED
67487505 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
....@@ -6795,7 +7552,6 @@
67957552 init_rt_rq(&rq->rt);
67967553 init_dl_rq(&rq->dl);
67977554 #ifdef CONFIG_FAIR_GROUP_SCHED
6798
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
67997555 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
68007556 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
68017557 /*
....@@ -6817,7 +7573,6 @@
68177573 * We achieve this by letting root_task_group's tasks sit
68187574 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
68197575 */
6820
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
68217576 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
68227577 #endif /* CONFIG_FAIR_GROUP_SCHED */
68237578
....@@ -6825,10 +7580,6 @@
68257580 #ifdef CONFIG_RT_GROUP_SCHED
68267581 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
68277582 #endif
6828
-
6829
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6830
- rq->cpu_load[j] = 0;
6831
-
68327583 #ifdef CONFIG_SMP
68337584 rq->sd = NULL;
68347585 rq->rd = NULL;
....@@ -6847,16 +7598,17 @@
68477598
68487599 rq_attach_root(rq, &def_root_domain);
68497600 #ifdef CONFIG_NO_HZ_COMMON
6850
- rq->last_load_update_tick = jiffies;
68517601 rq->last_blocked_load_update_tick = jiffies;
68527602 atomic_set(&rq->nohz_flags, 0);
7603
+
7604
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
68537605 #endif
68547606 #endif /* CONFIG_SMP */
68557607 hrtick_rq_init(rq);
68567608 atomic_set(&rq->nr_iowait, 0);
68577609 }
68587610
6859
- set_load_weight(&init_task, false);
7611
+ set_load_weight(&init_task);
68607612
68617613 /*
68627614 * The boot idle thread does lazy MMU switching as well:
....@@ -6891,7 +7643,7 @@
68917643 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
68927644 static inline int preempt_count_equals(int preempt_offset)
68937645 {
6894
- int nested = preempt_count() + sched_rcu_preempt_depth();
7646
+ int nested = preempt_count() + rcu_preempt_depth();
68957647
68967648 return (nested == preempt_offset);
68977649 }
....@@ -6925,7 +7677,7 @@
69257677 rcu_sleep_check();
69267678
69277679 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6928
- !is_idle_task(current)) ||
7680
+ !is_idle_task(current) && !current->non_block_count) ||
69297681 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
69307682 oops_in_progress)
69317683 return;
....@@ -6941,8 +7693,8 @@
69417693 "BUG: sleeping function called from invalid context at %s:%d\n",
69427694 file, line);
69437695 printk(KERN_ERR
6944
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6945
- in_atomic(), irqs_disabled(),
7696
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
7697
+ in_atomic(), irqs_disabled(), current->non_block_count,
69467698 current->pid, current->comm);
69477699
69487700 if (task_stack_end_corrupted(current))
....@@ -6954,13 +7706,43 @@
69547706 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
69557707 && !preempt_count_equals(preempt_offset)) {
69567708 pr_err("Preemption disabled at:");
6957
- print_ip_sym(preempt_disable_ip);
6958
- pr_cont("\n");
7709
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
69597710 }
7711
+
7712
+ trace_android_rvh_schedule_bug(NULL);
7713
+
69607714 dump_stack();
69617715 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
69627716 }
69637717 EXPORT_SYMBOL(___might_sleep);
7718
+
7719
+void __cant_sleep(const char *file, int line, int preempt_offset)
7720
+{
7721
+ static unsigned long prev_jiffy;
7722
+
7723
+ if (irqs_disabled())
7724
+ return;
7725
+
7726
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
7727
+ return;
7728
+
7729
+ if (preempt_count() > preempt_offset)
7730
+ return;
7731
+
7732
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7733
+ return;
7734
+ prev_jiffy = jiffies;
7735
+
7736
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
7737
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7738
+ in_atomic(), irqs_disabled(),
7739
+ current->pid, current->comm);
7740
+
7741
+ debug_show_held_locks(current);
7742
+ dump_stack();
7743
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7744
+}
7745
+EXPORT_SYMBOL_GPL(__cant_sleep);
69647746 #endif
69657747
69667748 #ifdef CONFIG_MAGIC_SYSRQ
....@@ -7029,7 +7811,7 @@
70297811
70307812 #ifdef CONFIG_IA64
70317813 /**
7032
- * set_curr_task - set the current task for a given CPU.
7814
+ * ia64_set_curr_task - set the current task for a given CPU.
70337815 * @cpu: the processor in question.
70347816 * @p: the task pointer to set.
70357817 *
....@@ -7195,8 +7977,15 @@
71957977
71967978 if (queued)
71977979 enqueue_task(rq, tsk, queue_flags);
7198
- if (running)
7199
- set_curr_task(rq, tsk);
7980
+ if (running) {
7981
+ set_next_task(rq, tsk);
7982
+ /*
7983
+ * After changing group, the running task may have joined a
7984
+ * throttled one but it's still the running task. Trigger a
7985
+ * resched to make sure that task can still run.
7986
+ */
7987
+ resched_curr(rq);
7988
+ }
72007989
72017990 task_rq_unlock(rq, tsk, &rf);
72027991 }
....@@ -7235,9 +8024,14 @@
72358024
72368025 #ifdef CONFIG_UCLAMP_TASK_GROUP
72378026 /* Propagate the effective uclamp value for the new group */
8027
+ mutex_lock(&uclamp_mutex);
8028
+ rcu_read_lock();
72388029 cpu_util_update_eff(css);
8030
+ rcu_read_unlock();
8031
+ mutex_unlock(&uclamp_mutex);
72398032 #endif
72408033
8034
+ trace_android_rvh_cpu_cgroup_online(css);
72418035 return 0;
72428036 }
72438037
....@@ -7303,6 +8097,9 @@
73038097 if (ret)
73048098 break;
73058099 }
8100
+
8101
+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
8102
+
73068103 return ret;
73078104 }
73088105
....@@ -7313,6 +8110,8 @@
73138110
73148111 cgroup_taskset_for_each(task, css, tset)
73158112 sched_move_task(task);
8113
+
8114
+ trace_android_rvh_cpu_cgroup_attach(tset);
73168115 }
73178116
73188117 #ifdef CONFIG_UCLAMP_TASK_GROUP
....@@ -7324,6 +8123,9 @@
73248123 unsigned int eff[UCLAMP_CNT];
73258124 enum uclamp_id clamp_id;
73268125 unsigned int clamps;
8126
+
8127
+ lockdep_assert_held(&uclamp_mutex);
8128
+ SCHED_WARN_ON(!rcu_read_lock_held());
73278129
73288130 css_for_each_descendant_pre(css, top_css) {
73298131 uc_parent = css_tg(css)->parent
....@@ -7357,7 +8159,7 @@
73578159 }
73588160
73598161 /* Immediately update descendants RUNNABLE tasks */
7360
- uclamp_update_active_tasks(css, clamps);
8162
+ uclamp_update_active_tasks(css);
73618163 }
73628164 }
73638165
....@@ -7414,6 +8216,8 @@
74148216 req = capacity_from_percent(buf);
74158217 if (req.ret)
74168218 return req.ret;
8219
+
8220
+ static_branch_enable(&sched_uclamp_used);
74178221
74188222 mutex_lock(&uclamp_mutex);
74198223 rcu_read_lock();
....@@ -7529,7 +8333,9 @@
75298333 static DEFINE_MUTEX(cfs_constraints_mutex);
75308334
75318335 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7532
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8336
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8337
+/* More than 203 days if BW_SHIFT equals 20. */
8338
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
75338339
75348340 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
75358341
....@@ -7555,6 +8361,12 @@
75558361 * feasibility.
75568362 */
75578363 if (period > max_cfs_quota_period)
8364
+ return -EINVAL;
8365
+
8366
+ /*
8367
+ * Bound quota to defend quota against overflow during bandwidth shift.
8368
+ */
8369
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
75588370 return -EINVAL;
75598371
75608372 /*
....@@ -7609,7 +8421,7 @@
76098421 return ret;
76108422 }
76118423
7612
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8424
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
76138425 {
76148426 u64 quota, period;
76158427
....@@ -7624,7 +8436,7 @@
76248436 return tg_set_cfs_bandwidth(tg, period, quota);
76258437 }
76268438
7627
-long tg_get_cfs_quota(struct task_group *tg)
8439
+static long tg_get_cfs_quota(struct task_group *tg)
76288440 {
76298441 u64 quota_us;
76308442
....@@ -7637,7 +8449,7 @@
76378449 return quota_us;
76388450 }
76398451
7640
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8452
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
76418453 {
76428454 u64 quota, period;
76438455
....@@ -7650,7 +8462,7 @@
76508462 return tg_set_cfs_bandwidth(tg, period, quota);
76518463 }
76528464
7653
-long tg_get_cfs_period(struct task_group *tg)
8465
+static long tg_get_cfs_period(struct task_group *tg)
76548466 {
76558467 u64 cfs_period_us;
76568468
....@@ -8127,172 +8939,7 @@
81278939 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
81288940 };
81298941
8130
-#undef CREATE_TRACE_POINTS
8131
-
8132
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133
-
8134
-static inline void
8135
-update_nr_migratory(struct task_struct *p, long delta)
8942
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
81368943 {
8137
- if (unlikely((p->sched_class == &rt_sched_class ||
8138
- p->sched_class == &dl_sched_class) &&
8139
- p->nr_cpus_allowed > 1)) {
8140
- if (p->sched_class == &rt_sched_class)
8141
- task_rq(p)->rt.rt_nr_migratory += delta;
8142
- else
8143
- task_rq(p)->dl.dl_nr_migratory += delta;
8144
- }
8944
+ trace_sched_update_nr_running_tp(rq, count);
81458945 }
8146
-
8147
-static inline void
8148
-migrate_disable_update_cpus_allowed(struct task_struct *p)
8149
-{
8150
- p->cpus_ptr = cpumask_of(smp_processor_id());
8151
- update_nr_migratory(p, -1);
8152
- p->nr_cpus_allowed = 1;
8153
-}
8154
-
8155
-static inline void
8156
-migrate_enable_update_cpus_allowed(struct task_struct *p)
8157
-{
8158
- struct rq *rq;
8159
- struct rq_flags rf;
8160
-
8161
- rq = task_rq_lock(p, &rf);
8162
- p->cpus_ptr = &p->cpus_mask;
8163
- p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164
- update_nr_migratory(p, 1);
8165
- task_rq_unlock(rq, p, &rf);
8166
-}
8167
-
8168
-void migrate_disable(void)
8169
-{
8170
- preempt_disable();
8171
-
8172
- if (++current->migrate_disable == 1) {
8173
- this_rq()->nr_pinned++;
8174
- preempt_lazy_disable();
8175
-#ifdef CONFIG_SCHED_DEBUG
8176
- WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177
- current->pinned_on_cpu = smp_processor_id();
8178
-#endif
8179
- }
8180
-
8181
- preempt_enable();
8182
-}
8183
-EXPORT_SYMBOL(migrate_disable);
8184
-
8185
-static void migrate_disabled_sched(struct task_struct *p)
8186
-{
8187
- if (p->migrate_disable_scheduled)
8188
- return;
8189
-
8190
- migrate_disable_update_cpus_allowed(p);
8191
- p->migrate_disable_scheduled = 1;
8192
-}
8193
-
8194
-static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195
-static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196
-
8197
-void migrate_enable(void)
8198
-{
8199
- struct task_struct *p = current;
8200
- struct rq *rq = this_rq();
8201
- int cpu = task_cpu(p);
8202
-
8203
- WARN_ON_ONCE(p->migrate_disable <= 0);
8204
- if (p->migrate_disable > 1) {
8205
- p->migrate_disable--;
8206
- return;
8207
- }
8208
-
8209
- preempt_disable();
8210
-
8211
-#ifdef CONFIG_SCHED_DEBUG
8212
- WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213
- current->pinned_on_cpu = -1;
8214
-#endif
8215
-
8216
- WARN_ON_ONCE(rq->nr_pinned < 1);
8217
-
8218
- p->migrate_disable = 0;
8219
- rq->nr_pinned--;
8220
-#ifdef CONFIG_HOTPLUG_CPU
8221
- if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222
- takedown_cpu_task)
8223
- wake_up_process(takedown_cpu_task);
8224
-#endif
8225
-
8226
- if (!p->migrate_disable_scheduled)
8227
- goto out;
8228
-
8229
- p->migrate_disable_scheduled = 0;
8230
-
8231
- migrate_enable_update_cpus_allowed(p);
8232
-
8233
- WARN_ON(smp_processor_id() != cpu);
8234
- if (!is_cpu_allowed(p, cpu)) {
8235
- struct migration_arg __percpu *arg;
8236
- struct cpu_stop_work __percpu *work;
8237
- struct rq_flags rf;
8238
-
8239
- work = this_cpu_ptr(&migrate_work);
8240
- arg = this_cpu_ptr(&migrate_arg);
8241
- WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242
-
8243
- arg->task = p;
8244
- arg->done = false;
8245
-
8246
- rq = task_rq_lock(p, &rf);
8247
- update_rq_clock(rq);
8248
- arg->dest_cpu = select_fallback_rq(cpu, p);
8249
- task_rq_unlock(rq, p, &rf);
8250
-
8251
- stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252
- arg, work);
8253
- tlb_migrate_finish(p->mm);
8254
- }
8255
-
8256
-out:
8257
- preempt_lazy_enable();
8258
- preempt_enable();
8259
-}
8260
-EXPORT_SYMBOL(migrate_enable);
8261
-
8262
-int cpu_nr_pinned(int cpu)
8263
-{
8264
- struct rq *rq = cpu_rq(cpu);
8265
-
8266
- return rq->nr_pinned;
8267
-}
8268
-
8269
-#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270
-static void migrate_disabled_sched(struct task_struct *p)
8271
-{
8272
-}
8273
-
8274
-void migrate_disable(void)
8275
-{
8276
-#ifdef CONFIG_SCHED_DEBUG
8277
- current->migrate_disable++;
8278
-#endif
8279
- barrier();
8280
-}
8281
-EXPORT_SYMBOL(migrate_disable);
8282
-
8283
-void migrate_enable(void)
8284
-{
8285
-#ifdef CONFIG_SCHED_DEBUG
8286
- struct task_struct *p = current;
8287
-
8288
- WARN_ON_ONCE(p->migrate_disable <= 0);
8289
- p->migrate_disable--;
8290
-#endif
8291
- barrier();
8292
-}
8293
-EXPORT_SYMBOL(migrate_enable);
8294
-#else
8295
-static void migrate_disabled_sched(struct task_struct *p)
8296
-{
8297
-}
8298
-#endif