hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/sched/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * kernel/sched/core.c
34 *
....@@ -5,6 +6,10 @@
56 *
67 * Copyright (C) 1991-2002 Linus Torvalds
78 */
9
+#define CREATE_TRACE_POINTS
10
+#include <trace/events/sched.h>
11
+#undef CREATE_TRACE_POINTS
12
+
813 #include "sched.h"
914
1015 #include <linux/nospec.h>
....@@ -16,14 +21,41 @@
1621 #include <asm/tlb.h>
1722
1823 #include "../workqueue_internal.h"
24
+#include "../../io_uring/io-wq.h"
1925 #include "../smpboot.h"
2026
2127 #include "pelt.h"
28
+#include "smp.h"
2229
23
-#define CREATE_TRACE_POINTS
24
-#include <trace/events/sched.h>
30
+#include <trace/hooks/sched.h>
31
+#include <trace/hooks/dtask.h>
32
+
33
+/*
34
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
35
+ * associated with them) to allow external modules to probe them.
36
+ */
37
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
38
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
39
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
40
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
41
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
42
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
43
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
44
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
45
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
46
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
47
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
48
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
49
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50
+#ifdef CONFIG_SCHEDSTATS
51
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55
+#endif
2556
2657 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
+EXPORT_SYMBOL_GPL(runqueues);
2759
2860 #ifdef CONFIG_SCHED_DEBUG
2961 /*
....@@ -38,6 +70,7 @@
3870 const_debug unsigned int sysctl_sched_features =
3971 #include "features.h"
4072 0;
73
+EXPORT_SYMBOL_GPL(sysctl_sched_features);
4174 #undef SCHED_FEAT
4275 #endif
4376
....@@ -45,7 +78,11 @@
4578 * Number of tasks to iterate in a single balance run.
4679 * Limited because this is done with IRQs disabled.
4780 */
81
+#ifdef CONFIG_PREEMPT_RT
82
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
83
+#else
4884 const_debug unsigned int sysctl_sched_nr_migrate = 32;
85
+#endif
4986
5087 /*
5188 * period over which we measure -rt task CPU usage in us.
....@@ -60,6 +97,100 @@
6097 * default: 0.95s
6198 */
6299 int sysctl_sched_rt_runtime = 950000;
100
+
101
+
102
+/*
103
+ * Serialization rules:
104
+ *
105
+ * Lock order:
106
+ *
107
+ * p->pi_lock
108
+ * rq->lock
109
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
110
+ *
111
+ * rq1->lock
112
+ * rq2->lock where: rq1 < rq2
113
+ *
114
+ * Regular state:
115
+ *
116
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
117
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
118
+ * always looks at the local rq data structures to find the most elegible task
119
+ * to run next.
120
+ *
121
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
122
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
123
+ * the local CPU to avoid bouncing the runqueue state around [ see
124
+ * ttwu_queue_wakelist() ]
125
+ *
126
+ * Task wakeup, specifically wakeups that involve migration, are horribly
127
+ * complicated to avoid having to take two rq->locks.
128
+ *
129
+ * Special state:
130
+ *
131
+ * System-calls and anything external will use task_rq_lock() which acquires
132
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
133
+ * stable while holding either lock:
134
+ *
135
+ * - sched_setaffinity()/
136
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
137
+ * - set_user_nice(): p->se.load, p->*prio
138
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
139
+ * p->se.load, p->rt_priority,
140
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
141
+ * - sched_setnuma(): p->numa_preferred_nid
142
+ * - sched_move_task()/
143
+ * cpu_cgroup_fork(): p->sched_task_group
144
+ * - uclamp_update_active() p->uclamp*
145
+ *
146
+ * p->state <- TASK_*:
147
+ *
148
+ * is changed locklessly using set_current_state(), __set_current_state() or
149
+ * set_special_state(), see their respective comments, or by
150
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
151
+ * concurrent self.
152
+ *
153
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
154
+ *
155
+ * is set by activate_task() and cleared by deactivate_task(), under
156
+ * rq->lock. Non-zero indicates the task is runnable, the special
157
+ * ON_RQ_MIGRATING state is used for migration without holding both
158
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
159
+ *
160
+ * p->on_cpu <- { 0, 1 }:
161
+ *
162
+ * is set by prepare_task() and cleared by finish_task() such that it will be
163
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
164
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
165
+ *
166
+ * [ The astute reader will observe that it is possible for two tasks on one
167
+ * CPU to have ->on_cpu = 1 at the same time. ]
168
+ *
169
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
170
+ *
171
+ * - Don't call set_task_cpu() on a blocked task:
172
+ *
173
+ * We don't care what CPU we're not running on, this simplifies hotplug,
174
+ * the CPU assignment of blocked tasks isn't required to be valid.
175
+ *
176
+ * - for try_to_wake_up(), called under p->pi_lock:
177
+ *
178
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
179
+ *
180
+ * - for migration called under rq->lock:
181
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
182
+ *
183
+ * o move_queued_task()
184
+ * o detach_task()
185
+ *
186
+ * - for migration called under double_rq_lock():
187
+ *
188
+ * o __migrate_swap_task()
189
+ * o push_rt_task() / pull_rt_task()
190
+ * o push_dl_task() / pull_dl_task()
191
+ * o dl_task_offline_migration()
192
+ *
193
+ */
63194
64195 /*
65196 * __task_rq_lock - lock the rq @p resides on.
....@@ -84,6 +215,7 @@
84215 cpu_relax();
85216 }
86217 }
218
+EXPORT_SYMBOL_GPL(__task_rq_lock);
87219
88220 /*
89221 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
....@@ -126,6 +258,7 @@
126258 cpu_relax();
127259 }
128260 }
261
+EXPORT_SYMBOL_GPL(task_rq_lock);
129262
130263 /*
131264 * RQ-clock updating methods:
....@@ -206,7 +339,15 @@
206339 rq->clock += delta;
207340 update_rq_clock_task(rq, delta);
208341 }
342
+EXPORT_SYMBOL_GPL(update_rq_clock);
209343
344
+static inline void
345
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
346
+{
347
+ csd->flags = 0;
348
+ csd->func = func;
349
+ csd->info = rq;
350
+}
210351
211352 #ifdef CONFIG_SCHED_HRTICK
212353 /*
....@@ -243,8 +384,9 @@
243384 static void __hrtick_restart(struct rq *rq)
244385 {
245386 struct hrtimer *timer = &rq->hrtick_timer;
387
+ ktime_t time = rq->hrtick_time;
246388
247
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
389
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
248390 }
249391
250392 /*
....@@ -257,7 +399,6 @@
257399
258400 rq_lock(rq, &rf);
259401 __hrtick_restart(rq);
260
- rq->hrtick_csd_pending = 0;
261402 rq_unlock(rq, &rf);
262403 }
263404
....@@ -269,7 +410,6 @@
269410 void hrtick_start(struct rq *rq, u64 delay)
270411 {
271412 struct hrtimer *timer = &rq->hrtick_timer;
272
- ktime_t time;
273413 s64 delta;
274414
275415 /*
....@@ -277,16 +417,12 @@
277417 * doesn't make sense and can cause timer DoS.
278418 */
279419 delta = max_t(s64, delay, 10000LL);
280
- time = ktime_add_ns(timer->base->get_time(), delta);
420
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
281421
282
- hrtimer_set_expires(timer, time);
283
-
284
- if (rq == this_rq()) {
422
+ if (rq == this_rq())
285423 __hrtick_restart(rq);
286
- } else if (!rq->hrtick_csd_pending) {
424
+ else
287425 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
288
- rq->hrtick_csd_pending = 1;
289
- }
290426 }
291427
292428 #else
....@@ -303,21 +439,17 @@
303439 */
304440 delay = max_t(u64, delay, 10000LL);
305441 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
306
- HRTIMER_MODE_REL_PINNED);
442
+ HRTIMER_MODE_REL_PINNED_HARD);
307443 }
444
+
308445 #endif /* CONFIG_SMP */
309446
310447 static void hrtick_rq_init(struct rq *rq)
311448 {
312449 #ifdef CONFIG_SMP
313
- rq->hrtick_csd_pending = 0;
314
-
315
- rq->hrtick_csd.flags = 0;
316
- rq->hrtick_csd.func = __hrtick_start;
317
- rq->hrtick_csd.info = rq;
450
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
318451 #endif
319
-
320
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
452
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
321453 rq->hrtick_timer.function = hrtick;
322454 }
323455 #else /* CONFIG_SCHED_HRTICK */
....@@ -399,9 +531,15 @@
399531 #endif
400532 #endif
401533
402
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
534
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task,
535
+ bool sleeper)
403536 {
404
- struct wake_q_node *node = &task->wake_q;
537
+ struct wake_q_node *node;
538
+
539
+ if (sleeper)
540
+ node = &task->wake_q_sleeper;
541
+ else
542
+ node = &task->wake_q;
405543
406544 /*
407545 * Atomically grab the task, if ->wake_q is !nil already it means
....@@ -412,42 +550,96 @@
412550 * state, even in the failed case, an explicit smp_mb() must be used.
413551 */
414552 smp_mb__before_atomic();
415
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
416
- return;
417
-
418
- head->count++;
419
-
420
- get_task_struct(task);
553
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
554
+ return false;
421555
422556 /*
423557 * The head is context local, there can be no concurrency.
424558 */
425559 *head->lastp = node;
426560 head->lastp = &node->next;
561
+ head->count++;
562
+ return true;
427563 }
428564
429
-static int
430
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
431
- int sibling_count_hint);
565
+/**
566
+ * wake_q_add() - queue a wakeup for 'later' waking.
567
+ * @head: the wake_q_head to add @task to
568
+ * @task: the task to queue for 'later' wakeup
569
+ *
570
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
571
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
572
+ * instantly.
573
+ *
574
+ * This function must be used as-if it were wake_up_process(); IOW the task
575
+ * must be ready to be woken at this location.
576
+ */
577
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
578
+{
579
+ if (__wake_q_add(head, task, false))
580
+ get_task_struct(task);
581
+}
432582
433
-void wake_up_q(struct wake_q_head *head)
583
+void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task)
584
+{
585
+ if (__wake_q_add(head, task, true))
586
+ get_task_struct(task);
587
+}
588
+
589
+/**
590
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
591
+ * @head: the wake_q_head to add @task to
592
+ * @task: the task to queue for 'later' wakeup
593
+ *
594
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
595
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
596
+ * instantly.
597
+ *
598
+ * This function must be used as-if it were wake_up_process(); IOW the task
599
+ * must be ready to be woken at this location.
600
+ *
601
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
602
+ * that already hold reference to @task can call the 'safe' version and trust
603
+ * wake_q to do the right thing depending whether or not the @task is already
604
+ * queued for wakeup.
605
+ */
606
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
607
+{
608
+ if (!__wake_q_add(head, task, false))
609
+ put_task_struct(task);
610
+}
611
+
612
+void __wake_up_q(struct wake_q_head *head, bool sleeper)
434613 {
435614 struct wake_q_node *node = head->first;
436615
437616 while (node != WAKE_Q_TAIL) {
438617 struct task_struct *task;
439618
440
- task = container_of(node, struct task_struct, wake_q);
619
+ if (sleeper)
620
+ task = container_of(node, struct task_struct, wake_q_sleeper);
621
+ else
622
+ task = container_of(node, struct task_struct, wake_q);
623
+
441624 BUG_ON(!task);
442625 /* Task can safely be re-inserted now: */
443626 node = node->next;
444
- task->wake_q.next = NULL;
627
+ task->wake_q_count = head->count;
628
+ if (sleeper)
629
+ task->wake_q_sleeper.next = NULL;
630
+ else
631
+ task->wake_q.next = NULL;
445632
446633 /*
447
- * try_to_wake_up() executes a full barrier, which pairs with
634
+ * wake_up_process() executes a full barrier, which pairs with
448635 * the queueing in wake_q_add() so as not to miss wakeups.
449636 */
450
- try_to_wake_up(task, TASK_NORMAL, 0, head->count);
637
+ if (sleeper)
638
+ wake_up_lock_sleeper(task);
639
+ else
640
+ wake_up_process(task);
641
+
642
+ task->wake_q_count = 0;
451643 put_task_struct(task);
452644 }
453645 }
....@@ -477,15 +669,54 @@
477669 return;
478670 }
479671
480
-#ifdef CONFIG_PREEMPT
481672 if (set_nr_and_not_polling(curr))
482
-#else
483
- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
484
-#endif
485673 smp_send_reschedule(cpu);
486674 else
487675 trace_sched_wake_idle_without_ipi(cpu);
488676 }
677
+EXPORT_SYMBOL_GPL(resched_curr);
678
+
679
+#ifdef CONFIG_PREEMPT_LAZY
680
+
681
+static int tsk_is_polling(struct task_struct *p)
682
+{
683
+#ifdef TIF_POLLING_NRFLAG
684
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
685
+#else
686
+ return 0;
687
+#endif
688
+}
689
+
690
+void resched_curr_lazy(struct rq *rq)
691
+{
692
+ struct task_struct *curr = rq->curr;
693
+ int cpu;
694
+
695
+ if (!sched_feat(PREEMPT_LAZY)) {
696
+ resched_curr(rq);
697
+ return;
698
+ }
699
+
700
+ lockdep_assert_held(&rq->lock);
701
+
702
+ if (test_tsk_need_resched(curr))
703
+ return;
704
+
705
+ if (test_tsk_need_resched_lazy(curr))
706
+ return;
707
+
708
+ set_tsk_need_resched_lazy(curr);
709
+
710
+ cpu = cpu_of(rq);
711
+ if (cpu == smp_processor_id())
712
+ return;
713
+
714
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
715
+ smp_mb();
716
+ if (!tsk_is_polling(curr))
717
+ smp_send_reschedule(cpu);
718
+}
719
+#endif
489720
490721 void resched_cpu(int cpu)
491722 {
....@@ -510,27 +741,49 @@
510741 */
511742 int get_nohz_timer_target(void)
512743 {
513
- int i, cpu = smp_processor_id();
744
+ int i, cpu = smp_processor_id(), default_cpu = -1;
514745 struct sched_domain *sd;
515746
516
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
517
- return cpu;
747
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
748
+ if (!idle_cpu(cpu))
749
+ return cpu;
750
+ default_cpu = cpu;
751
+ }
518752
519753 rcu_read_lock();
520754 for_each_domain(cpu, sd) {
521
- for_each_cpu(i, sched_domain_span(sd)) {
755
+ for_each_cpu_and(i, sched_domain_span(sd),
756
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
522757 if (cpu == i)
523758 continue;
524759
525
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
760
+ if (!idle_cpu(i)) {
526761 cpu = i;
527762 goto unlock;
528763 }
529764 }
530765 }
531766
532
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
533
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
767
+ if (default_cpu == -1) {
768
+ for_each_cpu_and(i, cpu_active_mask,
769
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
770
+ if (cpu == i)
771
+ continue;
772
+
773
+ if (!idle_cpu(i)) {
774
+ cpu = i;
775
+ goto unlock;
776
+ }
777
+ }
778
+
779
+ /* no active, not-idle, housekpeeing CPU found. */
780
+ default_cpu = cpumask_any(cpu_active_mask);
781
+
782
+ if (unlikely(default_cpu >= nr_cpu_ids))
783
+ goto unlock;
784
+ }
785
+
786
+ cpu = default_cpu;
534787 unlock:
535788 rcu_read_unlock();
536789 return cpu;
....@@ -590,29 +843,23 @@
590843 wake_up_idle_cpu(cpu);
591844 }
592845
593
-static inline bool got_nohz_idle_kick(void)
846
+static void nohz_csd_func(void *info)
594847 {
595
- int cpu = smp_processor_id();
596
-
597
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
598
- return false;
599
-
600
- if (idle_cpu(cpu) && !need_resched())
601
- return true;
848
+ struct rq *rq = info;
849
+ int cpu = cpu_of(rq);
850
+ unsigned int flags;
602851
603852 /*
604
- * We can't run Idle Load Balance on this CPU for this time so we
605
- * cancel it and clear NOHZ_BALANCE_KICK
853
+ * Release the rq::nohz_csd.
606854 */
607
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
608
- return false;
609
-}
855
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
856
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
610857
611
-#else /* CONFIG_NO_HZ_COMMON */
612
-
613
-static inline bool got_nohz_idle_kick(void)
614
-{
615
- return false;
858
+ rq->idle_balance = idle_cpu(cpu);
859
+ if (rq->idle_balance && !need_resched()) {
860
+ rq->nohz_idle_balance = flags;
861
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
862
+ }
616863 }
617864
618865 #endif /* CONFIG_NO_HZ_COMMON */
....@@ -703,18 +950,18 @@
703950 }
704951 #endif
705952
706
-static void set_load_weight(struct task_struct *p, bool update_load)
953
+static void set_load_weight(struct task_struct *p)
707954 {
955
+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
708956 int prio = p->static_prio - MAX_RT_PRIO;
709957 struct load_weight *load = &p->se.load;
710958
711959 /*
712960 * SCHED_IDLE tasks get minimal weight:
713961 */
714
- if (idle_policy(p->policy)) {
962
+ if (task_has_idle_policy(p)) {
715963 load->weight = scale_load(WEIGHT_IDLEPRIO);
716964 load->inv_weight = WMULT_IDLEPRIO;
717
- p->se.runnable_weight = load->weight;
718965 return;
719966 }
720967
....@@ -727,7 +974,6 @@
727974 } else {
728975 load->weight = scale_load(sched_prio_to_weight[prio]);
729976 load->inv_weight = sched_prio_to_wmult[prio];
730
- p->se.runnable_weight = load->weight;
731977 }
732978 }
733979
....@@ -750,8 +996,46 @@
750996 /* Max allowed maximum utilization */
751997 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
752998
999
+/*
1000
+ * By default RT tasks run at the maximum performance point/capacity of the
1001
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1002
+ * SCHED_CAPACITY_SCALE.
1003
+ *
1004
+ * This knob allows admins to change the default behavior when uclamp is being
1005
+ * used. In battery powered devices, particularly, running at the maximum
1006
+ * capacity and frequency will increase energy consumption and shorten the
1007
+ * battery life.
1008
+ *
1009
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1010
+ *
1011
+ * This knob will not override the system default sched_util_clamp_min defined
1012
+ * above.
1013
+ */
1014
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1015
+
7531016 /* All clamps are required to be less or equal than these values */
7541017 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1018
+
1019
+/*
1020
+ * This static key is used to reduce the uclamp overhead in the fast path. It
1021
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
1022
+ * enqueue/dequeue_task().
1023
+ *
1024
+ * This allows users to continue to enable uclamp in their kernel config with
1025
+ * minimum uclamp overhead in the fast path.
1026
+ *
1027
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
1028
+ * enabled, since we have an actual users that make use of uclamp
1029
+ * functionality.
1030
+ *
1031
+ * The knobs that would enable this static key are:
1032
+ *
1033
+ * * A task modifying its uclamp value with sched_setattr().
1034
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1035
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
1036
+ */
1037
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1038
+EXPORT_SYMBOL_GPL(sched_uclamp_used);
7551039
7561040 /* Integer rounded range for each bucket */
7571041 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
....@@ -762,11 +1046,6 @@
7621046 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
7631047 {
7641048 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
765
-}
766
-
767
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
768
-{
769
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
7701049 }
7711050
7721051 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
....@@ -832,12 +1111,79 @@
8321111 return uclamp_idle_value(rq, clamp_id, clamp_value);
8331112 }
8341113
1114
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1115
+{
1116
+ unsigned int default_util_min;
1117
+ struct uclamp_se *uc_se;
1118
+
1119
+ lockdep_assert_held(&p->pi_lock);
1120
+
1121
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
1122
+
1123
+ /* Only sync if user didn't override the default */
1124
+ if (uc_se->user_defined)
1125
+ return;
1126
+
1127
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1128
+ uclamp_se_set(uc_se, default_util_min, false);
1129
+}
1130
+
1131
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
1132
+{
1133
+ struct rq_flags rf;
1134
+ struct rq *rq;
1135
+
1136
+ if (!rt_task(p))
1137
+ return;
1138
+
1139
+ /* Protect updates to p->uclamp_* */
1140
+ rq = task_rq_lock(p, &rf);
1141
+ __uclamp_update_util_min_rt_default(p);
1142
+ task_rq_unlock(rq, p, &rf);
1143
+}
1144
+
1145
+static void uclamp_sync_util_min_rt_default(void)
1146
+{
1147
+ struct task_struct *g, *p;
1148
+
1149
+ /*
1150
+ * copy_process() sysctl_uclamp
1151
+ * uclamp_min_rt = X;
1152
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1153
+ * // link thread smp_mb__after_spinlock()
1154
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1155
+ * sched_post_fork() for_each_process_thread()
1156
+ * __uclamp_sync_rt() __uclamp_sync_rt()
1157
+ *
1158
+ * Ensures that either sched_post_fork() will observe the new
1159
+ * uclamp_min_rt or for_each_process_thread() will observe the new
1160
+ * task.
1161
+ */
1162
+ read_lock(&tasklist_lock);
1163
+ smp_mb__after_spinlock();
1164
+ read_unlock(&tasklist_lock);
1165
+
1166
+ rcu_read_lock();
1167
+ for_each_process_thread(g, p)
1168
+ uclamp_update_util_min_rt_default(p);
1169
+ rcu_read_unlock();
1170
+}
1171
+
1172
+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
1173
+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
1174
+{
1175
+ uclamp_sync_util_min_rt_default();
1176
+}
1177
+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
1178
+#endif
1179
+
8351180 static inline struct uclamp_se
8361181 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
8371182 {
1183
+ /* Copy by value as we could modify it */
8381184 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
8391185 #ifdef CONFIG_UCLAMP_TASK_GROUP
840
- struct uclamp_se uc_max;
1186
+ unsigned int tg_min, tg_max, value;
8411187
8421188 /*
8431189 * Tasks in autogroups or root task group will be
....@@ -848,9 +1194,11 @@
8481194 if (task_group(p) == &root_task_group)
8491195 return uc_req;
8501196
851
- uc_max = task_group(p)->uclamp[clamp_id];
852
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
853
- return uc_max;
1197
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1198
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1199
+ value = uc_req.value;
1200
+ value = clamp(value, tg_min, tg_max);
1201
+ uclamp_se_set(&uc_req, value, false);
8541202 #endif
8551203
8561204 return uc_req;
....@@ -869,6 +1217,12 @@
8691217 {
8701218 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
8711219 struct uclamp_se uc_max = uclamp_default[clamp_id];
1220
+ struct uclamp_se uc_eff;
1221
+ int ret = 0;
1222
+
1223
+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
1224
+ if (ret)
1225
+ return uc_eff;
8721226
8731227 /* System default restrictions always apply */
8741228 if (unlikely(uc_req.value > uc_max.value))
....@@ -889,6 +1243,7 @@
8891243
8901244 return (unsigned long)uc_eff.value;
8911245 }
1246
+EXPORT_SYMBOL_GPL(uclamp_eff_value);
8921247
8931248 /*
8941249 * When a task is enqueued on a rq, the clamp bucket currently defined by the
....@@ -949,10 +1304,38 @@
9491304
9501305 lockdep_assert_held(&rq->lock);
9511306
1307
+ /*
1308
+ * If sched_uclamp_used was enabled after task @p was enqueued,
1309
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
1310
+ *
1311
+ * In this case the uc_se->active flag should be false since no uclamp
1312
+ * accounting was performed at enqueue time and we can just return
1313
+ * here.
1314
+ *
1315
+ * Need to be careful of the following enqeueue/dequeue ordering
1316
+ * problem too
1317
+ *
1318
+ * enqueue(taskA)
1319
+ * // sched_uclamp_used gets enabled
1320
+ * enqueue(taskB)
1321
+ * dequeue(taskA)
1322
+ * // Must not decrement bukcet->tasks here
1323
+ * dequeue(taskB)
1324
+ *
1325
+ * where we could end up with stale data in uc_se and
1326
+ * bucket[uc_se->bucket_id].
1327
+ *
1328
+ * The following check here eliminates the possibility of such race.
1329
+ */
1330
+ if (unlikely(!uc_se->active))
1331
+ return;
1332
+
9521333 bucket = &uc_rq->bucket[uc_se->bucket_id];
1334
+
9531335 SCHED_WARN_ON(!bucket->tasks);
9541336 if (likely(bucket->tasks))
9551337 bucket->tasks--;
1338
+
9561339 uc_se->active = false;
9571340
9581341 /*
....@@ -980,6 +1363,15 @@
9801363 {
9811364 enum uclamp_id clamp_id;
9821365
1366
+ /*
1367
+ * Avoid any overhead until uclamp is actually used by the userspace.
1368
+ *
1369
+ * The condition is constructed such that a NOP is generated when
1370
+ * sched_uclamp_used is disabled.
1371
+ */
1372
+ if (!static_branch_unlikely(&sched_uclamp_used))
1373
+ return;
1374
+
9831375 if (unlikely(!p->sched_class->uclamp_enabled))
9841376 return;
9851377
....@@ -995,6 +1387,15 @@
9951387 {
9961388 enum uclamp_id clamp_id;
9971389
1390
+ /*
1391
+ * Avoid any overhead until uclamp is actually used by the userspace.
1392
+ *
1393
+ * The condition is constructed such that a NOP is generated when
1394
+ * sched_uclamp_used is disabled.
1395
+ */
1396
+ if (!static_branch_unlikely(&sched_uclamp_used))
1397
+ return;
1398
+
9981399 if (unlikely(!p->sched_class->uclamp_enabled))
9991400 return;
10001401
....@@ -1002,9 +1403,27 @@
10021403 uclamp_rq_dec_id(rq, p, clamp_id);
10031404 }
10041405
1005
-static inline void
1006
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1406
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1407
+ enum uclamp_id clamp_id)
10071408 {
1409
+ if (!p->uclamp[clamp_id].active)
1410
+ return;
1411
+
1412
+ uclamp_rq_dec_id(rq, p, clamp_id);
1413
+ uclamp_rq_inc_id(rq, p, clamp_id);
1414
+
1415
+ /*
1416
+ * Make sure to clear the idle flag if we've transiently reached 0
1417
+ * active tasks on rq.
1418
+ */
1419
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1420
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1421
+}
1422
+
1423
+static inline void
1424
+uclamp_update_active(struct task_struct *p)
1425
+{
1426
+ enum uclamp_id clamp_id;
10081427 struct rq_flags rf;
10091428 struct rq *rq;
10101429
....@@ -1024,30 +1443,22 @@
10241443 * affecting a valid clamp bucket, the next time it's enqueued,
10251444 * it will already see the updated clamp bucket value.
10261445 */
1027
- if (p->uclamp[clamp_id].active) {
1028
- uclamp_rq_dec_id(rq, p, clamp_id);
1029
- uclamp_rq_inc_id(rq, p, clamp_id);
1030
- }
1446
+ for_each_clamp_id(clamp_id)
1447
+ uclamp_rq_reinc_id(rq, p, clamp_id);
10311448
10321449 task_rq_unlock(rq, p, &rf);
10331450 }
10341451
10351452 #ifdef CONFIG_UCLAMP_TASK_GROUP
10361453 static inline void
1037
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1038
- unsigned int clamps)
1454
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
10391455 {
1040
- enum uclamp_id clamp_id;
10411456 struct css_task_iter it;
10421457 struct task_struct *p;
10431458
10441459 css_task_iter_start(css, 0, &it);
1045
- while ((p = css_task_iter_next(&it))) {
1046
- for_each_clamp_id(clamp_id) {
1047
- if ((0x1 << clamp_id) & clamps)
1048
- uclamp_update_active(p, clamp_id);
1049
- }
1050
- }
1460
+ while ((p = css_task_iter_next(&it)))
1461
+ uclamp_update_active(p);
10511462 css_task_iter_end(&it);
10521463 }
10531464
....@@ -1070,16 +1481,16 @@
10701481 #endif
10711482
10721483 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1073
- void __user *buffer, size_t *lenp,
1074
- loff_t *ppos)
1484
+ void *buffer, size_t *lenp, loff_t *ppos)
10751485 {
10761486 bool update_root_tg = false;
1077
- int old_min, old_max;
1487
+ int old_min, old_max, old_min_rt;
10781488 int result;
10791489
10801490 mutex_lock(&uclamp_mutex);
10811491 old_min = sysctl_sched_uclamp_util_min;
10821492 old_max = sysctl_sched_uclamp_util_max;
1493
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
10831494
10841495 result = proc_dointvec(table, write, buffer, lenp, ppos);
10851496 if (result)
....@@ -1088,7 +1499,9 @@
10881499 goto done;
10891500
10901501 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1091
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1502
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1503
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1504
+
10921505 result = -EINVAL;
10931506 goto undo;
10941507 }
....@@ -1104,8 +1517,15 @@
11041517 update_root_tg = true;
11051518 }
11061519
1107
- if (update_root_tg)
1520
+ if (update_root_tg) {
1521
+ static_branch_enable(&sched_uclamp_used);
11081522 uclamp_update_root_tg();
1523
+ }
1524
+
1525
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1526
+ static_branch_enable(&sched_uclamp_used);
1527
+ uclamp_sync_util_min_rt_default();
1528
+ }
11091529
11101530 /*
11111531 * We update all RUNNABLE tasks only when task groups are in use.
....@@ -1118,6 +1538,7 @@
11181538 undo:
11191539 sysctl_sched_uclamp_util_min = old_min;
11201540 sysctl_sched_uclamp_util_max = old_max;
1541
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
11211542 done:
11221543 mutex_unlock(&uclamp_mutex);
11231544
....@@ -1127,20 +1548,61 @@
11271548 static int uclamp_validate(struct task_struct *p,
11281549 const struct sched_attr *attr)
11291550 {
1130
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1131
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1551
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
1552
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
11321553
1133
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1134
- lower_bound = attr->sched_util_min;
1135
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1136
- upper_bound = attr->sched_util_max;
1554
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1555
+ util_min = attr->sched_util_min;
11371556
1138
- if (lower_bound > upper_bound)
1557
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1558
+ return -EINVAL;
1559
+ }
1560
+
1561
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1562
+ util_max = attr->sched_util_max;
1563
+
1564
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1565
+ return -EINVAL;
1566
+ }
1567
+
1568
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
11391569 return -EINVAL;
1140
- if (upper_bound > SCHED_CAPACITY_SCALE)
1141
- return -EINVAL;
1570
+
1571
+ /*
1572
+ * We have valid uclamp attributes; make sure uclamp is enabled.
1573
+ *
1574
+ * We need to do that here, because enabling static branches is a
1575
+ * blocking operation which obviously cannot be done while holding
1576
+ * scheduler locks.
1577
+ */
1578
+ static_branch_enable(&sched_uclamp_used);
11421579
11431580 return 0;
1581
+}
1582
+
1583
+static bool uclamp_reset(const struct sched_attr *attr,
1584
+ enum uclamp_id clamp_id,
1585
+ struct uclamp_se *uc_se)
1586
+{
1587
+ /* Reset on sched class change for a non user-defined clamp value. */
1588
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1589
+ !uc_se->user_defined)
1590
+ return true;
1591
+
1592
+ /* Reset on sched_util_{min,max} == -1. */
1593
+ if (clamp_id == UCLAMP_MIN &&
1594
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1595
+ attr->sched_util_min == -1) {
1596
+ return true;
1597
+ }
1598
+
1599
+ if (clamp_id == UCLAMP_MAX &&
1600
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1601
+ attr->sched_util_max == -1) {
1602
+ return true;
1603
+ }
1604
+
1605
+ return false;
11441606 }
11451607
11461608 static void __setscheduler_uclamp(struct task_struct *p,
....@@ -1148,40 +1610,41 @@
11481610 {
11491611 enum uclamp_id clamp_id;
11501612
1151
- /*
1152
- * On scheduling class change, reset to default clamps for tasks
1153
- * without a task-specific value.
1154
- */
11551613 for_each_clamp_id(clamp_id) {
11561614 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1157
- unsigned int clamp_value = uclamp_none(clamp_id);
1615
+ unsigned int value;
11581616
1159
- /* Keep using defined clamps across class changes */
1160
- if (uc_se->user_defined)
1617
+ if (!uclamp_reset(attr, clamp_id, uc_se))
11611618 continue;
11621619
1163
- /* By default, RT tasks always get 100% boost */
1164
- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1165
- unlikely(rt_task(p) &&
1166
- clamp_id == UCLAMP_MIN)) {
1620
+ /*
1621
+ * RT by default have a 100% boost value that could be modified
1622
+ * at runtime.
1623
+ */
1624
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1625
+ value = sysctl_sched_uclamp_util_min_rt_default;
1626
+ else
1627
+ value = uclamp_none(clamp_id);
11671628
1168
- clamp_value = uclamp_none(UCLAMP_MAX);
1169
- }
1629
+ uclamp_se_set(uc_se, value, false);
11701630
1171
- uclamp_se_set(uc_se, clamp_value, false);
11721631 }
11731632
11741633 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
11751634 return;
11761635
1177
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1636
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1637
+ attr->sched_util_min != -1) {
11781638 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
11791639 attr->sched_util_min, true);
1640
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
11801641 }
11811642
1182
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1643
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1644
+ attr->sched_util_max != -1) {
11831645 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
11841646 attr->sched_util_max, true);
1647
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
11851648 }
11861649 }
11871650
....@@ -1189,6 +1652,10 @@
11891652 {
11901653 enum uclamp_id clamp_id;
11911654
1655
+ /*
1656
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1657
+ * as the task is still at its early fork stages.
1658
+ */
11921659 for_each_clamp_id(clamp_id)
11931660 p->uclamp[clamp_id].active = false;
11941661
....@@ -1201,39 +1668,24 @@
12011668 }
12021669 }
12031670
1204
-#ifdef CONFIG_SMP
1205
-unsigned int uclamp_task(struct task_struct *p)
1671
+static void uclamp_post_fork(struct task_struct *p)
12061672 {
1207
- unsigned long util;
1208
-
1209
- util = task_util_est(p);
1210
- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1211
- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1212
-
1213
- return util;
1673
+ uclamp_update_util_min_rt_default(p);
12141674 }
12151675
1216
-bool uclamp_boosted(struct task_struct *p)
1676
+static void __init init_uclamp_rq(struct rq *rq)
12171677 {
1218
- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
1678
+ enum uclamp_id clamp_id;
1679
+ struct uclamp_rq *uc_rq = rq->uclamp;
1680
+
1681
+ for_each_clamp_id(clamp_id) {
1682
+ uc_rq[clamp_id] = (struct uclamp_rq) {
1683
+ .value = uclamp_none(clamp_id)
1684
+ };
1685
+ }
1686
+
1687
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
12191688 }
1220
-
1221
-bool uclamp_latency_sensitive(struct task_struct *p)
1222
-{
1223
-#ifdef CONFIG_UCLAMP_TASK_GROUP
1224
- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1225
- struct task_group *tg;
1226
-
1227
- if (!css)
1228
- return false;
1229
- tg = container_of(css, struct task_group, css);
1230
-
1231
- return tg->latency_sensitive;
1232
-#else
1233
- return false;
1234
-#endif
1235
-}
1236
-#endif /* CONFIG_SMP */
12371689
12381690 static void __init init_uclamp(void)
12391691 {
....@@ -1241,13 +1693,8 @@
12411693 enum uclamp_id clamp_id;
12421694 int cpu;
12431695
1244
- mutex_init(&uclamp_mutex);
1245
-
1246
- for_each_possible_cpu(cpu) {
1247
- memset(&cpu_rq(cpu)->uclamp, 0,
1248
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1249
- cpu_rq(cpu)->uclamp_flags = 0;
1250
- }
1696
+ for_each_possible_cpu(cpu)
1697
+ init_uclamp_rq(cpu_rq(cpu));
12511698
12521699 for_each_clamp_id(clamp_id) {
12531700 uclamp_se_set(&init_task.uclamp_req[clamp_id],
....@@ -1276,41 +1723,7 @@
12761723 static void __setscheduler_uclamp(struct task_struct *p,
12771724 const struct sched_attr *attr) { }
12781725 static inline void uclamp_fork(struct task_struct *p) { }
1279
-
1280
-long schedtune_task_margin(struct task_struct *task);
1281
-
1282
-#ifdef CONFIG_SMP
1283
-unsigned int uclamp_task(struct task_struct *p)
1284
-{
1285
- unsigned long util = task_util_est(p);
1286
-#ifdef CONFIG_SCHED_TUNE
1287
- long margin = schedtune_task_margin(p);
1288
-
1289
- trace_sched_boost_task(p, util, margin);
1290
-
1291
- util += margin;
1292
-#endif
1293
-
1294
- return util;
1295
-}
1296
-
1297
-bool uclamp_boosted(struct task_struct *p)
1298
-{
1299
-#ifdef CONFIG_SCHED_TUNE
1300
- return schedtune_task_boost(p) > 0;
1301
-#endif
1302
- return false;
1303
-}
1304
-
1305
-bool uclamp_latency_sensitive(struct task_struct *p)
1306
-{
1307
-#ifdef CONFIG_SCHED_TUNE
1308
- return schedtune_prefer_idle(p) != 0;
1309
-#endif
1310
- return false;
1311
-}
1312
-#endif /* CONFIG_SMP */
1313
-
1726
+static inline void uclamp_post_fork(struct task_struct *p) { }
13141727 static inline void init_uclamp(void) { }
13151728 #endif /* CONFIG_UCLAMP_TASK */
13161729
....@@ -1325,7 +1738,9 @@
13251738 }
13261739
13271740 uclamp_rq_inc(rq, p);
1741
+ trace_android_rvh_enqueue_task(rq, p, flags);
13281742 p->sched_class->enqueue_task(rq, p, flags);
1743
+ trace_android_rvh_after_enqueue_task(rq, p);
13291744 }
13301745
13311746 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
....@@ -1339,31 +1754,39 @@
13391754 }
13401755
13411756 uclamp_rq_dec(rq, p);
1757
+ trace_android_rvh_dequeue_task(rq, p, flags);
13421758 p->sched_class->dequeue_task(rq, p, flags);
1759
+ trace_android_rvh_after_dequeue_task(rq, p);
13431760 }
13441761
13451762 void activate_task(struct rq *rq, struct task_struct *p, int flags)
13461763 {
1347
- if (task_contributes_to_load(p))
1348
- rq->nr_uninterruptible--;
1349
-
13501764 enqueue_task(rq, p, flags);
1765
+
1766
+ p->on_rq = TASK_ON_RQ_QUEUED;
13511767 }
1768
+EXPORT_SYMBOL_GPL(activate_task);
13521769
13531770 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
13541771 {
1355
- if (task_contributes_to_load(p))
1356
- rq->nr_uninterruptible++;
1772
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
13571773
13581774 dequeue_task(rq, p, flags);
13591775 }
1776
+EXPORT_SYMBOL_GPL(deactivate_task);
13601777
1361
-/*
1362
- * __normal_prio - return the priority that is based on the static prio
1363
- */
1364
-static inline int __normal_prio(struct task_struct *p)
1778
+static inline int __normal_prio(int policy, int rt_prio, int nice)
13651779 {
1366
- return p->static_prio;
1780
+ int prio;
1781
+
1782
+ if (dl_policy(policy))
1783
+ prio = MAX_DL_PRIO - 1;
1784
+ else if (rt_policy(policy))
1785
+ prio = MAX_RT_PRIO - 1 - rt_prio;
1786
+ else
1787
+ prio = NICE_TO_PRIO(nice);
1788
+
1789
+ return prio;
13671790 }
13681791
13691792 /*
....@@ -1375,15 +1798,7 @@
13751798 */
13761799 static inline int normal_prio(struct task_struct *p)
13771800 {
1378
- int prio;
1379
-
1380
- if (task_has_dl_policy(p))
1381
- prio = MAX_DL_PRIO-1;
1382
- else if (task_has_rt_policy(p))
1383
- prio = MAX_RT_PRIO-1 - p->rt_priority;
1384
- else
1385
- prio = __normal_prio(p);
1386
- return prio;
1801
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
13871802 }
13881803
13891804 /*
....@@ -1439,20 +1854,10 @@
14391854
14401855 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
14411856 {
1442
- const struct sched_class *class;
1443
-
1444
- if (p->sched_class == rq->curr->sched_class) {
1857
+ if (p->sched_class == rq->curr->sched_class)
14451858 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1446
- } else {
1447
- for_each_class(class) {
1448
- if (class == rq->curr->sched_class)
1449
- break;
1450
- if (class == p->sched_class) {
1451
- resched_curr(rq);
1452
- break;
1453
- }
1454
- }
1455
- }
1859
+ else if (p->sched_class > rq->curr->sched_class)
1860
+ resched_curr(rq);
14561861
14571862 /*
14581863 * A queue event has occurred, and we're going to schedule. In
....@@ -1461,33 +1866,102 @@
14611866 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
14621867 rq_clock_skip_update(rq);
14631868 }
1869
+EXPORT_SYMBOL_GPL(check_preempt_curr);
14641870
14651871 #ifdef CONFIG_SMP
14661872
1467
-static inline bool is_per_cpu_kthread(struct task_struct *p)
1873
+static void
1874
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
1875
+
1876
+static int __set_cpus_allowed_ptr(struct task_struct *p,
1877
+ const struct cpumask *new_mask,
1878
+ u32 flags);
1879
+
1880
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
14681881 {
1469
- if (!(p->flags & PF_KTHREAD))
1470
- return false;
1882
+ if (likely(!p->migration_disabled))
1883
+ return;
14711884
1472
- if (p->nr_cpus_allowed != 1)
1473
- return false;
1885
+ if (p->cpus_ptr != &p->cpus_mask)
1886
+ return;
14741887
1475
- return true;
1888
+ /*
1889
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
1890
+ */
1891
+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
1892
+}
1893
+
1894
+void migrate_disable(void)
1895
+{
1896
+ struct task_struct *p = current;
1897
+
1898
+ if (p->migration_disabled) {
1899
+ p->migration_disabled++;
1900
+ return;
1901
+ }
1902
+
1903
+ trace_sched_migrate_disable_tp(p);
1904
+
1905
+ preempt_disable();
1906
+ this_rq()->nr_pinned++;
1907
+ p->migration_disabled = 1;
1908
+ preempt_lazy_disable();
1909
+ preempt_enable();
1910
+}
1911
+EXPORT_SYMBOL_GPL(migrate_disable);
1912
+
1913
+void migrate_enable(void)
1914
+{
1915
+ struct task_struct *p = current;
1916
+
1917
+ if (p->migration_disabled > 1) {
1918
+ p->migration_disabled--;
1919
+ return;
1920
+ }
1921
+
1922
+ /*
1923
+ * Ensure stop_task runs either before or after this, and that
1924
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
1925
+ */
1926
+ preempt_disable();
1927
+ if (p->cpus_ptr != &p->cpus_mask)
1928
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
1929
+ /*
1930
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
1931
+ * regular cpus_mask, otherwise things that race (eg.
1932
+ * select_fallback_rq) get confused.
1933
+ */
1934
+ barrier();
1935
+ p->migration_disabled = 0;
1936
+ this_rq()->nr_pinned--;
1937
+ preempt_lazy_enable();
1938
+ preempt_enable();
1939
+
1940
+ trace_sched_migrate_enable_tp(p);
1941
+}
1942
+EXPORT_SYMBOL_GPL(migrate_enable);
1943
+
1944
+static inline bool rq_has_pinned_tasks(struct rq *rq)
1945
+{
1946
+ return rq->nr_pinned;
14761947 }
14771948
14781949 /*
1479
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
1950
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
14801951 * __set_cpus_allowed_ptr() and select_fallback_rq().
14811952 */
14821953 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
14831954 {
1484
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
1955
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
14851956 return false;
14861957
1487
- if (is_per_cpu_kthread(p))
1958
+ if (is_per_cpu_kthread(p) || is_migration_disabled(p))
14881959 return cpu_online(cpu);
14891960
1490
- return cpu_active(cpu);
1961
+ if (!cpu_active(cpu))
1962
+ return false;
1963
+
1964
+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
14911965 }
14921966
14931967 /*
....@@ -1512,27 +1986,50 @@
15121986 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15131987 struct task_struct *p, int new_cpu)
15141988 {
1989
+ int detached = 0;
1990
+
15151991 lockdep_assert_held(&rq->lock);
15161992
1517
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1518
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1519
- set_task_cpu(p, new_cpu);
1520
- rq_unlock(rq, rf);
1993
+ /*
1994
+ * The vendor hook may drop the lock temporarily, so
1995
+ * pass the rq flags to unpin lock. We expect the
1996
+ * rq lock to be held after return.
1997
+ */
1998
+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
1999
+ if (detached)
2000
+ goto attach;
15212001
2002
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2003
+ set_task_cpu(p, new_cpu);
2004
+
2005
+attach:
2006
+ rq_unlock(rq, rf);
15222007 rq = cpu_rq(new_cpu);
15232008
15242009 rq_lock(rq, rf);
15252010 BUG_ON(task_cpu(p) != new_cpu);
1526
- enqueue_task(rq, p, 0);
1527
- p->on_rq = TASK_ON_RQ_QUEUED;
2011
+ activate_task(rq, p, 0);
15282012 check_preempt_curr(rq, p, 0);
15292013
15302014 return rq;
15312015 }
15322016
15332017 struct migration_arg {
1534
- struct task_struct *task;
1535
- int dest_cpu;
2018
+ struct task_struct *task;
2019
+ int dest_cpu;
2020
+ struct set_affinity_pending *pending;
2021
+};
2022
+
2023
+/*
2024
+ * @refs: number of wait_for_completion()
2025
+ * @stop_pending: is @stop_work in use
2026
+ */
2027
+struct set_affinity_pending {
2028
+ refcount_t refs;
2029
+ unsigned int stop_pending;
2030
+ struct completion done;
2031
+ struct cpu_stop_work stop_work;
2032
+ struct migration_arg arg;
15362033 };
15372034
15382035 /*
....@@ -1565,39 +2062,141 @@
15652062 static int migration_cpu_stop(void *data)
15662063 {
15672064 struct migration_arg *arg = data;
2065
+ struct set_affinity_pending *pending = arg->pending;
15682066 struct task_struct *p = arg->task;
15692067 struct rq *rq = this_rq();
2068
+ bool complete = false;
15702069 struct rq_flags rf;
15712070
15722071 /*
15732072 * The original target CPU might have gone down and we might
15742073 * be on another CPU but it doesn't matter.
15752074 */
1576
- local_irq_disable();
2075
+ local_irq_save(rf.flags);
15772076 /*
15782077 * We need to explicitly wake pending tasks before running
1579
- * __migrate_task() such that we will not miss enforcing cpus_allowed
2078
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
15802079 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
15812080 */
1582
- sched_ttwu_pending();
2081
+ flush_smp_call_function_from_idle();
15832082
15842083 raw_spin_lock(&p->pi_lock);
15852084 rq_lock(rq, &rf);
2085
+
15862086 /*
15872087 * If task_rq(p) != rq, it cannot be migrated here, because we're
15882088 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
15892089 * we're holding p->pi_lock.
15902090 */
15912091 if (task_rq(p) == rq) {
2092
+ if (is_migration_disabled(p))
2093
+ goto out;
2094
+
2095
+ if (pending) {
2096
+ if (p->migration_pending == pending)
2097
+ p->migration_pending = NULL;
2098
+ complete = true;
2099
+
2100
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2101
+ goto out;
2102
+ }
2103
+
15922104 if (task_on_rq_queued(p))
15932105 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
15942106 else
15952107 p->wake_cpu = arg->dest_cpu;
1596
- }
1597
- rq_unlock(rq, &rf);
1598
- raw_spin_unlock(&p->pi_lock);
15992108
1600
- local_irq_enable();
2109
+ /*
2110
+ * XXX __migrate_task() can fail, at which point we might end
2111
+ * up running on a dodgy CPU, AFAICT this can only happen
2112
+ * during CPU hotplug, at which point we'll get pushed out
2113
+ * anyway, so it's probably not a big deal.
2114
+ */
2115
+
2116
+ } else if (pending) {
2117
+ /*
2118
+ * This happens when we get migrated between migrate_enable()'s
2119
+ * preempt_enable() and scheduling the stopper task. At that
2120
+ * point we're a regular task again and not current anymore.
2121
+ *
2122
+ * A !PREEMPT kernel has a giant hole here, which makes it far
2123
+ * more likely.
2124
+ */
2125
+
2126
+ /*
2127
+ * The task moved before the stopper got to run. We're holding
2128
+ * ->pi_lock, so the allowed mask is stable - if it got
2129
+ * somewhere allowed, we're done.
2130
+ */
2131
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2132
+ if (p->migration_pending == pending)
2133
+ p->migration_pending = NULL;
2134
+ complete = true;
2135
+ goto out;
2136
+ }
2137
+
2138
+ /*
2139
+ * When migrate_enable() hits a rq mis-match we can't reliably
2140
+ * determine is_migration_disabled() and so have to chase after
2141
+ * it.
2142
+ */
2143
+ WARN_ON_ONCE(!pending->stop_pending);
2144
+ task_rq_unlock(rq, p, &rf);
2145
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2146
+ &pending->arg, &pending->stop_work);
2147
+ return 0;
2148
+ }
2149
+out:
2150
+ if (pending)
2151
+ pending->stop_pending = false;
2152
+ task_rq_unlock(rq, p, &rf);
2153
+
2154
+ if (complete)
2155
+ complete_all(&pending->done);
2156
+
2157
+ return 0;
2158
+}
2159
+
2160
+int push_cpu_stop(void *arg)
2161
+{
2162
+ struct rq *lowest_rq = NULL, *rq = this_rq();
2163
+ struct task_struct *p = arg;
2164
+
2165
+ raw_spin_lock_irq(&p->pi_lock);
2166
+ raw_spin_lock(&rq->lock);
2167
+
2168
+ if (task_rq(p) != rq)
2169
+ goto out_unlock;
2170
+
2171
+ if (is_migration_disabled(p)) {
2172
+ p->migration_flags |= MDF_PUSH;
2173
+ goto out_unlock;
2174
+ }
2175
+
2176
+ p->migration_flags &= ~MDF_PUSH;
2177
+
2178
+ if (p->sched_class->find_lock_rq)
2179
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
2180
+
2181
+ if (!lowest_rq)
2182
+ goto out_unlock;
2183
+
2184
+ // XXX validate p is still the highest prio task
2185
+ if (task_rq(p) == rq) {
2186
+ deactivate_task(rq, p, 0);
2187
+ set_task_cpu(p, lowest_rq->cpu);
2188
+ activate_task(lowest_rq, p, 0);
2189
+ resched_curr(lowest_rq);
2190
+ }
2191
+
2192
+ double_unlock_balance(rq, lowest_rq);
2193
+
2194
+out_unlock:
2195
+ rq->push_busy = false;
2196
+ raw_spin_unlock(&rq->lock);
2197
+ raw_spin_unlock_irq(&p->pi_lock);
2198
+
2199
+ put_task_struct(p);
16012200 return 0;
16022201 }
16032202
....@@ -1605,18 +2204,40 @@
16052204 * sched_class::set_cpus_allowed must do the below, but is not required to
16062205 * actually call this function.
16072206 */
1608
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
2207
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
16092208 {
1610
- cpumask_copy(&p->cpus_allowed, new_mask);
2209
+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2210
+ p->cpus_ptr = new_mask;
2211
+ return;
2212
+ }
2213
+
2214
+ cpumask_copy(&p->cpus_mask, new_mask);
16112215 p->nr_cpus_allowed = cpumask_weight(new_mask);
2216
+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
16122217 }
16132218
1614
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2219
+static void
2220
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
16152221 {
16162222 struct rq *rq = task_rq(p);
16172223 bool queued, running;
16182224
1619
- lockdep_assert_held(&p->pi_lock);
2225
+ /*
2226
+ * This here violates the locking rules for affinity, since we're only
2227
+ * supposed to change these variables while holding both rq->lock and
2228
+ * p->pi_lock.
2229
+ *
2230
+ * HOWEVER, it magically works, because ttwu() is the only code that
2231
+ * accesses these variables under p->pi_lock and only does so after
2232
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2233
+ * before finish_task().
2234
+ *
2235
+ * XXX do further audits, this smells like something putrid.
2236
+ */
2237
+ if (flags & SCA_MIGRATE_DISABLE)
2238
+ SCHED_WARN_ON(!p->on_cpu);
2239
+ else
2240
+ lockdep_assert_held(&p->pi_lock);
16202241
16212242 queued = task_on_rq_queued(p);
16222243 running = task_current(rq, p);
....@@ -1632,12 +2253,312 @@
16322253 if (running)
16332254 put_prev_task(rq, p);
16342255
1635
- p->sched_class->set_cpus_allowed(p, new_mask);
2256
+ p->sched_class->set_cpus_allowed(p, new_mask, flags);
16362257
16372258 if (queued)
16382259 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
16392260 if (running)
1640
- set_curr_task(rq, p);
2261
+ set_next_task(rq, p);
2262
+}
2263
+
2264
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2265
+ int dest_cpu, unsigned int flags);
2266
+/*
2267
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
2268
+ */
2269
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2270
+ const struct cpumask *new_mask,
2271
+ u32 flags,
2272
+ struct rq *rq,
2273
+ struct rq_flags *rf)
2274
+{
2275
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
2276
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2277
+ unsigned int dest_cpu;
2278
+ int ret = 0;
2279
+
2280
+ update_rq_clock(rq);
2281
+
2282
+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2283
+ /*
2284
+ * Kernel threads are allowed on online && !active CPUs.
2285
+ *
2286
+ * Specifically, migration_disabled() tasks must not fail the
2287
+ * cpumask_any_and_distribute() pick below, esp. so on
2288
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
2289
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2290
+ */
2291
+ cpu_valid_mask = cpu_online_mask;
2292
+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
2293
+ ret = -EINVAL;
2294
+ goto out;
2295
+ }
2296
+
2297
+ /*
2298
+ * Must re-check here, to close a race against __kthread_bind(),
2299
+ * sched_setaffinity() is not guaranteed to observe the flag.
2300
+ */
2301
+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2302
+ ret = -EINVAL;
2303
+ goto out;
2304
+ }
2305
+
2306
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
2307
+ if (cpumask_equal(&p->cpus_mask, new_mask))
2308
+ goto out;
2309
+
2310
+ if (WARN_ON_ONCE(p == current &&
2311
+ is_migration_disabled(p) &&
2312
+ !cpumask_test_cpu(task_cpu(p), new_mask))) {
2313
+ ret = -EBUSY;
2314
+ goto out;
2315
+ }
2316
+ }
2317
+
2318
+ /*
2319
+ * Picking a ~random cpu helps in cases where we are changing affinity
2320
+ * for groups of tasks (ie. cpuset), so that load balancing is not
2321
+ * immediately required to distribute the tasks within their new mask.
2322
+ */
2323
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2324
+ if (dest_cpu >= nr_cpu_ids) {
2325
+ ret = -EINVAL;
2326
+ goto out;
2327
+ }
2328
+
2329
+ __do_set_cpus_allowed(p, new_mask, flags);
2330
+
2331
+ if (p->flags & PF_KTHREAD) {
2332
+ /*
2333
+ * For kernel threads that do indeed end up on online &&
2334
+ * !active we want to ensure they are strict per-CPU threads.
2335
+ */
2336
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
2337
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
2338
+ p->nr_cpus_allowed != 1);
2339
+ }
2340
+
2341
+ return affine_move_task(rq, p, rf, dest_cpu, flags);
2342
+out:
2343
+ task_rq_unlock(rq, p, rf);
2344
+
2345
+ return ret;
2346
+}
2347
+
2348
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2349
+{
2350
+ __do_set_cpus_allowed(p, new_mask, 0);
2351
+}
2352
+
2353
+/*
2354
+ * This function is wildly self concurrent; here be dragons.
2355
+ *
2356
+ *
2357
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2358
+ * designated task is enqueued on an allowed CPU. If that task is currently
2359
+ * running, we have to kick it out using the CPU stopper.
2360
+ *
2361
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
2362
+ * Consider:
2363
+ *
2364
+ * Initial conditions: P0->cpus_mask = [0, 1]
2365
+ *
2366
+ * P0@CPU0 P1
2367
+ *
2368
+ * migrate_disable();
2369
+ * <preempted>
2370
+ * set_cpus_allowed_ptr(P0, [1]);
2371
+ *
2372
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2373
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2374
+ * This means we need the following scheme:
2375
+ *
2376
+ * P0@CPU0 P1
2377
+ *
2378
+ * migrate_disable();
2379
+ * <preempted>
2380
+ * set_cpus_allowed_ptr(P0, [1]);
2381
+ * <blocks>
2382
+ * <resumes>
2383
+ * migrate_enable();
2384
+ * __set_cpus_allowed_ptr();
2385
+ * <wakes local stopper>
2386
+ * `--> <woken on migration completion>
2387
+ *
2388
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2389
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2390
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
2391
+ * should come into effect at the end of the Migrate-Disable region is the last
2392
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2393
+ * but we still need to properly signal those waiting tasks at the appropriate
2394
+ * moment.
2395
+ *
2396
+ * This is implemented using struct set_affinity_pending. The first
2397
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2398
+ * setup an instance of that struct and install it on the targeted task_struct.
2399
+ * Any and all further callers will reuse that instance. Those then wait for
2400
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
2401
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2402
+ *
2403
+ *
2404
+ * (1) In the cases covered above. There is one more where the completion is
2405
+ * signaled within affine_move_task() itself: when a subsequent affinity request
2406
+ * cancels the need for an active migration. Consider:
2407
+ *
2408
+ * Initial conditions: P0->cpus_mask = [0, 1]
2409
+ *
2410
+ * P0@CPU0 P1 P2
2411
+ *
2412
+ * migrate_disable();
2413
+ * <preempted>
2414
+ * set_cpus_allowed_ptr(P0, [1]);
2415
+ * <blocks>
2416
+ * set_cpus_allowed_ptr(P0, [0, 1]);
2417
+ * <signal completion>
2418
+ * <awakes>
2419
+ *
2420
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
2421
+ * pending affinity completion is preceded an uninstallion of
2422
+ * p->migration_pending done with p->pi_lock held.
2423
+ */
2424
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2425
+ int dest_cpu, unsigned int flags)
2426
+{
2427
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
2428
+ bool stop_pending, complete = false;
2429
+
2430
+ /* Can the task run on the task's current CPU? If so, we're done */
2431
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2432
+ struct task_struct *push_task = NULL;
2433
+
2434
+ if ((flags & SCA_MIGRATE_ENABLE) &&
2435
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2436
+ rq->push_busy = true;
2437
+ push_task = get_task_struct(p);
2438
+ }
2439
+
2440
+ /*
2441
+ * If there are pending waiters, but no pending stop_work,
2442
+ * then complete now.
2443
+ */
2444
+ pending = p->migration_pending;
2445
+ if (pending && !pending->stop_pending) {
2446
+ p->migration_pending = NULL;
2447
+ complete = true;
2448
+ }
2449
+
2450
+ task_rq_unlock(rq, p, rf);
2451
+
2452
+ if (push_task) {
2453
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2454
+ p, &rq->push_work);
2455
+ }
2456
+
2457
+ if (complete)
2458
+ complete_all(&pending->done);
2459
+
2460
+ return 0;
2461
+ }
2462
+
2463
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
2464
+ /* serialized by p->pi_lock */
2465
+ if (!p->migration_pending) {
2466
+ /* Install the request */
2467
+ refcount_set(&my_pending.refs, 1);
2468
+ init_completion(&my_pending.done);
2469
+ my_pending.arg = (struct migration_arg) {
2470
+ .task = p,
2471
+ .dest_cpu = dest_cpu,
2472
+ .pending = &my_pending,
2473
+ };
2474
+
2475
+ p->migration_pending = &my_pending;
2476
+ } else {
2477
+ pending = p->migration_pending;
2478
+ refcount_inc(&pending->refs);
2479
+ /*
2480
+ * Affinity has changed, but we've already installed a
2481
+ * pending. migration_cpu_stop() *must* see this, else
2482
+ * we risk a completion of the pending despite having a
2483
+ * task on a disallowed CPU.
2484
+ *
2485
+ * Serialized by p->pi_lock, so this is safe.
2486
+ */
2487
+ pending->arg.dest_cpu = dest_cpu;
2488
+ }
2489
+ }
2490
+ pending = p->migration_pending;
2491
+ /*
2492
+ * - !MIGRATE_ENABLE:
2493
+ * we'll have installed a pending if there wasn't one already.
2494
+ *
2495
+ * - MIGRATE_ENABLE:
2496
+ * we're here because the current CPU isn't matching anymore,
2497
+ * the only way that can happen is because of a concurrent
2498
+ * set_cpus_allowed_ptr() call, which should then still be
2499
+ * pending completion.
2500
+ *
2501
+ * Either way, we really should have a @pending here.
2502
+ */
2503
+ if (WARN_ON_ONCE(!pending)) {
2504
+ task_rq_unlock(rq, p, rf);
2505
+ return -EINVAL;
2506
+ }
2507
+
2508
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
2509
+ /*
2510
+ * MIGRATE_ENABLE gets here because 'p == current', but for
2511
+ * anything else we cannot do is_migration_disabled(), punt
2512
+ * and have the stopper function handle it all race-free.
2513
+ */
2514
+ stop_pending = pending->stop_pending;
2515
+ if (!stop_pending)
2516
+ pending->stop_pending = true;
2517
+
2518
+ if (flags & SCA_MIGRATE_ENABLE)
2519
+ p->migration_flags &= ~MDF_PUSH;
2520
+
2521
+ task_rq_unlock(rq, p, rf);
2522
+
2523
+ if (!stop_pending) {
2524
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2525
+ &pending->arg, &pending->stop_work);
2526
+ }
2527
+
2528
+ if (flags & SCA_MIGRATE_ENABLE)
2529
+ return 0;
2530
+ } else {
2531
+
2532
+ if (!is_migration_disabled(p)) {
2533
+ if (task_on_rq_queued(p))
2534
+ rq = move_queued_task(rq, rf, p, dest_cpu);
2535
+
2536
+ if (!pending->stop_pending) {
2537
+ p->migration_pending = NULL;
2538
+ complete = true;
2539
+ }
2540
+ }
2541
+ task_rq_unlock(rq, p, rf);
2542
+
2543
+ if (complete)
2544
+ complete_all(&pending->done);
2545
+ }
2546
+
2547
+ wait_for_completion(&pending->done);
2548
+
2549
+ if (refcount_dec_and_test(&pending->refs))
2550
+ wake_up_var(&pending->refs); /* No UaF, just an address */
2551
+
2552
+ /*
2553
+ * Block the original owner of &pending until all subsequent callers
2554
+ * have seen the completion and decremented the refcount
2555
+ */
2556
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2557
+
2558
+ /* ARGH */
2559
+ WARN_ON_ONCE(my_pending.stop_pending);
2560
+
2561
+ return 0;
16412562 }
16422563
16432564 /*
....@@ -1650,83 +2571,89 @@
16502571 * call is not atomic; no spinlocks may be held.
16512572 */
16522573 static int __set_cpus_allowed_ptr(struct task_struct *p,
1653
- const struct cpumask *new_mask, bool check)
2574
+ const struct cpumask *new_mask,
2575
+ u32 flags)
16542576 {
1655
- const struct cpumask *cpu_valid_mask = cpu_active_mask;
1656
- unsigned int dest_cpu;
16572577 struct rq_flags rf;
16582578 struct rq *rq;
1659
- int ret = 0;
16602579
16612580 rq = task_rq_lock(p, &rf);
1662
- update_rq_clock(rq);
1663
-
1664
- if (p->flags & PF_KTHREAD) {
1665
- /*
1666
- * Kernel threads are allowed on online && !active CPUs
1667
- */
1668
- cpu_valid_mask = cpu_online_mask;
1669
- }
1670
-
1671
- /*
1672
- * Must re-check here, to close a race against __kthread_bind(),
1673
- * sched_setaffinity() is not guaranteed to observe the flag.
1674
- */
1675
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
1676
- ret = -EINVAL;
1677
- goto out;
1678
- }
1679
-
1680
- if (cpumask_equal(&p->cpus_allowed, new_mask))
1681
- goto out;
1682
-
1683
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1684
- if (dest_cpu >= nr_cpu_ids) {
1685
- ret = -EINVAL;
1686
- goto out;
1687
- }
1688
-
1689
- do_set_cpus_allowed(p, new_mask);
1690
-
1691
- if (p->flags & PF_KTHREAD) {
1692
- /*
1693
- * For kernel threads that do indeed end up on online &&
1694
- * !active we want to ensure they are strict per-CPU threads.
1695
- */
1696
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1697
- !cpumask_intersects(new_mask, cpu_active_mask) &&
1698
- p->nr_cpus_allowed != 1);
1699
- }
1700
-
1701
- /* Can the task run on the task's current CPU? If so, we're done */
1702
- if (cpumask_test_cpu(task_cpu(p), new_mask))
1703
- goto out;
1704
-
1705
- if (task_running(rq, p) || p->state == TASK_WAKING) {
1706
- struct migration_arg arg = { p, dest_cpu };
1707
- /* Need help from migration thread: drop lock and wait. */
1708
- task_rq_unlock(rq, p, &rf);
1709
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1710
- tlb_migrate_finish(p->mm);
1711
- return 0;
1712
- } else if (task_on_rq_queued(p)) {
1713
- /*
1714
- * OK, since we're going to drop the lock immediately
1715
- * afterwards anyway.
1716
- */
1717
- rq = move_queued_task(rq, &rf, p, dest_cpu);
1718
- }
1719
-out:
1720
- task_rq_unlock(rq, p, &rf);
1721
-
1722
- return ret;
2581
+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
17232582 }
17242583
17252584 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
17262585 {
1727
- return __set_cpus_allowed_ptr(p, new_mask, false);
2586
+ return __set_cpus_allowed_ptr(p, new_mask, 0);
17282587 }
17292588 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2589
+
2590
+/*
2591
+ * Change a given task's CPU affinity to the intersection of its current
2592
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
2593
+ * If the resulting mask is empty, leave the affinity unchanged and return
2594
+ * -EINVAL.
2595
+ */
2596
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
2597
+ struct cpumask *new_mask,
2598
+ const struct cpumask *subset_mask)
2599
+{
2600
+ struct rq_flags rf;
2601
+ struct rq *rq;
2602
+
2603
+ rq = task_rq_lock(p, &rf);
2604
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2605
+ task_rq_unlock(rq, p, &rf);
2606
+ return -EINVAL;
2607
+ }
2608
+
2609
+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
2610
+}
2611
+
2612
+/*
2613
+ * Restrict a given task's CPU affinity so that it is a subset of
2614
+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
2615
+ * walk up the cpuset hierarchy until we find a suitable mask.
2616
+ */
2617
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2618
+{
2619
+ cpumask_var_t new_mask;
2620
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
2621
+
2622
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
2623
+
2624
+ /*
2625
+ * __migrate_task() can fail silently in the face of concurrent
2626
+ * offlining of the chosen destination CPU, so take the hotplug
2627
+ * lock to ensure that the migration succeeds.
2628
+ */
2629
+ trace_android_rvh_force_compatible_pre(NULL);
2630
+ cpus_read_lock();
2631
+ if (!cpumask_available(new_mask))
2632
+ goto out_set_mask;
2633
+
2634
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2635
+ goto out_free_mask;
2636
+
2637
+ /*
2638
+ * We failed to find a valid subset of the affinity mask for the
2639
+ * task, so override it based on its cpuset hierarchy.
2640
+ */
2641
+ cpuset_cpus_allowed(p, new_mask);
2642
+ override_mask = new_mask;
2643
+
2644
+out_set_mask:
2645
+ if (printk_ratelimit()) {
2646
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2647
+ task_pid_nr(p), p->comm,
2648
+ cpumask_pr_args(override_mask));
2649
+ }
2650
+
2651
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2652
+out_free_mask:
2653
+ cpus_read_unlock();
2654
+ trace_android_rvh_force_compatible_post(NULL);
2655
+ free_cpumask_var(new_mask);
2656
+}
17302657
17312658 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
17322659 {
....@@ -1765,6 +2692,8 @@
17652692 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
17662693 */
17672694 WARN_ON_ONCE(!cpu_online(new_cpu));
2695
+
2696
+ WARN_ON_ONCE(is_migration_disabled(p));
17682697 #endif
17692698
17702699 trace_sched_migrate_task(p, new_cpu);
....@@ -1775,12 +2704,13 @@
17752704 p->se.nr_migrations++;
17762705 rseq_migrate(p);
17772706 perf_event_task_migrate(p);
2707
+ trace_android_rvh_set_task_cpu(p, new_cpu);
17782708 }
17792709
17802710 __set_task_cpu(p, new_cpu);
17812711 }
2712
+EXPORT_SYMBOL_GPL(set_task_cpu);
17822713
1783
-#ifdef CONFIG_NUMA_BALANCING
17842714 static void __migrate_swap_task(struct task_struct *p, int cpu)
17852715 {
17862716 if (task_on_rq_queued(p)) {
....@@ -1793,11 +2723,9 @@
17932723 rq_pin_lock(src_rq, &srf);
17942724 rq_pin_lock(dst_rq, &drf);
17952725
1796
- p->on_rq = TASK_ON_RQ_MIGRATING;
17972726 deactivate_task(src_rq, p, 0);
17982727 set_task_cpu(p, cpu);
17992728 activate_task(dst_rq, p, 0);
1800
- p->on_rq = TASK_ON_RQ_QUEUED;
18012729 check_preempt_curr(dst_rq, p, 0);
18022730
18032731 rq_unpin_lock(dst_rq, &drf);
....@@ -1840,10 +2768,10 @@
18402768 if (task_cpu(arg->src_task) != arg->src_cpu)
18412769 goto unlock;
18422770
1843
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
2771
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
18442772 goto unlock;
18452773
1846
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
2774
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
18472775 goto unlock;
18482776
18492777 __migrate_swap_task(arg->src_task, arg->dst_cpu);
....@@ -1885,10 +2813,10 @@
18852813 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
18862814 goto out;
18872815
1888
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
2816
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
18892817 goto out;
18902818
1891
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
2819
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
18922820 goto out;
18932821
18942822 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
....@@ -1897,7 +2825,19 @@
18972825 out:
18982826 return ret;
18992827 }
1900
-#endif /* CONFIG_NUMA_BALANCING */
2828
+EXPORT_SYMBOL_GPL(migrate_swap);
2829
+
2830
+static bool check_task_state(struct task_struct *p, long match_state)
2831
+{
2832
+ bool match = false;
2833
+
2834
+ raw_spin_lock_irq(&p->pi_lock);
2835
+ if (p->state == match_state || p->saved_state == match_state)
2836
+ match = true;
2837
+ raw_spin_unlock_irq(&p->pi_lock);
2838
+
2839
+ return match;
2840
+}
19012841
19022842 /*
19032843 * wait_task_inactive - wait for a thread to unschedule.
....@@ -1943,7 +2883,7 @@
19432883 * is actually now running somewhere else!
19442884 */
19452885 while (task_running(rq, p)) {
1946
- if (match_state && unlikely(p->state != match_state))
2886
+ if (match_state && !check_task_state(p, match_state))
19472887 return 0;
19482888 cpu_relax();
19492889 }
....@@ -1958,7 +2898,8 @@
19582898 running = task_running(rq, p);
19592899 queued = task_on_rq_queued(p);
19602900 ncsw = 0;
1961
- if (!match_state || p->state == match_state)
2901
+ if (!match_state || p->state == match_state ||
2902
+ p->saved_state == match_state)
19622903 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
19632904 task_rq_unlock(rq, p, &rf);
19642905
....@@ -1992,7 +2933,7 @@
19922933 ktime_t to = NSEC_PER_SEC / HZ;
19932934
19942935 set_current_state(TASK_UNINTERRUPTIBLE);
1995
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2936
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
19962937 continue;
19972938 }
19982939
....@@ -2033,7 +2974,7 @@
20332974 EXPORT_SYMBOL_GPL(kick_process);
20342975
20352976 /*
2036
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2977
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
20372978 *
20382979 * A few notes on cpu_active vs cpu_online:
20392980 *
....@@ -2059,7 +3000,11 @@
20593000 int nid = cpu_to_node(cpu);
20603001 const struct cpumask *nodemask = NULL;
20613002 enum { cpuset, possible, fail } state = cpuset;
2062
- int dest_cpu;
3003
+ int dest_cpu = -1;
3004
+
3005
+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
3006
+ if (dest_cpu >= 0)
3007
+ return dest_cpu;
20633008
20643009 /*
20653010 * If the node that the CPU is on has been offlined, cpu_to_node()
....@@ -2071,16 +3016,14 @@
20713016
20723017 /* Look for allowed, online CPU in same node. */
20733018 for_each_cpu(dest_cpu, nodemask) {
2074
- if (!cpu_active(dest_cpu))
2075
- continue;
2076
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
3019
+ if (is_cpu_allowed(p, dest_cpu))
20773020 return dest_cpu;
20783021 }
20793022 }
20803023
20813024 for (;;) {
20823025 /* Any allowed, online CPU? */
2083
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
3026
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
20843027 if (!is_cpu_allowed(p, dest_cpu))
20853028 continue;
20863029
....@@ -2095,12 +3038,17 @@
20953038 state = possible;
20963039 break;
20973040 }
2098
- /* Fall-through */
3041
+ fallthrough;
20993042 case possible:
2100
- do_set_cpus_allowed(p, cpu_possible_mask);
3043
+ /*
3044
+ * XXX When called from select_task_rq() we only
3045
+ * hold p->pi_lock and again violate locking order.
3046
+ *
3047
+ * More yuck to audit.
3048
+ */
3049
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
21013050 state = fail;
21023051 break;
2103
-
21043052 case fail:
21053053 BUG();
21063054 break;
....@@ -2124,23 +3072,21 @@
21243072 }
21253073
21263074 /*
2127
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
3075
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21283076 */
21293077 static inline
2130
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2131
- int sibling_count_hint)
3078
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21323079 {
21333080 lockdep_assert_held(&p->pi_lock);
21343081
2135
- if (p->nr_cpus_allowed > 1)
2136
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2137
- sibling_count_hint);
3082
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3083
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
21383084 else
2139
- cpu = cpumask_any(&p->cpus_allowed);
3085
+ cpu = cpumask_any(p->cpus_ptr);
21403086
21413087 /*
21423088 * In order not to call set_task_cpu() on a blocking task we need
2143
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
3089
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
21443090 * CPU.
21453091 *
21463092 * Since this is common to all placement strategies, this lives here.
....@@ -2154,14 +3100,9 @@
21543100 return cpu;
21553101 }
21563102
2157
-static void update_avg(u64 *avg, u64 sample)
2158
-{
2159
- s64 diff = sample - *avg;
2160
- *avg += diff >> 3;
2161
-}
2162
-
21633103 void sched_set_stop_task(int cpu, struct task_struct *stop)
21643104 {
3105
+ static struct lock_class_key stop_pi_lock;
21653106 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
21663107 struct task_struct *old_stop = cpu_rq(cpu)->stop;
21673108
....@@ -2177,6 +3118,20 @@
21773118 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
21783119
21793120 stop->sched_class = &stop_sched_class;
3121
+
3122
+ /*
3123
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3124
+ * adjust the effective priority of a task. As a result,
3125
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
3126
+ * which can then trigger wakeups of the stop thread to push
3127
+ * around the current task.
3128
+ *
3129
+ * The stop task itself will never be part of the PI-chain, it
3130
+ * never blocks, therefore that ->pi_lock recursion is safe.
3131
+ * Tell lockdep about this by placing the stop->pi_lock in its
3132
+ * own class.
3133
+ */
3134
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
21803135 }
21813136
21823137 cpu_rq(cpu)->stop = stop;
....@@ -2190,15 +3145,23 @@
21903145 }
21913146 }
21923147
2193
-#else
3148
+#else /* CONFIG_SMP */
21943149
21953150 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2196
- const struct cpumask *new_mask, bool check)
3151
+ const struct cpumask *new_mask,
3152
+ u32 flags)
21973153 {
21983154 return set_cpus_allowed_ptr(p, new_mask);
21993155 }
22003156
2201
-#endif /* CONFIG_SMP */
3157
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3158
+
3159
+static inline bool rq_has_pinned_tasks(struct rq *rq)
3160
+{
3161
+ return false;
3162
+}
3163
+
3164
+#endif /* !CONFIG_SMP */
22023165
22033166 static void
22043167 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
....@@ -2237,16 +3200,6 @@
22373200
22383201 if (wake_flags & WF_SYNC)
22393202 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2240
-}
2241
-
2242
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2243
-{
2244
- activate_task(rq, p, en_flags);
2245
- p->on_rq = TASK_ON_RQ_QUEUED;
2246
-
2247
- /* If a worker is waking up, notify the workqueue: */
2248
- if (p->flags & PF_WQ_WORKER)
2249
- wq_worker_waking_up(p, cpu_of(rq));
22503203 }
22513204
22523205 /*
....@@ -2290,27 +3243,54 @@
22903243 {
22913244 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
22923245
3246
+ if (wake_flags & WF_SYNC)
3247
+ en_flags |= ENQUEUE_WAKEUP_SYNC;
3248
+
22933249 lockdep_assert_held(&rq->lock);
22943250
2295
-#ifdef CONFIG_SMP
22963251 if (p->sched_contributes_to_load)
22973252 rq->nr_uninterruptible--;
22983253
3254
+#ifdef CONFIG_SMP
22993255 if (wake_flags & WF_MIGRATED)
23003256 en_flags |= ENQUEUE_MIGRATED;
3257
+ else
23013258 #endif
3259
+ if (p->in_iowait) {
3260
+ delayacct_blkio_end(p);
3261
+ atomic_dec(&task_rq(p)->nr_iowait);
3262
+ }
23023263
2303
- ttwu_activate(rq, p, en_flags);
3264
+ activate_task(rq, p, en_flags);
23043265 ttwu_do_wakeup(rq, p, wake_flags, rf);
23053266 }
23063267
23073268 /*
2308
- * Called in case the task @p isn't fully descheduled from its runqueue,
2309
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2310
- * since all we need to do is flip p->state to TASK_RUNNING, since
2311
- * the task is still ->on_rq.
3269
+ * Consider @p being inside a wait loop:
3270
+ *
3271
+ * for (;;) {
3272
+ * set_current_state(TASK_UNINTERRUPTIBLE);
3273
+ *
3274
+ * if (CONDITION)
3275
+ * break;
3276
+ *
3277
+ * schedule();
3278
+ * }
3279
+ * __set_current_state(TASK_RUNNING);
3280
+ *
3281
+ * between set_current_state() and schedule(). In this case @p is still
3282
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3283
+ * an atomic manner.
3284
+ *
3285
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3286
+ * then schedule() must still happen and p->state can be changed to
3287
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3288
+ * need to do a full wakeup with enqueue.
3289
+ *
3290
+ * Returns: %true when the wakeup is done,
3291
+ * %false otherwise.
23123292 */
2313
-static int ttwu_remote(struct task_struct *p, int wake_flags)
3293
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
23143294 {
23153295 struct rq_flags rf;
23163296 struct rq *rq;
....@@ -2329,75 +3309,63 @@
23293309 }
23303310
23313311 #ifdef CONFIG_SMP
2332
-void sched_ttwu_pending(void)
3312
+void sched_ttwu_pending(void *arg)
23333313 {
3314
+ struct llist_node *llist = arg;
23343315 struct rq *rq = this_rq();
2335
- struct llist_node *llist = llist_del_all(&rq->wake_list);
23363316 struct task_struct *p, *t;
23373317 struct rq_flags rf;
23383318
23393319 if (!llist)
23403320 return;
23413321
3322
+ /*
3323
+ * rq::ttwu_pending racy indication of out-standing wakeups.
3324
+ * Races such that false-negatives are possible, since they
3325
+ * are shorter lived that false-positives would be.
3326
+ */
3327
+ WRITE_ONCE(rq->ttwu_pending, 0);
3328
+
23423329 rq_lock_irqsave(rq, &rf);
23433330 update_rq_clock(rq);
23443331
2345
- llist_for_each_entry_safe(p, t, llist, wake_entry)
3332
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3333
+ if (WARN_ON_ONCE(p->on_cpu))
3334
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
3335
+
3336
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3337
+ set_task_cpu(p, cpu_of(rq));
3338
+
23463339 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3340
+ }
23473341
23483342 rq_unlock_irqrestore(rq, &rf);
23493343 }
23503344
2351
-void scheduler_ipi(void)
3345
+void send_call_function_single_ipi(int cpu)
23523346 {
2353
- /*
2354
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2355
- * TIF_NEED_RESCHED remotely (for the first time) will also send
2356
- * this IPI.
2357
- */
2358
- preempt_fold_need_resched();
3347
+ struct rq *rq = cpu_rq(cpu);
23593348
2360
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2361
- return;
2362
-
2363
- /*
2364
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2365
- * traditionally all their work was done from the interrupt return
2366
- * path. Now that we actually do some work, we need to make sure
2367
- * we do call them.
2368
- *
2369
- * Some archs already do call them, luckily irq_enter/exit nest
2370
- * properly.
2371
- *
2372
- * Arguably we should visit all archs and update all handlers,
2373
- * however a fair share of IPIs are still resched only so this would
2374
- * somewhat pessimize the simple resched case.
2375
- */
2376
- irq_enter();
2377
- sched_ttwu_pending();
2378
-
2379
- /*
2380
- * Check if someone kicked us for doing the nohz idle load balance.
2381
- */
2382
- if (unlikely(got_nohz_idle_kick())) {
2383
- this_rq()->idle_balance = 1;
2384
- raise_softirq_irqoff(SCHED_SOFTIRQ);
2385
- }
2386
- irq_exit();
3349
+ if (!set_nr_if_polling(rq->idle))
3350
+ arch_send_call_function_single_ipi(cpu);
3351
+ else
3352
+ trace_sched_wake_idle_without_ipi(cpu);
23873353 }
23883354
2389
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
3355
+/*
3356
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3357
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
3358
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3359
+ * of the wakeup instead of the waker.
3360
+ */
3361
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
23903362 {
23913363 struct rq *rq = cpu_rq(cpu);
23923364
23933365 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
23943366
2395
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2396
- if (!set_nr_if_polling(rq->idle))
2397
- smp_send_reschedule(cpu);
2398
- else
2399
- trace_sched_wake_idle_without_ipi(cpu);
2400
- }
3367
+ WRITE_ONCE(rq->ttwu_pending, 1);
3368
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
24013369 }
24023370
24033371 void wake_up_if_idle(int cpu)
....@@ -2423,6 +3391,7 @@
24233391 out:
24243392 rcu_read_unlock();
24253393 }
3394
+EXPORT_SYMBOL_GPL(wake_up_if_idle);
24263395
24273396 bool cpus_share_cache(int this_cpu, int that_cpu)
24283397 {
....@@ -2431,6 +3400,58 @@
24313400
24323401 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
24333402 }
3403
+
3404
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3405
+{
3406
+ /*
3407
+ * If the CPU does not share cache, then queue the task on the
3408
+ * remote rqs wakelist to avoid accessing remote data.
3409
+ */
3410
+ if (!cpus_share_cache(smp_processor_id(), cpu))
3411
+ return true;
3412
+
3413
+ /*
3414
+ * If the task is descheduling and the only running task on the
3415
+ * CPU then use the wakelist to offload the task activation to
3416
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
3417
+ * nr_running is checked to avoid unnecessary task stacking.
3418
+ *
3419
+ * Note that we can only get here with (wakee) p->on_rq=0,
3420
+ * p->on_cpu can be whatever, we've done the dequeue, so
3421
+ * the wakee has been accounted out of ->nr_running.
3422
+ */
3423
+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
3424
+ return true;
3425
+
3426
+ return false;
3427
+}
3428
+
3429
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3430
+{
3431
+ bool cond = false;
3432
+
3433
+ trace_android_rvh_ttwu_cond(&cond);
3434
+
3435
+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) ||
3436
+ cond) {
3437
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
3438
+ return false;
3439
+
3440
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
3441
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
3442
+ return true;
3443
+ }
3444
+
3445
+ return false;
3446
+}
3447
+
3448
+#else /* !CONFIG_SMP */
3449
+
3450
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3451
+{
3452
+ return false;
3453
+}
3454
+
24343455 #endif /* CONFIG_SMP */
24353456
24363457 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
....@@ -2438,13 +3459,8 @@
24383459 struct rq *rq = cpu_rq(cpu);
24393460 struct rq_flags rf;
24403461
2441
-#if defined(CONFIG_SMP)
2442
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2443
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2444
- ttwu_queue_remote(p, cpu, wake_flags);
3462
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
24453463 return;
2446
- }
2447
-#endif
24483464
24493465 rq_lock(rq, &rf);
24503466 update_rq_clock(rq);
....@@ -2500,8 +3516,8 @@
25003516 * migration. However the means are completely different as there is no lock
25013517 * chain to provide order. Instead we do:
25023518 *
2503
- * 1) smp_store_release(X->on_cpu, 0)
2504
- * 2) smp_cond_load_acquire(!X->on_cpu)
3519
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
3520
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25053521 *
25063522 * Example:
25073523 *
....@@ -2540,45 +3556,113 @@
25403556 * @p: the thread to be awakened
25413557 * @state: the mask of task states that can be woken
25423558 * @wake_flags: wake modifier flags (WF_*)
2543
- * @sibling_count_hint: A hint at the number of threads that are being woken up
2544
- * in this event.
25453559 *
2546
- * If (@state & @p->state) @p->state = TASK_RUNNING.
3560
+ * Conceptually does:
3561
+ *
3562
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
25473563 *
25483564 * If the task was not queued/runnable, also place it back on a runqueue.
25493565 *
2550
- * Atomic against schedule() which would dequeue a task, also see
2551
- * set_current_state().
3566
+ * This function is atomic against schedule() which would dequeue the task.
25523567 *
2553
- * This function executes a full memory barrier before accessing the task
2554
- * state; see set_current_state().
3568
+ * It issues a full memory barrier before accessing @p->state, see the comment
3569
+ * with set_current_state().
3570
+ *
3571
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
3572
+ *
3573
+ * Relies on p->pi_lock stabilizing:
3574
+ * - p->sched_class
3575
+ * - p->cpus_ptr
3576
+ * - p->sched_task_group
3577
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3578
+ *
3579
+ * Tries really hard to only take one task_rq(p)->lock for performance.
3580
+ * Takes rq->lock in:
3581
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3582
+ * - ttwu_queue() -- new rq, for enqueue of the task;
3583
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3584
+ *
3585
+ * As a consequence we race really badly with just about everything. See the
3586
+ * many memory barriers and their comments for details.
25553587 *
25563588 * Return: %true if @p->state changes (an actual wakeup was done),
25573589 * %false otherwise.
25583590 */
25593591 static int
2560
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2561
- int sibling_count_hint)
3592
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
25623593 {
25633594 unsigned long flags;
25643595 int cpu, success = 0;
25653596
3597
+ preempt_disable();
3598
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) {
3599
+ /*
3600
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3601
+ * == smp_processor_id()'. Together this means we can special
3602
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
3603
+ * without taking any locks.
3604
+ *
3605
+ * In particular:
3606
+ * - we rely on Program-Order guarantees for all the ordering,
3607
+ * - we're serialized against set_special_state() by virtue of
3608
+ * it disabling IRQs (this allows not taking ->pi_lock).
3609
+ */
3610
+ if (!(p->state & state))
3611
+ goto out;
3612
+
3613
+ success = 1;
3614
+ trace_sched_waking(p);
3615
+ p->state = TASK_RUNNING;
3616
+ trace_sched_wakeup(p);
3617
+ goto out;
3618
+ }
3619
+
25663620 /*
25673621 * If we are going to wake up a thread waiting for CONDITION we
25683622 * need to ensure that CONDITION=1 done by the caller can not be
2569
- * reordered with p->state check below. This pairs with mb() in
2570
- * set_current_state() the waiting thread does.
3623
+ * reordered with p->state check below. This pairs with smp_store_mb()
3624
+ * in set_current_state() that the waiting thread does.
25713625 */
25723626 raw_spin_lock_irqsave(&p->pi_lock, flags);
25733627 smp_mb__after_spinlock();
2574
- if (!(p->state & state))
2575
- goto out;
3628
+ if (!(p->state & state)) {
3629
+ /*
3630
+ * The task might be running due to a spinlock sleeper
3631
+ * wakeup. Check the saved state and set it to running
3632
+ * if the wakeup condition is true.
3633
+ */
3634
+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
3635
+ if (p->saved_state & state) {
3636
+ p->saved_state = TASK_RUNNING;
3637
+ success = 1;
3638
+ }
3639
+ }
3640
+ goto unlock;
3641
+ }
3642
+ /*
3643
+ * If this is a regular wakeup, then we can unconditionally
3644
+ * clear the saved state of a "lock sleeper".
3645
+ */
3646
+ if (!(wake_flags & WF_LOCK_SLEEPER))
3647
+ p->saved_state = TASK_RUNNING;
3648
+
3649
+#ifdef CONFIG_FREEZER
3650
+ /*
3651
+ * If we're going to wake up a thread which may be frozen, then
3652
+ * we can only do so if we have an active CPU which is capable of
3653
+ * running it. This may not be the case when resuming from suspend,
3654
+ * as the secondary CPUs may not yet be back online. See __thaw_task()
3655
+ * for the actual wakeup.
3656
+ */
3657
+ if (unlikely(frozen_or_skipped(p)) &&
3658
+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
3659
+ goto unlock;
3660
+#endif
25763661
25773662 trace_sched_waking(p);
25783663
25793664 /* We're going to change ->state: */
25803665 success = 1;
2581
- cpu = task_cpu(p);
25823666
25833667 /*
25843668 * Ensure we load p->on_rq _after_ p->state, otherwise it would
....@@ -2599,10 +3683,15 @@
25993683 *
26003684 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
26013685 * __schedule(). See the comment for smp_mb__after_spinlock().
3686
+ *
3687
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
26023688 */
26033689 smp_rmb();
2604
- if (p->on_rq && ttwu_remote(p, wake_flags))
2605
- goto stat;
3690
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3691
+ goto unlock;
3692
+
3693
+ if (p->state & TASK_UNINTERRUPTIBLE)
3694
+ trace_sched_blocked_reason(p);
26063695
26073696 #ifdef CONFIG_SMP
26083697 /*
....@@ -2623,8 +3712,43 @@
26233712 *
26243713 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
26253714 * __schedule(). See the comment for smp_mb__after_spinlock().
3715
+ *
3716
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3717
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
3718
+ * care about it's own p->state. See the comment in __schedule().
26263719 */
2627
- smp_rmb();
3720
+ smp_acquire__after_ctrl_dep();
3721
+
3722
+ /*
3723
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3724
+ * == 0), which means we need to do an enqueue, change p->state to
3725
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
3726
+ * enqueue, such as ttwu_queue_wakelist().
3727
+ */
3728
+ p->state = TASK_WAKING;
3729
+
3730
+ /*
3731
+ * If the owning (remote) CPU is still in the middle of schedule() with
3732
+ * this task as prev, considering queueing p on the remote CPUs wake_list
3733
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
3734
+ * let the waker make forward progress. This is safe because IRQs are
3735
+ * disabled and the IPI will deliver after on_cpu is cleared.
3736
+ *
3737
+ * Ensure we load task_cpu(p) after p->on_cpu:
3738
+ *
3739
+ * set_task_cpu(p, cpu);
3740
+ * STORE p->cpu = @cpu
3741
+ * __schedule() (switch to task 'p')
3742
+ * LOCK rq->lock
3743
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3744
+ * STORE p->on_cpu = 1 LOAD p->cpu
3745
+ *
3746
+ * to ensure we observe the correct CPU on which the task is currently
3747
+ * scheduling.
3748
+ */
3749
+ if (smp_load_acquire(&p->on_cpu) &&
3750
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3751
+ goto unlock;
26283752
26293753 /*
26303754 * If the owning (remote) CPU is still in the middle of schedule() with
....@@ -2637,88 +3761,79 @@
26373761 */
26383762 smp_cond_load_acquire(&p->on_cpu, !VAL);
26393763
2640
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2641
- p->state = TASK_WAKING;
3764
+ trace_android_rvh_try_to_wake_up(p);
26423765
2643
- if (p->in_iowait) {
2644
- delayacct_blkio_end(p);
2645
- atomic_dec(&task_rq(p)->nr_iowait);
2646
- }
2647
-
2648
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2649
- sibling_count_hint);
3766
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
26503767 if (task_cpu(p) != cpu) {
3768
+ if (p->in_iowait) {
3769
+ delayacct_blkio_end(p);
3770
+ atomic_dec(&task_rq(p)->nr_iowait);
3771
+ }
3772
+
26513773 wake_flags |= WF_MIGRATED;
26523774 psi_ttwu_dequeue(p);
26533775 set_task_cpu(p, cpu);
26543776 }
2655
-
2656
-#else /* CONFIG_SMP */
2657
-
2658
- if (p->in_iowait) {
2659
- delayacct_blkio_end(p);
2660
- atomic_dec(&task_rq(p)->nr_iowait);
2661
- }
2662
-
3777
+#else
3778
+ cpu = task_cpu(p);
26633779 #endif /* CONFIG_SMP */
26643780
26653781 ttwu_queue(p, cpu, wake_flags);
2666
-stat:
2667
- ttwu_stat(p, cpu, wake_flags);
2668
-out:
3782
+unlock:
26693783 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3784
+out:
3785
+ if (success) {
3786
+ trace_android_rvh_try_to_wake_up_success(p);
3787
+ ttwu_stat(p, task_cpu(p), wake_flags);
3788
+ }
3789
+ preempt_enable();
26703790
26713791 return success;
26723792 }
26733793
26743794 /**
2675
- * try_to_wake_up_local - try to wake up a local task with rq lock held
2676
- * @p: the thread to be awakened
2677
- * @rf: request-queue flags for pinning
3795
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3796
+ * @p: Process for which the function is to be invoked, can be @current.
3797
+ * @func: Function to invoke.
3798
+ * @arg: Argument to function.
26783799 *
2679
- * Put @p on the run-queue if it's not already there. The caller must
2680
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
2681
- * the current task.
3800
+ * If the specified task can be quickly locked into a definite state
3801
+ * (either sleeping or on a given runqueue), arrange to keep it in that
3802
+ * state while invoking @func(@arg). This function can use ->on_rq and
3803
+ * task_curr() to work out what the state is, if required. Given that
3804
+ * @func can be invoked with a runqueue lock held, it had better be quite
3805
+ * lightweight.
3806
+ *
3807
+ * Returns:
3808
+ * @false if the task slipped out from under the locks.
3809
+ * @true if the task was locked onto a runqueue or is sleeping.
3810
+ * However, @func can override this by returning @false.
26823811 */
2683
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
3812
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
26843813 {
2685
- struct rq *rq = task_rq(p);
3814
+ struct rq_flags rf;
3815
+ bool ret = false;
3816
+ struct rq *rq;
26863817
2687
- if (WARN_ON_ONCE(rq != this_rq()) ||
2688
- WARN_ON_ONCE(p == current))
2689
- return;
2690
-
2691
- lockdep_assert_held(&rq->lock);
2692
-
2693
- if (!raw_spin_trylock(&p->pi_lock)) {
2694
- /*
2695
- * This is OK, because current is on_cpu, which avoids it being
2696
- * picked for load-balance and preemption/IRQs are still
2697
- * disabled avoiding further scheduler activity on it and we've
2698
- * not yet picked a replacement task.
2699
- */
2700
- rq_unlock(rq, rf);
2701
- raw_spin_lock(&p->pi_lock);
2702
- rq_relock(rq, rf);
2703
- }
2704
-
2705
- if (!(p->state & TASK_NORMAL))
2706
- goto out;
2707
-
2708
- trace_sched_waking(p);
2709
-
2710
- if (!task_on_rq_queued(p)) {
2711
- if (p->in_iowait) {
2712
- delayacct_blkio_end(p);
2713
- atomic_dec(&rq->nr_iowait);
3818
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3819
+ if (p->on_rq) {
3820
+ rq = __task_rq_lock(p, &rf);
3821
+ if (task_rq(p) == rq)
3822
+ ret = func(p, arg);
3823
+ rq_unlock(rq, &rf);
3824
+ } else {
3825
+ switch (p->state) {
3826
+ case TASK_RUNNING:
3827
+ case TASK_WAKING:
3828
+ break;
3829
+ default:
3830
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3831
+ if (!p->on_rq)
3832
+ ret = func(p, arg);
27143833 }
2715
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
27163834 }
2717
-
2718
- ttwu_do_wakeup(rq, p, 0, rf);
2719
- ttwu_stat(p, smp_processor_id(), 0);
2720
-out:
2721
- raw_spin_unlock(&p->pi_lock);
3835
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3836
+ return ret;
27223837 }
27233838
27243839 /**
....@@ -2734,13 +3849,25 @@
27343849 */
27353850 int wake_up_process(struct task_struct *p)
27363851 {
2737
- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
3852
+ return try_to_wake_up(p, TASK_NORMAL, 0);
27383853 }
27393854 EXPORT_SYMBOL(wake_up_process);
27403855
3856
+/**
3857
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
3858
+ * @p: The process to be woken up.
3859
+ *
3860
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
3861
+ * the nature of the wakeup.
3862
+ */
3863
+int wake_up_lock_sleeper(struct task_struct *p)
3864
+{
3865
+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
3866
+}
3867
+
27413868 int wake_up_state(struct task_struct *p, unsigned int state)
27423869 {
2743
- return try_to_wake_up(p, state, 0, 1);
3870
+ return try_to_wake_up(p, state, 0);
27443871 }
27453872
27463873 /*
....@@ -2765,6 +3892,8 @@
27653892 p->se.cfs_rq = NULL;
27663893 #endif
27673894
3895
+ trace_android_rvh_sched_fork_init(p);
3896
+
27683897 #ifdef CONFIG_SCHEDSTATS
27693898 /* Even if schedstat is disabled, there should not be garbage */
27703899 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
....@@ -2785,7 +3914,14 @@
27853914 INIT_HLIST_HEAD(&p->preempt_notifiers);
27863915 #endif
27873916
3917
+#ifdef CONFIG_COMPACTION
3918
+ p->capture_control = NULL;
3919
+#endif
27883920 init_numa_balancing(clone_flags, p);
3921
+#ifdef CONFIG_SMP
3922
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
3923
+ p->migration_pending = NULL;
3924
+#endif
27893925 }
27903926
27913927 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
....@@ -2802,7 +3938,7 @@
28023938
28033939 #ifdef CONFIG_PROC_SYSCTL
28043940 int sysctl_numa_balancing(struct ctl_table *table, int write,
2805
- void __user *buffer, size_t *lenp, loff_t *ppos)
3941
+ void *buffer, size_t *lenp, loff_t *ppos)
28063942 {
28073943 struct ctl_table t;
28083944 int err;
....@@ -2876,8 +4012,8 @@
28764012 }
28774013
28784014 #ifdef CONFIG_PROC_SYSCTL
2879
-int sysctl_schedstats(struct ctl_table *table, int write,
2880
- void __user *buffer, size_t *lenp, loff_t *ppos)
4015
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4016
+ size_t *lenp, loff_t *ppos)
28814017 {
28824018 struct ctl_table t;
28834019 int err;
....@@ -2905,7 +4041,7 @@
29054041 */
29064042 int sched_fork(unsigned long clone_flags, struct task_struct *p)
29074043 {
2908
- unsigned long flags;
4044
+ trace_android_rvh_sched_fork(p);
29094045
29104046 __sched_fork(clone_flags, p);
29114047 /*
....@@ -2919,6 +4055,7 @@
29194055 * Make sure we do not leak PI boosting priority to the child.
29204056 */
29214057 p->prio = current->normal_prio;
4058
+ trace_android_rvh_prepare_prio_fork(p);
29224059
29234060 uclamp_fork(p);
29244061
....@@ -2933,8 +4070,8 @@
29334070 } else if (PRIO_TO_NICE(p->static_prio) < 0)
29344071 p->static_prio = NICE_TO_PRIO(0);
29354072
2936
- p->prio = p->normal_prio = __normal_prio(p);
2937
- set_load_weight(p, false);
4073
+ p->prio = p->normal_prio = p->static_prio;
4074
+ set_load_weight(p);
29384075
29394076 /*
29404077 * We don't need the reset flag anymore after the fork. It has
....@@ -2951,24 +4088,8 @@
29514088 p->sched_class = &fair_sched_class;
29524089
29534090 init_entity_runnable_average(&p->se);
4091
+ trace_android_rvh_finish_prio_fork(p);
29544092
2955
- /*
2956
- * The child is not yet in the pid-hash so no cgroup attach races,
2957
- * and the cgroup is pinned to this child due to cgroup_fork()
2958
- * is ran before sched_fork().
2959
- *
2960
- * Silence PROVE_RCU.
2961
- */
2962
- raw_spin_lock_irqsave(&p->pi_lock, flags);
2963
- rseq_migrate(p);
2964
- /*
2965
- * We're setting the CPU for the first time, we don't migrate,
2966
- * so use __set_task_cpu().
2967
- */
2968
- __set_task_cpu(p, smp_processor_id());
2969
- if (p->sched_class->task_fork)
2970
- p->sched_class->task_fork(p);
2971
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
29724093
29734094 #ifdef CONFIG_SCHED_INFO
29744095 if (likely(sched_info_on()))
....@@ -2978,11 +4099,49 @@
29784099 p->on_cpu = 0;
29794100 #endif
29804101 init_task_preempt_count(p);
4102
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
4103
+ task_thread_info(p)->preempt_lazy_count = 0;
4104
+#endif
29814105 #ifdef CONFIG_SMP
29824106 plist_node_init(&p->pushable_tasks, MAX_PRIO);
29834107 RB_CLEAR_NODE(&p->pushable_dl_tasks);
29844108 #endif
29854109 return 0;
4110
+}
4111
+
4112
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
4113
+{
4114
+ unsigned long flags;
4115
+
4116
+ /*
4117
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
4118
+ * required yet, but lockdep gets upset if rules are violated.
4119
+ */
4120
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
4121
+#ifdef CONFIG_CGROUP_SCHED
4122
+ if (1) {
4123
+ struct task_group *tg;
4124
+
4125
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
4126
+ struct task_group, css);
4127
+ tg = autogroup_task_group(p, tg);
4128
+ p->sched_task_group = tg;
4129
+ }
4130
+#endif
4131
+ rseq_migrate(p);
4132
+ /*
4133
+ * We're setting the CPU for the first time, we don't migrate,
4134
+ * so use __set_task_cpu().
4135
+ */
4136
+ __set_task_cpu(p, smp_processor_id());
4137
+ if (p->sched_class->task_fork)
4138
+ p->sched_class->task_fork(p);
4139
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4140
+}
4141
+
4142
+void sched_post_fork(struct task_struct *p)
4143
+{
4144
+ uclamp_post_fork(p);
29864145 }
29874146
29884147 unsigned long to_ratio(u64 period, u64 runtime)
....@@ -3013,12 +4172,14 @@
30134172 struct rq_flags rf;
30144173 struct rq *rq;
30154174
4175
+ trace_android_rvh_wake_up_new_task(p);
4176
+
30164177 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
30174178 p->state = TASK_RUNNING;
30184179 #ifdef CONFIG_SMP
30194180 /*
30204181 * Fork balancing, do it here and not earlier because:
3021
- * - cpus_allowed can change in the fork path
4182
+ * - cpus_ptr can change in the fork path
30224183 * - any previously selected CPU might disappear through hotplug
30234184 *
30244185 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
....@@ -3026,14 +4187,14 @@
30264187 */
30274188 p->recent_used_cpu = task_cpu(p);
30284189 rseq_migrate(p);
3029
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
4190
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
30304191 #endif
30314192 rq = __task_rq_lock(p, &rf);
30324193 update_rq_clock(rq);
3033
- post_init_entity_util_avg(&p->se);
4194
+ post_init_entity_util_avg(p);
4195
+ trace_android_rvh_new_task_stats(p);
30344196
30354197 activate_task(rq, p, ENQUEUE_NOCLOCK);
3036
- p->on_rq = TASK_ON_RQ_QUEUED;
30374198 trace_sched_wakeup_new(p);
30384199 check_preempt_curr(rq, p, WF_FORK);
30394200 #ifdef CONFIG_SMP
....@@ -3143,8 +4304,10 @@
31434304 /*
31444305 * Claim the task as running, we do this before switching to it
31454306 * such that any running task will have this set.
4307
+ *
4308
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
31464309 */
3147
- next->on_cpu = 1;
4310
+ WRITE_ONCE(next->on_cpu, 1);
31484311 #endif
31494312 }
31504313
....@@ -3152,8 +4315,9 @@
31524315 {
31534316 #ifdef CONFIG_SMP
31544317 /*
3155
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3156
- * We must ensure this doesn't happen until the switch is completely
4318
+ * This must be the very last reference to @prev from this CPU. After
4319
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
4320
+ * must ensure this doesn't happen until the switch is completely
31574321 * finished.
31584322 *
31594323 * In particular, the load of prev->state in finish_task_switch() must
....@@ -3165,6 +4329,90 @@
31654329 #endif
31664330 }
31674331
4332
+#ifdef CONFIG_SMP
4333
+
4334
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4335
+{
4336
+ void (*func)(struct rq *rq);
4337
+ struct callback_head *next;
4338
+
4339
+ lockdep_assert_held(&rq->lock);
4340
+
4341
+ while (head) {
4342
+ func = (void (*)(struct rq *))head->func;
4343
+ next = head->next;
4344
+ head->next = NULL;
4345
+ head = next;
4346
+
4347
+ func(rq);
4348
+ }
4349
+}
4350
+
4351
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4352
+{
4353
+ struct callback_head *head = rq->balance_callback;
4354
+
4355
+ lockdep_assert_held(&rq->lock);
4356
+ if (head) {
4357
+ rq->balance_callback = NULL;
4358
+ rq->balance_flags &= ~BALANCE_WORK;
4359
+ }
4360
+
4361
+ return head;
4362
+}
4363
+
4364
+static void __balance_callbacks(struct rq *rq)
4365
+{
4366
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
4367
+}
4368
+
4369
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4370
+{
4371
+ unsigned long flags;
4372
+
4373
+ if (unlikely(head)) {
4374
+ raw_spin_lock_irqsave(&rq->lock, flags);
4375
+ do_balance_callbacks(rq, head);
4376
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
4377
+ }
4378
+}
4379
+
4380
+static void balance_push(struct rq *rq);
4381
+
4382
+static inline void balance_switch(struct rq *rq)
4383
+{
4384
+ if (likely(!rq->balance_flags))
4385
+ return;
4386
+
4387
+ if (rq->balance_flags & BALANCE_PUSH) {
4388
+ balance_push(rq);
4389
+ return;
4390
+ }
4391
+
4392
+ __balance_callbacks(rq);
4393
+}
4394
+
4395
+#else
4396
+
4397
+static inline void __balance_callbacks(struct rq *rq)
4398
+{
4399
+}
4400
+
4401
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4402
+{
4403
+ return NULL;
4404
+}
4405
+
4406
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4407
+{
4408
+}
4409
+
4410
+static inline void balance_switch(struct rq *rq)
4411
+{
4412
+}
4413
+
4414
+#endif
4415
+
31684416 static inline void
31694417 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
31704418 {
....@@ -3175,7 +4423,7 @@
31754423 * do an early lockdep release here:
31764424 */
31774425 rq_unpin_lock(rq, rf);
3178
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4426
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
31794427 #ifdef CONFIG_DEBUG_SPINLOCK
31804428 /* this is a valid case when another task releases the spinlock */
31814429 rq->lock.owner = next;
....@@ -3190,6 +4438,7 @@
31904438 * prev into current:
31914439 */
31924440 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
4441
+ balance_switch(rq);
31934442 raw_spin_unlock_irq(&rq->lock);
31944443 }
31954444
....@@ -3204,6 +4453,22 @@
32044453 #ifndef finish_arch_post_lock_switch
32054454 # define finish_arch_post_lock_switch() do { } while (0)
32064455 #endif
4456
+
4457
+static inline void kmap_local_sched_out(void)
4458
+{
4459
+#ifdef CONFIG_KMAP_LOCAL
4460
+ if (unlikely(current->kmap_ctrl.idx))
4461
+ __kmap_local_sched_out();
4462
+#endif
4463
+}
4464
+
4465
+static inline void kmap_local_sched_in(void)
4466
+{
4467
+#ifdef CONFIG_KMAP_LOCAL
4468
+ if (unlikely(current->kmap_ctrl.idx))
4469
+ __kmap_local_sched_in();
4470
+#endif
4471
+}
32074472
32084473 /**
32094474 * prepare_task_switch - prepare to switch tasks
....@@ -3227,6 +4492,7 @@
32274492 perf_event_task_sched_out(prev, next);
32284493 rseq_preempt(prev);
32294494 fire_sched_out_preempt_notifiers(prev, next);
4495
+ kmap_local_sched_out();
32304496 prepare_task(next);
32314497 prepare_arch_switch(next);
32324498 }
....@@ -3293,6 +4559,7 @@
32934559 finish_lock_switch(rq);
32944560 finish_arch_post_lock_switch();
32954561 kcov_finish_switch(current);
4562
+ kmap_local_sched_in();
32964563
32974564 fire_sched_in_preempt_notifiers(current);
32984565 /*
....@@ -3307,66 +4574,24 @@
33074574 * provided by mmdrop(),
33084575 * - a sync_core for SYNC_CORE.
33094576 */
4577
+ /*
4578
+ * We use mmdrop_delayed() here so we don't have to do the
4579
+ * full __mmdrop() when we are the last user.
4580
+ */
33104581 if (mm) {
33114582 membarrier_mm_sync_core_before_usermode(mm);
3312
- mmdrop(mm);
4583
+ mmdrop_delayed(mm);
33134584 }
33144585 if (unlikely(prev_state == TASK_DEAD)) {
33154586 if (prev->sched_class->task_dead)
33164587 prev->sched_class->task_dead(prev);
33174588
3318
- /*
3319
- * Remove function-return probe instances associated with this
3320
- * task and put them back on the free list.
3321
- */
3322
- kprobe_flush_task(prev);
3323
-
3324
- /* Task is done with its stack. */
3325
- put_task_stack(prev);
3326
-
3327
- put_task_struct(prev);
4589
+ put_task_struct_rcu_user(prev);
33284590 }
33294591
33304592 tick_nohz_task_switch();
33314593 return rq;
33324594 }
3333
-
3334
-#ifdef CONFIG_SMP
3335
-
3336
-/* rq->lock is NOT held, but preemption is disabled */
3337
-static void __balance_callback(struct rq *rq)
3338
-{
3339
- struct callback_head *head, *next;
3340
- void (*func)(struct rq *rq);
3341
- unsigned long flags;
3342
-
3343
- raw_spin_lock_irqsave(&rq->lock, flags);
3344
- head = rq->balance_callback;
3345
- rq->balance_callback = NULL;
3346
- while (head) {
3347
- func = (void (*)(struct rq *))head->func;
3348
- next = head->next;
3349
- head->next = NULL;
3350
- head = next;
3351
-
3352
- func(rq);
3353
- }
3354
- raw_spin_unlock_irqrestore(&rq->lock, flags);
3355
-}
3356
-
3357
-static inline void balance_callback(struct rq *rq)
3358
-{
3359
- if (unlikely(rq->balance_callback))
3360
- __balance_callback(rq);
3361
-}
3362
-
3363
-#else
3364
-
3365
-static inline void balance_callback(struct rq *rq)
3366
-{
3367
-}
3368
-
3369
-#endif
33704595
33714596 /**
33724597 * schedule_tail - first thing a freshly forked thread must call.
....@@ -3387,7 +4612,6 @@
33874612 */
33884613
33894614 rq = finish_task_switch(prev);
3390
- balance_callback(rq);
33914615 preempt_enable();
33924616
33934617 if (current->set_child_tid)
....@@ -3403,12 +4627,8 @@
34034627 context_switch(struct rq *rq, struct task_struct *prev,
34044628 struct task_struct *next, struct rq_flags *rf)
34054629 {
3406
- struct mm_struct *mm, *oldmm;
3407
-
34084630 prepare_task_switch(rq, prev, next);
34094631
3410
- mm = next->mm;
3411
- oldmm = prev->active_mm;
34124632 /*
34134633 * For paravirt, this is coupled with an exit in switch_to to
34144634 * combine the page table reload and the switch backend into
....@@ -3417,22 +4637,37 @@
34174637 arch_start_context_switch(prev);
34184638
34194639 /*
3420
- * If mm is non-NULL, we pass through switch_mm(). If mm is
3421
- * NULL, we will pass through mmdrop() in finish_task_switch().
3422
- * Both of these contain the full memory barrier required by
3423
- * membarrier after storing to rq->curr, before returning to
3424
- * user-space.
4640
+ * kernel -> kernel lazy + transfer active
4641
+ * user -> kernel lazy + mmgrab() active
4642
+ *
4643
+ * kernel -> user switch + mmdrop() active
4644
+ * user -> user switch
34254645 */
3426
- if (!mm) {
3427
- next->active_mm = oldmm;
3428
- mmgrab(oldmm);
3429
- enter_lazy_tlb(oldmm, next);
3430
- } else
3431
- switch_mm_irqs_off(oldmm, mm, next);
4646
+ if (!next->mm) { // to kernel
4647
+ enter_lazy_tlb(prev->active_mm, next);
34324648
3433
- if (!prev->mm) {
3434
- prev->active_mm = NULL;
3435
- rq->prev_mm = oldmm;
4649
+ next->active_mm = prev->active_mm;
4650
+ if (prev->mm) // from user
4651
+ mmgrab(prev->active_mm);
4652
+ else
4653
+ prev->active_mm = NULL;
4654
+ } else { // to user
4655
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
4656
+ /*
4657
+ * sys_membarrier() requires an smp_mb() between setting
4658
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
4659
+ *
4660
+ * The below provides this either through switch_mm(), or in
4661
+ * case 'prev->active_mm == next->mm' through
4662
+ * finish_task_switch()'s mmdrop().
4663
+ */
4664
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
4665
+
4666
+ if (!prev->mm) { // from kernel
4667
+ /* will mmdrop() in finish_task_switch(). */
4668
+ rq->prev_mm = prev->active_mm;
4669
+ prev->active_mm = NULL;
4670
+ }
34364671 }
34374672
34384673 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
....@@ -3469,7 +4704,7 @@
34694704 * preemption, thus the result might have a time-of-check-to-time-of-use
34704705 * race. The caller is responsible to use it correctly, for example:
34714706 *
3472
- * - from a non-preemptable section (of course)
4707
+ * - from a non-preemptible section (of course)
34734708 *
34744709 * - from a thread that is bound to a single CPU
34754710 *
....@@ -3490,6 +4725,18 @@
34904725 sum += cpu_rq(i)->nr_switches;
34914726
34924727 return sum;
4728
+}
4729
+
4730
+/*
4731
+ * Consumers of these two interfaces, like for example the cpuidle menu
4732
+ * governor, are using nonsensical data. Preferring shallow idle state selection
4733
+ * for a CPU that has IO-wait which might not even end up running the task when
4734
+ * it does become runnable.
4735
+ */
4736
+
4737
+unsigned long nr_iowait_cpu(int cpu)
4738
+{
4739
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
34934740 }
34944741
34954742 /*
....@@ -3527,29 +4774,9 @@
35274774 unsigned long i, sum = 0;
35284775
35294776 for_each_possible_cpu(i)
3530
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
4777
+ sum += nr_iowait_cpu(i);
35314778
35324779 return sum;
3533
-}
3534
-
3535
-/*
3536
- * Consumers of these two interfaces, like for example the cpufreq menu
3537
- * governor are using nonsensical data. Boosting frequency for a CPU that has
3538
- * IO-wait which might not even end up running the task when it does become
3539
- * runnable.
3540
- */
3541
-
3542
-unsigned long nr_iowait_cpu(int cpu)
3543
-{
3544
- struct rq *this = cpu_rq(cpu);
3545
- return atomic_read(&this->nr_iowait);
3546
-}
3547
-
3548
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3549
-{
3550
- struct rq *rq = this_rq();
3551
- *nr_waiters = atomic_read(&rq->nr_iowait);
3552
- *load = rq->load.weight;
35534780 }
35544781
35554782 #ifdef CONFIG_SMP
....@@ -3563,9 +4790,14 @@
35634790 struct task_struct *p = current;
35644791 unsigned long flags;
35654792 int dest_cpu;
4793
+ bool cond = false;
4794
+
4795
+ trace_android_rvh_sched_exec(&cond);
4796
+ if (cond)
4797
+ return;
35664798
35674799 raw_spin_lock_irqsave(&p->pi_lock, flags);
3568
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
4800
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
35694801 if (dest_cpu == smp_processor_id())
35704802 goto unlock;
35714803
....@@ -3648,6 +4880,7 @@
36484880
36494881 return ns;
36504882 }
4883
+EXPORT_SYMBOL_GPL(task_sched_runtime);
36514884
36524885 /*
36534886 * This function gets called by the timer code, with HZ frequency.
....@@ -3659,14 +4892,18 @@
36594892 struct rq *rq = cpu_rq(cpu);
36604893 struct task_struct *curr = rq->curr;
36614894 struct rq_flags rf;
4895
+ unsigned long thermal_pressure;
36624896
4897
+ arch_scale_freq_tick();
36634898 sched_clock_tick();
36644899
36654900 rq_lock(rq, &rf);
36664901
4902
+ trace_android_rvh_tick_entry(rq);
36674903 update_rq_clock(rq);
4904
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4905
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
36684906 curr->sched_class->task_tick(rq, curr, 0);
3669
- cpu_load_update_active(rq);
36704907 calc_global_load_tick(rq);
36714908 psi_task_tick(rq);
36724909
....@@ -3678,6 +4915,8 @@
36784915 rq->idle_balance = idle_cpu(cpu);
36794916 trigger_load_balance(rq);
36804917 #endif
4918
+
4919
+ trace_android_vh_scheduler_tick(rq);
36814920 }
36824921
36834922 #ifdef CONFIG_NO_HZ_FULL
....@@ -3735,28 +4974,31 @@
37354974 * statistics and checks timeslices in a time-independent way, regardless
37364975 * of when exactly it is running.
37374976 */
3738
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
4977
+ if (!tick_nohz_tick_stopped_cpu(cpu))
37394978 goto out_requeue;
37404979
37414980 rq_lock_irq(rq, &rf);
37424981 curr = rq->curr;
3743
- if (is_idle_task(curr) || cpu_is_offline(cpu))
4982
+ if (cpu_is_offline(cpu))
37444983 goto out_unlock;
37454984
37464985 update_rq_clock(rq);
3747
- delta = rq_clock_task(rq) - curr->se.exec_start;
37484986
3749
- /*
3750
- * Make sure the next tick runs within a reasonable
3751
- * amount of time.
3752
- */
3753
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4987
+ if (!is_idle_task(curr)) {
4988
+ /*
4989
+ * Make sure the next tick runs within a reasonable
4990
+ * amount of time.
4991
+ */
4992
+ delta = rq_clock_task(rq) - curr->se.exec_start;
4993
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4994
+ }
37544995 curr->sched_class->task_tick(rq, curr, 0);
37554996
4997
+ calc_load_nohz_remote(rq);
37564998 out_unlock:
37574999 rq_unlock_irq(rq, &rf);
3758
-
37595000 out_requeue:
5001
+
37605002 /*
37615003 * Run the remote tick once per second (1Hz). This arbitrary
37625004 * frequency is large enough to avoid overload but short enough
....@@ -3820,7 +5062,7 @@
38205062 static inline void sched_tick_stop(int cpu) { }
38215063 #endif
38225064
3823
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5065
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
38245066 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
38255067 /*
38265068 * If the value passed in is equal to the current preempt count
....@@ -3926,11 +5168,12 @@
39265168 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
39275169 && in_atomic_preempt_off()) {
39285170 pr_err("Preemption disabled at:");
3929
- print_ip_sym(preempt_disable_ip);
3930
- pr_cont("\n");
5171
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
39315172 }
39325173 if (panic_on_warn)
39335174 panic("scheduling while atomic\n");
5175
+
5176
+ trace_android_rvh_schedule_bug(prev);
39345177
39355178 dump_stack();
39365179 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
....@@ -3939,11 +5182,23 @@
39395182 /*
39405183 * Various schedule()-time debugging checks and statistics:
39415184 */
3942
-static inline void schedule_debug(struct task_struct *prev)
5185
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
39435186 {
39445187 #ifdef CONFIG_SCHED_STACK_END_CHECK
39455188 if (task_stack_end_corrupted(prev))
39465189 panic("corrupted stack end detected inside scheduler\n");
5190
+
5191
+ if (task_scs_end_corrupted(prev))
5192
+ panic("corrupted shadow stack detected inside scheduler\n");
5193
+#endif
5194
+
5195
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5196
+ if (!preempt && prev->state && prev->non_block_count) {
5197
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5198
+ prev->comm, prev->pid, prev->non_block_count);
5199
+ dump_stack();
5200
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5201
+ }
39475202 #endif
39485203
39495204 if (unlikely(in_atomic_preempt_off())) {
....@@ -3955,6 +5210,28 @@
39555210 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
39565211
39575212 schedstat_inc(this_rq()->sched_count);
5213
+}
5214
+
5215
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5216
+ struct rq_flags *rf)
5217
+{
5218
+#ifdef CONFIG_SMP
5219
+ const struct sched_class *class;
5220
+ /*
5221
+ * We must do the balancing pass before put_prev_task(), such
5222
+ * that when we release the rq->lock the task is in the same
5223
+ * state as before we took rq->lock.
5224
+ *
5225
+ * We can terminate the balance pass as soon as we know there is
5226
+ * a runnable task of @class priority or higher.
5227
+ */
5228
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
5229
+ if (class->balance(rq, prev, rf))
5230
+ break;
5231
+ }
5232
+#endif
5233
+
5234
+ put_prev_task(rq, prev);
39585235 }
39595236
39605237 /*
....@@ -3972,29 +5249,29 @@
39725249 * higher scheduling class, because otherwise those loose the
39735250 * opportunity to pull in more work from other CPUs.
39745251 */
3975
- if (likely((prev->sched_class == &idle_sched_class ||
3976
- prev->sched_class == &fair_sched_class) &&
5252
+ if (likely(prev->sched_class <= &fair_sched_class &&
39775253 rq->nr_running == rq->cfs.h_nr_running)) {
39785254
3979
- p = fair_sched_class.pick_next_task(rq, prev, rf);
5255
+ p = pick_next_task_fair(rq, prev, rf);
39805256 if (unlikely(p == RETRY_TASK))
3981
- goto again;
5257
+ goto restart;
39825258
39835259 /* Assumes fair_sched_class->next == idle_sched_class */
3984
- if (unlikely(!p))
3985
- p = idle_sched_class.pick_next_task(rq, prev, rf);
5260
+ if (!p) {
5261
+ put_prev_task(rq, prev);
5262
+ p = pick_next_task_idle(rq);
5263
+ }
39865264
39875265 return p;
39885266 }
39895267
3990
-again:
5268
+restart:
5269
+ put_prev_task_balance(rq, prev, rf);
5270
+
39915271 for_each_class(class) {
3992
- p = class->pick_next_task(rq, prev, rf);
3993
- if (p) {
3994
- if (unlikely(p == RETRY_TASK))
3995
- goto again;
5272
+ p = class->pick_next_task(rq);
5273
+ if (p)
39965274 return p;
3997
- }
39985275 }
39995276
40005277 /* The idle class should always have a runnable task: */
....@@ -4021,7 +5298,7 @@
40215298 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
40225299 * called on the nearest possible occasion:
40235300 *
4024
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
5301
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
40255302 *
40265303 * - in syscall or exception context, at the next outmost
40275304 * preempt_enable(). (this might be as soon as the wake_up()'s
....@@ -4030,7 +5307,7 @@
40305307 * - in IRQ context, return from interrupt-handler to
40315308 * preemptible context
40325309 *
4033
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
5310
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
40345311 * then at the next:
40355312 *
40365313 * - cond_resched() call
....@@ -4040,10 +5317,11 @@
40405317 *
40415318 * WARNING: must be called with preemption disabled!
40425319 */
4043
-static void __sched notrace __schedule(bool preempt)
5320
+static void __sched notrace __schedule(bool preempt, bool spinning_lock)
40445321 {
40455322 struct task_struct *prev, *next;
40465323 unsigned long *switch_count;
5324
+ unsigned long prev_state;
40475325 struct rq_flags rf;
40485326 struct rq *rq;
40495327 int cpu;
....@@ -4052,7 +5330,7 @@
40525330 rq = cpu_rq(cpu);
40535331 prev = rq->curr;
40545332
4055
- schedule_debug(prev);
5333
+ schedule_debug(prev, preempt);
40565334
40575335 if (sched_feat(HRTICK))
40585336 hrtick_clear(rq);
....@@ -4063,9 +5341,16 @@
40635341 /*
40645342 * Make sure that signal_pending_state()->signal_pending() below
40655343 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4066
- * done by the caller to avoid the race with signal_wake_up().
5344
+ * done by the caller to avoid the race with signal_wake_up():
40675345 *
4068
- * The membarrier system call requires a full memory barrier
5346
+ * __set_current_state(@state) signal_wake_up()
5347
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
5348
+ * wake_up_state(p, state)
5349
+ * LOCK rq->lock LOCK p->pi_state
5350
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
5351
+ * if (signal_pending_state()) if (p->state & @state)
5352
+ *
5353
+ * Also, the membarrier system call requires a full memory barrier
40695354 * after coming from user-space, before storing to rq->curr.
40705355 */
40715356 rq_lock(rq, &rf);
....@@ -4076,29 +5361,43 @@
40765361 update_rq_clock(rq);
40775362
40785363 switch_count = &prev->nivcsw;
4079
- if (!preempt && prev->state) {
4080
- if (unlikely(signal_pending_state(prev->state, prev))) {
5364
+
5365
+ /*
5366
+ * We must load prev->state once (task_struct::state is volatile), such
5367
+ * that:
5368
+ *
5369
+ * - we form a control dependency vs deactivate_task() below.
5370
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
5371
+ */
5372
+ prev_state = prev->state;
5373
+ if ((!preempt || spinning_lock) && prev_state) {
5374
+ if (signal_pending_state(prev_state, prev)) {
40815375 prev->state = TASK_RUNNING;
40825376 } else {
5377
+ prev->sched_contributes_to_load =
5378
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
5379
+ !(prev_state & TASK_NOLOAD) &&
5380
+ !(prev->flags & PF_FROZEN);
5381
+
5382
+ if (prev->sched_contributes_to_load)
5383
+ rq->nr_uninterruptible++;
5384
+
5385
+ /*
5386
+ * __schedule() ttwu()
5387
+ * prev_state = prev->state; if (p->on_rq && ...)
5388
+ * if (prev_state) goto out;
5389
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
5390
+ * p->state = TASK_WAKING
5391
+ *
5392
+ * Where __schedule() and ttwu() have matching control dependencies.
5393
+ *
5394
+ * After this, schedule() must not care about p->state any more.
5395
+ */
40835396 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4084
- prev->on_rq = 0;
40855397
40865398 if (prev->in_iowait) {
40875399 atomic_inc(&rq->nr_iowait);
40885400 delayacct_blkio_start();
4089
- }
4090
-
4091
- /*
4092
- * If a worker went to sleep, notify and ask workqueue
4093
- * whether it wants to wake up a task to maintain
4094
- * concurrency.
4095
- */
4096
- if (prev->flags & PF_WQ_WORKER) {
4097
- struct task_struct *to_wakeup;
4098
-
4099
- to_wakeup = wq_worker_sleeping(prev);
4100
- if (to_wakeup)
4101
- try_to_wake_up_local(to_wakeup, &rf);
41025401 }
41035402 }
41045403 switch_count = &prev->nvcsw;
....@@ -4106,11 +5405,17 @@
41065405
41075406 next = pick_next_task(rq, prev, &rf);
41085407 clear_tsk_need_resched(prev);
5408
+ clear_tsk_need_resched_lazy(prev);
41095409 clear_preempt_need_resched();
41105410
5411
+ trace_android_rvh_schedule(prev, next, rq);
41115412 if (likely(prev != next)) {
41125413 rq->nr_switches++;
4113
- rq->curr = next;
5414
+ /*
5415
+ * RCU users of rcu_dereference(rq->curr) may not see
5416
+ * changes to task_struct made by pick_next_task().
5417
+ */
5418
+ RCU_INIT_POINTER(rq->curr, next);
41145419 /*
41155420 * The membarrier system call requires each architecture
41165421 * to have a full memory barrier after updating
....@@ -4127,16 +5432,20 @@
41275432 */
41285433 ++*switch_count;
41295434
5435
+ migrate_disable_switch(rq, prev);
5436
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
5437
+
41305438 trace_sched_switch(preempt, prev, next);
41315439
41325440 /* Also unlocks the rq: */
41335441 rq = context_switch(rq, prev, next, &rf);
41345442 } else {
41355443 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4136
- rq_unlock_irq(rq, &rf);
4137
- }
41385444
4139
- balance_callback(rq);
5445
+ rq_unpin_lock(rq, &rf);
5446
+ __balance_callbacks(rq);
5447
+ raw_spin_unlock_irq(&rq->lock);
5448
+ }
41405449 }
41415450
41425451 void __noreturn do_task_dead(void)
....@@ -4147,7 +5456,7 @@
41475456 /* Tell freezer to ignore us: */
41485457 current->flags |= PF_NOFREEZE;
41495458
4150
- __schedule(false);
5459
+ __schedule(false, false);
41515460 BUG();
41525461
41535462 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
....@@ -4157,14 +5466,45 @@
41575466
41585467 static inline void sched_submit_work(struct task_struct *tsk)
41595468 {
4160
- if (!tsk->state || tsk_is_pi_blocked(tsk))
5469
+ unsigned int task_flags;
5470
+
5471
+ if (!tsk->state)
41615472 return;
5473
+
5474
+ task_flags = tsk->flags;
5475
+ /*
5476
+ * If a worker went to sleep, notify and ask workqueue whether
5477
+ * it wants to wake up a task to maintain concurrency.
5478
+ * As this function is called inside the schedule() context,
5479
+ * we disable preemption to avoid it calling schedule() again
5480
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
5481
+ * requires it.
5482
+ */
5483
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5484
+ preempt_disable();
5485
+ if (task_flags & PF_WQ_WORKER)
5486
+ wq_worker_sleeping(tsk);
5487
+ else
5488
+ io_wq_worker_sleeping(tsk);
5489
+ preempt_enable_no_resched();
5490
+ }
5491
+
41625492 /*
41635493 * If we are going to sleep and we have plugged IO queued,
41645494 * make sure to submit it to avoid deadlocks.
41655495 */
41665496 if (blk_needs_flush_plug(tsk))
41675497 blk_schedule_flush_plug(tsk);
5498
+}
5499
+
5500
+static void sched_update_worker(struct task_struct *tsk)
5501
+{
5502
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5503
+ if (tsk->flags & PF_WQ_WORKER)
5504
+ wq_worker_running(tsk);
5505
+ else
5506
+ io_wq_worker_running(tsk);
5507
+ }
41685508 }
41695509
41705510 asmlinkage __visible void __sched schedule(void)
....@@ -4174,9 +5514,10 @@
41745514 sched_submit_work(tsk);
41755515 do {
41765516 preempt_disable();
4177
- __schedule(false);
5517
+ __schedule(false, false);
41785518 sched_preempt_enable_no_resched();
41795519 } while (need_resched());
5520
+ sched_update_worker(tsk);
41805521 }
41815522 EXPORT_SYMBOL(schedule);
41825523
....@@ -4201,7 +5542,7 @@
42015542 */
42025543 WARN_ON_ONCE(current->state);
42035544 do {
4204
- __schedule(false);
5545
+ __schedule(false, false);
42055546 } while (need_resched());
42065547 }
42075548
....@@ -4254,7 +5595,7 @@
42545595 */
42555596 preempt_disable_notrace();
42565597 preempt_latency_start(1);
4257
- __schedule(true);
5598
+ __schedule(true, false);
42585599 preempt_latency_stop(1);
42595600 preempt_enable_no_resched_notrace();
42605601
....@@ -4265,11 +5606,34 @@
42655606 } while (need_resched());
42665607 }
42675608
4268
-#ifdef CONFIG_PREEMPT
5609
+#ifdef CONFIG_PREEMPT_LAZY
42695610 /*
4270
- * this is the entry point to schedule() from in-kernel preemption
4271
- * off of preempt_enable. Kernel preemptions off return from interrupt
4272
- * occur there and call schedule directly.
5611
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
5612
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
5613
+ * preempt_lazy_count counter >0.
5614
+ */
5615
+static __always_inline int preemptible_lazy(void)
5616
+{
5617
+ if (test_thread_flag(TIF_NEED_RESCHED))
5618
+ return 1;
5619
+ if (current_thread_info()->preempt_lazy_count)
5620
+ return 0;
5621
+ return 1;
5622
+}
5623
+
5624
+#else
5625
+
5626
+static inline int preemptible_lazy(void)
5627
+{
5628
+ return 1;
5629
+}
5630
+
5631
+#endif
5632
+
5633
+#ifdef CONFIG_PREEMPTION
5634
+/*
5635
+ * This is the entry point to schedule() from in-kernel preemption
5636
+ * off of preempt_enable.
42735637 */
42745638 asmlinkage __visible void __sched notrace preempt_schedule(void)
42755639 {
....@@ -4279,11 +5643,25 @@
42795643 */
42805644 if (likely(!preemptible()))
42815645 return;
4282
-
5646
+ if (!preemptible_lazy())
5647
+ return;
42835648 preempt_schedule_common();
42845649 }
42855650 NOKPROBE_SYMBOL(preempt_schedule);
42865651 EXPORT_SYMBOL(preempt_schedule);
5652
+
5653
+#ifdef CONFIG_PREEMPT_RT
5654
+void __sched notrace preempt_schedule_lock(void)
5655
+{
5656
+ do {
5657
+ preempt_disable();
5658
+ __schedule(true, true);
5659
+ sched_preempt_enable_no_resched();
5660
+ } while (need_resched());
5661
+}
5662
+NOKPROBE_SYMBOL(preempt_schedule_lock);
5663
+EXPORT_SYMBOL(preempt_schedule_lock);
5664
+#endif
42875665
42885666 /**
42895667 * preempt_schedule_notrace - preempt_schedule called by tracing
....@@ -4304,6 +5682,9 @@
43045682 enum ctx_state prev_ctx;
43055683
43065684 if (likely(!preemptible()))
5685
+ return;
5686
+
5687
+ if (!preemptible_lazy())
43075688 return;
43085689
43095690 do {
....@@ -4328,7 +5709,7 @@
43285709 * an infinite recursion.
43295710 */
43305711 prev_ctx = exception_enter();
4331
- __schedule(true);
5712
+ __schedule(true, false);
43325713 exception_exit(prev_ctx);
43335714
43345715 preempt_latency_stop(1);
....@@ -4337,10 +5718,10 @@
43375718 }
43385719 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
43395720
4340
-#endif /* CONFIG_PREEMPT */
5721
+#endif /* CONFIG_PREEMPTION */
43415722
43425723 /*
4343
- * this is the entry point to schedule() from kernel preemption
5724
+ * This is the entry point to schedule() from kernel preemption
43445725 * off of irq context.
43455726 * Note, that this is called and return with irqs disabled. This will
43465727 * protect us against recursive calling from irq.
....@@ -4357,7 +5738,7 @@
43575738 do {
43585739 preempt_disable();
43595740 local_irq_enable();
4360
- __schedule(true);
5741
+ __schedule(true, false);
43615742 local_irq_disable();
43625743 sched_preempt_enable_no_resched();
43635744 } while (need_resched());
....@@ -4368,9 +5749,22 @@
43685749 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
43695750 void *key)
43705751 {
4371
- return try_to_wake_up(curr->private, mode, wake_flags, 1);
5752
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
5753
+ return try_to_wake_up(curr->private, mode, wake_flags);
43725754 }
43735755 EXPORT_SYMBOL(default_wake_function);
5756
+
5757
+static void __setscheduler_prio(struct task_struct *p, int prio)
5758
+{
5759
+ if (dl_prio(prio))
5760
+ p->sched_class = &dl_sched_class;
5761
+ else if (rt_prio(prio))
5762
+ p->sched_class = &rt_sched_class;
5763
+ else
5764
+ p->sched_class = &fair_sched_class;
5765
+
5766
+ p->prio = prio;
5767
+}
43745768
43755769 #ifdef CONFIG_RT_MUTEXES
43765770
....@@ -4408,6 +5802,7 @@
44085802 struct rq_flags rf;
44095803 struct rq *rq;
44105804
5805
+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
44115806 /* XXX used to be waiter->prio, not waiter->task->prio */
44125807 prio = __rt_effective_prio(pi_task, p->normal_prio);
44135808
....@@ -4482,39 +5877,39 @@
44825877 if (!dl_prio(p->normal_prio) ||
44835878 (pi_task && dl_prio(pi_task->prio) &&
44845879 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4485
- p->dl.dl_boosted = 1;
5880
+ p->dl.pi_se = pi_task->dl.pi_se;
44865881 queue_flag |= ENQUEUE_REPLENISH;
4487
- } else
4488
- p->dl.dl_boosted = 0;
4489
- p->sched_class = &dl_sched_class;
5882
+ } else {
5883
+ p->dl.pi_se = &p->dl;
5884
+ }
44905885 } else if (rt_prio(prio)) {
44915886 if (dl_prio(oldprio))
4492
- p->dl.dl_boosted = 0;
5887
+ p->dl.pi_se = &p->dl;
44935888 if (oldprio < prio)
44945889 queue_flag |= ENQUEUE_HEAD;
4495
- p->sched_class = &rt_sched_class;
44965890 } else {
44975891 if (dl_prio(oldprio))
4498
- p->dl.dl_boosted = 0;
5892
+ p->dl.pi_se = &p->dl;
44995893 if (rt_prio(oldprio))
45005894 p->rt.timeout = 0;
4501
- p->sched_class = &fair_sched_class;
45025895 }
45035896
4504
- p->prio = prio;
5897
+ __setscheduler_prio(p, prio);
45055898
45065899 if (queued)
45075900 enqueue_task(rq, p, queue_flag);
45085901 if (running)
4509
- set_curr_task(rq, p);
5902
+ set_next_task(rq, p);
45105903
45115904 check_class_changed(rq, p, prev_class, oldprio);
45125905 out_unlock:
45135906 /* Avoid rq from going away on us: */
45145907 preempt_disable();
4515
- __task_rq_unlock(rq, &rf);
45165908
4517
- balance_callback(rq);
5909
+ rq_unpin_lock(rq, &rf);
5910
+ __balance_callbacks(rq);
5911
+ raw_spin_unlock(&rq->lock);
5912
+
45185913 preempt_enable();
45195914 }
45205915 #else
....@@ -4526,12 +5921,13 @@
45265921
45275922 void set_user_nice(struct task_struct *p, long nice)
45285923 {
4529
- bool queued, running;
4530
- int old_prio, delta;
5924
+ bool queued, running, allowed = false;
5925
+ int old_prio;
45315926 struct rq_flags rf;
45325927 struct rq *rq;
45335928
4534
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5929
+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
5930
+ if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed)
45355931 return;
45365932 /*
45375933 * We have to be careful, if called from sys_setpriority(),
....@@ -4558,22 +5954,21 @@
45585954 put_prev_task(rq, p);
45595955
45605956 p->static_prio = NICE_TO_PRIO(nice);
4561
- set_load_weight(p, true);
5957
+ set_load_weight(p);
45625958 old_prio = p->prio;
45635959 p->prio = effective_prio(p);
4564
- delta = p->prio - old_prio;
45655960
4566
- if (queued) {
5961
+ if (queued)
45675962 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4568
- /*
4569
- * If the task increased its priority or is running and
4570
- * lowered its priority, then reschedule its CPU:
4571
- */
4572
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
4573
- resched_curr(rq);
4574
- }
45755963 if (running)
4576
- set_curr_task(rq, p);
5964
+ set_next_task(rq, p);
5965
+
5966
+ /*
5967
+ * If the task increased its priority or is running and
5968
+ * lowered its priority, then reschedule its CPU:
5969
+ */
5970
+ p->sched_class->prio_changed(rq, p, old_prio);
5971
+
45775972 out_unlock:
45785973 task_rq_unlock(rq, p, &rf);
45795974 }
....@@ -4658,7 +6053,7 @@
46586053 return 0;
46596054
46606055 #ifdef CONFIG_SMP
4661
- if (!llist_empty(&rq->wake_list))
6056
+ if (rq->ttwu_pending)
46626057 return 0;
46636058 #endif
46646059
....@@ -4681,6 +6076,7 @@
46816076
46826077 return 1;
46836078 }
6079
+EXPORT_SYMBOL_GPL(available_idle_cpu);
46846080
46856081 /**
46866082 * idle_task - return the idle task for a given CPU.
....@@ -4732,36 +6128,7 @@
47326128 */
47336129 p->rt_priority = attr->sched_priority;
47346130 p->normal_prio = normal_prio(p);
4735
- set_load_weight(p, true);
4736
-}
4737
-
4738
-/* Actually do priority change: must hold pi & rq lock. */
4739
-static void __setscheduler(struct rq *rq, struct task_struct *p,
4740
- const struct sched_attr *attr, bool keep_boost)
4741
-{
4742
- /*
4743
- * If params can't change scheduling class changes aren't allowed
4744
- * either.
4745
- */
4746
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4747
- return;
4748
-
4749
- __setscheduler_params(p, attr);
4750
-
4751
- /*
4752
- * Keep a potential priority boosting if called from
4753
- * sched_setscheduler().
4754
- */
4755
- p->prio = normal_prio(p);
4756
- if (keep_boost)
4757
- p->prio = rt_effective_prio(p, p->prio);
4758
-
4759
- if (dl_prio(p->prio))
4760
- p->sched_class = &dl_sched_class;
4761
- else if (rt_prio(p->prio))
4762
- p->sched_class = &rt_sched_class;
4763
- else
4764
- p->sched_class = &fair_sched_class;
6131
+ set_load_weight(p);
47656132 }
47666133
47676134 /*
....@@ -4784,11 +6151,10 @@
47846151 const struct sched_attr *attr,
47856152 bool user, bool pi)
47866153 {
4787
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4788
- MAX_RT_PRIO - 1 - attr->sched_priority;
4789
- int retval, oldprio, oldpolicy = -1, queued, running;
4790
- int new_effective_prio, policy = attr->sched_policy;
6154
+ int oldpolicy = -1, policy = attr->sched_policy;
6155
+ int retval, oldprio, newprio, queued, running;
47916156 const struct sched_class *prev_class;
6157
+ struct callback_head *head;
47926158 struct rq_flags rf;
47936159 int reset_on_fork;
47946160 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
....@@ -4860,7 +6226,7 @@
48606226 * Treat SCHED_IDLE as nice 20. Only allow a switch to
48616227 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
48626228 */
4863
- if (idle_policy(p->policy) && !idle_policy(policy)) {
6229
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
48646230 if (!can_nice(p, task_nice(p)))
48656231 return -EPERM;
48666232 }
....@@ -4871,6 +6237,10 @@
48716237
48726238 /* Normal users shall not reset the sched_reset_on_fork flag: */
48736239 if (p->sched_reset_on_fork && !reset_on_fork)
6240
+ return -EPERM;
6241
+
6242
+ /* Can't change util-clamps */
6243
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
48746244 return -EPERM;
48756245 }
48766246
....@@ -4904,8 +6274,8 @@
49046274 * Changing the policy of the stop threads its a very bad idea:
49056275 */
49066276 if (p == rq->stop) {
4907
- task_rq_unlock(rq, p, &rf);
4908
- return -EINVAL;
6277
+ retval = -EINVAL;
6278
+ goto unlock;
49096279 }
49106280
49116281 /*
....@@ -4923,8 +6293,8 @@
49236293 goto change;
49246294
49256295 p->sched_reset_on_fork = reset_on_fork;
4926
- task_rq_unlock(rq, p, &rf);
4927
- return 0;
6296
+ retval = 0;
6297
+ goto unlock;
49286298 }
49296299 change:
49306300
....@@ -4937,8 +6307,8 @@
49376307 if (rt_bandwidth_enabled() && rt_policy(policy) &&
49386308 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
49396309 !task_group_is_autogroup(task_group(p))) {
4940
- task_rq_unlock(rq, p, &rf);
4941
- return -EPERM;
6310
+ retval = -EPERM;
6311
+ goto unlock;
49426312 }
49436313 #endif
49446314 #ifdef CONFIG_SMP
....@@ -4951,10 +6321,10 @@
49516321 * the entire root_domain to become SCHED_DEADLINE. We
49526322 * will also fail if there's no bandwidth available.
49536323 */
4954
- if (!cpumask_subset(span, &p->cpus_allowed) ||
6324
+ if (!cpumask_subset(span, p->cpus_ptr) ||
49556325 rq->rd->dl_bw.bw == 0) {
4956
- task_rq_unlock(rq, p, &rf);
4957
- return -EPERM;
6326
+ retval = -EPERM;
6327
+ goto unlock;
49586328 }
49596329 }
49606330 #endif
....@@ -4973,13 +6343,14 @@
49736343 * is available.
49746344 */
49756345 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4976
- task_rq_unlock(rq, p, &rf);
4977
- return -EBUSY;
6346
+ retval = -EBUSY;
6347
+ goto unlock;
49786348 }
49796349
49806350 p->sched_reset_on_fork = reset_on_fork;
49816351 oldprio = p->prio;
49826352
6353
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
49836354 if (pi) {
49846355 /*
49856356 * Take priority boosted tasks into account. If the new
....@@ -4988,8 +6359,8 @@
49886359 * the runqueue. This will be done when the task deboost
49896360 * itself.
49906361 */
4991
- new_effective_prio = rt_effective_prio(p, newprio);
4992
- if (new_effective_prio == oldprio)
6362
+ newprio = rt_effective_prio(p, newprio);
6363
+ if (newprio == oldprio)
49936364 queue_flags &= ~DEQUEUE_MOVE;
49946365 }
49956366
....@@ -5002,7 +6373,11 @@
50026373
50036374 prev_class = p->sched_class;
50046375
5005
- __setscheduler(rq, p, attr, pi);
6376
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
6377
+ __setscheduler_params(p, attr);
6378
+ __setscheduler_prio(p, newprio);
6379
+ trace_android_rvh_setscheduler(p);
6380
+ }
50066381 __setscheduler_uclamp(p, attr);
50076382
50086383 if (queued) {
....@@ -5016,22 +6391,27 @@
50166391 enqueue_task(rq, p, queue_flags);
50176392 }
50186393 if (running)
5019
- set_curr_task(rq, p);
6394
+ set_next_task(rq, p);
50206395
50216396 check_class_changed(rq, p, prev_class, oldprio);
50226397
50236398 /* Avoid rq from going away on us: */
50246399 preempt_disable();
6400
+ head = splice_balance_callbacks(rq);
50256401 task_rq_unlock(rq, p, &rf);
50266402
50276403 if (pi)
50286404 rt_mutex_adjust_pi(p);
50296405
50306406 /* Run balance callbacks after we've adjusted the PI chain: */
5031
- balance_callback(rq);
6407
+ balance_callbacks(rq, head);
50326408 preempt_enable();
50336409
50346410 return 0;
6411
+
6412
+unlock:
6413
+ task_rq_unlock(rq, p, &rf);
6414
+ return retval;
50356415 }
50366416
50376417 static int _sched_setscheduler(struct task_struct *p, int policy,
....@@ -5043,6 +6423,14 @@
50436423 .sched_nice = PRIO_TO_NICE(p->static_prio),
50446424 };
50456425
6426
+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
6427
+ ((policy == SCHED_FIFO) || (policy == SCHED_RR))) {
6428
+ attr.sched_priority /= 2;
6429
+ if (!check)
6430
+ attr.sched_priority += MAX_RT_PRIO / 2;
6431
+ if (!attr.sched_priority)
6432
+ attr.sched_priority = 1;
6433
+ }
50466434 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
50476435 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
50486436 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
....@@ -5057,6 +6445,8 @@
50576445 * @p: the task in question.
50586446 * @policy: new policy.
50596447 * @param: structure containing the new RT priority.
6448
+ *
6449
+ * Use sched_set_fifo(), read its comment.
50606450 *
50616451 * Return: 0 on success. An error code otherwise.
50626452 *
....@@ -5079,6 +6469,7 @@
50796469 {
50806470 return __sched_setscheduler(p, attr, false, true);
50816471 }
6472
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
50826473
50836474 /**
50846475 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
....@@ -5099,6 +6490,51 @@
50996490 return _sched_setscheduler(p, policy, param, false);
51006491 }
51016492 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
6493
+
6494
+/*
6495
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
6496
+ * incapable of resource management, which is the one thing an OS really should
6497
+ * be doing.
6498
+ *
6499
+ * This is of course the reason it is limited to privileged users only.
6500
+ *
6501
+ * Worse still; it is fundamentally impossible to compose static priority
6502
+ * workloads. You cannot take two correctly working static prio workloads
6503
+ * and smash them together and still expect them to work.
6504
+ *
6505
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
6506
+ *
6507
+ * MAX_RT_PRIO / 2
6508
+ *
6509
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
6510
+ * know enough information to make a sensible choice.
6511
+ */
6512
+void sched_set_fifo(struct task_struct *p)
6513
+{
6514
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
6515
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6516
+}
6517
+EXPORT_SYMBOL_GPL(sched_set_fifo);
6518
+
6519
+/*
6520
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
6521
+ */
6522
+void sched_set_fifo_low(struct task_struct *p)
6523
+{
6524
+ struct sched_param sp = { .sched_priority = 1 };
6525
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6526
+}
6527
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
6528
+
6529
+void sched_set_normal(struct task_struct *p, int nice)
6530
+{
6531
+ struct sched_attr attr = {
6532
+ .sched_policy = SCHED_NORMAL,
6533
+ .sched_nice = nice,
6534
+ };
6535
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
6536
+}
6537
+EXPORT_SYMBOL_GPL(sched_set_normal);
51026538
51036539 static int
51046540 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
....@@ -5130,9 +6566,6 @@
51306566 u32 size;
51316567 int ret;
51326568
5133
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5134
- return -EFAULT;
5135
-
51366569 /* Zero the full structure, so that a short copy will be nice: */
51376570 memset(attr, 0, sizeof(*attr));
51386571
....@@ -5140,44 +6573,18 @@
51406573 if (ret)
51416574 return ret;
51426575
5143
- /* Bail out on silly large: */
5144
- if (size > PAGE_SIZE)
5145
- goto err_size;
5146
-
51476576 /* ABI compatibility quirk: */
51486577 if (!size)
51496578 size = SCHED_ATTR_SIZE_VER0;
5150
-
5151
- if (size < SCHED_ATTR_SIZE_VER0)
6579
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
51526580 goto err_size;
51536581
5154
- /*
5155
- * If we're handed a bigger struct than we know of,
5156
- * ensure all the unknown bits are 0 - i.e. new
5157
- * user-space does not rely on any kernel feature
5158
- * extensions we dont know about yet.
5159
- */
5160
- if (size > sizeof(*attr)) {
5161
- unsigned char __user *addr;
5162
- unsigned char __user *end;
5163
- unsigned char val;
5164
-
5165
- addr = (void __user *)uattr + sizeof(*attr);
5166
- end = (void __user *)uattr + size;
5167
-
5168
- for (; addr < end; addr++) {
5169
- ret = get_user(val, addr);
5170
- if (ret)
5171
- return ret;
5172
- if (val)
5173
- goto err_size;
5174
- }
5175
- size = sizeof(*attr);
6582
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
6583
+ if (ret) {
6584
+ if (ret == -E2BIG)
6585
+ goto err_size;
6586
+ return ret;
51766587 }
5177
-
5178
- ret = copy_from_user(attr, uattr, size);
5179
- if (ret)
5180
- return -EFAULT;
51816588
51826589 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
51836590 size < SCHED_ATTR_SIZE_VER1)
....@@ -5194,6 +6601,16 @@
51946601 err_size:
51956602 put_user(sizeof(*attr), &uattr->size);
51966603 return -E2BIG;
6604
+}
6605
+
6606
+static void get_params(struct task_struct *p, struct sched_attr *attr)
6607
+{
6608
+ if (task_has_dl_policy(p))
6609
+ __getparam_dl(p, attr);
6610
+ else if (task_has_rt_policy(p))
6611
+ attr->sched_priority = p->rt_priority;
6612
+ else
6613
+ attr->sched_nice = task_nice(p);
51976614 }
51986615
51996616 /**
....@@ -5257,6 +6674,8 @@
52576674 rcu_read_unlock();
52586675
52596676 if (likely(p)) {
6677
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
6678
+ get_params(p, &attr);
52606679 retval = sched_setattr(p, &attr);
52616680 put_task_struct(p);
52626681 }
....@@ -5350,7 +6769,7 @@
53506769 {
53516770 unsigned int ksize = sizeof(*kattr);
53526771
5353
- if (!access_ok(VERIFY_WRITE, uattr, usize))
6772
+ if (!access_ok(uattr, usize))
53546773 return -EFAULT;
53556774
53566775 /*
....@@ -5378,7 +6797,7 @@
53786797 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
53796798 * @pid: the pid in question.
53806799 * @uattr: structure containing the extended parameters.
5381
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
6800
+ * @usize: sizeof(attr) for fwd/bwd comp.
53826801 * @flags: for future extension.
53836802 */
53846803 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
....@@ -5405,14 +6824,15 @@
54056824 kattr.sched_policy = p->policy;
54066825 if (p->sched_reset_on_fork)
54076826 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5408
- if (task_has_dl_policy(p))
5409
- __getparam_dl(p, &kattr);
5410
- else if (task_has_rt_policy(p))
5411
- kattr.sched_priority = p->rt_priority;
5412
- else
5413
- kattr.sched_nice = task_nice(p);
6827
+ get_params(p, &kattr);
6828
+ kattr.sched_flags &= SCHED_FLAG_ALL;
54146829
54156830 #ifdef CONFIG_UCLAMP_TASK
6831
+ /*
6832
+ * This could race with another potential updater, but this is fine
6833
+ * because it'll correctly read the old or the new value. We don't need
6834
+ * to guarantee who wins the race as long as it doesn't return garbage.
6835
+ */
54166836 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
54176837 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
54186838 #endif
....@@ -5431,6 +6851,7 @@
54316851 cpumask_var_t cpus_allowed, new_mask;
54326852 struct task_struct *p;
54336853 int retval;
6854
+ int skip = 0;
54346855
54356856 rcu_read_lock();
54366857
....@@ -5466,6 +6887,9 @@
54666887 rcu_read_unlock();
54676888 }
54686889
6890
+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
6891
+ if (skip)
6892
+ goto out_free_new_mask;
54696893 retval = security_task_setscheduler(p);
54706894 if (retval)
54716895 goto out_free_new_mask;
....@@ -5492,7 +6916,7 @@
54926916 }
54936917 #endif
54946918 again:
5495
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
6919
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
54966920
54976921 if (!retval) {
54986922 cpuset_cpus_allowed(p, cpus_allowed);
....@@ -5506,6 +6930,9 @@
55066930 goto again;
55076931 }
55086932 }
6933
+
6934
+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
6935
+
55096936 out_free_new_mask:
55106937 free_cpumask_var(new_mask);
55116938 out_free_cpus_allowed:
....@@ -5514,7 +6941,6 @@
55146941 put_task_struct(p);
55156942 return retval;
55166943 }
5517
-EXPORT_SYMBOL_GPL(sched_setaffinity);
55186944
55196945 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
55206946 struct cpumask *new_mask)
....@@ -5569,7 +6995,7 @@
55696995 goto out_unlock;
55706996
55716997 raw_spin_lock_irqsave(&p->pi_lock, flags);
5572
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
6998
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
55736999 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
55747000
55757001 out_unlock:
....@@ -5633,6 +7059,8 @@
56337059 schedstat_inc(rq->yld_count);
56347060 current->sched_class->yield_task(rq);
56357061
7062
+ trace_android_rvh_do_sched_yield(rq);
7063
+
56367064 preempt_disable();
56377065 rq_unlock_irq(rq, &rf);
56387066 sched_preempt_enable_no_resched();
....@@ -5646,7 +7074,7 @@
56467074 return 0;
56477075 }
56487076
5649
-#ifndef CONFIG_PREEMPT
7077
+#ifndef CONFIG_PREEMPTION
56507078 int __sched _cond_resched(void)
56517079 {
56527080 if (should_resched(0)) {
....@@ -5663,7 +7091,7 @@
56637091 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
56647092 * call schedule, and on return reacquire the lock.
56657093 *
5666
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
7094
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
56677095 * operations here to prevent schedule() from being called twice (once via
56687096 * spin_unlock(), once by hand).
56697097 */
....@@ -5767,7 +7195,7 @@
57677195 if (task_running(p_rq, p) || p->state)
57687196 goto out_unlock;
57697197
5770
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
7198
+ yielded = curr->sched_class->yield_to_task(rq, p);
57717199 if (yielded) {
57727200 schedstat_inc(rq->yld_count);
57737201 /*
....@@ -5933,7 +7361,7 @@
59337361 * an error code.
59347362 */
59357363 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5936
- struct timespec __user *, interval)
7364
+ struct __kernel_timespec __user *, interval)
59377365 {
59387366 struct timespec64 t;
59397367 int retval = sched_rr_get_interval(pid, &t);
....@@ -5944,16 +7372,15 @@
59447372 return retval;
59457373 }
59467374
5947
-#ifdef CONFIG_COMPAT
5948
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
5949
- compat_pid_t, pid,
5950
- struct compat_timespec __user *, interval)
7375
+#ifdef CONFIG_COMPAT_32BIT_TIME
7376
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
7377
+ struct old_timespec32 __user *, interval)
59517378 {
59527379 struct timespec64 t;
59537380 int retval = sched_rr_get_interval(pid, &t);
59547381
59557382 if (retval == 0)
5956
- retval = compat_put_timespec64(&t, interval);
7383
+ retval = put_old_timespec32(&t, interval);
59577384 return retval;
59587385 }
59597386 #endif
....@@ -5966,10 +7393,10 @@
59667393 if (!try_get_task_stack(p))
59677394 return;
59687395
5969
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
7396
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
59707397
59717398 if (p->state == TASK_RUNNING)
5972
- printk(KERN_CONT " running task ");
7399
+ pr_cont(" running task ");
59737400 #ifdef CONFIG_DEBUG_STACK_USAGE
59747401 free = stack_not_used(p);
59757402 #endif
....@@ -5978,12 +7405,13 @@
59787405 if (pid_alive(p))
59797406 ppid = task_pid_nr(rcu_dereference(p->real_parent));
59807407 rcu_read_unlock();
5981
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5982
- task_pid_nr(p), ppid,
7408
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
7409
+ free, task_pid_nr(p), ppid,
59837410 (unsigned long)task_thread_info(p)->flags);
59847411
59857412 print_worker_info(KERN_INFO, p);
5986
- show_stack(p, NULL);
7413
+ trace_android_vh_sched_show_task(p);
7414
+ show_stack(p, NULL, KERN_INFO);
59877415 put_task_stack(p);
59887416 }
59897417 EXPORT_SYMBOL_GPL(sched_show_task);
....@@ -6014,13 +7442,6 @@
60147442 {
60157443 struct task_struct *g, *p;
60167444
6017
-#if BITS_PER_LONG == 32
6018
- printk(KERN_INFO
6019
- " task PC stack pid father\n");
6020
-#else
6021
- printk(KERN_INFO
6022
- " task PC stack pid father\n");
6023
-#endif
60247445 rcu_read_lock();
60257446 for_each_process_thread(g, p) {
60267447 /*
....@@ -6056,7 +7477,7 @@
60567477 * NOTE: this function does not set the idle thread's NEED_RESCHED
60577478 * flag, to make booting more robust.
60587479 */
6059
-void init_idle(struct task_struct *idle, int cpu)
7480
+void __init init_idle(struct task_struct *idle, int cpu)
60607481 {
60617482 struct rq *rq = cpu_rq(cpu);
60627483 unsigned long flags;
....@@ -6070,9 +7491,6 @@
60707491 idle->se.exec_start = sched_clock();
60717492 idle->flags |= PF_IDLE;
60727493
6073
- scs_task_reset(idle);
6074
- kasan_unpoison_task_stack(idle);
6075
-
60767494 #ifdef CONFIG_SMP
60777495 /*
60787496 * Its possible that init_idle() gets called multiple times on a task,
....@@ -6080,7 +7498,7 @@
60807498 *
60817499 * And since this is boot we can forgo the serialization.
60827500 */
6083
- set_cpus_allowed_common(idle, cpumask_of(cpu));
7501
+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
60847502 #endif
60857503 /*
60867504 * We're having a chicken and egg problem, even though we are
....@@ -6096,7 +7514,8 @@
60967514 __set_task_cpu(idle, cpu);
60977515 rcu_read_unlock();
60987516
6099
- rq->curr = rq->idle = idle;
7517
+ rq->idle = idle;
7518
+ rcu_assign_pointer(rq->curr, idle);
61007519 idle->on_rq = TASK_ON_RQ_QUEUED;
61017520 #ifdef CONFIG_SMP
61027521 idle->on_cpu = 1;
....@@ -6106,7 +7525,9 @@
61067525
61077526 /* Set the preempt count _outside_ the spinlocks! */
61087527 init_idle_preempt_count(idle, cpu);
6109
-
7528
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
7529
+ task_thread_info(idle)->preempt_lazy_count = 0;
7530
+#endif
61107531 /*
61117532 * The idle tasks have their own, simple scheduling class:
61127533 */
....@@ -6134,7 +7555,7 @@
61347555 }
61357556
61367557 int task_can_attach(struct task_struct *p,
6137
- const struct cpumask *cs_cpus_allowed)
7558
+ const struct cpumask *cs_effective_cpus)
61387559 {
61397560 int ret = 0;
61407561
....@@ -6145,7 +7566,7 @@
61457566 * allowed nodes is unnecessary. Thus, cpusets are not
61467567 * applicable for such threads. This prevents checking for
61477568 * success of set_cpus_allowed_ptr() on all attached tasks
6148
- * before cpus_allowed may be changed.
7569
+ * before cpus_mask may be changed.
61497570 */
61507571 if (p->flags & PF_NO_SETAFFINITY) {
61517572 ret = -EINVAL;
....@@ -6153,8 +7574,13 @@
61537574 }
61547575
61557576 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6156
- cs_cpus_allowed))
6157
- ret = dl_task_can_attach(p, cs_cpus_allowed);
7577
+ cs_effective_cpus)) {
7578
+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
7579
+
7580
+ if (unlikely(cpu >= nr_cpu_ids))
7581
+ return -EINVAL;
7582
+ ret = dl_cpu_busy(cpu, p);
7583
+ }
61587584
61597585 out:
61607586 return ret;
....@@ -6172,7 +7598,7 @@
61727598 if (curr_cpu == target_cpu)
61737599 return 0;
61747600
6175
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
7601
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
61767602 return -EINVAL;
61777603
61787604 /* TODO: This is not properly updating schedstats */
....@@ -6205,12 +7631,13 @@
62057631 if (queued)
62067632 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
62077633 if (running)
6208
- set_curr_task(rq, p);
7634
+ set_next_task(rq, p);
62097635 task_rq_unlock(rq, p, &rf);
62107636 }
62117637 #endif /* CONFIG_NUMA_BALANCING */
62127638
62137639 #ifdef CONFIG_HOTPLUG_CPU
7640
+
62147641 /*
62157642 * Ensure that the idle task is using init_mm right before its CPU goes
62167643 * offline.
....@@ -6230,123 +7657,163 @@
62307657 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
62317658 }
62327659
6233
-/*
6234
- * Since this CPU is going 'away' for a while, fold any nr_active delta
6235
- * we might have. Assumes we're called after migrate_tasks() so that the
6236
- * nr_active count is stable. We need to take the teardown thread which
6237
- * is calling this into account, so we hand in adjust = 1 to the load
6238
- * calculation.
6239
- *
6240
- * Also see the comment "Global load-average calculations".
6241
- */
6242
-static void calc_load_migrate(struct rq *rq)
7660
+static int __balance_push_cpu_stop(void *arg)
62437661 {
6244
- long delta = calc_load_fold_active(rq, 1);
6245
- if (delta)
6246
- atomic_long_add(delta, &calc_load_tasks);
6247
-}
7662
+ struct task_struct *p = arg;
7663
+ struct rq *rq = this_rq();
7664
+ struct rq_flags rf;
7665
+ int cpu;
62487666
6249
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6250
-{
6251
-}
7667
+ raw_spin_lock_irq(&p->pi_lock);
7668
+ rq_lock(rq, &rf);
62527669
6253
-static const struct sched_class fake_sched_class = {
6254
- .put_prev_task = put_prev_task_fake,
6255
-};
6256
-
6257
-static struct task_struct fake_task = {
6258
- /*
6259
- * Avoid pull_{rt,dl}_task()
6260
- */
6261
- .prio = MAX_PRIO + 1,
6262
- .sched_class = &fake_sched_class,
6263
-};
6264
-
6265
-/*
6266
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
6267
- * try_to_wake_up()->select_task_rq().
6268
- *
6269
- * Called with rq->lock held even though we'er in stop_machine() and
6270
- * there's no concurrency possible, we hold the required locks anyway
6271
- * because of lock validation efforts.
6272
- */
6273
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6274
-{
6275
- struct rq *rq = dead_rq;
6276
- struct task_struct *next, *stop = rq->stop;
6277
- struct rq_flags orf = *rf;
6278
- int dest_cpu;
6279
-
6280
- /*
6281
- * Fudge the rq selection such that the below task selection loop
6282
- * doesn't get stuck on the currently eligible stop task.
6283
- *
6284
- * We're currently inside stop_machine() and the rq is either stuck
6285
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
6286
- * either way we should never end up calling schedule() until we're
6287
- * done here.
6288
- */
6289
- rq->stop = NULL;
6290
-
6291
- /*
6292
- * put_prev_task() and pick_next_task() sched
6293
- * class method both need to have an up-to-date
6294
- * value of rq->clock[_task]
6295
- */
62967670 update_rq_clock(rq);
62977671
6298
- for (;;) {
6299
- /*
6300
- * There's this thread running, bail when that's the only
6301
- * remaining thread:
6302
- */
6303
- if (rq->nr_running == 1)
6304
- break;
6305
-
6306
- /*
6307
- * pick_next_task() assumes pinned rq->lock:
6308
- */
6309
- next = pick_next_task(rq, &fake_task, rf);
6310
- BUG_ON(!next);
6311
- put_prev_task(rq, next);
6312
-
6313
- /*
6314
- * Rules for changing task_struct::cpus_allowed are holding
6315
- * both pi_lock and rq->lock, such that holding either
6316
- * stabilizes the mask.
6317
- *
6318
- * Drop rq->lock is not quite as disastrous as it usually is
6319
- * because !cpu_active at this point, which means load-balance
6320
- * will not interfere. Also, stop-machine.
6321
- */
6322
- rq_unlock(rq, rf);
6323
- raw_spin_lock(&next->pi_lock);
6324
- rq_relock(rq, rf);
6325
-
6326
- /*
6327
- * Since we're inside stop-machine, _nothing_ should have
6328
- * changed the task, WARN if weird stuff happened, because in
6329
- * that case the above rq->lock drop is a fail too.
6330
- */
6331
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6332
- raw_spin_unlock(&next->pi_lock);
6333
- continue;
6334
- }
6335
-
6336
- /* Find suitable destination for @next, with force if needed. */
6337
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6338
- rq = __migrate_task(rq, rf, next, dest_cpu);
6339
- if (rq != dead_rq) {
6340
- rq_unlock(rq, rf);
6341
- rq = dead_rq;
6342
- *rf = orf;
6343
- rq_relock(rq, rf);
6344
- }
6345
- raw_spin_unlock(&next->pi_lock);
7672
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
7673
+ cpu = select_fallback_rq(rq->cpu, p);
7674
+ rq = __migrate_task(rq, &rf, p, cpu);
63467675 }
63477676
6348
- rq->stop = stop;
7677
+ rq_unlock(rq, &rf);
7678
+ raw_spin_unlock_irq(&p->pi_lock);
7679
+
7680
+ put_task_struct(p);
7681
+
7682
+ return 0;
63497683 }
7684
+
7685
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
7686
+
7687
+/*
7688
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
7689
+ */
7690
+
7691
+
7692
+static void balance_push(struct rq *rq)
7693
+{
7694
+ struct task_struct *push_task = rq->curr;
7695
+
7696
+ lockdep_assert_held(&rq->lock);
7697
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
7698
+
7699
+ /*
7700
+ * Both the cpu-hotplug and stop task are in this case and are
7701
+ * required to complete the hotplug process.
7702
+ */
7703
+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
7704
+ /*
7705
+ * If this is the idle task on the outgoing CPU try to wake
7706
+ * up the hotplug control thread which might wait for the
7707
+ * last task to vanish. The rcuwait_active() check is
7708
+ * accurate here because the waiter is pinned on this CPU
7709
+ * and can't obviously be running in parallel.
7710
+ *
7711
+ * On RT kernels this also has to check whether there are
7712
+ * pinned and scheduled out tasks on the runqueue. They
7713
+ * need to leave the migrate disabled section first.
7714
+ */
7715
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
7716
+ rcuwait_active(&rq->hotplug_wait)) {
7717
+ raw_spin_unlock(&rq->lock);
7718
+ rcuwait_wake_up(&rq->hotplug_wait);
7719
+ raw_spin_lock(&rq->lock);
7720
+ }
7721
+ return;
7722
+ }
7723
+
7724
+ get_task_struct(push_task);
7725
+ /*
7726
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
7727
+ * Both preemption and IRQs are still disabled.
7728
+ */
7729
+ raw_spin_unlock(&rq->lock);
7730
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
7731
+ this_cpu_ptr(&push_work));
7732
+ /*
7733
+ * At this point need_resched() is true and we'll take the loop in
7734
+ * schedule(). The next pick is obviously going to be the stop task
7735
+ * which is_per_cpu_kthread() and will push this task away.
7736
+ */
7737
+ raw_spin_lock(&rq->lock);
7738
+}
7739
+
7740
+static void balance_push_set(int cpu, bool on)
7741
+{
7742
+ struct rq *rq = cpu_rq(cpu);
7743
+ struct rq_flags rf;
7744
+
7745
+ rq_lock_irqsave(rq, &rf);
7746
+ if (on)
7747
+ rq->balance_flags |= BALANCE_PUSH;
7748
+ else
7749
+ rq->balance_flags &= ~BALANCE_PUSH;
7750
+ rq_unlock_irqrestore(rq, &rf);
7751
+}
7752
+
7753
+/*
7754
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
7755
+ * inactive. All tasks which are not per CPU kernel threads are either
7756
+ * pushed off this CPU now via balance_push() or placed on a different CPU
7757
+ * during wakeup. Wait until the CPU is quiescent.
7758
+ */
7759
+static void balance_hotplug_wait(void)
7760
+{
7761
+ struct rq *rq = this_rq();
7762
+
7763
+ rcuwait_wait_event(&rq->hotplug_wait,
7764
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
7765
+ TASK_UNINTERRUPTIBLE);
7766
+}
7767
+
7768
+static int drain_rq_cpu_stop(void *data)
7769
+{
7770
+#ifndef CONFIG_PREEMPT_RT
7771
+ struct rq *rq = this_rq();
7772
+ struct rq_flags rf;
7773
+
7774
+ rq_lock_irqsave(rq, &rf);
7775
+ migrate_tasks(rq, &rf, false);
7776
+ rq_unlock_irqrestore(rq, &rf);
7777
+#endif
7778
+ return 0;
7779
+}
7780
+
7781
+int sched_cpu_drain_rq(unsigned int cpu)
7782
+{
7783
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7784
+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
7785
+
7786
+ if (idle_cpu(cpu)) {
7787
+ rq_drain->done = NULL;
7788
+ return 0;
7789
+ }
7790
+
7791
+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
7792
+ rq_drain_done);
7793
+}
7794
+
7795
+void sched_cpu_drain_rq_wait(unsigned int cpu)
7796
+{
7797
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7798
+
7799
+ if (rq_drain->done)
7800
+ cpu_stop_work_wait(rq_drain);
7801
+}
7802
+
7803
+#else
7804
+
7805
+static inline void balance_push(struct rq *rq)
7806
+{
7807
+}
7808
+
7809
+static inline void balance_push_set(int cpu, bool on)
7810
+{
7811
+}
7812
+
7813
+static inline void balance_hotplug_wait(void)
7814
+{
7815
+}
7816
+
63507817 #endif /* CONFIG_HOTPLUG_CPU */
63517818
63527819 void set_rq_online(struct rq *rq)
....@@ -6417,8 +7884,10 @@
64177884 static int cpuset_cpu_inactive(unsigned int cpu)
64187885 {
64197886 if (!cpuhp_tasks_frozen) {
6420
- if (dl_cpu_busy(cpu))
6421
- return -EBUSY;
7887
+ int ret = dl_cpu_busy(cpu, NULL);
7888
+
7889
+ if (ret)
7890
+ return ret;
64227891 cpuset_update_active_cpus();
64237892 } else {
64247893 num_cpus_frozen++;
....@@ -6431,6 +7900,8 @@
64317900 {
64327901 struct rq *rq = cpu_rq(cpu);
64337902 struct rq_flags rf;
7903
+
7904
+ balance_push_set(cpu, false);
64347905
64357906 #ifdef CONFIG_SCHED_SMT
64367907 /*
....@@ -6467,19 +7938,39 @@
64677938 return 0;
64687939 }
64697940
6470
-int sched_cpu_deactivate(unsigned int cpu)
7941
+int sched_cpus_activate(struct cpumask *cpus)
64717942 {
7943
+ unsigned int cpu;
7944
+
7945
+ for_each_cpu(cpu, cpus) {
7946
+ if (sched_cpu_activate(cpu)) {
7947
+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
7948
+ sched_cpu_deactivate(cpu);
7949
+
7950
+ return -EBUSY;
7951
+ }
7952
+ }
7953
+
7954
+ return 0;
7955
+}
7956
+
7957
+int _sched_cpu_deactivate(unsigned int cpu)
7958
+{
7959
+ struct rq *rq = cpu_rq(cpu);
7960
+ struct rq_flags rf;
64727961 int ret;
64737962
64747963 set_cpu_active(cpu, false);
6475
- /*
6476
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6477
- * users of this state to go away such that all new such users will
6478
- * observe it.
6479
- *
6480
- * Do sync before park smpboot threads to take care the rcu boost case.
6481
- */
6482
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
7964
+
7965
+ balance_push_set(cpu, true);
7966
+
7967
+ rq_lock_irqsave(rq, &rf);
7968
+ if (rq->rd) {
7969
+ update_rq_clock(rq);
7970
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7971
+ set_rq_offline(rq);
7972
+ }
7973
+ rq_unlock_irqrestore(rq, &rf);
64837974
64847975 #ifdef CONFIG_SCHED_SMT
64857976 /*
....@@ -6494,10 +7985,51 @@
64947985
64957986 ret = cpuset_cpu_inactive(cpu);
64967987 if (ret) {
7988
+ balance_push_set(cpu, false);
64977989 set_cpu_active(cpu, true);
64987990 return ret;
64997991 }
65007992 sched_domains_numa_masks_clear(cpu);
7993
+
7994
+ update_max_interval();
7995
+
7996
+ return 0;
7997
+}
7998
+
7999
+int sched_cpu_deactivate(unsigned int cpu)
8000
+{
8001
+ int ret = _sched_cpu_deactivate(cpu);
8002
+
8003
+ if (ret)
8004
+ return ret;
8005
+
8006
+ /*
8007
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
8008
+ * users of this state to go away such that all new such users will
8009
+ * observe it.
8010
+ *
8011
+ * Do sync before park smpboot threads to take care the rcu boost case.
8012
+ */
8013
+ synchronize_rcu();
8014
+
8015
+ return 0;
8016
+}
8017
+
8018
+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
8019
+{
8020
+ unsigned int cpu;
8021
+
8022
+ for_each_cpu(cpu, cpus) {
8023
+ if (_sched_cpu_deactivate(cpu)) {
8024
+ for_each_cpu(cpu, cpus) {
8025
+ if (!cpu_active(cpu))
8026
+ sched_cpu_activate(cpu);
8027
+ }
8028
+
8029
+ return -EBUSY;
8030
+ }
8031
+ }
8032
+
65018033 return 0;
65028034 }
65038035
....@@ -6506,37 +8038,67 @@
65068038 struct rq *rq = cpu_rq(cpu);
65078039
65088040 rq->calc_load_update = calc_load_update;
6509
- update_max_interval();
65108041 }
65118042
65128043 int sched_cpu_starting(unsigned int cpu)
65138044 {
65148045 sched_rq_cpu_starting(cpu);
65158046 sched_tick_start(cpu);
8047
+ trace_android_rvh_sched_cpu_starting(cpu);
65168048 return 0;
65178049 }
65188050
65198051 #ifdef CONFIG_HOTPLUG_CPU
8052
+
8053
+/*
8054
+ * Invoked immediately before the stopper thread is invoked to bring the
8055
+ * CPU down completely. At this point all per CPU kthreads except the
8056
+ * hotplug thread (current) and the stopper thread (inactive) have been
8057
+ * either parked or have been unbound from the outgoing CPU. Ensure that
8058
+ * any of those which might be on the way out are gone.
8059
+ *
8060
+ * If after this point a bound task is being woken on this CPU then the
8061
+ * responsible hotplug callback has failed to do it's job.
8062
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
8063
+ */
8064
+int sched_cpu_wait_empty(unsigned int cpu)
8065
+{
8066
+ balance_hotplug_wait();
8067
+ return 0;
8068
+}
8069
+
8070
+/*
8071
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
8072
+ * might have. Called from the CPU stopper task after ensuring that the
8073
+ * stopper is the last running task on the CPU, so nr_active count is
8074
+ * stable. We need to take the teardown thread which is calling this into
8075
+ * account, so we hand in adjust = 1 to the load calculation.
8076
+ *
8077
+ * Also see the comment "Global load-average calculations".
8078
+ */
8079
+static void calc_load_migrate(struct rq *rq)
8080
+{
8081
+ long delta = calc_load_fold_active(rq, 1);
8082
+
8083
+ if (delta)
8084
+ atomic_long_add(delta, &calc_load_tasks);
8085
+}
8086
+
65208087 int sched_cpu_dying(unsigned int cpu)
65218088 {
65228089 struct rq *rq = cpu_rq(cpu);
65238090 struct rq_flags rf;
65248091
65258092 /* Handle pending wakeups and then migrate everything off */
6526
- sched_ttwu_pending();
65278093 sched_tick_stop(cpu);
65288094
65298095 rq_lock_irqsave(rq, &rf);
6530
- if (rq->rd) {
6531
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6532
- set_rq_offline(rq);
6533
- }
6534
- migrate_tasks(rq, &rf);
6535
- BUG_ON(rq->nr_running != 1);
8096
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
65368097 rq_unlock_irqrestore(rq, &rf);
65378098
8099
+ trace_android_rvh_sched_cpu_dying(cpu);
8100
+
65388101 calc_load_migrate(rq);
6539
- update_max_interval();
65408102 nohz_balance_exit_idle(rq);
65418103 hrtick_clear(rq);
65428104 return 0;
....@@ -6550,18 +8112,16 @@
65508112 /*
65518113 * There's no userspace yet to cause hotplug operations; hence all the
65528114 * CPU masks are stable and all blatant races in the below code cannot
6553
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6554
- * but there won't be any contention on it.
8115
+ * happen.
65558116 */
6556
- cpus_read_lock();
65578117 mutex_lock(&sched_domains_mutex);
65588118 sched_init_domains(cpu_active_mask);
65598119 mutex_unlock(&sched_domains_mutex);
6560
- cpus_read_unlock();
65618120
65628121 /* Move init over to a non-isolated CPU */
65638122 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
65648123 BUG();
8124
+
65658125 sched_init_granularity();
65668126
65678127 init_sched_rt_class();
....@@ -6572,7 +8132,7 @@
65728132
65738133 static int __init migration_init(void)
65748134 {
6575
- sched_rq_cpu_starting(smp_processor_id());
8135
+ sched_cpu_starting(smp_processor_id());
65768136 return 0;
65778137 }
65788138 early_initcall(migration_init);
....@@ -6597,7 +8157,9 @@
65978157 * Every task in system belongs to this group at bootup.
65988158 */
65998159 struct task_group root_task_group;
8160
+EXPORT_SYMBOL_GPL(root_task_group);
66008161 LIST_HEAD(task_groups);
8162
+EXPORT_SYMBOL_GPL(task_groups);
66018163
66028164 /* Cacheline aligned slab cache for task_group */
66038165 static struct kmem_cache *task_group_cache __read_mostly;
....@@ -6608,19 +8170,27 @@
66088170
66098171 void __init sched_init(void)
66108172 {
6611
- int i, j;
6612
- unsigned long alloc_size = 0, ptr;
8173
+ unsigned long ptr = 0;
8174
+ int i;
8175
+
8176
+ /* Make sure the linker didn't screw up */
8177
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8178
+ &fair_sched_class + 1 != &rt_sched_class ||
8179
+ &rt_sched_class + 1 != &dl_sched_class);
8180
+#ifdef CONFIG_SMP
8181
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8182
+#endif
66138183
66148184 wait_bit_init();
66158185
66168186 #ifdef CONFIG_FAIR_GROUP_SCHED
6617
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8187
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
66188188 #endif
66198189 #ifdef CONFIG_RT_GROUP_SCHED
6620
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8190
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
66218191 #endif
6622
- if (alloc_size) {
6623
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8192
+ if (ptr) {
8193
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
66248194
66258195 #ifdef CONFIG_FAIR_GROUP_SCHED
66268196 root_task_group.se = (struct sched_entity **)ptr;
....@@ -6629,6 +8199,8 @@
66298199 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
66308200 ptr += nr_cpu_ids * sizeof(void **);
66318201
8202
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8203
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
66328204 #endif /* CONFIG_FAIR_GROUP_SCHED */
66338205 #ifdef CONFIG_RT_GROUP_SCHED
66348206 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
....@@ -6681,7 +8253,6 @@
66818253 init_rt_rq(&rq->rt);
66828254 init_dl_rq(&rq->dl);
66838255 #ifdef CONFIG_FAIR_GROUP_SCHED
6684
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
66858256 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
66868257 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
66878258 /*
....@@ -6703,7 +8274,6 @@
67038274 * We achieve this by letting root_task_group's tasks sit
67048275 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
67058276 */
6706
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
67078277 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
67088278 #endif /* CONFIG_FAIR_GROUP_SCHED */
67098279
....@@ -6711,10 +8281,6 @@
67118281 #ifdef CONFIG_RT_GROUP_SCHED
67128282 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
67138283 #endif
6714
-
6715
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6716
- rq->cpu_load[j] = 0;
6717
-
67188284 #ifdef CONFIG_SMP
67198285 rq->sd = NULL;
67208286 rq->rd = NULL;
....@@ -6733,16 +8299,20 @@
67338299
67348300 rq_attach_root(rq, &def_root_domain);
67358301 #ifdef CONFIG_NO_HZ_COMMON
6736
- rq->last_load_update_tick = jiffies;
67378302 rq->last_blocked_load_update_tick = jiffies;
67388303 atomic_set(&rq->nohz_flags, 0);
8304
+
8305
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
8306
+#endif
8307
+#ifdef CONFIG_HOTPLUG_CPU
8308
+ rcuwait_init(&rq->hotplug_wait);
67398309 #endif
67408310 #endif /* CONFIG_SMP */
67418311 hrtick_rq_init(rq);
67428312 atomic_set(&rq->nr_iowait, 0);
67438313 }
67448314
6745
- set_load_weight(&init_task, false);
8315
+ set_load_weight(&init_task);
67468316
67478317 /*
67488318 * The boot idle thread does lazy MMU switching as well:
....@@ -6777,7 +8347,7 @@
67778347 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
67788348 static inline int preempt_count_equals(int preempt_offset)
67798349 {
6780
- int nested = preempt_count() + rcu_preempt_depth();
8350
+ int nested = preempt_count() + sched_rcu_preempt_depth();
67818351
67828352 return (nested == preempt_offset);
67838353 }
....@@ -6811,7 +8381,7 @@
68118381 rcu_sleep_check();
68128382
68138383 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6814
- !is_idle_task(current)) ||
8384
+ !is_idle_task(current) && !current->non_block_count) ||
68158385 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
68168386 oops_in_progress)
68178387 return;
....@@ -6827,8 +8397,8 @@
68278397 "BUG: sleeping function called from invalid context at %s:%d\n",
68288398 file, line);
68298399 printk(KERN_ERR
6830
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6831
- in_atomic(), irqs_disabled(),
8400
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
8401
+ in_atomic(), irqs_disabled(), current->non_block_count,
68328402 current->pid, current->comm);
68338403
68348404 if (task_stack_end_corrupted(current))
....@@ -6840,13 +8410,76 @@
68408410 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
68418411 && !preempt_count_equals(preempt_offset)) {
68428412 pr_err("Preemption disabled at:");
6843
- print_ip_sym(preempt_disable_ip);
6844
- pr_cont("\n");
8413
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
68458414 }
8415
+
8416
+ trace_android_rvh_schedule_bug(NULL);
8417
+
68468418 dump_stack();
68478419 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
68488420 }
68498421 EXPORT_SYMBOL(___might_sleep);
8422
+
8423
+void __cant_sleep(const char *file, int line, int preempt_offset)
8424
+{
8425
+ static unsigned long prev_jiffy;
8426
+
8427
+ if (irqs_disabled())
8428
+ return;
8429
+
8430
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8431
+ return;
8432
+
8433
+ if (preempt_count() > preempt_offset)
8434
+ return;
8435
+
8436
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8437
+ return;
8438
+ prev_jiffy = jiffies;
8439
+
8440
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8441
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8442
+ in_atomic(), irqs_disabled(),
8443
+ current->pid, current->comm);
8444
+
8445
+ debug_show_held_locks(current);
8446
+ dump_stack();
8447
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8448
+}
8449
+EXPORT_SYMBOL_GPL(__cant_sleep);
8450
+
8451
+#ifdef CONFIG_SMP
8452
+void __cant_migrate(const char *file, int line)
8453
+{
8454
+ static unsigned long prev_jiffy;
8455
+
8456
+ if (irqs_disabled())
8457
+ return;
8458
+
8459
+ if (is_migration_disabled(current))
8460
+ return;
8461
+
8462
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8463
+ return;
8464
+
8465
+ if (preempt_count() > 0)
8466
+ return;
8467
+
8468
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8469
+ return;
8470
+ prev_jiffy = jiffies;
8471
+
8472
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
8473
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
8474
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
8475
+ current->pid, current->comm);
8476
+
8477
+ debug_show_held_locks(current);
8478
+ dump_stack();
8479
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8480
+}
8481
+EXPORT_SYMBOL_GPL(__cant_migrate);
8482
+#endif
68508483 #endif
68518484
68528485 #ifdef CONFIG_MAGIC_SYSRQ
....@@ -6915,7 +8548,7 @@
69158548
69168549 #ifdef CONFIG_IA64
69178550 /**
6918
- * set_curr_task - set the current task for a given CPU.
8551
+ * ia64_set_curr_task - set the current task for a given CPU.
69198552 * @cpu: the processor in question.
69208553 * @p: the task pointer to set.
69218554 *
....@@ -7081,8 +8714,15 @@
70818714
70828715 if (queued)
70838716 enqueue_task(rq, tsk, queue_flags);
7084
- if (running)
7085
- set_curr_task(rq, tsk);
8717
+ if (running) {
8718
+ set_next_task(rq, tsk);
8719
+ /*
8720
+ * After changing group, the running task may have joined a
8721
+ * throttled one but it's still the running task. Trigger a
8722
+ * resched to make sure that task can still run.
8723
+ */
8724
+ resched_curr(rq);
8725
+ }
70868726
70878727 task_rq_unlock(rq, tsk, &rf);
70888728 }
....@@ -7121,9 +8761,14 @@
71218761
71228762 #ifdef CONFIG_UCLAMP_TASK_GROUP
71238763 /* Propagate the effective uclamp value for the new group */
8764
+ mutex_lock(&uclamp_mutex);
8765
+ rcu_read_lock();
71248766 cpu_util_update_eff(css);
8767
+ rcu_read_unlock();
8768
+ mutex_unlock(&uclamp_mutex);
71258769 #endif
71268770
8771
+ trace_android_rvh_cpu_cgroup_online(css);
71278772 return 0;
71288773 }
71298774
....@@ -7189,6 +8834,9 @@
71898834 if (ret)
71908835 break;
71918836 }
8837
+
8838
+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
8839
+
71928840 return ret;
71938841 }
71948842
....@@ -7199,6 +8847,8 @@
71998847
72008848 cgroup_taskset_for_each(task, css, tset)
72018849 sched_move_task(task);
8850
+
8851
+ trace_android_rvh_cpu_cgroup_attach(tset);
72028852 }
72038853
72048854 #ifdef CONFIG_UCLAMP_TASK_GROUP
....@@ -7210,6 +8860,9 @@
72108860 unsigned int eff[UCLAMP_CNT];
72118861 enum uclamp_id clamp_id;
72128862 unsigned int clamps;
8863
+
8864
+ lockdep_assert_held(&uclamp_mutex);
8865
+ SCHED_WARN_ON(!rcu_read_lock_held());
72138866
72148867 css_for_each_descendant_pre(css, top_css) {
72158868 uc_parent = css_tg(css)->parent
....@@ -7243,7 +8896,7 @@
72438896 }
72448897
72458898 /* Immediately update descendants RUNNABLE tasks */
7246
- uclamp_update_active_tasks(css, clamps);
8899
+ uclamp_update_active_tasks(css);
72478900 }
72488901 }
72498902
....@@ -7300,6 +8953,8 @@
73008953 req = capacity_from_percent(buf);
73018954 if (req.ret)
73028955 return req.ret;
8956
+
8957
+ static_branch_enable(&sched_uclamp_used);
73038958
73048959 mutex_lock(&uclamp_mutex);
73058960 rcu_read_lock();
....@@ -7415,7 +9070,9 @@
74159070 static DEFINE_MUTEX(cfs_constraints_mutex);
74169071
74179072 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7418
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9073
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9074
+/* More than 203 days if BW_SHIFT equals 20. */
9075
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
74199076
74209077 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
74219078
....@@ -7441,6 +9098,12 @@
74419098 * feasibility.
74429099 */
74439100 if (period > max_cfs_quota_period)
9101
+ return -EINVAL;
9102
+
9103
+ /*
9104
+ * Bound quota to defend quota against overflow during bandwidth shift.
9105
+ */
9106
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
74449107 return -EINVAL;
74459108
74469109 /*
....@@ -7495,7 +9158,7 @@
74959158 return ret;
74969159 }
74979160
7498
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9161
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
74999162 {
75009163 u64 quota, period;
75019164
....@@ -7510,7 +9173,7 @@
75109173 return tg_set_cfs_bandwidth(tg, period, quota);
75119174 }
75129175
7513
-long tg_get_cfs_quota(struct task_group *tg)
9176
+static long tg_get_cfs_quota(struct task_group *tg)
75149177 {
75159178 u64 quota_us;
75169179
....@@ -7523,7 +9186,7 @@
75239186 return quota_us;
75249187 }
75259188
7526
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9189
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
75279190 {
75289191 u64 quota, period;
75299192
....@@ -7536,7 +9199,7 @@
75369199 return tg_set_cfs_bandwidth(tg, period, quota);
75379200 }
75389201
7539
-long tg_get_cfs_period(struct task_group *tg)
9202
+static long tg_get_cfs_period(struct task_group *tg)
75409203 {
75419204 u64 cfs_period_us;
75429205
....@@ -8013,4 +9676,7 @@
80139676 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
80149677 };
80159678
8016
-#undef CREATE_TRACE_POINTS
9679
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
9680
+{
9681
+ trace_sched_update_nr_running_tp(rq, count);
9682
+}