forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/kernel/sched/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * kernel/sched/core.c
34 *
....@@ -5,6 +6,10 @@
56 *
67 * Copyright (C) 1991-2002 Linus Torvalds
78 */
9
+#define CREATE_TRACE_POINTS
10
+#include <trace/events/sched.h>
11
+#undef CREATE_TRACE_POINTS
12
+
813 #include "sched.h"
914
1015 #include <linux/nospec.h>
....@@ -16,14 +21,41 @@
1621 #include <asm/tlb.h>
1722
1823 #include "../workqueue_internal.h"
24
+#include "../../io_uring/io-wq.h"
1925 #include "../smpboot.h"
2026
2127 #include "pelt.h"
28
+#include "smp.h"
2229
23
-#define CREATE_TRACE_POINTS
24
-#include <trace/events/sched.h>
30
+#include <trace/hooks/sched.h>
31
+#include <trace/hooks/dtask.h>
32
+
33
+/*
34
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
35
+ * associated with them) to allow external modules to probe them.
36
+ */
37
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
38
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
39
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
40
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
41
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
42
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
43
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
44
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
45
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
46
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
47
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
48
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
49
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50
+#ifdef CONFIG_SCHEDSTATS
51
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55
+#endif
2556
2657 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
+EXPORT_SYMBOL_GPL(runqueues);
2759
2860 #ifdef CONFIG_SCHED_DEBUG
2961 /*
....@@ -38,6 +70,7 @@
3870 const_debug unsigned int sysctl_sched_features =
3971 #include "features.h"
4072 0;
73
+EXPORT_SYMBOL_GPL(sysctl_sched_features);
4174 #undef SCHED_FEAT
4275 #endif
4376
....@@ -45,7 +78,7 @@
4578 * Number of tasks to iterate in a single balance run.
4679 * Limited because this is done with IRQs disabled.
4780 */
48
-#ifdef CONFIG_PREEMPT_RT_FULL
81
+#ifdef CONFIG_PREEMPT_RT
4982 const_debug unsigned int sysctl_sched_nr_migrate = 8;
5083 #else
5184 const_debug unsigned int sysctl_sched_nr_migrate = 32;
....@@ -64,6 +97,100 @@
6497 * default: 0.95s
6598 */
6699 int sysctl_sched_rt_runtime = 950000;
100
+
101
+
102
+/*
103
+ * Serialization rules:
104
+ *
105
+ * Lock order:
106
+ *
107
+ * p->pi_lock
108
+ * rq->lock
109
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
110
+ *
111
+ * rq1->lock
112
+ * rq2->lock where: rq1 < rq2
113
+ *
114
+ * Regular state:
115
+ *
116
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
117
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
118
+ * always looks at the local rq data structures to find the most elegible task
119
+ * to run next.
120
+ *
121
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
122
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
123
+ * the local CPU to avoid bouncing the runqueue state around [ see
124
+ * ttwu_queue_wakelist() ]
125
+ *
126
+ * Task wakeup, specifically wakeups that involve migration, are horribly
127
+ * complicated to avoid having to take two rq->locks.
128
+ *
129
+ * Special state:
130
+ *
131
+ * System-calls and anything external will use task_rq_lock() which acquires
132
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
133
+ * stable while holding either lock:
134
+ *
135
+ * - sched_setaffinity()/
136
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
137
+ * - set_user_nice(): p->se.load, p->*prio
138
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
139
+ * p->se.load, p->rt_priority,
140
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
141
+ * - sched_setnuma(): p->numa_preferred_nid
142
+ * - sched_move_task()/
143
+ * cpu_cgroup_fork(): p->sched_task_group
144
+ * - uclamp_update_active() p->uclamp*
145
+ *
146
+ * p->state <- TASK_*:
147
+ *
148
+ * is changed locklessly using set_current_state(), __set_current_state() or
149
+ * set_special_state(), see their respective comments, or by
150
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
151
+ * concurrent self.
152
+ *
153
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
154
+ *
155
+ * is set by activate_task() and cleared by deactivate_task(), under
156
+ * rq->lock. Non-zero indicates the task is runnable, the special
157
+ * ON_RQ_MIGRATING state is used for migration without holding both
158
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
159
+ *
160
+ * p->on_cpu <- { 0, 1 }:
161
+ *
162
+ * is set by prepare_task() and cleared by finish_task() such that it will be
163
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
164
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
165
+ *
166
+ * [ The astute reader will observe that it is possible for two tasks on one
167
+ * CPU to have ->on_cpu = 1 at the same time. ]
168
+ *
169
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
170
+ *
171
+ * - Don't call set_task_cpu() on a blocked task:
172
+ *
173
+ * We don't care what CPU we're not running on, this simplifies hotplug,
174
+ * the CPU assignment of blocked tasks isn't required to be valid.
175
+ *
176
+ * - for try_to_wake_up(), called under p->pi_lock:
177
+ *
178
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
179
+ *
180
+ * - for migration called under rq->lock:
181
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
182
+ *
183
+ * o move_queued_task()
184
+ * o detach_task()
185
+ *
186
+ * - for migration called under double_rq_lock():
187
+ *
188
+ * o __migrate_swap_task()
189
+ * o push_rt_task() / pull_rt_task()
190
+ * o push_dl_task() / pull_dl_task()
191
+ * o dl_task_offline_migration()
192
+ *
193
+ */
67194
68195 /*
69196 * __task_rq_lock - lock the rq @p resides on.
....@@ -88,6 +215,7 @@
88215 cpu_relax();
89216 }
90217 }
218
+EXPORT_SYMBOL_GPL(__task_rq_lock);
91219
92220 /*
93221 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
....@@ -130,6 +258,7 @@
130258 cpu_relax();
131259 }
132260 }
261
+EXPORT_SYMBOL_GPL(task_rq_lock);
133262
134263 /*
135264 * RQ-clock updating methods:
....@@ -210,7 +339,15 @@
210339 rq->clock += delta;
211340 update_rq_clock_task(rq, delta);
212341 }
342
+EXPORT_SYMBOL_GPL(update_rq_clock);
213343
344
+static inline void
345
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
346
+{
347
+ csd->flags = 0;
348
+ csd->func = func;
349
+ csd->info = rq;
350
+}
214351
215352 #ifdef CONFIG_SCHED_HRTICK
216353 /*
....@@ -247,8 +384,9 @@
247384 static void __hrtick_restart(struct rq *rq)
248385 {
249386 struct hrtimer *timer = &rq->hrtick_timer;
387
+ ktime_t time = rq->hrtick_time;
250388
251
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
389
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
252390 }
253391
254392 /*
....@@ -261,7 +399,6 @@
261399
262400 rq_lock(rq, &rf);
263401 __hrtick_restart(rq);
264
- rq->hrtick_csd_pending = 0;
265402 rq_unlock(rq, &rf);
266403 }
267404
....@@ -273,7 +410,6 @@
273410 void hrtick_start(struct rq *rq, u64 delay)
274411 {
275412 struct hrtimer *timer = &rq->hrtick_timer;
276
- ktime_t time;
277413 s64 delta;
278414
279415 /*
....@@ -281,16 +417,12 @@
281417 * doesn't make sense and can cause timer DoS.
282418 */
283419 delta = max_t(s64, delay, 10000LL);
284
- time = ktime_add_ns(timer->base->get_time(), delta);
420
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
285421
286
- hrtimer_set_expires(timer, time);
287
-
288
- if (rq == this_rq()) {
422
+ if (rq == this_rq())
289423 __hrtick_restart(rq);
290
- } else if (!rq->hrtick_csd_pending) {
424
+ else
291425 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
292
- rq->hrtick_csd_pending = 1;
293
- }
294426 }
295427
296428 #else
....@@ -307,20 +439,16 @@
307439 */
308440 delay = max_t(u64, delay, 10000LL);
309441 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
310
- HRTIMER_MODE_REL_PINNED);
442
+ HRTIMER_MODE_REL_PINNED_HARD);
311443 }
444
+
312445 #endif /* CONFIG_SMP */
313446
314447 static void hrtick_rq_init(struct rq *rq)
315448 {
316449 #ifdef CONFIG_SMP
317
- rq->hrtick_csd_pending = 0;
318
-
319
- rq->hrtick_csd.flags = 0;
320
- rq->hrtick_csd.func = __hrtick_start;
321
- rq->hrtick_csd.info = rq;
450
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
322451 #endif
323
-
324452 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
325453 rq->hrtick_timer.function = hrtick;
326454 }
....@@ -403,8 +531,8 @@
403531 #endif
404532 #endif
405533
406
-void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
407
- bool sleeper)
534
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task,
535
+ bool sleeper)
408536 {
409537 struct wake_q_node *node;
410538
....@@ -422,23 +550,65 @@
422550 * state, even in the failed case, an explicit smp_mb() must be used.
423551 */
424552 smp_mb__before_atomic();
425
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
426
- return;
427
-
428
- head->count++;
429
-
430
- get_task_struct(task);
553
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
554
+ return false;
431555
432556 /*
433557 * The head is context local, there can be no concurrency.
434558 */
435559 *head->lastp = node;
436560 head->lastp = &node->next;
561
+ head->count++;
562
+ return true;
437563 }
438564
439
-static int
440
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
441
- int sibling_count_hint);
565
+/**
566
+ * wake_q_add() - queue a wakeup for 'later' waking.
567
+ * @head: the wake_q_head to add @task to
568
+ * @task: the task to queue for 'later' wakeup
569
+ *
570
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
571
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
572
+ * instantly.
573
+ *
574
+ * This function must be used as-if it were wake_up_process(); IOW the task
575
+ * must be ready to be woken at this location.
576
+ */
577
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
578
+{
579
+ if (__wake_q_add(head, task, false))
580
+ get_task_struct(task);
581
+}
582
+
583
+void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task)
584
+{
585
+ if (__wake_q_add(head, task, true))
586
+ get_task_struct(task);
587
+}
588
+
589
+/**
590
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
591
+ * @head: the wake_q_head to add @task to
592
+ * @task: the task to queue for 'later' wakeup
593
+ *
594
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
595
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
596
+ * instantly.
597
+ *
598
+ * This function must be used as-if it were wake_up_process(); IOW the task
599
+ * must be ready to be woken at this location.
600
+ *
601
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
602
+ * that already hold reference to @task can call the 'safe' version and trust
603
+ * wake_q to do the right thing depending whether or not the @task is already
604
+ * queued for wakeup.
605
+ */
606
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
607
+{
608
+ if (!__wake_q_add(head, task, false))
609
+ put_task_struct(task);
610
+}
611
+
442612 void __wake_up_q(struct wake_q_head *head, bool sleeper)
443613 {
444614 struct wake_q_node *node = head->first;
....@@ -450,13 +620,16 @@
450620 task = container_of(node, struct task_struct, wake_q_sleeper);
451621 else
452622 task = container_of(node, struct task_struct, wake_q);
623
+
453624 BUG_ON(!task);
454625 /* Task can safely be re-inserted now: */
455626 node = node->next;
627
+ task->wake_q_count = head->count;
456628 if (sleeper)
457629 task->wake_q_sleeper.next = NULL;
458630 else
459631 task->wake_q.next = NULL;
632
+
460633 /*
461634 * wake_up_process() executes a full barrier, which pairs with
462635 * the queueing in wake_q_add() so as not to miss wakeups.
....@@ -466,6 +639,7 @@
466639 else
467640 wake_up_process(task);
468641
642
+ task->wake_q_count = 0;
469643 put_task_struct(task);
470644 }
471645 }
....@@ -495,15 +669,12 @@
495669 return;
496670 }
497671
498
-#ifdef CONFIG_PREEMPT
499672 if (set_nr_and_not_polling(curr))
500
-#else
501
- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
502
-#endif
503673 smp_send_reschedule(cpu);
504674 else
505675 trace_sched_wake_idle_without_ipi(cpu);
506676 }
677
+EXPORT_SYMBOL_GPL(resched_curr);
507678
508679 #ifdef CONFIG_PREEMPT_LAZY
509680
....@@ -570,27 +741,49 @@
570741 */
571742 int get_nohz_timer_target(void)
572743 {
573
- int i, cpu = smp_processor_id();
744
+ int i, cpu = smp_processor_id(), default_cpu = -1;
574745 struct sched_domain *sd;
575746
576
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
577
- return cpu;
747
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
748
+ if (!idle_cpu(cpu))
749
+ return cpu;
750
+ default_cpu = cpu;
751
+ }
578752
579753 rcu_read_lock();
580754 for_each_domain(cpu, sd) {
581
- for_each_cpu(i, sched_domain_span(sd)) {
755
+ for_each_cpu_and(i, sched_domain_span(sd),
756
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
582757 if (cpu == i)
583758 continue;
584759
585
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
760
+ if (!idle_cpu(i)) {
586761 cpu = i;
587762 goto unlock;
588763 }
589764 }
590765 }
591766
592
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
593
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
767
+ if (default_cpu == -1) {
768
+ for_each_cpu_and(i, cpu_active_mask,
769
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
770
+ if (cpu == i)
771
+ continue;
772
+
773
+ if (!idle_cpu(i)) {
774
+ cpu = i;
775
+ goto unlock;
776
+ }
777
+ }
778
+
779
+ /* no active, not-idle, housekpeeing CPU found. */
780
+ default_cpu = cpumask_any(cpu_active_mask);
781
+
782
+ if (unlikely(default_cpu >= nr_cpu_ids))
783
+ goto unlock;
784
+ }
785
+
786
+ cpu = default_cpu;
594787 unlock:
595788 rcu_read_unlock();
596789 return cpu;
....@@ -650,29 +843,23 @@
650843 wake_up_idle_cpu(cpu);
651844 }
652845
653
-static inline bool got_nohz_idle_kick(void)
846
+static void nohz_csd_func(void *info)
654847 {
655
- int cpu = smp_processor_id();
656
-
657
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
658
- return false;
659
-
660
- if (idle_cpu(cpu) && !need_resched())
661
- return true;
848
+ struct rq *rq = info;
849
+ int cpu = cpu_of(rq);
850
+ unsigned int flags;
662851
663852 /*
664
- * We can't run Idle Load Balance on this CPU for this time so we
665
- * cancel it and clear NOHZ_BALANCE_KICK
853
+ * Release the rq::nohz_csd.
666854 */
667
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
668
- return false;
669
-}
855
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
856
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
670857
671
-#else /* CONFIG_NO_HZ_COMMON */
672
-
673
-static inline bool got_nohz_idle_kick(void)
674
-{
675
- return false;
858
+ rq->idle_balance = idle_cpu(cpu);
859
+ if (rq->idle_balance && !need_resched()) {
860
+ rq->nohz_idle_balance = flags;
861
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
862
+ }
676863 }
677864
678865 #endif /* CONFIG_NO_HZ_COMMON */
....@@ -763,18 +950,18 @@
763950 }
764951 #endif
765952
766
-static void set_load_weight(struct task_struct *p, bool update_load)
953
+static void set_load_weight(struct task_struct *p)
767954 {
955
+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
768956 int prio = p->static_prio - MAX_RT_PRIO;
769957 struct load_weight *load = &p->se.load;
770958
771959 /*
772960 * SCHED_IDLE tasks get minimal weight:
773961 */
774
- if (idle_policy(p->policy)) {
962
+ if (task_has_idle_policy(p)) {
775963 load->weight = scale_load(WEIGHT_IDLEPRIO);
776964 load->inv_weight = WMULT_IDLEPRIO;
777
- p->se.runnable_weight = load->weight;
778965 return;
779966 }
780967
....@@ -787,7 +974,6 @@
787974 } else {
788975 load->weight = scale_load(sched_prio_to_weight[prio]);
789976 load->inv_weight = sched_prio_to_wmult[prio];
790
- p->se.runnable_weight = load->weight;
791977 }
792978 }
793979
....@@ -810,8 +996,46 @@
810996 /* Max allowed maximum utilization */
811997 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
812998
999
+/*
1000
+ * By default RT tasks run at the maximum performance point/capacity of the
1001
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1002
+ * SCHED_CAPACITY_SCALE.
1003
+ *
1004
+ * This knob allows admins to change the default behavior when uclamp is being
1005
+ * used. In battery powered devices, particularly, running at the maximum
1006
+ * capacity and frequency will increase energy consumption and shorten the
1007
+ * battery life.
1008
+ *
1009
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1010
+ *
1011
+ * This knob will not override the system default sched_util_clamp_min defined
1012
+ * above.
1013
+ */
1014
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1015
+
8131016 /* All clamps are required to be less or equal than these values */
8141017 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1018
+
1019
+/*
1020
+ * This static key is used to reduce the uclamp overhead in the fast path. It
1021
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
1022
+ * enqueue/dequeue_task().
1023
+ *
1024
+ * This allows users to continue to enable uclamp in their kernel config with
1025
+ * minimum uclamp overhead in the fast path.
1026
+ *
1027
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
1028
+ * enabled, since we have an actual users that make use of uclamp
1029
+ * functionality.
1030
+ *
1031
+ * The knobs that would enable this static key are:
1032
+ *
1033
+ * * A task modifying its uclamp value with sched_setattr().
1034
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1035
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
1036
+ */
1037
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1038
+EXPORT_SYMBOL_GPL(sched_uclamp_used);
8151039
8161040 /* Integer rounded range for each bucket */
8171041 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
....@@ -822,11 +1046,6 @@
8221046 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
8231047 {
8241048 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
825
-}
826
-
827
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
828
-{
829
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
8301049 }
8311050
8321051 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
....@@ -892,12 +1111,79 @@
8921111 return uclamp_idle_value(rq, clamp_id, clamp_value);
8931112 }
8941113
1114
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1115
+{
1116
+ unsigned int default_util_min;
1117
+ struct uclamp_se *uc_se;
1118
+
1119
+ lockdep_assert_held(&p->pi_lock);
1120
+
1121
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
1122
+
1123
+ /* Only sync if user didn't override the default */
1124
+ if (uc_se->user_defined)
1125
+ return;
1126
+
1127
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1128
+ uclamp_se_set(uc_se, default_util_min, false);
1129
+}
1130
+
1131
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
1132
+{
1133
+ struct rq_flags rf;
1134
+ struct rq *rq;
1135
+
1136
+ if (!rt_task(p))
1137
+ return;
1138
+
1139
+ /* Protect updates to p->uclamp_* */
1140
+ rq = task_rq_lock(p, &rf);
1141
+ __uclamp_update_util_min_rt_default(p);
1142
+ task_rq_unlock(rq, p, &rf);
1143
+}
1144
+
1145
+static void uclamp_sync_util_min_rt_default(void)
1146
+{
1147
+ struct task_struct *g, *p;
1148
+
1149
+ /*
1150
+ * copy_process() sysctl_uclamp
1151
+ * uclamp_min_rt = X;
1152
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1153
+ * // link thread smp_mb__after_spinlock()
1154
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1155
+ * sched_post_fork() for_each_process_thread()
1156
+ * __uclamp_sync_rt() __uclamp_sync_rt()
1157
+ *
1158
+ * Ensures that either sched_post_fork() will observe the new
1159
+ * uclamp_min_rt or for_each_process_thread() will observe the new
1160
+ * task.
1161
+ */
1162
+ read_lock(&tasklist_lock);
1163
+ smp_mb__after_spinlock();
1164
+ read_unlock(&tasklist_lock);
1165
+
1166
+ rcu_read_lock();
1167
+ for_each_process_thread(g, p)
1168
+ uclamp_update_util_min_rt_default(p);
1169
+ rcu_read_unlock();
1170
+}
1171
+
1172
+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
1173
+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
1174
+{
1175
+ uclamp_sync_util_min_rt_default();
1176
+}
1177
+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
1178
+#endif
1179
+
8951180 static inline struct uclamp_se
8961181 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
8971182 {
1183
+ /* Copy by value as we could modify it */
8981184 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
8991185 #ifdef CONFIG_UCLAMP_TASK_GROUP
900
- struct uclamp_se uc_max;
1186
+ unsigned int tg_min, tg_max, value;
9011187
9021188 /*
9031189 * Tasks in autogroups or root task group will be
....@@ -908,9 +1194,11 @@
9081194 if (task_group(p) == &root_task_group)
9091195 return uc_req;
9101196
911
- uc_max = task_group(p)->uclamp[clamp_id];
912
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
913
- return uc_max;
1197
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1198
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1199
+ value = uc_req.value;
1200
+ value = clamp(value, tg_min, tg_max);
1201
+ uclamp_se_set(&uc_req, value, false);
9141202 #endif
9151203
9161204 return uc_req;
....@@ -929,6 +1217,12 @@
9291217 {
9301218 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
9311219 struct uclamp_se uc_max = uclamp_default[clamp_id];
1220
+ struct uclamp_se uc_eff;
1221
+ int ret = 0;
1222
+
1223
+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
1224
+ if (ret)
1225
+ return uc_eff;
9321226
9331227 /* System default restrictions always apply */
9341228 if (unlikely(uc_req.value > uc_max.value))
....@@ -949,6 +1243,7 @@
9491243
9501244 return (unsigned long)uc_eff.value;
9511245 }
1246
+EXPORT_SYMBOL_GPL(uclamp_eff_value);
9521247
9531248 /*
9541249 * When a task is enqueued on a rq, the clamp bucket currently defined by the
....@@ -1009,10 +1304,38 @@
10091304
10101305 lockdep_assert_held(&rq->lock);
10111306
1307
+ /*
1308
+ * If sched_uclamp_used was enabled after task @p was enqueued,
1309
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
1310
+ *
1311
+ * In this case the uc_se->active flag should be false since no uclamp
1312
+ * accounting was performed at enqueue time and we can just return
1313
+ * here.
1314
+ *
1315
+ * Need to be careful of the following enqeueue/dequeue ordering
1316
+ * problem too
1317
+ *
1318
+ * enqueue(taskA)
1319
+ * // sched_uclamp_used gets enabled
1320
+ * enqueue(taskB)
1321
+ * dequeue(taskA)
1322
+ * // Must not decrement bukcet->tasks here
1323
+ * dequeue(taskB)
1324
+ *
1325
+ * where we could end up with stale data in uc_se and
1326
+ * bucket[uc_se->bucket_id].
1327
+ *
1328
+ * The following check here eliminates the possibility of such race.
1329
+ */
1330
+ if (unlikely(!uc_se->active))
1331
+ return;
1332
+
10121333 bucket = &uc_rq->bucket[uc_se->bucket_id];
1334
+
10131335 SCHED_WARN_ON(!bucket->tasks);
10141336 if (likely(bucket->tasks))
10151337 bucket->tasks--;
1338
+
10161339 uc_se->active = false;
10171340
10181341 /*
....@@ -1040,6 +1363,15 @@
10401363 {
10411364 enum uclamp_id clamp_id;
10421365
1366
+ /*
1367
+ * Avoid any overhead until uclamp is actually used by the userspace.
1368
+ *
1369
+ * The condition is constructed such that a NOP is generated when
1370
+ * sched_uclamp_used is disabled.
1371
+ */
1372
+ if (!static_branch_unlikely(&sched_uclamp_used))
1373
+ return;
1374
+
10431375 if (unlikely(!p->sched_class->uclamp_enabled))
10441376 return;
10451377
....@@ -1055,6 +1387,15 @@
10551387 {
10561388 enum uclamp_id clamp_id;
10571389
1390
+ /*
1391
+ * Avoid any overhead until uclamp is actually used by the userspace.
1392
+ *
1393
+ * The condition is constructed such that a NOP is generated when
1394
+ * sched_uclamp_used is disabled.
1395
+ */
1396
+ if (!static_branch_unlikely(&sched_uclamp_used))
1397
+ return;
1398
+
10581399 if (unlikely(!p->sched_class->uclamp_enabled))
10591400 return;
10601401
....@@ -1062,9 +1403,27 @@
10621403 uclamp_rq_dec_id(rq, p, clamp_id);
10631404 }
10641405
1065
-static inline void
1066
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1406
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1407
+ enum uclamp_id clamp_id)
10671408 {
1409
+ if (!p->uclamp[clamp_id].active)
1410
+ return;
1411
+
1412
+ uclamp_rq_dec_id(rq, p, clamp_id);
1413
+ uclamp_rq_inc_id(rq, p, clamp_id);
1414
+
1415
+ /*
1416
+ * Make sure to clear the idle flag if we've transiently reached 0
1417
+ * active tasks on rq.
1418
+ */
1419
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1420
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1421
+}
1422
+
1423
+static inline void
1424
+uclamp_update_active(struct task_struct *p)
1425
+{
1426
+ enum uclamp_id clamp_id;
10681427 struct rq_flags rf;
10691428 struct rq *rq;
10701429
....@@ -1084,30 +1443,22 @@
10841443 * affecting a valid clamp bucket, the next time it's enqueued,
10851444 * it will already see the updated clamp bucket value.
10861445 */
1087
- if (p->uclamp[clamp_id].active) {
1088
- uclamp_rq_dec_id(rq, p, clamp_id);
1089
- uclamp_rq_inc_id(rq, p, clamp_id);
1090
- }
1446
+ for_each_clamp_id(clamp_id)
1447
+ uclamp_rq_reinc_id(rq, p, clamp_id);
10911448
10921449 task_rq_unlock(rq, p, &rf);
10931450 }
10941451
10951452 #ifdef CONFIG_UCLAMP_TASK_GROUP
10961453 static inline void
1097
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1098
- unsigned int clamps)
1454
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
10991455 {
1100
- enum uclamp_id clamp_id;
11011456 struct css_task_iter it;
11021457 struct task_struct *p;
11031458
11041459 css_task_iter_start(css, 0, &it);
1105
- while ((p = css_task_iter_next(&it))) {
1106
- for_each_clamp_id(clamp_id) {
1107
- if ((0x1 << clamp_id) & clamps)
1108
- uclamp_update_active(p, clamp_id);
1109
- }
1110
- }
1460
+ while ((p = css_task_iter_next(&it)))
1461
+ uclamp_update_active(p);
11111462 css_task_iter_end(&it);
11121463 }
11131464
....@@ -1130,16 +1481,16 @@
11301481 #endif
11311482
11321483 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1133
- void __user *buffer, size_t *lenp,
1134
- loff_t *ppos)
1484
+ void *buffer, size_t *lenp, loff_t *ppos)
11351485 {
11361486 bool update_root_tg = false;
1137
- int old_min, old_max;
1487
+ int old_min, old_max, old_min_rt;
11381488 int result;
11391489
11401490 mutex_lock(&uclamp_mutex);
11411491 old_min = sysctl_sched_uclamp_util_min;
11421492 old_max = sysctl_sched_uclamp_util_max;
1493
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
11431494
11441495 result = proc_dointvec(table, write, buffer, lenp, ppos);
11451496 if (result)
....@@ -1148,7 +1499,9 @@
11481499 goto done;
11491500
11501501 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1151
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1502
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1503
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1504
+
11521505 result = -EINVAL;
11531506 goto undo;
11541507 }
....@@ -1164,8 +1517,15 @@
11641517 update_root_tg = true;
11651518 }
11661519
1167
- if (update_root_tg)
1520
+ if (update_root_tg) {
1521
+ static_branch_enable(&sched_uclamp_used);
11681522 uclamp_update_root_tg();
1523
+ }
1524
+
1525
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1526
+ static_branch_enable(&sched_uclamp_used);
1527
+ uclamp_sync_util_min_rt_default();
1528
+ }
11691529
11701530 /*
11711531 * We update all RUNNABLE tasks only when task groups are in use.
....@@ -1178,6 +1538,7 @@
11781538 undo:
11791539 sysctl_sched_uclamp_util_min = old_min;
11801540 sysctl_sched_uclamp_util_max = old_max;
1541
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
11811542 done:
11821543 mutex_unlock(&uclamp_mutex);
11831544
....@@ -1187,20 +1548,61 @@
11871548 static int uclamp_validate(struct task_struct *p,
11881549 const struct sched_attr *attr)
11891550 {
1190
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1191
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1551
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
1552
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
11921553
1193
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1194
- lower_bound = attr->sched_util_min;
1195
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1196
- upper_bound = attr->sched_util_max;
1554
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1555
+ util_min = attr->sched_util_min;
11971556
1198
- if (lower_bound > upper_bound)
1557
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1558
+ return -EINVAL;
1559
+ }
1560
+
1561
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1562
+ util_max = attr->sched_util_max;
1563
+
1564
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1565
+ return -EINVAL;
1566
+ }
1567
+
1568
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
11991569 return -EINVAL;
1200
- if (upper_bound > SCHED_CAPACITY_SCALE)
1201
- return -EINVAL;
1570
+
1571
+ /*
1572
+ * We have valid uclamp attributes; make sure uclamp is enabled.
1573
+ *
1574
+ * We need to do that here, because enabling static branches is a
1575
+ * blocking operation which obviously cannot be done while holding
1576
+ * scheduler locks.
1577
+ */
1578
+ static_branch_enable(&sched_uclamp_used);
12021579
12031580 return 0;
1581
+}
1582
+
1583
+static bool uclamp_reset(const struct sched_attr *attr,
1584
+ enum uclamp_id clamp_id,
1585
+ struct uclamp_se *uc_se)
1586
+{
1587
+ /* Reset on sched class change for a non user-defined clamp value. */
1588
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1589
+ !uc_se->user_defined)
1590
+ return true;
1591
+
1592
+ /* Reset on sched_util_{min,max} == -1. */
1593
+ if (clamp_id == UCLAMP_MIN &&
1594
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1595
+ attr->sched_util_min == -1) {
1596
+ return true;
1597
+ }
1598
+
1599
+ if (clamp_id == UCLAMP_MAX &&
1600
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1601
+ attr->sched_util_max == -1) {
1602
+ return true;
1603
+ }
1604
+
1605
+ return false;
12041606 }
12051607
12061608 static void __setscheduler_uclamp(struct task_struct *p,
....@@ -1208,40 +1610,41 @@
12081610 {
12091611 enum uclamp_id clamp_id;
12101612
1211
- /*
1212
- * On scheduling class change, reset to default clamps for tasks
1213
- * without a task-specific value.
1214
- */
12151613 for_each_clamp_id(clamp_id) {
12161614 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1217
- unsigned int clamp_value = uclamp_none(clamp_id);
1615
+ unsigned int value;
12181616
1219
- /* Keep using defined clamps across class changes */
1220
- if (uc_se->user_defined)
1617
+ if (!uclamp_reset(attr, clamp_id, uc_se))
12211618 continue;
12221619
1223
- /* By default, RT tasks always get 100% boost */
1224
- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1225
- unlikely(rt_task(p) &&
1226
- clamp_id == UCLAMP_MIN)) {
1620
+ /*
1621
+ * RT by default have a 100% boost value that could be modified
1622
+ * at runtime.
1623
+ */
1624
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1625
+ value = sysctl_sched_uclamp_util_min_rt_default;
1626
+ else
1627
+ value = uclamp_none(clamp_id);
12271628
1228
- clamp_value = uclamp_none(UCLAMP_MAX);
1229
- }
1629
+ uclamp_se_set(uc_se, value, false);
12301630
1231
- uclamp_se_set(uc_se, clamp_value, false);
12321631 }
12331632
12341633 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
12351634 return;
12361635
1237
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1636
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1637
+ attr->sched_util_min != -1) {
12381638 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
12391639 attr->sched_util_min, true);
1640
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
12401641 }
12411642
1242
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1643
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1644
+ attr->sched_util_max != -1) {
12431645 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
12441646 attr->sched_util_max, true);
1647
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
12451648 }
12461649 }
12471650
....@@ -1249,6 +1652,10 @@
12491652 {
12501653 enum uclamp_id clamp_id;
12511654
1655
+ /*
1656
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1657
+ * as the task is still at its early fork stages.
1658
+ */
12521659 for_each_clamp_id(clamp_id)
12531660 p->uclamp[clamp_id].active = false;
12541661
....@@ -1261,39 +1668,24 @@
12611668 }
12621669 }
12631670
1264
-#ifdef CONFIG_SMP
1265
-unsigned int uclamp_task(struct task_struct *p)
1671
+static void uclamp_post_fork(struct task_struct *p)
12661672 {
1267
- unsigned long util;
1268
-
1269
- util = task_util_est(p);
1270
- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1271
- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1272
-
1273
- return util;
1673
+ uclamp_update_util_min_rt_default(p);
12741674 }
12751675
1276
-bool uclamp_boosted(struct task_struct *p)
1676
+static void __init init_uclamp_rq(struct rq *rq)
12771677 {
1278
- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
1678
+ enum uclamp_id clamp_id;
1679
+ struct uclamp_rq *uc_rq = rq->uclamp;
1680
+
1681
+ for_each_clamp_id(clamp_id) {
1682
+ uc_rq[clamp_id] = (struct uclamp_rq) {
1683
+ .value = uclamp_none(clamp_id)
1684
+ };
1685
+ }
1686
+
1687
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
12791688 }
1280
-
1281
-bool uclamp_latency_sensitive(struct task_struct *p)
1282
-{
1283
-#ifdef CONFIG_UCLAMP_TASK_GROUP
1284
- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1285
- struct task_group *tg;
1286
-
1287
- if (!css)
1288
- return false;
1289
- tg = container_of(css, struct task_group, css);
1290
-
1291
- return tg->latency_sensitive;
1292
-#else
1293
- return false;
1294
-#endif
1295
-}
1296
-#endif /* CONFIG_SMP */
12971689
12981690 static void __init init_uclamp(void)
12991691 {
....@@ -1301,13 +1693,8 @@
13011693 enum uclamp_id clamp_id;
13021694 int cpu;
13031695
1304
- mutex_init(&uclamp_mutex);
1305
-
1306
- for_each_possible_cpu(cpu) {
1307
- memset(&cpu_rq(cpu)->uclamp, 0,
1308
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1309
- cpu_rq(cpu)->uclamp_flags = 0;
1310
- }
1696
+ for_each_possible_cpu(cpu)
1697
+ init_uclamp_rq(cpu_rq(cpu));
13111698
13121699 for_each_clamp_id(clamp_id) {
13131700 uclamp_se_set(&init_task.uclamp_req[clamp_id],
....@@ -1336,41 +1723,7 @@
13361723 static void __setscheduler_uclamp(struct task_struct *p,
13371724 const struct sched_attr *attr) { }
13381725 static inline void uclamp_fork(struct task_struct *p) { }
1339
-
1340
-long schedtune_task_margin(struct task_struct *task);
1341
-
1342
-#ifdef CONFIG_SMP
1343
-unsigned int uclamp_task(struct task_struct *p)
1344
-{
1345
- unsigned long util = task_util_est(p);
1346
-#ifdef CONFIG_SCHED_TUNE
1347
- long margin = schedtune_task_margin(p);
1348
-
1349
- trace_sched_boost_task(p, util, margin);
1350
-
1351
- util += margin;
1352
-#endif
1353
-
1354
- return util;
1355
-}
1356
-
1357
-bool uclamp_boosted(struct task_struct *p)
1358
-{
1359
-#ifdef CONFIG_SCHED_TUNE
1360
- return schedtune_task_boost(p) > 0;
1361
-#endif
1362
- return false;
1363
-}
1364
-
1365
-bool uclamp_latency_sensitive(struct task_struct *p)
1366
-{
1367
-#ifdef CONFIG_SCHED_TUNE
1368
- return schedtune_prefer_idle(p) != 0;
1369
-#endif
1370
- return false;
1371
-}
1372
-#endif /* CONFIG_SMP */
1373
-
1726
+static inline void uclamp_post_fork(struct task_struct *p) { }
13741727 static inline void init_uclamp(void) { }
13751728 #endif /* CONFIG_UCLAMP_TASK */
13761729
....@@ -1385,7 +1738,9 @@
13851738 }
13861739
13871740 uclamp_rq_inc(rq, p);
1741
+ trace_android_rvh_enqueue_task(rq, p, flags);
13881742 p->sched_class->enqueue_task(rq, p, flags);
1743
+ trace_android_rvh_after_enqueue_task(rq, p);
13891744 }
13901745
13911746 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
....@@ -1399,31 +1754,39 @@
13991754 }
14001755
14011756 uclamp_rq_dec(rq, p);
1757
+ trace_android_rvh_dequeue_task(rq, p, flags);
14021758 p->sched_class->dequeue_task(rq, p, flags);
1759
+ trace_android_rvh_after_dequeue_task(rq, p);
14031760 }
14041761
14051762 void activate_task(struct rq *rq, struct task_struct *p, int flags)
14061763 {
1407
- if (task_contributes_to_load(p))
1408
- rq->nr_uninterruptible--;
1409
-
14101764 enqueue_task(rq, p, flags);
1765
+
1766
+ p->on_rq = TASK_ON_RQ_QUEUED;
14111767 }
1768
+EXPORT_SYMBOL_GPL(activate_task);
14121769
14131770 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
14141771 {
1415
- if (task_contributes_to_load(p))
1416
- rq->nr_uninterruptible++;
1772
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
14171773
14181774 dequeue_task(rq, p, flags);
14191775 }
1776
+EXPORT_SYMBOL_GPL(deactivate_task);
14201777
1421
-/*
1422
- * __normal_prio - return the priority that is based on the static prio
1423
- */
1424
-static inline int __normal_prio(struct task_struct *p)
1778
+static inline int __normal_prio(int policy, int rt_prio, int nice)
14251779 {
1426
- return p->static_prio;
1780
+ int prio;
1781
+
1782
+ if (dl_policy(policy))
1783
+ prio = MAX_DL_PRIO - 1;
1784
+ else if (rt_policy(policy))
1785
+ prio = MAX_RT_PRIO - 1 - rt_prio;
1786
+ else
1787
+ prio = NICE_TO_PRIO(nice);
1788
+
1789
+ return prio;
14271790 }
14281791
14291792 /*
....@@ -1435,15 +1798,7 @@
14351798 */
14361799 static inline int normal_prio(struct task_struct *p)
14371800 {
1438
- int prio;
1439
-
1440
- if (task_has_dl_policy(p))
1441
- prio = MAX_DL_PRIO-1;
1442
- else if (task_has_rt_policy(p))
1443
- prio = MAX_RT_PRIO-1 - p->rt_priority;
1444
- else
1445
- prio = __normal_prio(p);
1446
- return prio;
1801
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
14471802 }
14481803
14491804 /*
....@@ -1499,20 +1854,10 @@
14991854
15001855 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
15011856 {
1502
- const struct sched_class *class;
1503
-
1504
- if (p->sched_class == rq->curr->sched_class) {
1857
+ if (p->sched_class == rq->curr->sched_class)
15051858 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1506
- } else {
1507
- for_each_class(class) {
1508
- if (class == rq->curr->sched_class)
1509
- break;
1510
- if (class == p->sched_class) {
1511
- resched_curr(rq);
1512
- break;
1513
- }
1514
- }
1515
- }
1859
+ else if (p->sched_class > rq->curr->sched_class)
1860
+ resched_curr(rq);
15161861
15171862 /*
15181863 * A queue event has occurred, and we're going to schedule. In
....@@ -1521,22 +1866,88 @@
15211866 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
15221867 rq_clock_skip_update(rq);
15231868 }
1869
+EXPORT_SYMBOL_GPL(check_preempt_curr);
15241870
15251871 #ifdef CONFIG_SMP
15261872
1527
-static inline bool is_per_cpu_kthread(struct task_struct *p)
1873
+static void
1874
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
1875
+
1876
+static int __set_cpus_allowed_ptr(struct task_struct *p,
1877
+ const struct cpumask *new_mask,
1878
+ u32 flags);
1879
+
1880
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
15281881 {
1529
- if (!(p->flags & PF_KTHREAD))
1530
- return false;
1882
+ if (likely(!p->migration_disabled))
1883
+ return;
15311884
1532
- if (p->nr_cpus_allowed != 1)
1533
- return false;
1885
+ if (p->cpus_ptr != &p->cpus_mask)
1886
+ return;
15341887
1535
- return true;
1888
+ /*
1889
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
1890
+ */
1891
+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
1892
+}
1893
+
1894
+void migrate_disable(void)
1895
+{
1896
+ struct task_struct *p = current;
1897
+
1898
+ if (p->migration_disabled) {
1899
+ p->migration_disabled++;
1900
+ return;
1901
+ }
1902
+
1903
+ trace_sched_migrate_disable_tp(p);
1904
+
1905
+ preempt_disable();
1906
+ this_rq()->nr_pinned++;
1907
+ p->migration_disabled = 1;
1908
+ preempt_lazy_disable();
1909
+ preempt_enable();
1910
+}
1911
+EXPORT_SYMBOL_GPL(migrate_disable);
1912
+
1913
+void migrate_enable(void)
1914
+{
1915
+ struct task_struct *p = current;
1916
+
1917
+ if (p->migration_disabled > 1) {
1918
+ p->migration_disabled--;
1919
+ return;
1920
+ }
1921
+
1922
+ /*
1923
+ * Ensure stop_task runs either before or after this, and that
1924
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
1925
+ */
1926
+ preempt_disable();
1927
+ if (p->cpus_ptr != &p->cpus_mask)
1928
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
1929
+ /*
1930
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
1931
+ * regular cpus_mask, otherwise things that race (eg.
1932
+ * select_fallback_rq) get confused.
1933
+ */
1934
+ barrier();
1935
+ p->migration_disabled = 0;
1936
+ this_rq()->nr_pinned--;
1937
+ preempt_lazy_enable();
1938
+ preempt_enable();
1939
+
1940
+ trace_sched_migrate_enable_tp(p);
1941
+}
1942
+EXPORT_SYMBOL_GPL(migrate_enable);
1943
+
1944
+static inline bool rq_has_pinned_tasks(struct rq *rq)
1945
+{
1946
+ return rq->nr_pinned;
15361947 }
15371948
15381949 /*
1539
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
1950
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
15401951 * __set_cpus_allowed_ptr() and select_fallback_rq().
15411952 */
15421953 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
....@@ -1544,10 +1955,13 @@
15441955 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
15451956 return false;
15461957
1547
- if (is_per_cpu_kthread(p) || __migrate_disabled(p))
1958
+ if (is_per_cpu_kthread(p) || is_migration_disabled(p))
15481959 return cpu_online(cpu);
15491960
1550
- return cpu_active(cpu);
1961
+ if (!cpu_active(cpu))
1962
+ return false;
1963
+
1964
+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
15511965 }
15521966
15531967 /*
....@@ -1572,28 +1986,50 @@
15721986 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15731987 struct task_struct *p, int new_cpu)
15741988 {
1989
+ int detached = 0;
1990
+
15751991 lockdep_assert_held(&rq->lock);
15761992
1577
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1578
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1579
- set_task_cpu(p, new_cpu);
1580
- rq_unlock(rq, rf);
1993
+ /*
1994
+ * The vendor hook may drop the lock temporarily, so
1995
+ * pass the rq flags to unpin lock. We expect the
1996
+ * rq lock to be held after return.
1997
+ */
1998
+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
1999
+ if (detached)
2000
+ goto attach;
15812001
2002
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2003
+ set_task_cpu(p, new_cpu);
2004
+
2005
+attach:
2006
+ rq_unlock(rq, rf);
15822007 rq = cpu_rq(new_cpu);
15832008
15842009 rq_lock(rq, rf);
15852010 BUG_ON(task_cpu(p) != new_cpu);
1586
- enqueue_task(rq, p, 0);
1587
- p->on_rq = TASK_ON_RQ_QUEUED;
2011
+ activate_task(rq, p, 0);
15882012 check_preempt_curr(rq, p, 0);
15892013
15902014 return rq;
15912015 }
15922016
15932017 struct migration_arg {
1594
- struct task_struct *task;
1595
- int dest_cpu;
1596
- bool done;
2018
+ struct task_struct *task;
2019
+ int dest_cpu;
2020
+ struct set_affinity_pending *pending;
2021
+};
2022
+
2023
+/*
2024
+ * @refs: number of wait_for_completion()
2025
+ * @stop_pending: is @stop_work in use
2026
+ */
2027
+struct set_affinity_pending {
2028
+ refcount_t refs;
2029
+ unsigned int stop_pending;
2030
+ struct completion done;
2031
+ struct cpu_stop_work stop_work;
2032
+ struct migration_arg arg;
15972033 };
15982034
15992035 /*
....@@ -1626,44 +2062,141 @@
16262062 static int migration_cpu_stop(void *data)
16272063 {
16282064 struct migration_arg *arg = data;
2065
+ struct set_affinity_pending *pending = arg->pending;
16292066 struct task_struct *p = arg->task;
16302067 struct rq *rq = this_rq();
2068
+ bool complete = false;
16312069 struct rq_flags rf;
1632
- int dest_cpu = arg->dest_cpu;
1633
-
1634
- /* We don't look at arg after this point. */
1635
- smp_mb();
1636
- arg->done = true;
16372070
16382071 /*
16392072 * The original target CPU might have gone down and we might
16402073 * be on another CPU but it doesn't matter.
16412074 */
1642
- local_irq_disable();
2075
+ local_irq_save(rf.flags);
16432076 /*
16442077 * We need to explicitly wake pending tasks before running
16452078 * __migrate_task() such that we will not miss enforcing cpus_ptr
16462079 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
16472080 */
1648
- sched_ttwu_pending();
2081
+ flush_smp_call_function_from_idle();
16492082
16502083 raw_spin_lock(&p->pi_lock);
16512084 rq_lock(rq, &rf);
2085
+
16522086 /*
16532087 * If task_rq(p) != rq, it cannot be migrated here, because we're
16542088 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
16552089 * we're holding p->pi_lock.
16562090 */
16572091 if (task_rq(p) == rq) {
1658
- if (task_on_rq_queued(p))
1659
- rq = __migrate_task(rq, &rf, p, dest_cpu);
1660
- else
1661
- p->wake_cpu = dest_cpu;
1662
- }
1663
- rq_unlock(rq, &rf);
1664
- raw_spin_unlock(&p->pi_lock);
2092
+ if (is_migration_disabled(p))
2093
+ goto out;
16652094
1666
- local_irq_enable();
2095
+ if (pending) {
2096
+ if (p->migration_pending == pending)
2097
+ p->migration_pending = NULL;
2098
+ complete = true;
2099
+
2100
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2101
+ goto out;
2102
+ }
2103
+
2104
+ if (task_on_rq_queued(p))
2105
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2106
+ else
2107
+ p->wake_cpu = arg->dest_cpu;
2108
+
2109
+ /*
2110
+ * XXX __migrate_task() can fail, at which point we might end
2111
+ * up running on a dodgy CPU, AFAICT this can only happen
2112
+ * during CPU hotplug, at which point we'll get pushed out
2113
+ * anyway, so it's probably not a big deal.
2114
+ */
2115
+
2116
+ } else if (pending) {
2117
+ /*
2118
+ * This happens when we get migrated between migrate_enable()'s
2119
+ * preempt_enable() and scheduling the stopper task. At that
2120
+ * point we're a regular task again and not current anymore.
2121
+ *
2122
+ * A !PREEMPT kernel has a giant hole here, which makes it far
2123
+ * more likely.
2124
+ */
2125
+
2126
+ /*
2127
+ * The task moved before the stopper got to run. We're holding
2128
+ * ->pi_lock, so the allowed mask is stable - if it got
2129
+ * somewhere allowed, we're done.
2130
+ */
2131
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2132
+ if (p->migration_pending == pending)
2133
+ p->migration_pending = NULL;
2134
+ complete = true;
2135
+ goto out;
2136
+ }
2137
+
2138
+ /*
2139
+ * When migrate_enable() hits a rq mis-match we can't reliably
2140
+ * determine is_migration_disabled() and so have to chase after
2141
+ * it.
2142
+ */
2143
+ WARN_ON_ONCE(!pending->stop_pending);
2144
+ task_rq_unlock(rq, p, &rf);
2145
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2146
+ &pending->arg, &pending->stop_work);
2147
+ return 0;
2148
+ }
2149
+out:
2150
+ if (pending)
2151
+ pending->stop_pending = false;
2152
+ task_rq_unlock(rq, p, &rf);
2153
+
2154
+ if (complete)
2155
+ complete_all(&pending->done);
2156
+
2157
+ return 0;
2158
+}
2159
+
2160
+int push_cpu_stop(void *arg)
2161
+{
2162
+ struct rq *lowest_rq = NULL, *rq = this_rq();
2163
+ struct task_struct *p = arg;
2164
+
2165
+ raw_spin_lock_irq(&p->pi_lock);
2166
+ raw_spin_lock(&rq->lock);
2167
+
2168
+ if (task_rq(p) != rq)
2169
+ goto out_unlock;
2170
+
2171
+ if (is_migration_disabled(p)) {
2172
+ p->migration_flags |= MDF_PUSH;
2173
+ goto out_unlock;
2174
+ }
2175
+
2176
+ p->migration_flags &= ~MDF_PUSH;
2177
+
2178
+ if (p->sched_class->find_lock_rq)
2179
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
2180
+
2181
+ if (!lowest_rq)
2182
+ goto out_unlock;
2183
+
2184
+ // XXX validate p is still the highest prio task
2185
+ if (task_rq(p) == rq) {
2186
+ deactivate_task(rq, p, 0);
2187
+ set_task_cpu(p, lowest_rq->cpu);
2188
+ activate_task(lowest_rq, p, 0);
2189
+ resched_curr(lowest_rq);
2190
+ }
2191
+
2192
+ double_unlock_balance(rq, lowest_rq);
2193
+
2194
+out_unlock:
2195
+ rq->push_busy = false;
2196
+ raw_spin_unlock(&rq->lock);
2197
+ raw_spin_unlock_irq(&p->pi_lock);
2198
+
2199
+ put_task_struct(p);
16672200 return 0;
16682201 }
16692202
....@@ -1671,27 +2204,40 @@
16712204 * sched_class::set_cpus_allowed must do the below, but is not required to
16722205 * actually call this function.
16732206 */
1674
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
2207
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
16752208 {
2209
+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2210
+ p->cpus_ptr = new_mask;
2211
+ return;
2212
+ }
2213
+
16762214 cpumask_copy(&p->cpus_mask, new_mask);
1677
- if (p->cpus_ptr == &p->cpus_mask)
1678
- p->nr_cpus_allowed = cpumask_weight(new_mask);
2215
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
2216
+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
16792217 }
16802218
1681
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682
-int __migrate_disabled(struct task_struct *p)
1683
-{
1684
- return p->migrate_disable;
1685
-}
1686
-EXPORT_SYMBOL_GPL(__migrate_disabled);
1687
-#endif
1688
-
1689
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2219
+static void
2220
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
16902221 {
16912222 struct rq *rq = task_rq(p);
16922223 bool queued, running;
16932224
1694
- lockdep_assert_held(&p->pi_lock);
2225
+ /*
2226
+ * This here violates the locking rules for affinity, since we're only
2227
+ * supposed to change these variables while holding both rq->lock and
2228
+ * p->pi_lock.
2229
+ *
2230
+ * HOWEVER, it magically works, because ttwu() is the only code that
2231
+ * accesses these variables under p->pi_lock and only does so after
2232
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2233
+ * before finish_task().
2234
+ *
2235
+ * XXX do further audits, this smells like something putrid.
2236
+ */
2237
+ if (flags & SCA_MIGRATE_DISABLE)
2238
+ SCHED_WARN_ON(!p->on_cpu);
2239
+ else
2240
+ lockdep_assert_held(&p->pi_lock);
16952241
16962242 queued = task_on_rq_queued(p);
16972243 running = task_current(rq, p);
....@@ -1707,12 +2253,312 @@
17072253 if (running)
17082254 put_prev_task(rq, p);
17092255
1710
- p->sched_class->set_cpus_allowed(p, new_mask);
2256
+ p->sched_class->set_cpus_allowed(p, new_mask, flags);
17112257
17122258 if (queued)
17132259 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
17142260 if (running)
1715
- set_curr_task(rq, p);
2261
+ set_next_task(rq, p);
2262
+}
2263
+
2264
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2265
+ int dest_cpu, unsigned int flags);
2266
+/*
2267
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
2268
+ */
2269
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2270
+ const struct cpumask *new_mask,
2271
+ u32 flags,
2272
+ struct rq *rq,
2273
+ struct rq_flags *rf)
2274
+{
2275
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
2276
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2277
+ unsigned int dest_cpu;
2278
+ int ret = 0;
2279
+
2280
+ update_rq_clock(rq);
2281
+
2282
+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2283
+ /*
2284
+ * Kernel threads are allowed on online && !active CPUs.
2285
+ *
2286
+ * Specifically, migration_disabled() tasks must not fail the
2287
+ * cpumask_any_and_distribute() pick below, esp. so on
2288
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
2289
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2290
+ */
2291
+ cpu_valid_mask = cpu_online_mask;
2292
+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
2293
+ ret = -EINVAL;
2294
+ goto out;
2295
+ }
2296
+
2297
+ /*
2298
+ * Must re-check here, to close a race against __kthread_bind(),
2299
+ * sched_setaffinity() is not guaranteed to observe the flag.
2300
+ */
2301
+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2302
+ ret = -EINVAL;
2303
+ goto out;
2304
+ }
2305
+
2306
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
2307
+ if (cpumask_equal(&p->cpus_mask, new_mask))
2308
+ goto out;
2309
+
2310
+ if (WARN_ON_ONCE(p == current &&
2311
+ is_migration_disabled(p) &&
2312
+ !cpumask_test_cpu(task_cpu(p), new_mask))) {
2313
+ ret = -EBUSY;
2314
+ goto out;
2315
+ }
2316
+ }
2317
+
2318
+ /*
2319
+ * Picking a ~random cpu helps in cases where we are changing affinity
2320
+ * for groups of tasks (ie. cpuset), so that load balancing is not
2321
+ * immediately required to distribute the tasks within their new mask.
2322
+ */
2323
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2324
+ if (dest_cpu >= nr_cpu_ids) {
2325
+ ret = -EINVAL;
2326
+ goto out;
2327
+ }
2328
+
2329
+ __do_set_cpus_allowed(p, new_mask, flags);
2330
+
2331
+ if (p->flags & PF_KTHREAD) {
2332
+ /*
2333
+ * For kernel threads that do indeed end up on online &&
2334
+ * !active we want to ensure they are strict per-CPU threads.
2335
+ */
2336
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
2337
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
2338
+ p->nr_cpus_allowed != 1);
2339
+ }
2340
+
2341
+ return affine_move_task(rq, p, rf, dest_cpu, flags);
2342
+out:
2343
+ task_rq_unlock(rq, p, rf);
2344
+
2345
+ return ret;
2346
+}
2347
+
2348
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2349
+{
2350
+ __do_set_cpus_allowed(p, new_mask, 0);
2351
+}
2352
+
2353
+/*
2354
+ * This function is wildly self concurrent; here be dragons.
2355
+ *
2356
+ *
2357
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2358
+ * designated task is enqueued on an allowed CPU. If that task is currently
2359
+ * running, we have to kick it out using the CPU stopper.
2360
+ *
2361
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
2362
+ * Consider:
2363
+ *
2364
+ * Initial conditions: P0->cpus_mask = [0, 1]
2365
+ *
2366
+ * P0@CPU0 P1
2367
+ *
2368
+ * migrate_disable();
2369
+ * <preempted>
2370
+ * set_cpus_allowed_ptr(P0, [1]);
2371
+ *
2372
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2373
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2374
+ * This means we need the following scheme:
2375
+ *
2376
+ * P0@CPU0 P1
2377
+ *
2378
+ * migrate_disable();
2379
+ * <preempted>
2380
+ * set_cpus_allowed_ptr(P0, [1]);
2381
+ * <blocks>
2382
+ * <resumes>
2383
+ * migrate_enable();
2384
+ * __set_cpus_allowed_ptr();
2385
+ * <wakes local stopper>
2386
+ * `--> <woken on migration completion>
2387
+ *
2388
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2389
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2390
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
2391
+ * should come into effect at the end of the Migrate-Disable region is the last
2392
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2393
+ * but we still need to properly signal those waiting tasks at the appropriate
2394
+ * moment.
2395
+ *
2396
+ * This is implemented using struct set_affinity_pending. The first
2397
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2398
+ * setup an instance of that struct and install it on the targeted task_struct.
2399
+ * Any and all further callers will reuse that instance. Those then wait for
2400
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
2401
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2402
+ *
2403
+ *
2404
+ * (1) In the cases covered above. There is one more where the completion is
2405
+ * signaled within affine_move_task() itself: when a subsequent affinity request
2406
+ * cancels the need for an active migration. Consider:
2407
+ *
2408
+ * Initial conditions: P0->cpus_mask = [0, 1]
2409
+ *
2410
+ * P0@CPU0 P1 P2
2411
+ *
2412
+ * migrate_disable();
2413
+ * <preempted>
2414
+ * set_cpus_allowed_ptr(P0, [1]);
2415
+ * <blocks>
2416
+ * set_cpus_allowed_ptr(P0, [0, 1]);
2417
+ * <signal completion>
2418
+ * <awakes>
2419
+ *
2420
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
2421
+ * pending affinity completion is preceded an uninstallion of
2422
+ * p->migration_pending done with p->pi_lock held.
2423
+ */
2424
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2425
+ int dest_cpu, unsigned int flags)
2426
+{
2427
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
2428
+ bool stop_pending, complete = false;
2429
+
2430
+ /* Can the task run on the task's current CPU? If so, we're done */
2431
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2432
+ struct task_struct *push_task = NULL;
2433
+
2434
+ if ((flags & SCA_MIGRATE_ENABLE) &&
2435
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2436
+ rq->push_busy = true;
2437
+ push_task = get_task_struct(p);
2438
+ }
2439
+
2440
+ /*
2441
+ * If there are pending waiters, but no pending stop_work,
2442
+ * then complete now.
2443
+ */
2444
+ pending = p->migration_pending;
2445
+ if (pending && !pending->stop_pending) {
2446
+ p->migration_pending = NULL;
2447
+ complete = true;
2448
+ }
2449
+
2450
+ task_rq_unlock(rq, p, rf);
2451
+
2452
+ if (push_task) {
2453
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2454
+ p, &rq->push_work);
2455
+ }
2456
+
2457
+ if (complete)
2458
+ complete_all(&pending->done);
2459
+
2460
+ return 0;
2461
+ }
2462
+
2463
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
2464
+ /* serialized by p->pi_lock */
2465
+ if (!p->migration_pending) {
2466
+ /* Install the request */
2467
+ refcount_set(&my_pending.refs, 1);
2468
+ init_completion(&my_pending.done);
2469
+ my_pending.arg = (struct migration_arg) {
2470
+ .task = p,
2471
+ .dest_cpu = dest_cpu,
2472
+ .pending = &my_pending,
2473
+ };
2474
+
2475
+ p->migration_pending = &my_pending;
2476
+ } else {
2477
+ pending = p->migration_pending;
2478
+ refcount_inc(&pending->refs);
2479
+ /*
2480
+ * Affinity has changed, but we've already installed a
2481
+ * pending. migration_cpu_stop() *must* see this, else
2482
+ * we risk a completion of the pending despite having a
2483
+ * task on a disallowed CPU.
2484
+ *
2485
+ * Serialized by p->pi_lock, so this is safe.
2486
+ */
2487
+ pending->arg.dest_cpu = dest_cpu;
2488
+ }
2489
+ }
2490
+ pending = p->migration_pending;
2491
+ /*
2492
+ * - !MIGRATE_ENABLE:
2493
+ * we'll have installed a pending if there wasn't one already.
2494
+ *
2495
+ * - MIGRATE_ENABLE:
2496
+ * we're here because the current CPU isn't matching anymore,
2497
+ * the only way that can happen is because of a concurrent
2498
+ * set_cpus_allowed_ptr() call, which should then still be
2499
+ * pending completion.
2500
+ *
2501
+ * Either way, we really should have a @pending here.
2502
+ */
2503
+ if (WARN_ON_ONCE(!pending)) {
2504
+ task_rq_unlock(rq, p, rf);
2505
+ return -EINVAL;
2506
+ }
2507
+
2508
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
2509
+ /*
2510
+ * MIGRATE_ENABLE gets here because 'p == current', but for
2511
+ * anything else we cannot do is_migration_disabled(), punt
2512
+ * and have the stopper function handle it all race-free.
2513
+ */
2514
+ stop_pending = pending->stop_pending;
2515
+ if (!stop_pending)
2516
+ pending->stop_pending = true;
2517
+
2518
+ if (flags & SCA_MIGRATE_ENABLE)
2519
+ p->migration_flags &= ~MDF_PUSH;
2520
+
2521
+ task_rq_unlock(rq, p, rf);
2522
+
2523
+ if (!stop_pending) {
2524
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2525
+ &pending->arg, &pending->stop_work);
2526
+ }
2527
+
2528
+ if (flags & SCA_MIGRATE_ENABLE)
2529
+ return 0;
2530
+ } else {
2531
+
2532
+ if (!is_migration_disabled(p)) {
2533
+ if (task_on_rq_queued(p))
2534
+ rq = move_queued_task(rq, rf, p, dest_cpu);
2535
+
2536
+ if (!pending->stop_pending) {
2537
+ p->migration_pending = NULL;
2538
+ complete = true;
2539
+ }
2540
+ }
2541
+ task_rq_unlock(rq, p, rf);
2542
+
2543
+ if (complete)
2544
+ complete_all(&pending->done);
2545
+ }
2546
+
2547
+ wait_for_completion(&pending->done);
2548
+
2549
+ if (refcount_dec_and_test(&pending->refs))
2550
+ wake_up_var(&pending->refs); /* No UaF, just an address */
2551
+
2552
+ /*
2553
+ * Block the original owner of &pending until all subsequent callers
2554
+ * have seen the completion and decremented the refcount
2555
+ */
2556
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2557
+
2558
+ /* ARGH */
2559
+ WARN_ON_ONCE(my_pending.stop_pending);
2560
+
2561
+ return 0;
17162562 }
17172563
17182564 /*
....@@ -1725,84 +2571,89 @@
17252571 * call is not atomic; no spinlocks may be held.
17262572 */
17272573 static int __set_cpus_allowed_ptr(struct task_struct *p,
1728
- const struct cpumask *new_mask, bool check)
2574
+ const struct cpumask *new_mask,
2575
+ u32 flags)
17292576 {
1730
- const struct cpumask *cpu_valid_mask = cpu_active_mask;
1731
- unsigned int dest_cpu;
17322577 struct rq_flags rf;
17332578 struct rq *rq;
1734
- int ret = 0;
17352579
17362580 rq = task_rq_lock(p, &rf);
1737
- update_rq_clock(rq);
1738
-
1739
- if (p->flags & PF_KTHREAD) {
1740
- /*
1741
- * Kernel threads are allowed on online && !active CPUs
1742
- */
1743
- cpu_valid_mask = cpu_online_mask;
1744
- }
1745
-
1746
- /*
1747
- * Must re-check here, to close a race against __kthread_bind(),
1748
- * sched_setaffinity() is not guaranteed to observe the flag.
1749
- */
1750
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
1751
- ret = -EINVAL;
1752
- goto out;
1753
- }
1754
-
1755
- if (cpumask_equal(&p->cpus_mask, new_mask))
1756
- goto out;
1757
-
1758
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1759
- if (dest_cpu >= nr_cpu_ids) {
1760
- ret = -EINVAL;
1761
- goto out;
1762
- }
1763
-
1764
- do_set_cpus_allowed(p, new_mask);
1765
-
1766
- if (p->flags & PF_KTHREAD) {
1767
- /*
1768
- * For kernel threads that do indeed end up on online &&
1769
- * !active we want to ensure they are strict per-CPU threads.
1770
- */
1771
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1772
- !cpumask_intersects(new_mask, cpu_active_mask) &&
1773
- p->nr_cpus_allowed != 1);
1774
- }
1775
-
1776
- /* Can the task run on the task's current CPU? If so, we're done */
1777
- if (cpumask_test_cpu(task_cpu(p), new_mask) ||
1778
- p->cpus_ptr != &p->cpus_mask)
1779
- goto out;
1780
-
1781
- if (task_running(rq, p) || p->state == TASK_WAKING) {
1782
- struct migration_arg arg = { p, dest_cpu };
1783
- /* Need help from migration thread: drop lock and wait. */
1784
- task_rq_unlock(rq, p, &rf);
1785
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1786
- tlb_migrate_finish(p->mm);
1787
- return 0;
1788
- } else if (task_on_rq_queued(p)) {
1789
- /*
1790
- * OK, since we're going to drop the lock immediately
1791
- * afterwards anyway.
1792
- */
1793
- rq = move_queued_task(rq, &rf, p, dest_cpu);
1794
- }
1795
-out:
1796
- task_rq_unlock(rq, p, &rf);
1797
-
1798
- return ret;
2581
+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
17992582 }
18002583
18012584 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
18022585 {
1803
- return __set_cpus_allowed_ptr(p, new_mask, false);
2586
+ return __set_cpus_allowed_ptr(p, new_mask, 0);
18042587 }
18052588 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2589
+
2590
+/*
2591
+ * Change a given task's CPU affinity to the intersection of its current
2592
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
2593
+ * If the resulting mask is empty, leave the affinity unchanged and return
2594
+ * -EINVAL.
2595
+ */
2596
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
2597
+ struct cpumask *new_mask,
2598
+ const struct cpumask *subset_mask)
2599
+{
2600
+ struct rq_flags rf;
2601
+ struct rq *rq;
2602
+
2603
+ rq = task_rq_lock(p, &rf);
2604
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2605
+ task_rq_unlock(rq, p, &rf);
2606
+ return -EINVAL;
2607
+ }
2608
+
2609
+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
2610
+}
2611
+
2612
+/*
2613
+ * Restrict a given task's CPU affinity so that it is a subset of
2614
+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
2615
+ * walk up the cpuset hierarchy until we find a suitable mask.
2616
+ */
2617
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2618
+{
2619
+ cpumask_var_t new_mask;
2620
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
2621
+
2622
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
2623
+
2624
+ /*
2625
+ * __migrate_task() can fail silently in the face of concurrent
2626
+ * offlining of the chosen destination CPU, so take the hotplug
2627
+ * lock to ensure that the migration succeeds.
2628
+ */
2629
+ trace_android_rvh_force_compatible_pre(NULL);
2630
+ cpus_read_lock();
2631
+ if (!cpumask_available(new_mask))
2632
+ goto out_set_mask;
2633
+
2634
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2635
+ goto out_free_mask;
2636
+
2637
+ /*
2638
+ * We failed to find a valid subset of the affinity mask for the
2639
+ * task, so override it based on its cpuset hierarchy.
2640
+ */
2641
+ cpuset_cpus_allowed(p, new_mask);
2642
+ override_mask = new_mask;
2643
+
2644
+out_set_mask:
2645
+ if (printk_ratelimit()) {
2646
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2647
+ task_pid_nr(p), p->comm,
2648
+ cpumask_pr_args(override_mask));
2649
+ }
2650
+
2651
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2652
+out_free_mask:
2653
+ cpus_read_unlock();
2654
+ trace_android_rvh_force_compatible_post(NULL);
2655
+ free_cpumask_var(new_mask);
2656
+}
18062657
18072658 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
18082659 {
....@@ -1841,6 +2692,8 @@
18412692 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
18422693 */
18432694 WARN_ON_ONCE(!cpu_online(new_cpu));
2695
+
2696
+ WARN_ON_ONCE(is_migration_disabled(p));
18442697 #endif
18452698
18462699 trace_sched_migrate_task(p, new_cpu);
....@@ -1851,12 +2704,13 @@
18512704 p->se.nr_migrations++;
18522705 rseq_migrate(p);
18532706 perf_event_task_migrate(p);
2707
+ trace_android_rvh_set_task_cpu(p, new_cpu);
18542708 }
18552709
18562710 __set_task_cpu(p, new_cpu);
18572711 }
2712
+EXPORT_SYMBOL_GPL(set_task_cpu);
18582713
1859
-#ifdef CONFIG_NUMA_BALANCING
18602714 static void __migrate_swap_task(struct task_struct *p, int cpu)
18612715 {
18622716 if (task_on_rq_queued(p)) {
....@@ -1869,11 +2723,9 @@
18692723 rq_pin_lock(src_rq, &srf);
18702724 rq_pin_lock(dst_rq, &drf);
18712725
1872
- p->on_rq = TASK_ON_RQ_MIGRATING;
18732726 deactivate_task(src_rq, p, 0);
18742727 set_task_cpu(p, cpu);
18752728 activate_task(dst_rq, p, 0);
1876
- p->on_rq = TASK_ON_RQ_QUEUED;
18772729 check_preempt_curr(dst_rq, p, 0);
18782730
18792731 rq_unpin_lock(dst_rq, &drf);
....@@ -1973,7 +2825,7 @@
19732825 out:
19742826 return ret;
19752827 }
1976
-#endif /* CONFIG_NUMA_BALANCING */
2828
+EXPORT_SYMBOL_GPL(migrate_swap);
19772829
19782830 static bool check_task_state(struct task_struct *p, long match_state)
19792831 {
....@@ -2081,7 +2933,7 @@
20812933 ktime_t to = NSEC_PER_SEC / HZ;
20822934
20832935 set_current_state(TASK_UNINTERRUPTIBLE);
2084
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2936
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
20852937 continue;
20862938 }
20872939
....@@ -2148,7 +3000,11 @@
21483000 int nid = cpu_to_node(cpu);
21493001 const struct cpumask *nodemask = NULL;
21503002 enum { cpuset, possible, fail } state = cpuset;
2151
- int dest_cpu;
3003
+ int dest_cpu = -1;
3004
+
3005
+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
3006
+ if (dest_cpu >= 0)
3007
+ return dest_cpu;
21523008
21533009 /*
21543010 * If the node that the CPU is on has been offlined, cpu_to_node()
....@@ -2160,9 +3016,7 @@
21603016
21613017 /* Look for allowed, online CPU in same node. */
21623018 for_each_cpu(dest_cpu, nodemask) {
2163
- if (!cpu_active(dest_cpu))
2164
- continue;
2165
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
3019
+ if (is_cpu_allowed(p, dest_cpu))
21663020 return dest_cpu;
21673021 }
21683022 }
....@@ -2184,12 +3038,17 @@
21843038 state = possible;
21853039 break;
21863040 }
2187
- /* Fall-through */
3041
+ fallthrough;
21883042 case possible:
2189
- do_set_cpus_allowed(p, cpu_possible_mask);
3043
+ /*
3044
+ * XXX When called from select_task_rq() we only
3045
+ * hold p->pi_lock and again violate locking order.
3046
+ *
3047
+ * More yuck to audit.
3048
+ */
3049
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
21903050 state = fail;
21913051 break;
2192
-
21933052 case fail:
21943053 BUG();
21953054 break;
....@@ -2216,14 +3075,12 @@
22163075 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
22173076 */
22183077 static inline
2219
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2220
- int sibling_count_hint)
3078
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
22213079 {
22223080 lockdep_assert_held(&p->pi_lock);
22233081
2224
- if (p->nr_cpus_allowed > 1)
2225
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2226
- sibling_count_hint);
3082
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3083
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
22273084 else
22283085 cpu = cpumask_any(p->cpus_ptr);
22293086
....@@ -2243,14 +3100,9 @@
22433100 return cpu;
22443101 }
22453102
2246
-static void update_avg(u64 *avg, u64 sample)
2247
-{
2248
- s64 diff = sample - *avg;
2249
- *avg += diff >> 3;
2250
-}
2251
-
22523103 void sched_set_stop_task(int cpu, struct task_struct *stop)
22533104 {
3105
+ static struct lock_class_key stop_pi_lock;
22543106 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
22553107 struct task_struct *old_stop = cpu_rq(cpu)->stop;
22563108
....@@ -2266,6 +3118,20 @@
22663118 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
22673119
22683120 stop->sched_class = &stop_sched_class;
3121
+
3122
+ /*
3123
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3124
+ * adjust the effective priority of a task. As a result,
3125
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
3126
+ * which can then trigger wakeups of the stop thread to push
3127
+ * around the current task.
3128
+ *
3129
+ * The stop task itself will never be part of the PI-chain, it
3130
+ * never blocks, therefore that ->pi_lock recursion is safe.
3131
+ * Tell lockdep about this by placing the stop->pi_lock in its
3132
+ * own class.
3133
+ */
3134
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
22693135 }
22703136
22713137 cpu_rq(cpu)->stop = stop;
....@@ -2279,15 +3145,23 @@
22793145 }
22803146 }
22813147
2282
-#else
3148
+#else /* CONFIG_SMP */
22833149
22843150 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2285
- const struct cpumask *new_mask, bool check)
3151
+ const struct cpumask *new_mask,
3152
+ u32 flags)
22863153 {
22873154 return set_cpus_allowed_ptr(p, new_mask);
22883155 }
22893156
2290
-#endif /* CONFIG_SMP */
3157
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3158
+
3159
+static inline bool rq_has_pinned_tasks(struct rq *rq)
3160
+{
3161
+ return false;
3162
+}
3163
+
3164
+#endif /* !CONFIG_SMP */
22913165
22923166 static void
22933167 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
....@@ -2326,12 +3200,6 @@
23263200
23273201 if (wake_flags & WF_SYNC)
23283202 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2329
-}
2330
-
2331
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2332
-{
2333
- activate_task(rq, p, en_flags);
2334
- p->on_rq = TASK_ON_RQ_QUEUED;
23353203 }
23363204
23373205 /*
....@@ -2375,27 +3243,54 @@
23753243 {
23763244 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
23773245
3246
+ if (wake_flags & WF_SYNC)
3247
+ en_flags |= ENQUEUE_WAKEUP_SYNC;
3248
+
23783249 lockdep_assert_held(&rq->lock);
23793250
2380
-#ifdef CONFIG_SMP
23813251 if (p->sched_contributes_to_load)
23823252 rq->nr_uninterruptible--;
23833253
3254
+#ifdef CONFIG_SMP
23843255 if (wake_flags & WF_MIGRATED)
23853256 en_flags |= ENQUEUE_MIGRATED;
3257
+ else
23863258 #endif
3259
+ if (p->in_iowait) {
3260
+ delayacct_blkio_end(p);
3261
+ atomic_dec(&task_rq(p)->nr_iowait);
3262
+ }
23873263
2388
- ttwu_activate(rq, p, en_flags);
3264
+ activate_task(rq, p, en_flags);
23893265 ttwu_do_wakeup(rq, p, wake_flags, rf);
23903266 }
23913267
23923268 /*
2393
- * Called in case the task @p isn't fully descheduled from its runqueue,
2394
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2395
- * since all we need to do is flip p->state to TASK_RUNNING, since
2396
- * the task is still ->on_rq.
3269
+ * Consider @p being inside a wait loop:
3270
+ *
3271
+ * for (;;) {
3272
+ * set_current_state(TASK_UNINTERRUPTIBLE);
3273
+ *
3274
+ * if (CONDITION)
3275
+ * break;
3276
+ *
3277
+ * schedule();
3278
+ * }
3279
+ * __set_current_state(TASK_RUNNING);
3280
+ *
3281
+ * between set_current_state() and schedule(). In this case @p is still
3282
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3283
+ * an atomic manner.
3284
+ *
3285
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3286
+ * then schedule() must still happen and p->state can be changed to
3287
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3288
+ * need to do a full wakeup with enqueue.
3289
+ *
3290
+ * Returns: %true when the wakeup is done,
3291
+ * %false otherwise.
23973292 */
2398
-static int ttwu_remote(struct task_struct *p, int wake_flags)
3293
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
23993294 {
24003295 struct rq_flags rf;
24013296 struct rq *rq;
....@@ -2414,75 +3309,63 @@
24143309 }
24153310
24163311 #ifdef CONFIG_SMP
2417
-void sched_ttwu_pending(void)
3312
+void sched_ttwu_pending(void *arg)
24183313 {
3314
+ struct llist_node *llist = arg;
24193315 struct rq *rq = this_rq();
2420
- struct llist_node *llist = llist_del_all(&rq->wake_list);
24213316 struct task_struct *p, *t;
24223317 struct rq_flags rf;
24233318
24243319 if (!llist)
24253320 return;
24263321
3322
+ /*
3323
+ * rq::ttwu_pending racy indication of out-standing wakeups.
3324
+ * Races such that false-negatives are possible, since they
3325
+ * are shorter lived that false-positives would be.
3326
+ */
3327
+ WRITE_ONCE(rq->ttwu_pending, 0);
3328
+
24273329 rq_lock_irqsave(rq, &rf);
24283330 update_rq_clock(rq);
24293331
2430
- llist_for_each_entry_safe(p, t, llist, wake_entry)
3332
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3333
+ if (WARN_ON_ONCE(p->on_cpu))
3334
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
3335
+
3336
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3337
+ set_task_cpu(p, cpu_of(rq));
3338
+
24313339 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3340
+ }
24323341
24333342 rq_unlock_irqrestore(rq, &rf);
24343343 }
24353344
2436
-void scheduler_ipi(void)
3345
+void send_call_function_single_ipi(int cpu)
24373346 {
2438
- /*
2439
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2440
- * TIF_NEED_RESCHED remotely (for the first time) will also send
2441
- * this IPI.
2442
- */
2443
- preempt_fold_need_resched();
3347
+ struct rq *rq = cpu_rq(cpu);
24443348
2445
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2446
- return;
2447
-
2448
- /*
2449
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2450
- * traditionally all their work was done from the interrupt return
2451
- * path. Now that we actually do some work, we need to make sure
2452
- * we do call them.
2453
- *
2454
- * Some archs already do call them, luckily irq_enter/exit nest
2455
- * properly.
2456
- *
2457
- * Arguably we should visit all archs and update all handlers,
2458
- * however a fair share of IPIs are still resched only so this would
2459
- * somewhat pessimize the simple resched case.
2460
- */
2461
- irq_enter();
2462
- sched_ttwu_pending();
2463
-
2464
- /*
2465
- * Check if someone kicked us for doing the nohz idle load balance.
2466
- */
2467
- if (unlikely(got_nohz_idle_kick())) {
2468
- this_rq()->idle_balance = 1;
2469
- raise_softirq_irqoff(SCHED_SOFTIRQ);
2470
- }
2471
- irq_exit();
3349
+ if (!set_nr_if_polling(rq->idle))
3350
+ arch_send_call_function_single_ipi(cpu);
3351
+ else
3352
+ trace_sched_wake_idle_without_ipi(cpu);
24723353 }
24733354
2474
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
3355
+/*
3356
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3357
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
3358
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3359
+ * of the wakeup instead of the waker.
3360
+ */
3361
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
24753362 {
24763363 struct rq *rq = cpu_rq(cpu);
24773364
24783365 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
24793366
2480
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2481
- if (!set_nr_if_polling(rq->idle))
2482
- smp_send_reschedule(cpu);
2483
- else
2484
- trace_sched_wake_idle_without_ipi(cpu);
2485
- }
3367
+ WRITE_ONCE(rq->ttwu_pending, 1);
3368
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
24863369 }
24873370
24883371 void wake_up_if_idle(int cpu)
....@@ -2508,6 +3391,7 @@
25083391 out:
25093392 rcu_read_unlock();
25103393 }
3394
+EXPORT_SYMBOL_GPL(wake_up_if_idle);
25113395
25123396 bool cpus_share_cache(int this_cpu, int that_cpu)
25133397 {
....@@ -2516,6 +3400,58 @@
25163400
25173401 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
25183402 }
3403
+
3404
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3405
+{
3406
+ /*
3407
+ * If the CPU does not share cache, then queue the task on the
3408
+ * remote rqs wakelist to avoid accessing remote data.
3409
+ */
3410
+ if (!cpus_share_cache(smp_processor_id(), cpu))
3411
+ return true;
3412
+
3413
+ /*
3414
+ * If the task is descheduling and the only running task on the
3415
+ * CPU then use the wakelist to offload the task activation to
3416
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
3417
+ * nr_running is checked to avoid unnecessary task stacking.
3418
+ *
3419
+ * Note that we can only get here with (wakee) p->on_rq=0,
3420
+ * p->on_cpu can be whatever, we've done the dequeue, so
3421
+ * the wakee has been accounted out of ->nr_running.
3422
+ */
3423
+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
3424
+ return true;
3425
+
3426
+ return false;
3427
+}
3428
+
3429
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3430
+{
3431
+ bool cond = false;
3432
+
3433
+ trace_android_rvh_ttwu_cond(&cond);
3434
+
3435
+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) ||
3436
+ cond) {
3437
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
3438
+ return false;
3439
+
3440
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
3441
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
3442
+ return true;
3443
+ }
3444
+
3445
+ return false;
3446
+}
3447
+
3448
+#else /* !CONFIG_SMP */
3449
+
3450
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3451
+{
3452
+ return false;
3453
+}
3454
+
25193455 #endif /* CONFIG_SMP */
25203456
25213457 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
....@@ -2523,13 +3459,8 @@
25233459 struct rq *rq = cpu_rq(cpu);
25243460 struct rq_flags rf;
25253461
2526
-#if defined(CONFIG_SMP)
2527
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2528
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2529
- ttwu_queue_remote(p, cpu, wake_flags);
3462
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
25303463 return;
2531
- }
2532
-#endif
25333464
25343465 rq_lock(rq, &rf);
25353466 update_rq_clock(rq);
....@@ -2585,8 +3516,8 @@
25853516 * migration. However the means are completely different as there is no lock
25863517 * chain to provide order. Instead we do:
25873518 *
2588
- * 1) smp_store_release(X->on_cpu, 0)
2589
- * 2) smp_cond_load_acquire(!X->on_cpu)
3519
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
3520
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25903521 *
25913522 * Example:
25923523 *
....@@ -2625,34 +3556,72 @@
26253556 * @p: the thread to be awakened
26263557 * @state: the mask of task states that can be woken
26273558 * @wake_flags: wake modifier flags (WF_*)
2628
- * @sibling_count_hint: A hint at the number of threads that are being woken up
2629
- * in this event.
26303559 *
2631
- * If (@state & @p->state) @p->state = TASK_RUNNING.
3560
+ * Conceptually does:
3561
+ *
3562
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
26323563 *
26333564 * If the task was not queued/runnable, also place it back on a runqueue.
26343565 *
2635
- * Atomic against schedule() which would dequeue a task, also see
2636
- * set_current_state().
3566
+ * This function is atomic against schedule() which would dequeue the task.
26373567 *
2638
- * This function executes a full memory barrier before accessing the task
2639
- * state; see set_current_state().
3568
+ * It issues a full memory barrier before accessing @p->state, see the comment
3569
+ * with set_current_state().
3570
+ *
3571
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
3572
+ *
3573
+ * Relies on p->pi_lock stabilizing:
3574
+ * - p->sched_class
3575
+ * - p->cpus_ptr
3576
+ * - p->sched_task_group
3577
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3578
+ *
3579
+ * Tries really hard to only take one task_rq(p)->lock for performance.
3580
+ * Takes rq->lock in:
3581
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3582
+ * - ttwu_queue() -- new rq, for enqueue of the task;
3583
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3584
+ *
3585
+ * As a consequence we race really badly with just about everything. See the
3586
+ * many memory barriers and their comments for details.
26403587 *
26413588 * Return: %true if @p->state changes (an actual wakeup was done),
26423589 * %false otherwise.
26433590 */
26443591 static int
2645
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2646
- int sibling_count_hint)
3592
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
26473593 {
26483594 unsigned long flags;
26493595 int cpu, success = 0;
26503596
3597
+ preempt_disable();
3598
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) {
3599
+ /*
3600
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3601
+ * == smp_processor_id()'. Together this means we can special
3602
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
3603
+ * without taking any locks.
3604
+ *
3605
+ * In particular:
3606
+ * - we rely on Program-Order guarantees for all the ordering,
3607
+ * - we're serialized against set_special_state() by virtue of
3608
+ * it disabling IRQs (this allows not taking ->pi_lock).
3609
+ */
3610
+ if (!(p->state & state))
3611
+ goto out;
3612
+
3613
+ success = 1;
3614
+ trace_sched_waking(p);
3615
+ p->state = TASK_RUNNING;
3616
+ trace_sched_wakeup(p);
3617
+ goto out;
3618
+ }
3619
+
26513620 /*
26523621 * If we are going to wake up a thread waiting for CONDITION we
26533622 * need to ensure that CONDITION=1 done by the caller can not be
2654
- * reordered with p->state check below. This pairs with mb() in
2655
- * set_current_state() the waiting thread does.
3623
+ * reordered with p->state check below. This pairs with smp_store_mb()
3624
+ * in set_current_state() that the waiting thread does.
26563625 */
26573626 raw_spin_lock_irqsave(&p->pi_lock, flags);
26583627 smp_mb__after_spinlock();
....@@ -2668,9 +3637,8 @@
26683637 success = 1;
26693638 }
26703639 }
2671
- goto out;
3640
+ goto unlock;
26723641 }
2673
-
26743642 /*
26753643 * If this is a regular wakeup, then we can unconditionally
26763644 * clear the saved state of a "lock sleeper".
....@@ -2678,11 +3646,23 @@
26783646 if (!(wake_flags & WF_LOCK_SLEEPER))
26793647 p->saved_state = TASK_RUNNING;
26803648
3649
+#ifdef CONFIG_FREEZER
3650
+ /*
3651
+ * If we're going to wake up a thread which may be frozen, then
3652
+ * we can only do so if we have an active CPU which is capable of
3653
+ * running it. This may not be the case when resuming from suspend,
3654
+ * as the secondary CPUs may not yet be back online. See __thaw_task()
3655
+ * for the actual wakeup.
3656
+ */
3657
+ if (unlikely(frozen_or_skipped(p)) &&
3658
+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
3659
+ goto unlock;
3660
+#endif
3661
+
26813662 trace_sched_waking(p);
26823663
26833664 /* We're going to change ->state: */
26843665 success = 1;
2685
- cpu = task_cpu(p);
26863666
26873667 /*
26883668 * Ensure we load p->on_rq _after_ p->state, otherwise it would
....@@ -2703,10 +3683,15 @@
27033683 *
27043684 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27053685 * __schedule(). See the comment for smp_mb__after_spinlock().
3686
+ *
3687
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
27063688 */
27073689 smp_rmb();
2708
- if (p->on_rq && ttwu_remote(p, wake_flags))
2709
- goto stat;
3690
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3691
+ goto unlock;
3692
+
3693
+ if (p->state & TASK_UNINTERRUPTIBLE)
3694
+ trace_sched_blocked_reason(p);
27103695
27113696 #ifdef CONFIG_SMP
27123697 /*
....@@ -2727,8 +3712,43 @@
27273712 *
27283713 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
27293714 * __schedule(). See the comment for smp_mb__after_spinlock().
3715
+ *
3716
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3717
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
3718
+ * care about it's own p->state. See the comment in __schedule().
27303719 */
2731
- smp_rmb();
3720
+ smp_acquire__after_ctrl_dep();
3721
+
3722
+ /*
3723
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3724
+ * == 0), which means we need to do an enqueue, change p->state to
3725
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
3726
+ * enqueue, such as ttwu_queue_wakelist().
3727
+ */
3728
+ p->state = TASK_WAKING;
3729
+
3730
+ /*
3731
+ * If the owning (remote) CPU is still in the middle of schedule() with
3732
+ * this task as prev, considering queueing p on the remote CPUs wake_list
3733
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
3734
+ * let the waker make forward progress. This is safe because IRQs are
3735
+ * disabled and the IPI will deliver after on_cpu is cleared.
3736
+ *
3737
+ * Ensure we load task_cpu(p) after p->on_cpu:
3738
+ *
3739
+ * set_task_cpu(p, cpu);
3740
+ * STORE p->cpu = @cpu
3741
+ * __schedule() (switch to task 'p')
3742
+ * LOCK rq->lock
3743
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3744
+ * STORE p->on_cpu = 1 LOAD p->cpu
3745
+ *
3746
+ * to ensure we observe the correct CPU on which the task is currently
3747
+ * scheduling.
3748
+ */
3749
+ if (smp_load_acquire(&p->on_cpu) &&
3750
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3751
+ goto unlock;
27323752
27333753 /*
27343754 * If the owning (remote) CPU is still in the middle of schedule() with
....@@ -2741,38 +3761,79 @@
27413761 */
27423762 smp_cond_load_acquire(&p->on_cpu, !VAL);
27433763
2744
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2745
- p->state = TASK_WAKING;
3764
+ trace_android_rvh_try_to_wake_up(p);
27463765
2747
- if (p->in_iowait) {
2748
- delayacct_blkio_end(p);
2749
- atomic_dec(&task_rq(p)->nr_iowait);
2750
- }
2751
-
2752
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2753
- sibling_count_hint);
3766
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
27543767 if (task_cpu(p) != cpu) {
3768
+ if (p->in_iowait) {
3769
+ delayacct_blkio_end(p);
3770
+ atomic_dec(&task_rq(p)->nr_iowait);
3771
+ }
3772
+
27553773 wake_flags |= WF_MIGRATED;
27563774 psi_ttwu_dequeue(p);
27573775 set_task_cpu(p, cpu);
27583776 }
2759
-
2760
-#else /* CONFIG_SMP */
2761
-
2762
- if (p->in_iowait) {
2763
- delayacct_blkio_end(p);
2764
- atomic_dec(&task_rq(p)->nr_iowait);
2765
- }
2766
-
3777
+#else
3778
+ cpu = task_cpu(p);
27673779 #endif /* CONFIG_SMP */
27683780
27693781 ttwu_queue(p, cpu, wake_flags);
2770
-stat:
2771
- ttwu_stat(p, cpu, wake_flags);
2772
-out:
3782
+unlock:
27733783 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3784
+out:
3785
+ if (success) {
3786
+ trace_android_rvh_try_to_wake_up_success(p);
3787
+ ttwu_stat(p, task_cpu(p), wake_flags);
3788
+ }
3789
+ preempt_enable();
27743790
27753791 return success;
3792
+}
3793
+
3794
+/**
3795
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3796
+ * @p: Process for which the function is to be invoked, can be @current.
3797
+ * @func: Function to invoke.
3798
+ * @arg: Argument to function.
3799
+ *
3800
+ * If the specified task can be quickly locked into a definite state
3801
+ * (either sleeping or on a given runqueue), arrange to keep it in that
3802
+ * state while invoking @func(@arg). This function can use ->on_rq and
3803
+ * task_curr() to work out what the state is, if required. Given that
3804
+ * @func can be invoked with a runqueue lock held, it had better be quite
3805
+ * lightweight.
3806
+ *
3807
+ * Returns:
3808
+ * @false if the task slipped out from under the locks.
3809
+ * @true if the task was locked onto a runqueue or is sleeping.
3810
+ * However, @func can override this by returning @false.
3811
+ */
3812
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
3813
+{
3814
+ struct rq_flags rf;
3815
+ bool ret = false;
3816
+ struct rq *rq;
3817
+
3818
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3819
+ if (p->on_rq) {
3820
+ rq = __task_rq_lock(p, &rf);
3821
+ if (task_rq(p) == rq)
3822
+ ret = func(p, arg);
3823
+ rq_unlock(rq, &rf);
3824
+ } else {
3825
+ switch (p->state) {
3826
+ case TASK_RUNNING:
3827
+ case TASK_WAKING:
3828
+ break;
3829
+ default:
3830
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3831
+ if (!p->on_rq)
3832
+ ret = func(p, arg);
3833
+ }
3834
+ }
3835
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3836
+ return ret;
27763837 }
27773838
27783839 /**
....@@ -2788,7 +3849,7 @@
27883849 */
27893850 int wake_up_process(struct task_struct *p)
27903851 {
2791
- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
3852
+ return try_to_wake_up(p, TASK_NORMAL, 0);
27923853 }
27933854 EXPORT_SYMBOL(wake_up_process);
27943855
....@@ -2801,12 +3862,12 @@
28013862 */
28023863 int wake_up_lock_sleeper(struct task_struct *p)
28033864 {
2804
- return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
3865
+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
28053866 }
28063867
28073868 int wake_up_state(struct task_struct *p, unsigned int state)
28083869 {
2809
- return try_to_wake_up(p, state, 0, 1);
3870
+ return try_to_wake_up(p, state, 0);
28103871 }
28113872
28123873 /*
....@@ -2831,6 +3892,8 @@
28313892 p->se.cfs_rq = NULL;
28323893 #endif
28333894
3895
+ trace_android_rvh_sched_fork_init(p);
3896
+
28343897 #ifdef CONFIG_SCHEDSTATS
28353898 /* Even if schedstat is disabled, there should not be garbage */
28363899 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
....@@ -2851,7 +3914,14 @@
28513914 INIT_HLIST_HEAD(&p->preempt_notifiers);
28523915 #endif
28533916
3917
+#ifdef CONFIG_COMPACTION
3918
+ p->capture_control = NULL;
3919
+#endif
28543920 init_numa_balancing(clone_flags, p);
3921
+#ifdef CONFIG_SMP
3922
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
3923
+ p->migration_pending = NULL;
3924
+#endif
28553925 }
28563926
28573927 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
....@@ -2868,7 +3938,7 @@
28683938
28693939 #ifdef CONFIG_PROC_SYSCTL
28703940 int sysctl_numa_balancing(struct ctl_table *table, int write,
2871
- void __user *buffer, size_t *lenp, loff_t *ppos)
3941
+ void *buffer, size_t *lenp, loff_t *ppos)
28723942 {
28733943 struct ctl_table t;
28743944 int err;
....@@ -2942,8 +4012,8 @@
29424012 }
29434013
29444014 #ifdef CONFIG_PROC_SYSCTL
2945
-int sysctl_schedstats(struct ctl_table *table, int write,
2946
- void __user *buffer, size_t *lenp, loff_t *ppos)
4015
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4016
+ size_t *lenp, loff_t *ppos)
29474017 {
29484018 struct ctl_table t;
29494019 int err;
....@@ -2971,7 +4041,7 @@
29714041 */
29724042 int sched_fork(unsigned long clone_flags, struct task_struct *p)
29734043 {
2974
- unsigned long flags;
4044
+ trace_android_rvh_sched_fork(p);
29754045
29764046 __sched_fork(clone_flags, p);
29774047 /*
....@@ -2985,6 +4055,7 @@
29854055 * Make sure we do not leak PI boosting priority to the child.
29864056 */
29874057 p->prio = current->normal_prio;
4058
+ trace_android_rvh_prepare_prio_fork(p);
29884059
29894060 uclamp_fork(p);
29904061
....@@ -2999,8 +4070,8 @@
29994070 } else if (PRIO_TO_NICE(p->static_prio) < 0)
30004071 p->static_prio = NICE_TO_PRIO(0);
30014072
3002
- p->prio = p->normal_prio = __normal_prio(p);
3003
- set_load_weight(p, false);
4073
+ p->prio = p->normal_prio = p->static_prio;
4074
+ set_load_weight(p);
30044075
30054076 /*
30064077 * We don't need the reset flag anymore after the fork. It has
....@@ -3017,24 +4088,8 @@
30174088 p->sched_class = &fair_sched_class;
30184089
30194090 init_entity_runnable_average(&p->se);
4091
+ trace_android_rvh_finish_prio_fork(p);
30204092
3021
- /*
3022
- * The child is not yet in the pid-hash so no cgroup attach races,
3023
- * and the cgroup is pinned to this child due to cgroup_fork()
3024
- * is ran before sched_fork().
3025
- *
3026
- * Silence PROVE_RCU.
3027
- */
3028
- raw_spin_lock_irqsave(&p->pi_lock, flags);
3029
- rseq_migrate(p);
3030
- /*
3031
- * We're setting the CPU for the first time, we don't migrate,
3032
- * so use __set_task_cpu().
3033
- */
3034
- __set_task_cpu(p, smp_processor_id());
3035
- if (p->sched_class->task_fork)
3036
- p->sched_class->task_fork(p);
3037
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
30384093
30394094 #ifdef CONFIG_SCHED_INFO
30404095 if (likely(sched_info_on()))
....@@ -3052,6 +4107,41 @@
30524107 RB_CLEAR_NODE(&p->pushable_dl_tasks);
30534108 #endif
30544109 return 0;
4110
+}
4111
+
4112
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
4113
+{
4114
+ unsigned long flags;
4115
+
4116
+ /*
4117
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
4118
+ * required yet, but lockdep gets upset if rules are violated.
4119
+ */
4120
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
4121
+#ifdef CONFIG_CGROUP_SCHED
4122
+ if (1) {
4123
+ struct task_group *tg;
4124
+
4125
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
4126
+ struct task_group, css);
4127
+ tg = autogroup_task_group(p, tg);
4128
+ p->sched_task_group = tg;
4129
+ }
4130
+#endif
4131
+ rseq_migrate(p);
4132
+ /*
4133
+ * We're setting the CPU for the first time, we don't migrate,
4134
+ * so use __set_task_cpu().
4135
+ */
4136
+ __set_task_cpu(p, smp_processor_id());
4137
+ if (p->sched_class->task_fork)
4138
+ p->sched_class->task_fork(p);
4139
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4140
+}
4141
+
4142
+void sched_post_fork(struct task_struct *p)
4143
+{
4144
+ uclamp_post_fork(p);
30554145 }
30564146
30574147 unsigned long to_ratio(u64 period, u64 runtime)
....@@ -3082,6 +4172,8 @@
30824172 struct rq_flags rf;
30834173 struct rq *rq;
30844174
4175
+ trace_android_rvh_wake_up_new_task(p);
4176
+
30854177 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
30864178 p->state = TASK_RUNNING;
30874179 #ifdef CONFIG_SMP
....@@ -3095,14 +4187,14 @@
30954187 */
30964188 p->recent_used_cpu = task_cpu(p);
30974189 rseq_migrate(p);
3098
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
4190
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
30994191 #endif
31004192 rq = __task_rq_lock(p, &rf);
31014193 update_rq_clock(rq);
3102
- post_init_entity_util_avg(&p->se);
4194
+ post_init_entity_util_avg(p);
4195
+ trace_android_rvh_new_task_stats(p);
31034196
31044197 activate_task(rq, p, ENQUEUE_NOCLOCK);
3105
- p->on_rq = TASK_ON_RQ_QUEUED;
31064198 trace_sched_wakeup_new(p);
31074199 check_preempt_curr(rq, p, WF_FORK);
31084200 #ifdef CONFIG_SMP
....@@ -3212,8 +4304,10 @@
32124304 /*
32134305 * Claim the task as running, we do this before switching to it
32144306 * such that any running task will have this set.
4307
+ *
4308
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
32154309 */
3216
- next->on_cpu = 1;
4310
+ WRITE_ONCE(next->on_cpu, 1);
32174311 #endif
32184312 }
32194313
....@@ -3221,8 +4315,9 @@
32214315 {
32224316 #ifdef CONFIG_SMP
32234317 /*
3224
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3225
- * We must ensure this doesn't happen until the switch is completely
4318
+ * This must be the very last reference to @prev from this CPU. After
4319
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
4320
+ * must ensure this doesn't happen until the switch is completely
32264321 * finished.
32274322 *
32284323 * In particular, the load of prev->state in finish_task_switch() must
....@@ -3234,6 +4329,90 @@
32344329 #endif
32354330 }
32364331
4332
+#ifdef CONFIG_SMP
4333
+
4334
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4335
+{
4336
+ void (*func)(struct rq *rq);
4337
+ struct callback_head *next;
4338
+
4339
+ lockdep_assert_held(&rq->lock);
4340
+
4341
+ while (head) {
4342
+ func = (void (*)(struct rq *))head->func;
4343
+ next = head->next;
4344
+ head->next = NULL;
4345
+ head = next;
4346
+
4347
+ func(rq);
4348
+ }
4349
+}
4350
+
4351
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4352
+{
4353
+ struct callback_head *head = rq->balance_callback;
4354
+
4355
+ lockdep_assert_held(&rq->lock);
4356
+ if (head) {
4357
+ rq->balance_callback = NULL;
4358
+ rq->balance_flags &= ~BALANCE_WORK;
4359
+ }
4360
+
4361
+ return head;
4362
+}
4363
+
4364
+static void __balance_callbacks(struct rq *rq)
4365
+{
4366
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
4367
+}
4368
+
4369
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4370
+{
4371
+ unsigned long flags;
4372
+
4373
+ if (unlikely(head)) {
4374
+ raw_spin_lock_irqsave(&rq->lock, flags);
4375
+ do_balance_callbacks(rq, head);
4376
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
4377
+ }
4378
+}
4379
+
4380
+static void balance_push(struct rq *rq);
4381
+
4382
+static inline void balance_switch(struct rq *rq)
4383
+{
4384
+ if (likely(!rq->balance_flags))
4385
+ return;
4386
+
4387
+ if (rq->balance_flags & BALANCE_PUSH) {
4388
+ balance_push(rq);
4389
+ return;
4390
+ }
4391
+
4392
+ __balance_callbacks(rq);
4393
+}
4394
+
4395
+#else
4396
+
4397
+static inline void __balance_callbacks(struct rq *rq)
4398
+{
4399
+}
4400
+
4401
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4402
+{
4403
+ return NULL;
4404
+}
4405
+
4406
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4407
+{
4408
+}
4409
+
4410
+static inline void balance_switch(struct rq *rq)
4411
+{
4412
+}
4413
+
4414
+#endif
4415
+
32374416 static inline void
32384417 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
32394418 {
....@@ -3244,7 +4423,7 @@
32444423 * do an early lockdep release here:
32454424 */
32464425 rq_unpin_lock(rq, rf);
3247
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4426
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
32484427 #ifdef CONFIG_DEBUG_SPINLOCK
32494428 /* this is a valid case when another task releases the spinlock */
32504429 rq->lock.owner = next;
....@@ -3259,6 +4438,7 @@
32594438 * prev into current:
32604439 */
32614440 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
4441
+ balance_switch(rq);
32624442 raw_spin_unlock_irq(&rq->lock);
32634443 }
32644444
....@@ -3273,6 +4453,22 @@
32734453 #ifndef finish_arch_post_lock_switch
32744454 # define finish_arch_post_lock_switch() do { } while (0)
32754455 #endif
4456
+
4457
+static inline void kmap_local_sched_out(void)
4458
+{
4459
+#ifdef CONFIG_KMAP_LOCAL
4460
+ if (unlikely(current->kmap_ctrl.idx))
4461
+ __kmap_local_sched_out();
4462
+#endif
4463
+}
4464
+
4465
+static inline void kmap_local_sched_in(void)
4466
+{
4467
+#ifdef CONFIG_KMAP_LOCAL
4468
+ if (unlikely(current->kmap_ctrl.idx))
4469
+ __kmap_local_sched_in();
4470
+#endif
4471
+}
32764472
32774473 /**
32784474 * prepare_task_switch - prepare to switch tasks
....@@ -3296,6 +4492,7 @@
32964492 perf_event_task_sched_out(prev, next);
32974493 rseq_preempt(prev);
32984494 fire_sched_out_preempt_notifiers(prev, next);
4495
+ kmap_local_sched_out();
32994496 prepare_task(next);
33004497 prepare_arch_switch(next);
33014498 }
....@@ -3362,6 +4559,7 @@
33624559 finish_lock_switch(rq);
33634560 finish_arch_post_lock_switch();
33644561 kcov_finish_switch(current);
4562
+ kmap_local_sched_in();
33654563
33664564 fire_sched_in_preempt_notifiers(current);
33674565 /*
....@@ -3388,49 +4586,12 @@
33884586 if (prev->sched_class->task_dead)
33894587 prev->sched_class->task_dead(prev);
33904588
3391
- put_task_struct(prev);
4589
+ put_task_struct_rcu_user(prev);
33924590 }
33934591
33944592 tick_nohz_task_switch();
33954593 return rq;
33964594 }
3397
-
3398
-#ifdef CONFIG_SMP
3399
-
3400
-/* rq->lock is NOT held, but preemption is disabled */
3401
-static void __balance_callback(struct rq *rq)
3402
-{
3403
- struct callback_head *head, *next;
3404
- void (*func)(struct rq *rq);
3405
- unsigned long flags;
3406
-
3407
- raw_spin_lock_irqsave(&rq->lock, flags);
3408
- head = rq->balance_callback;
3409
- rq->balance_callback = NULL;
3410
- while (head) {
3411
- func = (void (*)(struct rq *))head->func;
3412
- next = head->next;
3413
- head->next = NULL;
3414
- head = next;
3415
-
3416
- func(rq);
3417
- }
3418
- raw_spin_unlock_irqrestore(&rq->lock, flags);
3419
-}
3420
-
3421
-static inline void balance_callback(struct rq *rq)
3422
-{
3423
- if (unlikely(rq->balance_callback))
3424
- __balance_callback(rq);
3425
-}
3426
-
3427
-#else
3428
-
3429
-static inline void balance_callback(struct rq *rq)
3430
-{
3431
-}
3432
-
3433
-#endif
34344595
34354596 /**
34364597 * schedule_tail - first thing a freshly forked thread must call.
....@@ -3451,7 +4612,6 @@
34514612 */
34524613
34534614 rq = finish_task_switch(prev);
3454
- balance_callback(rq);
34554615 preempt_enable();
34564616
34574617 if (current->set_child_tid)
....@@ -3467,12 +4627,8 @@
34674627 context_switch(struct rq *rq, struct task_struct *prev,
34684628 struct task_struct *next, struct rq_flags *rf)
34694629 {
3470
- struct mm_struct *mm, *oldmm;
3471
-
34724630 prepare_task_switch(rq, prev, next);
34734631
3474
- mm = next->mm;
3475
- oldmm = prev->active_mm;
34764632 /*
34774633 * For paravirt, this is coupled with an exit in switch_to to
34784634 * combine the page table reload and the switch backend into
....@@ -3481,22 +4637,37 @@
34814637 arch_start_context_switch(prev);
34824638
34834639 /*
3484
- * If mm is non-NULL, we pass through switch_mm(). If mm is
3485
- * NULL, we will pass through mmdrop() in finish_task_switch().
3486
- * Both of these contain the full memory barrier required by
3487
- * membarrier after storing to rq->curr, before returning to
3488
- * user-space.
4640
+ * kernel -> kernel lazy + transfer active
4641
+ * user -> kernel lazy + mmgrab() active
4642
+ *
4643
+ * kernel -> user switch + mmdrop() active
4644
+ * user -> user switch
34894645 */
3490
- if (!mm) {
3491
- next->active_mm = oldmm;
3492
- mmgrab(oldmm);
3493
- enter_lazy_tlb(oldmm, next);
3494
- } else
3495
- switch_mm_irqs_off(oldmm, mm, next);
4646
+ if (!next->mm) { // to kernel
4647
+ enter_lazy_tlb(prev->active_mm, next);
34964648
3497
- if (!prev->mm) {
3498
- prev->active_mm = NULL;
3499
- rq->prev_mm = oldmm;
4649
+ next->active_mm = prev->active_mm;
4650
+ if (prev->mm) // from user
4651
+ mmgrab(prev->active_mm);
4652
+ else
4653
+ prev->active_mm = NULL;
4654
+ } else { // to user
4655
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
4656
+ /*
4657
+ * sys_membarrier() requires an smp_mb() between setting
4658
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
4659
+ *
4660
+ * The below provides this either through switch_mm(), or in
4661
+ * case 'prev->active_mm == next->mm' through
4662
+ * finish_task_switch()'s mmdrop().
4663
+ */
4664
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
4665
+
4666
+ if (!prev->mm) { // from kernel
4667
+ /* will mmdrop() in finish_task_switch(). */
4668
+ rq->prev_mm = prev->active_mm;
4669
+ prev->active_mm = NULL;
4670
+ }
35004671 }
35014672
35024673 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
....@@ -3533,7 +4704,7 @@
35334704 * preemption, thus the result might have a time-of-check-to-time-of-use
35344705 * race. The caller is responsible to use it correctly, for example:
35354706 *
3536
- * - from a non-preemptable section (of course)
4707
+ * - from a non-preemptible section (of course)
35374708 *
35384709 * - from a thread that is bound to a single CPU
35394710 *
....@@ -3554,6 +4725,18 @@
35544725 sum += cpu_rq(i)->nr_switches;
35554726
35564727 return sum;
4728
+}
4729
+
4730
+/*
4731
+ * Consumers of these two interfaces, like for example the cpuidle menu
4732
+ * governor, are using nonsensical data. Preferring shallow idle state selection
4733
+ * for a CPU that has IO-wait which might not even end up running the task when
4734
+ * it does become runnable.
4735
+ */
4736
+
4737
+unsigned long nr_iowait_cpu(int cpu)
4738
+{
4739
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
35574740 }
35584741
35594742 /*
....@@ -3591,29 +4774,9 @@
35914774 unsigned long i, sum = 0;
35924775
35934776 for_each_possible_cpu(i)
3594
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
4777
+ sum += nr_iowait_cpu(i);
35954778
35964779 return sum;
3597
-}
3598
-
3599
-/*
3600
- * Consumers of these two interfaces, like for example the cpufreq menu
3601
- * governor are using nonsensical data. Boosting frequency for a CPU that has
3602
- * IO-wait which might not even end up running the task when it does become
3603
- * runnable.
3604
- */
3605
-
3606
-unsigned long nr_iowait_cpu(int cpu)
3607
-{
3608
- struct rq *this = cpu_rq(cpu);
3609
- return atomic_read(&this->nr_iowait);
3610
-}
3611
-
3612
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3613
-{
3614
- struct rq *rq = this_rq();
3615
- *nr_waiters = atomic_read(&rq->nr_iowait);
3616
- *load = rq->load.weight;
36174780 }
36184781
36194782 #ifdef CONFIG_SMP
....@@ -3627,9 +4790,14 @@
36274790 struct task_struct *p = current;
36284791 unsigned long flags;
36294792 int dest_cpu;
4793
+ bool cond = false;
4794
+
4795
+ trace_android_rvh_sched_exec(&cond);
4796
+ if (cond)
4797
+ return;
36304798
36314799 raw_spin_lock_irqsave(&p->pi_lock, flags);
3632
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
4800
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
36334801 if (dest_cpu == smp_processor_id())
36344802 goto unlock;
36354803
....@@ -3712,6 +4880,7 @@
37124880
37134881 return ns;
37144882 }
4883
+EXPORT_SYMBOL_GPL(task_sched_runtime);
37154884
37164885 /*
37174886 * This function gets called by the timer code, with HZ frequency.
....@@ -3723,14 +4892,18 @@
37234892 struct rq *rq = cpu_rq(cpu);
37244893 struct task_struct *curr = rq->curr;
37254894 struct rq_flags rf;
4895
+ unsigned long thermal_pressure;
37264896
4897
+ arch_scale_freq_tick();
37274898 sched_clock_tick();
37284899
37294900 rq_lock(rq, &rf);
37304901
4902
+ trace_android_rvh_tick_entry(rq);
37314903 update_rq_clock(rq);
4904
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4905
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
37324906 curr->sched_class->task_tick(rq, curr, 0);
3733
- cpu_load_update_active(rq);
37344907 calc_global_load_tick(rq);
37354908 psi_task_tick(rq);
37364909
....@@ -3742,6 +4915,8 @@
37424915 rq->idle_balance = idle_cpu(cpu);
37434916 trigger_load_balance(rq);
37444917 #endif
4918
+
4919
+ trace_android_vh_scheduler_tick(rq);
37454920 }
37464921
37474922 #ifdef CONFIG_NO_HZ_FULL
....@@ -3799,28 +4974,31 @@
37994974 * statistics and checks timeslices in a time-independent way, regardless
38004975 * of when exactly it is running.
38014976 */
3802
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
4977
+ if (!tick_nohz_tick_stopped_cpu(cpu))
38034978 goto out_requeue;
38044979
38054980 rq_lock_irq(rq, &rf);
38064981 curr = rq->curr;
3807
- if (is_idle_task(curr) || cpu_is_offline(cpu))
4982
+ if (cpu_is_offline(cpu))
38084983 goto out_unlock;
38094984
38104985 update_rq_clock(rq);
3811
- delta = rq_clock_task(rq) - curr->se.exec_start;
38124986
3813
- /*
3814
- * Make sure the next tick runs within a reasonable
3815
- * amount of time.
3816
- */
3817
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4987
+ if (!is_idle_task(curr)) {
4988
+ /*
4989
+ * Make sure the next tick runs within a reasonable
4990
+ * amount of time.
4991
+ */
4992
+ delta = rq_clock_task(rq) - curr->se.exec_start;
4993
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4994
+ }
38184995 curr->sched_class->task_tick(rq, curr, 0);
38194996
4997
+ calc_load_nohz_remote(rq);
38204998 out_unlock:
38214999 rq_unlock_irq(rq, &rf);
3822
-
38235000 out_requeue:
5001
+
38245002 /*
38255003 * Run the remote tick once per second (1Hz). This arbitrary
38265004 * frequency is large enough to avoid overload but short enough
....@@ -3884,7 +5062,7 @@
38845062 static inline void sched_tick_stop(int cpu) { }
38855063 #endif
38865064
3887
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5065
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
38885066 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
38895067 /*
38905068 * If the value passed in is equal to the current preempt count
....@@ -3990,11 +5168,12 @@
39905168 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
39915169 && in_atomic_preempt_off()) {
39925170 pr_err("Preemption disabled at:");
3993
- print_ip_sym(preempt_disable_ip);
3994
- pr_cont("\n");
5171
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
39955172 }
39965173 if (panic_on_warn)
39975174 panic("scheduling while atomic\n");
5175
+
5176
+ trace_android_rvh_schedule_bug(prev);
39985177
39995178 dump_stack();
40005179 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
....@@ -4003,11 +5182,23 @@
40035182 /*
40045183 * Various schedule()-time debugging checks and statistics:
40055184 */
4006
-static inline void schedule_debug(struct task_struct *prev)
5185
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
40075186 {
40085187 #ifdef CONFIG_SCHED_STACK_END_CHECK
40095188 if (task_stack_end_corrupted(prev))
40105189 panic("corrupted stack end detected inside scheduler\n");
5190
+
5191
+ if (task_scs_end_corrupted(prev))
5192
+ panic("corrupted shadow stack detected inside scheduler\n");
5193
+#endif
5194
+
5195
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5196
+ if (!preempt && prev->state && prev->non_block_count) {
5197
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5198
+ prev->comm, prev->pid, prev->non_block_count);
5199
+ dump_stack();
5200
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5201
+ }
40115202 #endif
40125203
40135204 if (unlikely(in_atomic_preempt_off())) {
....@@ -4019,6 +5210,28 @@
40195210 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
40205211
40215212 schedstat_inc(this_rq()->sched_count);
5213
+}
5214
+
5215
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5216
+ struct rq_flags *rf)
5217
+{
5218
+#ifdef CONFIG_SMP
5219
+ const struct sched_class *class;
5220
+ /*
5221
+ * We must do the balancing pass before put_prev_task(), such
5222
+ * that when we release the rq->lock the task is in the same
5223
+ * state as before we took rq->lock.
5224
+ *
5225
+ * We can terminate the balance pass as soon as we know there is
5226
+ * a runnable task of @class priority or higher.
5227
+ */
5228
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
5229
+ if (class->balance(rq, prev, rf))
5230
+ break;
5231
+ }
5232
+#endif
5233
+
5234
+ put_prev_task(rq, prev);
40225235 }
40235236
40245237 /*
....@@ -4036,36 +5249,34 @@
40365249 * higher scheduling class, because otherwise those loose the
40375250 * opportunity to pull in more work from other CPUs.
40385251 */
4039
- if (likely((prev->sched_class == &idle_sched_class ||
4040
- prev->sched_class == &fair_sched_class) &&
5252
+ if (likely(prev->sched_class <= &fair_sched_class &&
40415253 rq->nr_running == rq->cfs.h_nr_running)) {
40425254
4043
- p = fair_sched_class.pick_next_task(rq, prev, rf);
5255
+ p = pick_next_task_fair(rq, prev, rf);
40445256 if (unlikely(p == RETRY_TASK))
4045
- goto again;
5257
+ goto restart;
40465258
40475259 /* Assumes fair_sched_class->next == idle_sched_class */
4048
- if (unlikely(!p))
4049
- p = idle_sched_class.pick_next_task(rq, prev, rf);
5260
+ if (!p) {
5261
+ put_prev_task(rq, prev);
5262
+ p = pick_next_task_idle(rq);
5263
+ }
40505264
40515265 return p;
40525266 }
40535267
4054
-again:
5268
+restart:
5269
+ put_prev_task_balance(rq, prev, rf);
5270
+
40555271 for_each_class(class) {
4056
- p = class->pick_next_task(rq, prev, rf);
4057
- if (p) {
4058
- if (unlikely(p == RETRY_TASK))
4059
- goto again;
5272
+ p = class->pick_next_task(rq);
5273
+ if (p)
40605274 return p;
4061
- }
40625275 }
40635276
40645277 /* The idle class should always have a runnable task: */
40655278 BUG();
40665279 }
4067
-
4068
-static void migrate_disabled_sched(struct task_struct *p);
40695280
40705281 /*
40715282 * __schedule() is the main scheduler function.
....@@ -4087,7 +5298,7 @@
40875298 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
40885299 * called on the nearest possible occasion:
40895300 *
4090
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
5301
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
40915302 *
40925303 * - in syscall or exception context, at the next outmost
40935304 * preempt_enable(). (this might be as soon as the wake_up()'s
....@@ -4096,7 +5307,7 @@
40965307 * - in IRQ context, return from interrupt-handler to
40975308 * preemptible context
40985309 *
4099
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
5310
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
41005311 * then at the next:
41015312 *
41025313 * - cond_resched() call
....@@ -4106,10 +5317,11 @@
41065317 *
41075318 * WARNING: must be called with preemption disabled!
41085319 */
4109
-static void __sched notrace __schedule(bool preempt)
5320
+static void __sched notrace __schedule(bool preempt, bool spinning_lock)
41105321 {
41115322 struct task_struct *prev, *next;
41125323 unsigned long *switch_count;
5324
+ unsigned long prev_state;
41135325 struct rq_flags rf;
41145326 struct rq *rq;
41155327 int cpu;
....@@ -4118,7 +5330,7 @@
41185330 rq = cpu_rq(cpu);
41195331 prev = rq->curr;
41205332
4121
- schedule_debug(prev);
5333
+ schedule_debug(prev, preempt);
41225334
41235335 if (sched_feat(HRTICK))
41245336 hrtick_clear(rq);
....@@ -4129,28 +5341,59 @@
41295341 /*
41305342 * Make sure that signal_pending_state()->signal_pending() below
41315343 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4132
- * done by the caller to avoid the race with signal_wake_up().
5344
+ * done by the caller to avoid the race with signal_wake_up():
41335345 *
4134
- * The membarrier system call requires a full memory barrier
5346
+ * __set_current_state(@state) signal_wake_up()
5347
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
5348
+ * wake_up_state(p, state)
5349
+ * LOCK rq->lock LOCK p->pi_state
5350
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
5351
+ * if (signal_pending_state()) if (p->state & @state)
5352
+ *
5353
+ * Also, the membarrier system call requires a full memory barrier
41355354 * after coming from user-space, before storing to rq->curr.
41365355 */
41375356 rq_lock(rq, &rf);
41385357 smp_mb__after_spinlock();
4139
-
4140
- if (__migrate_disabled(prev))
4141
- migrate_disabled_sched(prev);
41425358
41435359 /* Promote REQ to ACT */
41445360 rq->clock_update_flags <<= 1;
41455361 update_rq_clock(rq);
41465362
41475363 switch_count = &prev->nivcsw;
4148
- if (!preempt && prev->state) {
4149
- if (unlikely(signal_pending_state(prev->state, prev))) {
5364
+
5365
+ /*
5366
+ * We must load prev->state once (task_struct::state is volatile), such
5367
+ * that:
5368
+ *
5369
+ * - we form a control dependency vs deactivate_task() below.
5370
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
5371
+ */
5372
+ prev_state = prev->state;
5373
+ if ((!preempt || spinning_lock) && prev_state) {
5374
+ if (signal_pending_state(prev_state, prev)) {
41505375 prev->state = TASK_RUNNING;
41515376 } else {
5377
+ prev->sched_contributes_to_load =
5378
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
5379
+ !(prev_state & TASK_NOLOAD) &&
5380
+ !(prev->flags & PF_FROZEN);
5381
+
5382
+ if (prev->sched_contributes_to_load)
5383
+ rq->nr_uninterruptible++;
5384
+
5385
+ /*
5386
+ * __schedule() ttwu()
5387
+ * prev_state = prev->state; if (p->on_rq && ...)
5388
+ * if (prev_state) goto out;
5389
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
5390
+ * p->state = TASK_WAKING
5391
+ *
5392
+ * Where __schedule() and ttwu() have matching control dependencies.
5393
+ *
5394
+ * After this, schedule() must not care about p->state any more.
5395
+ */
41525396 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4153
- prev->on_rq = 0;
41545397
41555398 if (prev->in_iowait) {
41565399 atomic_inc(&rq->nr_iowait);
....@@ -4165,9 +5408,14 @@
41655408 clear_tsk_need_resched_lazy(prev);
41665409 clear_preempt_need_resched();
41675410
5411
+ trace_android_rvh_schedule(prev, next, rq);
41685412 if (likely(prev != next)) {
41695413 rq->nr_switches++;
4170
- rq->curr = next;
5414
+ /*
5415
+ * RCU users of rcu_dereference(rq->curr) may not see
5416
+ * changes to task_struct made by pick_next_task().
5417
+ */
5418
+ RCU_INIT_POINTER(rq->curr, next);
41715419 /*
41725420 * The membarrier system call requires each architecture
41735421 * to have a full memory barrier after updating
....@@ -4184,16 +5432,20 @@
41845432 */
41855433 ++*switch_count;
41865434
5435
+ migrate_disable_switch(rq, prev);
5436
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
5437
+
41875438 trace_sched_switch(preempt, prev, next);
41885439
41895440 /* Also unlocks the rq: */
41905441 rq = context_switch(rq, prev, next, &rf);
41915442 } else {
41925443 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
4193
- rq_unlock_irq(rq, &rf);
4194
- }
41955444
4196
- balance_callback(rq);
5445
+ rq_unpin_lock(rq, &rf);
5446
+ __balance_callbacks(rq);
5447
+ raw_spin_unlock_irq(&rq->lock);
5448
+ }
41975449 }
41985450
41995451 void __noreturn do_task_dead(void)
....@@ -4204,7 +5456,7 @@
42045456 /* Tell freezer to ignore us: */
42055457 current->flags |= PF_NOFREEZE;
42065458
4207
- __schedule(false);
5459
+ __schedule(false, false);
42085460 BUG();
42095461
42105462 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
....@@ -4214,24 +5466,28 @@
42145466
42155467 static inline void sched_submit_work(struct task_struct *tsk)
42165468 {
5469
+ unsigned int task_flags;
5470
+
42175471 if (!tsk->state)
42185472 return;
42195473
5474
+ task_flags = tsk->flags;
42205475 /*
42215476 * If a worker went to sleep, notify and ask workqueue whether
42225477 * it wants to wake up a task to maintain concurrency.
42235478 * As this function is called inside the schedule() context,
42245479 * we disable preemption to avoid it calling schedule() again
4225
- * in the possible wakeup of a kworker.
5480
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
5481
+ * requires it.
42265482 */
4227
- if (tsk->flags & PF_WQ_WORKER) {
5483
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
42285484 preempt_disable();
4229
- wq_worker_sleeping(tsk);
5485
+ if (task_flags & PF_WQ_WORKER)
5486
+ wq_worker_sleeping(tsk);
5487
+ else
5488
+ io_wq_worker_sleeping(tsk);
42305489 preempt_enable_no_resched();
42315490 }
4232
-
4233
- if (tsk_is_pi_blocked(tsk))
4234
- return;
42355491
42365492 /*
42375493 * If we are going to sleep and we have plugged IO queued,
....@@ -4243,8 +5499,12 @@
42435499
42445500 static void sched_update_worker(struct task_struct *tsk)
42455501 {
4246
- if (tsk->flags & PF_WQ_WORKER)
4247
- wq_worker_running(tsk);
5502
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5503
+ if (tsk->flags & PF_WQ_WORKER)
5504
+ wq_worker_running(tsk);
5505
+ else
5506
+ io_wq_worker_running(tsk);
5507
+ }
42485508 }
42495509
42505510 asmlinkage __visible void __sched schedule(void)
....@@ -4254,7 +5514,7 @@
42545514 sched_submit_work(tsk);
42555515 do {
42565516 preempt_disable();
4257
- __schedule(false);
5517
+ __schedule(false, false);
42585518 sched_preempt_enable_no_resched();
42595519 } while (need_resched());
42605520 sched_update_worker(tsk);
....@@ -4282,7 +5542,7 @@
42825542 */
42835543 WARN_ON_ONCE(current->state);
42845544 do {
4285
- __schedule(false);
5545
+ __schedule(false, false);
42865546 } while (need_resched());
42875547 }
42885548
....@@ -4335,7 +5595,7 @@
43355595 */
43365596 preempt_disable_notrace();
43375597 preempt_latency_start(1);
4338
- __schedule(true);
5598
+ __schedule(true, false);
43395599 preempt_latency_stop(1);
43405600 preempt_enable_no_resched_notrace();
43415601
....@@ -4370,11 +5630,10 @@
43705630
43715631 #endif
43725632
4373
-#ifdef CONFIG_PREEMPT
5633
+#ifdef CONFIG_PREEMPTION
43745634 /*
4375
- * this is the entry point to schedule() from in-kernel preemption
4376
- * off of preempt_enable. Kernel preemptions off return from interrupt
4377
- * occur there and call schedule directly.
5635
+ * This is the entry point to schedule() from in-kernel preemption
5636
+ * off of preempt_enable.
43785637 */
43795638 asmlinkage __visible void __sched notrace preempt_schedule(void)
43805639 {
....@@ -4390,6 +5649,19 @@
43905649 }
43915650 NOKPROBE_SYMBOL(preempt_schedule);
43925651 EXPORT_SYMBOL(preempt_schedule);
5652
+
5653
+#ifdef CONFIG_PREEMPT_RT
5654
+void __sched notrace preempt_schedule_lock(void)
5655
+{
5656
+ do {
5657
+ preempt_disable();
5658
+ __schedule(true, true);
5659
+ sched_preempt_enable_no_resched();
5660
+ } while (need_resched());
5661
+}
5662
+NOKPROBE_SYMBOL(preempt_schedule_lock);
5663
+EXPORT_SYMBOL(preempt_schedule_lock);
5664
+#endif
43935665
43945666 /**
43955667 * preempt_schedule_notrace - preempt_schedule called by tracing
....@@ -4437,7 +5709,7 @@
44375709 * an infinite recursion.
44385710 */
44395711 prev_ctx = exception_enter();
4440
- __schedule(true);
5712
+ __schedule(true, false);
44415713 exception_exit(prev_ctx);
44425714
44435715 preempt_latency_stop(1);
....@@ -4446,10 +5718,10 @@
44465718 }
44475719 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
44485720
4449
-#endif /* CONFIG_PREEMPT */
5721
+#endif /* CONFIG_PREEMPTION */
44505722
44515723 /*
4452
- * this is the entry point to schedule() from kernel preemption
5724
+ * This is the entry point to schedule() from kernel preemption
44535725 * off of irq context.
44545726 * Note, that this is called and return with irqs disabled. This will
44555727 * protect us against recursive calling from irq.
....@@ -4466,7 +5738,7 @@
44665738 do {
44675739 preempt_disable();
44685740 local_irq_enable();
4469
- __schedule(true);
5741
+ __schedule(true, false);
44705742 local_irq_disable();
44715743 sched_preempt_enable_no_resched();
44725744 } while (need_resched());
....@@ -4477,9 +5749,22 @@
44775749 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
44785750 void *key)
44795751 {
4480
- return try_to_wake_up(curr->private, mode, wake_flags, 1);
5752
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
5753
+ return try_to_wake_up(curr->private, mode, wake_flags);
44815754 }
44825755 EXPORT_SYMBOL(default_wake_function);
5756
+
5757
+static void __setscheduler_prio(struct task_struct *p, int prio)
5758
+{
5759
+ if (dl_prio(prio))
5760
+ p->sched_class = &dl_sched_class;
5761
+ else if (rt_prio(prio))
5762
+ p->sched_class = &rt_sched_class;
5763
+ else
5764
+ p->sched_class = &fair_sched_class;
5765
+
5766
+ p->prio = prio;
5767
+}
44835768
44845769 #ifdef CONFIG_RT_MUTEXES
44855770
....@@ -4517,6 +5802,7 @@
45175802 struct rq_flags rf;
45185803 struct rq *rq;
45195804
5805
+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
45205806 /* XXX used to be waiter->prio, not waiter->task->prio */
45215807 prio = __rt_effective_prio(pi_task, p->normal_prio);
45225808
....@@ -4591,39 +5877,39 @@
45915877 if (!dl_prio(p->normal_prio) ||
45925878 (pi_task && dl_prio(pi_task->prio) &&
45935879 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4594
- p->dl.dl_boosted = 1;
5880
+ p->dl.pi_se = pi_task->dl.pi_se;
45955881 queue_flag |= ENQUEUE_REPLENISH;
4596
- } else
4597
- p->dl.dl_boosted = 0;
4598
- p->sched_class = &dl_sched_class;
5882
+ } else {
5883
+ p->dl.pi_se = &p->dl;
5884
+ }
45995885 } else if (rt_prio(prio)) {
46005886 if (dl_prio(oldprio))
4601
- p->dl.dl_boosted = 0;
5887
+ p->dl.pi_se = &p->dl;
46025888 if (oldprio < prio)
46035889 queue_flag |= ENQUEUE_HEAD;
4604
- p->sched_class = &rt_sched_class;
46055890 } else {
46065891 if (dl_prio(oldprio))
4607
- p->dl.dl_boosted = 0;
5892
+ p->dl.pi_se = &p->dl;
46085893 if (rt_prio(oldprio))
46095894 p->rt.timeout = 0;
4610
- p->sched_class = &fair_sched_class;
46115895 }
46125896
4613
- p->prio = prio;
5897
+ __setscheduler_prio(p, prio);
46145898
46155899 if (queued)
46165900 enqueue_task(rq, p, queue_flag);
46175901 if (running)
4618
- set_curr_task(rq, p);
5902
+ set_next_task(rq, p);
46195903
46205904 check_class_changed(rq, p, prev_class, oldprio);
46215905 out_unlock:
46225906 /* Avoid rq from going away on us: */
46235907 preempt_disable();
4624
- __task_rq_unlock(rq, &rf);
46255908
4626
- balance_callback(rq);
5909
+ rq_unpin_lock(rq, &rf);
5910
+ __balance_callbacks(rq);
5911
+ raw_spin_unlock(&rq->lock);
5912
+
46275913 preempt_enable();
46285914 }
46295915 #else
....@@ -4635,12 +5921,13 @@
46355921
46365922 void set_user_nice(struct task_struct *p, long nice)
46375923 {
4638
- bool queued, running;
4639
- int old_prio, delta;
5924
+ bool queued, running, allowed = false;
5925
+ int old_prio;
46405926 struct rq_flags rf;
46415927 struct rq *rq;
46425928
4643
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5929
+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
5930
+ if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed)
46445931 return;
46455932 /*
46465933 * We have to be careful, if called from sys_setpriority(),
....@@ -4667,22 +5954,21 @@
46675954 put_prev_task(rq, p);
46685955
46695956 p->static_prio = NICE_TO_PRIO(nice);
4670
- set_load_weight(p, true);
5957
+ set_load_weight(p);
46715958 old_prio = p->prio;
46725959 p->prio = effective_prio(p);
4673
- delta = p->prio - old_prio;
46745960
4675
- if (queued) {
5961
+ if (queued)
46765962 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4677
- /*
4678
- * If the task increased its priority or is running and
4679
- * lowered its priority, then reschedule its CPU:
4680
- */
4681
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
4682
- resched_curr(rq);
4683
- }
46845963 if (running)
4685
- set_curr_task(rq, p);
5964
+ set_next_task(rq, p);
5965
+
5966
+ /*
5967
+ * If the task increased its priority or is running and
5968
+ * lowered its priority, then reschedule its CPU:
5969
+ */
5970
+ p->sched_class->prio_changed(rq, p, old_prio);
5971
+
46865972 out_unlock:
46875973 task_rq_unlock(rq, p, &rf);
46885974 }
....@@ -4767,7 +6053,7 @@
47676053 return 0;
47686054
47696055 #ifdef CONFIG_SMP
4770
- if (!llist_empty(&rq->wake_list))
6056
+ if (rq->ttwu_pending)
47716057 return 0;
47726058 #endif
47736059
....@@ -4790,6 +6076,7 @@
47906076
47916077 return 1;
47926078 }
6079
+EXPORT_SYMBOL_GPL(available_idle_cpu);
47936080
47946081 /**
47956082 * idle_task - return the idle task for a given CPU.
....@@ -4841,36 +6128,7 @@
48416128 */
48426129 p->rt_priority = attr->sched_priority;
48436130 p->normal_prio = normal_prio(p);
4844
- set_load_weight(p, true);
4845
-}
4846
-
4847
-/* Actually do priority change: must hold pi & rq lock. */
4848
-static void __setscheduler(struct rq *rq, struct task_struct *p,
4849
- const struct sched_attr *attr, bool keep_boost)
4850
-{
4851
- /*
4852
- * If params can't change scheduling class changes aren't allowed
4853
- * either.
4854
- */
4855
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4856
- return;
4857
-
4858
- __setscheduler_params(p, attr);
4859
-
4860
- /*
4861
- * Keep a potential priority boosting if called from
4862
- * sched_setscheduler().
4863
- */
4864
- p->prio = normal_prio(p);
4865
- if (keep_boost)
4866
- p->prio = rt_effective_prio(p, p->prio);
4867
-
4868
- if (dl_prio(p->prio))
4869
- p->sched_class = &dl_sched_class;
4870
- else if (rt_prio(p->prio))
4871
- p->sched_class = &rt_sched_class;
4872
- else
4873
- p->sched_class = &fair_sched_class;
6131
+ set_load_weight(p);
48746132 }
48756133
48766134 /*
....@@ -4893,11 +6151,10 @@
48936151 const struct sched_attr *attr,
48946152 bool user, bool pi)
48956153 {
4896
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4897
- MAX_RT_PRIO - 1 - attr->sched_priority;
4898
- int retval, oldprio, oldpolicy = -1, queued, running;
4899
- int new_effective_prio, policy = attr->sched_policy;
6154
+ int oldpolicy = -1, policy = attr->sched_policy;
6155
+ int retval, oldprio, newprio, queued, running;
49006156 const struct sched_class *prev_class;
6157
+ struct callback_head *head;
49016158 struct rq_flags rf;
49026159 int reset_on_fork;
49036160 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
....@@ -4969,7 +6226,7 @@
49696226 * Treat SCHED_IDLE as nice 20. Only allow a switch to
49706227 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
49716228 */
4972
- if (idle_policy(p->policy) && !idle_policy(policy)) {
6229
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
49736230 if (!can_nice(p, task_nice(p)))
49746231 return -EPERM;
49756232 }
....@@ -4980,6 +6237,10 @@
49806237
49816238 /* Normal users shall not reset the sched_reset_on_fork flag: */
49826239 if (p->sched_reset_on_fork && !reset_on_fork)
6240
+ return -EPERM;
6241
+
6242
+ /* Can't change util-clamps */
6243
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
49836244 return -EPERM;
49846245 }
49856246
....@@ -5013,8 +6274,8 @@
50136274 * Changing the policy of the stop threads its a very bad idea:
50146275 */
50156276 if (p == rq->stop) {
5016
- task_rq_unlock(rq, p, &rf);
5017
- return -EINVAL;
6277
+ retval = -EINVAL;
6278
+ goto unlock;
50186279 }
50196280
50206281 /*
....@@ -5032,8 +6293,8 @@
50326293 goto change;
50336294
50346295 p->sched_reset_on_fork = reset_on_fork;
5035
- task_rq_unlock(rq, p, &rf);
5036
- return 0;
6296
+ retval = 0;
6297
+ goto unlock;
50376298 }
50386299 change:
50396300
....@@ -5046,8 +6307,8 @@
50466307 if (rt_bandwidth_enabled() && rt_policy(policy) &&
50476308 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
50486309 !task_group_is_autogroup(task_group(p))) {
5049
- task_rq_unlock(rq, p, &rf);
5050
- return -EPERM;
6310
+ retval = -EPERM;
6311
+ goto unlock;
50516312 }
50526313 #endif
50536314 #ifdef CONFIG_SMP
....@@ -5062,8 +6323,8 @@
50626323 */
50636324 if (!cpumask_subset(span, p->cpus_ptr) ||
50646325 rq->rd->dl_bw.bw == 0) {
5065
- task_rq_unlock(rq, p, &rf);
5066
- return -EPERM;
6326
+ retval = -EPERM;
6327
+ goto unlock;
50676328 }
50686329 }
50696330 #endif
....@@ -5082,13 +6343,14 @@
50826343 * is available.
50836344 */
50846345 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
5085
- task_rq_unlock(rq, p, &rf);
5086
- return -EBUSY;
6346
+ retval = -EBUSY;
6347
+ goto unlock;
50876348 }
50886349
50896350 p->sched_reset_on_fork = reset_on_fork;
50906351 oldprio = p->prio;
50916352
6353
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
50926354 if (pi) {
50936355 /*
50946356 * Take priority boosted tasks into account. If the new
....@@ -5097,8 +6359,8 @@
50976359 * the runqueue. This will be done when the task deboost
50986360 * itself.
50996361 */
5100
- new_effective_prio = rt_effective_prio(p, newprio);
5101
- if (new_effective_prio == oldprio)
6362
+ newprio = rt_effective_prio(p, newprio);
6363
+ if (newprio == oldprio)
51026364 queue_flags &= ~DEQUEUE_MOVE;
51036365 }
51046366
....@@ -5111,7 +6373,11 @@
51116373
51126374 prev_class = p->sched_class;
51136375
5114
- __setscheduler(rq, p, attr, pi);
6376
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
6377
+ __setscheduler_params(p, attr);
6378
+ __setscheduler_prio(p, newprio);
6379
+ trace_android_rvh_setscheduler(p);
6380
+ }
51156381 __setscheduler_uclamp(p, attr);
51166382
51176383 if (queued) {
....@@ -5125,22 +6391,27 @@
51256391 enqueue_task(rq, p, queue_flags);
51266392 }
51276393 if (running)
5128
- set_curr_task(rq, p);
6394
+ set_next_task(rq, p);
51296395
51306396 check_class_changed(rq, p, prev_class, oldprio);
51316397
51326398 /* Avoid rq from going away on us: */
51336399 preempt_disable();
6400
+ head = splice_balance_callbacks(rq);
51346401 task_rq_unlock(rq, p, &rf);
51356402
51366403 if (pi)
51376404 rt_mutex_adjust_pi(p);
51386405
51396406 /* Run balance callbacks after we've adjusted the PI chain: */
5140
- balance_callback(rq);
6407
+ balance_callbacks(rq, head);
51416408 preempt_enable();
51426409
51436410 return 0;
6411
+
6412
+unlock:
6413
+ task_rq_unlock(rq, p, &rf);
6414
+ return retval;
51446415 }
51456416
51466417 static int _sched_setscheduler(struct task_struct *p, int policy,
....@@ -5152,6 +6423,14 @@
51526423 .sched_nice = PRIO_TO_NICE(p->static_prio),
51536424 };
51546425
6426
+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
6427
+ ((policy == SCHED_FIFO) || (policy == SCHED_RR))) {
6428
+ attr.sched_priority /= 2;
6429
+ if (!check)
6430
+ attr.sched_priority += MAX_RT_PRIO / 2;
6431
+ if (!attr.sched_priority)
6432
+ attr.sched_priority = 1;
6433
+ }
51556434 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
51566435 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
51576436 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
....@@ -5166,6 +6445,8 @@
51666445 * @p: the task in question.
51676446 * @policy: new policy.
51686447 * @param: structure containing the new RT priority.
6448
+ *
6449
+ * Use sched_set_fifo(), read its comment.
51696450 *
51706451 * Return: 0 on success. An error code otherwise.
51716452 *
....@@ -5188,6 +6469,7 @@
51886469 {
51896470 return __sched_setscheduler(p, attr, false, true);
51906471 }
6472
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
51916473
51926474 /**
51936475 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
....@@ -5208,6 +6490,51 @@
52086490 return _sched_setscheduler(p, policy, param, false);
52096491 }
52106492 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
6493
+
6494
+/*
6495
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
6496
+ * incapable of resource management, which is the one thing an OS really should
6497
+ * be doing.
6498
+ *
6499
+ * This is of course the reason it is limited to privileged users only.
6500
+ *
6501
+ * Worse still; it is fundamentally impossible to compose static priority
6502
+ * workloads. You cannot take two correctly working static prio workloads
6503
+ * and smash them together and still expect them to work.
6504
+ *
6505
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
6506
+ *
6507
+ * MAX_RT_PRIO / 2
6508
+ *
6509
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
6510
+ * know enough information to make a sensible choice.
6511
+ */
6512
+void sched_set_fifo(struct task_struct *p)
6513
+{
6514
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
6515
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6516
+}
6517
+EXPORT_SYMBOL_GPL(sched_set_fifo);
6518
+
6519
+/*
6520
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
6521
+ */
6522
+void sched_set_fifo_low(struct task_struct *p)
6523
+{
6524
+ struct sched_param sp = { .sched_priority = 1 };
6525
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6526
+}
6527
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
6528
+
6529
+void sched_set_normal(struct task_struct *p, int nice)
6530
+{
6531
+ struct sched_attr attr = {
6532
+ .sched_policy = SCHED_NORMAL,
6533
+ .sched_nice = nice,
6534
+ };
6535
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
6536
+}
6537
+EXPORT_SYMBOL_GPL(sched_set_normal);
52116538
52126539 static int
52136540 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
....@@ -5239,9 +6566,6 @@
52396566 u32 size;
52406567 int ret;
52416568
5242
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5243
- return -EFAULT;
5244
-
52456569 /* Zero the full structure, so that a short copy will be nice: */
52466570 memset(attr, 0, sizeof(*attr));
52476571
....@@ -5249,44 +6573,18 @@
52496573 if (ret)
52506574 return ret;
52516575
5252
- /* Bail out on silly large: */
5253
- if (size > PAGE_SIZE)
5254
- goto err_size;
5255
-
52566576 /* ABI compatibility quirk: */
52576577 if (!size)
52586578 size = SCHED_ATTR_SIZE_VER0;
5259
-
5260
- if (size < SCHED_ATTR_SIZE_VER0)
6579
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
52616580 goto err_size;
52626581
5263
- /*
5264
- * If we're handed a bigger struct than we know of,
5265
- * ensure all the unknown bits are 0 - i.e. new
5266
- * user-space does not rely on any kernel feature
5267
- * extensions we dont know about yet.
5268
- */
5269
- if (size > sizeof(*attr)) {
5270
- unsigned char __user *addr;
5271
- unsigned char __user *end;
5272
- unsigned char val;
5273
-
5274
- addr = (void __user *)uattr + sizeof(*attr);
5275
- end = (void __user *)uattr + size;
5276
-
5277
- for (; addr < end; addr++) {
5278
- ret = get_user(val, addr);
5279
- if (ret)
5280
- return ret;
5281
- if (val)
5282
- goto err_size;
5283
- }
5284
- size = sizeof(*attr);
6582
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
6583
+ if (ret) {
6584
+ if (ret == -E2BIG)
6585
+ goto err_size;
6586
+ return ret;
52856587 }
5286
-
5287
- ret = copy_from_user(attr, uattr, size);
5288
- if (ret)
5289
- return -EFAULT;
52906588
52916589 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
52926590 size < SCHED_ATTR_SIZE_VER1)
....@@ -5303,6 +6601,16 @@
53036601 err_size:
53046602 put_user(sizeof(*attr), &uattr->size);
53056603 return -E2BIG;
6604
+}
6605
+
6606
+static void get_params(struct task_struct *p, struct sched_attr *attr)
6607
+{
6608
+ if (task_has_dl_policy(p))
6609
+ __getparam_dl(p, attr);
6610
+ else if (task_has_rt_policy(p))
6611
+ attr->sched_priority = p->rt_priority;
6612
+ else
6613
+ attr->sched_nice = task_nice(p);
53066614 }
53076615
53086616 /**
....@@ -5366,6 +6674,8 @@
53666674 rcu_read_unlock();
53676675
53686676 if (likely(p)) {
6677
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
6678
+ get_params(p, &attr);
53696679 retval = sched_setattr(p, &attr);
53706680 put_task_struct(p);
53716681 }
....@@ -5459,7 +6769,7 @@
54596769 {
54606770 unsigned int ksize = sizeof(*kattr);
54616771
5462
- if (!access_ok(VERIFY_WRITE, uattr, usize))
6772
+ if (!access_ok(uattr, usize))
54636773 return -EFAULT;
54646774
54656775 /*
....@@ -5487,7 +6797,7 @@
54876797 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
54886798 * @pid: the pid in question.
54896799 * @uattr: structure containing the extended parameters.
5490
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
6800
+ * @usize: sizeof(attr) for fwd/bwd comp.
54916801 * @flags: for future extension.
54926802 */
54936803 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
....@@ -5514,14 +6824,15 @@
55146824 kattr.sched_policy = p->policy;
55156825 if (p->sched_reset_on_fork)
55166826 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5517
- if (task_has_dl_policy(p))
5518
- __getparam_dl(p, &kattr);
5519
- else if (task_has_rt_policy(p))
5520
- kattr.sched_priority = p->rt_priority;
5521
- else
5522
- kattr.sched_nice = task_nice(p);
6827
+ get_params(p, &kattr);
6828
+ kattr.sched_flags &= SCHED_FLAG_ALL;
55236829
55246830 #ifdef CONFIG_UCLAMP_TASK
6831
+ /*
6832
+ * This could race with another potential updater, but this is fine
6833
+ * because it'll correctly read the old or the new value. We don't need
6834
+ * to guarantee who wins the race as long as it doesn't return garbage.
6835
+ */
55256836 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
55266837 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
55276838 #endif
....@@ -5540,6 +6851,7 @@
55406851 cpumask_var_t cpus_allowed, new_mask;
55416852 struct task_struct *p;
55426853 int retval;
6854
+ int skip = 0;
55436855
55446856 rcu_read_lock();
55456857
....@@ -5575,6 +6887,9 @@
55756887 rcu_read_unlock();
55766888 }
55776889
6890
+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
6891
+ if (skip)
6892
+ goto out_free_new_mask;
55786893 retval = security_task_setscheduler(p);
55796894 if (retval)
55806895 goto out_free_new_mask;
....@@ -5601,7 +6916,7 @@
56016916 }
56026917 #endif
56036918 again:
5604
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
6919
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
56056920
56066921 if (!retval) {
56076922 cpuset_cpus_allowed(p, cpus_allowed);
....@@ -5615,6 +6930,9 @@
56156930 goto again;
56166931 }
56176932 }
6933
+
6934
+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
6935
+
56186936 out_free_new_mask:
56196937 free_cpumask_var(new_mask);
56206938 out_free_cpus_allowed:
....@@ -5623,7 +6941,6 @@
56236941 put_task_struct(p);
56246942 return retval;
56256943 }
5626
-EXPORT_SYMBOL_GPL(sched_setaffinity);
56276944
56286945 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
56296946 struct cpumask *new_mask)
....@@ -5742,6 +7059,8 @@
57427059 schedstat_inc(rq->yld_count);
57437060 current->sched_class->yield_task(rq);
57447061
7062
+ trace_android_rvh_do_sched_yield(rq);
7063
+
57457064 preempt_disable();
57467065 rq_unlock_irq(rq, &rf);
57477066 sched_preempt_enable_no_resched();
....@@ -5755,7 +7074,7 @@
57557074 return 0;
57567075 }
57577076
5758
-#ifndef CONFIG_PREEMPT
7077
+#ifndef CONFIG_PREEMPTION
57597078 int __sched _cond_resched(void)
57607079 {
57617080 if (should_resched(0)) {
....@@ -5772,7 +7091,7 @@
57727091 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
57737092 * call schedule, and on return reacquire the lock.
57747093 *
5775
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
7094
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
57767095 * operations here to prevent schedule() from being called twice (once via
57777096 * spin_unlock(), once by hand).
57787097 */
....@@ -5876,7 +7195,7 @@
58767195 if (task_running(p_rq, p) || p->state)
58777196 goto out_unlock;
58787197
5879
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
7198
+ yielded = curr->sched_class->yield_to_task(rq, p);
58807199 if (yielded) {
58817200 schedstat_inc(rq->yld_count);
58827201 /*
....@@ -6042,7 +7361,7 @@
60427361 * an error code.
60437362 */
60447363 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6045
- struct timespec __user *, interval)
7364
+ struct __kernel_timespec __user *, interval)
60467365 {
60477366 struct timespec64 t;
60487367 int retval = sched_rr_get_interval(pid, &t);
....@@ -6053,16 +7372,15 @@
60537372 return retval;
60547373 }
60557374
6056
-#ifdef CONFIG_COMPAT
6057
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
6058
- compat_pid_t, pid,
6059
- struct compat_timespec __user *, interval)
7375
+#ifdef CONFIG_COMPAT_32BIT_TIME
7376
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
7377
+ struct old_timespec32 __user *, interval)
60607378 {
60617379 struct timespec64 t;
60627380 int retval = sched_rr_get_interval(pid, &t);
60637381
60647382 if (retval == 0)
6065
- retval = compat_put_timespec64(&t, interval);
7383
+ retval = put_old_timespec32(&t, interval);
60667384 return retval;
60677385 }
60687386 #endif
....@@ -6075,10 +7393,10 @@
60757393 if (!try_get_task_stack(p))
60767394 return;
60777395
6078
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
7396
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
60797397
60807398 if (p->state == TASK_RUNNING)
6081
- printk(KERN_CONT " running task ");
7399
+ pr_cont(" running task ");
60827400 #ifdef CONFIG_DEBUG_STACK_USAGE
60837401 free = stack_not_used(p);
60847402 #endif
....@@ -6087,12 +7405,13 @@
60877405 if (pid_alive(p))
60887406 ppid = task_pid_nr(rcu_dereference(p->real_parent));
60897407 rcu_read_unlock();
6090
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6091
- task_pid_nr(p), ppid,
7408
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
7409
+ free, task_pid_nr(p), ppid,
60927410 (unsigned long)task_thread_info(p)->flags);
60937411
60947412 print_worker_info(KERN_INFO, p);
6095
- show_stack(p, NULL);
7413
+ trace_android_vh_sched_show_task(p);
7414
+ show_stack(p, NULL, KERN_INFO);
60967415 put_task_stack(p);
60977416 }
60987417 EXPORT_SYMBOL_GPL(sched_show_task);
....@@ -6123,13 +7442,6 @@
61237442 {
61247443 struct task_struct *g, *p;
61257444
6126
-#if BITS_PER_LONG == 32
6127
- printk(KERN_INFO
6128
- " task PC stack pid father\n");
6129
-#else
6130
- printk(KERN_INFO
6131
- " task PC stack pid father\n");
6132
-#endif
61337445 rcu_read_lock();
61347446 for_each_process_thread(g, p) {
61357447 /*
....@@ -6165,7 +7477,7 @@
61657477 * NOTE: this function does not set the idle thread's NEED_RESCHED
61667478 * flag, to make booting more robust.
61677479 */
6168
-void init_idle(struct task_struct *idle, int cpu)
7480
+void __init init_idle(struct task_struct *idle, int cpu)
61697481 {
61707482 struct rq *rq = cpu_rq(cpu);
61717483 unsigned long flags;
....@@ -6179,9 +7491,6 @@
61797491 idle->se.exec_start = sched_clock();
61807492 idle->flags |= PF_IDLE;
61817493
6182
- scs_task_reset(idle);
6183
- kasan_unpoison_task_stack(idle);
6184
-
61857494 #ifdef CONFIG_SMP
61867495 /*
61877496 * Its possible that init_idle() gets called multiple times on a task,
....@@ -6189,7 +7498,7 @@
61897498 *
61907499 * And since this is boot we can forgo the serialization.
61917500 */
6192
- set_cpus_allowed_common(idle, cpumask_of(cpu));
7501
+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
61937502 #endif
61947503 /*
61957504 * We're having a chicken and egg problem, even though we are
....@@ -6205,7 +7514,8 @@
62057514 __set_task_cpu(idle, cpu);
62067515 rcu_read_unlock();
62077516
6208
- rq->curr = rq->idle = idle;
7517
+ rq->idle = idle;
7518
+ rcu_assign_pointer(rq->curr, idle);
62097519 idle->on_rq = TASK_ON_RQ_QUEUED;
62107520 #ifdef CONFIG_SMP
62117521 idle->on_cpu = 1;
....@@ -6245,7 +7555,7 @@
62457555 }
62467556
62477557 int task_can_attach(struct task_struct *p,
6248
- const struct cpumask *cs_cpus_allowed)
7558
+ const struct cpumask *cs_effective_cpus)
62497559 {
62507560 int ret = 0;
62517561
....@@ -6264,8 +7574,13 @@
62647574 }
62657575
62667576 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6267
- cs_cpus_allowed))
6268
- ret = dl_task_can_attach(p, cs_cpus_allowed);
7577
+ cs_effective_cpus)) {
7578
+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
7579
+
7580
+ if (unlikely(cpu >= nr_cpu_ids))
7581
+ return -EINVAL;
7582
+ ret = dl_cpu_busy(cpu, p);
7583
+ }
62697584
62707585 out:
62717586 return ret;
....@@ -6316,7 +7631,7 @@
63167631 if (queued)
63177632 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
63187633 if (running)
6319
- set_curr_task(rq, p);
7634
+ set_next_task(rq, p);
63207635 task_rq_unlock(rq, p, &rf);
63217636 }
63227637 #endif /* CONFIG_NUMA_BALANCING */
....@@ -6342,125 +7657,163 @@
63427657 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
63437658 }
63447659
6345
-/*
6346
- * Since this CPU is going 'away' for a while, fold any nr_active delta
6347
- * we might have. Assumes we're called after migrate_tasks() so that the
6348
- * nr_active count is stable. We need to take the teardown thread which
6349
- * is calling this into account, so we hand in adjust = 1 to the load
6350
- * calculation.
6351
- *
6352
- * Also see the comment "Global load-average calculations".
6353
- */
6354
-static void calc_load_migrate(struct rq *rq)
7660
+static int __balance_push_cpu_stop(void *arg)
63557661 {
6356
- long delta = calc_load_fold_active(rq, 1);
6357
- if (delta)
6358
- atomic_long_add(delta, &calc_load_tasks);
6359
-}
7662
+ struct task_struct *p = arg;
7663
+ struct rq *rq = this_rq();
7664
+ struct rq_flags rf;
7665
+ int cpu;
63607666
6361
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6362
-{
6363
-}
7667
+ raw_spin_lock_irq(&p->pi_lock);
7668
+ rq_lock(rq, &rf);
63647669
6365
-static const struct sched_class fake_sched_class = {
6366
- .put_prev_task = put_prev_task_fake,
6367
-};
6368
-
6369
-static struct task_struct fake_task = {
6370
- /*
6371
- * Avoid pull_{rt,dl}_task()
6372
- */
6373
- .prio = MAX_PRIO + 1,
6374
- .sched_class = &fake_sched_class,
6375
-};
6376
-
6377
-/*
6378
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
6379
- * try_to_wake_up()->select_task_rq().
6380
- *
6381
- * Called with rq->lock held even though we'er in stop_machine() and
6382
- * there's no concurrency possible, we hold the required locks anyway
6383
- * because of lock validation efforts.
6384
- */
6385
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6386
-{
6387
- struct rq *rq = dead_rq;
6388
- struct task_struct *next, *stop = rq->stop;
6389
- struct rq_flags orf = *rf;
6390
- int dest_cpu;
6391
-
6392
- /*
6393
- * Fudge the rq selection such that the below task selection loop
6394
- * doesn't get stuck on the currently eligible stop task.
6395
- *
6396
- * We're currently inside stop_machine() and the rq is either stuck
6397
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
6398
- * either way we should never end up calling schedule() until we're
6399
- * done here.
6400
- */
6401
- rq->stop = NULL;
6402
-
6403
- /*
6404
- * put_prev_task() and pick_next_task() sched
6405
- * class method both need to have an up-to-date
6406
- * value of rq->clock[_task]
6407
- */
64087670 update_rq_clock(rq);
64097671
6410
- for (;;) {
6411
- /*
6412
- * There's this thread running, bail when that's the only
6413
- * remaining thread:
6414
- */
6415
- if (rq->nr_running == 1)
6416
- break;
6417
-
6418
- /*
6419
- * pick_next_task() assumes pinned rq->lock:
6420
- */
6421
- next = pick_next_task(rq, &fake_task, rf);
6422
- BUG_ON(!next);
6423
- put_prev_task(rq, next);
6424
-
6425
- WARN_ON_ONCE(__migrate_disabled(next));
6426
-
6427
- /*
6428
- * Rules for changing task_struct::cpus_mask are holding
6429
- * both pi_lock and rq->lock, such that holding either
6430
- * stabilizes the mask.
6431
- *
6432
- * Drop rq->lock is not quite as disastrous as it usually is
6433
- * because !cpu_active at this point, which means load-balance
6434
- * will not interfere. Also, stop-machine.
6435
- */
6436
- rq_unlock(rq, rf);
6437
- raw_spin_lock(&next->pi_lock);
6438
- rq_relock(rq, rf);
6439
-
6440
- /*
6441
- * Since we're inside stop-machine, _nothing_ should have
6442
- * changed the task, WARN if weird stuff happened, because in
6443
- * that case the above rq->lock drop is a fail too.
6444
- */
6445
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6446
- raw_spin_unlock(&next->pi_lock);
6447
- continue;
6448
- }
6449
-
6450
- /* Find suitable destination for @next, with force if needed. */
6451
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6452
- rq = __migrate_task(rq, rf, next, dest_cpu);
6453
- if (rq != dead_rq) {
6454
- rq_unlock(rq, rf);
6455
- rq = dead_rq;
6456
- *rf = orf;
6457
- rq_relock(rq, rf);
6458
- }
6459
- raw_spin_unlock(&next->pi_lock);
7672
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
7673
+ cpu = select_fallback_rq(rq->cpu, p);
7674
+ rq = __migrate_task(rq, &rf, p, cpu);
64607675 }
64617676
6462
- rq->stop = stop;
7677
+ rq_unlock(rq, &rf);
7678
+ raw_spin_unlock_irq(&p->pi_lock);
7679
+
7680
+ put_task_struct(p);
7681
+
7682
+ return 0;
64637683 }
7684
+
7685
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
7686
+
7687
+/*
7688
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
7689
+ */
7690
+
7691
+
7692
+static void balance_push(struct rq *rq)
7693
+{
7694
+ struct task_struct *push_task = rq->curr;
7695
+
7696
+ lockdep_assert_held(&rq->lock);
7697
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
7698
+
7699
+ /*
7700
+ * Both the cpu-hotplug and stop task are in this case and are
7701
+ * required to complete the hotplug process.
7702
+ */
7703
+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
7704
+ /*
7705
+ * If this is the idle task on the outgoing CPU try to wake
7706
+ * up the hotplug control thread which might wait for the
7707
+ * last task to vanish. The rcuwait_active() check is
7708
+ * accurate here because the waiter is pinned on this CPU
7709
+ * and can't obviously be running in parallel.
7710
+ *
7711
+ * On RT kernels this also has to check whether there are
7712
+ * pinned and scheduled out tasks on the runqueue. They
7713
+ * need to leave the migrate disabled section first.
7714
+ */
7715
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
7716
+ rcuwait_active(&rq->hotplug_wait)) {
7717
+ raw_spin_unlock(&rq->lock);
7718
+ rcuwait_wake_up(&rq->hotplug_wait);
7719
+ raw_spin_lock(&rq->lock);
7720
+ }
7721
+ return;
7722
+ }
7723
+
7724
+ get_task_struct(push_task);
7725
+ /*
7726
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
7727
+ * Both preemption and IRQs are still disabled.
7728
+ */
7729
+ raw_spin_unlock(&rq->lock);
7730
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
7731
+ this_cpu_ptr(&push_work));
7732
+ /*
7733
+ * At this point need_resched() is true and we'll take the loop in
7734
+ * schedule(). The next pick is obviously going to be the stop task
7735
+ * which is_per_cpu_kthread() and will push this task away.
7736
+ */
7737
+ raw_spin_lock(&rq->lock);
7738
+}
7739
+
7740
+static void balance_push_set(int cpu, bool on)
7741
+{
7742
+ struct rq *rq = cpu_rq(cpu);
7743
+ struct rq_flags rf;
7744
+
7745
+ rq_lock_irqsave(rq, &rf);
7746
+ if (on)
7747
+ rq->balance_flags |= BALANCE_PUSH;
7748
+ else
7749
+ rq->balance_flags &= ~BALANCE_PUSH;
7750
+ rq_unlock_irqrestore(rq, &rf);
7751
+}
7752
+
7753
+/*
7754
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
7755
+ * inactive. All tasks which are not per CPU kernel threads are either
7756
+ * pushed off this CPU now via balance_push() or placed on a different CPU
7757
+ * during wakeup. Wait until the CPU is quiescent.
7758
+ */
7759
+static void balance_hotplug_wait(void)
7760
+{
7761
+ struct rq *rq = this_rq();
7762
+
7763
+ rcuwait_wait_event(&rq->hotplug_wait,
7764
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
7765
+ TASK_UNINTERRUPTIBLE);
7766
+}
7767
+
7768
+static int drain_rq_cpu_stop(void *data)
7769
+{
7770
+#ifndef CONFIG_PREEMPT_RT
7771
+ struct rq *rq = this_rq();
7772
+ struct rq_flags rf;
7773
+
7774
+ rq_lock_irqsave(rq, &rf);
7775
+ migrate_tasks(rq, &rf, false);
7776
+ rq_unlock_irqrestore(rq, &rf);
7777
+#endif
7778
+ return 0;
7779
+}
7780
+
7781
+int sched_cpu_drain_rq(unsigned int cpu)
7782
+{
7783
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7784
+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
7785
+
7786
+ if (idle_cpu(cpu)) {
7787
+ rq_drain->done = NULL;
7788
+ return 0;
7789
+ }
7790
+
7791
+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
7792
+ rq_drain_done);
7793
+}
7794
+
7795
+void sched_cpu_drain_rq_wait(unsigned int cpu)
7796
+{
7797
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7798
+
7799
+ if (rq_drain->done)
7800
+ cpu_stop_work_wait(rq_drain);
7801
+}
7802
+
7803
+#else
7804
+
7805
+static inline void balance_push(struct rq *rq)
7806
+{
7807
+}
7808
+
7809
+static inline void balance_push_set(int cpu, bool on)
7810
+{
7811
+}
7812
+
7813
+static inline void balance_hotplug_wait(void)
7814
+{
7815
+}
7816
+
64647817 #endif /* CONFIG_HOTPLUG_CPU */
64657818
64667819 void set_rq_online(struct rq *rq)
....@@ -6531,8 +7884,10 @@
65317884 static int cpuset_cpu_inactive(unsigned int cpu)
65327885 {
65337886 if (!cpuhp_tasks_frozen) {
6534
- if (dl_cpu_busy(cpu))
6535
- return -EBUSY;
7887
+ int ret = dl_cpu_busy(cpu, NULL);
7888
+
7889
+ if (ret)
7890
+ return ret;
65367891 cpuset_update_active_cpus();
65377892 } else {
65387893 num_cpus_frozen++;
....@@ -6545,6 +7900,8 @@
65457900 {
65467901 struct rq *rq = cpu_rq(cpu);
65477902 struct rq_flags rf;
7903
+
7904
+ balance_push_set(cpu, false);
65487905
65497906 #ifdef CONFIG_SCHED_SMT
65507907 /*
....@@ -6581,19 +7938,39 @@
65817938 return 0;
65827939 }
65837940
6584
-int sched_cpu_deactivate(unsigned int cpu)
7941
+int sched_cpus_activate(struct cpumask *cpus)
65857942 {
7943
+ unsigned int cpu;
7944
+
7945
+ for_each_cpu(cpu, cpus) {
7946
+ if (sched_cpu_activate(cpu)) {
7947
+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
7948
+ sched_cpu_deactivate(cpu);
7949
+
7950
+ return -EBUSY;
7951
+ }
7952
+ }
7953
+
7954
+ return 0;
7955
+}
7956
+
7957
+int _sched_cpu_deactivate(unsigned int cpu)
7958
+{
7959
+ struct rq *rq = cpu_rq(cpu);
7960
+ struct rq_flags rf;
65867961 int ret;
65877962
65887963 set_cpu_active(cpu, false);
6589
- /*
6590
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6591
- * users of this state to go away such that all new such users will
6592
- * observe it.
6593
- *
6594
- * Do sync before park smpboot threads to take care the rcu boost case.
6595
- */
6596
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
7964
+
7965
+ balance_push_set(cpu, true);
7966
+
7967
+ rq_lock_irqsave(rq, &rf);
7968
+ if (rq->rd) {
7969
+ update_rq_clock(rq);
7970
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7971
+ set_rq_offline(rq);
7972
+ }
7973
+ rq_unlock_irqrestore(rq, &rf);
65977974
65987975 #ifdef CONFIG_SCHED_SMT
65997976 /*
....@@ -6608,10 +7985,51 @@
66087985
66097986 ret = cpuset_cpu_inactive(cpu);
66107987 if (ret) {
7988
+ balance_push_set(cpu, false);
66117989 set_cpu_active(cpu, true);
66127990 return ret;
66137991 }
66147992 sched_domains_numa_masks_clear(cpu);
7993
+
7994
+ update_max_interval();
7995
+
7996
+ return 0;
7997
+}
7998
+
7999
+int sched_cpu_deactivate(unsigned int cpu)
8000
+{
8001
+ int ret = _sched_cpu_deactivate(cpu);
8002
+
8003
+ if (ret)
8004
+ return ret;
8005
+
8006
+ /*
8007
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
8008
+ * users of this state to go away such that all new such users will
8009
+ * observe it.
8010
+ *
8011
+ * Do sync before park smpboot threads to take care the rcu boost case.
8012
+ */
8013
+ synchronize_rcu();
8014
+
8015
+ return 0;
8016
+}
8017
+
8018
+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
8019
+{
8020
+ unsigned int cpu;
8021
+
8022
+ for_each_cpu(cpu, cpus) {
8023
+ if (_sched_cpu_deactivate(cpu)) {
8024
+ for_each_cpu(cpu, cpus) {
8025
+ if (!cpu_active(cpu))
8026
+ sched_cpu_activate(cpu);
8027
+ }
8028
+
8029
+ return -EBUSY;
8030
+ }
8031
+ }
8032
+
66158033 return 0;
66168034 }
66178035
....@@ -6620,37 +8038,67 @@
66208038 struct rq *rq = cpu_rq(cpu);
66218039
66228040 rq->calc_load_update = calc_load_update;
6623
- update_max_interval();
66248041 }
66258042
66268043 int sched_cpu_starting(unsigned int cpu)
66278044 {
66288045 sched_rq_cpu_starting(cpu);
66298046 sched_tick_start(cpu);
8047
+ trace_android_rvh_sched_cpu_starting(cpu);
66308048 return 0;
66318049 }
66328050
66338051 #ifdef CONFIG_HOTPLUG_CPU
8052
+
8053
+/*
8054
+ * Invoked immediately before the stopper thread is invoked to bring the
8055
+ * CPU down completely. At this point all per CPU kthreads except the
8056
+ * hotplug thread (current) and the stopper thread (inactive) have been
8057
+ * either parked or have been unbound from the outgoing CPU. Ensure that
8058
+ * any of those which might be on the way out are gone.
8059
+ *
8060
+ * If after this point a bound task is being woken on this CPU then the
8061
+ * responsible hotplug callback has failed to do it's job.
8062
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
8063
+ */
8064
+int sched_cpu_wait_empty(unsigned int cpu)
8065
+{
8066
+ balance_hotplug_wait();
8067
+ return 0;
8068
+}
8069
+
8070
+/*
8071
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
8072
+ * might have. Called from the CPU stopper task after ensuring that the
8073
+ * stopper is the last running task on the CPU, so nr_active count is
8074
+ * stable. We need to take the teardown thread which is calling this into
8075
+ * account, so we hand in adjust = 1 to the load calculation.
8076
+ *
8077
+ * Also see the comment "Global load-average calculations".
8078
+ */
8079
+static void calc_load_migrate(struct rq *rq)
8080
+{
8081
+ long delta = calc_load_fold_active(rq, 1);
8082
+
8083
+ if (delta)
8084
+ atomic_long_add(delta, &calc_load_tasks);
8085
+}
8086
+
66348087 int sched_cpu_dying(unsigned int cpu)
66358088 {
66368089 struct rq *rq = cpu_rq(cpu);
66378090 struct rq_flags rf;
66388091
66398092 /* Handle pending wakeups and then migrate everything off */
6640
- sched_ttwu_pending();
66418093 sched_tick_stop(cpu);
66428094
66438095 rq_lock_irqsave(rq, &rf);
6644
- if (rq->rd) {
6645
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6646
- set_rq_offline(rq);
6647
- }
6648
- migrate_tasks(rq, &rf);
6649
- BUG_ON(rq->nr_running != 1);
8096
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
66508097 rq_unlock_irqrestore(rq, &rf);
66518098
8099
+ trace_android_rvh_sched_cpu_dying(cpu);
8100
+
66528101 calc_load_migrate(rq);
6653
- update_max_interval();
66548102 nohz_balance_exit_idle(rq);
66558103 hrtick_clear(rq);
66568104 return 0;
....@@ -6664,18 +8112,16 @@
66648112 /*
66658113 * There's no userspace yet to cause hotplug operations; hence all the
66668114 * CPU masks are stable and all blatant races in the below code cannot
6667
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6668
- * but there won't be any contention on it.
8115
+ * happen.
66698116 */
6670
- cpus_read_lock();
66718117 mutex_lock(&sched_domains_mutex);
66728118 sched_init_domains(cpu_active_mask);
66738119 mutex_unlock(&sched_domains_mutex);
6674
- cpus_read_unlock();
66758120
66768121 /* Move init over to a non-isolated CPU */
66778122 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
66788123 BUG();
8124
+
66798125 sched_init_granularity();
66808126
66818127 init_sched_rt_class();
....@@ -6686,7 +8132,7 @@
66868132
66878133 static int __init migration_init(void)
66888134 {
6689
- sched_rq_cpu_starting(smp_processor_id());
8135
+ sched_cpu_starting(smp_processor_id());
66908136 return 0;
66918137 }
66928138 early_initcall(migration_init);
....@@ -6711,7 +8157,9 @@
67118157 * Every task in system belongs to this group at bootup.
67128158 */
67138159 struct task_group root_task_group;
8160
+EXPORT_SYMBOL_GPL(root_task_group);
67148161 LIST_HEAD(task_groups);
8162
+EXPORT_SYMBOL_GPL(task_groups);
67158163
67168164 /* Cacheline aligned slab cache for task_group */
67178165 static struct kmem_cache *task_group_cache __read_mostly;
....@@ -6722,19 +8170,27 @@
67228170
67238171 void __init sched_init(void)
67248172 {
6725
- int i, j;
6726
- unsigned long alloc_size = 0, ptr;
8173
+ unsigned long ptr = 0;
8174
+ int i;
8175
+
8176
+ /* Make sure the linker didn't screw up */
8177
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8178
+ &fair_sched_class + 1 != &rt_sched_class ||
8179
+ &rt_sched_class + 1 != &dl_sched_class);
8180
+#ifdef CONFIG_SMP
8181
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8182
+#endif
67278183
67288184 wait_bit_init();
67298185
67308186 #ifdef CONFIG_FAIR_GROUP_SCHED
6731
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8187
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67328188 #endif
67338189 #ifdef CONFIG_RT_GROUP_SCHED
6734
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8190
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
67358191 #endif
6736
- if (alloc_size) {
6737
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8192
+ if (ptr) {
8193
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
67388194
67398195 #ifdef CONFIG_FAIR_GROUP_SCHED
67408196 root_task_group.se = (struct sched_entity **)ptr;
....@@ -6743,6 +8199,8 @@
67438199 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
67448200 ptr += nr_cpu_ids * sizeof(void **);
67458201
8202
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8203
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
67468204 #endif /* CONFIG_FAIR_GROUP_SCHED */
67478205 #ifdef CONFIG_RT_GROUP_SCHED
67488206 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
....@@ -6795,7 +8253,6 @@
67958253 init_rt_rq(&rq->rt);
67968254 init_dl_rq(&rq->dl);
67978255 #ifdef CONFIG_FAIR_GROUP_SCHED
6798
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
67998256 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
68008257 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
68018258 /*
....@@ -6817,7 +8274,6 @@
68178274 * We achieve this by letting root_task_group's tasks sit
68188275 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
68198276 */
6820
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
68218277 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
68228278 #endif /* CONFIG_FAIR_GROUP_SCHED */
68238279
....@@ -6825,10 +8281,6 @@
68258281 #ifdef CONFIG_RT_GROUP_SCHED
68268282 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
68278283 #endif
6828
-
6829
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6830
- rq->cpu_load[j] = 0;
6831
-
68328284 #ifdef CONFIG_SMP
68338285 rq->sd = NULL;
68348286 rq->rd = NULL;
....@@ -6847,16 +8299,20 @@
68478299
68488300 rq_attach_root(rq, &def_root_domain);
68498301 #ifdef CONFIG_NO_HZ_COMMON
6850
- rq->last_load_update_tick = jiffies;
68518302 rq->last_blocked_load_update_tick = jiffies;
68528303 atomic_set(&rq->nohz_flags, 0);
8304
+
8305
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
8306
+#endif
8307
+#ifdef CONFIG_HOTPLUG_CPU
8308
+ rcuwait_init(&rq->hotplug_wait);
68538309 #endif
68548310 #endif /* CONFIG_SMP */
68558311 hrtick_rq_init(rq);
68568312 atomic_set(&rq->nr_iowait, 0);
68578313 }
68588314
6859
- set_load_weight(&init_task, false);
8315
+ set_load_weight(&init_task);
68608316
68618317 /*
68628318 * The boot idle thread does lazy MMU switching as well:
....@@ -6925,7 +8381,7 @@
69258381 rcu_sleep_check();
69268382
69278383 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6928
- !is_idle_task(current)) ||
8384
+ !is_idle_task(current) && !current->non_block_count) ||
69298385 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
69308386 oops_in_progress)
69318387 return;
....@@ -6941,8 +8397,8 @@
69418397 "BUG: sleeping function called from invalid context at %s:%d\n",
69428398 file, line);
69438399 printk(KERN_ERR
6944
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6945
- in_atomic(), irqs_disabled(),
8400
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
8401
+ in_atomic(), irqs_disabled(), current->non_block_count,
69468402 current->pid, current->comm);
69478403
69488404 if (task_stack_end_corrupted(current))
....@@ -6954,13 +8410,76 @@
69548410 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
69558411 && !preempt_count_equals(preempt_offset)) {
69568412 pr_err("Preemption disabled at:");
6957
- print_ip_sym(preempt_disable_ip);
6958
- pr_cont("\n");
8413
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
69598414 }
8415
+
8416
+ trace_android_rvh_schedule_bug(NULL);
8417
+
69608418 dump_stack();
69618419 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
69628420 }
69638421 EXPORT_SYMBOL(___might_sleep);
8422
+
8423
+void __cant_sleep(const char *file, int line, int preempt_offset)
8424
+{
8425
+ static unsigned long prev_jiffy;
8426
+
8427
+ if (irqs_disabled())
8428
+ return;
8429
+
8430
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8431
+ return;
8432
+
8433
+ if (preempt_count() > preempt_offset)
8434
+ return;
8435
+
8436
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8437
+ return;
8438
+ prev_jiffy = jiffies;
8439
+
8440
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8441
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8442
+ in_atomic(), irqs_disabled(),
8443
+ current->pid, current->comm);
8444
+
8445
+ debug_show_held_locks(current);
8446
+ dump_stack();
8447
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8448
+}
8449
+EXPORT_SYMBOL_GPL(__cant_sleep);
8450
+
8451
+#ifdef CONFIG_SMP
8452
+void __cant_migrate(const char *file, int line)
8453
+{
8454
+ static unsigned long prev_jiffy;
8455
+
8456
+ if (irqs_disabled())
8457
+ return;
8458
+
8459
+ if (is_migration_disabled(current))
8460
+ return;
8461
+
8462
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8463
+ return;
8464
+
8465
+ if (preempt_count() > 0)
8466
+ return;
8467
+
8468
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8469
+ return;
8470
+ prev_jiffy = jiffies;
8471
+
8472
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
8473
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
8474
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
8475
+ current->pid, current->comm);
8476
+
8477
+ debug_show_held_locks(current);
8478
+ dump_stack();
8479
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8480
+}
8481
+EXPORT_SYMBOL_GPL(__cant_migrate);
8482
+#endif
69648483 #endif
69658484
69668485 #ifdef CONFIG_MAGIC_SYSRQ
....@@ -7029,7 +8548,7 @@
70298548
70308549 #ifdef CONFIG_IA64
70318550 /**
7032
- * set_curr_task - set the current task for a given CPU.
8551
+ * ia64_set_curr_task - set the current task for a given CPU.
70338552 * @cpu: the processor in question.
70348553 * @p: the task pointer to set.
70358554 *
....@@ -7195,8 +8714,15 @@
71958714
71968715 if (queued)
71978716 enqueue_task(rq, tsk, queue_flags);
7198
- if (running)
7199
- set_curr_task(rq, tsk);
8717
+ if (running) {
8718
+ set_next_task(rq, tsk);
8719
+ /*
8720
+ * After changing group, the running task may have joined a
8721
+ * throttled one but it's still the running task. Trigger a
8722
+ * resched to make sure that task can still run.
8723
+ */
8724
+ resched_curr(rq);
8725
+ }
72008726
72018727 task_rq_unlock(rq, tsk, &rf);
72028728 }
....@@ -7235,9 +8761,14 @@
72358761
72368762 #ifdef CONFIG_UCLAMP_TASK_GROUP
72378763 /* Propagate the effective uclamp value for the new group */
8764
+ mutex_lock(&uclamp_mutex);
8765
+ rcu_read_lock();
72388766 cpu_util_update_eff(css);
8767
+ rcu_read_unlock();
8768
+ mutex_unlock(&uclamp_mutex);
72398769 #endif
72408770
8771
+ trace_android_rvh_cpu_cgroup_online(css);
72418772 return 0;
72428773 }
72438774
....@@ -7303,6 +8834,9 @@
73038834 if (ret)
73048835 break;
73058836 }
8837
+
8838
+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
8839
+
73068840 return ret;
73078841 }
73088842
....@@ -7313,6 +8847,8 @@
73138847
73148848 cgroup_taskset_for_each(task, css, tset)
73158849 sched_move_task(task);
8850
+
8851
+ trace_android_rvh_cpu_cgroup_attach(tset);
73168852 }
73178853
73188854 #ifdef CONFIG_UCLAMP_TASK_GROUP
....@@ -7324,6 +8860,9 @@
73248860 unsigned int eff[UCLAMP_CNT];
73258861 enum uclamp_id clamp_id;
73268862 unsigned int clamps;
8863
+
8864
+ lockdep_assert_held(&uclamp_mutex);
8865
+ SCHED_WARN_ON(!rcu_read_lock_held());
73278866
73288867 css_for_each_descendant_pre(css, top_css) {
73298868 uc_parent = css_tg(css)->parent
....@@ -7357,7 +8896,7 @@
73578896 }
73588897
73598898 /* Immediately update descendants RUNNABLE tasks */
7360
- uclamp_update_active_tasks(css, clamps);
8899
+ uclamp_update_active_tasks(css);
73618900 }
73628901 }
73638902
....@@ -7414,6 +8953,8 @@
74148953 req = capacity_from_percent(buf);
74158954 if (req.ret)
74168955 return req.ret;
8956
+
8957
+ static_branch_enable(&sched_uclamp_used);
74178958
74188959 mutex_lock(&uclamp_mutex);
74198960 rcu_read_lock();
....@@ -7529,7 +9070,9 @@
75299070 static DEFINE_MUTEX(cfs_constraints_mutex);
75309071
75319072 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7532
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9073
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9074
+/* More than 203 days if BW_SHIFT equals 20. */
9075
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
75339076
75349077 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
75359078
....@@ -7555,6 +9098,12 @@
75559098 * feasibility.
75569099 */
75579100 if (period > max_cfs_quota_period)
9101
+ return -EINVAL;
9102
+
9103
+ /*
9104
+ * Bound quota to defend quota against overflow during bandwidth shift.
9105
+ */
9106
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
75589107 return -EINVAL;
75599108
75609109 /*
....@@ -7609,7 +9158,7 @@
76099158 return ret;
76109159 }
76119160
7612
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9161
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
76139162 {
76149163 u64 quota, period;
76159164
....@@ -7624,7 +9173,7 @@
76249173 return tg_set_cfs_bandwidth(tg, period, quota);
76259174 }
76269175
7627
-long tg_get_cfs_quota(struct task_group *tg)
9176
+static long tg_get_cfs_quota(struct task_group *tg)
76289177 {
76299178 u64 quota_us;
76309179
....@@ -7637,7 +9186,7 @@
76379186 return quota_us;
76389187 }
76399188
7640
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9189
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
76419190 {
76429191 u64 quota, period;
76439192
....@@ -7650,7 +9199,7 @@
76509199 return tg_set_cfs_bandwidth(tg, period, quota);
76519200 }
76529201
7653
-long tg_get_cfs_period(struct task_group *tg)
9202
+static long tg_get_cfs_period(struct task_group *tg)
76549203 {
76559204 u64 cfs_period_us;
76569205
....@@ -8127,172 +9676,7 @@
81279676 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
81289677 };
81299678
8130
-#undef CREATE_TRACE_POINTS
8131
-
8132
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133
-
8134
-static inline void
8135
-update_nr_migratory(struct task_struct *p, long delta)
9679
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
81369680 {
8137
- if (unlikely((p->sched_class == &rt_sched_class ||
8138
- p->sched_class == &dl_sched_class) &&
8139
- p->nr_cpus_allowed > 1)) {
8140
- if (p->sched_class == &rt_sched_class)
8141
- task_rq(p)->rt.rt_nr_migratory += delta;
8142
- else
8143
- task_rq(p)->dl.dl_nr_migratory += delta;
8144
- }
9681
+ trace_sched_update_nr_running_tp(rq, count);
81459682 }
8146
-
8147
-static inline void
8148
-migrate_disable_update_cpus_allowed(struct task_struct *p)
8149
-{
8150
- p->cpus_ptr = cpumask_of(smp_processor_id());
8151
- update_nr_migratory(p, -1);
8152
- p->nr_cpus_allowed = 1;
8153
-}
8154
-
8155
-static inline void
8156
-migrate_enable_update_cpus_allowed(struct task_struct *p)
8157
-{
8158
- struct rq *rq;
8159
- struct rq_flags rf;
8160
-
8161
- rq = task_rq_lock(p, &rf);
8162
- p->cpus_ptr = &p->cpus_mask;
8163
- p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164
- update_nr_migratory(p, 1);
8165
- task_rq_unlock(rq, p, &rf);
8166
-}
8167
-
8168
-void migrate_disable(void)
8169
-{
8170
- preempt_disable();
8171
-
8172
- if (++current->migrate_disable == 1) {
8173
- this_rq()->nr_pinned++;
8174
- preempt_lazy_disable();
8175
-#ifdef CONFIG_SCHED_DEBUG
8176
- WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177
- current->pinned_on_cpu = smp_processor_id();
8178
-#endif
8179
- }
8180
-
8181
- preempt_enable();
8182
-}
8183
-EXPORT_SYMBOL(migrate_disable);
8184
-
8185
-static void migrate_disabled_sched(struct task_struct *p)
8186
-{
8187
- if (p->migrate_disable_scheduled)
8188
- return;
8189
-
8190
- migrate_disable_update_cpus_allowed(p);
8191
- p->migrate_disable_scheduled = 1;
8192
-}
8193
-
8194
-static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195
-static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196
-
8197
-void migrate_enable(void)
8198
-{
8199
- struct task_struct *p = current;
8200
- struct rq *rq = this_rq();
8201
- int cpu = task_cpu(p);
8202
-
8203
- WARN_ON_ONCE(p->migrate_disable <= 0);
8204
- if (p->migrate_disable > 1) {
8205
- p->migrate_disable--;
8206
- return;
8207
- }
8208
-
8209
- preempt_disable();
8210
-
8211
-#ifdef CONFIG_SCHED_DEBUG
8212
- WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213
- current->pinned_on_cpu = -1;
8214
-#endif
8215
-
8216
- WARN_ON_ONCE(rq->nr_pinned < 1);
8217
-
8218
- p->migrate_disable = 0;
8219
- rq->nr_pinned--;
8220
-#ifdef CONFIG_HOTPLUG_CPU
8221
- if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222
- takedown_cpu_task)
8223
- wake_up_process(takedown_cpu_task);
8224
-#endif
8225
-
8226
- if (!p->migrate_disable_scheduled)
8227
- goto out;
8228
-
8229
- p->migrate_disable_scheduled = 0;
8230
-
8231
- migrate_enable_update_cpus_allowed(p);
8232
-
8233
- WARN_ON(smp_processor_id() != cpu);
8234
- if (!is_cpu_allowed(p, cpu)) {
8235
- struct migration_arg __percpu *arg;
8236
- struct cpu_stop_work __percpu *work;
8237
- struct rq_flags rf;
8238
-
8239
- work = this_cpu_ptr(&migrate_work);
8240
- arg = this_cpu_ptr(&migrate_arg);
8241
- WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242
-
8243
- arg->task = p;
8244
- arg->done = false;
8245
-
8246
- rq = task_rq_lock(p, &rf);
8247
- update_rq_clock(rq);
8248
- arg->dest_cpu = select_fallback_rq(cpu, p);
8249
- task_rq_unlock(rq, p, &rf);
8250
-
8251
- stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252
- arg, work);
8253
- tlb_migrate_finish(p->mm);
8254
- }
8255
-
8256
-out:
8257
- preempt_lazy_enable();
8258
- preempt_enable();
8259
-}
8260
-EXPORT_SYMBOL(migrate_enable);
8261
-
8262
-int cpu_nr_pinned(int cpu)
8263
-{
8264
- struct rq *rq = cpu_rq(cpu);
8265
-
8266
- return rq->nr_pinned;
8267
-}
8268
-
8269
-#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270
-static void migrate_disabled_sched(struct task_struct *p)
8271
-{
8272
-}
8273
-
8274
-void migrate_disable(void)
8275
-{
8276
-#ifdef CONFIG_SCHED_DEBUG
8277
- current->migrate_disable++;
8278
-#endif
8279
- barrier();
8280
-}
8281
-EXPORT_SYMBOL(migrate_disable);
8282
-
8283
-void migrate_enable(void)
8284
-{
8285
-#ifdef CONFIG_SCHED_DEBUG
8286
- struct task_struct *p = current;
8287
-
8288
- WARN_ON_ONCE(p->migrate_disable <= 0);
8289
- p->migrate_disable--;
8290
-#endif
8291
- barrier();
8292
-}
8293
-EXPORT_SYMBOL(migrate_enable);
8294
-#else
8295
-static void migrate_disabled_sched(struct task_struct *p)
8296
-{
8297
-}
8298
-#endif