hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/kernel/sched/core.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * kernel/sched/core.c
34 *
....@@ -5,6 +6,10 @@
56 *
67 * Copyright (C) 1991-2002 Linus Torvalds
78 */
9
+#define CREATE_TRACE_POINTS
10
+#include <trace/events/sched.h>
11
+#undef CREATE_TRACE_POINTS
12
+
813 #include "sched.h"
914
1015 #include <linux/nospec.h>
....@@ -16,14 +21,41 @@
1621 #include <asm/tlb.h>
1722
1823 #include "../workqueue_internal.h"
24
+#include "../../io_uring/io-wq.h"
1925 #include "../smpboot.h"
2026
2127 #include "pelt.h"
28
+#include "smp.h"
2229
23
-#define CREATE_TRACE_POINTS
24
-#include <trace/events/sched.h>
30
+#include <trace/hooks/sched.h>
31
+#include <trace/hooks/dtask.h>
32
+
33
+/*
34
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
35
+ * associated with them) to allow external modules to probe them.
36
+ */
37
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
38
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
39
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
40
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
41
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
42
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
43
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
44
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
45
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
46
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
47
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
48
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch);
49
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50
+#ifdef CONFIG_SCHEDSTATS
51
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55
+#endif
2556
2657 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
+EXPORT_SYMBOL_GPL(runqueues);
2759
2860 #ifdef CONFIG_SCHED_DEBUG
2961 /*
....@@ -38,6 +70,7 @@
3870 const_debug unsigned int sysctl_sched_features =
3971 #include "features.h"
4072 0;
73
+EXPORT_SYMBOL_GPL(sysctl_sched_features);
4174 #undef SCHED_FEAT
4275 #endif
4376
....@@ -60,6 +93,100 @@
6093 * default: 0.95s
6194 */
6295 int sysctl_sched_rt_runtime = 950000;
96
+
97
+
98
+/*
99
+ * Serialization rules:
100
+ *
101
+ * Lock order:
102
+ *
103
+ * p->pi_lock
104
+ * rq->lock
105
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
106
+ *
107
+ * rq1->lock
108
+ * rq2->lock where: rq1 < rq2
109
+ *
110
+ * Regular state:
111
+ *
112
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
113
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
114
+ * always looks at the local rq data structures to find the most elegible task
115
+ * to run next.
116
+ *
117
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
118
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
119
+ * the local CPU to avoid bouncing the runqueue state around [ see
120
+ * ttwu_queue_wakelist() ]
121
+ *
122
+ * Task wakeup, specifically wakeups that involve migration, are horribly
123
+ * complicated to avoid having to take two rq->locks.
124
+ *
125
+ * Special state:
126
+ *
127
+ * System-calls and anything external will use task_rq_lock() which acquires
128
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
129
+ * stable while holding either lock:
130
+ *
131
+ * - sched_setaffinity()/
132
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
133
+ * - set_user_nice(): p->se.load, p->*prio
134
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
135
+ * p->se.load, p->rt_priority,
136
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
137
+ * - sched_setnuma(): p->numa_preferred_nid
138
+ * - sched_move_task()/
139
+ * cpu_cgroup_fork(): p->sched_task_group
140
+ * - uclamp_update_active() p->uclamp*
141
+ *
142
+ * p->state <- TASK_*:
143
+ *
144
+ * is changed locklessly using set_current_state(), __set_current_state() or
145
+ * set_special_state(), see their respective comments, or by
146
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
147
+ * concurrent self.
148
+ *
149
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
150
+ *
151
+ * is set by activate_task() and cleared by deactivate_task(), under
152
+ * rq->lock. Non-zero indicates the task is runnable, the special
153
+ * ON_RQ_MIGRATING state is used for migration without holding both
154
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
155
+ *
156
+ * p->on_cpu <- { 0, 1 }:
157
+ *
158
+ * is set by prepare_task() and cleared by finish_task() such that it will be
159
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
160
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
161
+ *
162
+ * [ The astute reader will observe that it is possible for two tasks on one
163
+ * CPU to have ->on_cpu = 1 at the same time. ]
164
+ *
165
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
166
+ *
167
+ * - Don't call set_task_cpu() on a blocked task:
168
+ *
169
+ * We don't care what CPU we're not running on, this simplifies hotplug,
170
+ * the CPU assignment of blocked tasks isn't required to be valid.
171
+ *
172
+ * - for try_to_wake_up(), called under p->pi_lock:
173
+ *
174
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
175
+ *
176
+ * - for migration called under rq->lock:
177
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
178
+ *
179
+ * o move_queued_task()
180
+ * o detach_task()
181
+ *
182
+ * - for migration called under double_rq_lock():
183
+ *
184
+ * o __migrate_swap_task()
185
+ * o push_rt_task() / pull_rt_task()
186
+ * o push_dl_task() / pull_dl_task()
187
+ * o dl_task_offline_migration()
188
+ *
189
+ */
63190
64191 /*
65192 * __task_rq_lock - lock the rq @p resides on.
....@@ -84,6 +211,7 @@
84211 cpu_relax();
85212 }
86213 }
214
+EXPORT_SYMBOL_GPL(__task_rq_lock);
87215
88216 /*
89217 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
....@@ -126,6 +254,7 @@
126254 cpu_relax();
127255 }
128256 }
257
+EXPORT_SYMBOL_GPL(task_rq_lock);
129258
130259 /*
131260 * RQ-clock updating methods:
....@@ -206,7 +335,15 @@
206335 rq->clock += delta;
207336 update_rq_clock_task(rq, delta);
208337 }
338
+EXPORT_SYMBOL_GPL(update_rq_clock);
209339
340
+static inline void
341
+rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func)
342
+{
343
+ csd->flags = 0;
344
+ csd->func = func;
345
+ csd->info = rq;
346
+}
210347
211348 #ifdef CONFIG_SCHED_HRTICK
212349 /*
....@@ -243,8 +380,9 @@
243380 static void __hrtick_restart(struct rq *rq)
244381 {
245382 struct hrtimer *timer = &rq->hrtick_timer;
383
+ ktime_t time = rq->hrtick_time;
246384
247
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
385
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
248386 }
249387
250388 /*
....@@ -257,7 +395,6 @@
257395
258396 rq_lock(rq, &rf);
259397 __hrtick_restart(rq);
260
- rq->hrtick_csd_pending = 0;
261398 rq_unlock(rq, &rf);
262399 }
263400
....@@ -269,7 +406,6 @@
269406 void hrtick_start(struct rq *rq, u64 delay)
270407 {
271408 struct hrtimer *timer = &rq->hrtick_timer;
272
- ktime_t time;
273409 s64 delta;
274410
275411 /*
....@@ -277,16 +413,12 @@
277413 * doesn't make sense and can cause timer DoS.
278414 */
279415 delta = max_t(s64, delay, 10000LL);
280
- time = ktime_add_ns(timer->base->get_time(), delta);
416
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
281417
282
- hrtimer_set_expires(timer, time);
283
-
284
- if (rq == this_rq()) {
418
+ if (rq == this_rq())
285419 __hrtick_restart(rq);
286
- } else if (!rq->hrtick_csd_pending) {
420
+ else
287421 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
288
- rq->hrtick_csd_pending = 1;
289
- }
290422 }
291423
292424 #else
....@@ -303,21 +435,17 @@
303435 */
304436 delay = max_t(u64, delay, 10000LL);
305437 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
306
- HRTIMER_MODE_REL_PINNED);
438
+ HRTIMER_MODE_REL_PINNED_HARD);
307439 }
440
+
308441 #endif /* CONFIG_SMP */
309442
310443 static void hrtick_rq_init(struct rq *rq)
311444 {
312445 #ifdef CONFIG_SMP
313
- rq->hrtick_csd_pending = 0;
314
-
315
- rq->hrtick_csd.flags = 0;
316
- rq->hrtick_csd.func = __hrtick_start;
317
- rq->hrtick_csd.info = rq;
446
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
318447 #endif
319
-
320
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
448
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
321449 rq->hrtick_timer.function = hrtick;
322450 }
323451 #else /* CONFIG_SCHED_HRTICK */
....@@ -399,7 +527,7 @@
399527 #endif
400528 #endif
401529
402
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
530
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
403531 {
404532 struct wake_q_node *node = &task->wake_q;
405533
....@@ -412,23 +540,58 @@
412540 * state, even in the failed case, an explicit smp_mb() must be used.
413541 */
414542 smp_mb__before_atomic();
415
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
416
- return;
417
-
418
- head->count++;
419
-
420
- get_task_struct(task);
543
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
544
+ return false;
421545
422546 /*
423547 * The head is context local, there can be no concurrency.
424548 */
425549 *head->lastp = node;
426550 head->lastp = &node->next;
551
+ head->count++;
552
+ return true;
427553 }
428554
429
-static int
430
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
431
- int sibling_count_hint);
555
+/**
556
+ * wake_q_add() - queue a wakeup for 'later' waking.
557
+ * @head: the wake_q_head to add @task to
558
+ * @task: the task to queue for 'later' wakeup
559
+ *
560
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
561
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
562
+ * instantly.
563
+ *
564
+ * This function must be used as-if it were wake_up_process(); IOW the task
565
+ * must be ready to be woken at this location.
566
+ */
567
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
568
+{
569
+ if (__wake_q_add(head, task))
570
+ get_task_struct(task);
571
+}
572
+
573
+/**
574
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
575
+ * @head: the wake_q_head to add @task to
576
+ * @task: the task to queue for 'later' wakeup
577
+ *
578
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
579
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
580
+ * instantly.
581
+ *
582
+ * This function must be used as-if it were wake_up_process(); IOW the task
583
+ * must be ready to be woken at this location.
584
+ *
585
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
586
+ * that already hold reference to @task can call the 'safe' version and trust
587
+ * wake_q to do the right thing depending whether or not the @task is already
588
+ * queued for wakeup.
589
+ */
590
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
591
+{
592
+ if (!__wake_q_add(head, task))
593
+ put_task_struct(task);
594
+}
432595
433596 void wake_up_q(struct wake_q_head *head)
434597 {
....@@ -442,12 +605,14 @@
442605 /* Task can safely be re-inserted now: */
443606 node = node->next;
444607 task->wake_q.next = NULL;
608
+ task->wake_q_count = head->count;
445609
446610 /*
447
- * try_to_wake_up() executes a full barrier, which pairs with
611
+ * wake_up_process() executes a full barrier, which pairs with
448612 * the queueing in wake_q_add() so as not to miss wakeups.
449613 */
450
- try_to_wake_up(task, TASK_NORMAL, 0, head->count);
614
+ wake_up_process(task);
615
+ task->wake_q_count = 0;
451616 put_task_struct(task);
452617 }
453618 }
....@@ -477,15 +642,12 @@
477642 return;
478643 }
479644
480
-#ifdef CONFIG_PREEMPT
481645 if (set_nr_and_not_polling(curr))
482
-#else
483
- if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle))
484
-#endif
485646 smp_send_reschedule(cpu);
486647 else
487648 trace_sched_wake_idle_without_ipi(cpu);
488649 }
650
+EXPORT_SYMBOL_GPL(resched_curr);
489651
490652 void resched_cpu(int cpu)
491653 {
....@@ -510,27 +672,49 @@
510672 */
511673 int get_nohz_timer_target(void)
512674 {
513
- int i, cpu = smp_processor_id();
675
+ int i, cpu = smp_processor_id(), default_cpu = -1;
514676 struct sched_domain *sd;
515677
516
- if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
517
- return cpu;
678
+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
679
+ if (!idle_cpu(cpu))
680
+ return cpu;
681
+ default_cpu = cpu;
682
+ }
518683
519684 rcu_read_lock();
520685 for_each_domain(cpu, sd) {
521
- for_each_cpu(i, sched_domain_span(sd)) {
686
+ for_each_cpu_and(i, sched_domain_span(sd),
687
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
522688 if (cpu == i)
523689 continue;
524690
525
- if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
691
+ if (!idle_cpu(i)) {
526692 cpu = i;
527693 goto unlock;
528694 }
529695 }
530696 }
531697
532
- if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
533
- cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
698
+ if (default_cpu == -1) {
699
+ for_each_cpu_and(i, cpu_active_mask,
700
+ housekeeping_cpumask(HK_FLAG_TIMER)) {
701
+ if (cpu == i)
702
+ continue;
703
+
704
+ if (!idle_cpu(i)) {
705
+ cpu = i;
706
+ goto unlock;
707
+ }
708
+ }
709
+
710
+ /* no active, not-idle, housekpeeing CPU found. */
711
+ default_cpu = cpumask_any(cpu_active_mask);
712
+
713
+ if (unlikely(default_cpu >= nr_cpu_ids))
714
+ goto unlock;
715
+ }
716
+
717
+ cpu = default_cpu;
534718 unlock:
535719 rcu_read_unlock();
536720 return cpu;
....@@ -590,29 +774,23 @@
590774 wake_up_idle_cpu(cpu);
591775 }
592776
593
-static inline bool got_nohz_idle_kick(void)
777
+static void nohz_csd_func(void *info)
594778 {
595
- int cpu = smp_processor_id();
596
-
597
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
598
- return false;
599
-
600
- if (idle_cpu(cpu) && !need_resched())
601
- return true;
779
+ struct rq *rq = info;
780
+ int cpu = cpu_of(rq);
781
+ unsigned int flags;
602782
603783 /*
604
- * We can't run Idle Load Balance on this CPU for this time so we
605
- * cancel it and clear NOHZ_BALANCE_KICK
784
+ * Release the rq::nohz_csd.
606785 */
607
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
608
- return false;
609
-}
786
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
787
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
610788
611
-#else /* CONFIG_NO_HZ_COMMON */
612
-
613
-static inline bool got_nohz_idle_kick(void)
614
-{
615
- return false;
789
+ rq->idle_balance = idle_cpu(cpu);
790
+ if (rq->idle_balance && !need_resched()) {
791
+ rq->nohz_idle_balance = flags;
792
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
793
+ }
616794 }
617795
618796 #endif /* CONFIG_NO_HZ_COMMON */
....@@ -703,18 +881,18 @@
703881 }
704882 #endif
705883
706
-static void set_load_weight(struct task_struct *p, bool update_load)
884
+static void set_load_weight(struct task_struct *p)
707885 {
886
+ bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
708887 int prio = p->static_prio - MAX_RT_PRIO;
709888 struct load_weight *load = &p->se.load;
710889
711890 /*
712891 * SCHED_IDLE tasks get minimal weight:
713892 */
714
- if (idle_policy(p->policy)) {
893
+ if (task_has_idle_policy(p)) {
715894 load->weight = scale_load(WEIGHT_IDLEPRIO);
716895 load->inv_weight = WMULT_IDLEPRIO;
717
- p->se.runnable_weight = load->weight;
718896 return;
719897 }
720898
....@@ -727,7 +905,6 @@
727905 } else {
728906 load->weight = scale_load(sched_prio_to_weight[prio]);
729907 load->inv_weight = sched_prio_to_wmult[prio];
730
- p->se.runnable_weight = load->weight;
731908 }
732909 }
733910
....@@ -750,8 +927,46 @@
750927 /* Max allowed maximum utilization */
751928 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
752929
930
+/*
931
+ * By default RT tasks run at the maximum performance point/capacity of the
932
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
933
+ * SCHED_CAPACITY_SCALE.
934
+ *
935
+ * This knob allows admins to change the default behavior when uclamp is being
936
+ * used. In battery powered devices, particularly, running at the maximum
937
+ * capacity and frequency will increase energy consumption and shorten the
938
+ * battery life.
939
+ *
940
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
941
+ *
942
+ * This knob will not override the system default sched_util_clamp_min defined
943
+ * above.
944
+ */
945
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
946
+
753947 /* All clamps are required to be less or equal than these values */
754948 static struct uclamp_se uclamp_default[UCLAMP_CNT];
949
+
950
+/*
951
+ * This static key is used to reduce the uclamp overhead in the fast path. It
952
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
953
+ * enqueue/dequeue_task().
954
+ *
955
+ * This allows users to continue to enable uclamp in their kernel config with
956
+ * minimum uclamp overhead in the fast path.
957
+ *
958
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
959
+ * enabled, since we have an actual users that make use of uclamp
960
+ * functionality.
961
+ *
962
+ * The knobs that would enable this static key are:
963
+ *
964
+ * * A task modifying its uclamp value with sched_setattr().
965
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
966
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
967
+ */
968
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
969
+EXPORT_SYMBOL_GPL(sched_uclamp_used);
755970
756971 /* Integer rounded range for each bucket */
757972 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
....@@ -762,11 +977,6 @@
762977 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
763978 {
764979 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
765
-}
766
-
767
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
768
-{
769
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
770980 }
771981
772982 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
....@@ -808,7 +1018,7 @@
8081018 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
8091019 return;
8101020
811
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1021
+ uclamp_rq_set(rq, clamp_id, clamp_value);
8121022 }
8131023
8141024 static inline
....@@ -832,12 +1042,79 @@
8321042 return uclamp_idle_value(rq, clamp_id, clamp_value);
8331043 }
8341044
1045
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1046
+{
1047
+ unsigned int default_util_min;
1048
+ struct uclamp_se *uc_se;
1049
+
1050
+ lockdep_assert_held(&p->pi_lock);
1051
+
1052
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
1053
+
1054
+ /* Only sync if user didn't override the default */
1055
+ if (uc_se->user_defined)
1056
+ return;
1057
+
1058
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1059
+ uclamp_se_set(uc_se, default_util_min, false);
1060
+}
1061
+
1062
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
1063
+{
1064
+ struct rq_flags rf;
1065
+ struct rq *rq;
1066
+
1067
+ if (!rt_task(p))
1068
+ return;
1069
+
1070
+ /* Protect updates to p->uclamp_* */
1071
+ rq = task_rq_lock(p, &rf);
1072
+ __uclamp_update_util_min_rt_default(p);
1073
+ task_rq_unlock(rq, p, &rf);
1074
+}
1075
+
1076
+static void uclamp_sync_util_min_rt_default(void)
1077
+{
1078
+ struct task_struct *g, *p;
1079
+
1080
+ /*
1081
+ * copy_process() sysctl_uclamp
1082
+ * uclamp_min_rt = X;
1083
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1084
+ * // link thread smp_mb__after_spinlock()
1085
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1086
+ * sched_post_fork() for_each_process_thread()
1087
+ * __uclamp_sync_rt() __uclamp_sync_rt()
1088
+ *
1089
+ * Ensures that either sched_post_fork() will observe the new
1090
+ * uclamp_min_rt or for_each_process_thread() will observe the new
1091
+ * task.
1092
+ */
1093
+ read_lock(&tasklist_lock);
1094
+ smp_mb__after_spinlock();
1095
+ read_unlock(&tasklist_lock);
1096
+
1097
+ rcu_read_lock();
1098
+ for_each_process_thread(g, p)
1099
+ uclamp_update_util_min_rt_default(p);
1100
+ rcu_read_unlock();
1101
+}
1102
+
1103
+#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)
1104
+void rockchip_perf_uclamp_sync_util_min_rt_default(void)
1105
+{
1106
+ uclamp_sync_util_min_rt_default();
1107
+}
1108
+EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default);
1109
+#endif
1110
+
8351111 static inline struct uclamp_se
8361112 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
8371113 {
1114
+ /* Copy by value as we could modify it */
8381115 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
8391116 #ifdef CONFIG_UCLAMP_TASK_GROUP
840
- struct uclamp_se uc_max;
1117
+ unsigned int tg_min, tg_max, value;
8411118
8421119 /*
8431120 * Tasks in autogroups or root task group will be
....@@ -848,9 +1125,11 @@
8481125 if (task_group(p) == &root_task_group)
8491126 return uc_req;
8501127
851
- uc_max = task_group(p)->uclamp[clamp_id];
852
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
853
- return uc_max;
1128
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1129
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1130
+ value = uc_req.value;
1131
+ value = clamp(value, tg_min, tg_max);
1132
+ uclamp_se_set(&uc_req, value, false);
8541133 #endif
8551134
8561135 return uc_req;
....@@ -869,6 +1148,12 @@
8691148 {
8701149 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
8711150 struct uclamp_se uc_max = uclamp_default[clamp_id];
1151
+ struct uclamp_se uc_eff;
1152
+ int ret = 0;
1153
+
1154
+ trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret);
1155
+ if (ret)
1156
+ return uc_eff;
8721157
8731158 /* System default restrictions always apply */
8741159 if (unlikely(uc_req.value > uc_max.value))
....@@ -889,6 +1174,7 @@
8891174
8901175 return (unsigned long)uc_eff.value;
8911176 }
1177
+EXPORT_SYMBOL_GPL(uclamp_eff_value);
8921178
8931179 /*
8941180 * When a task is enqueued on a rq, the clamp bucket currently defined by the
....@@ -925,8 +1211,8 @@
9251211 if (bucket->tasks == 1 || uc_se->value > bucket->value)
9261212 bucket->value = uc_se->value;
9271213
928
- if (uc_se->value > READ_ONCE(uc_rq->value))
929
- WRITE_ONCE(uc_rq->value, uc_se->value);
1214
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
1215
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
9301216 }
9311217
9321218 /*
....@@ -949,10 +1235,38 @@
9491235
9501236 lockdep_assert_held(&rq->lock);
9511237
1238
+ /*
1239
+ * If sched_uclamp_used was enabled after task @p was enqueued,
1240
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
1241
+ *
1242
+ * In this case the uc_se->active flag should be false since no uclamp
1243
+ * accounting was performed at enqueue time and we can just return
1244
+ * here.
1245
+ *
1246
+ * Need to be careful of the following enqeueue/dequeue ordering
1247
+ * problem too
1248
+ *
1249
+ * enqueue(taskA)
1250
+ * // sched_uclamp_used gets enabled
1251
+ * enqueue(taskB)
1252
+ * dequeue(taskA)
1253
+ * // Must not decrement bukcet->tasks here
1254
+ * dequeue(taskB)
1255
+ *
1256
+ * where we could end up with stale data in uc_se and
1257
+ * bucket[uc_se->bucket_id].
1258
+ *
1259
+ * The following check here eliminates the possibility of such race.
1260
+ */
1261
+ if (unlikely(!uc_se->active))
1262
+ return;
1263
+
9521264 bucket = &uc_rq->bucket[uc_se->bucket_id];
1265
+
9531266 SCHED_WARN_ON(!bucket->tasks);
9541267 if (likely(bucket->tasks))
9551268 bucket->tasks--;
1269
+
9561270 uc_se->active = false;
9571271
9581272 /*
....@@ -964,7 +1278,7 @@
9641278 if (likely(bucket->tasks))
9651279 return;
9661280
967
- rq_clamp = READ_ONCE(uc_rq->value);
1281
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
9681282 /*
9691283 * Defensive programming: this should never happen. If it happens,
9701284 * e.g. due to future modification, warn and fixup the expected value.
....@@ -972,13 +1286,22 @@
9721286 SCHED_WARN_ON(bucket->value > rq_clamp);
9731287 if (bucket->value >= rq_clamp) {
9741288 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
975
- WRITE_ONCE(uc_rq->value, bkt_clamp);
1289
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
9761290 }
9771291 }
9781292
9791293 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
9801294 {
9811295 enum uclamp_id clamp_id;
1296
+
1297
+ /*
1298
+ * Avoid any overhead until uclamp is actually used by the userspace.
1299
+ *
1300
+ * The condition is constructed such that a NOP is generated when
1301
+ * sched_uclamp_used is disabled.
1302
+ */
1303
+ if (!static_branch_unlikely(&sched_uclamp_used))
1304
+ return;
9821305
9831306 if (unlikely(!p->sched_class->uclamp_enabled))
9841307 return;
....@@ -995,6 +1318,15 @@
9951318 {
9961319 enum uclamp_id clamp_id;
9971320
1321
+ /*
1322
+ * Avoid any overhead until uclamp is actually used by the userspace.
1323
+ *
1324
+ * The condition is constructed such that a NOP is generated when
1325
+ * sched_uclamp_used is disabled.
1326
+ */
1327
+ if (!static_branch_unlikely(&sched_uclamp_used))
1328
+ return;
1329
+
9981330 if (unlikely(!p->sched_class->uclamp_enabled))
9991331 return;
10001332
....@@ -1002,9 +1334,27 @@
10021334 uclamp_rq_dec_id(rq, p, clamp_id);
10031335 }
10041336
1005
-static inline void
1006
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1337
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1338
+ enum uclamp_id clamp_id)
10071339 {
1340
+ if (!p->uclamp[clamp_id].active)
1341
+ return;
1342
+
1343
+ uclamp_rq_dec_id(rq, p, clamp_id);
1344
+ uclamp_rq_inc_id(rq, p, clamp_id);
1345
+
1346
+ /*
1347
+ * Make sure to clear the idle flag if we've transiently reached 0
1348
+ * active tasks on rq.
1349
+ */
1350
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1351
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1352
+}
1353
+
1354
+static inline void
1355
+uclamp_update_active(struct task_struct *p)
1356
+{
1357
+ enum uclamp_id clamp_id;
10081358 struct rq_flags rf;
10091359 struct rq *rq;
10101360
....@@ -1024,30 +1374,22 @@
10241374 * affecting a valid clamp bucket, the next time it's enqueued,
10251375 * it will already see the updated clamp bucket value.
10261376 */
1027
- if (p->uclamp[clamp_id].active) {
1028
- uclamp_rq_dec_id(rq, p, clamp_id);
1029
- uclamp_rq_inc_id(rq, p, clamp_id);
1030
- }
1377
+ for_each_clamp_id(clamp_id)
1378
+ uclamp_rq_reinc_id(rq, p, clamp_id);
10311379
10321380 task_rq_unlock(rq, p, &rf);
10331381 }
10341382
10351383 #ifdef CONFIG_UCLAMP_TASK_GROUP
10361384 static inline void
1037
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1038
- unsigned int clamps)
1385
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
10391386 {
1040
- enum uclamp_id clamp_id;
10411387 struct css_task_iter it;
10421388 struct task_struct *p;
10431389
10441390 css_task_iter_start(css, 0, &it);
1045
- while ((p = css_task_iter_next(&it))) {
1046
- for_each_clamp_id(clamp_id) {
1047
- if ((0x1 << clamp_id) & clamps)
1048
- uclamp_update_active(p, clamp_id);
1049
- }
1050
- }
1391
+ while ((p = css_task_iter_next(&it)))
1392
+ uclamp_update_active(p);
10511393 css_task_iter_end(&it);
10521394 }
10531395
....@@ -1070,16 +1412,16 @@
10701412 #endif
10711413
10721414 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1073
- void __user *buffer, size_t *lenp,
1074
- loff_t *ppos)
1415
+ void *buffer, size_t *lenp, loff_t *ppos)
10751416 {
10761417 bool update_root_tg = false;
1077
- int old_min, old_max;
1418
+ int old_min, old_max, old_min_rt;
10781419 int result;
10791420
10801421 mutex_lock(&uclamp_mutex);
10811422 old_min = sysctl_sched_uclamp_util_min;
10821423 old_max = sysctl_sched_uclamp_util_max;
1424
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
10831425
10841426 result = proc_dointvec(table, write, buffer, lenp, ppos);
10851427 if (result)
....@@ -1088,7 +1430,9 @@
10881430 goto done;
10891431
10901432 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1091
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1433
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1434
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1435
+
10921436 result = -EINVAL;
10931437 goto undo;
10941438 }
....@@ -1104,8 +1448,15 @@
11041448 update_root_tg = true;
11051449 }
11061450
1107
- if (update_root_tg)
1451
+ if (update_root_tg) {
1452
+ static_branch_enable(&sched_uclamp_used);
11081453 uclamp_update_root_tg();
1454
+ }
1455
+
1456
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1457
+ static_branch_enable(&sched_uclamp_used);
1458
+ uclamp_sync_util_min_rt_default();
1459
+ }
11091460
11101461 /*
11111462 * We update all RUNNABLE tasks only when task groups are in use.
....@@ -1118,6 +1469,7 @@
11181469 undo:
11191470 sysctl_sched_uclamp_util_min = old_min;
11201471 sysctl_sched_uclamp_util_max = old_max;
1472
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
11211473 done:
11221474 mutex_unlock(&uclamp_mutex);
11231475
....@@ -1127,20 +1479,61 @@
11271479 static int uclamp_validate(struct task_struct *p,
11281480 const struct sched_attr *attr)
11291481 {
1130
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1131
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1482
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
1483
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
11321484
1133
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1134
- lower_bound = attr->sched_util_min;
1135
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1136
- upper_bound = attr->sched_util_max;
1485
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1486
+ util_min = attr->sched_util_min;
11371487
1138
- if (lower_bound > upper_bound)
1488
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1489
+ return -EINVAL;
1490
+ }
1491
+
1492
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1493
+ util_max = attr->sched_util_max;
1494
+
1495
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1496
+ return -EINVAL;
1497
+ }
1498
+
1499
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
11391500 return -EINVAL;
1140
- if (upper_bound > SCHED_CAPACITY_SCALE)
1141
- return -EINVAL;
1501
+
1502
+ /*
1503
+ * We have valid uclamp attributes; make sure uclamp is enabled.
1504
+ *
1505
+ * We need to do that here, because enabling static branches is a
1506
+ * blocking operation which obviously cannot be done while holding
1507
+ * scheduler locks.
1508
+ */
1509
+ static_branch_enable(&sched_uclamp_used);
11421510
11431511 return 0;
1512
+}
1513
+
1514
+static bool uclamp_reset(const struct sched_attr *attr,
1515
+ enum uclamp_id clamp_id,
1516
+ struct uclamp_se *uc_se)
1517
+{
1518
+ /* Reset on sched class change for a non user-defined clamp value. */
1519
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1520
+ !uc_se->user_defined)
1521
+ return true;
1522
+
1523
+ /* Reset on sched_util_{min,max} == -1. */
1524
+ if (clamp_id == UCLAMP_MIN &&
1525
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1526
+ attr->sched_util_min == -1) {
1527
+ return true;
1528
+ }
1529
+
1530
+ if (clamp_id == UCLAMP_MAX &&
1531
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1532
+ attr->sched_util_max == -1) {
1533
+ return true;
1534
+ }
1535
+
1536
+ return false;
11441537 }
11451538
11461539 static void __setscheduler_uclamp(struct task_struct *p,
....@@ -1148,40 +1541,41 @@
11481541 {
11491542 enum uclamp_id clamp_id;
11501543
1151
- /*
1152
- * On scheduling class change, reset to default clamps for tasks
1153
- * without a task-specific value.
1154
- */
11551544 for_each_clamp_id(clamp_id) {
11561545 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1157
- unsigned int clamp_value = uclamp_none(clamp_id);
1546
+ unsigned int value;
11581547
1159
- /* Keep using defined clamps across class changes */
1160
- if (uc_se->user_defined)
1548
+ if (!uclamp_reset(attr, clamp_id, uc_se))
11611549 continue;
11621550
1163
- /* By default, RT tasks always get 100% boost */
1164
- if (sched_feat(SUGOV_RT_MAX_FREQ) &&
1165
- unlikely(rt_task(p) &&
1166
- clamp_id == UCLAMP_MIN)) {
1551
+ /*
1552
+ * RT by default have a 100% boost value that could be modified
1553
+ * at runtime.
1554
+ */
1555
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1556
+ value = sysctl_sched_uclamp_util_min_rt_default;
1557
+ else
1558
+ value = uclamp_none(clamp_id);
11671559
1168
- clamp_value = uclamp_none(UCLAMP_MAX);
1169
- }
1560
+ uclamp_se_set(uc_se, value, false);
11701561
1171
- uclamp_se_set(uc_se, clamp_value, false);
11721562 }
11731563
11741564 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
11751565 return;
11761566
1177
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1567
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1568
+ attr->sched_util_min != -1) {
11781569 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
11791570 attr->sched_util_min, true);
1571
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
11801572 }
11811573
1182
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1574
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1575
+ attr->sched_util_max != -1) {
11831576 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
11841577 attr->sched_util_max, true);
1578
+ trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
11851579 }
11861580 }
11871581
....@@ -1189,6 +1583,10 @@
11891583 {
11901584 enum uclamp_id clamp_id;
11911585
1586
+ /*
1587
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1588
+ * as the task is still at its early fork stages.
1589
+ */
11921590 for_each_clamp_id(clamp_id)
11931591 p->uclamp[clamp_id].active = false;
11941592
....@@ -1201,39 +1599,24 @@
12011599 }
12021600 }
12031601
1204
-#ifdef CONFIG_SMP
1205
-unsigned int uclamp_task(struct task_struct *p)
1602
+static void uclamp_post_fork(struct task_struct *p)
12061603 {
1207
- unsigned long util;
1208
-
1209
- util = task_util_est(p);
1210
- util = max(util, uclamp_eff_value(p, UCLAMP_MIN));
1211
- util = min(util, uclamp_eff_value(p, UCLAMP_MAX));
1212
-
1213
- return util;
1604
+ uclamp_update_util_min_rt_default(p);
12141605 }
12151606
1216
-bool uclamp_boosted(struct task_struct *p)
1607
+static void __init init_uclamp_rq(struct rq *rq)
12171608 {
1218
- return uclamp_eff_value(p, UCLAMP_MIN) > 0;
1609
+ enum uclamp_id clamp_id;
1610
+ struct uclamp_rq *uc_rq = rq->uclamp;
1611
+
1612
+ for_each_clamp_id(clamp_id) {
1613
+ uc_rq[clamp_id] = (struct uclamp_rq) {
1614
+ .value = uclamp_none(clamp_id)
1615
+ };
1616
+ }
1617
+
1618
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
12191619 }
1220
-
1221
-bool uclamp_latency_sensitive(struct task_struct *p)
1222
-{
1223
-#ifdef CONFIG_UCLAMP_TASK_GROUP
1224
- struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id);
1225
- struct task_group *tg;
1226
-
1227
- if (!css)
1228
- return false;
1229
- tg = container_of(css, struct task_group, css);
1230
-
1231
- return tg->latency_sensitive;
1232
-#else
1233
- return false;
1234
-#endif
1235
-}
1236
-#endif /* CONFIG_SMP */
12371620
12381621 static void __init init_uclamp(void)
12391622 {
....@@ -1241,13 +1624,8 @@
12411624 enum uclamp_id clamp_id;
12421625 int cpu;
12431626
1244
- mutex_init(&uclamp_mutex);
1245
-
1246
- for_each_possible_cpu(cpu) {
1247
- memset(&cpu_rq(cpu)->uclamp, 0,
1248
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
1249
- cpu_rq(cpu)->uclamp_flags = 0;
1250
- }
1627
+ for_each_possible_cpu(cpu)
1628
+ init_uclamp_rq(cpu_rq(cpu));
12511629
12521630 for_each_clamp_id(clamp_id) {
12531631 uclamp_se_set(&init_task.uclamp_req[clamp_id],
....@@ -1276,41 +1654,7 @@
12761654 static void __setscheduler_uclamp(struct task_struct *p,
12771655 const struct sched_attr *attr) { }
12781656 static inline void uclamp_fork(struct task_struct *p) { }
1279
-
1280
-long schedtune_task_margin(struct task_struct *task);
1281
-
1282
-#ifdef CONFIG_SMP
1283
-unsigned int uclamp_task(struct task_struct *p)
1284
-{
1285
- unsigned long util = task_util_est(p);
1286
-#ifdef CONFIG_SCHED_TUNE
1287
- long margin = schedtune_task_margin(p);
1288
-
1289
- trace_sched_boost_task(p, util, margin);
1290
-
1291
- util += margin;
1292
-#endif
1293
-
1294
- return util;
1295
-}
1296
-
1297
-bool uclamp_boosted(struct task_struct *p)
1298
-{
1299
-#ifdef CONFIG_SCHED_TUNE
1300
- return schedtune_task_boost(p) > 0;
1301
-#endif
1302
- return false;
1303
-}
1304
-
1305
-bool uclamp_latency_sensitive(struct task_struct *p)
1306
-{
1307
-#ifdef CONFIG_SCHED_TUNE
1308
- return schedtune_prefer_idle(p) != 0;
1309
-#endif
1310
- return false;
1311
-}
1312
-#endif /* CONFIG_SMP */
1313
-
1657
+static inline void uclamp_post_fork(struct task_struct *p) { }
13141658 static inline void init_uclamp(void) { }
13151659 #endif /* CONFIG_UCLAMP_TASK */
13161660
....@@ -1325,7 +1669,9 @@
13251669 }
13261670
13271671 uclamp_rq_inc(rq, p);
1672
+ trace_android_rvh_enqueue_task(rq, p, flags);
13281673 p->sched_class->enqueue_task(rq, p, flags);
1674
+ trace_android_rvh_after_enqueue_task(rq, p);
13291675 }
13301676
13311677 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
....@@ -1339,31 +1685,42 @@
13391685 }
13401686
13411687 uclamp_rq_dec(rq, p);
1688
+ trace_android_rvh_dequeue_task(rq, p, flags);
13421689 p->sched_class->dequeue_task(rq, p, flags);
1690
+ trace_android_rvh_after_dequeue_task(rq, p);
13431691 }
13441692
13451693 void activate_task(struct rq *rq, struct task_struct *p, int flags)
13461694 {
1347
- if (task_contributes_to_load(p))
1348
- rq->nr_uninterruptible--;
1695
+ if (task_on_rq_migrating(p))
1696
+ flags |= ENQUEUE_MIGRATED;
13491697
13501698 enqueue_task(rq, p, flags);
1699
+
1700
+ p->on_rq = TASK_ON_RQ_QUEUED;
13511701 }
1702
+EXPORT_SYMBOL_GPL(activate_task);
13521703
13531704 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
13541705 {
1355
- if (task_contributes_to_load(p))
1356
- rq->nr_uninterruptible++;
1706
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
13571707
13581708 dequeue_task(rq, p, flags);
13591709 }
1710
+EXPORT_SYMBOL_GPL(deactivate_task);
13601711
1361
-/*
1362
- * __normal_prio - return the priority that is based on the static prio
1363
- */
1364
-static inline int __normal_prio(struct task_struct *p)
1712
+static inline int __normal_prio(int policy, int rt_prio, int nice)
13651713 {
1366
- return p->static_prio;
1714
+ int prio;
1715
+
1716
+ if (dl_policy(policy))
1717
+ prio = MAX_DL_PRIO - 1;
1718
+ else if (rt_policy(policy))
1719
+ prio = MAX_RT_PRIO - 1 - rt_prio;
1720
+ else
1721
+ prio = NICE_TO_PRIO(nice);
1722
+
1723
+ return prio;
13671724 }
13681725
13691726 /*
....@@ -1375,15 +1732,7 @@
13751732 */
13761733 static inline int normal_prio(struct task_struct *p)
13771734 {
1378
- int prio;
1379
-
1380
- if (task_has_dl_policy(p))
1381
- prio = MAX_DL_PRIO-1;
1382
- else if (task_has_rt_policy(p))
1383
- prio = MAX_RT_PRIO-1 - p->rt_priority;
1384
- else
1385
- prio = __normal_prio(p);
1386
- return prio;
1735
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
13871736 }
13881737
13891738 /*
....@@ -1439,20 +1788,10 @@
14391788
14401789 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
14411790 {
1442
- const struct sched_class *class;
1443
-
1444
- if (p->sched_class == rq->curr->sched_class) {
1791
+ if (p->sched_class == rq->curr->sched_class)
14451792 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1446
- } else {
1447
- for_each_class(class) {
1448
- if (class == rq->curr->sched_class)
1449
- break;
1450
- if (class == p->sched_class) {
1451
- resched_curr(rq);
1452
- break;
1453
- }
1454
- }
1455
- }
1793
+ else if (p->sched_class > rq->curr->sched_class)
1794
+ resched_curr(rq);
14561795
14571796 /*
14581797 * A queue event has occurred, and we're going to schedule. In
....@@ -1461,33 +1800,26 @@
14611800 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
14621801 rq_clock_skip_update(rq);
14631802 }
1803
+EXPORT_SYMBOL_GPL(check_preempt_curr);
14641804
14651805 #ifdef CONFIG_SMP
14661806
1467
-static inline bool is_per_cpu_kthread(struct task_struct *p)
1468
-{
1469
- if (!(p->flags & PF_KTHREAD))
1470
- return false;
1471
-
1472
- if (p->nr_cpus_allowed != 1)
1473
- return false;
1474
-
1475
- return true;
1476
-}
1477
-
14781807 /*
1479
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
1808
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
14801809 * __set_cpus_allowed_ptr() and select_fallback_rq().
14811810 */
14821811 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
14831812 {
1484
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
1813
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
14851814 return false;
14861815
14871816 if (is_per_cpu_kthread(p))
14881817 return cpu_online(cpu);
14891818
1490
- return cpu_active(cpu);
1819
+ if (!cpu_active(cpu))
1820
+ return false;
1821
+
1822
+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
14911823 }
14921824
14931825 /*
....@@ -1512,19 +1844,29 @@
15121844 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
15131845 struct task_struct *p, int new_cpu)
15141846 {
1847
+ int detached = 0;
1848
+
15151849 lockdep_assert_held(&rq->lock);
15161850
1517
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
1518
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
1519
- set_task_cpu(p, new_cpu);
1520
- rq_unlock(rq, rf);
1851
+ /*
1852
+ * The vendor hook may drop the lock temporarily, so
1853
+ * pass the rq flags to unpin lock. We expect the
1854
+ * rq lock to be held after return.
1855
+ */
1856
+ trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached);
1857
+ if (detached)
1858
+ goto attach;
15211859
1860
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1861
+ set_task_cpu(p, new_cpu);
1862
+
1863
+attach:
1864
+ rq_unlock(rq, rf);
15221865 rq = cpu_rq(new_cpu);
15231866
15241867 rq_lock(rq, rf);
15251868 BUG_ON(task_cpu(p) != new_cpu);
1526
- enqueue_task(rq, p, 0);
1527
- p->on_rq = TASK_ON_RQ_QUEUED;
1869
+ activate_task(rq, p, 0);
15281870 check_preempt_curr(rq, p, 0);
15291871
15301872 return rq;
....@@ -1576,10 +1918,10 @@
15761918 local_irq_disable();
15771919 /*
15781920 * We need to explicitly wake pending tasks before running
1579
- * __migrate_task() such that we will not miss enforcing cpus_allowed
1921
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
15801922 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
15811923 */
1582
- sched_ttwu_pending();
1924
+ flush_smp_call_function_from_idle();
15831925
15841926 raw_spin_lock(&p->pi_lock);
15851927 rq_lock(rq, &rf);
....@@ -1607,8 +1949,9 @@
16071949 */
16081950 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
16091951 {
1610
- cpumask_copy(&p->cpus_allowed, new_mask);
1952
+ cpumask_copy(&p->cpus_mask, new_mask);
16111953 p->nr_cpus_allowed = cpumask_weight(new_mask);
1954
+ trace_android_rvh_set_cpus_allowed_comm(p, new_mask);
16121955 }
16131956
16141957 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
....@@ -1637,28 +1980,23 @@
16371980 if (queued)
16381981 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
16391982 if (running)
1640
- set_curr_task(rq, p);
1983
+ set_next_task(rq, p);
16411984 }
16421985
16431986 /*
1644
- * Change a given task's CPU affinity. Migrate the thread to a
1645
- * proper CPU and schedule it away if the CPU it's executing on
1646
- * is removed from the allowed bitmask.
1647
- *
1648
- * NOTE: the caller must have a valid reference to the task, the
1649
- * task must not exit() & deallocate itself prematurely. The
1650
- * call is not atomic; no spinlocks may be held.
1987
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
16511988 */
1652
-static int __set_cpus_allowed_ptr(struct task_struct *p,
1653
- const struct cpumask *new_mask, bool check)
1989
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
1990
+ const struct cpumask *new_mask,
1991
+ bool check,
1992
+ struct rq *rq,
1993
+ struct rq_flags *rf)
16541994 {
16551995 const struct cpumask *cpu_valid_mask = cpu_active_mask;
1996
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
16561997 unsigned int dest_cpu;
1657
- struct rq_flags rf;
1658
- struct rq *rq;
16591998 int ret = 0;
16601999
1661
- rq = task_rq_lock(p, &rf);
16622000 update_rq_clock(rq);
16632001
16642002 if (p->flags & PF_KTHREAD) {
....@@ -1666,6 +2004,9 @@
16662004 * Kernel threads are allowed on online && !active CPUs
16672005 */
16682006 cpu_valid_mask = cpu_online_mask;
2007
+ } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) {
2008
+ ret = -EINVAL;
2009
+ goto out;
16692010 }
16702011
16712012 /*
....@@ -1677,10 +2018,15 @@
16772018 goto out;
16782019 }
16792020
1680
- if (cpumask_equal(&p->cpus_allowed, new_mask))
2021
+ if (cpumask_equal(&p->cpus_mask, new_mask))
16812022 goto out;
16822023
1683
- dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
2024
+ /*
2025
+ * Picking a ~random cpu helps in cases where we are changing affinity
2026
+ * for groups of tasks (ie. cpuset), so that load balancing is not
2027
+ * immediately required to distribute the tasks within their new mask.
2028
+ */
2029
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
16842030 if (dest_cpu >= nr_cpu_ids) {
16852031 ret = -EINVAL;
16862032 goto out;
....@@ -1705,21 +2051,39 @@
17052051 if (task_running(rq, p) || p->state == TASK_WAKING) {
17062052 struct migration_arg arg = { p, dest_cpu };
17072053 /* Need help from migration thread: drop lock and wait. */
1708
- task_rq_unlock(rq, p, &rf);
2054
+ task_rq_unlock(rq, p, rf);
17092055 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1710
- tlb_migrate_finish(p->mm);
17112056 return 0;
17122057 } else if (task_on_rq_queued(p)) {
17132058 /*
17142059 * OK, since we're going to drop the lock immediately
17152060 * afterwards anyway.
17162061 */
1717
- rq = move_queued_task(rq, &rf, p, dest_cpu);
2062
+ rq = move_queued_task(rq, rf, p, dest_cpu);
17182063 }
17192064 out:
1720
- task_rq_unlock(rq, p, &rf);
2065
+ task_rq_unlock(rq, p, rf);
17212066
17222067 return ret;
2068
+}
2069
+
2070
+/*
2071
+ * Change a given task's CPU affinity. Migrate the thread to a
2072
+ * proper CPU and schedule it away if the CPU it's executing on
2073
+ * is removed from the allowed bitmask.
2074
+ *
2075
+ * NOTE: the caller must have a valid reference to the task, the
2076
+ * task must not exit() & deallocate itself prematurely. The
2077
+ * call is not atomic; no spinlocks may be held.
2078
+ */
2079
+static int __set_cpus_allowed_ptr(struct task_struct *p,
2080
+ const struct cpumask *new_mask, bool check)
2081
+{
2082
+ struct rq_flags rf;
2083
+ struct rq *rq;
2084
+
2085
+ rq = task_rq_lock(p, &rf);
2086
+ return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf);
17232087 }
17242088
17252089 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
....@@ -1727,6 +2091,74 @@
17272091 return __set_cpus_allowed_ptr(p, new_mask, false);
17282092 }
17292093 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2094
+
2095
+/*
2096
+ * Change a given task's CPU affinity to the intersection of its current
2097
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
2098
+ * If the resulting mask is empty, leave the affinity unchanged and return
2099
+ * -EINVAL.
2100
+ */
2101
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
2102
+ struct cpumask *new_mask,
2103
+ const struct cpumask *subset_mask)
2104
+{
2105
+ struct rq_flags rf;
2106
+ struct rq *rq;
2107
+
2108
+ rq = task_rq_lock(p, &rf);
2109
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2110
+ task_rq_unlock(rq, p, &rf);
2111
+ return -EINVAL;
2112
+ }
2113
+
2114
+ return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf);
2115
+}
2116
+
2117
+/*
2118
+ * Restrict a given task's CPU affinity so that it is a subset of
2119
+ * task_cpu_possible_mask(). If the resulting mask is empty, we warn and
2120
+ * walk up the cpuset hierarchy until we find a suitable mask.
2121
+ */
2122
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
2123
+{
2124
+ cpumask_var_t new_mask;
2125
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
2126
+
2127
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
2128
+
2129
+ /*
2130
+ * __migrate_task() can fail silently in the face of concurrent
2131
+ * offlining of the chosen destination CPU, so take the hotplug
2132
+ * lock to ensure that the migration succeeds.
2133
+ */
2134
+ trace_android_rvh_force_compatible_pre(NULL);
2135
+ cpus_read_lock();
2136
+ if (!cpumask_available(new_mask))
2137
+ goto out_set_mask;
2138
+
2139
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
2140
+ goto out_free_mask;
2141
+
2142
+ /*
2143
+ * We failed to find a valid subset of the affinity mask for the
2144
+ * task, so override it based on its cpuset hierarchy.
2145
+ */
2146
+ cpuset_cpus_allowed(p, new_mask);
2147
+ override_mask = new_mask;
2148
+
2149
+out_set_mask:
2150
+ if (printk_ratelimit()) {
2151
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
2152
+ task_pid_nr(p), p->comm,
2153
+ cpumask_pr_args(override_mask));
2154
+ }
2155
+
2156
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
2157
+out_free_mask:
2158
+ cpus_read_unlock();
2159
+ trace_android_rvh_force_compatible_post(NULL);
2160
+ free_cpumask_var(new_mask);
2161
+}
17302162
17312163 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
17322164 {
....@@ -1775,12 +2207,13 @@
17752207 p->se.nr_migrations++;
17762208 rseq_migrate(p);
17772209 perf_event_task_migrate(p);
2210
+ trace_android_rvh_set_task_cpu(p, new_cpu);
17782211 }
17792212
17802213 __set_task_cpu(p, new_cpu);
17812214 }
2215
+EXPORT_SYMBOL_GPL(set_task_cpu);
17822216
1783
-#ifdef CONFIG_NUMA_BALANCING
17842217 static void __migrate_swap_task(struct task_struct *p, int cpu)
17852218 {
17862219 if (task_on_rq_queued(p)) {
....@@ -1793,11 +2226,9 @@
17932226 rq_pin_lock(src_rq, &srf);
17942227 rq_pin_lock(dst_rq, &drf);
17952228
1796
- p->on_rq = TASK_ON_RQ_MIGRATING;
17972229 deactivate_task(src_rq, p, 0);
17982230 set_task_cpu(p, cpu);
17992231 activate_task(dst_rq, p, 0);
1800
- p->on_rq = TASK_ON_RQ_QUEUED;
18012232 check_preempt_curr(dst_rq, p, 0);
18022233
18032234 rq_unpin_lock(dst_rq, &drf);
....@@ -1840,10 +2271,10 @@
18402271 if (task_cpu(arg->src_task) != arg->src_cpu)
18412272 goto unlock;
18422273
1843
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
2274
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
18442275 goto unlock;
18452276
1846
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
2277
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
18472278 goto unlock;
18482279
18492280 __migrate_swap_task(arg->src_task, arg->dst_cpu);
....@@ -1885,10 +2316,10 @@
18852316 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
18862317 goto out;
18872318
1888
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
2319
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
18892320 goto out;
18902321
1891
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
2322
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
18922323 goto out;
18932324
18942325 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
....@@ -1897,7 +2328,7 @@
18972328 out:
18982329 return ret;
18992330 }
1900
-#endif /* CONFIG_NUMA_BALANCING */
2331
+EXPORT_SYMBOL_GPL(migrate_swap);
19012332
19022333 /*
19032334 * wait_task_inactive - wait for a thread to unschedule.
....@@ -2033,7 +2464,7 @@
20332464 EXPORT_SYMBOL_GPL(kick_process);
20342465
20352466 /*
2036
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2467
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
20372468 *
20382469 * A few notes on cpu_active vs cpu_online:
20392470 *
....@@ -2059,7 +2490,11 @@
20592490 int nid = cpu_to_node(cpu);
20602491 const struct cpumask *nodemask = NULL;
20612492 enum { cpuset, possible, fail } state = cpuset;
2062
- int dest_cpu;
2493
+ int dest_cpu = -1;
2494
+
2495
+ trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu);
2496
+ if (dest_cpu >= 0)
2497
+ return dest_cpu;
20632498
20642499 /*
20652500 * If the node that the CPU is on has been offlined, cpu_to_node()
....@@ -2071,16 +2506,14 @@
20712506
20722507 /* Look for allowed, online CPU in same node. */
20732508 for_each_cpu(dest_cpu, nodemask) {
2074
- if (!cpu_active(dest_cpu))
2075
- continue;
2076
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2509
+ if (is_cpu_allowed(p, dest_cpu))
20772510 return dest_cpu;
20782511 }
20792512 }
20802513
20812514 for (;;) {
20822515 /* Any allowed, online CPU? */
2083
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
2516
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
20842517 if (!is_cpu_allowed(p, dest_cpu))
20852518 continue;
20862519
....@@ -2095,12 +2528,11 @@
20952528 state = possible;
20962529 break;
20972530 }
2098
- /* Fall-through */
2531
+ fallthrough;
20992532 case possible:
2100
- do_set_cpus_allowed(p, cpu_possible_mask);
2533
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
21012534 state = fail;
21022535 break;
2103
-
21042536 case fail:
21052537 BUG();
21062538 break;
....@@ -2124,23 +2556,21 @@
21242556 }
21252557
21262558 /*
2127
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2559
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21282560 */
21292561 static inline
2130
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
2131
- int sibling_count_hint)
2562
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
21322563 {
21332564 lockdep_assert_held(&p->pi_lock);
21342565
21352566 if (p->nr_cpus_allowed > 1)
2136
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
2137
- sibling_count_hint);
2567
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
21382568 else
2139
- cpu = cpumask_any(&p->cpus_allowed);
2569
+ cpu = cpumask_any(p->cpus_ptr);
21402570
21412571 /*
21422572 * In order not to call set_task_cpu() on a blocking task we need
2143
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
2573
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
21442574 * CPU.
21452575 *
21462576 * Since this is common to all placement strategies, this lives here.
....@@ -2152,12 +2582,6 @@
21522582 cpu = select_fallback_rq(task_cpu(p), p);
21532583
21542584 return cpu;
2155
-}
2156
-
2157
-static void update_avg(u64 *avg, u64 sample)
2158
-{
2159
- s64 diff = sample - *avg;
2160
- *avg += diff >> 3;
21612585 }
21622586
21632587 void sched_set_stop_task(int cpu, struct task_struct *stop)
....@@ -2239,16 +2663,6 @@
22392663 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
22402664 }
22412665
2242
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2243
-{
2244
- activate_task(rq, p, en_flags);
2245
- p->on_rq = TASK_ON_RQ_QUEUED;
2246
-
2247
- /* If a worker is waking up, notify the workqueue: */
2248
- if (p->flags & PF_WQ_WORKER)
2249
- wq_worker_waking_up(p, cpu_of(rq));
2250
-}
2251
-
22522666 /*
22532667 * Mark the task runnable and perform wakeup-preemption.
22542668 */
....@@ -2290,27 +2704,54 @@
22902704 {
22912705 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
22922706
2707
+ if (wake_flags & WF_SYNC)
2708
+ en_flags |= ENQUEUE_WAKEUP_SYNC;
2709
+
22932710 lockdep_assert_held(&rq->lock);
22942711
2295
-#ifdef CONFIG_SMP
22962712 if (p->sched_contributes_to_load)
22972713 rq->nr_uninterruptible--;
22982714
2715
+#ifdef CONFIG_SMP
22992716 if (wake_flags & WF_MIGRATED)
23002717 en_flags |= ENQUEUE_MIGRATED;
2718
+ else
23012719 #endif
2720
+ if (p->in_iowait) {
2721
+ delayacct_blkio_end(p);
2722
+ atomic_dec(&task_rq(p)->nr_iowait);
2723
+ }
23022724
2303
- ttwu_activate(rq, p, en_flags);
2725
+ activate_task(rq, p, en_flags);
23042726 ttwu_do_wakeup(rq, p, wake_flags, rf);
23052727 }
23062728
23072729 /*
2308
- * Called in case the task @p isn't fully descheduled from its runqueue,
2309
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2310
- * since all we need to do is flip p->state to TASK_RUNNING, since
2311
- * the task is still ->on_rq.
2730
+ * Consider @p being inside a wait loop:
2731
+ *
2732
+ * for (;;) {
2733
+ * set_current_state(TASK_UNINTERRUPTIBLE);
2734
+ *
2735
+ * if (CONDITION)
2736
+ * break;
2737
+ *
2738
+ * schedule();
2739
+ * }
2740
+ * __set_current_state(TASK_RUNNING);
2741
+ *
2742
+ * between set_current_state() and schedule(). In this case @p is still
2743
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2744
+ * an atomic manner.
2745
+ *
2746
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2747
+ * then schedule() must still happen and p->state can be changed to
2748
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2749
+ * need to do a full wakeup with enqueue.
2750
+ *
2751
+ * Returns: %true when the wakeup is done,
2752
+ * %false otherwise.
23122753 */
2313
-static int ttwu_remote(struct task_struct *p, int wake_flags)
2754
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
23142755 {
23152756 struct rq_flags rf;
23162757 struct rq *rq;
....@@ -2329,75 +2770,63 @@
23292770 }
23302771
23312772 #ifdef CONFIG_SMP
2332
-void sched_ttwu_pending(void)
2773
+void sched_ttwu_pending(void *arg)
23332774 {
2775
+ struct llist_node *llist = arg;
23342776 struct rq *rq = this_rq();
2335
- struct llist_node *llist = llist_del_all(&rq->wake_list);
23362777 struct task_struct *p, *t;
23372778 struct rq_flags rf;
23382779
23392780 if (!llist)
23402781 return;
23412782
2783
+ /*
2784
+ * rq::ttwu_pending racy indication of out-standing wakeups.
2785
+ * Races such that false-negatives are possible, since they
2786
+ * are shorter lived that false-positives would be.
2787
+ */
2788
+ WRITE_ONCE(rq->ttwu_pending, 0);
2789
+
23422790 rq_lock_irqsave(rq, &rf);
23432791 update_rq_clock(rq);
23442792
2345
- llist_for_each_entry_safe(p, t, llist, wake_entry)
2793
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
2794
+ if (WARN_ON_ONCE(p->on_cpu))
2795
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
2796
+
2797
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
2798
+ set_task_cpu(p, cpu_of(rq));
2799
+
23462800 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2801
+ }
23472802
23482803 rq_unlock_irqrestore(rq, &rf);
23492804 }
23502805
2351
-void scheduler_ipi(void)
2806
+void send_call_function_single_ipi(int cpu)
23522807 {
2353
- /*
2354
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2355
- * TIF_NEED_RESCHED remotely (for the first time) will also send
2356
- * this IPI.
2357
- */
2358
- preempt_fold_need_resched();
2808
+ struct rq *rq = cpu_rq(cpu);
23592809
2360
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2361
- return;
2362
-
2363
- /*
2364
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2365
- * traditionally all their work was done from the interrupt return
2366
- * path. Now that we actually do some work, we need to make sure
2367
- * we do call them.
2368
- *
2369
- * Some archs already do call them, luckily irq_enter/exit nest
2370
- * properly.
2371
- *
2372
- * Arguably we should visit all archs and update all handlers,
2373
- * however a fair share of IPIs are still resched only so this would
2374
- * somewhat pessimize the simple resched case.
2375
- */
2376
- irq_enter();
2377
- sched_ttwu_pending();
2378
-
2379
- /*
2380
- * Check if someone kicked us for doing the nohz idle load balance.
2381
- */
2382
- if (unlikely(got_nohz_idle_kick())) {
2383
- this_rq()->idle_balance = 1;
2384
- raise_softirq_irqoff(SCHED_SOFTIRQ);
2385
- }
2386
- irq_exit();
2810
+ if (!set_nr_if_polling(rq->idle))
2811
+ arch_send_call_function_single_ipi(cpu);
2812
+ else
2813
+ trace_sched_wake_idle_without_ipi(cpu);
23872814 }
23882815
2389
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
2816
+/*
2817
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
2818
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
2819
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
2820
+ * of the wakeup instead of the waker.
2821
+ */
2822
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
23902823 {
23912824 struct rq *rq = cpu_rq(cpu);
23922825
23932826 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
23942827
2395
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
2396
- if (!set_nr_if_polling(rq->idle))
2397
- smp_send_reschedule(cpu);
2398
- else
2399
- trace_sched_wake_idle_without_ipi(cpu);
2400
- }
2828
+ WRITE_ONCE(rq->ttwu_pending, 1);
2829
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
24012830 }
24022831
24032832 void wake_up_if_idle(int cpu)
....@@ -2423,6 +2852,7 @@
24232852 out:
24242853 rcu_read_unlock();
24252854 }
2855
+EXPORT_SYMBOL_GPL(wake_up_if_idle);
24262856
24272857 bool cpus_share_cache(int this_cpu, int that_cpu)
24282858 {
....@@ -2431,6 +2861,58 @@
24312861
24322862 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
24332863 }
2864
+
2865
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2866
+{
2867
+ /*
2868
+ * If the CPU does not share cache, then queue the task on the
2869
+ * remote rqs wakelist to avoid accessing remote data.
2870
+ */
2871
+ if (!cpus_share_cache(smp_processor_id(), cpu))
2872
+ return true;
2873
+
2874
+ /*
2875
+ * If the task is descheduling and the only running task on the
2876
+ * CPU then use the wakelist to offload the task activation to
2877
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
2878
+ * nr_running is checked to avoid unnecessary task stacking.
2879
+ *
2880
+ * Note that we can only get here with (wakee) p->on_rq=0,
2881
+ * p->on_cpu can be whatever, we've done the dequeue, so
2882
+ * the wakee has been accounted out of ->nr_running.
2883
+ */
2884
+ if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running)
2885
+ return true;
2886
+
2887
+ return false;
2888
+}
2889
+
2890
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2891
+{
2892
+ bool cond = false;
2893
+
2894
+ trace_android_rvh_ttwu_cond(&cond);
2895
+
2896
+ if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) ||
2897
+ cond) {
2898
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
2899
+ return false;
2900
+
2901
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2902
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
2903
+ return true;
2904
+ }
2905
+
2906
+ return false;
2907
+}
2908
+
2909
+#else /* !CONFIG_SMP */
2910
+
2911
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2912
+{
2913
+ return false;
2914
+}
2915
+
24342916 #endif /* CONFIG_SMP */
24352917
24362918 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
....@@ -2438,13 +2920,8 @@
24382920 struct rq *rq = cpu_rq(cpu);
24392921 struct rq_flags rf;
24402922
2441
-#if defined(CONFIG_SMP)
2442
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
2443
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2444
- ttwu_queue_remote(p, cpu, wake_flags);
2923
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
24452924 return;
2446
- }
2447
-#endif
24482925
24492926 rq_lock(rq, &rf);
24502927 update_rq_clock(rq);
....@@ -2500,8 +2977,8 @@
25002977 * migration. However the means are completely different as there is no lock
25012978 * chain to provide order. Instead we do:
25022979 *
2503
- * 1) smp_store_release(X->on_cpu, 0)
2504
- * 2) smp_cond_load_acquire(!X->on_cpu)
2980
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
2981
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
25052982 *
25062983 * Example:
25072984 *
....@@ -2540,45 +3017,95 @@
25403017 * @p: the thread to be awakened
25413018 * @state: the mask of task states that can be woken
25423019 * @wake_flags: wake modifier flags (WF_*)
2543
- * @sibling_count_hint: A hint at the number of threads that are being woken up
2544
- * in this event.
25453020 *
2546
- * If (@state & @p->state) @p->state = TASK_RUNNING.
3021
+ * Conceptually does:
3022
+ *
3023
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
25473024 *
25483025 * If the task was not queued/runnable, also place it back on a runqueue.
25493026 *
2550
- * Atomic against schedule() which would dequeue a task, also see
2551
- * set_current_state().
3027
+ * This function is atomic against schedule() which would dequeue the task.
25523028 *
2553
- * This function executes a full memory barrier before accessing the task
2554
- * state; see set_current_state().
3029
+ * It issues a full memory barrier before accessing @p->state, see the comment
3030
+ * with set_current_state().
3031
+ *
3032
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
3033
+ *
3034
+ * Relies on p->pi_lock stabilizing:
3035
+ * - p->sched_class
3036
+ * - p->cpus_ptr
3037
+ * - p->sched_task_group
3038
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3039
+ *
3040
+ * Tries really hard to only take one task_rq(p)->lock for performance.
3041
+ * Takes rq->lock in:
3042
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3043
+ * - ttwu_queue() -- new rq, for enqueue of the task;
3044
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3045
+ *
3046
+ * As a consequence we race really badly with just about everything. See the
3047
+ * many memory barriers and their comments for details.
25553048 *
25563049 * Return: %true if @p->state changes (an actual wakeup was done),
25573050 * %false otherwise.
25583051 */
25593052 static int
2560
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2561
- int sibling_count_hint)
3053
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
25623054 {
25633055 unsigned long flags;
25643056 int cpu, success = 0;
25653057
3058
+ preempt_disable();
3059
+ if (p == current) {
3060
+ /*
3061
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3062
+ * == smp_processor_id()'. Together this means we can special
3063
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
3064
+ * without taking any locks.
3065
+ *
3066
+ * In particular:
3067
+ * - we rely on Program-Order guarantees for all the ordering,
3068
+ * - we're serialized against set_special_state() by virtue of
3069
+ * it disabling IRQs (this allows not taking ->pi_lock).
3070
+ */
3071
+ if (!(p->state & state))
3072
+ goto out;
3073
+
3074
+ success = 1;
3075
+ trace_sched_waking(p);
3076
+ p->state = TASK_RUNNING;
3077
+ trace_sched_wakeup(p);
3078
+ goto out;
3079
+ }
3080
+
25663081 /*
25673082 * If we are going to wake up a thread waiting for CONDITION we
25683083 * need to ensure that CONDITION=1 done by the caller can not be
2569
- * reordered with p->state check below. This pairs with mb() in
2570
- * set_current_state() the waiting thread does.
3084
+ * reordered with p->state check below. This pairs with smp_store_mb()
3085
+ * in set_current_state() that the waiting thread does.
25713086 */
25723087 raw_spin_lock_irqsave(&p->pi_lock, flags);
25733088 smp_mb__after_spinlock();
25743089 if (!(p->state & state))
2575
- goto out;
3090
+ goto unlock;
3091
+
3092
+#ifdef CONFIG_FREEZER
3093
+ /*
3094
+ * If we're going to wake up a thread which may be frozen, then
3095
+ * we can only do so if we have an active CPU which is capable of
3096
+ * running it. This may not be the case when resuming from suspend,
3097
+ * as the secondary CPUs may not yet be back online. See __thaw_task()
3098
+ * for the actual wakeup.
3099
+ */
3100
+ if (unlikely(frozen_or_skipped(p)) &&
3101
+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
3102
+ goto unlock;
3103
+#endif
25763104
25773105 trace_sched_waking(p);
25783106
25793107 /* We're going to change ->state: */
25803108 success = 1;
2581
- cpu = task_cpu(p);
25823109
25833110 /*
25843111 * Ensure we load p->on_rq _after_ p->state, otherwise it would
....@@ -2599,10 +3126,15 @@
25993126 *
26003127 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
26013128 * __schedule(). See the comment for smp_mb__after_spinlock().
3129
+ *
3130
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
26023131 */
26033132 smp_rmb();
2604
- if (p->on_rq && ttwu_remote(p, wake_flags))
2605
- goto stat;
3133
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
3134
+ goto unlock;
3135
+
3136
+ if (p->state & TASK_UNINTERRUPTIBLE)
3137
+ trace_sched_blocked_reason(p);
26063138
26073139 #ifdef CONFIG_SMP
26083140 /*
....@@ -2623,8 +3155,43 @@
26233155 *
26243156 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
26253157 * __schedule(). See the comment for smp_mb__after_spinlock().
3158
+ *
3159
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3160
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
3161
+ * care about it's own p->state. See the comment in __schedule().
26263162 */
2627
- smp_rmb();
3163
+ smp_acquire__after_ctrl_dep();
3164
+
3165
+ /*
3166
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3167
+ * == 0), which means we need to do an enqueue, change p->state to
3168
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
3169
+ * enqueue, such as ttwu_queue_wakelist().
3170
+ */
3171
+ p->state = TASK_WAKING;
3172
+
3173
+ /*
3174
+ * If the owning (remote) CPU is still in the middle of schedule() with
3175
+ * this task as prev, considering queueing p on the remote CPUs wake_list
3176
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
3177
+ * let the waker make forward progress. This is safe because IRQs are
3178
+ * disabled and the IPI will deliver after on_cpu is cleared.
3179
+ *
3180
+ * Ensure we load task_cpu(p) after p->on_cpu:
3181
+ *
3182
+ * set_task_cpu(p, cpu);
3183
+ * STORE p->cpu = @cpu
3184
+ * __schedule() (switch to task 'p')
3185
+ * LOCK rq->lock
3186
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3187
+ * STORE p->on_cpu = 1 LOAD p->cpu
3188
+ *
3189
+ * to ensure we observe the correct CPU on which the task is currently
3190
+ * scheduling.
3191
+ */
3192
+ if (smp_load_acquire(&p->on_cpu) &&
3193
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
3194
+ goto unlock;
26283195
26293196 /*
26303197 * If the owning (remote) CPU is still in the middle of schedule() with
....@@ -2637,88 +3204,79 @@
26373204 */
26383205 smp_cond_load_acquire(&p->on_cpu, !VAL);
26393206
2640
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
2641
- p->state = TASK_WAKING;
3207
+ trace_android_rvh_try_to_wake_up(p);
26423208
2643
- if (p->in_iowait) {
2644
- delayacct_blkio_end(p);
2645
- atomic_dec(&task_rq(p)->nr_iowait);
2646
- }
2647
-
2648
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2649
- sibling_count_hint);
3209
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
26503210 if (task_cpu(p) != cpu) {
3211
+ if (p->in_iowait) {
3212
+ delayacct_blkio_end(p);
3213
+ atomic_dec(&task_rq(p)->nr_iowait);
3214
+ }
3215
+
26513216 wake_flags |= WF_MIGRATED;
26523217 psi_ttwu_dequeue(p);
26533218 set_task_cpu(p, cpu);
26543219 }
2655
-
2656
-#else /* CONFIG_SMP */
2657
-
2658
- if (p->in_iowait) {
2659
- delayacct_blkio_end(p);
2660
- atomic_dec(&task_rq(p)->nr_iowait);
2661
- }
2662
-
3220
+#else
3221
+ cpu = task_cpu(p);
26633222 #endif /* CONFIG_SMP */
26643223
26653224 ttwu_queue(p, cpu, wake_flags);
2666
-stat:
2667
- ttwu_stat(p, cpu, wake_flags);
2668
-out:
3225
+unlock:
26693226 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3227
+out:
3228
+ if (success) {
3229
+ trace_android_rvh_try_to_wake_up_success(p);
3230
+ ttwu_stat(p, task_cpu(p), wake_flags);
3231
+ }
3232
+ preempt_enable();
26703233
26713234 return success;
26723235 }
26733236
26743237 /**
2675
- * try_to_wake_up_local - try to wake up a local task with rq lock held
2676
- * @p: the thread to be awakened
2677
- * @rf: request-queue flags for pinning
3238
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3239
+ * @p: Process for which the function is to be invoked, can be @current.
3240
+ * @func: Function to invoke.
3241
+ * @arg: Argument to function.
26783242 *
2679
- * Put @p on the run-queue if it's not already there. The caller must
2680
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
2681
- * the current task.
3243
+ * If the specified task can be quickly locked into a definite state
3244
+ * (either sleeping or on a given runqueue), arrange to keep it in that
3245
+ * state while invoking @func(@arg). This function can use ->on_rq and
3246
+ * task_curr() to work out what the state is, if required. Given that
3247
+ * @func can be invoked with a runqueue lock held, it had better be quite
3248
+ * lightweight.
3249
+ *
3250
+ * Returns:
3251
+ * @false if the task slipped out from under the locks.
3252
+ * @true if the task was locked onto a runqueue or is sleeping.
3253
+ * However, @func can override this by returning @false.
26823254 */
2683
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
3255
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
26843256 {
2685
- struct rq *rq = task_rq(p);
3257
+ struct rq_flags rf;
3258
+ bool ret = false;
3259
+ struct rq *rq;
26863260
2687
- if (WARN_ON_ONCE(rq != this_rq()) ||
2688
- WARN_ON_ONCE(p == current))
2689
- return;
2690
-
2691
- lockdep_assert_held(&rq->lock);
2692
-
2693
- if (!raw_spin_trylock(&p->pi_lock)) {
2694
- /*
2695
- * This is OK, because current is on_cpu, which avoids it being
2696
- * picked for load-balance and preemption/IRQs are still
2697
- * disabled avoiding further scheduler activity on it and we've
2698
- * not yet picked a replacement task.
2699
- */
2700
- rq_unlock(rq, rf);
2701
- raw_spin_lock(&p->pi_lock);
2702
- rq_relock(rq, rf);
2703
- }
2704
-
2705
- if (!(p->state & TASK_NORMAL))
2706
- goto out;
2707
-
2708
- trace_sched_waking(p);
2709
-
2710
- if (!task_on_rq_queued(p)) {
2711
- if (p->in_iowait) {
2712
- delayacct_blkio_end(p);
2713
- atomic_dec(&rq->nr_iowait);
3261
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3262
+ if (p->on_rq) {
3263
+ rq = __task_rq_lock(p, &rf);
3264
+ if (task_rq(p) == rq)
3265
+ ret = func(p, arg);
3266
+ rq_unlock(rq, &rf);
3267
+ } else {
3268
+ switch (p->state) {
3269
+ case TASK_RUNNING:
3270
+ case TASK_WAKING:
3271
+ break;
3272
+ default:
3273
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3274
+ if (!p->on_rq)
3275
+ ret = func(p, arg);
27143276 }
2715
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
27163277 }
2717
-
2718
- ttwu_do_wakeup(rq, p, 0, rf);
2719
- ttwu_stat(p, smp_processor_id(), 0);
2720
-out:
2721
- raw_spin_unlock(&p->pi_lock);
3278
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3279
+ return ret;
27223280 }
27233281
27243282 /**
....@@ -2734,13 +3292,13 @@
27343292 */
27353293 int wake_up_process(struct task_struct *p)
27363294 {
2737
- return try_to_wake_up(p, TASK_NORMAL, 0, 1);
3295
+ return try_to_wake_up(p, TASK_NORMAL, 0);
27383296 }
27393297 EXPORT_SYMBOL(wake_up_process);
27403298
27413299 int wake_up_state(struct task_struct *p, unsigned int state)
27423300 {
2743
- return try_to_wake_up(p, state, 0, 1);
3301
+ return try_to_wake_up(p, state, 0);
27443302 }
27453303
27463304 /*
....@@ -2765,6 +3323,8 @@
27653323 p->se.cfs_rq = NULL;
27663324 #endif
27673325
3326
+ trace_android_rvh_sched_fork_init(p);
3327
+
27683328 #ifdef CONFIG_SCHEDSTATS
27693329 /* Even if schedstat is disabled, there should not be garbage */
27703330 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
....@@ -2785,7 +3345,13 @@
27853345 INIT_HLIST_HEAD(&p->preempt_notifiers);
27863346 #endif
27873347
3348
+#ifdef CONFIG_COMPACTION
3349
+ p->capture_control = NULL;
3350
+#endif
27883351 init_numa_balancing(clone_flags, p);
3352
+#ifdef CONFIG_SMP
3353
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
3354
+#endif
27893355 }
27903356
27913357 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
....@@ -2802,7 +3368,7 @@
28023368
28033369 #ifdef CONFIG_PROC_SYSCTL
28043370 int sysctl_numa_balancing(struct ctl_table *table, int write,
2805
- void __user *buffer, size_t *lenp, loff_t *ppos)
3371
+ void *buffer, size_t *lenp, loff_t *ppos)
28063372 {
28073373 struct ctl_table t;
28083374 int err;
....@@ -2876,8 +3442,8 @@
28763442 }
28773443
28783444 #ifdef CONFIG_PROC_SYSCTL
2879
-int sysctl_schedstats(struct ctl_table *table, int write,
2880
- void __user *buffer, size_t *lenp, loff_t *ppos)
3445
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3446
+ size_t *lenp, loff_t *ppos)
28813447 {
28823448 struct ctl_table t;
28833449 int err;
....@@ -2905,7 +3471,7 @@
29053471 */
29063472 int sched_fork(unsigned long clone_flags, struct task_struct *p)
29073473 {
2908
- unsigned long flags;
3474
+ trace_android_rvh_sched_fork(p);
29093475
29103476 __sched_fork(clone_flags, p);
29113477 /*
....@@ -2919,6 +3485,7 @@
29193485 * Make sure we do not leak PI boosting priority to the child.
29203486 */
29213487 p->prio = current->normal_prio;
3488
+ trace_android_rvh_prepare_prio_fork(p);
29223489
29233490 uclamp_fork(p);
29243491
....@@ -2933,8 +3500,8 @@
29333500 } else if (PRIO_TO_NICE(p->static_prio) < 0)
29343501 p->static_prio = NICE_TO_PRIO(0);
29353502
2936
- p->prio = p->normal_prio = __normal_prio(p);
2937
- set_load_weight(p, false);
3503
+ p->prio = p->normal_prio = p->static_prio;
3504
+ set_load_weight(p);
29383505
29393506 /*
29403507 * We don't need the reset flag anymore after the fork. It has
....@@ -2951,24 +3518,8 @@
29513518 p->sched_class = &fair_sched_class;
29523519
29533520 init_entity_runnable_average(&p->se);
3521
+ trace_android_rvh_finish_prio_fork(p);
29543522
2955
- /*
2956
- * The child is not yet in the pid-hash so no cgroup attach races,
2957
- * and the cgroup is pinned to this child due to cgroup_fork()
2958
- * is ran before sched_fork().
2959
- *
2960
- * Silence PROVE_RCU.
2961
- */
2962
- raw_spin_lock_irqsave(&p->pi_lock, flags);
2963
- rseq_migrate(p);
2964
- /*
2965
- * We're setting the CPU for the first time, we don't migrate,
2966
- * so use __set_task_cpu().
2967
- */
2968
- __set_task_cpu(p, smp_processor_id());
2969
- if (p->sched_class->task_fork)
2970
- p->sched_class->task_fork(p);
2971
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
29723523
29733524 #ifdef CONFIG_SCHED_INFO
29743525 if (likely(sched_info_on()))
....@@ -2983,6 +3534,41 @@
29833534 RB_CLEAR_NODE(&p->pushable_dl_tasks);
29843535 #endif
29853536 return 0;
3537
+}
3538
+
3539
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
3540
+{
3541
+ unsigned long flags;
3542
+
3543
+ /*
3544
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
3545
+ * required yet, but lockdep gets upset if rules are violated.
3546
+ */
3547
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
3548
+#ifdef CONFIG_CGROUP_SCHED
3549
+ if (1) {
3550
+ struct task_group *tg;
3551
+
3552
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
3553
+ struct task_group, css);
3554
+ tg = autogroup_task_group(p, tg);
3555
+ p->sched_task_group = tg;
3556
+ }
3557
+#endif
3558
+ rseq_migrate(p);
3559
+ /*
3560
+ * We're setting the CPU for the first time, we don't migrate,
3561
+ * so use __set_task_cpu().
3562
+ */
3563
+ __set_task_cpu(p, smp_processor_id());
3564
+ if (p->sched_class->task_fork)
3565
+ p->sched_class->task_fork(p);
3566
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3567
+}
3568
+
3569
+void sched_post_fork(struct task_struct *p)
3570
+{
3571
+ uclamp_post_fork(p);
29863572 }
29873573
29883574 unsigned long to_ratio(u64 period, u64 runtime)
....@@ -3013,12 +3599,14 @@
30133599 struct rq_flags rf;
30143600 struct rq *rq;
30153601
3602
+ trace_android_rvh_wake_up_new_task(p);
3603
+
30163604 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
30173605 p->state = TASK_RUNNING;
30183606 #ifdef CONFIG_SMP
30193607 /*
30203608 * Fork balancing, do it here and not earlier because:
3021
- * - cpus_allowed can change in the fork path
3609
+ * - cpus_ptr can change in the fork path
30223610 * - any previously selected CPU might disappear through hotplug
30233611 *
30243612 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
....@@ -3026,14 +3614,14 @@
30263614 */
30273615 p->recent_used_cpu = task_cpu(p);
30283616 rseq_migrate(p);
3029
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
3617
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
30303618 #endif
30313619 rq = __task_rq_lock(p, &rf);
30323620 update_rq_clock(rq);
3033
- post_init_entity_util_avg(&p->se);
3621
+ post_init_entity_util_avg(p);
3622
+ trace_android_rvh_new_task_stats(p);
30343623
30353624 activate_task(rq, p, ENQUEUE_NOCLOCK);
3036
- p->on_rq = TASK_ON_RQ_QUEUED;
30373625 trace_sched_wakeup_new(p);
30383626 check_preempt_curr(rq, p, WF_FORK);
30393627 #ifdef CONFIG_SMP
....@@ -3143,8 +3731,10 @@
31433731 /*
31443732 * Claim the task as running, we do this before switching to it
31453733 * such that any running task will have this set.
3734
+ *
3735
+ * See the ttwu() WF_ON_CPU case and its ordering comment.
31463736 */
3147
- next->on_cpu = 1;
3737
+ WRITE_ONCE(next->on_cpu, 1);
31483738 #endif
31493739 }
31503740
....@@ -3152,8 +3742,9 @@
31523742 {
31533743 #ifdef CONFIG_SMP
31543744 /*
3155
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
3156
- * We must ensure this doesn't happen until the switch is completely
3745
+ * This must be the very last reference to @prev from this CPU. After
3746
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
3747
+ * must ensure this doesn't happen until the switch is completely
31573748 * finished.
31583749 *
31593750 * In particular, the load of prev->state in finish_task_switch() must
....@@ -3175,7 +3766,7 @@
31753766 * do an early lockdep release here:
31763767 */
31773768 rq_unpin_lock(rq, rf);
3178
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3769
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
31793770 #ifdef CONFIG_DEBUG_SPINLOCK
31803771 /* this is a valid case when another task releases the spinlock */
31813772 rq->lock.owner = next;
....@@ -3320,11 +3911,12 @@
33203911 * task and put them back on the free list.
33213912 */
33223913 kprobe_flush_task(prev);
3914
+ trace_android_rvh_flush_task(prev);
33233915
33243916 /* Task is done with its stack. */
33253917 put_task_stack(prev);
33263918
3327
- put_task_struct(prev);
3919
+ put_task_struct_rcu_user(prev);
33283920 }
33293921
33303922 tick_nohz_task_switch();
....@@ -3403,12 +3995,8 @@
34033995 context_switch(struct rq *rq, struct task_struct *prev,
34043996 struct task_struct *next, struct rq_flags *rf)
34053997 {
3406
- struct mm_struct *mm, *oldmm;
3407
-
34083998 prepare_task_switch(rq, prev, next);
34093999
3410
- mm = next->mm;
3411
- oldmm = prev->active_mm;
34124000 /*
34134001 * For paravirt, this is coupled with an exit in switch_to to
34144002 * combine the page table reload and the switch backend into
....@@ -3417,22 +4005,37 @@
34174005 arch_start_context_switch(prev);
34184006
34194007 /*
3420
- * If mm is non-NULL, we pass through switch_mm(). If mm is
3421
- * NULL, we will pass through mmdrop() in finish_task_switch().
3422
- * Both of these contain the full memory barrier required by
3423
- * membarrier after storing to rq->curr, before returning to
3424
- * user-space.
4008
+ * kernel -> kernel lazy + transfer active
4009
+ * user -> kernel lazy + mmgrab() active
4010
+ *
4011
+ * kernel -> user switch + mmdrop() active
4012
+ * user -> user switch
34254013 */
3426
- if (!mm) {
3427
- next->active_mm = oldmm;
3428
- mmgrab(oldmm);
3429
- enter_lazy_tlb(oldmm, next);
3430
- } else
3431
- switch_mm_irqs_off(oldmm, mm, next);
4014
+ if (!next->mm) { // to kernel
4015
+ enter_lazy_tlb(prev->active_mm, next);
34324016
3433
- if (!prev->mm) {
3434
- prev->active_mm = NULL;
3435
- rq->prev_mm = oldmm;
4017
+ next->active_mm = prev->active_mm;
4018
+ if (prev->mm) // from user
4019
+ mmgrab(prev->active_mm);
4020
+ else
4021
+ prev->active_mm = NULL;
4022
+ } else { // to user
4023
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
4024
+ /*
4025
+ * sys_membarrier() requires an smp_mb() between setting
4026
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
4027
+ *
4028
+ * The below provides this either through switch_mm(), or in
4029
+ * case 'prev->active_mm == next->mm' through
4030
+ * finish_task_switch()'s mmdrop().
4031
+ */
4032
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
4033
+
4034
+ if (!prev->mm) { // from kernel
4035
+ /* will mmdrop() in finish_task_switch(). */
4036
+ rq->prev_mm = prev->active_mm;
4037
+ prev->active_mm = NULL;
4038
+ }
34364039 }
34374040
34384041 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
....@@ -3469,7 +4072,7 @@
34694072 * preemption, thus the result might have a time-of-check-to-time-of-use
34704073 * race. The caller is responsible to use it correctly, for example:
34714074 *
3472
- * - from a non-preemptable section (of course)
4075
+ * - from a non-preemptible section (of course)
34734076 *
34744077 * - from a thread that is bound to a single CPU
34754078 *
....@@ -3490,6 +4093,18 @@
34904093 sum += cpu_rq(i)->nr_switches;
34914094
34924095 return sum;
4096
+}
4097
+
4098
+/*
4099
+ * Consumers of these two interfaces, like for example the cpuidle menu
4100
+ * governor, are using nonsensical data. Preferring shallow idle state selection
4101
+ * for a CPU that has IO-wait which might not even end up running the task when
4102
+ * it does become runnable.
4103
+ */
4104
+
4105
+unsigned long nr_iowait_cpu(int cpu)
4106
+{
4107
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
34934108 }
34944109
34954110 /*
....@@ -3527,29 +4142,9 @@
35274142 unsigned long i, sum = 0;
35284143
35294144 for_each_possible_cpu(i)
3530
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
4145
+ sum += nr_iowait_cpu(i);
35314146
35324147 return sum;
3533
-}
3534
-
3535
-/*
3536
- * Consumers of these two interfaces, like for example the cpufreq menu
3537
- * governor are using nonsensical data. Boosting frequency for a CPU that has
3538
- * IO-wait which might not even end up running the task when it does become
3539
- * runnable.
3540
- */
3541
-
3542
-unsigned long nr_iowait_cpu(int cpu)
3543
-{
3544
- struct rq *this = cpu_rq(cpu);
3545
- return atomic_read(&this->nr_iowait);
3546
-}
3547
-
3548
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3549
-{
3550
- struct rq *rq = this_rq();
3551
- *nr_waiters = atomic_read(&rq->nr_iowait);
3552
- *load = rq->load.weight;
35534148 }
35544149
35554150 #ifdef CONFIG_SMP
....@@ -3563,9 +4158,14 @@
35634158 struct task_struct *p = current;
35644159 unsigned long flags;
35654160 int dest_cpu;
4161
+ bool cond = false;
4162
+
4163
+ trace_android_rvh_sched_exec(&cond);
4164
+ if (cond)
4165
+ return;
35664166
35674167 raw_spin_lock_irqsave(&p->pi_lock, flags);
3568
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
4168
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
35694169 if (dest_cpu == smp_processor_id())
35704170 goto unlock;
35714171
....@@ -3648,6 +4248,7 @@
36484248
36494249 return ns;
36504250 }
4251
+EXPORT_SYMBOL_GPL(task_sched_runtime);
36514252
36524253 /*
36534254 * This function gets called by the timer code, with HZ frequency.
....@@ -3659,14 +4260,18 @@
36594260 struct rq *rq = cpu_rq(cpu);
36604261 struct task_struct *curr = rq->curr;
36614262 struct rq_flags rf;
4263
+ unsigned long thermal_pressure;
36624264
4265
+ arch_scale_freq_tick();
36634266 sched_clock_tick();
36644267
36654268 rq_lock(rq, &rf);
36664269
4270
+ trace_android_rvh_tick_entry(rq);
36674271 update_rq_clock(rq);
4272
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4273
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
36684274 curr->sched_class->task_tick(rq, curr, 0);
3669
- cpu_load_update_active(rq);
36704275 calc_global_load_tick(rq);
36714276 psi_task_tick(rq);
36724277
....@@ -3678,6 +4283,8 @@
36784283 rq->idle_balance = idle_cpu(cpu);
36794284 trigger_load_balance(rq);
36804285 #endif
4286
+
4287
+ trace_android_vh_scheduler_tick(rq);
36814288 }
36824289
36834290 #ifdef CONFIG_NO_HZ_FULL
....@@ -3735,28 +4342,31 @@
37354342 * statistics and checks timeslices in a time-independent way, regardless
37364343 * of when exactly it is running.
37374344 */
3738
- if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
4345
+ if (!tick_nohz_tick_stopped_cpu(cpu))
37394346 goto out_requeue;
37404347
37414348 rq_lock_irq(rq, &rf);
37424349 curr = rq->curr;
3743
- if (is_idle_task(curr) || cpu_is_offline(cpu))
4350
+ if (cpu_is_offline(cpu))
37444351 goto out_unlock;
37454352
37464353 update_rq_clock(rq);
3747
- delta = rq_clock_task(rq) - curr->se.exec_start;
37484354
3749
- /*
3750
- * Make sure the next tick runs within a reasonable
3751
- * amount of time.
3752
- */
3753
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4355
+ if (!is_idle_task(curr)) {
4356
+ /*
4357
+ * Make sure the next tick runs within a reasonable
4358
+ * amount of time.
4359
+ */
4360
+ delta = rq_clock_task(rq) - curr->se.exec_start;
4361
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
4362
+ }
37544363 curr->sched_class->task_tick(rq, curr, 0);
37554364
4365
+ calc_load_nohz_remote(rq);
37564366 out_unlock:
37574367 rq_unlock_irq(rq, &rf);
3758
-
37594368 out_requeue:
4369
+
37604370 /*
37614371 * Run the remote tick once per second (1Hz). This arbitrary
37624372 * frequency is large enough to avoid overload but short enough
....@@ -3820,7 +4430,7 @@
38204430 static inline void sched_tick_stop(int cpu) { }
38214431 #endif
38224432
3823
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4433
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
38244434 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
38254435 /*
38264436 * If the value passed in is equal to the current preempt count
....@@ -3926,11 +4536,11 @@
39264536 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
39274537 && in_atomic_preempt_off()) {
39284538 pr_err("Preemption disabled at:");
3929
- print_ip_sym(preempt_disable_ip);
3930
- pr_cont("\n");
4539
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
39314540 }
3932
- if (panic_on_warn)
3933
- panic("scheduling while atomic\n");
4541
+ check_panic_on_warn("scheduling while atomic");
4542
+
4543
+ trace_android_rvh_schedule_bug(prev);
39344544
39354545 dump_stack();
39364546 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
....@@ -3939,11 +4549,23 @@
39394549 /*
39404550 * Various schedule()-time debugging checks and statistics:
39414551 */
3942
-static inline void schedule_debug(struct task_struct *prev)
4552
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
39434553 {
39444554 #ifdef CONFIG_SCHED_STACK_END_CHECK
39454555 if (task_stack_end_corrupted(prev))
39464556 panic("corrupted stack end detected inside scheduler\n");
4557
+
4558
+ if (task_scs_end_corrupted(prev))
4559
+ panic("corrupted shadow stack detected inside scheduler\n");
4560
+#endif
4561
+
4562
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4563
+ if (!preempt && prev->state && prev->non_block_count) {
4564
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4565
+ prev->comm, prev->pid, prev->non_block_count);
4566
+ dump_stack();
4567
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4568
+ }
39474569 #endif
39484570
39494571 if (unlikely(in_atomic_preempt_off())) {
....@@ -3955,6 +4577,28 @@
39554577 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
39564578
39574579 schedstat_inc(this_rq()->sched_count);
4580
+}
4581
+
4582
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4583
+ struct rq_flags *rf)
4584
+{
4585
+#ifdef CONFIG_SMP
4586
+ const struct sched_class *class;
4587
+ /*
4588
+ * We must do the balancing pass before put_prev_task(), such
4589
+ * that when we release the rq->lock the task is in the same
4590
+ * state as before we took rq->lock.
4591
+ *
4592
+ * We can terminate the balance pass as soon as we know there is
4593
+ * a runnable task of @class priority or higher.
4594
+ */
4595
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
4596
+ if (class->balance(rq, prev, rf))
4597
+ break;
4598
+ }
4599
+#endif
4600
+
4601
+ put_prev_task(rq, prev);
39584602 }
39594603
39604604 /*
....@@ -3972,29 +4616,29 @@
39724616 * higher scheduling class, because otherwise those loose the
39734617 * opportunity to pull in more work from other CPUs.
39744618 */
3975
- if (likely((prev->sched_class == &idle_sched_class ||
3976
- prev->sched_class == &fair_sched_class) &&
4619
+ if (likely(prev->sched_class <= &fair_sched_class &&
39774620 rq->nr_running == rq->cfs.h_nr_running)) {
39784621
3979
- p = fair_sched_class.pick_next_task(rq, prev, rf);
4622
+ p = pick_next_task_fair(rq, prev, rf);
39804623 if (unlikely(p == RETRY_TASK))
3981
- goto again;
4624
+ goto restart;
39824625
39834626 /* Assumes fair_sched_class->next == idle_sched_class */
3984
- if (unlikely(!p))
3985
- p = idle_sched_class.pick_next_task(rq, prev, rf);
4627
+ if (!p) {
4628
+ put_prev_task(rq, prev);
4629
+ p = pick_next_task_idle(rq);
4630
+ }
39864631
39874632 return p;
39884633 }
39894634
3990
-again:
4635
+restart:
4636
+ put_prev_task_balance(rq, prev, rf);
4637
+
39914638 for_each_class(class) {
3992
- p = class->pick_next_task(rq, prev, rf);
3993
- if (p) {
3994
- if (unlikely(p == RETRY_TASK))
3995
- goto again;
4639
+ p = class->pick_next_task(rq);
4640
+ if (p)
39964641 return p;
3997
- }
39984642 }
39994643
40004644 /* The idle class should always have a runnable task: */
....@@ -4021,7 +4665,7 @@
40214665 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
40224666 * called on the nearest possible occasion:
40234667 *
4024
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
4668
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
40254669 *
40264670 * - in syscall or exception context, at the next outmost
40274671 * preempt_enable(). (this might be as soon as the wake_up()'s
....@@ -4030,7 +4674,7 @@
40304674 * - in IRQ context, return from interrupt-handler to
40314675 * preemptible context
40324676 *
4033
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
4677
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
40344678 * then at the next:
40354679 *
40364680 * - cond_resched() call
....@@ -4044,6 +4688,7 @@
40444688 {
40454689 struct task_struct *prev, *next;
40464690 unsigned long *switch_count;
4691
+ unsigned long prev_state;
40474692 struct rq_flags rf;
40484693 struct rq *rq;
40494694 int cpu;
....@@ -4052,7 +4697,7 @@
40524697 rq = cpu_rq(cpu);
40534698 prev = rq->curr;
40544699
4055
- schedule_debug(prev);
4700
+ schedule_debug(prev, preempt);
40564701
40574702 if (sched_feat(HRTICK))
40584703 hrtick_clear(rq);
....@@ -4063,9 +4708,16 @@
40634708 /*
40644709 * Make sure that signal_pending_state()->signal_pending() below
40654710 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4066
- * done by the caller to avoid the race with signal_wake_up().
4711
+ * done by the caller to avoid the race with signal_wake_up():
40674712 *
4068
- * The membarrier system call requires a full memory barrier
4713
+ * __set_current_state(@state) signal_wake_up()
4714
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
4715
+ * wake_up_state(p, state)
4716
+ * LOCK rq->lock LOCK p->pi_state
4717
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
4718
+ * if (signal_pending_state()) if (p->state & @state)
4719
+ *
4720
+ * Also, the membarrier system call requires a full memory barrier
40694721 * after coming from user-space, before storing to rq->curr.
40704722 */
40714723 rq_lock(rq, &rf);
....@@ -4076,29 +4728,43 @@
40764728 update_rq_clock(rq);
40774729
40784730 switch_count = &prev->nivcsw;
4079
- if (!preempt && prev->state) {
4080
- if (unlikely(signal_pending_state(prev->state, prev))) {
4731
+
4732
+ /*
4733
+ * We must load prev->state once (task_struct::state is volatile), such
4734
+ * that:
4735
+ *
4736
+ * - we form a control dependency vs deactivate_task() below.
4737
+ * - ptrace_{,un}freeze_traced() can change ->state underneath us.
4738
+ */
4739
+ prev_state = prev->state;
4740
+ if (!preempt && prev_state) {
4741
+ if (signal_pending_state(prev_state, prev)) {
40814742 prev->state = TASK_RUNNING;
40824743 } else {
4744
+ prev->sched_contributes_to_load =
4745
+ (prev_state & TASK_UNINTERRUPTIBLE) &&
4746
+ !(prev_state & TASK_NOLOAD) &&
4747
+ !(prev->flags & PF_FROZEN);
4748
+
4749
+ if (prev->sched_contributes_to_load)
4750
+ rq->nr_uninterruptible++;
4751
+
4752
+ /*
4753
+ * __schedule() ttwu()
4754
+ * prev_state = prev->state; if (p->on_rq && ...)
4755
+ * if (prev_state) goto out;
4756
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
4757
+ * p->state = TASK_WAKING
4758
+ *
4759
+ * Where __schedule() and ttwu() have matching control dependencies.
4760
+ *
4761
+ * After this, schedule() must not care about p->state any more.
4762
+ */
40834763 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4084
- prev->on_rq = 0;
40854764
40864765 if (prev->in_iowait) {
40874766 atomic_inc(&rq->nr_iowait);
40884767 delayacct_blkio_start();
4089
- }
4090
-
4091
- /*
4092
- * If a worker went to sleep, notify and ask workqueue
4093
- * whether it wants to wake up a task to maintain
4094
- * concurrency.
4095
- */
4096
- if (prev->flags & PF_WQ_WORKER) {
4097
- struct task_struct *to_wakeup;
4098
-
4099
- to_wakeup = wq_worker_sleeping(prev);
4100
- if (to_wakeup)
4101
- try_to_wake_up_local(to_wakeup, &rf);
41024768 }
41034769 }
41044770 switch_count = &prev->nvcsw;
....@@ -4108,9 +4774,14 @@
41084774 clear_tsk_need_resched(prev);
41094775 clear_preempt_need_resched();
41104776
4777
+ trace_android_rvh_schedule(prev, next, rq);
41114778 if (likely(prev != next)) {
41124779 rq->nr_switches++;
4113
- rq->curr = next;
4780
+ /*
4781
+ * RCU users of rcu_dereference(rq->curr) may not see
4782
+ * changes to task_struct made by pick_next_task().
4783
+ */
4784
+ RCU_INIT_POINTER(rq->curr, next);
41144785 /*
41154786 * The membarrier system call requires each architecture
41164787 * to have a full memory barrier after updating
....@@ -4126,6 +4797,8 @@
41264797 * is a RELEASE barrier),
41274798 */
41284799 ++*switch_count;
4800
+
4801
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev));
41294802
41304803 trace_sched_switch(preempt, prev, next);
41314804
....@@ -4157,14 +4830,48 @@
41574830
41584831 static inline void sched_submit_work(struct task_struct *tsk)
41594832 {
4160
- if (!tsk->state || tsk_is_pi_blocked(tsk))
4833
+ unsigned int task_flags;
4834
+
4835
+ if (!tsk->state)
41614836 return;
4837
+
4838
+ task_flags = tsk->flags;
4839
+ /*
4840
+ * If a worker went to sleep, notify and ask workqueue whether
4841
+ * it wants to wake up a task to maintain concurrency.
4842
+ * As this function is called inside the schedule() context,
4843
+ * we disable preemption to avoid it calling schedule() again
4844
+ * in the possible wakeup of a kworker and because wq_worker_sleeping()
4845
+ * requires it.
4846
+ */
4847
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4848
+ preempt_disable();
4849
+ if (task_flags & PF_WQ_WORKER)
4850
+ wq_worker_sleeping(tsk);
4851
+ else
4852
+ io_wq_worker_sleeping(tsk);
4853
+ preempt_enable_no_resched();
4854
+ }
4855
+
4856
+ if (tsk_is_pi_blocked(tsk))
4857
+ return;
4858
+
41624859 /*
41634860 * If we are going to sleep and we have plugged IO queued,
41644861 * make sure to submit it to avoid deadlocks.
41654862 */
41664863 if (blk_needs_flush_plug(tsk))
41674864 blk_schedule_flush_plug(tsk);
4865
+}
4866
+
4867
+static void sched_update_worker(struct task_struct *tsk)
4868
+{
4869
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
4870
+ if (tsk->flags & PF_WQ_WORKER)
4871
+ wq_worker_running(tsk);
4872
+ else
4873
+ io_wq_worker_running(tsk);
4874
+ }
41684875 }
41694876
41704877 asmlinkage __visible void __sched schedule(void)
....@@ -4177,6 +4884,7 @@
41774884 __schedule(false);
41784885 sched_preempt_enable_no_resched();
41794886 } while (need_resched());
4887
+ sched_update_worker(tsk);
41804888 }
41814889 EXPORT_SYMBOL(schedule);
41824890
....@@ -4265,11 +4973,10 @@
42654973 } while (need_resched());
42664974 }
42674975
4268
-#ifdef CONFIG_PREEMPT
4976
+#ifdef CONFIG_PREEMPTION
42694977 /*
4270
- * this is the entry point to schedule() from in-kernel preemption
4271
- * off of preempt_enable. Kernel preemptions off return from interrupt
4272
- * occur there and call schedule directly.
4978
+ * This is the entry point to schedule() from in-kernel preemption
4979
+ * off of preempt_enable.
42734980 */
42744981 asmlinkage __visible void __sched notrace preempt_schedule(void)
42754982 {
....@@ -4337,10 +5044,10 @@
43375044 }
43385045 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
43395046
4340
-#endif /* CONFIG_PREEMPT */
5047
+#endif /* CONFIG_PREEMPTION */
43415048
43425049 /*
4343
- * this is the entry point to schedule() from kernel preemption
5050
+ * This is the entry point to schedule() from kernel preemption
43445051 * off of irq context.
43455052 * Note, that this is called and return with irqs disabled. This will
43465053 * protect us against recursive calling from irq.
....@@ -4368,9 +5075,22 @@
43685075 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
43695076 void *key)
43705077 {
4371
- return try_to_wake_up(curr->private, mode, wake_flags, 1);
5078
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
5079
+ return try_to_wake_up(curr->private, mode, wake_flags);
43725080 }
43735081 EXPORT_SYMBOL(default_wake_function);
5082
+
5083
+static void __setscheduler_prio(struct task_struct *p, int prio)
5084
+{
5085
+ if (dl_prio(prio))
5086
+ p->sched_class = &dl_sched_class;
5087
+ else if (rt_prio(prio))
5088
+ p->sched_class = &rt_sched_class;
5089
+ else
5090
+ p->sched_class = &fair_sched_class;
5091
+
5092
+ p->prio = prio;
5093
+}
43745094
43755095 #ifdef CONFIG_RT_MUTEXES
43765096
....@@ -4408,6 +5128,7 @@
44085128 struct rq_flags rf;
44095129 struct rq *rq;
44105130
5131
+ trace_android_rvh_rtmutex_prepare_setprio(p, pi_task);
44115132 /* XXX used to be waiter->prio, not waiter->task->prio */
44125133 prio = __rt_effective_prio(pi_task, p->normal_prio);
44135134
....@@ -4482,31 +5203,29 @@
44825203 if (!dl_prio(p->normal_prio) ||
44835204 (pi_task && dl_prio(pi_task->prio) &&
44845205 dl_entity_preempt(&pi_task->dl, &p->dl))) {
4485
- p->dl.dl_boosted = 1;
5206
+ p->dl.pi_se = pi_task->dl.pi_se;
44865207 queue_flag |= ENQUEUE_REPLENISH;
4487
- } else
4488
- p->dl.dl_boosted = 0;
4489
- p->sched_class = &dl_sched_class;
5208
+ } else {
5209
+ p->dl.pi_se = &p->dl;
5210
+ }
44905211 } else if (rt_prio(prio)) {
44915212 if (dl_prio(oldprio))
4492
- p->dl.dl_boosted = 0;
5213
+ p->dl.pi_se = &p->dl;
44935214 if (oldprio < prio)
44945215 queue_flag |= ENQUEUE_HEAD;
4495
- p->sched_class = &rt_sched_class;
44965216 } else {
44975217 if (dl_prio(oldprio))
4498
- p->dl.dl_boosted = 0;
5218
+ p->dl.pi_se = &p->dl;
44995219 if (rt_prio(oldprio))
45005220 p->rt.timeout = 0;
4501
- p->sched_class = &fair_sched_class;
45025221 }
45035222
4504
- p->prio = prio;
5223
+ __setscheduler_prio(p, prio);
45055224
45065225 if (queued)
45075226 enqueue_task(rq, p, queue_flag);
45085227 if (running)
4509
- set_curr_task(rq, p);
5228
+ set_next_task(rq, p);
45105229
45115230 check_class_changed(rq, p, prev_class, oldprio);
45125231 out_unlock:
....@@ -4526,12 +5245,13 @@
45265245
45275246 void set_user_nice(struct task_struct *p, long nice)
45285247 {
4529
- bool queued, running;
4530
- int old_prio, delta;
5248
+ bool queued, running, allowed = false;
5249
+ int old_prio;
45315250 struct rq_flags rf;
45325251 struct rq *rq;
45335252
4534
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
5253
+ trace_android_rvh_set_user_nice(p, &nice, &allowed);
5254
+ if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed)
45355255 return;
45365256 /*
45375257 * We have to be careful, if called from sys_setpriority(),
....@@ -4558,22 +5278,21 @@
45585278 put_prev_task(rq, p);
45595279
45605280 p->static_prio = NICE_TO_PRIO(nice);
4561
- set_load_weight(p, true);
5281
+ set_load_weight(p);
45625282 old_prio = p->prio;
45635283 p->prio = effective_prio(p);
4564
- delta = p->prio - old_prio;
45655284
4566
- if (queued) {
5285
+ if (queued)
45675286 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
4568
- /*
4569
- * If the task increased its priority or is running and
4570
- * lowered its priority, then reschedule its CPU:
4571
- */
4572
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
4573
- resched_curr(rq);
4574
- }
45755287 if (running)
4576
- set_curr_task(rq, p);
5288
+ set_next_task(rq, p);
5289
+
5290
+ /*
5291
+ * If the task increased its priority or is running and
5292
+ * lowered its priority, then reschedule its CPU:
5293
+ */
5294
+ p->sched_class->prio_changed(rq, p, old_prio);
5295
+
45775296 out_unlock:
45785297 task_rq_unlock(rq, p, &rf);
45795298 }
....@@ -4658,7 +5377,7 @@
46585377 return 0;
46595378
46605379 #ifdef CONFIG_SMP
4661
- if (!llist_empty(&rq->wake_list))
5380
+ if (rq->ttwu_pending)
46625381 return 0;
46635382 #endif
46645383
....@@ -4681,6 +5400,7 @@
46815400
46825401 return 1;
46835402 }
5403
+EXPORT_SYMBOL_GPL(available_idle_cpu);
46845404
46855405 /**
46865406 * idle_task - return the idle task for a given CPU.
....@@ -4732,36 +5452,7 @@
47325452 */
47335453 p->rt_priority = attr->sched_priority;
47345454 p->normal_prio = normal_prio(p);
4735
- set_load_weight(p, true);
4736
-}
4737
-
4738
-/* Actually do priority change: must hold pi & rq lock. */
4739
-static void __setscheduler(struct rq *rq, struct task_struct *p,
4740
- const struct sched_attr *attr, bool keep_boost)
4741
-{
4742
- /*
4743
- * If params can't change scheduling class changes aren't allowed
4744
- * either.
4745
- */
4746
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4747
- return;
4748
-
4749
- __setscheduler_params(p, attr);
4750
-
4751
- /*
4752
- * Keep a potential priority boosting if called from
4753
- * sched_setscheduler().
4754
- */
4755
- p->prio = normal_prio(p);
4756
- if (keep_boost)
4757
- p->prio = rt_effective_prio(p, p->prio);
4758
-
4759
- if (dl_prio(p->prio))
4760
- p->sched_class = &dl_sched_class;
4761
- else if (rt_prio(p->prio))
4762
- p->sched_class = &rt_sched_class;
4763
- else
4764
- p->sched_class = &fair_sched_class;
5455
+ set_load_weight(p);
47655456 }
47665457
47675458 /*
....@@ -4784,15 +5475,14 @@
47845475 const struct sched_attr *attr,
47855476 bool user, bool pi)
47865477 {
4787
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4788
- MAX_RT_PRIO - 1 - attr->sched_priority;
4789
- int retval, oldprio, oldpolicy = -1, queued, running;
4790
- int new_effective_prio, policy = attr->sched_policy;
5478
+ int oldpolicy = -1, policy = attr->sched_policy;
5479
+ int retval, oldprio, newprio, queued, running;
47915480 const struct sched_class *prev_class;
47925481 struct rq_flags rf;
47935482 int reset_on_fork;
47945483 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
47955484 struct rq *rq;
5485
+ bool cpuset_locked = false;
47965486
47975487 /* The pi code expects interrupts enabled */
47985488 BUG_ON(pi && in_interrupt());
....@@ -4860,7 +5550,7 @@
48605550 * Treat SCHED_IDLE as nice 20. Only allow a switch to
48615551 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
48625552 */
4863
- if (idle_policy(p->policy) && !idle_policy(policy)) {
5553
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
48645554 if (!can_nice(p, task_nice(p)))
48655555 return -EPERM;
48665556 }
....@@ -4871,6 +5561,10 @@
48715561
48725562 /* Normal users shall not reset the sched_reset_on_fork flag: */
48735563 if (p->sched_reset_on_fork && !reset_on_fork)
5564
+ return -EPERM;
5565
+
5566
+ /* Can't change util-clamps */
5567
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
48745568 return -EPERM;
48755569 }
48765570
....@@ -4891,6 +5585,15 @@
48915585 }
48925586
48935587 /*
5588
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
5589
+ * information.
5590
+ */
5591
+ if (dl_policy(policy) || dl_policy(p->policy)) {
5592
+ cpuset_locked = true;
5593
+ cpuset_lock();
5594
+ }
5595
+
5596
+ /*
48945597 * Make sure no PI-waiters arrive (or leave) while we are
48955598 * changing the priority of the task:
48965599 *
....@@ -4904,8 +5607,8 @@
49045607 * Changing the policy of the stop threads its a very bad idea:
49055608 */
49065609 if (p == rq->stop) {
4907
- task_rq_unlock(rq, p, &rf);
4908
- return -EINVAL;
5610
+ retval = -EINVAL;
5611
+ goto unlock;
49095612 }
49105613
49115614 /*
....@@ -4923,8 +5626,8 @@
49235626 goto change;
49245627
49255628 p->sched_reset_on_fork = reset_on_fork;
4926
- task_rq_unlock(rq, p, &rf);
4927
- return 0;
5629
+ retval = 0;
5630
+ goto unlock;
49285631 }
49295632 change:
49305633
....@@ -4937,8 +5640,8 @@
49375640 if (rt_bandwidth_enabled() && rt_policy(policy) &&
49385641 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
49395642 !task_group_is_autogroup(task_group(p))) {
4940
- task_rq_unlock(rq, p, &rf);
4941
- return -EPERM;
5643
+ retval = -EPERM;
5644
+ goto unlock;
49425645 }
49435646 #endif
49445647 #ifdef CONFIG_SMP
....@@ -4951,10 +5654,10 @@
49515654 * the entire root_domain to become SCHED_DEADLINE. We
49525655 * will also fail if there's no bandwidth available.
49535656 */
4954
- if (!cpumask_subset(span, &p->cpus_allowed) ||
5657
+ if (!cpumask_subset(span, p->cpus_ptr) ||
49555658 rq->rd->dl_bw.bw == 0) {
4956
- task_rq_unlock(rq, p, &rf);
4957
- return -EPERM;
5659
+ retval = -EPERM;
5660
+ goto unlock;
49585661 }
49595662 }
49605663 #endif
....@@ -4964,6 +5667,8 @@
49645667 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
49655668 policy = oldpolicy = -1;
49665669 task_rq_unlock(rq, p, &rf);
5670
+ if (cpuset_locked)
5671
+ cpuset_unlock();
49675672 goto recheck;
49685673 }
49695674
....@@ -4973,13 +5678,14 @@
49735678 * is available.
49745679 */
49755680 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4976
- task_rq_unlock(rq, p, &rf);
4977
- return -EBUSY;
5681
+ retval = -EBUSY;
5682
+ goto unlock;
49785683 }
49795684
49805685 p->sched_reset_on_fork = reset_on_fork;
49815686 oldprio = p->prio;
49825687
5688
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
49835689 if (pi) {
49845690 /*
49855691 * Take priority boosted tasks into account. If the new
....@@ -4988,8 +5694,8 @@
49885694 * the runqueue. This will be done when the task deboost
49895695 * itself.
49905696 */
4991
- new_effective_prio = rt_effective_prio(p, newprio);
4992
- if (new_effective_prio == oldprio)
5697
+ newprio = rt_effective_prio(p, newprio);
5698
+ if (newprio == oldprio)
49935699 queue_flags &= ~DEQUEUE_MOVE;
49945700 }
49955701
....@@ -5002,7 +5708,11 @@
50025708
50035709 prev_class = p->sched_class;
50045710
5005
- __setscheduler(rq, p, attr, pi);
5711
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
5712
+ __setscheduler_params(p, attr);
5713
+ __setscheduler_prio(p, newprio);
5714
+ trace_android_rvh_setscheduler(p);
5715
+ }
50065716 __setscheduler_uclamp(p, attr);
50075717
50085718 if (queued) {
....@@ -5016,7 +5726,7 @@
50165726 enqueue_task(rq, p, queue_flags);
50175727 }
50185728 if (running)
5019
- set_curr_task(rq, p);
5729
+ set_next_task(rq, p);
50205730
50215731 check_class_changed(rq, p, prev_class, oldprio);
50225732
....@@ -5024,14 +5734,23 @@
50245734 preempt_disable();
50255735 task_rq_unlock(rq, p, &rf);
50265736
5027
- if (pi)
5737
+ if (pi) {
5738
+ if (cpuset_locked)
5739
+ cpuset_unlock();
50285740 rt_mutex_adjust_pi(p);
5741
+ }
50295742
50305743 /* Run balance callbacks after we've adjusted the PI chain: */
50315744 balance_callback(rq);
50325745 preempt_enable();
50335746
50345747 return 0;
5748
+
5749
+unlock:
5750
+ task_rq_unlock(rq, p, &rf);
5751
+ if (cpuset_locked)
5752
+ cpuset_unlock();
5753
+ return retval;
50355754 }
50365755
50375756 static int _sched_setscheduler(struct task_struct *p, int policy,
....@@ -5043,6 +5762,14 @@
50435762 .sched_nice = PRIO_TO_NICE(p->static_prio),
50445763 };
50455764
5765
+ if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) &&
5766
+ ((policy == SCHED_FIFO) || (policy == SCHED_RR))) {
5767
+ attr.sched_priority /= 2;
5768
+ if (!check)
5769
+ attr.sched_priority += MAX_RT_PRIO / 2;
5770
+ if (!attr.sched_priority)
5771
+ attr.sched_priority = 1;
5772
+ }
50465773 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
50475774 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
50485775 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
....@@ -5057,6 +5784,8 @@
50575784 * @p: the task in question.
50585785 * @policy: new policy.
50595786 * @param: structure containing the new RT priority.
5787
+ *
5788
+ * Use sched_set_fifo(), read its comment.
50605789 *
50615790 * Return: 0 on success. An error code otherwise.
50625791 *
....@@ -5079,6 +5808,7 @@
50795808 {
50805809 return __sched_setscheduler(p, attr, false, true);
50815810 }
5811
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
50825812
50835813 /**
50845814 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
....@@ -5099,6 +5829,51 @@
50995829 return _sched_setscheduler(p, policy, param, false);
51005830 }
51015831 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
5832
+
5833
+/*
5834
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
5835
+ * incapable of resource management, which is the one thing an OS really should
5836
+ * be doing.
5837
+ *
5838
+ * This is of course the reason it is limited to privileged users only.
5839
+ *
5840
+ * Worse still; it is fundamentally impossible to compose static priority
5841
+ * workloads. You cannot take two correctly working static prio workloads
5842
+ * and smash them together and still expect them to work.
5843
+ *
5844
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
5845
+ *
5846
+ * MAX_RT_PRIO / 2
5847
+ *
5848
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
5849
+ * know enough information to make a sensible choice.
5850
+ */
5851
+void sched_set_fifo(struct task_struct *p)
5852
+{
5853
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
5854
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5855
+}
5856
+EXPORT_SYMBOL_GPL(sched_set_fifo);
5857
+
5858
+/*
5859
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
5860
+ */
5861
+void sched_set_fifo_low(struct task_struct *p)
5862
+{
5863
+ struct sched_param sp = { .sched_priority = 1 };
5864
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
5865
+}
5866
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
5867
+
5868
+void sched_set_normal(struct task_struct *p, int nice)
5869
+{
5870
+ struct sched_attr attr = {
5871
+ .sched_policy = SCHED_NORMAL,
5872
+ .sched_nice = nice,
5873
+ };
5874
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
5875
+}
5876
+EXPORT_SYMBOL_GPL(sched_set_normal);
51025877
51035878 static int
51045879 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
....@@ -5130,9 +5905,6 @@
51305905 u32 size;
51315906 int ret;
51325907
5133
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
5134
- return -EFAULT;
5135
-
51365908 /* Zero the full structure, so that a short copy will be nice: */
51375909 memset(attr, 0, sizeof(*attr));
51385910
....@@ -5140,44 +5912,18 @@
51405912 if (ret)
51415913 return ret;
51425914
5143
- /* Bail out on silly large: */
5144
- if (size > PAGE_SIZE)
5145
- goto err_size;
5146
-
51475915 /* ABI compatibility quirk: */
51485916 if (!size)
51495917 size = SCHED_ATTR_SIZE_VER0;
5150
-
5151
- if (size < SCHED_ATTR_SIZE_VER0)
5918
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
51525919 goto err_size;
51535920
5154
- /*
5155
- * If we're handed a bigger struct than we know of,
5156
- * ensure all the unknown bits are 0 - i.e. new
5157
- * user-space does not rely on any kernel feature
5158
- * extensions we dont know about yet.
5159
- */
5160
- if (size > sizeof(*attr)) {
5161
- unsigned char __user *addr;
5162
- unsigned char __user *end;
5163
- unsigned char val;
5164
-
5165
- addr = (void __user *)uattr + sizeof(*attr);
5166
- end = (void __user *)uattr + size;
5167
-
5168
- for (; addr < end; addr++) {
5169
- ret = get_user(val, addr);
5170
- if (ret)
5171
- return ret;
5172
- if (val)
5173
- goto err_size;
5174
- }
5175
- size = sizeof(*attr);
5921
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
5922
+ if (ret) {
5923
+ if (ret == -E2BIG)
5924
+ goto err_size;
5925
+ return ret;
51765926 }
5177
-
5178
- ret = copy_from_user(attr, uattr, size);
5179
- if (ret)
5180
- return -EFAULT;
51815927
51825928 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
51835929 size < SCHED_ATTR_SIZE_VER1)
....@@ -5194,6 +5940,16 @@
51945940 err_size:
51955941 put_user(sizeof(*attr), &uattr->size);
51965942 return -E2BIG;
5943
+}
5944
+
5945
+static void get_params(struct task_struct *p, struct sched_attr *attr)
5946
+{
5947
+ if (task_has_dl_policy(p))
5948
+ __getparam_dl(p, attr);
5949
+ else if (task_has_rt_policy(p))
5950
+ attr->sched_priority = p->rt_priority;
5951
+ else
5952
+ attr->sched_nice = task_nice(p);
51975953 }
51985954
51995955 /**
....@@ -5257,6 +6013,8 @@
52576013 rcu_read_unlock();
52586014
52596015 if (likely(p)) {
6016
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
6017
+ get_params(p, &attr);
52606018 retval = sched_setattr(p, &attr);
52616019 put_task_struct(p);
52626020 }
....@@ -5350,7 +6108,7 @@
53506108 {
53516109 unsigned int ksize = sizeof(*kattr);
53526110
5353
- if (!access_ok(VERIFY_WRITE, uattr, usize))
6111
+ if (!access_ok(uattr, usize))
53546112 return -EFAULT;
53556113
53566114 /*
....@@ -5378,7 +6136,7 @@
53786136 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
53796137 * @pid: the pid in question.
53806138 * @uattr: structure containing the extended parameters.
5381
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
6139
+ * @usize: sizeof(attr) for fwd/bwd comp.
53826140 * @flags: for future extension.
53836141 */
53846142 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
....@@ -5405,14 +6163,15 @@
54056163 kattr.sched_policy = p->policy;
54066164 if (p->sched_reset_on_fork)
54076165 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5408
- if (task_has_dl_policy(p))
5409
- __getparam_dl(p, &kattr);
5410
- else if (task_has_rt_policy(p))
5411
- kattr.sched_priority = p->rt_priority;
5412
- else
5413
- kattr.sched_nice = task_nice(p);
6166
+ get_params(p, &kattr);
6167
+ kattr.sched_flags &= SCHED_FLAG_ALL;
54146168
54156169 #ifdef CONFIG_UCLAMP_TASK
6170
+ /*
6171
+ * This could race with another potential updater, but this is fine
6172
+ * because it'll correctly read the old or the new value. We don't need
6173
+ * to guarantee who wins the race as long as it doesn't return garbage.
6174
+ */
54166175 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
54176176 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
54186177 #endif
....@@ -5431,6 +6190,7 @@
54316190 cpumask_var_t cpus_allowed, new_mask;
54326191 struct task_struct *p;
54336192 int retval;
6193
+ int skip = 0;
54346194
54356195 rcu_read_lock();
54366196
....@@ -5466,6 +6226,9 @@
54666226 rcu_read_unlock();
54676227 }
54686228
6229
+ trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
6230
+ if (skip)
6231
+ goto out_free_new_mask;
54696232 retval = security_task_setscheduler(p);
54706233 if (retval)
54716234 goto out_free_new_mask;
....@@ -5506,6 +6269,9 @@
55066269 goto again;
55076270 }
55086271 }
6272
+
6273
+ trace_android_rvh_sched_setaffinity(p, in_mask, &retval);
6274
+
55096275 out_free_new_mask:
55106276 free_cpumask_var(new_mask);
55116277 out_free_cpus_allowed:
....@@ -5514,7 +6280,6 @@
55146280 put_task_struct(p);
55156281 return retval;
55166282 }
5517
-EXPORT_SYMBOL_GPL(sched_setaffinity);
55186283
55196284 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
55206285 struct cpumask *new_mask)
....@@ -5569,7 +6334,7 @@
55696334 goto out_unlock;
55706335
55716336 raw_spin_lock_irqsave(&p->pi_lock, flags);
5572
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
6337
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
55736338 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
55746339
55756340 out_unlock:
....@@ -5598,14 +6363,14 @@
55986363 if (len & (sizeof(unsigned long)-1))
55996364 return -EINVAL;
56006365
5601
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6366
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
56026367 return -ENOMEM;
56036368
56046369 ret = sched_getaffinity(pid, mask);
56056370 if (ret == 0) {
56066371 unsigned int retlen = min(len, cpumask_size());
56076372
5608
- if (copy_to_user(user_mask_ptr, mask, retlen))
6373
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
56096374 ret = -EFAULT;
56106375 else
56116376 ret = retlen;
....@@ -5633,6 +6398,8 @@
56336398 schedstat_inc(rq->yld_count);
56346399 current->sched_class->yield_task(rq);
56356400
6401
+ trace_android_rvh_do_sched_yield(rq);
6402
+
56366403 preempt_disable();
56376404 rq_unlock_irq(rq, &rf);
56386405 sched_preempt_enable_no_resched();
....@@ -5646,7 +6413,7 @@
56466413 return 0;
56476414 }
56486415
5649
-#ifndef CONFIG_PREEMPT
6416
+#ifndef CONFIG_PREEMPTION
56506417 int __sched _cond_resched(void)
56516418 {
56526419 if (should_resched(0)) {
....@@ -5663,7 +6430,7 @@
56636430 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
56646431 * call schedule, and on return reacquire the lock.
56656432 *
5666
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6433
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
56676434 * operations here to prevent schedule() from being called twice (once via
56686435 * spin_unlock(), once by hand).
56696436 */
....@@ -5767,7 +6534,7 @@
57676534 if (task_running(p_rq, p) || p->state)
57686535 goto out_unlock;
57696536
5770
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
6537
+ yielded = curr->sched_class->yield_to_task(rq, p);
57716538 if (yielded) {
57726539 schedstat_inc(rq->yld_count);
57736540 /*
....@@ -5933,7 +6700,7 @@
59336700 * an error code.
59346701 */
59356702 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5936
- struct timespec __user *, interval)
6703
+ struct __kernel_timespec __user *, interval)
59376704 {
59386705 struct timespec64 t;
59396706 int retval = sched_rr_get_interval(pid, &t);
....@@ -5944,16 +6711,15 @@
59446711 return retval;
59456712 }
59466713
5947
-#ifdef CONFIG_COMPAT
5948
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
5949
- compat_pid_t, pid,
5950
- struct compat_timespec __user *, interval)
6714
+#ifdef CONFIG_COMPAT_32BIT_TIME
6715
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
6716
+ struct old_timespec32 __user *, interval)
59516717 {
59526718 struct timespec64 t;
59536719 int retval = sched_rr_get_interval(pid, &t);
59546720
59556721 if (retval == 0)
5956
- retval = compat_put_timespec64(&t, interval);
6722
+ retval = put_old_timespec32(&t, interval);
59576723 return retval;
59586724 }
59596725 #endif
....@@ -5966,10 +6732,10 @@
59666732 if (!try_get_task_stack(p))
59676733 return;
59686734
5969
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
6735
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
59706736
59716737 if (p->state == TASK_RUNNING)
5972
- printk(KERN_CONT " running task ");
6738
+ pr_cont(" running task ");
59736739 #ifdef CONFIG_DEBUG_STACK_USAGE
59746740 free = stack_not_used(p);
59756741 #endif
....@@ -5978,12 +6744,13 @@
59786744 if (pid_alive(p))
59796745 ppid = task_pid_nr(rcu_dereference(p->real_parent));
59806746 rcu_read_unlock();
5981
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5982
- task_pid_nr(p), ppid,
6747
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
6748
+ free, task_pid_nr(p), ppid,
59836749 (unsigned long)task_thread_info(p)->flags);
59846750
59856751 print_worker_info(KERN_INFO, p);
5986
- show_stack(p, NULL);
6752
+ trace_android_vh_sched_show_task(p);
6753
+ show_stack(p, NULL, KERN_INFO);
59876754 put_task_stack(p);
59886755 }
59896756 EXPORT_SYMBOL_GPL(sched_show_task);
....@@ -6014,13 +6781,6 @@
60146781 {
60156782 struct task_struct *g, *p;
60166783
6017
-#if BITS_PER_LONG == 32
6018
- printk(KERN_INFO
6019
- " task PC stack pid father\n");
6020
-#else
6021
- printk(KERN_INFO
6022
- " task PC stack pid father\n");
6023
-#endif
60246784 rcu_read_lock();
60256785 for_each_process_thread(g, p) {
60266786 /*
....@@ -6056,7 +6816,7 @@
60566816 * NOTE: this function does not set the idle thread's NEED_RESCHED
60576817 * flag, to make booting more robust.
60586818 */
6059
-void init_idle(struct task_struct *idle, int cpu)
6819
+void __init init_idle(struct task_struct *idle, int cpu)
60606820 {
60616821 struct rq *rq = cpu_rq(cpu);
60626822 unsigned long flags;
....@@ -6069,9 +6829,6 @@
60696829 idle->state = TASK_RUNNING;
60706830 idle->se.exec_start = sched_clock();
60716831 idle->flags |= PF_IDLE;
6072
-
6073
- scs_task_reset(idle);
6074
- kasan_unpoison_task_stack(idle);
60756832
60766833 #ifdef CONFIG_SMP
60776834 /*
....@@ -6096,7 +6853,8 @@
60966853 __set_task_cpu(idle, cpu);
60976854 rcu_read_unlock();
60986855
6099
- rq->curr = rq->idle = idle;
6856
+ rq->idle = idle;
6857
+ rcu_assign_pointer(rq->curr, idle);
61006858 idle->on_rq = TASK_ON_RQ_QUEUED;
61016859 #ifdef CONFIG_SMP
61026860 idle->on_cpu = 1;
....@@ -6133,8 +6891,7 @@
61336891 return ret;
61346892 }
61356893
6136
-int task_can_attach(struct task_struct *p,
6137
- const struct cpumask *cs_cpus_allowed)
6894
+int task_can_attach(struct task_struct *p)
61386895 {
61396896 int ret = 0;
61406897
....@@ -6145,18 +6902,11 @@
61456902 * allowed nodes is unnecessary. Thus, cpusets are not
61466903 * applicable for such threads. This prevents checking for
61476904 * success of set_cpus_allowed_ptr() on all attached tasks
6148
- * before cpus_allowed may be changed.
6905
+ * before cpus_mask may be changed.
61496906 */
6150
- if (p->flags & PF_NO_SETAFFINITY) {
6907
+ if (p->flags & PF_NO_SETAFFINITY)
61516908 ret = -EINVAL;
6152
- goto out;
6153
- }
61546909
6155
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
6156
- cs_cpus_allowed))
6157
- ret = dl_task_can_attach(p, cs_cpus_allowed);
6158
-
6159
-out:
61606910 return ret;
61616911 }
61626912
....@@ -6172,7 +6922,7 @@
61726922 if (curr_cpu == target_cpu)
61736923 return 0;
61746924
6175
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
6925
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
61766926 return -EINVAL;
61776927
61786928 /* TODO: This is not properly updating schedstats */
....@@ -6205,7 +6955,7 @@
62056955 if (queued)
62066956 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
62076957 if (running)
6208
- set_curr_task(rq, p);
6958
+ set_next_task(rq, p);
62096959 task_rq_unlock(rq, p, &rf);
62106960 }
62116961 #endif /* CONFIG_NUMA_BALANCING */
....@@ -6246,21 +6996,22 @@
62466996 atomic_long_add(delta, &calc_load_tasks);
62476997 }
62486998
6249
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
6999
+static struct task_struct *__pick_migrate_task(struct rq *rq)
62507000 {
7001
+ const struct sched_class *class;
7002
+ struct task_struct *next;
7003
+
7004
+ for_each_class(class) {
7005
+ next = class->pick_next_task(rq);
7006
+ if (next) {
7007
+ next->sched_class->put_prev_task(rq, next);
7008
+ return next;
7009
+ }
7010
+ }
7011
+
7012
+ /* The idle class should always have a runnable task */
7013
+ BUG();
62517014 }
6252
-
6253
-static const struct sched_class fake_sched_class = {
6254
- .put_prev_task = put_prev_task_fake,
6255
-};
6256
-
6257
-static struct task_struct fake_task = {
6258
- /*
6259
- * Avoid pull_{rt,dl}_task()
6260
- */
6261
- .prio = MAX_PRIO + 1,
6262
- .sched_class = &fake_sched_class,
6263
-};
62647015
62657016 /*
62667017 * Migrate all tasks from the rq, sleeping tasks will be migrated by
....@@ -6269,11 +7020,14 @@
62697020 * Called with rq->lock held even though we'er in stop_machine() and
62707021 * there's no concurrency possible, we hold the required locks anyway
62717022 * because of lock validation efforts.
7023
+ *
7024
+ * force: if false, the function will skip CPU pinned kthreads.
62727025 */
6273
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
7026
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force)
62747027 {
62757028 struct rq *rq = dead_rq;
6276
- struct task_struct *next, *stop = rq->stop;
7029
+ struct task_struct *next, *tmp, *stop = rq->stop;
7030
+ LIST_HEAD(percpu_kthreads);
62777031 struct rq_flags orf = *rf;
62787032 int dest_cpu;
62797033
....@@ -6295,6 +7049,11 @@
62957049 */
62967050 update_rq_clock(rq);
62977051
7052
+#ifdef CONFIG_SCHED_DEBUG
7053
+ /* note the clock update in orf */
7054
+ orf.clock_update_flags |= RQCF_UPDATED;
7055
+#endif
7056
+
62987057 for (;;) {
62997058 /*
63007059 * There's this thread running, bail when that's the only
....@@ -6303,15 +7062,24 @@
63037062 if (rq->nr_running == 1)
63047063 break;
63057064
6306
- /*
6307
- * pick_next_task() assumes pinned rq->lock:
6308
- */
6309
- next = pick_next_task(rq, &fake_task, rf);
6310
- BUG_ON(!next);
6311
- put_prev_task(rq, next);
7065
+ next = __pick_migrate_task(rq);
63127066
63137067 /*
6314
- * Rules for changing task_struct::cpus_allowed are holding
7068
+ * Argh ... no iterator for tasks, we need to remove the
7069
+ * kthread from the run-queue to continue.
7070
+ */
7071
+ if (!force && is_per_cpu_kthread(next)) {
7072
+ INIT_LIST_HEAD(&next->percpu_kthread_node);
7073
+ list_add(&next->percpu_kthread_node, &percpu_kthreads);
7074
+
7075
+ /* DEQUEUE_SAVE not used due to move_entity in rt */
7076
+ deactivate_task(rq, next,
7077
+ DEQUEUE_NOCLOCK);
7078
+ continue;
7079
+ }
7080
+
7081
+ /*
7082
+ * Rules for changing task_struct::cpus_mask are holding
63157083 * both pi_lock and rq->lock, such that holding either
63167084 * stabilizes the mask.
63177085 *
....@@ -6328,7 +7096,14 @@
63287096 * changed the task, WARN if weird stuff happened, because in
63297097 * that case the above rq->lock drop is a fail too.
63307098 */
6331
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
7099
+ if (task_rq(next) != rq || !task_on_rq_queued(next)) {
7100
+ /*
7101
+ * In the !force case, there is a hole between
7102
+ * rq_unlock() and rq_relock(), where another CPU might
7103
+ * not observe an up to date cpu_active_mask and try to
7104
+ * move tasks around.
7105
+ */
7106
+ WARN_ON(force);
63327107 raw_spin_unlock(&next->pi_lock);
63337108 continue;
63347109 }
....@@ -6345,7 +7120,49 @@
63457120 raw_spin_unlock(&next->pi_lock);
63467121 }
63477122
7123
+ list_for_each_entry_safe(next, tmp, &percpu_kthreads,
7124
+ percpu_kthread_node) {
7125
+
7126
+ /* ENQUEUE_RESTORE not used due to move_entity in rt */
7127
+ activate_task(rq, next, ENQUEUE_NOCLOCK);
7128
+ list_del(&next->percpu_kthread_node);
7129
+ }
7130
+
63487131 rq->stop = stop;
7132
+}
7133
+
7134
+static int drain_rq_cpu_stop(void *data)
7135
+{
7136
+ struct rq *rq = this_rq();
7137
+ struct rq_flags rf;
7138
+
7139
+ rq_lock_irqsave(rq, &rf);
7140
+ migrate_tasks(rq, &rf, false);
7141
+ rq_unlock_irqrestore(rq, &rf);
7142
+
7143
+ return 0;
7144
+}
7145
+
7146
+int sched_cpu_drain_rq(unsigned int cpu)
7147
+{
7148
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7149
+ struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done);
7150
+
7151
+ if (idle_cpu(cpu)) {
7152
+ rq_drain->done = NULL;
7153
+ return 0;
7154
+ }
7155
+
7156
+ return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain,
7157
+ rq_drain_done);
7158
+}
7159
+
7160
+void sched_cpu_drain_rq_wait(unsigned int cpu)
7161
+{
7162
+ struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain);
7163
+
7164
+ if (rq_drain->done)
7165
+ cpu_stop_work_wait(rq_drain);
63497166 }
63507167 #endif /* CONFIG_HOTPLUG_CPU */
63517168
....@@ -6417,8 +7234,10 @@
64177234 static int cpuset_cpu_inactive(unsigned int cpu)
64187235 {
64197236 if (!cpuhp_tasks_frozen) {
6420
- if (dl_cpu_busy(cpu))
6421
- return -EBUSY;
7237
+ int ret = dl_bw_check_overflow(cpu);
7238
+
7239
+ if (ret)
7240
+ return ret;
64227241 cpuset_update_active_cpus();
64237242 } else {
64247243 num_cpus_frozen++;
....@@ -6467,19 +7286,27 @@
64677286 return 0;
64687287 }
64697288
6470
-int sched_cpu_deactivate(unsigned int cpu)
7289
+int sched_cpus_activate(struct cpumask *cpus)
7290
+{
7291
+ unsigned int cpu;
7292
+
7293
+ for_each_cpu(cpu, cpus) {
7294
+ if (sched_cpu_activate(cpu)) {
7295
+ for_each_cpu_and(cpu, cpus, cpu_active_mask)
7296
+ sched_cpu_deactivate(cpu);
7297
+
7298
+ return -EBUSY;
7299
+ }
7300
+ }
7301
+
7302
+ return 0;
7303
+}
7304
+
7305
+int _sched_cpu_deactivate(unsigned int cpu)
64717306 {
64727307 int ret;
64737308
64747309 set_cpu_active(cpu, false);
6475
- /*
6476
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
6477
- * users of this state to go away such that all new such users will
6478
- * observe it.
6479
- *
6480
- * Do sync before park smpboot threads to take care the rcu boost case.
6481
- */
6482
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
64837310
64847311 #ifdef CONFIG_SCHED_SMT
64857312 /*
....@@ -6498,6 +7325,46 @@
64987325 return ret;
64997326 }
65007327 sched_domains_numa_masks_clear(cpu);
7328
+
7329
+ update_max_interval();
7330
+
7331
+ return 0;
7332
+}
7333
+
7334
+int sched_cpu_deactivate(unsigned int cpu)
7335
+{
7336
+ int ret = _sched_cpu_deactivate(cpu);
7337
+
7338
+ if (ret)
7339
+ return ret;
7340
+
7341
+ /*
7342
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7343
+ * users of this state to go away such that all new such users will
7344
+ * observe it.
7345
+ *
7346
+ * Do sync before park smpboot threads to take care the rcu boost case.
7347
+ */
7348
+ synchronize_rcu();
7349
+
7350
+ return 0;
7351
+}
7352
+
7353
+int sched_cpus_deactivate_nosync(struct cpumask *cpus)
7354
+{
7355
+ unsigned int cpu;
7356
+
7357
+ for_each_cpu(cpu, cpus) {
7358
+ if (_sched_cpu_deactivate(cpu)) {
7359
+ for_each_cpu(cpu, cpus) {
7360
+ if (!cpu_active(cpu))
7361
+ sched_cpu_activate(cpu);
7362
+ }
7363
+
7364
+ return -EBUSY;
7365
+ }
7366
+ }
7367
+
65017368 return 0;
65027369 }
65037370
....@@ -6506,13 +7373,13 @@
65067373 struct rq *rq = cpu_rq(cpu);
65077374
65087375 rq->calc_load_update = calc_load_update;
6509
- update_max_interval();
65107376 }
65117377
65127378 int sched_cpu_starting(unsigned int cpu)
65137379 {
65147380 sched_rq_cpu_starting(cpu);
65157381 sched_tick_start(cpu);
7382
+ trace_android_rvh_sched_cpu_starting(cpu);
65167383 return 0;
65177384 }
65187385
....@@ -6523,7 +7390,6 @@
65237390 struct rq_flags rf;
65247391
65257392 /* Handle pending wakeups and then migrate everything off */
6526
- sched_ttwu_pending();
65277393 sched_tick_stop(cpu);
65287394
65297395 rq_lock_irqsave(rq, &rf);
....@@ -6531,12 +7397,13 @@
65317397 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
65327398 set_rq_offline(rq);
65337399 }
6534
- migrate_tasks(rq, &rf);
7400
+ migrate_tasks(rq, &rf, true);
65357401 BUG_ON(rq->nr_running != 1);
65367402 rq_unlock_irqrestore(rq, &rf);
65377403
7404
+ trace_android_rvh_sched_cpu_dying(cpu);
7405
+
65387406 calc_load_migrate(rq);
6539
- update_max_interval();
65407407 nohz_balance_exit_idle(rq);
65417408 hrtick_clear(rq);
65427409 return 0;
....@@ -6550,18 +7417,16 @@
65507417 /*
65517418 * There's no userspace yet to cause hotplug operations; hence all the
65527419 * CPU masks are stable and all blatant races in the below code cannot
6553
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
6554
- * but there won't be any contention on it.
7420
+ * happen.
65557421 */
6556
- cpus_read_lock();
65577422 mutex_lock(&sched_domains_mutex);
65587423 sched_init_domains(cpu_active_mask);
65597424 mutex_unlock(&sched_domains_mutex);
6560
- cpus_read_unlock();
65617425
65627426 /* Move init over to a non-isolated CPU */
65637427 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
65647428 BUG();
7429
+
65657430 sched_init_granularity();
65667431
65677432 init_sched_rt_class();
....@@ -6572,7 +7437,7 @@
65727437
65737438 static int __init migration_init(void)
65747439 {
6575
- sched_rq_cpu_starting(smp_processor_id());
7440
+ sched_cpu_starting(smp_processor_id());
65767441 return 0;
65777442 }
65787443 early_initcall(migration_init);
....@@ -6597,7 +7462,9 @@
65977462 * Every task in system belongs to this group at bootup.
65987463 */
65997464 struct task_group root_task_group;
7465
+EXPORT_SYMBOL_GPL(root_task_group);
66007466 LIST_HEAD(task_groups);
7467
+EXPORT_SYMBOL_GPL(task_groups);
66017468
66027469 /* Cacheline aligned slab cache for task_group */
66037470 static struct kmem_cache *task_group_cache __read_mostly;
....@@ -6608,19 +7475,27 @@
66087475
66097476 void __init sched_init(void)
66107477 {
6611
- int i, j;
6612
- unsigned long alloc_size = 0, ptr;
7478
+ unsigned long ptr = 0;
7479
+ int i;
7480
+
7481
+ /* Make sure the linker didn't screw up */
7482
+ BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
7483
+ &fair_sched_class + 1 != &rt_sched_class ||
7484
+ &rt_sched_class + 1 != &dl_sched_class);
7485
+#ifdef CONFIG_SMP
7486
+ BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
7487
+#endif
66137488
66147489 wait_bit_init();
66157490
66167491 #ifdef CONFIG_FAIR_GROUP_SCHED
6617
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7492
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
66187493 #endif
66197494 #ifdef CONFIG_RT_GROUP_SCHED
6620
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7495
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
66217496 #endif
6622
- if (alloc_size) {
6623
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7497
+ if (ptr) {
7498
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
66247499
66257500 #ifdef CONFIG_FAIR_GROUP_SCHED
66267501 root_task_group.se = (struct sched_entity **)ptr;
....@@ -6629,6 +7504,8 @@
66297504 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
66307505 ptr += nr_cpu_ids * sizeof(void **);
66317506
7507
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7508
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
66327509 #endif /* CONFIG_FAIR_GROUP_SCHED */
66337510 #ifdef CONFIG_RT_GROUP_SCHED
66347511 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
....@@ -6681,7 +7558,6 @@
66817558 init_rt_rq(&rq->rt);
66827559 init_dl_rq(&rq->dl);
66837560 #ifdef CONFIG_FAIR_GROUP_SCHED
6684
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
66857561 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
66867562 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
66877563 /*
....@@ -6703,7 +7579,6 @@
67037579 * We achieve this by letting root_task_group's tasks sit
67047580 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
67057581 */
6706
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
67077582 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
67087583 #endif /* CONFIG_FAIR_GROUP_SCHED */
67097584
....@@ -6711,10 +7586,6 @@
67117586 #ifdef CONFIG_RT_GROUP_SCHED
67127587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
67137588 #endif
6714
-
6715
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6716
- rq->cpu_load[j] = 0;
6717
-
67187589 #ifdef CONFIG_SMP
67197590 rq->sd = NULL;
67207591 rq->rd = NULL;
....@@ -6733,16 +7604,17 @@
67337604
67347605 rq_attach_root(rq, &def_root_domain);
67357606 #ifdef CONFIG_NO_HZ_COMMON
6736
- rq->last_load_update_tick = jiffies;
67377607 rq->last_blocked_load_update_tick = jiffies;
67387608 atomic_set(&rq->nohz_flags, 0);
7609
+
7610
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
67397611 #endif
67407612 #endif /* CONFIG_SMP */
67417613 hrtick_rq_init(rq);
67427614 atomic_set(&rq->nr_iowait, 0);
67437615 }
67447616
6745
- set_load_weight(&init_task, false);
7617
+ set_load_weight(&init_task);
67467618
67477619 /*
67487620 * The boot idle thread does lazy MMU switching as well:
....@@ -6811,7 +7683,7 @@
68117683 rcu_sleep_check();
68127684
68137685 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6814
- !is_idle_task(current)) ||
7686
+ !is_idle_task(current) && !current->non_block_count) ||
68157687 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
68167688 oops_in_progress)
68177689 return;
....@@ -6827,8 +7699,8 @@
68277699 "BUG: sleeping function called from invalid context at %s:%d\n",
68287700 file, line);
68297701 printk(KERN_ERR
6830
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6831
- in_atomic(), irqs_disabled(),
7702
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
7703
+ in_atomic(), irqs_disabled(), current->non_block_count,
68327704 current->pid, current->comm);
68337705
68347706 if (task_stack_end_corrupted(current))
....@@ -6840,13 +7712,43 @@
68407712 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
68417713 && !preempt_count_equals(preempt_offset)) {
68427714 pr_err("Preemption disabled at:");
6843
- print_ip_sym(preempt_disable_ip);
6844
- pr_cont("\n");
7715
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
68457716 }
7717
+
7718
+ trace_android_rvh_schedule_bug(NULL);
7719
+
68467720 dump_stack();
68477721 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
68487722 }
68497723 EXPORT_SYMBOL(___might_sleep);
7724
+
7725
+void __cant_sleep(const char *file, int line, int preempt_offset)
7726
+{
7727
+ static unsigned long prev_jiffy;
7728
+
7729
+ if (irqs_disabled())
7730
+ return;
7731
+
7732
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
7733
+ return;
7734
+
7735
+ if (preempt_count() > preempt_offset)
7736
+ return;
7737
+
7738
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7739
+ return;
7740
+ prev_jiffy = jiffies;
7741
+
7742
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
7743
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7744
+ in_atomic(), irqs_disabled(),
7745
+ current->pid, current->comm);
7746
+
7747
+ debug_show_held_locks(current);
7748
+ dump_stack();
7749
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
7750
+}
7751
+EXPORT_SYMBOL_GPL(__cant_sleep);
68507752 #endif
68517753
68527754 #ifdef CONFIG_MAGIC_SYSRQ
....@@ -6915,7 +7817,7 @@
69157817
69167818 #ifdef CONFIG_IA64
69177819 /**
6918
- * set_curr_task - set the current task for a given CPU.
7820
+ * ia64_set_curr_task - set the current task for a given CPU.
69197821 * @cpu: the processor in question.
69207822 * @p: the task pointer to set.
69217823 *
....@@ -7081,8 +7983,15 @@
70817983
70827984 if (queued)
70837985 enqueue_task(rq, tsk, queue_flags);
7084
- if (running)
7085
- set_curr_task(rq, tsk);
7986
+ if (running) {
7987
+ set_next_task(rq, tsk);
7988
+ /*
7989
+ * After changing group, the running task may have joined a
7990
+ * throttled one but it's still the running task. Trigger a
7991
+ * resched to make sure that task can still run.
7992
+ */
7993
+ resched_curr(rq);
7994
+ }
70867995
70877996 task_rq_unlock(rq, tsk, &rf);
70887997 }
....@@ -7121,9 +8030,14 @@
71218030
71228031 #ifdef CONFIG_UCLAMP_TASK_GROUP
71238032 /* Propagate the effective uclamp value for the new group */
8033
+ mutex_lock(&uclamp_mutex);
8034
+ rcu_read_lock();
71248035 cpu_util_update_eff(css);
8036
+ rcu_read_unlock();
8037
+ mutex_unlock(&uclamp_mutex);
71258038 #endif
71268039
8040
+ trace_android_rvh_cpu_cgroup_online(css);
71278041 return 0;
71288042 }
71298043
....@@ -7189,6 +8103,9 @@
71898103 if (ret)
71908104 break;
71918105 }
8106
+
8107
+ trace_android_rvh_cpu_cgroup_can_attach(tset, &ret);
8108
+
71928109 return ret;
71938110 }
71948111
....@@ -7199,6 +8116,8 @@
71998116
72008117 cgroup_taskset_for_each(task, css, tset)
72018118 sched_move_task(task);
8119
+
8120
+ trace_android_rvh_cpu_cgroup_attach(tset);
72028121 }
72038122
72048123 #ifdef CONFIG_UCLAMP_TASK_GROUP
....@@ -7210,6 +8129,9 @@
72108129 unsigned int eff[UCLAMP_CNT];
72118130 enum uclamp_id clamp_id;
72128131 unsigned int clamps;
8132
+
8133
+ lockdep_assert_held(&uclamp_mutex);
8134
+ SCHED_WARN_ON(!rcu_read_lock_held());
72138135
72148136 css_for_each_descendant_pre(css, top_css) {
72158137 uc_parent = css_tg(css)->parent
....@@ -7243,7 +8165,7 @@
72438165 }
72448166
72458167 /* Immediately update descendants RUNNABLE tasks */
7246
- uclamp_update_active_tasks(css, clamps);
8168
+ uclamp_update_active_tasks(css);
72478169 }
72488170 }
72498171
....@@ -7300,6 +8222,8 @@
73008222 req = capacity_from_percent(buf);
73018223 if (req.ret)
73028224 return req.ret;
8225
+
8226
+ static_branch_enable(&sched_uclamp_used);
73038227
73048228 mutex_lock(&uclamp_mutex);
73058229 rcu_read_lock();
....@@ -7415,7 +8339,9 @@
74158339 static DEFINE_MUTEX(cfs_constraints_mutex);
74168340
74178341 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7418
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8342
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8343
+/* More than 203 days if BW_SHIFT equals 20. */
8344
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
74198345
74208346 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
74218347
....@@ -7441,6 +8367,12 @@
74418367 * feasibility.
74428368 */
74438369 if (period > max_cfs_quota_period)
8370
+ return -EINVAL;
8371
+
8372
+ /*
8373
+ * Bound quota to defend quota against overflow during bandwidth shift.
8374
+ */
8375
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
74448376 return -EINVAL;
74458377
74468378 /*
....@@ -7495,7 +8427,7 @@
74958427 return ret;
74968428 }
74978429
7498
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
8430
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
74998431 {
75008432 u64 quota, period;
75018433
....@@ -7510,7 +8442,7 @@
75108442 return tg_set_cfs_bandwidth(tg, period, quota);
75118443 }
75128444
7513
-long tg_get_cfs_quota(struct task_group *tg)
8445
+static long tg_get_cfs_quota(struct task_group *tg)
75148446 {
75158447 u64 quota_us;
75168448
....@@ -7523,7 +8455,7 @@
75238455 return quota_us;
75248456 }
75258457
7526
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
8458
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
75278459 {
75288460 u64 quota, period;
75298461
....@@ -7536,7 +8468,7 @@
75368468 return tg_set_cfs_bandwidth(tg, period, quota);
75378469 }
75388470
7539
-long tg_get_cfs_period(struct task_group *tg)
8471
+static long tg_get_cfs_period(struct task_group *tg)
75408472 {
75418473 u64 cfs_period_us;
75428474
....@@ -8013,4 +8945,7 @@
80138945 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
80148946 };
80158947
8016
-#undef CREATE_TRACE_POINTS
8948
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
8949
+{
8950
+ trace_sched_update_nr_running_tp(rq, count);
8951
+}