From 61598093bbdd283a7edc367d900f223070ead8d2 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 07:43:03 +0000 Subject: [PATCH] add ax88772C AX88772C_eeprom_tools --- kernel/kernel/sched/core.c | 2555 ++++++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 1,745 insertions(+), 810 deletions(-) diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c index 3908b0e..30ab811 100644 --- a/kernel/kernel/sched/core.c +++ b/kernel/kernel/sched/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/core.c * @@ -5,6 +6,10 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + #include "sched.h" #include <linux/nospec.h> @@ -16,14 +21,41 @@ #include <asm/tlb.h> #include "../workqueue_internal.h" +#include "../../io_uring/io-wq.h" #include "../smpboot.h" #include "pelt.h" +#include "smp.h" -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> +#include <trace/hooks/sched.h> +#include <trace/hooks/dtask.h> + +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_switch); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking); +#ifdef CONFIG_SCHEDSTATS +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked); +#endif DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +EXPORT_SYMBOL_GPL(runqueues); #ifdef CONFIG_SCHED_DEBUG /* @@ -38,6 +70,7 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; +EXPORT_SYMBOL_GPL(sysctl_sched_features); #undef SCHED_FEAT #endif @@ -60,6 +93,100 @@ * default: 0.95s */ int sysctl_sched_rt_runtime = 950000; + + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ /* * __task_rq_lock - lock the rq @p resides on. @@ -84,6 +211,7 @@ cpu_relax(); } } +EXPORT_SYMBOL_GPL(__task_rq_lock); /* * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. @@ -126,6 +254,7 @@ cpu_relax(); } } +EXPORT_SYMBOL_GPL(task_rq_lock); /* * RQ-clock updating methods: @@ -206,7 +335,15 @@ rq->clock += delta; update_rq_clock_task(rq, delta); } +EXPORT_SYMBOL_GPL(update_rq_clock); +static inline void +rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func) +{ + csd->flags = 0; + csd->func = func; + csd->info = rq; +} #ifdef CONFIG_SCHED_HRTICK /* @@ -243,8 +380,9 @@ static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -257,7 +395,6 @@ rq_lock(rq, &rf); __hrtick_restart(rq); - rq->hrtick_csd_pending = 0; rq_unlock(rq, &rf); } @@ -269,7 +406,6 @@ void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -277,16 +413,12 @@ * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); - hrtimer_set_expires(timer, time); - - if (rq == this_rq()) { + if (rq == this_rq()) __hrtick_restart(rq); - } else if (!rq->hrtick_csd_pending) { + else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); - rq->hrtick_csd_pending = 1; - } } #else @@ -303,21 +435,17 @@ */ delay = max_t(u64, delay, 10000LL); hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } + #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq->hrtick_csd_pending = 0; - - rq->hrtick_csd.flags = 0; - rq->hrtick_csd.func = __hrtick_start; - rq->hrtick_csd.info = rq; + rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); #endif - - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; } #else /* CONFIG_SCHED_HRTICK */ @@ -399,7 +527,7 @@ #endif #endif -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; @@ -412,23 +540,58 @@ * state, even in the failed case, an explicit smp_mb() must be used. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) - return; - - head->count++; - - get_task_struct(task); + if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) + return false; /* * The head is context local, there can be no concurrency. */ *head->lastp = node; head->lastp = &node->next; + head->count++; + return true; } -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, - int sibling_count_hint); +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task)) + get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ + if (!__wake_q_add(head, task)) + put_task_struct(task); +} void wake_up_q(struct wake_q_head *head) { @@ -442,12 +605,14 @@ /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; + task->wake_q_count = head->count; /* - * try_to_wake_up() executes a full barrier, which pairs with + * wake_up_process() executes a full barrier, which pairs with * the queueing in wake_q_add() so as not to miss wakeups. */ - try_to_wake_up(task, TASK_NORMAL, 0, head->count); + wake_up_process(task); + task->wake_q_count = 0; put_task_struct(task); } } @@ -477,15 +642,12 @@ return; } -#ifdef CONFIG_PREEMPT if (set_nr_and_not_polling(curr)) -#else - if (set_nr_and_not_polling(curr) && (rq->curr == rq->idle)) -#endif smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); } +EXPORT_SYMBOL_GPL(resched_curr); void resched_cpu(int cpu) { @@ -510,27 +672,49 @@ */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) { + for_each_cpu_and(i, cpu_active_mask, + housekeeping_cpumask(HK_FLAG_TIMER)) { + if (cpu == i) + continue; + + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + + /* no active, not-idle, housekpeeing CPU found. */ + default_cpu = cpumask_any(cpu_active_mask); + + if (unlikely(default_cpu >= nr_cpu_ids)) + goto unlock; + } + + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; @@ -590,29 +774,23 @@ wake_up_idle_cpu(cpu); } -static inline bool got_nohz_idle_kick(void) +static void nohz_csd_func(void *info) { - int cpu = smp_processor_id(); - - if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle_cpu(cpu) && !need_resched()) - return true; + struct rq *rq = info; + int cpu = cpu_of(rq); + unsigned int flags; /* - * We can't run Idle Load Balance on this CPU for this time so we - * cancel it and clear NOHZ_BALANCE_KICK + * Release the rq::nohz_csd. */ - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); - return false; -} + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + WARN_ON(!(flags & NOHZ_KICK_MASK)); -#else /* CONFIG_NO_HZ_COMMON */ - -static inline bool got_nohz_idle_kick(void) -{ - return false; + rq->idle_balance = idle_cpu(cpu); + if (rq->idle_balance && !need_resched()) { + rq->nohz_idle_balance = flags; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } } #endif /* CONFIG_NO_HZ_COMMON */ @@ -703,18 +881,18 @@ } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +static void set_load_weight(struct task_struct *p) { + bool update_load = !(READ_ONCE(p->state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; /* * SCHED_IDLE tasks get minimal weight: */ - if (idle_policy(p->policy)) { + if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; - p->se.runnable_weight = load->weight; return; } @@ -727,7 +905,6 @@ } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; - p->se.runnable_weight = load->weight; } } @@ -750,8 +927,46 @@ /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; + +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); +EXPORT_SYMBOL_GPL(sched_uclamp_used); /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -762,11 +977,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); -} - -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) @@ -808,7 +1018,7 @@ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return; - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); + uclamp_rq_set(rq, clamp_id, clamp_value); } static inline @@ -832,12 +1042,79 @@ return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + +#if IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE) +void rockchip_perf_uclamp_sync_util_min_rt_default(void) +{ + uclamp_sync_util_min_rt_default(); +} +EXPORT_SYMBOL(rockchip_perf_uclamp_sync_util_min_rt_default); +#endif + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -848,9 +1125,11 @@ if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -869,6 +1148,12 @@ { struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; + struct uclamp_se uc_eff; + int ret = 0; + + trace_android_rvh_uclamp_eff_get(p, clamp_id, &uc_max, &uc_eff, &ret); + if (ret) + return uc_eff; /* System default restrictions always apply */ if (unlikely(uc_req.value > uc_max.value)) @@ -889,6 +1174,7 @@ return (unsigned long)uc_eff.value; } +EXPORT_SYMBOL_GPL(uclamp_eff_value); /* * When a task is enqueued on a rq, the clamp bucket currently defined by the @@ -925,8 +1211,8 @@ if (bucket->tasks == 1 || uc_se->value > bucket->value) bucket->value = uc_se->value; - if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, uc_se->value); + if (uc_se->value > uclamp_rq_get(rq, clamp_id)) + uclamp_rq_set(rq, clamp_id, uc_se->value); } /* @@ -949,10 +1235,38 @@ lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -964,7 +1278,7 @@ if (likely(bucket->tasks)) return; - rq_clamp = READ_ONCE(uc_rq->value); + rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fixup the expected value. @@ -972,13 +1286,22 @@ SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); - WRITE_ONCE(uc_rq->value, bkt_clamp); + uclamp_rq_set(rq, clamp_id, bkt_clamp); } } static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -995,6 +1318,15 @@ { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1002,9 +1334,27 @@ uclamp_rq_dec_id(rq, p, clamp_id); } -static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) { + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + +static inline void +uclamp_update_active(struct task_struct *p) +{ + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1024,30 +1374,22 @@ * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } @@ -1070,16 +1412,16 @@ #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1088,7 +1430,9 @@ goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1104,8 +1448,15 @@ update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1118,6 +1469,7 @@ undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1127,20 +1479,61 @@ static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; - if (lower_bound > upper_bound) + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) - return -EINVAL; + + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); return 0; +} + +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; } static void __setscheduler_uclamp(struct task_struct *p, @@ -1148,40 +1541,41 @@ { enum uclamp_id clamp_id; - /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. - */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); + unsigned int value; - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) + if (!uclamp_reset(attr, clamp_id, uc_se)) continue; - /* By default, RT tasks always get 100% boost */ - if (sched_feat(SUGOV_RT_MAX_FREQ) && - unlikely(rt_task(p) && - clamp_id == UCLAMP_MIN)) { + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + value = sysctl_sched_uclamp_util_min_rt_default; + else + value = uclamp_none(clamp_id); - clamp_value = uclamp_none(UCLAMP_MAX); - } + uclamp_se_set(uc_se, value, false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min); } - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); + trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max); } } @@ -1189,6 +1583,10 @@ { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1201,39 +1599,24 @@ } } -#ifdef CONFIG_SMP -unsigned int uclamp_task(struct task_struct *p) +static void uclamp_post_fork(struct task_struct *p) { - unsigned long util; - - util = task_util_est(p); - util = max(util, uclamp_eff_value(p, UCLAMP_MIN)); - util = min(util, uclamp_eff_value(p, UCLAMP_MAX)); - - return util; + uclamp_update_util_min_rt_default(p); } -bool uclamp_boosted(struct task_struct *p) +static void __init init_uclamp_rq(struct rq *rq) { - return uclamp_eff_value(p, UCLAMP_MIN) > 0; + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } - -bool uclamp_latency_sensitive(struct task_struct *p) -{ -#ifdef CONFIG_UCLAMP_TASK_GROUP - struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); - struct task_group *tg; - - if (!css) - return false; - tg = container_of(css, struct task_group, css); - - return tg->latency_sensitive; -#else - return false; -#endif -} -#endif /* CONFIG_SMP */ static void __init init_uclamp(void) { @@ -1241,13 +1624,8 @@ enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1276,41 +1654,7 @@ static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } - -long schedtune_task_margin(struct task_struct *task); - -#ifdef CONFIG_SMP -unsigned int uclamp_task(struct task_struct *p) -{ - unsigned long util = task_util_est(p); -#ifdef CONFIG_SCHED_TUNE - long margin = schedtune_task_margin(p); - - trace_sched_boost_task(p, util, margin); - - util += margin; -#endif - - return util; -} - -bool uclamp_boosted(struct task_struct *p) -{ -#ifdef CONFIG_SCHED_TUNE - return schedtune_task_boost(p) > 0; -#endif - return false; -} - -bool uclamp_latency_sensitive(struct task_struct *p) -{ -#ifdef CONFIG_SCHED_TUNE - return schedtune_prefer_idle(p) != 0; -#endif - return false; -} -#endif /* CONFIG_SMP */ - +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -1325,7 +1669,9 @@ } uclamp_rq_inc(rq, p); + trace_android_rvh_enqueue_task(rq, p, flags); p->sched_class->enqueue_task(rq, p, flags); + trace_android_rvh_after_enqueue_task(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1339,31 +1685,42 @@ } uclamp_rq_dec(rq, p); + trace_android_rvh_dequeue_task(rq, p, flags); p->sched_class->dequeue_task(rq, p, flags); + trace_android_rvh_after_dequeue_task(rq, p); } void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; + if (task_on_rq_migrating(p)) + flags |= ENQUEUE_MIGRATED; enqueue_task(rq, p, flags); + + p->on_rq = TASK_ON_RQ_QUEUED; } +EXPORT_SYMBOL_GPL(activate_task); void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; dequeue_task(rq, p, flags); } +EXPORT_SYMBOL_GPL(deactivate_task); -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) +static inline int __normal_prio(int policy, int rt_prio, int nice) { - return p->static_prio; + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; } /* @@ -1375,15 +1732,7 @@ */ static inline int normal_prio(struct task_struct *p) { - int prio; - - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); } /* @@ -1439,20 +1788,10 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -1461,33 +1800,26 @@ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } +EXPORT_SYMBOL_GPL(check_preempt_curr); #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; if (is_per_cpu_kthread(p)) return cpu_online(cpu); - return cpu_active(cpu); + if (!cpu_active(cpu)) + return false; + + return cpumask_test_cpu(cpu, task_cpu_possible_mask(p)); } /* @@ -1512,19 +1844,29 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { + int detached = 0; + lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, new_cpu); - rq_unlock(rq, rf); + /* + * The vendor hook may drop the lock temporarily, so + * pass the rq flags to unpin lock. We expect the + * rq lock to be held after return. + */ + trace_android_rvh_migrate_queued_task(rq, rf, p, new_cpu, &detached); + if (detached) + goto attach; + deactivate_task(rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, new_cpu); + +attach: + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -1576,10 +1918,10 @@ local_irq_disable(); /* * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed + * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - sched_ttwu_pending(); + flush_smp_call_function_from_idle(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -1607,8 +1949,9 @@ */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - cpumask_copy(&p->cpus_allowed, new_mask); + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); + trace_android_rvh_set_cpus_allowed_comm(p, new_mask); } void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) @@ -1637,28 +1980,23 @@ if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + bool check, + struct rq *rq, + struct rq_flags *rf) { const struct cpumask *cpu_valid_mask = cpu_active_mask; + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); if (p->flags & PF_KTHREAD) { @@ -1666,6 +2004,9 @@ * Kernel threads are allowed on online && !active CPUs */ cpu_valid_mask = cpu_online_mask; + } else if (!cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; } /* @@ -1677,10 +2018,15 @@ goto out; } - if (cpumask_equal(&p->cpus_allowed, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + /* + * Picking a ~random cpu helps in cases where we are changing affinity + * for groups of tasks (ie. cpuset), so that load balancing is not + * immediately required to distribute the tasks within their new mask. + */ + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; @@ -1705,21 +2051,39 @@ if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); return 0; } else if (task_on_rq_queued(p)) { /* * OK, since we're going to drop the lock immediately * afterwards anyway. */ - rq = move_queued_task(rq, &rf, p, dest_cpu); + rq = move_queued_task(rq, rf, p, dest_cpu); } out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; +} + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, check, rq, &rf); } int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) @@ -1727,6 +2091,74 @@ return __set_cpus_allowed_ptr(p, new_mask, false); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + task_rq_unlock(rq, p, &rf); + return -EINVAL; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, false, rq, &rf); +} + +/* + * Restrict a given task's CPU affinity so that it is a subset of + * task_cpu_possible_mask(). If the resulting mask is empty, we warn and + * walk up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + trace_android_rvh_force_compatible_pre(NULL); + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + trace_android_rvh_force_compatible_post(NULL); + free_cpumask_var(new_mask); +} void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1775,12 +2207,13 @@ p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + trace_android_rvh_set_task_cpu(p, new_cpu); } __set_task_cpu(p, new_cpu); } +EXPORT_SYMBOL_GPL(set_task_cpu); -#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1793,11 +2226,9 @@ rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -1840,10 +2271,10 @@ if (task_cpu(arg->src_task) != arg->src_cpu) goto unlock; - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) goto unlock; - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) goto unlock; __migrate_swap_task(arg->src_task, arg->dst_cpu); @@ -1885,10 +2316,10 @@ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) goto out; - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); @@ -1897,7 +2328,7 @@ out: return ret; } -#endif /* CONFIG_NUMA_BALANCING */ +EXPORT_SYMBOL_GPL(migrate_swap); /* * wait_task_inactive - wait for a thread to unschedule. @@ -2033,7 +2464,7 @@ EXPORT_SYMBOL_GPL(kick_process); /* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * ->cpus_ptr is protected by both rq->lock and p->pi_lock * * A few notes on cpu_active vs cpu_online: * @@ -2059,7 +2490,11 @@ int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; - int dest_cpu; + int dest_cpu = -1; + + trace_android_rvh_select_fallback_rq(cpu, p, &dest_cpu); + if (dest_cpu >= 0) + return dest_cpu; /* * If the node that the CPU is on has been offlined, cpu_to_node() @@ -2071,16 +2506,14 @@ /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, &p->cpus_allowed) { + for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; @@ -2095,12 +2528,11 @@ state = possible; break; } - /* Fall-through */ + fallthrough; case possible: - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -2124,23 +2556,21 @@ } /* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, - int sibling_count_hint) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, - sibling_count_hint); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else - cpu = cpumask_any(&p->cpus_allowed); + cpu = cpumask_any(p->cpus_ptr); /* * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed + * to rely on ttwu() to place the task on a valid ->cpus_ptr * CPU. * * Since this is common to all placement strategies, this lives here. @@ -2152,12 +2582,6 @@ cpu = select_fallback_rq(task_cpu(p), p); return cpu; -} - -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; } void sched_set_stop_task(int cpu, struct task_struct *stop) @@ -2239,16 +2663,6 @@ __schedstat_inc(p->se.statistics.nr_wakeups_sync); } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; - - /* If a worker is waking up, notify the workqueue: */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); -} - /* * Mark the task runnable and perform wakeup-preemption. */ @@ -2290,27 +2704,54 @@ { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + if (wake_flags & WF_SYNC) + en_flags |= ENQUEUE_WAKEUP_SYNC; + lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } - ttwu_activate(rq, p, en_flags); + activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2329,75 +2770,63 @@ } #ifdef CONFIG_SMP -void sched_ttwu_pending(void) +void sched_ttwu_pending(void *arg) { + struct llist_node *llist = arg; struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p, *t; struct rq_flags rf; if (!llist) return; + /* + * rq::ttwu_pending racy indication of out-standing wakeups. + * Races such that false-negatives are possible, since they + * are shorter lived that false-positives would be. + */ + WRITE_ONCE(rq->ttwu_pending, 0); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } -void scheduler_ipi(void) +void send_call_function_single_ipi(int cpu) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); + struct rq *rq = cpu_rq(cpu); - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; - - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); - - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); + if (!set_nr_if_polling(rq->idle)) + arch_send_call_function_single_ipi(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } -static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) +/* + * Queue a task on the target CPUs wake_list and wake the CPU via IPI if + * necessary. The wakee CPU on receipt of the IPI will queue the task + * via sched_ttwu_wakeup() for activation so the wakee incurs the cost + * of the wakeup instead of the waker. + */ +static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); - else - trace_sched_wake_idle_without_ipi(cpu); - } + WRITE_ONCE(rq->ttwu_pending, 1); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2423,6 +2852,7 @@ out: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(wake_up_if_idle); bool cpus_share_cache(int this_cpu, int that_cpu) { @@ -2431,6 +2861,58 @@ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } + +static inline bool ttwu_queue_cond(int cpu, int wake_flags) +{ + /* + * If the CPU does not share cache, then queue the task on the + * remote rqs wakelist to avoid accessing remote data. + */ + if (!cpus_share_cache(smp_processor_id(), cpu)) + return true; + + /* + * If the task is descheduling and the only running task on the + * CPU then use the wakelist to offload the task activation to + * the soon-to-be-idle CPU as the current CPU is likely busy. + * nr_running is checked to avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. + */ + if ((wake_flags & WF_ON_CPU) && !cpu_rq(cpu)->nr_running) + return true; + + return false; +} + +static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + bool cond = false; + + trace_android_rvh_ttwu_cond(&cond); + + if ((sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) || + cond) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ + __ttwu_queue_wakelist(p, cpu, wake_flags); + return true; + } + + return false; +} + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2438,13 +2920,8 @@ struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* Sync clocks across CPUs */ - ttwu_queue_remote(p, cpu, wake_flags); + if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; - } -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2500,8 +2977,8 @@ * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2540,45 +3017,95 @@ * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) - * @sibling_count_hint: A hint at the number of threads that are being woken up - * in this event. * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. + * + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, - int sibling_count_hint) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; + preempt_disable(); + if (p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special + * case the whole 'p->on_rq && ttwu_runnable()' case below + * without taking any locks. + * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ + if (!(p->state & state)) + goto out; + + success = 1; + trace_sched_waking(p); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); + goto out; + } + /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); if (!(p->state & state)) - goto out; + goto unlock; + +#ifdef CONFIG_FREEZER + /* + * If we're going to wake up a thread which may be frozen, then + * we can only do so if we have an active CPU which is capable of + * running it. This may not be the case when resuming from suspend, + * as the secondary CPUs may not yet be back online. See __thaw_task() + * for the actual wakeup. + */ + if (unlikely(frozen_or_skipped(p)) && + !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) + goto unlock; +#endif trace_sched_waking(p); /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2599,10 +3126,15 @@ * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) + goto unlock; + + if (p->state & TASK_UNINTERRUPTIBLE) + trace_sched_blocked_reason(p); #ifdef CONFIG_SMP /* @@ -2623,8 +3155,43 @@ * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; + + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) + goto unlock; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2637,88 +3204,79 @@ */ smp_cond_load_acquire(&p->on_cpu, !VAL); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; + trace_android_rvh_try_to_wake_up(p); - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, - sibling_count_hint); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } - -#else /* CONFIG_SMP */ - - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -stat: - ttwu_stat(p, cpu, wake_flags); -out: +unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); +out: + if (success) { + trace_android_rvh_try_to_wake_up_success(p); + ttwu_stat(p, task_cpu(p), wake_flags); + } + preempt_enable(); return success; } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning + * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked, can be @current. + * @func: Function to invoke. + * @arg: Argument to function. * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. + * If the specified task can be quickly locked into a definite state + * (either sleeping or on a given runqueue), arrange to keep it in that + * state while invoking @func(@arg). This function can use ->on_rq and + * task_curr() to work out what the state is, if required. Given that + * @func can be invoked with a runqueue lock held, it had better be quite + * lightweight. + * + * Returns: + * @false if the task slipped out from under the locks. + * @true if the task was locked onto a runqueue or is sleeping. + * However, @func can override this by returning @false. */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) +bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) { - struct rq *rq = task_rq(p); + struct rq_flags rf; + bool ret = false; + struct rq *rq; - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet picked a replacement task. - */ - rq_unlock(rq, rf); - raw_spin_lock(&p->pi_lock); - rq_relock(rq, rf); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - trace_sched_waking(p); - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&rq->nr_iowait); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (p->on_rq) { + rq = __task_rq_lock(p, &rf); + if (task_rq(p) == rq) + ret = func(p, arg); + rq_unlock(rq, &rf); + } else { + switch (p->state) { + case TASK_RUNNING: + case TASK_WAKING: + break; + default: + smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). + if (!p->on_rq) + ret = func(p, arg); } - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); } - - ttwu_do_wakeup(rq, p, 0, rf); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return ret; } /** @@ -2734,13 +3292,13 @@ */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0, 1); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0, 1); + return try_to_wake_up(p, state, 0); } /* @@ -2765,6 +3323,8 @@ p->se.cfs_rq = NULL; #endif + trace_android_rvh_sched_fork_init(p); + #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2785,7 +3345,13 @@ INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); +#ifdef CONFIG_SMP + p->wake_entry.u_flags = CSD_TYPE_TTWU; +#endif } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -2802,7 +3368,7 @@ #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2876,8 +3442,8 @@ } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2905,7 +3471,7 @@ */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { - unsigned long flags; + trace_android_rvh_sched_fork(p); __sched_fork(clone_flags, p); /* @@ -2919,6 +3485,7 @@ * Make sure we do not leak PI boosting priority to the child. */ p->prio = current->normal_prio; + trace_android_rvh_prepare_prio_fork(p); uclamp_fork(p); @@ -2933,8 +3500,8 @@ } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p, false); + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -2951,24 +3518,8 @@ p->sched_class = &fair_sched_class; init_entity_runnable_average(&p->se); + trace_android_rvh_finish_prio_fork(p); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - rseq_migrate(p); - /* - * We're setting the CPU for the first time, we don't migrate, - * so use __set_task_cpu(). - */ - __set_task_cpu(p, smp_processor_id()); - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) @@ -2983,6 +3534,41 @@ RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif return 0; +} + +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +{ + unsigned long flags; + + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + if (1) { + struct task_group *tg; + + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } +#endif + rseq_migrate(p); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, smp_processor_id()); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -3013,12 +3599,14 @@ struct rq_flags rf; struct rq *rq; + trace_android_rvh_wake_up_new_task(p); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path + * - cpus_ptr can change in the fork path * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, @@ -3026,14 +3614,14 @@ */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); - post_init_entity_util_avg(&p->se); + post_init_entity_util_avg(p); + trace_android_rvh_new_task_stats(p); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3143,8 +3731,10 @@ /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3152,8 +3742,9 @@ { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3175,7 +3766,7 @@ * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = next; @@ -3320,11 +3911,12 @@ * task and put them back on the free list. */ kprobe_flush_task(prev); + trace_android_rvh_flush_task(prev); /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); @@ -3403,12 +3995,8 @@ context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { - struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -3417,22 +4005,37 @@ arch_start_context_switch(prev); /* - * If mm is non-NULL, we pass through switch_mm(). If mm is - * NULL, we will pass through mmdrop() in finish_task_switch(). - * Both of these contain the full memory barrier required by - * membarrier after storing to rq->curr, before returning to - * user-space. + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch */ - if (!mm) { - next->active_mm = oldmm; - mmgrab(oldmm); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + membarrier_switch_mm(rq, prev->active_mm, next->mm); + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr / membarrier_switch_mm() and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); @@ -3469,7 +4072,7 @@ * preemption, thus the result might have a time-of-check-to-time-of-use * race. The caller is responsible to use it correctly, for example: * - * - from a non-preemptable section (of course) + * - from a non-preemptible section (of course) * * - from a thread that is bound to a single CPU * @@ -3490,6 +4093,18 @@ sum += cpu_rq(i)->nr_switches; return sum; +} + +/* + * Consumers of these two interfaces, like for example the cpuidle menu + * governor, are using nonsensical data. Preferring shallow idle state selection + * for a CPU that has IO-wait which might not even end up running the task when + * it does become runnable. + */ + +unsigned long nr_iowait_cpu(int cpu) +{ + return atomic_read(&cpu_rq(cpu)->nr_iowait); } /* @@ -3527,29 +4142,9 @@ unsigned long i, sum = 0; for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); + sum += nr_iowait_cpu(i); return sum; -} - -/* - * Consumers of these two interfaces, like for example the cpufreq menu - * governor are using nonsensical data. Boosting frequency for a CPU that has - * IO-wait which might not even end up running the task when it does become - * runnable. - */ - -unsigned long nr_iowait_cpu(int cpu) -{ - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); -} - -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -{ - struct rq *rq = this_rq(); - *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->load.weight; } #ifdef CONFIG_SMP @@ -3563,9 +4158,14 @@ struct task_struct *p = current; unsigned long flags; int dest_cpu; + bool cond = false; + + trace_android_rvh_sched_exec(&cond); + if (cond) + return; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3648,6 +4248,7 @@ return ns; } +EXPORT_SYMBOL_GPL(task_sched_runtime); /* * This function gets called by the timer code, with HZ frequency. @@ -3659,14 +4260,18 @@ struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + unsigned long thermal_pressure; + arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); + trace_android_rvh_tick_entry(rq); update_rq_clock(rq); + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); - cpu_load_update_active(rq); calc_global_load_tick(rq); psi_task_tick(rq); @@ -3678,6 +4283,8 @@ rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif + + trace_android_vh_scheduler_tick(rq); } #ifdef CONFIG_NO_HZ_FULL @@ -3735,28 +4342,31 @@ * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough @@ -3820,7 +4430,7 @@ static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3926,11 +4536,11 @@ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); + + trace_android_rvh_schedule_bug(prev); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -3939,11 +4549,23 @@ /* * Various schedule()-time debugging checks and statistics: */ -static inline void schedule_debug(struct task_struct *prev) +static inline void schedule_debug(struct task_struct *prev, bool preempt) { #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside scheduler\n"); + + if (task_scs_end_corrupted(prev)) + panic("corrupted shadow stack detected inside scheduler\n"); +#endif + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + if (!preempt && prev->state && prev->non_block_count) { + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", + prev->comm, prev->pid, prev->non_block_count); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } #endif if (unlikely(in_atomic_preempt_off())) { @@ -3955,6 +4577,28 @@ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); +} + +static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + const struct sched_class *class; + /* + * We must do the balancing pass before put_prev_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. + */ + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); } /* @@ -3972,29 +4616,29 @@ * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, rf); + p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, rf); + if (!p) { + put_prev_task(rq, prev); + p = pick_next_task_idle(rq); + } return p; } -again: +restart: + put_prev_task_balance(rq, prev, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq); + if (p) return p; - } } /* The idle class should always have a runnable task: */ @@ -4021,7 +4665,7 @@ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -4030,7 +4674,7 @@ * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -4044,6 +4688,7 @@ { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4052,7 +4697,7 @@ rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev); + schedule_debug(prev, preempt); if (sched_feat(HRTICK)) hrtick_clear(rq); @@ -4063,9 +4708,16 @@ /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): * - * The membarrier system call requires a full memory barrier + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) + * + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4076,29 +4728,43 @@ update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (unlikely(signal_pending_state(prev->state, prev))) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); - } - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) - try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; @@ -4108,9 +4774,14 @@ clear_tsk_need_resched(prev); clear_preempt_need_resched(); + trace_android_rvh_schedule(prev, next, rq); if (likely(prev != next)) { rq->nr_switches++; - rq->curr = next; + /* + * RCU users of rcu_dereference(rq->curr) may not see + * changes to task_struct made by pick_next_task(). + */ + RCU_INIT_POINTER(rq->curr, next); /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -4126,6 +4797,8 @@ * is a RELEASE barrier), */ ++*switch_count; + + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); @@ -4157,14 +4830,48 @@ static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state || tsk_is_pi_blocked(tsk)) + unsigned int task_flags; + + if (!tsk->state) return; + + task_flags = tsk->flags; + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again + * in the possible wakeup of a kworker and because wq_worker_sleeping() + * requires it. + */ + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + preempt_disable(); + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); + preempt_enable_no_resched(); + } + + if (tsk_is_pi_blocked(tsk)) + return; + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(tsk)) blk_schedule_flush_plug(tsk); +} + +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) @@ -4177,6 +4884,7 @@ __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -4265,11 +4973,10 @@ } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. + * This is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { @@ -4337,10 +5044,10 @@ } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* - * this is the entry point to schedule() from kernel preemption + * This is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. @@ -4368,9 +5075,22 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags, 1); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR)); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); + +static void __setscheduler_prio(struct task_struct *p, int prio) +{ + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; +} #ifdef CONFIG_RT_MUTEXES @@ -4408,6 +5128,7 @@ struct rq_flags rf; struct rq *rq; + trace_android_rvh_rtmutex_prepare_setprio(p, pi_task); /* XXX used to be waiter->prio, not waiter->task->prio */ prio = __rt_effective_prio(pi_task, p->normal_prio); @@ -4482,31 +5203,29 @@ if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; - p->sched_class = &dl_sched_class; + } else { + p->dl.pi_se = &p->dl; + } } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } - p->prio = prio; + __setscheduler_prio(p, prio); if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4526,12 +5245,13 @@ void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - int old_prio, delta; + bool queued, running, allowed = false; + int old_prio; struct rq_flags rf; struct rq *rq; - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + trace_android_rvh_set_user_nice(p, &nice, &allowed); + if ((task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) && !allowed) return; /* * We have to be careful, if called from sys_setpriority(), @@ -4558,22 +5278,21 @@ put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); + set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); - delta = p->prio - old_prio; - if (queued) { + if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); - } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); + out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4658,7 +5377,7 @@ return 0; #ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) + if (rq->ttwu_pending) return 0; #endif @@ -4681,6 +5400,7 @@ return 1; } +EXPORT_SYMBOL_GPL(available_idle_cpu); /** * idle_task - return the idle task for a given CPU. @@ -4732,36 +5452,7 @@ */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p, true); -} - -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; + set_load_weight(p); } /* @@ -4784,15 +5475,14 @@ const struct sched_attr *attr, bool user, bool pi) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; + bool cpuset_locked = false; /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); @@ -4860,7 +5550,7 @@ * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (idle_policy(p->policy) && !idle_policy(policy)) { + if (task_has_idle_policy(p) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -4871,6 +5561,10 @@ /* Normal users shall not reset the sched_reset_on_fork flag: */ if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; + + /* Can't change util-clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) return -EPERM; } @@ -4891,6 +5585,15 @@ } /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } + + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * @@ -4904,8 +5607,8 @@ * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } /* @@ -4923,8 +5626,8 @@ goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &rf); - return 0; + retval = 0; + goto unlock; } change: @@ -4937,8 +5640,8 @@ if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } #endif #ifdef CONFIG_SMP @@ -4951,10 +5654,10 @@ * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ - if (!cpumask_subset(span, &p->cpus_allowed) || + if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } } #endif @@ -4964,6 +5667,8 @@ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); goto recheck; } @@ -4973,13 +5678,14 @@ * is available. */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &rf); - return -EBUSY; + retval = -EBUSY; + goto unlock; } p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); if (pi) { /* * Take priority boosted tasks into account. If the new @@ -4988,8 +5694,8 @@ * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -5002,7 +5708,11 @@ prev_class = p->sched_class; - __setscheduler(rq, p, attr, pi); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + trace_android_rvh_setscheduler(p); + } __setscheduler_uclamp(p, attr); if (queued) { @@ -5016,7 +5726,7 @@ enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -5024,14 +5734,23 @@ preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + if (cpuset_locked) + cpuset_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); preempt_enable(); return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + return retval; } static int _sched_setscheduler(struct task_struct *p, int policy, @@ -5043,6 +5762,14 @@ .sched_nice = PRIO_TO_NICE(p->static_prio), }; + if (IS_ENABLED(CONFIG_ROCKCHIP_OPTIMIZE_RT_PRIO) && + ((policy == SCHED_FIFO) || (policy == SCHED_RR))) { + attr.sched_priority /= 2; + if (!check) + attr.sched_priority += MAX_RT_PRIO / 2; + if (!attr.sched_priority) + attr.sched_priority = 1; + } /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; @@ -5057,6 +5784,8 @@ * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * Use sched_set_fifo(), read its comment. * * Return: 0 on success. An error code otherwise. * @@ -5079,6 +5808,7 @@ { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -5099,6 +5829,51 @@ return _sched_setscheduler(p, policy, param, false); } EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -5130,9 +5905,6 @@ u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5140,44 +5912,18 @@ if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) @@ -5194,6 +5940,16 @@ err_size: put_user(sizeof(*attr), &uattr->size); return -E2BIG; +} + +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); } /** @@ -5257,6 +6013,8 @@ rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -5350,7 +6108,7 @@ { unsigned int ksize = sizeof(*kattr); - if (!access_ok(VERIFY_WRITE, uattr, usize)) + if (!access_ok(uattr, usize)) return -EFAULT; /* @@ -5378,7 +6136,7 @@ * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, @@ -5405,14 +6163,15 @@ kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif @@ -5431,6 +6190,7 @@ cpumask_var_t cpus_allowed, new_mask; struct task_struct *p; int retval; + int skip = 0; rcu_read_lock(); @@ -5466,6 +6226,9 @@ rcu_read_unlock(); } + trace_android_vh_sched_setaffinity_early(p, in_mask, &skip); + if (skip) + goto out_free_new_mask; retval = security_task_setscheduler(p); if (retval) goto out_free_new_mask; @@ -5506,6 +6269,9 @@ goto again; } } + + trace_android_rvh_sched_setaffinity(p, in_mask, &retval); + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -5514,7 +6280,6 @@ put_task_struct(p); return retval; } -EXPORT_SYMBOL_GPL(sched_setaffinity); static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) @@ -5569,7 +6334,7 @@ goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5598,14 +6363,14 @@ if (len & (sizeof(unsigned long)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { unsigned int retlen = min(len, cpumask_size()); - if (copy_to_user(user_mask_ptr, mask, retlen)) + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) ret = -EFAULT; else ret = retlen; @@ -5633,6 +6398,8 @@ schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); + trace_android_rvh_do_sched_yield(rq); + preempt_disable(); rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); @@ -5646,7 +6413,7 @@ return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5663,7 +6430,7 @@ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ @@ -5767,7 +6534,7 @@ if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* @@ -5933,7 +6700,7 @@ * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5944,16 +6711,15 @@ return retval; } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct compat_timespec __user *, interval) +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif @@ -5966,10 +6732,10 @@ if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -5978,12 +6744,13 @@ if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + trace_android_vh_sched_show_task(p); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); @@ -6014,13 +6781,6 @@ { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* @@ -6056,7 +6816,7 @@ * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -6069,9 +6829,6 @@ idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; - - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); #ifdef CONFIG_SMP /* @@ -6096,7 +6853,8 @@ __set_task_cpu(idle, cpu); rcu_read_unlock(); - rq->curr = rq->idle = idle; + rq->idle = idle; + rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; @@ -6133,8 +6891,7 @@ return ret; } -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) +int task_can_attach(struct task_struct *p) { int ret = 0; @@ -6145,18 +6902,11 @@ * allowed nodes is unnecessary. Thus, cpusets are not * applicable for such threads. This prevents checking for * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. + * before cpus_mask may be changed. */ - if (p->flags & PF_NO_SETAFFINITY) { + if (p->flags & PF_NO_SETAFFINITY) ret = -EINVAL; - goto out; - } - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); - -out: return ret; } @@ -6172,7 +6922,7 @@ if (curr_cpu == target_cpu) return 0; - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; /* TODO: This is not properly updating schedstats */ @@ -6205,7 +6955,7 @@ if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6246,21 +6996,22 @@ atomic_long_add(delta, &calc_load_tasks); } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq) { + const struct sched_class *class; + struct task_struct *next; + + for_each_class(class) { + next = class->pick_next_task(rq); + if (next) { + next->sched_class->put_prev_task(rq, next); + return next; + } + } + + /* The idle class should always have a runnable task */ + BUG(); } - -static const struct sched_class fake_sched_class = { - .put_prev_task = put_prev_task_fake, -}; - -static struct task_struct fake_task = { - /* - * Avoid pull_{rt,dl}_task() - */ - .prio = MAX_PRIO + 1, - .sched_class = &fake_sched_class, -}; /* * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6269,11 +7020,14 @@ * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. + * + * force: if false, the function will skip CPU pinned kthreads. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force) { struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; + struct task_struct *next, *tmp, *stop = rq->stop; + LIST_HEAD(percpu_kthreads); struct rq_flags orf = *rf; int dest_cpu; @@ -6295,6 +7049,11 @@ */ update_rq_clock(rq); +#ifdef CONFIG_SCHED_DEBUG + /* note the clock update in orf */ + orf.clock_update_flags |= RQCF_UPDATED; +#endif + for (;;) { /* * There's this thread running, bail when that's the only @@ -6303,15 +7062,24 @@ if (rq->nr_running == 1) break; - /* - * pick_next_task() assumes pinned rq->lock: - */ - next = pick_next_task(rq, &fake_task, rf); - BUG_ON(!next); - put_prev_task(rq, next); + next = __pick_migrate_task(rq); /* - * Rules for changing task_struct::cpus_allowed are holding + * Argh ... no iterator for tasks, we need to remove the + * kthread from the run-queue to continue. + */ + if (!force && is_per_cpu_kthread(next)) { + INIT_LIST_HEAD(&next->percpu_kthread_node); + list_add(&next->percpu_kthread_node, &percpu_kthreads); + + /* DEQUEUE_SAVE not used due to move_entity in rt */ + deactivate_task(rq, next, + DEQUEUE_NOCLOCK); + continue; + } + + /* + * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either * stabilizes the mask. * @@ -6328,7 +7096,14 @@ * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + /* + * In the !force case, there is a hole between + * rq_unlock() and rq_relock(), where another CPU might + * not observe an up to date cpu_active_mask and try to + * move tasks around. + */ + WARN_ON(force); raw_spin_unlock(&next->pi_lock); continue; } @@ -6345,7 +7120,49 @@ raw_spin_unlock(&next->pi_lock); } + list_for_each_entry_safe(next, tmp, &percpu_kthreads, + percpu_kthread_node) { + + /* ENQUEUE_RESTORE not used due to move_entity in rt */ + activate_task(rq, next, ENQUEUE_NOCLOCK); + list_del(&next->percpu_kthread_node); + } + rq->stop = stop; +} + +static int drain_rq_cpu_stop(void *data) +{ + struct rq *rq = this_rq(); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + migrate_tasks(rq, &rf, false); + rq_unlock_irqrestore(rq, &rf); + + return 0; +} + +int sched_cpu_drain_rq(unsigned int cpu) +{ + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); + struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done); + + if (idle_cpu(cpu)) { + rq_drain->done = NULL; + return 0; + } + + return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain, + rq_drain_done); +} + +void sched_cpu_drain_rq_wait(unsigned int cpu) +{ + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); + + if (rq_drain->done) + cpu_stop_work_wait(rq_drain); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -6417,8 +7234,10 @@ static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_bw_check_overflow(cpu); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; @@ -6467,19 +7286,27 @@ return 0; } -int sched_cpu_deactivate(unsigned int cpu) +int sched_cpus_activate(struct cpumask *cpus) +{ + unsigned int cpu; + + for_each_cpu(cpu, cpus) { + if (sched_cpu_activate(cpu)) { + for_each_cpu_and(cpu, cpus, cpu_active_mask) + sched_cpu_deactivate(cpu); + + return -EBUSY; + } + } + + return 0; +} + +int _sched_cpu_deactivate(unsigned int cpu) { int ret; set_cpu_active(cpu, false); - /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - synchronize_rcu_mult(call_rcu, call_rcu_sched); #ifdef CONFIG_SCHED_SMT /* @@ -6498,6 +7325,46 @@ return ret; } sched_domains_numa_masks_clear(cpu); + + update_max_interval(); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret = _sched_cpu_deactivate(cpu); + + if (ret) + return ret; + + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + synchronize_rcu(); + + return 0; +} + +int sched_cpus_deactivate_nosync(struct cpumask *cpus) +{ + unsigned int cpu; + + for_each_cpu(cpu, cpus) { + if (_sched_cpu_deactivate(cpu)) { + for_each_cpu(cpu, cpus) { + if (!cpu_active(cpu)) + sched_cpu_activate(cpu); + } + + return -EBUSY; + } + } + return 0; } @@ -6506,13 +7373,13 @@ struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - update_max_interval(); } int sched_cpu_starting(unsigned int cpu) { sched_rq_cpu_starting(cpu); sched_tick_start(cpu); + trace_android_rvh_sched_cpu_starting(cpu); return 0; } @@ -6523,7 +7390,6 @@ struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ - sched_ttwu_pending(); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); @@ -6531,12 +7397,13 @@ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); + trace_android_rvh_sched_cpu_dying(cpu); + calc_load_migrate(rq); - update_max_interval(); nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; @@ -6550,18 +7417,16 @@ /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. The hotplug lock is nevertheless taken to satisfy lockdep, - * but there won't be any contention on it. + * happen. */ - cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); - cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); + sched_init_granularity(); init_sched_rt_class(); @@ -6572,7 +7437,7 @@ static int __init migration_init(void) { - sched_rq_cpu_starting(smp_processor_id()); + sched_cpu_starting(smp_processor_id()); return 0; } early_initcall(migration_init); @@ -6597,7 +7462,9 @@ * Every task in system belongs to this group at bootup. */ struct task_group root_task_group; +EXPORT_SYMBOL_GPL(root_task_group); LIST_HEAD(task_groups); +EXPORT_SYMBOL_GPL(task_groups); /* Cacheline aligned slab cache for task_group */ static struct kmem_cache *task_group_cache __read_mostly; @@ -6608,19 +7475,27 @@ void __init sched_init(void) { - int i, j; - unsigned long alloc_size = 0, ptr; + unsigned long ptr = 0; + int i; + + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; @@ -6629,6 +7504,8 @@ root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -6681,7 +7558,6 @@ init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* @@ -6703,7 +7579,6 @@ * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -6711,10 +7586,6 @@ #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -6733,16 +7604,17 @@ rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON - rq->last_load_update_tick = jiffies; rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); } - set_load_weight(&init_task, false); + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: @@ -6811,7 +7683,7 @@ rcu_sleep_check(); if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || + !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) return; @@ -6827,8 +7699,8 @@ "BUG: sleeping function called from invalid context at %s:%d\n", file, line); printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, current->pid, current->comm); if (task_stack_end_corrupted(current)) @@ -6840,13 +7712,43 @@ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } + + trace_android_rvh_schedule_bug(NULL); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); + +void __cant_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > preempt_offset) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); + printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -6915,7 +7817,7 @@ #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * @@ -7081,8 +7983,15 @@ if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) - set_curr_task(rq, tsk); + if (running) { + set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } @@ -7121,9 +8030,14 @@ #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + mutex_lock(&uclamp_mutex); + rcu_read_lock(); cpu_util_update_eff(css); + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); #endif + trace_android_rvh_cpu_cgroup_online(css); return 0; } @@ -7189,6 +8103,9 @@ if (ret) break; } + + trace_android_rvh_cpu_cgroup_can_attach(tset, &ret); + return ret; } @@ -7199,6 +8116,8 @@ cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + trace_android_rvh_cpu_cgroup_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -7210,6 +8129,9 @@ unsigned int eff[UCLAMP_CNT]; enum uclamp_id clamp_id; unsigned int clamps; + + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -7243,7 +8165,7 @@ } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } @@ -7300,6 +8222,8 @@ req = capacity_from_percent(buf); if (req.ret) return req.ret; + + static_branch_enable(&sched_uclamp_used); mutex_lock(&uclamp_mutex); rcu_read_lock(); @@ -7415,7 +8339,9 @@ static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -7441,6 +8367,12 @@ * feasibility. */ if (period > max_cfs_quota_period) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota != RUNTIME_INF && quota > max_cfs_runtime) return -EINVAL; /* @@ -7495,7 +8427,7 @@ return ret; } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; @@ -7510,7 +8442,7 @@ return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; @@ -7523,7 +8455,7 @@ return quota_us; } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; @@ -7536,7 +8468,7 @@ return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -8013,4 +8945,7 @@ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} -- Gitblit v1.6.2