From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/kernel/sched/fair.c | 5267 ++++++++++++++++++++++++++++++++--------------------------- 1 files changed, 2,857 insertions(+), 2,410 deletions(-) diff --git a/kernel/kernel/sched/fair.c b/kernel/kernel/sched/fair.c index 15cb544..8c74c2f 100644 --- a/kernel/kernel/sched/fair.c +++ b/kernel/kernel/sched/fair.c @@ -20,12 +20,11 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS -#include <linux/cpufreq.h> -#endif #include "sched.h" -#include <trace/events/sched.h> +#include <trace/hooks/sched.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime); /* * Targeted preemption latency for CPU-bound tasks: @@ -41,17 +40,8 @@ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; - -/* - * Enable/disable honoring sync flag in energy-aware wakeups. - */ -unsigned int sysctl_sched_sync_hint_enable = 1; - -/* - * Enable/disable using cstate knowledge in idle sibling selection - */ -unsigned int sysctl_sched_cstate_aware = 1; +EXPORT_SYMBOL_GPL(sysctl_sched_latency); +static unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable @@ -71,8 +61,9 @@ * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); +static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity @@ -94,10 +85,23 @@ * * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +int sched_thermal_decay_shift; +static int __init setup_sched_thermal_decay_shift(char *str) +{ + int _shift = 0; + + if (kstrtoint(str, 0, &_shift)) + pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); + + sched_thermal_decay_shift = clamp(_shift, 0, 10); + return 1; +} +__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); #ifdef CONFIG_SMP /* @@ -107,6 +111,14 @@ { return -cpu; } + +/* + * The margin used when comparing utilization with CPU capacity. + * + * (default: ~20%) + */ +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -122,18 +134,6 @@ */ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS -unsigned int sysctl_sched_performance_bias = 1; -#endif - -/* - * The margin used when comparing utilization with CPU capacity: - * util * margin < capacity * 1024 - * - * (default: ~20%) - */ -unsigned int capacity_margin = 1280; static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -195,7 +195,7 @@ #undef SET_SYSCTL } -void sched_init_granularity(void) +void __init sched_init_granularity(void) { update_sysctl(); } @@ -246,8 +246,7 @@ } } - /* hint to use a 32x32->64 mul */ - fact = (u64)(u32)fact * lw->inv_weight; + fact = mul_u32_u32(fact, lw->inv_weight); while (fact >> 32) { fact >>= 1; @@ -290,6 +289,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { return grp->my_q; +} + +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (!path) + return; + + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) + autogroup_path(cfs_rq->tg, path, len); + else if (cfs_rq && cfs_rq->tg->css.cgroup) + cgroup_path(cfs_rq->tg->css.cgroup, path, len); + else + strlcpy(path, "(null)", len); } static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) @@ -466,6 +478,12 @@ return NULL; } +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (path) + strlcpy(path, "(null)", len); +} + static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -567,6 +585,7 @@ struct sched_entity *entry; bool leftmost = true; + trace_android_rvh_enqueue_entity(cfs_rq, se); /* * Find the right place in the rbtree: */ @@ -592,6 +611,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + trace_android_rvh_dequeue_entity(cfs_rq, se); rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } @@ -631,8 +651,7 @@ */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); @@ -689,7 +708,13 @@ */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + unsigned int nr_running = cfs_rq->nr_running; + u64 slice; + + if (sched_feat(ALT_PERIOD)) + nr_running = rq_of(cfs_rq)->cfs.h_nr_running; + + slice = __sched_period(nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; @@ -706,6 +731,10 @@ } slice = __calc_delta(slice, se->load.weight, load); } + + if (sched_feat(BASE_SLICE)) + slice = max(slice, (u64)sysctl_sched_min_granularity); + return slice; } @@ -734,26 +763,17 @@ memset(sa, 0, sizeof(*sa)); /* - * Tasks are intialized with full load to be seen as heavy tasks until + * Tasks are initialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. - * Group entities are intialized with zero load to reflect the fact that + * Group entities are initialized with zero load to reflect the fact that * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + sa->load_avg = scale_load_down(se->load.weight); - se->runnable_weight = se->load.weight; - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (sysctl_sched_performance_bias) { - sa->util_avg = SCHED_CAPACITY_SCALE >> 1; - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; - } -#endif /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static void attach_entity_cfs_rq(struct sched_entity *se); /* @@ -782,18 +802,15 @@ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (!sysctl_sched_performance_bias && (cap > 0)) { -#else if (cap > 0) { -#endif if (cfs_rq->avg.util_avg != 0) { sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; sa->util_avg /= (cfs_rq->avg.load_avg + 1); @@ -805,24 +822,25 @@ } } - if (entity_is_task(se)) { - struct task_struct *p = task_of(se); - if (p->sched_class != &fair_sched_class) { - /* - * For !fair tasks do: - * - update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); - switched_from_fair(rq, p); - * - * such that the next switched_to_fair() has the - * expected state. - */ - se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); - return; - } + sa->runnable_avg = sa->util_avg; + + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; } + /* Hook before this se's util is attached to cfs_rq's util */ + trace_android_rvh_post_init_entity_util_avg(se); attach_entity_cfs_rq(se); } @@ -830,10 +848,10 @@ void init_entity_runnable_average(struct sched_entity *se) { } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -983,7 +1001,6 @@ } trace_sched_stat_blocked(tsk, delta); - trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -1078,7 +1095,7 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000; struct numa_group { - atomic_t refcount; + refcount_t refcount; spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; @@ -1094,7 +1111,7 @@ * more by CPU use than by memory faults. */ unsigned long *faults_cpu; - unsigned long faults[0]; + unsigned long faults[]; }; /* @@ -1164,7 +1181,7 @@ unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; } @@ -1189,7 +1206,7 @@ unsigned long private = group_faults_priv(ng); unsigned long period = smax; - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; @@ -1199,56 +1216,15 @@ return max(smin, smax); } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) -{ - int mm_users = 0; - struct mm_struct *mm = p->mm; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) { - mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - mm->numa_scan_seq = 0; - } - } - p->node_stamp = 0; - p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - RCU_INIT_POINTER(p->numa_group, NULL); - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - /* New address space, reset the preferred nid */ - if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; - return; - } - - /* - * New thread, keep existing numa_preferred_nid which should be copied - * already by arch_dup_task_struct but stagger when scans start. - */ - if (mm) { - unsigned int delay; - - delay = min_t(unsigned int, task_scan_max(current), - current->numa_scan_period * mm_users * NSEC_PER_MSEC); - delay += 2 * TICK_NSEC; - p->node_stamp = delay; - } -} - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1474,7 +1450,7 @@ * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1527,55 +1503,52 @@ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(struct rq *rq); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; - + unsigned long runnable; + unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; + int idle_cpu; }; -/* - * XXX borrowed from update_sg_lb_stats - */ -static void update_numa_stats(struct numa_stats *ns, int nid) +static inline bool is_core_idle(int cpu) { - int smt, cpu, cpus = 0; - unsigned long capacity; +#ifdef CONFIG_SCHED_SMT + int sibling; - memset(ns, 0, sizeof(*ns)); - for_each_cpu(cpu, cpumask_of_node(nid)) { - struct rq *rq = cpu_rq(cpu); + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; - ns->nr_running += rq->nr_running; - ns->load += weighted_cpuload(rq); - ns->compute_capacity += capacity_of(cpu); - - cpus++; + if (!idle_cpu(sibling)) + return false; } +#endif - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); + return true; } struct task_numa_env { @@ -1594,20 +1567,132 @@ int best_cpu; }; +static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); +static unsigned long cpu_util(int cpu); +static inline long adjust_numa_imbalance(int imbalance, int nr_running); + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) + return node_has_spare; + + return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu, bool def); +static inline int numa_idle_core(int idle_core, int cpu) +{ + if (!static_branch_likely(&sched_smt_present) || + idle_core >= 0 || !test_idle_cores(cpu, false)) + return idle_core; + + /* + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ + if (is_core_idle(cpu)) + idle_core = cpu; + + return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ + return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid, + bool find_idle) +{ + int cpu, idle_core = -1; + + memset(ns, 0, sizeof(*ns)); + ns->idle_cpu = -1; + + rcu_read_lock(); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); + ns->util += cpu_util(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (READ_ONCE(rq->numa_migrate_on) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + if (ns->idle_cpu == -1) + ns->idle_cpu = cpu; + + idle_core = numa_idle_core(idle_core, cpu); + } + } + rcu_read_unlock(); + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); + + if (idle_core >= 0) + ns->idle_cpu = idle_core; +} + static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { struct rq *rq = cpu_rq(env->dst_cpu); - /* Bail out if run-queue part of active NUMA balance. */ - if (xchg(&rq->numa_migrate_on, 1)) - return; + /* Check if run-queue part of active NUMA balance. */ + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { + int cpu; + int start = env->dst_cpu; + /* Find alternative idle CPU. */ + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + if (cpu == env->best_cpu || !idle_cpu(cpu) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { + continue; + } + + env->dst_cpu = cpu; + rq = cpu_rq(env->dst_cpu); + if (!xchg(&rq->numa_migrate_on, 1)) + goto assign; + } + + /* Failed to find an alternative idle CPU */ + return; + } + +assign: /* * Clear previous best_cpu/rq numa-migrate flag, since task now * found a better CPU to move/swap. */ - if (env->best_cpu != -1) { + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { rq = cpu_rq(env->best_cpu); WRITE_ONCE(rq->numa_migrate_on, 0); } @@ -1663,7 +1748,7 @@ * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ -static void task_numa_compare(struct task_numa_env *env, +static bool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); @@ -1674,12 +1759,13 @@ int dist = env->dist; long moveimp = imp; long load; + bool stopsearch = false; if (READ_ONCE(dst_rq->numa_migrate_on)) - return; + return false; rcu_read_lock(); - cur = task_rcu_dereference(&dst_rq->curr); + cur = rcu_dereference(dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; @@ -1687,8 +1773,10 @@ * Because we have preemption enabled we can get migrated around and * end try selecting ourselves (current == env->p) as a swap candidate. */ - if (cur == env->p) + if (cur == env->p) { + stopsearch = true; goto unlock; + } if (!cur) { if (maymove && moveimp >= env->best_imp) @@ -1697,18 +1785,27 @@ goto unlock; } + /* Skip this swap candidate if cannot move to the source cpu. */ + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) + goto unlock; + + /* + * Skip this swap candidate if it is not moving to its preferred + * node and the best task is. + */ + if (env->best_task && + env->best_task->numa_preferred_nid == env->src_nid && + cur->numa_preferred_nid != env->src_nid) { + goto unlock; + } + /* * "imp" is the fault differential for the source task between the * source and destination node. Calculate the total differential for * the source task and potential destination task. The more negative * the value is, the more remote accesses that would be expected to * be incurred if the tasks were swapped. - */ - /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) - goto unlock; - - /* + * * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ @@ -1735,9 +1832,31 @@ task_weight(cur, env->dst_nid, dist); } + /* Discourage picking a task already on its preferred node */ + if (cur->numa_preferred_nid == env->dst_nid) + imp -= imp / 16; + + /* + * Encourage picking a task that moves to its preferred node. + * This potentially makes imp larger than it's maximum of + * 1998 (see SMALLIMP and task_weight for why) but in this + * case, it does not matter. + */ + if (cur->numa_preferred_nid == env->src_nid) + imp += imp / 8; + if (maymove && moveimp > imp && moveimp > env->best_imp) { imp = moveimp; cur = NULL; + goto assign; + } + + /* + * Prefer swapping with a task moving to its preferred node over a + * task that is not. + */ + if (env->best_task && cur->numa_preferred_nid == env->src_nid && + env->best_task->numa_preferred_nid != env->src_nid) { goto assign; } @@ -1764,50 +1883,104 @@ goto unlock; assign: - /* - * One idle CPU per node is evaluated for a task numa move. - * Call select_idle_sibling to maybe find a better one. - */ + /* Evaluate an idle CPU for a task numa move. */ if (!cur) { + int cpu = env->dst_stats.idle_cpu; + + /* Nothing cached so current CPU went idle since the search. */ + if (cpu < 0) + cpu = env->dst_cpu; + /* - * select_idle_siblings() uses an per-CPU cpumask that - * can be used from IRQ context. + * If the CPU is no longer truly idle and the previous best CPU + * is, keep using it. */ - local_irq_disable(); - env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, - env->dst_cpu); - local_irq_enable(); + if (!idle_cpu(cpu) && env->best_cpu >= 0 && + idle_cpu(env->best_cpu)) { + cpu = env->best_cpu; + } + + env->dst_cpu = cpu; } task_numa_assign(env, cur, imp); + + /* + * If a move to idle is allowed because there is capacity or load + * balance improves then stop the search. While a better swap + * candidate may exist, a search is not free. + */ + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) + stopsearch = true; + + /* + * If a swap candidate must be identified and the current best task + * moves its preferred node then stop the search. + */ + if (!maymove && env->best_task && + env->best_task->numa_preferred_nid == env->src_nid) { + stopsearch = true; + } unlock: rcu_read_unlock(); + + return stopsearch; } static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { - long src_load, dst_load, load; bool maymove = false; int cpu; - load = task_h_load(env->p); - dst_load = env->dst_stats.load + load; - src_load = env->src_stats.load - load; - /* - * If the improvement from just moving env->p direction is better - * than swapping tasks around, check if a move is possible. + * If dst node has spare capacity, then check if there is an + * imbalance that would be overruled by the load balancer. */ - maymove = !load_too_imbalanced(src_load, dst_load, env); + if (env->dst_stats.node_type == node_has_spare) { + unsigned int imbalance; + int src_running, dst_running; + + /* + * Would movement cause an imbalance? Note that if src has + * more running tasks that the imbalance is ignored as the + * move improves the imbalance from the perspective of the + * CPU load balancer. + * */ + src_running = env->src_stats.nr_running - 1; + dst_running = env->dst_stats.nr_running + 1; + imbalance = max(0, dst_running - src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running); + + /* Use idle CPU if there is no imbalance */ + if (!imbalance) { + maymove = true; + if (env->dst_stats.idle_cpu >= 0) { + env->dst_cpu = env->dst_stats.idle_cpu; + task_numa_assign(env, NULL, 0); + return; + } + } + } else { + long src_load, dst_load, load; + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + maymove = !load_too_imbalanced(src_load, dst_load, env); + } for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; env->dst_cpu = cpu; - task_numa_compare(env, taskimp, groupimp, maymove); + if (task_numa_compare(env, taskimp, groupimp, maymove)) + break; } } @@ -1861,10 +2034,10 @@ dist = env.dist = node_distance(env.src_nid, env.dst_nid); taskweight = task_weight(p, env.src_nid, dist); groupweight = group_weight(p, env.src_nid, dist); - update_numa_stats(&env.src_stats, env.src_nid); + update_numa_stats(&env, &env.src_stats, env.src_nid, false); taskimp = task_weight(p, env.dst_nid, dist) - taskweight; groupimp = group_weight(p, env.dst_nid, dist) - groupweight; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); @@ -1897,7 +2070,7 @@ env.dist = dist; env.dst_nid = nid; - update_numa_stats(&env.dst_stats, env.dst_nid); + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); task_numa_find_cpu(&env, taskimp, groupimp); } } @@ -1921,15 +2094,17 @@ } /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) + if (env.best_cpu == -1) { + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); return -EAGAIN; + } best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); return ret; } @@ -1937,7 +2112,7 @@ WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) - trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); put_task_struct(env.best_task); return ret; } @@ -1948,7 +2123,7 @@ unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2199,7 +2374,7 @@ static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2309,12 +2484,12 @@ static inline int get_numa_group(struct numa_group *grp) { - return atomic_inc_not_zero(&grp->refcount); + return refcount_inc_not_zero(&grp->refcount); } static inline void put_numa_group(struct numa_group *grp) { - if (atomic_dec_and_test(&grp->refcount)) + if (refcount_dec_and_test(&grp->refcount)) kfree_rcu(grp, rcu); } @@ -2335,7 +2510,7 @@ if (!grp) return; - atomic_set(&grp->refcount, 1); + refcount_set(&grp->refcount, 1); grp->active_nodes = 1; grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); @@ -2522,8 +2697,8 @@ local = 1; /* - * Retry task to preferred node migration periodically, in case it - * case it previously failed, or the scheduler moved us. + * Retry to migrate task to preferred node periodically, in case it + * previously failed, or the scheduler moved us. */ if (time_after(jiffies, p->numa_migrate_retry)) { task_numa_placement(p); @@ -2558,7 +2733,7 @@ * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ -void task_numa_work(struct callback_head *work) +static void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; @@ -2571,7 +2746,7 @@ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); - work->next = work; /* protect against double add */ + work->next = work; /* * Who cares about NUMA placement when they're dying. * @@ -2618,7 +2793,7 @@ return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); if (!vma) { @@ -2646,7 +2821,7 @@ * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + if (!vma_is_accessible(vma)) continue; do { @@ -2686,7 +2861,7 @@ mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code @@ -2700,10 +2875,54 @@ } } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + init_task_work(&p->numa_work, task_numa_work); + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + /* * Drive the periodic memory faults.. */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; @@ -2728,10 +2947,8 @@ curr->numa_scan_period = task_scan_start(curr); curr->node_stamp += period; - if (!time_before(jiffies, curr->mm->numa_next_scan)) { - init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ - task_work_add(curr, work, true); - } + if (!time_before(jiffies, curr->mm->numa_next_scan)) + task_work_add(curr, work, TWA_RESUME); } } @@ -2761,7 +2978,8 @@ * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } @@ -2791,8 +3009,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); @@ -2808,8 +3024,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); @@ -2856,26 +3070,18 @@ WRITE_ONCE(*ptr, res); \ } while (0) +/* + * Remove and clamp on negative, from a local variable. + * + * A variant of sub_positive(), which does not use explicit load-store + * and is thus optimized for local variable updates. + */ +#define lsub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + *ptr -= min_t(typeof(*ptr), *ptr, _val); \ +} while (0) + #ifdef CONFIG_SMP -static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight += se->runnable_weight; - - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; -} - -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->runnable_weight -= se->runnable_weight; - - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); - sub_positive(&cfs_rq->avg.runnable_load_sum, - se_runnable(se) * se->avg.runnable_load_sum); -} - static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2891,45 +3097,36 @@ } #else static inline void -enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight, unsigned long runnable) + unsigned long weight) { if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); - dequeue_runnable_load_avg(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); - se->avg.runnable_load_avg = - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) { - account_entity_enqueue(cfs_rq, se); - enqueue_runnable_load_avg(cfs_rq, se); - } + if (se->on_rq) + update_load_add(&cfs_rq->load, se->load.weight); + } void reweight_task(struct task_struct *p, int prio) @@ -2939,7 +3136,7 @@ struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight, weight); + reweight_entity(cfs_rq, se, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -3051,50 +3248,6 @@ */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } - -/* - * This calculates the effective runnable weight for a group entity based on - * the group entity weight calculated above. - * - * Because of the above approximation (2), our group entity weight is - * an load_avg based ratio (3). This means that it includes blocked load and - * does not represent the runnable weight. - * - * Approximate the group entity's runnable weight per ratio from the group - * runqueue: - * - * grq->avg.runnable_load_avg - * ge->runnable_weight = ge->load.weight * -------------------------- (7) - * grq->avg.load_avg - * - * However, analogous to above, since the avg numbers are slow, this leads to - * transients in the from-idle case. Instead we use: - * - * ge->runnable_weight = ge->load.weight * - * - * max(grq->avg.runnable_load_avg, grq->runnable_weight) - * ----------------------------------------------------- (8) - * max(grq->avg.load_avg, grq->load.weight) - * - * Where these max() serve both to use the 'instant' values to fix the slow - * from-idle and avoid the /0 on to-idle, similar to (6). - */ -static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) -{ - long runnable, load_avg; - - load_avg = max(cfs_rq->avg.load_avg, - scale_load_down(cfs_rq->load.weight)); - - runnable = max(cfs_rq->avg.runnable_load_avg, - scale_load_down(cfs_rq->runnable_weight)); - - runnable *= shares; - if (load_avg) - runnable /= load_avg; - - return clamp_t(long, runnable, MIN_SHARES, shares); -} #endif /* CONFIG_SMP */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3106,7 +3259,7 @@ static void update_cfs_group(struct sched_entity *se) { struct cfs_rq *gcfs_rq = group_cfs_rq(se); - long shares, runnable; + long shares; if (!gcfs_rq) return; @@ -3115,16 +3268,15 @@ return; #ifndef CONFIG_SMP - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); + shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else shares = calc_group_shares(gcfs_rq); - runnable = calc_group_runnable(gcfs_rq, shares); #endif - reweight_entity(cfs_rq_of(se), se, shares, runnable); + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3137,7 +3289,7 @@ { struct rq *rq = rq_of(cfs_rq); - if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3161,7 +3313,6 @@ /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3173,7 +3324,7 @@ * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3183,11 +3334,9 @@ if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; - - trace_sched_load_tg(cfs_rq); } } @@ -3240,7 +3389,6 @@ se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3251,11 +3399,11 @@ * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * Per the above update_tg_cfs_util() is trivial and simply copies the running - * sum over (but still wrong, because the group entity and group rq do not have - * their PELT windows aligned). + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial + * and simply copies the running/runnable sum over (but still wrong, because + * the group entity and group rq do not have their PELT windows aligned). * - * However, update_tg_cfs_runnable() is more complex. So we have: + * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * @@ -3308,45 +3456,75 @@ * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 divider; /* Nothing to update */ if (!delta) return; /* - * The relation between sum and avg is: - * - * LOAD_AVG_MAX - 1024 + sa->period_contrib - * - * however, the PELT windows are not aligned between grq and gse. + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. */ + divider = get_pelt_divider(&cfs_rq->avg); /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + se->avg.util_sum = se->avg.util_avg * divider; /* Update parent cfs_rq utilization */ add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 divider; + + /* Nothing to update */ + if (!delta) + return; + + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + + /* Set new sched_entity's runnable */ + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; + se->avg.runnable_sum = se->avg.runnable_avg * divider; + + /* Update parent cfs_rq runnable */ + add_positive(&cfs_rq->avg.runnable_avg, delta); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; +} + +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) +{ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; - unsigned long runnable_load_avg, load_avg; - u64 runnable_load_sum, load_sum = 0; + unsigned long load_avg; + u64 load_sum = 0; s64 delta_sum; + u32 divider; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3354,7 +3532,7 @@ * the CPU is saturated running == runnable. */ runnable_sum += se->avg.load_sum; - runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + runnable_sum = min_t(long, runnable_sum, divider); } else { /* * Estimate the new unweighted runnable_sum of the gcfs_rq by @@ -3379,7 +3557,7 @@ runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, LOAD_AVG_MAX); + load_avg = div_s64(load_sum, divider); delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; @@ -3388,19 +3566,6 @@ se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); - - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); - delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - - se->avg.runnable_load_sum = runnable_sum; - se->avg.runnable_load_avg = runnable_load_avg; - - if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); - } } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3429,9 +3594,10 @@ update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + update_tg_cfs_load(cfs_rq, se, gcfs_rq); - trace_sched_load_cfs_rq(cfs_rq); - trace_sched_load_se(se); + trace_pelt_cfs_tp(cfs_rq); + trace_pelt_se_tp(se); return 1; } @@ -3468,7 +3634,7 @@ #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3498,18 +3664,18 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); swap(cfs_rq->removed.load_avg, removed_load); - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); + swap(cfs_rq->removed.runnable_avg, removed_runnable); cfs_rq->removed.nr = 0; raw_spin_unlock(&cfs_rq->removed.lock); @@ -3520,8 +3686,29 @@ r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sub_positive(&sa->runnable_sum, r * divider); + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); decayed = 1; } @@ -3533,9 +3720,6 @@ cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3543,14 +3727,17 @@ * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3570,23 +3757,25 @@ */ se->avg.util_sum = se->avg.util_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.runnable_load_sum = se->avg.load_sum; + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); - cfs_rq_util_change(cfs_rq, flags); + cfs_rq_util_change(cfs_rq, 0); - trace_sched_load_cfs_rq(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); } /** @@ -3602,12 +3791,14 @@ dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq, 0); - trace_sched_load_cfs_rq(cfs_rq); + trace_pelt_cfs_tp(cfs_rq); } /* @@ -3623,12 +3814,15 @@ u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; + trace_android_vh_prepare_update_load_avg_se(se, flags); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); + + trace_android_vh_finish_update_load_avg_se(se, flags); decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); @@ -3642,11 +3836,15 @@ * * IOW we're enqueueing a task on a new CPU. */ - attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); - update_tg_load_avg(cfs_rq, 0); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq); + } } #ifndef CONFIG_64BIT @@ -3674,20 +3872,22 @@ * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); + trace_android_vh_prepare_update_load_avg_se(se, 0); __update_load_avg_blocked_se(last_update_time, se); + trace_android_vh_finish_update_load_avg_se(se, 0); } /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -3696,10 +3896,6 @@ * tasks cannot exit without having gone through wake_up_new_task() -> * post_init_entity_util_avg() which will have added things to the * cfs_rq, so we can remove unconditionally. - * - * Similarly for groups, they will have passed through - * post_init_entity_util_avg() before unregister_sched_fair_group() - * calls this. */ sync_entity_load_avg(se); @@ -3708,13 +3904,13 @@ ++cfs_rq->removed.nr; cfs_rq->removed.util_avg += se->avg.util_avg; cfs_rq->removed.load_avg += se->avg.load_avg; - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->avg.runnable_load_avg; + return cfs_rq->avg.runnable_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3722,7 +3918,7 @@ return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -3733,23 +3929,25 @@ { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return max(ue.ewma, ue.enqueued); + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); } -unsigned long task_util_est(struct task_struct *p) +static inline unsigned long task_util_est(struct task_struct *p) { return max(task_util(p), _task_util_est(p)); } #ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); + return clamp(task_util_est(p), uclamp_min, uclamp_max); } #else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { return task_util_est(p); } @@ -3765,13 +3963,29 @@ /* Update root cfs_rq's estimated utilization */ enqueued = cfs_rq->avg.util_est.enqueued; - enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); + enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); - /* Update plots for Task and CPU estimated utilization */ - trace_sched_util_est_task(p, &p->se.avg); - trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + trace_sched_util_est_cfs_tp(cfs_rq); } + +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) /* * Check if a (signed) value is within a specified (unsigned) margin, @@ -3786,24 +4000,20 @@ return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { - long last_ewma_diff; + long last_ewma_diff, last_enqueued_diff; struct util_est ue; - int cpu; + int ret = 0; + + trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret); + if (ret) + return; if (!sched_feat(UTIL_EST)) return; - - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - /* Update plots for CPU's estimated utilization */ - trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); /* * Skip update of task's estimated utilization when the task has not @@ -3820,11 +4030,13 @@ if (ue.enqueued & UTIL_AVG_UNCHANGED) return; + last_enqueued_diff = ue.enqueued; + /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + ue.enqueued = task_util(p); if (sched_feat(UTIL_EST_FASTUP)) { if (ue.ewma < ue.enqueued) { ue.ewma = ue.enqueued; @@ -3833,19 +4045,23 @@ } /* - * Skip update of task's estimated utilization when its EWMA is + * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ last_ewma_diff = ue.enqueued - ue.ewma; - if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) + last_enqueued_diff -= ue.enqueued; + if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { + if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) + goto done; + return; + } /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -3869,49 +4085,166 @@ ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: + ue.enqueued |= UTIL_AVG_UNCHANGED; WRITE_ONCE(p->se.avg.util_est, ue); - /* Update plots for Task's estimated utilization */ - trace_sched_util_est_task(p, &p->se.avg); + trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) { - return capacity * 1024 > uclamp_task_util(p) * capacity_margin; -} - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS -static inline bool task_fits_max(struct task_struct *p, int cpu) -{ + unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; + bool fits, uclamp_max_fits; - if (capacity == max_capacity) - return true; + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); - if (capacity * capacity_margin > max_capacity * 1024) - return true; + if (!uclamp_is_used()) + return fits; - return task_fits_capacity(p, capacity); + /* + * We must use capacity_orig_of() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for thermal pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + * + * In case of capacity inversion, which is not handled yet, we should + * honour the inverted capacity for both uclamp_min and uclamp_max all + * the time. + */ + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on cpu0 and should migrate + * to cpu1 + * * util = 80% of y then it is forced to fit on cpu1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) + fits = fits && (uclamp_min <= capacity_orig_thermal); + + return fits; } -#endif + +static inline int task_fits_cpu(struct task_struct *p, int cpu) +{ + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); +} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + bool need_update = true; + + trace_android_rvh_update_misfit_status(p, rq, &need_update); + if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -3928,11 +4261,11 @@ static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq, struct rq_flags *rf) +static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3941,8 +4274,11 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -3958,6 +4294,29 @@ if (d > 3*sysctl_sched_latency) schedstat_inc(cfs_rq->nr_spread_over); #endif +} + +static inline bool entity_is_long_sleeper(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + u64 sleep_time; + + if (se->exec_start == 0) + return false; + + cfs_rq = cfs_rq_of(se); + + sleep_time = rq_clock_task(rq_of(cfs_rq)); + + /* Happen while migrating because of clock task divergence */ + if (sleep_time <= se->exec_start) + return false; + + sleep_time -= se->exec_start; + if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) + return true; + + return false; } static void @@ -3988,8 +4347,30 @@ vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. + * However, min_vruntime can advance much faster than real time, with + * the extreme being when an entity with the minimal weight always runs + * on the cfs_rq. If the waking entity slept for a long time, its + * vruntime difference from min_vruntime may overflow s64 and their + * comparison may get inversed, so ignore the entity's original + * vruntime in that case. + * The maximal vruntime speedup is given by the ratio of normal to + * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. + * When placing a migrated waking entity, its exec_start has been set + * from a different rq. In order to take into account a possible + * divergence between new and prev rq's clocks task because of irq and + * stolen time, we take an additional margin. + * So, cutting off on the sleep time of + * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days + * should be safe. + */ + if (entity_is_long_sleeper(se)) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); + trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4014,6 +4395,7 @@ #endif } +static inline bool cfs_bandwidth_used(void); /* * MIGRATION @@ -4078,12 +4460,15 @@ * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + se_update_runnable(se); update_cfs_group(se); - enqueue_runnable_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); @@ -4092,10 +4477,16 @@ __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + /* + * When bandwidth control is enabled, cfs might have been removed + * because of a parent been throttled but cfs->nr_running > 1. Try to + * add it unconditionnally. + */ + if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); - } } static void __clear_buddies_last(struct sched_entity *se) @@ -4156,13 +4547,13 @@ /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Substract its load from the cfs_rq->runnable_avg. - * - Substract its previous weight from cfs_rq->load.weight. + * - Subtract its load from the cfs_rq->runnable_avg. + * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); - dequeue_runnable_load_avg(cfs_rq, se); + se_update_runnable(se); update_stats_dequeue(cfs_rq, se, flags); @@ -4206,9 +4597,14 @@ unsigned long ideal_runtime, delta_exec; struct sched_entity *se; s64 delta; + bool skip_preempt = false; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt, + delta_exec, cfs_rq, curr, sysctl_sched_min_granularity); + if (skip_preempt) + return; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* @@ -4237,8 +4633,7 @@ resched_curr(rq_of(cfs_rq)); } -static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* 'current' is not kept within the tree. */ if (se->on_rq) { @@ -4260,7 +4655,8 @@ * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); @@ -4268,6 +4664,8 @@ se->prev_sum_exec_runtime = se->sum_exec_runtime; } +EXPORT_SYMBOL_GPL(set_next_entity); + static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); @@ -4283,7 +4681,11 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; + struct sched_entity *se = NULL; + + trace_android_rvh_pick_next_entity(cfs_rq, curr, &se); + if (se) + goto done; /* * If curr is set we have to see if its left of the leftmost entity @@ -4313,18 +4715,19 @@ se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } +done: clear_buddies(cfs_rq, se); return se; @@ -4457,26 +4860,17 @@ return &tg->cfs_bandwidth; } -/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; - - return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; -} - /* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, + struct cfs_rq *cfs_rq, u64 target_runtime) { - struct task_group *tg = cfs_rq->tg; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount; + u64 min_amount, amount = 0; + + lockdep_assert_held(&cfs_b->lock); /* note: this is a positive sum as runtime_remaining <= 0 */ - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + min_amount = target_runtime - cfs_rq->runtime_remaining; - raw_spin_lock(&cfs_b->lock); if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { @@ -4488,11 +4882,23 @@ cfs_b->idle = 0; } } - raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; return cfs_rq->runtime_remaining > 0; +} + +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + int ret; + + raw_spin_lock(&cfs_b->lock); + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); + raw_spin_unlock(&cfs_b->lock); + + return ret; } static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) @@ -4557,9 +4963,8 @@ cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - /* adjust cfs_rq_clock_task() */ - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with already running entity in the list */ if (cfs_rq->nr_running >= 1) @@ -4576,7 +4981,7 @@ /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -4584,13 +4989,33 @@ return 0; } -static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; - bool empty; + long task_delta, idle_task_delta, dequeue = 1; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + /* + * We have raced with bandwidth becoming available, and if we + * actually throttled the timer might not unthrottle us for an + * entire period. We additionally needed to make sure that any + * subsequent check_cfs_rq_runtime calls agree not to throttle + * us, as we may commit to do cfs put_prev+pick_next, so we ask + * for 1ns of runtime rather than just check cfs_b. + */ + dequeue = 0; + } else { + list_add_tail_rcu(&cfs_rq->throttled_list, + &cfs_b->throttled_cfs_rq); + } + raw_spin_unlock(&cfs_b->lock); + + if (!dequeue) + return false; /* Throttle no longer required. */ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4600,15 +5025,22 @@ rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - if (dequeue) + if (dequeue) { dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + } else { + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + } + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4617,29 +5049,13 @@ if (!se) sub_nr_running(rq, task_delta); + /* + * Note: distribution will already see us throttled via the + * throttled-list. rq->lock protects completion. + */ cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_b->throttled_cfs_rq); - - /* - * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is - * not running add to the tail so that later runqueues don't get starved. - */ - if (cfs_b->distribute_running) - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - else - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - - /* - * If we're the first throttled task, make sure the bandwidth - * timer is running. - */ - if (empty) - start_cfs_bandwidth(cfs_b); - - raw_spin_unlock(&cfs_b->lock); + return true; } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4647,8 +5063,7 @@ struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4668,34 +5083,70 @@ return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; - + break; cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); - cfs_rq->h_nr_running += task_delta; + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + } + + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); + +unthrottle_throttle: + /* + * The cfs_rq_throttled() breaks in the above iteration can result in + * incomplete leaf list maintenance, resulting in triggering the + * assertion below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) break; } assert_list_leaf_cfs_rq(rq); - - if (!se) - add_nr_running(rq, task_delta); /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4703,17 +5154,20 @@ struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -4722,14 +5176,12 @@ unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (!remaining) break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -4738,9 +5190,8 @@ * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4769,24 +5220,15 @@ cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ - while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; - cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + while (throttled && cfs_b->runtime > 0) { + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); - raw_spin_lock(&cfs_b->lock); + distribute_cfs_runtime(cfs_b); + raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - cfs_b->runtime -= min(runtime, cfs_b->runtime); } /* @@ -4842,6 +5284,11 @@ if (runtime_refresh_within(cfs_b, min_left)) return; + /* don't push forwards an existing deferred unthrottle */ + if (cfs_b->slack_started) + return; + cfs_b->slack_started = true; + hrtimer_start(&cfs_b->slack_timer, ns_to_ktime(cfs_bandwidth_slack_period), HRTIMER_MODE_REL); @@ -4889,42 +5336,35 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); - if (cfs_b->distribute_running) { - raw_spin_unlock(&cfs_b->lock); - return; - } + raw_spin_lock_irqsave(&cfs_b->lock, flags); + cfs_b->slack_started = false; if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - if (runtime) - cfs_b->distribute_running = 1; - - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); - raw_spin_lock(&cfs_b->lock); - cfs_b->runtime -= min(runtime, cfs_b->runtime); - cfs_b->distribute_running = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { @@ -4959,7 +5399,7 @@ pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -4978,8 +5418,7 @@ if (cfs_rq_throttled(cfs_rq)) return true; - throttle_cfs_rq(cfs_rq); - return true; + return throttle_cfs_rq(cfs_rq); } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -4998,15 +5437,18 @@ { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); + unsigned long flags; int overrun; int idle = 0; int count = 0; - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -5037,12 +5479,10 @@ /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun); } if (idle) cfs_b->period_active = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -5059,7 +5499,7 @@ cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; - cfs_b->distribute_running = 0; + cfs_b->slack_started = false; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -5154,11 +5594,6 @@ return false; } -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) -{ - return rq_clock_task(rq_of(cfs_rq)); -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5251,22 +5686,43 @@ #ifdef CONFIG_SMP static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu); static inline bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + int overutilized = -1; + + trace_android_rvh_cpu_overutilized(cpu, &overutilized); + if (overutilized != -1) + return overutilized; + + return !util_fits_cpu(cpu_util(cpu), rq_util_min, rq_util_max, cpu); } static inline void update_overutilized_status(struct rq *rq) { if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized(1); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); } } #else static inline void update_overutilized_status(struct rq *rq) { } +#endif + +/* Runqueue only has SCHED_IDLE tasks enqueued */ +static int sched_idle_rq(struct rq *rq) +{ + return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + rq->nr_running); +} + +#ifdef CONFIG_SMP +static int sched_idle_cpu(int cpu) +{ + return sched_idle_rq(cpu_rq(cpu)); +} #endif /* @@ -5279,12 +5735,9 @@ { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (sysctl_sched_performance_bias) - cpufreq_task_boost(rq->cpu, task_util_est(p)); -#endif + int should_iowait_boost; /* * The code below (indirectly) updates schedutil which looks at @@ -5295,29 +5748,13 @@ util_est_enqueue(&rq->cfs, p); /* - * The code below (indirectly) updates schedutil which looks at - * the cfs_rq utilization to select a frequency. - * Let's update schedtune here to ensure the boost value of the - * current task is accounted for in the selection of the OPP. - * - * We do it also in the case where we enqueue a throttled task; - * we could argue that a throttled task should not boost a CPU, - * however: - * a) properly implementing CPU boosting considering throttled - * tasks will increase a lot the complexity of the solution - * b) it's not easy to quantify the benefits introduced by - * such a more complex solution. - * Thus, for the time being we go for the simple solution and boost - * also for throttled RQs. - */ - schedtune_enqueue_task(p, cpu_of(rq)); - - /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed. */ - if (p->in_iowait) + should_iowait_boost = p->in_iowait; + trace_android_rvh_set_iowait(p, &should_iowait_boost); + if (should_iowait_boost) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { @@ -5326,51 +5763,60 @@ cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; flags = ENQUEUE_WAKEUP; } + trace_android_rvh_enqueue_task_fair(rq, p, flags); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_running++; - - if (cfs_rq_throttled(cfs_rq)) - break; update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_cfs_group(se); + + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } - if (!se) { - add_nr_running(rq, 1); - /* - * Since new tasks are assigned an initial util_avg equal to - * half of the spare capacity of their CPU, tiny tasks have the - * ability to cross the overutilized threshold, which will - * result in the load balancer ruining all the task placement - * done by EAS. As a way to mitigate that effect, do not account - * for the first enqueue operation of new tasks during the - * overutilized flag detection. - * - * A better way of solving this problem would be to wait for - * the PELT signals of tasks to converge before taking them - * into account, but that is not straightforward to implement, - * and the following generally works well enough in practice. - */ - if (!task_new) - update_overutilized_status(rq); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); - } + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (!task_new) + update_overutilized_status(rq); +enqueue_throttle: if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() @@ -5403,28 +5849,21 @@ struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); - /* - * The code below (indirectly) updates schedutil which looks at - * the cfs_rq utilization to select a frequency. - * Let's update schedtune here to ensure the boost value of the - * current task is not more accounted for in the selection of the OPP. - */ - schedtune_dequeue_task(p, cpu_of(rq)); + util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5441,21 +5880,32 @@ flags |= DEQUEUE_SLEEP; } + trace_android_rvh_dequeue_task_fair(rq, p, flags); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_running--; - - if (cfs_rq_throttled(cfs_rq)) - break; update_load_avg(cfs_rq, se, UPDATE_TG); + se_update_runnable(se); update_cfs_group(se); + + cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; + } - if (!se) - sub_nr_running(rq, 1); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, 1); - util_est_dequeue(&rq->cfs, p, task_sleep); + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + +dequeue_throttle: + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -5466,71 +5916,6 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); #ifdef CONFIG_NO_HZ_COMMON -/* - * per rq 'load' arrray crap; XXX kill this. - */ - -/* - * The exact cpuload calculated at every tick would be: - * - * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load - * - * If a CPU misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when CPU may be busy, then we have: - * - * load_n = (1 - 1/2^i)^n * load_0 - * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load - * - * decay_load_missed() below does efficient calculation of - * - * load' = (1 - 1/2^i)^n * load - * - * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. - * This allows us to precompute the above in said factors, thereby allowing the - * reduction of an arbitrary n in O(log_2 n) steps. (See also - * fixed_power_int()) - * - * The calculation is approximated on a 128 point scale. - */ -#define DEGRADE_SHIFT 7 - -static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - { 0, 0, 0, 0, 0, 0, 0, 0 }, - { 64, 32, 8, 0, 0, 0, 0, 0 }, - { 96, 72, 40, 12, 1, 0, 0, 0 }, - { 112, 98, 75, 43, 15, 1, 0, 0 }, - { 120, 112, 98, 76, 45, 16, 2, 0 } -}; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} static struct { cpumask_var_t idle_cpus_mask; @@ -5542,249 +5927,68 @@ #endif /* CONFIG_NO_HZ_COMMON */ -/** - * __cpu_load_update - update the rq->cpu_load[] statistics - * @this_rq: The rq to update statistics for - * @this_load: The current load - * @pending_updates: The number of missed updates - * - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - * - * This function computes a decaying average: - * - * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load - * - * Because of NOHZ it might not get called on every tick which gives need for - * the @pending_updates argument. - * - * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 - * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load - * = A * (A * load[i]_n-2 + B) + B - * = A * (A * (A * load[i]_n-3 + B) + B) + B - * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B - * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B - * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B - * = (1 - 1/2^i)^n * (load[i]_0 - load) + load - * - * In the above we've assumed load_n := load, which is true for NOHZ_FULL as - * any change in load would have resulted in the tick being turned back on. - * - * For regular NOHZ, this reduces to: - * - * load[i]_n = (1 - 1/2^i)^n * load[i]_0 - * - * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. - */ -static void cpu_load_update(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) +static unsigned long cpu_load(struct rq *rq) { - unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; -#ifdef CONFIG_NO_HZ_COMMON - old_load = decay_load_missed(old_load, pending_updates - 1, i); - if (tickless_load) { - old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); - /* - * old_load can never be a negative value because a - * decayed tickless_load cannot be greater than the - * original tickless_load. - */ - old_load += tickless_load; - } -#endif - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } -} - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(struct rq *rq) -{ - return cfs_rq_runnable_load_avg(&rq->cfs); -} - -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we need to avoid the delta approach from the regular tick when - * possible since that would seriously skew the load calculation. This is why we - * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on - * jiffies deltas for updates happening while in nohz mode (idle ticks, idle - * loop exit, nohz_idle_balance, nohz full exit...) - * - * This means we might still be one tick off for nohz periods. - */ - -static void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) -{ - unsigned long pending_updates; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - cpu_load_update(this_rq, load, pending_updates); - } + return cfs_rq_load_avg(&rq->cfs); } /* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -static void cpu_load_update_idle(struct rq *this_rq) -{ - /* - * bail if there's load or we're actually up-to-date. - */ - if (weighted_cpuload(this_rq)) - return; - - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); -} - -/* - * Record CPU load on nohz entry so we know the tickless load to account - * on nohz exit. cpu_load[0] happens then to be updated more frequently - * than other cpu_load[idx] but it should be fine as cpu_load readers - * shouldn't rely into synchronized cpu_load[*] updates. - */ -void cpu_load_update_nohz_start(void) -{ - struct rq *this_rq = this_rq(); - - /* - * This is all lockless but should be fine. If weighted_cpuload changes - * concurrently we'll exit nohz. And cpu_load write can race with - * cpu_load_update_idle() but both updater would be writing the same. - */ - this_rq->cpu_load[0] = weighted_cpuload(this_rq); -} - -/* - * Account the tickless load in the end of a nohz frame. - */ -void cpu_load_update_nohz_stop(void) -{ - unsigned long curr_jiffies = READ_ONCE(jiffies); - struct rq *this_rq = this_rq(); - unsigned long load; - struct rq_flags rf; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - load = weighted_cpuload(this_rq); - rq_lock(this_rq, &rf); - update_rq_clock(this_rq); - cpu_load_update_nohz(this_rq, curr_jiffies, load); - rq_unlock(this_rq, &rf); -} -#else /* !CONFIG_NO_HZ_COMMON */ -static inline void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) { } -#endif /* CONFIG_NO_HZ_COMMON */ - -static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) -{ -#ifdef CONFIG_NO_HZ_COMMON - /* See the mess around cpu_load_update_nohz(). */ - this_rq->last_load_update_tick = READ_ONCE(jiffies); -#endif - cpu_load_update(this_rq, load, 1); -} - -/* - * Called from scheduler_tick() - */ -void cpu_load_update_active(struct rq *this_rq) -{ - unsigned long load = weighted_cpuload(this_rq); - - if (tick_nohz_tick_stopped()) - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); - else - cpu_load_update_periodic(this_rq, load); -} - -/* - * Return a low guess at the load of a migration-source CPU weighted - * according to the scheduling class and "nice" value. + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted * - * We want to under-estimate the load of migration sources, to - * balance conservatively. + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. */ -static unsigned long source_load(int cpu, int type) +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) { - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); + struct cfs_rq *cfs_rq; + unsigned int load; - if (type == 0 || !sched_feat(LB_BIAS)) - return total; + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); - return min(rq->cpu_load[type-1], total); + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); + + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; } -/* - * Return a high guess at the load of a migration-target CPU weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) +static unsigned long cpu_runnable(struct rq *rq) { - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); + return cfs_rq_runnable_avg(&rq->cfs); +} - if (type == 0 || !sched_feat(LB_BIAS)) - return total; +static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int runnable; - return max(rq->cpu_load[type-1], total); + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_runnable(rq); + + cfs_rq = &rq->cfs; + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + + /* Discount task's runnable from CPU's runnable */ + lsub_positive(&runnable, p->se.avg.runnable_avg); + + return runnable; } static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(rq); - - if (nr_running) - return load_avg / nr_running; - - return 0; } static void record_wakee(struct task_struct *p) @@ -5821,18 +6025,15 @@ * whatever is irrelevant, spread criteria is apparent partner count exceeds * socket size. */ -static int wake_wide(struct task_struct *p, int sibling_count_hint) +static int wake_wide(struct task_struct *p) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int llc_size = this_cpu_read(sd_llc_size); - - if (sibling_count_hint >= llc_size) - return 1; + int factor = __this_cpu_read(sd_llc_size); if (master < slave) swap(master, slave); - if (slave < llc_size || master < slave * llc_size) + if (slave < factor || master < slave * factor) return 0; return 1; } @@ -5880,7 +6081,7 @@ s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = target_load(this_cpu, sd->wake_idx); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5898,7 +6099,7 @@ this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5936,242 +6137,8 @@ return target; } -#ifdef CONFIG_SCHED_TUNE -struct reciprocal_value schedtune_spc_rdiv; - -static long -schedtune_margin(unsigned long signal, long boost) -{ - long long margin = 0; - - /* - * Signal proportional compensation (SPC) - * - * The Boost (B) value is used to compute a Margin (M) which is - * proportional to the complement of the original Signal (S): - * M = B * (SCHED_CAPACITY_SCALE - S) - * The obtained M could be used by the caller to "boost" S. - */ - if (boost >= 0) { - margin = SCHED_CAPACITY_SCALE - signal; - margin *= boost; - } else - margin = -signal * boost; - - margin = reciprocal_divide(margin, schedtune_spc_rdiv); - - if (boost < 0) - margin *= -1; - return margin; -} - -inline long -schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) -{ - int boost = schedtune_cpu_boost_with(cpu, p); - long margin; - - if (boost == 0) - margin = 0; - else - margin = schedtune_margin(util, boost); - - trace_sched_boost_cpu(cpu, util, margin); - - return margin; -} - -long schedtune_task_margin(struct task_struct *task) -{ - int boost = schedtune_task_boost(task); - unsigned long util; - long margin; - - if (boost == 0) - return 0; - - util = task_util_est(task); - margin = schedtune_margin(util, boost); - - return margin; -} - -#else /* CONFIG_SCHED_TUNE */ - -inline long -schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) -{ - return 0; -} - -#endif /* CONFIG_SCHED_TUNE */ - -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int load_idx = sd->forkexec_idx; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - do { - unsigned long load, avg_load, runnable_load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - &p->cpus_allowed)) - continue; - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (sysctl_sched_performance_bias) { - if (!task_fits_max(p, group_first_cpu(group))) - continue; - } -#endif - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - avg_load = 0; - runnable_load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward CPUs of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; - this_spare = max_spare_cap; - } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (sysctl_sched_performance_bias) { - if ((this_runnable_load == ULONG_MAX) || (this_avg_load == ULONG_MAX)) - return idlest; - } -#endif - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. - */ - if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) - return NULL; - - if (min_runnable_load > (this_runnable_load + imbalance)) - return NULL; - - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) - return NULL; - - return idlest; -} +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -6191,7 +6158,10 @@ return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + if (sched_idle_cpu(i)) + return i; + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -6215,7 +6185,7 @@ shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -6231,11 +6201,11 @@ { int new_cpu = cpu; - if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -6251,7 +6221,7 @@ continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; @@ -6348,16 +6318,18 @@ if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - cpumask_clear_cpu(cpu, cpus); - if (!available_idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) { idle = false; + break; + } } + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; @@ -6382,9 +6354,10 @@ return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -6415,8 +6388,8 @@ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; - u64 time, cost; - s64 delta; + u64 time; + int this = smp_processor_id(); int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); @@ -6441,23 +6414,68 @@ nr = 4; } - time = local_clock(); + time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (available_idle_cpu(cpu)) + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) break; } - time = local_clock() - time; - cost = this_sd->avg_scan_cost; - delta = (s64)(time - cost) / 8; - this_sd->avg_scan_cost += delta; + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); return cpu; +} + +/* + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which + * the task fits. If no CPU is big enough, but there are idle ones, try to + * maximize capacity. + */ +static int +select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) +{ + unsigned long task_util, util_min, util_max, best_cap = 0; + int cpu, best_cpu = -1; + struct cpumask *cpus; + + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); + + for_each_cpu_wrap(cpu, cpus, target) { + unsigned long cpu_cap = capacity_of(cpu); + + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + continue; + if (util_fits_cpu(task_util, util_min, util_max, cpu)) + return cpu; + + if (cpu_cap > best_cap) { + best_cap = cpu_cap; + best_cpu = cpu; + } + } + + return best_cpu; +} + +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return util_fits_cpu(util, util_min, util_max, cpu); + + return true; } /* @@ -6466,24 +6484,56 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util, util_min, util_max; int i, recent_used_cpu; - if (available_idle_cpu(target)) + /* + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sync_entity_load_avg(&p->se); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); + } + + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_cpu(task_util, util_min, util_max, prev)) return prev; + + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + in_task() && + prev == smp_processor_id() && + this_rq()->nr_running <= 1 && + asym_fits_cpu(task_util, util_min, util_max, prev)) { + return prev; + } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - available_idle_cpu(recent_used_cpu) && - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6492,6 +6542,32 @@ return recent_used_cpu; } + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH) + goto sd_llc; + } + + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + +sd_llc: sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6589,7 +6665,7 @@ util = READ_ONCE(cfs_rq->avg.util_avg); /* Discount task's util from CPU's util */ - util -= min_t(unsigned int, util, task_util(p)); + lsub_positive(&util, task_util(p)); /* * Covered cases: @@ -6638,10 +6714,9 @@ * properly fix the execl regression and it helps in further * reducing the chances for the above race. */ - if (unlikely(task_on_rq_queued(p) || current == p)) { - estimated -= min_t(unsigned int, estimated, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&estimated, _task_util_est(p)); + util = max(util, estimated); } @@ -6651,350 +6726,6 @@ * the cpu_util call. */ return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -unsigned long capacity_curr_of(int cpu) -{ - unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; - unsigned long scale_freq = arch_scale_freq_capacity(cpu); - - return cap_scale(max_cap, scale_freq); -} - -static void find_best_target(struct sched_domain *sd, cpumask_t *cpus, - struct task_struct *p) -{ - unsigned long min_util = uclamp_task(p); - unsigned long target_capacity = ULONG_MAX; - unsigned long min_wake_util = ULONG_MAX; - unsigned long target_max_spare_cap = 0; - unsigned long target_util = ULONG_MAX; - /* Initialise with deepest possible cstate (INT_MAX) */ - int shallowest_idle_cstate = INT_MAX; - struct sched_group *sg; - int best_active_cpu = -1; - int best_idle_cpu = -1; - int target_cpu = -1; - int backup_cpu = -1; - bool prefer_idle; - bool boosted; - int i; - - /* - * In most cases, target_capacity tracks capacity_orig of the most - * energy efficient CPU candidate, thus requiring to minimise - * target_capacity. For these cases target_capacity is already - * initialized to ULONG_MAX. - * However, for prefer_idle and boosted tasks we look for a high - * performance CPU, thus requiring to maximise target_capacity. In this - * case we initialise target_capacity to 0. - */ - prefer_idle = uclamp_latency_sensitive(p); - boosted = uclamp_boosted(p); - if (prefer_idle && boosted) - target_capacity = 0; - - /* Scan CPUs in all SDs */ - sg = sd->groups; - do { - for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) { - unsigned long capacity_curr = capacity_curr_of(i); - unsigned long capacity_orig = capacity_orig_of(i); - unsigned long wake_util, new_util; - long spare_cap; - int idle_idx = INT_MAX; - - if (!cpu_online(i)) - continue; - - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - wake_util = cpu_util_without(i, p); - new_util = wake_util + task_util_est(p); - - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - new_util = max(min_util, new_util); - if (new_util > capacity_orig) - continue; - - /* - * Pre-compute the maximum possible capacity we expect - * to have available on this CPU once the task is - * enqueued here. - */ - spare_cap = capacity_orig - new_util; - - if (idle_cpu(i)) - idle_idx = idle_get_state_idx(cpu_rq(i)); - - - /* - * Case A) Latency sensitive tasks - * - * Unconditionally favoring tasks that prefer idle CPU to - * improve latency. - * - * Looking for: - * - an idle CPU, whatever its idle_state is, since - * the first CPUs we explore are more likely to be - * reserved for latency sensitive tasks. - * - a non idle CPU where the task fits in its current - * capacity and has the maximum spare capacity. - * - a non idle CPU with lower contention from other - * tasks and running at the lowest possible OPP. - * - * The last two goals tries to favor a non idle CPU - * where the task can run as if it is "almost alone". - * A maximum spare capacity CPU is favoured since - * the task already fits into that CPU's capacity - * without waiting for an OPP chance. - * - * The following code path is the only one in the CPUs - * exploration loop which is always used by - * prefer_idle tasks. It exits the loop with wither a - * best_active_cpu or a target_cpu which should - * represent an optimal choice for latency sensitive - * tasks. - */ - if (prefer_idle) { - - /* - * Case A.1: IDLE CPU - * Return the best IDLE CPU we find: - * - for boosted tasks: the CPU with the highest - * performance (i.e. biggest capacity_orig) - * - for !boosted tasks: the most energy - * efficient CPU (i.e. smallest capacity_orig) - */ - if (idle_cpu(i)) { - if (boosted && - capacity_orig < target_capacity) - continue; - if (!boosted && - capacity_orig > target_capacity) - continue; - /* - * Minimise value of idle state: skip - * deeper idle states and pick the - * shallowest. - */ - if (capacity_orig == target_capacity && - sysctl_sched_cstate_aware && - idle_idx >= shallowest_idle_cstate) - continue; - - target_capacity = capacity_orig; - shallowest_idle_cstate = idle_idx; - best_idle_cpu = i; - continue; - } - if (best_idle_cpu != -1) - continue; - - /* - * Case A.2: Target ACTIVE CPU - * Favor CPUs with max spare capacity. - */ - if (capacity_curr > new_util && - spare_cap > target_max_spare_cap) { - target_max_spare_cap = spare_cap; - target_cpu = i; - continue; - } - if (target_cpu != -1) - continue; - - - /* - * Case A.3: Backup ACTIVE CPU - * Favor CPUs with: - * - lower utilization due to other tasks - * - lower utilization with the task in - */ - if (wake_util > min_wake_util) - continue; - min_wake_util = wake_util; - best_active_cpu = i; - continue; - } - - /* - * Enforce EAS mode - * - * For non latency sensitive tasks, skip CPUs that - * will be overutilized by moving the task there. - * - * The goal here is to remain in EAS mode as long as - * possible at least for !prefer_idle tasks. - */ - if ((new_util * capacity_margin) > - (capacity_orig * SCHED_CAPACITY_SCALE)) - continue; - - /* - * Favor CPUs with smaller capacity for non latency - * sensitive tasks. - */ - if (capacity_orig > target_capacity) - continue; - - /* - * Case B) Non latency sensitive tasks on IDLE CPUs. - * - * Find an optimal backup IDLE CPU for non latency - * sensitive tasks. - * - * Looking for: - * - minimizing the capacity_orig, - * i.e. preferring LITTLE CPUs - * - favoring shallowest idle states - * i.e. avoid to wakeup deep-idle CPUs - * - * The following code path is used by non latency - * sensitive tasks if IDLE CPUs are available. If at - * least one of such CPUs are available it sets the - * best_idle_cpu to the most suitable idle CPU to be - * selected. - * - * If idle CPUs are available, favour these CPUs to - * improve performances by spreading tasks. - * Indeed, the energy_diff() computed by the caller - * will take care to ensure the minimization of energy - * consumptions without affecting performance. - */ - if (idle_cpu(i)) { - /* - * Skip CPUs in deeper idle state, but only - * if they are also less energy efficient. - * IOW, prefer a deep IDLE LITTLE CPU vs a - * shallow idle big CPU. - */ - if (capacity_orig == target_capacity && - sysctl_sched_cstate_aware && - idle_idx >= shallowest_idle_cstate) - continue; - - target_capacity = capacity_orig; - shallowest_idle_cstate = idle_idx; - best_idle_cpu = i; - continue; - } - - /* - * Case C) Non latency sensitive tasks on ACTIVE CPUs. - * - * Pack tasks in the most energy efficient capacities. - * - * This task packing strategy prefers more energy - * efficient CPUs (i.e. pack on smaller maximum - * capacity CPUs) while also trying to spread tasks to - * run them all at the lower OPP. - * - * This assumes for example that it's more energy - * efficient to run two tasks on two CPUs at a lower - * OPP than packing both on a single CPU but running - * that CPU at an higher OPP. - * - * Thus, this case keep track of the CPU with the - * smallest maximum capacity and highest spare maximum - * capacity. - */ - - /* Favor CPUs with maximum spare capacity */ - if (capacity_orig == target_capacity && - spare_cap < target_max_spare_cap) - continue; - - target_max_spare_cap = spare_cap; - target_capacity = capacity_orig; - target_util = new_util; - target_cpu = i; - } - - } while (sg = sg->next, sg != sd->groups); - - /* - * For non latency sensitive tasks, cases B and C in the previous loop, - * we pick the best IDLE CPU only if we was not able to find a target - * ACTIVE CPU. - * - * Policies priorities: - * - * - prefer_idle tasks: - * - * a) IDLE CPU available: best_idle_cpu - * b) ACTIVE CPU where task fits and has the bigger maximum spare - * capacity (i.e. target_cpu) - * c) ACTIVE CPU with less contention due to other tasks - * (i.e. best_active_cpu) - * - * - NON prefer_idle tasks: - * - * a) ACTIVE CPU: target_cpu - * b) IDLE CPU: best_idle_cpu - */ - - if (prefer_idle && (best_idle_cpu != -1)) { - target_cpu = best_idle_cpu; - goto target; - } - - if (target_cpu == -1) - target_cpu = prefer_idle - ? best_active_cpu - : best_idle_cpu; - else - backup_cpu = prefer_idle - ? best_active_cpu - : best_idle_cpu; - - if (backup_cpu >= 0) - cpumask_set_cpu(backup_cpu, cpus); - if (target_cpu >= 0) { -target: - cpumask_set_cpu(target_cpu, cpus); - } - - trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu, - best_active_cpu, target_cpu, backup_cpu); -} - -/* - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. - * - * In that case WAKE_AFFINE doesn't make sense and we'll let - * BALANCE_WAKE sort things out. - */ -static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) -{ - long min_cap, max_cap; - - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return 0; - - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; - - /* Minimum capacity is close to max, no need to abort wake_affine */ - if (max_cap - min_cap < max_cap >> 3) - return 0; - - /* Bring task utilization in sync with prev_cpu */ - sync_entity_load_avg(&p->se); - - return !task_fits_capacity(p, min_cap); } /* @@ -7036,154 +6767,61 @@ } /* - * compute_energy(): Estimates the energy that would be consumed if @p was + * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of the * CPUs after the task migration, and uses the Energy Model + * landscape of @pd's CPUs after the task migration, and uses the Energy Model * to compute what would be the energy if we decided to actually migrate that * task. */ static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - unsigned int max_util, util_cfs, cpu_util, cpu_cap; - unsigned long sum_util, energy = 0; - struct task_struct *tsk; + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); + unsigned long max_util = 0, sum_util = 0; + unsigned long energy = 0; int cpu; - for (; pd; pd = pd->next) { - struct cpumask *pd_mask = perf_domain_span(pd); + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); + struct task_struct *tsk = cpu == dst_cpu ? p : NULL; /* - * The energy model mandates all the CPUs of a performance - * domain have the same capacity. + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. */ - cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask)); - max_util = sum_util = 0; + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); /* - * The capacity state of CPUs of the current rd can be driven by - * CPUs of another rd if they belong to the same performance - * domain. So, account for the utilization of these CPUs too - * by masking pd with cpu_online_mask instead of the rd span. - * - * If an entire performance domain is outside of the current rd, - * it will not appear in its pd list and will not be accounted - * by compute_energy(). + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - util_cfs = cpu_util_next(cpu, p, dst_cpu); - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - tsk = cpu == dst_cpu ? p : NULL; - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - energy += em_pd_energy(pd->em_pd, max_util, sum_util); + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } + + trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy); + if (!energy) + energy = em_cpu_energy(pd->em_pd, max_util, sum_util); return energy; } - -static void select_cpu_candidates(struct sched_domain *sd, cpumask_t *cpus, - struct perf_domain *pd, struct task_struct *p, int prev_cpu) -{ - int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1; - unsigned long spare_cap, max_spare_cap, util, cpu_cap; - bool prefer_idle = uclamp_latency_sensitive(p); - bool boosted = uclamp_boosted(p); - unsigned long target_cap = boosted ? 0 : ULONG_MAX; - unsigned long highest_spare_cap = 0; - unsigned int min_exit_lat = UINT_MAX; - int cpu, max_spare_cap_cpu; - struct cpuidle_state *idle; - - for (; pd; pd = pd->next) { - max_spare_cap_cpu = -1; - max_spare_cap = 0; - - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) - continue; - - util = cpu_util_next(cpu, p, cpu); - cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; - - /* - * Skip CPUs that cannot satisfy the capacity request. - * IOW, placing the task there would make the CPU - * overutilized. Take uclamp into account to see how - * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). - */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (cpu_cap * 1024 < util * capacity_margin) - continue; - - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; - max_spare_cap_cpu = cpu; - } - - if (!prefer_idle) - continue; - - if (idle_cpu(cpu)) { - cpu_cap = capacity_orig_of(cpu); - if (boosted && cpu_cap < target_cap) - continue; - if (!boosted && cpu_cap > target_cap) - continue; - idle = idle_get_state(cpu_rq(cpu)); - if (idle && idle->exit_latency > min_exit_lat && - cpu_cap == target_cap) - continue; - - if (idle) - min_exit_lat = idle->exit_latency; - target_cap = cpu_cap; - best_idle_cpu = cpu; - } else if (spare_cap > highest_spare_cap) { - highest_spare_cap = spare_cap; - highest_spare_cap_cpu = cpu; - } - } - - if (!prefer_idle && max_spare_cap_cpu >= 0) - cpumask_set_cpu(max_spare_cap_cpu, cpus); - } - - if (!prefer_idle) - return; - - if (best_idle_cpu >= 0) - cpumask_set_cpu(best_idle_cpu, cpus); - else - cpumask_set_cpu(highest_spare_cap_cpu, cpus); -} - -static DEFINE_PER_CPU(cpumask_t, energy_cpus); /* * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the @@ -7224,27 +6862,41 @@ * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ - static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) { - unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + unsigned long best_delta2 = ULONG_MAX; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - int weight, cpu, best_energy_cpu = prev_cpu; - unsigned long cur_energy; - struct perf_domain *pd; + int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1; + unsigned long max_spare_cap_ls = 0, target_cap; + unsigned long cpu_cap, util, base_energy = 0; + bool boosted, latency_sensitive = false; + unsigned int min_exit_lat = UINT_MAX; + int cpu, best_energy_cpu = prev_cpu; + struct cpuidle_state *idle; struct sched_domain *sd; - cpumask_t *candidates; + struct perf_domain *pd; + int new_cpu = INT_MAX; - if (sysctl_sched_sync_hint_enable && sync) { - cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) - return cpu; - } + sync_entity_load_avg(&p->se); + trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu); + if (new_cpu != INT_MAX) + return new_cpu; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; + + cpu = smp_processor_id(); + if (sync && cpu_rq(cpu)->nr_running == 1 && + cpumask_test_cpu(cpu, p->cpus_ptr) && + task_fits_cpu(p, cpu)) { + rcu_read_unlock(); + return cpu; + } /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -7256,59 +6908,169 @@ if (!sd) goto fail; - sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!uclamp_task_util(p, p_util_min, p_util_max)) goto unlock; - /* Pre-select a set of candidate CPUs. */ - candidates = this_cpu_ptr(&energy_cpus); - cpumask_clear(candidates); + latency_sensitive = uclamp_latency_sensitive(p); + boosted = uclamp_boosted(p); + target_cap = boosted ? 0 : ULONG_MAX; - if (sched_feat(FIND_BEST_TARGET)) - find_best_target(sd, candidates, p); - else - select_cpu_candidates(sd, candidates, pd, p, prev_cpu); + for (; pd; pd = pd->next) { + unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long rq_util_min, rq_util_max; + unsigned long util_min, util_max; + unsigned long base_energy_pd; + int max_spare_cap_cpu = -1; - /* Bail out if no candidate was found. */ - weight = cpumask_weight(candidates); - if (!weight) - goto unlock; + /* Compute the 'base' energy of the pd, without @p */ + base_energy_pd = compute_energy(p, -1, pd); + base_energy += base_energy_pd; - /* If there is only one sensible candidate, select it now. */ - cpu = cpumask_first(candidates); - if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) || - (cpu == prev_cpu))) { - best_energy_cpu = cpu; - goto unlock; - } + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + continue; - if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed)) - prev_energy = best_energy = compute_energy(p, prev_cpu, pd); - else - prev_energy = best_energy = ULONG_MAX; + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); - /* Select the best candidate energy-wise. */ - for_each_cpu(cpu, candidates) { - if (cpu == prev_cpu) - continue; - cur_energy = compute_energy(p, cpu, pd); - if (cur_energy < best_energy) { - best_energy = cur_energy; - best_energy_cpu = cpu; + /* + * Skip CPUs that cannot satisfy the capacity request. + * IOW, placing the task there would make the CPU + * overutilized. Take uclamp into account to see how + * much capacity we can get out of the CPU; this is + * aligned with schedutil_cpu_util(). + */ + if (uclamp_is_used()) { + if (uclamp_rq_is_idle(cpu_rq(cpu))) { + util_min = p_util_min; + util_max = p_util_max; + } else { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. Ie: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } + } + if (!util_fits_cpu(util, util_min, util_max, cpu)) + continue; + + /* Always use prev_cpu as a candidate. */ + if (!latency_sensitive && cpu == prev_cpu) { + prev_delta = compute_energy(p, prev_cpu, pd); + prev_delta -= base_energy_pd; + best_delta = min(best_delta, prev_delta); + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + if (prev_delta == best_delta) + best_energy_cpu = prev_cpu; + } + } + + /* + * Find the CPU with the maximum spare capacity in + * the performance domain + */ + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + + if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + if (!latency_sensitive) + continue; + } + + if (idle_cpu(cpu)) { + cpu_cap = capacity_orig_of(cpu); + if (boosted && cpu_cap < target_cap) + continue; + if (!boosted && cpu_cap > target_cap) + continue; + idle = idle_get_state(cpu_rq(cpu)); + if (idle && idle->exit_latency > min_exit_lat && + cpu_cap == target_cap) + continue; + + if (idle) + min_exit_lat = idle->exit_latency; + target_cap = cpu_cap; + best_idle_cpu = cpu; + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + best_delta2 = compute_energy(p, cpu, pd); + best_delta2 -= base_energy_pd; + } + } else if (spare_cap > max_spare_cap_ls) { + max_spare_cap_ls = spare_cap; + max_spare_cap_cpu_ls = cpu; + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + if (best_idle_cpu == -1) { + best_delta2 = compute_energy(p, cpu, pd); + best_delta2 -= base_energy_pd; + } + } + } + } + + /* Evaluate the energy impact of using this CPU. */ + if (!latency_sensitive && max_spare_cap_cpu >= 0 && + max_spare_cap_cpu != prev_cpu) { + cur_delta = compute_energy(p, max_spare_cap_cpu, pd); + cur_delta -= base_energy_pd; + if (cur_delta < best_delta) { + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + } } } unlock: rcu_read_unlock(); + if (latency_sensitive) + return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls; + /* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu. */ - if (prev_energy == ULONG_MAX) + if (prev_delta == ULONG_MAX) return best_energy_cpu; - if ((prev_energy - best_energy) > (prev_energy >> 4)) + if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) return best_energy_cpu; + + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask(); + struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask(); + int level = rockchip_perf_get_level(); + + /* + * when select ROCKCHIP_PERFORMANCE_LOW: + * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu + * is little cpu, so that tasks can migrate from big cpu to little + * cpu easier to save power. + */ + if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask && + cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) && + cpumask_test_cpu(best_energy_cpu, cpul_mask)) { + return best_energy_cpu; + } + + /* + * Pick the idlest cpu if it is a little power increased(<3.1%). + */ + if ((best_delta2 <= prev_delta) || + ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5))) + return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls; + } return prev_cpu; @@ -7331,39 +7093,44 @@ * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, - int sibling_count_hint) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + int target_cpu = -1; + + if (trace_android_rvh_select_task_rq_fair_enabled() && + !(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag, + wake_flags, &target_cpu); + if (target_cpu >= 0) + return target_cpu; if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - if (static_branch_unlikely(&sched_energy_present)) { - if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync) - goto sd_loop; + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH) + goto no_eas; + } + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync); if (new_cpu >= 0) return new_cpu; new_cpu = prev_cpu; } - want_affine = !wake_wide(p, sibling_count_hint) && - !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, &p->cpus_allowed); +no_eas: + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } -sd_loop: rcu_read_lock(); for_each_domain(cpu, tmp) { - if (!(tmp->flags & SD_LOAD_BALANCE)) - break; - /* * If both 'cpu' and 'prev_cpu' are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. @@ -7390,6 +7157,23 @@ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + struct root_domain *rd = cpu_rq(cpu)->rd; + struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask(); + struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask(); + int level = rockchip_perf_get_level(); + + if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) && + cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) && + cpumask_test_cpu(new_cpu, cpul_mask)) { + for_each_domain(cpu, tmp) { + sd = tmp; + } + if (sd) + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + } + } if (want_affine) current->recent_used_cpu = cpu; @@ -7457,15 +7241,21 @@ /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; - update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); +} + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; } #endif /* CONFIG_SMP */ @@ -7520,7 +7310,7 @@ static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -7532,7 +7322,7 @@ static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) return; for_each_sched_entity(se) { @@ -7558,6 +7348,7 @@ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + bool preempt = false, nopreempt = false; if (unlikely(se == pse)) return; @@ -7590,8 +7381,8 @@ return; /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(curr->policy == SCHED_IDLE) && - likely(p->policy != SCHED_IDLE)) + if (unlikely(task_has_idle_policy(curr)) && + likely(!task_has_idle_policy(p))) goto preempt; /* @@ -7603,6 +7394,12 @@ find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); + trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt, + wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity); + if (preempt) + goto preempt; + if (nopreempt) + return; BUG_ON(!pse); if (wakeup_preempt_entity(se, pse) == 1) { /* @@ -7634,20 +7431,21 @@ set_last_buddy(se); } -static struct task_struct * +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; - struct task_struct *p; + struct sched_entity *se = NULL; + struct task_struct *p = NULL; int new_tasks; + bool repick = false; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -7694,7 +7492,7 @@ } while (cfs_rq); p = task_of(se); - + trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev); /* * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the @@ -7724,8 +7522,15 @@ goto done; simple: #endif + if (prev) + put_prev_task(rq, prev); - put_prev_task(rq, prev); + trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev); + if (repick) { + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); + goto done; + } do { se = pick_next_entity(cfs_rq, NULL); @@ -7753,11 +7558,13 @@ return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + if (!rf) + return NULL; + + new_tasks = newidle_balance(rq, rf); /* - * Because idle_balance() releases (and re-acquires) rq->lock, it is + * Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -7774,6 +7581,11 @@ update_idle_rq_clock_pelt(rq); return NULL; +} + +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); } /* @@ -7826,7 +7638,7 @@ set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7961,15 +7773,54 @@ * rewrite all of this once again.] */ -static unsigned long __read_mostly max_load_balance_interval = HZ/10; +unsigned long __read_mostly max_load_balance_interval = HZ/10; +EXPORT_SYMBOL_GPL(max_load_balance_interval); enum fbq_type { regular, remote, all }; +/* + * 'group_type' describes the group of CPUs at the moment of load balancing. + * + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the group_type can simply be compared when selecting the busiest + * group. See update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + /* The group has spare capacity that can be used to run more tasks. */ + group_has_spare = 0, + /* + * The group is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + group_fully_busy, + /* + * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity + * and must be migrated to a more powerful CPU. + */ group_misfit_task, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the + * current CPU. + */ + group_asym_packing, + /* + * The tasks' affinity constraints previously prevented the scheduler + * from balancing the load across the system. + */ group_imbalanced, - group_overloaded, + /* + * The CPU is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + group_overloaded +}; + +enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, + migrate_misfit }; #define LBF_ALL_PINNED 0x01 @@ -7992,7 +7843,6 @@ int new_dst_cpu; enum cpu_idle_type idle; long imbalance; - unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -8003,8 +7853,9 @@ unsigned int loop_max; enum fbq_type fbq_type; - enum group_type src_grp_type; + enum migration_type migration_type; struct list_head tasks; + struct rq_flags *src_rq_rf; }; /* @@ -8019,7 +7870,11 @@ if (p->sched_class != &fair_sched_class) return 0; - if (unlikely(p->policy == SCHED_IDLE)) + if (unlikely(task_has_idle_policy(p))) + return 0; + + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) return 0; /* @@ -8107,20 +7962,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int can_migrate = 1; lockdep_assert_held(&env->src_rq->lock); + + trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate); + if (!can_migrate) + return 0; /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -8140,7 +8004,7 @@ /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -8186,9 +8050,20 @@ */ static void detach_task(struct task_struct *p, struct lb_env *env) { + int detached = 0; + lockdep_assert_held(&env->src_rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; + /* + * The vendor hook may drop the lock temporarily, so + * pass the rq flags to unpin lock. We expect the + * rq lock to be held after return. + */ + trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p, + env->dst_cpu, &detached); + if (detached) + return; + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -8227,7 +8102,7 @@ static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance weighted load from + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -8235,8 +8110,8 @@ static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; + unsigned long util, load; struct task_struct *p; - unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); @@ -8266,39 +8141,64 @@ break; } -#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS - if (sysctl_sched_performance_bias) { - if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu))) - goto next; - } -#endif - if (!can_migrate_task(p, env)) goto next; - /* - * Depending of the number of CPUs and tasks and the - * cgroup hierarchy, task_h_load() can return a null - * value. Make sure that env->imbalance decreases - * otherwise detach_tasks() will stop only after - * detaching up to loop_max tasks. - */ - load = max_t(unsigned long, task_h_load(p), 1); + switch (env->migration_type) { + case migrate_load: + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); + if (sched_feat(LB_MIN) && + load < 16 && !env->sd->nr_balance_failed) + goto next; - if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) - goto next; + /* + * Make sure that we don't migrate too much load. + * Nevertheless, let relax the constraint if + * scheduler fails to find a good waiting task to + * migrate. + */ + if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) + goto next; - if ((load / 2) > env->imbalance) - goto next; + env->imbalance -= load; + break; + + case migrate_util: + util = task_util_est(p); + + if (util > env->imbalance) + goto next; + + env->imbalance -= util; + break; + + case migrate_task: + env->imbalance--; + break; + + case migrate_misfit: + /* This is not a misfit task */ + if (task_fits_cpu(p, env->src_cpu)) + goto next; + + env->imbalance = 0; + break; + } detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; - env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize @@ -8310,7 +8210,7 @@ /* * We only want to steal up to the prescribed amount of - * weighted load. + * load/util/tasks. */ if (env->imbalance <= 0) break; @@ -8339,7 +8239,6 @@ BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -8380,6 +8279,7 @@ rq_unlock(env->dst_rq, &rf); } +#ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) @@ -8399,12 +8299,54 @@ if (READ_ONCE(rq->avg_dl.util_avg)) return true; + if (thermal_load_avg(rq)) + return true; + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif return false; +} + +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + rq->last_blocked_load_update_tick = jiffies; + + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8420,22 +8362,17 @@ if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->avg.runnable_load_sum) + if (cfs_rq->avg.runnable_sum) return false; return true; } -static void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; - const struct sched_class *curr_class; - struct rq_flags rf; - bool done = true; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); + bool decayed = false; + int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -8444,8 +8381,12 @@ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) - update_tg_load_avg(cfs_rq, 0); + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { + update_tg_load_avg(cfs_rq); + + if (cfs_rq == &rq->cfs) + decayed = true; + } /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; @@ -8461,23 +8402,10 @@ /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) - done = false; + *done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (done) - rq->has_blocked_load = 0; -#endif - rq_unlock_irqrestore(rq, &rf); + return decayed; } /* @@ -8527,27 +8455,16 @@ cfs_rq_load_avg(cfs_rq) + 1); } #else -static inline void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - const struct sched_class *curr_class; - struct rq_flags rf; + bool decayed; - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) - rq->has_blocked_load = 0; -#endif - rq_unlock_irqrestore(rq, &rf); + return decayed; } static unsigned long task_h_load(struct task_struct *p) @@ -8555,6 +8472,24 @@ return p->se.avg.load_avg; } #endif + +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} /********** Helpers for find_busiest_group ************************/ @@ -8564,15 +8499,15 @@ struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ - unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_util; /* Total utilization of the group */ - unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_no_capacity; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -8587,10 +8522,10 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ - unsigned long total_running; unsigned long total_load; /* Total load of all groups in sd */ unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* tasks should go to sibling first */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ struct sg_lb_stats local_stat; /* Statistics of the local group */ @@ -8601,54 +8536,26 @@ /* * Skimp on the clearing to avoid duplicate work. We can avoid clearing * local_stat because update_sg_lb_stats() does a full clear/assignment. - * We must however clear busiest_stat::avg_load because - * update_sd_pick_busiest() reads this before assignment. + * We must however set busiest_stat::group_type and + * busiest_stat::idle_cpus to the worst busiest group because + * update_sd_pick_busiest() reads these before assignment. */ *sds = (struct sd_lb_stats){ .busiest = NULL, .local = NULL, - .total_running = 0UL, .total_load = 0UL, .total_capacity = 0UL, .busiest_stat = { - .avg_load = 0UL, - .sum_nr_running = 0, - .group_type = group_other, + .idle_cpus = UINT_MAX, + .group_type = group_has_spare, }, }; } -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. - * - * Return: The load index. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - -static unsigned long scale_rt_capacity(int cpu, unsigned long max) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -8657,8 +8564,15 @@ if (unlikely(irq >= max)) return 1; + /* + * avg_rt.util_avg and avg_dl.util_avg track binary signals + * (running and not running) with weights 0 and 1024 respectively. + * avg_thermal.load_avg tracks thermal pressure and the weighted + * average uses the actual delta max capacity(load). + */ used = READ_ONCE(rq->avg_rt.util_avg); used += READ_ONCE(rq->avg_dl.util_avg); + used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -8668,52 +8582,20 @@ return scale_irq_capacity(free, irq, max); } -void init_max_cpu_capacity(struct max_cpu_capacity *mcc) { - raw_spin_lock_init(&mcc->lock); - mcc->val = 0; - mcc->cpu = -1; -} - static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - struct max_cpu_capacity *mcc; - unsigned long max_capacity; - int max_cap_cpu; - unsigned long flags; - cpu_rq(cpu)->cpu_capacity_orig = capacity; - - capacity *= arch_scale_max_freq_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - - mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; - - raw_spin_lock_irqsave(&mcc->lock, flags); - max_capacity = mcc->val; - max_cap_cpu = mcc->cpu; - - if ((max_capacity > capacity && max_cap_cpu == cpu) || - (max_capacity < capacity)) { - mcc->val = capacity; - mcc->cpu = cpu; -#ifdef CONFIG_SCHED_DEBUG - raw_spin_unlock_irqrestore(&mcc->lock, flags); - //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", - // cpu, capacity); - goto skip_unlock; -#endif - } - raw_spin_unlock_irqrestore(&mcc->lock, flags); - -skip_unlock: __attribute__ ((unused)); - capacity = scale_rt_capacity(cpu, capacity); + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; + trace_android_rvh_update_cpu_capacity(cpu, &capacity); cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -8746,29 +8628,11 @@ */ for_each_cpu(cpu, sched_group_span(sdg)) { - struct sched_group_capacity *sgc; - struct rq *rq = cpu_rq(cpu); + unsigned long cpu_cap = capacity_of(cpu); - /* - * build_sched_domains() -> init_sched_groups_capacity() - * gets here before we've attached the domains to the - * runqueues. - * - * Use capacity_of(), which is set irrespective of domains - * in update_cpu_capacity(). - * - * This avoids capacity from being 0 and - * causing divide-by-zero issues on boot. - */ - if (unlikely(!rq->sd)) { - capacity += capacity_of(cpu); - } else { - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; - } - - min_capacity = min(capacity, min_capacity); - max_capacity = max(capacity, max_capacity); + capacity += cpu_cap; + min_capacity = min(cpu_cap, min_capacity); + max_capacity = max(cpu_cap, max_capacity); } } else { /* @@ -8805,8 +8669,20 @@ } /* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + +/* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to ->cpus_allowed constraints. + * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. @@ -8851,13 +8727,17 @@ * any benefit for the load balance. */ static inline bool -group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running < sgs->group_weight) return true; + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) + return false; + if ((sgs->group_capacity * 100) > - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) return true; return false; @@ -8872,13 +8752,17 @@ * false. */ static inline bool -group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { if (sgs->sum_nr_running <= sgs->group_weight) return false; if ((sgs->group_capacity * 100) < - (sgs->group_util * env->sd->imbalance_pct)) + (sgs->group_util * imbalance_pct)) + return true; + + if ((sgs->group_capacity * imbalance_pct) < + (sgs->group_runnable * 100)) return true; return false; @@ -8891,8 +8775,7 @@ static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->min_capacity * capacity_margin < - ref->sgc->min_capacity * 1024; + return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); } /* @@ -8902,24 +8785,30 @@ static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { - return sg->sgc->max_capacity * capacity_margin < - ref->sgc->max_capacity * 1024; + return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); } static inline enum -group_type group_classify(struct sched_group *group, +group_type group_classify(unsigned int imbalance_pct, + struct sched_group *group, struct sg_lb_stats *sgs) { - if (sgs->group_no_capacity) + if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_asym_packing) + return group_asym_packing; + if (sgs->group_misfit_task_load) return group_misfit_task; - return group_other; + if (!group_has_capacity(imbalance_pct, sgs)) + return group_fully_busy; + + return group_has_spare; } static bool update_nohz_stats(struct rq *rq, bool force) @@ -8956,12 +8845,11 @@ struct sg_lb_stats *sgs, int *sg_status) { - int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); - int load_idx = get_sd_load_idx(env->sd, env->idle); - unsigned long load; - int i, nr_running; + int i, nr_running, local_group; memset(sgs, 0, sizeof(*sgs)); + + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -8969,17 +8857,14 @@ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - /* Bias balancing toward CPUs of our domain: */ - if (local_group) - load = target_load(i, load_idx); - else - load = source_load(i, load_idx); - - sgs->group_load += load; + sgs->group_load += cpu_load(rq); sgs->group_util += cpu_util(i); - sgs->sum_nr_running += rq->cfs.h_nr_running; + sgs->group_runnable += cpu_runnable(rq); + sgs->sum_h_nr_running += rq->cfs.h_nr_running; nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + if (nr_running > 1) *sg_status |= SG_OVERLOAD; @@ -8990,13 +8875,19 @@ sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ - if (!nr_running && idle_cpu(i)) + if (!nr_running && idle_cpu(i)) { sgs->idle_cpus++; + /* Idle cpu can't have misfit task */ + continue; + } + if (local_group) + continue; + + /* Check for a misfit task on the cpu */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -9004,17 +8895,24 @@ } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Check if dst CPU is idle and preferred to this group */ + if (env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && + sgs->sum_h_nr_running && + sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + sgs->group_asym_packing = 1; + } - if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; } /** @@ -9037,6 +8935,10 @@ { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* Make sure that there is at least one task to pull */ + if (!sgs->sum_h_nr_running) + return false; + /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some @@ -9045,7 +8947,7 @@ */ if (sgs->group_type == group_misfit_task && (!group_smaller_max_cpu_capacity(sg, sds->local) || - !group_has_capacity(env, &sds->local_stat))) + sds->local_stat.group_type != group_has_spare)) return false; if (sgs->group_type > busiest->group_type) @@ -9054,62 +8956,92 @@ if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) + /* + * The candidate and the current busiest group are the same type of + * group. Let check which one is the busiest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + /* Select the overloaded group with highest avg_load. */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to + * choose one more than another. + */ return false; - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; - - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. - */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_min_cpu_capacity(sds->local, sg)) - return false; - - /* - * If we have more than one misfit sg go with the biggest misfit. - */ - if (sgs->group_type == group_misfit_task && - sgs->group_misfit_task_load < busiest->group_misfit_task_load) - return false; - -asym_packing: - /* This is the busiest node in its class. */ - if (!(env->sd->flags & SD_ASYM_PACKING)) - return true; - - /* No ASYM_PACKING if target CPU is already busy */ - if (env->idle == CPU_NOT_IDLE) - return true; - /* - * ASYM_PACKING needs to move all the work to the highest - * prority CPUs in the group, therefore mark all groups - * of lower priority than ourself as busy. - */ - if (sgs->sum_nr_running && - sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { - if (!sds->busiest) - return true; - + case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, - sg->asym_prefer_cpu)) - return true; + if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) + return false; + break; + + case group_misfit_task: + /* + * If we have more than one misfit sg go with the biggest + * misfit. + */ + if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + break; + + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In + * theory, there is no need to pull task from such kind of + * group because tasks have all compute capacity that they need + * but we can still improve the overall throughput by reducing + * contention when accessing shared HW resources. + * + * XXX for now avg_load is not computed and always 0 so we + * select the 1st one. + */ + if (sgs->avg_load <= busiest->avg_load) + return false; + break; + + case group_has_spare: + /* + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare + * the spare capacity which is more stable but it can end up + * that the group has less spare capacity but finally more idle + * CPUs which means less opportunity to pull tasks. + */ + if (sgs->idle_cpus > busiest->idle_cpus) + return false; + else if ((sgs->idle_cpus == busiest->idle_cpus) && + (sgs->sum_nr_running <= busiest->sum_nr_running)) + return false; + + break; } - return false; + /* + * Candidate sg has no more than one task per CPU and has higher + * per-CPU capacity. Migrating tasks to less capable CPUs may harm + * throughput. Maximize throughput, power/energy consequences are not + * considered. + */ + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type <= group_fully_busy) && + (group_smaller_min_cpu_capacity(sds->local, sg))) + return false; + + return true; } #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->nr_numa_running) + if (sgs->sum_h_nr_running > sgs->nr_numa_running) return regular; - if (sgs->sum_nr_running > sgs->nr_preferred_running) + if (sgs->sum_h_nr_running > sgs->nr_preferred_running) return remote; return all; } @@ -9134,18 +9066,338 @@ } #endif /* CONFIG_NUMA_BALANCING */ + +struct sg_lb_stats; + +/* + * task_running_on_cpu - return 1 if @p is running on @cpu. + */ + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return 0; + + if (task_on_rq_queued(p)) + return 1; + + return 0; +} + +/** + * idle_cpu_without - would a given CPU be idle without p ? + * @cpu: the processor on which idleness is tested. + * @p: task which should be ignored. + * + * Return: 1 if the CPU would be idle. 0 otherwise. + */ +static int idle_cpu_without(int cpu, struct task_struct *p) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle && rq->curr != p) + return 0; + + /* + * rq->nr_running can't be used but an updated version without the + * impact of p on cpu must be used instead. The updated nr_running + * be computed and tested before calling idle_cpu_without(). + */ + +#ifdef CONFIG_SMP + if (rq->ttwu_pending) + return 0; +#endif + + return 1; +} + +/* + * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. + * @sd: The sched_domain level to look for idlest group. + * @group: sched_group whose statistics are to be updated. + * @sgs: variable to hold the statistics for this group. + * @p: The task for which we look for the idlest group/CPU. + */ +static inline void update_sg_wakeup_stats(struct sched_domain *sd, + struct sched_group *group, + struct sg_lb_stats *sgs, + struct task_struct *p) +{ + int i, nr_running; + + memset(sgs, 0, sizeof(*sgs)); + + /* Assume that task can't fit any CPU of the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY) + sgs->group_misfit_task_load = 1; + + for_each_cpu(i, sched_group_span(group)) { + struct rq *rq = cpu_rq(i); + unsigned int local; + + sgs->group_load += cpu_load_without(rq, p); + sgs->group_util += cpu_util_without(i, p); + sgs->group_runnable += cpu_runnable_without(rq, p); + local = task_running_on_cpu(i, p); + sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + + nr_running = rq->nr_running - local; + sgs->sum_nr_running += nr_running; + + /* + * No need to call idle_cpu_without() if nr_running is not 0 + */ + if (!nr_running && idle_cpu_without(i, p)) + sgs->idle_cpus++; + + /* Check if task fits in the CPU */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load && + task_fits_cpu(p, i)) + sgs->group_misfit_task_load = 0; + + } + + sgs->group_capacity = group->sgc->capacity; + + sgs->group_weight = group->group_weight; + + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); + + /* + * Computing avg_load makes sense only when group is fully busy or + * overloaded + */ + if (sgs->group_type == group_fully_busy || + sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / + sgs->group_capacity; +} + +static bool update_pick_idlest(struct sched_group *idlest, + struct sg_lb_stats *idlest_sgs, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_type < idlest_sgs->group_type) + return true; + + if (sgs->group_type > idlest_sgs->group_type) + return false; + + /* + * The candidate and the current idlest group are the same type of + * group. Let check which one is the idlest according to the type. + */ + + switch (sgs->group_type) { + case group_overloaded: + case group_fully_busy: + /* Select the group with lowest avg_load. */ + if (idlest_sgs->avg_load <= sgs->avg_load) + return false; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those types are not used in the slow wakeup path */ + return false; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (idlest->sgc->max_capacity >= group->sgc->max_capacity) + return false; + break; + + case group_has_spare: + /* Select group with most idle CPUs */ + if (idlest_sgs->idle_cpus > sgs->idle_cpus) + return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + + break; + } + + return true; +} + +/* + * find_idlest_group() finds and returns the least busy CPU group within the + * domain. + * + * Assumes p is allowed on at least one CPU in sd. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; + struct sg_lb_stats local_sgs, tmp_sgs; + struct sg_lb_stats *sgs; + unsigned long imbalance; + struct sg_lb_stats idlest_sgs = { + .avg_load = UINT_MAX, + .group_type = group_overloaded, + }; + + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + + do { + int local_group; + + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + struct root_domain *rd = cpu_rq(this_cpu)->rd; + struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask(); + int level = rockchip_perf_get_level(); + + if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) && + cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) && + !cpumask_intersects(sched_group_span(group), cpub_mask)) + continue; + } + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_span(group), + p->cpus_ptr)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_span(group)); + + if (local_group) { + sgs = &local_sgs; + local = group; + } else { + sgs = &tmp_sgs; + } + + update_sg_wakeup_stats(sd, group, sgs, p); + + if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) { + idlest = group; + idlest_sgs = *sgs; + } + + } while (group = group->next, group != sd->groups); + + + /* There is no idlest group to push tasks to */ + if (!idlest) + return NULL; + + /* The local group has been skipped because of CPU affinity */ + if (!local) + return idlest; + + /* + * If the local group is idler than the selected idlest group + * don't try and push the task. + */ + if (local_sgs.group_type < idlest_sgs.group_type) + return NULL; + + /* + * If the local group is busier than the selected idlest group + * try and push the task. + */ + if (local_sgs.group_type > idlest_sgs.group_type) + return idlest; + + switch (local_sgs.group_type) { + case group_overloaded: + case group_fully_busy: + /* + * When comparing groups across NUMA domains, it's possible for + * the local domain to be very lightly loaded relative to the + * remote domains but "imbalance" skews the comparison making + * remote CPUs look much more favourable. When considering + * cross-domain, add imbalance to the load on the remote node + * and consider staying local. + */ + + if ((sd->flags & SD_NUMA) && + ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load)) + return NULL; + + /* + * If the local group is less loaded than the selected + * idlest group don't try and push any tasks. + */ + if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance)) + return NULL; + + if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load) + return NULL; + break; + + case group_imbalanced: + case group_asym_packing: + /* Those type are not used in the slow wakeup path */ + return NULL; + + case group_misfit_task: + /* Select group with the highest max capacity */ + if (local->sgc->max_capacity >= idlest->sgc->max_capacity) + return NULL; + break; + + case group_has_spare: + if (sd->flags & SD_NUMA) { +#ifdef CONFIG_NUMA_BALANCING + int idlest_cpu; + /* + * If there is spare capacity at NUMA, try to select + * the preferred node + */ + if (cpu_to_node(this_cpu) == p->numa_preferred_nid) + return NULL; + + idlest_cpu = cpumask_first(sched_group_span(idlest)); + if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) + return idlest; +#endif + /* + * Otherwise, keep the task on this node to stay close + * its wakeup source and improve locality. If there is + * a real need of migration, periodic load balance will + * take care of it. + */ + if (local_sgs.idle_cpus) + return NULL; + } + + /* + * Select group with highest number of idle CPUs. We could also + * compare the utilization which is more stable but it can end + * up that the group has less spare capacity but finally more + * idle CPUs which means more opportunity to run task. + */ + if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus) + return NULL; + break; + } + + return idlest; +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ + static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON @@ -9172,22 +9424,6 @@ if (local_group) goto next_group; - /* - * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity so that we'll try - * and move all the excess tasks away. We lower the capacity - * of a group only if the local group has the capacity to fit - * these excess tasks. The extra check prevents the case where - * you always pull from the heaviest group when it is already - * under-utilized (possible with a large weight task outweighs - * the tasks on the system). - */ - if (prefer_sibling && sds->local && - group_has_capacity(env, local) && - (sgs->sum_nr_running > local->sum_nr_running + 1)) { - sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); - } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -9196,12 +9432,14 @@ next_group: /* Now, start updating sd_lb_stats */ - sds->total_running += sgs->sum_nr_running; sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); + + /* Tag domain that child domain prefers tasks go to siblings first */ + sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if ((env->flags & LBF_NOHZ_AGAIN) && @@ -9215,8 +9453,6 @@ if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); - env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; - if (!env->sd->parent) { struct root_domain *rd = env->dst_rq->rd; @@ -9225,144 +9461,28 @@ /* Update over-utilization (tipping point, U >= 0) indicator */ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED)); + trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized(1); - } + struct root_domain *rd = env->dst_rq->rd; + WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + } } -/** - * check_asym_packing - Check to see if the group is packed into the - * sched domain. - * - * This is primarily intended to used at the sibling level. Some - * cores like POWER7 prefer to use lower numbered SMT threads. In the - * case of POWER7, it can move to lower SMT modes only when higher - * threads are idle. When in lower SMT modes, the threads will - * perform better since they share less core resources. Hence when we - * have idle threads, we want them to be the higher ones. - * - * This packing function is run on idle threads. It checks to see if - * the busiest CPU in this domain (core in the P7 case) has a higher - * CPU number than the packing function is being run on. Here we are - * assuming lower CPU number will be equivalent to lower a SMT thread - * number. - * - * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in env->imbalance. - * - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain which is to be packed - */ -static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) +static inline long adjust_numa_imbalance(int imbalance, int nr_running) { - int busiest_cpu; - - if (!(env->sd->flags & SD_ASYM_PACKING)) - return 0; - - if (env->idle == CPU_NOT_IDLE) - return 0; - - if (!sds->busiest) - return 0; - - busiest_cpu = sds->busiest->asym_prefer_cpu; - if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) - return 0; - - env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_CAPACITY_SCALE); - - return 1; -} - -/** - * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. - * @env: The load balancing environment. - * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - */ -static inline -void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) -{ - unsigned long tmp, capa_now = 0, capa_move = 0; - unsigned int imbn = 2; - unsigned long scaled_busy_load_per_task; - struct sg_lb_stats *local, *busiest; - - local = &sds->local_stat; - busiest = &sds->busiest_stat; - - if (!local->sum_nr_running) - local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); - else if (busiest->load_per_task > local->load_per_task) - imbn = 1; - - scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - busiest->group_capacity; - - if (busiest->avg_load + scaled_busy_load_per_task >= - local->avg_load + (scaled_busy_load_per_task * imbn)) { - env->imbalance = busiest->load_per_task; - return; - } + unsigned int imbalance_min; /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU capacity used by - * moving them. + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the source domain is almost idle. */ + imbalance_min = 2; + if (nr_running <= imbalance_min) + return 0; - capa_now += busiest->group_capacity * - min(busiest->load_per_task, busiest->avg_load); - capa_now += local->group_capacity * - min(local->load_per_task, local->avg_load); - capa_now /= SCHED_CAPACITY_SCALE; - - /* Amount of load we'd subtract */ - if (busiest->avg_load > scaled_busy_load_per_task) { - capa_move += busiest->group_capacity * - min(busiest->load_per_task, - busiest->avg_load - scaled_busy_load_per_task); - } - - /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_CAPACITY_SCALE) { - tmp = (busiest->avg_load * busiest->group_capacity) / - local->group_capacity; - } else { - tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / - local->group_capacity; - } - capa_move += local->group_capacity * - min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_CAPACITY_SCALE; - - /* Move if we gain throughput */ - if (capa_move > capa_now) { - env->imbalance = busiest->load_per_task; - return; - } - - /* We can't see throughput improvement with the load-based - * method, but it is possible depending upon group size and - * capacity range that there might still be an underutilized - * cpu available in an asymmetric capacity system. Do one last - * check just in case. - */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - busiest->group_type == group_overloaded && - busiest->sum_nr_running > busiest->group_weight && - local->sum_nr_running < local->group_weight && - local->group_capacity < busiest->group_capacity) - env->imbalance = busiest->load_per_task; + return imbalance; } /** @@ -9373,96 +9493,180 @@ */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long max_pull, load_above_capacity = ~0UL; struct sg_lb_stats *local, *busiest; local = &sds->local_stat; busiest = &sds->busiest_stat; + if (busiest->group_type == group_misfit_task) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + return; + } + + if (busiest->group_type == group_asym_packing) { + /* + * In case of asym capacity, we will try to migrate all load to + * the preferred CPU. + */ + env->migration_type = migrate_task; + env->imbalance = busiest->sum_h_nr_running; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages - * to ensure CPU-load equilibrium, look at wider averages. XXX + * to ensure CPU-load equilibrium, try to move any task to fix + * the imbalance. The next load balance will take care of + * balancing back the system. */ - busiest->load_per_task = - min(busiest->load_per_task, sds->avg_load); + env->migration_type = migrate_task; + env->imbalance = 1; + return; } /* - * Avg load of busiest sg can be less and avg load of local sg can - * be greater than avg load across all sgs of sd because avg load - * factors in sg capacity and sgs with smaller group_type are - * skipped when updating the busiest sg: + * Try to use spare capacity of local group without overloading it or + * emptying busiest. */ - if (busiest->group_type != group_misfit_task && - (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load)) { - env->imbalance = 0; - return fix_small_imbalance(env, sds); + if (local->group_type == group_has_spare) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { + /* + * If busiest is overloaded, try to fill spare + * capacity. This might end up creating spare capacity + * in busiest or busiest still being overloaded but + * there is no simple way to directly compute the + * amount of load to migrate in order to balance the + * system. + */ + env->migration_type = migrate_util; + env->imbalance = max(local->group_capacity, local->group_util) - + local->group_util; + + /* + * In some cases, the group's utilization is max or even + * higher than capacity because of migrations but the + * local CPU is (newly) idle. There is at least one + * waiting task in this overloaded busiest group. Let's + * try to pull it. + */ + if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + env->migration_type = migrate_task; + env->imbalance = 1; + } + + return; + } + + if (busiest->group_weight == 1 || sds->prefer_sibling) { + unsigned int nr_diff = busiest->sum_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; + lsub_positive(&nr_diff, local->sum_nr_running); + env->imbalance = nr_diff >> 1; + } else { + + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - + busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) + env->imbalance = adjust_numa_imbalance(env->imbalance, + busiest->sum_nr_running); + + return; } /* - * If there aren't any idle CPUs, avoid creating some. + * Local is fully busy but has to take more load to relieve the + * busiest group */ - if (busiest->group_type == group_overloaded && - local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; - if (load_above_capacity > busiest->group_capacity) { - load_above_capacity -= busiest->group_capacity; - load_above_capacity *= scale_load_down(NICE_0_LOAD); - load_above_capacity /= busiest->group_capacity; - } else - load_above_capacity = ~0UL; + if (local->group_type < group_overloaded) { + /* + * Local will become overloaded so the avg_load metrics are + * finally needed. + */ + + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. + */ + if (local->avg_load >= busiest->avg_load) { + env->imbalance = 0; + return; + } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* - * We're trying to get all the CPUs to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded CPU below the average load. At the same time, - * we also don't want to reduce the group load below the group - * capacity. Thus we look for the minimum possible imbalance. + * Both group are or will become overloaded and we're trying to get all + * the CPUs to the average_load, so we don't want to push ourselves + * above the average load, nor do we wish to reduce the max loaded CPU + * below the average load. At the same time, we also don't want to + * reduce the group load below the group capacity. Thus we look for + * the minimum possible imbalance. */ - max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); - - /* How much load to actually move to equalise the imbalance */ + env->migration_type = migrate_load; env->imbalance = min( - max_pull * busiest->group_capacity, + (busiest->avg_load - sds->avg_load) * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - - /* Boost imbalance to allow misfit task to be balanced. - * Always do this if we are doing a NEWLY_IDLE balance - * on the assumption that any tasks we have must not be - * long-running (and hence we cannot rely upon load). - * However if we are not idle, we should assume the tasks - * we have are longer running and not override load-based - * calculations above unless we are sure that the local - * group is underutilized. - */ - if (busiest->group_type == group_misfit_task && - (env->idle == CPU_NEWLY_IDLE || - local->sum_nr_running < local->group_weight)) { - env->imbalance = max_t(long, env->imbalance, - busiest->group_misfit_task_load); - } - - /* - * if *imbalance is less than the average load per runnable task - * there is no guarantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (env->imbalance < busiest->load_per_task) - return fix_small_imbalance(env, sds); } /******* find_busiest_group() helpers end here *********************/ + +/* + * Decision matrix according to the local and busiest group type: + * + * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded + * has_spare nr_idle balanced N/A N/A balanced balanced + * fully_busy nr_idle nr_idle N/A N/A balanced balanced + * misfit_task force N/A N/A N/A force force + * asym_packing force force N/A N/A force force + * imbalanced force force N/A N/A force force + * overloaded force force N/A N/A force avg_load + * + * N/A : Not Applicable because already filtered while updating + * statistics. + * balanced : The system is balanced for these 2 groups. + * force : Calculate the imbalance as load migration is probably needed. + * avg_load : Only if imbalance is significant enough. + * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite + * different in groups. + */ /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. * - * Also calculates the amount of weighted load which should be moved + * Also calculates the amount of runnable load which should be moved * to restore balance. * * @env: The load balancing environment. @@ -9477,91 +9681,120 @@ init_sd_lb_stats(&sds); /* - * Compute the various statistics relavent for load balancing at + * Compute the various statistics relevant for load balancing at * this level. */ update_sd_lb_stats(env, &sds); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; + int out_balance = 1; - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq, + &out_balance); + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) + && out_balance) goto out_balanced; } local = &sds.local_stat; busiest = &sds.busiest_stat; - /* ASYM feature bypasses nice load balance check */ - if (check_asym_packing(env, &sds)) - return sds.busiest; - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || busiest->sum_nr_running == 0) + if (!sds.busiest) goto out_balanced; - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) - / sds.total_capacity; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + + /* ASYM feature bypasses nice load balance check */ + if (busiest->group_type == group_asym_packing) + goto force_balance; /* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically - * isn't true due to cpus_allowed constraints and the like. + * isn't true due to cpus_ptr constraints and the like. */ if (busiest->group_type == group_imbalanced) - goto force_balance; - - /* - * When dst_cpu is idle, prevent SMP nice and/or asymmetric group - * capacities from resulting in underutilization due to avg_load. - */ - if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && - busiest->group_no_capacity) - goto force_balance; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) goto force_balance; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ - if (local->avg_load >= busiest->avg_load) + if (local->group_type > busiest->group_type) goto out_balanced; /* - * Don't pull any tasks if this group is already above the domain - * average load. + * When groups are overloaded, use the avg_load to ensure fairness + * between tasks. */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; - - if (env->idle == CPU_IDLE) { + if (local->group_type == group_overloaded) { /* - * This CPU is idle. If the busiest group is not overloaded - * and there is no imbalance between this and busiest group - * wrt idle CPUs, it is balanced. The imbalance becomes - * significant if the diff is greater than 1 otherwise we - * might end up to just move the imbalance on another group + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. */ - if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + if (local->avg_load >= busiest->avg_load) goto out_balanced; - } else { + + /* XXX broken for overlapping NUMA groups */ + sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / + sds.total_capacity; + /* - * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use - * imbalance_pct to be conservative. + * Don't pull any tasks if this group is already above the + * domain average load. + */ + if (local->avg_load >= sds.avg_load) + goto out_balanced; + + /* + * If the busiest group is more loaded, use imbalance_pct to be + * conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } + /* Try to move all excess tasks to child's sibling domain */ + if (sds.prefer_sibling && local->group_type == group_has_spare && + busiest->sum_nr_running > local->sum_nr_running + 1) + goto force_balance; + + if (busiest->group_type != group_overloaded) { + if (env->idle == CPU_NOT_IDLE) + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; + + if (busiest->group_weight > 1 && + local->idle_cpus <= (busiest->idle_cpus + 1)) + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest + * group wrt idle CPUs, it is balanced. The imbalance + * becomes significant if the diff is greater than 1 + * otherwise we might end up to just move the imbalance + * on another group. Of course this applies only if + * there is more than 1 CPU per group. + */ + goto out_balanced; + + if (busiest->sum_h_nr_running == 1) + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; + } + force_balance: /* Looks like there is an imbalance. Compute it */ - env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -9577,11 +9810,18 @@ struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_capacity = 1; - int i; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; + int i, done = 0; + + trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus, + &busiest, &done); + if (done) + return busiest; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, wl; + unsigned long capacity, load, util; + unsigned int nr_running; enum fbq_type rt; rq = cpu_rq(i); @@ -9609,20 +9849,8 @@ if (rt > env->fbq_type) continue; - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we simply - * seek the "biggest" misfit task. - */ - if (env->src_grp_type == group_misfit_task) { - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - continue; - } - capacity = capacity_of(i); + nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9632,35 +9860,77 @@ */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && - rq->nr_running == 1) + nr_running == 1) continue; - wl = weighted_cpuload(rq); + switch (env->migration_type) { + case migrate_load: + /* + * When comparing with load imbalance, use cpu_load() + * which is not scaled with the CPU capacity. + */ + load = cpu_load(rq); - /* - * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the CPU capacity. - */ + if (nr_running == 1 && load > env->imbalance && + !check_cpu_capacity(rq, env->sd)) + break; - if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - continue; + /* + * For the load comparisons with the other CPUs, + * consider the cpu_load() scaled with the CPU + * capacity, so that the load can be moved away + * from the CPU that is potentially running at a + * lower capacity. + * + * Thus we're looking for max(load_i / capacity_i), + * crosswise multiplication to rid ourselves of the + * division works out to: + * load_i * capacity_j > load_j * capacity_i; + * where j is our previous maximum. + */ + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; + busiest_capacity = capacity; + busiest = rq; + } + break; - /* - * For the load comparisons with the other CPU's, consider - * the weighted_cpuload() scaled with the CPU capacity, so - * that the load can be moved away from the CPU that is - * potentially running at a lower capacity. - * - * Thus we're looking for max(wl_i / capacity_i), crosswise - * multiplication to rid ourselves of the division works out - * to: wl_i * capacity_j > wl_j * capacity_i; where j is - * our previous maximum. - */ - if (wl * busiest_capacity > busiest_load * capacity) { - busiest_load = wl; - busiest_capacity = capacity; - busiest = rq; + case migrate_util: + util = cpu_util(cpu_of(rq)); + + /* + * Don't try to pull utilization from a CPU with one + * running task. Whatever its utilization, we will fail + * detach the task. + */ + if (nr_running <= 1) + continue; + + if (busiest_util < util) { + busiest_util = util; + busiest = rq; + } + break; + + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; + busiest = rq; + } + break; + + case migrate_misfit: + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we + * simply seek the "biggest" misfit task. + */ + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + break; + } } @@ -9673,21 +9943,25 @@ */ #define MAX_PINNED_INTERVAL 512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env) +{ + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu); +} + +static inline bool +voluntary_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; - if (env->idle == CPU_NEWLY_IDLE) { - - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu)) - return 1; - } + if (asym_active_balance(env)) + return 1; /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -9702,19 +9976,18 @@ return 1; } - if (env->src_grp_type == group_misfit_task) + if (env->migration_type == migrate_misfit) return 1; - if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && - env->src_rq->cfs.h_nr_running == 1 && - cpu_overutilized(env->src_cpu) && - !cpu_overutilized(env->dst_cpu)) { - return 1; - } + return 0; +} - if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load) - return 1; +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + if (voluntary_active_balance(env)) + return 1; return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9724,7 +9997,17 @@ static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; + int cpu; + + if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) { + struct root_domain *rd = env->dst_rq->rd; + struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask(); + int level = rockchip_perf_get_level(); + + if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) && + cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask)) + return 0; + } /* * Ensure the balancing environment is consistent; can happen @@ -9745,18 +10028,12 @@ if (!idle_cpu(cpu)) continue; - balance_cpu = cpu; - break; + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; } - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; } /* @@ -9778,7 +10055,7 @@ .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, @@ -9828,6 +10105,7 @@ more_balance: rq_lock_irqsave(busiest, &rf); + env.src_rq_rf = &rf; update_rq_clock(busiest); /* @@ -9880,7 +10158,7 @@ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's CPUs */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + __cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; @@ -9907,7 +10185,7 @@ /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); + __cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are @@ -9934,8 +10212,7 @@ * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - if (env.src_grp_nr_running > 1) - sd->nr_balance_failed++; + sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; @@ -9947,7 +10224,7 @@ * if the curr task on busiest CPU can't be * moved to this_cpu: */ - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; @@ -9978,7 +10255,7 @@ } else sd->nr_balance_failed = 0; - if (likely(!active_balance)) { + if (likely(!active_balance) || voluntary_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { @@ -10021,18 +10298,18 @@ ld_moved = 0; /* - * idle_balance() disregards balance intervals, so we could repeatedly - * reach this code, which would lead to balance_interval skyrocketting - * in a short amount of time. Skip the balance_interval increase logic - * to avoid that. + * newidle_balance() disregards balance intervals, so we could + * repeatedly reach this code, which would lead to balance_interval + * skyrocketting in a short amount of time. Skip the balance_interval + * increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; /* tune up the balancing interval */ - if (((env.flags & LBF_ALL_PINNED) && - sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) + if ((env.flags & LBF_ALL_PINNED && + sd->balance_interval < MAX_PINNED_INTERVAL) || + sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: return ld_moved; @@ -10048,6 +10325,15 @@ /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; @@ -10110,9 +10396,8 @@ /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; } if (likely(sd)) { @@ -10130,6 +10415,7 @@ * about DST_PINNED. */ .flags = LBF_DST_PINNED, + .src_rq_rf = &rf, }; schedstat_inc(sd->alb_count); @@ -10165,7 +10451,7 @@ */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + max_load_balance_interval = HZ*num_active_cpus()/10; } /* @@ -10178,6 +10464,7 @@ { int continue_balancing = 1; int cpu = rq->cpu; + int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -10185,6 +10472,10 @@ int update_next_balance = 0; int need_serialize, need_decay = 0; u64 max_cost = 0; + + trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing); + if (!continue_balancing) + return; rcu_read_lock(); for_each_domain(cpu, sd) { @@ -10200,9 +10491,6 @@ } max_cost += sd->max_newidle_lb_cost; - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more @@ -10214,7 +10502,7 @@ break; } - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { @@ -10230,9 +10518,10 @@ * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, idle != CPU_IDLE); + interval = get_sd_balance_interval(sd, busy); } if (need_serialize) spin_unlock(&balancing); @@ -10292,7 +10581,11 @@ static inline int find_new_ilb(void) { - int ilb; + int ilb = -1; + + trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb); + if (ilb >= 0) + return ilb; for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { @@ -10323,29 +10616,25 @@ if (ilb_cpu >= nr_cpu_ids) return; + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); if (flags & NOHZ_KICK_MASK) return; /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which + * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ - smp_send_reschedule(ilb_cpu); + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { @@ -10354,6 +10643,7 @@ struct sched_domain *sd; int nr_busy, i, cpu = rq->cpu; unsigned int flags = 0; + int done = 0; if (unlikely(rq->idle_balance)) return; @@ -10378,30 +10668,25 @@ if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2 || rq->misfit_task_load) { + trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done); + if (done) + goto out; + + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - } sd = rcu_dereference(rq->sd); if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -10409,15 +10694,55 @@ sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; } + } + } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_KICK_MASK; + goto unlock; } } unlock: @@ -10483,9 +10808,20 @@ SCHED_WARN_ON(cpu != smp_processor_id()); - /* If this CPU is going down, then nothing needs to be done: */ - if (!cpu_active(cpu)) + if (!cpu_active(cpu)) { + /* + * A CPU can be paused while it is idle with it's tick + * stopped. nohz_balance_exit_idle() should be called + * from the local CPU, so it can't be called during + * pause. This results in paused CPU participating in + * the nohz idle balance, which should be avoided. + * + * When the paused CPU exits idle and enters again, + * exempt the paused CPU from nohz_balance_exit_idle. + */ + nohz_balance_exit_idle(rq); return; + } /* Spare idle load balancing on CPUs that don't want to be disturbed: */ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) @@ -10598,7 +10934,6 @@ rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - cpu_load_update_idle(rq); rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) @@ -10648,22 +10983,14 @@ */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - int this_cpu = this_rq->cpu; - unsigned int flags; + unsigned int flags = this_rq->nohz_idle_balance; - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) + if (!flags) return false; - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } + this_rq->nohz_idle_balance = 0; - /* - * barrier, pairs with nohz_balance_enter_idle(), ensures ... - */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) + if (idle != CPU_IDLE) return false; _nohz_idle_balance(this_rq, flags, idle); @@ -10717,15 +11044,26 @@ /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. + * + * Returns: + * < 0 - we released the lock and there are !fair tasks present + * 0 - failed, no new tasks + * > 0 - success, new (fair) tasks present */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + int done = 0; + trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done); + if (done) + return pulled_task; + + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -10767,9 +11105,6 @@ for_each_domain(this_cpu, sd) { int continue_balancing = 1; u64 t0, domain_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); @@ -10960,6 +11295,9 @@ if (!task_on_rq_queued(p)) return; + if (rq->cfs.nr_running == 1) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -11038,7 +11376,7 @@ /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -11056,8 +11394,8 @@ /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); - attach_entity_load_avg(cfs_rq, se, 0); - update_tg_load_avg(cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -11116,9 +11454,19 @@ * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11379,8 +11727,8 @@ /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, +const struct sched_class fair_sched_class + __section("__fair_sched_class") = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -11388,10 +11736,12 @@ .check_preempt_curr = check_preempt_wakeup, - .pick_next_task = pick_next_task_fair, + .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -11402,7 +11752,6 @@ .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -11472,3 +11821,101 @@ #endif /* SMP */ } + +/* + * Helper functions to facilitate extracting info from tracepoints. + */ + +const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_SMP + return cfs_rq ? &cfs_rq->avg : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); + +char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) +{ + if (!cfs_rq) { + if (str) + strlcpy(str, "(null)", len); + else + return NULL; + } + + cfs_rq_tg_path(cfs_rq, str, len); + return str; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); + +int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) +{ + return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); + +const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_rt : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); + +const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_dl : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); + +const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) + return rq ? &rq->avg_irq : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); + +int sched_trace_rq_cpu(struct rq *rq) +{ + return rq ? cpu_of(rq) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); + +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + +const struct cpumask *sched_trace_rd_span(struct root_domain *rd) +{ +#ifdef CONFIG_SMP + return rd ? rd->span : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rd_span); + +int sched_trace_rq_nr_running(struct rq *rq) +{ + return rq ? rq->nr_running : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); -- Gitblit v1.6.2