hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/kernel/sched/fair.c
....@@ -20,12 +20,11 @@
2020 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
2121 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
2222 */
23
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24
-#include <linux/cpufreq.h>
25
-#endif
2623 #include "sched.h"
2724
28
-#include <trace/events/sched.h>
25
+#include <trace/hooks/sched.h>
26
+
27
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
2928
3029 /*
3130 * Targeted preemption latency for CPU-bound tasks:
....@@ -41,17 +40,8 @@
4140 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
4241 */
4342 unsigned int sysctl_sched_latency = 6000000ULL;
44
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45
-
46
-/*
47
- * Enable/disable honoring sync flag in energy-aware wakeups.
48
- */
49
-unsigned int sysctl_sched_sync_hint_enable = 1;
50
-
51
-/*
52
- * Enable/disable using cstate knowledge in idle sibling selection
53
- */
54
-unsigned int sysctl_sched_cstate_aware = 1;
43
+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
44
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
5545
5646 /*
5747 * The initial- and re-scaling of tunables is configurable
....@@ -71,8 +61,9 @@
7161 *
7262 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
7363 */
74
-unsigned int sysctl_sched_min_granularity = 750000ULL;
75
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
64
+unsigned int sysctl_sched_min_granularity = 750000ULL;
65
+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
66
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
7667
7768 /*
7869 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
....@@ -94,10 +85,23 @@
9485 *
9586 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
9687 */
97
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
88
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
89
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
9990
10091 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
92
+
93
+int sched_thermal_decay_shift;
94
+static int __init setup_sched_thermal_decay_shift(char *str)
95
+{
96
+ int _shift = 0;
97
+
98
+ if (kstrtoint(str, 0, &_shift))
99
+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
100
+
101
+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
102
+ return 1;
103
+}
104
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101105
102106 #ifdef CONFIG_SMP
103107 /*
....@@ -107,6 +111,14 @@
107111 {
108112 return -cpu;
109113 }
114
+
115
+/*
116
+ * The margin used when comparing utilization with CPU capacity.
117
+ *
118
+ * (default: ~20%)
119
+ */
120
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
121
+
110122 #endif
111123
112124 #ifdef CONFIG_CFS_BANDWIDTH
....@@ -122,18 +134,6 @@
122134 */
123135 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124136 #endif
125
-
126
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127
-unsigned int sysctl_sched_performance_bias = 1;
128
-#endif
129
-
130
-/*
131
- * The margin used when comparing utilization with CPU capacity:
132
- * util * margin < capacity * 1024
133
- *
134
- * (default: ~20%)
135
- */
136
-unsigned int capacity_margin = 1280;
137137
138138 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139139 {
....@@ -195,7 +195,7 @@
195195 #undef SET_SYSCTL
196196 }
197197
198
-void sched_init_granularity(void)
198
+void __init sched_init_granularity(void)
199199 {
200200 update_sysctl();
201201 }
....@@ -246,8 +246,7 @@
246246 }
247247 }
248248
249
- /* hint to use a 32x32->64 mul */
250
- fact = (u64)(u32)fact * lw->inv_weight;
249
+ fact = mul_u32_u32(fact, lw->inv_weight);
251250
252251 while (fact >> 32) {
253252 fact >>= 1;
....@@ -290,6 +289,19 @@
290289 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
291290 {
292291 return grp->my_q;
292
+}
293
+
294
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
295
+{
296
+ if (!path)
297
+ return;
298
+
299
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
300
+ autogroup_path(cfs_rq->tg, path, len);
301
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
302
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
303
+ else
304
+ strlcpy(path, "(null)", len);
293305 }
294306
295307 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -466,6 +478,12 @@
466478 return NULL;
467479 }
468480
481
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
482
+{
483
+ if (path)
484
+ strlcpy(path, "(null)", len);
485
+}
486
+
469487 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470488 {
471489 return true;
....@@ -567,6 +585,7 @@
567585 struct sched_entity *entry;
568586 bool leftmost = true;
569587
588
+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570589 /*
571590 * Find the right place in the rbtree:
572591 */
....@@ -592,6 +611,7 @@
592611
593612 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
594613 {
614
+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595615 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596616 }
597617
....@@ -631,8 +651,7 @@
631651 */
632652
633653 int sched_proc_update_handler(struct ctl_table *table, int write,
634
- void __user *buffer, size_t *lenp,
635
- loff_t *ppos)
654
+ void *buffer, size_t *lenp, loff_t *ppos)
636655 {
637656 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638657 unsigned int factor = get_update_sysctl_factor();
....@@ -689,7 +708,13 @@
689708 */
690709 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
691710 {
692
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
711
+ unsigned int nr_running = cfs_rq->nr_running;
712
+ u64 slice;
713
+
714
+ if (sched_feat(ALT_PERIOD))
715
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
716
+
717
+ slice = __sched_period(nr_running + !se->on_rq);
693718
694719 for_each_sched_entity(se) {
695720 struct load_weight *load;
....@@ -706,6 +731,10 @@
706731 }
707732 slice = __calc_delta(slice, se->load.weight, load);
708733 }
734
+
735
+ if (sched_feat(BASE_SLICE))
736
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
737
+
709738 return slice;
710739 }
711740
....@@ -734,26 +763,17 @@
734763 memset(sa, 0, sizeof(*sa));
735764
736765 /*
737
- * Tasks are intialized with full load to be seen as heavy tasks until
766
+ * Tasks are initialized with full load to be seen as heavy tasks until
738767 * they get a chance to stabilize to their real load level.
739
- * Group entities are intialized with zero load to reflect the fact that
768
+ * Group entities are initialized with zero load to reflect the fact that
740769 * nothing has been attached to the task group yet.
741770 */
742771 if (entity_is_task(se))
743
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
772
+ sa->load_avg = scale_load_down(se->load.weight);
744773
745
- se->runnable_weight = se->load.weight;
746
-
747
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748
- if (sysctl_sched_performance_bias) {
749
- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751
- }
752
-#endif
753774 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754775 }
755776
756
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757777 static void attach_entity_cfs_rq(struct sched_entity *se);
758778
759779 /*
....@@ -782,18 +802,15 @@
782802 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783803 * if util_avg > util_avg_cap.
784804 */
785
-void post_init_entity_util_avg(struct sched_entity *se)
805
+void post_init_entity_util_avg(struct task_struct *p)
786806 {
807
+ struct sched_entity *se = &p->se;
787808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788809 struct sched_avg *sa = &se->avg;
789
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
810
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790811 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791812
792
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793
- if (!sysctl_sched_performance_bias && (cap > 0)) {
794
-#else
795813 if (cap > 0) {
796
-#endif
797814 if (cfs_rq->avg.util_avg != 0) {
798815 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799816 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
....@@ -805,24 +822,25 @@
805822 }
806823 }
807824
808
- if (entity_is_task(se)) {
809
- struct task_struct *p = task_of(se);
810
- if (p->sched_class != &fair_sched_class) {
811
- /*
812
- * For !fair tasks do:
813
- *
814
- update_cfs_rq_load_avg(now, cfs_rq);
815
- attach_entity_load_avg(cfs_rq, se, 0);
816
- switched_from_fair(rq, p);
817
- *
818
- * such that the next switched_to_fair() has the
819
- * expected state.
820
- */
821
- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822
- return;
823
- }
825
+ sa->runnable_avg = sa->util_avg;
826
+
827
+ if (p->sched_class != &fair_sched_class) {
828
+ /*
829
+ * For !fair tasks do:
830
+ *
831
+ update_cfs_rq_load_avg(now, cfs_rq);
832
+ attach_entity_load_avg(cfs_rq, se);
833
+ switched_from_fair(rq, p);
834
+ *
835
+ * such that the next switched_to_fair() has the
836
+ * expected state.
837
+ */
838
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
839
+ return;
824840 }
825841
842
+ /* Hook before this se's util is attached to cfs_rq's util */
843
+ trace_android_rvh_post_init_entity_util_avg(se);
826844 attach_entity_cfs_rq(se);
827845 }
828846
....@@ -830,10 +848,10 @@
830848 void init_entity_runnable_average(struct sched_entity *se)
831849 {
832850 }
833
-void post_init_entity_util_avg(struct sched_entity *se)
851
+void post_init_entity_util_avg(struct task_struct *p)
834852 {
835853 }
836
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
854
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837855 {
838856 }
839857 #endif /* CONFIG_SMP */
....@@ -983,7 +1001,6 @@
9831001 }
9841002
9851003 trace_sched_stat_blocked(tsk, delta);
986
- trace_sched_blocked_reason(tsk);
9871004
9881005 /*
9891006 * Blocking time is in units of nanosecs, so shift by
....@@ -1078,7 +1095,7 @@
10781095 unsigned int sysctl_numa_balancing_scan_delay = 1000;
10791096
10801097 struct numa_group {
1081
- atomic_t refcount;
1098
+ refcount_t refcount;
10821099
10831100 spinlock_t lock; /* nr_tasks, tasks */
10841101 int nr_tasks;
....@@ -1094,7 +1111,7 @@
10941111 * more by CPU use than by memory faults.
10951112 */
10961113 unsigned long *faults_cpu;
1097
- unsigned long faults[0];
1114
+ unsigned long faults[];
10981115 };
10991116
11001117 /*
....@@ -1164,7 +1181,7 @@
11641181 unsigned long shared = group_faults_shared(ng);
11651182 unsigned long private = group_faults_priv(ng);
11661183
1167
- period *= atomic_read(&ng->refcount);
1184
+ period *= refcount_read(&ng->refcount);
11681185 period *= shared + 1;
11691186 period /= private + shared + 1;
11701187 }
....@@ -1189,7 +1206,7 @@
11891206 unsigned long private = group_faults_priv(ng);
11901207 unsigned long period = smax;
11911208
1192
- period *= atomic_read(&ng->refcount);
1209
+ period *= refcount_read(&ng->refcount);
11931210 period *= shared + 1;
11941211 period /= private + shared + 1;
11951212
....@@ -1199,56 +1216,15 @@
11991216 return max(smin, smax);
12001217 }
12011218
1202
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203
-{
1204
- int mm_users = 0;
1205
- struct mm_struct *mm = p->mm;
1206
-
1207
- if (mm) {
1208
- mm_users = atomic_read(&mm->mm_users);
1209
- if (mm_users == 1) {
1210
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211
- mm->numa_scan_seq = 0;
1212
- }
1213
- }
1214
- p->node_stamp = 0;
1215
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217
- p->numa_work.next = &p->numa_work;
1218
- p->numa_faults = NULL;
1219
- RCU_INIT_POINTER(p->numa_group, NULL);
1220
- p->last_task_numa_placement = 0;
1221
- p->last_sum_exec_runtime = 0;
1222
-
1223
- /* New address space, reset the preferred nid */
1224
- if (!(clone_flags & CLONE_VM)) {
1225
- p->numa_preferred_nid = -1;
1226
- return;
1227
- }
1228
-
1229
- /*
1230
- * New thread, keep existing numa_preferred_nid which should be copied
1231
- * already by arch_dup_task_struct but stagger when scans start.
1232
- */
1233
- if (mm) {
1234
- unsigned int delay;
1235
-
1236
- delay = min_t(unsigned int, task_scan_max(current),
1237
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238
- delay += 2 * TICK_NSEC;
1239
- p->node_stamp = delay;
1240
- }
1241
-}
1242
-
12431219 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
12441220 {
1245
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
1221
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
12461222 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
12471223 }
12481224
12491225 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
12501226 {
1251
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1227
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
12521228 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
12531229 }
12541230
....@@ -1474,7 +1450,7 @@
14741450 * two full passes of the "multi-stage node selection" test that is
14751451 * executed below.
14761452 */
1477
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
1453
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
14781454 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
14791455 return true;
14801456
....@@ -1527,55 +1503,52 @@
15271503 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
15281504 }
15291505
1530
-static unsigned long weighted_cpuload(struct rq *rq);
1531
-static unsigned long source_load(int cpu, int type);
1532
-static unsigned long target_load(int cpu, int type);
1506
+/*
1507
+ * 'numa_type' describes the node at the moment of load balancing.
1508
+ */
1509
+enum numa_type {
1510
+ /* The node has spare capacity that can be used to run more tasks. */
1511
+ node_has_spare = 0,
1512
+ /*
1513
+ * The node is fully used and the tasks don't compete for more CPU
1514
+ * cycles. Nevertheless, some tasks might wait before running.
1515
+ */
1516
+ node_fully_busy,
1517
+ /*
1518
+ * The node is overloaded and can't provide expected CPU cycles to all
1519
+ * tasks.
1520
+ */
1521
+ node_overloaded
1522
+};
15331523
15341524 /* Cached statistics for all CPUs within a node */
15351525 struct numa_stats {
15361526 unsigned long load;
1537
-
1527
+ unsigned long runnable;
1528
+ unsigned long util;
15381529 /* Total compute capacity of CPUs on a node */
15391530 unsigned long compute_capacity;
1540
-
15411531 unsigned int nr_running;
1532
+ unsigned int weight;
1533
+ enum numa_type node_type;
1534
+ int idle_cpu;
15421535 };
15431536
1544
-/*
1545
- * XXX borrowed from update_sg_lb_stats
1546
- */
1547
-static void update_numa_stats(struct numa_stats *ns, int nid)
1537
+static inline bool is_core_idle(int cpu)
15481538 {
1549
- int smt, cpu, cpus = 0;
1550
- unsigned long capacity;
1539
+#ifdef CONFIG_SCHED_SMT
1540
+ int sibling;
15511541
1552
- memset(ns, 0, sizeof(*ns));
1553
- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554
- struct rq *rq = cpu_rq(cpu);
1542
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1543
+ if (cpu == sibling)
1544
+ continue;
15551545
1556
- ns->nr_running += rq->nr_running;
1557
- ns->load += weighted_cpuload(rq);
1558
- ns->compute_capacity += capacity_of(cpu);
1559
-
1560
- cpus++;
1546
+ if (!idle_cpu(sibling))
1547
+ return false;
15611548 }
1549
+#endif
15621550
1563
- /*
1564
- * If we raced with hotplug and there are no CPUs left in our mask
1565
- * the @ns structure is NULL'ed and task_numa_compare() will
1566
- * not find this node attractive.
1567
- *
1568
- * We'll detect a huge imbalance and bail there.
1569
- */
1570
- if (!cpus)
1571
- return;
1572
-
1573
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575
- capacity = cpus / smt; /* cores */
1576
-
1577
- capacity = min_t(unsigned, capacity,
1578
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1551
+ return true;
15791552 }
15801553
15811554 struct task_numa_env {
....@@ -1594,20 +1567,132 @@
15941567 int best_cpu;
15951568 };
15961569
1570
+static unsigned long cpu_load(struct rq *rq);
1571
+static unsigned long cpu_runnable(struct rq *rq);
1572
+static unsigned long cpu_util(int cpu);
1573
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
1574
+
1575
+static inline enum
1576
+numa_type numa_classify(unsigned int imbalance_pct,
1577
+ struct numa_stats *ns)
1578
+{
1579
+ if ((ns->nr_running > ns->weight) &&
1580
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1581
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1582
+ return node_overloaded;
1583
+
1584
+ if ((ns->nr_running < ns->weight) ||
1585
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1586
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1587
+ return node_has_spare;
1588
+
1589
+ return node_fully_busy;
1590
+}
1591
+
1592
+#ifdef CONFIG_SCHED_SMT
1593
+/* Forward declarations of select_idle_sibling helpers */
1594
+static inline bool test_idle_cores(int cpu, bool def);
1595
+static inline int numa_idle_core(int idle_core, int cpu)
1596
+{
1597
+ if (!static_branch_likely(&sched_smt_present) ||
1598
+ idle_core >= 0 || !test_idle_cores(cpu, false))
1599
+ return idle_core;
1600
+
1601
+ /*
1602
+ * Prefer cores instead of packing HT siblings
1603
+ * and triggering future load balancing.
1604
+ */
1605
+ if (is_core_idle(cpu))
1606
+ idle_core = cpu;
1607
+
1608
+ return idle_core;
1609
+}
1610
+#else
1611
+static inline int numa_idle_core(int idle_core, int cpu)
1612
+{
1613
+ return idle_core;
1614
+}
1615
+#endif
1616
+
1617
+/*
1618
+ * Gather all necessary information to make NUMA balancing placement
1619
+ * decisions that are compatible with standard load balancer. This
1620
+ * borrows code and logic from update_sg_lb_stats but sharing a
1621
+ * common implementation is impractical.
1622
+ */
1623
+static void update_numa_stats(struct task_numa_env *env,
1624
+ struct numa_stats *ns, int nid,
1625
+ bool find_idle)
1626
+{
1627
+ int cpu, idle_core = -1;
1628
+
1629
+ memset(ns, 0, sizeof(*ns));
1630
+ ns->idle_cpu = -1;
1631
+
1632
+ rcu_read_lock();
1633
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
1634
+ struct rq *rq = cpu_rq(cpu);
1635
+
1636
+ ns->load += cpu_load(rq);
1637
+ ns->runnable += cpu_runnable(rq);
1638
+ ns->util += cpu_util(cpu);
1639
+ ns->nr_running += rq->cfs.h_nr_running;
1640
+ ns->compute_capacity += capacity_of(cpu);
1641
+
1642
+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1643
+ if (READ_ONCE(rq->numa_migrate_on) ||
1644
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1645
+ continue;
1646
+
1647
+ if (ns->idle_cpu == -1)
1648
+ ns->idle_cpu = cpu;
1649
+
1650
+ idle_core = numa_idle_core(idle_core, cpu);
1651
+ }
1652
+ }
1653
+ rcu_read_unlock();
1654
+
1655
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
1656
+
1657
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
1658
+
1659
+ if (idle_core >= 0)
1660
+ ns->idle_cpu = idle_core;
1661
+}
1662
+
15971663 static void task_numa_assign(struct task_numa_env *env,
15981664 struct task_struct *p, long imp)
15991665 {
16001666 struct rq *rq = cpu_rq(env->dst_cpu);
16011667
1602
- /* Bail out if run-queue part of active NUMA balance. */
1603
- if (xchg(&rq->numa_migrate_on, 1))
1604
- return;
1668
+ /* Check if run-queue part of active NUMA balance. */
1669
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1670
+ int cpu;
1671
+ int start = env->dst_cpu;
16051672
1673
+ /* Find alternative idle CPU. */
1674
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1675
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1676
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1677
+ continue;
1678
+ }
1679
+
1680
+ env->dst_cpu = cpu;
1681
+ rq = cpu_rq(env->dst_cpu);
1682
+ if (!xchg(&rq->numa_migrate_on, 1))
1683
+ goto assign;
1684
+ }
1685
+
1686
+ /* Failed to find an alternative idle CPU */
1687
+ return;
1688
+ }
1689
+
1690
+assign:
16061691 /*
16071692 * Clear previous best_cpu/rq numa-migrate flag, since task now
16081693 * found a better CPU to move/swap.
16091694 */
1610
- if (env->best_cpu != -1) {
1695
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
16111696 rq = cpu_rq(env->best_cpu);
16121697 WRITE_ONCE(rq->numa_migrate_on, 0);
16131698 }
....@@ -1663,7 +1748,7 @@
16631748 * into account that it might be best if task running on the dst_cpu should
16641749 * be exchanged with the source task
16651750 */
1666
-static void task_numa_compare(struct task_numa_env *env,
1751
+static bool task_numa_compare(struct task_numa_env *env,
16671752 long taskimp, long groupimp, bool maymove)
16681753 {
16691754 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
....@@ -1674,12 +1759,13 @@
16741759 int dist = env->dist;
16751760 long moveimp = imp;
16761761 long load;
1762
+ bool stopsearch = false;
16771763
16781764 if (READ_ONCE(dst_rq->numa_migrate_on))
1679
- return;
1765
+ return false;
16801766
16811767 rcu_read_lock();
1682
- cur = task_rcu_dereference(&dst_rq->curr);
1768
+ cur = rcu_dereference(dst_rq->curr);
16831769 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
16841770 cur = NULL;
16851771
....@@ -1687,8 +1773,10 @@
16871773 * Because we have preemption enabled we can get migrated around and
16881774 * end try selecting ourselves (current == env->p) as a swap candidate.
16891775 */
1690
- if (cur == env->p)
1776
+ if (cur == env->p) {
1777
+ stopsearch = true;
16911778 goto unlock;
1779
+ }
16921780
16931781 if (!cur) {
16941782 if (maymove && moveimp >= env->best_imp)
....@@ -1697,18 +1785,27 @@
16971785 goto unlock;
16981786 }
16991787
1788
+ /* Skip this swap candidate if cannot move to the source cpu. */
1789
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1790
+ goto unlock;
1791
+
1792
+ /*
1793
+ * Skip this swap candidate if it is not moving to its preferred
1794
+ * node and the best task is.
1795
+ */
1796
+ if (env->best_task &&
1797
+ env->best_task->numa_preferred_nid == env->src_nid &&
1798
+ cur->numa_preferred_nid != env->src_nid) {
1799
+ goto unlock;
1800
+ }
1801
+
17001802 /*
17011803 * "imp" is the fault differential for the source task between the
17021804 * source and destination node. Calculate the total differential for
17031805 * the source task and potential destination task. The more negative
17041806 * the value is, the more remote accesses that would be expected to
17051807 * be incurred if the tasks were swapped.
1706
- */
1707
- /* Skip this swap candidate if cannot move to the source cpu */
1708
- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1709
- goto unlock;
1710
-
1711
- /*
1808
+ *
17121809 * If dst and source tasks are in the same NUMA group, or not
17131810 * in any group then look only at task weights.
17141811 */
....@@ -1735,9 +1832,31 @@
17351832 task_weight(cur, env->dst_nid, dist);
17361833 }
17371834
1835
+ /* Discourage picking a task already on its preferred node */
1836
+ if (cur->numa_preferred_nid == env->dst_nid)
1837
+ imp -= imp / 16;
1838
+
1839
+ /*
1840
+ * Encourage picking a task that moves to its preferred node.
1841
+ * This potentially makes imp larger than it's maximum of
1842
+ * 1998 (see SMALLIMP and task_weight for why) but in this
1843
+ * case, it does not matter.
1844
+ */
1845
+ if (cur->numa_preferred_nid == env->src_nid)
1846
+ imp += imp / 8;
1847
+
17381848 if (maymove && moveimp > imp && moveimp > env->best_imp) {
17391849 imp = moveimp;
17401850 cur = NULL;
1851
+ goto assign;
1852
+ }
1853
+
1854
+ /*
1855
+ * Prefer swapping with a task moving to its preferred node over a
1856
+ * task that is not.
1857
+ */
1858
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1859
+ env->best_task->numa_preferred_nid != env->src_nid) {
17411860 goto assign;
17421861 }
17431862
....@@ -1764,42 +1883,95 @@
17641883 goto unlock;
17651884
17661885 assign:
1767
- /*
1768
- * One idle CPU per node is evaluated for a task numa move.
1769
- * Call select_idle_sibling to maybe find a better one.
1770
- */
1886
+ /* Evaluate an idle CPU for a task numa move. */
17711887 if (!cur) {
1888
+ int cpu = env->dst_stats.idle_cpu;
1889
+
1890
+ /* Nothing cached so current CPU went idle since the search. */
1891
+ if (cpu < 0)
1892
+ cpu = env->dst_cpu;
1893
+
17721894 /*
1773
- * select_idle_siblings() uses an per-CPU cpumask that
1774
- * can be used from IRQ context.
1895
+ * If the CPU is no longer truly idle and the previous best CPU
1896
+ * is, keep using it.
17751897 */
1776
- local_irq_disable();
1777
- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778
- env->dst_cpu);
1779
- local_irq_enable();
1898
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1899
+ idle_cpu(env->best_cpu)) {
1900
+ cpu = env->best_cpu;
1901
+ }
1902
+
1903
+ env->dst_cpu = cpu;
17801904 }
17811905
17821906 task_numa_assign(env, cur, imp);
1907
+
1908
+ /*
1909
+ * If a move to idle is allowed because there is capacity or load
1910
+ * balance improves then stop the search. While a better swap
1911
+ * candidate may exist, a search is not free.
1912
+ */
1913
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1914
+ stopsearch = true;
1915
+
1916
+ /*
1917
+ * If a swap candidate must be identified and the current best task
1918
+ * moves its preferred node then stop the search.
1919
+ */
1920
+ if (!maymove && env->best_task &&
1921
+ env->best_task->numa_preferred_nid == env->src_nid) {
1922
+ stopsearch = true;
1923
+ }
17831924 unlock:
17841925 rcu_read_unlock();
1926
+
1927
+ return stopsearch;
17851928 }
17861929
17871930 static void task_numa_find_cpu(struct task_numa_env *env,
17881931 long taskimp, long groupimp)
17891932 {
1790
- long src_load, dst_load, load;
17911933 bool maymove = false;
17921934 int cpu;
17931935
1794
- load = task_h_load(env->p);
1795
- dst_load = env->dst_stats.load + load;
1796
- src_load = env->src_stats.load - load;
1797
-
17981936 /*
1799
- * If the improvement from just moving env->p direction is better
1800
- * than swapping tasks around, check if a move is possible.
1937
+ * If dst node has spare capacity, then check if there is an
1938
+ * imbalance that would be overruled by the load balancer.
18011939 */
1802
- maymove = !load_too_imbalanced(src_load, dst_load, env);
1940
+ if (env->dst_stats.node_type == node_has_spare) {
1941
+ unsigned int imbalance;
1942
+ int src_running, dst_running;
1943
+
1944
+ /*
1945
+ * Would movement cause an imbalance? Note that if src has
1946
+ * more running tasks that the imbalance is ignored as the
1947
+ * move improves the imbalance from the perspective of the
1948
+ * CPU load balancer.
1949
+ * */
1950
+ src_running = env->src_stats.nr_running - 1;
1951
+ dst_running = env->dst_stats.nr_running + 1;
1952
+ imbalance = max(0, dst_running - src_running);
1953
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
1954
+
1955
+ /* Use idle CPU if there is no imbalance */
1956
+ if (!imbalance) {
1957
+ maymove = true;
1958
+ if (env->dst_stats.idle_cpu >= 0) {
1959
+ env->dst_cpu = env->dst_stats.idle_cpu;
1960
+ task_numa_assign(env, NULL, 0);
1961
+ return;
1962
+ }
1963
+ }
1964
+ } else {
1965
+ long src_load, dst_load, load;
1966
+ /*
1967
+ * If the improvement from just moving env->p direction is better
1968
+ * than swapping tasks around, check if a move is possible.
1969
+ */
1970
+ load = task_h_load(env->p);
1971
+ dst_load = env->dst_stats.load + load;
1972
+ src_load = env->src_stats.load - load;
1973
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
1974
+ }
18031975
18041976 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
18051977 /* Skip this CPU if the source task cannot migrate */
....@@ -1807,7 +1979,8 @@
18071979 continue;
18081980
18091981 env->dst_cpu = cpu;
1810
- task_numa_compare(env, taskimp, groupimp, maymove);
1982
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
1983
+ break;
18111984 }
18121985 }
18131986
....@@ -1861,10 +2034,10 @@
18612034 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
18622035 taskweight = task_weight(p, env.src_nid, dist);
18632036 groupweight = group_weight(p, env.src_nid, dist);
1864
- update_numa_stats(&env.src_stats, env.src_nid);
2037
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
18652038 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
18662039 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867
- update_numa_stats(&env.dst_stats, env.dst_nid);
2040
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
18682041
18692042 /* Try to find a spot on the preferred nid. */
18702043 task_numa_find_cpu(&env, taskimp, groupimp);
....@@ -1897,7 +2070,7 @@
18972070
18982071 env.dist = dist;
18992072 env.dst_nid = nid;
1900
- update_numa_stats(&env.dst_stats, env.dst_nid);
2073
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
19012074 task_numa_find_cpu(&env, taskimp, groupimp);
19022075 }
19032076 }
....@@ -1921,15 +2094,17 @@
19212094 }
19222095
19232096 /* No better CPU than the current one was found. */
1924
- if (env.best_cpu == -1)
2097
+ if (env.best_cpu == -1) {
2098
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
19252099 return -EAGAIN;
2100
+ }
19262101
19272102 best_rq = cpu_rq(env.best_cpu);
19282103 if (env.best_task == NULL) {
19292104 ret = migrate_task_to(p, env.best_cpu);
19302105 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19312106 if (ret != 0)
1932
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
2107
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
19332108 return ret;
19342109 }
19352110
....@@ -1937,7 +2112,7 @@
19372112 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19382113
19392114 if (ret != 0)
1940
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
2115
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
19412116 put_task_struct(env.best_task);
19422117 return ret;
19432118 }
....@@ -1948,7 +2123,7 @@
19482123 unsigned long interval = HZ;
19492124
19502125 /* This task has no NUMA fault statistics yet */
1951
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
2126
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
19522127 return;
19532128
19542129 /* Periodically retry migrating the task to the preferred node */
....@@ -2199,7 +2374,7 @@
21992374
22002375 static void task_numa_placement(struct task_struct *p)
22012376 {
2202
- int seq, nid, max_nid = -1;
2377
+ int seq, nid, max_nid = NUMA_NO_NODE;
22032378 unsigned long max_faults = 0;
22042379 unsigned long fault_types[2] = { 0, 0 };
22052380 unsigned long total_faults;
....@@ -2309,12 +2484,12 @@
23092484
23102485 static inline int get_numa_group(struct numa_group *grp)
23112486 {
2312
- return atomic_inc_not_zero(&grp->refcount);
2487
+ return refcount_inc_not_zero(&grp->refcount);
23132488 }
23142489
23152490 static inline void put_numa_group(struct numa_group *grp)
23162491 {
2317
- if (atomic_dec_and_test(&grp->refcount))
2492
+ if (refcount_dec_and_test(&grp->refcount))
23182493 kfree_rcu(grp, rcu);
23192494 }
23202495
....@@ -2335,7 +2510,7 @@
23352510 if (!grp)
23362511 return;
23372512
2338
- atomic_set(&grp->refcount, 1);
2513
+ refcount_set(&grp->refcount, 1);
23392514 grp->active_nodes = 1;
23402515 grp->max_faults_cpu = 0;
23412516 spin_lock_init(&grp->lock);
....@@ -2522,8 +2697,8 @@
25222697 local = 1;
25232698
25242699 /*
2525
- * Retry task to preferred node migration periodically, in case it
2526
- * case it previously failed, or the scheduler moved us.
2700
+ * Retry to migrate task to preferred node periodically, in case it
2701
+ * previously failed, or the scheduler moved us.
25272702 */
25282703 if (time_after(jiffies, p->numa_migrate_retry)) {
25292704 task_numa_placement(p);
....@@ -2558,7 +2733,7 @@
25582733 * The expensive part of numa migration is done from task_work context.
25592734 * Triggered from task_tick_numa().
25602735 */
2561
-void task_numa_work(struct callback_head *work)
2736
+static void task_numa_work(struct callback_head *work)
25622737 {
25632738 unsigned long migrate, next_scan, now = jiffies;
25642739 struct task_struct *p = current;
....@@ -2571,7 +2746,7 @@
25712746
25722747 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
25732748
2574
- work->next = work; /* protect against double add */
2749
+ work->next = work;
25752750 /*
25762751 * Who cares about NUMA placement when they're dying.
25772752 *
....@@ -2618,7 +2793,7 @@
26182793 return;
26192794
26202795
2621
- if (!down_read_trylock(&mm->mmap_sem))
2796
+ if (!mmap_read_trylock(mm))
26222797 return;
26232798 vma = find_vma(mm, start);
26242799 if (!vma) {
....@@ -2646,7 +2821,7 @@
26462821 * Skip inaccessible VMAs to avoid any confusion between
26472822 * PROT_NONE and NUMA hinting ptes
26482823 */
2649
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2824
+ if (!vma_is_accessible(vma))
26502825 continue;
26512826
26522827 do {
....@@ -2686,7 +2861,7 @@
26862861 mm->numa_scan_offset = start;
26872862 else
26882863 reset_ptenuma_scan(p);
2689
- up_read(&mm->mmap_sem);
2864
+ mmap_read_unlock(mm);
26902865
26912866 /*
26922867 * Make sure tasks use at least 32x as much time to run other code
....@@ -2700,10 +2875,54 @@
27002875 }
27012876 }
27022877
2878
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2879
+{
2880
+ int mm_users = 0;
2881
+ struct mm_struct *mm = p->mm;
2882
+
2883
+ if (mm) {
2884
+ mm_users = atomic_read(&mm->mm_users);
2885
+ if (mm_users == 1) {
2886
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2887
+ mm->numa_scan_seq = 0;
2888
+ }
2889
+ }
2890
+ p->node_stamp = 0;
2891
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2892
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2893
+ /* Protect against double add, see task_tick_numa and task_numa_work */
2894
+ p->numa_work.next = &p->numa_work;
2895
+ p->numa_faults = NULL;
2896
+ RCU_INIT_POINTER(p->numa_group, NULL);
2897
+ p->last_task_numa_placement = 0;
2898
+ p->last_sum_exec_runtime = 0;
2899
+
2900
+ init_task_work(&p->numa_work, task_numa_work);
2901
+
2902
+ /* New address space, reset the preferred nid */
2903
+ if (!(clone_flags & CLONE_VM)) {
2904
+ p->numa_preferred_nid = NUMA_NO_NODE;
2905
+ return;
2906
+ }
2907
+
2908
+ /*
2909
+ * New thread, keep existing numa_preferred_nid which should be copied
2910
+ * already by arch_dup_task_struct but stagger when scans start.
2911
+ */
2912
+ if (mm) {
2913
+ unsigned int delay;
2914
+
2915
+ delay = min_t(unsigned int, task_scan_max(current),
2916
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2917
+ delay += 2 * TICK_NSEC;
2918
+ p->node_stamp = delay;
2919
+ }
2920
+}
2921
+
27032922 /*
27042923 * Drive the periodic memory faults..
27052924 */
2706
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
2925
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
27072926 {
27082927 struct callback_head *work = &curr->numa_work;
27092928 u64 period, now;
....@@ -2728,10 +2947,8 @@
27282947 curr->numa_scan_period = task_scan_start(curr);
27292948 curr->node_stamp += period;
27302949
2731
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733
- task_work_add(curr, work, true);
2734
- }
2950
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
2951
+ task_work_add(curr, work, TWA_RESUME);
27352952 }
27362953 }
27372954
....@@ -2761,7 +2978,8 @@
27612978 * the preferred node.
27622979 */
27632980 if (dst_nid == p->numa_preferred_nid ||
2764
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2981
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
2982
+ src_nid != p->numa_preferred_nid))
27652983 return;
27662984 }
27672985
....@@ -2791,8 +3009,6 @@
27913009 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
27923010 {
27933011 update_load_add(&cfs_rq->load, se->load.weight);
2794
- if (!parent_entity(se))
2795
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
27963012 #ifdef CONFIG_SMP
27973013 if (entity_is_task(se)) {
27983014 struct rq *rq = rq_of(cfs_rq);
....@@ -2808,8 +3024,6 @@
28083024 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
28093025 {
28103026 update_load_sub(&cfs_rq->load, se->load.weight);
2811
- if (!parent_entity(se))
2812
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
28133027 #ifdef CONFIG_SMP
28143028 if (entity_is_task(se)) {
28153029 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
....@@ -2856,26 +3070,18 @@
28563070 WRITE_ONCE(*ptr, res); \
28573071 } while (0)
28583072
3073
+/*
3074
+ * Remove and clamp on negative, from a local variable.
3075
+ *
3076
+ * A variant of sub_positive(), which does not use explicit load-store
3077
+ * and is thus optimized for local variable updates.
3078
+ */
3079
+#define lsub_positive(_ptr, _val) do { \
3080
+ typeof(_ptr) ptr = (_ptr); \
3081
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3082
+} while (0)
3083
+
28593084 #ifdef CONFIG_SMP
2860
-static inline void
2861
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2862
-{
2863
- cfs_rq->runnable_weight += se->runnable_weight;
2864
-
2865
- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867
-}
2868
-
2869
-static inline void
2870
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2871
-{
2872
- cfs_rq->runnable_weight -= se->runnable_weight;
2873
-
2874
- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875
- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876
- se_runnable(se) * se->avg.runnable_load_sum);
2877
-}
2878
-
28793085 static inline void
28803086 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
28813087 {
....@@ -2891,45 +3097,36 @@
28913097 }
28923098 #else
28933099 static inline void
2894
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2895
-static inline void
2896
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2897
-static inline void
28983100 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
28993101 static inline void
29003102 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
29013103 #endif
29023104
29033105 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2904
- unsigned long weight, unsigned long runnable)
3106
+ unsigned long weight)
29053107 {
29063108 if (se->on_rq) {
29073109 /* commit outstanding execution time */
29083110 if (cfs_rq->curr == se)
29093111 update_curr(cfs_rq);
2910
- account_entity_dequeue(cfs_rq, se);
2911
- dequeue_runnable_load_avg(cfs_rq, se);
3112
+ update_load_sub(&cfs_rq->load, se->load.weight);
29123113 }
29133114 dequeue_load_avg(cfs_rq, se);
29143115
2915
- se->runnable_weight = runnable;
29163116 update_load_set(&se->load, weight);
29173117
29183118 #ifdef CONFIG_SMP
29193119 do {
2920
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
3120
+ u32 divider = get_pelt_divider(&se->avg);
29213121
29223122 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923
- se->avg.runnable_load_avg =
2924
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
29253123 } while (0);
29263124 #endif
29273125
29283126 enqueue_load_avg(cfs_rq, se);
2929
- if (se->on_rq) {
2930
- account_entity_enqueue(cfs_rq, se);
2931
- enqueue_runnable_load_avg(cfs_rq, se);
2932
- }
3127
+ if (se->on_rq)
3128
+ update_load_add(&cfs_rq->load, se->load.weight);
3129
+
29333130 }
29343131
29353132 void reweight_task(struct task_struct *p, int prio)
....@@ -2939,7 +3136,7 @@
29393136 struct load_weight *load = &se->load;
29403137 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
29413138
2942
- reweight_entity(cfs_rq, se, weight, weight);
3139
+ reweight_entity(cfs_rq, se, weight);
29433140 load->inv_weight = sched_prio_to_wmult[prio];
29443141 }
29453142
....@@ -3051,50 +3248,6 @@
30513248 */
30523249 return clamp_t(long, shares, MIN_SHARES, tg_shares);
30533250 }
3054
-
3055
-/*
3056
- * This calculates the effective runnable weight for a group entity based on
3057
- * the group entity weight calculated above.
3058
- *
3059
- * Because of the above approximation (2), our group entity weight is
3060
- * an load_avg based ratio (3). This means that it includes blocked load and
3061
- * does not represent the runnable weight.
3062
- *
3063
- * Approximate the group entity's runnable weight per ratio from the group
3064
- * runqueue:
3065
- *
3066
- * grq->avg.runnable_load_avg
3067
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068
- * grq->avg.load_avg
3069
- *
3070
- * However, analogous to above, since the avg numbers are slow, this leads to
3071
- * transients in the from-idle case. Instead we use:
3072
- *
3073
- * ge->runnable_weight = ge->load.weight *
3074
- *
3075
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076
- * ----------------------------------------------------- (8)
3077
- * max(grq->avg.load_avg, grq->load.weight)
3078
- *
3079
- * Where these max() serve both to use the 'instant' values to fix the slow
3080
- * from-idle and avoid the /0 on to-idle, similar to (6).
3081
- */
3082
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083
-{
3084
- long runnable, load_avg;
3085
-
3086
- load_avg = max(cfs_rq->avg.load_avg,
3087
- scale_load_down(cfs_rq->load.weight));
3088
-
3089
- runnable = max(cfs_rq->avg.runnable_load_avg,
3090
- scale_load_down(cfs_rq->runnable_weight));
3091
-
3092
- runnable *= shares;
3093
- if (load_avg)
3094
- runnable /= load_avg;
3095
-
3096
- return clamp_t(long, runnable, MIN_SHARES, shares);
3097
-}
30983251 #endif /* CONFIG_SMP */
30993252
31003253 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
....@@ -3106,7 +3259,7 @@
31063259 static void update_cfs_group(struct sched_entity *se)
31073260 {
31083261 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109
- long shares, runnable;
3262
+ long shares;
31103263
31113264 if (!gcfs_rq)
31123265 return;
....@@ -3115,16 +3268,15 @@
31153268 return;
31163269
31173270 #ifndef CONFIG_SMP
3118
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3271
+ shares = READ_ONCE(gcfs_rq->tg->shares);
31193272
31203273 if (likely(se->load.weight == shares))
31213274 return;
31223275 #else
31233276 shares = calc_group_shares(gcfs_rq);
3124
- runnable = calc_group_runnable(gcfs_rq, shares);
31253277 #endif
31263278
3127
- reweight_entity(cfs_rq_of(se), se, shares, runnable);
3279
+ reweight_entity(cfs_rq_of(se), se, shares);
31283280 }
31293281
31303282 #else /* CONFIG_FAIR_GROUP_SCHED */
....@@ -3137,7 +3289,7 @@
31373289 {
31383290 struct rq *rq = rq_of(cfs_rq);
31393291
3140
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3292
+ if (&rq->cfs == cfs_rq) {
31413293 /*
31423294 * There are a few boundary cases this might miss but it should
31433295 * get called often enough that that should (hopefully) not be
....@@ -3161,7 +3313,6 @@
31613313 /**
31623314 * update_tg_load_avg - update the tg's load avg
31633315 * @cfs_rq: the cfs_rq whose avg changed
3164
- * @force: update regardless of how small the difference
31653316 *
31663317 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
31673318 * However, because tg->load_avg is a global value there are performance
....@@ -3173,7 +3324,7 @@
31733324 *
31743325 * Updating tg's load_avg is necessary before update_cfs_share().
31753326 */
3176
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3327
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
31773328 {
31783329 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
31793330
....@@ -3183,11 +3334,9 @@
31833334 if (cfs_rq->tg == &root_task_group)
31843335 return;
31853336
3186
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3337
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
31873338 atomic_long_add(delta, &cfs_rq->tg->load_avg);
31883339 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189
-
3190
- trace_sched_load_tg(cfs_rq);
31913340 }
31923341 }
31933342
....@@ -3240,7 +3389,6 @@
32403389 se->avg.last_update_time = n_last_update_time;
32413390 }
32423391
3243
-
32443392 /*
32453393 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
32463394 * propagate its contribution. The key to this propagation is the invariant
....@@ -3251,11 +3399,11 @@
32513399 * _IFF_ we look at the pure running and runnable sums. Because they
32523400 * represent the very same entity, just at different points in the hierarchy.
32533401 *
3254
- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255
- * sum over (but still wrong, because the group entity and group rq do not have
3256
- * their PELT windows aligned).
3402
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3403
+ * and simply copies the running/runnable sum over (but still wrong, because
3404
+ * the group entity and group rq do not have their PELT windows aligned).
32573405 *
3258
- * However, update_tg_cfs_runnable() is more complex. So we have:
3406
+ * However, update_tg_cfs_load() is more complex. So we have:
32593407 *
32603408 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
32613409 *
....@@ -3308,45 +3456,75 @@
33083456 * XXX: only do this for the part of runnable > running ?
33093457 *
33103458 */
3311
-
33123459 static inline void
33133460 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33143461 {
33153462 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3463
+ u32 divider;
33163464
33173465 /* Nothing to update */
33183466 if (!delta)
33193467 return;
33203468
33213469 /*
3322
- * The relation between sum and avg is:
3323
- *
3324
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325
- *
3326
- * however, the PELT windows are not aligned between grq and gse.
3470
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3471
+ * See ___update_load_avg() for details.
33273472 */
3473
+ divider = get_pelt_divider(&cfs_rq->avg);
33283474
33293475 /* Set new sched_entity's utilization */
33303476 se->avg.util_avg = gcfs_rq->avg.util_avg;
3331
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3477
+ se->avg.util_sum = se->avg.util_avg * divider;
33323478
33333479 /* Update parent cfs_rq utilization */
33343480 add_positive(&cfs_rq->avg.util_avg, delta);
3335
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3481
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
33363482 }
33373483
33383484 static inline void
33393485 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33403486 {
3487
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3488
+ u32 divider;
3489
+
3490
+ /* Nothing to update */
3491
+ if (!delta)
3492
+ return;
3493
+
3494
+ /*
3495
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3496
+ * See ___update_load_avg() for details.
3497
+ */
3498
+ divider = get_pelt_divider(&cfs_rq->avg);
3499
+
3500
+ /* Set new sched_entity's runnable */
3501
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3502
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3503
+
3504
+ /* Update parent cfs_rq runnable */
3505
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
3506
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3507
+}
3508
+
3509
+static inline void
3510
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3511
+{
33413512 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342
- unsigned long runnable_load_avg, load_avg;
3343
- u64 runnable_load_sum, load_sum = 0;
3513
+ unsigned long load_avg;
3514
+ u64 load_sum = 0;
33443515 s64 delta_sum;
3516
+ u32 divider;
33453517
33463518 if (!runnable_sum)
33473519 return;
33483520
33493521 gcfs_rq->prop_runnable_sum = 0;
3522
+
3523
+ /*
3524
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3525
+ * See ___update_load_avg() for details.
3526
+ */
3527
+ divider = get_pelt_divider(&cfs_rq->avg);
33503528
33513529 if (runnable_sum >= 0) {
33523530 /*
....@@ -3354,7 +3532,7 @@
33543532 * the CPU is saturated running == runnable.
33553533 */
33563534 runnable_sum += se->avg.load_sum;
3357
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3535
+ runnable_sum = min_t(long, runnable_sum, divider);
33583536 } else {
33593537 /*
33603538 * Estimate the new unweighted runnable_sum of the gcfs_rq by
....@@ -3379,7 +3557,7 @@
33793557 runnable_sum = max(runnable_sum, running_sum);
33803558
33813559 load_sum = (s64)se_weight(se) * runnable_sum;
3382
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3560
+ load_avg = div_s64(load_sum, divider);
33833561
33843562 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
33853563 delta_avg = load_avg - se->avg.load_avg;
....@@ -3388,19 +3566,6 @@
33883566 se->avg.load_avg = load_avg;
33893567 add_positive(&cfs_rq->avg.load_avg, delta_avg);
33903568 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391
-
3392
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396
-
3397
- se->avg.runnable_load_sum = runnable_sum;
3398
- se->avg.runnable_load_avg = runnable_load_avg;
3399
-
3400
- if (se->on_rq) {
3401
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403
- }
34043569 }
34053570
34063571 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
....@@ -3429,9 +3594,10 @@
34293594
34303595 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
34313596 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3597
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
34323598
3433
- trace_sched_load_cfs_rq(cfs_rq);
3434
- trace_sched_load_se(se);
3599
+ trace_pelt_cfs_tp(cfs_rq);
3600
+ trace_pelt_se_tp(se);
34353601
34363602 return 1;
34373603 }
....@@ -3468,7 +3634,7 @@
34683634
34693635 #else /* CONFIG_FAIR_GROUP_SCHED */
34703636
3471
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3637
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
34723638
34733639 static inline int propagate_entity_load_avg(struct sched_entity *se)
34743640 {
....@@ -3498,18 +3664,18 @@
34983664 static inline int
34993665 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
35003666 {
3501
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3667
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
35023668 struct sched_avg *sa = &cfs_rq->avg;
35033669 int decayed = 0;
35043670
35053671 if (cfs_rq->removed.nr) {
35063672 unsigned long r;
3507
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3673
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35083674
35093675 raw_spin_lock(&cfs_rq->removed.lock);
35103676 swap(cfs_rq->removed.util_avg, removed_util);
35113677 swap(cfs_rq->removed.load_avg, removed_load);
3512
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3678
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
35133679 cfs_rq->removed.nr = 0;
35143680 raw_spin_unlock(&cfs_rq->removed.lock);
35153681
....@@ -3520,8 +3686,29 @@
35203686 r = removed_util;
35213687 sub_positive(&sa->util_avg, r);
35223688 sub_positive(&sa->util_sum, r * divider);
3689
+ /*
3690
+ * Because of rounding, se->util_sum might ends up being +1 more than
3691
+ * cfs->util_sum. Although this is not a problem by itself, detaching
3692
+ * a lot of tasks with the rounding problem between 2 updates of
3693
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3694
+ * cfs_util_avg is not.
3695
+ * Check that util_sum is still above its lower bound for the new
3696
+ * util_avg. Given that period_contrib might have moved since the last
3697
+ * sync, we are only sure that util_sum must be above or equal to
3698
+ * util_avg * minimum possible divider
3699
+ */
3700
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
35233701
3524
- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3702
+ r = removed_runnable;
3703
+ sub_positive(&sa->runnable_avg, r);
3704
+ sub_positive(&sa->runnable_sum, r * divider);
3705
+
3706
+ /*
3707
+ * removed_runnable is the unweighted version of removed_load so we
3708
+ * can use it to estimate removed_load_sum.
3709
+ */
3710
+ add_tg_cfs_propagate(cfs_rq,
3711
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
35253712
35263713 decayed = 1;
35273714 }
....@@ -3533,9 +3720,6 @@
35333720 cfs_rq->load_last_update_time_copy = sa->last_update_time;
35343721 #endif
35353722
3536
- if (decayed)
3537
- cfs_rq_util_change(cfs_rq, 0);
3538
-
35393723 return decayed;
35403724 }
35413725
....@@ -3543,14 +3727,17 @@
35433727 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
35443728 * @cfs_rq: cfs_rq to attach to
35453729 * @se: sched_entity to attach
3546
- * @flags: migration hints
35473730 *
35483731 * Must call update_cfs_rq_load_avg() before this, since we rely on
35493732 * cfs_rq->avg.last_update_time being current.
35503733 */
3551
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3734
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
35523735 {
3553
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3736
+ /*
3737
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3738
+ * See ___update_load_avg() for details.
3739
+ */
3740
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35543741
35553742 /*
35563743 * When we attach the @se to the @cfs_rq, we must align the decay
....@@ -3570,23 +3757,25 @@
35703757 */
35713758 se->avg.util_sum = se->avg.util_avg * divider;
35723759
3573
- se->avg.load_sum = divider;
3574
- if (se_weight(se)) {
3575
- se->avg.load_sum =
3576
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577
- }
3760
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
35783761
3579
- se->avg.runnable_load_sum = se->avg.load_sum;
3762
+ se->avg.load_sum = se->avg.load_avg * divider;
3763
+ if (se_weight(se) < se->avg.load_sum)
3764
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
3765
+ else
3766
+ se->avg.load_sum = 1;
35803767
35813768 enqueue_load_avg(cfs_rq, se);
35823769 cfs_rq->avg.util_avg += se->avg.util_avg;
35833770 cfs_rq->avg.util_sum += se->avg.util_sum;
3771
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3772
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
35843773
35853774 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
35863775
3587
- cfs_rq_util_change(cfs_rq, flags);
3776
+ cfs_rq_util_change(cfs_rq, 0);
35883777
3589
- trace_sched_load_cfs_rq(cfs_rq);
3778
+ trace_pelt_cfs_tp(cfs_rq);
35903779 }
35913780
35923781 /**
....@@ -3602,12 +3791,14 @@
36023791 dequeue_load_avg(cfs_rq, se);
36033792 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
36043793 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3794
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3795
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
36053796
36063797 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
36073798
36083799 cfs_rq_util_change(cfs_rq, 0);
36093800
3610
- trace_sched_load_cfs_rq(cfs_rq);
3801
+ trace_pelt_cfs_tp(cfs_rq);
36113802 }
36123803
36133804 /*
....@@ -3623,12 +3814,15 @@
36233814 u64 now = cfs_rq_clock_pelt(cfs_rq);
36243815 int decayed;
36253816
3817
+ trace_android_vh_prepare_update_load_avg_se(se, flags);
36263818 /*
36273819 * Track task load average for carrying it to new CPU after migrated, and
36283820 * track group sched_entity load average for task_h_load calc in migration
36293821 */
36303822 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
36313823 __update_load_avg_se(now, cfs_rq, se);
3824
+
3825
+ trace_android_vh_finish_update_load_avg_se(se, flags);
36323826
36333827 decayed = update_cfs_rq_load_avg(now, cfs_rq);
36343828 decayed |= propagate_entity_load_avg(se);
....@@ -3642,11 +3836,15 @@
36423836 *
36433837 * IOW we're enqueueing a task on a new CPU.
36443838 */
3645
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646
- update_tg_load_avg(cfs_rq, 0);
3839
+ attach_entity_load_avg(cfs_rq, se);
3840
+ update_tg_load_avg(cfs_rq);
36473841
3648
- } else if (decayed && (flags & UPDATE_TG))
3649
- update_tg_load_avg(cfs_rq, 0);
3842
+ } else if (decayed) {
3843
+ cfs_rq_util_change(cfs_rq, 0);
3844
+
3845
+ if (flags & UPDATE_TG)
3846
+ update_tg_load_avg(cfs_rq);
3847
+ }
36503848 }
36513849
36523850 #ifndef CONFIG_64BIT
....@@ -3674,20 +3872,22 @@
36743872 * Synchronize entity load avg of dequeued entity without locking
36753873 * the previous rq.
36763874 */
3677
-void sync_entity_load_avg(struct sched_entity *se)
3875
+static void sync_entity_load_avg(struct sched_entity *se)
36783876 {
36793877 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36803878 u64 last_update_time;
36813879
36823880 last_update_time = cfs_rq_last_update_time(cfs_rq);
3881
+ trace_android_vh_prepare_update_load_avg_se(se, 0);
36833882 __update_load_avg_blocked_se(last_update_time, se);
3883
+ trace_android_vh_finish_update_load_avg_se(se, 0);
36843884 }
36853885
36863886 /*
36873887 * Task first catches up with cfs_rq, and then subtract
36883888 * itself from the cfs_rq (task must be off the queue now).
36893889 */
3690
-void remove_entity_load_avg(struct sched_entity *se)
3890
+static void remove_entity_load_avg(struct sched_entity *se)
36913891 {
36923892 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36933893 unsigned long flags;
....@@ -3696,10 +3896,6 @@
36963896 * tasks cannot exit without having gone through wake_up_new_task() ->
36973897 * post_init_entity_util_avg() which will have added things to the
36983898 * cfs_rq, so we can remove unconditionally.
3699
- *
3700
- * Similarly for groups, they will have passed through
3701
- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702
- * calls this.
37033899 */
37043900
37053901 sync_entity_load_avg(se);
....@@ -3708,13 +3904,13 @@
37083904 ++cfs_rq->removed.nr;
37093905 cfs_rq->removed.util_avg += se->avg.util_avg;
37103906 cfs_rq->removed.load_avg += se->avg.load_avg;
3711
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
3907
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
37123908 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
37133909 }
37143910
3715
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3911
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
37163912 {
3717
- return cfs_rq->avg.runnable_load_avg;
3913
+ return cfs_rq->avg.runnable_avg;
37183914 }
37193915
37203916 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
....@@ -3722,7 +3918,7 @@
37223918 return cfs_rq->avg.load_avg;
37233919 }
37243920
3725
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3921
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
37263922
37273923 static inline unsigned long task_util(struct task_struct *p)
37283924 {
....@@ -3733,23 +3929,25 @@
37333929 {
37343930 struct util_est ue = READ_ONCE(p->se.avg.util_est);
37353931
3736
- return max(ue.ewma, ue.enqueued);
3932
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
37373933 }
37383934
3739
-unsigned long task_util_est(struct task_struct *p)
3935
+static inline unsigned long task_util_est(struct task_struct *p)
37403936 {
37413937 return max(task_util(p), _task_util_est(p));
37423938 }
37433939
37443940 #ifdef CONFIG_UCLAMP_TASK
3745
-static inline unsigned long uclamp_task_util(struct task_struct *p)
3941
+static inline unsigned long uclamp_task_util(struct task_struct *p,
3942
+ unsigned long uclamp_min,
3943
+ unsigned long uclamp_max)
37463944 {
3747
- return clamp(task_util_est(p),
3748
- uclamp_eff_value(p, UCLAMP_MIN),
3749
- uclamp_eff_value(p, UCLAMP_MAX));
3945
+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
37503946 }
37513947 #else
3752
-static inline unsigned long uclamp_task_util(struct task_struct *p)
3948
+static inline unsigned long uclamp_task_util(struct task_struct *p,
3949
+ unsigned long uclamp_min,
3950
+ unsigned long uclamp_max)
37533951 {
37543952 return task_util_est(p);
37553953 }
....@@ -3765,13 +3963,29 @@
37653963
37663964 /* Update root cfs_rq's estimated utilization */
37673965 enqueued = cfs_rq->avg.util_est.enqueued;
3768
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3966
+ enqueued += _task_util_est(p);
37693967 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
37703968
3771
- /* Update plots for Task and CPU estimated utilization */
3772
- trace_sched_util_est_task(p, &p->se.avg);
3773
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3969
+ trace_sched_util_est_cfs_tp(cfs_rq);
37743970 }
3971
+
3972
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3973
+ struct task_struct *p)
3974
+{
3975
+ unsigned int enqueued;
3976
+
3977
+ if (!sched_feat(UTIL_EST))
3978
+ return;
3979
+
3980
+ /* Update root cfs_rq's estimated utilization */
3981
+ enqueued = cfs_rq->avg.util_est.enqueued;
3982
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3983
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3984
+
3985
+ trace_sched_util_est_cfs_tp(cfs_rq);
3986
+}
3987
+
3988
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
37753989
37763990 /*
37773991 * Check if a (signed) value is within a specified (unsigned) margin,
....@@ -3786,24 +4000,20 @@
37864000 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
37874001 }
37884002
3789
-static void
3790
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
4003
+static inline void util_est_update(struct cfs_rq *cfs_rq,
4004
+ struct task_struct *p,
4005
+ bool task_sleep)
37914006 {
3792
- long last_ewma_diff;
4007
+ long last_ewma_diff, last_enqueued_diff;
37934008 struct util_est ue;
3794
- int cpu;
4009
+ int ret = 0;
4010
+
4011
+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
4012
+ if (ret)
4013
+ return;
37954014
37964015 if (!sched_feat(UTIL_EST))
37974016 return;
3798
-
3799
- /* Update root cfs_rq's estimated utilization */
3800
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3803
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804
-
3805
- /* Update plots for CPU's estimated utilization */
3806
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
38074017
38084018 /*
38094019 * Skip update of task's estimated utilization when the task has not
....@@ -3820,11 +4030,13 @@
38204030 if (ue.enqueued & UTIL_AVG_UNCHANGED)
38214031 return;
38224032
4033
+ last_enqueued_diff = ue.enqueued;
4034
+
38234035 /*
38244036 * Reset EWMA on utilization increases, the moving average is used only
38254037 * to smooth utilization decreases.
38264038 */
3827
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
4039
+ ue.enqueued = task_util(p);
38284040 if (sched_feat(UTIL_EST_FASTUP)) {
38294041 if (ue.ewma < ue.enqueued) {
38304042 ue.ewma = ue.enqueued;
....@@ -3833,19 +4045,23 @@
38334045 }
38344046
38354047 /*
3836
- * Skip update of task's estimated utilization when its EWMA is
4048
+ * Skip update of task's estimated utilization when its members are
38374049 * already ~1% close to its last activation value.
38384050 */
38394051 last_ewma_diff = ue.enqueued - ue.ewma;
3840
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
4052
+ last_enqueued_diff -= ue.enqueued;
4053
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4054
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4055
+ goto done;
4056
+
38414057 return;
4058
+ }
38424059
38434060 /*
38444061 * To avoid overestimation of actual task utilization, skip updates if
38454062 * we cannot grant there is idle time in this CPU.
38464063 */
3847
- cpu = cpu_of(rq_of(cfs_rq));
3848
- if (task_util(p) > capacity_orig_of(cpu))
4064
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
38494065 return;
38504066
38514067 /*
....@@ -3869,49 +4085,166 @@
38694085 ue.ewma += last_ewma_diff;
38704086 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
38714087 done:
4088
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
38724089 WRITE_ONCE(p->se.avg.util_est, ue);
38734090
3874
- /* Update plots for Task's estimated utilization */
3875
- trace_sched_util_est_task(p, &p->se.avg);
4091
+ trace_sched_util_est_se_tp(&p->se);
38764092 }
38774093
3878
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
4094
+static inline int util_fits_cpu(unsigned long util,
4095
+ unsigned long uclamp_min,
4096
+ unsigned long uclamp_max,
4097
+ int cpu)
38794098 {
3880
- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
3881
-}
3882
-
3883
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884
-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885
-{
4099
+ unsigned long capacity_orig, capacity_orig_thermal;
38864100 unsigned long capacity = capacity_of(cpu);
3887
- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
4101
+ bool fits, uclamp_max_fits;
38884102
3889
- if (capacity == max_capacity)
3890
- return true;
4103
+ /*
4104
+ * Check if the real util fits without any uclamp boost/cap applied.
4105
+ */
4106
+ fits = fits_capacity(util, capacity);
38914107
3892
- if (capacity * capacity_margin > max_capacity * 1024)
3893
- return true;
4108
+ if (!uclamp_is_used())
4109
+ return fits;
38944110
3895
- return task_fits_capacity(p, capacity);
4111
+ /*
4112
+ * We must use capacity_orig_of() for comparing against uclamp_min and
4113
+ * uclamp_max. We only care about capacity pressure (by using
4114
+ * capacity_of()) for comparing against the real util.
4115
+ *
4116
+ * If a task is boosted to 1024 for example, we don't want a tiny
4117
+ * pressure to skew the check whether it fits a CPU or not.
4118
+ *
4119
+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
4120
+ * should fit a little cpu even if there's some pressure.
4121
+ *
4122
+ * Only exception is for thermal pressure since it has a direct impact
4123
+ * on available OPP of the system.
4124
+ *
4125
+ * We honour it for uclamp_min only as a drop in performance level
4126
+ * could result in not getting the requested minimum performance level.
4127
+ *
4128
+ * For uclamp_max, we can tolerate a drop in performance level as the
4129
+ * goal is to cap the task. So it's okay if it's getting less.
4130
+ *
4131
+ * In case of capacity inversion, which is not handled yet, we should
4132
+ * honour the inverted capacity for both uclamp_min and uclamp_max all
4133
+ * the time.
4134
+ */
4135
+ capacity_orig = capacity_orig_of(cpu);
4136
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
4137
+
4138
+ /*
4139
+ * We want to force a task to fit a cpu as implied by uclamp_max.
4140
+ * But we do have some corner cases to cater for..
4141
+ *
4142
+ *
4143
+ * C=z
4144
+ * | ___
4145
+ * | C=y | |
4146
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
4147
+ * | C=x | | | |
4148
+ * | ___ | | | |
4149
+ * | | | | | | | (util somewhere in this region)
4150
+ * | | | | | | |
4151
+ * | | | | | | |
4152
+ * +----------------------------------------
4153
+ * cpu0 cpu1 cpu2
4154
+ *
4155
+ * In the above example if a task is capped to a specific performance
4156
+ * point, y, then when:
4157
+ *
4158
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
4159
+ * to cpu1
4160
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
4161
+ * uclamp_max request.
4162
+ *
4163
+ * which is what we're enforcing here. A task always fits if
4164
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
4165
+ * the normal upmigration rules should withhold still.
4166
+ *
4167
+ * Only exception is when we are on max capacity, then we need to be
4168
+ * careful not to block overutilized state. This is so because:
4169
+ *
4170
+ * 1. There's no concept of capping at max_capacity! We can't go
4171
+ * beyond this performance level anyway.
4172
+ * 2. The system is being saturated when we're operating near
4173
+ * max capacity, it doesn't make sense to block overutilized.
4174
+ */
4175
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
4176
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
4177
+ fits = fits || uclamp_max_fits;
4178
+
4179
+ /*
4180
+ *
4181
+ * C=z
4182
+ * | ___ (region a, capped, util >= uclamp_max)
4183
+ * | C=y | |
4184
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
4185
+ * | C=x | | | |
4186
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
4187
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
4188
+ * | | | | | | |
4189
+ * | | | | | | | (region c, boosted, util < uclamp_min)
4190
+ * +----------------------------------------
4191
+ * cpu0 cpu1 cpu2
4192
+ *
4193
+ * a) If util > uclamp_max, then we're capped, we don't care about
4194
+ * actual fitness value here. We only care if uclamp_max fits
4195
+ * capacity without taking margin/pressure into account.
4196
+ * See comment above.
4197
+ *
4198
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
4199
+ * fits_capacity() rules apply. Except we need to ensure that we
4200
+ * enforce we remain within uclamp_max, see comment above.
4201
+ *
4202
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
4203
+ * need to take into account the boosted value fits the CPU without
4204
+ * taking margin/pressure into account.
4205
+ *
4206
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
4207
+ * just need to consider an extra check for case (c) after ensuring we
4208
+ * handle the case uclamp_min > uclamp_max.
4209
+ */
4210
+ uclamp_min = min(uclamp_min, uclamp_max);
4211
+ if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
4212
+ fits = fits && (uclamp_min <= capacity_orig_thermal);
4213
+
4214
+ return fits;
38964215 }
3897
-#endif
4216
+
4217
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
4218
+{
4219
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
4220
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
4221
+ unsigned long util = task_util_est(p);
4222
+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
4223
+}
38984224
38994225 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
39004226 {
3901
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
4227
+ bool need_update = true;
4228
+
4229
+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
4230
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update)
39024231 return;
39034232
3904
- if (!p) {
4233
+ if (!p || p->nr_cpus_allowed == 1) {
39054234 rq->misfit_task_load = 0;
39064235 return;
39074236 }
39084237
3909
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4238
+ if (task_fits_cpu(p, cpu_of(rq))) {
39104239 rq->misfit_task_load = 0;
39114240 return;
39124241 }
39134242
3914
- rq->misfit_task_load = task_h_load(p);
4243
+ /*
4244
+ * Make sure that misfit_task_load will not be null even if
4245
+ * task_h_load() returns 0.
4246
+ */
4247
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
39154248 }
39164249
39174250 #else /* CONFIG_SMP */
....@@ -3928,11 +4261,11 @@
39284261 static inline void remove_entity_load_avg(struct sched_entity *se) {}
39294262
39304263 static inline void
3931
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
4264
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39324265 static inline void
39334266 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39344267
3935
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
4268
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
39364269 {
39374270 return 0;
39384271 }
....@@ -3941,8 +4274,11 @@
39414274 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
39424275
39434276 static inline void
3944
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3945
- bool task_sleep) {}
4277
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4278
+
4279
+static inline void
4280
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4281
+ bool task_sleep) {}
39464282 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
39474283
39484284 #endif /* CONFIG_SMP */
....@@ -3958,6 +4294,29 @@
39584294 if (d > 3*sysctl_sched_latency)
39594295 schedstat_inc(cfs_rq->nr_spread_over);
39604296 #endif
4297
+}
4298
+
4299
+static inline bool entity_is_long_sleeper(struct sched_entity *se)
4300
+{
4301
+ struct cfs_rq *cfs_rq;
4302
+ u64 sleep_time;
4303
+
4304
+ if (se->exec_start == 0)
4305
+ return false;
4306
+
4307
+ cfs_rq = cfs_rq_of(se);
4308
+
4309
+ sleep_time = rq_clock_task(rq_of(cfs_rq));
4310
+
4311
+ /* Happen while migrating because of clock task divergence */
4312
+ if (sleep_time <= se->exec_start)
4313
+ return false;
4314
+
4315
+ sleep_time -= se->exec_start;
4316
+ if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
4317
+ return true;
4318
+
4319
+ return false;
39614320 }
39624321
39634322 static void
....@@ -3988,8 +4347,30 @@
39884347 vruntime -= thresh;
39894348 }
39904349
3991
- /* ensure we never gain time by being placed backwards. */
3992
- se->vruntime = max_vruntime(se->vruntime, vruntime);
4350
+ /*
4351
+ * Pull vruntime of the entity being placed to the base level of
4352
+ * cfs_rq, to prevent boosting it if placed backwards.
4353
+ * However, min_vruntime can advance much faster than real time, with
4354
+ * the extreme being when an entity with the minimal weight always runs
4355
+ * on the cfs_rq. If the waking entity slept for a long time, its
4356
+ * vruntime difference from min_vruntime may overflow s64 and their
4357
+ * comparison may get inversed, so ignore the entity's original
4358
+ * vruntime in that case.
4359
+ * The maximal vruntime speedup is given by the ratio of normal to
4360
+ * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
4361
+ * When placing a migrated waking entity, its exec_start has been set
4362
+ * from a different rq. In order to take into account a possible
4363
+ * divergence between new and prev rq's clocks task because of irq and
4364
+ * stolen time, we take an additional margin.
4365
+ * So, cutting off on the sleep time of
4366
+ * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
4367
+ * should be safe.
4368
+ */
4369
+ if (entity_is_long_sleeper(se))
4370
+ se->vruntime = vruntime;
4371
+ else
4372
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
4373
+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
39934374 }
39944375
39954376 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
....@@ -4014,6 +4395,7 @@
40144395 #endif
40154396 }
40164397
4398
+static inline bool cfs_bandwidth_used(void);
40174399
40184400 /*
40194401 * MIGRATION
....@@ -4078,12 +4460,15 @@
40784460 * - Add its new weight to cfs_rq->load.weight
40794461 */
40804462 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4463
+ se_update_runnable(se);
40814464 update_cfs_group(se);
4082
- enqueue_runnable_load_avg(cfs_rq, se);
40834465 account_entity_enqueue(cfs_rq, se);
40844466
40854467 if (flags & ENQUEUE_WAKEUP)
40864468 place_entity(cfs_rq, se, 0);
4469
+ /* Entity has migrated, no longer consider this task hot */
4470
+ if (flags & ENQUEUE_MIGRATED)
4471
+ se->exec_start = 0;
40874472
40884473 check_schedstat_required();
40894474 update_stats_enqueue(cfs_rq, se, flags);
....@@ -4092,10 +4477,16 @@
40924477 __enqueue_entity(cfs_rq, se);
40934478 se->on_rq = 1;
40944479
4095
- if (cfs_rq->nr_running == 1) {
4480
+ /*
4481
+ * When bandwidth control is enabled, cfs might have been removed
4482
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
4483
+ * add it unconditionnally.
4484
+ */
4485
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
40964486 list_add_leaf_cfs_rq(cfs_rq);
4487
+
4488
+ if (cfs_rq->nr_running == 1)
40974489 check_enqueue_throttle(cfs_rq);
4098
- }
40994490 }
41004491
41014492 static void __clear_buddies_last(struct sched_entity *se)
....@@ -4156,13 +4547,13 @@
41564547 /*
41574548 * When dequeuing a sched_entity, we must:
41584549 * - Update loads to have both entity and cfs_rq synced with now.
4159
- * - Substract its load from the cfs_rq->runnable_avg.
4160
- * - Substract its previous weight from cfs_rq->load.weight.
4550
+ * - Subtract its load from the cfs_rq->runnable_avg.
4551
+ * - Subtract its previous weight from cfs_rq->load.weight.
41614552 * - For group entity, update its weight to reflect the new share
41624553 * of its group cfs_rq.
41634554 */
41644555 update_load_avg(cfs_rq, se, UPDATE_TG);
4165
- dequeue_runnable_load_avg(cfs_rq, se);
4556
+ se_update_runnable(se);
41664557
41674558 update_stats_dequeue(cfs_rq, se, flags);
41684559
....@@ -4206,11 +4597,16 @@
42064597 unsigned long ideal_runtime, delta_exec;
42074598 struct sched_entity *se;
42084599 s64 delta;
4600
+ bool skip_preempt = false;
42094601
42104602 ideal_runtime = sched_slice(cfs_rq, curr);
42114603 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4604
+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
4605
+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
4606
+ if (skip_preempt)
4607
+ return;
42124608 if (delta_exec > ideal_runtime) {
4213
- resched_curr_lazy(rq_of(cfs_rq));
4609
+ resched_curr(rq_of(cfs_rq));
42144610 /*
42154611 * The current task ran long enough, ensure it doesn't get
42164612 * re-elected due to buddy favours.
....@@ -4234,11 +4630,10 @@
42344630 return;
42354631
42364632 if (delta > ideal_runtime)
4237
- resched_curr_lazy(rq_of(cfs_rq));
4633
+ resched_curr(rq_of(cfs_rq));
42384634 }
42394635
4240
-static void
4241
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4636
+void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
42424637 {
42434638 /* 'current' is not kept within the tree. */
42444639 if (se->on_rq) {
....@@ -4260,7 +4655,8 @@
42604655 * least twice that of our own weight (i.e. dont track it
42614656 * when there are only lesser-weight tasks around):
42624657 */
4263
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4658
+ if (schedstat_enabled() &&
4659
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
42644660 schedstat_set(se->statistics.slice_max,
42654661 max((u64)schedstat_val(se->statistics.slice_max),
42664662 se->sum_exec_runtime - se->prev_sum_exec_runtime));
....@@ -4268,6 +4664,8 @@
42684664
42694665 se->prev_sum_exec_runtime = se->sum_exec_runtime;
42704666 }
4667
+EXPORT_SYMBOL_GPL(set_next_entity);
4668
+
42714669
42724670 static int
42734671 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
....@@ -4283,7 +4681,11 @@
42834681 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
42844682 {
42854683 struct sched_entity *left = __pick_first_entity(cfs_rq);
4286
- struct sched_entity *se;
4684
+ struct sched_entity *se = NULL;
4685
+
4686
+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
4687
+ if (se)
4688
+ goto done;
42874689
42884690 /*
42894691 * If curr is set we have to see if its left of the leftmost entity
....@@ -4313,18 +4715,19 @@
43134715 se = second;
43144716 }
43154717
4316
- /*
4317
- * Prefer last buddy, try to return the CPU to a preempted task.
4318
- */
4319
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320
- se = cfs_rq->last;
4321
-
4322
- /*
4323
- * Someone really wants this to run. If it's not unfair, run it.
4324
- */
4325
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4718
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4719
+ /*
4720
+ * Someone really wants this to run. If it's not unfair, run it.
4721
+ */
43264722 se = cfs_rq->next;
4723
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4724
+ /*
4725
+ * Prefer last buddy, try to return the CPU to a preempted task.
4726
+ */
4727
+ se = cfs_rq->last;
4728
+ }
43274729
4730
+done:
43284731 clear_buddies(cfs_rq, se);
43294732
43304733 return se;
....@@ -4376,7 +4779,7 @@
43764779 * validating it and just reschedule.
43774780 */
43784781 if (queued) {
4379
- resched_curr_lazy(rq_of(cfs_rq));
4782
+ resched_curr(rq_of(cfs_rq));
43804783 return;
43814784 }
43824785 /*
....@@ -4457,26 +4860,17 @@
44574860 return &tg->cfs_bandwidth;
44584861 }
44594862
4460
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462
-{
4463
- if (unlikely(cfs_rq->throttle_count))
4464
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465
-
4466
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467
-}
4468
-
44694863 /* returns 0 on failure to allocate runtime */
4470
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4864
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4865
+ struct cfs_rq *cfs_rq, u64 target_runtime)
44714866 {
4472
- struct task_group *tg = cfs_rq->tg;
4473
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474
- u64 amount = 0, min_amount;
4867
+ u64 min_amount, amount = 0;
4868
+
4869
+ lockdep_assert_held(&cfs_b->lock);
44754870
44764871 /* note: this is a positive sum as runtime_remaining <= 0 */
4477
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4872
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
44784873
4479
- raw_spin_lock(&cfs_b->lock);
44804874 if (cfs_b->quota == RUNTIME_INF)
44814875 amount = min_amount;
44824876 else {
....@@ -4488,11 +4882,23 @@
44884882 cfs_b->idle = 0;
44894883 }
44904884 }
4491
- raw_spin_unlock(&cfs_b->lock);
44924885
44934886 cfs_rq->runtime_remaining += amount;
44944887
44954888 return cfs_rq->runtime_remaining > 0;
4889
+}
4890
+
4891
+/* returns 0 on failure to allocate runtime */
4892
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4893
+{
4894
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4895
+ int ret;
4896
+
4897
+ raw_spin_lock(&cfs_b->lock);
4898
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4899
+ raw_spin_unlock(&cfs_b->lock);
4900
+
4901
+ return ret;
44964902 }
44974903
44984904 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
....@@ -4510,7 +4916,7 @@
45104916 * hierarchy can be throttled
45114917 */
45124918 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4513
- resched_curr_lazy(rq_of(cfs_rq));
4919
+ resched_curr(rq_of(cfs_rq));
45144920 }
45154921
45164922 static __always_inline
....@@ -4557,9 +4963,8 @@
45574963
45584964 cfs_rq->throttle_count--;
45594965 if (!cfs_rq->throttle_count) {
4560
- /* adjust cfs_rq_clock_task() */
4561
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562
- cfs_rq->throttled_clock_task;
4966
+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
4967
+ cfs_rq->throttled_clock_pelt;
45634968
45644969 /* Add cfs_rq with already running entity in the list */
45654970 if (cfs_rq->nr_running >= 1)
....@@ -4576,7 +4981,7 @@
45764981
45774982 /* group is entering throttled state, stop time */
45784983 if (!cfs_rq->throttle_count) {
4579
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
4984
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
45804985 list_del_leaf_cfs_rq(cfs_rq);
45814986 }
45824987 cfs_rq->throttle_count++;
....@@ -4584,13 +4989,33 @@
45844989 return 0;
45854990 }
45864991
4587
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4992
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
45884993 {
45894994 struct rq *rq = rq_of(cfs_rq);
45904995 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
45914996 struct sched_entity *se;
4592
- long task_delta, dequeue = 1;
4593
- bool empty;
4997
+ long task_delta, idle_task_delta, dequeue = 1;
4998
+
4999
+ raw_spin_lock(&cfs_b->lock);
5000
+ /* This will start the period timer if necessary */
5001
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
5002
+ /*
5003
+ * We have raced with bandwidth becoming available, and if we
5004
+ * actually throttled the timer might not unthrottle us for an
5005
+ * entire period. We additionally needed to make sure that any
5006
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
5007
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
5008
+ * for 1ns of runtime rather than just check cfs_b.
5009
+ */
5010
+ dequeue = 0;
5011
+ } else {
5012
+ list_add_tail_rcu(&cfs_rq->throttled_list,
5013
+ &cfs_b->throttled_cfs_rq);
5014
+ }
5015
+ raw_spin_unlock(&cfs_b->lock);
5016
+
5017
+ if (!dequeue)
5018
+ return false; /* Throttle no longer required. */
45945019
45955020 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
45965021
....@@ -4600,15 +5025,22 @@
46005025 rcu_read_unlock();
46015026
46025027 task_delta = cfs_rq->h_nr_running;
5028
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46035029 for_each_sched_entity(se) {
46045030 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
46055031 /* throttled entity or throttle-on-deactivate */
46065032 if (!se->on_rq)
46075033 break;
46085034
4609
- if (dequeue)
5035
+ if (dequeue) {
46105036 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5037
+ } else {
5038
+ update_load_avg(qcfs_rq, se, 0);
5039
+ se_update_runnable(se);
5040
+ }
5041
+
46115042 qcfs_rq->h_nr_running -= task_delta;
5043
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
46125044
46135045 if (qcfs_rq->load.weight)
46145046 dequeue = 0;
....@@ -4617,29 +5049,13 @@
46175049 if (!se)
46185050 sub_nr_running(rq, task_delta);
46195051
5052
+ /*
5053
+ * Note: distribution will already see us throttled via the
5054
+ * throttled-list. rq->lock protects completion.
5055
+ */
46205056 cfs_rq->throttled = 1;
46215057 cfs_rq->throttled_clock = rq_clock(rq);
4622
- raw_spin_lock(&cfs_b->lock);
4623
- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624
-
4625
- /*
4626
- * Add to the _head_ of the list, so that an already-started
4627
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628
- * not running add to the tail so that later runqueues don't get starved.
4629
- */
4630
- if (cfs_b->distribute_running)
4631
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632
- else
4633
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634
-
4635
- /*
4636
- * If we're the first throttled task, make sure the bandwidth
4637
- * timer is running.
4638
- */
4639
- if (empty)
4640
- start_cfs_bandwidth(cfs_b);
4641
-
4642
- raw_spin_unlock(&cfs_b->lock);
5058
+ return true;
46435059 }
46445060
46455061 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -4647,8 +5063,7 @@
46475063 struct rq *rq = rq_of(cfs_rq);
46485064 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
46495065 struct sched_entity *se;
4650
- int enqueue = 1;
4651
- long task_delta;
5066
+ long task_delta, idle_task_delta;
46525067
46535068 se = cfs_rq->tg->se[cpu_of(rq)];
46545069
....@@ -4668,34 +5083,70 @@
46685083 return;
46695084
46705085 task_delta = cfs_rq->h_nr_running;
5086
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46715087 for_each_sched_entity(se) {
46725088 if (se->on_rq)
4673
- enqueue = 0;
4674
-
5089
+ break;
46755090 cfs_rq = cfs_rq_of(se);
4676
- if (enqueue)
4677
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678
- cfs_rq->h_nr_running += task_delta;
5091
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
46795092
5093
+ cfs_rq->h_nr_running += task_delta;
5094
+ cfs_rq->idle_h_nr_running += idle_task_delta;
5095
+
5096
+ /* end evaluation on encountering a throttled cfs_rq */
46805097 if (cfs_rq_throttled(cfs_rq))
5098
+ goto unthrottle_throttle;
5099
+ }
5100
+
5101
+ for_each_sched_entity(se) {
5102
+ cfs_rq = cfs_rq_of(se);
5103
+
5104
+ update_load_avg(cfs_rq, se, UPDATE_TG);
5105
+ se_update_runnable(se);
5106
+
5107
+ cfs_rq->h_nr_running += task_delta;
5108
+ cfs_rq->idle_h_nr_running += idle_task_delta;
5109
+
5110
+
5111
+ /* end evaluation on encountering a throttled cfs_rq */
5112
+ if (cfs_rq_throttled(cfs_rq))
5113
+ goto unthrottle_throttle;
5114
+
5115
+ /*
5116
+ * One parent has been throttled and cfs_rq removed from the
5117
+ * list. Add it back to not break the leaf list.
5118
+ */
5119
+ if (throttled_hierarchy(cfs_rq))
5120
+ list_add_leaf_cfs_rq(cfs_rq);
5121
+ }
5122
+
5123
+ /* At this point se is NULL and we are at root level*/
5124
+ add_nr_running(rq, task_delta);
5125
+
5126
+unthrottle_throttle:
5127
+ /*
5128
+ * The cfs_rq_throttled() breaks in the above iteration can result in
5129
+ * incomplete leaf list maintenance, resulting in triggering the
5130
+ * assertion below.
5131
+ */
5132
+ for_each_sched_entity(se) {
5133
+ cfs_rq = cfs_rq_of(se);
5134
+
5135
+ if (list_add_leaf_cfs_rq(cfs_rq))
46815136 break;
46825137 }
46835138
46845139 assert_list_leaf_cfs_rq(rq);
4685
-
4686
- if (!se)
4687
- add_nr_running(rq, task_delta);
46885140
46895141 /* Determine whether we need to wake up potentially idle CPU: */
46905142 if (rq->curr == rq->idle && rq->cfs.nr_running)
46915143 resched_curr(rq);
46925144 }
46935145
4694
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
5146
+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
46955147 {
46965148 struct cfs_rq *cfs_rq;
4697
- u64 runtime;
4698
- u64 starting_runtime = remaining;
5149
+ u64 runtime, remaining = 1;
46995150
47005151 rcu_read_lock();
47015152 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
....@@ -4710,10 +5161,13 @@
47105161 /* By the above check, this should never be true */
47115162 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
47125163
5164
+ raw_spin_lock(&cfs_b->lock);
47135165 runtime = -cfs_rq->runtime_remaining + 1;
4714
- if (runtime > remaining)
4715
- runtime = remaining;
4716
- remaining -= runtime;
5166
+ if (runtime > cfs_b->runtime)
5167
+ runtime = cfs_b->runtime;
5168
+ cfs_b->runtime -= runtime;
5169
+ remaining = cfs_b->runtime;
5170
+ raw_spin_unlock(&cfs_b->lock);
47175171
47185172 cfs_rq->runtime_remaining += runtime;
47195173
....@@ -4728,8 +5182,6 @@
47285182 break;
47295183 }
47305184 rcu_read_unlock();
4731
-
4732
- return starting_runtime - remaining;
47335185 }
47345186
47355187 /*
....@@ -4740,7 +5192,6 @@
47405192 */
47415193 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
47425194 {
4743
- u64 runtime;
47445195 int throttled;
47455196
47465197 /* no need to continue the timer with no bandwidth constraint */
....@@ -4769,24 +5220,15 @@
47695220 cfs_b->nr_throttled += overrun;
47705221
47715222 /*
4772
- * This check is repeated as we are holding onto the new bandwidth while
4773
- * we unthrottle. This can potentially race with an unthrottled group
4774
- * trying to acquire new bandwidth from the global pool. This can result
4775
- * in us over-using our runtime if it is all used during this loop, but
4776
- * only by limited amounts in that extreme case.
5223
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
47775224 */
4778
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779
- runtime = cfs_b->runtime;
4780
- cfs_b->distribute_running = 1;
5225
+ while (throttled && cfs_b->runtime > 0) {
47815226 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
47825227 /* we can't nest cfs_b->lock while distributing bandwidth */
4783
- runtime = distribute_cfs_runtime(cfs_b, runtime);
5228
+ distribute_cfs_runtime(cfs_b);
47845229 raw_spin_lock_irqsave(&cfs_b->lock, flags);
47855230
4786
- cfs_b->distribute_running = 0;
47875231 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788
-
4789
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
47905232 }
47915233
47925234 /*
....@@ -4842,6 +5284,11 @@
48425284 if (runtime_refresh_within(cfs_b, min_left))
48435285 return;
48445286
5287
+ /* don't push forwards an existing deferred unthrottle */
5288
+ if (cfs_b->slack_started)
5289
+ return;
5290
+ cfs_b->slack_started = true;
5291
+
48455292 hrtimer_start(&cfs_b->slack_timer,
48465293 ns_to_ktime(cfs_bandwidth_slack_period),
48475294 HRTIMER_MODE_REL);
....@@ -4893,10 +5340,7 @@
48935340
48945341 /* confirm we're still not at a refresh boundary */
48955342 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4896
- if (cfs_b->distribute_running) {
4897
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4898
- return;
4899
- }
5343
+ cfs_b->slack_started = false;
49005344
49015345 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
49025346 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
....@@ -4906,26 +5350,21 @@
49065350 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
49075351 runtime = cfs_b->runtime;
49085352
4909
- if (runtime)
4910
- cfs_b->distribute_running = 1;
4911
-
49125353 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49135354
49145355 if (!runtime)
49155356 return;
49165357
4917
- runtime = distribute_cfs_runtime(cfs_b, runtime);
5358
+ distribute_cfs_runtime(cfs_b);
49185359
49195360 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4920
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4921
- cfs_b->distribute_running = 0;
49225361 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49235362 }
49245363
49255364 /*
49265365 * When a group wakes up we want to make sure that its quota is not already
49275366 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4928
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
5367
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
49295368 */
49305369 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
49315370 {
....@@ -4960,7 +5399,7 @@
49605399 pcfs_rq = tg->parent->cfs_rq[cpu];
49615400
49625401 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4963
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5402
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
49645403 }
49655404
49665405 /* conditionally throttle active cfs_rq's from put_prev_entity() */
....@@ -4979,8 +5418,7 @@
49795418 if (cfs_rq_throttled(cfs_rq))
49805419 return true;
49815420
4982
- throttle_cfs_rq(cfs_rq);
4983
- return true;
5421
+ return throttle_cfs_rq(cfs_rq);
49845422 }
49855423
49865424 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
....@@ -5009,6 +5447,8 @@
50095447 overrun = hrtimer_forward_now(timer, cfs_b->period);
50105448 if (!overrun)
50115449 break;
5450
+
5451
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
50125452
50135453 if (++count > 3) {
50145454 u64 new, old = ktime_to_ns(cfs_b->period);
....@@ -5039,8 +5479,6 @@
50395479 /* reset count so we don't come right back in here */
50405480 count = 0;
50415481 }
5042
-
5043
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
50445482 }
50455483 if (idle)
50465484 cfs_b->period_active = 0;
....@@ -5061,7 +5499,7 @@
50615499 cfs_b->period_timer.function = sched_cfs_period_timer;
50625500 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
50635501 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5064
- cfs_b->distribute_running = 0;
5502
+ cfs_b->slack_started = false;
50655503 }
50665504
50675505 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
....@@ -5156,11 +5594,6 @@
51565594 return false;
51575595 }
51585596
5159
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5160
-{
5161
- return rq_clock_task(rq_of(cfs_rq));
5162
-}
5163
-
51645597 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
51655598 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
51665599 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
....@@ -5218,7 +5651,7 @@
52185651
52195652 if (delta < 0) {
52205653 if (rq->curr == p)
5221
- resched_curr_lazy(rq);
5654
+ resched_curr(rq);
52225655 return;
52235656 }
52245657 hrtick_start(rq, delta);
....@@ -5253,22 +5686,43 @@
52535686
52545687 #ifdef CONFIG_SMP
52555688 static inline unsigned long cpu_util(int cpu);
5256
-static unsigned long capacity_of(int cpu);
52575689
52585690 static inline bool cpu_overutilized(int cpu)
52595691 {
5260
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
5692
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
5693
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
5694
+ int overutilized = -1;
5695
+
5696
+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
5697
+ if (overutilized != -1)
5698
+ return overutilized;
5699
+
5700
+ return !util_fits_cpu(cpu_util(cpu), rq_util_min, rq_util_max, cpu);
52615701 }
52625702
52635703 static inline void update_overutilized_status(struct rq *rq)
52645704 {
52655705 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
52665706 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5267
- trace_sched_overutilized(1);
5707
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
52685708 }
52695709 }
52705710 #else
52715711 static inline void update_overutilized_status(struct rq *rq) { }
5712
+#endif
5713
+
5714
+/* Runqueue only has SCHED_IDLE tasks enqueued */
5715
+static int sched_idle_rq(struct rq *rq)
5716
+{
5717
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5718
+ rq->nr_running);
5719
+}
5720
+
5721
+#ifdef CONFIG_SMP
5722
+static int sched_idle_cpu(int cpu)
5723
+{
5724
+ return sched_idle_rq(cpu_rq(cpu));
5725
+}
52725726 #endif
52735727
52745728 /*
....@@ -5281,12 +5735,9 @@
52815735 {
52825736 struct cfs_rq *cfs_rq;
52835737 struct sched_entity *se = &p->se;
5738
+ int idle_h_nr_running = task_has_idle_policy(p);
52845739 int task_new = !(flags & ENQUEUE_WAKEUP);
5285
-
5286
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5287
- if (sysctl_sched_performance_bias)
5288
- cpufreq_task_boost(rq->cpu, task_util_est(p));
5289
-#endif
5740
+ int should_iowait_boost;
52905741
52915742 /*
52925743 * The code below (indirectly) updates schedutil which looks at
....@@ -5297,29 +5748,13 @@
52975748 util_est_enqueue(&rq->cfs, p);
52985749
52995750 /*
5300
- * The code below (indirectly) updates schedutil which looks at
5301
- * the cfs_rq utilization to select a frequency.
5302
- * Let's update schedtune here to ensure the boost value of the
5303
- * current task is accounted for in the selection of the OPP.
5304
- *
5305
- * We do it also in the case where we enqueue a throttled task;
5306
- * we could argue that a throttled task should not boost a CPU,
5307
- * however:
5308
- * a) properly implementing CPU boosting considering throttled
5309
- * tasks will increase a lot the complexity of the solution
5310
- * b) it's not easy to quantify the benefits introduced by
5311
- * such a more complex solution.
5312
- * Thus, for the time being we go for the simple solution and boost
5313
- * also for throttled RQs.
5314
- */
5315
- schedtune_enqueue_task(p, cpu_of(rq));
5316
-
5317
- /*
53185751 * If in_iowait is set, the code below may not trigger any cpufreq
53195752 * utilization updates, so do it here explicitly with the IOWAIT flag
53205753 * passed.
53215754 */
5322
- if (p->in_iowait)
5755
+ should_iowait_boost = p->in_iowait;
5756
+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
5757
+ if (should_iowait_boost)
53235758 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
53245759
53255760 for_each_sched_entity(se) {
....@@ -5328,51 +5763,60 @@
53285763 cfs_rq = cfs_rq_of(se);
53295764 enqueue_entity(cfs_rq, se, flags);
53305765
5331
- /*
5332
- * end evaluation on encountering a throttled cfs_rq
5333
- *
5334
- * note: in the case of encountering a throttled cfs_rq we will
5335
- * post the final h_nr_running increment below.
5336
- */
5337
- if (cfs_rq_throttled(cfs_rq))
5338
- break;
53395766 cfs_rq->h_nr_running++;
5767
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5768
+
5769
+ /* end evaluation on encountering a throttled cfs_rq */
5770
+ if (cfs_rq_throttled(cfs_rq))
5771
+ goto enqueue_throttle;
53405772
53415773 flags = ENQUEUE_WAKEUP;
53425774 }
53435775
5776
+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
53445777 for_each_sched_entity(se) {
53455778 cfs_rq = cfs_rq_of(se);
5346
- cfs_rq->h_nr_running++;
5347
-
5348
- if (cfs_rq_throttled(cfs_rq))
5349
- break;
53505779
53515780 update_load_avg(cfs_rq, se, UPDATE_TG);
5781
+ se_update_runnable(se);
53525782 update_cfs_group(se);
5783
+
5784
+ cfs_rq->h_nr_running++;
5785
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5786
+
5787
+ /* end evaluation on encountering a throttled cfs_rq */
5788
+ if (cfs_rq_throttled(cfs_rq))
5789
+ goto enqueue_throttle;
5790
+
5791
+ /*
5792
+ * One parent has been throttled and cfs_rq removed from the
5793
+ * list. Add it back to not break the leaf list.
5794
+ */
5795
+ if (throttled_hierarchy(cfs_rq))
5796
+ list_add_leaf_cfs_rq(cfs_rq);
53535797 }
53545798
5355
- if (!se) {
5356
- add_nr_running(rq, 1);
5357
- /*
5358
- * Since new tasks are assigned an initial util_avg equal to
5359
- * half of the spare capacity of their CPU, tiny tasks have the
5360
- * ability to cross the overutilized threshold, which will
5361
- * result in the load balancer ruining all the task placement
5362
- * done by EAS. As a way to mitigate that effect, do not account
5363
- * for the first enqueue operation of new tasks during the
5364
- * overutilized flag detection.
5365
- *
5366
- * A better way of solving this problem would be to wait for
5367
- * the PELT signals of tasks to converge before taking them
5368
- * into account, but that is not straightforward to implement,
5369
- * and the following generally works well enough in practice.
5370
- */
5371
- if (!task_new)
5372
- update_overutilized_status(rq);
5799
+ /* At this point se is NULL and we are at root level*/
5800
+ add_nr_running(rq, 1);
53735801
5374
- }
5802
+ /*
5803
+ * Since new tasks are assigned an initial util_avg equal to
5804
+ * half of the spare capacity of their CPU, tiny tasks have the
5805
+ * ability to cross the overutilized threshold, which will
5806
+ * result in the load balancer ruining all the task placement
5807
+ * done by EAS. As a way to mitigate that effect, do not account
5808
+ * for the first enqueue operation of new tasks during the
5809
+ * overutilized flag detection.
5810
+ *
5811
+ * A better way of solving this problem would be to wait for
5812
+ * the PELT signals of tasks to converge before taking them
5813
+ * into account, but that is not straightforward to implement,
5814
+ * and the following generally works well enough in practice.
5815
+ */
5816
+ if (!task_new)
5817
+ update_overutilized_status(rq);
53755818
5819
+enqueue_throttle:
53765820 if (cfs_bandwidth_used()) {
53775821 /*
53785822 * When bandwidth control is enabled; the cfs_rq_throttled()
....@@ -5405,28 +5849,21 @@
54055849 struct cfs_rq *cfs_rq;
54065850 struct sched_entity *se = &p->se;
54075851 int task_sleep = flags & DEQUEUE_SLEEP;
5852
+ int idle_h_nr_running = task_has_idle_policy(p);
5853
+ bool was_sched_idle = sched_idle_rq(rq);
54085854
5409
- /*
5410
- * The code below (indirectly) updates schedutil which looks at
5411
- * the cfs_rq utilization to select a frequency.
5412
- * Let's update schedtune here to ensure the boost value of the
5413
- * current task is not more accounted for in the selection of the OPP.
5414
- */
5415
- schedtune_dequeue_task(p, cpu_of(rq));
5855
+ util_est_dequeue(&rq->cfs, p);
54165856
54175857 for_each_sched_entity(se) {
54185858 cfs_rq = cfs_rq_of(se);
54195859 dequeue_entity(cfs_rq, se, flags);
54205860
5421
- /*
5422
- * end evaluation on encountering a throttled cfs_rq
5423
- *
5424
- * note: in the case of encountering a throttled cfs_rq we will
5425
- * post the final h_nr_running decrement below.
5426
- */
5427
- if (cfs_rq_throttled(cfs_rq))
5428
- break;
54295861 cfs_rq->h_nr_running--;
5862
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5863
+
5864
+ /* end evaluation on encountering a throttled cfs_rq */
5865
+ if (cfs_rq_throttled(cfs_rq))
5866
+ goto dequeue_throttle;
54305867
54315868 /* Don't dequeue parent if it has other entities besides us */
54325869 if (cfs_rq->load.weight) {
....@@ -5443,21 +5880,32 @@
54435880 flags |= DEQUEUE_SLEEP;
54445881 }
54455882
5883
+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
54465884 for_each_sched_entity(se) {
54475885 cfs_rq = cfs_rq_of(se);
5448
- cfs_rq->h_nr_running--;
5449
-
5450
- if (cfs_rq_throttled(cfs_rq))
5451
- break;
54525886
54535887 update_load_avg(cfs_rq, se, UPDATE_TG);
5888
+ se_update_runnable(se);
54545889 update_cfs_group(se);
5890
+
5891
+ cfs_rq->h_nr_running--;
5892
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5893
+
5894
+ /* end evaluation on encountering a throttled cfs_rq */
5895
+ if (cfs_rq_throttled(cfs_rq))
5896
+ goto dequeue_throttle;
5897
+
54555898 }
54565899
5457
- if (!se)
5458
- sub_nr_running(rq, 1);
5900
+ /* At this point se is NULL and we are at root level*/
5901
+ sub_nr_running(rq, 1);
54595902
5460
- util_est_dequeue(&rq->cfs, p, task_sleep);
5903
+ /* balance early to pull high priority tasks */
5904
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5905
+ rq->next_balance = jiffies;
5906
+
5907
+dequeue_throttle:
5908
+ util_est_update(&rq->cfs, p, task_sleep);
54615909 hrtick_update(rq);
54625910 }
54635911
....@@ -5468,71 +5916,6 @@
54685916 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
54695917
54705918 #ifdef CONFIG_NO_HZ_COMMON
5471
-/*
5472
- * per rq 'load' arrray crap; XXX kill this.
5473
- */
5474
-
5475
-/*
5476
- * The exact cpuload calculated at every tick would be:
5477
- *
5478
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5479
- *
5480
- * If a CPU misses updates for n ticks (as it was idle) and update gets
5481
- * called on the n+1-th tick when CPU may be busy, then we have:
5482
- *
5483
- * load_n = (1 - 1/2^i)^n * load_0
5484
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5485
- *
5486
- * decay_load_missed() below does efficient calculation of
5487
- *
5488
- * load' = (1 - 1/2^i)^n * load
5489
- *
5490
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5491
- * This allows us to precompute the above in said factors, thereby allowing the
5492
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5493
- * fixed_power_int())
5494
- *
5495
- * The calculation is approximated on a 128 point scale.
5496
- */
5497
-#define DEGRADE_SHIFT 7
5498
-
5499
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5500
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5501
- { 0, 0, 0, 0, 0, 0, 0, 0 },
5502
- { 64, 32, 8, 0, 0, 0, 0, 0 },
5503
- { 96, 72, 40, 12, 1, 0, 0, 0 },
5504
- { 112, 98, 75, 43, 15, 1, 0, 0 },
5505
- { 120, 112, 98, 76, 45, 16, 2, 0 }
5506
-};
5507
-
5508
-/*
5509
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5510
- * would be when CPU is idle and so we just decay the old load without
5511
- * adding any new load.
5512
- */
5513
-static unsigned long
5514
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5515
-{
5516
- int j = 0;
5517
-
5518
- if (!missed_updates)
5519
- return load;
5520
-
5521
- if (missed_updates >= degrade_zero_ticks[idx])
5522
- return 0;
5523
-
5524
- if (idx == 1)
5525
- return load >> missed_updates;
5526
-
5527
- while (missed_updates) {
5528
- if (missed_updates % 2)
5529
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5530
-
5531
- missed_updates >>= 1;
5532
- j++;
5533
- }
5534
- return load;
5535
-}
55365919
55375920 static struct {
55385921 cpumask_var_t idle_cpus_mask;
....@@ -5544,249 +5927,68 @@
55445927
55455928 #endif /* CONFIG_NO_HZ_COMMON */
55465929
5547
-/**
5548
- * __cpu_load_update - update the rq->cpu_load[] statistics
5549
- * @this_rq: The rq to update statistics for
5550
- * @this_load: The current load
5551
- * @pending_updates: The number of missed updates
5552
- *
5553
- * Update rq->cpu_load[] statistics. This function is usually called every
5554
- * scheduler tick (TICK_NSEC).
5555
- *
5556
- * This function computes a decaying average:
5557
- *
5558
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5559
- *
5560
- * Because of NOHZ it might not get called on every tick which gives need for
5561
- * the @pending_updates argument.
5562
- *
5563
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5564
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5565
- * = A * (A * load[i]_n-2 + B) + B
5566
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5567
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5568
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5569
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5570
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5571
- *
5572
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5573
- * any change in load would have resulted in the tick being turned back on.
5574
- *
5575
- * For regular NOHZ, this reduces to:
5576
- *
5577
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5578
- *
5579
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5580
- * term.
5581
- */
5582
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5583
- unsigned long pending_updates)
5930
+static unsigned long cpu_load(struct rq *rq)
55845931 {
5585
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5586
- int i, scale;
5587
-
5588
- this_rq->nr_load_updates++;
5589
-
5590
- /* Update our load: */
5591
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5592
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5593
- unsigned long old_load, new_load;
5594
-
5595
- /* scale is effectively 1 << i now, and >> i divides by scale */
5596
-
5597
- old_load = this_rq->cpu_load[i];
5598
-#ifdef CONFIG_NO_HZ_COMMON
5599
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5600
- if (tickless_load) {
5601
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5602
- /*
5603
- * old_load can never be a negative value because a
5604
- * decayed tickless_load cannot be greater than the
5605
- * original tickless_load.
5606
- */
5607
- old_load += tickless_load;
5608
- }
5609
-#endif
5610
- new_load = this_load;
5611
- /*
5612
- * Round up the averaging division if load is increasing. This
5613
- * prevents us from getting stuck on 9 if the load is 10, for
5614
- * example.
5615
- */
5616
- if (new_load > old_load)
5617
- new_load += scale - 1;
5618
-
5619
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5620
- }
5621
-}
5622
-
5623
-/* Used instead of source_load when we know the type == 0 */
5624
-static unsigned long weighted_cpuload(struct rq *rq)
5625
-{
5626
- return cfs_rq_runnable_load_avg(&rq->cfs);
5627
-}
5628
-
5629
-#ifdef CONFIG_NO_HZ_COMMON
5630
-/*
5631
- * There is no sane way to deal with nohz on smp when using jiffies because the
5632
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5633
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5634
- *
5635
- * Therefore we need to avoid the delta approach from the regular tick when
5636
- * possible since that would seriously skew the load calculation. This is why we
5637
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5638
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5639
- * loop exit, nohz_idle_balance, nohz full exit...)
5640
- *
5641
- * This means we might still be one tick off for nohz periods.
5642
- */
5643
-
5644
-static void cpu_load_update_nohz(struct rq *this_rq,
5645
- unsigned long curr_jiffies,
5646
- unsigned long load)
5647
-{
5648
- unsigned long pending_updates;
5649
-
5650
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5651
- if (pending_updates) {
5652
- this_rq->last_load_update_tick = curr_jiffies;
5653
- /*
5654
- * In the regular NOHZ case, we were idle, this means load 0.
5655
- * In the NOHZ_FULL case, we were non-idle, we should consider
5656
- * its weighted load.
5657
- */
5658
- cpu_load_update(this_rq, load, pending_updates);
5659
- }
5932
+ return cfs_rq_load_avg(&rq->cfs);
56605933 }
56615934
56625935 /*
5663
- * Called from nohz_idle_balance() to update the load ratings before doing the
5664
- * idle balance.
5665
- */
5666
-static void cpu_load_update_idle(struct rq *this_rq)
5667
-{
5668
- /*
5669
- * bail if there's load or we're actually up-to-date.
5670
- */
5671
- if (weighted_cpuload(this_rq))
5672
- return;
5673
-
5674
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5675
-}
5676
-
5677
-/*
5678
- * Record CPU load on nohz entry so we know the tickless load to account
5679
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5680
- * than other cpu_load[idx] but it should be fine as cpu_load readers
5681
- * shouldn't rely into synchronized cpu_load[*] updates.
5682
- */
5683
-void cpu_load_update_nohz_start(void)
5684
-{
5685
- struct rq *this_rq = this_rq();
5686
-
5687
- /*
5688
- * This is all lockless but should be fine. If weighted_cpuload changes
5689
- * concurrently we'll exit nohz. And cpu_load write can race with
5690
- * cpu_load_update_idle() but both updater would be writing the same.
5691
- */
5692
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5693
-}
5694
-
5695
-/*
5696
- * Account the tickless load in the end of a nohz frame.
5697
- */
5698
-void cpu_load_update_nohz_stop(void)
5699
-{
5700
- unsigned long curr_jiffies = READ_ONCE(jiffies);
5701
- struct rq *this_rq = this_rq();
5702
- unsigned long load;
5703
- struct rq_flags rf;
5704
-
5705
- if (curr_jiffies == this_rq->last_load_update_tick)
5706
- return;
5707
-
5708
- load = weighted_cpuload(this_rq);
5709
- rq_lock(this_rq, &rf);
5710
- update_rq_clock(this_rq);
5711
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5712
- rq_unlock(this_rq, &rf);
5713
-}
5714
-#else /* !CONFIG_NO_HZ_COMMON */
5715
-static inline void cpu_load_update_nohz(struct rq *this_rq,
5716
- unsigned long curr_jiffies,
5717
- unsigned long load) { }
5718
-#endif /* CONFIG_NO_HZ_COMMON */
5719
-
5720
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5721
-{
5722
-#ifdef CONFIG_NO_HZ_COMMON
5723
- /* See the mess around cpu_load_update_nohz(). */
5724
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5725
-#endif
5726
- cpu_load_update(this_rq, load, 1);
5727
-}
5728
-
5729
-/*
5730
- * Called from scheduler_tick()
5731
- */
5732
-void cpu_load_update_active(struct rq *this_rq)
5733
-{
5734
- unsigned long load = weighted_cpuload(this_rq);
5735
-
5736
- if (tick_nohz_tick_stopped())
5737
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5738
- else
5739
- cpu_load_update_periodic(this_rq, load);
5740
-}
5741
-
5742
-/*
5743
- * Return a low guess at the load of a migration-source CPU weighted
5744
- * according to the scheduling class and "nice" value.
5936
+ * cpu_load_without - compute CPU load without any contributions from *p
5937
+ * @cpu: the CPU which load is requested
5938
+ * @p: the task which load should be discounted
57455939 *
5746
- * We want to under-estimate the load of migration sources, to
5747
- * balance conservatively.
5940
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
5941
+ * CPU as well as tasks which are currently sleeping after an execution on that
5942
+ * CPU.
5943
+ *
5944
+ * This method returns the load of the specified CPU by discounting the load of
5945
+ * the specified task, whenever the task is currently contributing to the CPU
5946
+ * load.
57485947 */
5749
-static unsigned long source_load(int cpu, int type)
5948
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
57505949 {
5751
- struct rq *rq = cpu_rq(cpu);
5752
- unsigned long total = weighted_cpuload(rq);
5950
+ struct cfs_rq *cfs_rq;
5951
+ unsigned int load;
57535952
5754
- if (type == 0 || !sched_feat(LB_BIAS))
5755
- return total;
5953
+ /* Task has no contribution or is new */
5954
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5955
+ return cpu_load(rq);
57565956
5757
- return min(rq->cpu_load[type-1], total);
5957
+ cfs_rq = &rq->cfs;
5958
+ load = READ_ONCE(cfs_rq->avg.load_avg);
5959
+
5960
+ /* Discount task's util from CPU's util */
5961
+ lsub_positive(&load, task_h_load(p));
5962
+
5963
+ return load;
57585964 }
57595965
5760
-/*
5761
- * Return a high guess at the load of a migration-target CPU weighted
5762
- * according to the scheduling class and "nice" value.
5763
- */
5764
-static unsigned long target_load(int cpu, int type)
5966
+static unsigned long cpu_runnable(struct rq *rq)
57655967 {
5766
- struct rq *rq = cpu_rq(cpu);
5767
- unsigned long total = weighted_cpuload(rq);
5968
+ return cfs_rq_runnable_avg(&rq->cfs);
5969
+}
57685970
5769
- if (type == 0 || !sched_feat(LB_BIAS))
5770
- return total;
5971
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5972
+{
5973
+ struct cfs_rq *cfs_rq;
5974
+ unsigned int runnable;
57715975
5772
- return max(rq->cpu_load[type-1], total);
5976
+ /* Task has no contribution or is new */
5977
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5978
+ return cpu_runnable(rq);
5979
+
5980
+ cfs_rq = &rq->cfs;
5981
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5982
+
5983
+ /* Discount task's runnable from CPU's runnable */
5984
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
5985
+
5986
+ return runnable;
57735987 }
57745988
57755989 static unsigned long capacity_of(int cpu)
57765990 {
57775991 return cpu_rq(cpu)->cpu_capacity;
5778
-}
5779
-
5780
-static unsigned long cpu_avg_load_per_task(int cpu)
5781
-{
5782
- struct rq *rq = cpu_rq(cpu);
5783
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5784
- unsigned long load_avg = weighted_cpuload(rq);
5785
-
5786
- if (nr_running)
5787
- return load_avg / nr_running;
5788
-
5789
- return 0;
57905992 }
57915993
57925994 static void record_wakee(struct task_struct *p)
....@@ -5823,18 +6025,15 @@
58236025 * whatever is irrelevant, spread criteria is apparent partner count exceeds
58246026 * socket size.
58256027 */
5826
-static int wake_wide(struct task_struct *p, int sibling_count_hint)
6028
+static int wake_wide(struct task_struct *p)
58276029 {
58286030 unsigned int master = current->wakee_flips;
58296031 unsigned int slave = p->wakee_flips;
5830
- int llc_size = this_cpu_read(sd_llc_size);
5831
-
5832
- if (sibling_count_hint >= llc_size)
5833
- return 1;
6032
+ int factor = __this_cpu_read(sd_llc_size);
58346033
58356034 if (master < slave)
58366035 swap(master, slave);
5837
- if (slave < llc_size || master < slave * llc_size)
6036
+ if (slave < factor || master < slave * factor)
58386037 return 0;
58396038 return 1;
58406039 }
....@@ -5882,7 +6081,7 @@
58826081 s64 this_eff_load, prev_eff_load;
58836082 unsigned long task_load;
58846083
5885
- this_eff_load = target_load(this_cpu, sd->wake_idx);
6084
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
58866085
58876086 if (sync) {
58886087 unsigned long current_load = task_h_load(current);
....@@ -5900,7 +6099,7 @@
59006099 this_eff_load *= 100;
59016100 this_eff_load *= capacity_of(prev_cpu);
59026101
5903
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
6102
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
59046103 prev_eff_load -= task_load;
59056104 if (sched_feat(WA_BIAS))
59066105 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
....@@ -5938,242 +6137,8 @@
59386137 return target;
59396138 }
59406139
5941
-#ifdef CONFIG_SCHED_TUNE
5942
-struct reciprocal_value schedtune_spc_rdiv;
5943
-
5944
-static long
5945
-schedtune_margin(unsigned long signal, long boost)
5946
-{
5947
- long long margin = 0;
5948
-
5949
- /*
5950
- * Signal proportional compensation (SPC)
5951
- *
5952
- * The Boost (B) value is used to compute a Margin (M) which is
5953
- * proportional to the complement of the original Signal (S):
5954
- * M = B * (SCHED_CAPACITY_SCALE - S)
5955
- * The obtained M could be used by the caller to "boost" S.
5956
- */
5957
- if (boost >= 0) {
5958
- margin = SCHED_CAPACITY_SCALE - signal;
5959
- margin *= boost;
5960
- } else
5961
- margin = -signal * boost;
5962
-
5963
- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5964
-
5965
- if (boost < 0)
5966
- margin *= -1;
5967
- return margin;
5968
-}
5969
-
5970
-inline long
5971
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5972
-{
5973
- int boost = schedtune_cpu_boost_with(cpu, p);
5974
- long margin;
5975
-
5976
- if (boost == 0)
5977
- margin = 0;
5978
- else
5979
- margin = schedtune_margin(util, boost);
5980
-
5981
- trace_sched_boost_cpu(cpu, util, margin);
5982
-
5983
- return margin;
5984
-}
5985
-
5986
-long schedtune_task_margin(struct task_struct *task)
5987
-{
5988
- int boost = schedtune_task_boost(task);
5989
- unsigned long util;
5990
- long margin;
5991
-
5992
- if (boost == 0)
5993
- return 0;
5994
-
5995
- util = task_util_est(task);
5996
- margin = schedtune_margin(util, boost);
5997
-
5998
- return margin;
5999
-}
6000
-
6001
-#else /* CONFIG_SCHED_TUNE */
6002
-
6003
-inline long
6004
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6005
-{
6006
- return 0;
6007
-}
6008
-
6009
-#endif /* CONFIG_SCHED_TUNE */
6010
-
6011
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6012
-
6013
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6014
-{
6015
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6016
-}
6017
-
6018
-/*
6019
- * find_idlest_group finds and returns the least busy CPU group within the
6020
- * domain.
6021
- *
6022
- * Assumes p is allowed on at least one CPU in sd.
6023
- */
60246140 static struct sched_group *
6025
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
6026
- int this_cpu, int sd_flag)
6027
-{
6028
- struct sched_group *idlest = NULL, *group = sd->groups;
6029
- struct sched_group *most_spare_sg = NULL;
6030
- unsigned long min_runnable_load = ULONG_MAX;
6031
- unsigned long this_runnable_load = ULONG_MAX;
6032
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6033
- unsigned long most_spare = 0, this_spare = 0;
6034
- int load_idx = sd->forkexec_idx;
6035
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6036
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6037
- (sd->imbalance_pct-100) / 100;
6038
-
6039
- if (sd_flag & SD_BALANCE_WAKE)
6040
- load_idx = sd->wake_idx;
6041
-
6042
- do {
6043
- unsigned long load, avg_load, runnable_load;
6044
- unsigned long spare_cap, max_spare_cap;
6045
- int local_group;
6046
- int i;
6047
-
6048
- /* Skip over this group if it has no CPUs allowed */
6049
- if (!cpumask_intersects(sched_group_span(group),
6050
- p->cpus_ptr))
6051
- continue;
6052
-
6053
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6054
- if (sysctl_sched_performance_bias) {
6055
- if (!task_fits_max(p, group_first_cpu(group)))
6056
- continue;
6057
- }
6058
-#endif
6059
-
6060
- local_group = cpumask_test_cpu(this_cpu,
6061
- sched_group_span(group));
6062
-
6063
- /*
6064
- * Tally up the load of all CPUs in the group and find
6065
- * the group containing the CPU with most spare capacity.
6066
- */
6067
- avg_load = 0;
6068
- runnable_load = 0;
6069
- max_spare_cap = 0;
6070
-
6071
- for_each_cpu(i, sched_group_span(group)) {
6072
- /* Bias balancing toward CPUs of our domain */
6073
- if (local_group)
6074
- load = source_load(i, load_idx);
6075
- else
6076
- load = target_load(i, load_idx);
6077
-
6078
- runnable_load += load;
6079
-
6080
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6081
-
6082
- spare_cap = capacity_spare_without(i, p);
6083
-
6084
- if (spare_cap > max_spare_cap)
6085
- max_spare_cap = spare_cap;
6086
- }
6087
-
6088
- /* Adjust by relative CPU capacity of the group */
6089
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6090
- group->sgc->capacity;
6091
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6092
- group->sgc->capacity;
6093
-
6094
- if (local_group) {
6095
- this_runnable_load = runnable_load;
6096
- this_avg_load = avg_load;
6097
- this_spare = max_spare_cap;
6098
- } else {
6099
- if (min_runnable_load > (runnable_load + imbalance)) {
6100
- /*
6101
- * The runnable load is significantly smaller
6102
- * so we can pick this new CPU:
6103
- */
6104
- min_runnable_load = runnable_load;
6105
- min_avg_load = avg_load;
6106
- idlest = group;
6107
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6108
- (100*min_avg_load > imbalance_scale*avg_load)) {
6109
- /*
6110
- * The runnable loads are close so take the
6111
- * blocked load into account through avg_load:
6112
- */
6113
- min_avg_load = avg_load;
6114
- idlest = group;
6115
- }
6116
-
6117
- if (most_spare < max_spare_cap) {
6118
- most_spare = max_spare_cap;
6119
- most_spare_sg = group;
6120
- }
6121
- }
6122
- } while (group = group->next, group != sd->groups);
6123
-
6124
- /*
6125
- * The cross-over point between using spare capacity or least load
6126
- * is too conservative for high utilization tasks on partially
6127
- * utilized systems if we require spare_capacity > task_util(p),
6128
- * so we allow for some task stuffing by using
6129
- * spare_capacity > task_util(p)/2.
6130
- *
6131
- * Spare capacity can't be used for fork because the utilization has
6132
- * not been set yet, we must first select a rq to compute the initial
6133
- * utilization.
6134
- */
6135
- if (sd_flag & SD_BALANCE_FORK)
6136
- goto skip_spare;
6137
-
6138
- if (this_spare > task_util(p) / 2 &&
6139
- imbalance_scale*this_spare > 100*most_spare)
6140
- return NULL;
6141
-
6142
- if (most_spare > task_util(p) / 2)
6143
- return most_spare_sg;
6144
-
6145
-skip_spare:
6146
- if (!idlest)
6147
- return NULL;
6148
-
6149
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6150
- if (sysctl_sched_performance_bias) {
6151
- if ((this_runnable_load == ULONG_MAX) || (this_avg_load == ULONG_MAX))
6152
- return idlest;
6153
- }
6154
-#endif
6155
-
6156
- /*
6157
- * When comparing groups across NUMA domains, it's possible for the
6158
- * local domain to be very lightly loaded relative to the remote
6159
- * domains but "imbalance" skews the comparison making remote CPUs
6160
- * look much more favourable. When considering cross-domain, add
6161
- * imbalance to the runnable load on the remote node and consider
6162
- * staying local.
6163
- */
6164
- if ((sd->flags & SD_NUMA) &&
6165
- min_runnable_load + imbalance >= this_runnable_load)
6166
- return NULL;
6167
-
6168
- if (min_runnable_load > (this_runnable_load + imbalance))
6169
- return NULL;
6170
-
6171
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6172
- (100*this_avg_load < imbalance_scale*min_avg_load))
6173
- return NULL;
6174
-
6175
- return idlest;
6176
-}
6141
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
61776142
61786143 /*
61796144 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
....@@ -6194,6 +6159,9 @@
61946159
61956160 /* Traverse only the allowed CPUs */
61966161 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
6162
+ if (sched_idle_cpu(i))
6163
+ return i;
6164
+
61976165 if (available_idle_cpu(i)) {
61986166 struct rq *rq = cpu_rq(i);
61996167 struct cpuidle_state *idle = idle_get_state(rq);
....@@ -6217,7 +6185,7 @@
62176185 shallowest_idle_cpu = i;
62186186 }
62196187 } else if (shallowest_idle_cpu == -1) {
6220
- load = weighted_cpuload(cpu_rq(i));
6188
+ load = cpu_load(cpu_rq(i));
62216189 if (load < min_load) {
62226190 min_load = load;
62236191 least_loaded_cpu = i;
....@@ -6237,7 +6205,7 @@
62376205 return prev_cpu;
62386206
62396207 /*
6240
- * We need task's util for capacity_spare_without, sync it up to
6208
+ * We need task's util for cpu_util_without, sync it up to
62416209 * prev_cpu's last_update_time.
62426210 */
62436211 if (!(sd_flag & SD_BALANCE_FORK))
....@@ -6253,7 +6221,7 @@
62536221 continue;
62546222 }
62556223
6256
- group = find_idlest_group(sd, p, cpu, sd_flag);
6224
+ group = find_idlest_group(sd, p, cpu);
62576225 if (!group) {
62586226 sd = sd->child;
62596227 continue;
....@@ -6356,10 +6324,12 @@
63566324 bool idle = true;
63576325
63586326 for_each_cpu(cpu, cpu_smt_mask(core)) {
6359
- cpumask_clear_cpu(cpu, cpus);
6360
- if (!available_idle_cpu(cpu))
6327
+ if (!available_idle_cpu(cpu)) {
63616328 idle = false;
6329
+ break;
6330
+ }
63626331 }
6332
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
63636333
63646334 if (idle)
63656335 return core;
....@@ -6384,9 +6354,10 @@
63846354 return -1;
63856355
63866356 for_each_cpu(cpu, cpu_smt_mask(target)) {
6387
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6357
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6358
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
63886359 continue;
6389
- if (available_idle_cpu(cpu))
6360
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
63906361 return cpu;
63916362 }
63926363
....@@ -6417,8 +6388,8 @@
64176388 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
64186389 struct sched_domain *this_sd;
64196390 u64 avg_cost, avg_idle;
6420
- u64 time, cost;
6421
- s64 delta;
6391
+ u64 time;
6392
+ int this = smp_processor_id();
64226393 int cpu, nr = INT_MAX;
64236394
64246395 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
....@@ -6443,23 +6414,68 @@
64436414 nr = 4;
64446415 }
64456416
6446
- time = local_clock();
6417
+ time = cpu_clock(this);
64476418
64486419 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
64496420
64506421 for_each_cpu_wrap(cpu, cpus, target) {
64516422 if (!--nr)
64526423 return -1;
6453
- if (available_idle_cpu(cpu))
6424
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
64546425 break;
64556426 }
64566427
6457
- time = local_clock() - time;
6458
- cost = this_sd->avg_scan_cost;
6459
- delta = (s64)(time - cost) / 8;
6460
- this_sd->avg_scan_cost += delta;
6428
+ time = cpu_clock(this) - time;
6429
+ update_avg(&this_sd->avg_scan_cost, time);
64616430
64626431 return cpu;
6432
+}
6433
+
6434
+/*
6435
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6436
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
6437
+ * maximize capacity.
6438
+ */
6439
+static int
6440
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6441
+{
6442
+ unsigned long task_util, util_min, util_max, best_cap = 0;
6443
+ int cpu, best_cpu = -1;
6444
+ struct cpumask *cpus;
6445
+
6446
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6447
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6448
+
6449
+ task_util = task_util_est(p);
6450
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
6451
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
6452
+
6453
+ for_each_cpu_wrap(cpu, cpus, target) {
6454
+ unsigned long cpu_cap = capacity_of(cpu);
6455
+
6456
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6457
+ continue;
6458
+ if (util_fits_cpu(task_util, util_min, util_max, cpu))
6459
+ return cpu;
6460
+
6461
+ if (cpu_cap > best_cap) {
6462
+ best_cap = cpu_cap;
6463
+ best_cpu = cpu;
6464
+ }
6465
+ }
6466
+
6467
+ return best_cpu;
6468
+}
6469
+
6470
+static inline bool asym_fits_cpu(unsigned long util,
6471
+ unsigned long util_min,
6472
+ unsigned long util_max,
6473
+ int cpu)
6474
+{
6475
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
6476
+ return util_fits_cpu(util, util_min, util_max, cpu);
6477
+
6478
+ return true;
64636479 }
64646480
64656481 /*
....@@ -6468,24 +6484,56 @@
64686484 static int select_idle_sibling(struct task_struct *p, int prev, int target)
64696485 {
64706486 struct sched_domain *sd;
6487
+ unsigned long task_util, util_min, util_max;
64716488 int i, recent_used_cpu;
64726489
6473
- if (available_idle_cpu(target))
6490
+ /*
6491
+ * On asymmetric system, update task utilization because we will check
6492
+ * that the task fits with cpu's capacity.
6493
+ */
6494
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6495
+ sync_entity_load_avg(&p->se);
6496
+ task_util = task_util_est(p);
6497
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
6498
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
6499
+ }
6500
+
6501
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6502
+ asym_fits_cpu(task_util, util_min, util_max, target))
64746503 return target;
64756504
64766505 /*
64776506 * If the previous CPU is cache affine and idle, don't be stupid:
64786507 */
6479
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6508
+ if (prev != target && cpus_share_cache(prev, target) &&
6509
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6510
+ asym_fits_cpu(task_util, util_min, util_max, prev))
64806511 return prev;
6512
+
6513
+ /*
6514
+ * Allow a per-cpu kthread to stack with the wakee if the
6515
+ * kworker thread and the tasks previous CPUs are the same.
6516
+ * The assumption is that the wakee queued work for the
6517
+ * per-cpu kthread that is now complete and the wakeup is
6518
+ * essentially a sync wakeup. An obvious example of this
6519
+ * pattern is IO completions.
6520
+ */
6521
+ if (is_per_cpu_kthread(current) &&
6522
+ in_task() &&
6523
+ prev == smp_processor_id() &&
6524
+ this_rq()->nr_running <= 1 &&
6525
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
6526
+ return prev;
6527
+ }
64816528
64826529 /* Check a recently used CPU as a potential idle candidate: */
64836530 recent_used_cpu = p->recent_used_cpu;
64846531 if (recent_used_cpu != prev &&
64856532 recent_used_cpu != target &&
64866533 cpus_share_cache(recent_used_cpu, target) &&
6487
- available_idle_cpu(recent_used_cpu) &&
6488
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6534
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6535
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6536
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
64896537 /*
64906538 * Replace recent_used_cpu with prev as it is a potential
64916539 * candidate for the next wake:
....@@ -6494,6 +6542,32 @@
64946542 return recent_used_cpu;
64956543 }
64966544
6545
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6546
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
6547
+ goto sd_llc;
6548
+ }
6549
+
6550
+ /*
6551
+ * For asymmetric CPU capacity systems, our domain of interest is
6552
+ * sd_asym_cpucapacity rather than sd_llc.
6553
+ */
6554
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6555
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6556
+ /*
6557
+ * On an asymmetric CPU capacity system where an exclusive
6558
+ * cpuset defines a symmetric island (i.e. one unique
6559
+ * capacity_orig value through the cpuset), the key will be set
6560
+ * but the CPUs within that cpuset will not have a domain with
6561
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6562
+ * capacity path.
6563
+ */
6564
+ if (sd) {
6565
+ i = select_idle_capacity(p, sd, target);
6566
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
6567
+ }
6568
+ }
6569
+
6570
+sd_llc:
64976571 sd = rcu_dereference(per_cpu(sd_llc, target));
64986572 if (!sd)
64996573 return target;
....@@ -6591,7 +6665,7 @@
65916665 util = READ_ONCE(cfs_rq->avg.util_avg);
65926666
65936667 /* Discount task's util from CPU's util */
6594
- util -= min_t(unsigned int, util, task_util(p));
6668
+ lsub_positive(&util, task_util(p));
65956669
65966670 /*
65976671 * Covered cases:
....@@ -6640,10 +6714,9 @@
66406714 * properly fix the execl regression and it helps in further
66416715 * reducing the chances for the above race.
66426716 */
6643
- if (unlikely(task_on_rq_queued(p) || current == p)) {
6644
- estimated -= min_t(unsigned int, estimated,
6645
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
6646
- }
6717
+ if (unlikely(task_on_rq_queued(p) || current == p))
6718
+ lsub_positive(&estimated, _task_util_est(p));
6719
+
66476720 util = max(util, estimated);
66486721 }
66496722
....@@ -6653,350 +6726,6 @@
66536726 * the cpu_util call.
66546727 */
66556728 return min_t(unsigned long, util, capacity_orig_of(cpu));
6656
-}
6657
-
6658
-/*
6659
- * Returns the current capacity of cpu after applying both
6660
- * cpu and freq scaling.
6661
- */
6662
-unsigned long capacity_curr_of(int cpu)
6663
-{
6664
- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6665
- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6666
-
6667
- return cap_scale(max_cap, scale_freq);
6668
-}
6669
-
6670
-static void find_best_target(struct sched_domain *sd, cpumask_t *cpus,
6671
- struct task_struct *p)
6672
-{
6673
- unsigned long min_util = uclamp_task(p);
6674
- unsigned long target_capacity = ULONG_MAX;
6675
- unsigned long min_wake_util = ULONG_MAX;
6676
- unsigned long target_max_spare_cap = 0;
6677
- unsigned long target_util = ULONG_MAX;
6678
- /* Initialise with deepest possible cstate (INT_MAX) */
6679
- int shallowest_idle_cstate = INT_MAX;
6680
- struct sched_group *sg;
6681
- int best_active_cpu = -1;
6682
- int best_idle_cpu = -1;
6683
- int target_cpu = -1;
6684
- int backup_cpu = -1;
6685
- bool prefer_idle;
6686
- bool boosted;
6687
- int i;
6688
-
6689
- /*
6690
- * In most cases, target_capacity tracks capacity_orig of the most
6691
- * energy efficient CPU candidate, thus requiring to minimise
6692
- * target_capacity. For these cases target_capacity is already
6693
- * initialized to ULONG_MAX.
6694
- * However, for prefer_idle and boosted tasks we look for a high
6695
- * performance CPU, thus requiring to maximise target_capacity. In this
6696
- * case we initialise target_capacity to 0.
6697
- */
6698
- prefer_idle = uclamp_latency_sensitive(p);
6699
- boosted = uclamp_boosted(p);
6700
- if (prefer_idle && boosted)
6701
- target_capacity = 0;
6702
-
6703
- /* Scan CPUs in all SDs */
6704
- sg = sd->groups;
6705
- do {
6706
- for_each_cpu_and(i, p->cpus_ptr, sched_group_span(sg)) {
6707
- unsigned long capacity_curr = capacity_curr_of(i);
6708
- unsigned long capacity_orig = capacity_orig_of(i);
6709
- unsigned long wake_util, new_util;
6710
- long spare_cap;
6711
- int idle_idx = INT_MAX;
6712
-
6713
- if (!cpu_online(i))
6714
- continue;
6715
-
6716
- /*
6717
- * p's blocked utilization is still accounted for on prev_cpu
6718
- * so prev_cpu will receive a negative bias due to the double
6719
- * accounting. However, the blocked utilization may be zero.
6720
- */
6721
- wake_util = cpu_util_without(i, p);
6722
- new_util = wake_util + task_util_est(p);
6723
-
6724
- /*
6725
- * Ensure minimum capacity to grant the required boost.
6726
- * The target CPU can be already at a capacity level higher
6727
- * than the one required to boost the task.
6728
- */
6729
- new_util = max(min_util, new_util);
6730
- if (new_util > capacity_orig)
6731
- continue;
6732
-
6733
- /*
6734
- * Pre-compute the maximum possible capacity we expect
6735
- * to have available on this CPU once the task is
6736
- * enqueued here.
6737
- */
6738
- spare_cap = capacity_orig - new_util;
6739
-
6740
- if (idle_cpu(i))
6741
- idle_idx = idle_get_state_idx(cpu_rq(i));
6742
-
6743
-
6744
- /*
6745
- * Case A) Latency sensitive tasks
6746
- *
6747
- * Unconditionally favoring tasks that prefer idle CPU to
6748
- * improve latency.
6749
- *
6750
- * Looking for:
6751
- * - an idle CPU, whatever its idle_state is, since
6752
- * the first CPUs we explore are more likely to be
6753
- * reserved for latency sensitive tasks.
6754
- * - a non idle CPU where the task fits in its current
6755
- * capacity and has the maximum spare capacity.
6756
- * - a non idle CPU with lower contention from other
6757
- * tasks and running at the lowest possible OPP.
6758
- *
6759
- * The last two goals tries to favor a non idle CPU
6760
- * where the task can run as if it is "almost alone".
6761
- * A maximum spare capacity CPU is favoured since
6762
- * the task already fits into that CPU's capacity
6763
- * without waiting for an OPP chance.
6764
- *
6765
- * The following code path is the only one in the CPUs
6766
- * exploration loop which is always used by
6767
- * prefer_idle tasks. It exits the loop with wither a
6768
- * best_active_cpu or a target_cpu which should
6769
- * represent an optimal choice for latency sensitive
6770
- * tasks.
6771
- */
6772
- if (prefer_idle) {
6773
-
6774
- /*
6775
- * Case A.1: IDLE CPU
6776
- * Return the best IDLE CPU we find:
6777
- * - for boosted tasks: the CPU with the highest
6778
- * performance (i.e. biggest capacity_orig)
6779
- * - for !boosted tasks: the most energy
6780
- * efficient CPU (i.e. smallest capacity_orig)
6781
- */
6782
- if (idle_cpu(i)) {
6783
- if (boosted &&
6784
- capacity_orig < target_capacity)
6785
- continue;
6786
- if (!boosted &&
6787
- capacity_orig > target_capacity)
6788
- continue;
6789
- /*
6790
- * Minimise value of idle state: skip
6791
- * deeper idle states and pick the
6792
- * shallowest.
6793
- */
6794
- if (capacity_orig == target_capacity &&
6795
- sysctl_sched_cstate_aware &&
6796
- idle_idx >= shallowest_idle_cstate)
6797
- continue;
6798
-
6799
- target_capacity = capacity_orig;
6800
- shallowest_idle_cstate = idle_idx;
6801
- best_idle_cpu = i;
6802
- continue;
6803
- }
6804
- if (best_idle_cpu != -1)
6805
- continue;
6806
-
6807
- /*
6808
- * Case A.2: Target ACTIVE CPU
6809
- * Favor CPUs with max spare capacity.
6810
- */
6811
- if (capacity_curr > new_util &&
6812
- spare_cap > target_max_spare_cap) {
6813
- target_max_spare_cap = spare_cap;
6814
- target_cpu = i;
6815
- continue;
6816
- }
6817
- if (target_cpu != -1)
6818
- continue;
6819
-
6820
-
6821
- /*
6822
- * Case A.3: Backup ACTIVE CPU
6823
- * Favor CPUs with:
6824
- * - lower utilization due to other tasks
6825
- * - lower utilization with the task in
6826
- */
6827
- if (wake_util > min_wake_util)
6828
- continue;
6829
- min_wake_util = wake_util;
6830
- best_active_cpu = i;
6831
- continue;
6832
- }
6833
-
6834
- /*
6835
- * Enforce EAS mode
6836
- *
6837
- * For non latency sensitive tasks, skip CPUs that
6838
- * will be overutilized by moving the task there.
6839
- *
6840
- * The goal here is to remain in EAS mode as long as
6841
- * possible at least for !prefer_idle tasks.
6842
- */
6843
- if ((new_util * capacity_margin) >
6844
- (capacity_orig * SCHED_CAPACITY_SCALE))
6845
- continue;
6846
-
6847
- /*
6848
- * Favor CPUs with smaller capacity for non latency
6849
- * sensitive tasks.
6850
- */
6851
- if (capacity_orig > target_capacity)
6852
- continue;
6853
-
6854
- /*
6855
- * Case B) Non latency sensitive tasks on IDLE CPUs.
6856
- *
6857
- * Find an optimal backup IDLE CPU for non latency
6858
- * sensitive tasks.
6859
- *
6860
- * Looking for:
6861
- * - minimizing the capacity_orig,
6862
- * i.e. preferring LITTLE CPUs
6863
- * - favoring shallowest idle states
6864
- * i.e. avoid to wakeup deep-idle CPUs
6865
- *
6866
- * The following code path is used by non latency
6867
- * sensitive tasks if IDLE CPUs are available. If at
6868
- * least one of such CPUs are available it sets the
6869
- * best_idle_cpu to the most suitable idle CPU to be
6870
- * selected.
6871
- *
6872
- * If idle CPUs are available, favour these CPUs to
6873
- * improve performances by spreading tasks.
6874
- * Indeed, the energy_diff() computed by the caller
6875
- * will take care to ensure the minimization of energy
6876
- * consumptions without affecting performance.
6877
- */
6878
- if (idle_cpu(i)) {
6879
- /*
6880
- * Skip CPUs in deeper idle state, but only
6881
- * if they are also less energy efficient.
6882
- * IOW, prefer a deep IDLE LITTLE CPU vs a
6883
- * shallow idle big CPU.
6884
- */
6885
- if (capacity_orig == target_capacity &&
6886
- sysctl_sched_cstate_aware &&
6887
- idle_idx >= shallowest_idle_cstate)
6888
- continue;
6889
-
6890
- target_capacity = capacity_orig;
6891
- shallowest_idle_cstate = idle_idx;
6892
- best_idle_cpu = i;
6893
- continue;
6894
- }
6895
-
6896
- /*
6897
- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6898
- *
6899
- * Pack tasks in the most energy efficient capacities.
6900
- *
6901
- * This task packing strategy prefers more energy
6902
- * efficient CPUs (i.e. pack on smaller maximum
6903
- * capacity CPUs) while also trying to spread tasks to
6904
- * run them all at the lower OPP.
6905
- *
6906
- * This assumes for example that it's more energy
6907
- * efficient to run two tasks on two CPUs at a lower
6908
- * OPP than packing both on a single CPU but running
6909
- * that CPU at an higher OPP.
6910
- *
6911
- * Thus, this case keep track of the CPU with the
6912
- * smallest maximum capacity and highest spare maximum
6913
- * capacity.
6914
- */
6915
-
6916
- /* Favor CPUs with maximum spare capacity */
6917
- if (capacity_orig == target_capacity &&
6918
- spare_cap < target_max_spare_cap)
6919
- continue;
6920
-
6921
- target_max_spare_cap = spare_cap;
6922
- target_capacity = capacity_orig;
6923
- target_util = new_util;
6924
- target_cpu = i;
6925
- }
6926
-
6927
- } while (sg = sg->next, sg != sd->groups);
6928
-
6929
- /*
6930
- * For non latency sensitive tasks, cases B and C in the previous loop,
6931
- * we pick the best IDLE CPU only if we was not able to find a target
6932
- * ACTIVE CPU.
6933
- *
6934
- * Policies priorities:
6935
- *
6936
- * - prefer_idle tasks:
6937
- *
6938
- * a) IDLE CPU available: best_idle_cpu
6939
- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6940
- * capacity (i.e. target_cpu)
6941
- * c) ACTIVE CPU with less contention due to other tasks
6942
- * (i.e. best_active_cpu)
6943
- *
6944
- * - NON prefer_idle tasks:
6945
- *
6946
- * a) ACTIVE CPU: target_cpu
6947
- * b) IDLE CPU: best_idle_cpu
6948
- */
6949
-
6950
- if (prefer_idle && (best_idle_cpu != -1)) {
6951
- target_cpu = best_idle_cpu;
6952
- goto target;
6953
- }
6954
-
6955
- if (target_cpu == -1)
6956
- target_cpu = prefer_idle
6957
- ? best_active_cpu
6958
- : best_idle_cpu;
6959
- else
6960
- backup_cpu = prefer_idle
6961
- ? best_active_cpu
6962
- : best_idle_cpu;
6963
-
6964
- if (backup_cpu >= 0)
6965
- cpumask_set_cpu(backup_cpu, cpus);
6966
- if (target_cpu >= 0) {
6967
-target:
6968
- cpumask_set_cpu(target_cpu, cpus);
6969
- }
6970
-
6971
- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6972
- best_active_cpu, target_cpu, backup_cpu);
6973
-}
6974
-
6975
-/*
6976
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6977
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6978
- *
6979
- * In that case WAKE_AFFINE doesn't make sense and we'll let
6980
- * BALANCE_WAKE sort things out.
6981
- */
6982
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6983
-{
6984
- long min_cap, max_cap;
6985
-
6986
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6987
- return 0;
6988
-
6989
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6990
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6991
-
6992
- /* Minimum capacity is close to max, no need to abort wake_affine */
6993
- if (max_cap - min_cap < max_cap >> 3)
6994
- return 0;
6995
-
6996
- /* Bring task utilization in sync with prev_cpu */
6997
- sync_entity_load_avg(&p->se);
6998
-
6999
- return !task_fits_capacity(p, min_cap);
70006729 }
70016730
70026731 /*
....@@ -7038,154 +6767,61 @@
70386767 }
70396768
70406769 /*
7041
- * compute_energy(): Estimates the energy that would be consumed if @p was
6770
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
70426771 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7043
- * landscape of the * CPUs after the task migration, and uses the Energy Model
6772
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
70446773 * to compute what would be the energy if we decided to actually migrate that
70456774 * task.
70466775 */
70476776 static long
70486777 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
70496778 {
7050
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7051
- unsigned long sum_util, energy = 0;
7052
- struct task_struct *tsk;
6779
+ struct cpumask *pd_mask = perf_domain_span(pd);
6780
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6781
+ unsigned long max_util = 0, sum_util = 0;
6782
+ unsigned long energy = 0;
70536783 int cpu;
70546784
7055
- for (; pd; pd = pd->next) {
7056
- struct cpumask *pd_mask = perf_domain_span(pd);
6785
+ /*
6786
+ * The capacity state of CPUs of the current rd can be driven by CPUs
6787
+ * of another rd if they belong to the same pd. So, account for the
6788
+ * utilization of these CPUs too by masking pd with cpu_online_mask
6789
+ * instead of the rd span.
6790
+ *
6791
+ * If an entire pd is outside of the current rd, it will not appear in
6792
+ * its pd list and will not be accounted by compute_energy().
6793
+ */
6794
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6795
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6796
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
70576797
70586798 /*
7059
- * The energy model mandates all the CPUs of a performance
7060
- * domain have the same capacity.
6799
+ * Busy time computation: utilization clamping is not
6800
+ * required since the ratio (sum_util / cpu_capacity)
6801
+ * is already enough to scale the EM reported power
6802
+ * consumption at the (eventually clamped) cpu_capacity.
70616803 */
7062
- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7063
- max_util = sum_util = 0;
6804
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6805
+ ENERGY_UTIL, NULL);
70646806
70656807 /*
7066
- * The capacity state of CPUs of the current rd can be driven by
7067
- * CPUs of another rd if they belong to the same performance
7068
- * domain. So, account for the utilization of these CPUs too
7069
- * by masking pd with cpu_online_mask instead of the rd span.
7070
- *
7071
- * If an entire performance domain is outside of the current rd,
7072
- * it will not appear in its pd list and will not be accounted
7073
- * by compute_energy().
6808
+ * Performance domain frequency: utilization clamping
6809
+ * must be considered since it affects the selection
6810
+ * of the performance domain frequency.
6811
+ * NOTE: in case RT tasks are running, by default the
6812
+ * FREQUENCY_UTIL's utilization can be max OPP.
70746813 */
7075
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7076
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7077
-
7078
- /*
7079
- * Busy time computation: utilization clamping is not
7080
- * required since the ratio (sum_util / cpu_capacity)
7081
- * is already enough to scale the EM reported power
7082
- * consumption at the (eventually clamped) cpu_capacity.
7083
- */
7084
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7085
- ENERGY_UTIL, NULL);
7086
-
7087
- /*
7088
- * Performance domain frequency: utilization clamping
7089
- * must be considered since it affects the selection
7090
- * of the performance domain frequency.
7091
- * NOTE: in case RT tasks are running, by default the
7092
- * FREQUENCY_UTIL's utilization can be max OPP.
7093
- */
7094
- tsk = cpu == dst_cpu ? p : NULL;
7095
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7096
- FREQUENCY_UTIL, tsk);
7097
- max_util = max(max_util, cpu_util);
7098
- }
7099
-
7100
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6814
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6815
+ FREQUENCY_UTIL, tsk);
6816
+ max_util = max(max_util, cpu_util);
71016817 }
6818
+
6819
+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
6820
+ if (!energy)
6821
+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
71026822
71036823 return energy;
71046824 }
7105
-
7106
-static void select_cpu_candidates(struct sched_domain *sd, cpumask_t *cpus,
7107
- struct perf_domain *pd, struct task_struct *p, int prev_cpu)
7108
-{
7109
- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7110
- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7111
- bool prefer_idle = uclamp_latency_sensitive(p);
7112
- bool boosted = uclamp_boosted(p);
7113
- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7114
- unsigned long highest_spare_cap = 0;
7115
- unsigned int min_exit_lat = UINT_MAX;
7116
- int cpu, max_spare_cap_cpu;
7117
- struct cpuidle_state *idle;
7118
-
7119
- for (; pd; pd = pd->next) {
7120
- max_spare_cap_cpu = -1;
7121
- max_spare_cap = 0;
7122
-
7123
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7124
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
7125
- continue;
7126
-
7127
- util = cpu_util_next(cpu, p, cpu);
7128
- cpu_cap = capacity_of(cpu);
7129
- spare_cap = cpu_cap - util;
7130
-
7131
- /*
7132
- * Skip CPUs that cannot satisfy the capacity request.
7133
- * IOW, placing the task there would make the CPU
7134
- * overutilized. Take uclamp into account to see how
7135
- * much capacity we can get out of the CPU; this is
7136
- * aligned with schedutil_cpu_util().
7137
- */
7138
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7139
- if (cpu_cap * 1024 < util * capacity_margin)
7140
- continue;
7141
-
7142
- /*
7143
- * Find the CPU with the maximum spare capacity in
7144
- * the performance domain
7145
- */
7146
- if (spare_cap > max_spare_cap) {
7147
- max_spare_cap = spare_cap;
7148
- max_spare_cap_cpu = cpu;
7149
- }
7150
-
7151
- if (!prefer_idle)
7152
- continue;
7153
-
7154
- if (idle_cpu(cpu)) {
7155
- cpu_cap = capacity_orig_of(cpu);
7156
- if (boosted && cpu_cap < target_cap)
7157
- continue;
7158
- if (!boosted && cpu_cap > target_cap)
7159
- continue;
7160
- idle = idle_get_state(cpu_rq(cpu));
7161
- if (idle && idle->exit_latency > min_exit_lat &&
7162
- cpu_cap == target_cap)
7163
- continue;
7164
-
7165
- if (idle)
7166
- min_exit_lat = idle->exit_latency;
7167
- target_cap = cpu_cap;
7168
- best_idle_cpu = cpu;
7169
- } else if (spare_cap > highest_spare_cap) {
7170
- highest_spare_cap = spare_cap;
7171
- highest_spare_cap_cpu = cpu;
7172
- }
7173
- }
7174
-
7175
- if (!prefer_idle && max_spare_cap_cpu >= 0)
7176
- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7177
- }
7178
-
7179
- if (!prefer_idle)
7180
- return;
7181
-
7182
- if (best_idle_cpu >= 0)
7183
- cpumask_set_cpu(best_idle_cpu, cpus);
7184
- else
7185
- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7186
-}
7187
-
7188
-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
71896825
71906826 /*
71916827 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
....@@ -7226,27 +6862,41 @@
72266862 * other use-cases too. So, until someone finds a better way to solve this,
72276863 * let's keep things simple by re-using the existing slow path.
72286864 */
7229
-
72306865 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
72316866 {
7232
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
6867
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6868
+ unsigned long best_delta2 = ULONG_MAX;
6869
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
6870
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
72336871 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7234
- int weight, cpu, best_energy_cpu = prev_cpu;
7235
- unsigned long cur_energy;
7236
- struct perf_domain *pd;
6872
+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
6873
+ unsigned long max_spare_cap_ls = 0, target_cap;
6874
+ unsigned long cpu_cap, util, base_energy = 0;
6875
+ bool boosted, latency_sensitive = false;
6876
+ unsigned int min_exit_lat = UINT_MAX;
6877
+ int cpu, best_energy_cpu = prev_cpu;
6878
+ struct cpuidle_state *idle;
72376879 struct sched_domain *sd;
7238
- cpumask_t *candidates;
6880
+ struct perf_domain *pd;
6881
+ int new_cpu = INT_MAX;
72396882
7240
- if (sysctl_sched_sync_hint_enable && sync) {
7241
- cpu = smp_processor_id();
7242
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
7243
- return cpu;
7244
- }
6883
+ sync_entity_load_avg(&p->se);
6884
+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
6885
+ if (new_cpu != INT_MAX)
6886
+ return new_cpu;
72456887
72466888 rcu_read_lock();
72476889 pd = rcu_dereference(rd->pd);
72486890 if (!pd || READ_ONCE(rd->overutilized))
72496891 goto fail;
6892
+
6893
+ cpu = smp_processor_id();
6894
+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
6895
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
6896
+ task_fits_cpu(p, cpu)) {
6897
+ rcu_read_unlock();
6898
+ return cpu;
6899
+ }
72506900
72516901 /*
72526902 * Energy-aware wake-up happens on the lowest sched_domain starting
....@@ -7258,59 +6908,169 @@
72586908 if (!sd)
72596909 goto fail;
72606910
7261
- sync_entity_load_avg(&p->se);
7262
- if (!task_util_est(p))
6911
+ if (!uclamp_task_util(p, p_util_min, p_util_max))
72636912 goto unlock;
72646913
7265
- /* Pre-select a set of candidate CPUs. */
7266
- candidates = this_cpu_ptr(&energy_cpus);
7267
- cpumask_clear(candidates);
6914
+ latency_sensitive = uclamp_latency_sensitive(p);
6915
+ boosted = uclamp_boosted(p);
6916
+ target_cap = boosted ? 0 : ULONG_MAX;
72686917
7269
- if (sched_feat(FIND_BEST_TARGET))
7270
- find_best_target(sd, candidates, p);
7271
- else
7272
- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
6918
+ for (; pd; pd = pd->next) {
6919
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6920
+ unsigned long rq_util_min, rq_util_max;
6921
+ unsigned long util_min, util_max;
6922
+ unsigned long base_energy_pd;
6923
+ int max_spare_cap_cpu = -1;
72736924
7274
- /* Bail out if no candidate was found. */
7275
- weight = cpumask_weight(candidates);
7276
- if (!weight)
7277
- goto unlock;
6925
+ /* Compute the 'base' energy of the pd, without @p */
6926
+ base_energy_pd = compute_energy(p, -1, pd);
6927
+ base_energy += base_energy_pd;
72786928
7279
- /* If there is only one sensible candidate, select it now. */
7280
- cpu = cpumask_first(candidates);
7281
- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) ||
7282
- (cpu == prev_cpu))) {
7283
- best_energy_cpu = cpu;
7284
- goto unlock;
7285
- }
6929
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6930
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6931
+ continue;
72866932
7287
- if (cpumask_test_cpu(prev_cpu, p->cpus_ptr))
7288
- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7289
- else
7290
- prev_energy = best_energy = ULONG_MAX;
6933
+ util = cpu_util_next(cpu, p, cpu);
6934
+ cpu_cap = capacity_of(cpu);
6935
+ spare_cap = cpu_cap;
6936
+ lsub_positive(&spare_cap, util);
72916937
7292
- /* Select the best candidate energy-wise. */
7293
- for_each_cpu(cpu, candidates) {
7294
- if (cpu == prev_cpu)
7295
- continue;
7296
- cur_energy = compute_energy(p, cpu, pd);
7297
- if (cur_energy < best_energy) {
7298
- best_energy = cur_energy;
7299
- best_energy_cpu = cpu;
6938
+ /*
6939
+ * Skip CPUs that cannot satisfy the capacity request.
6940
+ * IOW, placing the task there would make the CPU
6941
+ * overutilized. Take uclamp into account to see how
6942
+ * much capacity we can get out of the CPU; this is
6943
+ * aligned with schedutil_cpu_util().
6944
+ */
6945
+ if (uclamp_is_used()) {
6946
+ if (uclamp_rq_is_idle(cpu_rq(cpu))) {
6947
+ util_min = p_util_min;
6948
+ util_max = p_util_max;
6949
+ } else {
6950
+ /*
6951
+ * Open code uclamp_rq_util_with() except for
6952
+ * the clamp() part. Ie: apply max aggregation
6953
+ * only. util_fits_cpu() logic requires to
6954
+ * operate on non clamped util but must use the
6955
+ * max-aggregated uclamp_{min, max}.
6956
+ */
6957
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
6958
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
6959
+
6960
+ util_min = max(rq_util_min, p_util_min);
6961
+ util_max = max(rq_util_max, p_util_max);
6962
+ }
6963
+ }
6964
+ if (!util_fits_cpu(util, util_min, util_max, cpu))
6965
+ continue;
6966
+
6967
+ /* Always use prev_cpu as a candidate. */
6968
+ if (!latency_sensitive && cpu == prev_cpu) {
6969
+ prev_delta = compute_energy(p, prev_cpu, pd);
6970
+ prev_delta -= base_energy_pd;
6971
+ best_delta = min(best_delta, prev_delta);
6972
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6973
+ if (prev_delta == best_delta)
6974
+ best_energy_cpu = prev_cpu;
6975
+ }
6976
+ }
6977
+
6978
+ /*
6979
+ * Find the CPU with the maximum spare capacity in
6980
+ * the performance domain
6981
+ */
6982
+ if (spare_cap > max_spare_cap) {
6983
+ max_spare_cap = spare_cap;
6984
+ max_spare_cap_cpu = cpu;
6985
+ }
6986
+
6987
+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6988
+ if (!latency_sensitive)
6989
+ continue;
6990
+ }
6991
+
6992
+ if (idle_cpu(cpu)) {
6993
+ cpu_cap = capacity_orig_of(cpu);
6994
+ if (boosted && cpu_cap < target_cap)
6995
+ continue;
6996
+ if (!boosted && cpu_cap > target_cap)
6997
+ continue;
6998
+ idle = idle_get_state(cpu_rq(cpu));
6999
+ if (idle && idle->exit_latency > min_exit_lat &&
7000
+ cpu_cap == target_cap)
7001
+ continue;
7002
+
7003
+ if (idle)
7004
+ min_exit_lat = idle->exit_latency;
7005
+ target_cap = cpu_cap;
7006
+ best_idle_cpu = cpu;
7007
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
7008
+ best_delta2 = compute_energy(p, cpu, pd);
7009
+ best_delta2 -= base_energy_pd;
7010
+ }
7011
+ } else if (spare_cap > max_spare_cap_ls) {
7012
+ max_spare_cap_ls = spare_cap;
7013
+ max_spare_cap_cpu_ls = cpu;
7014
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
7015
+ if (best_idle_cpu == -1) {
7016
+ best_delta2 = compute_energy(p, cpu, pd);
7017
+ best_delta2 -= base_energy_pd;
7018
+ }
7019
+ }
7020
+ }
7021
+ }
7022
+
7023
+ /* Evaluate the energy impact of using this CPU. */
7024
+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
7025
+ max_spare_cap_cpu != prev_cpu) {
7026
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
7027
+ cur_delta -= base_energy_pd;
7028
+ if (cur_delta < best_delta) {
7029
+ best_delta = cur_delta;
7030
+ best_energy_cpu = max_spare_cap_cpu;
7031
+ }
73007032 }
73017033 }
73027034 unlock:
73037035 rcu_read_unlock();
73047036
7037
+ if (latency_sensitive)
7038
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
7039
+
73057040 /*
73067041 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
73077042 * least 6% of the energy used by prev_cpu.
73087043 */
7309
- if (prev_energy == ULONG_MAX)
7044
+ if (prev_delta == ULONG_MAX)
73107045 return best_energy_cpu;
73117046
7312
- if ((prev_energy - best_energy) > (prev_energy >> 4))
7047
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
73137048 return best_energy_cpu;
7049
+
7050
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
7051
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
7052
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
7053
+ int level = rockchip_perf_get_level();
7054
+
7055
+ /*
7056
+ * when select ROCKCHIP_PERFORMANCE_LOW:
7057
+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
7058
+ * is little cpu, so that tasks can migrate from big cpu to little
7059
+ * cpu easier to save power.
7060
+ */
7061
+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
7062
+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
7063
+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
7064
+ return best_energy_cpu;
7065
+ }
7066
+
7067
+ /*
7068
+ * Pick the idlest cpu if it is a little power increased(<3.1%).
7069
+ */
7070
+ if ((best_delta2 <= prev_delta) ||
7071
+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
7072
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
7073
+ }
73147074
73157075 return prev_cpu;
73167076
....@@ -7333,39 +7093,44 @@
73337093 * preempt must be disabled.
73347094 */
73357095 static int
7336
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7337
- int sibling_count_hint)
7096
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
73387097 {
73397098 struct sched_domain *tmp, *sd = NULL;
73407099 int cpu = smp_processor_id();
73417100 int new_cpu = prev_cpu;
73427101 int want_affine = 0;
73437102 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
7103
+ int target_cpu = -1;
7104
+
7105
+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
7106
+ !(sd_flag & SD_BALANCE_FORK))
7107
+ sync_entity_load_avg(&p->se);
7108
+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
7109
+ wake_flags, &target_cpu);
7110
+ if (target_cpu >= 0)
7111
+ return target_cpu;
73447112
73457113 if (sd_flag & SD_BALANCE_WAKE) {
73467114 record_wakee(p);
73477115
7348
- if (static_branch_unlikely(&sched_energy_present)) {
7349
- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7350
- goto sd_loop;
7116
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
7117
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
7118
+ goto no_eas;
7119
+ }
73517120
7121
+ if (sched_energy_enabled()) {
73527122 new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
73537123 if (new_cpu >= 0)
73547124 return new_cpu;
73557125 new_cpu = prev_cpu;
73567126 }
73577127
7358
- want_affine = !wake_wide(p, sibling_count_hint) &&
7359
- !wake_cap(p, cpu, prev_cpu) &&
7360
- cpumask_test_cpu(cpu, p->cpus_ptr);
7128
+no_eas:
7129
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
73617130 }
73627131
7363
-sd_loop:
73647132 rcu_read_lock();
73657133 for_each_domain(cpu, tmp) {
7366
- if (!(tmp->flags & SD_LOAD_BALANCE))
7367
- break;
7368
-
73697134 /*
73707135 * If both 'cpu' and 'prev_cpu' are part of this domain,
73717136 * cpu is a valid SD_WAKE_AFFINE target.
....@@ -7392,6 +7157,23 @@
73927157 /* Fast path */
73937158
73947159 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
7160
+
7161
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
7162
+ struct root_domain *rd = cpu_rq(cpu)->rd;
7163
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
7164
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
7165
+ int level = rockchip_perf_get_level();
7166
+
7167
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
7168
+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
7169
+ cpumask_test_cpu(new_cpu, cpul_mask)) {
7170
+ for_each_domain(cpu, tmp) {
7171
+ sd = tmp;
7172
+ }
7173
+ if (sd)
7174
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
7175
+ }
7176
+ }
73957177
73967178 if (want_affine)
73977179 current->recent_used_cpu = cpu;
....@@ -7459,15 +7241,21 @@
74597241 /* Tell new CPU we are migrated */
74607242 p->se.avg.last_update_time = 0;
74617243
7462
- /* We have migrated, no longer consider this task hot */
7463
- p->se.exec_start = 0;
7464
-
74657244 update_scan_period(p, new_cpu);
74667245 }
74677246
74687247 static void task_dead_fair(struct task_struct *p)
74697248 {
74707249 remove_entity_load_avg(&p->se);
7250
+}
7251
+
7252
+static int
7253
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7254
+{
7255
+ if (rq->nr_running)
7256
+ return 1;
7257
+
7258
+ return newidle_balance(rq, rf) != 0;
74717259 }
74727260 #endif /* CONFIG_SMP */
74737261
....@@ -7522,7 +7310,7 @@
75227310
75237311 static void set_last_buddy(struct sched_entity *se)
75247312 {
7525
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7313
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75267314 return;
75277315
75287316 for_each_sched_entity(se) {
....@@ -7534,7 +7322,7 @@
75347322
75357323 static void set_next_buddy(struct sched_entity *se)
75367324 {
7537
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7325
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75387326 return;
75397327
75407328 for_each_sched_entity(se) {
....@@ -7560,6 +7348,7 @@
75607348 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
75617349 int scale = cfs_rq->nr_running >= sched_nr_latency;
75627350 int next_buddy_marked = 0;
7351
+ bool preempt = false, nopreempt = false;
75637352
75647353 if (unlikely(se == pse))
75657354 return;
....@@ -7592,8 +7381,8 @@
75927381 return;
75937382
75947383 /* Idle tasks are by definition preempted by non-idle tasks. */
7595
- if (unlikely(curr->policy == SCHED_IDLE) &&
7596
- likely(p->policy != SCHED_IDLE))
7384
+ if (unlikely(task_has_idle_policy(curr)) &&
7385
+ likely(!task_has_idle_policy(p)))
75977386 goto preempt;
75987387
75997388 /*
....@@ -7605,6 +7394,12 @@
76057394
76067395 find_matching_se(&se, &pse);
76077396 update_curr(cfs_rq_of(se));
7397
+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
7398
+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
7399
+ if (preempt)
7400
+ goto preempt;
7401
+ if (nopreempt)
7402
+ return;
76087403 BUG_ON(!pse);
76097404 if (wakeup_preempt_entity(se, pse) == 1) {
76107405 /*
....@@ -7619,7 +7414,7 @@
76197414 return;
76207415
76217416 preempt:
7622
- resched_curr_lazy(rq);
7417
+ resched_curr(rq);
76237418 /*
76247419 * Only set the backward buddy when the current task is still
76257420 * on the rq. This can happen when a wakeup gets interleaved
....@@ -7636,20 +7431,21 @@
76367431 set_last_buddy(se);
76377432 }
76387433
7639
-static struct task_struct *
7434
+struct task_struct *
76407435 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
76417436 {
76427437 struct cfs_rq *cfs_rq = &rq->cfs;
7643
- struct sched_entity *se;
7644
- struct task_struct *p;
7438
+ struct sched_entity *se = NULL;
7439
+ struct task_struct *p = NULL;
76457440 int new_tasks;
7441
+ bool repick = false;
76467442
76477443 again:
7648
- if (!cfs_rq->nr_running)
7444
+ if (!sched_fair_runnable(rq))
76497445 goto idle;
76507446
76517447 #ifdef CONFIG_FAIR_GROUP_SCHED
7652
- if (prev->sched_class != &fair_sched_class)
7448
+ if (!prev || prev->sched_class != &fair_sched_class)
76537449 goto simple;
76547450
76557451 /*
....@@ -7696,7 +7492,7 @@
76967492 } while (cfs_rq);
76977493
76987494 p = task_of(se);
7699
-
7495
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
77007496 /*
77017497 * Since we haven't yet done put_prev_entity and if the selected task
77027498 * is a different task than we started out with, try and touch the
....@@ -7726,8 +7522,15 @@
77267522 goto done;
77277523 simple:
77287524 #endif
7525
+ if (prev)
7526
+ put_prev_task(rq, prev);
77297527
7730
- put_prev_task(rq, prev);
7528
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
7529
+ if (repick) {
7530
+ for_each_sched_entity(se)
7531
+ set_next_entity(cfs_rq_of(se), se);
7532
+ goto done;
7533
+ }
77317534
77327535 do {
77337536 se = pick_next_entity(cfs_rq, NULL);
....@@ -7755,11 +7558,13 @@
77557558 return p;
77567559
77577560 idle:
7758
- update_misfit_status(NULL, rq);
7759
- new_tasks = idle_balance(rq, rf);
7561
+ if (!rf)
7562
+ return NULL;
7563
+
7564
+ new_tasks = newidle_balance(rq, rf);
77607565
77617566 /*
7762
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
7567
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
77637568 * possible for any higher priority task to appear. In that case we
77647569 * must re-start the pick_next_entity() loop.
77657570 */
....@@ -7776,6 +7581,11 @@
77767581 update_idle_rq_clock_pelt(rq);
77777582
77787583 return NULL;
7584
+}
7585
+
7586
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
7587
+{
7588
+ return pick_next_task_fair(rq, NULL, NULL);
77797589 }
77807590
77817591 /*
....@@ -7828,7 +7638,7 @@
78287638 set_skip_buddy(se);
78297639 }
78307640
7831
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7641
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
78327642 {
78337643 struct sched_entity *se = &p->se;
78347644
....@@ -7963,15 +7773,54 @@
79637773 * rewrite all of this once again.]
79647774 */
79657775
7966
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7776
+unsigned long __read_mostly max_load_balance_interval = HZ/10;
7777
+EXPORT_SYMBOL_GPL(max_load_balance_interval);
79677778
79687779 enum fbq_type { regular, remote, all };
79697780
7781
+/*
7782
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
7783
+ *
7784
+ * The enum is ordered by pulling priority, with the group with lowest priority
7785
+ * first so the group_type can simply be compared when selecting the busiest
7786
+ * group. See update_sd_pick_busiest().
7787
+ */
79707788 enum group_type {
7971
- group_other = 0,
7789
+ /* The group has spare capacity that can be used to run more tasks. */
7790
+ group_has_spare = 0,
7791
+ /*
7792
+ * The group is fully used and the tasks don't compete for more CPU
7793
+ * cycles. Nevertheless, some tasks might wait before running.
7794
+ */
7795
+ group_fully_busy,
7796
+ /*
7797
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7798
+ * and must be migrated to a more powerful CPU.
7799
+ */
79727800 group_misfit_task,
7801
+ /*
7802
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7803
+ * and the task should be migrated to it instead of running on the
7804
+ * current CPU.
7805
+ */
7806
+ group_asym_packing,
7807
+ /*
7808
+ * The tasks' affinity constraints previously prevented the scheduler
7809
+ * from balancing the load across the system.
7810
+ */
79737811 group_imbalanced,
7974
- group_overloaded,
7812
+ /*
7813
+ * The CPU is overloaded and can't provide expected CPU cycles to all
7814
+ * tasks.
7815
+ */
7816
+ group_overloaded
7817
+};
7818
+
7819
+enum migration_type {
7820
+ migrate_load = 0,
7821
+ migrate_util,
7822
+ migrate_task,
7823
+ migrate_misfit
79757824 };
79767825
79777826 #define LBF_ALL_PINNED 0x01
....@@ -7994,7 +7843,6 @@
79947843 int new_dst_cpu;
79957844 enum cpu_idle_type idle;
79967845 long imbalance;
7997
- unsigned int src_grp_nr_running;
79987846 /* The set of CPUs under consideration for load-balancing */
79997847 struct cpumask *cpus;
80007848
....@@ -8005,8 +7853,9 @@
80057853 unsigned int loop_max;
80067854
80077855 enum fbq_type fbq_type;
8008
- enum group_type src_grp_type;
7856
+ enum migration_type migration_type;
80097857 struct list_head tasks;
7858
+ struct rq_flags *src_rq_rf;
80107859 };
80117860
80127861 /*
....@@ -8021,7 +7870,11 @@
80217870 if (p->sched_class != &fair_sched_class)
80227871 return 0;
80237872
8024
- if (unlikely(p->policy == SCHED_IDLE))
7873
+ if (unlikely(task_has_idle_policy(p)))
7874
+ return 0;
7875
+
7876
+ /* SMT siblings share cache */
7877
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
80257878 return 0;
80267879
80277880 /*
....@@ -8109,8 +7962,13 @@
81097962 int can_migrate_task(struct task_struct *p, struct lb_env *env)
81107963 {
81117964 int tsk_cache_hot;
7965
+ int can_migrate = 1;
81127966
81137967 lockdep_assert_held(&env->src_rq->lock);
7968
+
7969
+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
7970
+ if (!can_migrate)
7971
+ return 0;
81147972
81157973 /*
81167974 * We do not migrate tasks that are:
....@@ -8120,6 +7978,10 @@
81207978 * 4) are cache-hot on their current CPU.
81217979 */
81227980 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7981
+ return 0;
7982
+
7983
+ /* Disregard pcpu kthreads; they are where they need to be. */
7984
+ if (kthread_is_per_cpu(p))
81237985 return 0;
81247986
81257987 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
....@@ -8188,9 +8050,20 @@
81888050 */
81898051 static void detach_task(struct task_struct *p, struct lb_env *env)
81908052 {
8053
+ int detached = 0;
8054
+
81918055 lockdep_assert_held(&env->src_rq->lock);
81928056
8193
- p->on_rq = TASK_ON_RQ_MIGRATING;
8057
+ /*
8058
+ * The vendor hook may drop the lock temporarily, so
8059
+ * pass the rq flags to unpin lock. We expect the
8060
+ * rq lock to be held after return.
8061
+ */
8062
+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
8063
+ env->dst_cpu, &detached);
8064
+ if (detached)
8065
+ return;
8066
+
81948067 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
81958068 set_task_cpu(p, env->dst_cpu);
81968069 }
....@@ -8229,7 +8102,7 @@
82298102 static const unsigned int sched_nr_migrate_break = 32;
82308103
82318104 /*
8232
- * detach_tasks() -- tries to detach up to imbalance weighted load from
8105
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
82338106 * busiest_rq, as part of a balancing operation within domain "sd".
82348107 *
82358108 * Returns number of detached tasks if successful and 0 otherwise.
....@@ -8237,8 +8110,8 @@
82378110 static int detach_tasks(struct lb_env *env)
82388111 {
82398112 struct list_head *tasks = &env->src_rq->cfs_tasks;
8113
+ unsigned long util, load;
82408114 struct task_struct *p;
8241
- unsigned long load;
82428115 int detached = 0;
82438116
82448117 lockdep_assert_held(&env->src_rq->lock);
....@@ -8268,39 +8141,64 @@
82688141 break;
82698142 }
82708143
8271
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8272
- if (sysctl_sched_performance_bias) {
8273
- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8274
- goto next;
8275
- }
8276
-#endif
8277
-
82788144 if (!can_migrate_task(p, env))
82798145 goto next;
82808146
8281
- /*
8282
- * Depending of the number of CPUs and tasks and the
8283
- * cgroup hierarchy, task_h_load() can return a null
8284
- * value. Make sure that env->imbalance decreases
8285
- * otherwise detach_tasks() will stop only after
8286
- * detaching up to loop_max tasks.
8287
- */
8288
- load = max_t(unsigned long, task_h_load(p), 1);
8147
+ switch (env->migration_type) {
8148
+ case migrate_load:
8149
+ /*
8150
+ * Depending of the number of CPUs and tasks and the
8151
+ * cgroup hierarchy, task_h_load() can return a null
8152
+ * value. Make sure that env->imbalance decreases
8153
+ * otherwise detach_tasks() will stop only after
8154
+ * detaching up to loop_max tasks.
8155
+ */
8156
+ load = max_t(unsigned long, task_h_load(p), 1);
82898157
8158
+ if (sched_feat(LB_MIN) &&
8159
+ load < 16 && !env->sd->nr_balance_failed)
8160
+ goto next;
82908161
8291
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8292
- goto next;
8162
+ /*
8163
+ * Make sure that we don't migrate too much load.
8164
+ * Nevertheless, let relax the constraint if
8165
+ * scheduler fails to find a good waiting task to
8166
+ * migrate.
8167
+ */
8168
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
8169
+ goto next;
82938170
8294
- if ((load / 2) > env->imbalance)
8295
- goto next;
8171
+ env->imbalance -= load;
8172
+ break;
8173
+
8174
+ case migrate_util:
8175
+ util = task_util_est(p);
8176
+
8177
+ if (util > env->imbalance)
8178
+ goto next;
8179
+
8180
+ env->imbalance -= util;
8181
+ break;
8182
+
8183
+ case migrate_task:
8184
+ env->imbalance--;
8185
+ break;
8186
+
8187
+ case migrate_misfit:
8188
+ /* This is not a misfit task */
8189
+ if (task_fits_cpu(p, env->src_cpu))
8190
+ goto next;
8191
+
8192
+ env->imbalance = 0;
8193
+ break;
8194
+ }
82968195
82978196 detach_task(p, env);
82988197 list_add(&p->se.group_node, &env->tasks);
82998198
83008199 detached++;
8301
- env->imbalance -= load;
83028200
8303
-#ifdef CONFIG_PREEMPT
8201
+#ifdef CONFIG_PREEMPTION
83048202 /*
83058203 * NEWIDLE balancing is a source of latency, so preemptible
83068204 * kernels will stop after the first task is detached to minimize
....@@ -8312,7 +8210,7 @@
83128210
83138211 /*
83148212 * We only want to steal up to the prescribed amount of
8315
- * weighted load.
8213
+ * load/util/tasks.
83168214 */
83178215 if (env->imbalance <= 0)
83188216 break;
....@@ -8341,7 +8239,6 @@
83418239
83428240 BUG_ON(task_rq(p) != rq);
83438241 activate_task(rq, p, ENQUEUE_NOCLOCK);
8344
- p->on_rq = TASK_ON_RQ_QUEUED;
83458242 check_preempt_curr(rq, p, 0);
83468243 }
83478244
....@@ -8382,6 +8279,7 @@
83828279 rq_unlock(env->dst_rq, &rf);
83838280 }
83848281
8282
+#ifdef CONFIG_NO_HZ_COMMON
83858283 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
83868284 {
83878285 if (cfs_rq->avg.load_avg)
....@@ -8401,12 +8299,54 @@
84018299 if (READ_ONCE(rq->avg_dl.util_avg))
84028300 return true;
84038301
8302
+ if (thermal_load_avg(rq))
8303
+ return true;
8304
+
84048305 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
84058306 if (READ_ONCE(rq->avg_irq.util_avg))
84068307 return true;
84078308 #endif
84088309
84098310 return false;
8311
+}
8312
+
8313
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8314
+{
8315
+ rq->last_blocked_load_update_tick = jiffies;
8316
+
8317
+ if (!has_blocked)
8318
+ rq->has_blocked_load = 0;
8319
+}
8320
+#else
8321
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8322
+static inline bool others_have_blocked(struct rq *rq) { return false; }
8323
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8324
+#endif
8325
+
8326
+static bool __update_blocked_others(struct rq *rq, bool *done)
8327
+{
8328
+ const struct sched_class *curr_class;
8329
+ u64 now = rq_clock_pelt(rq);
8330
+ unsigned long thermal_pressure;
8331
+ bool decayed;
8332
+
8333
+ /*
8334
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
8335
+ * DL and IRQ signals have been updated before updating CFS.
8336
+ */
8337
+ curr_class = rq->curr->sched_class;
8338
+
8339
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8340
+
8341
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8342
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8343
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8344
+ update_irq_load_avg(rq, 0);
8345
+
8346
+ if (others_have_blocked(rq))
8347
+ *done = false;
8348
+
8349
+ return decayed;
84108350 }
84118351
84128352 #ifdef CONFIG_FAIR_GROUP_SCHED
....@@ -8422,22 +8362,17 @@
84228362 if (cfs_rq->avg.util_sum)
84238363 return false;
84248364
8425
- if (cfs_rq->avg.runnable_load_sum)
8365
+ if (cfs_rq->avg.runnable_sum)
84268366 return false;
84278367
84288368 return true;
84298369 }
84308370
8431
-static void update_blocked_averages(int cpu)
8371
+static bool __update_blocked_fair(struct rq *rq, bool *done)
84328372 {
8433
- struct rq *rq = cpu_rq(cpu);
84348373 struct cfs_rq *cfs_rq, *pos;
8435
- const struct sched_class *curr_class;
8436
- struct rq_flags rf;
8437
- bool done = true;
8438
-
8439
- rq_lock_irqsave(rq, &rf);
8440
- update_rq_clock(rq);
8374
+ bool decayed = false;
8375
+ int cpu = cpu_of(rq);
84418376
84428377 /*
84438378 * Iterates the task_group tree in a bottom up fashion, see
....@@ -8446,8 +8381,12 @@
84468381 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
84478382 struct sched_entity *se;
84488383
8449
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8450
- update_tg_load_avg(cfs_rq, 0);
8384
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8385
+ update_tg_load_avg(cfs_rq);
8386
+
8387
+ if (cfs_rq == &rq->cfs)
8388
+ decayed = true;
8389
+ }
84518390
84528391 /* Propagate pending load changes to the parent, if any: */
84538392 se = cfs_rq->tg->se[cpu];
....@@ -8463,23 +8402,10 @@
84638402
84648403 /* Don't need periodic decay once load/util_avg are null */
84658404 if (cfs_rq_has_blocked(cfs_rq))
8466
- done = false;
8405
+ *done = false;
84678406 }
84688407
8469
- curr_class = rq->curr->sched_class;
8470
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8471
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8472
- update_irq_load_avg(rq, 0);
8473
- /* Don't need periodic decay once load/util_avg are null */
8474
- if (others_have_blocked(rq))
8475
- done = false;
8476
-
8477
-#ifdef CONFIG_NO_HZ_COMMON
8478
- rq->last_blocked_load_update_tick = jiffies;
8479
- if (done)
8480
- rq->has_blocked_load = 0;
8481
-#endif
8482
- rq_unlock_irqrestore(rq, &rf);
8408
+ return decayed;
84838409 }
84848410
84858411 /*
....@@ -8529,27 +8455,16 @@
85298455 cfs_rq_load_avg(cfs_rq) + 1);
85308456 }
85318457 #else
8532
-static inline void update_blocked_averages(int cpu)
8458
+static bool __update_blocked_fair(struct rq *rq, bool *done)
85338459 {
8534
- struct rq *rq = cpu_rq(cpu);
85358460 struct cfs_rq *cfs_rq = &rq->cfs;
8536
- const struct sched_class *curr_class;
8537
- struct rq_flags rf;
8461
+ bool decayed;
85388462
8539
- rq_lock_irqsave(rq, &rf);
8540
- update_rq_clock(rq);
8541
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8463
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8464
+ if (cfs_rq_has_blocked(cfs_rq))
8465
+ *done = false;
85428466
8543
- curr_class = rq->curr->sched_class;
8544
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8545
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8546
- update_irq_load_avg(rq, 0);
8547
-#ifdef CONFIG_NO_HZ_COMMON
8548
- rq->last_blocked_load_update_tick = jiffies;
8549
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8550
- rq->has_blocked_load = 0;
8551
-#endif
8552
- rq_unlock_irqrestore(rq, &rf);
8467
+ return decayed;
85538468 }
85548469
85558470 static unsigned long task_h_load(struct task_struct *p)
....@@ -8557,6 +8472,24 @@
85578472 return p->se.avg.load_avg;
85588473 }
85598474 #endif
8475
+
8476
+static void update_blocked_averages(int cpu)
8477
+{
8478
+ bool decayed = false, done = true;
8479
+ struct rq *rq = cpu_rq(cpu);
8480
+ struct rq_flags rf;
8481
+
8482
+ rq_lock_irqsave(rq, &rf);
8483
+ update_rq_clock(rq);
8484
+
8485
+ decayed |= __update_blocked_others(rq, &done);
8486
+ decayed |= __update_blocked_fair(rq, &done);
8487
+
8488
+ update_blocked_load_status(rq, !done);
8489
+ if (decayed)
8490
+ cpufreq_update_util(rq, 0);
8491
+ rq_unlock_irqrestore(rq, &rf);
8492
+}
85608493
85618494 /********** Helpers for find_busiest_group ************************/
85628495
....@@ -8566,15 +8499,15 @@
85668499 struct sg_lb_stats {
85678500 unsigned long avg_load; /*Avg load across the CPUs of the group */
85688501 unsigned long group_load; /* Total load over the CPUs of the group */
8569
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8570
- unsigned long load_per_task;
85718502 unsigned long group_capacity;
8572
- unsigned long group_util; /* Total utilization of the group */
8573
- unsigned int sum_nr_running; /* Nr tasks running in the group */
8503
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
8504
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
8505
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
8506
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
85748507 unsigned int idle_cpus;
85758508 unsigned int group_weight;
85768509 enum group_type group_type;
8577
- int group_no_capacity;
8510
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
85788511 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
85798512 #ifdef CONFIG_NUMA_BALANCING
85808513 unsigned int nr_numa_running;
....@@ -8589,10 +8522,10 @@
85898522 struct sd_lb_stats {
85908523 struct sched_group *busiest; /* Busiest group in this sd */
85918524 struct sched_group *local; /* Local group in this sd */
8592
- unsigned long total_running;
85938525 unsigned long total_load; /* Total load of all groups in sd */
85948526 unsigned long total_capacity; /* Total capacity of all groups in sd */
85958527 unsigned long avg_load; /* Average load across all groups in sd */
8528
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
85968529
85978530 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
85988531 struct sg_lb_stats local_stat; /* Statistics of the local group */
....@@ -8603,54 +8536,26 @@
86038536 /*
86048537 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
86058538 * local_stat because update_sg_lb_stats() does a full clear/assignment.
8606
- * We must however clear busiest_stat::avg_load because
8607
- * update_sd_pick_busiest() reads this before assignment.
8539
+ * We must however set busiest_stat::group_type and
8540
+ * busiest_stat::idle_cpus to the worst busiest group because
8541
+ * update_sd_pick_busiest() reads these before assignment.
86088542 */
86098543 *sds = (struct sd_lb_stats){
86108544 .busiest = NULL,
86118545 .local = NULL,
8612
- .total_running = 0UL,
86138546 .total_load = 0UL,
86148547 .total_capacity = 0UL,
86158548 .busiest_stat = {
8616
- .avg_load = 0UL,
8617
- .sum_nr_running = 0,
8618
- .group_type = group_other,
8549
+ .idle_cpus = UINT_MAX,
8550
+ .group_type = group_has_spare,
86198551 },
86208552 };
86218553 }
86228554
8623
-/**
8624
- * get_sd_load_idx - Obtain the load index for a given sched domain.
8625
- * @sd: The sched_domain whose load_idx is to be obtained.
8626
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8627
- *
8628
- * Return: The load index.
8629
- */
8630
-static inline int get_sd_load_idx(struct sched_domain *sd,
8631
- enum cpu_idle_type idle)
8632
-{
8633
- int load_idx;
8634
-
8635
- switch (idle) {
8636
- case CPU_NOT_IDLE:
8637
- load_idx = sd->busy_idx;
8638
- break;
8639
-
8640
- case CPU_NEWLY_IDLE:
8641
- load_idx = sd->newidle_idx;
8642
- break;
8643
- default:
8644
- load_idx = sd->idle_idx;
8645
- break;
8646
- }
8647
-
8648
- return load_idx;
8649
-}
8650
-
8651
-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
8555
+static unsigned long scale_rt_capacity(int cpu)
86528556 {
86538557 struct rq *rq = cpu_rq(cpu);
8558
+ unsigned long max = arch_scale_cpu_capacity(cpu);
86548559 unsigned long used, free;
86558560 unsigned long irq;
86568561
....@@ -8659,8 +8564,15 @@
86598564 if (unlikely(irq >= max))
86608565 return 1;
86618566
8567
+ /*
8568
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
8569
+ * (running and not running) with weights 0 and 1024 respectively.
8570
+ * avg_thermal.load_avg tracks thermal pressure and the weighted
8571
+ * average uses the actual delta max capacity(load).
8572
+ */
86628573 used = READ_ONCE(rq->avg_rt.util_avg);
86638574 used += READ_ONCE(rq->avg_dl.util_avg);
8575
+ used += thermal_load_avg(rq);
86648576
86658577 if (unlikely(used >= max))
86668578 return 1;
....@@ -8670,52 +8582,20 @@
86708582 return scale_irq_capacity(free, irq, max);
86718583 }
86728584
8673
-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8674
- raw_spin_lock_init(&mcc->lock);
8675
- mcc->val = 0;
8676
- mcc->cpu = -1;
8677
-}
8678
-
86798585 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
86808586 {
8681
- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
8587
+ unsigned long capacity = scale_rt_capacity(cpu);
86828588 struct sched_group *sdg = sd->groups;
8683
- struct max_cpu_capacity *mcc;
8684
- unsigned long max_capacity;
8685
- int max_cap_cpu;
8686
- unsigned long flags;
86878589
8688
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8689
-
8690
- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8691
- capacity >>= SCHED_CAPACITY_SHIFT;
8692
-
8693
- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8694
-
8695
- raw_spin_lock_irqsave(&mcc->lock, flags);
8696
- max_capacity = mcc->val;
8697
- max_cap_cpu = mcc->cpu;
8698
-
8699
- if ((max_capacity > capacity && max_cap_cpu == cpu) ||
8700
- (max_capacity < capacity)) {
8701
- mcc->val = capacity;
8702
- mcc->cpu = cpu;
8703
-#ifdef CONFIG_SCHED_DEBUG
8704
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8705
- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8706
- // cpu, capacity);
8707
- goto skip_unlock;
8708
-#endif
8709
- }
8710
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8711
-
8712
-skip_unlock: __attribute__ ((unused));
8713
- capacity = scale_rt_capacity(cpu, capacity);
8590
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
87148591
87158592 if (!capacity)
87168593 capacity = 1;
87178594
8595
+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
87188596 cpu_rq(cpu)->cpu_capacity = capacity;
8597
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8598
+
87198599 sdg->sgc->capacity = capacity;
87208600 sdg->sgc->min_capacity = capacity;
87218601 sdg->sgc->max_capacity = capacity;
....@@ -8748,29 +8628,11 @@
87488628 */
87498629
87508630 for_each_cpu(cpu, sched_group_span(sdg)) {
8751
- struct sched_group_capacity *sgc;
8752
- struct rq *rq = cpu_rq(cpu);
8631
+ unsigned long cpu_cap = capacity_of(cpu);
87538632
8754
- /*
8755
- * build_sched_domains() -> init_sched_groups_capacity()
8756
- * gets here before we've attached the domains to the
8757
- * runqueues.
8758
- *
8759
- * Use capacity_of(), which is set irrespective of domains
8760
- * in update_cpu_capacity().
8761
- *
8762
- * This avoids capacity from being 0 and
8763
- * causing divide-by-zero issues on boot.
8764
- */
8765
- if (unlikely(!rq->sd)) {
8766
- capacity += capacity_of(cpu);
8767
- } else {
8768
- sgc = rq->sd->groups->sgc;
8769
- capacity += sgc->capacity;
8770
- }
8771
-
8772
- min_capacity = min(capacity, min_capacity);
8773
- max_capacity = max(capacity, max_capacity);
8633
+ capacity += cpu_cap;
8634
+ min_capacity = min(cpu_cap, min_capacity);
8635
+ max_capacity = max(cpu_cap, max_capacity);
87748636 }
87758637 } else {
87768638 /*
....@@ -8804,6 +8666,18 @@
88048666 {
88058667 return ((rq->cpu_capacity * sd->imbalance_pct) <
88068668 (rq->cpu_capacity_orig * 100));
8669
+}
8670
+
8671
+/*
8672
+ * Check whether a rq has a misfit task and if it looks like we can actually
8673
+ * help that task: we can migrate the task to a CPU of higher capacity, or
8674
+ * the task's current CPU is heavily pressured.
8675
+ */
8676
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8677
+{
8678
+ return rq->misfit_task_load &&
8679
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8680
+ check_cpu_capacity(rq, sd));
88078681 }
88088682
88098683 /*
....@@ -8853,13 +8727,17 @@
88538727 * any benefit for the load balance.
88548728 */
88558729 static inline bool
8856
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8730
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88578731 {
88588732 if (sgs->sum_nr_running < sgs->group_weight)
88598733 return true;
88608734
8735
+ if ((sgs->group_capacity * imbalance_pct) <
8736
+ (sgs->group_runnable * 100))
8737
+ return false;
8738
+
88618739 if ((sgs->group_capacity * 100) >
8862
- (sgs->group_util * env->sd->imbalance_pct))
8740
+ (sgs->group_util * imbalance_pct))
88638741 return true;
88648742
88658743 return false;
....@@ -8874,13 +8752,17 @@
88748752 * false.
88758753 */
88768754 static inline bool
8877
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8755
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88788756 {
88798757 if (sgs->sum_nr_running <= sgs->group_weight)
88808758 return false;
88818759
88828760 if ((sgs->group_capacity * 100) <
8883
- (sgs->group_util * env->sd->imbalance_pct))
8761
+ (sgs->group_util * imbalance_pct))
8762
+ return true;
8763
+
8764
+ if ((sgs->group_capacity * imbalance_pct) <
8765
+ (sgs->group_runnable * 100))
88848766 return true;
88858767
88868768 return false;
....@@ -8893,8 +8775,7 @@
88938775 static inline bool
88948776 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
88958777 {
8896
- return sg->sgc->min_capacity * capacity_margin <
8897
- ref->sgc->min_capacity * 1024;
8778
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
88988779 }
88998780
89008781 /*
....@@ -8904,24 +8785,30 @@
89048785 static inline bool
89058786 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
89068787 {
8907
- return sg->sgc->max_capacity * capacity_margin <
8908
- ref->sgc->max_capacity * 1024;
8788
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
89098789 }
89108790
89118791 static inline enum
8912
-group_type group_classify(struct sched_group *group,
8792
+group_type group_classify(unsigned int imbalance_pct,
8793
+ struct sched_group *group,
89138794 struct sg_lb_stats *sgs)
89148795 {
8915
- if (sgs->group_no_capacity)
8796
+ if (group_is_overloaded(imbalance_pct, sgs))
89168797 return group_overloaded;
89178798
89188799 if (sg_imbalanced(group))
89198800 return group_imbalanced;
89208801
8802
+ if (sgs->group_asym_packing)
8803
+ return group_asym_packing;
8804
+
89218805 if (sgs->group_misfit_task_load)
89228806 return group_misfit_task;
89238807
8924
- return group_other;
8808
+ if (!group_has_capacity(imbalance_pct, sgs))
8809
+ return group_fully_busy;
8810
+
8811
+ return group_has_spare;
89258812 }
89268813
89278814 static bool update_nohz_stats(struct rq *rq, bool force)
....@@ -8958,12 +8845,11 @@
89588845 struct sg_lb_stats *sgs,
89598846 int *sg_status)
89608847 {
8961
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8962
- int load_idx = get_sd_load_idx(env->sd, env->idle);
8963
- unsigned long load;
8964
- int i, nr_running;
8848
+ int i, nr_running, local_group;
89658849
89668850 memset(sgs, 0, sizeof(*sgs));
8851
+
8852
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
89678853
89688854 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
89698855 struct rq *rq = cpu_rq(i);
....@@ -8971,17 +8857,14 @@
89718857 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
89728858 env->flags |= LBF_NOHZ_AGAIN;
89738859
8974
- /* Bias balancing toward CPUs of our domain: */
8975
- if (local_group)
8976
- load = target_load(i, load_idx);
8977
- else
8978
- load = source_load(i, load_idx);
8979
-
8980
- sgs->group_load += load;
8860
+ sgs->group_load += cpu_load(rq);
89818861 sgs->group_util += cpu_util(i);
8982
- sgs->sum_nr_running += rq->cfs.h_nr_running;
8862
+ sgs->group_runnable += cpu_runnable(rq);
8863
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
89838864
89848865 nr_running = rq->nr_running;
8866
+ sgs->sum_nr_running += nr_running;
8867
+
89858868 if (nr_running > 1)
89868869 *sg_status |= SG_OVERLOAD;
89878870
....@@ -8992,13 +8875,19 @@
89928875 sgs->nr_numa_running += rq->nr_numa_running;
89938876 sgs->nr_preferred_running += rq->nr_preferred_running;
89948877 #endif
8995
- sgs->sum_weighted_load += weighted_cpuload(rq);
89968878 /*
89978879 * No need to call idle_cpu() if nr_running is not 0
89988880 */
8999
- if (!nr_running && idle_cpu(i))
8881
+ if (!nr_running && idle_cpu(i)) {
90008882 sgs->idle_cpus++;
8883
+ /* Idle cpu can't have misfit task */
8884
+ continue;
8885
+ }
90018886
8887
+ if (local_group)
8888
+ continue;
8889
+
8890
+ /* Check for a misfit task on the cpu */
90028891 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
90038892 sgs->group_misfit_task_load < rq->misfit_task_load) {
90048893 sgs->group_misfit_task_load = rq->misfit_task_load;
....@@ -9006,17 +8895,24 @@
90068895 }
90078896 }
90088897
9009
- /* Adjust by relative CPU capacity of the group */
9010
- sgs->group_capacity = group->sgc->capacity;
9011
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8898
+ /* Check if dst CPU is idle and preferred to this group */
8899
+ if (env->sd->flags & SD_ASYM_PACKING &&
8900
+ env->idle != CPU_NOT_IDLE &&
8901
+ sgs->sum_h_nr_running &&
8902
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8903
+ sgs->group_asym_packing = 1;
8904
+ }
90128905
9013
- if (sgs->sum_nr_running)
9014
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8906
+ sgs->group_capacity = group->sgc->capacity;
90158907
90168908 sgs->group_weight = group->group_weight;
90178909
9018
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9019
- sgs->group_type = group_classify(group, sgs);
8910
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8911
+
8912
+ /* Computing avg_load makes sense only when group is overloaded */
8913
+ if (sgs->group_type == group_overloaded)
8914
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8915
+ sgs->group_capacity;
90208916 }
90218917
90228918 /**
....@@ -9039,6 +8935,10 @@
90398935 {
90408936 struct sg_lb_stats *busiest = &sds->busiest_stat;
90418937
8938
+ /* Make sure that there is at least one task to pull */
8939
+ if (!sgs->sum_h_nr_running)
8940
+ return false;
8941
+
90428942 /*
90438943 * Don't try to pull misfit tasks we can't help.
90448944 * We can use max_capacity here as reduction in capacity on some
....@@ -9047,7 +8947,7 @@
90478947 */
90488948 if (sgs->group_type == group_misfit_task &&
90498949 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
9050
- !group_has_capacity(env, &sds->local_stat)))
8950
+ sds->local_stat.group_type != group_has_spare))
90518951 return false;
90528952
90538953 if (sgs->group_type > busiest->group_type)
....@@ -9056,62 +8956,92 @@
90568956 if (sgs->group_type < busiest->group_type)
90578957 return false;
90588958
9059
- if (sgs->avg_load <= busiest->avg_load)
8959
+ /*
8960
+ * The candidate and the current busiest group are the same type of
8961
+ * group. Let check which one is the busiest according to the type.
8962
+ */
8963
+
8964
+ switch (sgs->group_type) {
8965
+ case group_overloaded:
8966
+ /* Select the overloaded group with highest avg_load. */
8967
+ if (sgs->avg_load <= busiest->avg_load)
8968
+ return false;
8969
+ break;
8970
+
8971
+ case group_imbalanced:
8972
+ /*
8973
+ * Select the 1st imbalanced group as we don't have any way to
8974
+ * choose one more than another.
8975
+ */
90608976 return false;
90618977
9062
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9063
- goto asym_packing;
9064
-
9065
- /*
9066
- * Candidate sg has no more than one task per CPU and
9067
- * has higher per-CPU capacity. Migrating tasks to less
9068
- * capable CPUs may harm throughput. Maximize throughput,
9069
- * power/energy consequences are not considered.
9070
- */
9071
- if (sgs->sum_nr_running <= sgs->group_weight &&
9072
- group_smaller_min_cpu_capacity(sds->local, sg))
9073
- return false;
9074
-
9075
- /*
9076
- * If we have more than one misfit sg go with the biggest misfit.
9077
- */
9078
- if (sgs->group_type == group_misfit_task &&
9079
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9080
- return false;
9081
-
9082
-asym_packing:
9083
- /* This is the busiest node in its class. */
9084
- if (!(env->sd->flags & SD_ASYM_PACKING))
9085
- return true;
9086
-
9087
- /* No ASYM_PACKING if target CPU is already busy */
9088
- if (env->idle == CPU_NOT_IDLE)
9089
- return true;
9090
- /*
9091
- * ASYM_PACKING needs to move all the work to the highest
9092
- * prority CPUs in the group, therefore mark all groups
9093
- * of lower priority than ourself as busy.
9094
- */
9095
- if (sgs->sum_nr_running &&
9096
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9097
- if (!sds->busiest)
9098
- return true;
9099
-
8978
+ case group_asym_packing:
91008979 /* Prefer to move from lowest priority CPU's work */
9101
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9102
- sg->asym_prefer_cpu))
9103
- return true;
8980
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8981
+ return false;
8982
+ break;
8983
+
8984
+ case group_misfit_task:
8985
+ /*
8986
+ * If we have more than one misfit sg go with the biggest
8987
+ * misfit.
8988
+ */
8989
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8990
+ return false;
8991
+ break;
8992
+
8993
+ case group_fully_busy:
8994
+ /*
8995
+ * Select the fully busy group with highest avg_load. In
8996
+ * theory, there is no need to pull task from such kind of
8997
+ * group because tasks have all compute capacity that they need
8998
+ * but we can still improve the overall throughput by reducing
8999
+ * contention when accessing shared HW resources.
9000
+ *
9001
+ * XXX for now avg_load is not computed and always 0 so we
9002
+ * select the 1st one.
9003
+ */
9004
+ if (sgs->avg_load <= busiest->avg_load)
9005
+ return false;
9006
+ break;
9007
+
9008
+ case group_has_spare:
9009
+ /*
9010
+ * Select not overloaded group with lowest number of idle cpus
9011
+ * and highest number of running tasks. We could also compare
9012
+ * the spare capacity which is more stable but it can end up
9013
+ * that the group has less spare capacity but finally more idle
9014
+ * CPUs which means less opportunity to pull tasks.
9015
+ */
9016
+ if (sgs->idle_cpus > busiest->idle_cpus)
9017
+ return false;
9018
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
9019
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
9020
+ return false;
9021
+
9022
+ break;
91049023 }
91059024
9106
- return false;
9025
+ /*
9026
+ * Candidate sg has no more than one task per CPU and has higher
9027
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
9028
+ * throughput. Maximize throughput, power/energy consequences are not
9029
+ * considered.
9030
+ */
9031
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
9032
+ (sgs->group_type <= group_fully_busy) &&
9033
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
9034
+ return false;
9035
+
9036
+ return true;
91079037 }
91089038
91099039 #ifdef CONFIG_NUMA_BALANCING
91109040 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
91119041 {
9112
- if (sgs->sum_nr_running > sgs->nr_numa_running)
9042
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
91139043 return regular;
9114
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
9044
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
91159045 return remote;
91169046 return all;
91179047 }
....@@ -9136,18 +9066,338 @@
91369066 }
91379067 #endif /* CONFIG_NUMA_BALANCING */
91389068
9069
+
9070
+struct sg_lb_stats;
9071
+
9072
+/*
9073
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
9074
+ */
9075
+
9076
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
9077
+{
9078
+ /* Task has no contribution or is new */
9079
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
9080
+ return 0;
9081
+
9082
+ if (task_on_rq_queued(p))
9083
+ return 1;
9084
+
9085
+ return 0;
9086
+}
9087
+
9088
+/**
9089
+ * idle_cpu_without - would a given CPU be idle without p ?
9090
+ * @cpu: the processor on which idleness is tested.
9091
+ * @p: task which should be ignored.
9092
+ *
9093
+ * Return: 1 if the CPU would be idle. 0 otherwise.
9094
+ */
9095
+static int idle_cpu_without(int cpu, struct task_struct *p)
9096
+{
9097
+ struct rq *rq = cpu_rq(cpu);
9098
+
9099
+ if (rq->curr != rq->idle && rq->curr != p)
9100
+ return 0;
9101
+
9102
+ /*
9103
+ * rq->nr_running can't be used but an updated version without the
9104
+ * impact of p on cpu must be used instead. The updated nr_running
9105
+ * be computed and tested before calling idle_cpu_without().
9106
+ */
9107
+
9108
+#ifdef CONFIG_SMP
9109
+ if (rq->ttwu_pending)
9110
+ return 0;
9111
+#endif
9112
+
9113
+ return 1;
9114
+}
9115
+
9116
+/*
9117
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
9118
+ * @sd: The sched_domain level to look for idlest group.
9119
+ * @group: sched_group whose statistics are to be updated.
9120
+ * @sgs: variable to hold the statistics for this group.
9121
+ * @p: The task for which we look for the idlest group/CPU.
9122
+ */
9123
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
9124
+ struct sched_group *group,
9125
+ struct sg_lb_stats *sgs,
9126
+ struct task_struct *p)
9127
+{
9128
+ int i, nr_running;
9129
+
9130
+ memset(sgs, 0, sizeof(*sgs));
9131
+
9132
+ /* Assume that task can't fit any CPU of the group */
9133
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
9134
+ sgs->group_misfit_task_load = 1;
9135
+
9136
+ for_each_cpu(i, sched_group_span(group)) {
9137
+ struct rq *rq = cpu_rq(i);
9138
+ unsigned int local;
9139
+
9140
+ sgs->group_load += cpu_load_without(rq, p);
9141
+ sgs->group_util += cpu_util_without(i, p);
9142
+ sgs->group_runnable += cpu_runnable_without(rq, p);
9143
+ local = task_running_on_cpu(i, p);
9144
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
9145
+
9146
+ nr_running = rq->nr_running - local;
9147
+ sgs->sum_nr_running += nr_running;
9148
+
9149
+ /*
9150
+ * No need to call idle_cpu_without() if nr_running is not 0
9151
+ */
9152
+ if (!nr_running && idle_cpu_without(i, p))
9153
+ sgs->idle_cpus++;
9154
+
9155
+ /* Check if task fits in the CPU */
9156
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
9157
+ sgs->group_misfit_task_load &&
9158
+ task_fits_cpu(p, i))
9159
+ sgs->group_misfit_task_load = 0;
9160
+
9161
+ }
9162
+
9163
+ sgs->group_capacity = group->sgc->capacity;
9164
+
9165
+ sgs->group_weight = group->group_weight;
9166
+
9167
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
9168
+
9169
+ /*
9170
+ * Computing avg_load makes sense only when group is fully busy or
9171
+ * overloaded
9172
+ */
9173
+ if (sgs->group_type == group_fully_busy ||
9174
+ sgs->group_type == group_overloaded)
9175
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
9176
+ sgs->group_capacity;
9177
+}
9178
+
9179
+static bool update_pick_idlest(struct sched_group *idlest,
9180
+ struct sg_lb_stats *idlest_sgs,
9181
+ struct sched_group *group,
9182
+ struct sg_lb_stats *sgs)
9183
+{
9184
+ if (sgs->group_type < idlest_sgs->group_type)
9185
+ return true;
9186
+
9187
+ if (sgs->group_type > idlest_sgs->group_type)
9188
+ return false;
9189
+
9190
+ /*
9191
+ * The candidate and the current idlest group are the same type of
9192
+ * group. Let check which one is the idlest according to the type.
9193
+ */
9194
+
9195
+ switch (sgs->group_type) {
9196
+ case group_overloaded:
9197
+ case group_fully_busy:
9198
+ /* Select the group with lowest avg_load. */
9199
+ if (idlest_sgs->avg_load <= sgs->avg_load)
9200
+ return false;
9201
+ break;
9202
+
9203
+ case group_imbalanced:
9204
+ case group_asym_packing:
9205
+ /* Those types are not used in the slow wakeup path */
9206
+ return false;
9207
+
9208
+ case group_misfit_task:
9209
+ /* Select group with the highest max capacity */
9210
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9211
+ return false;
9212
+ break;
9213
+
9214
+ case group_has_spare:
9215
+ /* Select group with most idle CPUs */
9216
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
9217
+ return false;
9218
+
9219
+ /* Select group with lowest group_util */
9220
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9221
+ idlest_sgs->group_util <= sgs->group_util)
9222
+ return false;
9223
+
9224
+ break;
9225
+ }
9226
+
9227
+ return true;
9228
+}
9229
+
9230
+/*
9231
+ * find_idlest_group() finds and returns the least busy CPU group within the
9232
+ * domain.
9233
+ *
9234
+ * Assumes p is allowed on at least one CPU in sd.
9235
+ */
9236
+static struct sched_group *
9237
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
9238
+{
9239
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9240
+ struct sg_lb_stats local_sgs, tmp_sgs;
9241
+ struct sg_lb_stats *sgs;
9242
+ unsigned long imbalance;
9243
+ struct sg_lb_stats idlest_sgs = {
9244
+ .avg_load = UINT_MAX,
9245
+ .group_type = group_overloaded,
9246
+ };
9247
+
9248
+ imbalance = scale_load_down(NICE_0_LOAD) *
9249
+ (sd->imbalance_pct-100) / 100;
9250
+
9251
+ do {
9252
+ int local_group;
9253
+
9254
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
9255
+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
9256
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
9257
+ int level = rockchip_perf_get_level();
9258
+
9259
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
9260
+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
9261
+ !cpumask_intersects(sched_group_span(group), cpub_mask))
9262
+ continue;
9263
+ }
9264
+
9265
+ /* Skip over this group if it has no CPUs allowed */
9266
+ if (!cpumask_intersects(sched_group_span(group),
9267
+ p->cpus_ptr))
9268
+ continue;
9269
+
9270
+ local_group = cpumask_test_cpu(this_cpu,
9271
+ sched_group_span(group));
9272
+
9273
+ if (local_group) {
9274
+ sgs = &local_sgs;
9275
+ local = group;
9276
+ } else {
9277
+ sgs = &tmp_sgs;
9278
+ }
9279
+
9280
+ update_sg_wakeup_stats(sd, group, sgs, p);
9281
+
9282
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9283
+ idlest = group;
9284
+ idlest_sgs = *sgs;
9285
+ }
9286
+
9287
+ } while (group = group->next, group != sd->groups);
9288
+
9289
+
9290
+ /* There is no idlest group to push tasks to */
9291
+ if (!idlest)
9292
+ return NULL;
9293
+
9294
+ /* The local group has been skipped because of CPU affinity */
9295
+ if (!local)
9296
+ return idlest;
9297
+
9298
+ /*
9299
+ * If the local group is idler than the selected idlest group
9300
+ * don't try and push the task.
9301
+ */
9302
+ if (local_sgs.group_type < idlest_sgs.group_type)
9303
+ return NULL;
9304
+
9305
+ /*
9306
+ * If the local group is busier than the selected idlest group
9307
+ * try and push the task.
9308
+ */
9309
+ if (local_sgs.group_type > idlest_sgs.group_type)
9310
+ return idlest;
9311
+
9312
+ switch (local_sgs.group_type) {
9313
+ case group_overloaded:
9314
+ case group_fully_busy:
9315
+ /*
9316
+ * When comparing groups across NUMA domains, it's possible for
9317
+ * the local domain to be very lightly loaded relative to the
9318
+ * remote domains but "imbalance" skews the comparison making
9319
+ * remote CPUs look much more favourable. When considering
9320
+ * cross-domain, add imbalance to the load on the remote node
9321
+ * and consider staying local.
9322
+ */
9323
+
9324
+ if ((sd->flags & SD_NUMA) &&
9325
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9326
+ return NULL;
9327
+
9328
+ /*
9329
+ * If the local group is less loaded than the selected
9330
+ * idlest group don't try and push any tasks.
9331
+ */
9332
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9333
+ return NULL;
9334
+
9335
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9336
+ return NULL;
9337
+ break;
9338
+
9339
+ case group_imbalanced:
9340
+ case group_asym_packing:
9341
+ /* Those type are not used in the slow wakeup path */
9342
+ return NULL;
9343
+
9344
+ case group_misfit_task:
9345
+ /* Select group with the highest max capacity */
9346
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9347
+ return NULL;
9348
+ break;
9349
+
9350
+ case group_has_spare:
9351
+ if (sd->flags & SD_NUMA) {
9352
+#ifdef CONFIG_NUMA_BALANCING
9353
+ int idlest_cpu;
9354
+ /*
9355
+ * If there is spare capacity at NUMA, try to select
9356
+ * the preferred node
9357
+ */
9358
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9359
+ return NULL;
9360
+
9361
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
9362
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9363
+ return idlest;
9364
+#endif
9365
+ /*
9366
+ * Otherwise, keep the task on this node to stay close
9367
+ * its wakeup source and improve locality. If there is
9368
+ * a real need of migration, periodic load balance will
9369
+ * take care of it.
9370
+ */
9371
+ if (local_sgs.idle_cpus)
9372
+ return NULL;
9373
+ }
9374
+
9375
+ /*
9376
+ * Select group with highest number of idle CPUs. We could also
9377
+ * compare the utilization which is more stable but it can end
9378
+ * up that the group has less spare capacity but finally more
9379
+ * idle CPUs which means more opportunity to run task.
9380
+ */
9381
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9382
+ return NULL;
9383
+ break;
9384
+ }
9385
+
9386
+ return idlest;
9387
+}
9388
+
91399389 /**
91409390 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
91419391 * @env: The load balancing environment.
91429392 * @sds: variable to hold the statistics for this sched_domain.
91439393 */
9394
+
91449395 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
91459396 {
91469397 struct sched_domain *child = env->sd->child;
91479398 struct sched_group *sg = env->sd->groups;
91489399 struct sg_lb_stats *local = &sds->local_stat;
91499400 struct sg_lb_stats tmp_sgs;
9150
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
91519401 int sg_status = 0;
91529402
91539403 #ifdef CONFIG_NO_HZ_COMMON
....@@ -9174,22 +9424,6 @@
91749424 if (local_group)
91759425 goto next_group;
91769426
9177
- /*
9178
- * In case the child domain prefers tasks go to siblings
9179
- * first, lower the sg capacity so that we'll try
9180
- * and move all the excess tasks away. We lower the capacity
9181
- * of a group only if the local group has the capacity to fit
9182
- * these excess tasks. The extra check prevents the case where
9183
- * you always pull from the heaviest group when it is already
9184
- * under-utilized (possible with a large weight task outweighs
9185
- * the tasks on the system).
9186
- */
9187
- if (prefer_sibling && sds->local &&
9188
- group_has_capacity(env, local) &&
9189
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9190
- sgs->group_no_capacity = 1;
9191
- sgs->group_type = group_classify(sg, sgs);
9192
- }
91939427
91949428 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
91959429 sds->busiest = sg;
....@@ -9198,12 +9432,14 @@
91989432
91999433 next_group:
92009434 /* Now, start updating sd_lb_stats */
9201
- sds->total_running += sgs->sum_nr_running;
92029435 sds->total_load += sgs->group_load;
92039436 sds->total_capacity += sgs->group_capacity;
92049437
92059438 sg = sg->next;
92069439 } while (sg != env->sd->groups);
9440
+
9441
+ /* Tag domain that child domain prefers tasks go to siblings first */
9442
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
92079443
92089444 #ifdef CONFIG_NO_HZ_COMMON
92099445 if ((env->flags & LBF_NOHZ_AGAIN) &&
....@@ -9217,8 +9453,6 @@
92179453 if (env->sd->flags & SD_NUMA)
92189454 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
92199455
9220
- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9221
-
92229456 if (!env->sd->parent) {
92239457 struct root_domain *rd = env->dst_rq->rd;
92249458
....@@ -9227,144 +9461,28 @@
92279461
92289462 /* Update over-utilization (tipping point, U >= 0) indicator */
92299463 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9230
- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
9464
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
92319465 } else if (sg_status & SG_OVERUTILIZED) {
9232
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9233
- trace_sched_overutilized(1);
9234
- }
9466
+ struct root_domain *rd = env->dst_rq->rd;
92359467
9468
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9469
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9470
+ }
92369471 }
92379472
9238
-/**
9239
- * check_asym_packing - Check to see if the group is packed into the
9240
- * sched domain.
9241
- *
9242
- * This is primarily intended to used at the sibling level. Some
9243
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9244
- * case of POWER7, it can move to lower SMT modes only when higher
9245
- * threads are idle. When in lower SMT modes, the threads will
9246
- * perform better since they share less core resources. Hence when we
9247
- * have idle threads, we want them to be the higher ones.
9248
- *
9249
- * This packing function is run on idle threads. It checks to see if
9250
- * the busiest CPU in this domain (core in the P7 case) has a higher
9251
- * CPU number than the packing function is being run on. Here we are
9252
- * assuming lower CPU number will be equivalent to lower a SMT thread
9253
- * number.
9254
- *
9255
- * Return: 1 when packing is required and a task should be moved to
9256
- * this CPU. The amount of the imbalance is returned in env->imbalance.
9257
- *
9258
- * @env: The load balancing environment.
9259
- * @sds: Statistics of the sched_domain which is to be packed
9260
- */
9261
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9473
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
92629474 {
9263
- int busiest_cpu;
9264
-
9265
- if (!(env->sd->flags & SD_ASYM_PACKING))
9266
- return 0;
9267
-
9268
- if (env->idle == CPU_NOT_IDLE)
9269
- return 0;
9270
-
9271
- if (!sds->busiest)
9272
- return 0;
9273
-
9274
- busiest_cpu = sds->busiest->asym_prefer_cpu;
9275
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9276
- return 0;
9277
-
9278
- env->imbalance = DIV_ROUND_CLOSEST(
9279
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9280
- SCHED_CAPACITY_SCALE);
9281
-
9282
- return 1;
9283
-}
9284
-
9285
-/**
9286
- * fix_small_imbalance - Calculate the minor imbalance that exists
9287
- * amongst the groups of a sched_domain, during
9288
- * load balancing.
9289
- * @env: The load balancing environment.
9290
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9291
- */
9292
-static inline
9293
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9294
-{
9295
- unsigned long tmp, capa_now = 0, capa_move = 0;
9296
- unsigned int imbn = 2;
9297
- unsigned long scaled_busy_load_per_task;
9298
- struct sg_lb_stats *local, *busiest;
9299
-
9300
- local = &sds->local_stat;
9301
- busiest = &sds->busiest_stat;
9302
-
9303
- if (!local->sum_nr_running)
9304
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9305
- else if (busiest->load_per_task > local->load_per_task)
9306
- imbn = 1;
9307
-
9308
- scaled_busy_load_per_task =
9309
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9310
- busiest->group_capacity;
9311
-
9312
- if (busiest->avg_load + scaled_busy_load_per_task >=
9313
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9314
- env->imbalance = busiest->load_per_task;
9315
- return;
9316
- }
9475
+ unsigned int imbalance_min;
93179476
93189477 /*
9319
- * OK, we don't have enough imbalance to justify moving tasks,
9320
- * however we may be able to increase total CPU capacity used by
9321
- * moving them.
9478
+ * Allow a small imbalance based on a simple pair of communicating
9479
+ * tasks that remain local when the source domain is almost idle.
93229480 */
9481
+ imbalance_min = 2;
9482
+ if (nr_running <= imbalance_min)
9483
+ return 0;
93239484
9324
- capa_now += busiest->group_capacity *
9325
- min(busiest->load_per_task, busiest->avg_load);
9326
- capa_now += local->group_capacity *
9327
- min(local->load_per_task, local->avg_load);
9328
- capa_now /= SCHED_CAPACITY_SCALE;
9329
-
9330
- /* Amount of load we'd subtract */
9331
- if (busiest->avg_load > scaled_busy_load_per_task) {
9332
- capa_move += busiest->group_capacity *
9333
- min(busiest->load_per_task,
9334
- busiest->avg_load - scaled_busy_load_per_task);
9335
- }
9336
-
9337
- /* Amount of load we'd add */
9338
- if (busiest->avg_load * busiest->group_capacity <
9339
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9340
- tmp = (busiest->avg_load * busiest->group_capacity) /
9341
- local->group_capacity;
9342
- } else {
9343
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9344
- local->group_capacity;
9345
- }
9346
- capa_move += local->group_capacity *
9347
- min(local->load_per_task, local->avg_load + tmp);
9348
- capa_move /= SCHED_CAPACITY_SCALE;
9349
-
9350
- /* Move if we gain throughput */
9351
- if (capa_move > capa_now) {
9352
- env->imbalance = busiest->load_per_task;
9353
- return;
9354
- }
9355
-
9356
- /* We can't see throughput improvement with the load-based
9357
- * method, but it is possible depending upon group size and
9358
- * capacity range that there might still be an underutilized
9359
- * cpu available in an asymmetric capacity system. Do one last
9360
- * check just in case.
9361
- */
9362
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9363
- busiest->group_type == group_overloaded &&
9364
- busiest->sum_nr_running > busiest->group_weight &&
9365
- local->sum_nr_running < local->group_weight &&
9366
- local->group_capacity < busiest->group_capacity)
9367
- env->imbalance = busiest->load_per_task;
9485
+ return imbalance;
93689486 }
93699487
93709488 /**
....@@ -9375,96 +9493,180 @@
93759493 */
93769494 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
93779495 {
9378
- unsigned long max_pull, load_above_capacity = ~0UL;
93799496 struct sg_lb_stats *local, *busiest;
93809497
93819498 local = &sds->local_stat;
93829499 busiest = &sds->busiest_stat;
93839500
9501
+ if (busiest->group_type == group_misfit_task) {
9502
+ /* Set imbalance to allow misfit tasks to be balanced. */
9503
+ env->migration_type = migrate_misfit;
9504
+ env->imbalance = 1;
9505
+ return;
9506
+ }
9507
+
9508
+ if (busiest->group_type == group_asym_packing) {
9509
+ /*
9510
+ * In case of asym capacity, we will try to migrate all load to
9511
+ * the preferred CPU.
9512
+ */
9513
+ env->migration_type = migrate_task;
9514
+ env->imbalance = busiest->sum_h_nr_running;
9515
+ return;
9516
+ }
9517
+
93849518 if (busiest->group_type == group_imbalanced) {
93859519 /*
93869520 * In the group_imb case we cannot rely on group-wide averages
9387
- * to ensure CPU-load equilibrium, look at wider averages. XXX
9521
+ * to ensure CPU-load equilibrium, try to move any task to fix
9522
+ * the imbalance. The next load balance will take care of
9523
+ * balancing back the system.
93889524 */
9389
- busiest->load_per_task =
9390
- min(busiest->load_per_task, sds->avg_load);
9525
+ env->migration_type = migrate_task;
9526
+ env->imbalance = 1;
9527
+ return;
93919528 }
93929529
93939530 /*
9394
- * Avg load of busiest sg can be less and avg load of local sg can
9395
- * be greater than avg load across all sgs of sd because avg load
9396
- * factors in sg capacity and sgs with smaller group_type are
9397
- * skipped when updating the busiest sg:
9531
+ * Try to use spare capacity of local group without overloading it or
9532
+ * emptying busiest.
93989533 */
9399
- if (busiest->group_type != group_misfit_task &&
9400
- (busiest->avg_load <= sds->avg_load ||
9401
- local->avg_load >= sds->avg_load)) {
9402
- env->imbalance = 0;
9403
- return fix_small_imbalance(env, sds);
9534
+ if (local->group_type == group_has_spare) {
9535
+ if ((busiest->group_type > group_fully_busy) &&
9536
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9537
+ /*
9538
+ * If busiest is overloaded, try to fill spare
9539
+ * capacity. This might end up creating spare capacity
9540
+ * in busiest or busiest still being overloaded but
9541
+ * there is no simple way to directly compute the
9542
+ * amount of load to migrate in order to balance the
9543
+ * system.
9544
+ */
9545
+ env->migration_type = migrate_util;
9546
+ env->imbalance = max(local->group_capacity, local->group_util) -
9547
+ local->group_util;
9548
+
9549
+ /*
9550
+ * In some cases, the group's utilization is max or even
9551
+ * higher than capacity because of migrations but the
9552
+ * local CPU is (newly) idle. There is at least one
9553
+ * waiting task in this overloaded busiest group. Let's
9554
+ * try to pull it.
9555
+ */
9556
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9557
+ env->migration_type = migrate_task;
9558
+ env->imbalance = 1;
9559
+ }
9560
+
9561
+ return;
9562
+ }
9563
+
9564
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
9565
+ unsigned int nr_diff = busiest->sum_nr_running;
9566
+ /*
9567
+ * When prefer sibling, evenly spread running tasks on
9568
+ * groups.
9569
+ */
9570
+ env->migration_type = migrate_task;
9571
+ lsub_positive(&nr_diff, local->sum_nr_running);
9572
+ env->imbalance = nr_diff >> 1;
9573
+ } else {
9574
+
9575
+ /*
9576
+ * If there is no overload, we just want to even the number of
9577
+ * idle cpus.
9578
+ */
9579
+ env->migration_type = migrate_task;
9580
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
9581
+ busiest->idle_cpus) >> 1);
9582
+ }
9583
+
9584
+ /* Consider allowing a small imbalance between NUMA groups */
9585
+ if (env->sd->flags & SD_NUMA)
9586
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
9587
+ busiest->sum_nr_running);
9588
+
9589
+ return;
94049590 }
94059591
94069592 /*
9407
- * If there aren't any idle CPUs, avoid creating some.
9593
+ * Local is fully busy but has to take more load to relieve the
9594
+ * busiest group
94089595 */
9409
- if (busiest->group_type == group_overloaded &&
9410
- local->group_type == group_overloaded) {
9411
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9412
- if (load_above_capacity > busiest->group_capacity) {
9413
- load_above_capacity -= busiest->group_capacity;
9414
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9415
- load_above_capacity /= busiest->group_capacity;
9416
- } else
9417
- load_above_capacity = ~0UL;
9596
+ if (local->group_type < group_overloaded) {
9597
+ /*
9598
+ * Local will become overloaded so the avg_load metrics are
9599
+ * finally needed.
9600
+ */
9601
+
9602
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9603
+ local->group_capacity;
9604
+
9605
+ /*
9606
+ * If the local group is more loaded than the selected
9607
+ * busiest group don't try to pull any tasks.
9608
+ */
9609
+ if (local->avg_load >= busiest->avg_load) {
9610
+ env->imbalance = 0;
9611
+ return;
9612
+ }
9613
+
9614
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9615
+ sds->total_capacity;
9616
+
9617
+ /*
9618
+ * If the local group is more loaded than the average system
9619
+ * load, don't try to pull any tasks.
9620
+ */
9621
+ if (local->avg_load >= sds->avg_load) {
9622
+ env->imbalance = 0;
9623
+ return;
9624
+ }
9625
+
94189626 }
94199627
94209628 /*
9421
- * We're trying to get all the CPUs to the average_load, so we don't
9422
- * want to push ourselves above the average load, nor do we wish to
9423
- * reduce the max loaded CPU below the average load. At the same time,
9424
- * we also don't want to reduce the group load below the group
9425
- * capacity. Thus we look for the minimum possible imbalance.
9629
+ * Both group are or will become overloaded and we're trying to get all
9630
+ * the CPUs to the average_load, so we don't want to push ourselves
9631
+ * above the average load, nor do we wish to reduce the max loaded CPU
9632
+ * below the average load. At the same time, we also don't want to
9633
+ * reduce the group load below the group capacity. Thus we look for
9634
+ * the minimum possible imbalance.
94269635 */
9427
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9428
-
9429
- /* How much load to actually move to equalise the imbalance */
9636
+ env->migration_type = migrate_load;
94309637 env->imbalance = min(
9431
- max_pull * busiest->group_capacity,
9638
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
94329639 (sds->avg_load - local->avg_load) * local->group_capacity
94339640 ) / SCHED_CAPACITY_SCALE;
9434
-
9435
- /* Boost imbalance to allow misfit task to be balanced.
9436
- * Always do this if we are doing a NEWLY_IDLE balance
9437
- * on the assumption that any tasks we have must not be
9438
- * long-running (and hence we cannot rely upon load).
9439
- * However if we are not idle, we should assume the tasks
9440
- * we have are longer running and not override load-based
9441
- * calculations above unless we are sure that the local
9442
- * group is underutilized.
9443
- */
9444
- if (busiest->group_type == group_misfit_task &&
9445
- (env->idle == CPU_NEWLY_IDLE ||
9446
- local->sum_nr_running < local->group_weight)) {
9447
- env->imbalance = max_t(long, env->imbalance,
9448
- busiest->group_misfit_task_load);
9449
- }
9450
-
9451
- /*
9452
- * if *imbalance is less than the average load per runnable task
9453
- * there is no guarantee that any tasks will be moved so we'll have
9454
- * a think about bumping its value to force at least one task to be
9455
- * moved
9456
- */
9457
- if (env->imbalance < busiest->load_per_task)
9458
- return fix_small_imbalance(env, sds);
94599641 }
94609642
94619643 /******* find_busiest_group() helpers end here *********************/
9644
+
9645
+/*
9646
+ * Decision matrix according to the local and busiest group type:
9647
+ *
9648
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9649
+ * has_spare nr_idle balanced N/A N/A balanced balanced
9650
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
9651
+ * misfit_task force N/A N/A N/A force force
9652
+ * asym_packing force force N/A N/A force force
9653
+ * imbalanced force force N/A N/A force force
9654
+ * overloaded force force N/A N/A force avg_load
9655
+ *
9656
+ * N/A : Not Applicable because already filtered while updating
9657
+ * statistics.
9658
+ * balanced : The system is balanced for these 2 groups.
9659
+ * force : Calculate the imbalance as load migration is probably needed.
9660
+ * avg_load : Only if imbalance is significant enough.
9661
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9662
+ * different in groups.
9663
+ */
94629664
94639665 /**
94649666 * find_busiest_group - Returns the busiest group within the sched_domain
94659667 * if there is an imbalance.
94669668 *
9467
- * Also calculates the amount of weighted load which should be moved
9669
+ * Also calculates the amount of runnable load which should be moved
94689670 * to restore balance.
94699671 *
94709672 * @env: The load balancing environment.
....@@ -9479,32 +9681,36 @@
94799681 init_sd_lb_stats(&sds);
94809682
94819683 /*
9482
- * Compute the various statistics relavent for load balancing at
9684
+ * Compute the various statistics relevant for load balancing at
94839685 * this level.
94849686 */
94859687 update_sd_lb_stats(env, &sds);
94869688
9487
- if (static_branch_unlikely(&sched_energy_present)) {
9689
+ if (sched_energy_enabled()) {
94889690 struct root_domain *rd = env->dst_rq->rd;
9691
+ int out_balance = 1;
94899692
9490
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9693
+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
9694
+ &out_balance);
9695
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
9696
+ && out_balance)
94919697 goto out_balanced;
94929698 }
94939699
94949700 local = &sds.local_stat;
94959701 busiest = &sds.busiest_stat;
94969702
9497
- /* ASYM feature bypasses nice load balance check */
9498
- if (check_asym_packing(env, &sds))
9499
- return sds.busiest;
9500
-
95019703 /* There is no busy sibling group to pull tasks from */
9502
- if (!sds.busiest || busiest->sum_nr_running == 0)
9704
+ if (!sds.busiest)
95039705 goto out_balanced;
95049706
9505
- /* XXX broken for overlapping NUMA groups */
9506
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9507
- / sds.total_capacity;
9707
+ /* Misfit tasks should be dealt with regardless of the avg load */
9708
+ if (busiest->group_type == group_misfit_task)
9709
+ goto force_balance;
9710
+
9711
+ /* ASYM feature bypasses nice load balance check */
9712
+ if (busiest->group_type == group_asym_packing)
9713
+ goto force_balance;
95089714
95099715 /*
95109716 * If the busiest group is imbalanced the below checks don't
....@@ -9515,55 +9721,80 @@
95159721 goto force_balance;
95169722
95179723 /*
9518
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9519
- * capacities from resulting in underutilization due to avg_load.
9520
- */
9521
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9522
- busiest->group_no_capacity)
9523
- goto force_balance;
9524
-
9525
- /* Misfit tasks should be dealt with regardless of the avg load */
9526
- if (busiest->group_type == group_misfit_task)
9527
- goto force_balance;
9528
-
9529
- /*
95309724 * If the local group is busier than the selected busiest group
95319725 * don't try and pull any tasks.
95329726 */
9533
- if (local->avg_load >= busiest->avg_load)
9727
+ if (local->group_type > busiest->group_type)
95349728 goto out_balanced;
95359729
95369730 /*
9537
- * Don't pull any tasks if this group is already above the domain
9538
- * average load.
9731
+ * When groups are overloaded, use the avg_load to ensure fairness
9732
+ * between tasks.
95399733 */
9540
- if (local->avg_load >= sds.avg_load)
9541
- goto out_balanced;
9542
-
9543
- if (env->idle == CPU_IDLE) {
9734
+ if (local->group_type == group_overloaded) {
95449735 /*
9545
- * This CPU is idle. If the busiest group is not overloaded
9546
- * and there is no imbalance between this and busiest group
9547
- * wrt idle CPUs, it is balanced. The imbalance becomes
9548
- * significant if the diff is greater than 1 otherwise we
9549
- * might end up to just move the imbalance on another group
9736
+ * If the local group is more loaded than the selected
9737
+ * busiest group don't try to pull any tasks.
95509738 */
9551
- if ((busiest->group_type != group_overloaded) &&
9552
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
9739
+ if (local->avg_load >= busiest->avg_load)
95539740 goto out_balanced;
9554
- } else {
9741
+
9742
+ /* XXX broken for overlapping NUMA groups */
9743
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9744
+ sds.total_capacity;
9745
+
95559746 /*
9556
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9557
- * imbalance_pct to be conservative.
9747
+ * Don't pull any tasks if this group is already above the
9748
+ * domain average load.
9749
+ */
9750
+ if (local->avg_load >= sds.avg_load)
9751
+ goto out_balanced;
9752
+
9753
+ /*
9754
+ * If the busiest group is more loaded, use imbalance_pct to be
9755
+ * conservative.
95589756 */
95599757 if (100 * busiest->avg_load <=
95609758 env->sd->imbalance_pct * local->avg_load)
95619759 goto out_balanced;
95629760 }
95639761
9762
+ /* Try to move all excess tasks to child's sibling domain */
9763
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
9764
+ busiest->sum_nr_running > local->sum_nr_running + 1)
9765
+ goto force_balance;
9766
+
9767
+ if (busiest->group_type != group_overloaded) {
9768
+ if (env->idle == CPU_NOT_IDLE)
9769
+ /*
9770
+ * If the busiest group is not overloaded (and as a
9771
+ * result the local one too) but this CPU is already
9772
+ * busy, let another idle CPU try to pull task.
9773
+ */
9774
+ goto out_balanced;
9775
+
9776
+ if (busiest->group_weight > 1 &&
9777
+ local->idle_cpus <= (busiest->idle_cpus + 1))
9778
+ /*
9779
+ * If the busiest group is not overloaded
9780
+ * and there is no imbalance between this and busiest
9781
+ * group wrt idle CPUs, it is balanced. The imbalance
9782
+ * becomes significant if the diff is greater than 1
9783
+ * otherwise we might end up to just move the imbalance
9784
+ * on another group. Of course this applies only if
9785
+ * there is more than 1 CPU per group.
9786
+ */
9787
+ goto out_balanced;
9788
+
9789
+ if (busiest->sum_h_nr_running == 1)
9790
+ /*
9791
+ * busiest doesn't have any tasks waiting to run
9792
+ */
9793
+ goto out_balanced;
9794
+ }
9795
+
95649796 force_balance:
95659797 /* Looks like there is an imbalance. Compute it */
9566
- env->src_grp_type = busiest->group_type;
95679798 calculate_imbalance(env, &sds);
95689799 return env->imbalance ? sds.busiest : NULL;
95699800
....@@ -9579,11 +9810,18 @@
95799810 struct sched_group *group)
95809811 {
95819812 struct rq *busiest = NULL, *rq;
9582
- unsigned long busiest_load = 0, busiest_capacity = 1;
9583
- int i;
9813
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9814
+ unsigned int busiest_nr = 0;
9815
+ int i, done = 0;
9816
+
9817
+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
9818
+ &busiest, &done);
9819
+ if (done)
9820
+ return busiest;
95849821
95859822 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9586
- unsigned long capacity, wl;
9823
+ unsigned long capacity, load, util;
9824
+ unsigned int nr_running;
95879825 enum fbq_type rt;
95889826
95899827 rq = cpu_rq(i);
....@@ -9611,20 +9849,8 @@
96119849 if (rt > env->fbq_type)
96129850 continue;
96139851
9614
- /*
9615
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9616
- * seek the "biggest" misfit task.
9617
- */
9618
- if (env->src_grp_type == group_misfit_task) {
9619
- if (rq->misfit_task_load > busiest_load) {
9620
- busiest_load = rq->misfit_task_load;
9621
- busiest = rq;
9622
- }
9623
-
9624
- continue;
9625
- }
9626
-
96279852 capacity = capacity_of(i);
9853
+ nr_running = rq->cfs.h_nr_running;
96289854
96299855 /*
96309856 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
....@@ -9634,35 +9860,77 @@
96349860 */
96359861 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
96369862 capacity_of(env->dst_cpu) < capacity &&
9637
- rq->nr_running == 1)
9863
+ nr_running == 1)
96389864 continue;
96399865
9640
- wl = weighted_cpuload(rq);
9866
+ switch (env->migration_type) {
9867
+ case migrate_load:
9868
+ /*
9869
+ * When comparing with load imbalance, use cpu_load()
9870
+ * which is not scaled with the CPU capacity.
9871
+ */
9872
+ load = cpu_load(rq);
96419873
9642
- /*
9643
- * When comparing with imbalance, use weighted_cpuload()
9644
- * which is not scaled with the CPU capacity.
9645
- */
9874
+ if (nr_running == 1 && load > env->imbalance &&
9875
+ !check_cpu_capacity(rq, env->sd))
9876
+ break;
96469877
9647
- if (rq->nr_running == 1 && wl > env->imbalance &&
9648
- !check_cpu_capacity(rq, env->sd))
9649
- continue;
9878
+ /*
9879
+ * For the load comparisons with the other CPUs,
9880
+ * consider the cpu_load() scaled with the CPU
9881
+ * capacity, so that the load can be moved away
9882
+ * from the CPU that is potentially running at a
9883
+ * lower capacity.
9884
+ *
9885
+ * Thus we're looking for max(load_i / capacity_i),
9886
+ * crosswise multiplication to rid ourselves of the
9887
+ * division works out to:
9888
+ * load_i * capacity_j > load_j * capacity_i;
9889
+ * where j is our previous maximum.
9890
+ */
9891
+ if (load * busiest_capacity > busiest_load * capacity) {
9892
+ busiest_load = load;
9893
+ busiest_capacity = capacity;
9894
+ busiest = rq;
9895
+ }
9896
+ break;
96509897
9651
- /*
9652
- * For the load comparisons with the other CPU's, consider
9653
- * the weighted_cpuload() scaled with the CPU capacity, so
9654
- * that the load can be moved away from the CPU that is
9655
- * potentially running at a lower capacity.
9656
- *
9657
- * Thus we're looking for max(wl_i / capacity_i), crosswise
9658
- * multiplication to rid ourselves of the division works out
9659
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9660
- * our previous maximum.
9661
- */
9662
- if (wl * busiest_capacity > busiest_load * capacity) {
9663
- busiest_load = wl;
9664
- busiest_capacity = capacity;
9665
- busiest = rq;
9898
+ case migrate_util:
9899
+ util = cpu_util(cpu_of(rq));
9900
+
9901
+ /*
9902
+ * Don't try to pull utilization from a CPU with one
9903
+ * running task. Whatever its utilization, we will fail
9904
+ * detach the task.
9905
+ */
9906
+ if (nr_running <= 1)
9907
+ continue;
9908
+
9909
+ if (busiest_util < util) {
9910
+ busiest_util = util;
9911
+ busiest = rq;
9912
+ }
9913
+ break;
9914
+
9915
+ case migrate_task:
9916
+ if (busiest_nr < nr_running) {
9917
+ busiest_nr = nr_running;
9918
+ busiest = rq;
9919
+ }
9920
+ break;
9921
+
9922
+ case migrate_misfit:
9923
+ /*
9924
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
9925
+ * simply seek the "biggest" misfit task.
9926
+ */
9927
+ if (rq->misfit_task_load > busiest_load) {
9928
+ busiest_load = rq->misfit_task_load;
9929
+ busiest = rq;
9930
+ }
9931
+
9932
+ break;
9933
+
96669934 }
96679935 }
96689936
....@@ -9675,21 +9943,25 @@
96759943 */
96769944 #define MAX_PINNED_INTERVAL 512
96779945
9678
-static int need_active_balance(struct lb_env *env)
9946
+static inline bool
9947
+asym_active_balance(struct lb_env *env)
9948
+{
9949
+ /*
9950
+ * ASYM_PACKING needs to force migrate tasks from busy but
9951
+ * lower priority CPUs in order to pack all tasks in the
9952
+ * highest priority CPUs.
9953
+ */
9954
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9955
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
9956
+}
9957
+
9958
+static inline bool
9959
+voluntary_active_balance(struct lb_env *env)
96799960 {
96809961 struct sched_domain *sd = env->sd;
96819962
9682
- if (env->idle == CPU_NEWLY_IDLE) {
9683
-
9684
- /*
9685
- * ASYM_PACKING needs to force migrate tasks from busy but
9686
- * lower priority CPUs in order to pack all tasks in the
9687
- * highest priority CPUs.
9688
- */
9689
- if ((sd->flags & SD_ASYM_PACKING) &&
9690
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9691
- return 1;
9692
- }
9963
+ if (asym_active_balance(env))
9964
+ return 1;
96939965
96949966 /*
96959967 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
....@@ -9704,19 +9976,18 @@
97049976 return 1;
97059977 }
97069978
9707
- if (env->src_grp_type == group_misfit_task)
9979
+ if (env->migration_type == migrate_misfit)
97089980 return 1;
97099981
9710
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9711
- env->src_rq->cfs.h_nr_running == 1 &&
9712
- cpu_overutilized(env->src_cpu) &&
9713
- !cpu_overutilized(env->dst_cpu)) {
9714
- return 1;
9715
- }
9982
+ return 0;
9983
+}
97169984
9717
- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9718
- return 1;
9985
+static int need_active_balance(struct lb_env *env)
9986
+{
9987
+ struct sched_domain *sd = env->sd;
97199988
9989
+ if (voluntary_active_balance(env))
9990
+ return 1;
97209991
97219992 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
97229993 }
....@@ -9726,7 +9997,17 @@
97269997 static int should_we_balance(struct lb_env *env)
97279998 {
97289999 struct sched_group *sg = env->sd->groups;
9729
- int cpu, balance_cpu = -1;
10000
+ int cpu;
10001
+
10002
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
10003
+ struct root_domain *rd = env->dst_rq->rd;
10004
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
10005
+ int level = rockchip_perf_get_level();
10006
+
10007
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
10008
+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
10009
+ return 0;
10010
+ }
973010011
973110012 /*
973210013 * Ensure the balancing environment is consistent; can happen
....@@ -9747,18 +10028,12 @@
974710028 if (!idle_cpu(cpu))
974810029 continue;
974910030
9750
- balance_cpu = cpu;
9751
- break;
10031
+ /* Are we the first idle CPU? */
10032
+ return cpu == env->dst_cpu;
975210033 }
975310034
9754
- if (balance_cpu == -1)
9755
- balance_cpu = group_balance_cpu(sg);
9756
-
9757
- /*
9758
- * First idle CPU or the first CPU(busiest) in this sched group
9759
- * is eligible for doing load balancing at this and above domains.
9760
- */
9761
- return balance_cpu == env->dst_cpu;
10035
+ /* Are we the first CPU of this group ? */
10036
+ return group_balance_cpu(sg) == env->dst_cpu;
976210037 }
976310038
976410039 /*
....@@ -9780,7 +10055,7 @@
978010055 .sd = sd,
978110056 .dst_cpu = this_cpu,
978210057 .dst_rq = this_rq,
9783
- .dst_grpmask = sched_group_span(sd->groups),
10058
+ .dst_grpmask = group_balance_mask(sd->groups),
978410059 .idle = idle,
978510060 .loop_break = sched_nr_migrate_break,
978610061 .cpus = cpus,
....@@ -9830,6 +10105,7 @@
983010105
983110106 more_balance:
983210107 rq_lock_irqsave(busiest, &rf);
10108
+ env.src_rq_rf = &rf;
983310109 update_rq_clock(busiest);
983410110
983510111 /*
....@@ -9882,7 +10158,7 @@
988210158 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
988310159
988410160 /* Prevent to re-select dst_cpu via env's CPUs */
9885
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
10161
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
988610162
988710163 env.dst_rq = cpu_rq(env.new_dst_cpu);
988810164 env.dst_cpu = env.new_dst_cpu;
....@@ -9909,7 +10185,7 @@
990910185
991010186 /* All tasks on this runqueue were pinned by CPU affinity */
991110187 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9912
- cpumask_clear_cpu(cpu_of(busiest), cpus);
10188
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
991310189 /*
991410190 * Attempting to continue load balancing at the current
991510191 * sched_domain level only makes sense if there are
....@@ -9936,8 +10212,7 @@
993610212 * excessive cache_hot migrations and active balances.
993710213 */
993810214 if (idle != CPU_NEWLY_IDLE)
9939
- if (env.src_grp_nr_running > 1)
9940
- sd->nr_balance_failed++;
10215
+ sd->nr_balance_failed++;
994110216
994210217 if (need_active_balance(&env)) {
994310218 unsigned long flags;
....@@ -9980,7 +10255,7 @@
998010255 } else
998110256 sd->nr_balance_failed = 0;
998210257
9983
- if (likely(!active_balance)) {
10258
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
998410259 /* We were unbalanced, so reset the balancing interval */
998510260 sd->balance_interval = sd->min_interval;
998610261 } else {
....@@ -10023,18 +10298,18 @@
1002310298 ld_moved = 0;
1002410299
1002510300 /*
10026
- * idle_balance() disregards balance intervals, so we could repeatedly
10027
- * reach this code, which would lead to balance_interval skyrocketting
10028
- * in a short amount of time. Skip the balance_interval increase logic
10029
- * to avoid that.
10301
+ * newidle_balance() disregards balance intervals, so we could
10302
+ * repeatedly reach this code, which would lead to balance_interval
10303
+ * skyrocketting in a short amount of time. Skip the balance_interval
10304
+ * increase logic to avoid that.
1003010305 */
1003110306 if (env.idle == CPU_NEWLY_IDLE)
1003210307 goto out;
1003310308
1003410309 /* tune up the balancing interval */
10035
- if (((env.flags & LBF_ALL_PINNED) &&
10036
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
10037
- (sd->balance_interval < sd->max_interval))
10310
+ if ((env.flags & LBF_ALL_PINNED &&
10311
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
10312
+ sd->balance_interval < sd->max_interval)
1003810313 sd->balance_interval *= 2;
1003910314 out:
1004010315 return ld_moved;
....@@ -10050,6 +10325,15 @@
1005010325
1005110326 /* scale ms to jiffies */
1005210327 interval = msecs_to_jiffies(interval);
10328
+
10329
+ /*
10330
+ * Reduce likelihood of busy balancing at higher domains racing with
10331
+ * balancing at lower domains by preventing their balancing periods
10332
+ * from being multiples of each other.
10333
+ */
10334
+ if (cpu_busy)
10335
+ interval -= 1;
10336
+
1005310337 interval = clamp(interval, 1UL, max_load_balance_interval);
1005410338
1005510339 return interval;
....@@ -10112,9 +10396,8 @@
1011210396 /* Search for an sd spanning us and the target CPU. */
1011310397 rcu_read_lock();
1011410398 for_each_domain(target_cpu, sd) {
10115
- if ((sd->flags & SD_LOAD_BALANCE) &&
10116
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10117
- break;
10399
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10400
+ break;
1011810401 }
1011910402
1012010403 if (likely(sd)) {
....@@ -10132,6 +10415,7 @@
1013210415 * about DST_PINNED.
1013310416 */
1013410417 .flags = LBF_DST_PINNED,
10418
+ .src_rq_rf = &rf,
1013510419 };
1013610420
1013710421 schedstat_inc(sd->alb_count);
....@@ -10167,7 +10451,7 @@
1016710451 */
1016810452 void update_max_interval(void)
1016910453 {
10170
- max_load_balance_interval = HZ*num_online_cpus()/10;
10454
+ max_load_balance_interval = HZ*num_active_cpus()/10;
1017110455 }
1017210456
1017310457 /*
....@@ -10180,6 +10464,7 @@
1018010464 {
1018110465 int continue_balancing = 1;
1018210466 int cpu = rq->cpu;
10467
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1018310468 unsigned long interval;
1018410469 struct sched_domain *sd;
1018510470 /* Earliest time when we have to do rebalance again */
....@@ -10187,6 +10472,10 @@
1018710472 int update_next_balance = 0;
1018810473 int need_serialize, need_decay = 0;
1018910474 u64 max_cost = 0;
10475
+
10476
+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
10477
+ if (!continue_balancing)
10478
+ return;
1019010479
1019110480 rcu_read_lock();
1019210481 for_each_domain(cpu, sd) {
....@@ -10202,9 +10491,6 @@
1020210491 }
1020310492 max_cost += sd->max_newidle_lb_cost;
1020410493
10205
- if (!(sd->flags & SD_LOAD_BALANCE))
10206
- continue;
10207
-
1020810494 /*
1020910495 * Stop the load balance at this level. There is another
1021010496 * CPU in our sched group which is doing load balancing more
....@@ -10216,7 +10502,7 @@
1021610502 break;
1021710503 }
1021810504
10219
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10505
+ interval = get_sd_balance_interval(sd, busy);
1022010506
1022110507 need_serialize = sd->flags & SD_SERIALIZE;
1022210508 if (need_serialize) {
....@@ -10232,9 +10518,10 @@
1023210518 * state even if we migrated tasks. Update it.
1023310519 */
1023410520 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10521
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1023510522 }
1023610523 sd->last_balance = jiffies;
10237
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10524
+ interval = get_sd_balance_interval(sd, busy);
1023810525 }
1023910526 if (need_serialize)
1024010527 spin_unlock(&balancing);
....@@ -10294,7 +10581,11 @@
1029410581
1029510582 static inline int find_new_ilb(void)
1029610583 {
10297
- int ilb;
10584
+ int ilb = -1;
10585
+
10586
+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
10587
+ if (ilb >= 0)
10588
+ return ilb;
1029810589
1029910590 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
1030010591 housekeeping_cpumask(HK_FLAG_MISC)) {
....@@ -10325,29 +10616,25 @@
1032510616 if (ilb_cpu >= nr_cpu_ids)
1032610617 return;
1032710618
10619
+ /*
10620
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10621
+ * the first flag owns it; cleared by nohz_csd_func().
10622
+ */
1032810623 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
1032910624 if (flags & NOHZ_KICK_MASK)
1033010625 return;
1033110626
1033210627 /*
10333
- * Use smp_send_reschedule() instead of resched_cpu().
10334
- * This way we generate a sched IPI on the target CPU which
10628
+ * This way we generate an IPI on the target CPU which
1033510629 * is idle. And the softirq performing nohz idle load balance
1033610630 * will be run before returning from the IPI.
1033710631 */
10338
- smp_send_reschedule(ilb_cpu);
10632
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
1033910633 }
1034010634
1034110635 /*
10342
- * Current heuristic for kicking the idle load balancer in the presence
10343
- * of an idle cpu in the system.
10344
- * - This rq has more than one task.
10345
- * - This rq has at least one CFS task and the capacity of the CPU is
10346
- * significantly reduced because of RT tasks or IRQs.
10347
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10348
- * multiple busy cpu.
10349
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10350
- * domain span are idle.
10636
+ * Current decision point for kicking the idle load balancer in the presence
10637
+ * of idle CPUs in the system.
1035110638 */
1035210639 static void nohz_balancer_kick(struct rq *rq)
1035310640 {
....@@ -10356,6 +10643,7 @@
1035610643 struct sched_domain *sd;
1035710644 int nr_busy, i, cpu = rq->cpu;
1035810645 unsigned int flags = 0;
10646
+ int done = 0;
1035910647
1036010648 if (unlikely(rq->idle_balance))
1036110649 return;
....@@ -10380,30 +10668,25 @@
1038010668 if (time_before(now, nohz.next_balance))
1038110669 goto out;
1038210670
10383
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
10671
+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
10672
+ if (done)
10673
+ goto out;
10674
+
10675
+ if (rq->nr_running >= 2) {
1038410676 flags = NOHZ_KICK_MASK;
1038510677 goto out;
1038610678 }
1038710679
1038810680 rcu_read_lock();
10389
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10390
- if (sds) {
10391
- /*
10392
- * XXX: write a coherent comment on why we do this.
10393
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10394
- */
10395
- nr_busy = atomic_read(&sds->nr_busy_cpus);
10396
- if (nr_busy > 1) {
10397
- flags = NOHZ_KICK_MASK;
10398
- goto unlock;
10399
- }
10400
-
10401
- }
1040210681
1040310682 sd = rcu_dereference(rq->sd);
1040410683 if (sd) {
10405
- if ((rq->cfs.h_nr_running >= 1) &&
10406
- check_cpu_capacity(rq, sd)) {
10684
+ /*
10685
+ * If there's a CFS task and the current CPU has reduced
10686
+ * capacity; kick the ILB to see if there's a better CPU to run
10687
+ * on.
10688
+ */
10689
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
1040710690 flags = NOHZ_KICK_MASK;
1040810691 goto unlock;
1040910692 }
....@@ -10411,15 +10694,55 @@
1041110694
1041210695 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
1041310696 if (sd) {
10414
- for_each_cpu(i, sched_domain_span(sd)) {
10415
- if (i == cpu ||
10416
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10417
- continue;
10418
-
10697
+ /*
10698
+ * When ASYM_PACKING; see if there's a more preferred CPU
10699
+ * currently idle; in which case, kick the ILB to move tasks
10700
+ * around.
10701
+ */
10702
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
1041910703 if (sched_asym_prefer(i, cpu)) {
1042010704 flags = NOHZ_KICK_MASK;
1042110705 goto unlock;
1042210706 }
10707
+ }
10708
+ }
10709
+
10710
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10711
+ if (sd) {
10712
+ /*
10713
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10714
+ * to run the misfit task on.
10715
+ */
10716
+ if (check_misfit_status(rq, sd)) {
10717
+ flags = NOHZ_KICK_MASK;
10718
+ goto unlock;
10719
+ }
10720
+
10721
+ /*
10722
+ * For asymmetric systems, we do not want to nicely balance
10723
+ * cache use, instead we want to embrace asymmetry and only
10724
+ * ensure tasks have enough CPU capacity.
10725
+ *
10726
+ * Skip the LLC logic because it's not relevant in that case.
10727
+ */
10728
+ goto unlock;
10729
+ }
10730
+
10731
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10732
+ if (sds) {
10733
+ /*
10734
+ * If there is an imbalance between LLC domains (IOW we could
10735
+ * increase the overall cache use), we need some less-loaded LLC
10736
+ * domain to pull some load. Likewise, we may need to spread
10737
+ * load within the current LLC domain (e.g. packed SMT cores but
10738
+ * other CPUs are idle). We can't really know from here how busy
10739
+ * the others are - so just get a nohz balance going if it looks
10740
+ * like this LLC domain has tasks we could move.
10741
+ */
10742
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
10743
+ if (nr_busy > 1) {
10744
+ flags = NOHZ_KICK_MASK;
10745
+ goto unlock;
1042310746 }
1042410747 }
1042510748 unlock:
....@@ -10485,9 +10808,20 @@
1048510808
1048610809 SCHED_WARN_ON(cpu != smp_processor_id());
1048710810
10488
- /* If this CPU is going down, then nothing needs to be done: */
10489
- if (!cpu_active(cpu))
10811
+ if (!cpu_active(cpu)) {
10812
+ /*
10813
+ * A CPU can be paused while it is idle with it's tick
10814
+ * stopped. nohz_balance_exit_idle() should be called
10815
+ * from the local CPU, so it can't be called during
10816
+ * pause. This results in paused CPU participating in
10817
+ * the nohz idle balance, which should be avoided.
10818
+ *
10819
+ * When the paused CPU exits idle and enters again,
10820
+ * exempt the paused CPU from nohz_balance_exit_idle.
10821
+ */
10822
+ nohz_balance_exit_idle(rq);
1049010823 return;
10824
+ }
1049110825
1049210826 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
1049310827 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
....@@ -10600,7 +10934,6 @@
1060010934
1060110935 rq_lock_irqsave(rq, &rf);
1060210936 update_rq_clock(rq);
10603
- cpu_load_update_idle(rq);
1060410937 rq_unlock_irqrestore(rq, &rf);
1060510938
1060610939 if (flags & NOHZ_BALANCE_KICK)
....@@ -10650,22 +10983,14 @@
1065010983 */
1065110984 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
1065210985 {
10653
- int this_cpu = this_rq->cpu;
10654
- unsigned int flags;
10986
+ unsigned int flags = this_rq->nohz_idle_balance;
1065510987
10656
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
10988
+ if (!flags)
1065710989 return false;
1065810990
10659
- if (idle != CPU_IDLE) {
10660
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10661
- return false;
10662
- }
10991
+ this_rq->nohz_idle_balance = 0;
1066310992
10664
- /*
10665
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10666
- */
10667
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10668
- if (!(flags & NOHZ_KICK_MASK))
10993
+ if (idle != CPU_IDLE)
1066910994 return false;
1067010995
1067110996 _nohz_idle_balance(this_rq, flags, idle);
....@@ -10719,15 +11044,26 @@
1071911044 /*
1072011045 * idle_balance is called by schedule() if this_cpu is about to become
1072111046 * idle. Attempts to pull tasks from other CPUs.
11047
+ *
11048
+ * Returns:
11049
+ * < 0 - we released the lock and there are !fair tasks present
11050
+ * 0 - failed, no new tasks
11051
+ * > 0 - success, new (fair) tasks present
1072211052 */
10723
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
11053
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
1072411054 {
1072511055 unsigned long next_balance = jiffies + HZ;
1072611056 int this_cpu = this_rq->cpu;
1072711057 struct sched_domain *sd;
1072811058 int pulled_task = 0;
1072911059 u64 curr_cost = 0;
11060
+ int done = 0;
1073011061
11062
+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
11063
+ if (done)
11064
+ return pulled_task;
11065
+
11066
+ update_misfit_status(NULL, this_rq);
1073111067 /*
1073211068 * We must set idle_stamp _before_ calling idle_balance(), such that we
1073311069 * measure the duration of idle_balance() as idle time.
....@@ -10769,9 +11105,6 @@
1076911105 for_each_domain(this_cpu, sd) {
1077011106 int continue_balancing = 1;
1077111107 u64 t0, domain_cost;
10772
-
10773
- if (!(sd->flags & SD_LOAD_BALANCE))
10774
- continue;
1077511108
1077611109 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
1077711110 update_next_balance(sd, &next_balance);
....@@ -10945,7 +11278,7 @@
1094511278 * 'current' within the tree based on its new key value.
1094611279 */
1094711280 swap(curr->vruntime, se->vruntime);
10948
- resched_curr_lazy(rq);
11281
+ resched_curr(rq);
1094911282 }
1095011283
1095111284 se->vruntime -= cfs_rq->min_vruntime;
....@@ -10962,6 +11295,9 @@
1096211295 if (!task_on_rq_queued(p))
1096311296 return;
1096411297
11298
+ if (rq->cfs.nr_running == 1)
11299
+ return;
11300
+
1096511301 /*
1096611302 * Reschedule if we are currently running on this runqueue and
1096711303 * our priority decreased, or if we are not currently running on
....@@ -10969,7 +11305,7 @@
1096911305 */
1097011306 if (rq->curr == p) {
1097111307 if (p->prio > oldprio)
10972
- resched_curr_lazy(rq);
11308
+ resched_curr(rq);
1097311309 } else
1097411310 check_preempt_curr(rq, p, 0);
1097511311 }
....@@ -11040,7 +11376,7 @@
1104011376 /* Catch up with the cfs_rq and remove our load when we leave */
1104111377 update_load_avg(cfs_rq, se, 0);
1104211378 detach_entity_load_avg(cfs_rq, se);
11043
- update_tg_load_avg(cfs_rq, false);
11379
+ update_tg_load_avg(cfs_rq);
1104411380 propagate_entity_cfs_rq(se);
1104511381 }
1104611382
....@@ -11058,8 +11394,8 @@
1105811394
1105911395 /* Synchronize entity with its cfs_rq */
1106011396 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11061
- attach_entity_load_avg(cfs_rq, se, 0);
11062
- update_tg_load_avg(cfs_rq, false);
11397
+ attach_entity_load_avg(cfs_rq, se);
11398
+ update_tg_load_avg(cfs_rq);
1106311399 propagate_entity_cfs_rq(se);
1106411400 }
1106511401
....@@ -11118,9 +11454,19 @@
1111811454 * This routine is mostly called to set cfs_rq->curr field when a task
1111911455 * migrates between groups/classes.
1112011456 */
11121
-static void set_curr_task_fair(struct rq *rq)
11457
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1112211458 {
11123
- struct sched_entity *se = &rq->curr->se;
11459
+ struct sched_entity *se = &p->se;
11460
+
11461
+#ifdef CONFIG_SMP
11462
+ if (task_on_rq_queued(p)) {
11463
+ /*
11464
+ * Move the next running task to the front of the list, so our
11465
+ * cfs_tasks list becomes MRU one.
11466
+ */
11467
+ list_move(&se->group_node, &rq->cfs_tasks);
11468
+ }
11469
+#endif
1112411470
1112511471 for_each_sched_entity(se) {
1112611472 struct cfs_rq *cfs_rq = cfs_rq_of(se);
....@@ -11381,8 +11727,8 @@
1138111727 /*
1138211728 * All the scheduling class methods:
1138311729 */
11384
-const struct sched_class fair_sched_class = {
11385
- .next = &idle_sched_class,
11730
+const struct sched_class fair_sched_class
11731
+ __section("__fair_sched_class") = {
1138611732 .enqueue_task = enqueue_task_fair,
1138711733 .dequeue_task = dequeue_task_fair,
1138811734 .yield_task = yield_task_fair,
....@@ -11390,10 +11736,12 @@
1139011736
1139111737 .check_preempt_curr = check_preempt_wakeup,
1139211738
11393
- .pick_next_task = pick_next_task_fair,
11739
+ .pick_next_task = __pick_next_task_fair,
1139411740 .put_prev_task = put_prev_task_fair,
11741
+ .set_next_task = set_next_task_fair,
1139511742
1139611743 #ifdef CONFIG_SMP
11744
+ .balance = balance_fair,
1139711745 .select_task_rq = select_task_rq_fair,
1139811746 .migrate_task_rq = migrate_task_rq_fair,
1139911747
....@@ -11404,7 +11752,6 @@
1140411752 .set_cpus_allowed = set_cpus_allowed_common,
1140511753 #endif
1140611754
11407
- .set_curr_task = set_curr_task_fair,
1140811755 .task_tick = task_tick_fair,
1140911756 .task_fork = task_fork_fair,
1141011757
....@@ -11474,3 +11821,101 @@
1147411821 #endif /* SMP */
1147511822
1147611823 }
11824
+
11825
+/*
11826
+ * Helper functions to facilitate extracting info from tracepoints.
11827
+ */
11828
+
11829
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11830
+{
11831
+#ifdef CONFIG_SMP
11832
+ return cfs_rq ? &cfs_rq->avg : NULL;
11833
+#else
11834
+ return NULL;
11835
+#endif
11836
+}
11837
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11838
+
11839
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11840
+{
11841
+ if (!cfs_rq) {
11842
+ if (str)
11843
+ strlcpy(str, "(null)", len);
11844
+ else
11845
+ return NULL;
11846
+ }
11847
+
11848
+ cfs_rq_tg_path(cfs_rq, str, len);
11849
+ return str;
11850
+}
11851
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11852
+
11853
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11854
+{
11855
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11856
+}
11857
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11858
+
11859
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11860
+{
11861
+#ifdef CONFIG_SMP
11862
+ return rq ? &rq->avg_rt : NULL;
11863
+#else
11864
+ return NULL;
11865
+#endif
11866
+}
11867
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11868
+
11869
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11870
+{
11871
+#ifdef CONFIG_SMP
11872
+ return rq ? &rq->avg_dl : NULL;
11873
+#else
11874
+ return NULL;
11875
+#endif
11876
+}
11877
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11878
+
11879
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11880
+{
11881
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11882
+ return rq ? &rq->avg_irq : NULL;
11883
+#else
11884
+ return NULL;
11885
+#endif
11886
+}
11887
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11888
+
11889
+int sched_trace_rq_cpu(struct rq *rq)
11890
+{
11891
+ return rq ? cpu_of(rq) : -1;
11892
+}
11893
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11894
+
11895
+int sched_trace_rq_cpu_capacity(struct rq *rq)
11896
+{
11897
+ return rq ?
11898
+#ifdef CONFIG_SMP
11899
+ rq->cpu_capacity
11900
+#else
11901
+ SCHED_CAPACITY_SCALE
11902
+#endif
11903
+ : -1;
11904
+}
11905
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11906
+
11907
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11908
+{
11909
+#ifdef CONFIG_SMP
11910
+ return rd ? rd->span : NULL;
11911
+#else
11912
+ return NULL;
11913
+#endif
11914
+}
11915
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11916
+
11917
+int sched_trace_rq_nr_running(struct rq *rq)
11918
+{
11919
+ return rq ? rq->nr_running : -1;
11920
+}
11921
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);