hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/kernel/sched/fair.c
....@@ -20,12 +20,11 @@
2020 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
2121 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
2222 */
23
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24
-#include <linux/cpufreq.h>
25
-#endif
2623 #include "sched.h"
2724
28
-#include <trace/events/sched.h>
25
+#include <trace/hooks/sched.h>
26
+
27
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
2928
3029 /*
3130 * Targeted preemption latency for CPU-bound tasks:
....@@ -41,17 +40,8 @@
4140 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
4241 */
4342 unsigned int sysctl_sched_latency = 6000000ULL;
44
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45
-
46
-/*
47
- * Enable/disable honoring sync flag in energy-aware wakeups.
48
- */
49
-unsigned int sysctl_sched_sync_hint_enable = 1;
50
-
51
-/*
52
- * Enable/disable using cstate knowledge in idle sibling selection
53
- */
54
-unsigned int sysctl_sched_cstate_aware = 1;
43
+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
44
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
5545
5646 /*
5747 * The initial- and re-scaling of tunables is configurable
....@@ -71,8 +61,9 @@
7161 *
7262 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
7363 */
74
-unsigned int sysctl_sched_min_granularity = 750000ULL;
75
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
64
+unsigned int sysctl_sched_min_granularity = 750000ULL;
65
+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
66
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
7667
7768 /*
7869 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
....@@ -94,10 +85,23 @@
9485 *
9586 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
9687 */
97
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
88
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
89
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
9990
10091 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
92
+
93
+int sched_thermal_decay_shift;
94
+static int __init setup_sched_thermal_decay_shift(char *str)
95
+{
96
+ int _shift = 0;
97
+
98
+ if (kstrtoint(str, 0, &_shift))
99
+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
100
+
101
+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
102
+ return 1;
103
+}
104
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101105
102106 #ifdef CONFIG_SMP
103107 /*
....@@ -107,6 +111,14 @@
107111 {
108112 return -cpu;
109113 }
114
+
115
+/*
116
+ * The margin used when comparing utilization with CPU capacity.
117
+ *
118
+ * (default: ~20%)
119
+ */
120
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
121
+
110122 #endif
111123
112124 #ifdef CONFIG_CFS_BANDWIDTH
....@@ -122,18 +134,6 @@
122134 */
123135 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124136 #endif
125
-
126
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127
-unsigned int sysctl_sched_performance_bias = 1;
128
-#endif
129
-
130
-/*
131
- * The margin used when comparing utilization with CPU capacity:
132
- * util * margin < capacity * 1024
133
- *
134
- * (default: ~20%)
135
- */
136
-unsigned int capacity_margin = 1280;
137137
138138 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139139 {
....@@ -195,7 +195,7 @@
195195 #undef SET_SYSCTL
196196 }
197197
198
-void sched_init_granularity(void)
198
+void __init sched_init_granularity(void)
199199 {
200200 update_sysctl();
201201 }
....@@ -246,8 +246,7 @@
246246 }
247247 }
248248
249
- /* hint to use a 32x32->64 mul */
250
- fact = (u64)(u32)fact * lw->inv_weight;
249
+ fact = mul_u32_u32(fact, lw->inv_weight);
251250
252251 while (fact >> 32) {
253252 fact >>= 1;
....@@ -290,6 +289,19 @@
290289 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
291290 {
292291 return grp->my_q;
292
+}
293
+
294
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
295
+{
296
+ if (!path)
297
+ return;
298
+
299
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
300
+ autogroup_path(cfs_rq->tg, path, len);
301
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
302
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
303
+ else
304
+ strlcpy(path, "(null)", len);
293305 }
294306
295307 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -466,6 +478,12 @@
466478 return NULL;
467479 }
468480
481
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
482
+{
483
+ if (path)
484
+ strlcpy(path, "(null)", len);
485
+}
486
+
469487 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470488 {
471489 return true;
....@@ -567,6 +585,7 @@
567585 struct sched_entity *entry;
568586 bool leftmost = true;
569587
588
+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570589 /*
571590 * Find the right place in the rbtree:
572591 */
....@@ -592,6 +611,7 @@
592611
593612 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
594613 {
614
+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595615 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596616 }
597617
....@@ -631,8 +651,7 @@
631651 */
632652
633653 int sched_proc_update_handler(struct ctl_table *table, int write,
634
- void __user *buffer, size_t *lenp,
635
- loff_t *ppos)
654
+ void *buffer, size_t *lenp, loff_t *ppos)
636655 {
637656 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638657 unsigned int factor = get_update_sysctl_factor();
....@@ -689,7 +708,13 @@
689708 */
690709 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
691710 {
692
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
711
+ unsigned int nr_running = cfs_rq->nr_running;
712
+ u64 slice;
713
+
714
+ if (sched_feat(ALT_PERIOD))
715
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
716
+
717
+ slice = __sched_period(nr_running + !se->on_rq);
693718
694719 for_each_sched_entity(se) {
695720 struct load_weight *load;
....@@ -706,6 +731,10 @@
706731 }
707732 slice = __calc_delta(slice, se->load.weight, load);
708733 }
734
+
735
+ if (sched_feat(BASE_SLICE))
736
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
737
+
709738 return slice;
710739 }
711740
....@@ -734,26 +763,17 @@
734763 memset(sa, 0, sizeof(*sa));
735764
736765 /*
737
- * Tasks are intialized with full load to be seen as heavy tasks until
766
+ * Tasks are initialized with full load to be seen as heavy tasks until
738767 * they get a chance to stabilize to their real load level.
739
- * Group entities are intialized with zero load to reflect the fact that
768
+ * Group entities are initialized with zero load to reflect the fact that
740769 * nothing has been attached to the task group yet.
741770 */
742771 if (entity_is_task(se))
743
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
772
+ sa->load_avg = scale_load_down(se->load.weight);
744773
745
- se->runnable_weight = se->load.weight;
746
-
747
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748
- if (sysctl_sched_performance_bias) {
749
- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751
- }
752
-#endif
753774 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754775 }
755776
756
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757777 static void attach_entity_cfs_rq(struct sched_entity *se);
758778
759779 /*
....@@ -782,18 +802,15 @@
782802 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783803 * if util_avg > util_avg_cap.
784804 */
785
-void post_init_entity_util_avg(struct sched_entity *se)
805
+void post_init_entity_util_avg(struct task_struct *p)
786806 {
807
+ struct sched_entity *se = &p->se;
787808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788809 struct sched_avg *sa = &se->avg;
789
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
810
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790811 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791812
792
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793
- if (!sysctl_sched_performance_bias && (cap > 0)) {
794
-#else
795813 if (cap > 0) {
796
-#endif
797814 if (cfs_rq->avg.util_avg != 0) {
798815 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799816 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
....@@ -805,24 +822,25 @@
805822 }
806823 }
807824
808
- if (entity_is_task(se)) {
809
- struct task_struct *p = task_of(se);
810
- if (p->sched_class != &fair_sched_class) {
811
- /*
812
- * For !fair tasks do:
813
- *
814
- update_cfs_rq_load_avg(now, cfs_rq);
815
- attach_entity_load_avg(cfs_rq, se, 0);
816
- switched_from_fair(rq, p);
817
- *
818
- * such that the next switched_to_fair() has the
819
- * expected state.
820
- */
821
- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822
- return;
823
- }
825
+ sa->runnable_avg = sa->util_avg;
826
+
827
+ if (p->sched_class != &fair_sched_class) {
828
+ /*
829
+ * For !fair tasks do:
830
+ *
831
+ update_cfs_rq_load_avg(now, cfs_rq);
832
+ attach_entity_load_avg(cfs_rq, se);
833
+ switched_from_fair(rq, p);
834
+ *
835
+ * such that the next switched_to_fair() has the
836
+ * expected state.
837
+ */
838
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
839
+ return;
824840 }
825841
842
+ /* Hook before this se's util is attached to cfs_rq's util */
843
+ trace_android_rvh_post_init_entity_util_avg(se);
826844 attach_entity_cfs_rq(se);
827845 }
828846
....@@ -830,10 +848,10 @@
830848 void init_entity_runnable_average(struct sched_entity *se)
831849 {
832850 }
833
-void post_init_entity_util_avg(struct sched_entity *se)
851
+void post_init_entity_util_avg(struct task_struct *p)
834852 {
835853 }
836
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
854
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837855 {
838856 }
839857 #endif /* CONFIG_SMP */
....@@ -983,7 +1001,6 @@
9831001 }
9841002
9851003 trace_sched_stat_blocked(tsk, delta);
986
- trace_sched_blocked_reason(tsk);
9871004
9881005 /*
9891006 * Blocking time is in units of nanosecs, so shift by
....@@ -1078,7 +1095,7 @@
10781095 unsigned int sysctl_numa_balancing_scan_delay = 1000;
10791096
10801097 struct numa_group {
1081
- atomic_t refcount;
1098
+ refcount_t refcount;
10821099
10831100 spinlock_t lock; /* nr_tasks, tasks */
10841101 int nr_tasks;
....@@ -1094,7 +1111,7 @@
10941111 * more by CPU use than by memory faults.
10951112 */
10961113 unsigned long *faults_cpu;
1097
- unsigned long faults[0];
1114
+ unsigned long faults[];
10981115 };
10991116
11001117 /*
....@@ -1164,7 +1181,7 @@
11641181 unsigned long shared = group_faults_shared(ng);
11651182 unsigned long private = group_faults_priv(ng);
11661183
1167
- period *= atomic_read(&ng->refcount);
1184
+ period *= refcount_read(&ng->refcount);
11681185 period *= shared + 1;
11691186 period /= private + shared + 1;
11701187 }
....@@ -1189,7 +1206,7 @@
11891206 unsigned long private = group_faults_priv(ng);
11901207 unsigned long period = smax;
11911208
1192
- period *= atomic_read(&ng->refcount);
1209
+ period *= refcount_read(&ng->refcount);
11931210 period *= shared + 1;
11941211 period /= private + shared + 1;
11951212
....@@ -1199,56 +1216,15 @@
11991216 return max(smin, smax);
12001217 }
12011218
1202
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203
-{
1204
- int mm_users = 0;
1205
- struct mm_struct *mm = p->mm;
1206
-
1207
- if (mm) {
1208
- mm_users = atomic_read(&mm->mm_users);
1209
- if (mm_users == 1) {
1210
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211
- mm->numa_scan_seq = 0;
1212
- }
1213
- }
1214
- p->node_stamp = 0;
1215
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217
- p->numa_work.next = &p->numa_work;
1218
- p->numa_faults = NULL;
1219
- RCU_INIT_POINTER(p->numa_group, NULL);
1220
- p->last_task_numa_placement = 0;
1221
- p->last_sum_exec_runtime = 0;
1222
-
1223
- /* New address space, reset the preferred nid */
1224
- if (!(clone_flags & CLONE_VM)) {
1225
- p->numa_preferred_nid = -1;
1226
- return;
1227
- }
1228
-
1229
- /*
1230
- * New thread, keep existing numa_preferred_nid which should be copied
1231
- * already by arch_dup_task_struct but stagger when scans start.
1232
- */
1233
- if (mm) {
1234
- unsigned int delay;
1235
-
1236
- delay = min_t(unsigned int, task_scan_max(current),
1237
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238
- delay += 2 * TICK_NSEC;
1239
- p->node_stamp = delay;
1240
- }
1241
-}
1242
-
12431219 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
12441220 {
1245
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
1221
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
12461222 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
12471223 }
12481224
12491225 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
12501226 {
1251
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1227
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
12521228 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
12531229 }
12541230
....@@ -1474,7 +1450,7 @@
14741450 * two full passes of the "multi-stage node selection" test that is
14751451 * executed below.
14761452 */
1477
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
1453
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
14781454 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
14791455 return true;
14801456
....@@ -1527,55 +1503,52 @@
15271503 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
15281504 }
15291505
1530
-static unsigned long weighted_cpuload(struct rq *rq);
1531
-static unsigned long source_load(int cpu, int type);
1532
-static unsigned long target_load(int cpu, int type);
1506
+/*
1507
+ * 'numa_type' describes the node at the moment of load balancing.
1508
+ */
1509
+enum numa_type {
1510
+ /* The node has spare capacity that can be used to run more tasks. */
1511
+ node_has_spare = 0,
1512
+ /*
1513
+ * The node is fully used and the tasks don't compete for more CPU
1514
+ * cycles. Nevertheless, some tasks might wait before running.
1515
+ */
1516
+ node_fully_busy,
1517
+ /*
1518
+ * The node is overloaded and can't provide expected CPU cycles to all
1519
+ * tasks.
1520
+ */
1521
+ node_overloaded
1522
+};
15331523
15341524 /* Cached statistics for all CPUs within a node */
15351525 struct numa_stats {
15361526 unsigned long load;
1537
-
1527
+ unsigned long runnable;
1528
+ unsigned long util;
15381529 /* Total compute capacity of CPUs on a node */
15391530 unsigned long compute_capacity;
1540
-
15411531 unsigned int nr_running;
1532
+ unsigned int weight;
1533
+ enum numa_type node_type;
1534
+ int idle_cpu;
15421535 };
15431536
1544
-/*
1545
- * XXX borrowed from update_sg_lb_stats
1546
- */
1547
-static void update_numa_stats(struct numa_stats *ns, int nid)
1537
+static inline bool is_core_idle(int cpu)
15481538 {
1549
- int smt, cpu, cpus = 0;
1550
- unsigned long capacity;
1539
+#ifdef CONFIG_SCHED_SMT
1540
+ int sibling;
15511541
1552
- memset(ns, 0, sizeof(*ns));
1553
- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554
- struct rq *rq = cpu_rq(cpu);
1542
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1543
+ if (cpu == sibling)
1544
+ continue;
15551545
1556
- ns->nr_running += rq->nr_running;
1557
- ns->load += weighted_cpuload(rq);
1558
- ns->compute_capacity += capacity_of(cpu);
1559
-
1560
- cpus++;
1546
+ if (!idle_cpu(sibling))
1547
+ return false;
15611548 }
1549
+#endif
15621550
1563
- /*
1564
- * If we raced with hotplug and there are no CPUs left in our mask
1565
- * the @ns structure is NULL'ed and task_numa_compare() will
1566
- * not find this node attractive.
1567
- *
1568
- * We'll detect a huge imbalance and bail there.
1569
- */
1570
- if (!cpus)
1571
- return;
1572
-
1573
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575
- capacity = cpus / smt; /* cores */
1576
-
1577
- capacity = min_t(unsigned, capacity,
1578
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1551
+ return true;
15791552 }
15801553
15811554 struct task_numa_env {
....@@ -1594,20 +1567,132 @@
15941567 int best_cpu;
15951568 };
15961569
1570
+static unsigned long cpu_load(struct rq *rq);
1571
+static unsigned long cpu_runnable(struct rq *rq);
1572
+static unsigned long cpu_util(int cpu);
1573
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
1574
+
1575
+static inline enum
1576
+numa_type numa_classify(unsigned int imbalance_pct,
1577
+ struct numa_stats *ns)
1578
+{
1579
+ if ((ns->nr_running > ns->weight) &&
1580
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1581
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1582
+ return node_overloaded;
1583
+
1584
+ if ((ns->nr_running < ns->weight) ||
1585
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1586
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1587
+ return node_has_spare;
1588
+
1589
+ return node_fully_busy;
1590
+}
1591
+
1592
+#ifdef CONFIG_SCHED_SMT
1593
+/* Forward declarations of select_idle_sibling helpers */
1594
+static inline bool test_idle_cores(int cpu, bool def);
1595
+static inline int numa_idle_core(int idle_core, int cpu)
1596
+{
1597
+ if (!static_branch_likely(&sched_smt_present) ||
1598
+ idle_core >= 0 || !test_idle_cores(cpu, false))
1599
+ return idle_core;
1600
+
1601
+ /*
1602
+ * Prefer cores instead of packing HT siblings
1603
+ * and triggering future load balancing.
1604
+ */
1605
+ if (is_core_idle(cpu))
1606
+ idle_core = cpu;
1607
+
1608
+ return idle_core;
1609
+}
1610
+#else
1611
+static inline int numa_idle_core(int idle_core, int cpu)
1612
+{
1613
+ return idle_core;
1614
+}
1615
+#endif
1616
+
1617
+/*
1618
+ * Gather all necessary information to make NUMA balancing placement
1619
+ * decisions that are compatible with standard load balancer. This
1620
+ * borrows code and logic from update_sg_lb_stats but sharing a
1621
+ * common implementation is impractical.
1622
+ */
1623
+static void update_numa_stats(struct task_numa_env *env,
1624
+ struct numa_stats *ns, int nid,
1625
+ bool find_idle)
1626
+{
1627
+ int cpu, idle_core = -1;
1628
+
1629
+ memset(ns, 0, sizeof(*ns));
1630
+ ns->idle_cpu = -1;
1631
+
1632
+ rcu_read_lock();
1633
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
1634
+ struct rq *rq = cpu_rq(cpu);
1635
+
1636
+ ns->load += cpu_load(rq);
1637
+ ns->runnable += cpu_runnable(rq);
1638
+ ns->util += cpu_util(cpu);
1639
+ ns->nr_running += rq->cfs.h_nr_running;
1640
+ ns->compute_capacity += capacity_of(cpu);
1641
+
1642
+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1643
+ if (READ_ONCE(rq->numa_migrate_on) ||
1644
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1645
+ continue;
1646
+
1647
+ if (ns->idle_cpu == -1)
1648
+ ns->idle_cpu = cpu;
1649
+
1650
+ idle_core = numa_idle_core(idle_core, cpu);
1651
+ }
1652
+ }
1653
+ rcu_read_unlock();
1654
+
1655
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
1656
+
1657
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
1658
+
1659
+ if (idle_core >= 0)
1660
+ ns->idle_cpu = idle_core;
1661
+}
1662
+
15971663 static void task_numa_assign(struct task_numa_env *env,
15981664 struct task_struct *p, long imp)
15991665 {
16001666 struct rq *rq = cpu_rq(env->dst_cpu);
16011667
1602
- /* Bail out if run-queue part of active NUMA balance. */
1603
- if (xchg(&rq->numa_migrate_on, 1))
1604
- return;
1668
+ /* Check if run-queue part of active NUMA balance. */
1669
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1670
+ int cpu;
1671
+ int start = env->dst_cpu;
16051672
1673
+ /* Find alternative idle CPU. */
1674
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1675
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1676
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1677
+ continue;
1678
+ }
1679
+
1680
+ env->dst_cpu = cpu;
1681
+ rq = cpu_rq(env->dst_cpu);
1682
+ if (!xchg(&rq->numa_migrate_on, 1))
1683
+ goto assign;
1684
+ }
1685
+
1686
+ /* Failed to find an alternative idle CPU */
1687
+ return;
1688
+ }
1689
+
1690
+assign:
16061691 /*
16071692 * Clear previous best_cpu/rq numa-migrate flag, since task now
16081693 * found a better CPU to move/swap.
16091694 */
1610
- if (env->best_cpu != -1) {
1695
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
16111696 rq = cpu_rq(env->best_cpu);
16121697 WRITE_ONCE(rq->numa_migrate_on, 0);
16131698 }
....@@ -1663,7 +1748,7 @@
16631748 * into account that it might be best if task running on the dst_cpu should
16641749 * be exchanged with the source task
16651750 */
1666
-static void task_numa_compare(struct task_numa_env *env,
1751
+static bool task_numa_compare(struct task_numa_env *env,
16671752 long taskimp, long groupimp, bool maymove)
16681753 {
16691754 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
....@@ -1674,12 +1759,13 @@
16741759 int dist = env->dist;
16751760 long moveimp = imp;
16761761 long load;
1762
+ bool stopsearch = false;
16771763
16781764 if (READ_ONCE(dst_rq->numa_migrate_on))
1679
- return;
1765
+ return false;
16801766
16811767 rcu_read_lock();
1682
- cur = task_rcu_dereference(&dst_rq->curr);
1768
+ cur = rcu_dereference(dst_rq->curr);
16831769 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
16841770 cur = NULL;
16851771
....@@ -1687,8 +1773,10 @@
16871773 * Because we have preemption enabled we can get migrated around and
16881774 * end try selecting ourselves (current == env->p) as a swap candidate.
16891775 */
1690
- if (cur == env->p)
1776
+ if (cur == env->p) {
1777
+ stopsearch = true;
16911778 goto unlock;
1779
+ }
16921780
16931781 if (!cur) {
16941782 if (maymove && moveimp >= env->best_imp)
....@@ -1697,18 +1785,27 @@
16971785 goto unlock;
16981786 }
16991787
1788
+ /* Skip this swap candidate if cannot move to the source cpu. */
1789
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1790
+ goto unlock;
1791
+
1792
+ /*
1793
+ * Skip this swap candidate if it is not moving to its preferred
1794
+ * node and the best task is.
1795
+ */
1796
+ if (env->best_task &&
1797
+ env->best_task->numa_preferred_nid == env->src_nid &&
1798
+ cur->numa_preferred_nid != env->src_nid) {
1799
+ goto unlock;
1800
+ }
1801
+
17001802 /*
17011803 * "imp" is the fault differential for the source task between the
17021804 * source and destination node. Calculate the total differential for
17031805 * the source task and potential destination task. The more negative
17041806 * the value is, the more remote accesses that would be expected to
17051807 * be incurred if the tasks were swapped.
1706
- */
1707
- /* Skip this swap candidate if cannot move to the source cpu */
1708
- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1709
- goto unlock;
1710
-
1711
- /*
1808
+ *
17121809 * If dst and source tasks are in the same NUMA group, or not
17131810 * in any group then look only at task weights.
17141811 */
....@@ -1735,9 +1832,31 @@
17351832 task_weight(cur, env->dst_nid, dist);
17361833 }
17371834
1835
+ /* Discourage picking a task already on its preferred node */
1836
+ if (cur->numa_preferred_nid == env->dst_nid)
1837
+ imp -= imp / 16;
1838
+
1839
+ /*
1840
+ * Encourage picking a task that moves to its preferred node.
1841
+ * This potentially makes imp larger than it's maximum of
1842
+ * 1998 (see SMALLIMP and task_weight for why) but in this
1843
+ * case, it does not matter.
1844
+ */
1845
+ if (cur->numa_preferred_nid == env->src_nid)
1846
+ imp += imp / 8;
1847
+
17381848 if (maymove && moveimp > imp && moveimp > env->best_imp) {
17391849 imp = moveimp;
17401850 cur = NULL;
1851
+ goto assign;
1852
+ }
1853
+
1854
+ /*
1855
+ * Prefer swapping with a task moving to its preferred node over a
1856
+ * task that is not.
1857
+ */
1858
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1859
+ env->best_task->numa_preferred_nid != env->src_nid) {
17411860 goto assign;
17421861 }
17431862
....@@ -1764,42 +1883,95 @@
17641883 goto unlock;
17651884
17661885 assign:
1767
- /*
1768
- * One idle CPU per node is evaluated for a task numa move.
1769
- * Call select_idle_sibling to maybe find a better one.
1770
- */
1886
+ /* Evaluate an idle CPU for a task numa move. */
17711887 if (!cur) {
1888
+ int cpu = env->dst_stats.idle_cpu;
1889
+
1890
+ /* Nothing cached so current CPU went idle since the search. */
1891
+ if (cpu < 0)
1892
+ cpu = env->dst_cpu;
1893
+
17721894 /*
1773
- * select_idle_siblings() uses an per-CPU cpumask that
1774
- * can be used from IRQ context.
1895
+ * If the CPU is no longer truly idle and the previous best CPU
1896
+ * is, keep using it.
17751897 */
1776
- local_irq_disable();
1777
- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778
- env->dst_cpu);
1779
- local_irq_enable();
1898
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1899
+ idle_cpu(env->best_cpu)) {
1900
+ cpu = env->best_cpu;
1901
+ }
1902
+
1903
+ env->dst_cpu = cpu;
17801904 }
17811905
17821906 task_numa_assign(env, cur, imp);
1907
+
1908
+ /*
1909
+ * If a move to idle is allowed because there is capacity or load
1910
+ * balance improves then stop the search. While a better swap
1911
+ * candidate may exist, a search is not free.
1912
+ */
1913
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1914
+ stopsearch = true;
1915
+
1916
+ /*
1917
+ * If a swap candidate must be identified and the current best task
1918
+ * moves its preferred node then stop the search.
1919
+ */
1920
+ if (!maymove && env->best_task &&
1921
+ env->best_task->numa_preferred_nid == env->src_nid) {
1922
+ stopsearch = true;
1923
+ }
17831924 unlock:
17841925 rcu_read_unlock();
1926
+
1927
+ return stopsearch;
17851928 }
17861929
17871930 static void task_numa_find_cpu(struct task_numa_env *env,
17881931 long taskimp, long groupimp)
17891932 {
1790
- long src_load, dst_load, load;
17911933 bool maymove = false;
17921934 int cpu;
17931935
1794
- load = task_h_load(env->p);
1795
- dst_load = env->dst_stats.load + load;
1796
- src_load = env->src_stats.load - load;
1797
-
17981936 /*
1799
- * If the improvement from just moving env->p direction is better
1800
- * than swapping tasks around, check if a move is possible.
1937
+ * If dst node has spare capacity, then check if there is an
1938
+ * imbalance that would be overruled by the load balancer.
18011939 */
1802
- maymove = !load_too_imbalanced(src_load, dst_load, env);
1940
+ if (env->dst_stats.node_type == node_has_spare) {
1941
+ unsigned int imbalance;
1942
+ int src_running, dst_running;
1943
+
1944
+ /*
1945
+ * Would movement cause an imbalance? Note that if src has
1946
+ * more running tasks that the imbalance is ignored as the
1947
+ * move improves the imbalance from the perspective of the
1948
+ * CPU load balancer.
1949
+ * */
1950
+ src_running = env->src_stats.nr_running - 1;
1951
+ dst_running = env->dst_stats.nr_running + 1;
1952
+ imbalance = max(0, dst_running - src_running);
1953
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
1954
+
1955
+ /* Use idle CPU if there is no imbalance */
1956
+ if (!imbalance) {
1957
+ maymove = true;
1958
+ if (env->dst_stats.idle_cpu >= 0) {
1959
+ env->dst_cpu = env->dst_stats.idle_cpu;
1960
+ task_numa_assign(env, NULL, 0);
1961
+ return;
1962
+ }
1963
+ }
1964
+ } else {
1965
+ long src_load, dst_load, load;
1966
+ /*
1967
+ * If the improvement from just moving env->p direction is better
1968
+ * than swapping tasks around, check if a move is possible.
1969
+ */
1970
+ load = task_h_load(env->p);
1971
+ dst_load = env->dst_stats.load + load;
1972
+ src_load = env->src_stats.load - load;
1973
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
1974
+ }
18031975
18041976 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
18051977 /* Skip this CPU if the source task cannot migrate */
....@@ -1807,7 +1979,8 @@
18071979 continue;
18081980
18091981 env->dst_cpu = cpu;
1810
- task_numa_compare(env, taskimp, groupimp, maymove);
1982
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
1983
+ break;
18111984 }
18121985 }
18131986
....@@ -1861,10 +2034,10 @@
18612034 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
18622035 taskweight = task_weight(p, env.src_nid, dist);
18632036 groupweight = group_weight(p, env.src_nid, dist);
1864
- update_numa_stats(&env.src_stats, env.src_nid);
2037
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
18652038 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
18662039 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867
- update_numa_stats(&env.dst_stats, env.dst_nid);
2040
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
18682041
18692042 /* Try to find a spot on the preferred nid. */
18702043 task_numa_find_cpu(&env, taskimp, groupimp);
....@@ -1897,7 +2070,7 @@
18972070
18982071 env.dist = dist;
18992072 env.dst_nid = nid;
1900
- update_numa_stats(&env.dst_stats, env.dst_nid);
2073
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
19012074 task_numa_find_cpu(&env, taskimp, groupimp);
19022075 }
19032076 }
....@@ -1921,15 +2094,17 @@
19212094 }
19222095
19232096 /* No better CPU than the current one was found. */
1924
- if (env.best_cpu == -1)
2097
+ if (env.best_cpu == -1) {
2098
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
19252099 return -EAGAIN;
2100
+ }
19262101
19272102 best_rq = cpu_rq(env.best_cpu);
19282103 if (env.best_task == NULL) {
19292104 ret = migrate_task_to(p, env.best_cpu);
19302105 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19312106 if (ret != 0)
1932
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
2107
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
19332108 return ret;
19342109 }
19352110
....@@ -1937,7 +2112,7 @@
19372112 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19382113
19392114 if (ret != 0)
1940
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
2115
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
19412116 put_task_struct(env.best_task);
19422117 return ret;
19432118 }
....@@ -1948,7 +2123,7 @@
19482123 unsigned long interval = HZ;
19492124
19502125 /* This task has no NUMA fault statistics yet */
1951
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
2126
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
19522127 return;
19532128
19542129 /* Periodically retry migrating the task to the preferred node */
....@@ -2199,7 +2374,7 @@
21992374
22002375 static void task_numa_placement(struct task_struct *p)
22012376 {
2202
- int seq, nid, max_nid = -1;
2377
+ int seq, nid, max_nid = NUMA_NO_NODE;
22032378 unsigned long max_faults = 0;
22042379 unsigned long fault_types[2] = { 0, 0 };
22052380 unsigned long total_faults;
....@@ -2309,12 +2484,12 @@
23092484
23102485 static inline int get_numa_group(struct numa_group *grp)
23112486 {
2312
- return atomic_inc_not_zero(&grp->refcount);
2487
+ return refcount_inc_not_zero(&grp->refcount);
23132488 }
23142489
23152490 static inline void put_numa_group(struct numa_group *grp)
23162491 {
2317
- if (atomic_dec_and_test(&grp->refcount))
2492
+ if (refcount_dec_and_test(&grp->refcount))
23182493 kfree_rcu(grp, rcu);
23192494 }
23202495
....@@ -2335,7 +2510,7 @@
23352510 if (!grp)
23362511 return;
23372512
2338
- atomic_set(&grp->refcount, 1);
2513
+ refcount_set(&grp->refcount, 1);
23392514 grp->active_nodes = 1;
23402515 grp->max_faults_cpu = 0;
23412516 spin_lock_init(&grp->lock);
....@@ -2522,8 +2697,8 @@
25222697 local = 1;
25232698
25242699 /*
2525
- * Retry task to preferred node migration periodically, in case it
2526
- * case it previously failed, or the scheduler moved us.
2700
+ * Retry to migrate task to preferred node periodically, in case it
2701
+ * previously failed, or the scheduler moved us.
25272702 */
25282703 if (time_after(jiffies, p->numa_migrate_retry)) {
25292704 task_numa_placement(p);
....@@ -2558,7 +2733,7 @@
25582733 * The expensive part of numa migration is done from task_work context.
25592734 * Triggered from task_tick_numa().
25602735 */
2561
-void task_numa_work(struct callback_head *work)
2736
+static void task_numa_work(struct callback_head *work)
25622737 {
25632738 unsigned long migrate, next_scan, now = jiffies;
25642739 struct task_struct *p = current;
....@@ -2571,7 +2746,7 @@
25712746
25722747 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
25732748
2574
- work->next = work; /* protect against double add */
2749
+ work->next = work;
25752750 /*
25762751 * Who cares about NUMA placement when they're dying.
25772752 *
....@@ -2618,7 +2793,7 @@
26182793 return;
26192794
26202795
2621
- if (!down_read_trylock(&mm->mmap_sem))
2796
+ if (!mmap_read_trylock(mm))
26222797 return;
26232798 vma = find_vma(mm, start);
26242799 if (!vma) {
....@@ -2646,7 +2821,7 @@
26462821 * Skip inaccessible VMAs to avoid any confusion between
26472822 * PROT_NONE and NUMA hinting ptes
26482823 */
2649
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2824
+ if (!vma_is_accessible(vma))
26502825 continue;
26512826
26522827 do {
....@@ -2686,7 +2861,7 @@
26862861 mm->numa_scan_offset = start;
26872862 else
26882863 reset_ptenuma_scan(p);
2689
- up_read(&mm->mmap_sem);
2864
+ mmap_read_unlock(mm);
26902865
26912866 /*
26922867 * Make sure tasks use at least 32x as much time to run other code
....@@ -2700,10 +2875,54 @@
27002875 }
27012876 }
27022877
2878
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2879
+{
2880
+ int mm_users = 0;
2881
+ struct mm_struct *mm = p->mm;
2882
+
2883
+ if (mm) {
2884
+ mm_users = atomic_read(&mm->mm_users);
2885
+ if (mm_users == 1) {
2886
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2887
+ mm->numa_scan_seq = 0;
2888
+ }
2889
+ }
2890
+ p->node_stamp = 0;
2891
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2892
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2893
+ /* Protect against double add, see task_tick_numa and task_numa_work */
2894
+ p->numa_work.next = &p->numa_work;
2895
+ p->numa_faults = NULL;
2896
+ RCU_INIT_POINTER(p->numa_group, NULL);
2897
+ p->last_task_numa_placement = 0;
2898
+ p->last_sum_exec_runtime = 0;
2899
+
2900
+ init_task_work(&p->numa_work, task_numa_work);
2901
+
2902
+ /* New address space, reset the preferred nid */
2903
+ if (!(clone_flags & CLONE_VM)) {
2904
+ p->numa_preferred_nid = NUMA_NO_NODE;
2905
+ return;
2906
+ }
2907
+
2908
+ /*
2909
+ * New thread, keep existing numa_preferred_nid which should be copied
2910
+ * already by arch_dup_task_struct but stagger when scans start.
2911
+ */
2912
+ if (mm) {
2913
+ unsigned int delay;
2914
+
2915
+ delay = min_t(unsigned int, task_scan_max(current),
2916
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2917
+ delay += 2 * TICK_NSEC;
2918
+ p->node_stamp = delay;
2919
+ }
2920
+}
2921
+
27032922 /*
27042923 * Drive the periodic memory faults..
27052924 */
2706
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
2925
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
27072926 {
27082927 struct callback_head *work = &curr->numa_work;
27092928 u64 period, now;
....@@ -2728,10 +2947,8 @@
27282947 curr->numa_scan_period = task_scan_start(curr);
27292948 curr->node_stamp += period;
27302949
2731
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733
- task_work_add(curr, work, true);
2734
- }
2950
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
2951
+ task_work_add(curr, work, TWA_RESUME);
27352952 }
27362953 }
27372954
....@@ -2761,7 +2978,8 @@
27612978 * the preferred node.
27622979 */
27632980 if (dst_nid == p->numa_preferred_nid ||
2764
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2981
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
2982
+ src_nid != p->numa_preferred_nid))
27652983 return;
27662984 }
27672985
....@@ -2791,8 +3009,6 @@
27913009 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
27923010 {
27933011 update_load_add(&cfs_rq->load, se->load.weight);
2794
- if (!parent_entity(se))
2795
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
27963012 #ifdef CONFIG_SMP
27973013 if (entity_is_task(se)) {
27983014 struct rq *rq = rq_of(cfs_rq);
....@@ -2808,8 +3024,6 @@
28083024 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
28093025 {
28103026 update_load_sub(&cfs_rq->load, se->load.weight);
2811
- if (!parent_entity(se))
2812
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
28133027 #ifdef CONFIG_SMP
28143028 if (entity_is_task(se)) {
28153029 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
....@@ -2856,26 +3070,18 @@
28563070 WRITE_ONCE(*ptr, res); \
28573071 } while (0)
28583072
3073
+/*
3074
+ * Remove and clamp on negative, from a local variable.
3075
+ *
3076
+ * A variant of sub_positive(), which does not use explicit load-store
3077
+ * and is thus optimized for local variable updates.
3078
+ */
3079
+#define lsub_positive(_ptr, _val) do { \
3080
+ typeof(_ptr) ptr = (_ptr); \
3081
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3082
+} while (0)
3083
+
28593084 #ifdef CONFIG_SMP
2860
-static inline void
2861
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2862
-{
2863
- cfs_rq->runnable_weight += se->runnable_weight;
2864
-
2865
- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867
-}
2868
-
2869
-static inline void
2870
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2871
-{
2872
- cfs_rq->runnable_weight -= se->runnable_weight;
2873
-
2874
- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875
- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876
- se_runnable(se) * se->avg.runnable_load_sum);
2877
-}
2878
-
28793085 static inline void
28803086 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
28813087 {
....@@ -2891,45 +3097,36 @@
28913097 }
28923098 #else
28933099 static inline void
2894
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2895
-static inline void
2896
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2897
-static inline void
28983100 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
28993101 static inline void
29003102 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
29013103 #endif
29023104
29033105 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2904
- unsigned long weight, unsigned long runnable)
3106
+ unsigned long weight)
29053107 {
29063108 if (se->on_rq) {
29073109 /* commit outstanding execution time */
29083110 if (cfs_rq->curr == se)
29093111 update_curr(cfs_rq);
2910
- account_entity_dequeue(cfs_rq, se);
2911
- dequeue_runnable_load_avg(cfs_rq, se);
3112
+ update_load_sub(&cfs_rq->load, se->load.weight);
29123113 }
29133114 dequeue_load_avg(cfs_rq, se);
29143115
2915
- se->runnable_weight = runnable;
29163116 update_load_set(&se->load, weight);
29173117
29183118 #ifdef CONFIG_SMP
29193119 do {
2920
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
3120
+ u32 divider = get_pelt_divider(&se->avg);
29213121
29223122 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923
- se->avg.runnable_load_avg =
2924
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
29253123 } while (0);
29263124 #endif
29273125
29283126 enqueue_load_avg(cfs_rq, se);
2929
- if (se->on_rq) {
2930
- account_entity_enqueue(cfs_rq, se);
2931
- enqueue_runnable_load_avg(cfs_rq, se);
2932
- }
3127
+ if (se->on_rq)
3128
+ update_load_add(&cfs_rq->load, se->load.weight);
3129
+
29333130 }
29343131
29353132 void reweight_task(struct task_struct *p, int prio)
....@@ -2939,7 +3136,7 @@
29393136 struct load_weight *load = &se->load;
29403137 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
29413138
2942
- reweight_entity(cfs_rq, se, weight, weight);
3139
+ reweight_entity(cfs_rq, se, weight);
29433140 load->inv_weight = sched_prio_to_wmult[prio];
29443141 }
29453142
....@@ -3051,50 +3248,6 @@
30513248 */
30523249 return clamp_t(long, shares, MIN_SHARES, tg_shares);
30533250 }
3054
-
3055
-/*
3056
- * This calculates the effective runnable weight for a group entity based on
3057
- * the group entity weight calculated above.
3058
- *
3059
- * Because of the above approximation (2), our group entity weight is
3060
- * an load_avg based ratio (3). This means that it includes blocked load and
3061
- * does not represent the runnable weight.
3062
- *
3063
- * Approximate the group entity's runnable weight per ratio from the group
3064
- * runqueue:
3065
- *
3066
- * grq->avg.runnable_load_avg
3067
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068
- * grq->avg.load_avg
3069
- *
3070
- * However, analogous to above, since the avg numbers are slow, this leads to
3071
- * transients in the from-idle case. Instead we use:
3072
- *
3073
- * ge->runnable_weight = ge->load.weight *
3074
- *
3075
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076
- * ----------------------------------------------------- (8)
3077
- * max(grq->avg.load_avg, grq->load.weight)
3078
- *
3079
- * Where these max() serve both to use the 'instant' values to fix the slow
3080
- * from-idle and avoid the /0 on to-idle, similar to (6).
3081
- */
3082
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083
-{
3084
- long runnable, load_avg;
3085
-
3086
- load_avg = max(cfs_rq->avg.load_avg,
3087
- scale_load_down(cfs_rq->load.weight));
3088
-
3089
- runnable = max(cfs_rq->avg.runnable_load_avg,
3090
- scale_load_down(cfs_rq->runnable_weight));
3091
-
3092
- runnable *= shares;
3093
- if (load_avg)
3094
- runnable /= load_avg;
3095
-
3096
- return clamp_t(long, runnable, MIN_SHARES, shares);
3097
-}
30983251 #endif /* CONFIG_SMP */
30993252
31003253 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
....@@ -3106,7 +3259,7 @@
31063259 static void update_cfs_group(struct sched_entity *se)
31073260 {
31083261 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109
- long shares, runnable;
3262
+ long shares;
31103263
31113264 if (!gcfs_rq)
31123265 return;
....@@ -3115,16 +3268,15 @@
31153268 return;
31163269
31173270 #ifndef CONFIG_SMP
3118
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3271
+ shares = READ_ONCE(gcfs_rq->tg->shares);
31193272
31203273 if (likely(se->load.weight == shares))
31213274 return;
31223275 #else
31233276 shares = calc_group_shares(gcfs_rq);
3124
- runnable = calc_group_runnable(gcfs_rq, shares);
31253277 #endif
31263278
3127
- reweight_entity(cfs_rq_of(se), se, shares, runnable);
3279
+ reweight_entity(cfs_rq_of(se), se, shares);
31283280 }
31293281
31303282 #else /* CONFIG_FAIR_GROUP_SCHED */
....@@ -3137,7 +3289,7 @@
31373289 {
31383290 struct rq *rq = rq_of(cfs_rq);
31393291
3140
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3292
+ if (&rq->cfs == cfs_rq) {
31413293 /*
31423294 * There are a few boundary cases this might miss but it should
31433295 * get called often enough that that should (hopefully) not be
....@@ -3161,7 +3313,6 @@
31613313 /**
31623314 * update_tg_load_avg - update the tg's load avg
31633315 * @cfs_rq: the cfs_rq whose avg changed
3164
- * @force: update regardless of how small the difference
31653316 *
31663317 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
31673318 * However, because tg->load_avg is a global value there are performance
....@@ -3173,7 +3324,7 @@
31733324 *
31743325 * Updating tg's load_avg is necessary before update_cfs_share().
31753326 */
3176
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3327
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
31773328 {
31783329 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
31793330
....@@ -3183,11 +3334,9 @@
31833334 if (cfs_rq->tg == &root_task_group)
31843335 return;
31853336
3186
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3337
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
31873338 atomic_long_add(delta, &cfs_rq->tg->load_avg);
31883339 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189
-
3190
- trace_sched_load_tg(cfs_rq);
31913340 }
31923341 }
31933342
....@@ -3240,7 +3389,6 @@
32403389 se->avg.last_update_time = n_last_update_time;
32413390 }
32423391
3243
-
32443392 /*
32453393 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
32463394 * propagate its contribution. The key to this propagation is the invariant
....@@ -3251,11 +3399,11 @@
32513399 * _IFF_ we look at the pure running and runnable sums. Because they
32523400 * represent the very same entity, just at different points in the hierarchy.
32533401 *
3254
- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255
- * sum over (but still wrong, because the group entity and group rq do not have
3256
- * their PELT windows aligned).
3402
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3403
+ * and simply copies the running/runnable sum over (but still wrong, because
3404
+ * the group entity and group rq do not have their PELT windows aligned).
32573405 *
3258
- * However, update_tg_cfs_runnable() is more complex. So we have:
3406
+ * However, update_tg_cfs_load() is more complex. So we have:
32593407 *
32603408 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
32613409 *
....@@ -3308,45 +3456,75 @@
33083456 * XXX: only do this for the part of runnable > running ?
33093457 *
33103458 */
3311
-
33123459 static inline void
33133460 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33143461 {
33153462 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3463
+ u32 divider;
33163464
33173465 /* Nothing to update */
33183466 if (!delta)
33193467 return;
33203468
33213469 /*
3322
- * The relation between sum and avg is:
3323
- *
3324
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325
- *
3326
- * however, the PELT windows are not aligned between grq and gse.
3470
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3471
+ * See ___update_load_avg() for details.
33273472 */
3473
+ divider = get_pelt_divider(&cfs_rq->avg);
33283474
33293475 /* Set new sched_entity's utilization */
33303476 se->avg.util_avg = gcfs_rq->avg.util_avg;
3331
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3477
+ se->avg.util_sum = se->avg.util_avg * divider;
33323478
33333479 /* Update parent cfs_rq utilization */
33343480 add_positive(&cfs_rq->avg.util_avg, delta);
3335
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3481
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
33363482 }
33373483
33383484 static inline void
33393485 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33403486 {
3487
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3488
+ u32 divider;
3489
+
3490
+ /* Nothing to update */
3491
+ if (!delta)
3492
+ return;
3493
+
3494
+ /*
3495
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3496
+ * See ___update_load_avg() for details.
3497
+ */
3498
+ divider = get_pelt_divider(&cfs_rq->avg);
3499
+
3500
+ /* Set new sched_entity's runnable */
3501
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3502
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3503
+
3504
+ /* Update parent cfs_rq runnable */
3505
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
3506
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3507
+}
3508
+
3509
+static inline void
3510
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3511
+{
33413512 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342
- unsigned long runnable_load_avg, load_avg;
3343
- u64 runnable_load_sum, load_sum = 0;
3513
+ unsigned long load_avg;
3514
+ u64 load_sum = 0;
33443515 s64 delta_sum;
3516
+ u32 divider;
33453517
33463518 if (!runnable_sum)
33473519 return;
33483520
33493521 gcfs_rq->prop_runnable_sum = 0;
3522
+
3523
+ /*
3524
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3525
+ * See ___update_load_avg() for details.
3526
+ */
3527
+ divider = get_pelt_divider(&cfs_rq->avg);
33503528
33513529 if (runnable_sum >= 0) {
33523530 /*
....@@ -3354,7 +3532,7 @@
33543532 * the CPU is saturated running == runnable.
33553533 */
33563534 runnable_sum += se->avg.load_sum;
3357
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3535
+ runnable_sum = min_t(long, runnable_sum, divider);
33583536 } else {
33593537 /*
33603538 * Estimate the new unweighted runnable_sum of the gcfs_rq by
....@@ -3379,7 +3557,7 @@
33793557 runnable_sum = max(runnable_sum, running_sum);
33803558
33813559 load_sum = (s64)se_weight(se) * runnable_sum;
3382
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3560
+ load_avg = div_s64(load_sum, divider);
33833561
33843562 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
33853563 delta_avg = load_avg - se->avg.load_avg;
....@@ -3388,19 +3566,6 @@
33883566 se->avg.load_avg = load_avg;
33893567 add_positive(&cfs_rq->avg.load_avg, delta_avg);
33903568 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391
-
3392
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396
-
3397
- se->avg.runnable_load_sum = runnable_sum;
3398
- se->avg.runnable_load_avg = runnable_load_avg;
3399
-
3400
- if (se->on_rq) {
3401
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403
- }
34043569 }
34053570
34063571 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
....@@ -3429,9 +3594,10 @@
34293594
34303595 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
34313596 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3597
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
34323598
3433
- trace_sched_load_cfs_rq(cfs_rq);
3434
- trace_sched_load_se(se);
3599
+ trace_pelt_cfs_tp(cfs_rq);
3600
+ trace_pelt_se_tp(se);
34353601
34363602 return 1;
34373603 }
....@@ -3468,7 +3634,7 @@
34683634
34693635 #else /* CONFIG_FAIR_GROUP_SCHED */
34703636
3471
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3637
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
34723638
34733639 static inline int propagate_entity_load_avg(struct sched_entity *se)
34743640 {
....@@ -3498,18 +3664,18 @@
34983664 static inline int
34993665 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
35003666 {
3501
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3667
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
35023668 struct sched_avg *sa = &cfs_rq->avg;
35033669 int decayed = 0;
35043670
35053671 if (cfs_rq->removed.nr) {
35063672 unsigned long r;
3507
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3673
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35083674
35093675 raw_spin_lock(&cfs_rq->removed.lock);
35103676 swap(cfs_rq->removed.util_avg, removed_util);
35113677 swap(cfs_rq->removed.load_avg, removed_load);
3512
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3678
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
35133679 cfs_rq->removed.nr = 0;
35143680 raw_spin_unlock(&cfs_rq->removed.lock);
35153681
....@@ -3520,8 +3686,29 @@
35203686 r = removed_util;
35213687 sub_positive(&sa->util_avg, r);
35223688 sub_positive(&sa->util_sum, r * divider);
3689
+ /*
3690
+ * Because of rounding, se->util_sum might ends up being +1 more than
3691
+ * cfs->util_sum. Although this is not a problem by itself, detaching
3692
+ * a lot of tasks with the rounding problem between 2 updates of
3693
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3694
+ * cfs_util_avg is not.
3695
+ * Check that util_sum is still above its lower bound for the new
3696
+ * util_avg. Given that period_contrib might have moved since the last
3697
+ * sync, we are only sure that util_sum must be above or equal to
3698
+ * util_avg * minimum possible divider
3699
+ */
3700
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
35233701
3524
- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3702
+ r = removed_runnable;
3703
+ sub_positive(&sa->runnable_avg, r);
3704
+ sub_positive(&sa->runnable_sum, r * divider);
3705
+
3706
+ /*
3707
+ * removed_runnable is the unweighted version of removed_load so we
3708
+ * can use it to estimate removed_load_sum.
3709
+ */
3710
+ add_tg_cfs_propagate(cfs_rq,
3711
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
35253712
35263713 decayed = 1;
35273714 }
....@@ -3533,9 +3720,6 @@
35333720 cfs_rq->load_last_update_time_copy = sa->last_update_time;
35343721 #endif
35353722
3536
- if (decayed)
3537
- cfs_rq_util_change(cfs_rq, 0);
3538
-
35393723 return decayed;
35403724 }
35413725
....@@ -3543,14 +3727,17 @@
35433727 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
35443728 * @cfs_rq: cfs_rq to attach to
35453729 * @se: sched_entity to attach
3546
- * @flags: migration hints
35473730 *
35483731 * Must call update_cfs_rq_load_avg() before this, since we rely on
35493732 * cfs_rq->avg.last_update_time being current.
35503733 */
3551
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3734
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
35523735 {
3553
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3736
+ /*
3737
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3738
+ * See ___update_load_avg() for details.
3739
+ */
3740
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35543741
35553742 /*
35563743 * When we attach the @se to the @cfs_rq, we must align the decay
....@@ -3570,23 +3757,25 @@
35703757 */
35713758 se->avg.util_sum = se->avg.util_avg * divider;
35723759
3573
- se->avg.load_sum = divider;
3574
- if (se_weight(se)) {
3575
- se->avg.load_sum =
3576
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577
- }
3760
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
35783761
3579
- se->avg.runnable_load_sum = se->avg.load_sum;
3762
+ se->avg.load_sum = se->avg.load_avg * divider;
3763
+ if (se_weight(se) < se->avg.load_sum)
3764
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
3765
+ else
3766
+ se->avg.load_sum = 1;
35803767
35813768 enqueue_load_avg(cfs_rq, se);
35823769 cfs_rq->avg.util_avg += se->avg.util_avg;
35833770 cfs_rq->avg.util_sum += se->avg.util_sum;
3771
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3772
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
35843773
35853774 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
35863775
3587
- cfs_rq_util_change(cfs_rq, flags);
3776
+ cfs_rq_util_change(cfs_rq, 0);
35883777
3589
- trace_sched_load_cfs_rq(cfs_rq);
3778
+ trace_pelt_cfs_tp(cfs_rq);
35903779 }
35913780
35923781 /**
....@@ -3602,12 +3791,14 @@
36023791 dequeue_load_avg(cfs_rq, se);
36033792 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
36043793 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3794
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3795
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
36053796
36063797 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
36073798
36083799 cfs_rq_util_change(cfs_rq, 0);
36093800
3610
- trace_sched_load_cfs_rq(cfs_rq);
3801
+ trace_pelt_cfs_tp(cfs_rq);
36113802 }
36123803
36133804 /*
....@@ -3623,12 +3814,15 @@
36233814 u64 now = cfs_rq_clock_pelt(cfs_rq);
36243815 int decayed;
36253816
3817
+ trace_android_vh_prepare_update_load_avg_se(se, flags);
36263818 /*
36273819 * Track task load average for carrying it to new CPU after migrated, and
36283820 * track group sched_entity load average for task_h_load calc in migration
36293821 */
36303822 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
36313823 __update_load_avg_se(now, cfs_rq, se);
3824
+
3825
+ trace_android_vh_finish_update_load_avg_se(se, flags);
36323826
36333827 decayed = update_cfs_rq_load_avg(now, cfs_rq);
36343828 decayed |= propagate_entity_load_avg(se);
....@@ -3642,11 +3836,15 @@
36423836 *
36433837 * IOW we're enqueueing a task on a new CPU.
36443838 */
3645
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646
- update_tg_load_avg(cfs_rq, 0);
3839
+ attach_entity_load_avg(cfs_rq, se);
3840
+ update_tg_load_avg(cfs_rq);
36473841
3648
- } else if (decayed && (flags & UPDATE_TG))
3649
- update_tg_load_avg(cfs_rq, 0);
3842
+ } else if (decayed) {
3843
+ cfs_rq_util_change(cfs_rq, 0);
3844
+
3845
+ if (flags & UPDATE_TG)
3846
+ update_tg_load_avg(cfs_rq);
3847
+ }
36503848 }
36513849
36523850 #ifndef CONFIG_64BIT
....@@ -3674,20 +3872,22 @@
36743872 * Synchronize entity load avg of dequeued entity without locking
36753873 * the previous rq.
36763874 */
3677
-void sync_entity_load_avg(struct sched_entity *se)
3875
+static void sync_entity_load_avg(struct sched_entity *se)
36783876 {
36793877 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36803878 u64 last_update_time;
36813879
36823880 last_update_time = cfs_rq_last_update_time(cfs_rq);
3881
+ trace_android_vh_prepare_update_load_avg_se(se, 0);
36833882 __update_load_avg_blocked_se(last_update_time, se);
3883
+ trace_android_vh_finish_update_load_avg_se(se, 0);
36843884 }
36853885
36863886 /*
36873887 * Task first catches up with cfs_rq, and then subtract
36883888 * itself from the cfs_rq (task must be off the queue now).
36893889 */
3690
-void remove_entity_load_avg(struct sched_entity *se)
3890
+static void remove_entity_load_avg(struct sched_entity *se)
36913891 {
36923892 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36933893 unsigned long flags;
....@@ -3696,10 +3896,6 @@
36963896 * tasks cannot exit without having gone through wake_up_new_task() ->
36973897 * post_init_entity_util_avg() which will have added things to the
36983898 * cfs_rq, so we can remove unconditionally.
3699
- *
3700
- * Similarly for groups, they will have passed through
3701
- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702
- * calls this.
37033899 */
37043900
37053901 sync_entity_load_avg(se);
....@@ -3708,13 +3904,13 @@
37083904 ++cfs_rq->removed.nr;
37093905 cfs_rq->removed.util_avg += se->avg.util_avg;
37103906 cfs_rq->removed.load_avg += se->avg.load_avg;
3711
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
3907
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
37123908 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
37133909 }
37143910
3715
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3911
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
37163912 {
3717
- return cfs_rq->avg.runnable_load_avg;
3913
+ return cfs_rq->avg.runnable_avg;
37183914 }
37193915
37203916 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
....@@ -3722,7 +3918,7 @@
37223918 return cfs_rq->avg.load_avg;
37233919 }
37243920
3725
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3921
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
37263922
37273923 static inline unsigned long task_util(struct task_struct *p)
37283924 {
....@@ -3733,10 +3929,10 @@
37333929 {
37343930 struct util_est ue = READ_ONCE(p->se.avg.util_est);
37353931
3736
- return max(ue.ewma, ue.enqueued);
3932
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
37373933 }
37383934
3739
-unsigned long task_util_est(struct task_struct *p)
3935
+static inline unsigned long task_util_est(struct task_struct *p)
37403936 {
37413937 return max(task_util(p), _task_util_est(p));
37423938 }
....@@ -3765,13 +3961,29 @@
37653961
37663962 /* Update root cfs_rq's estimated utilization */
37673963 enqueued = cfs_rq->avg.util_est.enqueued;
3768
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3964
+ enqueued += _task_util_est(p);
37693965 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
37703966
3771
- /* Update plots for Task and CPU estimated utilization */
3772
- trace_sched_util_est_task(p, &p->se.avg);
3773
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3967
+ trace_sched_util_est_cfs_tp(cfs_rq);
37743968 }
3969
+
3970
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3971
+ struct task_struct *p)
3972
+{
3973
+ unsigned int enqueued;
3974
+
3975
+ if (!sched_feat(UTIL_EST))
3976
+ return;
3977
+
3978
+ /* Update root cfs_rq's estimated utilization */
3979
+ enqueued = cfs_rq->avg.util_est.enqueued;
3980
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3981
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3982
+
3983
+ trace_sched_util_est_cfs_tp(cfs_rq);
3984
+}
3985
+
3986
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
37753987
37763988 /*
37773989 * Check if a (signed) value is within a specified (unsigned) margin,
....@@ -3786,24 +3998,20 @@
37863998 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
37873999 }
37884000
3789
-static void
3790
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
4001
+static inline void util_est_update(struct cfs_rq *cfs_rq,
4002
+ struct task_struct *p,
4003
+ bool task_sleep)
37914004 {
3792
- long last_ewma_diff;
4005
+ long last_ewma_diff, last_enqueued_diff;
37934006 struct util_est ue;
3794
- int cpu;
4007
+ int ret = 0;
4008
+
4009
+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
4010
+ if (ret)
4011
+ return;
37954012
37964013 if (!sched_feat(UTIL_EST))
37974014 return;
3798
-
3799
- /* Update root cfs_rq's estimated utilization */
3800
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3803
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804
-
3805
- /* Update plots for CPU's estimated utilization */
3806
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
38074015
38084016 /*
38094017 * Skip update of task's estimated utilization when the task has not
....@@ -3820,11 +4028,13 @@
38204028 if (ue.enqueued & UTIL_AVG_UNCHANGED)
38214029 return;
38224030
4031
+ last_enqueued_diff = ue.enqueued;
4032
+
38234033 /*
38244034 * Reset EWMA on utilization increases, the moving average is used only
38254035 * to smooth utilization decreases.
38264036 */
3827
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
4037
+ ue.enqueued = task_util(p);
38284038 if (sched_feat(UTIL_EST_FASTUP)) {
38294039 if (ue.ewma < ue.enqueued) {
38304040 ue.ewma = ue.enqueued;
....@@ -3833,19 +4043,23 @@
38334043 }
38344044
38354045 /*
3836
- * Skip update of task's estimated utilization when its EWMA is
4046
+ * Skip update of task's estimated utilization when its members are
38374047 * already ~1% close to its last activation value.
38384048 */
38394049 last_ewma_diff = ue.enqueued - ue.ewma;
3840
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
4050
+ last_enqueued_diff -= ue.enqueued;
4051
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4052
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4053
+ goto done;
4054
+
38414055 return;
4056
+ }
38424057
38434058 /*
38444059 * To avoid overestimation of actual task utilization, skip updates if
38454060 * we cannot grant there is idle time in this CPU.
38464061 */
3847
- cpu = cpu_of(rq_of(cfs_rq));
3848
- if (task_util(p) > capacity_orig_of(cpu))
4062
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
38494063 return;
38504064
38514065 /*
....@@ -3869,39 +4083,26 @@
38694083 ue.ewma += last_ewma_diff;
38704084 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
38714085 done:
4086
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
38724087 WRITE_ONCE(p->se.avg.util_est, ue);
38734088
3874
- /* Update plots for Task's estimated utilization */
3875
- trace_sched_util_est_task(p, &p->se.avg);
4089
+ trace_sched_util_est_se_tp(&p->se);
38764090 }
38774091
38784092 static inline int task_fits_capacity(struct task_struct *p, long capacity)
38794093 {
3880
- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
4094
+ return fits_capacity(uclamp_task_util(p), capacity);
38814095 }
3882
-
3883
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884
-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885
-{
3886
- unsigned long capacity = capacity_of(cpu);
3887
- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
3888
-
3889
- if (capacity == max_capacity)
3890
- return true;
3891
-
3892
- if (capacity * capacity_margin > max_capacity * 1024)
3893
- return true;
3894
-
3895
- return task_fits_capacity(p, capacity);
3896
-}
3897
-#endif
38984096
38994097 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
39004098 {
3901
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
4099
+ bool need_update = true;
4100
+
4101
+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
4102
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update)
39024103 return;
39034104
3904
- if (!p) {
4105
+ if (!p || p->nr_cpus_allowed == 1) {
39054106 rq->misfit_task_load = 0;
39064107 return;
39074108 }
....@@ -3911,7 +4112,11 @@
39114112 return;
39124113 }
39134114
3914
- rq->misfit_task_load = task_h_load(p);
4115
+ /*
4116
+ * Make sure that misfit_task_load will not be null even if
4117
+ * task_h_load() returns 0.
4118
+ */
4119
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
39154120 }
39164121
39174122 #else /* CONFIG_SMP */
....@@ -3928,11 +4133,11 @@
39284133 static inline void remove_entity_load_avg(struct sched_entity *se) {}
39294134
39304135 static inline void
3931
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
4136
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39324137 static inline void
39334138 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39344139
3935
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
4140
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
39364141 {
39374142 return 0;
39384143 }
....@@ -3941,8 +4146,11 @@
39414146 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
39424147
39434148 static inline void
3944
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3945
- bool task_sleep) {}
4149
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4150
+
4151
+static inline void
4152
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4153
+ bool task_sleep) {}
39464154 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
39474155
39484156 #endif /* CONFIG_SMP */
....@@ -3990,6 +4198,7 @@
39904198
39914199 /* ensure we never gain time by being placed backwards. */
39924200 se->vruntime = max_vruntime(se->vruntime, vruntime);
4201
+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
39934202 }
39944203
39954204 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
....@@ -4014,6 +4223,7 @@
40144223 #endif
40154224 }
40164225
4226
+static inline bool cfs_bandwidth_used(void);
40174227
40184228 /*
40194229 * MIGRATION
....@@ -4078,8 +4288,8 @@
40784288 * - Add its new weight to cfs_rq->load.weight
40794289 */
40804290 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4291
+ se_update_runnable(se);
40814292 update_cfs_group(se);
4082
- enqueue_runnable_load_avg(cfs_rq, se);
40834293 account_entity_enqueue(cfs_rq, se);
40844294
40854295 if (flags & ENQUEUE_WAKEUP)
....@@ -4092,10 +4302,16 @@
40924302 __enqueue_entity(cfs_rq, se);
40934303 se->on_rq = 1;
40944304
4095
- if (cfs_rq->nr_running == 1) {
4305
+ /*
4306
+ * When bandwidth control is enabled, cfs might have been removed
4307
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
4308
+ * add it unconditionnally.
4309
+ */
4310
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
40964311 list_add_leaf_cfs_rq(cfs_rq);
4312
+
4313
+ if (cfs_rq->nr_running == 1)
40974314 check_enqueue_throttle(cfs_rq);
4098
- }
40994315 }
41004316
41014317 static void __clear_buddies_last(struct sched_entity *se)
....@@ -4156,13 +4372,13 @@
41564372 /*
41574373 * When dequeuing a sched_entity, we must:
41584374 * - Update loads to have both entity and cfs_rq synced with now.
4159
- * - Substract its load from the cfs_rq->runnable_avg.
4160
- * - Substract its previous weight from cfs_rq->load.weight.
4375
+ * - Subtract its load from the cfs_rq->runnable_avg.
4376
+ * - Subtract its previous weight from cfs_rq->load.weight.
41614377 * - For group entity, update its weight to reflect the new share
41624378 * of its group cfs_rq.
41634379 */
41644380 update_load_avg(cfs_rq, se, UPDATE_TG);
4165
- dequeue_runnable_load_avg(cfs_rq, se);
4381
+ se_update_runnable(se);
41664382
41674383 update_stats_dequeue(cfs_rq, se, flags);
41684384
....@@ -4206,9 +4422,14 @@
42064422 unsigned long ideal_runtime, delta_exec;
42074423 struct sched_entity *se;
42084424 s64 delta;
4425
+ bool skip_preempt = false;
42094426
42104427 ideal_runtime = sched_slice(cfs_rq, curr);
42114428 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4429
+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
4430
+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
4431
+ if (skip_preempt)
4432
+ return;
42124433 if (delta_exec > ideal_runtime) {
42134434 resched_curr_lazy(rq_of(cfs_rq));
42144435 /*
....@@ -4237,8 +4458,7 @@
42374458 resched_curr_lazy(rq_of(cfs_rq));
42384459 }
42394460
4240
-static void
4241
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4461
+void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
42424462 {
42434463 /* 'current' is not kept within the tree. */
42444464 if (se->on_rq) {
....@@ -4260,7 +4480,8 @@
42604480 * least twice that of our own weight (i.e. dont track it
42614481 * when there are only lesser-weight tasks around):
42624482 */
4263
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4483
+ if (schedstat_enabled() &&
4484
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
42644485 schedstat_set(se->statistics.slice_max,
42654486 max((u64)schedstat_val(se->statistics.slice_max),
42664487 se->sum_exec_runtime - se->prev_sum_exec_runtime));
....@@ -4268,6 +4489,8 @@
42684489
42694490 se->prev_sum_exec_runtime = se->sum_exec_runtime;
42704491 }
4492
+EXPORT_SYMBOL_GPL(set_next_entity);
4493
+
42714494
42724495 static int
42734496 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
....@@ -4283,7 +4506,11 @@
42834506 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
42844507 {
42854508 struct sched_entity *left = __pick_first_entity(cfs_rq);
4286
- struct sched_entity *se;
4509
+ struct sched_entity *se = NULL;
4510
+
4511
+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
4512
+ if (se)
4513
+ goto done;
42874514
42884515 /*
42894516 * If curr is set we have to see if its left of the leftmost entity
....@@ -4313,18 +4540,19 @@
43134540 se = second;
43144541 }
43154542
4316
- /*
4317
- * Prefer last buddy, try to return the CPU to a preempted task.
4318
- */
4319
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320
- se = cfs_rq->last;
4321
-
4322
- /*
4323
- * Someone really wants this to run. If it's not unfair, run it.
4324
- */
4325
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4543
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4544
+ /*
4545
+ * Someone really wants this to run. If it's not unfair, run it.
4546
+ */
43264547 se = cfs_rq->next;
4548
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4549
+ /*
4550
+ * Prefer last buddy, try to return the CPU to a preempted task.
4551
+ */
4552
+ se = cfs_rq->last;
4553
+ }
43274554
4555
+done:
43284556 clear_buddies(cfs_rq, se);
43294557
43304558 return se;
....@@ -4457,26 +4685,17 @@
44574685 return &tg->cfs_bandwidth;
44584686 }
44594687
4460
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462
-{
4463
- if (unlikely(cfs_rq->throttle_count))
4464
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465
-
4466
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467
-}
4468
-
44694688 /* returns 0 on failure to allocate runtime */
4470
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4689
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4690
+ struct cfs_rq *cfs_rq, u64 target_runtime)
44714691 {
4472
- struct task_group *tg = cfs_rq->tg;
4473
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474
- u64 amount = 0, min_amount;
4692
+ u64 min_amount, amount = 0;
4693
+
4694
+ lockdep_assert_held(&cfs_b->lock);
44754695
44764696 /* note: this is a positive sum as runtime_remaining <= 0 */
4477
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4697
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
44784698
4479
- raw_spin_lock(&cfs_b->lock);
44804699 if (cfs_b->quota == RUNTIME_INF)
44814700 amount = min_amount;
44824701 else {
....@@ -4488,11 +4707,23 @@
44884707 cfs_b->idle = 0;
44894708 }
44904709 }
4491
- raw_spin_unlock(&cfs_b->lock);
44924710
44934711 cfs_rq->runtime_remaining += amount;
44944712
44954713 return cfs_rq->runtime_remaining > 0;
4714
+}
4715
+
4716
+/* returns 0 on failure to allocate runtime */
4717
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4718
+{
4719
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4720
+ int ret;
4721
+
4722
+ raw_spin_lock(&cfs_b->lock);
4723
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4724
+ raw_spin_unlock(&cfs_b->lock);
4725
+
4726
+ return ret;
44964727 }
44974728
44984729 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
....@@ -4557,9 +4788,8 @@
45574788
45584789 cfs_rq->throttle_count--;
45594790 if (!cfs_rq->throttle_count) {
4560
- /* adjust cfs_rq_clock_task() */
4561
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562
- cfs_rq->throttled_clock_task;
4791
+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
4792
+ cfs_rq->throttled_clock_pelt;
45634793
45644794 /* Add cfs_rq with already running entity in the list */
45654795 if (cfs_rq->nr_running >= 1)
....@@ -4576,7 +4806,7 @@
45764806
45774807 /* group is entering throttled state, stop time */
45784808 if (!cfs_rq->throttle_count) {
4579
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
4809
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
45804810 list_del_leaf_cfs_rq(cfs_rq);
45814811 }
45824812 cfs_rq->throttle_count++;
....@@ -4584,13 +4814,33 @@
45844814 return 0;
45854815 }
45864816
4587
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4817
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
45884818 {
45894819 struct rq *rq = rq_of(cfs_rq);
45904820 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
45914821 struct sched_entity *se;
4592
- long task_delta, dequeue = 1;
4593
- bool empty;
4822
+ long task_delta, idle_task_delta, dequeue = 1;
4823
+
4824
+ raw_spin_lock(&cfs_b->lock);
4825
+ /* This will start the period timer if necessary */
4826
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4827
+ /*
4828
+ * We have raced with bandwidth becoming available, and if we
4829
+ * actually throttled the timer might not unthrottle us for an
4830
+ * entire period. We additionally needed to make sure that any
4831
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
4832
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
4833
+ * for 1ns of runtime rather than just check cfs_b.
4834
+ */
4835
+ dequeue = 0;
4836
+ } else {
4837
+ list_add_tail_rcu(&cfs_rq->throttled_list,
4838
+ &cfs_b->throttled_cfs_rq);
4839
+ }
4840
+ raw_spin_unlock(&cfs_b->lock);
4841
+
4842
+ if (!dequeue)
4843
+ return false; /* Throttle no longer required. */
45944844
45954845 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
45964846
....@@ -4600,15 +4850,22 @@
46004850 rcu_read_unlock();
46014851
46024852 task_delta = cfs_rq->h_nr_running;
4853
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46034854 for_each_sched_entity(se) {
46044855 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
46054856 /* throttled entity or throttle-on-deactivate */
46064857 if (!se->on_rq)
46074858 break;
46084859
4609
- if (dequeue)
4860
+ if (dequeue) {
46104861 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4862
+ } else {
4863
+ update_load_avg(qcfs_rq, se, 0);
4864
+ se_update_runnable(se);
4865
+ }
4866
+
46114867 qcfs_rq->h_nr_running -= task_delta;
4868
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
46124869
46134870 if (qcfs_rq->load.weight)
46144871 dequeue = 0;
....@@ -4617,29 +4874,13 @@
46174874 if (!se)
46184875 sub_nr_running(rq, task_delta);
46194876
4877
+ /*
4878
+ * Note: distribution will already see us throttled via the
4879
+ * throttled-list. rq->lock protects completion.
4880
+ */
46204881 cfs_rq->throttled = 1;
46214882 cfs_rq->throttled_clock = rq_clock(rq);
4622
- raw_spin_lock(&cfs_b->lock);
4623
- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624
-
4625
- /*
4626
- * Add to the _head_ of the list, so that an already-started
4627
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628
- * not running add to the tail so that later runqueues don't get starved.
4629
- */
4630
- if (cfs_b->distribute_running)
4631
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632
- else
4633
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634
-
4635
- /*
4636
- * If we're the first throttled task, make sure the bandwidth
4637
- * timer is running.
4638
- */
4639
- if (empty)
4640
- start_cfs_bandwidth(cfs_b);
4641
-
4642
- raw_spin_unlock(&cfs_b->lock);
4883
+ return true;
46434884 }
46444885
46454886 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -4647,8 +4888,7 @@
46474888 struct rq *rq = rq_of(cfs_rq);
46484889 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
46494890 struct sched_entity *se;
4650
- int enqueue = 1;
4651
- long task_delta;
4891
+ long task_delta, idle_task_delta;
46524892
46534893 se = cfs_rq->tg->se[cpu_of(rq)];
46544894
....@@ -4668,34 +4908,70 @@
46684908 return;
46694909
46704910 task_delta = cfs_rq->h_nr_running;
4911
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46714912 for_each_sched_entity(se) {
46724913 if (se->on_rq)
4673
- enqueue = 0;
4674
-
4914
+ break;
46754915 cfs_rq = cfs_rq_of(se);
4676
- if (enqueue)
4677
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678
- cfs_rq->h_nr_running += task_delta;
4916
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
46794917
4918
+ cfs_rq->h_nr_running += task_delta;
4919
+ cfs_rq->idle_h_nr_running += idle_task_delta;
4920
+
4921
+ /* end evaluation on encountering a throttled cfs_rq */
46804922 if (cfs_rq_throttled(cfs_rq))
4923
+ goto unthrottle_throttle;
4924
+ }
4925
+
4926
+ for_each_sched_entity(se) {
4927
+ cfs_rq = cfs_rq_of(se);
4928
+
4929
+ update_load_avg(cfs_rq, se, UPDATE_TG);
4930
+ se_update_runnable(se);
4931
+
4932
+ cfs_rq->h_nr_running += task_delta;
4933
+ cfs_rq->idle_h_nr_running += idle_task_delta;
4934
+
4935
+
4936
+ /* end evaluation on encountering a throttled cfs_rq */
4937
+ if (cfs_rq_throttled(cfs_rq))
4938
+ goto unthrottle_throttle;
4939
+
4940
+ /*
4941
+ * One parent has been throttled and cfs_rq removed from the
4942
+ * list. Add it back to not break the leaf list.
4943
+ */
4944
+ if (throttled_hierarchy(cfs_rq))
4945
+ list_add_leaf_cfs_rq(cfs_rq);
4946
+ }
4947
+
4948
+ /* At this point se is NULL and we are at root level*/
4949
+ add_nr_running(rq, task_delta);
4950
+
4951
+unthrottle_throttle:
4952
+ /*
4953
+ * The cfs_rq_throttled() breaks in the above iteration can result in
4954
+ * incomplete leaf list maintenance, resulting in triggering the
4955
+ * assertion below.
4956
+ */
4957
+ for_each_sched_entity(se) {
4958
+ cfs_rq = cfs_rq_of(se);
4959
+
4960
+ if (list_add_leaf_cfs_rq(cfs_rq))
46814961 break;
46824962 }
46834963
46844964 assert_list_leaf_cfs_rq(rq);
4685
-
4686
- if (!se)
4687
- add_nr_running(rq, task_delta);
46884965
46894966 /* Determine whether we need to wake up potentially idle CPU: */
46904967 if (rq->curr == rq->idle && rq->cfs.nr_running)
46914968 resched_curr(rq);
46924969 }
46934970
4694
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4971
+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
46954972 {
46964973 struct cfs_rq *cfs_rq;
4697
- u64 runtime;
4698
- u64 starting_runtime = remaining;
4974
+ u64 runtime, remaining = 1;
46994975
47004976 rcu_read_lock();
47014977 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
....@@ -4710,10 +4986,13 @@
47104986 /* By the above check, this should never be true */
47114987 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
47124988
4989
+ raw_spin_lock(&cfs_b->lock);
47134990 runtime = -cfs_rq->runtime_remaining + 1;
4714
- if (runtime > remaining)
4715
- runtime = remaining;
4716
- remaining -= runtime;
4991
+ if (runtime > cfs_b->runtime)
4992
+ runtime = cfs_b->runtime;
4993
+ cfs_b->runtime -= runtime;
4994
+ remaining = cfs_b->runtime;
4995
+ raw_spin_unlock(&cfs_b->lock);
47174996
47184997 cfs_rq->runtime_remaining += runtime;
47194998
....@@ -4728,8 +5007,6 @@
47285007 break;
47295008 }
47305009 rcu_read_unlock();
4731
-
4732
- return starting_runtime - remaining;
47335010 }
47345011
47355012 /*
....@@ -4740,7 +5017,6 @@
47405017 */
47415018 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
47425019 {
4743
- u64 runtime;
47445020 int throttled;
47455021
47465022 /* no need to continue the timer with no bandwidth constraint */
....@@ -4769,24 +5045,15 @@
47695045 cfs_b->nr_throttled += overrun;
47705046
47715047 /*
4772
- * This check is repeated as we are holding onto the new bandwidth while
4773
- * we unthrottle. This can potentially race with an unthrottled group
4774
- * trying to acquire new bandwidth from the global pool. This can result
4775
- * in us over-using our runtime if it is all used during this loop, but
4776
- * only by limited amounts in that extreme case.
5048
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
47775049 */
4778
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779
- runtime = cfs_b->runtime;
4780
- cfs_b->distribute_running = 1;
5050
+ while (throttled && cfs_b->runtime > 0) {
47815051 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
47825052 /* we can't nest cfs_b->lock while distributing bandwidth */
4783
- runtime = distribute_cfs_runtime(cfs_b, runtime);
5053
+ distribute_cfs_runtime(cfs_b);
47845054 raw_spin_lock_irqsave(&cfs_b->lock, flags);
47855055
4786
- cfs_b->distribute_running = 0;
47875056 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788
-
4789
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
47905057 }
47915058
47925059 /*
....@@ -4842,6 +5109,11 @@
48425109 if (runtime_refresh_within(cfs_b, min_left))
48435110 return;
48445111
5112
+ /* don't push forwards an existing deferred unthrottle */
5113
+ if (cfs_b->slack_started)
5114
+ return;
5115
+ cfs_b->slack_started = true;
5116
+
48455117 hrtimer_start(&cfs_b->slack_timer,
48465118 ns_to_ktime(cfs_bandwidth_slack_period),
48475119 HRTIMER_MODE_REL);
....@@ -4893,10 +5165,7 @@
48935165
48945166 /* confirm we're still not at a refresh boundary */
48955167 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4896
- if (cfs_b->distribute_running) {
4897
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4898
- return;
4899
- }
5168
+ cfs_b->slack_started = false;
49005169
49015170 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
49025171 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
....@@ -4906,26 +5175,21 @@
49065175 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
49075176 runtime = cfs_b->runtime;
49085177
4909
- if (runtime)
4910
- cfs_b->distribute_running = 1;
4911
-
49125178 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49135179
49145180 if (!runtime)
49155181 return;
49165182
4917
- runtime = distribute_cfs_runtime(cfs_b, runtime);
5183
+ distribute_cfs_runtime(cfs_b);
49185184
49195185 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4920
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4921
- cfs_b->distribute_running = 0;
49225186 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49235187 }
49245188
49255189 /*
49265190 * When a group wakes up we want to make sure that its quota is not already
49275191 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4928
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
5192
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
49295193 */
49305194 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
49315195 {
....@@ -4960,7 +5224,7 @@
49605224 pcfs_rq = tg->parent->cfs_rq[cpu];
49615225
49625226 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4963
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5227
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
49645228 }
49655229
49665230 /* conditionally throttle active cfs_rq's from put_prev_entity() */
....@@ -4979,8 +5243,7 @@
49795243 if (cfs_rq_throttled(cfs_rq))
49805244 return true;
49815245
4982
- throttle_cfs_rq(cfs_rq);
4983
- return true;
5246
+ return throttle_cfs_rq(cfs_rq);
49845247 }
49855248
49865249 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
....@@ -5009,6 +5272,8 @@
50095272 overrun = hrtimer_forward_now(timer, cfs_b->period);
50105273 if (!overrun)
50115274 break;
5275
+
5276
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
50125277
50135278 if (++count > 3) {
50145279 u64 new, old = ktime_to_ns(cfs_b->period);
....@@ -5039,8 +5304,6 @@
50395304 /* reset count so we don't come right back in here */
50405305 count = 0;
50415306 }
5042
-
5043
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
50445307 }
50455308 if (idle)
50465309 cfs_b->period_active = 0;
....@@ -5061,7 +5324,7 @@
50615324 cfs_b->period_timer.function = sched_cfs_period_timer;
50625325 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
50635326 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5064
- cfs_b->distribute_running = 0;
5327
+ cfs_b->slack_started = false;
50655328 }
50665329
50675330 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
....@@ -5156,11 +5419,6 @@
51565419 return false;
51575420 }
51585421
5159
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5160
-{
5161
- return rq_clock_task(rq_of(cfs_rq));
5162
-}
5163
-
51645422 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
51655423 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
51665424 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
....@@ -5253,22 +5511,41 @@
52535511
52545512 #ifdef CONFIG_SMP
52555513 static inline unsigned long cpu_util(int cpu);
5256
-static unsigned long capacity_of(int cpu);
52575514
52585515 static inline bool cpu_overutilized(int cpu)
52595516 {
5260
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
5517
+ int overutilized = -1;
5518
+
5519
+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
5520
+ if (overutilized != -1)
5521
+ return overutilized;
5522
+
5523
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
52615524 }
52625525
52635526 static inline void update_overutilized_status(struct rq *rq)
52645527 {
52655528 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
52665529 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5267
- trace_sched_overutilized(1);
5530
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
52685531 }
52695532 }
52705533 #else
52715534 static inline void update_overutilized_status(struct rq *rq) { }
5535
+#endif
5536
+
5537
+/* Runqueue only has SCHED_IDLE tasks enqueued */
5538
+static int sched_idle_rq(struct rq *rq)
5539
+{
5540
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5541
+ rq->nr_running);
5542
+}
5543
+
5544
+#ifdef CONFIG_SMP
5545
+static int sched_idle_cpu(int cpu)
5546
+{
5547
+ return sched_idle_rq(cpu_rq(cpu));
5548
+}
52725549 #endif
52735550
52745551 /*
....@@ -5281,12 +5558,9 @@
52815558 {
52825559 struct cfs_rq *cfs_rq;
52835560 struct sched_entity *se = &p->se;
5561
+ int idle_h_nr_running = task_has_idle_policy(p);
52845562 int task_new = !(flags & ENQUEUE_WAKEUP);
5285
-
5286
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5287
- if (sysctl_sched_performance_bias)
5288
- cpufreq_task_boost(rq->cpu, task_util_est(p));
5289
-#endif
5563
+ int should_iowait_boost;
52905564
52915565 /*
52925566 * The code below (indirectly) updates schedutil which looks at
....@@ -5297,29 +5571,13 @@
52975571 util_est_enqueue(&rq->cfs, p);
52985572
52995573 /*
5300
- * The code below (indirectly) updates schedutil which looks at
5301
- * the cfs_rq utilization to select a frequency.
5302
- * Let's update schedtune here to ensure the boost value of the
5303
- * current task is accounted for in the selection of the OPP.
5304
- *
5305
- * We do it also in the case where we enqueue a throttled task;
5306
- * we could argue that a throttled task should not boost a CPU,
5307
- * however:
5308
- * a) properly implementing CPU boosting considering throttled
5309
- * tasks will increase a lot the complexity of the solution
5310
- * b) it's not easy to quantify the benefits introduced by
5311
- * such a more complex solution.
5312
- * Thus, for the time being we go for the simple solution and boost
5313
- * also for throttled RQs.
5314
- */
5315
- schedtune_enqueue_task(p, cpu_of(rq));
5316
-
5317
- /*
53185574 * If in_iowait is set, the code below may not trigger any cpufreq
53195575 * utilization updates, so do it here explicitly with the IOWAIT flag
53205576 * passed.
53215577 */
5322
- if (p->in_iowait)
5578
+ should_iowait_boost = p->in_iowait;
5579
+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
5580
+ if (should_iowait_boost)
53235581 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
53245582
53255583 for_each_sched_entity(se) {
....@@ -5328,51 +5586,60 @@
53285586 cfs_rq = cfs_rq_of(se);
53295587 enqueue_entity(cfs_rq, se, flags);
53305588
5331
- /*
5332
- * end evaluation on encountering a throttled cfs_rq
5333
- *
5334
- * note: in the case of encountering a throttled cfs_rq we will
5335
- * post the final h_nr_running increment below.
5336
- */
5337
- if (cfs_rq_throttled(cfs_rq))
5338
- break;
53395589 cfs_rq->h_nr_running++;
5590
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5591
+
5592
+ /* end evaluation on encountering a throttled cfs_rq */
5593
+ if (cfs_rq_throttled(cfs_rq))
5594
+ goto enqueue_throttle;
53405595
53415596 flags = ENQUEUE_WAKEUP;
53425597 }
53435598
5599
+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
53445600 for_each_sched_entity(se) {
53455601 cfs_rq = cfs_rq_of(se);
5346
- cfs_rq->h_nr_running++;
5347
-
5348
- if (cfs_rq_throttled(cfs_rq))
5349
- break;
53505602
53515603 update_load_avg(cfs_rq, se, UPDATE_TG);
5604
+ se_update_runnable(se);
53525605 update_cfs_group(se);
5606
+
5607
+ cfs_rq->h_nr_running++;
5608
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5609
+
5610
+ /* end evaluation on encountering a throttled cfs_rq */
5611
+ if (cfs_rq_throttled(cfs_rq))
5612
+ goto enqueue_throttle;
5613
+
5614
+ /*
5615
+ * One parent has been throttled and cfs_rq removed from the
5616
+ * list. Add it back to not break the leaf list.
5617
+ */
5618
+ if (throttled_hierarchy(cfs_rq))
5619
+ list_add_leaf_cfs_rq(cfs_rq);
53535620 }
53545621
5355
- if (!se) {
5356
- add_nr_running(rq, 1);
5357
- /*
5358
- * Since new tasks are assigned an initial util_avg equal to
5359
- * half of the spare capacity of their CPU, tiny tasks have the
5360
- * ability to cross the overutilized threshold, which will
5361
- * result in the load balancer ruining all the task placement
5362
- * done by EAS. As a way to mitigate that effect, do not account
5363
- * for the first enqueue operation of new tasks during the
5364
- * overutilized flag detection.
5365
- *
5366
- * A better way of solving this problem would be to wait for
5367
- * the PELT signals of tasks to converge before taking them
5368
- * into account, but that is not straightforward to implement,
5369
- * and the following generally works well enough in practice.
5370
- */
5371
- if (!task_new)
5372
- update_overutilized_status(rq);
5622
+ /* At this point se is NULL and we are at root level*/
5623
+ add_nr_running(rq, 1);
53735624
5374
- }
5625
+ /*
5626
+ * Since new tasks are assigned an initial util_avg equal to
5627
+ * half of the spare capacity of their CPU, tiny tasks have the
5628
+ * ability to cross the overutilized threshold, which will
5629
+ * result in the load balancer ruining all the task placement
5630
+ * done by EAS. As a way to mitigate that effect, do not account
5631
+ * for the first enqueue operation of new tasks during the
5632
+ * overutilized flag detection.
5633
+ *
5634
+ * A better way of solving this problem would be to wait for
5635
+ * the PELT signals of tasks to converge before taking them
5636
+ * into account, but that is not straightforward to implement,
5637
+ * and the following generally works well enough in practice.
5638
+ */
5639
+ if (!task_new)
5640
+ update_overutilized_status(rq);
53755641
5642
+enqueue_throttle:
53765643 if (cfs_bandwidth_used()) {
53775644 /*
53785645 * When bandwidth control is enabled; the cfs_rq_throttled()
....@@ -5405,28 +5672,21 @@
54055672 struct cfs_rq *cfs_rq;
54065673 struct sched_entity *se = &p->se;
54075674 int task_sleep = flags & DEQUEUE_SLEEP;
5675
+ int idle_h_nr_running = task_has_idle_policy(p);
5676
+ bool was_sched_idle = sched_idle_rq(rq);
54085677
5409
- /*
5410
- * The code below (indirectly) updates schedutil which looks at
5411
- * the cfs_rq utilization to select a frequency.
5412
- * Let's update schedtune here to ensure the boost value of the
5413
- * current task is not more accounted for in the selection of the OPP.
5414
- */
5415
- schedtune_dequeue_task(p, cpu_of(rq));
5678
+ util_est_dequeue(&rq->cfs, p);
54165679
54175680 for_each_sched_entity(se) {
54185681 cfs_rq = cfs_rq_of(se);
54195682 dequeue_entity(cfs_rq, se, flags);
54205683
5421
- /*
5422
- * end evaluation on encountering a throttled cfs_rq
5423
- *
5424
- * note: in the case of encountering a throttled cfs_rq we will
5425
- * post the final h_nr_running decrement below.
5426
- */
5427
- if (cfs_rq_throttled(cfs_rq))
5428
- break;
54295684 cfs_rq->h_nr_running--;
5685
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5686
+
5687
+ /* end evaluation on encountering a throttled cfs_rq */
5688
+ if (cfs_rq_throttled(cfs_rq))
5689
+ goto dequeue_throttle;
54305690
54315691 /* Don't dequeue parent if it has other entities besides us */
54325692 if (cfs_rq->load.weight) {
....@@ -5443,21 +5703,32 @@
54435703 flags |= DEQUEUE_SLEEP;
54445704 }
54455705
5706
+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
54465707 for_each_sched_entity(se) {
54475708 cfs_rq = cfs_rq_of(se);
5448
- cfs_rq->h_nr_running--;
5449
-
5450
- if (cfs_rq_throttled(cfs_rq))
5451
- break;
54525709
54535710 update_load_avg(cfs_rq, se, UPDATE_TG);
5711
+ se_update_runnable(se);
54545712 update_cfs_group(se);
5713
+
5714
+ cfs_rq->h_nr_running--;
5715
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5716
+
5717
+ /* end evaluation on encountering a throttled cfs_rq */
5718
+ if (cfs_rq_throttled(cfs_rq))
5719
+ goto dequeue_throttle;
5720
+
54555721 }
54565722
5457
- if (!se)
5458
- sub_nr_running(rq, 1);
5723
+ /* At this point se is NULL and we are at root level*/
5724
+ sub_nr_running(rq, 1);
54595725
5460
- util_est_dequeue(&rq->cfs, p, task_sleep);
5726
+ /* balance early to pull high priority tasks */
5727
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5728
+ rq->next_balance = jiffies;
5729
+
5730
+dequeue_throttle:
5731
+ util_est_update(&rq->cfs, p, task_sleep);
54615732 hrtick_update(rq);
54625733 }
54635734
....@@ -5468,71 +5739,6 @@
54685739 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
54695740
54705741 #ifdef CONFIG_NO_HZ_COMMON
5471
-/*
5472
- * per rq 'load' arrray crap; XXX kill this.
5473
- */
5474
-
5475
-/*
5476
- * The exact cpuload calculated at every tick would be:
5477
- *
5478
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5479
- *
5480
- * If a CPU misses updates for n ticks (as it was idle) and update gets
5481
- * called on the n+1-th tick when CPU may be busy, then we have:
5482
- *
5483
- * load_n = (1 - 1/2^i)^n * load_0
5484
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5485
- *
5486
- * decay_load_missed() below does efficient calculation of
5487
- *
5488
- * load' = (1 - 1/2^i)^n * load
5489
- *
5490
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5491
- * This allows us to precompute the above in said factors, thereby allowing the
5492
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5493
- * fixed_power_int())
5494
- *
5495
- * The calculation is approximated on a 128 point scale.
5496
- */
5497
-#define DEGRADE_SHIFT 7
5498
-
5499
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5500
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5501
- { 0, 0, 0, 0, 0, 0, 0, 0 },
5502
- { 64, 32, 8, 0, 0, 0, 0, 0 },
5503
- { 96, 72, 40, 12, 1, 0, 0, 0 },
5504
- { 112, 98, 75, 43, 15, 1, 0, 0 },
5505
- { 120, 112, 98, 76, 45, 16, 2, 0 }
5506
-};
5507
-
5508
-/*
5509
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5510
- * would be when CPU is idle and so we just decay the old load without
5511
- * adding any new load.
5512
- */
5513
-static unsigned long
5514
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5515
-{
5516
- int j = 0;
5517
-
5518
- if (!missed_updates)
5519
- return load;
5520
-
5521
- if (missed_updates >= degrade_zero_ticks[idx])
5522
- return 0;
5523
-
5524
- if (idx == 1)
5525
- return load >> missed_updates;
5526
-
5527
- while (missed_updates) {
5528
- if (missed_updates % 2)
5529
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5530
-
5531
- missed_updates >>= 1;
5532
- j++;
5533
- }
5534
- return load;
5535
-}
55365742
55375743 static struct {
55385744 cpumask_var_t idle_cpus_mask;
....@@ -5544,249 +5750,68 @@
55445750
55455751 #endif /* CONFIG_NO_HZ_COMMON */
55465752
5547
-/**
5548
- * __cpu_load_update - update the rq->cpu_load[] statistics
5549
- * @this_rq: The rq to update statistics for
5550
- * @this_load: The current load
5551
- * @pending_updates: The number of missed updates
5552
- *
5553
- * Update rq->cpu_load[] statistics. This function is usually called every
5554
- * scheduler tick (TICK_NSEC).
5555
- *
5556
- * This function computes a decaying average:
5557
- *
5558
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5559
- *
5560
- * Because of NOHZ it might not get called on every tick which gives need for
5561
- * the @pending_updates argument.
5562
- *
5563
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5564
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5565
- * = A * (A * load[i]_n-2 + B) + B
5566
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5567
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5568
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5569
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5570
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5571
- *
5572
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5573
- * any change in load would have resulted in the tick being turned back on.
5574
- *
5575
- * For regular NOHZ, this reduces to:
5576
- *
5577
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5578
- *
5579
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5580
- * term.
5581
- */
5582
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5583
- unsigned long pending_updates)
5753
+static unsigned long cpu_load(struct rq *rq)
55845754 {
5585
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5586
- int i, scale;
5587
-
5588
- this_rq->nr_load_updates++;
5589
-
5590
- /* Update our load: */
5591
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5592
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5593
- unsigned long old_load, new_load;
5594
-
5595
- /* scale is effectively 1 << i now, and >> i divides by scale */
5596
-
5597
- old_load = this_rq->cpu_load[i];
5598
-#ifdef CONFIG_NO_HZ_COMMON
5599
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5600
- if (tickless_load) {
5601
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5602
- /*
5603
- * old_load can never be a negative value because a
5604
- * decayed tickless_load cannot be greater than the
5605
- * original tickless_load.
5606
- */
5607
- old_load += tickless_load;
5608
- }
5609
-#endif
5610
- new_load = this_load;
5611
- /*
5612
- * Round up the averaging division if load is increasing. This
5613
- * prevents us from getting stuck on 9 if the load is 10, for
5614
- * example.
5615
- */
5616
- if (new_load > old_load)
5617
- new_load += scale - 1;
5618
-
5619
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5620
- }
5621
-}
5622
-
5623
-/* Used instead of source_load when we know the type == 0 */
5624
-static unsigned long weighted_cpuload(struct rq *rq)
5625
-{
5626
- return cfs_rq_runnable_load_avg(&rq->cfs);
5627
-}
5628
-
5629
-#ifdef CONFIG_NO_HZ_COMMON
5630
-/*
5631
- * There is no sane way to deal with nohz on smp when using jiffies because the
5632
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5633
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5634
- *
5635
- * Therefore we need to avoid the delta approach from the regular tick when
5636
- * possible since that would seriously skew the load calculation. This is why we
5637
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5638
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5639
- * loop exit, nohz_idle_balance, nohz full exit...)
5640
- *
5641
- * This means we might still be one tick off for nohz periods.
5642
- */
5643
-
5644
-static void cpu_load_update_nohz(struct rq *this_rq,
5645
- unsigned long curr_jiffies,
5646
- unsigned long load)
5647
-{
5648
- unsigned long pending_updates;
5649
-
5650
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5651
- if (pending_updates) {
5652
- this_rq->last_load_update_tick = curr_jiffies;
5653
- /*
5654
- * In the regular NOHZ case, we were idle, this means load 0.
5655
- * In the NOHZ_FULL case, we were non-idle, we should consider
5656
- * its weighted load.
5657
- */
5658
- cpu_load_update(this_rq, load, pending_updates);
5659
- }
5755
+ return cfs_rq_load_avg(&rq->cfs);
56605756 }
56615757
56625758 /*
5663
- * Called from nohz_idle_balance() to update the load ratings before doing the
5664
- * idle balance.
5665
- */
5666
-static void cpu_load_update_idle(struct rq *this_rq)
5667
-{
5668
- /*
5669
- * bail if there's load or we're actually up-to-date.
5670
- */
5671
- if (weighted_cpuload(this_rq))
5672
- return;
5673
-
5674
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5675
-}
5676
-
5677
-/*
5678
- * Record CPU load on nohz entry so we know the tickless load to account
5679
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5680
- * than other cpu_load[idx] but it should be fine as cpu_load readers
5681
- * shouldn't rely into synchronized cpu_load[*] updates.
5682
- */
5683
-void cpu_load_update_nohz_start(void)
5684
-{
5685
- struct rq *this_rq = this_rq();
5686
-
5687
- /*
5688
- * This is all lockless but should be fine. If weighted_cpuload changes
5689
- * concurrently we'll exit nohz. And cpu_load write can race with
5690
- * cpu_load_update_idle() but both updater would be writing the same.
5691
- */
5692
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5693
-}
5694
-
5695
-/*
5696
- * Account the tickless load in the end of a nohz frame.
5697
- */
5698
-void cpu_load_update_nohz_stop(void)
5699
-{
5700
- unsigned long curr_jiffies = READ_ONCE(jiffies);
5701
- struct rq *this_rq = this_rq();
5702
- unsigned long load;
5703
- struct rq_flags rf;
5704
-
5705
- if (curr_jiffies == this_rq->last_load_update_tick)
5706
- return;
5707
-
5708
- load = weighted_cpuload(this_rq);
5709
- rq_lock(this_rq, &rf);
5710
- update_rq_clock(this_rq);
5711
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5712
- rq_unlock(this_rq, &rf);
5713
-}
5714
-#else /* !CONFIG_NO_HZ_COMMON */
5715
-static inline void cpu_load_update_nohz(struct rq *this_rq,
5716
- unsigned long curr_jiffies,
5717
- unsigned long load) { }
5718
-#endif /* CONFIG_NO_HZ_COMMON */
5719
-
5720
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5721
-{
5722
-#ifdef CONFIG_NO_HZ_COMMON
5723
- /* See the mess around cpu_load_update_nohz(). */
5724
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5725
-#endif
5726
- cpu_load_update(this_rq, load, 1);
5727
-}
5728
-
5729
-/*
5730
- * Called from scheduler_tick()
5731
- */
5732
-void cpu_load_update_active(struct rq *this_rq)
5733
-{
5734
- unsigned long load = weighted_cpuload(this_rq);
5735
-
5736
- if (tick_nohz_tick_stopped())
5737
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5738
- else
5739
- cpu_load_update_periodic(this_rq, load);
5740
-}
5741
-
5742
-/*
5743
- * Return a low guess at the load of a migration-source CPU weighted
5744
- * according to the scheduling class and "nice" value.
5759
+ * cpu_load_without - compute CPU load without any contributions from *p
5760
+ * @cpu: the CPU which load is requested
5761
+ * @p: the task which load should be discounted
57455762 *
5746
- * We want to under-estimate the load of migration sources, to
5747
- * balance conservatively.
5763
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
5764
+ * CPU as well as tasks which are currently sleeping after an execution on that
5765
+ * CPU.
5766
+ *
5767
+ * This method returns the load of the specified CPU by discounting the load of
5768
+ * the specified task, whenever the task is currently contributing to the CPU
5769
+ * load.
57485770 */
5749
-static unsigned long source_load(int cpu, int type)
5771
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
57505772 {
5751
- struct rq *rq = cpu_rq(cpu);
5752
- unsigned long total = weighted_cpuload(rq);
5773
+ struct cfs_rq *cfs_rq;
5774
+ unsigned int load;
57535775
5754
- if (type == 0 || !sched_feat(LB_BIAS))
5755
- return total;
5776
+ /* Task has no contribution or is new */
5777
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5778
+ return cpu_load(rq);
57565779
5757
- return min(rq->cpu_load[type-1], total);
5780
+ cfs_rq = &rq->cfs;
5781
+ load = READ_ONCE(cfs_rq->avg.load_avg);
5782
+
5783
+ /* Discount task's util from CPU's util */
5784
+ lsub_positive(&load, task_h_load(p));
5785
+
5786
+ return load;
57585787 }
57595788
5760
-/*
5761
- * Return a high guess at the load of a migration-target CPU weighted
5762
- * according to the scheduling class and "nice" value.
5763
- */
5764
-static unsigned long target_load(int cpu, int type)
5789
+static unsigned long cpu_runnable(struct rq *rq)
57655790 {
5766
- struct rq *rq = cpu_rq(cpu);
5767
- unsigned long total = weighted_cpuload(rq);
5791
+ return cfs_rq_runnable_avg(&rq->cfs);
5792
+}
57685793
5769
- if (type == 0 || !sched_feat(LB_BIAS))
5770
- return total;
5794
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5795
+{
5796
+ struct cfs_rq *cfs_rq;
5797
+ unsigned int runnable;
57715798
5772
- return max(rq->cpu_load[type-1], total);
5799
+ /* Task has no contribution or is new */
5800
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5801
+ return cpu_runnable(rq);
5802
+
5803
+ cfs_rq = &rq->cfs;
5804
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5805
+
5806
+ /* Discount task's runnable from CPU's runnable */
5807
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
5808
+
5809
+ return runnable;
57735810 }
57745811
57755812 static unsigned long capacity_of(int cpu)
57765813 {
57775814 return cpu_rq(cpu)->cpu_capacity;
5778
-}
5779
-
5780
-static unsigned long cpu_avg_load_per_task(int cpu)
5781
-{
5782
- struct rq *rq = cpu_rq(cpu);
5783
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5784
- unsigned long load_avg = weighted_cpuload(rq);
5785
-
5786
- if (nr_running)
5787
- return load_avg / nr_running;
5788
-
5789
- return 0;
57905815 }
57915816
57925817 static void record_wakee(struct task_struct *p)
....@@ -5823,18 +5848,15 @@
58235848 * whatever is irrelevant, spread criteria is apparent partner count exceeds
58245849 * socket size.
58255850 */
5826
-static int wake_wide(struct task_struct *p, int sibling_count_hint)
5851
+static int wake_wide(struct task_struct *p)
58275852 {
58285853 unsigned int master = current->wakee_flips;
58295854 unsigned int slave = p->wakee_flips;
5830
- int llc_size = this_cpu_read(sd_llc_size);
5831
-
5832
- if (sibling_count_hint >= llc_size)
5833
- return 1;
5855
+ int factor = __this_cpu_read(sd_llc_size);
58345856
58355857 if (master < slave)
58365858 swap(master, slave);
5837
- if (slave < llc_size || master < slave * llc_size)
5859
+ if (slave < factor || master < slave * factor)
58385860 return 0;
58395861 return 1;
58405862 }
....@@ -5882,7 +5904,7 @@
58825904 s64 this_eff_load, prev_eff_load;
58835905 unsigned long task_load;
58845906
5885
- this_eff_load = target_load(this_cpu, sd->wake_idx);
5907
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
58865908
58875909 if (sync) {
58885910 unsigned long current_load = task_h_load(current);
....@@ -5900,7 +5922,7 @@
59005922 this_eff_load *= 100;
59015923 this_eff_load *= capacity_of(prev_cpu);
59025924
5903
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5925
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
59045926 prev_eff_load -= task_load;
59055927 if (sched_feat(WA_BIAS))
59065928 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
....@@ -5938,242 +5960,8 @@
59385960 return target;
59395961 }
59405962
5941
-#ifdef CONFIG_SCHED_TUNE
5942
-struct reciprocal_value schedtune_spc_rdiv;
5943
-
5944
-static long
5945
-schedtune_margin(unsigned long signal, long boost)
5946
-{
5947
- long long margin = 0;
5948
-
5949
- /*
5950
- * Signal proportional compensation (SPC)
5951
- *
5952
- * The Boost (B) value is used to compute a Margin (M) which is
5953
- * proportional to the complement of the original Signal (S):
5954
- * M = B * (SCHED_CAPACITY_SCALE - S)
5955
- * The obtained M could be used by the caller to "boost" S.
5956
- */
5957
- if (boost >= 0) {
5958
- margin = SCHED_CAPACITY_SCALE - signal;
5959
- margin *= boost;
5960
- } else
5961
- margin = -signal * boost;
5962
-
5963
- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5964
-
5965
- if (boost < 0)
5966
- margin *= -1;
5967
- return margin;
5968
-}
5969
-
5970
-inline long
5971
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5972
-{
5973
- int boost = schedtune_cpu_boost_with(cpu, p);
5974
- long margin;
5975
-
5976
- if (boost == 0)
5977
- margin = 0;
5978
- else
5979
- margin = schedtune_margin(util, boost);
5980
-
5981
- trace_sched_boost_cpu(cpu, util, margin);
5982
-
5983
- return margin;
5984
-}
5985
-
5986
-long schedtune_task_margin(struct task_struct *task)
5987
-{
5988
- int boost = schedtune_task_boost(task);
5989
- unsigned long util;
5990
- long margin;
5991
-
5992
- if (boost == 0)
5993
- return 0;
5994
-
5995
- util = task_util_est(task);
5996
- margin = schedtune_margin(util, boost);
5997
-
5998
- return margin;
5999
-}
6000
-
6001
-#else /* CONFIG_SCHED_TUNE */
6002
-
6003
-inline long
6004
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6005
-{
6006
- return 0;
6007
-}
6008
-
6009
-#endif /* CONFIG_SCHED_TUNE */
6010
-
6011
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6012
-
6013
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6014
-{
6015
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6016
-}
6017
-
6018
-/*
6019
- * find_idlest_group finds and returns the least busy CPU group within the
6020
- * domain.
6021
- *
6022
- * Assumes p is allowed on at least one CPU in sd.
6023
- */
60245963 static struct sched_group *
6025
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
6026
- int this_cpu, int sd_flag)
6027
-{
6028
- struct sched_group *idlest = NULL, *group = sd->groups;
6029
- struct sched_group *most_spare_sg = NULL;
6030
- unsigned long min_runnable_load = ULONG_MAX;
6031
- unsigned long this_runnable_load = ULONG_MAX;
6032
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6033
- unsigned long most_spare = 0, this_spare = 0;
6034
- int load_idx = sd->forkexec_idx;
6035
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6036
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6037
- (sd->imbalance_pct-100) / 100;
6038
-
6039
- if (sd_flag & SD_BALANCE_WAKE)
6040
- load_idx = sd->wake_idx;
6041
-
6042
- do {
6043
- unsigned long load, avg_load, runnable_load;
6044
- unsigned long spare_cap, max_spare_cap;
6045
- int local_group;
6046
- int i;
6047
-
6048
- /* Skip over this group if it has no CPUs allowed */
6049
- if (!cpumask_intersects(sched_group_span(group),
6050
- p->cpus_ptr))
6051
- continue;
6052
-
6053
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6054
- if (sysctl_sched_performance_bias) {
6055
- if (!task_fits_max(p, group_first_cpu(group)))
6056
- continue;
6057
- }
6058
-#endif
6059
-
6060
- local_group = cpumask_test_cpu(this_cpu,
6061
- sched_group_span(group));
6062
-
6063
- /*
6064
- * Tally up the load of all CPUs in the group and find
6065
- * the group containing the CPU with most spare capacity.
6066
- */
6067
- avg_load = 0;
6068
- runnable_load = 0;
6069
- max_spare_cap = 0;
6070
-
6071
- for_each_cpu(i, sched_group_span(group)) {
6072
- /* Bias balancing toward CPUs of our domain */
6073
- if (local_group)
6074
- load = source_load(i, load_idx);
6075
- else
6076
- load = target_load(i, load_idx);
6077
-
6078
- runnable_load += load;
6079
-
6080
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6081
-
6082
- spare_cap = capacity_spare_without(i, p);
6083
-
6084
- if (spare_cap > max_spare_cap)
6085
- max_spare_cap = spare_cap;
6086
- }
6087
-
6088
- /* Adjust by relative CPU capacity of the group */
6089
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6090
- group->sgc->capacity;
6091
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6092
- group->sgc->capacity;
6093
-
6094
- if (local_group) {
6095
- this_runnable_load = runnable_load;
6096
- this_avg_load = avg_load;
6097
- this_spare = max_spare_cap;
6098
- } else {
6099
- if (min_runnable_load > (runnable_load + imbalance)) {
6100
- /*
6101
- * The runnable load is significantly smaller
6102
- * so we can pick this new CPU:
6103
- */
6104
- min_runnable_load = runnable_load;
6105
- min_avg_load = avg_load;
6106
- idlest = group;
6107
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6108
- (100*min_avg_load > imbalance_scale*avg_load)) {
6109
- /*
6110
- * The runnable loads are close so take the
6111
- * blocked load into account through avg_load:
6112
- */
6113
- min_avg_load = avg_load;
6114
- idlest = group;
6115
- }
6116
-
6117
- if (most_spare < max_spare_cap) {
6118
- most_spare = max_spare_cap;
6119
- most_spare_sg = group;
6120
- }
6121
- }
6122
- } while (group = group->next, group != sd->groups);
6123
-
6124
- /*
6125
- * The cross-over point between using spare capacity or least load
6126
- * is too conservative for high utilization tasks on partially
6127
- * utilized systems if we require spare_capacity > task_util(p),
6128
- * so we allow for some task stuffing by using
6129
- * spare_capacity > task_util(p)/2.
6130
- *
6131
- * Spare capacity can't be used for fork because the utilization has
6132
- * not been set yet, we must first select a rq to compute the initial
6133
- * utilization.
6134
- */
6135
- if (sd_flag & SD_BALANCE_FORK)
6136
- goto skip_spare;
6137
-
6138
- if (this_spare > task_util(p) / 2 &&
6139
- imbalance_scale*this_spare > 100*most_spare)
6140
- return NULL;
6141
-
6142
- if (most_spare > task_util(p) / 2)
6143
- return most_spare_sg;
6144
-
6145
-skip_spare:
6146
- if (!idlest)
6147
- return NULL;
6148
-
6149
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6150
- if (sysctl_sched_performance_bias) {
6151
- if ((this_runnable_load == ULONG_MAX) || (this_avg_load == ULONG_MAX))
6152
- return idlest;
6153
- }
6154
-#endif
6155
-
6156
- /*
6157
- * When comparing groups across NUMA domains, it's possible for the
6158
- * local domain to be very lightly loaded relative to the remote
6159
- * domains but "imbalance" skews the comparison making remote CPUs
6160
- * look much more favourable. When considering cross-domain, add
6161
- * imbalance to the runnable load on the remote node and consider
6162
- * staying local.
6163
- */
6164
- if ((sd->flags & SD_NUMA) &&
6165
- min_runnable_load + imbalance >= this_runnable_load)
6166
- return NULL;
6167
-
6168
- if (min_runnable_load > (this_runnable_load + imbalance))
6169
- return NULL;
6170
-
6171
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6172
- (100*this_avg_load < imbalance_scale*min_avg_load))
6173
- return NULL;
6174
-
6175
- return idlest;
6176
-}
5964
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
61775965
61785966 /*
61795967 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
....@@ -6194,6 +5982,9 @@
61945982
61955983 /* Traverse only the allowed CPUs */
61965984 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5985
+ if (sched_idle_cpu(i))
5986
+ return i;
5987
+
61975988 if (available_idle_cpu(i)) {
61985989 struct rq *rq = cpu_rq(i);
61995990 struct cpuidle_state *idle = idle_get_state(rq);
....@@ -6217,7 +6008,7 @@
62176008 shallowest_idle_cpu = i;
62186009 }
62196010 } else if (shallowest_idle_cpu == -1) {
6220
- load = weighted_cpuload(cpu_rq(i));
6011
+ load = cpu_load(cpu_rq(i));
62216012 if (load < min_load) {
62226013 min_load = load;
62236014 least_loaded_cpu = i;
....@@ -6237,7 +6028,7 @@
62376028 return prev_cpu;
62386029
62396030 /*
6240
- * We need task's util for capacity_spare_without, sync it up to
6031
+ * We need task's util for cpu_util_without, sync it up to
62416032 * prev_cpu's last_update_time.
62426033 */
62436034 if (!(sd_flag & SD_BALANCE_FORK))
....@@ -6253,7 +6044,7 @@
62536044 continue;
62546045 }
62556046
6256
- group = find_idlest_group(sd, p, cpu, sd_flag);
6047
+ group = find_idlest_group(sd, p, cpu);
62576048 if (!group) {
62586049 sd = sd->child;
62596050 continue;
....@@ -6356,10 +6147,12 @@
63566147 bool idle = true;
63576148
63586149 for_each_cpu(cpu, cpu_smt_mask(core)) {
6359
- cpumask_clear_cpu(cpu, cpus);
6360
- if (!available_idle_cpu(cpu))
6150
+ if (!available_idle_cpu(cpu)) {
63616151 idle = false;
6152
+ break;
6153
+ }
63626154 }
6155
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
63636156
63646157 if (idle)
63656158 return core;
....@@ -6384,9 +6177,10 @@
63846177 return -1;
63856178
63866179 for_each_cpu(cpu, cpu_smt_mask(target)) {
6387
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6180
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6181
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
63886182 continue;
6389
- if (available_idle_cpu(cpu))
6183
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
63906184 return cpu;
63916185 }
63926186
....@@ -6417,8 +6211,8 @@
64176211 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
64186212 struct sched_domain *this_sd;
64196213 u64 avg_cost, avg_idle;
6420
- u64 time, cost;
6421
- s64 delta;
6214
+ u64 time;
6215
+ int this = smp_processor_id();
64226216 int cpu, nr = INT_MAX;
64236217
64246218 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
....@@ -6443,23 +6237,63 @@
64436237 nr = 4;
64446238 }
64456239
6446
- time = local_clock();
6240
+ time = cpu_clock(this);
64476241
64486242 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
64496243
64506244 for_each_cpu_wrap(cpu, cpus, target) {
64516245 if (!--nr)
64526246 return -1;
6453
- if (available_idle_cpu(cpu))
6247
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
64546248 break;
64556249 }
64566250
6457
- time = local_clock() - time;
6458
- cost = this_sd->avg_scan_cost;
6459
- delta = (s64)(time - cost) / 8;
6460
- this_sd->avg_scan_cost += delta;
6251
+ time = cpu_clock(this) - time;
6252
+ update_avg(&this_sd->avg_scan_cost, time);
64616253
64626254 return cpu;
6255
+}
6256
+
6257
+/*
6258
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6259
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
6260
+ * maximize capacity.
6261
+ */
6262
+static int
6263
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6264
+{
6265
+ unsigned long task_util, best_cap = 0;
6266
+ int cpu, best_cpu = -1;
6267
+ struct cpumask *cpus;
6268
+
6269
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6270
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6271
+
6272
+ task_util = uclamp_task_util(p);
6273
+
6274
+ for_each_cpu_wrap(cpu, cpus, target) {
6275
+ unsigned long cpu_cap = capacity_of(cpu);
6276
+
6277
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6278
+ continue;
6279
+ if (fits_capacity(task_util, cpu_cap))
6280
+ return cpu;
6281
+
6282
+ if (cpu_cap > best_cap) {
6283
+ best_cap = cpu_cap;
6284
+ best_cpu = cpu;
6285
+ }
6286
+ }
6287
+
6288
+ return best_cpu;
6289
+}
6290
+
6291
+static inline bool asym_fits_capacity(int task_util, int cpu)
6292
+{
6293
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
6294
+ return fits_capacity(task_util, capacity_of(cpu));
6295
+
6296
+ return true;
64636297 }
64646298
64656299 /*
....@@ -6468,24 +6302,54 @@
64686302 static int select_idle_sibling(struct task_struct *p, int prev, int target)
64696303 {
64706304 struct sched_domain *sd;
6305
+ unsigned long task_util;
64716306 int i, recent_used_cpu;
64726307
6473
- if (available_idle_cpu(target))
6308
+ /*
6309
+ * On asymmetric system, update task utilization because we will check
6310
+ * that the task fits with cpu's capacity.
6311
+ */
6312
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6313
+ sync_entity_load_avg(&p->se);
6314
+ task_util = uclamp_task_util(p);
6315
+ }
6316
+
6317
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6318
+ asym_fits_capacity(task_util, target))
64746319 return target;
64756320
64766321 /*
64776322 * If the previous CPU is cache affine and idle, don't be stupid:
64786323 */
6479
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6324
+ if (prev != target && cpus_share_cache(prev, target) &&
6325
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6326
+ asym_fits_capacity(task_util, prev))
64806327 return prev;
6328
+
6329
+ /*
6330
+ * Allow a per-cpu kthread to stack with the wakee if the
6331
+ * kworker thread and the tasks previous CPUs are the same.
6332
+ * The assumption is that the wakee queued work for the
6333
+ * per-cpu kthread that is now complete and the wakeup is
6334
+ * essentially a sync wakeup. An obvious example of this
6335
+ * pattern is IO completions.
6336
+ */
6337
+ if (is_per_cpu_kthread(current) &&
6338
+ in_task() &&
6339
+ prev == smp_processor_id() &&
6340
+ this_rq()->nr_running <= 1 &&
6341
+ asym_fits_capacity(task_util, prev)) {
6342
+ return prev;
6343
+ }
64816344
64826345 /* Check a recently used CPU as a potential idle candidate: */
64836346 recent_used_cpu = p->recent_used_cpu;
64846347 if (recent_used_cpu != prev &&
64856348 recent_used_cpu != target &&
64866349 cpus_share_cache(recent_used_cpu, target) &&
6487
- available_idle_cpu(recent_used_cpu) &&
6488
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6350
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6351
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6352
+ asym_fits_capacity(task_util, recent_used_cpu)) {
64896353 /*
64906354 * Replace recent_used_cpu with prev as it is a potential
64916355 * candidate for the next wake:
....@@ -6494,6 +6358,32 @@
64946358 return recent_used_cpu;
64956359 }
64966360
6361
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6362
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
6363
+ goto sd_llc;
6364
+ }
6365
+
6366
+ /*
6367
+ * For asymmetric CPU capacity systems, our domain of interest is
6368
+ * sd_asym_cpucapacity rather than sd_llc.
6369
+ */
6370
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6371
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6372
+ /*
6373
+ * On an asymmetric CPU capacity system where an exclusive
6374
+ * cpuset defines a symmetric island (i.e. one unique
6375
+ * capacity_orig value through the cpuset), the key will be set
6376
+ * but the CPUs within that cpuset will not have a domain with
6377
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6378
+ * capacity path.
6379
+ */
6380
+ if (sd) {
6381
+ i = select_idle_capacity(p, sd, target);
6382
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
6383
+ }
6384
+ }
6385
+
6386
+sd_llc:
64976387 sd = rcu_dereference(per_cpu(sd_llc, target));
64986388 if (!sd)
64996389 return target;
....@@ -6591,7 +6481,7 @@
65916481 util = READ_ONCE(cfs_rq->avg.util_avg);
65926482
65936483 /* Discount task's util from CPU's util */
6594
- util -= min_t(unsigned int, util, task_util(p));
6484
+ lsub_positive(&util, task_util(p));
65956485
65966486 /*
65976487 * Covered cases:
....@@ -6640,10 +6530,9 @@
66406530 * properly fix the execl regression and it helps in further
66416531 * reducing the chances for the above race.
66426532 */
6643
- if (unlikely(task_on_rq_queued(p) || current == p)) {
6644
- estimated -= min_t(unsigned int, estimated,
6645
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
6646
- }
6533
+ if (unlikely(task_on_rq_queued(p) || current == p))
6534
+ lsub_positive(&estimated, _task_util_est(p));
6535
+
66476536 util = max(util, estimated);
66486537 }
66496538
....@@ -6653,350 +6542,6 @@
66536542 * the cpu_util call.
66546543 */
66556544 return min_t(unsigned long, util, capacity_orig_of(cpu));
6656
-}
6657
-
6658
-/*
6659
- * Returns the current capacity of cpu after applying both
6660
- * cpu and freq scaling.
6661
- */
6662
-unsigned long capacity_curr_of(int cpu)
6663
-{
6664
- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6665
- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6666
-
6667
- return cap_scale(max_cap, scale_freq);
6668
-}
6669
-
6670
-static void find_best_target(struct sched_domain *sd, cpumask_t *cpus,
6671
- struct task_struct *p)
6672
-{
6673
- unsigned long min_util = uclamp_task(p);
6674
- unsigned long target_capacity = ULONG_MAX;
6675
- unsigned long min_wake_util = ULONG_MAX;
6676
- unsigned long target_max_spare_cap = 0;
6677
- unsigned long target_util = ULONG_MAX;
6678
- /* Initialise with deepest possible cstate (INT_MAX) */
6679
- int shallowest_idle_cstate = INT_MAX;
6680
- struct sched_group *sg;
6681
- int best_active_cpu = -1;
6682
- int best_idle_cpu = -1;
6683
- int target_cpu = -1;
6684
- int backup_cpu = -1;
6685
- bool prefer_idle;
6686
- bool boosted;
6687
- int i;
6688
-
6689
- /*
6690
- * In most cases, target_capacity tracks capacity_orig of the most
6691
- * energy efficient CPU candidate, thus requiring to minimise
6692
- * target_capacity. For these cases target_capacity is already
6693
- * initialized to ULONG_MAX.
6694
- * However, for prefer_idle and boosted tasks we look for a high
6695
- * performance CPU, thus requiring to maximise target_capacity. In this
6696
- * case we initialise target_capacity to 0.
6697
- */
6698
- prefer_idle = uclamp_latency_sensitive(p);
6699
- boosted = uclamp_boosted(p);
6700
- if (prefer_idle && boosted)
6701
- target_capacity = 0;
6702
-
6703
- /* Scan CPUs in all SDs */
6704
- sg = sd->groups;
6705
- do {
6706
- for_each_cpu_and(i, p->cpus_ptr, sched_group_span(sg)) {
6707
- unsigned long capacity_curr = capacity_curr_of(i);
6708
- unsigned long capacity_orig = capacity_orig_of(i);
6709
- unsigned long wake_util, new_util;
6710
- long spare_cap;
6711
- int idle_idx = INT_MAX;
6712
-
6713
- if (!cpu_online(i))
6714
- continue;
6715
-
6716
- /*
6717
- * p's blocked utilization is still accounted for on prev_cpu
6718
- * so prev_cpu will receive a negative bias due to the double
6719
- * accounting. However, the blocked utilization may be zero.
6720
- */
6721
- wake_util = cpu_util_without(i, p);
6722
- new_util = wake_util + task_util_est(p);
6723
-
6724
- /*
6725
- * Ensure minimum capacity to grant the required boost.
6726
- * The target CPU can be already at a capacity level higher
6727
- * than the one required to boost the task.
6728
- */
6729
- new_util = max(min_util, new_util);
6730
- if (new_util > capacity_orig)
6731
- continue;
6732
-
6733
- /*
6734
- * Pre-compute the maximum possible capacity we expect
6735
- * to have available on this CPU once the task is
6736
- * enqueued here.
6737
- */
6738
- spare_cap = capacity_orig - new_util;
6739
-
6740
- if (idle_cpu(i))
6741
- idle_idx = idle_get_state_idx(cpu_rq(i));
6742
-
6743
-
6744
- /*
6745
- * Case A) Latency sensitive tasks
6746
- *
6747
- * Unconditionally favoring tasks that prefer idle CPU to
6748
- * improve latency.
6749
- *
6750
- * Looking for:
6751
- * - an idle CPU, whatever its idle_state is, since
6752
- * the first CPUs we explore are more likely to be
6753
- * reserved for latency sensitive tasks.
6754
- * - a non idle CPU where the task fits in its current
6755
- * capacity and has the maximum spare capacity.
6756
- * - a non idle CPU with lower contention from other
6757
- * tasks and running at the lowest possible OPP.
6758
- *
6759
- * The last two goals tries to favor a non idle CPU
6760
- * where the task can run as if it is "almost alone".
6761
- * A maximum spare capacity CPU is favoured since
6762
- * the task already fits into that CPU's capacity
6763
- * without waiting for an OPP chance.
6764
- *
6765
- * The following code path is the only one in the CPUs
6766
- * exploration loop which is always used by
6767
- * prefer_idle tasks. It exits the loop with wither a
6768
- * best_active_cpu or a target_cpu which should
6769
- * represent an optimal choice for latency sensitive
6770
- * tasks.
6771
- */
6772
- if (prefer_idle) {
6773
-
6774
- /*
6775
- * Case A.1: IDLE CPU
6776
- * Return the best IDLE CPU we find:
6777
- * - for boosted tasks: the CPU with the highest
6778
- * performance (i.e. biggest capacity_orig)
6779
- * - for !boosted tasks: the most energy
6780
- * efficient CPU (i.e. smallest capacity_orig)
6781
- */
6782
- if (idle_cpu(i)) {
6783
- if (boosted &&
6784
- capacity_orig < target_capacity)
6785
- continue;
6786
- if (!boosted &&
6787
- capacity_orig > target_capacity)
6788
- continue;
6789
- /*
6790
- * Minimise value of idle state: skip
6791
- * deeper idle states and pick the
6792
- * shallowest.
6793
- */
6794
- if (capacity_orig == target_capacity &&
6795
- sysctl_sched_cstate_aware &&
6796
- idle_idx >= shallowest_idle_cstate)
6797
- continue;
6798
-
6799
- target_capacity = capacity_orig;
6800
- shallowest_idle_cstate = idle_idx;
6801
- best_idle_cpu = i;
6802
- continue;
6803
- }
6804
- if (best_idle_cpu != -1)
6805
- continue;
6806
-
6807
- /*
6808
- * Case A.2: Target ACTIVE CPU
6809
- * Favor CPUs with max spare capacity.
6810
- */
6811
- if (capacity_curr > new_util &&
6812
- spare_cap > target_max_spare_cap) {
6813
- target_max_spare_cap = spare_cap;
6814
- target_cpu = i;
6815
- continue;
6816
- }
6817
- if (target_cpu != -1)
6818
- continue;
6819
-
6820
-
6821
- /*
6822
- * Case A.3: Backup ACTIVE CPU
6823
- * Favor CPUs with:
6824
- * - lower utilization due to other tasks
6825
- * - lower utilization with the task in
6826
- */
6827
- if (wake_util > min_wake_util)
6828
- continue;
6829
- min_wake_util = wake_util;
6830
- best_active_cpu = i;
6831
- continue;
6832
- }
6833
-
6834
- /*
6835
- * Enforce EAS mode
6836
- *
6837
- * For non latency sensitive tasks, skip CPUs that
6838
- * will be overutilized by moving the task there.
6839
- *
6840
- * The goal here is to remain in EAS mode as long as
6841
- * possible at least for !prefer_idle tasks.
6842
- */
6843
- if ((new_util * capacity_margin) >
6844
- (capacity_orig * SCHED_CAPACITY_SCALE))
6845
- continue;
6846
-
6847
- /*
6848
- * Favor CPUs with smaller capacity for non latency
6849
- * sensitive tasks.
6850
- */
6851
- if (capacity_orig > target_capacity)
6852
- continue;
6853
-
6854
- /*
6855
- * Case B) Non latency sensitive tasks on IDLE CPUs.
6856
- *
6857
- * Find an optimal backup IDLE CPU for non latency
6858
- * sensitive tasks.
6859
- *
6860
- * Looking for:
6861
- * - minimizing the capacity_orig,
6862
- * i.e. preferring LITTLE CPUs
6863
- * - favoring shallowest idle states
6864
- * i.e. avoid to wakeup deep-idle CPUs
6865
- *
6866
- * The following code path is used by non latency
6867
- * sensitive tasks if IDLE CPUs are available. If at
6868
- * least one of such CPUs are available it sets the
6869
- * best_idle_cpu to the most suitable idle CPU to be
6870
- * selected.
6871
- *
6872
- * If idle CPUs are available, favour these CPUs to
6873
- * improve performances by spreading tasks.
6874
- * Indeed, the energy_diff() computed by the caller
6875
- * will take care to ensure the minimization of energy
6876
- * consumptions without affecting performance.
6877
- */
6878
- if (idle_cpu(i)) {
6879
- /*
6880
- * Skip CPUs in deeper idle state, but only
6881
- * if they are also less energy efficient.
6882
- * IOW, prefer a deep IDLE LITTLE CPU vs a
6883
- * shallow idle big CPU.
6884
- */
6885
- if (capacity_orig == target_capacity &&
6886
- sysctl_sched_cstate_aware &&
6887
- idle_idx >= shallowest_idle_cstate)
6888
- continue;
6889
-
6890
- target_capacity = capacity_orig;
6891
- shallowest_idle_cstate = idle_idx;
6892
- best_idle_cpu = i;
6893
- continue;
6894
- }
6895
-
6896
- /*
6897
- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6898
- *
6899
- * Pack tasks in the most energy efficient capacities.
6900
- *
6901
- * This task packing strategy prefers more energy
6902
- * efficient CPUs (i.e. pack on smaller maximum
6903
- * capacity CPUs) while also trying to spread tasks to
6904
- * run them all at the lower OPP.
6905
- *
6906
- * This assumes for example that it's more energy
6907
- * efficient to run two tasks on two CPUs at a lower
6908
- * OPP than packing both on a single CPU but running
6909
- * that CPU at an higher OPP.
6910
- *
6911
- * Thus, this case keep track of the CPU with the
6912
- * smallest maximum capacity and highest spare maximum
6913
- * capacity.
6914
- */
6915
-
6916
- /* Favor CPUs with maximum spare capacity */
6917
- if (capacity_orig == target_capacity &&
6918
- spare_cap < target_max_spare_cap)
6919
- continue;
6920
-
6921
- target_max_spare_cap = spare_cap;
6922
- target_capacity = capacity_orig;
6923
- target_util = new_util;
6924
- target_cpu = i;
6925
- }
6926
-
6927
- } while (sg = sg->next, sg != sd->groups);
6928
-
6929
- /*
6930
- * For non latency sensitive tasks, cases B and C in the previous loop,
6931
- * we pick the best IDLE CPU only if we was not able to find a target
6932
- * ACTIVE CPU.
6933
- *
6934
- * Policies priorities:
6935
- *
6936
- * - prefer_idle tasks:
6937
- *
6938
- * a) IDLE CPU available: best_idle_cpu
6939
- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6940
- * capacity (i.e. target_cpu)
6941
- * c) ACTIVE CPU with less contention due to other tasks
6942
- * (i.e. best_active_cpu)
6943
- *
6944
- * - NON prefer_idle tasks:
6945
- *
6946
- * a) ACTIVE CPU: target_cpu
6947
- * b) IDLE CPU: best_idle_cpu
6948
- */
6949
-
6950
- if (prefer_idle && (best_idle_cpu != -1)) {
6951
- target_cpu = best_idle_cpu;
6952
- goto target;
6953
- }
6954
-
6955
- if (target_cpu == -1)
6956
- target_cpu = prefer_idle
6957
- ? best_active_cpu
6958
- : best_idle_cpu;
6959
- else
6960
- backup_cpu = prefer_idle
6961
- ? best_active_cpu
6962
- : best_idle_cpu;
6963
-
6964
- if (backup_cpu >= 0)
6965
- cpumask_set_cpu(backup_cpu, cpus);
6966
- if (target_cpu >= 0) {
6967
-target:
6968
- cpumask_set_cpu(target_cpu, cpus);
6969
- }
6970
-
6971
- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6972
- best_active_cpu, target_cpu, backup_cpu);
6973
-}
6974
-
6975
-/*
6976
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6977
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6978
- *
6979
- * In that case WAKE_AFFINE doesn't make sense and we'll let
6980
- * BALANCE_WAKE sort things out.
6981
- */
6982
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6983
-{
6984
- long min_cap, max_cap;
6985
-
6986
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6987
- return 0;
6988
-
6989
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6990
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6991
-
6992
- /* Minimum capacity is close to max, no need to abort wake_affine */
6993
- if (max_cap - min_cap < max_cap >> 3)
6994
- return 0;
6995
-
6996
- /* Bring task utilization in sync with prev_cpu */
6997
- sync_entity_load_avg(&p->se);
6998
-
6999
- return !task_fits_capacity(p, min_cap);
70006545 }
70016546
70026547 /*
....@@ -7038,154 +6583,61 @@
70386583 }
70396584
70406585 /*
7041
- * compute_energy(): Estimates the energy that would be consumed if @p was
6586
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
70426587 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7043
- * landscape of the * CPUs after the task migration, and uses the Energy Model
6588
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
70446589 * to compute what would be the energy if we decided to actually migrate that
70456590 * task.
70466591 */
70476592 static long
70486593 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
70496594 {
7050
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7051
- unsigned long sum_util, energy = 0;
7052
- struct task_struct *tsk;
6595
+ struct cpumask *pd_mask = perf_domain_span(pd);
6596
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6597
+ unsigned long max_util = 0, sum_util = 0;
6598
+ unsigned long energy = 0;
70536599 int cpu;
70546600
7055
- for (; pd; pd = pd->next) {
7056
- struct cpumask *pd_mask = perf_domain_span(pd);
6601
+ /*
6602
+ * The capacity state of CPUs of the current rd can be driven by CPUs
6603
+ * of another rd if they belong to the same pd. So, account for the
6604
+ * utilization of these CPUs too by masking pd with cpu_online_mask
6605
+ * instead of the rd span.
6606
+ *
6607
+ * If an entire pd is outside of the current rd, it will not appear in
6608
+ * its pd list and will not be accounted by compute_energy().
6609
+ */
6610
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6611
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6612
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
70576613
70586614 /*
7059
- * The energy model mandates all the CPUs of a performance
7060
- * domain have the same capacity.
6615
+ * Busy time computation: utilization clamping is not
6616
+ * required since the ratio (sum_util / cpu_capacity)
6617
+ * is already enough to scale the EM reported power
6618
+ * consumption at the (eventually clamped) cpu_capacity.
70616619 */
7062
- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7063
- max_util = sum_util = 0;
6620
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6621
+ ENERGY_UTIL, NULL);
70646622
70656623 /*
7066
- * The capacity state of CPUs of the current rd can be driven by
7067
- * CPUs of another rd if they belong to the same performance
7068
- * domain. So, account for the utilization of these CPUs too
7069
- * by masking pd with cpu_online_mask instead of the rd span.
7070
- *
7071
- * If an entire performance domain is outside of the current rd,
7072
- * it will not appear in its pd list and will not be accounted
7073
- * by compute_energy().
6624
+ * Performance domain frequency: utilization clamping
6625
+ * must be considered since it affects the selection
6626
+ * of the performance domain frequency.
6627
+ * NOTE: in case RT tasks are running, by default the
6628
+ * FREQUENCY_UTIL's utilization can be max OPP.
70746629 */
7075
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7076
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7077
-
7078
- /*
7079
- * Busy time computation: utilization clamping is not
7080
- * required since the ratio (sum_util / cpu_capacity)
7081
- * is already enough to scale the EM reported power
7082
- * consumption at the (eventually clamped) cpu_capacity.
7083
- */
7084
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7085
- ENERGY_UTIL, NULL);
7086
-
7087
- /*
7088
- * Performance domain frequency: utilization clamping
7089
- * must be considered since it affects the selection
7090
- * of the performance domain frequency.
7091
- * NOTE: in case RT tasks are running, by default the
7092
- * FREQUENCY_UTIL's utilization can be max OPP.
7093
- */
7094
- tsk = cpu == dst_cpu ? p : NULL;
7095
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7096
- FREQUENCY_UTIL, tsk);
7097
- max_util = max(max_util, cpu_util);
7098
- }
7099
-
7100
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6630
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6631
+ FREQUENCY_UTIL, tsk);
6632
+ max_util = max(max_util, cpu_util);
71016633 }
6634
+
6635
+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
6636
+ if (!energy)
6637
+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
71026638
71036639 return energy;
71046640 }
7105
-
7106
-static void select_cpu_candidates(struct sched_domain *sd, cpumask_t *cpus,
7107
- struct perf_domain *pd, struct task_struct *p, int prev_cpu)
7108
-{
7109
- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7110
- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7111
- bool prefer_idle = uclamp_latency_sensitive(p);
7112
- bool boosted = uclamp_boosted(p);
7113
- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7114
- unsigned long highest_spare_cap = 0;
7115
- unsigned int min_exit_lat = UINT_MAX;
7116
- int cpu, max_spare_cap_cpu;
7117
- struct cpuidle_state *idle;
7118
-
7119
- for (; pd; pd = pd->next) {
7120
- max_spare_cap_cpu = -1;
7121
- max_spare_cap = 0;
7122
-
7123
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7124
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
7125
- continue;
7126
-
7127
- util = cpu_util_next(cpu, p, cpu);
7128
- cpu_cap = capacity_of(cpu);
7129
- spare_cap = cpu_cap - util;
7130
-
7131
- /*
7132
- * Skip CPUs that cannot satisfy the capacity request.
7133
- * IOW, placing the task there would make the CPU
7134
- * overutilized. Take uclamp into account to see how
7135
- * much capacity we can get out of the CPU; this is
7136
- * aligned with schedutil_cpu_util().
7137
- */
7138
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7139
- if (cpu_cap * 1024 < util * capacity_margin)
7140
- continue;
7141
-
7142
- /*
7143
- * Find the CPU with the maximum spare capacity in
7144
- * the performance domain
7145
- */
7146
- if (spare_cap > max_spare_cap) {
7147
- max_spare_cap = spare_cap;
7148
- max_spare_cap_cpu = cpu;
7149
- }
7150
-
7151
- if (!prefer_idle)
7152
- continue;
7153
-
7154
- if (idle_cpu(cpu)) {
7155
- cpu_cap = capacity_orig_of(cpu);
7156
- if (boosted && cpu_cap < target_cap)
7157
- continue;
7158
- if (!boosted && cpu_cap > target_cap)
7159
- continue;
7160
- idle = idle_get_state(cpu_rq(cpu));
7161
- if (idle && idle->exit_latency > min_exit_lat &&
7162
- cpu_cap == target_cap)
7163
- continue;
7164
-
7165
- if (idle)
7166
- min_exit_lat = idle->exit_latency;
7167
- target_cap = cpu_cap;
7168
- best_idle_cpu = cpu;
7169
- } else if (spare_cap > highest_spare_cap) {
7170
- highest_spare_cap = spare_cap;
7171
- highest_spare_cap_cpu = cpu;
7172
- }
7173
- }
7174
-
7175
- if (!prefer_idle && max_spare_cap_cpu >= 0)
7176
- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7177
- }
7178
-
7179
- if (!prefer_idle)
7180
- return;
7181
-
7182
- if (best_idle_cpu >= 0)
7183
- cpumask_set_cpu(best_idle_cpu, cpus);
7184
- else
7185
- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7186
-}
7187
-
7188
-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
71896641
71906642 /*
71916643 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
....@@ -7226,27 +6678,39 @@
72266678 * other use-cases too. So, until someone finds a better way to solve this,
72276679 * let's keep things simple by re-using the existing slow path.
72286680 */
7229
-
72306681 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
72316682 {
7232
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
6683
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6684
+ unsigned long best_delta2 = ULONG_MAX;
72336685 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7234
- int weight, cpu, best_energy_cpu = prev_cpu;
7235
- unsigned long cur_energy;
7236
- struct perf_domain *pd;
6686
+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
6687
+ unsigned long max_spare_cap_ls = 0, target_cap;
6688
+ unsigned long cpu_cap, util, base_energy = 0;
6689
+ bool boosted, latency_sensitive = false;
6690
+ unsigned int min_exit_lat = UINT_MAX;
6691
+ int cpu, best_energy_cpu = prev_cpu;
6692
+ struct cpuidle_state *idle;
72376693 struct sched_domain *sd;
7238
- cpumask_t *candidates;
6694
+ struct perf_domain *pd;
6695
+ int new_cpu = INT_MAX;
72396696
7240
- if (sysctl_sched_sync_hint_enable && sync) {
7241
- cpu = smp_processor_id();
7242
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
7243
- return cpu;
7244
- }
6697
+ sync_entity_load_avg(&p->se);
6698
+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
6699
+ if (new_cpu != INT_MAX)
6700
+ return new_cpu;
72456701
72466702 rcu_read_lock();
72476703 pd = rcu_dereference(rd->pd);
72486704 if (!pd || READ_ONCE(rd->overutilized))
72496705 goto fail;
6706
+
6707
+ cpu = smp_processor_id();
6708
+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
6709
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
6710
+ task_fits_capacity(p, capacity_of(cpu))) {
6711
+ rcu_read_unlock();
6712
+ return cpu;
6713
+ }
72506714
72516715 /*
72526716 * Energy-aware wake-up happens on the lowest sched_domain starting
....@@ -7258,59 +6722,149 @@
72586722 if (!sd)
72596723 goto fail;
72606724
7261
- sync_entity_load_avg(&p->se);
72626725 if (!task_util_est(p))
72636726 goto unlock;
72646727
7265
- /* Pre-select a set of candidate CPUs. */
7266
- candidates = this_cpu_ptr(&energy_cpus);
7267
- cpumask_clear(candidates);
6728
+ latency_sensitive = uclamp_latency_sensitive(p);
6729
+ boosted = uclamp_boosted(p);
6730
+ target_cap = boosted ? 0 : ULONG_MAX;
72686731
7269
- if (sched_feat(FIND_BEST_TARGET))
7270
- find_best_target(sd, candidates, p);
7271
- else
7272
- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
6732
+ for (; pd; pd = pd->next) {
6733
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6734
+ unsigned long base_energy_pd;
6735
+ int max_spare_cap_cpu = -1;
72736736
7274
- /* Bail out if no candidate was found. */
7275
- weight = cpumask_weight(candidates);
7276
- if (!weight)
7277
- goto unlock;
6737
+ /* Compute the 'base' energy of the pd, without @p */
6738
+ base_energy_pd = compute_energy(p, -1, pd);
6739
+ base_energy += base_energy_pd;
72786740
7279
- /* If there is only one sensible candidate, select it now. */
7280
- cpu = cpumask_first(candidates);
7281
- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) ||
7282
- (cpu == prev_cpu))) {
7283
- best_energy_cpu = cpu;
7284
- goto unlock;
7285
- }
6741
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6742
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6743
+ continue;
72866744
7287
- if (cpumask_test_cpu(prev_cpu, p->cpus_ptr))
7288
- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7289
- else
7290
- prev_energy = best_energy = ULONG_MAX;
6745
+ util = cpu_util_next(cpu, p, cpu);
6746
+ cpu_cap = capacity_of(cpu);
6747
+ spare_cap = cpu_cap;
6748
+ lsub_positive(&spare_cap, util);
72916749
7292
- /* Select the best candidate energy-wise. */
7293
- for_each_cpu(cpu, candidates) {
7294
- if (cpu == prev_cpu)
7295
- continue;
7296
- cur_energy = compute_energy(p, cpu, pd);
7297
- if (cur_energy < best_energy) {
7298
- best_energy = cur_energy;
7299
- best_energy_cpu = cpu;
6750
+ /*
6751
+ * Skip CPUs that cannot satisfy the capacity request.
6752
+ * IOW, placing the task there would make the CPU
6753
+ * overutilized. Take uclamp into account to see how
6754
+ * much capacity we can get out of the CPU; this is
6755
+ * aligned with schedutil_cpu_util().
6756
+ */
6757
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6758
+ if (!fits_capacity(util, cpu_cap))
6759
+ continue;
6760
+
6761
+ /* Always use prev_cpu as a candidate. */
6762
+ if (!latency_sensitive && cpu == prev_cpu) {
6763
+ prev_delta = compute_energy(p, prev_cpu, pd);
6764
+ prev_delta -= base_energy_pd;
6765
+ best_delta = min(best_delta, prev_delta);
6766
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6767
+ if (prev_delta == best_delta)
6768
+ best_energy_cpu = prev_cpu;
6769
+ }
6770
+ }
6771
+
6772
+ /*
6773
+ * Find the CPU with the maximum spare capacity in
6774
+ * the performance domain
6775
+ */
6776
+ if (spare_cap > max_spare_cap) {
6777
+ max_spare_cap = spare_cap;
6778
+ max_spare_cap_cpu = cpu;
6779
+ }
6780
+
6781
+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6782
+ if (!latency_sensitive)
6783
+ continue;
6784
+ }
6785
+
6786
+ if (idle_cpu(cpu)) {
6787
+ cpu_cap = capacity_orig_of(cpu);
6788
+ if (boosted && cpu_cap < target_cap)
6789
+ continue;
6790
+ if (!boosted && cpu_cap > target_cap)
6791
+ continue;
6792
+ idle = idle_get_state(cpu_rq(cpu));
6793
+ if (idle && idle->exit_latency > min_exit_lat &&
6794
+ cpu_cap == target_cap)
6795
+ continue;
6796
+
6797
+ if (idle)
6798
+ min_exit_lat = idle->exit_latency;
6799
+ target_cap = cpu_cap;
6800
+ best_idle_cpu = cpu;
6801
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6802
+ best_delta2 = compute_energy(p, cpu, pd);
6803
+ best_delta2 -= base_energy_pd;
6804
+ }
6805
+ } else if (spare_cap > max_spare_cap_ls) {
6806
+ max_spare_cap_ls = spare_cap;
6807
+ max_spare_cap_cpu_ls = cpu;
6808
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6809
+ if (best_idle_cpu == -1) {
6810
+ best_delta2 = compute_energy(p, cpu, pd);
6811
+ best_delta2 -= base_energy_pd;
6812
+ }
6813
+ }
6814
+ }
6815
+ }
6816
+
6817
+ /* Evaluate the energy impact of using this CPU. */
6818
+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
6819
+ max_spare_cap_cpu != prev_cpu) {
6820
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6821
+ cur_delta -= base_energy_pd;
6822
+ if (cur_delta < best_delta) {
6823
+ best_delta = cur_delta;
6824
+ best_energy_cpu = max_spare_cap_cpu;
6825
+ }
73006826 }
73016827 }
73026828 unlock:
73036829 rcu_read_unlock();
73046830
6831
+ if (latency_sensitive)
6832
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6833
+
73056834 /*
73066835 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
73076836 * least 6% of the energy used by prev_cpu.
73086837 */
7309
- if (prev_energy == ULONG_MAX)
6838
+ if (prev_delta == ULONG_MAX)
73106839 return best_energy_cpu;
73116840
7312
- if ((prev_energy - best_energy) > (prev_energy >> 4))
6841
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
73136842 return best_energy_cpu;
6843
+
6844
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6845
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
6846
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
6847
+ int level = rockchip_perf_get_level();
6848
+
6849
+ /*
6850
+ * when select ROCKCHIP_PERFORMANCE_LOW:
6851
+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
6852
+ * is little cpu, so that tasks can migrate from big cpu to little
6853
+ * cpu easier to save power.
6854
+ */
6855
+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
6856
+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
6857
+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
6858
+ return best_energy_cpu;
6859
+ }
6860
+
6861
+ /*
6862
+ * Pick the idlest cpu if it is a little power increased(<3.1%).
6863
+ */
6864
+ if ((best_delta2 <= prev_delta) ||
6865
+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
6866
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6867
+ }
73146868
73156869 return prev_cpu;
73166870
....@@ -7333,39 +6887,44 @@
73336887 * preempt must be disabled.
73346888 */
73356889 static int
7336
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7337
- int sibling_count_hint)
6890
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
73386891 {
73396892 struct sched_domain *tmp, *sd = NULL;
73406893 int cpu = smp_processor_id();
73416894 int new_cpu = prev_cpu;
73426895 int want_affine = 0;
73436896 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6897
+ int target_cpu = -1;
6898
+
6899
+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
6900
+ !(sd_flag & SD_BALANCE_FORK))
6901
+ sync_entity_load_avg(&p->se);
6902
+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
6903
+ wake_flags, &target_cpu);
6904
+ if (target_cpu >= 0)
6905
+ return target_cpu;
73446906
73456907 if (sd_flag & SD_BALANCE_WAKE) {
73466908 record_wakee(p);
73476909
7348
- if (static_branch_unlikely(&sched_energy_present)) {
7349
- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7350
- goto sd_loop;
6910
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6911
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
6912
+ goto no_eas;
6913
+ }
73516914
6915
+ if (sched_energy_enabled()) {
73526916 new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
73536917 if (new_cpu >= 0)
73546918 return new_cpu;
73556919 new_cpu = prev_cpu;
73566920 }
73576921
7358
- want_affine = !wake_wide(p, sibling_count_hint) &&
7359
- !wake_cap(p, cpu, prev_cpu) &&
7360
- cpumask_test_cpu(cpu, p->cpus_ptr);
6922
+no_eas:
6923
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
73616924 }
73626925
7363
-sd_loop:
73646926 rcu_read_lock();
73656927 for_each_domain(cpu, tmp) {
7366
- if (!(tmp->flags & SD_LOAD_BALANCE))
7367
- break;
7368
-
73696928 /*
73706929 * If both 'cpu' and 'prev_cpu' are part of this domain,
73716930 * cpu is a valid SD_WAKE_AFFINE target.
....@@ -7392,6 +6951,23 @@
73926951 /* Fast path */
73936952
73946953 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6954
+
6955
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6956
+ struct root_domain *rd = cpu_rq(cpu)->rd;
6957
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
6958
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
6959
+ int level = rockchip_perf_get_level();
6960
+
6961
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
6962
+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
6963
+ cpumask_test_cpu(new_cpu, cpul_mask)) {
6964
+ for_each_domain(cpu, tmp) {
6965
+ sd = tmp;
6966
+ }
6967
+ if (sd)
6968
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6969
+ }
6970
+ }
73956971
73966972 if (want_affine)
73976973 current->recent_used_cpu = cpu;
....@@ -7469,6 +7045,15 @@
74697045 {
74707046 remove_entity_load_avg(&p->se);
74717047 }
7048
+
7049
+static int
7050
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7051
+{
7052
+ if (rq->nr_running)
7053
+ return 1;
7054
+
7055
+ return newidle_balance(rq, rf) != 0;
7056
+}
74727057 #endif /* CONFIG_SMP */
74737058
74747059 static unsigned long wakeup_gran(struct sched_entity *se)
....@@ -7522,7 +7107,7 @@
75227107
75237108 static void set_last_buddy(struct sched_entity *se)
75247109 {
7525
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7110
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75267111 return;
75277112
75287113 for_each_sched_entity(se) {
....@@ -7534,7 +7119,7 @@
75347119
75357120 static void set_next_buddy(struct sched_entity *se)
75367121 {
7537
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7122
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75387123 return;
75397124
75407125 for_each_sched_entity(se) {
....@@ -7560,6 +7145,7 @@
75607145 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
75617146 int scale = cfs_rq->nr_running >= sched_nr_latency;
75627147 int next_buddy_marked = 0;
7148
+ bool preempt = false, nopreempt = false;
75637149
75647150 if (unlikely(se == pse))
75657151 return;
....@@ -7592,8 +7178,8 @@
75927178 return;
75937179
75947180 /* Idle tasks are by definition preempted by non-idle tasks. */
7595
- if (unlikely(curr->policy == SCHED_IDLE) &&
7596
- likely(p->policy != SCHED_IDLE))
7181
+ if (unlikely(task_has_idle_policy(curr)) &&
7182
+ likely(!task_has_idle_policy(p)))
75977183 goto preempt;
75987184
75997185 /*
....@@ -7605,6 +7191,12 @@
76057191
76067192 find_matching_se(&se, &pse);
76077193 update_curr(cfs_rq_of(se));
7194
+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
7195
+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
7196
+ if (preempt)
7197
+ goto preempt;
7198
+ if (nopreempt)
7199
+ return;
76087200 BUG_ON(!pse);
76097201 if (wakeup_preempt_entity(se, pse) == 1) {
76107202 /*
....@@ -7636,20 +7228,21 @@
76367228 set_last_buddy(se);
76377229 }
76387230
7639
-static struct task_struct *
7231
+struct task_struct *
76407232 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
76417233 {
76427234 struct cfs_rq *cfs_rq = &rq->cfs;
7643
- struct sched_entity *se;
7644
- struct task_struct *p;
7235
+ struct sched_entity *se = NULL;
7236
+ struct task_struct *p = NULL;
76457237 int new_tasks;
7238
+ bool repick = false;
76467239
76477240 again:
7648
- if (!cfs_rq->nr_running)
7241
+ if (!sched_fair_runnable(rq))
76497242 goto idle;
76507243
76517244 #ifdef CONFIG_FAIR_GROUP_SCHED
7652
- if (prev->sched_class != &fair_sched_class)
7245
+ if (!prev || prev->sched_class != &fair_sched_class)
76537246 goto simple;
76547247
76557248 /*
....@@ -7696,7 +7289,7 @@
76967289 } while (cfs_rq);
76977290
76987291 p = task_of(se);
7699
-
7292
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
77007293 /*
77017294 * Since we haven't yet done put_prev_entity and if the selected task
77027295 * is a different task than we started out with, try and touch the
....@@ -7726,8 +7319,15 @@
77267319 goto done;
77277320 simple:
77287321 #endif
7322
+ if (prev)
7323
+ put_prev_task(rq, prev);
77297324
7730
- put_prev_task(rq, prev);
7325
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
7326
+ if (repick) {
7327
+ for_each_sched_entity(se)
7328
+ set_next_entity(cfs_rq_of(se), se);
7329
+ goto done;
7330
+ }
77317331
77327332 do {
77337333 se = pick_next_entity(cfs_rq, NULL);
....@@ -7755,11 +7355,13 @@
77557355 return p;
77567356
77577357 idle:
7758
- update_misfit_status(NULL, rq);
7759
- new_tasks = idle_balance(rq, rf);
7358
+ if (!rf)
7359
+ return NULL;
7360
+
7361
+ new_tasks = newidle_balance(rq, rf);
77607362
77617363 /*
7762
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
7364
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
77637365 * possible for any higher priority task to appear. In that case we
77647366 * must re-start the pick_next_entity() loop.
77657367 */
....@@ -7776,6 +7378,11 @@
77767378 update_idle_rq_clock_pelt(rq);
77777379
77787380 return NULL;
7381
+}
7382
+
7383
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
7384
+{
7385
+ return pick_next_task_fair(rq, NULL, NULL);
77797386 }
77807387
77817388 /*
....@@ -7828,7 +7435,7 @@
78287435 set_skip_buddy(se);
78297436 }
78307437
7831
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7438
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
78327439 {
78337440 struct sched_entity *se = &p->se;
78347441
....@@ -7963,15 +7570,54 @@
79637570 * rewrite all of this once again.]
79647571 */
79657572
7966
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7573
+unsigned long __read_mostly max_load_balance_interval = HZ/10;
7574
+EXPORT_SYMBOL_GPL(max_load_balance_interval);
79677575
79687576 enum fbq_type { regular, remote, all };
79697577
7578
+/*
7579
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
7580
+ *
7581
+ * The enum is ordered by pulling priority, with the group with lowest priority
7582
+ * first so the group_type can simply be compared when selecting the busiest
7583
+ * group. See update_sd_pick_busiest().
7584
+ */
79707585 enum group_type {
7971
- group_other = 0,
7586
+ /* The group has spare capacity that can be used to run more tasks. */
7587
+ group_has_spare = 0,
7588
+ /*
7589
+ * The group is fully used and the tasks don't compete for more CPU
7590
+ * cycles. Nevertheless, some tasks might wait before running.
7591
+ */
7592
+ group_fully_busy,
7593
+ /*
7594
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7595
+ * and must be migrated to a more powerful CPU.
7596
+ */
79727597 group_misfit_task,
7598
+ /*
7599
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7600
+ * and the task should be migrated to it instead of running on the
7601
+ * current CPU.
7602
+ */
7603
+ group_asym_packing,
7604
+ /*
7605
+ * The tasks' affinity constraints previously prevented the scheduler
7606
+ * from balancing the load across the system.
7607
+ */
79737608 group_imbalanced,
7974
- group_overloaded,
7609
+ /*
7610
+ * The CPU is overloaded and can't provide expected CPU cycles to all
7611
+ * tasks.
7612
+ */
7613
+ group_overloaded
7614
+};
7615
+
7616
+enum migration_type {
7617
+ migrate_load = 0,
7618
+ migrate_util,
7619
+ migrate_task,
7620
+ migrate_misfit
79757621 };
79767622
79777623 #define LBF_ALL_PINNED 0x01
....@@ -7994,7 +7640,6 @@
79947640 int new_dst_cpu;
79957641 enum cpu_idle_type idle;
79967642 long imbalance;
7997
- unsigned int src_grp_nr_running;
79987643 /* The set of CPUs under consideration for load-balancing */
79997644 struct cpumask *cpus;
80007645
....@@ -8005,8 +7650,9 @@
80057650 unsigned int loop_max;
80067651
80077652 enum fbq_type fbq_type;
8008
- enum group_type src_grp_type;
7653
+ enum migration_type migration_type;
80097654 struct list_head tasks;
7655
+ struct rq_flags *src_rq_rf;
80107656 };
80117657
80127658 /*
....@@ -8021,7 +7667,11 @@
80217667 if (p->sched_class != &fair_sched_class)
80227668 return 0;
80237669
8024
- if (unlikely(p->policy == SCHED_IDLE))
7670
+ if (unlikely(task_has_idle_policy(p)))
7671
+ return 0;
7672
+
7673
+ /* SMT siblings share cache */
7674
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
80257675 return 0;
80267676
80277677 /*
....@@ -8109,8 +7759,13 @@
81097759 int can_migrate_task(struct task_struct *p, struct lb_env *env)
81107760 {
81117761 int tsk_cache_hot;
7762
+ int can_migrate = 1;
81127763
81137764 lockdep_assert_held(&env->src_rq->lock);
7765
+
7766
+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
7767
+ if (!can_migrate)
7768
+ return 0;
81147769
81157770 /*
81167771 * We do not migrate tasks that are:
....@@ -8120,6 +7775,10 @@
81207775 * 4) are cache-hot on their current CPU.
81217776 */
81227777 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7778
+ return 0;
7779
+
7780
+ /* Disregard pcpu kthreads; they are where they need to be. */
7781
+ if (kthread_is_per_cpu(p))
81237782 return 0;
81247783
81257784 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
....@@ -8188,9 +7847,20 @@
81887847 */
81897848 static void detach_task(struct task_struct *p, struct lb_env *env)
81907849 {
7850
+ int detached = 0;
7851
+
81917852 lockdep_assert_held(&env->src_rq->lock);
81927853
8193
- p->on_rq = TASK_ON_RQ_MIGRATING;
7854
+ /*
7855
+ * The vendor hook may drop the lock temporarily, so
7856
+ * pass the rq flags to unpin lock. We expect the
7857
+ * rq lock to be held after return.
7858
+ */
7859
+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
7860
+ env->dst_cpu, &detached);
7861
+ if (detached)
7862
+ return;
7863
+
81947864 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
81957865 set_task_cpu(p, env->dst_cpu);
81967866 }
....@@ -8229,7 +7899,7 @@
82297899 static const unsigned int sched_nr_migrate_break = 32;
82307900
82317901 /*
8232
- * detach_tasks() -- tries to detach up to imbalance weighted load from
7902
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
82337903 * busiest_rq, as part of a balancing operation within domain "sd".
82347904 *
82357905 * Returns number of detached tasks if successful and 0 otherwise.
....@@ -8237,8 +7907,8 @@
82377907 static int detach_tasks(struct lb_env *env)
82387908 {
82397909 struct list_head *tasks = &env->src_rq->cfs_tasks;
7910
+ unsigned long util, load;
82407911 struct task_struct *p;
8241
- unsigned long load;
82427912 int detached = 0;
82437913
82447914 lockdep_assert_held(&env->src_rq->lock);
....@@ -8268,39 +7938,64 @@
82687938 break;
82697939 }
82707940
8271
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8272
- if (sysctl_sched_performance_bias) {
8273
- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8274
- goto next;
8275
- }
8276
-#endif
8277
-
82787941 if (!can_migrate_task(p, env))
82797942 goto next;
82807943
8281
- /*
8282
- * Depending of the number of CPUs and tasks and the
8283
- * cgroup hierarchy, task_h_load() can return a null
8284
- * value. Make sure that env->imbalance decreases
8285
- * otherwise detach_tasks() will stop only after
8286
- * detaching up to loop_max tasks.
8287
- */
8288
- load = max_t(unsigned long, task_h_load(p), 1);
7944
+ switch (env->migration_type) {
7945
+ case migrate_load:
7946
+ /*
7947
+ * Depending of the number of CPUs and tasks and the
7948
+ * cgroup hierarchy, task_h_load() can return a null
7949
+ * value. Make sure that env->imbalance decreases
7950
+ * otherwise detach_tasks() will stop only after
7951
+ * detaching up to loop_max tasks.
7952
+ */
7953
+ load = max_t(unsigned long, task_h_load(p), 1);
82897954
7955
+ if (sched_feat(LB_MIN) &&
7956
+ load < 16 && !env->sd->nr_balance_failed)
7957
+ goto next;
82907958
8291
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8292
- goto next;
7959
+ /*
7960
+ * Make sure that we don't migrate too much load.
7961
+ * Nevertheless, let relax the constraint if
7962
+ * scheduler fails to find a good waiting task to
7963
+ * migrate.
7964
+ */
7965
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
7966
+ goto next;
82937967
8294
- if ((load / 2) > env->imbalance)
8295
- goto next;
7968
+ env->imbalance -= load;
7969
+ break;
7970
+
7971
+ case migrate_util:
7972
+ util = task_util_est(p);
7973
+
7974
+ if (util > env->imbalance)
7975
+ goto next;
7976
+
7977
+ env->imbalance -= util;
7978
+ break;
7979
+
7980
+ case migrate_task:
7981
+ env->imbalance--;
7982
+ break;
7983
+
7984
+ case migrate_misfit:
7985
+ /* This is not a misfit task */
7986
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
7987
+ goto next;
7988
+
7989
+ env->imbalance = 0;
7990
+ break;
7991
+ }
82967992
82977993 detach_task(p, env);
82987994 list_add(&p->se.group_node, &env->tasks);
82997995
83007996 detached++;
8301
- env->imbalance -= load;
83027997
8303
-#ifdef CONFIG_PREEMPT
7998
+#ifdef CONFIG_PREEMPTION
83047999 /*
83058000 * NEWIDLE balancing is a source of latency, so preemptible
83068001 * kernels will stop after the first task is detached to minimize
....@@ -8312,7 +8007,7 @@
83128007
83138008 /*
83148009 * We only want to steal up to the prescribed amount of
8315
- * weighted load.
8010
+ * load/util/tasks.
83168011 */
83178012 if (env->imbalance <= 0)
83188013 break;
....@@ -8341,7 +8036,6 @@
83418036
83428037 BUG_ON(task_rq(p) != rq);
83438038 activate_task(rq, p, ENQUEUE_NOCLOCK);
8344
- p->on_rq = TASK_ON_RQ_QUEUED;
83458039 check_preempt_curr(rq, p, 0);
83468040 }
83478041
....@@ -8382,6 +8076,7 @@
83828076 rq_unlock(env->dst_rq, &rf);
83838077 }
83848078
8079
+#ifdef CONFIG_NO_HZ_COMMON
83858080 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
83868081 {
83878082 if (cfs_rq->avg.load_avg)
....@@ -8401,12 +8096,54 @@
84018096 if (READ_ONCE(rq->avg_dl.util_avg))
84028097 return true;
84038098
8099
+ if (thermal_load_avg(rq))
8100
+ return true;
8101
+
84048102 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
84058103 if (READ_ONCE(rq->avg_irq.util_avg))
84068104 return true;
84078105 #endif
84088106
84098107 return false;
8108
+}
8109
+
8110
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8111
+{
8112
+ rq->last_blocked_load_update_tick = jiffies;
8113
+
8114
+ if (!has_blocked)
8115
+ rq->has_blocked_load = 0;
8116
+}
8117
+#else
8118
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8119
+static inline bool others_have_blocked(struct rq *rq) { return false; }
8120
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8121
+#endif
8122
+
8123
+static bool __update_blocked_others(struct rq *rq, bool *done)
8124
+{
8125
+ const struct sched_class *curr_class;
8126
+ u64 now = rq_clock_pelt(rq);
8127
+ unsigned long thermal_pressure;
8128
+ bool decayed;
8129
+
8130
+ /*
8131
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
8132
+ * DL and IRQ signals have been updated before updating CFS.
8133
+ */
8134
+ curr_class = rq->curr->sched_class;
8135
+
8136
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8137
+
8138
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8139
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8140
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8141
+ update_irq_load_avg(rq, 0);
8142
+
8143
+ if (others_have_blocked(rq))
8144
+ *done = false;
8145
+
8146
+ return decayed;
84108147 }
84118148
84128149 #ifdef CONFIG_FAIR_GROUP_SCHED
....@@ -8422,22 +8159,17 @@
84228159 if (cfs_rq->avg.util_sum)
84238160 return false;
84248161
8425
- if (cfs_rq->avg.runnable_load_sum)
8162
+ if (cfs_rq->avg.runnable_sum)
84268163 return false;
84278164
84288165 return true;
84298166 }
84308167
8431
-static void update_blocked_averages(int cpu)
8168
+static bool __update_blocked_fair(struct rq *rq, bool *done)
84328169 {
8433
- struct rq *rq = cpu_rq(cpu);
84348170 struct cfs_rq *cfs_rq, *pos;
8435
- const struct sched_class *curr_class;
8436
- struct rq_flags rf;
8437
- bool done = true;
8438
-
8439
- rq_lock_irqsave(rq, &rf);
8440
- update_rq_clock(rq);
8171
+ bool decayed = false;
8172
+ int cpu = cpu_of(rq);
84418173
84428174 /*
84438175 * Iterates the task_group tree in a bottom up fashion, see
....@@ -8446,8 +8178,12 @@
84468178 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
84478179 struct sched_entity *se;
84488180
8449
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8450
- update_tg_load_avg(cfs_rq, 0);
8181
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8182
+ update_tg_load_avg(cfs_rq);
8183
+
8184
+ if (cfs_rq == &rq->cfs)
8185
+ decayed = true;
8186
+ }
84518187
84528188 /* Propagate pending load changes to the parent, if any: */
84538189 se = cfs_rq->tg->se[cpu];
....@@ -8463,23 +8199,10 @@
84638199
84648200 /* Don't need periodic decay once load/util_avg are null */
84658201 if (cfs_rq_has_blocked(cfs_rq))
8466
- done = false;
8202
+ *done = false;
84678203 }
84688204
8469
- curr_class = rq->curr->sched_class;
8470
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8471
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8472
- update_irq_load_avg(rq, 0);
8473
- /* Don't need periodic decay once load/util_avg are null */
8474
- if (others_have_blocked(rq))
8475
- done = false;
8476
-
8477
-#ifdef CONFIG_NO_HZ_COMMON
8478
- rq->last_blocked_load_update_tick = jiffies;
8479
- if (done)
8480
- rq->has_blocked_load = 0;
8481
-#endif
8482
- rq_unlock_irqrestore(rq, &rf);
8205
+ return decayed;
84838206 }
84848207
84858208 /*
....@@ -8529,27 +8252,16 @@
85298252 cfs_rq_load_avg(cfs_rq) + 1);
85308253 }
85318254 #else
8532
-static inline void update_blocked_averages(int cpu)
8255
+static bool __update_blocked_fair(struct rq *rq, bool *done)
85338256 {
8534
- struct rq *rq = cpu_rq(cpu);
85358257 struct cfs_rq *cfs_rq = &rq->cfs;
8536
- const struct sched_class *curr_class;
8537
- struct rq_flags rf;
8258
+ bool decayed;
85388259
8539
- rq_lock_irqsave(rq, &rf);
8540
- update_rq_clock(rq);
8541
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8260
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8261
+ if (cfs_rq_has_blocked(cfs_rq))
8262
+ *done = false;
85428263
8543
- curr_class = rq->curr->sched_class;
8544
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8545
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8546
- update_irq_load_avg(rq, 0);
8547
-#ifdef CONFIG_NO_HZ_COMMON
8548
- rq->last_blocked_load_update_tick = jiffies;
8549
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8550
- rq->has_blocked_load = 0;
8551
-#endif
8552
- rq_unlock_irqrestore(rq, &rf);
8264
+ return decayed;
85538265 }
85548266
85558267 static unsigned long task_h_load(struct task_struct *p)
....@@ -8557,6 +8269,24 @@
85578269 return p->se.avg.load_avg;
85588270 }
85598271 #endif
8272
+
8273
+static void update_blocked_averages(int cpu)
8274
+{
8275
+ bool decayed = false, done = true;
8276
+ struct rq *rq = cpu_rq(cpu);
8277
+ struct rq_flags rf;
8278
+
8279
+ rq_lock_irqsave(rq, &rf);
8280
+ update_rq_clock(rq);
8281
+
8282
+ decayed |= __update_blocked_others(rq, &done);
8283
+ decayed |= __update_blocked_fair(rq, &done);
8284
+
8285
+ update_blocked_load_status(rq, !done);
8286
+ if (decayed)
8287
+ cpufreq_update_util(rq, 0);
8288
+ rq_unlock_irqrestore(rq, &rf);
8289
+}
85608290
85618291 /********** Helpers for find_busiest_group ************************/
85628292
....@@ -8566,15 +8296,15 @@
85668296 struct sg_lb_stats {
85678297 unsigned long avg_load; /*Avg load across the CPUs of the group */
85688298 unsigned long group_load; /* Total load over the CPUs of the group */
8569
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8570
- unsigned long load_per_task;
85718299 unsigned long group_capacity;
8572
- unsigned long group_util; /* Total utilization of the group */
8573
- unsigned int sum_nr_running; /* Nr tasks running in the group */
8300
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
8301
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
8302
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
8303
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
85748304 unsigned int idle_cpus;
85758305 unsigned int group_weight;
85768306 enum group_type group_type;
8577
- int group_no_capacity;
8307
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
85788308 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
85798309 #ifdef CONFIG_NUMA_BALANCING
85808310 unsigned int nr_numa_running;
....@@ -8589,10 +8319,10 @@
85898319 struct sd_lb_stats {
85908320 struct sched_group *busiest; /* Busiest group in this sd */
85918321 struct sched_group *local; /* Local group in this sd */
8592
- unsigned long total_running;
85938322 unsigned long total_load; /* Total load of all groups in sd */
85948323 unsigned long total_capacity; /* Total capacity of all groups in sd */
85958324 unsigned long avg_load; /* Average load across all groups in sd */
8325
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
85968326
85978327 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
85988328 struct sg_lb_stats local_stat; /* Statistics of the local group */
....@@ -8603,54 +8333,26 @@
86038333 /*
86048334 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
86058335 * local_stat because update_sg_lb_stats() does a full clear/assignment.
8606
- * We must however clear busiest_stat::avg_load because
8607
- * update_sd_pick_busiest() reads this before assignment.
8336
+ * We must however set busiest_stat::group_type and
8337
+ * busiest_stat::idle_cpus to the worst busiest group because
8338
+ * update_sd_pick_busiest() reads these before assignment.
86088339 */
86098340 *sds = (struct sd_lb_stats){
86108341 .busiest = NULL,
86118342 .local = NULL,
8612
- .total_running = 0UL,
86138343 .total_load = 0UL,
86148344 .total_capacity = 0UL,
86158345 .busiest_stat = {
8616
- .avg_load = 0UL,
8617
- .sum_nr_running = 0,
8618
- .group_type = group_other,
8346
+ .idle_cpus = UINT_MAX,
8347
+ .group_type = group_has_spare,
86198348 },
86208349 };
86218350 }
86228351
8623
-/**
8624
- * get_sd_load_idx - Obtain the load index for a given sched domain.
8625
- * @sd: The sched_domain whose load_idx is to be obtained.
8626
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8627
- *
8628
- * Return: The load index.
8629
- */
8630
-static inline int get_sd_load_idx(struct sched_domain *sd,
8631
- enum cpu_idle_type idle)
8632
-{
8633
- int load_idx;
8634
-
8635
- switch (idle) {
8636
- case CPU_NOT_IDLE:
8637
- load_idx = sd->busy_idx;
8638
- break;
8639
-
8640
- case CPU_NEWLY_IDLE:
8641
- load_idx = sd->newidle_idx;
8642
- break;
8643
- default:
8644
- load_idx = sd->idle_idx;
8645
- break;
8646
- }
8647
-
8648
- return load_idx;
8649
-}
8650
-
8651
-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
8352
+static unsigned long scale_rt_capacity(int cpu)
86528353 {
86538354 struct rq *rq = cpu_rq(cpu);
8355
+ unsigned long max = arch_scale_cpu_capacity(cpu);
86548356 unsigned long used, free;
86558357 unsigned long irq;
86568358
....@@ -8659,8 +8361,15 @@
86598361 if (unlikely(irq >= max))
86608362 return 1;
86618363
8364
+ /*
8365
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
8366
+ * (running and not running) with weights 0 and 1024 respectively.
8367
+ * avg_thermal.load_avg tracks thermal pressure and the weighted
8368
+ * average uses the actual delta max capacity(load).
8369
+ */
86628370 used = READ_ONCE(rq->avg_rt.util_avg);
86638371 used += READ_ONCE(rq->avg_dl.util_avg);
8372
+ used += thermal_load_avg(rq);
86648373
86658374 if (unlikely(used >= max))
86668375 return 1;
....@@ -8670,52 +8379,20 @@
86708379 return scale_irq_capacity(free, irq, max);
86718380 }
86728381
8673
-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8674
- raw_spin_lock_init(&mcc->lock);
8675
- mcc->val = 0;
8676
- mcc->cpu = -1;
8677
-}
8678
-
86798382 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
86808383 {
8681
- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
8384
+ unsigned long capacity = scale_rt_capacity(cpu);
86828385 struct sched_group *sdg = sd->groups;
8683
- struct max_cpu_capacity *mcc;
8684
- unsigned long max_capacity;
8685
- int max_cap_cpu;
8686
- unsigned long flags;
86878386
8688
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8689
-
8690
- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8691
- capacity >>= SCHED_CAPACITY_SHIFT;
8692
-
8693
- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8694
-
8695
- raw_spin_lock_irqsave(&mcc->lock, flags);
8696
- max_capacity = mcc->val;
8697
- max_cap_cpu = mcc->cpu;
8698
-
8699
- if ((max_capacity > capacity && max_cap_cpu == cpu) ||
8700
- (max_capacity < capacity)) {
8701
- mcc->val = capacity;
8702
- mcc->cpu = cpu;
8703
-#ifdef CONFIG_SCHED_DEBUG
8704
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8705
- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8706
- // cpu, capacity);
8707
- goto skip_unlock;
8708
-#endif
8709
- }
8710
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8711
-
8712
-skip_unlock: __attribute__ ((unused));
8713
- capacity = scale_rt_capacity(cpu, capacity);
8387
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
87148388
87158389 if (!capacity)
87168390 capacity = 1;
87178391
8392
+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
87188393 cpu_rq(cpu)->cpu_capacity = capacity;
8394
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8395
+
87198396 sdg->sgc->capacity = capacity;
87208397 sdg->sgc->min_capacity = capacity;
87218398 sdg->sgc->max_capacity = capacity;
....@@ -8748,29 +8425,11 @@
87488425 */
87498426
87508427 for_each_cpu(cpu, sched_group_span(sdg)) {
8751
- struct sched_group_capacity *sgc;
8752
- struct rq *rq = cpu_rq(cpu);
8428
+ unsigned long cpu_cap = capacity_of(cpu);
87538429
8754
- /*
8755
- * build_sched_domains() -> init_sched_groups_capacity()
8756
- * gets here before we've attached the domains to the
8757
- * runqueues.
8758
- *
8759
- * Use capacity_of(), which is set irrespective of domains
8760
- * in update_cpu_capacity().
8761
- *
8762
- * This avoids capacity from being 0 and
8763
- * causing divide-by-zero issues on boot.
8764
- */
8765
- if (unlikely(!rq->sd)) {
8766
- capacity += capacity_of(cpu);
8767
- } else {
8768
- sgc = rq->sd->groups->sgc;
8769
- capacity += sgc->capacity;
8770
- }
8771
-
8772
- min_capacity = min(capacity, min_capacity);
8773
- max_capacity = max(capacity, max_capacity);
8430
+ capacity += cpu_cap;
8431
+ min_capacity = min(cpu_cap, min_capacity);
8432
+ max_capacity = max(cpu_cap, max_capacity);
87748433 }
87758434 } else {
87768435 /*
....@@ -8804,6 +8463,18 @@
88048463 {
88058464 return ((rq->cpu_capacity * sd->imbalance_pct) <
88068465 (rq->cpu_capacity_orig * 100));
8466
+}
8467
+
8468
+/*
8469
+ * Check whether a rq has a misfit task and if it looks like we can actually
8470
+ * help that task: we can migrate the task to a CPU of higher capacity, or
8471
+ * the task's current CPU is heavily pressured.
8472
+ */
8473
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8474
+{
8475
+ return rq->misfit_task_load &&
8476
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8477
+ check_cpu_capacity(rq, sd));
88078478 }
88088479
88098480 /*
....@@ -8853,13 +8524,17 @@
88538524 * any benefit for the load balance.
88548525 */
88558526 static inline bool
8856
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8527
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88578528 {
88588529 if (sgs->sum_nr_running < sgs->group_weight)
88598530 return true;
88608531
8532
+ if ((sgs->group_capacity * imbalance_pct) <
8533
+ (sgs->group_runnable * 100))
8534
+ return false;
8535
+
88618536 if ((sgs->group_capacity * 100) >
8862
- (sgs->group_util * env->sd->imbalance_pct))
8537
+ (sgs->group_util * imbalance_pct))
88638538 return true;
88648539
88658540 return false;
....@@ -8874,13 +8549,17 @@
88748549 * false.
88758550 */
88768551 static inline bool
8877
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8552
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88788553 {
88798554 if (sgs->sum_nr_running <= sgs->group_weight)
88808555 return false;
88818556
88828557 if ((sgs->group_capacity * 100) <
8883
- (sgs->group_util * env->sd->imbalance_pct))
8558
+ (sgs->group_util * imbalance_pct))
8559
+ return true;
8560
+
8561
+ if ((sgs->group_capacity * imbalance_pct) <
8562
+ (sgs->group_runnable * 100))
88848563 return true;
88858564
88868565 return false;
....@@ -8893,8 +8572,7 @@
88938572 static inline bool
88948573 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
88958574 {
8896
- return sg->sgc->min_capacity * capacity_margin <
8897
- ref->sgc->min_capacity * 1024;
8575
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
88988576 }
88998577
89008578 /*
....@@ -8904,24 +8582,30 @@
89048582 static inline bool
89058583 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
89068584 {
8907
- return sg->sgc->max_capacity * capacity_margin <
8908
- ref->sgc->max_capacity * 1024;
8585
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
89098586 }
89108587
89118588 static inline enum
8912
-group_type group_classify(struct sched_group *group,
8589
+group_type group_classify(unsigned int imbalance_pct,
8590
+ struct sched_group *group,
89138591 struct sg_lb_stats *sgs)
89148592 {
8915
- if (sgs->group_no_capacity)
8593
+ if (group_is_overloaded(imbalance_pct, sgs))
89168594 return group_overloaded;
89178595
89188596 if (sg_imbalanced(group))
89198597 return group_imbalanced;
89208598
8599
+ if (sgs->group_asym_packing)
8600
+ return group_asym_packing;
8601
+
89218602 if (sgs->group_misfit_task_load)
89228603 return group_misfit_task;
89238604
8924
- return group_other;
8605
+ if (!group_has_capacity(imbalance_pct, sgs))
8606
+ return group_fully_busy;
8607
+
8608
+ return group_has_spare;
89258609 }
89268610
89278611 static bool update_nohz_stats(struct rq *rq, bool force)
....@@ -8958,12 +8642,11 @@
89588642 struct sg_lb_stats *sgs,
89598643 int *sg_status)
89608644 {
8961
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8962
- int load_idx = get_sd_load_idx(env->sd, env->idle);
8963
- unsigned long load;
8964
- int i, nr_running;
8645
+ int i, nr_running, local_group;
89658646
89668647 memset(sgs, 0, sizeof(*sgs));
8648
+
8649
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
89678650
89688651 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
89698652 struct rq *rq = cpu_rq(i);
....@@ -8971,17 +8654,14 @@
89718654 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
89728655 env->flags |= LBF_NOHZ_AGAIN;
89738656
8974
- /* Bias balancing toward CPUs of our domain: */
8975
- if (local_group)
8976
- load = target_load(i, load_idx);
8977
- else
8978
- load = source_load(i, load_idx);
8979
-
8980
- sgs->group_load += load;
8657
+ sgs->group_load += cpu_load(rq);
89818658 sgs->group_util += cpu_util(i);
8982
- sgs->sum_nr_running += rq->cfs.h_nr_running;
8659
+ sgs->group_runnable += cpu_runnable(rq);
8660
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
89838661
89848662 nr_running = rq->nr_running;
8663
+ sgs->sum_nr_running += nr_running;
8664
+
89858665 if (nr_running > 1)
89868666 *sg_status |= SG_OVERLOAD;
89878667
....@@ -8992,13 +8672,19 @@
89928672 sgs->nr_numa_running += rq->nr_numa_running;
89938673 sgs->nr_preferred_running += rq->nr_preferred_running;
89948674 #endif
8995
- sgs->sum_weighted_load += weighted_cpuload(rq);
89968675 /*
89978676 * No need to call idle_cpu() if nr_running is not 0
89988677 */
8999
- if (!nr_running && idle_cpu(i))
8678
+ if (!nr_running && idle_cpu(i)) {
90008679 sgs->idle_cpus++;
8680
+ /* Idle cpu can't have misfit task */
8681
+ continue;
8682
+ }
90018683
8684
+ if (local_group)
8685
+ continue;
8686
+
8687
+ /* Check for a misfit task on the cpu */
90028688 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
90038689 sgs->group_misfit_task_load < rq->misfit_task_load) {
90048690 sgs->group_misfit_task_load = rq->misfit_task_load;
....@@ -9006,17 +8692,24 @@
90068692 }
90078693 }
90088694
9009
- /* Adjust by relative CPU capacity of the group */
9010
- sgs->group_capacity = group->sgc->capacity;
9011
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8695
+ /* Check if dst CPU is idle and preferred to this group */
8696
+ if (env->sd->flags & SD_ASYM_PACKING &&
8697
+ env->idle != CPU_NOT_IDLE &&
8698
+ sgs->sum_h_nr_running &&
8699
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8700
+ sgs->group_asym_packing = 1;
8701
+ }
90128702
9013
- if (sgs->sum_nr_running)
9014
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8703
+ sgs->group_capacity = group->sgc->capacity;
90158704
90168705 sgs->group_weight = group->group_weight;
90178706
9018
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9019
- sgs->group_type = group_classify(group, sgs);
8707
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8708
+
8709
+ /* Computing avg_load makes sense only when group is overloaded */
8710
+ if (sgs->group_type == group_overloaded)
8711
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8712
+ sgs->group_capacity;
90208713 }
90218714
90228715 /**
....@@ -9039,6 +8732,10 @@
90398732 {
90408733 struct sg_lb_stats *busiest = &sds->busiest_stat;
90418734
8735
+ /* Make sure that there is at least one task to pull */
8736
+ if (!sgs->sum_h_nr_running)
8737
+ return false;
8738
+
90428739 /*
90438740 * Don't try to pull misfit tasks we can't help.
90448741 * We can use max_capacity here as reduction in capacity on some
....@@ -9047,7 +8744,7 @@
90478744 */
90488745 if (sgs->group_type == group_misfit_task &&
90498746 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
9050
- !group_has_capacity(env, &sds->local_stat)))
8747
+ sds->local_stat.group_type != group_has_spare))
90518748 return false;
90528749
90538750 if (sgs->group_type > busiest->group_type)
....@@ -9056,62 +8753,92 @@
90568753 if (sgs->group_type < busiest->group_type)
90578754 return false;
90588755
9059
- if (sgs->avg_load <= busiest->avg_load)
8756
+ /*
8757
+ * The candidate and the current busiest group are the same type of
8758
+ * group. Let check which one is the busiest according to the type.
8759
+ */
8760
+
8761
+ switch (sgs->group_type) {
8762
+ case group_overloaded:
8763
+ /* Select the overloaded group with highest avg_load. */
8764
+ if (sgs->avg_load <= busiest->avg_load)
8765
+ return false;
8766
+ break;
8767
+
8768
+ case group_imbalanced:
8769
+ /*
8770
+ * Select the 1st imbalanced group as we don't have any way to
8771
+ * choose one more than another.
8772
+ */
90608773 return false;
90618774
9062
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9063
- goto asym_packing;
9064
-
9065
- /*
9066
- * Candidate sg has no more than one task per CPU and
9067
- * has higher per-CPU capacity. Migrating tasks to less
9068
- * capable CPUs may harm throughput. Maximize throughput,
9069
- * power/energy consequences are not considered.
9070
- */
9071
- if (sgs->sum_nr_running <= sgs->group_weight &&
9072
- group_smaller_min_cpu_capacity(sds->local, sg))
9073
- return false;
9074
-
9075
- /*
9076
- * If we have more than one misfit sg go with the biggest misfit.
9077
- */
9078
- if (sgs->group_type == group_misfit_task &&
9079
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9080
- return false;
9081
-
9082
-asym_packing:
9083
- /* This is the busiest node in its class. */
9084
- if (!(env->sd->flags & SD_ASYM_PACKING))
9085
- return true;
9086
-
9087
- /* No ASYM_PACKING if target CPU is already busy */
9088
- if (env->idle == CPU_NOT_IDLE)
9089
- return true;
9090
- /*
9091
- * ASYM_PACKING needs to move all the work to the highest
9092
- * prority CPUs in the group, therefore mark all groups
9093
- * of lower priority than ourself as busy.
9094
- */
9095
- if (sgs->sum_nr_running &&
9096
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9097
- if (!sds->busiest)
9098
- return true;
9099
-
8775
+ case group_asym_packing:
91008776 /* Prefer to move from lowest priority CPU's work */
9101
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9102
- sg->asym_prefer_cpu))
9103
- return true;
8777
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8778
+ return false;
8779
+ break;
8780
+
8781
+ case group_misfit_task:
8782
+ /*
8783
+ * If we have more than one misfit sg go with the biggest
8784
+ * misfit.
8785
+ */
8786
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8787
+ return false;
8788
+ break;
8789
+
8790
+ case group_fully_busy:
8791
+ /*
8792
+ * Select the fully busy group with highest avg_load. In
8793
+ * theory, there is no need to pull task from such kind of
8794
+ * group because tasks have all compute capacity that they need
8795
+ * but we can still improve the overall throughput by reducing
8796
+ * contention when accessing shared HW resources.
8797
+ *
8798
+ * XXX for now avg_load is not computed and always 0 so we
8799
+ * select the 1st one.
8800
+ */
8801
+ if (sgs->avg_load <= busiest->avg_load)
8802
+ return false;
8803
+ break;
8804
+
8805
+ case group_has_spare:
8806
+ /*
8807
+ * Select not overloaded group with lowest number of idle cpus
8808
+ * and highest number of running tasks. We could also compare
8809
+ * the spare capacity which is more stable but it can end up
8810
+ * that the group has less spare capacity but finally more idle
8811
+ * CPUs which means less opportunity to pull tasks.
8812
+ */
8813
+ if (sgs->idle_cpus > busiest->idle_cpus)
8814
+ return false;
8815
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8816
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
8817
+ return false;
8818
+
8819
+ break;
91048820 }
91058821
9106
- return false;
8822
+ /*
8823
+ * Candidate sg has no more than one task per CPU and has higher
8824
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
8825
+ * throughput. Maximize throughput, power/energy consequences are not
8826
+ * considered.
8827
+ */
8828
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8829
+ (sgs->group_type <= group_fully_busy) &&
8830
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
8831
+ return false;
8832
+
8833
+ return true;
91078834 }
91088835
91098836 #ifdef CONFIG_NUMA_BALANCING
91108837 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
91118838 {
9112
- if (sgs->sum_nr_running > sgs->nr_numa_running)
8839
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
91138840 return regular;
9114
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
8841
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
91158842 return remote;
91168843 return all;
91178844 }
....@@ -9136,18 +8863,334 @@
91368863 }
91378864 #endif /* CONFIG_NUMA_BALANCING */
91388865
8866
+
8867
+struct sg_lb_stats;
8868
+
8869
+/*
8870
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
8871
+ */
8872
+
8873
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8874
+{
8875
+ /* Task has no contribution or is new */
8876
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8877
+ return 0;
8878
+
8879
+ if (task_on_rq_queued(p))
8880
+ return 1;
8881
+
8882
+ return 0;
8883
+}
8884
+
8885
+/**
8886
+ * idle_cpu_without - would a given CPU be idle without p ?
8887
+ * @cpu: the processor on which idleness is tested.
8888
+ * @p: task which should be ignored.
8889
+ *
8890
+ * Return: 1 if the CPU would be idle. 0 otherwise.
8891
+ */
8892
+static int idle_cpu_without(int cpu, struct task_struct *p)
8893
+{
8894
+ struct rq *rq = cpu_rq(cpu);
8895
+
8896
+ if (rq->curr != rq->idle && rq->curr != p)
8897
+ return 0;
8898
+
8899
+ /*
8900
+ * rq->nr_running can't be used but an updated version without the
8901
+ * impact of p on cpu must be used instead. The updated nr_running
8902
+ * be computed and tested before calling idle_cpu_without().
8903
+ */
8904
+
8905
+#ifdef CONFIG_SMP
8906
+ if (rq->ttwu_pending)
8907
+ return 0;
8908
+#endif
8909
+
8910
+ return 1;
8911
+}
8912
+
8913
+/*
8914
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
8915
+ * @sd: The sched_domain level to look for idlest group.
8916
+ * @group: sched_group whose statistics are to be updated.
8917
+ * @sgs: variable to hold the statistics for this group.
8918
+ * @p: The task for which we look for the idlest group/CPU.
8919
+ */
8920
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8921
+ struct sched_group *group,
8922
+ struct sg_lb_stats *sgs,
8923
+ struct task_struct *p)
8924
+{
8925
+ int i, nr_running;
8926
+
8927
+ memset(sgs, 0, sizeof(*sgs));
8928
+
8929
+ for_each_cpu(i, sched_group_span(group)) {
8930
+ struct rq *rq = cpu_rq(i);
8931
+ unsigned int local;
8932
+
8933
+ sgs->group_load += cpu_load_without(rq, p);
8934
+ sgs->group_util += cpu_util_without(i, p);
8935
+ sgs->group_runnable += cpu_runnable_without(rq, p);
8936
+ local = task_running_on_cpu(i, p);
8937
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8938
+
8939
+ nr_running = rq->nr_running - local;
8940
+ sgs->sum_nr_running += nr_running;
8941
+
8942
+ /*
8943
+ * No need to call idle_cpu_without() if nr_running is not 0
8944
+ */
8945
+ if (!nr_running && idle_cpu_without(i, p))
8946
+ sgs->idle_cpus++;
8947
+
8948
+ }
8949
+
8950
+ /* Check if task fits in the group */
8951
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
8952
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
8953
+ sgs->group_misfit_task_load = 1;
8954
+ }
8955
+
8956
+ sgs->group_capacity = group->sgc->capacity;
8957
+
8958
+ sgs->group_weight = group->group_weight;
8959
+
8960
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8961
+
8962
+ /*
8963
+ * Computing avg_load makes sense only when group is fully busy or
8964
+ * overloaded
8965
+ */
8966
+ if (sgs->group_type == group_fully_busy ||
8967
+ sgs->group_type == group_overloaded)
8968
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8969
+ sgs->group_capacity;
8970
+}
8971
+
8972
+static bool update_pick_idlest(struct sched_group *idlest,
8973
+ struct sg_lb_stats *idlest_sgs,
8974
+ struct sched_group *group,
8975
+ struct sg_lb_stats *sgs)
8976
+{
8977
+ if (sgs->group_type < idlest_sgs->group_type)
8978
+ return true;
8979
+
8980
+ if (sgs->group_type > idlest_sgs->group_type)
8981
+ return false;
8982
+
8983
+ /*
8984
+ * The candidate and the current idlest group are the same type of
8985
+ * group. Let check which one is the idlest according to the type.
8986
+ */
8987
+
8988
+ switch (sgs->group_type) {
8989
+ case group_overloaded:
8990
+ case group_fully_busy:
8991
+ /* Select the group with lowest avg_load. */
8992
+ if (idlest_sgs->avg_load <= sgs->avg_load)
8993
+ return false;
8994
+ break;
8995
+
8996
+ case group_imbalanced:
8997
+ case group_asym_packing:
8998
+ /* Those types are not used in the slow wakeup path */
8999
+ return false;
9000
+
9001
+ case group_misfit_task:
9002
+ /* Select group with the highest max capacity */
9003
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9004
+ return false;
9005
+ break;
9006
+
9007
+ case group_has_spare:
9008
+ /* Select group with most idle CPUs */
9009
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
9010
+ return false;
9011
+
9012
+ /* Select group with lowest group_util */
9013
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9014
+ idlest_sgs->group_util <= sgs->group_util)
9015
+ return false;
9016
+
9017
+ break;
9018
+ }
9019
+
9020
+ return true;
9021
+}
9022
+
9023
+/*
9024
+ * find_idlest_group() finds and returns the least busy CPU group within the
9025
+ * domain.
9026
+ *
9027
+ * Assumes p is allowed on at least one CPU in sd.
9028
+ */
9029
+static struct sched_group *
9030
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
9031
+{
9032
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9033
+ struct sg_lb_stats local_sgs, tmp_sgs;
9034
+ struct sg_lb_stats *sgs;
9035
+ unsigned long imbalance;
9036
+ struct sg_lb_stats idlest_sgs = {
9037
+ .avg_load = UINT_MAX,
9038
+ .group_type = group_overloaded,
9039
+ };
9040
+
9041
+ imbalance = scale_load_down(NICE_0_LOAD) *
9042
+ (sd->imbalance_pct-100) / 100;
9043
+
9044
+ do {
9045
+ int local_group;
9046
+
9047
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
9048
+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
9049
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
9050
+ int level = rockchip_perf_get_level();
9051
+
9052
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
9053
+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
9054
+ !cpumask_intersects(sched_group_span(group), cpub_mask))
9055
+ continue;
9056
+ }
9057
+
9058
+ /* Skip over this group if it has no CPUs allowed */
9059
+ if (!cpumask_intersects(sched_group_span(group),
9060
+ p->cpus_ptr))
9061
+ continue;
9062
+
9063
+ local_group = cpumask_test_cpu(this_cpu,
9064
+ sched_group_span(group));
9065
+
9066
+ if (local_group) {
9067
+ sgs = &local_sgs;
9068
+ local = group;
9069
+ } else {
9070
+ sgs = &tmp_sgs;
9071
+ }
9072
+
9073
+ update_sg_wakeup_stats(sd, group, sgs, p);
9074
+
9075
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9076
+ idlest = group;
9077
+ idlest_sgs = *sgs;
9078
+ }
9079
+
9080
+ } while (group = group->next, group != sd->groups);
9081
+
9082
+
9083
+ /* There is no idlest group to push tasks to */
9084
+ if (!idlest)
9085
+ return NULL;
9086
+
9087
+ /* The local group has been skipped because of CPU affinity */
9088
+ if (!local)
9089
+ return idlest;
9090
+
9091
+ /*
9092
+ * If the local group is idler than the selected idlest group
9093
+ * don't try and push the task.
9094
+ */
9095
+ if (local_sgs.group_type < idlest_sgs.group_type)
9096
+ return NULL;
9097
+
9098
+ /*
9099
+ * If the local group is busier than the selected idlest group
9100
+ * try and push the task.
9101
+ */
9102
+ if (local_sgs.group_type > idlest_sgs.group_type)
9103
+ return idlest;
9104
+
9105
+ switch (local_sgs.group_type) {
9106
+ case group_overloaded:
9107
+ case group_fully_busy:
9108
+ /*
9109
+ * When comparing groups across NUMA domains, it's possible for
9110
+ * the local domain to be very lightly loaded relative to the
9111
+ * remote domains but "imbalance" skews the comparison making
9112
+ * remote CPUs look much more favourable. When considering
9113
+ * cross-domain, add imbalance to the load on the remote node
9114
+ * and consider staying local.
9115
+ */
9116
+
9117
+ if ((sd->flags & SD_NUMA) &&
9118
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9119
+ return NULL;
9120
+
9121
+ /*
9122
+ * If the local group is less loaded than the selected
9123
+ * idlest group don't try and push any tasks.
9124
+ */
9125
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9126
+ return NULL;
9127
+
9128
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9129
+ return NULL;
9130
+ break;
9131
+
9132
+ case group_imbalanced:
9133
+ case group_asym_packing:
9134
+ /* Those type are not used in the slow wakeup path */
9135
+ return NULL;
9136
+
9137
+ case group_misfit_task:
9138
+ /* Select group with the highest max capacity */
9139
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9140
+ return NULL;
9141
+ break;
9142
+
9143
+ case group_has_spare:
9144
+ if (sd->flags & SD_NUMA) {
9145
+#ifdef CONFIG_NUMA_BALANCING
9146
+ int idlest_cpu;
9147
+ /*
9148
+ * If there is spare capacity at NUMA, try to select
9149
+ * the preferred node
9150
+ */
9151
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9152
+ return NULL;
9153
+
9154
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
9155
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9156
+ return idlest;
9157
+#endif
9158
+ /*
9159
+ * Otherwise, keep the task on this node to stay close
9160
+ * its wakeup source and improve locality. If there is
9161
+ * a real need of migration, periodic load balance will
9162
+ * take care of it.
9163
+ */
9164
+ if (local_sgs.idle_cpus)
9165
+ return NULL;
9166
+ }
9167
+
9168
+ /*
9169
+ * Select group with highest number of idle CPUs. We could also
9170
+ * compare the utilization which is more stable but it can end
9171
+ * up that the group has less spare capacity but finally more
9172
+ * idle CPUs which means more opportunity to run task.
9173
+ */
9174
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9175
+ return NULL;
9176
+ break;
9177
+ }
9178
+
9179
+ return idlest;
9180
+}
9181
+
91399182 /**
91409183 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
91419184 * @env: The load balancing environment.
91429185 * @sds: variable to hold the statistics for this sched_domain.
91439186 */
9187
+
91449188 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
91459189 {
91469190 struct sched_domain *child = env->sd->child;
91479191 struct sched_group *sg = env->sd->groups;
91489192 struct sg_lb_stats *local = &sds->local_stat;
91499193 struct sg_lb_stats tmp_sgs;
9150
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
91519194 int sg_status = 0;
91529195
91539196 #ifdef CONFIG_NO_HZ_COMMON
....@@ -9174,22 +9217,6 @@
91749217 if (local_group)
91759218 goto next_group;
91769219
9177
- /*
9178
- * In case the child domain prefers tasks go to siblings
9179
- * first, lower the sg capacity so that we'll try
9180
- * and move all the excess tasks away. We lower the capacity
9181
- * of a group only if the local group has the capacity to fit
9182
- * these excess tasks. The extra check prevents the case where
9183
- * you always pull from the heaviest group when it is already
9184
- * under-utilized (possible with a large weight task outweighs
9185
- * the tasks on the system).
9186
- */
9187
- if (prefer_sibling && sds->local &&
9188
- group_has_capacity(env, local) &&
9189
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9190
- sgs->group_no_capacity = 1;
9191
- sgs->group_type = group_classify(sg, sgs);
9192
- }
91939220
91949221 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
91959222 sds->busiest = sg;
....@@ -9198,12 +9225,14 @@
91989225
91999226 next_group:
92009227 /* Now, start updating sd_lb_stats */
9201
- sds->total_running += sgs->sum_nr_running;
92029228 sds->total_load += sgs->group_load;
92039229 sds->total_capacity += sgs->group_capacity;
92049230
92059231 sg = sg->next;
92069232 } while (sg != env->sd->groups);
9233
+
9234
+ /* Tag domain that child domain prefers tasks go to siblings first */
9235
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
92079236
92089237 #ifdef CONFIG_NO_HZ_COMMON
92099238 if ((env->flags & LBF_NOHZ_AGAIN) &&
....@@ -9217,8 +9246,6 @@
92179246 if (env->sd->flags & SD_NUMA)
92189247 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
92199248
9220
- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9221
-
92229249 if (!env->sd->parent) {
92239250 struct root_domain *rd = env->dst_rq->rd;
92249251
....@@ -9227,144 +9254,28 @@
92279254
92289255 /* Update over-utilization (tipping point, U >= 0) indicator */
92299256 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9230
- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
9257
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
92319258 } else if (sg_status & SG_OVERUTILIZED) {
9232
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9233
- trace_sched_overutilized(1);
9234
- }
9259
+ struct root_domain *rd = env->dst_rq->rd;
92359260
9261
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9262
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9263
+ }
92369264 }
92379265
9238
-/**
9239
- * check_asym_packing - Check to see if the group is packed into the
9240
- * sched domain.
9241
- *
9242
- * This is primarily intended to used at the sibling level. Some
9243
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9244
- * case of POWER7, it can move to lower SMT modes only when higher
9245
- * threads are idle. When in lower SMT modes, the threads will
9246
- * perform better since they share less core resources. Hence when we
9247
- * have idle threads, we want them to be the higher ones.
9248
- *
9249
- * This packing function is run on idle threads. It checks to see if
9250
- * the busiest CPU in this domain (core in the P7 case) has a higher
9251
- * CPU number than the packing function is being run on. Here we are
9252
- * assuming lower CPU number will be equivalent to lower a SMT thread
9253
- * number.
9254
- *
9255
- * Return: 1 when packing is required and a task should be moved to
9256
- * this CPU. The amount of the imbalance is returned in env->imbalance.
9257
- *
9258
- * @env: The load balancing environment.
9259
- * @sds: Statistics of the sched_domain which is to be packed
9260
- */
9261
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9266
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
92629267 {
9263
- int busiest_cpu;
9264
-
9265
- if (!(env->sd->flags & SD_ASYM_PACKING))
9266
- return 0;
9267
-
9268
- if (env->idle == CPU_NOT_IDLE)
9269
- return 0;
9270
-
9271
- if (!sds->busiest)
9272
- return 0;
9273
-
9274
- busiest_cpu = sds->busiest->asym_prefer_cpu;
9275
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9276
- return 0;
9277
-
9278
- env->imbalance = DIV_ROUND_CLOSEST(
9279
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9280
- SCHED_CAPACITY_SCALE);
9281
-
9282
- return 1;
9283
-}
9284
-
9285
-/**
9286
- * fix_small_imbalance - Calculate the minor imbalance that exists
9287
- * amongst the groups of a sched_domain, during
9288
- * load balancing.
9289
- * @env: The load balancing environment.
9290
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9291
- */
9292
-static inline
9293
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9294
-{
9295
- unsigned long tmp, capa_now = 0, capa_move = 0;
9296
- unsigned int imbn = 2;
9297
- unsigned long scaled_busy_load_per_task;
9298
- struct sg_lb_stats *local, *busiest;
9299
-
9300
- local = &sds->local_stat;
9301
- busiest = &sds->busiest_stat;
9302
-
9303
- if (!local->sum_nr_running)
9304
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9305
- else if (busiest->load_per_task > local->load_per_task)
9306
- imbn = 1;
9307
-
9308
- scaled_busy_load_per_task =
9309
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9310
- busiest->group_capacity;
9311
-
9312
- if (busiest->avg_load + scaled_busy_load_per_task >=
9313
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9314
- env->imbalance = busiest->load_per_task;
9315
- return;
9316
- }
9268
+ unsigned int imbalance_min;
93179269
93189270 /*
9319
- * OK, we don't have enough imbalance to justify moving tasks,
9320
- * however we may be able to increase total CPU capacity used by
9321
- * moving them.
9271
+ * Allow a small imbalance based on a simple pair of communicating
9272
+ * tasks that remain local when the source domain is almost idle.
93229273 */
9274
+ imbalance_min = 2;
9275
+ if (nr_running <= imbalance_min)
9276
+ return 0;
93239277
9324
- capa_now += busiest->group_capacity *
9325
- min(busiest->load_per_task, busiest->avg_load);
9326
- capa_now += local->group_capacity *
9327
- min(local->load_per_task, local->avg_load);
9328
- capa_now /= SCHED_CAPACITY_SCALE;
9329
-
9330
- /* Amount of load we'd subtract */
9331
- if (busiest->avg_load > scaled_busy_load_per_task) {
9332
- capa_move += busiest->group_capacity *
9333
- min(busiest->load_per_task,
9334
- busiest->avg_load - scaled_busy_load_per_task);
9335
- }
9336
-
9337
- /* Amount of load we'd add */
9338
- if (busiest->avg_load * busiest->group_capacity <
9339
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9340
- tmp = (busiest->avg_load * busiest->group_capacity) /
9341
- local->group_capacity;
9342
- } else {
9343
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9344
- local->group_capacity;
9345
- }
9346
- capa_move += local->group_capacity *
9347
- min(local->load_per_task, local->avg_load + tmp);
9348
- capa_move /= SCHED_CAPACITY_SCALE;
9349
-
9350
- /* Move if we gain throughput */
9351
- if (capa_move > capa_now) {
9352
- env->imbalance = busiest->load_per_task;
9353
- return;
9354
- }
9355
-
9356
- /* We can't see throughput improvement with the load-based
9357
- * method, but it is possible depending upon group size and
9358
- * capacity range that there might still be an underutilized
9359
- * cpu available in an asymmetric capacity system. Do one last
9360
- * check just in case.
9361
- */
9362
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9363
- busiest->group_type == group_overloaded &&
9364
- busiest->sum_nr_running > busiest->group_weight &&
9365
- local->sum_nr_running < local->group_weight &&
9366
- local->group_capacity < busiest->group_capacity)
9367
- env->imbalance = busiest->load_per_task;
9278
+ return imbalance;
93689279 }
93699280
93709281 /**
....@@ -9375,96 +9286,169 @@
93759286 */
93769287 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
93779288 {
9378
- unsigned long max_pull, load_above_capacity = ~0UL;
93799289 struct sg_lb_stats *local, *busiest;
93809290
93819291 local = &sds->local_stat;
93829292 busiest = &sds->busiest_stat;
93839293
9294
+ if (busiest->group_type == group_misfit_task) {
9295
+ /* Set imbalance to allow misfit tasks to be balanced. */
9296
+ env->migration_type = migrate_misfit;
9297
+ env->imbalance = 1;
9298
+ return;
9299
+ }
9300
+
9301
+ if (busiest->group_type == group_asym_packing) {
9302
+ /*
9303
+ * In case of asym capacity, we will try to migrate all load to
9304
+ * the preferred CPU.
9305
+ */
9306
+ env->migration_type = migrate_task;
9307
+ env->imbalance = busiest->sum_h_nr_running;
9308
+ return;
9309
+ }
9310
+
93849311 if (busiest->group_type == group_imbalanced) {
93859312 /*
93869313 * In the group_imb case we cannot rely on group-wide averages
9387
- * to ensure CPU-load equilibrium, look at wider averages. XXX
9314
+ * to ensure CPU-load equilibrium, try to move any task to fix
9315
+ * the imbalance. The next load balance will take care of
9316
+ * balancing back the system.
93889317 */
9389
- busiest->load_per_task =
9390
- min(busiest->load_per_task, sds->avg_load);
9318
+ env->migration_type = migrate_task;
9319
+ env->imbalance = 1;
9320
+ return;
93919321 }
93929322
93939323 /*
9394
- * Avg load of busiest sg can be less and avg load of local sg can
9395
- * be greater than avg load across all sgs of sd because avg load
9396
- * factors in sg capacity and sgs with smaller group_type are
9397
- * skipped when updating the busiest sg:
9324
+ * Try to use spare capacity of local group without overloading it or
9325
+ * emptying busiest.
93989326 */
9399
- if (busiest->group_type != group_misfit_task &&
9400
- (busiest->avg_load <= sds->avg_load ||
9401
- local->avg_load >= sds->avg_load)) {
9402
- env->imbalance = 0;
9403
- return fix_small_imbalance(env, sds);
9327
+ if (local->group_type == group_has_spare) {
9328
+ if ((busiest->group_type > group_fully_busy) &&
9329
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9330
+ /*
9331
+ * If busiest is overloaded, try to fill spare
9332
+ * capacity. This might end up creating spare capacity
9333
+ * in busiest or busiest still being overloaded but
9334
+ * there is no simple way to directly compute the
9335
+ * amount of load to migrate in order to balance the
9336
+ * system.
9337
+ */
9338
+ env->migration_type = migrate_util;
9339
+ env->imbalance = max(local->group_capacity, local->group_util) -
9340
+ local->group_util;
9341
+
9342
+ /*
9343
+ * In some cases, the group's utilization is max or even
9344
+ * higher than capacity because of migrations but the
9345
+ * local CPU is (newly) idle. There is at least one
9346
+ * waiting task in this overloaded busiest group. Let's
9347
+ * try to pull it.
9348
+ */
9349
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9350
+ env->migration_type = migrate_task;
9351
+ env->imbalance = 1;
9352
+ }
9353
+
9354
+ return;
9355
+ }
9356
+
9357
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
9358
+ unsigned int nr_diff = busiest->sum_nr_running;
9359
+ /*
9360
+ * When prefer sibling, evenly spread running tasks on
9361
+ * groups.
9362
+ */
9363
+ env->migration_type = migrate_task;
9364
+ lsub_positive(&nr_diff, local->sum_nr_running);
9365
+ env->imbalance = nr_diff >> 1;
9366
+ } else {
9367
+
9368
+ /*
9369
+ * If there is no overload, we just want to even the number of
9370
+ * idle cpus.
9371
+ */
9372
+ env->migration_type = migrate_task;
9373
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
9374
+ busiest->idle_cpus) >> 1);
9375
+ }
9376
+
9377
+ /* Consider allowing a small imbalance between NUMA groups */
9378
+ if (env->sd->flags & SD_NUMA)
9379
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
9380
+ busiest->sum_nr_running);
9381
+
9382
+ return;
94049383 }
94059384
94069385 /*
9407
- * If there aren't any idle CPUs, avoid creating some.
9386
+ * Local is fully busy but has to take more load to relieve the
9387
+ * busiest group
94089388 */
9409
- if (busiest->group_type == group_overloaded &&
9410
- local->group_type == group_overloaded) {
9411
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9412
- if (load_above_capacity > busiest->group_capacity) {
9413
- load_above_capacity -= busiest->group_capacity;
9414
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9415
- load_above_capacity /= busiest->group_capacity;
9416
- } else
9417
- load_above_capacity = ~0UL;
9389
+ if (local->group_type < group_overloaded) {
9390
+ /*
9391
+ * Local will become overloaded so the avg_load metrics are
9392
+ * finally needed.
9393
+ */
9394
+
9395
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9396
+ local->group_capacity;
9397
+
9398
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9399
+ sds->total_capacity;
9400
+ /*
9401
+ * If the local group is more loaded than the selected
9402
+ * busiest group don't try to pull any tasks.
9403
+ */
9404
+ if (local->avg_load >= busiest->avg_load) {
9405
+ env->imbalance = 0;
9406
+ return;
9407
+ }
94189408 }
94199409
94209410 /*
9421
- * We're trying to get all the CPUs to the average_load, so we don't
9422
- * want to push ourselves above the average load, nor do we wish to
9423
- * reduce the max loaded CPU below the average load. At the same time,
9424
- * we also don't want to reduce the group load below the group
9425
- * capacity. Thus we look for the minimum possible imbalance.
9411
+ * Both group are or will become overloaded and we're trying to get all
9412
+ * the CPUs to the average_load, so we don't want to push ourselves
9413
+ * above the average load, nor do we wish to reduce the max loaded CPU
9414
+ * below the average load. At the same time, we also don't want to
9415
+ * reduce the group load below the group capacity. Thus we look for
9416
+ * the minimum possible imbalance.
94269417 */
9427
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9428
-
9429
- /* How much load to actually move to equalise the imbalance */
9418
+ env->migration_type = migrate_load;
94309419 env->imbalance = min(
9431
- max_pull * busiest->group_capacity,
9420
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
94329421 (sds->avg_load - local->avg_load) * local->group_capacity
94339422 ) / SCHED_CAPACITY_SCALE;
9434
-
9435
- /* Boost imbalance to allow misfit task to be balanced.
9436
- * Always do this if we are doing a NEWLY_IDLE balance
9437
- * on the assumption that any tasks we have must not be
9438
- * long-running (and hence we cannot rely upon load).
9439
- * However if we are not idle, we should assume the tasks
9440
- * we have are longer running and not override load-based
9441
- * calculations above unless we are sure that the local
9442
- * group is underutilized.
9443
- */
9444
- if (busiest->group_type == group_misfit_task &&
9445
- (env->idle == CPU_NEWLY_IDLE ||
9446
- local->sum_nr_running < local->group_weight)) {
9447
- env->imbalance = max_t(long, env->imbalance,
9448
- busiest->group_misfit_task_load);
9449
- }
9450
-
9451
- /*
9452
- * if *imbalance is less than the average load per runnable task
9453
- * there is no guarantee that any tasks will be moved so we'll have
9454
- * a think about bumping its value to force at least one task to be
9455
- * moved
9456
- */
9457
- if (env->imbalance < busiest->load_per_task)
9458
- return fix_small_imbalance(env, sds);
94599423 }
94609424
94619425 /******* find_busiest_group() helpers end here *********************/
9426
+
9427
+/*
9428
+ * Decision matrix according to the local and busiest group type:
9429
+ *
9430
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9431
+ * has_spare nr_idle balanced N/A N/A balanced balanced
9432
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
9433
+ * misfit_task force N/A N/A N/A force force
9434
+ * asym_packing force force N/A N/A force force
9435
+ * imbalanced force force N/A N/A force force
9436
+ * overloaded force force N/A N/A force avg_load
9437
+ *
9438
+ * N/A : Not Applicable because already filtered while updating
9439
+ * statistics.
9440
+ * balanced : The system is balanced for these 2 groups.
9441
+ * force : Calculate the imbalance as load migration is probably needed.
9442
+ * avg_load : Only if imbalance is significant enough.
9443
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9444
+ * different in groups.
9445
+ */
94629446
94639447 /**
94649448 * find_busiest_group - Returns the busiest group within the sched_domain
94659449 * if there is an imbalance.
94669450 *
9467
- * Also calculates the amount of weighted load which should be moved
9451
+ * Also calculates the amount of runnable load which should be moved
94689452 * to restore balance.
94699453 *
94709454 * @env: The load balancing environment.
....@@ -9479,32 +9463,36 @@
94799463 init_sd_lb_stats(&sds);
94809464
94819465 /*
9482
- * Compute the various statistics relavent for load balancing at
9466
+ * Compute the various statistics relevant for load balancing at
94839467 * this level.
94849468 */
94859469 update_sd_lb_stats(env, &sds);
94869470
9487
- if (static_branch_unlikely(&sched_energy_present)) {
9471
+ if (sched_energy_enabled()) {
94889472 struct root_domain *rd = env->dst_rq->rd;
9473
+ int out_balance = 1;
94899474
9490
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9475
+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
9476
+ &out_balance);
9477
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
9478
+ && out_balance)
94919479 goto out_balanced;
94929480 }
94939481
94949482 local = &sds.local_stat;
94959483 busiest = &sds.busiest_stat;
94969484
9497
- /* ASYM feature bypasses nice load balance check */
9498
- if (check_asym_packing(env, &sds))
9499
- return sds.busiest;
9500
-
95019485 /* There is no busy sibling group to pull tasks from */
9502
- if (!sds.busiest || busiest->sum_nr_running == 0)
9486
+ if (!sds.busiest)
95039487 goto out_balanced;
95049488
9505
- /* XXX broken for overlapping NUMA groups */
9506
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9507
- / sds.total_capacity;
9489
+ /* Misfit tasks should be dealt with regardless of the avg load */
9490
+ if (busiest->group_type == group_misfit_task)
9491
+ goto force_balance;
9492
+
9493
+ /* ASYM feature bypasses nice load balance check */
9494
+ if (busiest->group_type == group_asym_packing)
9495
+ goto force_balance;
95089496
95099497 /*
95109498 * If the busiest group is imbalanced the below checks don't
....@@ -9515,55 +9503,80 @@
95159503 goto force_balance;
95169504
95179505 /*
9518
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9519
- * capacities from resulting in underutilization due to avg_load.
9520
- */
9521
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9522
- busiest->group_no_capacity)
9523
- goto force_balance;
9524
-
9525
- /* Misfit tasks should be dealt with regardless of the avg load */
9526
- if (busiest->group_type == group_misfit_task)
9527
- goto force_balance;
9528
-
9529
- /*
95309506 * If the local group is busier than the selected busiest group
95319507 * don't try and pull any tasks.
95329508 */
9533
- if (local->avg_load >= busiest->avg_load)
9509
+ if (local->group_type > busiest->group_type)
95349510 goto out_balanced;
95359511
95369512 /*
9537
- * Don't pull any tasks if this group is already above the domain
9538
- * average load.
9513
+ * When groups are overloaded, use the avg_load to ensure fairness
9514
+ * between tasks.
95399515 */
9540
- if (local->avg_load >= sds.avg_load)
9541
- goto out_balanced;
9542
-
9543
- if (env->idle == CPU_IDLE) {
9516
+ if (local->group_type == group_overloaded) {
95449517 /*
9545
- * This CPU is idle. If the busiest group is not overloaded
9546
- * and there is no imbalance between this and busiest group
9547
- * wrt idle CPUs, it is balanced. The imbalance becomes
9548
- * significant if the diff is greater than 1 otherwise we
9549
- * might end up to just move the imbalance on another group
9518
+ * If the local group is more loaded than the selected
9519
+ * busiest group don't try to pull any tasks.
95509520 */
9551
- if ((busiest->group_type != group_overloaded) &&
9552
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
9521
+ if (local->avg_load >= busiest->avg_load)
95539522 goto out_balanced;
9554
- } else {
9523
+
9524
+ /* XXX broken for overlapping NUMA groups */
9525
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9526
+ sds.total_capacity;
9527
+
95559528 /*
9556
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9557
- * imbalance_pct to be conservative.
9529
+ * Don't pull any tasks if this group is already above the
9530
+ * domain average load.
9531
+ */
9532
+ if (local->avg_load >= sds.avg_load)
9533
+ goto out_balanced;
9534
+
9535
+ /*
9536
+ * If the busiest group is more loaded, use imbalance_pct to be
9537
+ * conservative.
95589538 */
95599539 if (100 * busiest->avg_load <=
95609540 env->sd->imbalance_pct * local->avg_load)
95619541 goto out_balanced;
95629542 }
95639543
9544
+ /* Try to move all excess tasks to child's sibling domain */
9545
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
9546
+ busiest->sum_nr_running > local->sum_nr_running + 1)
9547
+ goto force_balance;
9548
+
9549
+ if (busiest->group_type != group_overloaded) {
9550
+ if (env->idle == CPU_NOT_IDLE)
9551
+ /*
9552
+ * If the busiest group is not overloaded (and as a
9553
+ * result the local one too) but this CPU is already
9554
+ * busy, let another idle CPU try to pull task.
9555
+ */
9556
+ goto out_balanced;
9557
+
9558
+ if (busiest->group_weight > 1 &&
9559
+ local->idle_cpus <= (busiest->idle_cpus + 1))
9560
+ /*
9561
+ * If the busiest group is not overloaded
9562
+ * and there is no imbalance between this and busiest
9563
+ * group wrt idle CPUs, it is balanced. The imbalance
9564
+ * becomes significant if the diff is greater than 1
9565
+ * otherwise we might end up to just move the imbalance
9566
+ * on another group. Of course this applies only if
9567
+ * there is more than 1 CPU per group.
9568
+ */
9569
+ goto out_balanced;
9570
+
9571
+ if (busiest->sum_h_nr_running == 1)
9572
+ /*
9573
+ * busiest doesn't have any tasks waiting to run
9574
+ */
9575
+ goto out_balanced;
9576
+ }
9577
+
95649578 force_balance:
95659579 /* Looks like there is an imbalance. Compute it */
9566
- env->src_grp_type = busiest->group_type;
95679580 calculate_imbalance(env, &sds);
95689581 return env->imbalance ? sds.busiest : NULL;
95699582
....@@ -9579,11 +9592,18 @@
95799592 struct sched_group *group)
95809593 {
95819594 struct rq *busiest = NULL, *rq;
9582
- unsigned long busiest_load = 0, busiest_capacity = 1;
9583
- int i;
9595
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9596
+ unsigned int busiest_nr = 0;
9597
+ int i, done = 0;
9598
+
9599
+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
9600
+ &busiest, &done);
9601
+ if (done)
9602
+ return busiest;
95849603
95859604 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9586
- unsigned long capacity, wl;
9605
+ unsigned long capacity, load, util;
9606
+ unsigned int nr_running;
95879607 enum fbq_type rt;
95889608
95899609 rq = cpu_rq(i);
....@@ -9611,20 +9631,8 @@
96119631 if (rt > env->fbq_type)
96129632 continue;
96139633
9614
- /*
9615
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9616
- * seek the "biggest" misfit task.
9617
- */
9618
- if (env->src_grp_type == group_misfit_task) {
9619
- if (rq->misfit_task_load > busiest_load) {
9620
- busiest_load = rq->misfit_task_load;
9621
- busiest = rq;
9622
- }
9623
-
9624
- continue;
9625
- }
9626
-
96279634 capacity = capacity_of(i);
9635
+ nr_running = rq->cfs.h_nr_running;
96289636
96299637 /*
96309638 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
....@@ -9634,35 +9642,77 @@
96349642 */
96359643 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
96369644 capacity_of(env->dst_cpu) < capacity &&
9637
- rq->nr_running == 1)
9645
+ nr_running == 1)
96389646 continue;
96399647
9640
- wl = weighted_cpuload(rq);
9648
+ switch (env->migration_type) {
9649
+ case migrate_load:
9650
+ /*
9651
+ * When comparing with load imbalance, use cpu_load()
9652
+ * which is not scaled with the CPU capacity.
9653
+ */
9654
+ load = cpu_load(rq);
96419655
9642
- /*
9643
- * When comparing with imbalance, use weighted_cpuload()
9644
- * which is not scaled with the CPU capacity.
9645
- */
9656
+ if (nr_running == 1 && load > env->imbalance &&
9657
+ !check_cpu_capacity(rq, env->sd))
9658
+ break;
96469659
9647
- if (rq->nr_running == 1 && wl > env->imbalance &&
9648
- !check_cpu_capacity(rq, env->sd))
9649
- continue;
9660
+ /*
9661
+ * For the load comparisons with the other CPUs,
9662
+ * consider the cpu_load() scaled with the CPU
9663
+ * capacity, so that the load can be moved away
9664
+ * from the CPU that is potentially running at a
9665
+ * lower capacity.
9666
+ *
9667
+ * Thus we're looking for max(load_i / capacity_i),
9668
+ * crosswise multiplication to rid ourselves of the
9669
+ * division works out to:
9670
+ * load_i * capacity_j > load_j * capacity_i;
9671
+ * where j is our previous maximum.
9672
+ */
9673
+ if (load * busiest_capacity > busiest_load * capacity) {
9674
+ busiest_load = load;
9675
+ busiest_capacity = capacity;
9676
+ busiest = rq;
9677
+ }
9678
+ break;
96509679
9651
- /*
9652
- * For the load comparisons with the other CPU's, consider
9653
- * the weighted_cpuload() scaled with the CPU capacity, so
9654
- * that the load can be moved away from the CPU that is
9655
- * potentially running at a lower capacity.
9656
- *
9657
- * Thus we're looking for max(wl_i / capacity_i), crosswise
9658
- * multiplication to rid ourselves of the division works out
9659
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9660
- * our previous maximum.
9661
- */
9662
- if (wl * busiest_capacity > busiest_load * capacity) {
9663
- busiest_load = wl;
9664
- busiest_capacity = capacity;
9665
- busiest = rq;
9680
+ case migrate_util:
9681
+ util = cpu_util(cpu_of(rq));
9682
+
9683
+ /*
9684
+ * Don't try to pull utilization from a CPU with one
9685
+ * running task. Whatever its utilization, we will fail
9686
+ * detach the task.
9687
+ */
9688
+ if (nr_running <= 1)
9689
+ continue;
9690
+
9691
+ if (busiest_util < util) {
9692
+ busiest_util = util;
9693
+ busiest = rq;
9694
+ }
9695
+ break;
9696
+
9697
+ case migrate_task:
9698
+ if (busiest_nr < nr_running) {
9699
+ busiest_nr = nr_running;
9700
+ busiest = rq;
9701
+ }
9702
+ break;
9703
+
9704
+ case migrate_misfit:
9705
+ /*
9706
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
9707
+ * simply seek the "biggest" misfit task.
9708
+ */
9709
+ if (rq->misfit_task_load > busiest_load) {
9710
+ busiest_load = rq->misfit_task_load;
9711
+ busiest = rq;
9712
+ }
9713
+
9714
+ break;
9715
+
96669716 }
96679717 }
96689718
....@@ -9675,21 +9725,25 @@
96759725 */
96769726 #define MAX_PINNED_INTERVAL 512
96779727
9678
-static int need_active_balance(struct lb_env *env)
9728
+static inline bool
9729
+asym_active_balance(struct lb_env *env)
9730
+{
9731
+ /*
9732
+ * ASYM_PACKING needs to force migrate tasks from busy but
9733
+ * lower priority CPUs in order to pack all tasks in the
9734
+ * highest priority CPUs.
9735
+ */
9736
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9737
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
9738
+}
9739
+
9740
+static inline bool
9741
+voluntary_active_balance(struct lb_env *env)
96799742 {
96809743 struct sched_domain *sd = env->sd;
96819744
9682
- if (env->idle == CPU_NEWLY_IDLE) {
9683
-
9684
- /*
9685
- * ASYM_PACKING needs to force migrate tasks from busy but
9686
- * lower priority CPUs in order to pack all tasks in the
9687
- * highest priority CPUs.
9688
- */
9689
- if ((sd->flags & SD_ASYM_PACKING) &&
9690
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9691
- return 1;
9692
- }
9745
+ if (asym_active_balance(env))
9746
+ return 1;
96939747
96949748 /*
96959749 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
....@@ -9704,19 +9758,18 @@
97049758 return 1;
97059759 }
97069760
9707
- if (env->src_grp_type == group_misfit_task)
9761
+ if (env->migration_type == migrate_misfit)
97089762 return 1;
97099763
9710
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9711
- env->src_rq->cfs.h_nr_running == 1 &&
9712
- cpu_overutilized(env->src_cpu) &&
9713
- !cpu_overutilized(env->dst_cpu)) {
9714
- return 1;
9715
- }
9764
+ return 0;
9765
+}
97169766
9717
- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9718
- return 1;
9767
+static int need_active_balance(struct lb_env *env)
9768
+{
9769
+ struct sched_domain *sd = env->sd;
97199770
9771
+ if (voluntary_active_balance(env))
9772
+ return 1;
97209773
97219774 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
97229775 }
....@@ -9726,7 +9779,17 @@
97269779 static int should_we_balance(struct lb_env *env)
97279780 {
97289781 struct sched_group *sg = env->sd->groups;
9729
- int cpu, balance_cpu = -1;
9782
+ int cpu;
9783
+
9784
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
9785
+ struct root_domain *rd = env->dst_rq->rd;
9786
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
9787
+ int level = rockchip_perf_get_level();
9788
+
9789
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
9790
+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
9791
+ return 0;
9792
+ }
97309793
97319794 /*
97329795 * Ensure the balancing environment is consistent; can happen
....@@ -9747,18 +9810,12 @@
97479810 if (!idle_cpu(cpu))
97489811 continue;
97499812
9750
- balance_cpu = cpu;
9751
- break;
9813
+ /* Are we the first idle CPU? */
9814
+ return cpu == env->dst_cpu;
97529815 }
97539816
9754
- if (balance_cpu == -1)
9755
- balance_cpu = group_balance_cpu(sg);
9756
-
9757
- /*
9758
- * First idle CPU or the first CPU(busiest) in this sched group
9759
- * is eligible for doing load balancing at this and above domains.
9760
- */
9761
- return balance_cpu == env->dst_cpu;
9817
+ /* Are we the first CPU of this group ? */
9818
+ return group_balance_cpu(sg) == env->dst_cpu;
97629819 }
97639820
97649821 /*
....@@ -9830,6 +9887,7 @@
98309887
98319888 more_balance:
98329889 rq_lock_irqsave(busiest, &rf);
9890
+ env.src_rq_rf = &rf;
98339891 update_rq_clock(busiest);
98349892
98359893 /*
....@@ -9882,7 +9940,7 @@
98829940 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
98839941
98849942 /* Prevent to re-select dst_cpu via env's CPUs */
9885
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
9943
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
98869944
98879945 env.dst_rq = cpu_rq(env.new_dst_cpu);
98889946 env.dst_cpu = env.new_dst_cpu;
....@@ -9909,7 +9967,7 @@
99099967
99109968 /* All tasks on this runqueue were pinned by CPU affinity */
99119969 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9912
- cpumask_clear_cpu(cpu_of(busiest), cpus);
9970
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
99139971 /*
99149972 * Attempting to continue load balancing at the current
99159973 * sched_domain level only makes sense if there are
....@@ -9936,8 +9994,7 @@
99369994 * excessive cache_hot migrations and active balances.
99379995 */
99389996 if (idle != CPU_NEWLY_IDLE)
9939
- if (env.src_grp_nr_running > 1)
9940
- sd->nr_balance_failed++;
9997
+ sd->nr_balance_failed++;
99419998
99429999 if (need_active_balance(&env)) {
994310000 unsigned long flags;
....@@ -9980,7 +10037,7 @@
998010037 } else
998110038 sd->nr_balance_failed = 0;
998210039
9983
- if (likely(!active_balance)) {
10040
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
998410041 /* We were unbalanced, so reset the balancing interval */
998510042 sd->balance_interval = sd->min_interval;
998610043 } else {
....@@ -10023,18 +10080,18 @@
1002310080 ld_moved = 0;
1002410081
1002510082 /*
10026
- * idle_balance() disregards balance intervals, so we could repeatedly
10027
- * reach this code, which would lead to balance_interval skyrocketting
10028
- * in a short amount of time. Skip the balance_interval increase logic
10029
- * to avoid that.
10083
+ * newidle_balance() disregards balance intervals, so we could
10084
+ * repeatedly reach this code, which would lead to balance_interval
10085
+ * skyrocketting in a short amount of time. Skip the balance_interval
10086
+ * increase logic to avoid that.
1003010087 */
1003110088 if (env.idle == CPU_NEWLY_IDLE)
1003210089 goto out;
1003310090
1003410091 /* tune up the balancing interval */
10035
- if (((env.flags & LBF_ALL_PINNED) &&
10036
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
10037
- (sd->balance_interval < sd->max_interval))
10092
+ if ((env.flags & LBF_ALL_PINNED &&
10093
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
10094
+ sd->balance_interval < sd->max_interval)
1003810095 sd->balance_interval *= 2;
1003910096 out:
1004010097 return ld_moved;
....@@ -10050,6 +10107,15 @@
1005010107
1005110108 /* scale ms to jiffies */
1005210109 interval = msecs_to_jiffies(interval);
10110
+
10111
+ /*
10112
+ * Reduce likelihood of busy balancing at higher domains racing with
10113
+ * balancing at lower domains by preventing their balancing periods
10114
+ * from being multiples of each other.
10115
+ */
10116
+ if (cpu_busy)
10117
+ interval -= 1;
10118
+
1005310119 interval = clamp(interval, 1UL, max_load_balance_interval);
1005410120
1005510121 return interval;
....@@ -10112,9 +10178,8 @@
1011210178 /* Search for an sd spanning us and the target CPU. */
1011310179 rcu_read_lock();
1011410180 for_each_domain(target_cpu, sd) {
10115
- if ((sd->flags & SD_LOAD_BALANCE) &&
10116
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10117
- break;
10181
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10182
+ break;
1011810183 }
1011910184
1012010185 if (likely(sd)) {
....@@ -10132,6 +10197,7 @@
1013210197 * about DST_PINNED.
1013310198 */
1013410199 .flags = LBF_DST_PINNED,
10200
+ .src_rq_rf = &rf,
1013510201 };
1013610202
1013710203 schedstat_inc(sd->alb_count);
....@@ -10167,7 +10233,7 @@
1016710233 */
1016810234 void update_max_interval(void)
1016910235 {
10170
- max_load_balance_interval = HZ*num_online_cpus()/10;
10236
+ max_load_balance_interval = HZ*num_active_cpus()/10;
1017110237 }
1017210238
1017310239 /*
....@@ -10180,6 +10246,7 @@
1018010246 {
1018110247 int continue_balancing = 1;
1018210248 int cpu = rq->cpu;
10249
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1018310250 unsigned long interval;
1018410251 struct sched_domain *sd;
1018510252 /* Earliest time when we have to do rebalance again */
....@@ -10187,6 +10254,10 @@
1018710254 int update_next_balance = 0;
1018810255 int need_serialize, need_decay = 0;
1018910256 u64 max_cost = 0;
10257
+
10258
+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
10259
+ if (!continue_balancing)
10260
+ return;
1019010261
1019110262 rcu_read_lock();
1019210263 for_each_domain(cpu, sd) {
....@@ -10202,9 +10273,6 @@
1020210273 }
1020310274 max_cost += sd->max_newidle_lb_cost;
1020410275
10205
- if (!(sd->flags & SD_LOAD_BALANCE))
10206
- continue;
10207
-
1020810276 /*
1020910277 * Stop the load balance at this level. There is another
1021010278 * CPU in our sched group which is doing load balancing more
....@@ -10216,7 +10284,7 @@
1021610284 break;
1021710285 }
1021810286
10219
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10287
+ interval = get_sd_balance_interval(sd, busy);
1022010288
1022110289 need_serialize = sd->flags & SD_SERIALIZE;
1022210290 if (need_serialize) {
....@@ -10232,9 +10300,10 @@
1023210300 * state even if we migrated tasks. Update it.
1023310301 */
1023410302 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10303
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1023510304 }
1023610305 sd->last_balance = jiffies;
10237
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10306
+ interval = get_sd_balance_interval(sd, busy);
1023810307 }
1023910308 if (need_serialize)
1024010309 spin_unlock(&balancing);
....@@ -10294,7 +10363,11 @@
1029410363
1029510364 static inline int find_new_ilb(void)
1029610365 {
10297
- int ilb;
10366
+ int ilb = -1;
10367
+
10368
+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
10369
+ if (ilb >= 0)
10370
+ return ilb;
1029810371
1029910372 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
1030010373 housekeeping_cpumask(HK_FLAG_MISC)) {
....@@ -10325,29 +10398,25 @@
1032510398 if (ilb_cpu >= nr_cpu_ids)
1032610399 return;
1032710400
10401
+ /*
10402
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10403
+ * the first flag owns it; cleared by nohz_csd_func().
10404
+ */
1032810405 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
1032910406 if (flags & NOHZ_KICK_MASK)
1033010407 return;
1033110408
1033210409 /*
10333
- * Use smp_send_reschedule() instead of resched_cpu().
10334
- * This way we generate a sched IPI on the target CPU which
10410
+ * This way we generate an IPI on the target CPU which
1033510411 * is idle. And the softirq performing nohz idle load balance
1033610412 * will be run before returning from the IPI.
1033710413 */
10338
- smp_send_reschedule(ilb_cpu);
10414
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
1033910415 }
1034010416
1034110417 /*
10342
- * Current heuristic for kicking the idle load balancer in the presence
10343
- * of an idle cpu in the system.
10344
- * - This rq has more than one task.
10345
- * - This rq has at least one CFS task and the capacity of the CPU is
10346
- * significantly reduced because of RT tasks or IRQs.
10347
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10348
- * multiple busy cpu.
10349
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10350
- * domain span are idle.
10418
+ * Current decision point for kicking the idle load balancer in the presence
10419
+ * of idle CPUs in the system.
1035110420 */
1035210421 static void nohz_balancer_kick(struct rq *rq)
1035310422 {
....@@ -10356,6 +10425,7 @@
1035610425 struct sched_domain *sd;
1035710426 int nr_busy, i, cpu = rq->cpu;
1035810427 unsigned int flags = 0;
10428
+ int done = 0;
1035910429
1036010430 if (unlikely(rq->idle_balance))
1036110431 return;
....@@ -10380,30 +10450,25 @@
1038010450 if (time_before(now, nohz.next_balance))
1038110451 goto out;
1038210452
10383
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
10453
+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
10454
+ if (done)
10455
+ goto out;
10456
+
10457
+ if (rq->nr_running >= 2) {
1038410458 flags = NOHZ_KICK_MASK;
1038510459 goto out;
1038610460 }
1038710461
1038810462 rcu_read_lock();
10389
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10390
- if (sds) {
10391
- /*
10392
- * XXX: write a coherent comment on why we do this.
10393
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10394
- */
10395
- nr_busy = atomic_read(&sds->nr_busy_cpus);
10396
- if (nr_busy > 1) {
10397
- flags = NOHZ_KICK_MASK;
10398
- goto unlock;
10399
- }
10400
-
10401
- }
1040210463
1040310464 sd = rcu_dereference(rq->sd);
1040410465 if (sd) {
10405
- if ((rq->cfs.h_nr_running >= 1) &&
10406
- check_cpu_capacity(rq, sd)) {
10466
+ /*
10467
+ * If there's a CFS task and the current CPU has reduced
10468
+ * capacity; kick the ILB to see if there's a better CPU to run
10469
+ * on.
10470
+ */
10471
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
1040710472 flags = NOHZ_KICK_MASK;
1040810473 goto unlock;
1040910474 }
....@@ -10411,15 +10476,55 @@
1041110476
1041210477 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
1041310478 if (sd) {
10414
- for_each_cpu(i, sched_domain_span(sd)) {
10415
- if (i == cpu ||
10416
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10417
- continue;
10418
-
10479
+ /*
10480
+ * When ASYM_PACKING; see if there's a more preferred CPU
10481
+ * currently idle; in which case, kick the ILB to move tasks
10482
+ * around.
10483
+ */
10484
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
1041910485 if (sched_asym_prefer(i, cpu)) {
1042010486 flags = NOHZ_KICK_MASK;
1042110487 goto unlock;
1042210488 }
10489
+ }
10490
+ }
10491
+
10492
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10493
+ if (sd) {
10494
+ /*
10495
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10496
+ * to run the misfit task on.
10497
+ */
10498
+ if (check_misfit_status(rq, sd)) {
10499
+ flags = NOHZ_KICK_MASK;
10500
+ goto unlock;
10501
+ }
10502
+
10503
+ /*
10504
+ * For asymmetric systems, we do not want to nicely balance
10505
+ * cache use, instead we want to embrace asymmetry and only
10506
+ * ensure tasks have enough CPU capacity.
10507
+ *
10508
+ * Skip the LLC logic because it's not relevant in that case.
10509
+ */
10510
+ goto unlock;
10511
+ }
10512
+
10513
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10514
+ if (sds) {
10515
+ /*
10516
+ * If there is an imbalance between LLC domains (IOW we could
10517
+ * increase the overall cache use), we need some less-loaded LLC
10518
+ * domain to pull some load. Likewise, we may need to spread
10519
+ * load within the current LLC domain (e.g. packed SMT cores but
10520
+ * other CPUs are idle). We can't really know from here how busy
10521
+ * the others are - so just get a nohz balance going if it looks
10522
+ * like this LLC domain has tasks we could move.
10523
+ */
10524
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
10525
+ if (nr_busy > 1) {
10526
+ flags = NOHZ_KICK_MASK;
10527
+ goto unlock;
1042310528 }
1042410529 }
1042510530 unlock:
....@@ -10485,9 +10590,20 @@
1048510590
1048610591 SCHED_WARN_ON(cpu != smp_processor_id());
1048710592
10488
- /* If this CPU is going down, then nothing needs to be done: */
10489
- if (!cpu_active(cpu))
10593
+ if (!cpu_active(cpu)) {
10594
+ /*
10595
+ * A CPU can be paused while it is idle with it's tick
10596
+ * stopped. nohz_balance_exit_idle() should be called
10597
+ * from the local CPU, so it can't be called during
10598
+ * pause. This results in paused CPU participating in
10599
+ * the nohz idle balance, which should be avoided.
10600
+ *
10601
+ * When the paused CPU exits idle and enters again,
10602
+ * exempt the paused CPU from nohz_balance_exit_idle.
10603
+ */
10604
+ nohz_balance_exit_idle(rq);
1049010605 return;
10606
+ }
1049110607
1049210608 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
1049310609 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
....@@ -10600,7 +10716,6 @@
1060010716
1060110717 rq_lock_irqsave(rq, &rf);
1060210718 update_rq_clock(rq);
10603
- cpu_load_update_idle(rq);
1060410719 rq_unlock_irqrestore(rq, &rf);
1060510720
1060610721 if (flags & NOHZ_BALANCE_KICK)
....@@ -10650,22 +10765,14 @@
1065010765 */
1065110766 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
1065210767 {
10653
- int this_cpu = this_rq->cpu;
10654
- unsigned int flags;
10768
+ unsigned int flags = this_rq->nohz_idle_balance;
1065510769
10656
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
10770
+ if (!flags)
1065710771 return false;
1065810772
10659
- if (idle != CPU_IDLE) {
10660
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10661
- return false;
10662
- }
10773
+ this_rq->nohz_idle_balance = 0;
1066310774
10664
- /*
10665
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10666
- */
10667
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10668
- if (!(flags & NOHZ_KICK_MASK))
10775
+ if (idle != CPU_IDLE)
1066910776 return false;
1067010777
1067110778 _nohz_idle_balance(this_rq, flags, idle);
....@@ -10719,15 +10826,26 @@
1071910826 /*
1072010827 * idle_balance is called by schedule() if this_cpu is about to become
1072110828 * idle. Attempts to pull tasks from other CPUs.
10829
+ *
10830
+ * Returns:
10831
+ * < 0 - we released the lock and there are !fair tasks present
10832
+ * 0 - failed, no new tasks
10833
+ * > 0 - success, new (fair) tasks present
1072210834 */
10723
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
10835
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
1072410836 {
1072510837 unsigned long next_balance = jiffies + HZ;
1072610838 int this_cpu = this_rq->cpu;
1072710839 struct sched_domain *sd;
1072810840 int pulled_task = 0;
1072910841 u64 curr_cost = 0;
10842
+ int done = 0;
1073010843
10844
+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
10845
+ if (done)
10846
+ return pulled_task;
10847
+
10848
+ update_misfit_status(NULL, this_rq);
1073110849 /*
1073210850 * We must set idle_stamp _before_ calling idle_balance(), such that we
1073310851 * measure the duration of idle_balance() as idle time.
....@@ -10769,9 +10887,6 @@
1076910887 for_each_domain(this_cpu, sd) {
1077010888 int continue_balancing = 1;
1077110889 u64 t0, domain_cost;
10772
-
10773
- if (!(sd->flags & SD_LOAD_BALANCE))
10774
- continue;
1077510890
1077610891 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
1077710892 update_next_balance(sd, &next_balance);
....@@ -10962,6 +11077,9 @@
1096211077 if (!task_on_rq_queued(p))
1096311078 return;
1096411079
11080
+ if (rq->cfs.nr_running == 1)
11081
+ return;
11082
+
1096511083 /*
1096611084 * Reschedule if we are currently running on this runqueue and
1096711085 * our priority decreased, or if we are not currently running on
....@@ -11040,7 +11158,7 @@
1104011158 /* Catch up with the cfs_rq and remove our load when we leave */
1104111159 update_load_avg(cfs_rq, se, 0);
1104211160 detach_entity_load_avg(cfs_rq, se);
11043
- update_tg_load_avg(cfs_rq, false);
11161
+ update_tg_load_avg(cfs_rq);
1104411162 propagate_entity_cfs_rq(se);
1104511163 }
1104611164
....@@ -11058,8 +11176,8 @@
1105811176
1105911177 /* Synchronize entity with its cfs_rq */
1106011178 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11061
- attach_entity_load_avg(cfs_rq, se, 0);
11062
- update_tg_load_avg(cfs_rq, false);
11179
+ attach_entity_load_avg(cfs_rq, se);
11180
+ update_tg_load_avg(cfs_rq);
1106311181 propagate_entity_cfs_rq(se);
1106411182 }
1106511183
....@@ -11118,9 +11236,19 @@
1111811236 * This routine is mostly called to set cfs_rq->curr field when a task
1111911237 * migrates between groups/classes.
1112011238 */
11121
-static void set_curr_task_fair(struct rq *rq)
11239
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1112211240 {
11123
- struct sched_entity *se = &rq->curr->se;
11241
+ struct sched_entity *se = &p->se;
11242
+
11243
+#ifdef CONFIG_SMP
11244
+ if (task_on_rq_queued(p)) {
11245
+ /*
11246
+ * Move the next running task to the front of the list, so our
11247
+ * cfs_tasks list becomes MRU one.
11248
+ */
11249
+ list_move(&se->group_node, &rq->cfs_tasks);
11250
+ }
11251
+#endif
1112411252
1112511253 for_each_sched_entity(se) {
1112611254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
....@@ -11381,8 +11509,8 @@
1138111509 /*
1138211510 * All the scheduling class methods:
1138311511 */
11384
-const struct sched_class fair_sched_class = {
11385
- .next = &idle_sched_class,
11512
+const struct sched_class fair_sched_class
11513
+ __section("__fair_sched_class") = {
1138611514 .enqueue_task = enqueue_task_fair,
1138711515 .dequeue_task = dequeue_task_fair,
1138811516 .yield_task = yield_task_fair,
....@@ -11390,10 +11518,12 @@
1139011518
1139111519 .check_preempt_curr = check_preempt_wakeup,
1139211520
11393
- .pick_next_task = pick_next_task_fair,
11521
+ .pick_next_task = __pick_next_task_fair,
1139411522 .put_prev_task = put_prev_task_fair,
11523
+ .set_next_task = set_next_task_fair,
1139511524
1139611525 #ifdef CONFIG_SMP
11526
+ .balance = balance_fair,
1139711527 .select_task_rq = select_task_rq_fair,
1139811528 .migrate_task_rq = migrate_task_rq_fair,
1139911529
....@@ -11404,7 +11534,6 @@
1140411534 .set_cpus_allowed = set_cpus_allowed_common,
1140511535 #endif
1140611536
11407
- .set_curr_task = set_curr_task_fair,
1140811537 .task_tick = task_tick_fair,
1140911538 .task_fork = task_fork_fair,
1141011539
....@@ -11474,3 +11603,101 @@
1147411603 #endif /* SMP */
1147511604
1147611605 }
11606
+
11607
+/*
11608
+ * Helper functions to facilitate extracting info from tracepoints.
11609
+ */
11610
+
11611
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11612
+{
11613
+#ifdef CONFIG_SMP
11614
+ return cfs_rq ? &cfs_rq->avg : NULL;
11615
+#else
11616
+ return NULL;
11617
+#endif
11618
+}
11619
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11620
+
11621
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11622
+{
11623
+ if (!cfs_rq) {
11624
+ if (str)
11625
+ strlcpy(str, "(null)", len);
11626
+ else
11627
+ return NULL;
11628
+ }
11629
+
11630
+ cfs_rq_tg_path(cfs_rq, str, len);
11631
+ return str;
11632
+}
11633
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11634
+
11635
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11636
+{
11637
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11638
+}
11639
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11640
+
11641
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11642
+{
11643
+#ifdef CONFIG_SMP
11644
+ return rq ? &rq->avg_rt : NULL;
11645
+#else
11646
+ return NULL;
11647
+#endif
11648
+}
11649
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11650
+
11651
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11652
+{
11653
+#ifdef CONFIG_SMP
11654
+ return rq ? &rq->avg_dl : NULL;
11655
+#else
11656
+ return NULL;
11657
+#endif
11658
+}
11659
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11660
+
11661
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11662
+{
11663
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11664
+ return rq ? &rq->avg_irq : NULL;
11665
+#else
11666
+ return NULL;
11667
+#endif
11668
+}
11669
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11670
+
11671
+int sched_trace_rq_cpu(struct rq *rq)
11672
+{
11673
+ return rq ? cpu_of(rq) : -1;
11674
+}
11675
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11676
+
11677
+int sched_trace_rq_cpu_capacity(struct rq *rq)
11678
+{
11679
+ return rq ?
11680
+#ifdef CONFIG_SMP
11681
+ rq->cpu_capacity
11682
+#else
11683
+ SCHED_CAPACITY_SCALE
11684
+#endif
11685
+ : -1;
11686
+}
11687
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11688
+
11689
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11690
+{
11691
+#ifdef CONFIG_SMP
11692
+ return rd ? rd->span : NULL;
11693
+#else
11694
+ return NULL;
11695
+#endif
11696
+}
11697
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11698
+
11699
+int sched_trace_rq_nr_running(struct rq *rq)
11700
+{
11701
+ return rq ? rq->nr_running : -1;
11702
+}
11703
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);