hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/kernel/sched/fair.c
....@@ -20,12 +20,11 @@
2020 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
2121 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
2222 */
23
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24
-#include <linux/cpufreq.h>
25
-#endif
2623 #include "sched.h"
2724
28
-#include <trace/events/sched.h>
25
+#include <trace/hooks/sched.h>
26
+
27
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
2928
3029 /*
3130 * Targeted preemption latency for CPU-bound tasks:
....@@ -41,17 +40,8 @@
4140 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
4241 */
4342 unsigned int sysctl_sched_latency = 6000000ULL;
44
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45
-
46
-/*
47
- * Enable/disable honoring sync flag in energy-aware wakeups.
48
- */
49
-unsigned int sysctl_sched_sync_hint_enable = 1;
50
-
51
-/*
52
- * Enable/disable using cstate knowledge in idle sibling selection
53
- */
54
-unsigned int sysctl_sched_cstate_aware = 1;
43
+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
44
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
5545
5646 /*
5747 * The initial- and re-scaling of tunables is configurable
....@@ -71,8 +61,9 @@
7161 *
7262 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
7363 */
74
-unsigned int sysctl_sched_min_granularity = 750000ULL;
75
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
64
+unsigned int sysctl_sched_min_granularity = 750000ULL;
65
+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
66
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
7667
7768 /*
7869 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
....@@ -94,10 +85,23 @@
9485 *
9586 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
9687 */
97
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
88
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
89
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
9990
10091 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
92
+
93
+int sched_thermal_decay_shift;
94
+static int __init setup_sched_thermal_decay_shift(char *str)
95
+{
96
+ int _shift = 0;
97
+
98
+ if (kstrtoint(str, 0, &_shift))
99
+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
100
+
101
+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
102
+ return 1;
103
+}
104
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101105
102106 #ifdef CONFIG_SMP
103107 /*
....@@ -107,6 +111,14 @@
107111 {
108112 return -cpu;
109113 }
114
+
115
+/*
116
+ * The margin used when comparing utilization with CPU capacity.
117
+ *
118
+ * (default: ~20%)
119
+ */
120
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
121
+
110122 #endif
111123
112124 #ifdef CONFIG_CFS_BANDWIDTH
....@@ -122,18 +134,6 @@
122134 */
123135 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124136 #endif
125
-
126
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127
-unsigned int sysctl_sched_performance_bias = 1;
128
-#endif
129
-
130
-/*
131
- * The margin used when comparing utilization with CPU capacity:
132
- * util * margin < capacity * 1024
133
- *
134
- * (default: ~20%)
135
- */
136
-unsigned int capacity_margin = 1280;
137137
138138 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139139 {
....@@ -195,7 +195,7 @@
195195 #undef SET_SYSCTL
196196 }
197197
198
-void sched_init_granularity(void)
198
+void __init sched_init_granularity(void)
199199 {
200200 update_sysctl();
201201 }
....@@ -246,8 +246,7 @@
246246 }
247247 }
248248
249
- /* hint to use a 32x32->64 mul */
250
- fact = (u64)(u32)fact * lw->inv_weight;
249
+ fact = mul_u32_u32(fact, lw->inv_weight);
251250
252251 while (fact >> 32) {
253252 fact >>= 1;
....@@ -290,6 +289,19 @@
290289 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
291290 {
292291 return grp->my_q;
292
+}
293
+
294
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
295
+{
296
+ if (!path)
297
+ return;
298
+
299
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
300
+ autogroup_path(cfs_rq->tg, path, len);
301
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
302
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
303
+ else
304
+ strlcpy(path, "(null)", len);
293305 }
294306
295307 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -466,6 +478,12 @@
466478 return NULL;
467479 }
468480
481
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
482
+{
483
+ if (path)
484
+ strlcpy(path, "(null)", len);
485
+}
486
+
469487 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470488 {
471489 return true;
....@@ -567,6 +585,7 @@
567585 struct sched_entity *entry;
568586 bool leftmost = true;
569587
588
+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570589 /*
571590 * Find the right place in the rbtree:
572591 */
....@@ -592,6 +611,7 @@
592611
593612 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
594613 {
614
+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595615 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596616 }
597617
....@@ -631,8 +651,7 @@
631651 */
632652
633653 int sched_proc_update_handler(struct ctl_table *table, int write,
634
- void __user *buffer, size_t *lenp,
635
- loff_t *ppos)
654
+ void *buffer, size_t *lenp, loff_t *ppos)
636655 {
637656 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638657 unsigned int factor = get_update_sysctl_factor();
....@@ -689,7 +708,13 @@
689708 */
690709 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
691710 {
692
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
711
+ unsigned int nr_running = cfs_rq->nr_running;
712
+ u64 slice;
713
+
714
+ if (sched_feat(ALT_PERIOD))
715
+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
716
+
717
+ slice = __sched_period(nr_running + !se->on_rq);
693718
694719 for_each_sched_entity(se) {
695720 struct load_weight *load;
....@@ -706,6 +731,10 @@
706731 }
707732 slice = __calc_delta(slice, se->load.weight, load);
708733 }
734
+
735
+ if (sched_feat(BASE_SLICE))
736
+ slice = max(slice, (u64)sysctl_sched_min_granularity);
737
+
709738 return slice;
710739 }
711740
....@@ -734,26 +763,17 @@
734763 memset(sa, 0, sizeof(*sa));
735764
736765 /*
737
- * Tasks are intialized with full load to be seen as heavy tasks until
766
+ * Tasks are initialized with full load to be seen as heavy tasks until
738767 * they get a chance to stabilize to their real load level.
739
- * Group entities are intialized with zero load to reflect the fact that
768
+ * Group entities are initialized with zero load to reflect the fact that
740769 * nothing has been attached to the task group yet.
741770 */
742771 if (entity_is_task(se))
743
- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
772
+ sa->load_avg = scale_load_down(se->load.weight);
744773
745
- se->runnable_weight = se->load.weight;
746
-
747
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748
- if (sysctl_sched_performance_bias) {
749
- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751
- }
752
-#endif
753774 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754775 }
755776
756
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757777 static void attach_entity_cfs_rq(struct sched_entity *se);
758778
759779 /*
....@@ -782,18 +802,15 @@
782802 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783803 * if util_avg > util_avg_cap.
784804 */
785
-void post_init_entity_util_avg(struct sched_entity *se)
805
+void post_init_entity_util_avg(struct task_struct *p)
786806 {
807
+ struct sched_entity *se = &p->se;
787808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788809 struct sched_avg *sa = &se->avg;
789
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
810
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790811 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791812
792
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793
- if (!sysctl_sched_performance_bias && (cap > 0)) {
794
-#else
795813 if (cap > 0) {
796
-#endif
797814 if (cfs_rq->avg.util_avg != 0) {
798815 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799816 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
....@@ -805,24 +822,25 @@
805822 }
806823 }
807824
808
- if (entity_is_task(se)) {
809
- struct task_struct *p = task_of(se);
810
- if (p->sched_class != &fair_sched_class) {
811
- /*
812
- * For !fair tasks do:
813
- *
814
- update_cfs_rq_load_avg(now, cfs_rq);
815
- attach_entity_load_avg(cfs_rq, se, 0);
816
- switched_from_fair(rq, p);
817
- *
818
- * such that the next switched_to_fair() has the
819
- * expected state.
820
- */
821
- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822
- return;
823
- }
825
+ sa->runnable_avg = sa->util_avg;
826
+
827
+ if (p->sched_class != &fair_sched_class) {
828
+ /*
829
+ * For !fair tasks do:
830
+ *
831
+ update_cfs_rq_load_avg(now, cfs_rq);
832
+ attach_entity_load_avg(cfs_rq, se);
833
+ switched_from_fair(rq, p);
834
+ *
835
+ * such that the next switched_to_fair() has the
836
+ * expected state.
837
+ */
838
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
839
+ return;
824840 }
825841
842
+ /* Hook before this se's util is attached to cfs_rq's util */
843
+ trace_android_rvh_post_init_entity_util_avg(se);
826844 attach_entity_cfs_rq(se);
827845 }
828846
....@@ -830,10 +848,10 @@
830848 void init_entity_runnable_average(struct sched_entity *se)
831849 {
832850 }
833
-void post_init_entity_util_avg(struct sched_entity *se)
851
+void post_init_entity_util_avg(struct task_struct *p)
834852 {
835853 }
836
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
854
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837855 {
838856 }
839857 #endif /* CONFIG_SMP */
....@@ -983,7 +1001,6 @@
9831001 }
9841002
9851003 trace_sched_stat_blocked(tsk, delta);
986
- trace_sched_blocked_reason(tsk);
9871004
9881005 /*
9891006 * Blocking time is in units of nanosecs, so shift by
....@@ -1078,7 +1095,7 @@
10781095 unsigned int sysctl_numa_balancing_scan_delay = 1000;
10791096
10801097 struct numa_group {
1081
- atomic_t refcount;
1098
+ refcount_t refcount;
10821099
10831100 spinlock_t lock; /* nr_tasks, tasks */
10841101 int nr_tasks;
....@@ -1094,7 +1111,7 @@
10941111 * more by CPU use than by memory faults.
10951112 */
10961113 unsigned long *faults_cpu;
1097
- unsigned long faults[0];
1114
+ unsigned long faults[];
10981115 };
10991116
11001117 /*
....@@ -1164,7 +1181,7 @@
11641181 unsigned long shared = group_faults_shared(ng);
11651182 unsigned long private = group_faults_priv(ng);
11661183
1167
- period *= atomic_read(&ng->refcount);
1184
+ period *= refcount_read(&ng->refcount);
11681185 period *= shared + 1;
11691186 period /= private + shared + 1;
11701187 }
....@@ -1189,7 +1206,7 @@
11891206 unsigned long private = group_faults_priv(ng);
11901207 unsigned long period = smax;
11911208
1192
- period *= atomic_read(&ng->refcount);
1209
+ period *= refcount_read(&ng->refcount);
11931210 period *= shared + 1;
11941211 period /= private + shared + 1;
11951212
....@@ -1199,56 +1216,15 @@
11991216 return max(smin, smax);
12001217 }
12011218
1202
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203
-{
1204
- int mm_users = 0;
1205
- struct mm_struct *mm = p->mm;
1206
-
1207
- if (mm) {
1208
- mm_users = atomic_read(&mm->mm_users);
1209
- if (mm_users == 1) {
1210
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211
- mm->numa_scan_seq = 0;
1212
- }
1213
- }
1214
- p->node_stamp = 0;
1215
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217
- p->numa_work.next = &p->numa_work;
1218
- p->numa_faults = NULL;
1219
- RCU_INIT_POINTER(p->numa_group, NULL);
1220
- p->last_task_numa_placement = 0;
1221
- p->last_sum_exec_runtime = 0;
1222
-
1223
- /* New address space, reset the preferred nid */
1224
- if (!(clone_flags & CLONE_VM)) {
1225
- p->numa_preferred_nid = -1;
1226
- return;
1227
- }
1228
-
1229
- /*
1230
- * New thread, keep existing numa_preferred_nid which should be copied
1231
- * already by arch_dup_task_struct but stagger when scans start.
1232
- */
1233
- if (mm) {
1234
- unsigned int delay;
1235
-
1236
- delay = min_t(unsigned int, task_scan_max(current),
1237
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238
- delay += 2 * TICK_NSEC;
1239
- p->node_stamp = delay;
1240
- }
1241
-}
1242
-
12431219 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
12441220 {
1245
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
1221
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
12461222 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
12471223 }
12481224
12491225 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
12501226 {
1251
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1227
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
12521228 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
12531229 }
12541230
....@@ -1474,7 +1450,7 @@
14741450 * two full passes of the "multi-stage node selection" test that is
14751451 * executed below.
14761452 */
1477
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
1453
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
14781454 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
14791455 return true;
14801456
....@@ -1527,55 +1503,52 @@
15271503 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
15281504 }
15291505
1530
-static unsigned long weighted_cpuload(struct rq *rq);
1531
-static unsigned long source_load(int cpu, int type);
1532
-static unsigned long target_load(int cpu, int type);
1506
+/*
1507
+ * 'numa_type' describes the node at the moment of load balancing.
1508
+ */
1509
+enum numa_type {
1510
+ /* The node has spare capacity that can be used to run more tasks. */
1511
+ node_has_spare = 0,
1512
+ /*
1513
+ * The node is fully used and the tasks don't compete for more CPU
1514
+ * cycles. Nevertheless, some tasks might wait before running.
1515
+ */
1516
+ node_fully_busy,
1517
+ /*
1518
+ * The node is overloaded and can't provide expected CPU cycles to all
1519
+ * tasks.
1520
+ */
1521
+ node_overloaded
1522
+};
15331523
15341524 /* Cached statistics for all CPUs within a node */
15351525 struct numa_stats {
15361526 unsigned long load;
1537
-
1527
+ unsigned long runnable;
1528
+ unsigned long util;
15381529 /* Total compute capacity of CPUs on a node */
15391530 unsigned long compute_capacity;
1540
-
15411531 unsigned int nr_running;
1532
+ unsigned int weight;
1533
+ enum numa_type node_type;
1534
+ int idle_cpu;
15421535 };
15431536
1544
-/*
1545
- * XXX borrowed from update_sg_lb_stats
1546
- */
1547
-static void update_numa_stats(struct numa_stats *ns, int nid)
1537
+static inline bool is_core_idle(int cpu)
15481538 {
1549
- int smt, cpu, cpus = 0;
1550
- unsigned long capacity;
1539
+#ifdef CONFIG_SCHED_SMT
1540
+ int sibling;
15511541
1552
- memset(ns, 0, sizeof(*ns));
1553
- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554
- struct rq *rq = cpu_rq(cpu);
1542
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1543
+ if (cpu == sibling)
1544
+ continue;
15551545
1556
- ns->nr_running += rq->nr_running;
1557
- ns->load += weighted_cpuload(rq);
1558
- ns->compute_capacity += capacity_of(cpu);
1559
-
1560
- cpus++;
1546
+ if (!idle_cpu(sibling))
1547
+ return false;
15611548 }
1549
+#endif
15621550
1563
- /*
1564
- * If we raced with hotplug and there are no CPUs left in our mask
1565
- * the @ns structure is NULL'ed and task_numa_compare() will
1566
- * not find this node attractive.
1567
- *
1568
- * We'll detect a huge imbalance and bail there.
1569
- */
1570
- if (!cpus)
1571
- return;
1572
-
1573
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575
- capacity = cpus / smt; /* cores */
1576
-
1577
- capacity = min_t(unsigned, capacity,
1578
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1551
+ return true;
15791552 }
15801553
15811554 struct task_numa_env {
....@@ -1594,20 +1567,132 @@
15941567 int best_cpu;
15951568 };
15961569
1570
+static unsigned long cpu_load(struct rq *rq);
1571
+static unsigned long cpu_runnable(struct rq *rq);
1572
+static unsigned long cpu_util(int cpu);
1573
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
1574
+
1575
+static inline enum
1576
+numa_type numa_classify(unsigned int imbalance_pct,
1577
+ struct numa_stats *ns)
1578
+{
1579
+ if ((ns->nr_running > ns->weight) &&
1580
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1581
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
1582
+ return node_overloaded;
1583
+
1584
+ if ((ns->nr_running < ns->weight) ||
1585
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1586
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
1587
+ return node_has_spare;
1588
+
1589
+ return node_fully_busy;
1590
+}
1591
+
1592
+#ifdef CONFIG_SCHED_SMT
1593
+/* Forward declarations of select_idle_sibling helpers */
1594
+static inline bool test_idle_cores(int cpu, bool def);
1595
+static inline int numa_idle_core(int idle_core, int cpu)
1596
+{
1597
+ if (!static_branch_likely(&sched_smt_present) ||
1598
+ idle_core >= 0 || !test_idle_cores(cpu, false))
1599
+ return idle_core;
1600
+
1601
+ /*
1602
+ * Prefer cores instead of packing HT siblings
1603
+ * and triggering future load balancing.
1604
+ */
1605
+ if (is_core_idle(cpu))
1606
+ idle_core = cpu;
1607
+
1608
+ return idle_core;
1609
+}
1610
+#else
1611
+static inline int numa_idle_core(int idle_core, int cpu)
1612
+{
1613
+ return idle_core;
1614
+}
1615
+#endif
1616
+
1617
+/*
1618
+ * Gather all necessary information to make NUMA balancing placement
1619
+ * decisions that are compatible with standard load balancer. This
1620
+ * borrows code and logic from update_sg_lb_stats but sharing a
1621
+ * common implementation is impractical.
1622
+ */
1623
+static void update_numa_stats(struct task_numa_env *env,
1624
+ struct numa_stats *ns, int nid,
1625
+ bool find_idle)
1626
+{
1627
+ int cpu, idle_core = -1;
1628
+
1629
+ memset(ns, 0, sizeof(*ns));
1630
+ ns->idle_cpu = -1;
1631
+
1632
+ rcu_read_lock();
1633
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
1634
+ struct rq *rq = cpu_rq(cpu);
1635
+
1636
+ ns->load += cpu_load(rq);
1637
+ ns->runnable += cpu_runnable(rq);
1638
+ ns->util += cpu_util(cpu);
1639
+ ns->nr_running += rq->cfs.h_nr_running;
1640
+ ns->compute_capacity += capacity_of(cpu);
1641
+
1642
+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1643
+ if (READ_ONCE(rq->numa_migrate_on) ||
1644
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1645
+ continue;
1646
+
1647
+ if (ns->idle_cpu == -1)
1648
+ ns->idle_cpu = cpu;
1649
+
1650
+ idle_core = numa_idle_core(idle_core, cpu);
1651
+ }
1652
+ }
1653
+ rcu_read_unlock();
1654
+
1655
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
1656
+
1657
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
1658
+
1659
+ if (idle_core >= 0)
1660
+ ns->idle_cpu = idle_core;
1661
+}
1662
+
15971663 static void task_numa_assign(struct task_numa_env *env,
15981664 struct task_struct *p, long imp)
15991665 {
16001666 struct rq *rq = cpu_rq(env->dst_cpu);
16011667
1602
- /* Bail out if run-queue part of active NUMA balance. */
1603
- if (xchg(&rq->numa_migrate_on, 1))
1604
- return;
1668
+ /* Check if run-queue part of active NUMA balance. */
1669
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1670
+ int cpu;
1671
+ int start = env->dst_cpu;
16051672
1673
+ /* Find alternative idle CPU. */
1674
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1675
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1676
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1677
+ continue;
1678
+ }
1679
+
1680
+ env->dst_cpu = cpu;
1681
+ rq = cpu_rq(env->dst_cpu);
1682
+ if (!xchg(&rq->numa_migrate_on, 1))
1683
+ goto assign;
1684
+ }
1685
+
1686
+ /* Failed to find an alternative idle CPU */
1687
+ return;
1688
+ }
1689
+
1690
+assign:
16061691 /*
16071692 * Clear previous best_cpu/rq numa-migrate flag, since task now
16081693 * found a better CPU to move/swap.
16091694 */
1610
- if (env->best_cpu != -1) {
1695
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
16111696 rq = cpu_rq(env->best_cpu);
16121697 WRITE_ONCE(rq->numa_migrate_on, 0);
16131698 }
....@@ -1663,7 +1748,7 @@
16631748 * into account that it might be best if task running on the dst_cpu should
16641749 * be exchanged with the source task
16651750 */
1666
-static void task_numa_compare(struct task_numa_env *env,
1751
+static bool task_numa_compare(struct task_numa_env *env,
16671752 long taskimp, long groupimp, bool maymove)
16681753 {
16691754 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
....@@ -1674,12 +1759,13 @@
16741759 int dist = env->dist;
16751760 long moveimp = imp;
16761761 long load;
1762
+ bool stopsearch = false;
16771763
16781764 if (READ_ONCE(dst_rq->numa_migrate_on))
1679
- return;
1765
+ return false;
16801766
16811767 rcu_read_lock();
1682
- cur = task_rcu_dereference(&dst_rq->curr);
1768
+ cur = rcu_dereference(dst_rq->curr);
16831769 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
16841770 cur = NULL;
16851771
....@@ -1687,8 +1773,10 @@
16871773 * Because we have preemption enabled we can get migrated around and
16881774 * end try selecting ourselves (current == env->p) as a swap candidate.
16891775 */
1690
- if (cur == env->p)
1776
+ if (cur == env->p) {
1777
+ stopsearch = true;
16911778 goto unlock;
1779
+ }
16921780
16931781 if (!cur) {
16941782 if (maymove && moveimp >= env->best_imp)
....@@ -1697,18 +1785,27 @@
16971785 goto unlock;
16981786 }
16991787
1788
+ /* Skip this swap candidate if cannot move to the source cpu. */
1789
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1790
+ goto unlock;
1791
+
1792
+ /*
1793
+ * Skip this swap candidate if it is not moving to its preferred
1794
+ * node and the best task is.
1795
+ */
1796
+ if (env->best_task &&
1797
+ env->best_task->numa_preferred_nid == env->src_nid &&
1798
+ cur->numa_preferred_nid != env->src_nid) {
1799
+ goto unlock;
1800
+ }
1801
+
17001802 /*
17011803 * "imp" is the fault differential for the source task between the
17021804 * source and destination node. Calculate the total differential for
17031805 * the source task and potential destination task. The more negative
17041806 * the value is, the more remote accesses that would be expected to
17051807 * be incurred if the tasks were swapped.
1706
- */
1707
- /* Skip this swap candidate if cannot move to the source cpu */
1708
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1709
- goto unlock;
1710
-
1711
- /*
1808
+ *
17121809 * If dst and source tasks are in the same NUMA group, or not
17131810 * in any group then look only at task weights.
17141811 */
....@@ -1735,9 +1832,31 @@
17351832 task_weight(cur, env->dst_nid, dist);
17361833 }
17371834
1835
+ /* Discourage picking a task already on its preferred node */
1836
+ if (cur->numa_preferred_nid == env->dst_nid)
1837
+ imp -= imp / 16;
1838
+
1839
+ /*
1840
+ * Encourage picking a task that moves to its preferred node.
1841
+ * This potentially makes imp larger than it's maximum of
1842
+ * 1998 (see SMALLIMP and task_weight for why) but in this
1843
+ * case, it does not matter.
1844
+ */
1845
+ if (cur->numa_preferred_nid == env->src_nid)
1846
+ imp += imp / 8;
1847
+
17381848 if (maymove && moveimp > imp && moveimp > env->best_imp) {
17391849 imp = moveimp;
17401850 cur = NULL;
1851
+ goto assign;
1852
+ }
1853
+
1854
+ /*
1855
+ * Prefer swapping with a task moving to its preferred node over a
1856
+ * task that is not.
1857
+ */
1858
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1859
+ env->best_task->numa_preferred_nid != env->src_nid) {
17411860 goto assign;
17421861 }
17431862
....@@ -1764,50 +1883,104 @@
17641883 goto unlock;
17651884
17661885 assign:
1767
- /*
1768
- * One idle CPU per node is evaluated for a task numa move.
1769
- * Call select_idle_sibling to maybe find a better one.
1770
- */
1886
+ /* Evaluate an idle CPU for a task numa move. */
17711887 if (!cur) {
1888
+ int cpu = env->dst_stats.idle_cpu;
1889
+
1890
+ /* Nothing cached so current CPU went idle since the search. */
1891
+ if (cpu < 0)
1892
+ cpu = env->dst_cpu;
1893
+
17721894 /*
1773
- * select_idle_siblings() uses an per-CPU cpumask that
1774
- * can be used from IRQ context.
1895
+ * If the CPU is no longer truly idle and the previous best CPU
1896
+ * is, keep using it.
17751897 */
1776
- local_irq_disable();
1777
- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778
- env->dst_cpu);
1779
- local_irq_enable();
1898
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1899
+ idle_cpu(env->best_cpu)) {
1900
+ cpu = env->best_cpu;
1901
+ }
1902
+
1903
+ env->dst_cpu = cpu;
17801904 }
17811905
17821906 task_numa_assign(env, cur, imp);
1907
+
1908
+ /*
1909
+ * If a move to idle is allowed because there is capacity or load
1910
+ * balance improves then stop the search. While a better swap
1911
+ * candidate may exist, a search is not free.
1912
+ */
1913
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1914
+ stopsearch = true;
1915
+
1916
+ /*
1917
+ * If a swap candidate must be identified and the current best task
1918
+ * moves its preferred node then stop the search.
1919
+ */
1920
+ if (!maymove && env->best_task &&
1921
+ env->best_task->numa_preferred_nid == env->src_nid) {
1922
+ stopsearch = true;
1923
+ }
17831924 unlock:
17841925 rcu_read_unlock();
1926
+
1927
+ return stopsearch;
17851928 }
17861929
17871930 static void task_numa_find_cpu(struct task_numa_env *env,
17881931 long taskimp, long groupimp)
17891932 {
1790
- long src_load, dst_load, load;
17911933 bool maymove = false;
17921934 int cpu;
17931935
1794
- load = task_h_load(env->p);
1795
- dst_load = env->dst_stats.load + load;
1796
- src_load = env->src_stats.load - load;
1797
-
17981936 /*
1799
- * If the improvement from just moving env->p direction is better
1800
- * than swapping tasks around, check if a move is possible.
1937
+ * If dst node has spare capacity, then check if there is an
1938
+ * imbalance that would be overruled by the load balancer.
18011939 */
1802
- maymove = !load_too_imbalanced(src_load, dst_load, env);
1940
+ if (env->dst_stats.node_type == node_has_spare) {
1941
+ unsigned int imbalance;
1942
+ int src_running, dst_running;
1943
+
1944
+ /*
1945
+ * Would movement cause an imbalance? Note that if src has
1946
+ * more running tasks that the imbalance is ignored as the
1947
+ * move improves the imbalance from the perspective of the
1948
+ * CPU load balancer.
1949
+ * */
1950
+ src_running = env->src_stats.nr_running - 1;
1951
+ dst_running = env->dst_stats.nr_running + 1;
1952
+ imbalance = max(0, dst_running - src_running);
1953
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
1954
+
1955
+ /* Use idle CPU if there is no imbalance */
1956
+ if (!imbalance) {
1957
+ maymove = true;
1958
+ if (env->dst_stats.idle_cpu >= 0) {
1959
+ env->dst_cpu = env->dst_stats.idle_cpu;
1960
+ task_numa_assign(env, NULL, 0);
1961
+ return;
1962
+ }
1963
+ }
1964
+ } else {
1965
+ long src_load, dst_load, load;
1966
+ /*
1967
+ * If the improvement from just moving env->p direction is better
1968
+ * than swapping tasks around, check if a move is possible.
1969
+ */
1970
+ load = task_h_load(env->p);
1971
+ dst_load = env->dst_stats.load + load;
1972
+ src_load = env->src_stats.load - load;
1973
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
1974
+ }
18031975
18041976 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
18051977 /* Skip this CPU if the source task cannot migrate */
1806
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1978
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
18071979 continue;
18081980
18091981 env->dst_cpu = cpu;
1810
- task_numa_compare(env, taskimp, groupimp, maymove);
1982
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
1983
+ break;
18111984 }
18121985 }
18131986
....@@ -1861,10 +2034,10 @@
18612034 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
18622035 taskweight = task_weight(p, env.src_nid, dist);
18632036 groupweight = group_weight(p, env.src_nid, dist);
1864
- update_numa_stats(&env.src_stats, env.src_nid);
2037
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
18652038 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
18662039 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867
- update_numa_stats(&env.dst_stats, env.dst_nid);
2040
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
18682041
18692042 /* Try to find a spot on the preferred nid. */
18702043 task_numa_find_cpu(&env, taskimp, groupimp);
....@@ -1897,7 +2070,7 @@
18972070
18982071 env.dist = dist;
18992072 env.dst_nid = nid;
1900
- update_numa_stats(&env.dst_stats, env.dst_nid);
2073
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
19012074 task_numa_find_cpu(&env, taskimp, groupimp);
19022075 }
19032076 }
....@@ -1921,15 +2094,17 @@
19212094 }
19222095
19232096 /* No better CPU than the current one was found. */
1924
- if (env.best_cpu == -1)
2097
+ if (env.best_cpu == -1) {
2098
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
19252099 return -EAGAIN;
2100
+ }
19262101
19272102 best_rq = cpu_rq(env.best_cpu);
19282103 if (env.best_task == NULL) {
19292104 ret = migrate_task_to(p, env.best_cpu);
19302105 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19312106 if (ret != 0)
1932
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
2107
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
19332108 return ret;
19342109 }
19352110
....@@ -1937,7 +2112,7 @@
19372112 WRITE_ONCE(best_rq->numa_migrate_on, 0);
19382113
19392114 if (ret != 0)
1940
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
2115
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
19412116 put_task_struct(env.best_task);
19422117 return ret;
19432118 }
....@@ -1948,7 +2123,7 @@
19482123 unsigned long interval = HZ;
19492124
19502125 /* This task has no NUMA fault statistics yet */
1951
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
2126
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
19522127 return;
19532128
19542129 /* Periodically retry migrating the task to the preferred node */
....@@ -2199,7 +2374,7 @@
21992374
22002375 static void task_numa_placement(struct task_struct *p)
22012376 {
2202
- int seq, nid, max_nid = -1;
2377
+ int seq, nid, max_nid = NUMA_NO_NODE;
22032378 unsigned long max_faults = 0;
22042379 unsigned long fault_types[2] = { 0, 0 };
22052380 unsigned long total_faults;
....@@ -2309,12 +2484,12 @@
23092484
23102485 static inline int get_numa_group(struct numa_group *grp)
23112486 {
2312
- return atomic_inc_not_zero(&grp->refcount);
2487
+ return refcount_inc_not_zero(&grp->refcount);
23132488 }
23142489
23152490 static inline void put_numa_group(struct numa_group *grp)
23162491 {
2317
- if (atomic_dec_and_test(&grp->refcount))
2492
+ if (refcount_dec_and_test(&grp->refcount))
23182493 kfree_rcu(grp, rcu);
23192494 }
23202495
....@@ -2335,7 +2510,7 @@
23352510 if (!grp)
23362511 return;
23372512
2338
- atomic_set(&grp->refcount, 1);
2513
+ refcount_set(&grp->refcount, 1);
23392514 grp->active_nodes = 1;
23402515 grp->max_faults_cpu = 0;
23412516 spin_lock_init(&grp->lock);
....@@ -2522,8 +2697,8 @@
25222697 local = 1;
25232698
25242699 /*
2525
- * Retry task to preferred node migration periodically, in case it
2526
- * case it previously failed, or the scheduler moved us.
2700
+ * Retry to migrate task to preferred node periodically, in case it
2701
+ * previously failed, or the scheduler moved us.
25272702 */
25282703 if (time_after(jiffies, p->numa_migrate_retry)) {
25292704 task_numa_placement(p);
....@@ -2558,7 +2733,7 @@
25582733 * The expensive part of numa migration is done from task_work context.
25592734 * Triggered from task_tick_numa().
25602735 */
2561
-void task_numa_work(struct callback_head *work)
2736
+static void task_numa_work(struct callback_head *work)
25622737 {
25632738 unsigned long migrate, next_scan, now = jiffies;
25642739 struct task_struct *p = current;
....@@ -2571,7 +2746,7 @@
25712746
25722747 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
25732748
2574
- work->next = work; /* protect against double add */
2749
+ work->next = work;
25752750 /*
25762751 * Who cares about NUMA placement when they're dying.
25772752 *
....@@ -2618,7 +2793,7 @@
26182793 return;
26192794
26202795
2621
- if (!down_read_trylock(&mm->mmap_sem))
2796
+ if (!mmap_read_trylock(mm))
26222797 return;
26232798 vma = find_vma(mm, start);
26242799 if (!vma) {
....@@ -2646,7 +2821,7 @@
26462821 * Skip inaccessible VMAs to avoid any confusion between
26472822 * PROT_NONE and NUMA hinting ptes
26482823 */
2649
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2824
+ if (!vma_is_accessible(vma))
26502825 continue;
26512826
26522827 do {
....@@ -2686,7 +2861,7 @@
26862861 mm->numa_scan_offset = start;
26872862 else
26882863 reset_ptenuma_scan(p);
2689
- up_read(&mm->mmap_sem);
2864
+ mmap_read_unlock(mm);
26902865
26912866 /*
26922867 * Make sure tasks use at least 32x as much time to run other code
....@@ -2700,10 +2875,54 @@
27002875 }
27012876 }
27022877
2878
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2879
+{
2880
+ int mm_users = 0;
2881
+ struct mm_struct *mm = p->mm;
2882
+
2883
+ if (mm) {
2884
+ mm_users = atomic_read(&mm->mm_users);
2885
+ if (mm_users == 1) {
2886
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2887
+ mm->numa_scan_seq = 0;
2888
+ }
2889
+ }
2890
+ p->node_stamp = 0;
2891
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2892
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2893
+ /* Protect against double add, see task_tick_numa and task_numa_work */
2894
+ p->numa_work.next = &p->numa_work;
2895
+ p->numa_faults = NULL;
2896
+ RCU_INIT_POINTER(p->numa_group, NULL);
2897
+ p->last_task_numa_placement = 0;
2898
+ p->last_sum_exec_runtime = 0;
2899
+
2900
+ init_task_work(&p->numa_work, task_numa_work);
2901
+
2902
+ /* New address space, reset the preferred nid */
2903
+ if (!(clone_flags & CLONE_VM)) {
2904
+ p->numa_preferred_nid = NUMA_NO_NODE;
2905
+ return;
2906
+ }
2907
+
2908
+ /*
2909
+ * New thread, keep existing numa_preferred_nid which should be copied
2910
+ * already by arch_dup_task_struct but stagger when scans start.
2911
+ */
2912
+ if (mm) {
2913
+ unsigned int delay;
2914
+
2915
+ delay = min_t(unsigned int, task_scan_max(current),
2916
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2917
+ delay += 2 * TICK_NSEC;
2918
+ p->node_stamp = delay;
2919
+ }
2920
+}
2921
+
27032922 /*
27042923 * Drive the periodic memory faults..
27052924 */
2706
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
2925
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
27072926 {
27082927 struct callback_head *work = &curr->numa_work;
27092928 u64 period, now;
....@@ -2728,10 +2947,8 @@
27282947 curr->numa_scan_period = task_scan_start(curr);
27292948 curr->node_stamp += period;
27302949
2731
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733
- task_work_add(curr, work, true);
2734
- }
2950
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
2951
+ task_work_add(curr, work, TWA_RESUME);
27352952 }
27362953 }
27372954
....@@ -2761,7 +2978,8 @@
27612978 * the preferred node.
27622979 */
27632980 if (dst_nid == p->numa_preferred_nid ||
2764
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2981
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
2982
+ src_nid != p->numa_preferred_nid))
27652983 return;
27662984 }
27672985
....@@ -2791,8 +3009,6 @@
27913009 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
27923010 {
27933011 update_load_add(&cfs_rq->load, se->load.weight);
2794
- if (!parent_entity(se))
2795
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
27963012 #ifdef CONFIG_SMP
27973013 if (entity_is_task(se)) {
27983014 struct rq *rq = rq_of(cfs_rq);
....@@ -2808,8 +3024,6 @@
28083024 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
28093025 {
28103026 update_load_sub(&cfs_rq->load, se->load.weight);
2811
- if (!parent_entity(se))
2812
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
28133027 #ifdef CONFIG_SMP
28143028 if (entity_is_task(se)) {
28153029 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
....@@ -2856,26 +3070,18 @@
28563070 WRITE_ONCE(*ptr, res); \
28573071 } while (0)
28583072
3073
+/*
3074
+ * Remove and clamp on negative, from a local variable.
3075
+ *
3076
+ * A variant of sub_positive(), which does not use explicit load-store
3077
+ * and is thus optimized for local variable updates.
3078
+ */
3079
+#define lsub_positive(_ptr, _val) do { \
3080
+ typeof(_ptr) ptr = (_ptr); \
3081
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3082
+} while (0)
3083
+
28593084 #ifdef CONFIG_SMP
2860
-static inline void
2861
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2862
-{
2863
- cfs_rq->runnable_weight += se->runnable_weight;
2864
-
2865
- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866
- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867
-}
2868
-
2869
-static inline void
2870
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2871
-{
2872
- cfs_rq->runnable_weight -= se->runnable_weight;
2873
-
2874
- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875
- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876
- se_runnable(se) * se->avg.runnable_load_sum);
2877
-}
2878
-
28793085 static inline void
28803086 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
28813087 {
....@@ -2891,45 +3097,36 @@
28913097 }
28923098 #else
28933099 static inline void
2894
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2895
-static inline void
2896
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2897
-static inline void
28983100 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
28993101 static inline void
29003102 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
29013103 #endif
29023104
29033105 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2904
- unsigned long weight, unsigned long runnable)
3106
+ unsigned long weight)
29053107 {
29063108 if (se->on_rq) {
29073109 /* commit outstanding execution time */
29083110 if (cfs_rq->curr == se)
29093111 update_curr(cfs_rq);
2910
- account_entity_dequeue(cfs_rq, se);
2911
- dequeue_runnable_load_avg(cfs_rq, se);
3112
+ update_load_sub(&cfs_rq->load, se->load.weight);
29123113 }
29133114 dequeue_load_avg(cfs_rq, se);
29143115
2915
- se->runnable_weight = runnable;
29163116 update_load_set(&se->load, weight);
29173117
29183118 #ifdef CONFIG_SMP
29193119 do {
2920
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
3120
+ u32 divider = get_pelt_divider(&se->avg);
29213121
29223122 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923
- se->avg.runnable_load_avg =
2924
- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
29253123 } while (0);
29263124 #endif
29273125
29283126 enqueue_load_avg(cfs_rq, se);
2929
- if (se->on_rq) {
2930
- account_entity_enqueue(cfs_rq, se);
2931
- enqueue_runnable_load_avg(cfs_rq, se);
2932
- }
3127
+ if (se->on_rq)
3128
+ update_load_add(&cfs_rq->load, se->load.weight);
3129
+
29333130 }
29343131
29353132 void reweight_task(struct task_struct *p, int prio)
....@@ -2939,7 +3136,7 @@
29393136 struct load_weight *load = &se->load;
29403137 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
29413138
2942
- reweight_entity(cfs_rq, se, weight, weight);
3139
+ reweight_entity(cfs_rq, se, weight);
29433140 load->inv_weight = sched_prio_to_wmult[prio];
29443141 }
29453142
....@@ -3051,50 +3248,6 @@
30513248 */
30523249 return clamp_t(long, shares, MIN_SHARES, tg_shares);
30533250 }
3054
-
3055
-/*
3056
- * This calculates the effective runnable weight for a group entity based on
3057
- * the group entity weight calculated above.
3058
- *
3059
- * Because of the above approximation (2), our group entity weight is
3060
- * an load_avg based ratio (3). This means that it includes blocked load and
3061
- * does not represent the runnable weight.
3062
- *
3063
- * Approximate the group entity's runnable weight per ratio from the group
3064
- * runqueue:
3065
- *
3066
- * grq->avg.runnable_load_avg
3067
- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068
- * grq->avg.load_avg
3069
- *
3070
- * However, analogous to above, since the avg numbers are slow, this leads to
3071
- * transients in the from-idle case. Instead we use:
3072
- *
3073
- * ge->runnable_weight = ge->load.weight *
3074
- *
3075
- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076
- * ----------------------------------------------------- (8)
3077
- * max(grq->avg.load_avg, grq->load.weight)
3078
- *
3079
- * Where these max() serve both to use the 'instant' values to fix the slow
3080
- * from-idle and avoid the /0 on to-idle, similar to (6).
3081
- */
3082
-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083
-{
3084
- long runnable, load_avg;
3085
-
3086
- load_avg = max(cfs_rq->avg.load_avg,
3087
- scale_load_down(cfs_rq->load.weight));
3088
-
3089
- runnable = max(cfs_rq->avg.runnable_load_avg,
3090
- scale_load_down(cfs_rq->runnable_weight));
3091
-
3092
- runnable *= shares;
3093
- if (load_avg)
3094
- runnable /= load_avg;
3095
-
3096
- return clamp_t(long, runnable, MIN_SHARES, shares);
3097
-}
30983251 #endif /* CONFIG_SMP */
30993252
31003253 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
....@@ -3106,7 +3259,7 @@
31063259 static void update_cfs_group(struct sched_entity *se)
31073260 {
31083261 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109
- long shares, runnable;
3262
+ long shares;
31103263
31113264 if (!gcfs_rq)
31123265 return;
....@@ -3115,16 +3268,15 @@
31153268 return;
31163269
31173270 #ifndef CONFIG_SMP
3118
- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3271
+ shares = READ_ONCE(gcfs_rq->tg->shares);
31193272
31203273 if (likely(se->load.weight == shares))
31213274 return;
31223275 #else
31233276 shares = calc_group_shares(gcfs_rq);
3124
- runnable = calc_group_runnable(gcfs_rq, shares);
31253277 #endif
31263278
3127
- reweight_entity(cfs_rq_of(se), se, shares, runnable);
3279
+ reweight_entity(cfs_rq_of(se), se, shares);
31283280 }
31293281
31303282 #else /* CONFIG_FAIR_GROUP_SCHED */
....@@ -3137,7 +3289,7 @@
31373289 {
31383290 struct rq *rq = rq_of(cfs_rq);
31393291
3140
- if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3292
+ if (&rq->cfs == cfs_rq) {
31413293 /*
31423294 * There are a few boundary cases this might miss but it should
31433295 * get called often enough that that should (hopefully) not be
....@@ -3161,7 +3313,6 @@
31613313 /**
31623314 * update_tg_load_avg - update the tg's load avg
31633315 * @cfs_rq: the cfs_rq whose avg changed
3164
- * @force: update regardless of how small the difference
31653316 *
31663317 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
31673318 * However, because tg->load_avg is a global value there are performance
....@@ -3173,7 +3324,7 @@
31733324 *
31743325 * Updating tg's load_avg is necessary before update_cfs_share().
31753326 */
3176
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3327
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
31773328 {
31783329 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
31793330
....@@ -3183,11 +3334,9 @@
31833334 if (cfs_rq->tg == &root_task_group)
31843335 return;
31853336
3186
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3337
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
31873338 atomic_long_add(delta, &cfs_rq->tg->load_avg);
31883339 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189
-
3190
- trace_sched_load_tg(cfs_rq);
31913340 }
31923341 }
31933342
....@@ -3240,7 +3389,6 @@
32403389 se->avg.last_update_time = n_last_update_time;
32413390 }
32423391
3243
-
32443392 /*
32453393 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
32463394 * propagate its contribution. The key to this propagation is the invariant
....@@ -3251,11 +3399,11 @@
32513399 * _IFF_ we look at the pure running and runnable sums. Because they
32523400 * represent the very same entity, just at different points in the hierarchy.
32533401 *
3254
- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255
- * sum over (but still wrong, because the group entity and group rq do not have
3256
- * their PELT windows aligned).
3402
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3403
+ * and simply copies the running/runnable sum over (but still wrong, because
3404
+ * the group entity and group rq do not have their PELT windows aligned).
32573405 *
3258
- * However, update_tg_cfs_runnable() is more complex. So we have:
3406
+ * However, update_tg_cfs_load() is more complex. So we have:
32593407 *
32603408 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
32613409 *
....@@ -3308,45 +3456,75 @@
33083456 * XXX: only do this for the part of runnable > running ?
33093457 *
33103458 */
3311
-
33123459 static inline void
33133460 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33143461 {
33153462 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3463
+ u32 divider;
33163464
33173465 /* Nothing to update */
33183466 if (!delta)
33193467 return;
33203468
33213469 /*
3322
- * The relation between sum and avg is:
3323
- *
3324
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325
- *
3326
- * however, the PELT windows are not aligned between grq and gse.
3470
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3471
+ * See ___update_load_avg() for details.
33273472 */
3473
+ divider = get_pelt_divider(&cfs_rq->avg);
33283474
33293475 /* Set new sched_entity's utilization */
33303476 se->avg.util_avg = gcfs_rq->avg.util_avg;
3331
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3477
+ se->avg.util_sum = se->avg.util_avg * divider;
33323478
33333479 /* Update parent cfs_rq utilization */
33343480 add_positive(&cfs_rq->avg.util_avg, delta);
3335
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3481
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
33363482 }
33373483
33383484 static inline void
33393485 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
33403486 {
3487
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3488
+ u32 divider;
3489
+
3490
+ /* Nothing to update */
3491
+ if (!delta)
3492
+ return;
3493
+
3494
+ /*
3495
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3496
+ * See ___update_load_avg() for details.
3497
+ */
3498
+ divider = get_pelt_divider(&cfs_rq->avg);
3499
+
3500
+ /* Set new sched_entity's runnable */
3501
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
3502
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3503
+
3504
+ /* Update parent cfs_rq runnable */
3505
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
3506
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
3507
+}
3508
+
3509
+static inline void
3510
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3511
+{
33413512 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342
- unsigned long runnable_load_avg, load_avg;
3343
- u64 runnable_load_sum, load_sum = 0;
3513
+ unsigned long load_avg;
3514
+ u64 load_sum = 0;
33443515 s64 delta_sum;
3516
+ u32 divider;
33453517
33463518 if (!runnable_sum)
33473519 return;
33483520
33493521 gcfs_rq->prop_runnable_sum = 0;
3522
+
3523
+ /*
3524
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3525
+ * See ___update_load_avg() for details.
3526
+ */
3527
+ divider = get_pelt_divider(&cfs_rq->avg);
33503528
33513529 if (runnable_sum >= 0) {
33523530 /*
....@@ -3354,7 +3532,7 @@
33543532 * the CPU is saturated running == runnable.
33553533 */
33563534 runnable_sum += se->avg.load_sum;
3357
- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3535
+ runnable_sum = min_t(long, runnable_sum, divider);
33583536 } else {
33593537 /*
33603538 * Estimate the new unweighted runnable_sum of the gcfs_rq by
....@@ -3379,7 +3557,7 @@
33793557 runnable_sum = max(runnable_sum, running_sum);
33803558
33813559 load_sum = (s64)se_weight(se) * runnable_sum;
3382
- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3560
+ load_avg = div_s64(load_sum, divider);
33833561
33843562 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
33853563 delta_avg = load_avg - se->avg.load_avg;
....@@ -3388,19 +3566,6 @@
33883566 se->avg.load_avg = load_avg;
33893567 add_positive(&cfs_rq->avg.load_avg, delta_avg);
33903568 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391
-
3392
- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393
- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394
- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395
- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396
-
3397
- se->avg.runnable_load_sum = runnable_sum;
3398
- se->avg.runnable_load_avg = runnable_load_avg;
3399
-
3400
- if (se->on_rq) {
3401
- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402
- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403
- }
34043569 }
34053570
34063571 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
....@@ -3429,9 +3594,10 @@
34293594
34303595 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
34313596 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3597
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
34323598
3433
- trace_sched_load_cfs_rq(cfs_rq);
3434
- trace_sched_load_se(se);
3599
+ trace_pelt_cfs_tp(cfs_rq);
3600
+ trace_pelt_se_tp(se);
34353601
34363602 return 1;
34373603 }
....@@ -3468,7 +3634,7 @@
34683634
34693635 #else /* CONFIG_FAIR_GROUP_SCHED */
34703636
3471
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3637
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
34723638
34733639 static inline int propagate_entity_load_avg(struct sched_entity *se)
34743640 {
....@@ -3498,18 +3664,18 @@
34983664 static inline int
34993665 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
35003666 {
3501
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3667
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
35023668 struct sched_avg *sa = &cfs_rq->avg;
35033669 int decayed = 0;
35043670
35053671 if (cfs_rq->removed.nr) {
35063672 unsigned long r;
3507
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3673
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35083674
35093675 raw_spin_lock(&cfs_rq->removed.lock);
35103676 swap(cfs_rq->removed.util_avg, removed_util);
35113677 swap(cfs_rq->removed.load_avg, removed_load);
3512
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3678
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
35133679 cfs_rq->removed.nr = 0;
35143680 raw_spin_unlock(&cfs_rq->removed.lock);
35153681
....@@ -3520,8 +3686,29 @@
35203686 r = removed_util;
35213687 sub_positive(&sa->util_avg, r);
35223688 sub_positive(&sa->util_sum, r * divider);
3689
+ /*
3690
+ * Because of rounding, se->util_sum might ends up being +1 more than
3691
+ * cfs->util_sum. Although this is not a problem by itself, detaching
3692
+ * a lot of tasks with the rounding problem between 2 updates of
3693
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3694
+ * cfs_util_avg is not.
3695
+ * Check that util_sum is still above its lower bound for the new
3696
+ * util_avg. Given that period_contrib might have moved since the last
3697
+ * sync, we are only sure that util_sum must be above or equal to
3698
+ * util_avg * minimum possible divider
3699
+ */
3700
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
35233701
3524
- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3702
+ r = removed_runnable;
3703
+ sub_positive(&sa->runnable_avg, r);
3704
+ sub_positive(&sa->runnable_sum, r * divider);
3705
+
3706
+ /*
3707
+ * removed_runnable is the unweighted version of removed_load so we
3708
+ * can use it to estimate removed_load_sum.
3709
+ */
3710
+ add_tg_cfs_propagate(cfs_rq,
3711
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
35253712
35263713 decayed = 1;
35273714 }
....@@ -3533,9 +3720,6 @@
35333720 cfs_rq->load_last_update_time_copy = sa->last_update_time;
35343721 #endif
35353722
3536
- if (decayed)
3537
- cfs_rq_util_change(cfs_rq, 0);
3538
-
35393723 return decayed;
35403724 }
35413725
....@@ -3543,14 +3727,17 @@
35433727 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
35443728 * @cfs_rq: cfs_rq to attach to
35453729 * @se: sched_entity to attach
3546
- * @flags: migration hints
35473730 *
35483731 * Must call update_cfs_rq_load_avg() before this, since we rely on
35493732 * cfs_rq->avg.last_update_time being current.
35503733 */
3551
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3734
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
35523735 {
3553
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3736
+ /*
3737
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3738
+ * See ___update_load_avg() for details.
3739
+ */
3740
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
35543741
35553742 /*
35563743 * When we attach the @se to the @cfs_rq, we must align the decay
....@@ -3570,23 +3757,25 @@
35703757 */
35713758 se->avg.util_sum = se->avg.util_avg * divider;
35723759
3573
- se->avg.load_sum = divider;
3574
- if (se_weight(se)) {
3575
- se->avg.load_sum =
3576
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577
- }
3760
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
35783761
3579
- se->avg.runnable_load_sum = se->avg.load_sum;
3762
+ se->avg.load_sum = se->avg.load_avg * divider;
3763
+ if (se_weight(se) < se->avg.load_sum)
3764
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
3765
+ else
3766
+ se->avg.load_sum = 1;
35803767
35813768 enqueue_load_avg(cfs_rq, se);
35823769 cfs_rq->avg.util_avg += se->avg.util_avg;
35833770 cfs_rq->avg.util_sum += se->avg.util_sum;
3771
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3772
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
35843773
35853774 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
35863775
3587
- cfs_rq_util_change(cfs_rq, flags);
3776
+ cfs_rq_util_change(cfs_rq, 0);
35883777
3589
- trace_sched_load_cfs_rq(cfs_rq);
3778
+ trace_pelt_cfs_tp(cfs_rq);
35903779 }
35913780
35923781 /**
....@@ -3602,12 +3791,14 @@
36023791 dequeue_load_avg(cfs_rq, se);
36033792 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
36043793 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3794
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
3795
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
36053796
36063797 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
36073798
36083799 cfs_rq_util_change(cfs_rq, 0);
36093800
3610
- trace_sched_load_cfs_rq(cfs_rq);
3801
+ trace_pelt_cfs_tp(cfs_rq);
36113802 }
36123803
36133804 /*
....@@ -3623,12 +3814,15 @@
36233814 u64 now = cfs_rq_clock_pelt(cfs_rq);
36243815 int decayed;
36253816
3817
+ trace_android_vh_prepare_update_load_avg_se(se, flags);
36263818 /*
36273819 * Track task load average for carrying it to new CPU after migrated, and
36283820 * track group sched_entity load average for task_h_load calc in migration
36293821 */
36303822 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
36313823 __update_load_avg_se(now, cfs_rq, se);
3824
+
3825
+ trace_android_vh_finish_update_load_avg_se(se, flags);
36323826
36333827 decayed = update_cfs_rq_load_avg(now, cfs_rq);
36343828 decayed |= propagate_entity_load_avg(se);
....@@ -3642,11 +3836,15 @@
36423836 *
36433837 * IOW we're enqueueing a task on a new CPU.
36443838 */
3645
- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646
- update_tg_load_avg(cfs_rq, 0);
3839
+ attach_entity_load_avg(cfs_rq, se);
3840
+ update_tg_load_avg(cfs_rq);
36473841
3648
- } else if (decayed && (flags & UPDATE_TG))
3649
- update_tg_load_avg(cfs_rq, 0);
3842
+ } else if (decayed) {
3843
+ cfs_rq_util_change(cfs_rq, 0);
3844
+
3845
+ if (flags & UPDATE_TG)
3846
+ update_tg_load_avg(cfs_rq);
3847
+ }
36503848 }
36513849
36523850 #ifndef CONFIG_64BIT
....@@ -3674,20 +3872,22 @@
36743872 * Synchronize entity load avg of dequeued entity without locking
36753873 * the previous rq.
36763874 */
3677
-void sync_entity_load_avg(struct sched_entity *se)
3875
+static void sync_entity_load_avg(struct sched_entity *se)
36783876 {
36793877 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36803878 u64 last_update_time;
36813879
36823880 last_update_time = cfs_rq_last_update_time(cfs_rq);
3881
+ trace_android_vh_prepare_update_load_avg_se(se, 0);
36833882 __update_load_avg_blocked_se(last_update_time, se);
3883
+ trace_android_vh_finish_update_load_avg_se(se, 0);
36843884 }
36853885
36863886 /*
36873887 * Task first catches up with cfs_rq, and then subtract
36883888 * itself from the cfs_rq (task must be off the queue now).
36893889 */
3690
-void remove_entity_load_avg(struct sched_entity *se)
3890
+static void remove_entity_load_avg(struct sched_entity *se)
36913891 {
36923892 struct cfs_rq *cfs_rq = cfs_rq_of(se);
36933893 unsigned long flags;
....@@ -3696,10 +3896,6 @@
36963896 * tasks cannot exit without having gone through wake_up_new_task() ->
36973897 * post_init_entity_util_avg() which will have added things to the
36983898 * cfs_rq, so we can remove unconditionally.
3699
- *
3700
- * Similarly for groups, they will have passed through
3701
- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702
- * calls this.
37033899 */
37043900
37053901 sync_entity_load_avg(se);
....@@ -3708,13 +3904,13 @@
37083904 ++cfs_rq->removed.nr;
37093905 cfs_rq->removed.util_avg += se->avg.util_avg;
37103906 cfs_rq->removed.load_avg += se->avg.load_avg;
3711
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
3907
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
37123908 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
37133909 }
37143910
3715
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3911
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
37163912 {
3717
- return cfs_rq->avg.runnable_load_avg;
3913
+ return cfs_rq->avg.runnable_avg;
37183914 }
37193915
37203916 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
....@@ -3722,7 +3918,7 @@
37223918 return cfs_rq->avg.load_avg;
37233919 }
37243920
3725
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3921
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
37263922
37273923 static inline unsigned long task_util(struct task_struct *p)
37283924 {
....@@ -3733,10 +3929,10 @@
37333929 {
37343930 struct util_est ue = READ_ONCE(p->se.avg.util_est);
37353931
3736
- return max(ue.ewma, ue.enqueued);
3932
+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
37373933 }
37383934
3739
-unsigned long task_util_est(struct task_struct *p)
3935
+static inline unsigned long task_util_est(struct task_struct *p)
37403936 {
37413937 return max(task_util(p), _task_util_est(p));
37423938 }
....@@ -3765,13 +3961,29 @@
37653961
37663962 /* Update root cfs_rq's estimated utilization */
37673963 enqueued = cfs_rq->avg.util_est.enqueued;
3768
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3964
+ enqueued += _task_util_est(p);
37693965 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
37703966
3771
- /* Update plots for Task and CPU estimated utilization */
3772
- trace_sched_util_est_task(p, &p->se.avg);
3773
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3967
+ trace_sched_util_est_cfs_tp(cfs_rq);
37743968 }
3969
+
3970
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3971
+ struct task_struct *p)
3972
+{
3973
+ unsigned int enqueued;
3974
+
3975
+ if (!sched_feat(UTIL_EST))
3976
+ return;
3977
+
3978
+ /* Update root cfs_rq's estimated utilization */
3979
+ enqueued = cfs_rq->avg.util_est.enqueued;
3980
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
3981
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3982
+
3983
+ trace_sched_util_est_cfs_tp(cfs_rq);
3984
+}
3985
+
3986
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
37753987
37763988 /*
37773989 * Check if a (signed) value is within a specified (unsigned) margin,
....@@ -3786,24 +3998,20 @@
37863998 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
37873999 }
37884000
3789
-static void
3790
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
4001
+static inline void util_est_update(struct cfs_rq *cfs_rq,
4002
+ struct task_struct *p,
4003
+ bool task_sleep)
37914004 {
3792
- long last_ewma_diff;
4005
+ long last_ewma_diff, last_enqueued_diff;
37934006 struct util_est ue;
3794
- int cpu;
4007
+ int ret = 0;
4008
+
4009
+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
4010
+ if (ret)
4011
+ return;
37954012
37964013 if (!sched_feat(UTIL_EST))
37974014 return;
3798
-
3799
- /* Update root cfs_rq's estimated utilization */
3800
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3803
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804
-
3805
- /* Update plots for CPU's estimated utilization */
3806
- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
38074015
38084016 /*
38094017 * Skip update of task's estimated utilization when the task has not
....@@ -3820,11 +4028,13 @@
38204028 if (ue.enqueued & UTIL_AVG_UNCHANGED)
38214029 return;
38224030
4031
+ last_enqueued_diff = ue.enqueued;
4032
+
38234033 /*
38244034 * Reset EWMA on utilization increases, the moving average is used only
38254035 * to smooth utilization decreases.
38264036 */
3827
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
4037
+ ue.enqueued = task_util(p);
38284038 if (sched_feat(UTIL_EST_FASTUP)) {
38294039 if (ue.ewma < ue.enqueued) {
38304040 ue.ewma = ue.enqueued;
....@@ -3833,19 +4043,23 @@
38334043 }
38344044
38354045 /*
3836
- * Skip update of task's estimated utilization when its EWMA is
4046
+ * Skip update of task's estimated utilization when its members are
38374047 * already ~1% close to its last activation value.
38384048 */
38394049 last_ewma_diff = ue.enqueued - ue.ewma;
3840
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
4050
+ last_enqueued_diff -= ue.enqueued;
4051
+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4052
+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4053
+ goto done;
4054
+
38414055 return;
4056
+ }
38424057
38434058 /*
38444059 * To avoid overestimation of actual task utilization, skip updates if
38454060 * we cannot grant there is idle time in this CPU.
38464061 */
3847
- cpu = cpu_of(rq_of(cfs_rq));
3848
- if (task_util(p) > capacity_orig_of(cpu))
4062
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
38494063 return;
38504064
38514065 /*
....@@ -3869,39 +4083,26 @@
38694083 ue.ewma += last_ewma_diff;
38704084 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
38714085 done:
4086
+ ue.enqueued |= UTIL_AVG_UNCHANGED;
38724087 WRITE_ONCE(p->se.avg.util_est, ue);
38734088
3874
- /* Update plots for Task's estimated utilization */
3875
- trace_sched_util_est_task(p, &p->se.avg);
4089
+ trace_sched_util_est_se_tp(&p->se);
38764090 }
38774091
38784092 static inline int task_fits_capacity(struct task_struct *p, long capacity)
38794093 {
3880
- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
4094
+ return fits_capacity(uclamp_task_util(p), capacity);
38814095 }
3882
-
3883
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884
-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885
-{
3886
- unsigned long capacity = capacity_of(cpu);
3887
- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
3888
-
3889
- if (capacity == max_capacity)
3890
- return true;
3891
-
3892
- if (capacity * capacity_margin > max_capacity * 1024)
3893
- return true;
3894
-
3895
- return task_fits_capacity(p, capacity);
3896
-}
3897
-#endif
38984096
38994097 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
39004098 {
3901
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
4099
+ bool need_update = true;
4100
+
4101
+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
4102
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update)
39024103 return;
39034104
3904
- if (!p) {
4105
+ if (!p || p->nr_cpus_allowed == 1) {
39054106 rq->misfit_task_load = 0;
39064107 return;
39074108 }
....@@ -3911,7 +4112,11 @@
39114112 return;
39124113 }
39134114
3914
- rq->misfit_task_load = task_h_load(p);
4115
+ /*
4116
+ * Make sure that misfit_task_load will not be null even if
4117
+ * task_h_load() returns 0.
4118
+ */
4119
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
39154120 }
39164121
39174122 #else /* CONFIG_SMP */
....@@ -3928,11 +4133,11 @@
39284133 static inline void remove_entity_load_avg(struct sched_entity *se) {}
39294134
39304135 static inline void
3931
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
4136
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39324137 static inline void
39334138 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
39344139
3935
-static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
4140
+static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
39364141 {
39374142 return 0;
39384143 }
....@@ -3941,8 +4146,11 @@
39414146 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
39424147
39434148 static inline void
3944
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3945
- bool task_sleep) {}
4149
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4150
+
4151
+static inline void
4152
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4153
+ bool task_sleep) {}
39464154 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
39474155
39484156 #endif /* CONFIG_SMP */
....@@ -3990,6 +4198,7 @@
39904198
39914199 /* ensure we never gain time by being placed backwards. */
39924200 se->vruntime = max_vruntime(se->vruntime, vruntime);
4201
+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
39934202 }
39944203
39954204 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
....@@ -4014,6 +4223,7 @@
40144223 #endif
40154224 }
40164225
4226
+static inline bool cfs_bandwidth_used(void);
40174227
40184228 /*
40194229 * MIGRATION
....@@ -4078,8 +4288,8 @@
40784288 * - Add its new weight to cfs_rq->load.weight
40794289 */
40804290 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
4291
+ se_update_runnable(se);
40814292 update_cfs_group(se);
4082
- enqueue_runnable_load_avg(cfs_rq, se);
40834293 account_entity_enqueue(cfs_rq, se);
40844294
40854295 if (flags & ENQUEUE_WAKEUP)
....@@ -4092,10 +4302,16 @@
40924302 __enqueue_entity(cfs_rq, se);
40934303 se->on_rq = 1;
40944304
4095
- if (cfs_rq->nr_running == 1) {
4305
+ /*
4306
+ * When bandwidth control is enabled, cfs might have been removed
4307
+ * because of a parent been throttled but cfs->nr_running > 1. Try to
4308
+ * add it unconditionnally.
4309
+ */
4310
+ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
40964311 list_add_leaf_cfs_rq(cfs_rq);
4312
+
4313
+ if (cfs_rq->nr_running == 1)
40974314 check_enqueue_throttle(cfs_rq);
4098
- }
40994315 }
41004316
41014317 static void __clear_buddies_last(struct sched_entity *se)
....@@ -4156,13 +4372,13 @@
41564372 /*
41574373 * When dequeuing a sched_entity, we must:
41584374 * - Update loads to have both entity and cfs_rq synced with now.
4159
- * - Substract its load from the cfs_rq->runnable_avg.
4160
- * - Substract its previous weight from cfs_rq->load.weight.
4375
+ * - Subtract its load from the cfs_rq->runnable_avg.
4376
+ * - Subtract its previous weight from cfs_rq->load.weight.
41614377 * - For group entity, update its weight to reflect the new share
41624378 * of its group cfs_rq.
41634379 */
41644380 update_load_avg(cfs_rq, se, UPDATE_TG);
4165
- dequeue_runnable_load_avg(cfs_rq, se);
4381
+ se_update_runnable(se);
41664382
41674383 update_stats_dequeue(cfs_rq, se, flags);
41684384
....@@ -4206,11 +4422,16 @@
42064422 unsigned long ideal_runtime, delta_exec;
42074423 struct sched_entity *se;
42084424 s64 delta;
4425
+ bool skip_preempt = false;
42094426
42104427 ideal_runtime = sched_slice(cfs_rq, curr);
42114428 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4429
+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
4430
+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
4431
+ if (skip_preempt)
4432
+ return;
42124433 if (delta_exec > ideal_runtime) {
4213
- resched_curr(rq_of(cfs_rq));
4434
+ resched_curr_lazy(rq_of(cfs_rq));
42144435 /*
42154436 * The current task ran long enough, ensure it doesn't get
42164437 * re-elected due to buddy favours.
....@@ -4234,11 +4455,10 @@
42344455 return;
42354456
42364457 if (delta > ideal_runtime)
4237
- resched_curr(rq_of(cfs_rq));
4458
+ resched_curr_lazy(rq_of(cfs_rq));
42384459 }
42394460
4240
-static void
4241
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4461
+void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
42424462 {
42434463 /* 'current' is not kept within the tree. */
42444464 if (se->on_rq) {
....@@ -4260,7 +4480,8 @@
42604480 * least twice that of our own weight (i.e. dont track it
42614481 * when there are only lesser-weight tasks around):
42624482 */
4263
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
4483
+ if (schedstat_enabled() &&
4484
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
42644485 schedstat_set(se->statistics.slice_max,
42654486 max((u64)schedstat_val(se->statistics.slice_max),
42664487 se->sum_exec_runtime - se->prev_sum_exec_runtime));
....@@ -4268,6 +4489,8 @@
42684489
42694490 se->prev_sum_exec_runtime = se->sum_exec_runtime;
42704491 }
4492
+EXPORT_SYMBOL_GPL(set_next_entity);
4493
+
42714494
42724495 static int
42734496 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
....@@ -4283,7 +4506,11 @@
42834506 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
42844507 {
42854508 struct sched_entity *left = __pick_first_entity(cfs_rq);
4286
- struct sched_entity *se;
4509
+ struct sched_entity *se = NULL;
4510
+
4511
+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
4512
+ if (se)
4513
+ goto done;
42874514
42884515 /*
42894516 * If curr is set we have to see if its left of the leftmost entity
....@@ -4313,18 +4540,19 @@
43134540 se = second;
43144541 }
43154542
4316
- /*
4317
- * Prefer last buddy, try to return the CPU to a preempted task.
4318
- */
4319
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320
- se = cfs_rq->last;
4321
-
4322
- /*
4323
- * Someone really wants this to run. If it's not unfair, run it.
4324
- */
4325
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4543
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4544
+ /*
4545
+ * Someone really wants this to run. If it's not unfair, run it.
4546
+ */
43264547 se = cfs_rq->next;
4548
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4549
+ /*
4550
+ * Prefer last buddy, try to return the CPU to a preempted task.
4551
+ */
4552
+ se = cfs_rq->last;
4553
+ }
43274554
4555
+done:
43284556 clear_buddies(cfs_rq, se);
43294557
43304558 return se;
....@@ -4376,7 +4604,7 @@
43764604 * validating it and just reschedule.
43774605 */
43784606 if (queued) {
4379
- resched_curr(rq_of(cfs_rq));
4607
+ resched_curr_lazy(rq_of(cfs_rq));
43804608 return;
43814609 }
43824610 /*
....@@ -4457,26 +4685,17 @@
44574685 return &tg->cfs_bandwidth;
44584686 }
44594687
4460
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462
-{
4463
- if (unlikely(cfs_rq->throttle_count))
4464
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465
-
4466
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467
-}
4468
-
44694688 /* returns 0 on failure to allocate runtime */
4470
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4689
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4690
+ struct cfs_rq *cfs_rq, u64 target_runtime)
44714691 {
4472
- struct task_group *tg = cfs_rq->tg;
4473
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474
- u64 amount = 0, min_amount;
4692
+ u64 min_amount, amount = 0;
4693
+
4694
+ lockdep_assert_held(&cfs_b->lock);
44754695
44764696 /* note: this is a positive sum as runtime_remaining <= 0 */
4477
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4697
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
44784698
4479
- raw_spin_lock(&cfs_b->lock);
44804699 if (cfs_b->quota == RUNTIME_INF)
44814700 amount = min_amount;
44824701 else {
....@@ -4488,11 +4707,23 @@
44884707 cfs_b->idle = 0;
44894708 }
44904709 }
4491
- raw_spin_unlock(&cfs_b->lock);
44924710
44934711 cfs_rq->runtime_remaining += amount;
44944712
44954713 return cfs_rq->runtime_remaining > 0;
4714
+}
4715
+
4716
+/* returns 0 on failure to allocate runtime */
4717
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4718
+{
4719
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4720
+ int ret;
4721
+
4722
+ raw_spin_lock(&cfs_b->lock);
4723
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4724
+ raw_spin_unlock(&cfs_b->lock);
4725
+
4726
+ return ret;
44964727 }
44974728
44984729 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
....@@ -4510,7 +4741,7 @@
45104741 * hierarchy can be throttled
45114742 */
45124743 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4513
- resched_curr(rq_of(cfs_rq));
4744
+ resched_curr_lazy(rq_of(cfs_rq));
45144745 }
45154746
45164747 static __always_inline
....@@ -4557,9 +4788,8 @@
45574788
45584789 cfs_rq->throttle_count--;
45594790 if (!cfs_rq->throttle_count) {
4560
- /* adjust cfs_rq_clock_task() */
4561
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562
- cfs_rq->throttled_clock_task;
4791
+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
4792
+ cfs_rq->throttled_clock_pelt;
45634793
45644794 /* Add cfs_rq with already running entity in the list */
45654795 if (cfs_rq->nr_running >= 1)
....@@ -4576,7 +4806,7 @@
45764806
45774807 /* group is entering throttled state, stop time */
45784808 if (!cfs_rq->throttle_count) {
4579
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
4809
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
45804810 list_del_leaf_cfs_rq(cfs_rq);
45814811 }
45824812 cfs_rq->throttle_count++;
....@@ -4584,13 +4814,33 @@
45844814 return 0;
45854815 }
45864816
4587
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4817
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
45884818 {
45894819 struct rq *rq = rq_of(cfs_rq);
45904820 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
45914821 struct sched_entity *se;
4592
- long task_delta, dequeue = 1;
4593
- bool empty;
4822
+ long task_delta, idle_task_delta, dequeue = 1;
4823
+
4824
+ raw_spin_lock(&cfs_b->lock);
4825
+ /* This will start the period timer if necessary */
4826
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4827
+ /*
4828
+ * We have raced with bandwidth becoming available, and if we
4829
+ * actually throttled the timer might not unthrottle us for an
4830
+ * entire period. We additionally needed to make sure that any
4831
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
4832
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
4833
+ * for 1ns of runtime rather than just check cfs_b.
4834
+ */
4835
+ dequeue = 0;
4836
+ } else {
4837
+ list_add_tail_rcu(&cfs_rq->throttled_list,
4838
+ &cfs_b->throttled_cfs_rq);
4839
+ }
4840
+ raw_spin_unlock(&cfs_b->lock);
4841
+
4842
+ if (!dequeue)
4843
+ return false; /* Throttle no longer required. */
45944844
45954845 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
45964846
....@@ -4600,15 +4850,22 @@
46004850 rcu_read_unlock();
46014851
46024852 task_delta = cfs_rq->h_nr_running;
4853
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46034854 for_each_sched_entity(se) {
46044855 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
46054856 /* throttled entity or throttle-on-deactivate */
46064857 if (!se->on_rq)
46074858 break;
46084859
4609
- if (dequeue)
4860
+ if (dequeue) {
46104861 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4862
+ } else {
4863
+ update_load_avg(qcfs_rq, se, 0);
4864
+ se_update_runnable(se);
4865
+ }
4866
+
46114867 qcfs_rq->h_nr_running -= task_delta;
4868
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
46124869
46134870 if (qcfs_rq->load.weight)
46144871 dequeue = 0;
....@@ -4617,29 +4874,13 @@
46174874 if (!se)
46184875 sub_nr_running(rq, task_delta);
46194876
4877
+ /*
4878
+ * Note: distribution will already see us throttled via the
4879
+ * throttled-list. rq->lock protects completion.
4880
+ */
46204881 cfs_rq->throttled = 1;
46214882 cfs_rq->throttled_clock = rq_clock(rq);
4622
- raw_spin_lock(&cfs_b->lock);
4623
- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624
-
4625
- /*
4626
- * Add to the _head_ of the list, so that an already-started
4627
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628
- * not running add to the tail so that later runqueues don't get starved.
4629
- */
4630
- if (cfs_b->distribute_running)
4631
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632
- else
4633
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634
-
4635
- /*
4636
- * If we're the first throttled task, make sure the bandwidth
4637
- * timer is running.
4638
- */
4639
- if (empty)
4640
- start_cfs_bandwidth(cfs_b);
4641
-
4642
- raw_spin_unlock(&cfs_b->lock);
4883
+ return true;
46434884 }
46444885
46454886 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
....@@ -4647,8 +4888,7 @@
46474888 struct rq *rq = rq_of(cfs_rq);
46484889 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
46494890 struct sched_entity *se;
4650
- int enqueue = 1;
4651
- long task_delta;
4891
+ long task_delta, idle_task_delta;
46524892
46534893 se = cfs_rq->tg->se[cpu_of(rq)];
46544894
....@@ -4668,34 +4908,70 @@
46684908 return;
46694909
46704910 task_delta = cfs_rq->h_nr_running;
4911
+ idle_task_delta = cfs_rq->idle_h_nr_running;
46714912 for_each_sched_entity(se) {
46724913 if (se->on_rq)
4673
- enqueue = 0;
4674
-
4914
+ break;
46754915 cfs_rq = cfs_rq_of(se);
4676
- if (enqueue)
4677
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678
- cfs_rq->h_nr_running += task_delta;
4916
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
46794917
4918
+ cfs_rq->h_nr_running += task_delta;
4919
+ cfs_rq->idle_h_nr_running += idle_task_delta;
4920
+
4921
+ /* end evaluation on encountering a throttled cfs_rq */
46804922 if (cfs_rq_throttled(cfs_rq))
4923
+ goto unthrottle_throttle;
4924
+ }
4925
+
4926
+ for_each_sched_entity(se) {
4927
+ cfs_rq = cfs_rq_of(se);
4928
+
4929
+ update_load_avg(cfs_rq, se, UPDATE_TG);
4930
+ se_update_runnable(se);
4931
+
4932
+ cfs_rq->h_nr_running += task_delta;
4933
+ cfs_rq->idle_h_nr_running += idle_task_delta;
4934
+
4935
+
4936
+ /* end evaluation on encountering a throttled cfs_rq */
4937
+ if (cfs_rq_throttled(cfs_rq))
4938
+ goto unthrottle_throttle;
4939
+
4940
+ /*
4941
+ * One parent has been throttled and cfs_rq removed from the
4942
+ * list. Add it back to not break the leaf list.
4943
+ */
4944
+ if (throttled_hierarchy(cfs_rq))
4945
+ list_add_leaf_cfs_rq(cfs_rq);
4946
+ }
4947
+
4948
+ /* At this point se is NULL and we are at root level*/
4949
+ add_nr_running(rq, task_delta);
4950
+
4951
+unthrottle_throttle:
4952
+ /*
4953
+ * The cfs_rq_throttled() breaks in the above iteration can result in
4954
+ * incomplete leaf list maintenance, resulting in triggering the
4955
+ * assertion below.
4956
+ */
4957
+ for_each_sched_entity(se) {
4958
+ cfs_rq = cfs_rq_of(se);
4959
+
4960
+ if (list_add_leaf_cfs_rq(cfs_rq))
46814961 break;
46824962 }
46834963
46844964 assert_list_leaf_cfs_rq(rq);
4685
-
4686
- if (!se)
4687
- add_nr_running(rq, task_delta);
46884965
46894966 /* Determine whether we need to wake up potentially idle CPU: */
46904967 if (rq->curr == rq->idle && rq->cfs.nr_running)
46914968 resched_curr(rq);
46924969 }
46934970
4694
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4971
+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
46954972 {
46964973 struct cfs_rq *cfs_rq;
4697
- u64 runtime;
4698
- u64 starting_runtime = remaining;
4974
+ u64 runtime, remaining = 1;
46994975
47004976 rcu_read_lock();
47014977 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
....@@ -4703,17 +4979,20 @@
47034979 struct rq *rq = rq_of(cfs_rq);
47044980 struct rq_flags rf;
47054981
4706
- rq_lock(rq, &rf);
4982
+ rq_lock_irqsave(rq, &rf);
47074983 if (!cfs_rq_throttled(cfs_rq))
47084984 goto next;
47094985
47104986 /* By the above check, this should never be true */
47114987 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
47124988
4989
+ raw_spin_lock(&cfs_b->lock);
47134990 runtime = -cfs_rq->runtime_remaining + 1;
4714
- if (runtime > remaining)
4715
- runtime = remaining;
4716
- remaining -= runtime;
4991
+ if (runtime > cfs_b->runtime)
4992
+ runtime = cfs_b->runtime;
4993
+ cfs_b->runtime -= runtime;
4994
+ remaining = cfs_b->runtime;
4995
+ raw_spin_unlock(&cfs_b->lock);
47174996
47184997 cfs_rq->runtime_remaining += runtime;
47194998
....@@ -4722,14 +5001,12 @@
47225001 unthrottle_cfs_rq(cfs_rq);
47235002
47245003 next:
4725
- rq_unlock(rq, &rf);
5004
+ rq_unlock_irqrestore(rq, &rf);
47265005
47275006 if (!remaining)
47285007 break;
47295008 }
47305009 rcu_read_unlock();
4731
-
4732
- return starting_runtime - remaining;
47335010 }
47345011
47355012 /*
....@@ -4738,9 +5015,8 @@
47385015 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
47395016 * used to track this state.
47405017 */
4741
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5018
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
47425019 {
4743
- u64 runtime;
47445020 int throttled;
47455021
47465022 /* no need to continue the timer with no bandwidth constraint */
....@@ -4769,24 +5045,15 @@
47695045 cfs_b->nr_throttled += overrun;
47705046
47715047 /*
4772
- * This check is repeated as we are holding onto the new bandwidth while
4773
- * we unthrottle. This can potentially race with an unthrottled group
4774
- * trying to acquire new bandwidth from the global pool. This can result
4775
- * in us over-using our runtime if it is all used during this loop, but
4776
- * only by limited amounts in that extreme case.
5048
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
47775049 */
4778
- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779
- runtime = cfs_b->runtime;
4780
- cfs_b->distribute_running = 1;
4781
- raw_spin_unlock(&cfs_b->lock);
5050
+ while (throttled && cfs_b->runtime > 0) {
5051
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
47825052 /* we can't nest cfs_b->lock while distributing bandwidth */
4783
- runtime = distribute_cfs_runtime(cfs_b, runtime);
4784
- raw_spin_lock(&cfs_b->lock);
5053
+ distribute_cfs_runtime(cfs_b);
5054
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
47855055
4786
- cfs_b->distribute_running = 0;
47875056 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788
-
4789
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
47905057 }
47915058
47925059 /*
....@@ -4842,6 +5109,11 @@
48425109 if (runtime_refresh_within(cfs_b, min_left))
48435110 return;
48445111
5112
+ /* don't push forwards an existing deferred unthrottle */
5113
+ if (cfs_b->slack_started)
5114
+ return;
5115
+ cfs_b->slack_started = true;
5116
+
48455117 hrtimer_start(&cfs_b->slack_timer,
48465118 ns_to_ktime(cfs_bandwidth_slack_period),
48475119 HRTIMER_MODE_REL);
....@@ -4889,42 +5161,35 @@
48895161 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
48905162 {
48915163 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5164
+ unsigned long flags;
48925165
48935166 /* confirm we're still not at a refresh boundary */
4894
- raw_spin_lock(&cfs_b->lock);
4895
- if (cfs_b->distribute_running) {
4896
- raw_spin_unlock(&cfs_b->lock);
4897
- return;
4898
- }
5167
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
5168
+ cfs_b->slack_started = false;
48995169
49005170 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4901
- raw_spin_unlock(&cfs_b->lock);
5171
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49025172 return;
49035173 }
49045174
49055175 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
49065176 runtime = cfs_b->runtime;
49075177
4908
- if (runtime)
4909
- cfs_b->distribute_running = 1;
4910
-
4911
- raw_spin_unlock(&cfs_b->lock);
5178
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49125179
49135180 if (!runtime)
49145181 return;
49155182
4916
- runtime = distribute_cfs_runtime(cfs_b, runtime);
5183
+ distribute_cfs_runtime(cfs_b);
49175184
4918
- raw_spin_lock(&cfs_b->lock);
4919
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4920
- cfs_b->distribute_running = 0;
4921
- raw_spin_unlock(&cfs_b->lock);
5185
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
5186
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
49225187 }
49235188
49245189 /*
49255190 * When a group wakes up we want to make sure that its quota is not already
49265191 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4927
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
5192
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
49285193 */
49295194 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
49305195 {
....@@ -4959,7 +5224,7 @@
49595224 pcfs_rq = tg->parent->cfs_rq[cpu];
49605225
49615226 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4962
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
5227
+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
49635228 }
49645229
49655230 /* conditionally throttle active cfs_rq's from put_prev_entity() */
....@@ -4978,8 +5243,7 @@
49785243 if (cfs_rq_throttled(cfs_rq))
49795244 return true;
49805245
4981
- throttle_cfs_rq(cfs_rq);
4982
- return true;
5246
+ return throttle_cfs_rq(cfs_rq);
49835247 }
49845248
49855249 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
....@@ -4998,15 +5262,18 @@
49985262 {
49995263 struct cfs_bandwidth *cfs_b =
50005264 container_of(timer, struct cfs_bandwidth, period_timer);
5265
+ unsigned long flags;
50015266 int overrun;
50025267 int idle = 0;
50035268 int count = 0;
50045269
5005
- raw_spin_lock(&cfs_b->lock);
5270
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
50065271 for (;;) {
50075272 overrun = hrtimer_forward_now(timer, cfs_b->period);
50085273 if (!overrun)
50095274 break;
5275
+
5276
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
50105277
50115278 if (++count > 3) {
50125279 u64 new, old = ktime_to_ns(cfs_b->period);
....@@ -5037,12 +5304,10 @@
50375304 /* reset count so we don't come right back in here */
50385305 count = 0;
50395306 }
5040
-
5041
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
50425307 }
50435308 if (idle)
50445309 cfs_b->period_active = 0;
5045
- raw_spin_unlock(&cfs_b->lock);
5310
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
50465311
50475312 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
50485313 }
....@@ -5059,7 +5324,7 @@
50595324 cfs_b->period_timer.function = sched_cfs_period_timer;
50605325 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
50615326 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5062
- cfs_b->distribute_running = 0;
5327
+ cfs_b->slack_started = false;
50635328 }
50645329
50655330 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
....@@ -5154,11 +5419,6 @@
51545419 return false;
51555420 }
51565421
5157
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5158
-{
5159
- return rq_clock_task(rq_of(cfs_rq));
5160
-}
5161
-
51625422 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
51635423 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
51645424 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
....@@ -5216,7 +5476,7 @@
52165476
52175477 if (delta < 0) {
52185478 if (rq->curr == p)
5219
- resched_curr(rq);
5479
+ resched_curr_lazy(rq);
52205480 return;
52215481 }
52225482 hrtick_start(rq, delta);
....@@ -5251,22 +5511,41 @@
52515511
52525512 #ifdef CONFIG_SMP
52535513 static inline unsigned long cpu_util(int cpu);
5254
-static unsigned long capacity_of(int cpu);
52555514
52565515 static inline bool cpu_overutilized(int cpu)
52575516 {
5258
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
5517
+ int overutilized = -1;
5518
+
5519
+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
5520
+ if (overutilized != -1)
5521
+ return overutilized;
5522
+
5523
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
52595524 }
52605525
52615526 static inline void update_overutilized_status(struct rq *rq)
52625527 {
52635528 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
52645529 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5265
- trace_sched_overutilized(1);
5530
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
52665531 }
52675532 }
52685533 #else
52695534 static inline void update_overutilized_status(struct rq *rq) { }
5535
+#endif
5536
+
5537
+/* Runqueue only has SCHED_IDLE tasks enqueued */
5538
+static int sched_idle_rq(struct rq *rq)
5539
+{
5540
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5541
+ rq->nr_running);
5542
+}
5543
+
5544
+#ifdef CONFIG_SMP
5545
+static int sched_idle_cpu(int cpu)
5546
+{
5547
+ return sched_idle_rq(cpu_rq(cpu));
5548
+}
52705549 #endif
52715550
52725551 /*
....@@ -5279,12 +5558,9 @@
52795558 {
52805559 struct cfs_rq *cfs_rq;
52815560 struct sched_entity *se = &p->se;
5561
+ int idle_h_nr_running = task_has_idle_policy(p);
52825562 int task_new = !(flags & ENQUEUE_WAKEUP);
5283
-
5284
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5285
- if (sysctl_sched_performance_bias)
5286
- cpufreq_task_boost(rq->cpu, task_util_est(p));
5287
-#endif
5563
+ int should_iowait_boost;
52885564
52895565 /*
52905566 * The code below (indirectly) updates schedutil which looks at
....@@ -5295,29 +5571,13 @@
52955571 util_est_enqueue(&rq->cfs, p);
52965572
52975573 /*
5298
- * The code below (indirectly) updates schedutil which looks at
5299
- * the cfs_rq utilization to select a frequency.
5300
- * Let's update schedtune here to ensure the boost value of the
5301
- * current task is accounted for in the selection of the OPP.
5302
- *
5303
- * We do it also in the case where we enqueue a throttled task;
5304
- * we could argue that a throttled task should not boost a CPU,
5305
- * however:
5306
- * a) properly implementing CPU boosting considering throttled
5307
- * tasks will increase a lot the complexity of the solution
5308
- * b) it's not easy to quantify the benefits introduced by
5309
- * such a more complex solution.
5310
- * Thus, for the time being we go for the simple solution and boost
5311
- * also for throttled RQs.
5312
- */
5313
- schedtune_enqueue_task(p, cpu_of(rq));
5314
-
5315
- /*
53165574 * If in_iowait is set, the code below may not trigger any cpufreq
53175575 * utilization updates, so do it here explicitly with the IOWAIT flag
53185576 * passed.
53195577 */
5320
- if (p->in_iowait)
5578
+ should_iowait_boost = p->in_iowait;
5579
+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
5580
+ if (should_iowait_boost)
53215581 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
53225582
53235583 for_each_sched_entity(se) {
....@@ -5326,51 +5586,60 @@
53265586 cfs_rq = cfs_rq_of(se);
53275587 enqueue_entity(cfs_rq, se, flags);
53285588
5329
- /*
5330
- * end evaluation on encountering a throttled cfs_rq
5331
- *
5332
- * note: in the case of encountering a throttled cfs_rq we will
5333
- * post the final h_nr_running increment below.
5334
- */
5335
- if (cfs_rq_throttled(cfs_rq))
5336
- break;
53375589 cfs_rq->h_nr_running++;
5590
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5591
+
5592
+ /* end evaluation on encountering a throttled cfs_rq */
5593
+ if (cfs_rq_throttled(cfs_rq))
5594
+ goto enqueue_throttle;
53385595
53395596 flags = ENQUEUE_WAKEUP;
53405597 }
53415598
5599
+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
53425600 for_each_sched_entity(se) {
53435601 cfs_rq = cfs_rq_of(se);
5344
- cfs_rq->h_nr_running++;
5345
-
5346
- if (cfs_rq_throttled(cfs_rq))
5347
- break;
53485602
53495603 update_load_avg(cfs_rq, se, UPDATE_TG);
5604
+ se_update_runnable(se);
53505605 update_cfs_group(se);
5606
+
5607
+ cfs_rq->h_nr_running++;
5608
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
5609
+
5610
+ /* end evaluation on encountering a throttled cfs_rq */
5611
+ if (cfs_rq_throttled(cfs_rq))
5612
+ goto enqueue_throttle;
5613
+
5614
+ /*
5615
+ * One parent has been throttled and cfs_rq removed from the
5616
+ * list. Add it back to not break the leaf list.
5617
+ */
5618
+ if (throttled_hierarchy(cfs_rq))
5619
+ list_add_leaf_cfs_rq(cfs_rq);
53515620 }
53525621
5353
- if (!se) {
5354
- add_nr_running(rq, 1);
5355
- /*
5356
- * Since new tasks are assigned an initial util_avg equal to
5357
- * half of the spare capacity of their CPU, tiny tasks have the
5358
- * ability to cross the overutilized threshold, which will
5359
- * result in the load balancer ruining all the task placement
5360
- * done by EAS. As a way to mitigate that effect, do not account
5361
- * for the first enqueue operation of new tasks during the
5362
- * overutilized flag detection.
5363
- *
5364
- * A better way of solving this problem would be to wait for
5365
- * the PELT signals of tasks to converge before taking them
5366
- * into account, but that is not straightforward to implement,
5367
- * and the following generally works well enough in practice.
5368
- */
5369
- if (!task_new)
5370
- update_overutilized_status(rq);
5622
+ /* At this point se is NULL and we are at root level*/
5623
+ add_nr_running(rq, 1);
53715624
5372
- }
5625
+ /*
5626
+ * Since new tasks are assigned an initial util_avg equal to
5627
+ * half of the spare capacity of their CPU, tiny tasks have the
5628
+ * ability to cross the overutilized threshold, which will
5629
+ * result in the load balancer ruining all the task placement
5630
+ * done by EAS. As a way to mitigate that effect, do not account
5631
+ * for the first enqueue operation of new tasks during the
5632
+ * overutilized flag detection.
5633
+ *
5634
+ * A better way of solving this problem would be to wait for
5635
+ * the PELT signals of tasks to converge before taking them
5636
+ * into account, but that is not straightforward to implement,
5637
+ * and the following generally works well enough in practice.
5638
+ */
5639
+ if (!task_new)
5640
+ update_overutilized_status(rq);
53735641
5642
+enqueue_throttle:
53745643 if (cfs_bandwidth_used()) {
53755644 /*
53765645 * When bandwidth control is enabled; the cfs_rq_throttled()
....@@ -5403,28 +5672,21 @@
54035672 struct cfs_rq *cfs_rq;
54045673 struct sched_entity *se = &p->se;
54055674 int task_sleep = flags & DEQUEUE_SLEEP;
5675
+ int idle_h_nr_running = task_has_idle_policy(p);
5676
+ bool was_sched_idle = sched_idle_rq(rq);
54065677
5407
- /*
5408
- * The code below (indirectly) updates schedutil which looks at
5409
- * the cfs_rq utilization to select a frequency.
5410
- * Let's update schedtune here to ensure the boost value of the
5411
- * current task is not more accounted for in the selection of the OPP.
5412
- */
5413
- schedtune_dequeue_task(p, cpu_of(rq));
5678
+ util_est_dequeue(&rq->cfs, p);
54145679
54155680 for_each_sched_entity(se) {
54165681 cfs_rq = cfs_rq_of(se);
54175682 dequeue_entity(cfs_rq, se, flags);
54185683
5419
- /*
5420
- * end evaluation on encountering a throttled cfs_rq
5421
- *
5422
- * note: in the case of encountering a throttled cfs_rq we will
5423
- * post the final h_nr_running decrement below.
5424
- */
5425
- if (cfs_rq_throttled(cfs_rq))
5426
- break;
54275684 cfs_rq->h_nr_running--;
5685
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5686
+
5687
+ /* end evaluation on encountering a throttled cfs_rq */
5688
+ if (cfs_rq_throttled(cfs_rq))
5689
+ goto dequeue_throttle;
54285690
54295691 /* Don't dequeue parent if it has other entities besides us */
54305692 if (cfs_rq->load.weight) {
....@@ -5441,21 +5703,32 @@
54415703 flags |= DEQUEUE_SLEEP;
54425704 }
54435705
5706
+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
54445707 for_each_sched_entity(se) {
54455708 cfs_rq = cfs_rq_of(se);
5446
- cfs_rq->h_nr_running--;
5447
-
5448
- if (cfs_rq_throttled(cfs_rq))
5449
- break;
54505709
54515710 update_load_avg(cfs_rq, se, UPDATE_TG);
5711
+ se_update_runnable(se);
54525712 update_cfs_group(se);
5713
+
5714
+ cfs_rq->h_nr_running--;
5715
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5716
+
5717
+ /* end evaluation on encountering a throttled cfs_rq */
5718
+ if (cfs_rq_throttled(cfs_rq))
5719
+ goto dequeue_throttle;
5720
+
54535721 }
54545722
5455
- if (!se)
5456
- sub_nr_running(rq, 1);
5723
+ /* At this point se is NULL and we are at root level*/
5724
+ sub_nr_running(rq, 1);
54575725
5458
- util_est_dequeue(&rq->cfs, p, task_sleep);
5726
+ /* balance early to pull high priority tasks */
5727
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5728
+ rq->next_balance = jiffies;
5729
+
5730
+dequeue_throttle:
5731
+ util_est_update(&rq->cfs, p, task_sleep);
54595732 hrtick_update(rq);
54605733 }
54615734
....@@ -5466,71 +5739,6 @@
54665739 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
54675740
54685741 #ifdef CONFIG_NO_HZ_COMMON
5469
-/*
5470
- * per rq 'load' arrray crap; XXX kill this.
5471
- */
5472
-
5473
-/*
5474
- * The exact cpuload calculated at every tick would be:
5475
- *
5476
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5477
- *
5478
- * If a CPU misses updates for n ticks (as it was idle) and update gets
5479
- * called on the n+1-th tick when CPU may be busy, then we have:
5480
- *
5481
- * load_n = (1 - 1/2^i)^n * load_0
5482
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5483
- *
5484
- * decay_load_missed() below does efficient calculation of
5485
- *
5486
- * load' = (1 - 1/2^i)^n * load
5487
- *
5488
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5489
- * This allows us to precompute the above in said factors, thereby allowing the
5490
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5491
- * fixed_power_int())
5492
- *
5493
- * The calculation is approximated on a 128 point scale.
5494
- */
5495
-#define DEGRADE_SHIFT 7
5496
-
5497
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5498
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5499
- { 0, 0, 0, 0, 0, 0, 0, 0 },
5500
- { 64, 32, 8, 0, 0, 0, 0, 0 },
5501
- { 96, 72, 40, 12, 1, 0, 0, 0 },
5502
- { 112, 98, 75, 43, 15, 1, 0, 0 },
5503
- { 120, 112, 98, 76, 45, 16, 2, 0 }
5504
-};
5505
-
5506
-/*
5507
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5508
- * would be when CPU is idle and so we just decay the old load without
5509
- * adding any new load.
5510
- */
5511
-static unsigned long
5512
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5513
-{
5514
- int j = 0;
5515
-
5516
- if (!missed_updates)
5517
- return load;
5518
-
5519
- if (missed_updates >= degrade_zero_ticks[idx])
5520
- return 0;
5521
-
5522
- if (idx == 1)
5523
- return load >> missed_updates;
5524
-
5525
- while (missed_updates) {
5526
- if (missed_updates % 2)
5527
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5528
-
5529
- missed_updates >>= 1;
5530
- j++;
5531
- }
5532
- return load;
5533
-}
55345742
55355743 static struct {
55365744 cpumask_var_t idle_cpus_mask;
....@@ -5542,249 +5750,68 @@
55425750
55435751 #endif /* CONFIG_NO_HZ_COMMON */
55445752
5545
-/**
5546
- * __cpu_load_update - update the rq->cpu_load[] statistics
5547
- * @this_rq: The rq to update statistics for
5548
- * @this_load: The current load
5549
- * @pending_updates: The number of missed updates
5550
- *
5551
- * Update rq->cpu_load[] statistics. This function is usually called every
5552
- * scheduler tick (TICK_NSEC).
5553
- *
5554
- * This function computes a decaying average:
5555
- *
5556
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5557
- *
5558
- * Because of NOHZ it might not get called on every tick which gives need for
5559
- * the @pending_updates argument.
5560
- *
5561
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5562
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5563
- * = A * (A * load[i]_n-2 + B) + B
5564
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5565
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5566
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5567
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5568
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5569
- *
5570
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5571
- * any change in load would have resulted in the tick being turned back on.
5572
- *
5573
- * For regular NOHZ, this reduces to:
5574
- *
5575
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5576
- *
5577
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5578
- * term.
5579
- */
5580
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5581
- unsigned long pending_updates)
5753
+static unsigned long cpu_load(struct rq *rq)
55825754 {
5583
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5584
- int i, scale;
5585
-
5586
- this_rq->nr_load_updates++;
5587
-
5588
- /* Update our load: */
5589
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5590
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5591
- unsigned long old_load, new_load;
5592
-
5593
- /* scale is effectively 1 << i now, and >> i divides by scale */
5594
-
5595
- old_load = this_rq->cpu_load[i];
5596
-#ifdef CONFIG_NO_HZ_COMMON
5597
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5598
- if (tickless_load) {
5599
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5600
- /*
5601
- * old_load can never be a negative value because a
5602
- * decayed tickless_load cannot be greater than the
5603
- * original tickless_load.
5604
- */
5605
- old_load += tickless_load;
5606
- }
5607
-#endif
5608
- new_load = this_load;
5609
- /*
5610
- * Round up the averaging division if load is increasing. This
5611
- * prevents us from getting stuck on 9 if the load is 10, for
5612
- * example.
5613
- */
5614
- if (new_load > old_load)
5615
- new_load += scale - 1;
5616
-
5617
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5618
- }
5619
-}
5620
-
5621
-/* Used instead of source_load when we know the type == 0 */
5622
-static unsigned long weighted_cpuload(struct rq *rq)
5623
-{
5624
- return cfs_rq_runnable_load_avg(&rq->cfs);
5625
-}
5626
-
5627
-#ifdef CONFIG_NO_HZ_COMMON
5628
-/*
5629
- * There is no sane way to deal with nohz on smp when using jiffies because the
5630
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5631
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5632
- *
5633
- * Therefore we need to avoid the delta approach from the regular tick when
5634
- * possible since that would seriously skew the load calculation. This is why we
5635
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5636
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5637
- * loop exit, nohz_idle_balance, nohz full exit...)
5638
- *
5639
- * This means we might still be one tick off for nohz periods.
5640
- */
5641
-
5642
-static void cpu_load_update_nohz(struct rq *this_rq,
5643
- unsigned long curr_jiffies,
5644
- unsigned long load)
5645
-{
5646
- unsigned long pending_updates;
5647
-
5648
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5649
- if (pending_updates) {
5650
- this_rq->last_load_update_tick = curr_jiffies;
5651
- /*
5652
- * In the regular NOHZ case, we were idle, this means load 0.
5653
- * In the NOHZ_FULL case, we were non-idle, we should consider
5654
- * its weighted load.
5655
- */
5656
- cpu_load_update(this_rq, load, pending_updates);
5657
- }
5755
+ return cfs_rq_load_avg(&rq->cfs);
56585756 }
56595757
56605758 /*
5661
- * Called from nohz_idle_balance() to update the load ratings before doing the
5662
- * idle balance.
5663
- */
5664
-static void cpu_load_update_idle(struct rq *this_rq)
5665
-{
5666
- /*
5667
- * bail if there's load or we're actually up-to-date.
5668
- */
5669
- if (weighted_cpuload(this_rq))
5670
- return;
5671
-
5672
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5673
-}
5674
-
5675
-/*
5676
- * Record CPU load on nohz entry so we know the tickless load to account
5677
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5678
- * than other cpu_load[idx] but it should be fine as cpu_load readers
5679
- * shouldn't rely into synchronized cpu_load[*] updates.
5680
- */
5681
-void cpu_load_update_nohz_start(void)
5682
-{
5683
- struct rq *this_rq = this_rq();
5684
-
5685
- /*
5686
- * This is all lockless but should be fine. If weighted_cpuload changes
5687
- * concurrently we'll exit nohz. And cpu_load write can race with
5688
- * cpu_load_update_idle() but both updater would be writing the same.
5689
- */
5690
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5691
-}
5692
-
5693
-/*
5694
- * Account the tickless load in the end of a nohz frame.
5695
- */
5696
-void cpu_load_update_nohz_stop(void)
5697
-{
5698
- unsigned long curr_jiffies = READ_ONCE(jiffies);
5699
- struct rq *this_rq = this_rq();
5700
- unsigned long load;
5701
- struct rq_flags rf;
5702
-
5703
- if (curr_jiffies == this_rq->last_load_update_tick)
5704
- return;
5705
-
5706
- load = weighted_cpuload(this_rq);
5707
- rq_lock(this_rq, &rf);
5708
- update_rq_clock(this_rq);
5709
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5710
- rq_unlock(this_rq, &rf);
5711
-}
5712
-#else /* !CONFIG_NO_HZ_COMMON */
5713
-static inline void cpu_load_update_nohz(struct rq *this_rq,
5714
- unsigned long curr_jiffies,
5715
- unsigned long load) { }
5716
-#endif /* CONFIG_NO_HZ_COMMON */
5717
-
5718
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5719
-{
5720
-#ifdef CONFIG_NO_HZ_COMMON
5721
- /* See the mess around cpu_load_update_nohz(). */
5722
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5723
-#endif
5724
- cpu_load_update(this_rq, load, 1);
5725
-}
5726
-
5727
-/*
5728
- * Called from scheduler_tick()
5729
- */
5730
-void cpu_load_update_active(struct rq *this_rq)
5731
-{
5732
- unsigned long load = weighted_cpuload(this_rq);
5733
-
5734
- if (tick_nohz_tick_stopped())
5735
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5736
- else
5737
- cpu_load_update_periodic(this_rq, load);
5738
-}
5739
-
5740
-/*
5741
- * Return a low guess at the load of a migration-source CPU weighted
5742
- * according to the scheduling class and "nice" value.
5759
+ * cpu_load_without - compute CPU load without any contributions from *p
5760
+ * @cpu: the CPU which load is requested
5761
+ * @p: the task which load should be discounted
57435762 *
5744
- * We want to under-estimate the load of migration sources, to
5745
- * balance conservatively.
5763
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
5764
+ * CPU as well as tasks which are currently sleeping after an execution on that
5765
+ * CPU.
5766
+ *
5767
+ * This method returns the load of the specified CPU by discounting the load of
5768
+ * the specified task, whenever the task is currently contributing to the CPU
5769
+ * load.
57465770 */
5747
-static unsigned long source_load(int cpu, int type)
5771
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
57485772 {
5749
- struct rq *rq = cpu_rq(cpu);
5750
- unsigned long total = weighted_cpuload(rq);
5773
+ struct cfs_rq *cfs_rq;
5774
+ unsigned int load;
57515775
5752
- if (type == 0 || !sched_feat(LB_BIAS))
5753
- return total;
5776
+ /* Task has no contribution or is new */
5777
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5778
+ return cpu_load(rq);
57545779
5755
- return min(rq->cpu_load[type-1], total);
5780
+ cfs_rq = &rq->cfs;
5781
+ load = READ_ONCE(cfs_rq->avg.load_avg);
5782
+
5783
+ /* Discount task's util from CPU's util */
5784
+ lsub_positive(&load, task_h_load(p));
5785
+
5786
+ return load;
57565787 }
57575788
5758
-/*
5759
- * Return a high guess at the load of a migration-target CPU weighted
5760
- * according to the scheduling class and "nice" value.
5761
- */
5762
-static unsigned long target_load(int cpu, int type)
5789
+static unsigned long cpu_runnable(struct rq *rq)
57635790 {
5764
- struct rq *rq = cpu_rq(cpu);
5765
- unsigned long total = weighted_cpuload(rq);
5791
+ return cfs_rq_runnable_avg(&rq->cfs);
5792
+}
57665793
5767
- if (type == 0 || !sched_feat(LB_BIAS))
5768
- return total;
5794
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5795
+{
5796
+ struct cfs_rq *cfs_rq;
5797
+ unsigned int runnable;
57695798
5770
- return max(rq->cpu_load[type-1], total);
5799
+ /* Task has no contribution or is new */
5800
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5801
+ return cpu_runnable(rq);
5802
+
5803
+ cfs_rq = &rq->cfs;
5804
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5805
+
5806
+ /* Discount task's runnable from CPU's runnable */
5807
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
5808
+
5809
+ return runnable;
57715810 }
57725811
57735812 static unsigned long capacity_of(int cpu)
57745813 {
57755814 return cpu_rq(cpu)->cpu_capacity;
5776
-}
5777
-
5778
-static unsigned long cpu_avg_load_per_task(int cpu)
5779
-{
5780
- struct rq *rq = cpu_rq(cpu);
5781
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5782
- unsigned long load_avg = weighted_cpuload(rq);
5783
-
5784
- if (nr_running)
5785
- return load_avg / nr_running;
5786
-
5787
- return 0;
57885815 }
57895816
57905817 static void record_wakee(struct task_struct *p)
....@@ -5821,18 +5848,15 @@
58215848 * whatever is irrelevant, spread criteria is apparent partner count exceeds
58225849 * socket size.
58235850 */
5824
-static int wake_wide(struct task_struct *p, int sibling_count_hint)
5851
+static int wake_wide(struct task_struct *p)
58255852 {
58265853 unsigned int master = current->wakee_flips;
58275854 unsigned int slave = p->wakee_flips;
5828
- int llc_size = this_cpu_read(sd_llc_size);
5829
-
5830
- if (sibling_count_hint >= llc_size)
5831
- return 1;
5855
+ int factor = __this_cpu_read(sd_llc_size);
58325856
58335857 if (master < slave)
58345858 swap(master, slave);
5835
- if (slave < llc_size || master < slave * llc_size)
5859
+ if (slave < factor || master < slave * factor)
58365860 return 0;
58375861 return 1;
58385862 }
....@@ -5880,7 +5904,7 @@
58805904 s64 this_eff_load, prev_eff_load;
58815905 unsigned long task_load;
58825906
5883
- this_eff_load = target_load(this_cpu, sd->wake_idx);
5907
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
58845908
58855909 if (sync) {
58865910 unsigned long current_load = task_h_load(current);
....@@ -5898,7 +5922,7 @@
58985922 this_eff_load *= 100;
58995923 this_eff_load *= capacity_of(prev_cpu);
59005924
5901
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5925
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
59025926 prev_eff_load -= task_load;
59035927 if (sched_feat(WA_BIAS))
59045928 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
....@@ -5936,242 +5960,8 @@
59365960 return target;
59375961 }
59385962
5939
-#ifdef CONFIG_SCHED_TUNE
5940
-struct reciprocal_value schedtune_spc_rdiv;
5941
-
5942
-static long
5943
-schedtune_margin(unsigned long signal, long boost)
5944
-{
5945
- long long margin = 0;
5946
-
5947
- /*
5948
- * Signal proportional compensation (SPC)
5949
- *
5950
- * The Boost (B) value is used to compute a Margin (M) which is
5951
- * proportional to the complement of the original Signal (S):
5952
- * M = B * (SCHED_CAPACITY_SCALE - S)
5953
- * The obtained M could be used by the caller to "boost" S.
5954
- */
5955
- if (boost >= 0) {
5956
- margin = SCHED_CAPACITY_SCALE - signal;
5957
- margin *= boost;
5958
- } else
5959
- margin = -signal * boost;
5960
-
5961
- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5962
-
5963
- if (boost < 0)
5964
- margin *= -1;
5965
- return margin;
5966
-}
5967
-
5968
-inline long
5969
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5970
-{
5971
- int boost = schedtune_cpu_boost_with(cpu, p);
5972
- long margin;
5973
-
5974
- if (boost == 0)
5975
- margin = 0;
5976
- else
5977
- margin = schedtune_margin(util, boost);
5978
-
5979
- trace_sched_boost_cpu(cpu, util, margin);
5980
-
5981
- return margin;
5982
-}
5983
-
5984
-long schedtune_task_margin(struct task_struct *task)
5985
-{
5986
- int boost = schedtune_task_boost(task);
5987
- unsigned long util;
5988
- long margin;
5989
-
5990
- if (boost == 0)
5991
- return 0;
5992
-
5993
- util = task_util_est(task);
5994
- margin = schedtune_margin(util, boost);
5995
-
5996
- return margin;
5997
-}
5998
-
5999
-#else /* CONFIG_SCHED_TUNE */
6000
-
6001
-inline long
6002
-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6003
-{
6004
- return 0;
6005
-}
6006
-
6007
-#endif /* CONFIG_SCHED_TUNE */
6008
-
6009
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6010
-
6011
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6012
-{
6013
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6014
-}
6015
-
6016
-/*
6017
- * find_idlest_group finds and returns the least busy CPU group within the
6018
- * domain.
6019
- *
6020
- * Assumes p is allowed on at least one CPU in sd.
6021
- */
60225963 static struct sched_group *
6023
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
6024
- int this_cpu, int sd_flag)
6025
-{
6026
- struct sched_group *idlest = NULL, *group = sd->groups;
6027
- struct sched_group *most_spare_sg = NULL;
6028
- unsigned long min_runnable_load = ULONG_MAX;
6029
- unsigned long this_runnable_load = ULONG_MAX;
6030
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6031
- unsigned long most_spare = 0, this_spare = 0;
6032
- int load_idx = sd->forkexec_idx;
6033
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6034
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6035
- (sd->imbalance_pct-100) / 100;
6036
-
6037
- if (sd_flag & SD_BALANCE_WAKE)
6038
- load_idx = sd->wake_idx;
6039
-
6040
- do {
6041
- unsigned long load, avg_load, runnable_load;
6042
- unsigned long spare_cap, max_spare_cap;
6043
- int local_group;
6044
- int i;
6045
-
6046
- /* Skip over this group if it has no CPUs allowed */
6047
- if (!cpumask_intersects(sched_group_span(group),
6048
- &p->cpus_allowed))
6049
- continue;
6050
-
6051
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6052
- if (sysctl_sched_performance_bias) {
6053
- if (!task_fits_max(p, group_first_cpu(group)))
6054
- continue;
6055
- }
6056
-#endif
6057
-
6058
- local_group = cpumask_test_cpu(this_cpu,
6059
- sched_group_span(group));
6060
-
6061
- /*
6062
- * Tally up the load of all CPUs in the group and find
6063
- * the group containing the CPU with most spare capacity.
6064
- */
6065
- avg_load = 0;
6066
- runnable_load = 0;
6067
- max_spare_cap = 0;
6068
-
6069
- for_each_cpu(i, sched_group_span(group)) {
6070
- /* Bias balancing toward CPUs of our domain */
6071
- if (local_group)
6072
- load = source_load(i, load_idx);
6073
- else
6074
- load = target_load(i, load_idx);
6075
-
6076
- runnable_load += load;
6077
-
6078
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6079
-
6080
- spare_cap = capacity_spare_without(i, p);
6081
-
6082
- if (spare_cap > max_spare_cap)
6083
- max_spare_cap = spare_cap;
6084
- }
6085
-
6086
- /* Adjust by relative CPU capacity of the group */
6087
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6088
- group->sgc->capacity;
6089
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6090
- group->sgc->capacity;
6091
-
6092
- if (local_group) {
6093
- this_runnable_load = runnable_load;
6094
- this_avg_load = avg_load;
6095
- this_spare = max_spare_cap;
6096
- } else {
6097
- if (min_runnable_load > (runnable_load + imbalance)) {
6098
- /*
6099
- * The runnable load is significantly smaller
6100
- * so we can pick this new CPU:
6101
- */
6102
- min_runnable_load = runnable_load;
6103
- min_avg_load = avg_load;
6104
- idlest = group;
6105
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6106
- (100*min_avg_load > imbalance_scale*avg_load)) {
6107
- /*
6108
- * The runnable loads are close so take the
6109
- * blocked load into account through avg_load:
6110
- */
6111
- min_avg_load = avg_load;
6112
- idlest = group;
6113
- }
6114
-
6115
- if (most_spare < max_spare_cap) {
6116
- most_spare = max_spare_cap;
6117
- most_spare_sg = group;
6118
- }
6119
- }
6120
- } while (group = group->next, group != sd->groups);
6121
-
6122
- /*
6123
- * The cross-over point between using spare capacity or least load
6124
- * is too conservative for high utilization tasks on partially
6125
- * utilized systems if we require spare_capacity > task_util(p),
6126
- * so we allow for some task stuffing by using
6127
- * spare_capacity > task_util(p)/2.
6128
- *
6129
- * Spare capacity can't be used for fork because the utilization has
6130
- * not been set yet, we must first select a rq to compute the initial
6131
- * utilization.
6132
- */
6133
- if (sd_flag & SD_BALANCE_FORK)
6134
- goto skip_spare;
6135
-
6136
- if (this_spare > task_util(p) / 2 &&
6137
- imbalance_scale*this_spare > 100*most_spare)
6138
- return NULL;
6139
-
6140
- if (most_spare > task_util(p) / 2)
6141
- return most_spare_sg;
6142
-
6143
-skip_spare:
6144
- if (!idlest)
6145
- return NULL;
6146
-
6147
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6148
- if (sysctl_sched_performance_bias) {
6149
- if ((this_runnable_load == ULONG_MAX) || (this_avg_load == ULONG_MAX))
6150
- return idlest;
6151
- }
6152
-#endif
6153
-
6154
- /*
6155
- * When comparing groups across NUMA domains, it's possible for the
6156
- * local domain to be very lightly loaded relative to the remote
6157
- * domains but "imbalance" skews the comparison making remote CPUs
6158
- * look much more favourable. When considering cross-domain, add
6159
- * imbalance to the runnable load on the remote node and consider
6160
- * staying local.
6161
- */
6162
- if ((sd->flags & SD_NUMA) &&
6163
- min_runnable_load + imbalance >= this_runnable_load)
6164
- return NULL;
6165
-
6166
- if (min_runnable_load > (this_runnable_load + imbalance))
6167
- return NULL;
6168
-
6169
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6170
- (100*this_avg_load < imbalance_scale*min_avg_load))
6171
- return NULL;
6172
-
6173
- return idlest;
6174
-}
5964
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
61755965
61765966 /*
61775967 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
....@@ -6191,7 +5981,10 @@
61915981 return cpumask_first(sched_group_span(group));
61925982
61935983 /* Traverse only the allowed CPUs */
6194
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
5984
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5985
+ if (sched_idle_cpu(i))
5986
+ return i;
5987
+
61955988 if (available_idle_cpu(i)) {
61965989 struct rq *rq = cpu_rq(i);
61975990 struct cpuidle_state *idle = idle_get_state(rq);
....@@ -6215,7 +6008,7 @@
62156008 shallowest_idle_cpu = i;
62166009 }
62176010 } else if (shallowest_idle_cpu == -1) {
6218
- load = weighted_cpuload(cpu_rq(i));
6011
+ load = cpu_load(cpu_rq(i));
62196012 if (load < min_load) {
62206013 min_load = load;
62216014 least_loaded_cpu = i;
....@@ -6231,11 +6024,11 @@
62316024 {
62326025 int new_cpu = cpu;
62336026
6234
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6027
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
62356028 return prev_cpu;
62366029
62376030 /*
6238
- * We need task's util for capacity_spare_without, sync it up to
6031
+ * We need task's util for cpu_util_without, sync it up to
62396032 * prev_cpu's last_update_time.
62406033 */
62416034 if (!(sd_flag & SD_BALANCE_FORK))
....@@ -6251,7 +6044,7 @@
62516044 continue;
62526045 }
62536046
6254
- group = find_idlest_group(sd, p, cpu, sd_flag);
6047
+ group = find_idlest_group(sd, p, cpu);
62556048 if (!group) {
62566049 sd = sd->child;
62576050 continue;
....@@ -6348,16 +6141,18 @@
63486141 if (!test_idle_cores(target, false))
63496142 return -1;
63506143
6351
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6144
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
63526145
63536146 for_each_cpu_wrap(core, cpus, target) {
63546147 bool idle = true;
63556148
63566149 for_each_cpu(cpu, cpu_smt_mask(core)) {
6357
- cpumask_clear_cpu(cpu, cpus);
6358
- if (!available_idle_cpu(cpu))
6150
+ if (!available_idle_cpu(cpu)) {
63596151 idle = false;
6152
+ break;
6153
+ }
63606154 }
6155
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
63616156
63626157 if (idle)
63636158 return core;
....@@ -6382,9 +6177,10 @@
63826177 return -1;
63836178
63846179 for_each_cpu(cpu, cpu_smt_mask(target)) {
6385
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6180
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6181
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
63866182 continue;
6387
- if (available_idle_cpu(cpu))
6183
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
63886184 return cpu;
63896185 }
63906186
....@@ -6415,8 +6211,8 @@
64156211 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
64166212 struct sched_domain *this_sd;
64176213 u64 avg_cost, avg_idle;
6418
- u64 time, cost;
6419
- s64 delta;
6214
+ u64 time;
6215
+ int this = smp_processor_id();
64206216 int cpu, nr = INT_MAX;
64216217
64226218 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
....@@ -6441,23 +6237,63 @@
64416237 nr = 4;
64426238 }
64436239
6444
- time = local_clock();
6240
+ time = cpu_clock(this);
64456241
6446
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
6242
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
64476243
64486244 for_each_cpu_wrap(cpu, cpus, target) {
64496245 if (!--nr)
64506246 return -1;
6451
- if (available_idle_cpu(cpu))
6247
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
64526248 break;
64536249 }
64546250
6455
- time = local_clock() - time;
6456
- cost = this_sd->avg_scan_cost;
6457
- delta = (s64)(time - cost) / 8;
6458
- this_sd->avg_scan_cost += delta;
6251
+ time = cpu_clock(this) - time;
6252
+ update_avg(&this_sd->avg_scan_cost, time);
64596253
64606254 return cpu;
6255
+}
6256
+
6257
+/*
6258
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6259
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
6260
+ * maximize capacity.
6261
+ */
6262
+static int
6263
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6264
+{
6265
+ unsigned long task_util, best_cap = 0;
6266
+ int cpu, best_cpu = -1;
6267
+ struct cpumask *cpus;
6268
+
6269
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6270
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6271
+
6272
+ task_util = uclamp_task_util(p);
6273
+
6274
+ for_each_cpu_wrap(cpu, cpus, target) {
6275
+ unsigned long cpu_cap = capacity_of(cpu);
6276
+
6277
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6278
+ continue;
6279
+ if (fits_capacity(task_util, cpu_cap))
6280
+ return cpu;
6281
+
6282
+ if (cpu_cap > best_cap) {
6283
+ best_cap = cpu_cap;
6284
+ best_cpu = cpu;
6285
+ }
6286
+ }
6287
+
6288
+ return best_cpu;
6289
+}
6290
+
6291
+static inline bool asym_fits_capacity(int task_util, int cpu)
6292
+{
6293
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
6294
+ return fits_capacity(task_util, capacity_of(cpu));
6295
+
6296
+ return true;
64616297 }
64626298
64636299 /*
....@@ -6466,24 +6302,54 @@
64666302 static int select_idle_sibling(struct task_struct *p, int prev, int target)
64676303 {
64686304 struct sched_domain *sd;
6305
+ unsigned long task_util;
64696306 int i, recent_used_cpu;
64706307
6471
- if (available_idle_cpu(target))
6308
+ /*
6309
+ * On asymmetric system, update task utilization because we will check
6310
+ * that the task fits with cpu's capacity.
6311
+ */
6312
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6313
+ sync_entity_load_avg(&p->se);
6314
+ task_util = uclamp_task_util(p);
6315
+ }
6316
+
6317
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6318
+ asym_fits_capacity(task_util, target))
64726319 return target;
64736320
64746321 /*
64756322 * If the previous CPU is cache affine and idle, don't be stupid:
64766323 */
6477
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6324
+ if (prev != target && cpus_share_cache(prev, target) &&
6325
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6326
+ asym_fits_capacity(task_util, prev))
64786327 return prev;
6328
+
6329
+ /*
6330
+ * Allow a per-cpu kthread to stack with the wakee if the
6331
+ * kworker thread and the tasks previous CPUs are the same.
6332
+ * The assumption is that the wakee queued work for the
6333
+ * per-cpu kthread that is now complete and the wakeup is
6334
+ * essentially a sync wakeup. An obvious example of this
6335
+ * pattern is IO completions.
6336
+ */
6337
+ if (is_per_cpu_kthread(current) &&
6338
+ in_task() &&
6339
+ prev == smp_processor_id() &&
6340
+ this_rq()->nr_running <= 1 &&
6341
+ asym_fits_capacity(task_util, prev)) {
6342
+ return prev;
6343
+ }
64796344
64806345 /* Check a recently used CPU as a potential idle candidate: */
64816346 recent_used_cpu = p->recent_used_cpu;
64826347 if (recent_used_cpu != prev &&
64836348 recent_used_cpu != target &&
64846349 cpus_share_cache(recent_used_cpu, target) &&
6485
- available_idle_cpu(recent_used_cpu) &&
6486
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6350
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6351
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6352
+ asym_fits_capacity(task_util, recent_used_cpu)) {
64876353 /*
64886354 * Replace recent_used_cpu with prev as it is a potential
64896355 * candidate for the next wake:
....@@ -6492,6 +6358,32 @@
64926358 return recent_used_cpu;
64936359 }
64946360
6361
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6362
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
6363
+ goto sd_llc;
6364
+ }
6365
+
6366
+ /*
6367
+ * For asymmetric CPU capacity systems, our domain of interest is
6368
+ * sd_asym_cpucapacity rather than sd_llc.
6369
+ */
6370
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6371
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6372
+ /*
6373
+ * On an asymmetric CPU capacity system where an exclusive
6374
+ * cpuset defines a symmetric island (i.e. one unique
6375
+ * capacity_orig value through the cpuset), the key will be set
6376
+ * but the CPUs within that cpuset will not have a domain with
6377
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6378
+ * capacity path.
6379
+ */
6380
+ if (sd) {
6381
+ i = select_idle_capacity(p, sd, target);
6382
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
6383
+ }
6384
+ }
6385
+
6386
+sd_llc:
64956387 sd = rcu_dereference(per_cpu(sd_llc, target));
64966388 if (!sd)
64976389 return target;
....@@ -6589,7 +6481,7 @@
65896481 util = READ_ONCE(cfs_rq->avg.util_avg);
65906482
65916483 /* Discount task's util from CPU's util */
6592
- util -= min_t(unsigned int, util, task_util(p));
6484
+ lsub_positive(&util, task_util(p));
65936485
65946486 /*
65956487 * Covered cases:
....@@ -6638,10 +6530,9 @@
66386530 * properly fix the execl regression and it helps in further
66396531 * reducing the chances for the above race.
66406532 */
6641
- if (unlikely(task_on_rq_queued(p) || current == p)) {
6642
- estimated -= min_t(unsigned int, estimated,
6643
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
6644
- }
6533
+ if (unlikely(task_on_rq_queued(p) || current == p))
6534
+ lsub_positive(&estimated, _task_util_est(p));
6535
+
66456536 util = max(util, estimated);
66466537 }
66476538
....@@ -6651,350 +6542,6 @@
66516542 * the cpu_util call.
66526543 */
66536544 return min_t(unsigned long, util, capacity_orig_of(cpu));
6654
-}
6655
-
6656
-/*
6657
- * Returns the current capacity of cpu after applying both
6658
- * cpu and freq scaling.
6659
- */
6660
-unsigned long capacity_curr_of(int cpu)
6661
-{
6662
- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6663
- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6664
-
6665
- return cap_scale(max_cap, scale_freq);
6666
-}
6667
-
6668
-static void find_best_target(struct sched_domain *sd, cpumask_t *cpus,
6669
- struct task_struct *p)
6670
-{
6671
- unsigned long min_util = uclamp_task(p);
6672
- unsigned long target_capacity = ULONG_MAX;
6673
- unsigned long min_wake_util = ULONG_MAX;
6674
- unsigned long target_max_spare_cap = 0;
6675
- unsigned long target_util = ULONG_MAX;
6676
- /* Initialise with deepest possible cstate (INT_MAX) */
6677
- int shallowest_idle_cstate = INT_MAX;
6678
- struct sched_group *sg;
6679
- int best_active_cpu = -1;
6680
- int best_idle_cpu = -1;
6681
- int target_cpu = -1;
6682
- int backup_cpu = -1;
6683
- bool prefer_idle;
6684
- bool boosted;
6685
- int i;
6686
-
6687
- /*
6688
- * In most cases, target_capacity tracks capacity_orig of the most
6689
- * energy efficient CPU candidate, thus requiring to minimise
6690
- * target_capacity. For these cases target_capacity is already
6691
- * initialized to ULONG_MAX.
6692
- * However, for prefer_idle and boosted tasks we look for a high
6693
- * performance CPU, thus requiring to maximise target_capacity. In this
6694
- * case we initialise target_capacity to 0.
6695
- */
6696
- prefer_idle = uclamp_latency_sensitive(p);
6697
- boosted = uclamp_boosted(p);
6698
- if (prefer_idle && boosted)
6699
- target_capacity = 0;
6700
-
6701
- /* Scan CPUs in all SDs */
6702
- sg = sd->groups;
6703
- do {
6704
- for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) {
6705
- unsigned long capacity_curr = capacity_curr_of(i);
6706
- unsigned long capacity_orig = capacity_orig_of(i);
6707
- unsigned long wake_util, new_util;
6708
- long spare_cap;
6709
- int idle_idx = INT_MAX;
6710
-
6711
- if (!cpu_online(i))
6712
- continue;
6713
-
6714
- /*
6715
- * p's blocked utilization is still accounted for on prev_cpu
6716
- * so prev_cpu will receive a negative bias due to the double
6717
- * accounting. However, the blocked utilization may be zero.
6718
- */
6719
- wake_util = cpu_util_without(i, p);
6720
- new_util = wake_util + task_util_est(p);
6721
-
6722
- /*
6723
- * Ensure minimum capacity to grant the required boost.
6724
- * The target CPU can be already at a capacity level higher
6725
- * than the one required to boost the task.
6726
- */
6727
- new_util = max(min_util, new_util);
6728
- if (new_util > capacity_orig)
6729
- continue;
6730
-
6731
- /*
6732
- * Pre-compute the maximum possible capacity we expect
6733
- * to have available on this CPU once the task is
6734
- * enqueued here.
6735
- */
6736
- spare_cap = capacity_orig - new_util;
6737
-
6738
- if (idle_cpu(i))
6739
- idle_idx = idle_get_state_idx(cpu_rq(i));
6740
-
6741
-
6742
- /*
6743
- * Case A) Latency sensitive tasks
6744
- *
6745
- * Unconditionally favoring tasks that prefer idle CPU to
6746
- * improve latency.
6747
- *
6748
- * Looking for:
6749
- * - an idle CPU, whatever its idle_state is, since
6750
- * the first CPUs we explore are more likely to be
6751
- * reserved for latency sensitive tasks.
6752
- * - a non idle CPU where the task fits in its current
6753
- * capacity and has the maximum spare capacity.
6754
- * - a non idle CPU with lower contention from other
6755
- * tasks and running at the lowest possible OPP.
6756
- *
6757
- * The last two goals tries to favor a non idle CPU
6758
- * where the task can run as if it is "almost alone".
6759
- * A maximum spare capacity CPU is favoured since
6760
- * the task already fits into that CPU's capacity
6761
- * without waiting for an OPP chance.
6762
- *
6763
- * The following code path is the only one in the CPUs
6764
- * exploration loop which is always used by
6765
- * prefer_idle tasks. It exits the loop with wither a
6766
- * best_active_cpu or a target_cpu which should
6767
- * represent an optimal choice for latency sensitive
6768
- * tasks.
6769
- */
6770
- if (prefer_idle) {
6771
-
6772
- /*
6773
- * Case A.1: IDLE CPU
6774
- * Return the best IDLE CPU we find:
6775
- * - for boosted tasks: the CPU with the highest
6776
- * performance (i.e. biggest capacity_orig)
6777
- * - for !boosted tasks: the most energy
6778
- * efficient CPU (i.e. smallest capacity_orig)
6779
- */
6780
- if (idle_cpu(i)) {
6781
- if (boosted &&
6782
- capacity_orig < target_capacity)
6783
- continue;
6784
- if (!boosted &&
6785
- capacity_orig > target_capacity)
6786
- continue;
6787
- /*
6788
- * Minimise value of idle state: skip
6789
- * deeper idle states and pick the
6790
- * shallowest.
6791
- */
6792
- if (capacity_orig == target_capacity &&
6793
- sysctl_sched_cstate_aware &&
6794
- idle_idx >= shallowest_idle_cstate)
6795
- continue;
6796
-
6797
- target_capacity = capacity_orig;
6798
- shallowest_idle_cstate = idle_idx;
6799
- best_idle_cpu = i;
6800
- continue;
6801
- }
6802
- if (best_idle_cpu != -1)
6803
- continue;
6804
-
6805
- /*
6806
- * Case A.2: Target ACTIVE CPU
6807
- * Favor CPUs with max spare capacity.
6808
- */
6809
- if (capacity_curr > new_util &&
6810
- spare_cap > target_max_spare_cap) {
6811
- target_max_spare_cap = spare_cap;
6812
- target_cpu = i;
6813
- continue;
6814
- }
6815
- if (target_cpu != -1)
6816
- continue;
6817
-
6818
-
6819
- /*
6820
- * Case A.3: Backup ACTIVE CPU
6821
- * Favor CPUs with:
6822
- * - lower utilization due to other tasks
6823
- * - lower utilization with the task in
6824
- */
6825
- if (wake_util > min_wake_util)
6826
- continue;
6827
- min_wake_util = wake_util;
6828
- best_active_cpu = i;
6829
- continue;
6830
- }
6831
-
6832
- /*
6833
- * Enforce EAS mode
6834
- *
6835
- * For non latency sensitive tasks, skip CPUs that
6836
- * will be overutilized by moving the task there.
6837
- *
6838
- * The goal here is to remain in EAS mode as long as
6839
- * possible at least for !prefer_idle tasks.
6840
- */
6841
- if ((new_util * capacity_margin) >
6842
- (capacity_orig * SCHED_CAPACITY_SCALE))
6843
- continue;
6844
-
6845
- /*
6846
- * Favor CPUs with smaller capacity for non latency
6847
- * sensitive tasks.
6848
- */
6849
- if (capacity_orig > target_capacity)
6850
- continue;
6851
-
6852
- /*
6853
- * Case B) Non latency sensitive tasks on IDLE CPUs.
6854
- *
6855
- * Find an optimal backup IDLE CPU for non latency
6856
- * sensitive tasks.
6857
- *
6858
- * Looking for:
6859
- * - minimizing the capacity_orig,
6860
- * i.e. preferring LITTLE CPUs
6861
- * - favoring shallowest idle states
6862
- * i.e. avoid to wakeup deep-idle CPUs
6863
- *
6864
- * The following code path is used by non latency
6865
- * sensitive tasks if IDLE CPUs are available. If at
6866
- * least one of such CPUs are available it sets the
6867
- * best_idle_cpu to the most suitable idle CPU to be
6868
- * selected.
6869
- *
6870
- * If idle CPUs are available, favour these CPUs to
6871
- * improve performances by spreading tasks.
6872
- * Indeed, the energy_diff() computed by the caller
6873
- * will take care to ensure the minimization of energy
6874
- * consumptions without affecting performance.
6875
- */
6876
- if (idle_cpu(i)) {
6877
- /*
6878
- * Skip CPUs in deeper idle state, but only
6879
- * if they are also less energy efficient.
6880
- * IOW, prefer a deep IDLE LITTLE CPU vs a
6881
- * shallow idle big CPU.
6882
- */
6883
- if (capacity_orig == target_capacity &&
6884
- sysctl_sched_cstate_aware &&
6885
- idle_idx >= shallowest_idle_cstate)
6886
- continue;
6887
-
6888
- target_capacity = capacity_orig;
6889
- shallowest_idle_cstate = idle_idx;
6890
- best_idle_cpu = i;
6891
- continue;
6892
- }
6893
-
6894
- /*
6895
- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6896
- *
6897
- * Pack tasks in the most energy efficient capacities.
6898
- *
6899
- * This task packing strategy prefers more energy
6900
- * efficient CPUs (i.e. pack on smaller maximum
6901
- * capacity CPUs) while also trying to spread tasks to
6902
- * run them all at the lower OPP.
6903
- *
6904
- * This assumes for example that it's more energy
6905
- * efficient to run two tasks on two CPUs at a lower
6906
- * OPP than packing both on a single CPU but running
6907
- * that CPU at an higher OPP.
6908
- *
6909
- * Thus, this case keep track of the CPU with the
6910
- * smallest maximum capacity and highest spare maximum
6911
- * capacity.
6912
- */
6913
-
6914
- /* Favor CPUs with maximum spare capacity */
6915
- if (capacity_orig == target_capacity &&
6916
- spare_cap < target_max_spare_cap)
6917
- continue;
6918
-
6919
- target_max_spare_cap = spare_cap;
6920
- target_capacity = capacity_orig;
6921
- target_util = new_util;
6922
- target_cpu = i;
6923
- }
6924
-
6925
- } while (sg = sg->next, sg != sd->groups);
6926
-
6927
- /*
6928
- * For non latency sensitive tasks, cases B and C in the previous loop,
6929
- * we pick the best IDLE CPU only if we was not able to find a target
6930
- * ACTIVE CPU.
6931
- *
6932
- * Policies priorities:
6933
- *
6934
- * - prefer_idle tasks:
6935
- *
6936
- * a) IDLE CPU available: best_idle_cpu
6937
- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6938
- * capacity (i.e. target_cpu)
6939
- * c) ACTIVE CPU with less contention due to other tasks
6940
- * (i.e. best_active_cpu)
6941
- *
6942
- * - NON prefer_idle tasks:
6943
- *
6944
- * a) ACTIVE CPU: target_cpu
6945
- * b) IDLE CPU: best_idle_cpu
6946
- */
6947
-
6948
- if (prefer_idle && (best_idle_cpu != -1)) {
6949
- target_cpu = best_idle_cpu;
6950
- goto target;
6951
- }
6952
-
6953
- if (target_cpu == -1)
6954
- target_cpu = prefer_idle
6955
- ? best_active_cpu
6956
- : best_idle_cpu;
6957
- else
6958
- backup_cpu = prefer_idle
6959
- ? best_active_cpu
6960
- : best_idle_cpu;
6961
-
6962
- if (backup_cpu >= 0)
6963
- cpumask_set_cpu(backup_cpu, cpus);
6964
- if (target_cpu >= 0) {
6965
-target:
6966
- cpumask_set_cpu(target_cpu, cpus);
6967
- }
6968
-
6969
- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6970
- best_active_cpu, target_cpu, backup_cpu);
6971
-}
6972
-
6973
-/*
6974
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6975
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6976
- *
6977
- * In that case WAKE_AFFINE doesn't make sense and we'll let
6978
- * BALANCE_WAKE sort things out.
6979
- */
6980
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6981
-{
6982
- long min_cap, max_cap;
6983
-
6984
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6985
- return 0;
6986
-
6987
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6988
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6989
-
6990
- /* Minimum capacity is close to max, no need to abort wake_affine */
6991
- if (max_cap - min_cap < max_cap >> 3)
6992
- return 0;
6993
-
6994
- /* Bring task utilization in sync with prev_cpu */
6995
- sync_entity_load_avg(&p->se);
6996
-
6997
- return !task_fits_capacity(p, min_cap);
69986545 }
69996546
70006547 /*
....@@ -7036,154 +6583,61 @@
70366583 }
70376584
70386585 /*
7039
- * compute_energy(): Estimates the energy that would be consumed if @p was
6586
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
70406587 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7041
- * landscape of the * CPUs after the task migration, and uses the Energy Model
6588
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
70426589 * to compute what would be the energy if we decided to actually migrate that
70436590 * task.
70446591 */
70456592 static long
70466593 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
70476594 {
7048
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7049
- unsigned long sum_util, energy = 0;
7050
- struct task_struct *tsk;
6595
+ struct cpumask *pd_mask = perf_domain_span(pd);
6596
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6597
+ unsigned long max_util = 0, sum_util = 0;
6598
+ unsigned long energy = 0;
70516599 int cpu;
70526600
7053
- for (; pd; pd = pd->next) {
7054
- struct cpumask *pd_mask = perf_domain_span(pd);
6601
+ /*
6602
+ * The capacity state of CPUs of the current rd can be driven by CPUs
6603
+ * of another rd if they belong to the same pd. So, account for the
6604
+ * utilization of these CPUs too by masking pd with cpu_online_mask
6605
+ * instead of the rd span.
6606
+ *
6607
+ * If an entire pd is outside of the current rd, it will not appear in
6608
+ * its pd list and will not be accounted by compute_energy().
6609
+ */
6610
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6611
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6612
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
70556613
70566614 /*
7057
- * The energy model mandates all the CPUs of a performance
7058
- * domain have the same capacity.
6615
+ * Busy time computation: utilization clamping is not
6616
+ * required since the ratio (sum_util / cpu_capacity)
6617
+ * is already enough to scale the EM reported power
6618
+ * consumption at the (eventually clamped) cpu_capacity.
70596619 */
7060
- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7061
- max_util = sum_util = 0;
6620
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6621
+ ENERGY_UTIL, NULL);
70626622
70636623 /*
7064
- * The capacity state of CPUs of the current rd can be driven by
7065
- * CPUs of another rd if they belong to the same performance
7066
- * domain. So, account for the utilization of these CPUs too
7067
- * by masking pd with cpu_online_mask instead of the rd span.
7068
- *
7069
- * If an entire performance domain is outside of the current rd,
7070
- * it will not appear in its pd list and will not be accounted
7071
- * by compute_energy().
6624
+ * Performance domain frequency: utilization clamping
6625
+ * must be considered since it affects the selection
6626
+ * of the performance domain frequency.
6627
+ * NOTE: in case RT tasks are running, by default the
6628
+ * FREQUENCY_UTIL's utilization can be max OPP.
70726629 */
7073
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7074
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7075
-
7076
- /*
7077
- * Busy time computation: utilization clamping is not
7078
- * required since the ratio (sum_util / cpu_capacity)
7079
- * is already enough to scale the EM reported power
7080
- * consumption at the (eventually clamped) cpu_capacity.
7081
- */
7082
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7083
- ENERGY_UTIL, NULL);
7084
-
7085
- /*
7086
- * Performance domain frequency: utilization clamping
7087
- * must be considered since it affects the selection
7088
- * of the performance domain frequency.
7089
- * NOTE: in case RT tasks are running, by default the
7090
- * FREQUENCY_UTIL's utilization can be max OPP.
7091
- */
7092
- tsk = cpu == dst_cpu ? p : NULL;
7093
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7094
- FREQUENCY_UTIL, tsk);
7095
- max_util = max(max_util, cpu_util);
7096
- }
7097
-
7098
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6630
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6631
+ FREQUENCY_UTIL, tsk);
6632
+ max_util = max(max_util, cpu_util);
70996633 }
6634
+
6635
+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
6636
+ if (!energy)
6637
+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
71006638
71016639 return energy;
71026640 }
7103
-
7104
-static void select_cpu_candidates(struct sched_domain *sd, cpumask_t *cpus,
7105
- struct perf_domain *pd, struct task_struct *p, int prev_cpu)
7106
-{
7107
- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7108
- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7109
- bool prefer_idle = uclamp_latency_sensitive(p);
7110
- bool boosted = uclamp_boosted(p);
7111
- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7112
- unsigned long highest_spare_cap = 0;
7113
- unsigned int min_exit_lat = UINT_MAX;
7114
- int cpu, max_spare_cap_cpu;
7115
- struct cpuidle_state *idle;
7116
-
7117
- for (; pd; pd = pd->next) {
7118
- max_spare_cap_cpu = -1;
7119
- max_spare_cap = 0;
7120
-
7121
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7122
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
7123
- continue;
7124
-
7125
- util = cpu_util_next(cpu, p, cpu);
7126
- cpu_cap = capacity_of(cpu);
7127
- spare_cap = cpu_cap - util;
7128
-
7129
- /*
7130
- * Skip CPUs that cannot satisfy the capacity request.
7131
- * IOW, placing the task there would make the CPU
7132
- * overutilized. Take uclamp into account to see how
7133
- * much capacity we can get out of the CPU; this is
7134
- * aligned with schedutil_cpu_util().
7135
- */
7136
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7137
- if (cpu_cap * 1024 < util * capacity_margin)
7138
- continue;
7139
-
7140
- /*
7141
- * Find the CPU with the maximum spare capacity in
7142
- * the performance domain
7143
- */
7144
- if (spare_cap > max_spare_cap) {
7145
- max_spare_cap = spare_cap;
7146
- max_spare_cap_cpu = cpu;
7147
- }
7148
-
7149
- if (!prefer_idle)
7150
- continue;
7151
-
7152
- if (idle_cpu(cpu)) {
7153
- cpu_cap = capacity_orig_of(cpu);
7154
- if (boosted && cpu_cap < target_cap)
7155
- continue;
7156
- if (!boosted && cpu_cap > target_cap)
7157
- continue;
7158
- idle = idle_get_state(cpu_rq(cpu));
7159
- if (idle && idle->exit_latency > min_exit_lat &&
7160
- cpu_cap == target_cap)
7161
- continue;
7162
-
7163
- if (idle)
7164
- min_exit_lat = idle->exit_latency;
7165
- target_cap = cpu_cap;
7166
- best_idle_cpu = cpu;
7167
- } else if (spare_cap > highest_spare_cap) {
7168
- highest_spare_cap = spare_cap;
7169
- highest_spare_cap_cpu = cpu;
7170
- }
7171
- }
7172
-
7173
- if (!prefer_idle && max_spare_cap_cpu >= 0)
7174
- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7175
- }
7176
-
7177
- if (!prefer_idle)
7178
- return;
7179
-
7180
- if (best_idle_cpu >= 0)
7181
- cpumask_set_cpu(best_idle_cpu, cpus);
7182
- else
7183
- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7184
-}
7185
-
7186
-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
71876641
71886642 /*
71896643 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
....@@ -7224,27 +6678,39 @@
72246678 * other use-cases too. So, until someone finds a better way to solve this,
72256679 * let's keep things simple by re-using the existing slow path.
72266680 */
7227
-
72286681 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
72296682 {
7230
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
6683
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6684
+ unsigned long best_delta2 = ULONG_MAX;
72316685 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7232
- int weight, cpu, best_energy_cpu = prev_cpu;
7233
- unsigned long cur_energy;
7234
- struct perf_domain *pd;
6686
+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
6687
+ unsigned long max_spare_cap_ls = 0, target_cap;
6688
+ unsigned long cpu_cap, util, base_energy = 0;
6689
+ bool boosted, latency_sensitive = false;
6690
+ unsigned int min_exit_lat = UINT_MAX;
6691
+ int cpu, best_energy_cpu = prev_cpu;
6692
+ struct cpuidle_state *idle;
72356693 struct sched_domain *sd;
7236
- cpumask_t *candidates;
6694
+ struct perf_domain *pd;
6695
+ int new_cpu = INT_MAX;
72376696
7238
- if (sysctl_sched_sync_hint_enable && sync) {
7239
- cpu = smp_processor_id();
7240
- if (cpumask_test_cpu(cpu, &p->cpus_allowed))
7241
- return cpu;
7242
- }
6697
+ sync_entity_load_avg(&p->se);
6698
+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
6699
+ if (new_cpu != INT_MAX)
6700
+ return new_cpu;
72436701
72446702 rcu_read_lock();
72456703 pd = rcu_dereference(rd->pd);
72466704 if (!pd || READ_ONCE(rd->overutilized))
72476705 goto fail;
6706
+
6707
+ cpu = smp_processor_id();
6708
+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
6709
+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
6710
+ task_fits_capacity(p, capacity_of(cpu))) {
6711
+ rcu_read_unlock();
6712
+ return cpu;
6713
+ }
72486714
72496715 /*
72506716 * Energy-aware wake-up happens on the lowest sched_domain starting
....@@ -7256,59 +6722,149 @@
72566722 if (!sd)
72576723 goto fail;
72586724
7259
- sync_entity_load_avg(&p->se);
72606725 if (!task_util_est(p))
72616726 goto unlock;
72626727
7263
- /* Pre-select a set of candidate CPUs. */
7264
- candidates = this_cpu_ptr(&energy_cpus);
7265
- cpumask_clear(candidates);
6728
+ latency_sensitive = uclamp_latency_sensitive(p);
6729
+ boosted = uclamp_boosted(p);
6730
+ target_cap = boosted ? 0 : ULONG_MAX;
72666731
7267
- if (sched_feat(FIND_BEST_TARGET))
7268
- find_best_target(sd, candidates, p);
7269
- else
7270
- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
6732
+ for (; pd; pd = pd->next) {
6733
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6734
+ unsigned long base_energy_pd;
6735
+ int max_spare_cap_cpu = -1;
72716736
7272
- /* Bail out if no candidate was found. */
7273
- weight = cpumask_weight(candidates);
7274
- if (!weight)
7275
- goto unlock;
6737
+ /* Compute the 'base' energy of the pd, without @p */
6738
+ base_energy_pd = compute_energy(p, -1, pd);
6739
+ base_energy += base_energy_pd;
72766740
7277
- /* If there is only one sensible candidate, select it now. */
7278
- cpu = cpumask_first(candidates);
7279
- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) ||
7280
- (cpu == prev_cpu))) {
7281
- best_energy_cpu = cpu;
7282
- goto unlock;
7283
- }
6741
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6742
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6743
+ continue;
72846744
7285
- if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed))
7286
- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7287
- else
7288
- prev_energy = best_energy = ULONG_MAX;
6745
+ util = cpu_util_next(cpu, p, cpu);
6746
+ cpu_cap = capacity_of(cpu);
6747
+ spare_cap = cpu_cap;
6748
+ lsub_positive(&spare_cap, util);
72896749
7290
- /* Select the best candidate energy-wise. */
7291
- for_each_cpu(cpu, candidates) {
7292
- if (cpu == prev_cpu)
7293
- continue;
7294
- cur_energy = compute_energy(p, cpu, pd);
7295
- if (cur_energy < best_energy) {
7296
- best_energy = cur_energy;
7297
- best_energy_cpu = cpu;
6750
+ /*
6751
+ * Skip CPUs that cannot satisfy the capacity request.
6752
+ * IOW, placing the task there would make the CPU
6753
+ * overutilized. Take uclamp into account to see how
6754
+ * much capacity we can get out of the CPU; this is
6755
+ * aligned with schedutil_cpu_util().
6756
+ */
6757
+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
6758
+ if (!fits_capacity(util, cpu_cap))
6759
+ continue;
6760
+
6761
+ /* Always use prev_cpu as a candidate. */
6762
+ if (!latency_sensitive && cpu == prev_cpu) {
6763
+ prev_delta = compute_energy(p, prev_cpu, pd);
6764
+ prev_delta -= base_energy_pd;
6765
+ best_delta = min(best_delta, prev_delta);
6766
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6767
+ if (prev_delta == best_delta)
6768
+ best_energy_cpu = prev_cpu;
6769
+ }
6770
+ }
6771
+
6772
+ /*
6773
+ * Find the CPU with the maximum spare capacity in
6774
+ * the performance domain
6775
+ */
6776
+ if (spare_cap > max_spare_cap) {
6777
+ max_spare_cap = spare_cap;
6778
+ max_spare_cap_cpu = cpu;
6779
+ }
6780
+
6781
+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6782
+ if (!latency_sensitive)
6783
+ continue;
6784
+ }
6785
+
6786
+ if (idle_cpu(cpu)) {
6787
+ cpu_cap = capacity_orig_of(cpu);
6788
+ if (boosted && cpu_cap < target_cap)
6789
+ continue;
6790
+ if (!boosted && cpu_cap > target_cap)
6791
+ continue;
6792
+ idle = idle_get_state(cpu_rq(cpu));
6793
+ if (idle && idle->exit_latency > min_exit_lat &&
6794
+ cpu_cap == target_cap)
6795
+ continue;
6796
+
6797
+ if (idle)
6798
+ min_exit_lat = idle->exit_latency;
6799
+ target_cap = cpu_cap;
6800
+ best_idle_cpu = cpu;
6801
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6802
+ best_delta2 = compute_energy(p, cpu, pd);
6803
+ best_delta2 -= base_energy_pd;
6804
+ }
6805
+ } else if (spare_cap > max_spare_cap_ls) {
6806
+ max_spare_cap_ls = spare_cap;
6807
+ max_spare_cap_cpu_ls = cpu;
6808
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6809
+ if (best_idle_cpu == -1) {
6810
+ best_delta2 = compute_energy(p, cpu, pd);
6811
+ best_delta2 -= base_energy_pd;
6812
+ }
6813
+ }
6814
+ }
6815
+ }
6816
+
6817
+ /* Evaluate the energy impact of using this CPU. */
6818
+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
6819
+ max_spare_cap_cpu != prev_cpu) {
6820
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6821
+ cur_delta -= base_energy_pd;
6822
+ if (cur_delta < best_delta) {
6823
+ best_delta = cur_delta;
6824
+ best_energy_cpu = max_spare_cap_cpu;
6825
+ }
72986826 }
72996827 }
73006828 unlock:
73016829 rcu_read_unlock();
73026830
6831
+ if (latency_sensitive)
6832
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6833
+
73036834 /*
73046835 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
73056836 * least 6% of the energy used by prev_cpu.
73066837 */
7307
- if (prev_energy == ULONG_MAX)
6838
+ if (prev_delta == ULONG_MAX)
73086839 return best_energy_cpu;
73096840
7310
- if ((prev_energy - best_energy) > (prev_energy >> 4))
6841
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
73116842 return best_energy_cpu;
6843
+
6844
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6845
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
6846
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
6847
+ int level = rockchip_perf_get_level();
6848
+
6849
+ /*
6850
+ * when select ROCKCHIP_PERFORMANCE_LOW:
6851
+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
6852
+ * is little cpu, so that tasks can migrate from big cpu to little
6853
+ * cpu easier to save power.
6854
+ */
6855
+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
6856
+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
6857
+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
6858
+ return best_energy_cpu;
6859
+ }
6860
+
6861
+ /*
6862
+ * Pick the idlest cpu if it is a little power increased(<3.1%).
6863
+ */
6864
+ if ((best_delta2 <= prev_delta) ||
6865
+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
6866
+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
6867
+ }
73126868
73136869 return prev_cpu;
73146870
....@@ -7331,39 +6887,44 @@
73316887 * preempt must be disabled.
73326888 */
73336889 static int
7334
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7335
- int sibling_count_hint)
6890
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
73366891 {
73376892 struct sched_domain *tmp, *sd = NULL;
73386893 int cpu = smp_processor_id();
73396894 int new_cpu = prev_cpu;
73406895 int want_affine = 0;
73416896 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6897
+ int target_cpu = -1;
6898
+
6899
+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
6900
+ !(sd_flag & SD_BALANCE_FORK))
6901
+ sync_entity_load_avg(&p->se);
6902
+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
6903
+ wake_flags, &target_cpu);
6904
+ if (target_cpu >= 0)
6905
+ return target_cpu;
73426906
73436907 if (sd_flag & SD_BALANCE_WAKE) {
73446908 record_wakee(p);
73456909
7346
- if (static_branch_unlikely(&sched_energy_present)) {
7347
- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7348
- goto sd_loop;
6910
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6911
+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
6912
+ goto no_eas;
6913
+ }
73496914
6915
+ if (sched_energy_enabled()) {
73506916 new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
73516917 if (new_cpu >= 0)
73526918 return new_cpu;
73536919 new_cpu = prev_cpu;
73546920 }
73556921
7356
- want_affine = !wake_wide(p, sibling_count_hint) &&
7357
- !wake_cap(p, cpu, prev_cpu) &&
7358
- cpumask_test_cpu(cpu, &p->cpus_allowed);
6922
+no_eas:
6923
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
73596924 }
73606925
7361
-sd_loop:
73626926 rcu_read_lock();
73636927 for_each_domain(cpu, tmp) {
7364
- if (!(tmp->flags & SD_LOAD_BALANCE))
7365
- break;
7366
-
73676928 /*
73686929 * If both 'cpu' and 'prev_cpu' are part of this domain,
73696930 * cpu is a valid SD_WAKE_AFFINE target.
....@@ -7390,6 +6951,23 @@
73906951 /* Fast path */
73916952
73926953 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6954
+
6955
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
6956
+ struct root_domain *rd = cpu_rq(cpu)->rd;
6957
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
6958
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
6959
+ int level = rockchip_perf_get_level();
6960
+
6961
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
6962
+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
6963
+ cpumask_test_cpu(new_cpu, cpul_mask)) {
6964
+ for_each_domain(cpu, tmp) {
6965
+ sd = tmp;
6966
+ }
6967
+ if (sd)
6968
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6969
+ }
6970
+ }
73936971
73946972 if (want_affine)
73956973 current->recent_used_cpu = cpu;
....@@ -7467,6 +7045,15 @@
74677045 {
74687046 remove_entity_load_avg(&p->se);
74697047 }
7048
+
7049
+static int
7050
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
7051
+{
7052
+ if (rq->nr_running)
7053
+ return 1;
7054
+
7055
+ return newidle_balance(rq, rf) != 0;
7056
+}
74707057 #endif /* CONFIG_SMP */
74717058
74727059 static unsigned long wakeup_gran(struct sched_entity *se)
....@@ -7520,7 +7107,7 @@
75207107
75217108 static void set_last_buddy(struct sched_entity *se)
75227109 {
7523
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7110
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75247111 return;
75257112
75267113 for_each_sched_entity(se) {
....@@ -7532,7 +7119,7 @@
75327119
75337120 static void set_next_buddy(struct sched_entity *se)
75347121 {
7535
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7122
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
75367123 return;
75377124
75387125 for_each_sched_entity(se) {
....@@ -7558,6 +7145,7 @@
75587145 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
75597146 int scale = cfs_rq->nr_running >= sched_nr_latency;
75607147 int next_buddy_marked = 0;
7148
+ bool preempt = false, nopreempt = false;
75617149
75627150 if (unlikely(se == pse))
75637151 return;
....@@ -7590,8 +7178,8 @@
75907178 return;
75917179
75927180 /* Idle tasks are by definition preempted by non-idle tasks. */
7593
- if (unlikely(curr->policy == SCHED_IDLE) &&
7594
- likely(p->policy != SCHED_IDLE))
7181
+ if (unlikely(task_has_idle_policy(curr)) &&
7182
+ likely(!task_has_idle_policy(p)))
75957183 goto preempt;
75967184
75977185 /*
....@@ -7603,6 +7191,12 @@
76037191
76047192 find_matching_se(&se, &pse);
76057193 update_curr(cfs_rq_of(se));
7194
+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
7195
+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
7196
+ if (preempt)
7197
+ goto preempt;
7198
+ if (nopreempt)
7199
+ return;
76067200 BUG_ON(!pse);
76077201 if (wakeup_preempt_entity(se, pse) == 1) {
76087202 /*
....@@ -7617,7 +7211,7 @@
76177211 return;
76187212
76197213 preempt:
7620
- resched_curr(rq);
7214
+ resched_curr_lazy(rq);
76217215 /*
76227216 * Only set the backward buddy when the current task is still
76237217 * on the rq. This can happen when a wakeup gets interleaved
....@@ -7634,20 +7228,21 @@
76347228 set_last_buddy(se);
76357229 }
76367230
7637
-static struct task_struct *
7231
+struct task_struct *
76387232 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
76397233 {
76407234 struct cfs_rq *cfs_rq = &rq->cfs;
7641
- struct sched_entity *se;
7642
- struct task_struct *p;
7235
+ struct sched_entity *se = NULL;
7236
+ struct task_struct *p = NULL;
76437237 int new_tasks;
7238
+ bool repick = false;
76447239
76457240 again:
7646
- if (!cfs_rq->nr_running)
7241
+ if (!sched_fair_runnable(rq))
76477242 goto idle;
76487243
76497244 #ifdef CONFIG_FAIR_GROUP_SCHED
7650
- if (prev->sched_class != &fair_sched_class)
7245
+ if (!prev || prev->sched_class != &fair_sched_class)
76517246 goto simple;
76527247
76537248 /*
....@@ -7694,7 +7289,7 @@
76947289 } while (cfs_rq);
76957290
76967291 p = task_of(se);
7697
-
7292
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
76987293 /*
76997294 * Since we haven't yet done put_prev_entity and if the selected task
77007295 * is a different task than we started out with, try and touch the
....@@ -7724,8 +7319,15 @@
77247319 goto done;
77257320 simple:
77267321 #endif
7322
+ if (prev)
7323
+ put_prev_task(rq, prev);
77277324
7728
- put_prev_task(rq, prev);
7325
+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
7326
+ if (repick) {
7327
+ for_each_sched_entity(se)
7328
+ set_next_entity(cfs_rq_of(se), se);
7329
+ goto done;
7330
+ }
77297331
77307332 do {
77317333 se = pick_next_entity(cfs_rq, NULL);
....@@ -7753,11 +7355,13 @@
77537355 return p;
77547356
77557357 idle:
7756
- update_misfit_status(NULL, rq);
7757
- new_tasks = idle_balance(rq, rf);
7358
+ if (!rf)
7359
+ return NULL;
7360
+
7361
+ new_tasks = newidle_balance(rq, rf);
77587362
77597363 /*
7760
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
7364
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
77617365 * possible for any higher priority task to appear. In that case we
77627366 * must re-start the pick_next_entity() loop.
77637367 */
....@@ -7774,6 +7378,11 @@
77747378 update_idle_rq_clock_pelt(rq);
77757379
77767380 return NULL;
7381
+}
7382
+
7383
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
7384
+{
7385
+ return pick_next_task_fair(rq, NULL, NULL);
77777386 }
77787387
77797388 /*
....@@ -7826,7 +7435,7 @@
78267435 set_skip_buddy(se);
78277436 }
78287437
7829
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7438
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
78307439 {
78317440 struct sched_entity *se = &p->se;
78327441
....@@ -7961,15 +7570,54 @@
79617570 * rewrite all of this once again.]
79627571 */
79637572
7964
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7573
+unsigned long __read_mostly max_load_balance_interval = HZ/10;
7574
+EXPORT_SYMBOL_GPL(max_load_balance_interval);
79657575
79667576 enum fbq_type { regular, remote, all };
79677577
7578
+/*
7579
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
7580
+ *
7581
+ * The enum is ordered by pulling priority, with the group with lowest priority
7582
+ * first so the group_type can simply be compared when selecting the busiest
7583
+ * group. See update_sd_pick_busiest().
7584
+ */
79687585 enum group_type {
7969
- group_other = 0,
7586
+ /* The group has spare capacity that can be used to run more tasks. */
7587
+ group_has_spare = 0,
7588
+ /*
7589
+ * The group is fully used and the tasks don't compete for more CPU
7590
+ * cycles. Nevertheless, some tasks might wait before running.
7591
+ */
7592
+ group_fully_busy,
7593
+ /*
7594
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7595
+ * and must be migrated to a more powerful CPU.
7596
+ */
79707597 group_misfit_task,
7598
+ /*
7599
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7600
+ * and the task should be migrated to it instead of running on the
7601
+ * current CPU.
7602
+ */
7603
+ group_asym_packing,
7604
+ /*
7605
+ * The tasks' affinity constraints previously prevented the scheduler
7606
+ * from balancing the load across the system.
7607
+ */
79717608 group_imbalanced,
7972
- group_overloaded,
7609
+ /*
7610
+ * The CPU is overloaded and can't provide expected CPU cycles to all
7611
+ * tasks.
7612
+ */
7613
+ group_overloaded
7614
+};
7615
+
7616
+enum migration_type {
7617
+ migrate_load = 0,
7618
+ migrate_util,
7619
+ migrate_task,
7620
+ migrate_misfit
79737621 };
79747622
79757623 #define LBF_ALL_PINNED 0x01
....@@ -7992,7 +7640,6 @@
79927640 int new_dst_cpu;
79937641 enum cpu_idle_type idle;
79947642 long imbalance;
7995
- unsigned int src_grp_nr_running;
79967643 /* The set of CPUs under consideration for load-balancing */
79977644 struct cpumask *cpus;
79987645
....@@ -8003,8 +7650,9 @@
80037650 unsigned int loop_max;
80047651
80057652 enum fbq_type fbq_type;
8006
- enum group_type src_grp_type;
7653
+ enum migration_type migration_type;
80077654 struct list_head tasks;
7655
+ struct rq_flags *src_rq_rf;
80087656 };
80097657
80107658 /*
....@@ -8019,7 +7667,11 @@
80197667 if (p->sched_class != &fair_sched_class)
80207668 return 0;
80217669
8022
- if (unlikely(p->policy == SCHED_IDLE))
7670
+ if (unlikely(task_has_idle_policy(p)))
7671
+ return 0;
7672
+
7673
+ /* SMT siblings share cache */
7674
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
80237675 return 0;
80247676
80257677 /*
....@@ -8107,20 +7759,29 @@
81077759 int can_migrate_task(struct task_struct *p, struct lb_env *env)
81087760 {
81097761 int tsk_cache_hot;
7762
+ int can_migrate = 1;
81107763
81117764 lockdep_assert_held(&env->src_rq->lock);
7765
+
7766
+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
7767
+ if (!can_migrate)
7768
+ return 0;
81127769
81137770 /*
81147771 * We do not migrate tasks that are:
81157772 * 1) throttled_lb_pair, or
8116
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
7773
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
81177774 * 3) running (obviously), or
81187775 * 4) are cache-hot on their current CPU.
81197776 */
81207777 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
81217778 return 0;
81227779
8123
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
7780
+ /* Disregard pcpu kthreads; they are where they need to be. */
7781
+ if (kthread_is_per_cpu(p))
7782
+ return 0;
7783
+
7784
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
81247785 int cpu;
81257786
81267787 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
....@@ -8140,7 +7801,7 @@
81407801
81417802 /* Prevent to re-select dst_cpu via env's CPUs: */
81427803 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8143
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7804
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
81447805 env->flags |= LBF_DST_PINNED;
81457806 env->new_dst_cpu = cpu;
81467807 break;
....@@ -8186,9 +7847,20 @@
81867847 */
81877848 static void detach_task(struct task_struct *p, struct lb_env *env)
81887849 {
7850
+ int detached = 0;
7851
+
81897852 lockdep_assert_held(&env->src_rq->lock);
81907853
8191
- p->on_rq = TASK_ON_RQ_MIGRATING;
7854
+ /*
7855
+ * The vendor hook may drop the lock temporarily, so
7856
+ * pass the rq flags to unpin lock. We expect the
7857
+ * rq lock to be held after return.
7858
+ */
7859
+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
7860
+ env->dst_cpu, &detached);
7861
+ if (detached)
7862
+ return;
7863
+
81927864 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
81937865 set_task_cpu(p, env->dst_cpu);
81947866 }
....@@ -8227,7 +7899,7 @@
82277899 static const unsigned int sched_nr_migrate_break = 32;
82287900
82297901 /*
8230
- * detach_tasks() -- tries to detach up to imbalance weighted load from
7902
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
82317903 * busiest_rq, as part of a balancing operation within domain "sd".
82327904 *
82337905 * Returns number of detached tasks if successful and 0 otherwise.
....@@ -8235,8 +7907,8 @@
82357907 static int detach_tasks(struct lb_env *env)
82367908 {
82377909 struct list_head *tasks = &env->src_rq->cfs_tasks;
7910
+ unsigned long util, load;
82387911 struct task_struct *p;
8239
- unsigned long load;
82407912 int detached = 0;
82417913
82427914 lockdep_assert_held(&env->src_rq->lock);
....@@ -8266,39 +7938,64 @@
82667938 break;
82677939 }
82687940
8269
-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8270
- if (sysctl_sched_performance_bias) {
8271
- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8272
- goto next;
8273
- }
8274
-#endif
8275
-
82767941 if (!can_migrate_task(p, env))
82777942 goto next;
82787943
8279
- /*
8280
- * Depending of the number of CPUs and tasks and the
8281
- * cgroup hierarchy, task_h_load() can return a null
8282
- * value. Make sure that env->imbalance decreases
8283
- * otherwise detach_tasks() will stop only after
8284
- * detaching up to loop_max tasks.
8285
- */
8286
- load = max_t(unsigned long, task_h_load(p), 1);
7944
+ switch (env->migration_type) {
7945
+ case migrate_load:
7946
+ /*
7947
+ * Depending of the number of CPUs and tasks and the
7948
+ * cgroup hierarchy, task_h_load() can return a null
7949
+ * value. Make sure that env->imbalance decreases
7950
+ * otherwise detach_tasks() will stop only after
7951
+ * detaching up to loop_max tasks.
7952
+ */
7953
+ load = max_t(unsigned long, task_h_load(p), 1);
82877954
7955
+ if (sched_feat(LB_MIN) &&
7956
+ load < 16 && !env->sd->nr_balance_failed)
7957
+ goto next;
82887958
8289
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8290
- goto next;
7959
+ /*
7960
+ * Make sure that we don't migrate too much load.
7961
+ * Nevertheless, let relax the constraint if
7962
+ * scheduler fails to find a good waiting task to
7963
+ * migrate.
7964
+ */
7965
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
7966
+ goto next;
82917967
8292
- if ((load / 2) > env->imbalance)
8293
- goto next;
7968
+ env->imbalance -= load;
7969
+ break;
7970
+
7971
+ case migrate_util:
7972
+ util = task_util_est(p);
7973
+
7974
+ if (util > env->imbalance)
7975
+ goto next;
7976
+
7977
+ env->imbalance -= util;
7978
+ break;
7979
+
7980
+ case migrate_task:
7981
+ env->imbalance--;
7982
+ break;
7983
+
7984
+ case migrate_misfit:
7985
+ /* This is not a misfit task */
7986
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
7987
+ goto next;
7988
+
7989
+ env->imbalance = 0;
7990
+ break;
7991
+ }
82947992
82957993 detach_task(p, env);
82967994 list_add(&p->se.group_node, &env->tasks);
82977995
82987996 detached++;
8299
- env->imbalance -= load;
83007997
8301
-#ifdef CONFIG_PREEMPT
7998
+#ifdef CONFIG_PREEMPTION
83027999 /*
83038000 * NEWIDLE balancing is a source of latency, so preemptible
83048001 * kernels will stop after the first task is detached to minimize
....@@ -8310,7 +8007,7 @@
83108007
83118008 /*
83128009 * We only want to steal up to the prescribed amount of
8313
- * weighted load.
8010
+ * load/util/tasks.
83148011 */
83158012 if (env->imbalance <= 0)
83168013 break;
....@@ -8339,7 +8036,6 @@
83398036
83408037 BUG_ON(task_rq(p) != rq);
83418038 activate_task(rq, p, ENQUEUE_NOCLOCK);
8342
- p->on_rq = TASK_ON_RQ_QUEUED;
83438039 check_preempt_curr(rq, p, 0);
83448040 }
83458041
....@@ -8380,6 +8076,7 @@
83808076 rq_unlock(env->dst_rq, &rf);
83818077 }
83828078
8079
+#ifdef CONFIG_NO_HZ_COMMON
83838080 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
83848081 {
83858082 if (cfs_rq->avg.load_avg)
....@@ -8399,12 +8096,54 @@
83998096 if (READ_ONCE(rq->avg_dl.util_avg))
84008097 return true;
84018098
8099
+ if (thermal_load_avg(rq))
8100
+ return true;
8101
+
84028102 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
84038103 if (READ_ONCE(rq->avg_irq.util_avg))
84048104 return true;
84058105 #endif
84068106
84078107 return false;
8108
+}
8109
+
8110
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8111
+{
8112
+ rq->last_blocked_load_update_tick = jiffies;
8113
+
8114
+ if (!has_blocked)
8115
+ rq->has_blocked_load = 0;
8116
+}
8117
+#else
8118
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8119
+static inline bool others_have_blocked(struct rq *rq) { return false; }
8120
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8121
+#endif
8122
+
8123
+static bool __update_blocked_others(struct rq *rq, bool *done)
8124
+{
8125
+ const struct sched_class *curr_class;
8126
+ u64 now = rq_clock_pelt(rq);
8127
+ unsigned long thermal_pressure;
8128
+ bool decayed;
8129
+
8130
+ /*
8131
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
8132
+ * DL and IRQ signals have been updated before updating CFS.
8133
+ */
8134
+ curr_class = rq->curr->sched_class;
8135
+
8136
+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8137
+
8138
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8139
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
8140
+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
8141
+ update_irq_load_avg(rq, 0);
8142
+
8143
+ if (others_have_blocked(rq))
8144
+ *done = false;
8145
+
8146
+ return decayed;
84088147 }
84098148
84108149 #ifdef CONFIG_FAIR_GROUP_SCHED
....@@ -8420,22 +8159,17 @@
84208159 if (cfs_rq->avg.util_sum)
84218160 return false;
84228161
8423
- if (cfs_rq->avg.runnable_load_sum)
8162
+ if (cfs_rq->avg.runnable_sum)
84248163 return false;
84258164
84268165 return true;
84278166 }
84288167
8429
-static void update_blocked_averages(int cpu)
8168
+static bool __update_blocked_fair(struct rq *rq, bool *done)
84308169 {
8431
- struct rq *rq = cpu_rq(cpu);
84328170 struct cfs_rq *cfs_rq, *pos;
8433
- const struct sched_class *curr_class;
8434
- struct rq_flags rf;
8435
- bool done = true;
8436
-
8437
- rq_lock_irqsave(rq, &rf);
8438
- update_rq_clock(rq);
8171
+ bool decayed = false;
8172
+ int cpu = cpu_of(rq);
84398173
84408174 /*
84418175 * Iterates the task_group tree in a bottom up fashion, see
....@@ -8444,8 +8178,12 @@
84448178 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
84458179 struct sched_entity *se;
84468180
8447
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8448
- update_tg_load_avg(cfs_rq, 0);
8181
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
8182
+ update_tg_load_avg(cfs_rq);
8183
+
8184
+ if (cfs_rq == &rq->cfs)
8185
+ decayed = true;
8186
+ }
84498187
84508188 /* Propagate pending load changes to the parent, if any: */
84518189 se = cfs_rq->tg->se[cpu];
....@@ -8461,23 +8199,10 @@
84618199
84628200 /* Don't need periodic decay once load/util_avg are null */
84638201 if (cfs_rq_has_blocked(cfs_rq))
8464
- done = false;
8202
+ *done = false;
84658203 }
84668204
8467
- curr_class = rq->curr->sched_class;
8468
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8469
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8470
- update_irq_load_avg(rq, 0);
8471
- /* Don't need periodic decay once load/util_avg are null */
8472
- if (others_have_blocked(rq))
8473
- done = false;
8474
-
8475
-#ifdef CONFIG_NO_HZ_COMMON
8476
- rq->last_blocked_load_update_tick = jiffies;
8477
- if (done)
8478
- rq->has_blocked_load = 0;
8479
-#endif
8480
- rq_unlock_irqrestore(rq, &rf);
8205
+ return decayed;
84818206 }
84828207
84838208 /*
....@@ -8527,27 +8252,16 @@
85278252 cfs_rq_load_avg(cfs_rq) + 1);
85288253 }
85298254 #else
8530
-static inline void update_blocked_averages(int cpu)
8255
+static bool __update_blocked_fair(struct rq *rq, bool *done)
85318256 {
8532
- struct rq *rq = cpu_rq(cpu);
85338257 struct cfs_rq *cfs_rq = &rq->cfs;
8534
- const struct sched_class *curr_class;
8535
- struct rq_flags rf;
8258
+ bool decayed;
85368259
8537
- rq_lock_irqsave(rq, &rf);
8538
- update_rq_clock(rq);
8539
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8260
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8261
+ if (cfs_rq_has_blocked(cfs_rq))
8262
+ *done = false;
85408263
8541
- curr_class = rq->curr->sched_class;
8542
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8543
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8544
- update_irq_load_avg(rq, 0);
8545
-#ifdef CONFIG_NO_HZ_COMMON
8546
- rq->last_blocked_load_update_tick = jiffies;
8547
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8548
- rq->has_blocked_load = 0;
8549
-#endif
8550
- rq_unlock_irqrestore(rq, &rf);
8264
+ return decayed;
85518265 }
85528266
85538267 static unsigned long task_h_load(struct task_struct *p)
....@@ -8555,6 +8269,24 @@
85558269 return p->se.avg.load_avg;
85568270 }
85578271 #endif
8272
+
8273
+static void update_blocked_averages(int cpu)
8274
+{
8275
+ bool decayed = false, done = true;
8276
+ struct rq *rq = cpu_rq(cpu);
8277
+ struct rq_flags rf;
8278
+
8279
+ rq_lock_irqsave(rq, &rf);
8280
+ update_rq_clock(rq);
8281
+
8282
+ decayed |= __update_blocked_others(rq, &done);
8283
+ decayed |= __update_blocked_fair(rq, &done);
8284
+
8285
+ update_blocked_load_status(rq, !done);
8286
+ if (decayed)
8287
+ cpufreq_update_util(rq, 0);
8288
+ rq_unlock_irqrestore(rq, &rf);
8289
+}
85588290
85598291 /********** Helpers for find_busiest_group ************************/
85608292
....@@ -8564,15 +8296,15 @@
85648296 struct sg_lb_stats {
85658297 unsigned long avg_load; /*Avg load across the CPUs of the group */
85668298 unsigned long group_load; /* Total load over the CPUs of the group */
8567
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8568
- unsigned long load_per_task;
85698299 unsigned long group_capacity;
8570
- unsigned long group_util; /* Total utilization of the group */
8571
- unsigned int sum_nr_running; /* Nr tasks running in the group */
8300
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
8301
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
8302
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
8303
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
85728304 unsigned int idle_cpus;
85738305 unsigned int group_weight;
85748306 enum group_type group_type;
8575
- int group_no_capacity;
8307
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
85768308 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
85778309 #ifdef CONFIG_NUMA_BALANCING
85788310 unsigned int nr_numa_running;
....@@ -8587,10 +8319,10 @@
85878319 struct sd_lb_stats {
85888320 struct sched_group *busiest; /* Busiest group in this sd */
85898321 struct sched_group *local; /* Local group in this sd */
8590
- unsigned long total_running;
85918322 unsigned long total_load; /* Total load of all groups in sd */
85928323 unsigned long total_capacity; /* Total capacity of all groups in sd */
85938324 unsigned long avg_load; /* Average load across all groups in sd */
8325
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
85948326
85958327 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
85968328 struct sg_lb_stats local_stat; /* Statistics of the local group */
....@@ -8601,54 +8333,26 @@
86018333 /*
86028334 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
86038335 * local_stat because update_sg_lb_stats() does a full clear/assignment.
8604
- * We must however clear busiest_stat::avg_load because
8605
- * update_sd_pick_busiest() reads this before assignment.
8336
+ * We must however set busiest_stat::group_type and
8337
+ * busiest_stat::idle_cpus to the worst busiest group because
8338
+ * update_sd_pick_busiest() reads these before assignment.
86068339 */
86078340 *sds = (struct sd_lb_stats){
86088341 .busiest = NULL,
86098342 .local = NULL,
8610
- .total_running = 0UL,
86118343 .total_load = 0UL,
86128344 .total_capacity = 0UL,
86138345 .busiest_stat = {
8614
- .avg_load = 0UL,
8615
- .sum_nr_running = 0,
8616
- .group_type = group_other,
8346
+ .idle_cpus = UINT_MAX,
8347
+ .group_type = group_has_spare,
86178348 },
86188349 };
86198350 }
86208351
8621
-/**
8622
- * get_sd_load_idx - Obtain the load index for a given sched domain.
8623
- * @sd: The sched_domain whose load_idx is to be obtained.
8624
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8625
- *
8626
- * Return: The load index.
8627
- */
8628
-static inline int get_sd_load_idx(struct sched_domain *sd,
8629
- enum cpu_idle_type idle)
8630
-{
8631
- int load_idx;
8632
-
8633
- switch (idle) {
8634
- case CPU_NOT_IDLE:
8635
- load_idx = sd->busy_idx;
8636
- break;
8637
-
8638
- case CPU_NEWLY_IDLE:
8639
- load_idx = sd->newidle_idx;
8640
- break;
8641
- default:
8642
- load_idx = sd->idle_idx;
8643
- break;
8644
- }
8645
-
8646
- return load_idx;
8647
-}
8648
-
8649
-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
8352
+static unsigned long scale_rt_capacity(int cpu)
86508353 {
86518354 struct rq *rq = cpu_rq(cpu);
8355
+ unsigned long max = arch_scale_cpu_capacity(cpu);
86528356 unsigned long used, free;
86538357 unsigned long irq;
86548358
....@@ -8657,8 +8361,15 @@
86578361 if (unlikely(irq >= max))
86588362 return 1;
86598363
8364
+ /*
8365
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
8366
+ * (running and not running) with weights 0 and 1024 respectively.
8367
+ * avg_thermal.load_avg tracks thermal pressure and the weighted
8368
+ * average uses the actual delta max capacity(load).
8369
+ */
86608370 used = READ_ONCE(rq->avg_rt.util_avg);
86618371 used += READ_ONCE(rq->avg_dl.util_avg);
8372
+ used += thermal_load_avg(rq);
86628373
86638374 if (unlikely(used >= max))
86648375 return 1;
....@@ -8668,52 +8379,20 @@
86688379 return scale_irq_capacity(free, irq, max);
86698380 }
86708381
8671
-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8672
- raw_spin_lock_init(&mcc->lock);
8673
- mcc->val = 0;
8674
- mcc->cpu = -1;
8675
-}
8676
-
86778382 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
86788383 {
8679
- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
8384
+ unsigned long capacity = scale_rt_capacity(cpu);
86808385 struct sched_group *sdg = sd->groups;
8681
- struct max_cpu_capacity *mcc;
8682
- unsigned long max_capacity;
8683
- int max_cap_cpu;
8684
- unsigned long flags;
86858386
8686
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8687
-
8688
- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8689
- capacity >>= SCHED_CAPACITY_SHIFT;
8690
-
8691
- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8692
-
8693
- raw_spin_lock_irqsave(&mcc->lock, flags);
8694
- max_capacity = mcc->val;
8695
- max_cap_cpu = mcc->cpu;
8696
-
8697
- if ((max_capacity > capacity && max_cap_cpu == cpu) ||
8698
- (max_capacity < capacity)) {
8699
- mcc->val = capacity;
8700
- mcc->cpu = cpu;
8701
-#ifdef CONFIG_SCHED_DEBUG
8702
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8703
- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8704
- // cpu, capacity);
8705
- goto skip_unlock;
8706
-#endif
8707
- }
8708
- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8709
-
8710
-skip_unlock: __attribute__ ((unused));
8711
- capacity = scale_rt_capacity(cpu, capacity);
8387
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
87128388
87138389 if (!capacity)
87148390 capacity = 1;
87158391
8392
+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
87168393 cpu_rq(cpu)->cpu_capacity = capacity;
8394
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8395
+
87178396 sdg->sgc->capacity = capacity;
87188397 sdg->sgc->min_capacity = capacity;
87198398 sdg->sgc->max_capacity = capacity;
....@@ -8746,29 +8425,11 @@
87468425 */
87478426
87488427 for_each_cpu(cpu, sched_group_span(sdg)) {
8749
- struct sched_group_capacity *sgc;
8750
- struct rq *rq = cpu_rq(cpu);
8428
+ unsigned long cpu_cap = capacity_of(cpu);
87518429
8752
- /*
8753
- * build_sched_domains() -> init_sched_groups_capacity()
8754
- * gets here before we've attached the domains to the
8755
- * runqueues.
8756
- *
8757
- * Use capacity_of(), which is set irrespective of domains
8758
- * in update_cpu_capacity().
8759
- *
8760
- * This avoids capacity from being 0 and
8761
- * causing divide-by-zero issues on boot.
8762
- */
8763
- if (unlikely(!rq->sd)) {
8764
- capacity += capacity_of(cpu);
8765
- } else {
8766
- sgc = rq->sd->groups->sgc;
8767
- capacity += sgc->capacity;
8768
- }
8769
-
8770
- min_capacity = min(capacity, min_capacity);
8771
- max_capacity = max(capacity, max_capacity);
8430
+ capacity += cpu_cap;
8431
+ min_capacity = min(cpu_cap, min_capacity);
8432
+ max_capacity = max(cpu_cap, max_capacity);
87728433 }
87738434 } else {
87748435 /*
....@@ -8805,8 +8466,20 @@
88058466 }
88068467
88078468 /*
8469
+ * Check whether a rq has a misfit task and if it looks like we can actually
8470
+ * help that task: we can migrate the task to a CPU of higher capacity, or
8471
+ * the task's current CPU is heavily pressured.
8472
+ */
8473
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8474
+{
8475
+ return rq->misfit_task_load &&
8476
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8477
+ check_cpu_capacity(rq, sd));
8478
+}
8479
+
8480
+/*
88088481 * Group imbalance indicates (and tries to solve) the problem where balancing
8809
- * groups is inadequate due to ->cpus_allowed constraints.
8482
+ * groups is inadequate due to ->cpus_ptr constraints.
88108483 *
88118484 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
88128485 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
....@@ -8851,13 +8524,17 @@
88518524 * any benefit for the load balance.
88528525 */
88538526 static inline bool
8854
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8527
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88558528 {
88568529 if (sgs->sum_nr_running < sgs->group_weight)
88578530 return true;
88588531
8532
+ if ((sgs->group_capacity * imbalance_pct) <
8533
+ (sgs->group_runnable * 100))
8534
+ return false;
8535
+
88598536 if ((sgs->group_capacity * 100) >
8860
- (sgs->group_util * env->sd->imbalance_pct))
8537
+ (sgs->group_util * imbalance_pct))
88618538 return true;
88628539
88638540 return false;
....@@ -8872,13 +8549,17 @@
88728549 * false.
88738550 */
88748551 static inline bool
8875
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8552
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
88768553 {
88778554 if (sgs->sum_nr_running <= sgs->group_weight)
88788555 return false;
88798556
88808557 if ((sgs->group_capacity * 100) <
8881
- (sgs->group_util * env->sd->imbalance_pct))
8558
+ (sgs->group_util * imbalance_pct))
8559
+ return true;
8560
+
8561
+ if ((sgs->group_capacity * imbalance_pct) <
8562
+ (sgs->group_runnable * 100))
88828563 return true;
88838564
88848565 return false;
....@@ -8891,8 +8572,7 @@
88918572 static inline bool
88928573 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
88938574 {
8894
- return sg->sgc->min_capacity * capacity_margin <
8895
- ref->sgc->min_capacity * 1024;
8575
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
88968576 }
88978577
88988578 /*
....@@ -8902,24 +8582,30 @@
89028582 static inline bool
89038583 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
89048584 {
8905
- return sg->sgc->max_capacity * capacity_margin <
8906
- ref->sgc->max_capacity * 1024;
8585
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
89078586 }
89088587
89098588 static inline enum
8910
-group_type group_classify(struct sched_group *group,
8589
+group_type group_classify(unsigned int imbalance_pct,
8590
+ struct sched_group *group,
89118591 struct sg_lb_stats *sgs)
89128592 {
8913
- if (sgs->group_no_capacity)
8593
+ if (group_is_overloaded(imbalance_pct, sgs))
89148594 return group_overloaded;
89158595
89168596 if (sg_imbalanced(group))
89178597 return group_imbalanced;
89188598
8599
+ if (sgs->group_asym_packing)
8600
+ return group_asym_packing;
8601
+
89198602 if (sgs->group_misfit_task_load)
89208603 return group_misfit_task;
89218604
8922
- return group_other;
8605
+ if (!group_has_capacity(imbalance_pct, sgs))
8606
+ return group_fully_busy;
8607
+
8608
+ return group_has_spare;
89238609 }
89248610
89258611 static bool update_nohz_stats(struct rq *rq, bool force)
....@@ -8956,12 +8642,11 @@
89568642 struct sg_lb_stats *sgs,
89578643 int *sg_status)
89588644 {
8959
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8960
- int load_idx = get_sd_load_idx(env->sd, env->idle);
8961
- unsigned long load;
8962
- int i, nr_running;
8645
+ int i, nr_running, local_group;
89638646
89648647 memset(sgs, 0, sizeof(*sgs));
8648
+
8649
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
89658650
89668651 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
89678652 struct rq *rq = cpu_rq(i);
....@@ -8969,17 +8654,14 @@
89698654 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
89708655 env->flags |= LBF_NOHZ_AGAIN;
89718656
8972
- /* Bias balancing toward CPUs of our domain: */
8973
- if (local_group)
8974
- load = target_load(i, load_idx);
8975
- else
8976
- load = source_load(i, load_idx);
8977
-
8978
- sgs->group_load += load;
8657
+ sgs->group_load += cpu_load(rq);
89798658 sgs->group_util += cpu_util(i);
8980
- sgs->sum_nr_running += rq->cfs.h_nr_running;
8659
+ sgs->group_runnable += cpu_runnable(rq);
8660
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
89818661
89828662 nr_running = rq->nr_running;
8663
+ sgs->sum_nr_running += nr_running;
8664
+
89838665 if (nr_running > 1)
89848666 *sg_status |= SG_OVERLOAD;
89858667
....@@ -8990,13 +8672,19 @@
89908672 sgs->nr_numa_running += rq->nr_numa_running;
89918673 sgs->nr_preferred_running += rq->nr_preferred_running;
89928674 #endif
8993
- sgs->sum_weighted_load += weighted_cpuload(rq);
89948675 /*
89958676 * No need to call idle_cpu() if nr_running is not 0
89968677 */
8997
- if (!nr_running && idle_cpu(i))
8678
+ if (!nr_running && idle_cpu(i)) {
89988679 sgs->idle_cpus++;
8680
+ /* Idle cpu can't have misfit task */
8681
+ continue;
8682
+ }
89998683
8684
+ if (local_group)
8685
+ continue;
8686
+
8687
+ /* Check for a misfit task on the cpu */
90008688 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
90018689 sgs->group_misfit_task_load < rq->misfit_task_load) {
90028690 sgs->group_misfit_task_load = rq->misfit_task_load;
....@@ -9004,17 +8692,24 @@
90048692 }
90058693 }
90068694
9007
- /* Adjust by relative CPU capacity of the group */
9008
- sgs->group_capacity = group->sgc->capacity;
9009
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8695
+ /* Check if dst CPU is idle and preferred to this group */
8696
+ if (env->sd->flags & SD_ASYM_PACKING &&
8697
+ env->idle != CPU_NOT_IDLE &&
8698
+ sgs->sum_h_nr_running &&
8699
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
8700
+ sgs->group_asym_packing = 1;
8701
+ }
90108702
9011
- if (sgs->sum_nr_running)
9012
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8703
+ sgs->group_capacity = group->sgc->capacity;
90138704
90148705 sgs->group_weight = group->group_weight;
90158706
9016
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9017
- sgs->group_type = group_classify(group, sgs);
8707
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
8708
+
8709
+ /* Computing avg_load makes sense only when group is overloaded */
8710
+ if (sgs->group_type == group_overloaded)
8711
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8712
+ sgs->group_capacity;
90188713 }
90198714
90208715 /**
....@@ -9037,6 +8732,10 @@
90378732 {
90388733 struct sg_lb_stats *busiest = &sds->busiest_stat;
90398734
8735
+ /* Make sure that there is at least one task to pull */
8736
+ if (!sgs->sum_h_nr_running)
8737
+ return false;
8738
+
90408739 /*
90418740 * Don't try to pull misfit tasks we can't help.
90428741 * We can use max_capacity here as reduction in capacity on some
....@@ -9045,7 +8744,7 @@
90458744 */
90468745 if (sgs->group_type == group_misfit_task &&
90478746 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
9048
- !group_has_capacity(env, &sds->local_stat)))
8747
+ sds->local_stat.group_type != group_has_spare))
90498748 return false;
90508749
90518750 if (sgs->group_type > busiest->group_type)
....@@ -9054,62 +8753,92 @@
90548753 if (sgs->group_type < busiest->group_type)
90558754 return false;
90568755
9057
- if (sgs->avg_load <= busiest->avg_load)
8756
+ /*
8757
+ * The candidate and the current busiest group are the same type of
8758
+ * group. Let check which one is the busiest according to the type.
8759
+ */
8760
+
8761
+ switch (sgs->group_type) {
8762
+ case group_overloaded:
8763
+ /* Select the overloaded group with highest avg_load. */
8764
+ if (sgs->avg_load <= busiest->avg_load)
8765
+ return false;
8766
+ break;
8767
+
8768
+ case group_imbalanced:
8769
+ /*
8770
+ * Select the 1st imbalanced group as we don't have any way to
8771
+ * choose one more than another.
8772
+ */
90588773 return false;
90598774
9060
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9061
- goto asym_packing;
9062
-
9063
- /*
9064
- * Candidate sg has no more than one task per CPU and
9065
- * has higher per-CPU capacity. Migrating tasks to less
9066
- * capable CPUs may harm throughput. Maximize throughput,
9067
- * power/energy consequences are not considered.
9068
- */
9069
- if (sgs->sum_nr_running <= sgs->group_weight &&
9070
- group_smaller_min_cpu_capacity(sds->local, sg))
9071
- return false;
9072
-
9073
- /*
9074
- * If we have more than one misfit sg go with the biggest misfit.
9075
- */
9076
- if (sgs->group_type == group_misfit_task &&
9077
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9078
- return false;
9079
-
9080
-asym_packing:
9081
- /* This is the busiest node in its class. */
9082
- if (!(env->sd->flags & SD_ASYM_PACKING))
9083
- return true;
9084
-
9085
- /* No ASYM_PACKING if target CPU is already busy */
9086
- if (env->idle == CPU_NOT_IDLE)
9087
- return true;
9088
- /*
9089
- * ASYM_PACKING needs to move all the work to the highest
9090
- * prority CPUs in the group, therefore mark all groups
9091
- * of lower priority than ourself as busy.
9092
- */
9093
- if (sgs->sum_nr_running &&
9094
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9095
- if (!sds->busiest)
9096
- return true;
9097
-
8775
+ case group_asym_packing:
90988776 /* Prefer to move from lowest priority CPU's work */
9099
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9100
- sg->asym_prefer_cpu))
9101
- return true;
8777
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8778
+ return false;
8779
+ break;
8780
+
8781
+ case group_misfit_task:
8782
+ /*
8783
+ * If we have more than one misfit sg go with the biggest
8784
+ * misfit.
8785
+ */
8786
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8787
+ return false;
8788
+ break;
8789
+
8790
+ case group_fully_busy:
8791
+ /*
8792
+ * Select the fully busy group with highest avg_load. In
8793
+ * theory, there is no need to pull task from such kind of
8794
+ * group because tasks have all compute capacity that they need
8795
+ * but we can still improve the overall throughput by reducing
8796
+ * contention when accessing shared HW resources.
8797
+ *
8798
+ * XXX for now avg_load is not computed and always 0 so we
8799
+ * select the 1st one.
8800
+ */
8801
+ if (sgs->avg_load <= busiest->avg_load)
8802
+ return false;
8803
+ break;
8804
+
8805
+ case group_has_spare:
8806
+ /*
8807
+ * Select not overloaded group with lowest number of idle cpus
8808
+ * and highest number of running tasks. We could also compare
8809
+ * the spare capacity which is more stable but it can end up
8810
+ * that the group has less spare capacity but finally more idle
8811
+ * CPUs which means less opportunity to pull tasks.
8812
+ */
8813
+ if (sgs->idle_cpus > busiest->idle_cpus)
8814
+ return false;
8815
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8816
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
8817
+ return false;
8818
+
8819
+ break;
91028820 }
91038821
9104
- return false;
8822
+ /*
8823
+ * Candidate sg has no more than one task per CPU and has higher
8824
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
8825
+ * throughput. Maximize throughput, power/energy consequences are not
8826
+ * considered.
8827
+ */
8828
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8829
+ (sgs->group_type <= group_fully_busy) &&
8830
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
8831
+ return false;
8832
+
8833
+ return true;
91058834 }
91068835
91078836 #ifdef CONFIG_NUMA_BALANCING
91088837 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
91098838 {
9110
- if (sgs->sum_nr_running > sgs->nr_numa_running)
8839
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
91118840 return regular;
9112
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
8841
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
91138842 return remote;
91148843 return all;
91158844 }
....@@ -9134,18 +8863,334 @@
91348863 }
91358864 #endif /* CONFIG_NUMA_BALANCING */
91368865
8866
+
8867
+struct sg_lb_stats;
8868
+
8869
+/*
8870
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
8871
+ */
8872
+
8873
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8874
+{
8875
+ /* Task has no contribution or is new */
8876
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8877
+ return 0;
8878
+
8879
+ if (task_on_rq_queued(p))
8880
+ return 1;
8881
+
8882
+ return 0;
8883
+}
8884
+
8885
+/**
8886
+ * idle_cpu_without - would a given CPU be idle without p ?
8887
+ * @cpu: the processor on which idleness is tested.
8888
+ * @p: task which should be ignored.
8889
+ *
8890
+ * Return: 1 if the CPU would be idle. 0 otherwise.
8891
+ */
8892
+static int idle_cpu_without(int cpu, struct task_struct *p)
8893
+{
8894
+ struct rq *rq = cpu_rq(cpu);
8895
+
8896
+ if (rq->curr != rq->idle && rq->curr != p)
8897
+ return 0;
8898
+
8899
+ /*
8900
+ * rq->nr_running can't be used but an updated version without the
8901
+ * impact of p on cpu must be used instead. The updated nr_running
8902
+ * be computed and tested before calling idle_cpu_without().
8903
+ */
8904
+
8905
+#ifdef CONFIG_SMP
8906
+ if (rq->ttwu_pending)
8907
+ return 0;
8908
+#endif
8909
+
8910
+ return 1;
8911
+}
8912
+
8913
+/*
8914
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
8915
+ * @sd: The sched_domain level to look for idlest group.
8916
+ * @group: sched_group whose statistics are to be updated.
8917
+ * @sgs: variable to hold the statistics for this group.
8918
+ * @p: The task for which we look for the idlest group/CPU.
8919
+ */
8920
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8921
+ struct sched_group *group,
8922
+ struct sg_lb_stats *sgs,
8923
+ struct task_struct *p)
8924
+{
8925
+ int i, nr_running;
8926
+
8927
+ memset(sgs, 0, sizeof(*sgs));
8928
+
8929
+ for_each_cpu(i, sched_group_span(group)) {
8930
+ struct rq *rq = cpu_rq(i);
8931
+ unsigned int local;
8932
+
8933
+ sgs->group_load += cpu_load_without(rq, p);
8934
+ sgs->group_util += cpu_util_without(i, p);
8935
+ sgs->group_runnable += cpu_runnable_without(rq, p);
8936
+ local = task_running_on_cpu(i, p);
8937
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
8938
+
8939
+ nr_running = rq->nr_running - local;
8940
+ sgs->sum_nr_running += nr_running;
8941
+
8942
+ /*
8943
+ * No need to call idle_cpu_without() if nr_running is not 0
8944
+ */
8945
+ if (!nr_running && idle_cpu_without(i, p))
8946
+ sgs->idle_cpus++;
8947
+
8948
+ }
8949
+
8950
+ /* Check if task fits in the group */
8951
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
8952
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
8953
+ sgs->group_misfit_task_load = 1;
8954
+ }
8955
+
8956
+ sgs->group_capacity = group->sgc->capacity;
8957
+
8958
+ sgs->group_weight = group->group_weight;
8959
+
8960
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8961
+
8962
+ /*
8963
+ * Computing avg_load makes sense only when group is fully busy or
8964
+ * overloaded
8965
+ */
8966
+ if (sgs->group_type == group_fully_busy ||
8967
+ sgs->group_type == group_overloaded)
8968
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8969
+ sgs->group_capacity;
8970
+}
8971
+
8972
+static bool update_pick_idlest(struct sched_group *idlest,
8973
+ struct sg_lb_stats *idlest_sgs,
8974
+ struct sched_group *group,
8975
+ struct sg_lb_stats *sgs)
8976
+{
8977
+ if (sgs->group_type < idlest_sgs->group_type)
8978
+ return true;
8979
+
8980
+ if (sgs->group_type > idlest_sgs->group_type)
8981
+ return false;
8982
+
8983
+ /*
8984
+ * The candidate and the current idlest group are the same type of
8985
+ * group. Let check which one is the idlest according to the type.
8986
+ */
8987
+
8988
+ switch (sgs->group_type) {
8989
+ case group_overloaded:
8990
+ case group_fully_busy:
8991
+ /* Select the group with lowest avg_load. */
8992
+ if (idlest_sgs->avg_load <= sgs->avg_load)
8993
+ return false;
8994
+ break;
8995
+
8996
+ case group_imbalanced:
8997
+ case group_asym_packing:
8998
+ /* Those types are not used in the slow wakeup path */
8999
+ return false;
9000
+
9001
+ case group_misfit_task:
9002
+ /* Select group with the highest max capacity */
9003
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9004
+ return false;
9005
+ break;
9006
+
9007
+ case group_has_spare:
9008
+ /* Select group with most idle CPUs */
9009
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
9010
+ return false;
9011
+
9012
+ /* Select group with lowest group_util */
9013
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9014
+ idlest_sgs->group_util <= sgs->group_util)
9015
+ return false;
9016
+
9017
+ break;
9018
+ }
9019
+
9020
+ return true;
9021
+}
9022
+
9023
+/*
9024
+ * find_idlest_group() finds and returns the least busy CPU group within the
9025
+ * domain.
9026
+ *
9027
+ * Assumes p is allowed on at least one CPU in sd.
9028
+ */
9029
+static struct sched_group *
9030
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
9031
+{
9032
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9033
+ struct sg_lb_stats local_sgs, tmp_sgs;
9034
+ struct sg_lb_stats *sgs;
9035
+ unsigned long imbalance;
9036
+ struct sg_lb_stats idlest_sgs = {
9037
+ .avg_load = UINT_MAX,
9038
+ .group_type = group_overloaded,
9039
+ };
9040
+
9041
+ imbalance = scale_load_down(NICE_0_LOAD) *
9042
+ (sd->imbalance_pct-100) / 100;
9043
+
9044
+ do {
9045
+ int local_group;
9046
+
9047
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
9048
+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
9049
+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
9050
+ int level = rockchip_perf_get_level();
9051
+
9052
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
9053
+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
9054
+ !cpumask_intersects(sched_group_span(group), cpub_mask))
9055
+ continue;
9056
+ }
9057
+
9058
+ /* Skip over this group if it has no CPUs allowed */
9059
+ if (!cpumask_intersects(sched_group_span(group),
9060
+ p->cpus_ptr))
9061
+ continue;
9062
+
9063
+ local_group = cpumask_test_cpu(this_cpu,
9064
+ sched_group_span(group));
9065
+
9066
+ if (local_group) {
9067
+ sgs = &local_sgs;
9068
+ local = group;
9069
+ } else {
9070
+ sgs = &tmp_sgs;
9071
+ }
9072
+
9073
+ update_sg_wakeup_stats(sd, group, sgs, p);
9074
+
9075
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9076
+ idlest = group;
9077
+ idlest_sgs = *sgs;
9078
+ }
9079
+
9080
+ } while (group = group->next, group != sd->groups);
9081
+
9082
+
9083
+ /* There is no idlest group to push tasks to */
9084
+ if (!idlest)
9085
+ return NULL;
9086
+
9087
+ /* The local group has been skipped because of CPU affinity */
9088
+ if (!local)
9089
+ return idlest;
9090
+
9091
+ /*
9092
+ * If the local group is idler than the selected idlest group
9093
+ * don't try and push the task.
9094
+ */
9095
+ if (local_sgs.group_type < idlest_sgs.group_type)
9096
+ return NULL;
9097
+
9098
+ /*
9099
+ * If the local group is busier than the selected idlest group
9100
+ * try and push the task.
9101
+ */
9102
+ if (local_sgs.group_type > idlest_sgs.group_type)
9103
+ return idlest;
9104
+
9105
+ switch (local_sgs.group_type) {
9106
+ case group_overloaded:
9107
+ case group_fully_busy:
9108
+ /*
9109
+ * When comparing groups across NUMA domains, it's possible for
9110
+ * the local domain to be very lightly loaded relative to the
9111
+ * remote domains but "imbalance" skews the comparison making
9112
+ * remote CPUs look much more favourable. When considering
9113
+ * cross-domain, add imbalance to the load on the remote node
9114
+ * and consider staying local.
9115
+ */
9116
+
9117
+ if ((sd->flags & SD_NUMA) &&
9118
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9119
+ return NULL;
9120
+
9121
+ /*
9122
+ * If the local group is less loaded than the selected
9123
+ * idlest group don't try and push any tasks.
9124
+ */
9125
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9126
+ return NULL;
9127
+
9128
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9129
+ return NULL;
9130
+ break;
9131
+
9132
+ case group_imbalanced:
9133
+ case group_asym_packing:
9134
+ /* Those type are not used in the slow wakeup path */
9135
+ return NULL;
9136
+
9137
+ case group_misfit_task:
9138
+ /* Select group with the highest max capacity */
9139
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9140
+ return NULL;
9141
+ break;
9142
+
9143
+ case group_has_spare:
9144
+ if (sd->flags & SD_NUMA) {
9145
+#ifdef CONFIG_NUMA_BALANCING
9146
+ int idlest_cpu;
9147
+ /*
9148
+ * If there is spare capacity at NUMA, try to select
9149
+ * the preferred node
9150
+ */
9151
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9152
+ return NULL;
9153
+
9154
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
9155
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9156
+ return idlest;
9157
+#endif
9158
+ /*
9159
+ * Otherwise, keep the task on this node to stay close
9160
+ * its wakeup source and improve locality. If there is
9161
+ * a real need of migration, periodic load balance will
9162
+ * take care of it.
9163
+ */
9164
+ if (local_sgs.idle_cpus)
9165
+ return NULL;
9166
+ }
9167
+
9168
+ /*
9169
+ * Select group with highest number of idle CPUs. We could also
9170
+ * compare the utilization which is more stable but it can end
9171
+ * up that the group has less spare capacity but finally more
9172
+ * idle CPUs which means more opportunity to run task.
9173
+ */
9174
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9175
+ return NULL;
9176
+ break;
9177
+ }
9178
+
9179
+ return idlest;
9180
+}
9181
+
91379182 /**
91389183 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
91399184 * @env: The load balancing environment.
91409185 * @sds: variable to hold the statistics for this sched_domain.
91419186 */
9187
+
91429188 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
91439189 {
91449190 struct sched_domain *child = env->sd->child;
91459191 struct sched_group *sg = env->sd->groups;
91469192 struct sg_lb_stats *local = &sds->local_stat;
91479193 struct sg_lb_stats tmp_sgs;
9148
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
91499194 int sg_status = 0;
91509195
91519196 #ifdef CONFIG_NO_HZ_COMMON
....@@ -9172,22 +9217,6 @@
91729217 if (local_group)
91739218 goto next_group;
91749219
9175
- /*
9176
- * In case the child domain prefers tasks go to siblings
9177
- * first, lower the sg capacity so that we'll try
9178
- * and move all the excess tasks away. We lower the capacity
9179
- * of a group only if the local group has the capacity to fit
9180
- * these excess tasks. The extra check prevents the case where
9181
- * you always pull from the heaviest group when it is already
9182
- * under-utilized (possible with a large weight task outweighs
9183
- * the tasks on the system).
9184
- */
9185
- if (prefer_sibling && sds->local &&
9186
- group_has_capacity(env, local) &&
9187
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9188
- sgs->group_no_capacity = 1;
9189
- sgs->group_type = group_classify(sg, sgs);
9190
- }
91919220
91929221 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
91939222 sds->busiest = sg;
....@@ -9196,12 +9225,14 @@
91969225
91979226 next_group:
91989227 /* Now, start updating sd_lb_stats */
9199
- sds->total_running += sgs->sum_nr_running;
92009228 sds->total_load += sgs->group_load;
92019229 sds->total_capacity += sgs->group_capacity;
92029230
92039231 sg = sg->next;
92049232 } while (sg != env->sd->groups);
9233
+
9234
+ /* Tag domain that child domain prefers tasks go to siblings first */
9235
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
92059236
92069237 #ifdef CONFIG_NO_HZ_COMMON
92079238 if ((env->flags & LBF_NOHZ_AGAIN) &&
....@@ -9215,8 +9246,6 @@
92159246 if (env->sd->flags & SD_NUMA)
92169247 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
92179248
9218
- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9219
-
92209249 if (!env->sd->parent) {
92219250 struct root_domain *rd = env->dst_rq->rd;
92229251
....@@ -9225,144 +9254,28 @@
92259254
92269255 /* Update over-utilization (tipping point, U >= 0) indicator */
92279256 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9228
- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
9257
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
92299258 } else if (sg_status & SG_OVERUTILIZED) {
9230
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9231
- trace_sched_overutilized(1);
9232
- }
9259
+ struct root_domain *rd = env->dst_rq->rd;
92339260
9261
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9262
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
9263
+ }
92349264 }
92359265
9236
-/**
9237
- * check_asym_packing - Check to see if the group is packed into the
9238
- * sched domain.
9239
- *
9240
- * This is primarily intended to used at the sibling level. Some
9241
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9242
- * case of POWER7, it can move to lower SMT modes only when higher
9243
- * threads are idle. When in lower SMT modes, the threads will
9244
- * perform better since they share less core resources. Hence when we
9245
- * have idle threads, we want them to be the higher ones.
9246
- *
9247
- * This packing function is run on idle threads. It checks to see if
9248
- * the busiest CPU in this domain (core in the P7 case) has a higher
9249
- * CPU number than the packing function is being run on. Here we are
9250
- * assuming lower CPU number will be equivalent to lower a SMT thread
9251
- * number.
9252
- *
9253
- * Return: 1 when packing is required and a task should be moved to
9254
- * this CPU. The amount of the imbalance is returned in env->imbalance.
9255
- *
9256
- * @env: The load balancing environment.
9257
- * @sds: Statistics of the sched_domain which is to be packed
9258
- */
9259
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
9266
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
92609267 {
9261
- int busiest_cpu;
9262
-
9263
- if (!(env->sd->flags & SD_ASYM_PACKING))
9264
- return 0;
9265
-
9266
- if (env->idle == CPU_NOT_IDLE)
9267
- return 0;
9268
-
9269
- if (!sds->busiest)
9270
- return 0;
9271
-
9272
- busiest_cpu = sds->busiest->asym_prefer_cpu;
9273
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9274
- return 0;
9275
-
9276
- env->imbalance = DIV_ROUND_CLOSEST(
9277
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9278
- SCHED_CAPACITY_SCALE);
9279
-
9280
- return 1;
9281
-}
9282
-
9283
-/**
9284
- * fix_small_imbalance - Calculate the minor imbalance that exists
9285
- * amongst the groups of a sched_domain, during
9286
- * load balancing.
9287
- * @env: The load balancing environment.
9288
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9289
- */
9290
-static inline
9291
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
9292
-{
9293
- unsigned long tmp, capa_now = 0, capa_move = 0;
9294
- unsigned int imbn = 2;
9295
- unsigned long scaled_busy_load_per_task;
9296
- struct sg_lb_stats *local, *busiest;
9297
-
9298
- local = &sds->local_stat;
9299
- busiest = &sds->busiest_stat;
9300
-
9301
- if (!local->sum_nr_running)
9302
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9303
- else if (busiest->load_per_task > local->load_per_task)
9304
- imbn = 1;
9305
-
9306
- scaled_busy_load_per_task =
9307
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9308
- busiest->group_capacity;
9309
-
9310
- if (busiest->avg_load + scaled_busy_load_per_task >=
9311
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9312
- env->imbalance = busiest->load_per_task;
9313
- return;
9314
- }
9268
+ unsigned int imbalance_min;
93159269
93169270 /*
9317
- * OK, we don't have enough imbalance to justify moving tasks,
9318
- * however we may be able to increase total CPU capacity used by
9319
- * moving them.
9271
+ * Allow a small imbalance based on a simple pair of communicating
9272
+ * tasks that remain local when the source domain is almost idle.
93209273 */
9274
+ imbalance_min = 2;
9275
+ if (nr_running <= imbalance_min)
9276
+ return 0;
93219277
9322
- capa_now += busiest->group_capacity *
9323
- min(busiest->load_per_task, busiest->avg_load);
9324
- capa_now += local->group_capacity *
9325
- min(local->load_per_task, local->avg_load);
9326
- capa_now /= SCHED_CAPACITY_SCALE;
9327
-
9328
- /* Amount of load we'd subtract */
9329
- if (busiest->avg_load > scaled_busy_load_per_task) {
9330
- capa_move += busiest->group_capacity *
9331
- min(busiest->load_per_task,
9332
- busiest->avg_load - scaled_busy_load_per_task);
9333
- }
9334
-
9335
- /* Amount of load we'd add */
9336
- if (busiest->avg_load * busiest->group_capacity <
9337
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9338
- tmp = (busiest->avg_load * busiest->group_capacity) /
9339
- local->group_capacity;
9340
- } else {
9341
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9342
- local->group_capacity;
9343
- }
9344
- capa_move += local->group_capacity *
9345
- min(local->load_per_task, local->avg_load + tmp);
9346
- capa_move /= SCHED_CAPACITY_SCALE;
9347
-
9348
- /* Move if we gain throughput */
9349
- if (capa_move > capa_now) {
9350
- env->imbalance = busiest->load_per_task;
9351
- return;
9352
- }
9353
-
9354
- /* We can't see throughput improvement with the load-based
9355
- * method, but it is possible depending upon group size and
9356
- * capacity range that there might still be an underutilized
9357
- * cpu available in an asymmetric capacity system. Do one last
9358
- * check just in case.
9359
- */
9360
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9361
- busiest->group_type == group_overloaded &&
9362
- busiest->sum_nr_running > busiest->group_weight &&
9363
- local->sum_nr_running < local->group_weight &&
9364
- local->group_capacity < busiest->group_capacity)
9365
- env->imbalance = busiest->load_per_task;
9278
+ return imbalance;
93669279 }
93679280
93689281 /**
....@@ -9373,96 +9286,169 @@
93739286 */
93749287 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
93759288 {
9376
- unsigned long max_pull, load_above_capacity = ~0UL;
93779289 struct sg_lb_stats *local, *busiest;
93789290
93799291 local = &sds->local_stat;
93809292 busiest = &sds->busiest_stat;
93819293
9294
+ if (busiest->group_type == group_misfit_task) {
9295
+ /* Set imbalance to allow misfit tasks to be balanced. */
9296
+ env->migration_type = migrate_misfit;
9297
+ env->imbalance = 1;
9298
+ return;
9299
+ }
9300
+
9301
+ if (busiest->group_type == group_asym_packing) {
9302
+ /*
9303
+ * In case of asym capacity, we will try to migrate all load to
9304
+ * the preferred CPU.
9305
+ */
9306
+ env->migration_type = migrate_task;
9307
+ env->imbalance = busiest->sum_h_nr_running;
9308
+ return;
9309
+ }
9310
+
93829311 if (busiest->group_type == group_imbalanced) {
93839312 /*
93849313 * In the group_imb case we cannot rely on group-wide averages
9385
- * to ensure CPU-load equilibrium, look at wider averages. XXX
9314
+ * to ensure CPU-load equilibrium, try to move any task to fix
9315
+ * the imbalance. The next load balance will take care of
9316
+ * balancing back the system.
93869317 */
9387
- busiest->load_per_task =
9388
- min(busiest->load_per_task, sds->avg_load);
9318
+ env->migration_type = migrate_task;
9319
+ env->imbalance = 1;
9320
+ return;
93899321 }
93909322
93919323 /*
9392
- * Avg load of busiest sg can be less and avg load of local sg can
9393
- * be greater than avg load across all sgs of sd because avg load
9394
- * factors in sg capacity and sgs with smaller group_type are
9395
- * skipped when updating the busiest sg:
9324
+ * Try to use spare capacity of local group without overloading it or
9325
+ * emptying busiest.
93969326 */
9397
- if (busiest->group_type != group_misfit_task &&
9398
- (busiest->avg_load <= sds->avg_load ||
9399
- local->avg_load >= sds->avg_load)) {
9400
- env->imbalance = 0;
9401
- return fix_small_imbalance(env, sds);
9327
+ if (local->group_type == group_has_spare) {
9328
+ if ((busiest->group_type > group_fully_busy) &&
9329
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
9330
+ /*
9331
+ * If busiest is overloaded, try to fill spare
9332
+ * capacity. This might end up creating spare capacity
9333
+ * in busiest or busiest still being overloaded but
9334
+ * there is no simple way to directly compute the
9335
+ * amount of load to migrate in order to balance the
9336
+ * system.
9337
+ */
9338
+ env->migration_type = migrate_util;
9339
+ env->imbalance = max(local->group_capacity, local->group_util) -
9340
+ local->group_util;
9341
+
9342
+ /*
9343
+ * In some cases, the group's utilization is max or even
9344
+ * higher than capacity because of migrations but the
9345
+ * local CPU is (newly) idle. There is at least one
9346
+ * waiting task in this overloaded busiest group. Let's
9347
+ * try to pull it.
9348
+ */
9349
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9350
+ env->migration_type = migrate_task;
9351
+ env->imbalance = 1;
9352
+ }
9353
+
9354
+ return;
9355
+ }
9356
+
9357
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
9358
+ unsigned int nr_diff = busiest->sum_nr_running;
9359
+ /*
9360
+ * When prefer sibling, evenly spread running tasks on
9361
+ * groups.
9362
+ */
9363
+ env->migration_type = migrate_task;
9364
+ lsub_positive(&nr_diff, local->sum_nr_running);
9365
+ env->imbalance = nr_diff >> 1;
9366
+ } else {
9367
+
9368
+ /*
9369
+ * If there is no overload, we just want to even the number of
9370
+ * idle cpus.
9371
+ */
9372
+ env->migration_type = migrate_task;
9373
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
9374
+ busiest->idle_cpus) >> 1);
9375
+ }
9376
+
9377
+ /* Consider allowing a small imbalance between NUMA groups */
9378
+ if (env->sd->flags & SD_NUMA)
9379
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
9380
+ busiest->sum_nr_running);
9381
+
9382
+ return;
94029383 }
94039384
94049385 /*
9405
- * If there aren't any idle CPUs, avoid creating some.
9386
+ * Local is fully busy but has to take more load to relieve the
9387
+ * busiest group
94069388 */
9407
- if (busiest->group_type == group_overloaded &&
9408
- local->group_type == group_overloaded) {
9409
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9410
- if (load_above_capacity > busiest->group_capacity) {
9411
- load_above_capacity -= busiest->group_capacity;
9412
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9413
- load_above_capacity /= busiest->group_capacity;
9414
- } else
9415
- load_above_capacity = ~0UL;
9389
+ if (local->group_type < group_overloaded) {
9390
+ /*
9391
+ * Local will become overloaded so the avg_load metrics are
9392
+ * finally needed.
9393
+ */
9394
+
9395
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9396
+ local->group_capacity;
9397
+
9398
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9399
+ sds->total_capacity;
9400
+ /*
9401
+ * If the local group is more loaded than the selected
9402
+ * busiest group don't try to pull any tasks.
9403
+ */
9404
+ if (local->avg_load >= busiest->avg_load) {
9405
+ env->imbalance = 0;
9406
+ return;
9407
+ }
94169408 }
94179409
94189410 /*
9419
- * We're trying to get all the CPUs to the average_load, so we don't
9420
- * want to push ourselves above the average load, nor do we wish to
9421
- * reduce the max loaded CPU below the average load. At the same time,
9422
- * we also don't want to reduce the group load below the group
9423
- * capacity. Thus we look for the minimum possible imbalance.
9411
+ * Both group are or will become overloaded and we're trying to get all
9412
+ * the CPUs to the average_load, so we don't want to push ourselves
9413
+ * above the average load, nor do we wish to reduce the max loaded CPU
9414
+ * below the average load. At the same time, we also don't want to
9415
+ * reduce the group load below the group capacity. Thus we look for
9416
+ * the minimum possible imbalance.
94249417 */
9425
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9426
-
9427
- /* How much load to actually move to equalise the imbalance */
9418
+ env->migration_type = migrate_load;
94289419 env->imbalance = min(
9429
- max_pull * busiest->group_capacity,
9420
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
94309421 (sds->avg_load - local->avg_load) * local->group_capacity
94319422 ) / SCHED_CAPACITY_SCALE;
9432
-
9433
- /* Boost imbalance to allow misfit task to be balanced.
9434
- * Always do this if we are doing a NEWLY_IDLE balance
9435
- * on the assumption that any tasks we have must not be
9436
- * long-running (and hence we cannot rely upon load).
9437
- * However if we are not idle, we should assume the tasks
9438
- * we have are longer running and not override load-based
9439
- * calculations above unless we are sure that the local
9440
- * group is underutilized.
9441
- */
9442
- if (busiest->group_type == group_misfit_task &&
9443
- (env->idle == CPU_NEWLY_IDLE ||
9444
- local->sum_nr_running < local->group_weight)) {
9445
- env->imbalance = max_t(long, env->imbalance,
9446
- busiest->group_misfit_task_load);
9447
- }
9448
-
9449
- /*
9450
- * if *imbalance is less than the average load per runnable task
9451
- * there is no guarantee that any tasks will be moved so we'll have
9452
- * a think about bumping its value to force at least one task to be
9453
- * moved
9454
- */
9455
- if (env->imbalance < busiest->load_per_task)
9456
- return fix_small_imbalance(env, sds);
94579423 }
94589424
94599425 /******* find_busiest_group() helpers end here *********************/
9426
+
9427
+/*
9428
+ * Decision matrix according to the local and busiest group type:
9429
+ *
9430
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9431
+ * has_spare nr_idle balanced N/A N/A balanced balanced
9432
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
9433
+ * misfit_task force N/A N/A N/A force force
9434
+ * asym_packing force force N/A N/A force force
9435
+ * imbalanced force force N/A N/A force force
9436
+ * overloaded force force N/A N/A force avg_load
9437
+ *
9438
+ * N/A : Not Applicable because already filtered while updating
9439
+ * statistics.
9440
+ * balanced : The system is balanced for these 2 groups.
9441
+ * force : Calculate the imbalance as load migration is probably needed.
9442
+ * avg_load : Only if imbalance is significant enough.
9443
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9444
+ * different in groups.
9445
+ */
94609446
94619447 /**
94629448 * find_busiest_group - Returns the busiest group within the sched_domain
94639449 * if there is an imbalance.
94649450 *
9465
- * Also calculates the amount of weighted load which should be moved
9451
+ * Also calculates the amount of runnable load which should be moved
94669452 * to restore balance.
94679453 *
94689454 * @env: The load balancing environment.
....@@ -9477,91 +9463,120 @@
94779463 init_sd_lb_stats(&sds);
94789464
94799465 /*
9480
- * Compute the various statistics relavent for load balancing at
9466
+ * Compute the various statistics relevant for load balancing at
94819467 * this level.
94829468 */
94839469 update_sd_lb_stats(env, &sds);
94849470
9485
- if (static_branch_unlikely(&sched_energy_present)) {
9471
+ if (sched_energy_enabled()) {
94869472 struct root_domain *rd = env->dst_rq->rd;
9473
+ int out_balance = 1;
94879474
9488
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9475
+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
9476
+ &out_balance);
9477
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
9478
+ && out_balance)
94899479 goto out_balanced;
94909480 }
94919481
94929482 local = &sds.local_stat;
94939483 busiest = &sds.busiest_stat;
94949484
9495
- /* ASYM feature bypasses nice load balance check */
9496
- if (check_asym_packing(env, &sds))
9497
- return sds.busiest;
9498
-
94999485 /* There is no busy sibling group to pull tasks from */
9500
- if (!sds.busiest || busiest->sum_nr_running == 0)
9486
+ if (!sds.busiest)
95019487 goto out_balanced;
95029488
9503
- /* XXX broken for overlapping NUMA groups */
9504
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9505
- / sds.total_capacity;
9489
+ /* Misfit tasks should be dealt with regardless of the avg load */
9490
+ if (busiest->group_type == group_misfit_task)
9491
+ goto force_balance;
9492
+
9493
+ /* ASYM feature bypasses nice load balance check */
9494
+ if (busiest->group_type == group_asym_packing)
9495
+ goto force_balance;
95069496
95079497 /*
95089498 * If the busiest group is imbalanced the below checks don't
95099499 * work because they assume all things are equal, which typically
9510
- * isn't true due to cpus_allowed constraints and the like.
9500
+ * isn't true due to cpus_ptr constraints and the like.
95119501 */
95129502 if (busiest->group_type == group_imbalanced)
9513
- goto force_balance;
9514
-
9515
- /*
9516
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9517
- * capacities from resulting in underutilization due to avg_load.
9518
- */
9519
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9520
- busiest->group_no_capacity)
9521
- goto force_balance;
9522
-
9523
- /* Misfit tasks should be dealt with regardless of the avg load */
9524
- if (busiest->group_type == group_misfit_task)
95259503 goto force_balance;
95269504
95279505 /*
95289506 * If the local group is busier than the selected busiest group
95299507 * don't try and pull any tasks.
95309508 */
9531
- if (local->avg_load >= busiest->avg_load)
9509
+ if (local->group_type > busiest->group_type)
95329510 goto out_balanced;
95339511
95349512 /*
9535
- * Don't pull any tasks if this group is already above the domain
9536
- * average load.
9513
+ * When groups are overloaded, use the avg_load to ensure fairness
9514
+ * between tasks.
95379515 */
9538
- if (local->avg_load >= sds.avg_load)
9539
- goto out_balanced;
9540
-
9541
- if (env->idle == CPU_IDLE) {
9516
+ if (local->group_type == group_overloaded) {
95429517 /*
9543
- * This CPU is idle. If the busiest group is not overloaded
9544
- * and there is no imbalance between this and busiest group
9545
- * wrt idle CPUs, it is balanced. The imbalance becomes
9546
- * significant if the diff is greater than 1 otherwise we
9547
- * might end up to just move the imbalance on another group
9518
+ * If the local group is more loaded than the selected
9519
+ * busiest group don't try to pull any tasks.
95489520 */
9549
- if ((busiest->group_type != group_overloaded) &&
9550
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
9521
+ if (local->avg_load >= busiest->avg_load)
95519522 goto out_balanced;
9552
- } else {
9523
+
9524
+ /* XXX broken for overlapping NUMA groups */
9525
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9526
+ sds.total_capacity;
9527
+
95539528 /*
9554
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9555
- * imbalance_pct to be conservative.
9529
+ * Don't pull any tasks if this group is already above the
9530
+ * domain average load.
9531
+ */
9532
+ if (local->avg_load >= sds.avg_load)
9533
+ goto out_balanced;
9534
+
9535
+ /*
9536
+ * If the busiest group is more loaded, use imbalance_pct to be
9537
+ * conservative.
95569538 */
95579539 if (100 * busiest->avg_load <=
95589540 env->sd->imbalance_pct * local->avg_load)
95599541 goto out_balanced;
95609542 }
95619543
9544
+ /* Try to move all excess tasks to child's sibling domain */
9545
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
9546
+ busiest->sum_nr_running > local->sum_nr_running + 1)
9547
+ goto force_balance;
9548
+
9549
+ if (busiest->group_type != group_overloaded) {
9550
+ if (env->idle == CPU_NOT_IDLE)
9551
+ /*
9552
+ * If the busiest group is not overloaded (and as a
9553
+ * result the local one too) but this CPU is already
9554
+ * busy, let another idle CPU try to pull task.
9555
+ */
9556
+ goto out_balanced;
9557
+
9558
+ if (busiest->group_weight > 1 &&
9559
+ local->idle_cpus <= (busiest->idle_cpus + 1))
9560
+ /*
9561
+ * If the busiest group is not overloaded
9562
+ * and there is no imbalance between this and busiest
9563
+ * group wrt idle CPUs, it is balanced. The imbalance
9564
+ * becomes significant if the diff is greater than 1
9565
+ * otherwise we might end up to just move the imbalance
9566
+ * on another group. Of course this applies only if
9567
+ * there is more than 1 CPU per group.
9568
+ */
9569
+ goto out_balanced;
9570
+
9571
+ if (busiest->sum_h_nr_running == 1)
9572
+ /*
9573
+ * busiest doesn't have any tasks waiting to run
9574
+ */
9575
+ goto out_balanced;
9576
+ }
9577
+
95629578 force_balance:
95639579 /* Looks like there is an imbalance. Compute it */
9564
- env->src_grp_type = busiest->group_type;
95659580 calculate_imbalance(env, &sds);
95669581 return env->imbalance ? sds.busiest : NULL;
95679582
....@@ -9577,11 +9592,18 @@
95779592 struct sched_group *group)
95789593 {
95799594 struct rq *busiest = NULL, *rq;
9580
- unsigned long busiest_load = 0, busiest_capacity = 1;
9581
- int i;
9595
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9596
+ unsigned int busiest_nr = 0;
9597
+ int i, done = 0;
9598
+
9599
+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
9600
+ &busiest, &done);
9601
+ if (done)
9602
+ return busiest;
95829603
95839604 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9584
- unsigned long capacity, wl;
9605
+ unsigned long capacity, load, util;
9606
+ unsigned int nr_running;
95859607 enum fbq_type rt;
95869608
95879609 rq = cpu_rq(i);
....@@ -9609,20 +9631,8 @@
96099631 if (rt > env->fbq_type)
96109632 continue;
96119633
9612
- /*
9613
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9614
- * seek the "biggest" misfit task.
9615
- */
9616
- if (env->src_grp_type == group_misfit_task) {
9617
- if (rq->misfit_task_load > busiest_load) {
9618
- busiest_load = rq->misfit_task_load;
9619
- busiest = rq;
9620
- }
9621
-
9622
- continue;
9623
- }
9624
-
96259634 capacity = capacity_of(i);
9635
+ nr_running = rq->cfs.h_nr_running;
96269636
96279637 /*
96289638 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
....@@ -9632,35 +9642,77 @@
96329642 */
96339643 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
96349644 capacity_of(env->dst_cpu) < capacity &&
9635
- rq->nr_running == 1)
9645
+ nr_running == 1)
96369646 continue;
96379647
9638
- wl = weighted_cpuload(rq);
9648
+ switch (env->migration_type) {
9649
+ case migrate_load:
9650
+ /*
9651
+ * When comparing with load imbalance, use cpu_load()
9652
+ * which is not scaled with the CPU capacity.
9653
+ */
9654
+ load = cpu_load(rq);
96399655
9640
- /*
9641
- * When comparing with imbalance, use weighted_cpuload()
9642
- * which is not scaled with the CPU capacity.
9643
- */
9656
+ if (nr_running == 1 && load > env->imbalance &&
9657
+ !check_cpu_capacity(rq, env->sd))
9658
+ break;
96449659
9645
- if (rq->nr_running == 1 && wl > env->imbalance &&
9646
- !check_cpu_capacity(rq, env->sd))
9647
- continue;
9660
+ /*
9661
+ * For the load comparisons with the other CPUs,
9662
+ * consider the cpu_load() scaled with the CPU
9663
+ * capacity, so that the load can be moved away
9664
+ * from the CPU that is potentially running at a
9665
+ * lower capacity.
9666
+ *
9667
+ * Thus we're looking for max(load_i / capacity_i),
9668
+ * crosswise multiplication to rid ourselves of the
9669
+ * division works out to:
9670
+ * load_i * capacity_j > load_j * capacity_i;
9671
+ * where j is our previous maximum.
9672
+ */
9673
+ if (load * busiest_capacity > busiest_load * capacity) {
9674
+ busiest_load = load;
9675
+ busiest_capacity = capacity;
9676
+ busiest = rq;
9677
+ }
9678
+ break;
96489679
9649
- /*
9650
- * For the load comparisons with the other CPU's, consider
9651
- * the weighted_cpuload() scaled with the CPU capacity, so
9652
- * that the load can be moved away from the CPU that is
9653
- * potentially running at a lower capacity.
9654
- *
9655
- * Thus we're looking for max(wl_i / capacity_i), crosswise
9656
- * multiplication to rid ourselves of the division works out
9657
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9658
- * our previous maximum.
9659
- */
9660
- if (wl * busiest_capacity > busiest_load * capacity) {
9661
- busiest_load = wl;
9662
- busiest_capacity = capacity;
9663
- busiest = rq;
9680
+ case migrate_util:
9681
+ util = cpu_util(cpu_of(rq));
9682
+
9683
+ /*
9684
+ * Don't try to pull utilization from a CPU with one
9685
+ * running task. Whatever its utilization, we will fail
9686
+ * detach the task.
9687
+ */
9688
+ if (nr_running <= 1)
9689
+ continue;
9690
+
9691
+ if (busiest_util < util) {
9692
+ busiest_util = util;
9693
+ busiest = rq;
9694
+ }
9695
+ break;
9696
+
9697
+ case migrate_task:
9698
+ if (busiest_nr < nr_running) {
9699
+ busiest_nr = nr_running;
9700
+ busiest = rq;
9701
+ }
9702
+ break;
9703
+
9704
+ case migrate_misfit:
9705
+ /*
9706
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
9707
+ * simply seek the "biggest" misfit task.
9708
+ */
9709
+ if (rq->misfit_task_load > busiest_load) {
9710
+ busiest_load = rq->misfit_task_load;
9711
+ busiest = rq;
9712
+ }
9713
+
9714
+ break;
9715
+
96649716 }
96659717 }
96669718
....@@ -9673,21 +9725,25 @@
96739725 */
96749726 #define MAX_PINNED_INTERVAL 512
96759727
9676
-static int need_active_balance(struct lb_env *env)
9728
+static inline bool
9729
+asym_active_balance(struct lb_env *env)
9730
+{
9731
+ /*
9732
+ * ASYM_PACKING needs to force migrate tasks from busy but
9733
+ * lower priority CPUs in order to pack all tasks in the
9734
+ * highest priority CPUs.
9735
+ */
9736
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9737
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
9738
+}
9739
+
9740
+static inline bool
9741
+voluntary_active_balance(struct lb_env *env)
96779742 {
96789743 struct sched_domain *sd = env->sd;
96799744
9680
- if (env->idle == CPU_NEWLY_IDLE) {
9681
-
9682
- /*
9683
- * ASYM_PACKING needs to force migrate tasks from busy but
9684
- * lower priority CPUs in order to pack all tasks in the
9685
- * highest priority CPUs.
9686
- */
9687
- if ((sd->flags & SD_ASYM_PACKING) &&
9688
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9689
- return 1;
9690
- }
9745
+ if (asym_active_balance(env))
9746
+ return 1;
96919747
96929748 /*
96939749 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
....@@ -9702,19 +9758,18 @@
97029758 return 1;
97039759 }
97049760
9705
- if (env->src_grp_type == group_misfit_task)
9761
+ if (env->migration_type == migrate_misfit)
97069762 return 1;
97079763
9708
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9709
- env->src_rq->cfs.h_nr_running == 1 &&
9710
- cpu_overutilized(env->src_cpu) &&
9711
- !cpu_overutilized(env->dst_cpu)) {
9712
- return 1;
9713
- }
9764
+ return 0;
9765
+}
97149766
9715
- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9716
- return 1;
9767
+static int need_active_balance(struct lb_env *env)
9768
+{
9769
+ struct sched_domain *sd = env->sd;
97179770
9771
+ if (voluntary_active_balance(env))
9772
+ return 1;
97189773
97199774 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
97209775 }
....@@ -9724,7 +9779,17 @@
97249779 static int should_we_balance(struct lb_env *env)
97259780 {
97269781 struct sched_group *sg = env->sd->groups;
9727
- int cpu, balance_cpu = -1;
9782
+ int cpu;
9783
+
9784
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
9785
+ struct root_domain *rd = env->dst_rq->rd;
9786
+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
9787
+ int level = rockchip_perf_get_level();
9788
+
9789
+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
9790
+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
9791
+ return 0;
9792
+ }
97289793
97299794 /*
97309795 * Ensure the balancing environment is consistent; can happen
....@@ -9745,18 +9810,12 @@
97459810 if (!idle_cpu(cpu))
97469811 continue;
97479812
9748
- balance_cpu = cpu;
9749
- break;
9813
+ /* Are we the first idle CPU? */
9814
+ return cpu == env->dst_cpu;
97509815 }
97519816
9752
- if (balance_cpu == -1)
9753
- balance_cpu = group_balance_cpu(sg);
9754
-
9755
- /*
9756
- * First idle CPU or the first CPU(busiest) in this sched group
9757
- * is eligible for doing load balancing at this and above domains.
9758
- */
9759
- return balance_cpu == env->dst_cpu;
9817
+ /* Are we the first CPU of this group ? */
9818
+ return group_balance_cpu(sg) == env->dst_cpu;
97609819 }
97619820
97629821 /*
....@@ -9828,6 +9887,7 @@
98289887
98299888 more_balance:
98309889 rq_lock_irqsave(busiest, &rf);
9890
+ env.src_rq_rf = &rf;
98319891 update_rq_clock(busiest);
98329892
98339893 /*
....@@ -9880,7 +9940,7 @@
98809940 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
98819941
98829942 /* Prevent to re-select dst_cpu via env's CPUs */
9883
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
9943
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
98849944
98859945 env.dst_rq = cpu_rq(env.new_dst_cpu);
98869946 env.dst_cpu = env.new_dst_cpu;
....@@ -9907,7 +9967,7 @@
99079967
99089968 /* All tasks on this runqueue were pinned by CPU affinity */
99099969 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9910
- cpumask_clear_cpu(cpu_of(busiest), cpus);
9970
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
99119971 /*
99129972 * Attempting to continue load balancing at the current
99139973 * sched_domain level only makes sense if there are
....@@ -9934,8 +9994,7 @@
99349994 * excessive cache_hot migrations and active balances.
99359995 */
99369996 if (idle != CPU_NEWLY_IDLE)
9937
- if (env.src_grp_nr_running > 1)
9938
- sd->nr_balance_failed++;
9997
+ sd->nr_balance_failed++;
99399998
99409999 if (need_active_balance(&env)) {
994110000 unsigned long flags;
....@@ -9947,7 +10006,7 @@
994710006 * if the curr task on busiest CPU can't be
994810007 * moved to this_cpu:
994910008 */
9950
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
10009
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
995110010 raw_spin_unlock_irqrestore(&busiest->lock,
995210011 flags);
995310012 env.flags |= LBF_ALL_PINNED;
....@@ -9978,7 +10037,7 @@
997810037 } else
997910038 sd->nr_balance_failed = 0;
998010039
9981
- if (likely(!active_balance)) {
10040
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
998210041 /* We were unbalanced, so reset the balancing interval */
998310042 sd->balance_interval = sd->min_interval;
998410043 } else {
....@@ -10021,18 +10080,18 @@
1002110080 ld_moved = 0;
1002210081
1002310082 /*
10024
- * idle_balance() disregards balance intervals, so we could repeatedly
10025
- * reach this code, which would lead to balance_interval skyrocketting
10026
- * in a short amount of time. Skip the balance_interval increase logic
10027
- * to avoid that.
10083
+ * newidle_balance() disregards balance intervals, so we could
10084
+ * repeatedly reach this code, which would lead to balance_interval
10085
+ * skyrocketting in a short amount of time. Skip the balance_interval
10086
+ * increase logic to avoid that.
1002810087 */
1002910088 if (env.idle == CPU_NEWLY_IDLE)
1003010089 goto out;
1003110090
1003210091 /* tune up the balancing interval */
10033
- if (((env.flags & LBF_ALL_PINNED) &&
10034
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
10035
- (sd->balance_interval < sd->max_interval))
10092
+ if ((env.flags & LBF_ALL_PINNED &&
10093
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
10094
+ sd->balance_interval < sd->max_interval)
1003610095 sd->balance_interval *= 2;
1003710096 out:
1003810097 return ld_moved;
....@@ -10048,6 +10107,15 @@
1004810107
1004910108 /* scale ms to jiffies */
1005010109 interval = msecs_to_jiffies(interval);
10110
+
10111
+ /*
10112
+ * Reduce likelihood of busy balancing at higher domains racing with
10113
+ * balancing at lower domains by preventing their balancing periods
10114
+ * from being multiples of each other.
10115
+ */
10116
+ if (cpu_busy)
10117
+ interval -= 1;
10118
+
1005110119 interval = clamp(interval, 1UL, max_load_balance_interval);
1005210120
1005310121 return interval;
....@@ -10110,9 +10178,8 @@
1011010178 /* Search for an sd spanning us and the target CPU. */
1011110179 rcu_read_lock();
1011210180 for_each_domain(target_cpu, sd) {
10113
- if ((sd->flags & SD_LOAD_BALANCE) &&
10114
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10115
- break;
10181
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10182
+ break;
1011610183 }
1011710184
1011810185 if (likely(sd)) {
....@@ -10130,6 +10197,7 @@
1013010197 * about DST_PINNED.
1013110198 */
1013210199 .flags = LBF_DST_PINNED,
10200
+ .src_rq_rf = &rf,
1013310201 };
1013410202
1013510203 schedstat_inc(sd->alb_count);
....@@ -10165,7 +10233,7 @@
1016510233 */
1016610234 void update_max_interval(void)
1016710235 {
10168
- max_load_balance_interval = HZ*num_online_cpus()/10;
10236
+ max_load_balance_interval = HZ*num_active_cpus()/10;
1016910237 }
1017010238
1017110239 /*
....@@ -10178,6 +10246,7 @@
1017810246 {
1017910247 int continue_balancing = 1;
1018010248 int cpu = rq->cpu;
10249
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1018110250 unsigned long interval;
1018210251 struct sched_domain *sd;
1018310252 /* Earliest time when we have to do rebalance again */
....@@ -10185,6 +10254,10 @@
1018510254 int update_next_balance = 0;
1018610255 int need_serialize, need_decay = 0;
1018710256 u64 max_cost = 0;
10257
+
10258
+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
10259
+ if (!continue_balancing)
10260
+ return;
1018810261
1018910262 rcu_read_lock();
1019010263 for_each_domain(cpu, sd) {
....@@ -10200,9 +10273,6 @@
1020010273 }
1020110274 max_cost += sd->max_newidle_lb_cost;
1020210275
10203
- if (!(sd->flags & SD_LOAD_BALANCE))
10204
- continue;
10205
-
1020610276 /*
1020710277 * Stop the load balance at this level. There is another
1020810278 * CPU in our sched group which is doing load balancing more
....@@ -10214,7 +10284,7 @@
1021410284 break;
1021510285 }
1021610286
10217
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10287
+ interval = get_sd_balance_interval(sd, busy);
1021810288
1021910289 need_serialize = sd->flags & SD_SERIALIZE;
1022010290 if (need_serialize) {
....@@ -10230,9 +10300,10 @@
1023010300 * state even if we migrated tasks. Update it.
1023110301 */
1023210302 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
10303
+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
1023310304 }
1023410305 sd->last_balance = jiffies;
10235
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
10306
+ interval = get_sd_balance_interval(sd, busy);
1023610307 }
1023710308 if (need_serialize)
1023810309 spin_unlock(&balancing);
....@@ -10292,7 +10363,11 @@
1029210363
1029310364 static inline int find_new_ilb(void)
1029410365 {
10295
- int ilb;
10366
+ int ilb = -1;
10367
+
10368
+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
10369
+ if (ilb >= 0)
10370
+ return ilb;
1029610371
1029710372 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
1029810373 housekeeping_cpumask(HK_FLAG_MISC)) {
....@@ -10323,29 +10398,25 @@
1032310398 if (ilb_cpu >= nr_cpu_ids)
1032410399 return;
1032510400
10401
+ /*
10402
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10403
+ * the first flag owns it; cleared by nohz_csd_func().
10404
+ */
1032610405 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
1032710406 if (flags & NOHZ_KICK_MASK)
1032810407 return;
1032910408
1033010409 /*
10331
- * Use smp_send_reschedule() instead of resched_cpu().
10332
- * This way we generate a sched IPI on the target CPU which
10410
+ * This way we generate an IPI on the target CPU which
1033310411 * is idle. And the softirq performing nohz idle load balance
1033410412 * will be run before returning from the IPI.
1033510413 */
10336
- smp_send_reschedule(ilb_cpu);
10414
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
1033710415 }
1033810416
1033910417 /*
10340
- * Current heuristic for kicking the idle load balancer in the presence
10341
- * of an idle cpu in the system.
10342
- * - This rq has more than one task.
10343
- * - This rq has at least one CFS task and the capacity of the CPU is
10344
- * significantly reduced because of RT tasks or IRQs.
10345
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10346
- * multiple busy cpu.
10347
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10348
- * domain span are idle.
10418
+ * Current decision point for kicking the idle load balancer in the presence
10419
+ * of idle CPUs in the system.
1034910420 */
1035010421 static void nohz_balancer_kick(struct rq *rq)
1035110422 {
....@@ -10354,6 +10425,7 @@
1035410425 struct sched_domain *sd;
1035510426 int nr_busy, i, cpu = rq->cpu;
1035610427 unsigned int flags = 0;
10428
+ int done = 0;
1035710429
1035810430 if (unlikely(rq->idle_balance))
1035910431 return;
....@@ -10378,30 +10450,25 @@
1037810450 if (time_before(now, nohz.next_balance))
1037910451 goto out;
1038010452
10381
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
10453
+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
10454
+ if (done)
10455
+ goto out;
10456
+
10457
+ if (rq->nr_running >= 2) {
1038210458 flags = NOHZ_KICK_MASK;
1038310459 goto out;
1038410460 }
1038510461
1038610462 rcu_read_lock();
10387
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10388
- if (sds) {
10389
- /*
10390
- * XXX: write a coherent comment on why we do this.
10391
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10392
- */
10393
- nr_busy = atomic_read(&sds->nr_busy_cpus);
10394
- if (nr_busy > 1) {
10395
- flags = NOHZ_KICK_MASK;
10396
- goto unlock;
10397
- }
10398
-
10399
- }
1040010463
1040110464 sd = rcu_dereference(rq->sd);
1040210465 if (sd) {
10403
- if ((rq->cfs.h_nr_running >= 1) &&
10404
- check_cpu_capacity(rq, sd)) {
10466
+ /*
10467
+ * If there's a CFS task and the current CPU has reduced
10468
+ * capacity; kick the ILB to see if there's a better CPU to run
10469
+ * on.
10470
+ */
10471
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
1040510472 flags = NOHZ_KICK_MASK;
1040610473 goto unlock;
1040710474 }
....@@ -10409,15 +10476,55 @@
1040910476
1041010477 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
1041110478 if (sd) {
10412
- for_each_cpu(i, sched_domain_span(sd)) {
10413
- if (i == cpu ||
10414
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10415
- continue;
10416
-
10479
+ /*
10480
+ * When ASYM_PACKING; see if there's a more preferred CPU
10481
+ * currently idle; in which case, kick the ILB to move tasks
10482
+ * around.
10483
+ */
10484
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
1041710485 if (sched_asym_prefer(i, cpu)) {
1041810486 flags = NOHZ_KICK_MASK;
1041910487 goto unlock;
1042010488 }
10489
+ }
10490
+ }
10491
+
10492
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10493
+ if (sd) {
10494
+ /*
10495
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10496
+ * to run the misfit task on.
10497
+ */
10498
+ if (check_misfit_status(rq, sd)) {
10499
+ flags = NOHZ_KICK_MASK;
10500
+ goto unlock;
10501
+ }
10502
+
10503
+ /*
10504
+ * For asymmetric systems, we do not want to nicely balance
10505
+ * cache use, instead we want to embrace asymmetry and only
10506
+ * ensure tasks have enough CPU capacity.
10507
+ *
10508
+ * Skip the LLC logic because it's not relevant in that case.
10509
+ */
10510
+ goto unlock;
10511
+ }
10512
+
10513
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10514
+ if (sds) {
10515
+ /*
10516
+ * If there is an imbalance between LLC domains (IOW we could
10517
+ * increase the overall cache use), we need some less-loaded LLC
10518
+ * domain to pull some load. Likewise, we may need to spread
10519
+ * load within the current LLC domain (e.g. packed SMT cores but
10520
+ * other CPUs are idle). We can't really know from here how busy
10521
+ * the others are - so just get a nohz balance going if it looks
10522
+ * like this LLC domain has tasks we could move.
10523
+ */
10524
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
10525
+ if (nr_busy > 1) {
10526
+ flags = NOHZ_KICK_MASK;
10527
+ goto unlock;
1042110528 }
1042210529 }
1042310530 unlock:
....@@ -10483,9 +10590,20 @@
1048310590
1048410591 SCHED_WARN_ON(cpu != smp_processor_id());
1048510592
10486
- /* If this CPU is going down, then nothing needs to be done: */
10487
- if (!cpu_active(cpu))
10593
+ if (!cpu_active(cpu)) {
10594
+ /*
10595
+ * A CPU can be paused while it is idle with it's tick
10596
+ * stopped. nohz_balance_exit_idle() should be called
10597
+ * from the local CPU, so it can't be called during
10598
+ * pause. This results in paused CPU participating in
10599
+ * the nohz idle balance, which should be avoided.
10600
+ *
10601
+ * When the paused CPU exits idle and enters again,
10602
+ * exempt the paused CPU from nohz_balance_exit_idle.
10603
+ */
10604
+ nohz_balance_exit_idle(rq);
1048810605 return;
10606
+ }
1048910607
1049010608 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
1049110609 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
....@@ -10598,7 +10716,6 @@
1059810716
1059910717 rq_lock_irqsave(rq, &rf);
1060010718 update_rq_clock(rq);
10601
- cpu_load_update_idle(rq);
1060210719 rq_unlock_irqrestore(rq, &rf);
1060310720
1060410721 if (flags & NOHZ_BALANCE_KICK)
....@@ -10648,22 +10765,14 @@
1064810765 */
1064910766 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
1065010767 {
10651
- int this_cpu = this_rq->cpu;
10652
- unsigned int flags;
10768
+ unsigned int flags = this_rq->nohz_idle_balance;
1065310769
10654
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
10770
+ if (!flags)
1065510771 return false;
1065610772
10657
- if (idle != CPU_IDLE) {
10658
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10659
- return false;
10660
- }
10773
+ this_rq->nohz_idle_balance = 0;
1066110774
10662
- /*
10663
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10664
- */
10665
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10666
- if (!(flags & NOHZ_KICK_MASK))
10775
+ if (idle != CPU_IDLE)
1066710776 return false;
1066810777
1066910778 _nohz_idle_balance(this_rq, flags, idle);
....@@ -10717,15 +10826,26 @@
1071710826 /*
1071810827 * idle_balance is called by schedule() if this_cpu is about to become
1071910828 * idle. Attempts to pull tasks from other CPUs.
10829
+ *
10830
+ * Returns:
10831
+ * < 0 - we released the lock and there are !fair tasks present
10832
+ * 0 - failed, no new tasks
10833
+ * > 0 - success, new (fair) tasks present
1072010834 */
10721
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
10835
+static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
1072210836 {
1072310837 unsigned long next_balance = jiffies + HZ;
1072410838 int this_cpu = this_rq->cpu;
1072510839 struct sched_domain *sd;
1072610840 int pulled_task = 0;
1072710841 u64 curr_cost = 0;
10842
+ int done = 0;
1072810843
10844
+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
10845
+ if (done)
10846
+ return pulled_task;
10847
+
10848
+ update_misfit_status(NULL, this_rq);
1072910849 /*
1073010850 * We must set idle_stamp _before_ calling idle_balance(), such that we
1073110851 * measure the duration of idle_balance() as idle time.
....@@ -10767,9 +10887,6 @@
1076710887 for_each_domain(this_cpu, sd) {
1076810888 int continue_balancing = 1;
1076910889 u64 t0, domain_cost;
10770
-
10771
- if (!(sd->flags & SD_LOAD_BALANCE))
10772
- continue;
1077310890
1077410891 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
1077510892 update_next_balance(sd, &next_balance);
....@@ -10943,7 +11060,7 @@
1094311060 * 'current' within the tree based on its new key value.
1094411061 */
1094511062 swap(curr->vruntime, se->vruntime);
10946
- resched_curr(rq);
11063
+ resched_curr_lazy(rq);
1094711064 }
1094811065
1094911066 se->vruntime -= cfs_rq->min_vruntime;
....@@ -10960,6 +11077,9 @@
1096011077 if (!task_on_rq_queued(p))
1096111078 return;
1096211079
11080
+ if (rq->cfs.nr_running == 1)
11081
+ return;
11082
+
1096311083 /*
1096411084 * Reschedule if we are currently running on this runqueue and
1096511085 * our priority decreased, or if we are not currently running on
....@@ -10967,7 +11087,7 @@
1096711087 */
1096811088 if (rq->curr == p) {
1096911089 if (p->prio > oldprio)
10970
- resched_curr(rq);
11090
+ resched_curr_lazy(rq);
1097111091 } else
1097211092 check_preempt_curr(rq, p, 0);
1097311093 }
....@@ -11038,7 +11158,7 @@
1103811158 /* Catch up with the cfs_rq and remove our load when we leave */
1103911159 update_load_avg(cfs_rq, se, 0);
1104011160 detach_entity_load_avg(cfs_rq, se);
11041
- update_tg_load_avg(cfs_rq, false);
11161
+ update_tg_load_avg(cfs_rq);
1104211162 propagate_entity_cfs_rq(se);
1104311163 }
1104411164
....@@ -11056,8 +11176,8 @@
1105611176
1105711177 /* Synchronize entity with its cfs_rq */
1105811178 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11059
- attach_entity_load_avg(cfs_rq, se, 0);
11060
- update_tg_load_avg(cfs_rq, false);
11179
+ attach_entity_load_avg(cfs_rq, se);
11180
+ update_tg_load_avg(cfs_rq);
1106111181 propagate_entity_cfs_rq(se);
1106211182 }
1106311183
....@@ -11116,9 +11236,19 @@
1111611236 * This routine is mostly called to set cfs_rq->curr field when a task
1111711237 * migrates between groups/classes.
1111811238 */
11119
-static void set_curr_task_fair(struct rq *rq)
11239
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1112011240 {
11121
- struct sched_entity *se = &rq->curr->se;
11241
+ struct sched_entity *se = &p->se;
11242
+
11243
+#ifdef CONFIG_SMP
11244
+ if (task_on_rq_queued(p)) {
11245
+ /*
11246
+ * Move the next running task to the front of the list, so our
11247
+ * cfs_tasks list becomes MRU one.
11248
+ */
11249
+ list_move(&se->group_node, &rq->cfs_tasks);
11250
+ }
11251
+#endif
1112211252
1112311253 for_each_sched_entity(se) {
1112411254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
....@@ -11379,8 +11509,8 @@
1137911509 /*
1138011510 * All the scheduling class methods:
1138111511 */
11382
-const struct sched_class fair_sched_class = {
11383
- .next = &idle_sched_class,
11512
+const struct sched_class fair_sched_class
11513
+ __section("__fair_sched_class") = {
1138411514 .enqueue_task = enqueue_task_fair,
1138511515 .dequeue_task = dequeue_task_fair,
1138611516 .yield_task = yield_task_fair,
....@@ -11388,10 +11518,12 @@
1138811518
1138911519 .check_preempt_curr = check_preempt_wakeup,
1139011520
11391
- .pick_next_task = pick_next_task_fair,
11521
+ .pick_next_task = __pick_next_task_fair,
1139211522 .put_prev_task = put_prev_task_fair,
11523
+ .set_next_task = set_next_task_fair,
1139311524
1139411525 #ifdef CONFIG_SMP
11526
+ .balance = balance_fair,
1139511527 .select_task_rq = select_task_rq_fair,
1139611528 .migrate_task_rq = migrate_task_rq_fair,
1139711529
....@@ -11402,7 +11534,6 @@
1140211534 .set_cpus_allowed = set_cpus_allowed_common,
1140311535 #endif
1140411536
11405
- .set_curr_task = set_curr_task_fair,
1140611537 .task_tick = task_tick_fair,
1140711538 .task_fork = task_fork_fair,
1140811539
....@@ -11472,3 +11603,101 @@
1147211603 #endif /* SMP */
1147311604
1147411605 }
11606
+
11607
+/*
11608
+ * Helper functions to facilitate extracting info from tracepoints.
11609
+ */
11610
+
11611
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11612
+{
11613
+#ifdef CONFIG_SMP
11614
+ return cfs_rq ? &cfs_rq->avg : NULL;
11615
+#else
11616
+ return NULL;
11617
+#endif
11618
+}
11619
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11620
+
11621
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11622
+{
11623
+ if (!cfs_rq) {
11624
+ if (str)
11625
+ strlcpy(str, "(null)", len);
11626
+ else
11627
+ return NULL;
11628
+ }
11629
+
11630
+ cfs_rq_tg_path(cfs_rq, str, len);
11631
+ return str;
11632
+}
11633
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11634
+
11635
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11636
+{
11637
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11638
+}
11639
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11640
+
11641
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11642
+{
11643
+#ifdef CONFIG_SMP
11644
+ return rq ? &rq->avg_rt : NULL;
11645
+#else
11646
+ return NULL;
11647
+#endif
11648
+}
11649
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11650
+
11651
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11652
+{
11653
+#ifdef CONFIG_SMP
11654
+ return rq ? &rq->avg_dl : NULL;
11655
+#else
11656
+ return NULL;
11657
+#endif
11658
+}
11659
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11660
+
11661
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11662
+{
11663
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11664
+ return rq ? &rq->avg_irq : NULL;
11665
+#else
11666
+ return NULL;
11667
+#endif
11668
+}
11669
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11670
+
11671
+int sched_trace_rq_cpu(struct rq *rq)
11672
+{
11673
+ return rq ? cpu_of(rq) : -1;
11674
+}
11675
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11676
+
11677
+int sched_trace_rq_cpu_capacity(struct rq *rq)
11678
+{
11679
+ return rq ?
11680
+#ifdef CONFIG_SMP
11681
+ rq->cpu_capacity
11682
+#else
11683
+ SCHED_CAPACITY_SCALE
11684
+#endif
11685
+ : -1;
11686
+}
11687
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11688
+
11689
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11690
+{
11691
+#ifdef CONFIG_SMP
11692
+ return rd ? rd->span : NULL;
11693
+#else
11694
+ return NULL;
11695
+#endif
11696
+}
11697
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
11698
+
11699
+int sched_trace_rq_nr_running(struct rq *rq)
11700
+{
11701
+ return rq ? rq->nr_running : -1;
11702
+}
11703
+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);