hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/sched/rt.c
....@@ -7,8 +7,12 @@
77
88 #include "pelt.h"
99
10
+#include <trace/hooks/sched.h>
11
+
1012 int sched_rr_timeslice = RR_TIMESLICE;
1113 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
14
+/* More than 4 hours if BW_SHIFT equals 20. */
15
+static const u64 max_rt_runtime = MAX_BW;
1216
1317 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
1418
....@@ -45,8 +49,8 @@
4549
4650 raw_spin_lock_init(&rt_b->rt_runtime_lock);
4751
48
- hrtimer_init(&rt_b->rt_period_timer,
49
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
52
+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
53
+ HRTIMER_MODE_REL_HARD);
5054 rt_b->rt_period_timer.function = sched_rt_period_timer;
5155 }
5256
....@@ -64,7 +68,8 @@
6468 * to update the period.
6569 */
6670 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
67
- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
71
+ hrtimer_start_expires(&rt_b->rt_period_timer,
72
+ HRTIMER_MODE_ABS_PINNED_HARD);
6873 }
6974 raw_spin_unlock(&rt_b->rt_runtime_lock);
7075 }
....@@ -267,7 +272,7 @@
267272 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
268273 {
269274 /* Try to pull RT tasks here if we lower this rq's prio */
270
- return rq->rt.highest_prio.curr > prev->prio;
275
+ return rq->online && rq->rt.highest_prio.curr > prev->prio;
271276 }
272277
273278 static inline int rt_overloaded(struct rq *rq)
....@@ -434,7 +439,7 @@
434439 #endif /* CONFIG_SMP */
435440
436441 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
437
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
442
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
438443
439444 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
440445 {
....@@ -555,7 +560,7 @@
555560 rt_se = rt_rq->tg->rt_se[cpu];
556561
557562 if (!rt_se) {
558
- dequeue_top_rt_rq(rt_rq);
563
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
559564 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
560565 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
561566 }
....@@ -641,7 +646,7 @@
641646
642647 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
643648 {
644
- dequeue_top_rt_rq(rt_rq);
649
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
645650 }
646651
647652 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
....@@ -973,6 +978,13 @@
973978 if (likely(rt_b->rt_runtime)) {
974979 rt_rq->rt_throttled = 1;
975980 printk_deferred_once("sched: RT throttling activated\n");
981
+
982
+ trace_android_vh_dump_throttled_rt_tasks(
983
+ raw_smp_processor_id(),
984
+ rq_clock(rq_of_rt_rq(rt_rq)),
985
+ sched_rt_period(rt_rq),
986
+ runtime,
987
+ hrtimer_get_expires_ns(&rt_b->rt_period_timer));
976988 } else {
977989 /*
978990 * In case we did anyway, make it go away,
....@@ -1019,6 +1031,8 @@
10191031 curr->se.exec_start = now;
10201032 cgroup_account_cputime(curr, delta_exec);
10211033
1034
+ trace_android_vh_sched_stat_runtime_rt(curr, delta_exec);
1035
+
10221036 if (!rt_bandwidth_enabled())
10231037 return;
10241038
....@@ -1040,7 +1054,7 @@
10401054 }
10411055
10421056 static void
1043
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
1057
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
10441058 {
10451059 struct rq *rq = rq_of_rt_rq(rt_rq);
10461060
....@@ -1051,7 +1065,7 @@
10511065
10521066 BUG_ON(!rq->nr_running);
10531067
1054
- sub_nr_running(rq, rt_rq->rt_nr_running);
1068
+ sub_nr_running(rq, count);
10551069 rt_rq->rt_queued = 0;
10561070
10571071 }
....@@ -1330,18 +1344,21 @@
13301344 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
13311345 {
13321346 struct sched_rt_entity *back = NULL;
1347
+ unsigned int rt_nr_running;
13331348
13341349 for_each_sched_rt_entity(rt_se) {
13351350 rt_se->back = back;
13361351 back = rt_se;
13371352 }
13381353
1339
- dequeue_top_rt_rq(rt_rq_of_se(back));
1354
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
13401355
13411356 for (rt_se = back; rt_se; rt_se = rt_se->back) {
13421357 if (on_rt_rq(rt_se))
13431358 __dequeue_rt_entity(rt_se, flags);
13441359 }
1360
+
1361
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
13451362 }
13461363
13471364 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
....@@ -1369,6 +1386,27 @@
13691386 enqueue_top_rt_rq(&rq->rt);
13701387 }
13711388
1389
+#ifdef CONFIG_SMP
1390
+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1391
+ bool sync)
1392
+{
1393
+ /*
1394
+ * If the waker is CFS, then an RT sync wakeup would preempt the waker
1395
+ * and force it to run for a likely small time after the RT wakee is
1396
+ * done. So, only honor RT sync wakeups from RT wakers.
1397
+ */
1398
+ return sync && task_has_rt_policy(rq->curr) &&
1399
+ p->prio <= rq->rt.highest_prio.next &&
1400
+ rq->rt.rt_nr_running <= 2;
1401
+}
1402
+#else
1403
+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1404
+ bool sync)
1405
+{
1406
+ return 0;
1407
+}
1408
+#endif
1409
+
13721410 /*
13731411 * Adding/removing a task to/from a priority array:
13741412 */
....@@ -1376,23 +1414,21 @@
13761414 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
13771415 {
13781416 struct sched_rt_entity *rt_se = &p->rt;
1379
-
1380
- schedtune_enqueue_task(p, cpu_of(rq));
1417
+ bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
13811418
13821419 if (flags & ENQUEUE_WAKEUP)
13831420 rt_se->timeout = 0;
13841421
13851422 enqueue_rt_entity(rt_se, flags);
13861423
1387
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1424
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1 &&
1425
+ !should_honor_rt_sync(rq, p, sync))
13881426 enqueue_pushable_task(rq, p);
13891427 }
13901428
13911429 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
13921430 {
13931431 struct sched_rt_entity *rt_se = &p->rt;
1394
-
1395
- schedtune_dequeue_task(p, cpu_of(rq));
13961432
13971433 update_curr_rt(rq);
13981434 dequeue_rt_entity(rt_se, flags);
....@@ -1437,13 +1473,43 @@
14371473 #ifdef CONFIG_SMP
14381474 static int find_lowest_rq(struct task_struct *task);
14391475
1476
+#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
1477
+/*
1478
+ * Return whether the task on the given cpu is currently non-preemptible
1479
+ * while handling a potentially long softint, or if the task is likely
1480
+ * to block preemptions soon because it is a ksoftirq thread that is
1481
+ * handling slow softints.
1482
+ */
1483
+bool
1484
+task_may_not_preempt(struct task_struct *task, int cpu)
1485
+{
1486
+ __u32 softirqs = per_cpu(active_softirqs, cpu) |
1487
+ __IRQ_STAT(cpu, __softirq_pending);
1488
+
1489
+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
1490
+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
1491
+ (task == cpu_ksoftirqd ||
1492
+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
1493
+}
1494
+EXPORT_SYMBOL_GPL(task_may_not_preempt);
1495
+#endif /* CONFIG_RT_SOFTINT_OPTIMIZATION */
1496
+
14401497 static int
1441
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1442
- int sibling_count_hint)
1498
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
14431499 {
14441500 struct task_struct *curr;
14451501 struct rq *rq;
1502
+ struct rq *this_cpu_rq;
14461503 bool test;
1504
+ int target_cpu = -1;
1505
+ bool may_not_preempt;
1506
+ bool sync = !!(flags & WF_SYNC);
1507
+ int this_cpu;
1508
+
1509
+ trace_android_rvh_select_task_rq_rt(p, cpu, sd_flag,
1510
+ flags, &target_cpu);
1511
+ if (target_cpu >= 0)
1512
+ return target_cpu;
14471513
14481514 /* For anything but wake ups, just return the task_cpu */
14491515 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
....@@ -1453,9 +1519,16 @@
14531519
14541520 rcu_read_lock();
14551521 curr = READ_ONCE(rq->curr); /* unlocked access */
1522
+ this_cpu = smp_processor_id();
1523
+ this_cpu_rq = cpu_rq(this_cpu);
14561524
14571525 /*
1458
- * If the current task on @p's runqueue is an RT task, then
1526
+ * If the current task on @p's runqueue is a softirq task,
1527
+ * it may run without preemption for a time that is
1528
+ * ill-suited for a waiting RT task. Therefore, try to
1529
+ * wake this RT task on another runqueue.
1530
+ *
1531
+ * Also, if the current task on @p's runqueue is an RT task, then
14591532 * try to see if we can wake this RT task up on another
14601533 * runqueue. Otherwise simply start this RT task
14611534 * on its current runqueue.
....@@ -1480,9 +1553,21 @@
14801553 * requirement of the task - which is only important on heterogeneous
14811554 * systems like big.LITTLE.
14821555 */
1483
- test = curr &&
1484
- unlikely(rt_task(curr)) &&
1485
- (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1556
+ may_not_preempt = task_may_not_preempt(curr, cpu);
1557
+ test = (curr && (may_not_preempt ||
1558
+ (unlikely(rt_task(curr)) &&
1559
+ (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio))));
1560
+
1561
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE))
1562
+ test |= rockchip_perf_misfit_rt(cpu);
1563
+ /*
1564
+ * Respect the sync flag as long as the task can run on this CPU.
1565
+ */
1566
+ if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
1567
+ cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
1568
+ cpu = this_cpu;
1569
+ goto out_unlock;
1570
+ }
14861571
14871572 if (test || !rt_task_fits_capacity(p, cpu)) {
14881573 int target = find_lowest_rq(p);
....@@ -1495,11 +1580,14 @@
14951580 goto out_unlock;
14961581
14971582 /*
1498
- * Don't bother moving it if the destination CPU is
1583
+ * If cpu is non-preemptible, prefer remote cpu
1584
+ * even if it's running a higher-prio task.
1585
+ * Otherwise: Don't bother moving it if the destination CPU is
14991586 * not running a lower priority task.
15001587 */
15011588 if (target != -1 &&
1502
- p->prio < cpu_rq(target)->rt.highest_prio.curr)
1589
+ (may_not_preempt ||
1590
+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
15031591 cpu = target;
15041592 }
15051593
....@@ -1537,6 +1625,26 @@
15371625 resched_curr(rq);
15381626 }
15391627
1628
+static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1629
+{
1630
+ if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1631
+ int done = 0;
1632
+
1633
+ /*
1634
+ * This is OK, because current is on_cpu, which avoids it being
1635
+ * picked for load-balance and preemption/IRQs are still
1636
+ * disabled avoiding further scheduler activity on it and we've
1637
+ * not yet started the picking loop.
1638
+ */
1639
+ rq_unpin_lock(rq, rf);
1640
+ trace_android_rvh_sched_balance_rt(rq, p, &done);
1641
+ if (!done)
1642
+ pull_rt_task(rq);
1643
+ rq_repin_lock(rq, rf);
1644
+ }
1645
+
1646
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1647
+}
15401648 #endif /* CONFIG_SMP */
15411649
15421650 /*
....@@ -1567,6 +1675,27 @@
15671675 #endif
15681676 }
15691677
1678
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1679
+{
1680
+ p->se.exec_start = rq_clock_task(rq);
1681
+
1682
+ /* The running task is never eligible for pushing */
1683
+ dequeue_pushable_task(rq, p);
1684
+
1685
+ if (!first)
1686
+ return;
1687
+
1688
+ /*
1689
+ * If prev task was rt, put_prev_task() has already updated the
1690
+ * utilization. We only care of the case where we start to schedule a
1691
+ * rt task
1692
+ */
1693
+ if (rq->curr->sched_class != &rt_sched_class)
1694
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1695
+
1696
+ rt_queue_push_tasks(rq);
1697
+}
1698
+
15701699 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
15711700 struct rt_rq *rt_rq)
15721701 {
....@@ -1587,7 +1716,6 @@
15871716 static struct task_struct *_pick_next_task_rt(struct rq *rq)
15881717 {
15891718 struct sched_rt_entity *rt_se;
1590
- struct task_struct *p;
15911719 struct rt_rq *rt_rq = &rq->rt;
15921720
15931721 do {
....@@ -1596,65 +1724,18 @@
15961724 rt_rq = group_rt_rq(rt_se);
15971725 } while (rt_rq);
15981726
1599
- p = rt_task_of(rt_se);
1600
- p->se.exec_start = rq_clock_task(rq);
1601
-
1602
- return p;
1727
+ return rt_task_of(rt_se);
16031728 }
16041729
1605
-static struct task_struct *
1606
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1730
+static struct task_struct *pick_next_task_rt(struct rq *rq)
16071731 {
16081732 struct task_struct *p;
1609
- struct rt_rq *rt_rq = &rq->rt;
16101733
1611
- if (need_pull_rt_task(rq, prev)) {
1612
- /*
1613
- * This is OK, because current is on_cpu, which avoids it being
1614
- * picked for load-balance and preemption/IRQs are still
1615
- * disabled avoiding further scheduler activity on it and we're
1616
- * being very careful to re-start the picking loop.
1617
- */
1618
- rq_unpin_lock(rq, rf);
1619
- pull_rt_task(rq);
1620
- rq_repin_lock(rq, rf);
1621
- /*
1622
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
1623
- * means a dl or stop task can slip in, in which case we need
1624
- * to re-start task selection.
1625
- */
1626
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1627
- rq->dl.dl_nr_running))
1628
- return RETRY_TASK;
1629
- }
1630
-
1631
- /*
1632
- * We may dequeue prev's rt_rq in put_prev_task().
1633
- * So, we update time before rt_nr_running check.
1634
- */
1635
- if (prev->sched_class == &rt_sched_class)
1636
- update_curr_rt(rq);
1637
-
1638
- if (!rt_rq->rt_queued)
1734
+ if (!sched_rt_runnable(rq))
16391735 return NULL;
16401736
1641
- put_prev_task(rq, prev);
1642
-
16431737 p = _pick_next_task_rt(rq);
1644
-
1645
- /* The running task is never eligible for pushing */
1646
- dequeue_pushable_task(rq, p);
1647
-
1648
- rt_queue_push_tasks(rq);
1649
-
1650
- /*
1651
- * If prev task was rt, put_prev_task() has already updated the
1652
- * utilization. We only care of the case where we start to schedule a
1653
- * rt task
1654
- */
1655
- if (rq->curr->sched_class != &rt_sched_class)
1656
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1657
-
1738
+ set_next_task_rt(rq, p, true);
16581739 return p;
16591740 }
16601741
....@@ -1680,7 +1761,7 @@
16801761 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
16811762 {
16821763 if (!task_running(rq, p) &&
1683
- cpumask_test_cpu(cpu, &p->cpus_allowed))
1764
+ cpumask_test_cpu(cpu, &p->cpus_mask))
16841765 return 1;
16851766
16861767 return 0;
....@@ -1690,7 +1771,7 @@
16901771 * Return the highest pushable rq's task, which is suitable to be executed
16911772 * on the CPU, NULL otherwise
16921773 */
1693
-static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1774
+struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
16941775 {
16951776 struct plist_head *head = &rq->rt.pushable_tasks;
16961777 struct task_struct *p;
....@@ -1705,6 +1786,7 @@
17051786
17061787 return NULL;
17071788 }
1789
+EXPORT_SYMBOL_GPL(pick_highest_pushable_task);
17081790
17091791 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
17101792
....@@ -1713,7 +1795,7 @@
17131795 struct sched_domain *sd;
17141796 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
17151797 int this_cpu = smp_processor_id();
1716
- int cpu = task_cpu(task);
1798
+ int cpu = -1;
17171799 int ret;
17181800
17191801 /* Make sure the mask is initialized first */
....@@ -1738,9 +1820,17 @@
17381820 task, lowest_mask);
17391821 }
17401822
1823
+ trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu);
1824
+ if (cpu >= 0)
1825
+ return cpu;
1826
+
17411827 if (!ret)
17421828 return -1; /* No targets found */
17431829
1830
+ cpu = task_cpu(task);
1831
+
1832
+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE))
1833
+ cpu = rockchip_perf_select_rt_cpu(cpu, lowest_mask);
17441834 /*
17451835 * At this point we have built a mask of CPUs representing the
17461836 * lowest priority tasks in the system. Now we want to elect
....@@ -1774,8 +1864,8 @@
17741864 return this_cpu;
17751865 }
17761866
1777
- best_cpu = cpumask_first_and(lowest_mask,
1778
- sched_domain_span(sd));
1867
+ best_cpu = cpumask_any_and_distribute(lowest_mask,
1868
+ sched_domain_span(sd));
17791869 if (best_cpu < nr_cpu_ids) {
17801870 rcu_read_unlock();
17811871 return best_cpu;
....@@ -1792,7 +1882,7 @@
17921882 if (this_cpu != -1)
17931883 return this_cpu;
17941884
1795
- cpu = cpumask_any(lowest_mask);
1885
+ cpu = cpumask_any_distribute(lowest_mask);
17961886 if (cpu < nr_cpu_ids)
17971887 return cpu;
17981888
....@@ -1833,7 +1923,7 @@
18331923 * Also make sure that it wasn't scheduled on its rq.
18341924 */
18351925 if (unlikely(task_rq(task) != rq ||
1836
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1926
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
18371927 task_running(rq, task) ||
18381928 !rt_task(task) ||
18391929 !task_on_rq_queued(task))) {
....@@ -1881,7 +1971,7 @@
18811971 * running task can migrate over to a CPU that is running a task
18821972 * of lesser priority.
18831973 */
1884
-static int push_rt_task(struct rq *rq)
1974
+static int push_rt_task(struct rq *rq, bool pull)
18851975 {
18861976 struct task_struct *next_task;
18871977 struct rq *lowest_rq;
....@@ -1895,10 +1985,41 @@
18951985 return 0;
18961986
18971987 retry:
1898
- if (unlikely(next_task == rq->curr)) {
1899
- WARN_ON(1);
1988
+ if (is_migration_disabled(next_task)) {
1989
+ struct task_struct *push_task = NULL;
1990
+ int cpu;
1991
+
1992
+ if (!pull)
1993
+ return 0;
1994
+
1995
+ trace_sched_migrate_pull_tp(next_task);
1996
+
1997
+ if (rq->push_busy)
1998
+ return 0;
1999
+
2000
+ cpu = find_lowest_rq(rq->curr);
2001
+ if (cpu == -1 || cpu == rq->cpu)
2002
+ return 0;
2003
+
2004
+ /*
2005
+ * Given we found a CPU with lower priority than @next_task,
2006
+ * therefore it should be running. However we cannot migrate it
2007
+ * to this other CPU, instead attempt to push the current
2008
+ * running task on this CPU away.
2009
+ */
2010
+ push_task = get_push_task(rq);
2011
+ if (push_task) {
2012
+ raw_spin_unlock(&rq->lock);
2013
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2014
+ push_task, &rq->push_work);
2015
+ raw_spin_lock(&rq->lock);
2016
+ }
2017
+
19002018 return 0;
19012019 }
2020
+
2021
+ if (WARN_ON(next_task == rq->curr))
2022
+ return 0;
19022023
19032024 /*
19042025 * It's possible that the next_task slipped in of
....@@ -1951,12 +2072,10 @@
19512072 deactivate_task(rq, next_task, 0);
19522073 set_task_cpu(next_task, lowest_rq->cpu);
19532074 activate_task(lowest_rq, next_task, 0);
2075
+ resched_curr(lowest_rq);
19542076 ret = 1;
19552077
1956
- resched_curr(lowest_rq);
1957
-
19582078 double_unlock_balance(rq, lowest_rq);
1959
-
19602079 out:
19612080 put_task_struct(next_task);
19622081
....@@ -1966,7 +2085,7 @@
19662085 static void push_rt_tasks(struct rq *rq)
19672086 {
19682087 /* push_rt_task will return true if it moved an RT */
1969
- while (push_rt_task(rq))
2088
+ while (push_rt_task(rq, false))
19702089 ;
19712090 }
19722091
....@@ -2119,7 +2238,8 @@
21192238 */
21202239 if (has_pushable_tasks(rq)) {
21212240 raw_spin_lock(&rq->lock);
2122
- push_rt_tasks(rq);
2241
+ while (push_rt_task(rq, true))
2242
+ ;
21232243 raw_spin_unlock(&rq->lock);
21242244 }
21252245
....@@ -2144,7 +2264,7 @@
21442264 {
21452265 int this_cpu = this_rq->cpu, cpu;
21462266 bool resched = false;
2147
- struct task_struct *p;
2267
+ struct task_struct *p, *push_task;
21482268 struct rq *src_rq;
21492269 int rt_overload_count = rt_overloaded(this_rq);
21502270
....@@ -2191,6 +2311,7 @@
21912311 * double_lock_balance, and another CPU could
21922312 * alter this_rq
21932313 */
2314
+ push_task = NULL;
21942315 double_lock_balance(this_rq, src_rq);
21952316
21962317 /*
....@@ -2218,11 +2339,15 @@
22182339 if (p->prio < src_rq->curr->prio)
22192340 goto skip;
22202341
2221
- resched = true;
2222
-
2223
- deactivate_task(src_rq, p, 0);
2224
- set_task_cpu(p, this_cpu);
2225
- activate_task(this_rq, p, 0);
2342
+ if (is_migration_disabled(p)) {
2343
+ trace_sched_migrate_pull_tp(p);
2344
+ push_task = get_push_task(src_rq);
2345
+ } else {
2346
+ deactivate_task(src_rq, p, 0);
2347
+ set_task_cpu(p, this_cpu);
2348
+ activate_task(this_rq, p, 0);
2349
+ resched = true;
2350
+ }
22262351 /*
22272352 * We continue with the search, just in
22282353 * case there's an even higher prio task
....@@ -2232,6 +2357,13 @@
22322357 }
22332358 skip:
22342359 double_unlock_balance(this_rq, src_rq);
2360
+
2361
+ if (push_task) {
2362
+ raw_spin_unlock(&this_rq->lock);
2363
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2364
+ push_task, &src_rq->push_work);
2365
+ raw_spin_lock(&this_rq->lock);
2366
+ }
22352367 }
22362368
22372369 if (resched)
....@@ -2315,13 +2447,20 @@
23152447 static void switched_to_rt(struct rq *rq, struct task_struct *p)
23162448 {
23172449 /*
2318
- * If we are already running, then there's nothing
2319
- * that needs to be done. But if we are not running
2320
- * we may need to preempt the current running task.
2321
- * If that current running task is also an RT task
2450
+ * If we are running, update the avg_rt tracking, as the running time
2451
+ * will now on be accounted into the latter.
2452
+ */
2453
+ if (task_current(rq, p)) {
2454
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2455
+ return;
2456
+ }
2457
+
2458
+ /*
2459
+ * If we are not running we may need to preempt the current
2460
+ * running task. If that current running task is also an RT task
23222461 * then see if we can move to another run queue.
23232462 */
2324
- if (task_on_rq_queued(p) && rq->curr != p) {
2463
+ if (task_on_rq_queued(p)) {
23252464 #ifdef CONFIG_SMP
23262465 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
23272466 rt_queue_push_tasks(rq);
....@@ -2390,8 +2529,10 @@
23902529 }
23912530
23922531 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2393
- if (p->rt.timeout > next)
2394
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2532
+ if (p->rt.timeout > next) {
2533
+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
2534
+ p->se.sum_exec_runtime);
2535
+ }
23952536 }
23962537 }
23972538 #else
....@@ -2440,16 +2581,6 @@
24402581 }
24412582 }
24422583
2443
-static void set_curr_task_rt(struct rq *rq)
2444
-{
2445
- struct task_struct *p = rq->curr;
2446
-
2447
- p->se.exec_start = rq_clock_task(rq);
2448
-
2449
- /* The running task is never eligible for pushing */
2450
- dequeue_pushable_task(rq, p);
2451
-}
2452
-
24532584 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
24542585 {
24552586 /*
....@@ -2461,8 +2592,8 @@
24612592 return 0;
24622593 }
24632594
2464
-const struct sched_class rt_sched_class = {
2465
- .next = &fair_sched_class,
2595
+const struct sched_class rt_sched_class
2596
+ __section("__rt_sched_class") = {
24662597 .enqueue_task = enqueue_task_rt,
24672598 .dequeue_task = dequeue_task_rt,
24682599 .yield_task = yield_task_rt,
....@@ -2471,18 +2602,19 @@
24712602
24722603 .pick_next_task = pick_next_task_rt,
24732604 .put_prev_task = put_prev_task_rt,
2605
+ .set_next_task = set_next_task_rt,
24742606
24752607 #ifdef CONFIG_SMP
2608
+ .balance = balance_rt,
24762609 .select_task_rq = select_task_rq_rt,
2477
-
24782610 .set_cpus_allowed = set_cpus_allowed_common,
24792611 .rq_online = rq_online_rt,
24802612 .rq_offline = rq_offline_rt,
24812613 .task_woken = task_woken_rt,
24822614 .switched_from = switched_from_rt,
2615
+ .find_lock_rq = find_lock_lowest_rq,
24832616 #endif
24842617
2485
- .set_curr_task = set_curr_task_rt,
24862618 .task_tick = task_tick_rt,
24872619
24882620 .get_rr_interval = get_rr_interval_rt,
....@@ -2503,10 +2635,11 @@
25032635 */
25042636 static DEFINE_MUTEX(rt_constraints_mutex);
25052637
2506
-/* Must be called with tasklist_lock held */
25072638 static inline int tg_has_rt_tasks(struct task_group *tg)
25082639 {
2509
- struct task_struct *g, *p;
2640
+ struct task_struct *task;
2641
+ struct css_task_iter it;
2642
+ int ret = 0;
25102643
25112644 /*
25122645 * Autogroups do not have RT tasks; see autogroup_create().
....@@ -2514,12 +2647,12 @@
25142647 if (task_group_is_autogroup(tg))
25152648 return 0;
25162649
2517
- for_each_process_thread(g, p) {
2518
- if (rt_task(p) && task_group(p) == tg)
2519
- return 1;
2520
- }
2650
+ css_task_iter_start(&tg->css, 0, &it);
2651
+ while (!ret && (task = css_task_iter_next(&it)))
2652
+ ret |= rt_task(task);
2653
+ css_task_iter_end(&it);
25212654
2522
- return 0;
2655
+ return ret;
25232656 }
25242657
25252658 struct rt_schedulable_data {
....@@ -2550,9 +2683,10 @@
25502683 return -EINVAL;
25512684
25522685 /*
2553
- * Ensure we don't starve existing RT tasks.
2686
+ * Ensure we don't starve existing RT tasks if runtime turns zero.
25542687 */
2555
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2688
+ if (rt_bandwidth_enabled() && !runtime &&
2689
+ tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
25562690 return -EBUSY;
25572691
25582692 total = to_ratio(period, runtime);
....@@ -2617,8 +2751,13 @@
26172751 if (rt_period == 0)
26182752 return -EINVAL;
26192753
2754
+ /*
2755
+ * Bound quota to defend quota against overflow during bandwidth shift.
2756
+ */
2757
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2758
+ return -EINVAL;
2759
+
26202760 mutex_lock(&rt_constraints_mutex);
2621
- read_lock(&tasklist_lock);
26222761 err = __rt_schedulable(tg, rt_period, rt_runtime);
26232762 if (err)
26242763 goto unlock;
....@@ -2636,7 +2775,6 @@
26362775 }
26372776 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
26382777 unlock:
2639
- read_unlock(&tasklist_lock);
26402778 mutex_unlock(&rt_constraints_mutex);
26412779
26422780 return err;
....@@ -2695,9 +2833,7 @@
26952833 int ret = 0;
26962834
26972835 mutex_lock(&rt_constraints_mutex);
2698
- read_lock(&tasklist_lock);
26992836 ret = __rt_schedulable(NULL, 0, 0);
2700
- read_unlock(&tasklist_lock);
27012837 mutex_unlock(&rt_constraints_mutex);
27022838
27032839 return ret;
....@@ -2738,7 +2874,9 @@
27382874 return -EINVAL;
27392875
27402876 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2741
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2877
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
2878
+ ((u64)sysctl_sched_rt_runtime *
2879
+ NSEC_PER_USEC > max_rt_runtime)))
27422880 return -EINVAL;
27432881
27442882 return 0;
....@@ -2754,9 +2892,8 @@
27542892 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
27552893 }
27562894
2757
-int sched_rt_handler(struct ctl_table *table, int write,
2758
- void __user *buffer, size_t *lenp,
2759
- loff_t *ppos)
2895
+int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
2896
+ size_t *lenp, loff_t *ppos)
27602897 {
27612898 int old_period, old_runtime;
27622899 static DEFINE_MUTEX(mutex);
....@@ -2794,9 +2931,8 @@
27942931 return ret;
27952932 }
27962933
2797
-int sched_rr_handler(struct ctl_table *table, int write,
2798
- void __user *buffer, size_t *lenp,
2799
- loff_t *ppos)
2934
+int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
2935
+ size_t *lenp, loff_t *ppos)
28002936 {
28012937 int ret;
28022938 static DEFINE_MUTEX(mutex);