hc
2024-05-10 61598093bbdd283a7edc367d900f223070ead8d2
kernel/kernel/sched/deadline.c
....@@ -17,6 +17,7 @@
1717 */
1818 #include "sched.h"
1919 #include "pelt.h"
20
+#include <linux/cpuset.h>
2021
2122 struct dl_bandwidth def_dl_bandwidth;
2223
....@@ -43,6 +44,28 @@
4344 return !RB_EMPTY_NODE(&dl_se->rb_node);
4445 }
4546
47
+#ifdef CONFIG_RT_MUTEXES
48
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
49
+{
50
+ return dl_se->pi_se;
51
+}
52
+
53
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
54
+{
55
+ return pi_of(dl_se) != dl_se;
56
+}
57
+#else
58
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
59
+{
60
+ return dl_se;
61
+}
62
+
63
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
64
+{
65
+ return false;
66
+}
67
+#endif
68
+
4669 #ifdef CONFIG_SMP
4770 static inline struct dl_bw *dl_bw_of(int i)
4871 {
....@@ -54,14 +77,48 @@
5477 static inline int dl_bw_cpus(int i)
5578 {
5679 struct root_domain *rd = cpu_rq(i)->rd;
57
- int cpus = 0;
80
+ int cpus;
5881
5982 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
6083 "sched RCU must be held");
84
+
85
+ if (cpumask_subset(rd->span, cpu_active_mask))
86
+ return cpumask_weight(rd->span);
87
+
88
+ cpus = 0;
89
+
6190 for_each_cpu_and(i, rd->span, cpu_active_mask)
6291 cpus++;
6392
6493 return cpus;
94
+}
95
+
96
+static inline unsigned long __dl_bw_capacity(int i)
97
+{
98
+ struct root_domain *rd = cpu_rq(i)->rd;
99
+ unsigned long cap = 0;
100
+
101
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
102
+ "sched RCU must be held");
103
+
104
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
105
+ cap += capacity_orig_of(i);
106
+
107
+ return cap;
108
+}
109
+
110
+/*
111
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
112
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
113
+ */
114
+static inline unsigned long dl_bw_capacity(int i)
115
+{
116
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
117
+ capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
118
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
119
+ } else {
120
+ return __dl_bw_capacity(i);
121
+ }
65122 }
66123 #else
67124 static inline struct dl_bw *dl_bw_of(int i)
....@@ -72,6 +129,11 @@
72129 static inline int dl_bw_cpus(int i)
73130 {
74131 return 1;
132
+}
133
+
134
+static inline unsigned long dl_bw_capacity(int i)
135
+{
136
+ return SCHED_CAPACITY_SCALE;
75137 }
76138 #endif
77139
....@@ -153,7 +215,7 @@
153215 __sub_running_bw(dl_se->dl_bw, dl_rq);
154216 }
155217
156
-void dl_change_utilization(struct task_struct *p, u64 new_bw)
218
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
157219 {
158220 struct rq *rq;
159221
....@@ -287,7 +349,7 @@
287349
288350 dl_se->dl_non_contending = 1;
289351 get_task_struct(p);
290
- hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
352
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
291353 }
292354
293355 static void task_contending(struct sched_dl_entity *dl_se, int flags)
....@@ -333,6 +395,8 @@
333395
334396 return dl_rq->root.rb_leftmost == &dl_se->rb_node;
335397 }
398
+
399
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
336400
337401 void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
338402 {
....@@ -539,7 +603,7 @@
539603 * If we cannot preempt any rq, fall back to pick any
540604 * online CPU:
541605 */
542
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
606
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
543607 if (cpu >= nr_cpu_ids) {
544608 /*
545609 * Failed to find any suitable CPU.
....@@ -657,7 +721,7 @@
657721 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
658722 struct rq *rq = rq_of_dl_rq(dl_rq);
659723
660
- WARN_ON(dl_se->dl_boosted);
724
+ WARN_ON(is_dl_boosted(dl_se));
661725 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
662726
663727 /*
....@@ -695,21 +759,20 @@
695759 * could happen are, typically, a entity voluntarily trying to overcome its
696760 * runtime, or it just underestimated it during sched_setattr().
697761 */
698
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
699
- struct sched_dl_entity *pi_se)
762
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
700763 {
701764 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
702765 struct rq *rq = rq_of_dl_rq(dl_rq);
703766
704
- BUG_ON(pi_se->dl_runtime <= 0);
767
+ BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
705768
706769 /*
707770 * This could be the case for a !-dl task that is boosted.
708771 * Just go with full inherited parameters.
709772 */
710773 if (dl_se->dl_deadline == 0) {
711
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
712
- dl_se->runtime = pi_se->dl_runtime;
774
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
775
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
713776 }
714777
715778 if (dl_se->dl_yielded && dl_se->runtime > 0)
....@@ -722,8 +785,8 @@
722785 * arbitrary large.
723786 */
724787 while (dl_se->runtime <= 0) {
725
- dl_se->deadline += pi_se->dl_period;
726
- dl_se->runtime += pi_se->dl_runtime;
788
+ dl_se->deadline += pi_of(dl_se)->dl_period;
789
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
727790 }
728791
729792 /*
....@@ -737,8 +800,8 @@
737800 */
738801 if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
739802 printk_deferred_once("sched: DL replenish lagged too much\n");
740
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
741
- dl_se->runtime = pi_se->dl_runtime;
803
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
804
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
742805 }
743806
744807 if (dl_se->dl_yielded)
....@@ -759,7 +822,7 @@
759822 * refill the runtime and set the deadline a period in the future,
760823 * because keeping the current (absolute) deadline of the task would
761824 * result in breaking guarantees promised to other tasks (refer to
762
- * Documentation/scheduler/sched-deadline.txt for more informations).
825
+ * Documentation/scheduler/sched-deadline.rst for more information).
763826 *
764827 * This function returns true if:
765828 *
....@@ -771,8 +834,7 @@
771834 * task with deadline equal to period this is the same of using
772835 * dl_period instead of dl_deadline in the equation above.
773836 */
774
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
775
- struct sched_dl_entity *pi_se, u64 t)
837
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
776838 {
777839 u64 left, right;
778840
....@@ -794,9 +856,9 @@
794856 * of anything below microseconds resolution is actually fiction
795857 * (but still we want to give the user that illusion >;).
796858 */
797
- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
859
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
798860 right = ((dl_se->deadline - t) >> DL_SCALE) *
799
- (pi_se->dl_runtime >> DL_SCALE);
861
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
800862
801863 return dl_time_before(right, left);
802864 }
....@@ -881,24 +943,23 @@
881943 * Please refer to the comments update_dl_revised_wakeup() function to find
882944 * more about the Revised CBS rule.
883945 */
884
-static void update_dl_entity(struct sched_dl_entity *dl_se,
885
- struct sched_dl_entity *pi_se)
946
+static void update_dl_entity(struct sched_dl_entity *dl_se)
886947 {
887948 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
888949 struct rq *rq = rq_of_dl_rq(dl_rq);
889950
890951 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
891
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
952
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
892953
893954 if (unlikely(!dl_is_implicit(dl_se) &&
894955 !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
895
- !dl_se->dl_boosted)){
956
+ !is_dl_boosted(dl_se))) {
896957 update_dl_revised_wakeup(dl_se, rq);
897958 return;
898959 }
899960
900
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
901
- dl_se->runtime = pi_se->dl_runtime;
961
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
962
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
902963 }
903964 }
904965
....@@ -956,7 +1017,7 @@
9561017 */
9571018 if (!hrtimer_is_queued(timer)) {
9581019 get_task_struct(p);
959
- hrtimer_start(timer, act, HRTIMER_MODE_ABS);
1020
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
9601021 }
9611022
9621023 return 1;
....@@ -997,7 +1058,7 @@
9971058 * The task might have been boosted by someone else and might be in the
9981059 * boosting/deboosting path, its not throttled.
9991060 */
1000
- if (dl_se->dl_boosted)
1061
+ if (is_dl_boosted(dl_se))
10011062 goto unlock;
10021063
10031064 /*
....@@ -1025,7 +1086,7 @@
10251086 * but do not enqueue -- wait for our wakeup to do that.
10261087 */
10271088 if (!task_on_rq_queued(p)) {
1028
- replenish_dl_entity(dl_se, dl_se);
1089
+ replenish_dl_entity(dl_se);
10291090 goto unlock;
10301091 }
10311092
....@@ -1086,7 +1147,7 @@
10861147 {
10871148 struct hrtimer *timer = &dl_se->dl_timer;
10881149
1089
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1150
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10901151 timer->function = dl_task_timer;
10911152 }
10921153
....@@ -1096,7 +1157,7 @@
10961157 * cannot use the runtime, and so it replenishes the task. This rule
10971158 * works fine for implicit deadline tasks (deadline == period), and the
10981159 * CBS was designed for implicit deadline tasks. However, a task with
1099
- * constrained deadline (deadine < period) might be awakened after the
1160
+ * constrained deadline (deadline < period) might be awakened after the
11001161 * deadline, but before the next period. In this case, replenishing the
11011162 * task would allow it to run for runtime / deadline. As in this case
11021163 * deadline < period, CBS enables a task to run for more than the
....@@ -1115,7 +1176,7 @@
11151176
11161177 if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
11171178 dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
1118
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
1179
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
11191180 return;
11201181 dl_se->dl_throttled = 1;
11211182 if (dl_se->runtime > 0)
....@@ -1228,7 +1289,7 @@
12281289 &curr->dl);
12291290 } else {
12301291 unsigned long scale_freq = arch_scale_freq_capacity(cpu);
1231
- unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
1292
+ unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
12321293
12331294 scaled_delta_exec = cap_scale(delta_exec, scale_freq);
12341295 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
....@@ -1246,7 +1307,7 @@
12461307 dl_se->dl_overrun = 1;
12471308
12481309 __dequeue_task_dl(rq, curr, 0);
1249
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
1310
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
12501311 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
12511312
12521313 if (!is_leftmost(curr, &rq->dl))
....@@ -1325,7 +1386,7 @@
13251386 {
13261387 struct hrtimer *timer = &dl_se->inactive_timer;
13271388
1328
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1389
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
13291390 timer->function = inactive_task_timer;
13301391 }
13311392
....@@ -1440,8 +1501,7 @@
14401501 }
14411502
14421503 static void
1443
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
1444
- struct sched_dl_entity *pi_se, int flags)
1504
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
14451505 {
14461506 BUG_ON(on_dl_rq(dl_se));
14471507
....@@ -1452,9 +1512,9 @@
14521512 */
14531513 if (flags & ENQUEUE_WAKEUP) {
14541514 task_contending(dl_se, flags);
1455
- update_dl_entity(dl_se, pi_se);
1515
+ update_dl_entity(dl_se);
14561516 } else if (flags & ENQUEUE_REPLENISH) {
1457
- replenish_dl_entity(dl_se, pi_se);
1517
+ replenish_dl_entity(dl_se);
14581518 } else if ((flags & ENQUEUE_RESTORE) &&
14591519 dl_time_before(dl_se->deadline,
14601520 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
....@@ -1471,28 +1531,43 @@
14711531
14721532 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
14731533 {
1474
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
1475
- struct sched_dl_entity *pi_se = &p->dl;
1476
-
1477
- /*
1478
- * Use the scheduling parameters of the top pi-waiter task if:
1479
- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
1480
- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
1481
- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
1482
- * boosted due to a SCHED_DEADLINE pi-waiter).
1483
- * Otherwise we keep our runtime and deadline.
1484
- */
1485
- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
1486
- pi_se = &pi_task->dl;
1534
+ if (is_dl_boosted(&p->dl)) {
1535
+ /*
1536
+ * Because of delays in the detection of the overrun of a
1537
+ * thread's runtime, it might be the case that a thread
1538
+ * goes to sleep in a rt mutex with negative runtime. As
1539
+ * a consequence, the thread will be throttled.
1540
+ *
1541
+ * While waiting for the mutex, this thread can also be
1542
+ * boosted via PI, resulting in a thread that is throttled
1543
+ * and boosted at the same time.
1544
+ *
1545
+ * In this case, the boost overrides the throttle.
1546
+ */
1547
+ if (p->dl.dl_throttled) {
1548
+ /*
1549
+ * The replenish timer needs to be canceled. No
1550
+ * problem if it fires concurrently: boosted threads
1551
+ * are ignored in dl_task_timer().
1552
+ */
1553
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
1554
+ p->dl.dl_throttled = 0;
1555
+ }
14871556 } else if (!dl_prio(p->normal_prio)) {
14881557 /*
1489
- * Special case in which we have a !SCHED_DEADLINE task
1490
- * that is going to be deboosted, but exceeds its
1491
- * runtime while doing so. No point in replenishing
1492
- * it, as it's going to return back to its original
1493
- * scheduling class after this.
1558
+ * Special case in which we have a !SCHED_DEADLINE task that is going
1559
+ * to be deboosted, but exceeds its runtime while doing so. No point in
1560
+ * replenishing it, as it's going to return back to its original
1561
+ * scheduling class after this. If it has been throttled, we need to
1562
+ * clear the flag, otherwise the task may wake up as throttled after
1563
+ * being boosted again with no means to replenish the runtime and clear
1564
+ * the throttle.
14941565 */
1495
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
1566
+ p->dl.dl_throttled = 0;
1567
+ if (!(flags & ENQUEUE_REPLENISH))
1568
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
1569
+ task_pid_nr(p));
1570
+
14961571 return;
14971572 }
14981573
....@@ -1529,7 +1604,7 @@
15291604 return;
15301605 }
15311606
1532
- enqueue_dl_entity(&p->dl, pi_se, flags);
1607
+ enqueue_dl_entity(&p->dl, flags);
15331608
15341609 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
15351610 enqueue_pushable_dl_task(rq, p);
....@@ -1599,10 +1674,10 @@
15991674 static int find_later_rq(struct task_struct *task);
16001675
16011676 static int
1602
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
1603
- int sibling_count_hint)
1677
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
16041678 {
16051679 struct task_struct *curr;
1680
+ bool select_rq;
16061681 struct rq *rq;
16071682
16081683 if (sd_flag != SD_BALANCE_WAKE)
....@@ -1622,10 +1697,19 @@
16221697 * other hand, if it has a shorter deadline, we
16231698 * try to make it stay here, it might be important.
16241699 */
1625
- if (unlikely(dl_task(curr)) &&
1626
- (curr->nr_cpus_allowed < 2 ||
1627
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
1628
- (p->nr_cpus_allowed > 1)) {
1700
+ select_rq = unlikely(dl_task(curr)) &&
1701
+ (curr->nr_cpus_allowed < 2 ||
1702
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
1703
+ p->nr_cpus_allowed > 1;
1704
+
1705
+ /*
1706
+ * Take the capacity of the CPU into account to
1707
+ * ensure it fits the requirement of the task.
1708
+ */
1709
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
1710
+ select_rq |= !dl_task_fits_capacity(p, cpu);
1711
+
1712
+ if (select_rq) {
16291713 int target = find_later_rq(p);
16301714
16311715 if (target != -1 &&
....@@ -1693,6 +1777,22 @@
16931777 resched_curr(rq);
16941778 }
16951779
1780
+static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1781
+{
1782
+ if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
1783
+ /*
1784
+ * This is OK, because current is on_cpu, which avoids it being
1785
+ * picked for load-balance and preemption/IRQs are still
1786
+ * disabled avoiding further scheduler activity on it and we've
1787
+ * not yet started the picking loop.
1788
+ */
1789
+ rq_unpin_lock(rq, rf);
1790
+ pull_dl_task(rq);
1791
+ rq_repin_lock(rq, rf);
1792
+ }
1793
+
1794
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq);
1795
+}
16961796 #endif /* CONFIG_SMP */
16971797
16981798 /*
....@@ -1729,8 +1829,26 @@
17291829 }
17301830 #endif
17311831
1732
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
1733
- struct dl_rq *dl_rq)
1832
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
1833
+{
1834
+ p->se.exec_start = rq_clock_task(rq);
1835
+
1836
+ /* You can't push away the running task */
1837
+ dequeue_pushable_dl_task(rq, p);
1838
+
1839
+ if (!first)
1840
+ return;
1841
+
1842
+ if (hrtick_enabled(rq))
1843
+ start_hrtick_dl(rq, p);
1844
+
1845
+ if (rq->curr->sched_class != &dl_sched_class)
1846
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1847
+
1848
+ deadline_queue_push_tasks(rq);
1849
+}
1850
+
1851
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
17341852 {
17351853 struct rb_node *left = rb_first_cached(&dl_rq->root);
17361854
....@@ -1740,63 +1858,19 @@
17401858 return rb_entry(left, struct sched_dl_entity, rb_node);
17411859 }
17421860
1743
-static struct task_struct *
1744
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1861
+static struct task_struct *pick_next_task_dl(struct rq *rq)
17451862 {
17461863 struct sched_dl_entity *dl_se;
1864
+ struct dl_rq *dl_rq = &rq->dl;
17471865 struct task_struct *p;
1748
- struct dl_rq *dl_rq;
17491866
1750
- dl_rq = &rq->dl;
1751
-
1752
- if (need_pull_dl_task(rq, prev)) {
1753
- /*
1754
- * This is OK, because current is on_cpu, which avoids it being
1755
- * picked for load-balance and preemption/IRQs are still
1756
- * disabled avoiding further scheduler activity on it and we're
1757
- * being very careful to re-start the picking loop.
1758
- */
1759
- rq_unpin_lock(rq, rf);
1760
- pull_dl_task(rq);
1761
- rq_repin_lock(rq, rf);
1762
- /*
1763
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
1764
- * means a stop task can slip in, in which case we need to
1765
- * re-start task selection.
1766
- */
1767
- if (rq->stop && task_on_rq_queued(rq->stop))
1768
- return RETRY_TASK;
1769
- }
1770
-
1771
- /*
1772
- * When prev is DL, we may throttle it in put_prev_task().
1773
- * So, we update time before we check for dl_nr_running.
1774
- */
1775
- if (prev->sched_class == &dl_sched_class)
1776
- update_curr_dl(rq);
1777
-
1778
- if (unlikely(!dl_rq->dl_nr_running))
1867
+ if (!sched_dl_runnable(rq))
17791868 return NULL;
17801869
1781
- put_prev_task(rq, prev);
1782
-
1783
- dl_se = pick_next_dl_entity(rq, dl_rq);
1870
+ dl_se = pick_next_dl_entity(dl_rq);
17841871 BUG_ON(!dl_se);
1785
-
17861872 p = dl_task_of(dl_se);
1787
- p->se.exec_start = rq_clock_task(rq);
1788
-
1789
- /* Running task will never be pushed. */
1790
- dequeue_pushable_dl_task(rq, p);
1791
-
1792
- if (hrtick_enabled(rq))
1793
- start_hrtick_dl(rq, p);
1794
-
1795
- deadline_queue_push_tasks(rq);
1796
-
1797
- if (rq->curr->sched_class != &dl_sched_class)
1798
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1799
-
1873
+ set_next_task_dl(rq, p, true);
18001874 return p;
18011875 }
18021876
....@@ -1840,16 +1914,6 @@
18401914 */
18411915 }
18421916
1843
-static void set_curr_task_dl(struct rq *rq)
1844
-{
1845
- struct task_struct *p = rq->curr;
1846
-
1847
- p->se.exec_start = rq_clock_task(rq);
1848
-
1849
- /* You can't push away the running task */
1850
- dequeue_pushable_dl_task(rq, p);
1851
-}
1852
-
18531917 #ifdef CONFIG_SMP
18541918
18551919 /* Only try algorithms three times */
....@@ -1858,7 +1922,7 @@
18581922 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
18591923 {
18601924 if (!task_running(rq, p) &&
1861
- cpumask_test_cpu(cpu, &p->cpus_allowed))
1925
+ cpumask_test_cpu(cpu, p->cpus_ptr))
18621926 return 1;
18631927 return 0;
18641928 }
....@@ -2008,7 +2072,7 @@
20082072 /* Retry if something changed. */
20092073 if (double_lock_balance(rq, later_rq)) {
20102074 if (unlikely(task_rq(task) != rq ||
2011
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
2075
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
20122076 task_running(rq, task) ||
20132077 !dl_task(task) ||
20142078 !task_on_rq_queued(task))) {
....@@ -2075,10 +2139,8 @@
20752139 return 0;
20762140
20772141 retry:
2078
- if (unlikely(next_task == rq->curr)) {
2079
- WARN_ON(1);
2142
+ if (WARN_ON(next_task == rq->curr))
20802143 return 0;
2081
- }
20822144
20832145 /*
20842146 * If next_task preempts rq->curr, and rq->curr
....@@ -2124,17 +2186,13 @@
21242186 }
21252187
21262188 deactivate_task(rq, next_task, 0);
2127
- sub_running_bw(&next_task->dl, &rq->dl);
2128
- sub_rq_bw(&next_task->dl, &rq->dl);
21292189 set_task_cpu(next_task, later_rq->cpu);
2130
- add_rq_bw(&next_task->dl, &later_rq->dl);
21312190
21322191 /*
21332192 * Update the later_rq clock here, because the clock is used
21342193 * by the cpufreq_update_util() inside __add_running_bw().
21352194 */
21362195 update_rq_clock(later_rq);
2137
- add_running_bw(&next_task->dl, &later_rq->dl);
21382196 activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
21392197 ret = 1;
21402198
....@@ -2222,11 +2280,7 @@
22222280 resched = true;
22232281
22242282 deactivate_task(src_rq, p, 0);
2225
- sub_running_bw(&p->dl, &src_rq->dl);
2226
- sub_rq_bw(&p->dl, &src_rq->dl);
22272283 set_task_cpu(p, this_cpu);
2228
- add_rq_bw(&p->dl, &this_rq->dl);
2229
- add_running_bw(&p->dl, &this_rq->dl);
22302284 activate_task(this_rq, p, 0);
22312285 dmin = p->dl.deadline;
22322286
....@@ -2319,6 +2373,39 @@
23192373 GFP_KERNEL, cpu_to_node(i));
23202374 }
23212375
2376
+void dl_add_task_root_domain(struct task_struct *p)
2377
+{
2378
+ struct rq_flags rf;
2379
+ struct rq *rq;
2380
+ struct dl_bw *dl_b;
2381
+
2382
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2383
+ if (!dl_task(p)) {
2384
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
2385
+ return;
2386
+ }
2387
+
2388
+ rq = __task_rq_lock(p, &rf);
2389
+
2390
+ dl_b = &rq->rd->dl_bw;
2391
+ raw_spin_lock(&dl_b->lock);
2392
+
2393
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
2394
+
2395
+ raw_spin_unlock(&dl_b->lock);
2396
+
2397
+ task_rq_unlock(rq, p, &rf);
2398
+}
2399
+
2400
+void dl_clear_root_domain(struct root_domain *rd)
2401
+{
2402
+ unsigned long flags;
2403
+
2404
+ raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
2405
+ rd->dl_bw.total_bw = 0;
2406
+ raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
2407
+}
2408
+
23222409 #endif /* CONFIG_SMP */
23232410
23242411 static void switched_from_dl(struct rq *rq, struct task_struct *p)
....@@ -2333,6 +2420,12 @@
23332420 */
23342421 if (task_on_rq_queued(p) && p->dl.dl_runtime)
23352422 task_non_contending(p);
2423
+
2424
+ /*
2425
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
2426
+ * keep track of that on its cpuset (for correct bandwidth tracking).
2427
+ */
2428
+ dec_dl_tasks_cs(p);
23362429
23372430 if (!task_on_rq_queued(p)) {
23382431 /*
....@@ -2374,6 +2467,12 @@
23742467 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
23752468 put_task_struct(p);
23762469
2470
+ /*
2471
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
2472
+ * track of that on its cpuset (for correct bandwidth tracking).
2473
+ */
2474
+ inc_dl_tasks_cs(p);
2475
+
23772476 /* If p is not queued we will update its parameters at next wakeup. */
23782477 if (!task_on_rq_queued(p)) {
23792478 add_rq_bw(&p->dl, &rq->dl);
....@@ -2390,6 +2489,8 @@
23902489 check_preempt_curr_dl(rq, p, 0);
23912490 else
23922491 resched_curr(rq);
2492
+ } else {
2493
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
23932494 }
23942495 }
23952496
....@@ -2429,8 +2530,8 @@
24292530 }
24302531 }
24312532
2432
-const struct sched_class dl_sched_class = {
2433
- .next = &rt_sched_class,
2533
+const struct sched_class dl_sched_class
2534
+ __section("__dl_sched_class") = {
24342535 .enqueue_task = enqueue_task_dl,
24352536 .dequeue_task = dequeue_task_dl,
24362537 .yield_task = yield_task_dl,
....@@ -2439,8 +2540,10 @@
24392540
24402541 .pick_next_task = pick_next_task_dl,
24412542 .put_prev_task = put_prev_task_dl,
2543
+ .set_next_task = set_next_task_dl,
24422544
24432545 #ifdef CONFIG_SMP
2546
+ .balance = balance_dl,
24442547 .select_task_rq = select_task_rq_dl,
24452548 .migrate_task_rq = migrate_task_rq_dl,
24462549 .set_cpus_allowed = set_cpus_allowed_dl,
....@@ -2449,7 +2552,6 @@
24492552 .task_woken = task_woken_dl,
24502553 #endif
24512554
2452
- .set_curr_task = set_curr_task_dl,
24532555 .task_tick = task_tick_dl,
24542556 .task_fork = task_fork_dl,
24552557
....@@ -2497,7 +2599,7 @@
24972599 return ret;
24982600 }
24992601
2500
-void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
2602
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
25012603 {
25022604 if (global_rt_runtime() == RUNTIME_INF) {
25032605 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
....@@ -2550,11 +2652,12 @@
25502652 int sched_dl_overflow(struct task_struct *p, int policy,
25512653 const struct sched_attr *attr)
25522654 {
2553
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
25542655 u64 period = attr->sched_period ?: attr->sched_deadline;
25552656 u64 runtime = attr->sched_runtime;
25562657 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2557
- int cpus, err = -1;
2658
+ int cpus, err = -1, cpu = task_cpu(p);
2659
+ struct dl_bw *dl_b = dl_bw_of(cpu);
2660
+ unsigned long cap;
25582661
25592662 if (attr->sched_flags & SCHED_FLAG_SUGOV)
25602663 return 0;
....@@ -2569,15 +2672,17 @@
25692672 * allocated bandwidth of the container.
25702673 */
25712674 raw_spin_lock(&dl_b->lock);
2572
- cpus = dl_bw_cpus(task_cpu(p));
2675
+ cpus = dl_bw_cpus(cpu);
2676
+ cap = dl_bw_capacity(cpu);
2677
+
25732678 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2574
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2679
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
25752680 if (hrtimer_active(&p->dl.inactive_timer))
25762681 __dl_sub(dl_b, p->dl.dl_bw, cpus);
25772682 __dl_add(dl_b, new_bw, cpus);
25782683 err = 0;
25792684 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2580
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2685
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
25812686 /*
25822687 * XXX this is slightly incorrect: when the task
25832688 * utilization decreases, we should delay the total
....@@ -2635,6 +2740,14 @@
26352740 }
26362741
26372742 /*
2743
+ * Default limits for DL period; on the top end we guard against small util
2744
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
2745
+ * guard against timer DoS.
2746
+ */
2747
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
2748
+unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
2749
+
2750
+/*
26382751 * This function validates the new parameters of a -deadline task.
26392752 * We ask for the deadline not being zero, and greater or equal
26402753 * than the runtime, as well as the period of being zero or
....@@ -2646,6 +2759,8 @@
26462759 */
26472760 bool __checkparam_dl(const struct sched_attr *attr)
26482761 {
2762
+ u64 period, max, min;
2763
+
26492764 /* special dl tasks don't actually use any parameter */
26502765 if (attr->sched_flags & SCHED_FLAG_SUGOV)
26512766 return true;
....@@ -2669,10 +2784,19 @@
26692784 attr->sched_period & (1ULL << 63))
26702785 return false;
26712786
2787
+ period = attr->sched_period;
2788
+ if (!period)
2789
+ period = attr->sched_deadline;
2790
+
26722791 /* runtime <= deadline <= period (if period != 0) */
2673
- if ((attr->sched_period != 0 &&
2674
- attr->sched_period < attr->sched_deadline) ||
2792
+ if (period < attr->sched_deadline ||
26752793 attr->sched_deadline < attr->sched_runtime)
2794
+ return false;
2795
+
2796
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
2797
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
2798
+
2799
+ if (period < min || period > max)
26762800 return false;
26772801
26782802 return true;
....@@ -2692,11 +2816,14 @@
26922816 dl_se->dl_bw = 0;
26932817 dl_se->dl_density = 0;
26942818
2695
- dl_se->dl_boosted = 0;
26962819 dl_se->dl_throttled = 0;
26972820 dl_se->dl_yielded = 0;
26982821 dl_se->dl_non_contending = 0;
26992822 dl_se->dl_overrun = 0;
2823
+
2824
+#ifdef CONFIG_RT_MUTEXES
2825
+ dl_se->pi_se = dl_se;
2826
+#endif
27002827 }
27012828
27022829 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
....@@ -2713,39 +2840,6 @@
27132840 }
27142841
27152842 #ifdef CONFIG_SMP
2716
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2717
-{
2718
- unsigned int dest_cpu;
2719
- struct dl_bw *dl_b;
2720
- bool overflow;
2721
- int cpus, ret;
2722
- unsigned long flags;
2723
-
2724
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2725
-
2726
- rcu_read_lock_sched();
2727
- dl_b = dl_bw_of(dest_cpu);
2728
- raw_spin_lock_irqsave(&dl_b->lock, flags);
2729
- cpus = dl_bw_cpus(dest_cpu);
2730
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2731
- if (overflow) {
2732
- ret = -EBUSY;
2733
- } else {
2734
- /*
2735
- * We reserve space for this task in the destination
2736
- * root_domain, as we can't fail after this point.
2737
- * We will free resources in the source root_domain
2738
- * later on (see set_cpus_allowed_dl()).
2739
- */
2740
- __dl_add(dl_b, p->dl.dl_bw, cpus);
2741
- ret = 0;
2742
- }
2743
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2744
- rcu_read_unlock_sched();
2745
-
2746
- return ret;
2747
-}
2748
-
27492843 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
27502844 const struct cpumask *trial)
27512845 {
....@@ -2767,22 +2861,59 @@
27672861 return ret;
27682862 }
27692863
2770
-bool dl_cpu_busy(unsigned int cpu)
2864
+enum dl_bw_request {
2865
+ dl_bw_req_check_overflow = 0,
2866
+ dl_bw_req_alloc,
2867
+ dl_bw_req_free
2868
+};
2869
+
2870
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
27712871 {
27722872 unsigned long flags;
27732873 struct dl_bw *dl_b;
2774
- bool overflow;
2775
- int cpus;
2874
+ bool overflow = 0;
27762875
27772876 rcu_read_lock_sched();
27782877 dl_b = dl_bw_of(cpu);
27792878 raw_spin_lock_irqsave(&dl_b->lock, flags);
2780
- cpus = dl_bw_cpus(cpu);
2781
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
2879
+
2880
+ if (req == dl_bw_req_free) {
2881
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
2882
+ } else {
2883
+ unsigned long cap = dl_bw_capacity(cpu);
2884
+
2885
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
2886
+
2887
+ if (req == dl_bw_req_alloc && !overflow) {
2888
+ /*
2889
+ * We reserve space in the destination
2890
+ * root_domain, as we can't fail after this point.
2891
+ * We will free resources in the source root_domain
2892
+ * later on (see set_cpus_allowed_dl()).
2893
+ */
2894
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
2895
+ }
2896
+ }
2897
+
27822898 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
27832899 rcu_read_unlock_sched();
27842900
2785
- return overflow;
2901
+ return overflow ? -EBUSY : 0;
2902
+}
2903
+
2904
+int dl_bw_check_overflow(int cpu)
2905
+{
2906
+ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
2907
+}
2908
+
2909
+int dl_bw_alloc(int cpu, u64 dl_bw)
2910
+{
2911
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
2912
+}
2913
+
2914
+void dl_bw_free(int cpu, u64 dl_bw)
2915
+{
2916
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
27862917 }
27872918 #endif
27882919