~hc/RK356X_SDK_RELEASE.git

..	..	@@ -7,8 +7,12 @@
7	7
8	8	#include "pelt.h"
9	9
	10	+#include <trace/hooks/sched.h>
	11	+
10	12	int sched_rr_timeslice = RR_TIMESLICE;
11	13	int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
	14	+/* More than 4 hours if BW_SHIFT equals 20. */
	15	+static const u64 max_rt_runtime = MAX_BW;
12	16
13	17	static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
14	18
..	..	@@ -45,8 +49,8 @@
45	49
46	50	raw_spin_lock_init(&rt_b->rt_runtime_lock);
47	51
48		- hrtimer_init(&rt_b->rt_period_timer,
49		- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	52	+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
	53	+ HRTIMER_MODE_REL_HARD);
50	54	rt_b->rt_period_timer.function = sched_rt_period_timer;
51	55	}
52	56
..	..	@@ -64,7 +68,8 @@
64	68	* to update the period.
65	69	*/
66	70	hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
67		- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
	71	+ hrtimer_start_expires(&rt_b->rt_period_timer,
	72	+ HRTIMER_MODE_ABS_PINNED_HARD);
68	73	}
69	74	raw_spin_unlock(&rt_b->rt_runtime_lock);
70	75	}
..	..	@@ -434,7 +439,7 @@
434	439	#endif /* CONFIG_SMP */
435	440
436	441	static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
437		-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
	442	+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
438	443
439	444	static inline int on_rt_rq(struct sched_rt_entity *rt_se)
440	445	{
..	..	@@ -555,7 +560,7 @@
555	560	rt_se = rt_rq->tg->rt_se[cpu];
556	561
557	562	if (!rt_se) {
558		- dequeue_top_rt_rq(rt_rq);
	563	+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
559	564	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
560	565	cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
561	566	}
..	..	@@ -641,7 +646,7 @@
641	646
642	647	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
643	648	{
644		- dequeue_top_rt_rq(rt_rq);
	649	+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
645	650	}
646	651
647	652	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
..	..	@@ -973,6 +978,13 @@
973	978	if (likely(rt_b->rt_runtime)) {
974	979	rt_rq->rt_throttled = 1;
975	980	printk_deferred_once("sched: RT throttling activated\n");
	981	+
	982	+ trace_android_vh_dump_throttled_rt_tasks(
	983	+ raw_smp_processor_id(),
	984	+ rq_clock(rq_of_rt_rq(rt_rq)),
	985	+ sched_rt_period(rt_rq),
	986	+ runtime,
	987	+ hrtimer_get_expires_ns(&rt_b->rt_period_timer));
976	988	} else {
977	989	/*
978	990	* In case we did anyway, make it go away,
..	..	@@ -1019,6 +1031,8 @@
1019	1031	curr->se.exec_start = now;
1020	1032	cgroup_account_cputime(curr, delta_exec);
1021	1033
	1034	+ trace_android_vh_sched_stat_runtime_rt(curr, delta_exec);
	1035	+
1022	1036	if (!rt_bandwidth_enabled())
1023	1037	return;
1024	1038
..	..	@@ -1040,7 +1054,7 @@
1040	1054	}
1041	1055
1042	1056	static void
1043		-dequeue_top_rt_rq(struct rt_rq *rt_rq)
	1057	+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1044	1058	{
1045	1059	struct rq *rq = rq_of_rt_rq(rt_rq);
1046	1060
..	..	@@ -1051,7 +1065,7 @@
1051	1065
1052	1066	BUG_ON(!rq->nr_running);
1053	1067
1054		- sub_nr_running(rq, rt_rq->rt_nr_running);
	1068	+ sub_nr_running(rq, count);
1055	1069	rt_rq->rt_queued = 0;
1056	1070
1057	1071	}
..	..	@@ -1330,18 +1344,21 @@
1330	1344	static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1331	1345	{
1332	1346	struct sched_rt_entity *back = NULL;
	1347	+ unsigned int rt_nr_running;
1333	1348
1334	1349	for_each_sched_rt_entity(rt_se) {
1335	1350	rt_se->back = back;
1336	1351	back = rt_se;
1337	1352	}
1338	1353
1339		- dequeue_top_rt_rq(rt_rq_of_se(back));
	1354	+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1340	1355
1341	1356	for (rt_se = back; rt_se; rt_se = rt_se->back) {
1342	1357	if (on_rt_rq(rt_se))
1343	1358	__dequeue_rt_entity(rt_se, flags);
1344	1359	}
	1360	+
	1361	+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1345	1362	}
1346	1363
1347	1364	static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
..	..	@@ -1369,6 +1386,27 @@
1369	1386	enqueue_top_rt_rq(&rq->rt);
1370	1387	}
1371	1388
	1389	+#ifdef CONFIG_SMP
	1390	+static inline bool should_honor_rt_sync(struct rq rq, struct task_struct p,
	1391	+ bool sync)
	1392	+{
	1393	+ /*
	1394	+ * If the waker is CFS, then an RT sync wakeup would preempt the waker
	1395	+ * and force it to run for a likely small time after the RT wakee is
	1396	+ * done. So, only honor RT sync wakeups from RT wakers.
	1397	+ */
	1398	+ return sync && task_has_rt_policy(rq->curr) &&
	1399	+ p->prio <= rq->rt.highest_prio.next &&
	1400	+ rq->rt.rt_nr_running <= 2;
	1401	+}
	1402	+#else
	1403	+static inline bool should_honor_rt_sync(struct rq rq, struct task_struct p,
	1404	+ bool sync)
	1405	+{
	1406	+ return 0;
	1407	+}
	1408	+#endif
	1409	+
1372	1410	/*
1373	1411	* Adding/removing a task to/from a priority array:
1374	1412	*/
..	..	@@ -1376,23 +1414,21 @@
1376	1414	enqueue_task_rt(struct rq rq, struct task_struct p, int flags)
1377	1415	{
1378	1416	struct sched_rt_entity *rt_se = &p->rt;
1379		-
1380		- schedtune_enqueue_task(p, cpu_of(rq));
	1417	+ bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
1381	1418
1382	1419	if (flags & ENQUEUE_WAKEUP)
1383	1420	rt_se->timeout = 0;
1384	1421
1385	1422	enqueue_rt_entity(rt_se, flags);
1386	1423
1387		- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
	1424	+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1 &&
	1425	+ !should_honor_rt_sync(rq, p, sync))
1388	1426	enqueue_pushable_task(rq, p);
1389	1427	}
1390	1428
1391	1429	static void dequeue_task_rt(struct rq rq, struct task_struct p, int flags)
1392	1430	{
1393	1431	struct sched_rt_entity *rt_se = &p->rt;
1394		-
1395		- schedtune_dequeue_task(p, cpu_of(rq));
1396	1432
1397	1433	update_curr_rt(rq);
1398	1434	dequeue_rt_entity(rt_se, flags);
..	..	@@ -1437,13 +1473,43 @@
1437	1473	#ifdef CONFIG_SMP
1438	1474	static int find_lowest_rq(struct task_struct *task);
1439	1475
	1476	+#ifdef CONFIG_RT_SOFTINT_OPTIMIZATION
	1477	+/*
	1478	+ * Return whether the task on the given cpu is currently non-preemptible
	1479	+ * while handling a potentially long softint, or if the task is likely
	1480	+ * to block preemptions soon because it is a ksoftirq thread that is
	1481	+ * handling slow softints.
	1482	+ */
	1483	+bool
	1484	+task_may_not_preempt(struct task_struct *task, int cpu)
	1485	+{
	1486	+ __u32 softirqs = per_cpu(active_softirqs, cpu) \|
	1487	+ __IRQ_STAT(cpu, __softirq_pending);
	1488	+
	1489	+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
	1490	+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
	1491	+ (task == cpu_ksoftirqd \|\|
	1492	+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
	1493	+}
	1494	+EXPORT_SYMBOL_GPL(task_may_not_preempt);
	1495	+#endif /* CONFIG_RT_SOFTINT_OPTIMIZATION */
	1496	+
1440	1497	static int
1441		-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1442		- int sibling_count_hint)
	1498	+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1443	1499	{
1444	1500	struct task_struct *curr;
1445	1501	struct rq *rq;
	1502	+ struct rq *this_cpu_rq;
1446	1503	bool test;
	1504	+ int target_cpu = -1;
	1505	+ bool may_not_preempt;
	1506	+ bool sync = !!(flags & WF_SYNC);
	1507	+ int this_cpu;
	1508	+
	1509	+ trace_android_rvh_select_task_rq_rt(p, cpu, sd_flag,
	1510	+ flags, &target_cpu);
	1511	+ if (target_cpu >= 0)
	1512	+ return target_cpu;
1447	1513
1448	1514	/* For anything but wake ups, just return the task_cpu */
1449	1515	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
..	..	@@ -1453,9 +1519,16 @@
1453	1519
1454	1520	rcu_read_lock();
1455	1521	curr = READ_ONCE(rq->curr); /* unlocked access */
	1522	+ this_cpu = smp_processor_id();
	1523	+ this_cpu_rq = cpu_rq(this_cpu);
1456	1524
1457	1525	/*
1458		- * If the current task on @p's runqueue is an RT task, then
	1526	+ * If the current task on @p's runqueue is a softirq task,
	1527	+ * it may run without preemption for a time that is
	1528	+ * ill-suited for a waiting RT task. Therefore, try to
	1529	+ * wake this RT task on another runqueue.
	1530	+ *
	1531	+ * Also, if the current task on @p's runqueue is an RT task, then
1459	1532	* try to see if we can wake this RT task up on another
1460	1533	* runqueue. Otherwise simply start this RT task
1461	1534	* on its current runqueue.
..	..	@@ -1480,9 +1553,21 @@
1480	1553	* requirement of the task - which is only important on heterogeneous
1481	1554	* systems like big.LITTLE.
1482	1555	*/
1483		- test = curr &&
1484		- unlikely(rt_task(curr)) &&
1485		- (curr->nr_cpus_allowed < 2 \|\| curr->prio <= p->prio);
	1556	+ may_not_preempt = task_may_not_preempt(curr, cpu);
	1557	+ test = (curr && (may_not_preempt \|\|
	1558	+ (unlikely(rt_task(curr)) &&
	1559	+ (curr->nr_cpus_allowed < 2 \|\| curr->prio <= p->prio))));
	1560	+
	1561	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE))
	1562	+ test \|= rockchip_perf_misfit_rt(cpu);
	1563	+ /*
	1564	+ * Respect the sync flag as long as the task can run on this CPU.
	1565	+ */
	1566	+ if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
	1567	+ cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
	1568	+ cpu = this_cpu;
	1569	+ goto out_unlock;
	1570	+ }
1486	1571
1487	1572	if (test \|\| !rt_task_fits_capacity(p, cpu)) {
1488	1573	int target = find_lowest_rq(p);
..	..	@@ -1495,11 +1580,14 @@
1495	1580	goto out_unlock;
1496	1581
1497	1582	/*
1498		- * Don't bother moving it if the destination CPU is
	1583	+ * If cpu is non-preemptible, prefer remote cpu
	1584	+ * even if it's running a higher-prio task.
	1585	+ * Otherwise: Don't bother moving it if the destination CPU is
1499	1586	* not running a lower priority task.
1500	1587	*/
1501	1588	if (target != -1 &&
1502		- p->prio < cpu_rq(target)->rt.highest_prio.curr)
	1589	+ (may_not_preempt \|\|
	1590	+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
1503	1591	cpu = target;
1504	1592	}
1505	1593
..	..	@@ -1537,6 +1625,26 @@
1537	1625	resched_curr(rq);
1538	1626	}
1539	1627
	1628	+static int balance_rt(struct rq rq, struct task_struct p, struct rq_flags *rf)
	1629	+{
	1630	+ if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
	1631	+ int done = 0;
	1632	+
	1633	+ /*
	1634	+ * This is OK, because current is on_cpu, which avoids it being
	1635	+ * picked for load-balance and preemption/IRQs are still
	1636	+ * disabled avoiding further scheduler activity on it and we've
	1637	+ * not yet started the picking loop.
	1638	+ */
	1639	+ rq_unpin_lock(rq, rf);
	1640	+ trace_android_rvh_sched_balance_rt(rq, p, &done);
	1641	+ if (!done)
	1642	+ pull_rt_task(rq);
	1643	+ rq_repin_lock(rq, rf);
	1644	+ }
	1645	+
	1646	+ return sched_stop_runnable(rq) \|\| sched_dl_runnable(rq) \|\| sched_rt_runnable(rq);
	1647	+}
1540	1648	#endif /* CONFIG_SMP */
1541	1649
1542	1650	/*
..	..	@@ -1567,8 +1675,28 @@
1567	1675	#endif
1568	1676	}
1569	1677
1570		-static struct sched_rt_entity pick_next_rt_entity(struct rq rq,
1571		- struct rt_rq *rt_rq)
	1678	+static inline void set_next_task_rt(struct rq rq, struct task_struct p, bool first)
	1679	+{
	1680	+ p->se.exec_start = rq_clock_task(rq);
	1681	+
	1682	+ /* The running task is never eligible for pushing */
	1683	+ dequeue_pushable_task(rq, p);
	1684	+
	1685	+ if (!first)
	1686	+ return;
	1687	+
	1688	+ /*
	1689	+ * If prev task was rt, put_prev_task() has already updated the
	1690	+ * utilization. We only care of the case where we start to schedule a
	1691	+ * rt task
	1692	+ */
	1693	+ if (rq->curr->sched_class != &rt_sched_class)
	1694	+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
	1695	+
	1696	+ rt_queue_push_tasks(rq);
	1697	+}
	1698	+
	1699	+static struct sched_rt_entity pick_next_rt_entity(struct rt_rq rt_rq)
1572	1700	{
1573	1701	struct rt_prio_array *array = &rt_rq->active;
1574	1702	struct sched_rt_entity *next = NULL;
..	..	@@ -1579,6 +1707,8 @@
1579	1707	BUG_ON(idx >= MAX_RT_PRIO);
1580	1708
1581	1709	queue = array->queue + idx;
	1710	+ if (SCHED_WARN_ON(list_empty(queue)))
	1711	+ return NULL;
1582	1712	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1583	1713
1584	1714	return next;
..	..	@@ -1587,74 +1717,27 @@
1587	1717	static struct task_struct _pick_next_task_rt(struct rq rq)
1588	1718	{
1589	1719	struct sched_rt_entity *rt_se;
1590		- struct task_struct *p;
1591	1720	struct rt_rq *rt_rq = &rq->rt;
1592	1721
1593	1722	do {
1594		- rt_se = pick_next_rt_entity(rq, rt_rq);
1595		- BUG_ON(!rt_se);
	1723	+ rt_se = pick_next_rt_entity(rt_rq);
	1724	+ if (unlikely(!rt_se))
	1725	+ return NULL;
1596	1726	rt_rq = group_rt_rq(rt_se);
1597	1727	} while (rt_rq);
1598	1728
1599		- p = rt_task_of(rt_se);
1600		- p->se.exec_start = rq_clock_task(rq);
1601		-
1602		- return p;
	1729	+ return rt_task_of(rt_se);
1603	1730	}
1604	1731
1605		-static struct task_struct *
1606		-pick_next_task_rt(struct rq rq, struct task_struct prev, struct rq_flags *rf)
	1732	+static struct task_struct pick_next_task_rt(struct rq rq)
1607	1733	{
1608	1734	struct task_struct *p;
1609		- struct rt_rq *rt_rq = &rq->rt;
1610	1735
1611		- if (need_pull_rt_task(rq, prev)) {
1612		- /*
1613		- * This is OK, because current is on_cpu, which avoids it being
1614		- * picked for load-balance and preemption/IRQs are still
1615		- * disabled avoiding further scheduler activity on it and we're
1616		- * being very careful to re-start the picking loop.
1617		- */
1618		- rq_unpin_lock(rq, rf);
1619		- pull_rt_task(rq);
1620		- rq_repin_lock(rq, rf);
1621		- /*
1622		- * pull_rt_task() can drop (and re-acquire) rq->lock; this
1623		- * means a dl or stop task can slip in, in which case we need
1624		- * to re-start task selection.
1625		- */
1626		- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) \|\|
1627		- rq->dl.dl_nr_running))
1628		- return RETRY_TASK;
1629		- }
1630		-
1631		- /*
1632		- * We may dequeue prev's rt_rq in put_prev_task().
1633		- * So, we update time before rt_nr_running check.
1634		- */
1635		- if (prev->sched_class == &rt_sched_class)
1636		- update_curr_rt(rq);
1637		-
1638		- if (!rt_rq->rt_queued)
	1736	+ if (!sched_rt_runnable(rq))
1639	1737	return NULL;
1640	1738
1641		- put_prev_task(rq, prev);
1642		-
1643	1739	p = _pick_next_task_rt(rq);
1644		-
1645		- /* The running task is never eligible for pushing */
1646		- dequeue_pushable_task(rq, p);
1647		-
1648		- rt_queue_push_tasks(rq);
1649		-
1650		- /*
1651		- * If prev task was rt, put_prev_task() has already updated the
1652		- * utilization. We only care of the case where we start to schedule a
1653		- * rt task
1654		- */
1655		- if (rq->curr->sched_class != &rt_sched_class)
1656		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1657		-
	1740	+ set_next_task_rt(rq, p, true);
1658	1741	return p;
1659	1742	}
1660	1743
..	..	@@ -1680,7 +1763,7 @@
1680	1763	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
1681	1764	{
1682	1765	if (!task_running(rq, p) &&
1683		- cpumask_test_cpu(cpu, &p->cpus_allowed))
	1766	+ cpumask_test_cpu(cpu, p->cpus_ptr))
1684	1767	return 1;
1685	1768
1686	1769	return 0;
..	..	@@ -1690,7 +1773,7 @@
1690	1773	* Return the highest pushable rq's task, which is suitable to be executed
1691	1774	* on the CPU, NULL otherwise
1692	1775	*/
1693		-static struct task_struct pick_highest_pushable_task(struct rq rq, int cpu)
	1776	+struct task_struct pick_highest_pushable_task(struct rq rq, int cpu)
1694	1777	{
1695	1778	struct plist_head *head = &rq->rt.pushable_tasks;
1696	1779	struct task_struct *p;
..	..	@@ -1705,6 +1788,7 @@
1705	1788
1706	1789	return NULL;
1707	1790	}
	1791	+EXPORT_SYMBOL_GPL(pick_highest_pushable_task);
1708	1792
1709	1793	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1710	1794
..	..	@@ -1713,7 +1797,7 @@
1713	1797	struct sched_domain *sd;
1714	1798	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1715	1799	int this_cpu = smp_processor_id();
1716		- int cpu = task_cpu(task);
	1800	+ int cpu = -1;
1717	1801	int ret;
1718	1802
1719	1803	/* Make sure the mask is initialized first */
..	..	@@ -1738,9 +1822,17 @@
1738	1822	task, lowest_mask);
1739	1823	}
1740	1824
	1825	+ trace_android_rvh_find_lowest_rq(task, lowest_mask, ret, &cpu);
	1826	+ if (cpu >= 0)
	1827	+ return cpu;
	1828	+
1741	1829	if (!ret)
1742	1830	return -1; /* No targets found */
1743	1831
	1832	+ cpu = task_cpu(task);
	1833	+
	1834	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE))
	1835	+ cpu = rockchip_perf_select_rt_cpu(cpu, lowest_mask);
1744	1836	/*
1745	1837	* At this point we have built a mask of CPUs representing the
1746	1838	* lowest priority tasks in the system. Now we want to elect
..	..	@@ -1833,7 +1925,7 @@
1833	1925	* Also make sure that it wasn't scheduled on its rq.
1834	1926	*/
1835	1927	if (unlikely(task_rq(task) != rq \|\|
1836		- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) \|\|
	1928	+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) \|\|
1837	1929	task_running(rq, task) \|\|
1838	1930	!rt_task(task) \|\|
1839	1931	!task_on_rq_queued(task))) {
..	..	@@ -1895,10 +1987,8 @@
1895	1987	return 0;
1896	1988
1897	1989	retry:
1898		- if (unlikely(next_task == rq->curr)) {
1899		- WARN_ON(1);
	1990	+ if (WARN_ON(next_task == rq->curr))
1900	1991	return 0;
1901		- }
1902	1992
1903	1993	/*
1904	1994	* It's possible that the next_task slipped in of
..	..	@@ -2315,13 +2405,20 @@
2315	2405	static void switched_to_rt(struct rq rq, struct task_struct p)
2316	2406	{
2317	2407	/*
2318		- * If we are already running, then there's nothing
2319		- * that needs to be done. But if we are not running
2320		- * we may need to preempt the current running task.
2321		- * If that current running task is also an RT task
	2408	+ * If we are running, update the avg_rt tracking, as the running time
	2409	+ * will now on be accounted into the latter.
	2410	+ */
	2411	+ if (task_current(rq, p)) {
	2412	+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
	2413	+ return;
	2414	+ }
	2415	+
	2416	+ /*
	2417	+ * If we are not running we may need to preempt the current
	2418	+ * running task. If that current running task is also an RT task
2322	2419	* then see if we can move to another run queue.
2323	2420	*/
2324		- if (task_on_rq_queued(p) && rq->curr != p) {
	2421	+ if (task_on_rq_queued(p)) {
2325	2422	#ifdef CONFIG_SMP
2326	2423	if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2327	2424	rt_queue_push_tasks(rq);
..	..	@@ -2390,8 +2487,10 @@
2390	2487	}
2391	2488
2392	2489	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2393		- if (p->rt.timeout > next)
2394		- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
	2490	+ if (p->rt.timeout > next) {
	2491	+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
	2492	+ p->se.sum_exec_runtime);
	2493	+ }
2395	2494	}
2396	2495	}
2397	2496	#else
..	..	@@ -2440,16 +2539,6 @@
2440	2539	}
2441	2540	}
2442	2541
2443		-static void set_curr_task_rt(struct rq *rq)
2444		-{
2445		- struct task_struct *p = rq->curr;
2446		-
2447		- p->se.exec_start = rq_clock_task(rq);
2448		-
2449		- /* The running task is never eligible for pushing */
2450		- dequeue_pushable_task(rq, p);
2451		-}
2452		-
2453	2542	static unsigned int get_rr_interval_rt(struct rq rq, struct task_struct task)
2454	2543	{
2455	2544	/*
..	..	@@ -2461,8 +2550,8 @@
2461	2550	return 0;
2462	2551	}
2463	2552
2464		-const struct sched_class rt_sched_class = {
2465		- .next = &fair_sched_class,
	2553	+const struct sched_class rt_sched_class
	2554	+ __section("__rt_sched_class") = {
2466	2555	.enqueue_task = enqueue_task_rt,
2467	2556	.dequeue_task = dequeue_task_rt,
2468	2557	.yield_task = yield_task_rt,
..	..	@@ -2471,10 +2560,11 @@
2471	2560
2472	2561	.pick_next_task = pick_next_task_rt,
2473	2562	.put_prev_task = put_prev_task_rt,
	2563	+ .set_next_task = set_next_task_rt,
2474	2564
2475	2565	#ifdef CONFIG_SMP
	2566	+ .balance = balance_rt,
2476	2567	.select_task_rq = select_task_rq_rt,
2477		-
2478	2568	.set_cpus_allowed = set_cpus_allowed_common,
2479	2569	.rq_online = rq_online_rt,
2480	2570	.rq_offline = rq_offline_rt,
..	..	@@ -2482,7 +2572,6 @@
2482	2572	.switched_from = switched_from_rt,
2483	2573	#endif
2484	2574
2485		- .set_curr_task = set_curr_task_rt,
2486	2575	.task_tick = task_tick_rt,
2487	2576
2488	2577	.get_rr_interval = get_rr_interval_rt,
..	..	@@ -2503,10 +2592,11 @@
2503	2592	*/
2504	2593	static DEFINE_MUTEX(rt_constraints_mutex);
2505	2594
2506		-/* Must be called with tasklist_lock held */
2507	2595	static inline int tg_has_rt_tasks(struct task_group *tg)
2508	2596	{
2509		- struct task_struct g, p;
	2597	+ struct task_struct *task;
	2598	+ struct css_task_iter it;
	2599	+ int ret = 0;
2510	2600
2511	2601	/*
2512	2602	* Autogroups do not have RT tasks; see autogroup_create().
..	..	@@ -2514,12 +2604,12 @@
2514	2604	if (task_group_is_autogroup(tg))
2515	2605	return 0;
2516	2606
2517		- for_each_process_thread(g, p) {
2518		- if (rt_task(p) && task_group(p) == tg)
2519		- return 1;
2520		- }
	2607	+ css_task_iter_start(&tg->css, 0, &it);
	2608	+ while (!ret && (task = css_task_iter_next(&it)))
	2609	+ ret \|= rt_task(task);
	2610	+ css_task_iter_end(&it);
2521	2611
2522		- return 0;
	2612	+ return ret;
2523	2613	}
2524	2614
2525	2615	struct rt_schedulable_data {
..	..	@@ -2550,9 +2640,10 @@
2550	2640	return -EINVAL;
2551	2641
2552	2642	/*
2553		- * Ensure we don't starve existing RT tasks.
	2643	+ * Ensure we don't starve existing RT tasks if runtime turns zero.
2554	2644	*/
2555		- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
	2645	+ if (rt_bandwidth_enabled() && !runtime &&
	2646	+ tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2556	2647	return -EBUSY;
2557	2648
2558	2649	total = to_ratio(period, runtime);
..	..	@@ -2617,8 +2708,13 @@
2617	2708	if (rt_period == 0)
2618	2709	return -EINVAL;
2619	2710
	2711	+ /*
	2712	+ * Bound quota to defend quota against overflow during bandwidth shift.
	2713	+ */
	2714	+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
	2715	+ return -EINVAL;
	2716	+
2620	2717	mutex_lock(&rt_constraints_mutex);
2621		- read_lock(&tasklist_lock);
2622	2718	err = __rt_schedulable(tg, rt_period, rt_runtime);
2623	2719	if (err)
2624	2720	goto unlock;
..	..	@@ -2636,7 +2732,6 @@
2636	2732	}
2637	2733	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2638	2734	unlock:
2639		- read_unlock(&tasklist_lock);
2640	2735	mutex_unlock(&rt_constraints_mutex);
2641	2736
2642	2737	return err;
..	..	@@ -2695,9 +2790,7 @@
2695	2790	int ret = 0;
2696	2791
2697	2792	mutex_lock(&rt_constraints_mutex);
2698		- read_lock(&tasklist_lock);
2699	2793	ret = __rt_schedulable(NULL, 0, 0);
2700		- read_unlock(&tasklist_lock);
2701	2794	mutex_unlock(&rt_constraints_mutex);
2702	2795
2703	2796	return ret;
..	..	@@ -2738,7 +2831,9 @@
2738	2831	return -EINVAL;
2739	2832
2740	2833	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2741		- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
	2834	+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) \|\|
	2835	+ ((u64)sysctl_sched_rt_runtime *
	2836	+ NSEC_PER_USEC > max_rt_runtime)))
2742	2837	return -EINVAL;
2743	2838
2744	2839	return 0;
..	..	@@ -2754,9 +2849,8 @@
2754	2849	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2755	2850	}
2756	2851
2757		-int sched_rt_handler(struct ctl_table *table, int write,
2758		- void __user buffer, size_t lenp,
2759		- loff_t *ppos)
	2852	+int sched_rt_handler(struct ctl_table table, int write, void buffer,
	2853	+ size_t lenp, loff_t ppos)
2760	2854	{
2761	2855	int old_period, old_runtime;
2762	2856	static DEFINE_MUTEX(mutex);
..	..	@@ -2794,9 +2888,8 @@
2794	2888	return ret;
2795	2889	}
2796	2890
2797		-int sched_rr_handler(struct ctl_table *table, int write,
2798		- void __user buffer, size_t lenp,
2799		- loff_t *ppos)
	2891	+int sched_rr_handler(struct ctl_table table, int write, void buffer,
	2892	+ size_t lenp, loff_t ppos)
2800	2893	{
2801	2894	int ret;
2802	2895	static DEFINE_MUTEX(mutex);