~hc/RK356X_SDK_RELEASE.git

..	..	@@ -3,10 +3,8 @@
3	3	* Implement CPU time clocks for the POSIX clock interface.
4	4	*/
5	5
6		-#include <uapi/linux/sched/types.h>
7	6	#include <linux/sched/signal.h>
8	7	#include <linux/sched/cputime.h>
9		-#include <linux/sched/rt.h>
10	8	#include <linux/posix-timers.h>
11	9	#include <linux/errno.h>
12	10	#include <linux/math64.h>
..	..	@@ -17,17 +15,25 @@
17	15	#include <linux/workqueue.h>
18	16	#include <linux/compat.h>
19	17	#include <linux/sched/deadline.h>
20		-#include <linux/smpboot.h>
21	18
22	19	#include "posix-timers.h"
23	20
24	21	static void posix_cpu_timer_rearm(struct k_itimer *timer);
25	22
	23	+void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
	24	+{
	25	+ posix_cputimers_init(pct);
	26	+ if (cpu_limit != RLIM_INFINITY) {
	27	+ pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
	28	+ pct->timers_active = true;
	29	+ }
	30	+}
	31	+
26	32	/*
27	33	* Called after updating RLIMIT_CPU to run cpu timer and update
28		- * tsk->signal->cputime_expires expiration cache if necessary. Needs
29		- * siglock protection since other code may update expiration cache as
30		- * well.
	34	+ * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
	35	+ * necessary. Needs siglock protection since other code may update the
	36	+ * expiration cache as well.
31	37	*/
32	38	void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
33	39	{
..	..	@@ -38,46 +44,87 @@
38	44	spin_unlock_irq(&task->sighand->siglock);
39	45	}
40	46
41		-static int check_clock(const clockid_t which_clock)
	47	+/*
	48	+ * Functions for validating access to tasks.
	49	+ */
	50	+static struct pid *pid_for_clock(const clockid_t clock, bool gettime)
42	51	{
43		- int error = 0;
44		- struct task_struct *p;
45		- const pid_t pid = CPUCLOCK_PID(which_clock);
	52	+ const bool thread = !!CPUCLOCK_PERTHREAD(clock);
	53	+ const pid_t upid = CPUCLOCK_PID(clock);
	54	+ struct pid *pid;
46	55
47		- if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
48		- return -EINVAL;
	56	+ if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
	57	+ return NULL;
49	58
50		- if (pid == 0)
51		- return 0;
	59	+ /*
	60	+ * If the encoded PID is 0, then the timer is targeted at current
	61	+ * or the process to which current belongs.
	62	+ */
	63	+ if (upid == 0)
	64	+ return thread ? task_pid(current) : task_tgid(current);
	65	+
	66	+ pid = find_vpid(upid);
	67	+ if (!pid)
	68	+ return NULL;
	69	+
	70	+ if (thread) {
	71	+ struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
	72	+ return (tsk && same_thread_group(tsk, current)) ? pid : NULL;
	73	+ }
	74	+
	75	+ /*
	76	+ * For clock_gettime(PROCESS) allow finding the process by
	77	+ * with the pid of the current task. The code needs the tgid
	78	+ * of the process so that pid_task(pid, PIDTYPE_TGID) can be
	79	+ * used to find the process.
	80	+ */
	81	+ if (gettime && (pid == task_pid(current)))
	82	+ return task_tgid(current);
	83	+
	84	+ /*
	85	+ * For processes require that pid identifies a process.
	86	+ */
	87	+ return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL;
	88	+}
	89	+
	90	+static inline int validate_clock_permissions(const clockid_t clock)
	91	+{
	92	+ int ret;
52	93
53	94	rcu_read_lock();
54		- p = find_task_by_vpid(pid);
55		- if (!p \|\| !(CPUCLOCK_PERTHREAD(which_clock) ?
56		- same_thread_group(p, current) : has_group_leader_pid(p))) {
57		- error = -EINVAL;
58		- }
	95	+ ret = pid_for_clock(clock, false) ? 0 : -EINVAL;
59	96	rcu_read_unlock();
60	97
61		- return error;
	98	+ return ret;
	99	+}
	100	+
	101	+static inline enum pid_type clock_pid_type(const clockid_t clock)
	102	+{
	103	+ return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
	104	+}
	105	+
	106	+static inline struct task_struct cpu_timer_task_rcu(struct k_itimer timer)
	107	+{
	108	+ return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock));
62	109	}
63	110
64	111	/*
65	112	* Update expiry time from increment, and increase overrun count,
66	113	* given the current clock sample.
67	114	*/
68		-static void bump_cpu_timer(struct k_itimer *timer, u64 now)
	115	+static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
69	116	{
	117	+ u64 delta, incr, expires = timer->it.cpu.node.expires;
70	118	int i;
71		- u64 delta, incr;
72	119
73		- if (timer->it.cpu.incr == 0)
74		- return;
	120	+ if (!timer->it_interval)
	121	+ return expires;
75	122
76		- if (now < timer->it.cpu.expires)
77		- return;
	123	+ if (now < expires)
	124	+ return expires;
78	125
79		- incr = timer->it.cpu.incr;
80		- delta = now + incr - timer->it.cpu.expires;
	126	+ incr = timer->it_interval;
	127	+ delta = now + incr - expires;
81	128
82	129	/* Don't use (incr2 < delta), incr2 might overflow. */
83	130	for (i = 0; incr < delta - incr; i++)
..	..	@@ -87,48 +134,26 @@
87	134	if (delta < incr)
88	135	continue;
89	136
90		- timer->it.cpu.expires += incr;
	137	+ timer->it.cpu.node.expires += incr;
91	138	timer->it_overrun += 1LL << i;
92	139	delta -= incr;
93	140	}
	141	+ return timer->it.cpu.node.expires;
94	142	}
95	143
96		-/**
97		- * task_cputime_zero - Check a task_cputime struct for all zero fields.
98		- *
99		- * @cputime: The struct to compare.
100		- *
101		- * Checks @cputime to see if all fields are zero. Returns true if all fields
102		- * are zero, false if any field is nonzero.
103		- */
104		-static inline int task_cputime_zero(const struct task_cputime *cputime)
	144	+/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
	145	+static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
105	146	{
106		- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
107		- return 1;
108		- return 0;
109		-}
110		-
111		-static inline u64 prof_ticks(struct task_struct *p)
112		-{
113		- u64 utime, stime;
114		-
115		- task_cputime(p, &utime, &stime);
116		-
117		- return utime + stime;
118		-}
119		-static inline u64 virt_ticks(struct task_struct *p)
120		-{
121		- u64 utime, stime;
122		-
123		- task_cputime(p, &utime, &stime);
124		-
125		- return utime;
	147	+ return !(~pct->bases[CPUCLOCK_PROF].nextevt \|
	148	+ ~pct->bases[CPUCLOCK_VIRT].nextevt \|
	149	+ ~pct->bases[CPUCLOCK_SCHED].nextevt);
126	150	}
127	151
128	152	static int
129	153	posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
130	154	{
131		- int error = check_clock(which_clock);
	155	+ int error = validate_clock_permissions(which_clock);
	156	+
132	157	if (!error) {
133	158	tp->tv_sec = 0;
134	159	tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
..	..	@@ -145,40 +170,64 @@
145	170	}
146	171
147	172	static int
148		-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
	173	+posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
149	174	{
	175	+ int error = validate_clock_permissions(clock);
	176	+
150	177	/*
151	178	* You can never reset a CPU clock, but we check for other errors
152	179	* in the call before failing with EPERM.
153	180	*/
154		- int error = check_clock(which_clock);
155		- if (error == 0) {
156		- error = -EPERM;
157		- }
158		- return error;
	181	+ return error ? : -EPERM;
159	182	}
160	183
161		-
162	184	/*
163		- * Sample a per-thread clock for the given task.
	185	+ * Sample a per-thread clock for the given task. clkid is validated.
164	186	*/
165		-static int cpu_clock_sample(const clockid_t which_clock,
166		- struct task_struct p, u64 sample)
	187	+static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
167	188	{
168		- switch (CPUCLOCK_WHICH(which_clock)) {
169		- default:
170		- return -EINVAL;
	189	+ u64 utime, stime;
	190	+
	191	+ if (clkid == CPUCLOCK_SCHED)
	192	+ return task_sched_runtime(p);
	193	+
	194	+ task_cputime(p, &utime, &stime);
	195	+
	196	+ switch (clkid) {
171	197	case CPUCLOCK_PROF:
172		- *sample = prof_ticks(p);
173		- break;
	198	+ return utime + stime;
174	199	case CPUCLOCK_VIRT:
175		- *sample = virt_ticks(p);
176		- break;
177		- case CPUCLOCK_SCHED:
178		- *sample = task_sched_runtime(p);
179		- break;
	200	+ return utime;
	201	+ default:
	202	+ WARN_ON_ONCE(1);
180	203	}
181	204	return 0;
	205	+}
	206	+
	207	+static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
	208	+{
	209	+ samples[CPUCLOCK_PROF] = stime + utime;
	210	+ samples[CPUCLOCK_VIRT] = utime;
	211	+ samples[CPUCLOCK_SCHED] = rtime;
	212	+}
	213	+
	214	+static void task_sample_cputime(struct task_struct p, u64 samples)
	215	+{
	216	+ u64 stime, utime;
	217	+
	218	+ task_cputime(p, &utime, &stime);
	219	+ store_samples(samples, stime, utime, p->se.sum_exec_runtime);
	220	+}
	221	+
	222	+static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
	223	+ u64 *samples)
	224	+{
	225	+ u64 stime, utime, rtime;
	226	+
	227	+ utime = atomic64_read(&at->utime);
	228	+ stime = atomic64_read(&at->stime);
	229	+ rtime = atomic64_read(&at->sum_exec_runtime);
	230	+ store_samples(samples, stime, utime, rtime);
182	231	}
183	232
184	233	/*
..	..	@@ -196,29 +245,56 @@
196	245	}
197	246	}
198	247
199		-static void update_gt_cputime(struct task_cputime_atomic cputime_atomic, struct task_cputime sum)
	248	+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
	249	+ struct task_cputime *sum)
200	250	{
201	251	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
202	252	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
203	253	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
204	254	}
205	255
206		-/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
207		-static inline void sample_cputime_atomic(struct task_cputime *times,
208		- struct task_cputime_atomic *atomic_times)
209		-{
210		- times->utime = atomic64_read(&atomic_times->utime);
211		- times->stime = atomic64_read(&atomic_times->stime);
212		- times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
213		-}
214		-
215		-void thread_group_cputimer(struct task_struct tsk, struct task_cputime times)
	256	+/**
	257	+ * thread_group_sample_cputime - Sample cputime for a given task
	258	+ * @tsk: Task for which cputime needs to be started
	259	+ * @samples: Storage for time samples
	260	+ *
	261	+ * Called from sys_getitimer() to calculate the expiry time of an active
	262	+ * timer. That means group cputime accounting is already active. Called
	263	+ * with task sighand lock held.
	264	+ *
	265	+ * Updates @times with an uptodate sample of the thread group cputimes.
	266	+ */
	267	+void thread_group_sample_cputime(struct task_struct tsk, u64 samples)
216	268	{
217	269	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
218		- struct task_cputime sum;
	270	+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
	271	+
	272	+ WARN_ON_ONCE(!pct->timers_active);
	273	+
	274	+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
	275	+}
	276	+
	277	+/**
	278	+ * thread_group_start_cputime - Start cputime and return a sample
	279	+ * @tsk: Task for which cputime needs to be started
	280	+ * @samples: Storage for time samples
	281	+ *
	282	+ * The thread group cputime accouting is avoided when there are no posix
	283	+ * CPU timers armed. Before starting a timer it's required to check whether
	284	+ * the time accounting is active. If not, a full update of the atomic
	285	+ * accounting store needs to be done and the accounting enabled.
	286	+ *
	287	+ * Updates @times with an uptodate sample of the thread group cputimes.
	288	+ */
	289	+static void thread_group_start_cputime(struct task_struct tsk, u64 samples)
	290	+{
	291	+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
	292	+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
219	293
220	294	/* Check if cputimer isn't running. This is accessed without locking. */
221		- if (!READ_ONCE(cputimer->running)) {
	295	+ if (!READ_ONCE(pct->timers_active)) {
	296	+ struct task_cputime sum;
	297	+
222	298	/*
223	299	* The POSIX timer interface allows for absolute time expiry
224	300	* values through the TIMER_ABSTIME flag, therefore we have
..	..	@@ -228,94 +304,70 @@
228	304	update_gt_cputime(&cputimer->cputime_atomic, &sum);
229	305
230	306	/*
231		- * We're setting cputimer->running without a lock. Ensure
232		- * this only gets written to in one operation. We set
233		- * running after update_gt_cputime() as a small optimization,
234		- * but barriers are not required because update_gt_cputime()
	307	+ * We're setting timers_active without a lock. Ensure this
	308	+ * only gets written to in one operation. We set it after
	309	+ * update_gt_cputime() as a small optimization, but
	310	+ * barriers are not required because update_gt_cputime()
235	311	* can handle concurrent updates.
236	312	*/
237		- WRITE_ONCE(cputimer->running, true);
	313	+ WRITE_ONCE(pct->timers_active, true);
238	314	}
239		- sample_cputime_atomic(times, &cputimer->cputime_atomic);
	315	+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
	316	+}
	317	+
	318	+static void __thread_group_cputime(struct task_struct tsk, u64 samples)
	319	+{
	320	+ struct task_cputime ct;
	321	+
	322	+ thread_group_cputime(tsk, &ct);
	323	+ store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
240	324	}
241	325
242	326	/*
243		- * Sample a process (thread group) clock for the given group_leader task.
244		- * Must be called with task sighand lock held for safe while_each_thread()
245		- * traversal.
	327	+ * Sample a process (thread group) clock for the given task clkid. If the
	328	+ * group's cputime accounting is already enabled, read the atomic
	329	+ * store. Otherwise a full update is required. clkid is already validated.
246	330	*/
247		-static int cpu_clock_sample_group(const clockid_t which_clock,
248		- struct task_struct *p,
249		- u64 *sample)
	331	+static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
	332	+ bool start)
250	333	{
251		- struct task_cputime cputime;
	334	+ struct thread_group_cputimer *cputimer = &p->signal->cputimer;
	335	+ struct posix_cputimers *pct = &p->signal->posix_cputimers;
	336	+ u64 samples[CPUCLOCK_MAX];
252	337
253		- switch (CPUCLOCK_WHICH(which_clock)) {
254		- default:
255		- return -EINVAL;
256		- case CPUCLOCK_PROF:
257		- thread_group_cputime(p, &cputime);
258		- *sample = cputime.utime + cputime.stime;
259		- break;
260		- case CPUCLOCK_VIRT:
261		- thread_group_cputime(p, &cputime);
262		- *sample = cputime.utime;
263		- break;
264		- case CPUCLOCK_SCHED:
265		- thread_group_cputime(p, &cputime);
266		- *sample = cputime.sum_exec_runtime;
267		- break;
268		- }
269		- return 0;
270		-}
271		-
272		-static int posix_cpu_clock_get_task(struct task_struct *tsk,
273		- const clockid_t which_clock,
274		- struct timespec64 *tp)
275		-{
276		- int err = -EINVAL;
277		- u64 rtn;
278		-
279		- if (CPUCLOCK_PERTHREAD(which_clock)) {
280		- if (same_thread_group(tsk, current))
281		- err = cpu_clock_sample(which_clock, tsk, &rtn);
	338	+ if (!READ_ONCE(pct->timers_active)) {
	339	+ if (start)
	340	+ thread_group_start_cputime(p, samples);
	341	+ else
	342	+ __thread_group_cputime(p, samples);
282	343	} else {
283		- if (tsk == current \|\| thread_group_leader(tsk))
284		- err = cpu_clock_sample_group(which_clock, tsk, &rtn);
	344	+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
285	345	}
286	346
287		- if (!err)
288		- *tp = ns_to_timespec64(rtn);
289		-
290		- return err;
	347	+ return samples[clkid];
291	348	}
292	349
293		-
294		-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
	350	+static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
295	351	{
296		- const pid_t pid = CPUCLOCK_PID(which_clock);
297		- int err = -EINVAL;
	352	+ const clockid_t clkid = CPUCLOCK_WHICH(clock);
	353	+ struct task_struct *tsk;
	354	+ u64 t;
298	355
299		- if (pid == 0) {
300		- /*
301		- * Special case constant value for our own clocks.
302		- * We don't have to do any lookup to find ourselves.
303		- */
304		- err = posix_cpu_clock_get_task(current, which_clock, tp);
305		- } else {
306		- /*
307		- * Find the given PID, and validate that the caller
308		- * should be able to see it.
309		- */
310		- struct task_struct *p;
311		- rcu_read_lock();
312		- p = find_task_by_vpid(pid);
313		- if (p)
314		- err = posix_cpu_clock_get_task(p, which_clock, tp);
	356	+ rcu_read_lock();
	357	+ tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock));
	358	+ if (!tsk) {
315	359	rcu_read_unlock();
	360	+ return -EINVAL;
316	361	}
317	362
318		- return err;
	363	+ if (CPUCLOCK_PERTHREAD(clock))
	364	+ t = cpu_clock_sample(clkid, tsk);
	365	+ else
	366	+ t = cpu_clock_sample_group(clkid, tsk, false);
	367	+ rcu_read_unlock();
	368	+
	369	+ *tp = ns_to_timespec64(t);
	370	+ return 0;
319	371	}
320	372
321	373	/*
..	..	@@ -325,44 +377,32 @@
325	377	*/
326	378	static int posix_cpu_timer_create(struct k_itimer *new_timer)
327	379	{
328		- int ret = 0;
329		- const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
330		- struct task_struct *p;
331		-
332		- if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
333		- return -EINVAL;
334		-
335		- new_timer->kclock = &clock_posix_cpu;
336		-
337		- INIT_LIST_HEAD(&new_timer->it.cpu.entry);
	380	+ static struct lock_class_key posix_cpu_timers_key;
	381	+ struct pid *pid;
338	382
339	383	rcu_read_lock();
340		- if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
341		- if (pid == 0) {
342		- p = current;
343		- } else {
344		- p = find_task_by_vpid(pid);
345		- if (p && !same_thread_group(p, current))
346		- p = NULL;
347		- }
348		- } else {
349		- if (pid == 0) {
350		- p = current->group_leader;
351		- } else {
352		- p = find_task_by_vpid(pid);
353		- if (p && !has_group_leader_pid(p))
354		- p = NULL;
355		- }
	384	+ pid = pid_for_clock(new_timer->it_clock, false);
	385	+ if (!pid) {
	386	+ rcu_read_unlock();
	387	+ return -EINVAL;
356	388	}
357		- new_timer->it.cpu.task = p;
358		- if (p) {
359		- get_task_struct(p);
360		- } else {
361		- ret = -EINVAL;
362		- }
363		- rcu_read_unlock();
364	389
365		- return ret;
	390	+ /*
	391	+ * If posix timer expiry is handled in task work context then
	392	+ * timer::it_lock can be taken without disabling interrupts as all
	393	+ * other locking happens in task context. This requires a seperate
	394	+ * lock class key otherwise regular posix timer expiry would record
	395	+ * the lock class being taken in interrupt context and generate a
	396	+ * false positive warning.
	397	+ */
	398	+ if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
	399	+ lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
	400	+
	401	+ new_timer->kclock = &clock_posix_cpu;
	402	+ timerqueue_init(&new_timer->it.cpu.node);
	403	+ new_timer->it.cpu.pid = get_pid(pid);
	404	+ rcu_read_unlock();
	405	+ return 0;
366	406	}
367	407
368	408	/*
..	..	@@ -373,13 +413,16 @@
373	413	*/
374	414	static int posix_cpu_timer_del(struct k_itimer *timer)
375	415	{
376		- int ret = 0;
377		- unsigned long flags;
	416	+ struct cpu_timer *ctmr = &timer->it.cpu;
378	417	struct sighand_struct *sighand;
379		- struct task_struct *p = timer->it.cpu.task;
	418	+ struct task_struct *p;
	419	+ unsigned long flags;
	420	+ int ret = 0;
380	421
381		- if (WARN_ON_ONCE(!p))
382		- return -EINVAL;
	422	+ rcu_read_lock();
	423	+ p = cpu_timer_task_rcu(timer);
	424	+ if (!p)
	425	+ goto out;
383	426
384	427	/*
385	428	* Protect against sighand release/switch in exit/exec and process/
..	..	@@ -388,44 +431,51 @@
388	431	sighand = lock_task_sighand(p, &flags);
389	432	if (unlikely(sighand == NULL)) {
390	433	/*
391		- * We raced with the reaping of the task.
392		- * The deletion should have cleared us off the list.
	434	+ * This raced with the reaping of the task. The exit cleanup
	435	+ * should have removed this timer from the timer queue.
393	436	*/
394		- WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
	437	+ WARN_ON_ONCE(ctmr->head \|\| timerqueue_node_queued(&ctmr->node));
395	438	} else {
396	439	if (timer->it.cpu.firing)
397	440	ret = TIMER_RETRY;
398	441	else
399		- list_del(&timer->it.cpu.entry);
	442	+ cpu_timer_dequeue(ctmr);
400	443
401	444	unlock_task_sighand(p, &flags);
402	445	}
403	446
	447	+out:
	448	+ rcu_read_unlock();
404	449	if (!ret)
405		- put_task_struct(p);
	450	+ put_pid(ctmr->pid);
406	451
407	452	return ret;
408	453	}
409	454
410		-static void cleanup_timers_list(struct list_head *head)
	455	+static void cleanup_timerqueue(struct timerqueue_head *head)
411	456	{
412		- struct cpu_timer_list timer, next;
	457	+ struct timerqueue_node *node;
	458	+ struct cpu_timer *ctmr;
413	459
414		- list_for_each_entry_safe(timer, next, head, entry)
415		- list_del_init(&timer->entry);
	460	+ while ((node = timerqueue_getnext(head))) {
	461	+ timerqueue_del(head, node);
	462	+ ctmr = container_of(node, struct cpu_timer, node);
	463	+ ctmr->head = NULL;
	464	+ }
416	465	}
417	466
418	467	/*
419		- * Clean out CPU timers still ticking when a thread exited. The task
420		- * pointer is cleared, and the expiry time is replaced with the residual
421		- * time for later timer_gettime calls to return.
	468	+ * Clean out CPU timers which are still armed when a thread exits. The
	469	+ * timers are only removed from the list. No other updates are done. The
	470	+ * corresponding posix timers are still accessible, but cannot be rearmed.
	471	+ *
422	472	* This must be called with the siglock held.
423	473	*/
424		-static void cleanup_timers(struct list_head *head)
	474	+static void cleanup_timers(struct posix_cputimers *pct)
425	475	{
426		- cleanup_timers_list(head);
427		- cleanup_timers_list(++head);
428		- cleanup_timers_list(++head);
	476	+ cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
	477	+ cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
	478	+ cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
429	479	}
430	480
431	481	/*
..	..	@@ -435,76 +485,45 @@
435	485	*/
436	486	void posix_cpu_timers_exit(struct task_struct *tsk)
437	487	{
438		- cleanup_timers(tsk->cpu_timers);
	488	+ cleanup_timers(&tsk->posix_cputimers);
439	489	}
440	490	void posix_cpu_timers_exit_group(struct task_struct *tsk)
441	491	{
442		- cleanup_timers(tsk->signal->cpu_timers);
443		-}
444		-
445		-static inline int expires_gt(u64 expires, u64 new_exp)
446		-{
447		- return expires == 0 \|\| expires > new_exp;
	492	+ cleanup_timers(&tsk->signal->posix_cputimers);
448	493	}
449	494
450	495	/*
451	496	* Insert the timer on the appropriate list before any timers that
452	497	* expire later. This must be called with the sighand lock held.
453	498	*/
454		-static void arm_timer(struct k_itimer *timer)
	499	+static void arm_timer(struct k_itimer timer, struct task_struct p)
455	500	{
456		- struct task_struct *p = timer->it.cpu.task;
457		- struct list_head head, listpos;
458		- struct task_cputime *cputime_expires;
459		- struct cpu_timer_list *const nt = &timer->it.cpu;
460		- struct cpu_timer_list *next;
	501	+ int clkidx = CPUCLOCK_WHICH(timer->it_clock);
	502	+ struct cpu_timer *ctmr = &timer->it.cpu;
	503	+ u64 newexp = cpu_timer_getexpires(ctmr);
	504	+ struct posix_cputimer_base *base;
461	505
462		- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
463		- head = p->cpu_timers;
464		- cputime_expires = &p->cputime_expires;
465		- } else {
466		- head = p->signal->cpu_timers;
467		- cputime_expires = &p->signal->cputime_expires;
468		- }
469		- head += CPUCLOCK_WHICH(timer->it_clock);
	506	+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
	507	+ base = p->posix_cputimers.bases + clkidx;
	508	+ else
	509	+ base = p->signal->posix_cputimers.bases + clkidx;
470	510
471		- listpos = head;
472		- list_for_each_entry(next, head, entry) {
473		- if (nt->expires < next->expires)
474		- break;
475		- listpos = &next->entry;
476		- }
477		- list_add(&nt->entry, listpos);
	511	+ if (!cpu_timer_enqueue(&base->tqhead, ctmr))
	512	+ return;
478	513
479		- if (listpos == head) {
480		- u64 exp = nt->expires;
	514	+ /*
	515	+ * We are the new earliest-expiring POSIX 1.b timer, hence
	516	+ * need to update expiration cache. Take into account that
	517	+ * for process timers we share expiration cache with itimers
	518	+ * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
	519	+ */
	520	+ if (newexp < base->nextevt)
	521	+ base->nextevt = newexp;
481	522
482		- /*
483		- * We are the new earliest-expiring POSIX 1.b timer, hence
484		- * need to update expiration cache. Take into account that
485		- * for process timers we share expiration cache with itimers
486		- * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
487		- */
488		-
489		- switch (CPUCLOCK_WHICH(timer->it_clock)) {
490		- case CPUCLOCK_PROF:
491		- if (expires_gt(cputime_expires->prof_exp, exp))
492		- cputime_expires->prof_exp = exp;
493		- break;
494		- case CPUCLOCK_VIRT:
495		- if (expires_gt(cputime_expires->virt_exp, exp))
496		- cputime_expires->virt_exp = exp;
497		- break;
498		- case CPUCLOCK_SCHED:
499		- if (expires_gt(cputime_expires->sched_exp, exp))
500		- cputime_expires->sched_exp = exp;
501		- break;
502		- }
503		- if (CPUCLOCK_PERTHREAD(timer->it_clock))
504		- tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
505		- else
506		- tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
507		- }
	523	+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
	524	+ tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
	525	+ else
	526	+ tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
508	527	}
509	528
510	529	/*
..	..	@@ -512,24 +531,26 @@
512	531	*/
513	532	static void cpu_timer_fire(struct k_itimer *timer)
514	533	{
	534	+ struct cpu_timer *ctmr = &timer->it.cpu;
	535	+
515	536	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
516	537	/*
517	538	* User don't want any signal.
518	539	*/
519		- timer->it.cpu.expires = 0;
	540	+ cpu_timer_setexpires(ctmr, 0);
520	541	} else if (unlikely(timer->sigq == NULL)) {
521	542	/*
522	543	* This a special case for clock_nanosleep,
523	544	* not a normal timer from sys_timer_create.
524	545	*/
525	546	wake_up_process(timer->it_process);
526		- timer->it.cpu.expires = 0;
527		- } else if (timer->it.cpu.incr == 0) {
	547	+ cpu_timer_setexpires(ctmr, 0);
	548	+ } else if (!timer->it_interval) {
528	549	/*
529	550	* One-shot timer. Clear it as soon as it's fired.
530	551	*/
531	552	posix_timer_event(timer, 0);
532		- timer->it.cpu.expires = 0;
	553	+ cpu_timer_setexpires(ctmr, 0);
533	554	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
534	555	/*
535	556	* The signal did not get queued because the signal
..	..	@@ -543,33 +564,6 @@
543	564	}
544	565
545	566	/*
546		- * Sample a process (thread group) timer for the given group_leader task.
547		- * Must be called with task sighand lock held for safe while_each_thread()
548		- * traversal.
549		- */
550		-static int cpu_timer_sample_group(const clockid_t which_clock,
551		- struct task_struct p, u64 sample)
552		-{
553		- struct task_cputime cputime;
554		-
555		- thread_group_cputimer(p, &cputime);
556		- switch (CPUCLOCK_WHICH(which_clock)) {
557		- default:
558		- return -EINVAL;
559		- case CPUCLOCK_PROF:
560		- *sample = cputime.utime + cputime.stime;
561		- break;
562		- case CPUCLOCK_VIRT:
563		- *sample = cputime.utime;
564		- break;
565		- case CPUCLOCK_SCHED:
566		- *sample = cputime.sum_exec_runtime;
567		- break;
568		- }
569		- return 0;
570		-}
571		-
572		-/*
573	567	* Guts of sys_timer_settime for CPU timers.
574	568	* This is called with the timer locked and interrupts disabled.
575	569	* If we return TIMER_RETRY, it's necessary to release the timer's lock
..	..	@@ -578,14 +572,24 @@
578	572	static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
579	573	struct itimerspec64 new, struct itimerspec64 old)
580	574	{
581		- unsigned long flags;
582		- struct sighand_struct *sighand;
583		- struct task_struct *p = timer->it.cpu.task;
	575	+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
584	576	u64 old_expires, new_expires, old_incr, val;
585		- int ret;
	577	+ struct cpu_timer *ctmr = &timer->it.cpu;
	578	+ struct sighand_struct *sighand;
	579	+ struct task_struct *p;
	580	+ unsigned long flags;
	581	+ int ret = 0;
586	582
587		- if (WARN_ON_ONCE(!p))
588		- return -EINVAL;
	583	+ rcu_read_lock();
	584	+ p = cpu_timer_task_rcu(timer);
	585	+ if (!p) {
	586	+ /*
	587	+ * If p has just been reaped, we can no
	588	+ * longer get any information about it at all.
	589	+ */
	590	+ rcu_read_unlock();
	591	+ return -ESRCH;
	592	+ }
589	593
590	594	/*
591	595	* Use the to_ktime conversion because that clamps the maximum
..	..	@@ -603,21 +607,22 @@
603	607	* longer get any information about it at all.
604	608	*/
605	609	if (unlikely(sighand == NULL)) {
	610	+ rcu_read_unlock();
606	611	return -ESRCH;
607	612	}
608	613
609	614	/*
610	615	* Disarm any old timer after extracting its expiry time.
611	616	*/
	617	+ old_incr = timer->it_interval;
	618	+ old_expires = cpu_timer_getexpires(ctmr);
612	619
613		- ret = 0;
614		- old_incr = timer->it.cpu.incr;
615		- old_expires = timer->it.cpu.expires;
616	620	if (unlikely(timer->it.cpu.firing)) {
617	621	timer->it.cpu.firing = -1;
618	622	ret = TIMER_RETRY;
619		- } else
620		- list_del_init(&timer->it.cpu.entry);
	623	+ } else {
	624	+ cpu_timer_dequeue(ctmr);
	625	+ }
621	626
622	627	/*
623	628	* We need to sample the current value to convert the new
..	..	@@ -627,11 +632,10 @@
627	632	* times (in arm_timer). With an absolute time, we must
628	633	* check if it's already passed. In short, we need a sample.
629	634	*/
630		- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
631		- cpu_clock_sample(timer->it_clock, p, &val);
632		- } else {
633		- cpu_timer_sample_group(timer->it_clock, p, &val);
634		- }
	635	+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
	636	+ val = cpu_clock_sample(clkid, p);
	637	+ else
	638	+ val = cpu_clock_sample_group(clkid, p, true);
635	639
636	640	if (old) {
637	641	if (old_expires == 0) {
..	..	@@ -639,18 +643,16 @@
639	643	old->it_value.tv_nsec = 0;
640	644	} else {
641	645	/*
642		- * Update the timer in case it has
643		- * overrun already. If it has,
644		- * we'll report it as having overrun
645		- * and with the next reloaded timer
646		- * already ticking, though we are
647		- * swallowing that pending
648		- * notification here to install the
649		- * new setting.
	646	+ * Update the timer in case it has overrun already.
	647	+ * If it has, we'll report it as having overrun and
	648	+ * with the next reloaded timer already ticking,
	649	+ * though we are swallowing that pending
	650	+ * notification here to install the new setting.
650	651	*/
651		- bump_cpu_timer(timer, val);
652		- if (val < timer->it.cpu.expires) {
653		- old_expires = timer->it.cpu.expires - val;
	652	+ u64 exp = bump_cpu_timer(timer, val);
	653	+
	654	+ if (val < exp) {
	655	+ old_expires = exp - val;
654	656	old->it_value = ns_to_timespec64(old_expires);
655	657	} else {
656	658	old->it_value.tv_nsec = 1;
..	..	@@ -679,9 +681,9 @@
679	681	* For a timer with no notification action, we don't actually
680	682	* arm the timer (we'll just fake it for timer_gettime).
681	683	*/
682		- timer->it.cpu.expires = new_expires;
	684	+ cpu_timer_setexpires(ctmr, new_expires);
683	685	if (new_expires != 0 && val < new_expires) {
684		- arm_timer(timer);
	686	+ arm_timer(timer, p);
685	687	}
686	688
687	689	unlock_task_sighand(p, &flags);
..	..	@@ -689,8 +691,7 @@
689	691	* Install the new reload setting, and
690	692	* set up the signal and overrun bookkeeping.
691	693	*/
692		- timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
693		- timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
	694	+ timer->it_interval = timespec64_to_ktime(new->it_interval);
694	695
695	696	/*
696	697	* This acts as a modification timestamp for the timer,
..	..	@@ -713,6 +714,7 @@
713	714
714	715	ret = 0;
715	716	out:
	717	+ rcu_read_unlock();
716	718	if (old)
717	719	old->it_interval = ns_to_timespec64(old_incr);
718	720
..	..	@@ -721,51 +723,34 @@
721	723
722	724	static void posix_cpu_timer_get(struct k_itimer timer, struct itimerspec64 itp)
723	725	{
724		- struct task_struct *p = timer->it.cpu.task;
725		- u64 now;
	726	+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
	727	+ struct cpu_timer *ctmr = &timer->it.cpu;
	728	+ u64 now, expires = cpu_timer_getexpires(ctmr);
	729	+ struct task_struct *p;
726	730
727		- if (WARN_ON_ONCE(!p))
728		- return;
	731	+ rcu_read_lock();
	732	+ p = cpu_timer_task_rcu(timer);
	733	+ if (!p)
	734	+ goto out;
729	735
730	736	/*
731	737	* Easy part: convert the reload time.
732	738	*/
733		- itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
	739	+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
734	740
735		- if (!timer->it.cpu.expires)
736		- return;
	741	+ if (!expires)
	742	+ goto out;
737	743
738	744	/*
739	745	* Sample the clock to take the difference with the expiry time.
740	746	*/
741		- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
742		- cpu_clock_sample(timer->it_clock, p, &now);
743		- } else {
744		- struct sighand_struct *sighand;
745		- unsigned long flags;
	747	+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
	748	+ now = cpu_clock_sample(clkid, p);
	749	+ else
	750	+ now = cpu_clock_sample_group(clkid, p, false);
746	751
747		- /*
748		- * Protect against sighand release/switch in exit/exec and
749		- * also make timer sampling safe if it ends up calling
750		- * thread_group_cputime().
751		- */
752		- sighand = lock_task_sighand(p, &flags);
753		- if (unlikely(sighand == NULL)) {
754		- /*
755		- * The process has been reaped.
756		- * We can't even collect a sample any more.
757		- * Call the timer disarmed, nothing else to do.
758		- */
759		- timer->it.cpu.expires = 0;
760		- return;
761		- } else {
762		- cpu_timer_sample_group(timer->it_clock, p, &now);
763		- unlock_task_sighand(p, &flags);
764		- }
765		- }
766		-
767		- if (now < timer->it.cpu.expires) {
768		- itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
	752	+ if (now < expires) {
	753	+ itp->it_value = ns_to_timespec64(expires - now);
769	754	} else {
770	755	/*
771	756	* The timer should have expired already, but the firing
..	..	@@ -774,29 +759,48 @@
774	759	itp->it_value.tv_nsec = 1;
775	760	itp->it_value.tv_sec = 0;
776	761	}
	762	+out:
	763	+ rcu_read_unlock();
777	764	}
778	765
779		-static unsigned long long
780		-check_timers_list(struct list_head *timers,
781		- struct list_head *firing,
782		- unsigned long long curr)
	766	+#define MAX_COLLECTED 20
	767	+
	768	+static u64 collect_timerqueue(struct timerqueue_head *head,
	769	+ struct list_head *firing, u64 now)
783	770	{
784		- int maxfire = 20;
	771	+ struct timerqueue_node *next;
	772	+ int i = 0;
785	773
786		- while (!list_empty(timers)) {
787		- struct cpu_timer_list *t;
	774	+ while ((next = timerqueue_getnext(head))) {
	775	+ struct cpu_timer *ctmr;
	776	+ u64 expires;
788	777
789		- t = list_first_entry(timers, struct cpu_timer_list, entry);
	778	+ ctmr = container_of(next, struct cpu_timer, node);
	779	+ expires = cpu_timer_getexpires(ctmr);
	780	+ /* Limit the number of timers to expire at once */
	781	+ if (++i == MAX_COLLECTED \|\| now < expires)
	782	+ return expires;
790	783
791		- if (!--maxfire \|\| curr < t->expires)
792		- return t->expires;
793		-
794		- t->firing = 1;
795		- t->firing_cpu = smp_processor_id();
796		- list_move_tail(&t->entry, firing);
	784	+ ctmr->firing = 1;
	785	+ /* See posix_cpu_timer_wait_running() */
	786	+ rcu_assign_pointer(ctmr->handling, current);
	787	+ cpu_timer_dequeue(ctmr);
	788	+ list_add_tail(&ctmr->elist, firing);
797	789	}
798	790
799		- return 0;
	791	+ return U64_MAX;
	792	+}
	793	+
	794	+static void collect_posix_cputimers(struct posix_cputimers pct, u64 samples,
	795	+ struct list_head *firing)
	796	+{
	797	+ struct posix_cputimer_base *base = pct->bases;
	798	+ int i;
	799	+
	800	+ for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
	801	+ base->nextevt = collect_timerqueue(&base->tqhead, firing,
	802	+ samples[i]);
	803	+ }
800	804	}
801	805
802	806	static inline void check_dl_overrun(struct task_struct *tsk)
..	..	@@ -807,6 +811,20 @@
807	811	}
808	812	}
809	813
	814	+static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
	815	+{
	816	+ if (time < limit)
	817	+ return false;
	818	+
	819	+ if (print_fatal_signals) {
	820	+ pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
	821	+ rt ? "RT" : "CPU", hard ? "hard" : "soft",
	822	+ current->comm, task_pid_nr(current));
	823	+ }
	824	+ __group_send_sig_info(signo, SEND_SIG_PRIV, current);
	825	+ return true;
	826	+}
	827	+
810	828	/*
811	829	* Check for any per-thread CPU timers that have fired and move them off
812	830	* the tsk->cpu_timers[N] list onto the firing list. Here we update the
..	..	@@ -815,76 +833,50 @@
815	833	static void check_thread_timers(struct task_struct *tsk,
816	834	struct list_head *firing)
817	835	{
818		- struct list_head *timers = tsk->cpu_timers;
819		- struct task_cputime *tsk_expires = &tsk->cputime_expires;
820		- u64 expires;
	836	+ struct posix_cputimers *pct = &tsk->posix_cputimers;
	837	+ u64 samples[CPUCLOCK_MAX];
821	838	unsigned long soft;
822	839
823	840	if (dl_task(tsk))
824	841	check_dl_overrun(tsk);
825	842
826		- /*
827		- * If cputime_expires is zero, then there are no active
828		- * per thread CPU timers.
829		- */
830		- if (task_cputime_zero(&tsk->cputime_expires))
	843	+ if (expiry_cache_is_inactive(pct))
831	844	return;
832	845
833		- expires = check_timers_list(timers, firing, prof_ticks(tsk));
834		- tsk_expires->prof_exp = expires;
835		-
836		- expires = check_timers_list(++timers, firing, virt_ticks(tsk));
837		- tsk_expires->virt_exp = expires;
838		-
839		- tsk_expires->sched_exp = check_timers_list(++timers, firing,
840		- tsk->se.sum_exec_runtime);
	846	+ task_sample_cputime(tsk, samples);
	847	+ collect_posix_cputimers(pct, samples, firing);
841	848
842	849	/*
843	850	* Check for the special case thread timers.
844	851	*/
845	852	soft = task_rlimit(tsk, RLIMIT_RTTIME);
846	853	if (soft != RLIM_INFINITY) {
	854	+ /* Task RT timeout is accounted in jiffies. RTTIME is usec */
	855	+ unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
847	856	unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
848	857
	858	+ /* At the hard limit, send SIGKILL. No further action. */
849	859	if (hard != RLIM_INFINITY &&
850		- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
851		- /*
852		- * At the hard limit, we just die.
853		- * No need to calculate anything else now.
854		- */
855		- if (print_fatal_signals) {
856		- pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
857		- tsk->comm, task_pid_nr(tsk));
858		- }
859		- __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
	860	+ check_rlimit(rttime, hard, SIGKILL, true, true))
860	861	return;
861		- }
862		- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
863		- /*
864		- * At the soft limit, send a SIGXCPU every second.
865		- */
866		- if (soft < hard) {
867		- soft += USEC_PER_SEC;
868		- tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur =
869		- soft;
870		- }
871		- if (print_fatal_signals) {
872		- pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
873		- tsk->comm, task_pid_nr(tsk));
874		- }
875		- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
	862	+
	863	+ /* At the soft limit, send a SIGXCPU every second */
	864	+ if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
	865	+ soft += USEC_PER_SEC;
	866	+ tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
876	867	}
877	868	}
878		- if (task_cputime_zero(tsk_expires))
	869	+
	870	+ if (expiry_cache_is_inactive(pct))
879	871	tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
880	872	}
881	873
882	874	static inline void stop_process_timers(struct signal_struct *sig)
883	875	{
884		- struct thread_group_cputimer *cputimer = &sig->cputimer;
	876	+ struct posix_cputimers *pct = &sig->posix_cputimers;
885	877
886		- /* Turn off cputimer->running. This is done without locking. */
887		- WRITE_ONCE(cputimer->running, false);
	878	+ /* Turn off the active flag. This is done without locking. */
	879	+ WRITE_ONCE(pct->timers_active, false);
888	880	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
889	881	}
890	882
..	..	@@ -906,7 +898,7 @@
906	898	__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
907	899	}
908	900
909		- if (it->expires && (!expires \|\| it->expires < expires))
	901	+ if (it->expires && it->expires < *expires)
910	902	*expires = it->expires;
911	903	}
912	904
..	..	@@ -919,90 +911,69 @@
919	911	struct list_head *firing)
920	912	{
921	913	struct signal_struct *const sig = tsk->signal;
922		- u64 utime, ptime, virt_expires, prof_expires;
923		- u64 sum_sched_runtime, sched_expires;
924		- struct list_head *timers = sig->cpu_timers;
925		- struct task_cputime cputime;
	914	+ struct posix_cputimers *pct = &sig->posix_cputimers;
	915	+ u64 samples[CPUCLOCK_MAX];
926	916	unsigned long soft;
927	917
928		- if (dl_task(tsk))
929		- check_dl_overrun(tsk);
930		-
931	918	/*
932		- * If cputimer is not running, then there are no active
933		- * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
	919	+ * If there are no active process wide timers (POSIX 1.b, itimers,
	920	+ * RLIMIT_CPU) nothing to check. Also skip the process wide timer
	921	+ * processing when there is already another task handling them.
934	922	*/
935		- if (!READ_ONCE(tsk->signal->cputimer.running))
	923	+ if (!READ_ONCE(pct->timers_active) \|\| pct->expiry_active)
936	924	return;
937	925
938		- /*
	926	+ /*
939	927	* Signify that a thread is checking for process timers.
940	928	* Write access to this field is protected by the sighand lock.
941	929	*/
942		- sig->cputimer.checking_timer = true;
	930	+ pct->expiry_active = true;
943	931
944	932	/*
945		- * Collect the current process totals.
	933	+ * Collect the current process totals. Group accounting is active
	934	+ * so the sample can be taken directly.
946	935	*/
947		- thread_group_cputimer(tsk, &cputime);
948		- utime = cputime.utime;
949		- ptime = utime + cputime.stime;
950		- sum_sched_runtime = cputime.sum_exec_runtime;
951		-
952		- prof_expires = check_timers_list(timers, firing, ptime);
953		- virt_expires = check_timers_list(++timers, firing, utime);
954		- sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
	936	+ proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
	937	+ collect_posix_cputimers(pct, samples, firing);
955	938
956	939	/*
957	940	* Check for the special case process timers.
958	941	*/
959		- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
960		- SIGPROF);
961		- check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
962		- SIGVTALRM);
	942	+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
	943	+ &pct->bases[CPUCLOCK_PROF].nextevt,
	944	+ samples[CPUCLOCK_PROF], SIGPROF);
	945	+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
	946	+ &pct->bases[CPUCLOCK_VIRT].nextevt,
	947	+ samples[CPUCLOCK_VIRT], SIGVTALRM);
	948	+
963	949	soft = task_rlimit(tsk, RLIMIT_CPU);
964	950	if (soft != RLIM_INFINITY) {
965		- unsigned long psecs = div_u64(ptime, NSEC_PER_SEC);
	951	+ /* RLIMIT_CPU is in seconds. Samples are nanoseconds */
966	952	unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
967		- u64 x;
968		- if (psecs >= hard) {
969		- /*
970		- * At the hard limit, we just die.
971		- * No need to calculate anything else now.
972		- */
973		- if (print_fatal_signals) {
974		- pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
975		- tsk->comm, task_pid_nr(tsk));
976		- }
977		- __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
	953	+ u64 ptime = samples[CPUCLOCK_PROF];
	954	+ u64 softns = (u64)soft * NSEC_PER_SEC;
	955	+ u64 hardns = (u64)hard * NSEC_PER_SEC;
	956	+
	957	+ /* At the hard limit, send SIGKILL. No further action. */
	958	+ if (hard != RLIM_INFINITY &&
	959	+ check_rlimit(ptime, hardns, SIGKILL, false, true))
978	960	return;
	961	+
	962	+ /* At the soft limit, send a SIGXCPU every second */
	963	+ if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
	964	+ sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
	965	+ softns += NSEC_PER_SEC;
979	966	}
980		- if (psecs >= soft) {
981		- /*
982		- * At the soft limit, send a SIGXCPU every second.
983		- */
984		- if (print_fatal_signals) {
985		- pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
986		- tsk->comm, task_pid_nr(tsk));
987		- }
988		- __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
989		- if (soft < hard) {
990		- soft++;
991		- sig->rlim[RLIMIT_CPU].rlim_cur = soft;
992		- }
993		- }
994		- x = soft * NSEC_PER_SEC;
995		- if (!prof_expires \|\| x < prof_expires)
996		- prof_expires = x;
	967	+
	968	+ /* Update the expiry cache */
	969	+ if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
	970	+ pct->bases[CPUCLOCK_PROF].nextevt = softns;
997	971	}
998	972
999		- sig->cputime_expires.prof_exp = prof_expires;
1000		- sig->cputime_expires.virt_exp = virt_expires;
1001		- sig->cputime_expires.sched_exp = sched_expires;
1002		- if (task_cputime_zero(&sig->cputime_expires))
	973	+ if (expiry_cache_is_inactive(pct))
1003	974	stop_process_timers(sig);
1004	975
1005		- sig->cputimer.checking_timer = false;
	976	+ pct->expiry_active = false;
1006	977	}
1007	978
1008	979	/*
..	..	@@ -1011,78 +982,60 @@
1011	982	*/
1012	983	static void posix_cpu_timer_rearm(struct k_itimer *timer)
1013	984	{
1014		- struct task_struct *p = timer->it.cpu.task;
	985	+ clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
	986	+ struct task_struct *p;
1015	987	struct sighand_struct *sighand;
1016	988	unsigned long flags;
1017	989	u64 now;
1018	990
1019		- if (WARN_ON_ONCE(!p))
1020		- return;
	991	+ rcu_read_lock();
	992	+ p = cpu_timer_task_rcu(timer);
	993	+ if (!p)
	994	+ goto out;
	995	+
	996	+ /* Protect timer list r/w in arm_timer() */
	997	+ sighand = lock_task_sighand(p, &flags);
	998	+ if (unlikely(sighand == NULL))
	999	+ goto out;
1021	1000
1022	1001	/*
1023	1002	* Fetch the current sample and update the timer's expiry time.
1024	1003	*/
1025		- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
1026		- cpu_clock_sample(timer->it_clock, p, &now);
1027		- bump_cpu_timer(timer, now);
1028		- if (unlikely(p->exit_state))
1029		- return;
	1004	+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
	1005	+ now = cpu_clock_sample(clkid, p);
	1006	+ else
	1007	+ now = cpu_clock_sample_group(clkid, p, true);
1030	1008
1031		- /* Protect timer list r/w in arm_timer() */
1032		- sighand = lock_task_sighand(p, &flags);
1033		- if (!sighand)
1034		- return;
1035		- } else {
1036		- /*
1037		- * Protect arm_timer() and timer sampling in case of call to
1038		- * thread_group_cputime().
1039		- */
1040		- sighand = lock_task_sighand(p, &flags);
1041		- if (unlikely(sighand == NULL)) {
1042		- /*
1043		- * The process has been reaped.
1044		- * We can't even collect a sample any more.
1045		- */
1046		- timer->it.cpu.expires = 0;
1047		- return;
1048		- } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1049		- /* If the process is dying, no need to rearm */
1050		- goto unlock;
1051		- }
1052		- cpu_timer_sample_group(timer->it_clock, p, &now);
1053		- bump_cpu_timer(timer, now);
1054		- /* Leave the sighand locked for the call below. */
1055		- }
	1009	+ bump_cpu_timer(timer, now);
1056	1010
1057	1011	/*
1058	1012	* Now re-arm for the new expiry time.
1059	1013	*/
1060		- arm_timer(timer);
1061		-unlock:
	1014	+ arm_timer(timer, p);
1062	1015	unlock_task_sighand(p, &flags);
	1016	+out:
	1017	+ rcu_read_unlock();
1063	1018	}
1064	1019
1065	1020	/**
1066		- * task_cputime_expired - Compare two task_cputime entities.
	1021	+ * task_cputimers_expired - Check whether posix CPU timers are expired
1067	1022	*
1068		- * @sample: The task_cputime structure to be checked for expiration.
1069		- * @expires: Expiration times, against which @sample will be checked.
	1023	+ * @samples: Array of current samples for the CPUCLOCK clocks
	1024	+ * @pct: Pointer to a posix_cputimers container
1070	1025	*
1071		- * Checks @sample against @expires to see if any field of @sample has expired.
1072		- * Returns true if any field of the former is greater than the corresponding
1073		- * field of the latter if the latter field is set. Otherwise returns false.
	1026	+ * Returns true if any member of @samples is greater than the corresponding
	1027	+ * member of @pct->bases[CLK].nextevt. False otherwise
1074	1028	*/
1075		-static inline int task_cputime_expired(const struct task_cputime *sample,
1076		- const struct task_cputime *expires)
	1029	+static inline bool
	1030	+task_cputimers_expired(const u64 samples, struct posix_cputimers pct)
1077	1031	{
1078		- if (expires->utime && sample->utime >= expires->utime)
1079		- return 1;
1080		- if (expires->stime && sample->utime + sample->stime >= expires->stime)
1081		- return 1;
1082		- if (expires->sum_exec_runtime != 0 &&
1083		- sample->sum_exec_runtime >= expires->sum_exec_runtime)
1084		- return 1;
1085		- return 0;
	1032	+ int i;
	1033	+
	1034	+ for (i = 0; i < CPUCLOCK_MAX; i++) {
	1035	+ if (samples[i] >= pct->bases[i].nextevt)
	1036	+ return true;
	1037	+ }
	1038	+ return false;
1086	1039	}
1087	1040
1088	1041	/**
..	..	@@ -1095,101 +1048,279 @@
1095	1048	* timers and compare them with the corresponding expiration times. Return
1096	1049	* true if a timer has expired, else return false.
1097	1050	*/
1098		-static inline int fastpath_timer_check(struct task_struct *tsk)
	1051	+static inline bool fastpath_timer_check(struct task_struct *tsk)
1099	1052	{
	1053	+ struct posix_cputimers *pct = &tsk->posix_cputimers;
1100	1054	struct signal_struct *sig;
1101	1055
1102		- if (!task_cputime_zero(&tsk->cputime_expires)) {
1103		- struct task_cputime task_sample;
	1056	+ if (!expiry_cache_is_inactive(pct)) {
	1057	+ u64 samples[CPUCLOCK_MAX];
1104	1058
1105		- task_cputime(tsk, &task_sample.utime, &task_sample.stime);
1106		- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
1107		- if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1108		- return 1;
	1059	+ task_sample_cputime(tsk, samples);
	1060	+ if (task_cputimers_expired(samples, pct))
	1061	+ return true;
1109	1062	}
1110	1063
1111	1064	sig = tsk->signal;
	1065	+ pct = &sig->posix_cputimers;
1112	1066	/*
1113		- * Check if thread group timers expired when the cputimer is
1114		- * running and no other thread in the group is already checking
1115		- * for thread group cputimers. These fields are read without the
1116		- * sighand lock. However, this is fine because this is meant to
1117		- * be a fastpath heuristic to determine whether we should try to
1118		- * acquire the sighand lock to check/handle timers.
	1067	+ * Check if thread group timers expired when timers are active and
	1068	+ * no other thread in the group is already handling expiry for
	1069	+ * thread group cputimers. These fields are read without the
	1070	+ * sighand lock. However, this is fine because this is meant to be
	1071	+ * a fastpath heuristic to determine whether we should try to
	1072	+ * acquire the sighand lock to handle timer expiry.
1119	1073	*
1120		- * In the worst case scenario, if 'running' or 'checking_timer' gets
1121		- * set but the current thread doesn't see the change yet, we'll wait
1122		- * until the next thread in the group gets a scheduler interrupt to
1123		- * handle the timer. This isn't an issue in practice because these
1124		- * types of delays with signals actually getting sent are expected.
	1074	+ * In the worst case scenario, if concurrently timers_active is set
	1075	+ * or expiry_active is cleared, but the current thread doesn't see
	1076	+ * the change yet, the timer checks are delayed until the next
	1077	+ * thread in the group gets a scheduler interrupt to handle the
	1078	+ * timer. This isn't an issue in practice because these types of
	1079	+ * delays with signals actually getting sent are expected.
1125	1080	*/
1126		- if (READ_ONCE(sig->cputimer.running) &&
1127		- !READ_ONCE(sig->cputimer.checking_timer)) {
1128		- struct task_cputime group_sample;
	1081	+ if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
	1082	+ u64 samples[CPUCLOCK_MAX];
1129	1083
1130		- sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
	1084	+ proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
	1085	+ samples);
1131	1086
1132		- if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1133		- return 1;
	1087	+ if (task_cputimers_expired(samples, pct))
	1088	+ return true;
1134	1089	}
1135	1090
1136	1091	if (dl_task(tsk) && tsk->dl.dl_overrun)
1137		- return 1;
	1092	+ return true;
1138	1093
1139		- return 0;
	1094	+ return false;
1140	1095	}
1141	1096
1142		-static DEFINE_PER_CPU(spinlock_t, cpu_timer_expiry_lock) = __SPIN_LOCK_UNLOCKED(cpu_timer_expiry_lock);
	1097	+static void handle_posix_cpu_timers(struct task_struct *tsk);
1143	1098
1144		-void cpu_timers_grab_expiry_lock(struct k_itimer *timer)
	1099	+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
	1100	+static void posix_cpu_timers_work(struct callback_head *work)
1145	1101	{
1146		- int cpu = timer->it.cpu.firing_cpu;
	1102	+ struct posix_cputimers_work cw = container_of(work, typeof(cw), work);
1147	1103
1148		- if (cpu >= 0) {
1149		- spinlock_t *expiry_lock = per_cpu_ptr(&cpu_timer_expiry_lock, cpu);
1150		-
1151		- spin_lock_irq(expiry_lock);
1152		- spin_unlock_irq(expiry_lock);
1153		- }
	1104	+ mutex_lock(&cw->mutex);
	1105	+ handle_posix_cpu_timers(current);
	1106	+ mutex_unlock(&cw->mutex);
1154	1107	}
1155	1108
1156	1109	/*
1157		- * This is called from the timer interrupt handler. The irq handler has
1158		- * already updated our counts. We need to check if any timers fire now.
1159		- * Interrupts are disabled.
	1110	+ * Invoked from the posix-timer core when a cancel operation failed because
	1111	+ * the timer is marked firing. The caller holds rcu_read_lock(), which
	1112	+ * protects the timer and the task which is expiring it from being freed.
1160	1113	*/
1161		-static void __run_posix_cpu_timers(struct task_struct *tsk)
	1114	+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1162	1115	{
1163		- LIST_HEAD(firing);
1164		- struct k_itimer timer, next;
1165		- unsigned long flags;
1166		- spinlock_t *expiry_lock;
	1116	+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
	1117	+
	1118	+ /* Has the handling task completed expiry already? */
	1119	+ if (!tsk)
	1120	+ return;
	1121	+
	1122	+ /* Ensure that the task cannot go away */
	1123	+ get_task_struct(tsk);
	1124	+ /* Now drop the RCU protection so the mutex can be locked */
	1125	+ rcu_read_unlock();
	1126	+ /* Wait on the expiry mutex */
	1127	+ mutex_lock(&tsk->posix_cputimers_work.mutex);
	1128	+ /* Release it immediately again. */
	1129	+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
	1130	+ /* Drop the task reference. */
	1131	+ put_task_struct(tsk);
	1132	+ /* Relock RCU so the callsite is balanced */
	1133	+ rcu_read_lock();
	1134	+}
	1135	+
	1136	+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
	1137	+{
	1138	+ /* Ensure that timr->it.cpu.handling task cannot go away */
	1139	+ rcu_read_lock();
	1140	+ spin_unlock_irq(&timr->it_lock);
	1141	+ posix_cpu_timer_wait_running(timr);
	1142	+ rcu_read_unlock();
	1143	+ /* @timr is on stack and is valid */
	1144	+ spin_lock_irq(&timr->it_lock);
	1145	+}
	1146	+
	1147	+/*
	1148	+ * Clear existing posix CPU timers task work.
	1149	+ */
	1150	+void clear_posix_cputimers_work(struct task_struct *p)
	1151	+{
	1152	+ /*
	1153	+ * A copied work entry from the old task is not meaningful, clear it.
	1154	+ * N.B. init_task_work will not do this.
	1155	+ */
	1156	+ memset(&p->posix_cputimers_work.work, 0,
	1157	+ sizeof(p->posix_cputimers_work.work));
	1158	+ init_task_work(&p->posix_cputimers_work.work,
	1159	+ posix_cpu_timers_work);
	1160	+ mutex_init(&p->posix_cputimers_work.mutex);
	1161	+ p->posix_cputimers_work.scheduled = false;
	1162	+}
	1163	+
	1164	+/*
	1165	+ * Initialize posix CPU timers task work in init task. Out of line to
	1166	+ * keep the callback static and to avoid header recursion hell.
	1167	+ */
	1168	+void __init posix_cputimers_init_work(void)
	1169	+{
	1170	+ clear_posix_cputimers_work(current);
	1171	+}
	1172	+
	1173	+/*
	1174	+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
	1175	+ * in hard interrupt context or in task context with interrupts
	1176	+ * disabled. Aside of that the writer/reader interaction is always in the
	1177	+ * context of the current task, which means they are strict per CPU.
	1178	+ */
	1179	+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
	1180	+{
	1181	+ return tsk->posix_cputimers_work.scheduled;
	1182	+}
	1183	+
	1184	+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
	1185	+{
	1186	+ if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
	1187	+ return;
	1188	+
	1189	+ /* Schedule task work to actually expire the timers */
	1190	+ tsk->posix_cputimers_work.scheduled = true;
	1191	+ task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
	1192	+}
	1193	+
	1194	+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
	1195	+ unsigned long start)
	1196	+{
	1197	+ bool ret = true;
1167	1198
1168	1199	/*
1169		- * The fast path checks that there are no expired thread or thread
1170		- * group timers. If that's so, just return.
	1200	+ * On !RT kernels interrupts are disabled while collecting expired
	1201	+ * timers, so no tick can happen and the fast path check can be
	1202	+ * reenabled without further checks.
1171	1203	*/
1172		- if (!fastpath_timer_check(tsk))
1173		- return;
1174		-
1175		- expiry_lock = this_cpu_ptr(&cpu_timer_expiry_lock);
1176		- spin_lock(expiry_lock);
1177		-
1178		- if (!lock_task_sighand(tsk, &flags)) {
1179		- spin_unlock(expiry_lock);
1180		- return;
	1204	+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
	1205	+ tsk->posix_cputimers_work.scheduled = false;
	1206	+ return true;
1181	1207	}
	1208	+
1182	1209	/*
1183		- * Here we take off tsk->signal->cpu_timers[N] and
1184		- * tsk->cpu_timers[N] all the timers that are firing, and
1185		- * put them on the firing list.
	1210	+ * On RT enabled kernels ticks can happen while the expired timers
	1211	+ * are collected under sighand lock. But any tick which observes
	1212	+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
	1213	+ * checks. So reenabling the tick work has do be done carefully:
	1214	+ *
	1215	+ * Disable interrupts and run the fast path check if jiffies have
	1216	+ * advanced since the collecting of expired timers started. If
	1217	+ * jiffies have not advanced or the fast path check did not find
	1218	+ * newly expired timers, reenable the fast path check in the timer
	1219	+ * interrupt. If there are newly expired timers, return false and
	1220	+ * let the collection loop repeat.
1186	1221	*/
1187		- check_thread_timers(tsk, &firing);
	1222	+ local_irq_disable();
	1223	+ if (start != jiffies && fastpath_timer_check(tsk))
	1224	+ ret = false;
	1225	+ else
	1226	+ tsk->posix_cputimers_work.scheduled = false;
	1227	+ local_irq_enable();
1188	1228
1189		- check_process_timers(tsk, &firing);
	1229	+ return ret;
	1230	+}
	1231	+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
	1232	+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
	1233	+{
	1234	+ lockdep_posixtimer_enter();
	1235	+ handle_posix_cpu_timers(tsk);
	1236	+ lockdep_posixtimer_exit();
	1237	+}
	1238	+
	1239	+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
	1240	+{
	1241	+ cpu_relax();
	1242	+}
	1243	+
	1244	+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
	1245	+{
	1246	+ spin_unlock_irq(&timr->it_lock);
	1247	+ cpu_relax();
	1248	+ spin_lock_irq(&timr->it_lock);
	1249	+}
	1250	+
	1251	+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
	1252	+{
	1253	+ return false;
	1254	+}
	1255	+
	1256	+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
	1257	+ unsigned long start)
	1258	+{
	1259	+ return true;
	1260	+}
	1261	+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
	1262	+
	1263	+static void handle_posix_cpu_timers(struct task_struct *tsk)
	1264	+{
	1265	+ struct k_itimer timer, next;
	1266	+ unsigned long flags, start;
	1267	+ LIST_HEAD(firing);
	1268	+
	1269	+ if (!lock_task_sighand(tsk, &flags))
	1270	+ return;
	1271	+
	1272	+ do {
	1273	+ /*
	1274	+ * On RT locking sighand lock does not disable interrupts,
	1275	+ * so this needs to be careful vs. ticks. Store the current
	1276	+ * jiffies value.
	1277	+ */
	1278	+ start = READ_ONCE(jiffies);
	1279	+ barrier();
	1280	+
	1281	+ /*
	1282	+ * Here we take off tsk->signal->cpu_timers[N] and
	1283	+ * tsk->cpu_timers[N] all the timers that are firing, and
	1284	+ * put them on the firing list.
	1285	+ */
	1286	+ check_thread_timers(tsk, &firing);
	1287	+
	1288	+ check_process_timers(tsk, &firing);
	1289	+
	1290	+ /*
	1291	+ * The above timer checks have updated the exipry cache and
	1292	+ * because nothing can have queued or modified timers after
	1293	+ * sighand lock was taken above it is guaranteed to be
	1294	+ * consistent. So the next timer interrupt fastpath check
	1295	+ * will find valid data.
	1296	+ *
	1297	+ * If timer expiry runs in the timer interrupt context then
	1298	+ * the loop is not relevant as timers will be directly
	1299	+ * expired in interrupt context. The stub function below
	1300	+ * returns always true which allows the compiler to
	1301	+ * optimize the loop out.
	1302	+ *
	1303	+ * If timer expiry is deferred to task work context then
	1304	+ * the following rules apply:
	1305	+ *
	1306	+ * - On !RT kernels no tick can have happened on this CPU
	1307	+ * after sighand lock was acquired because interrupts are
	1308	+ * disabled. So reenabling task work before dropping
	1309	+ * sighand lock and reenabling interrupts is race free.
	1310	+ *
	1311	+ * - On RT kernels ticks might have happened but the tick
	1312	+ * work ignored posix CPU timer handling because the
	1313	+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
	1314	+ * must be done very carefully including a check whether
	1315	+ * ticks have happened since the start of the timer
	1316	+ * expiry checks. posix_cpu_timers_enable_work() takes
	1317	+ * care of that and eventually lets the expiry checks
	1318	+ * run again.
	1319	+ */
	1320	+ } while (!posix_cpu_timers_enable_work(tsk, start));
1190	1321
1191	1322	/*
1192		- * We must release these locks before taking any timer's lock.
	1323	+ * We must release sighand lock before taking any timer's lock.
1193	1324	* There is a potential race with timer deletion here, as the
1194	1325	* siglock now protects our private firing list. We have set
1195	1326	* the firing flag in each timer, so that a deletion attempt
..	..	@@ -1204,14 +1335,20 @@
1204	1335	* each timer's lock before clearing its firing flag, so no
1205	1336	* timer call will interfere.
1206	1337	*/
1207		- list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
	1338	+ list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
1208	1339	int cpu_firing;
1209	1340
	1341	+ /*
	1342	+ * spin_lock() is sufficient here even independent of the
	1343	+ * expiry context. If expiry happens in hard interrupt
	1344	+ * context it's obvious. For task work context it's safe
	1345	+ * because all other operations on timer::it_lock happen in
	1346	+ * task context (syscall or exit).
	1347	+ */
1210	1348	spin_lock(&timer->it_lock);
1211		- list_del_init(&timer->it.cpu.entry);
	1349	+ list_del_init(&timer->it.cpu.elist);
1212	1350	cpu_firing = timer->it.cpu.firing;
1213	1351	timer->it.cpu.firing = 0;
1214		- timer->it.cpu.firing_cpu = -1;
1215	1352	/*
1216	1353	* The firing flag is -1 if we collided with a reset
1217	1354	* of the timer, which already reported this
..	..	@@ -1219,174 +1356,56 @@
1219	1356	*/
1220	1357	if (likely(cpu_firing >= 0))
1221	1358	cpu_timer_fire(timer);
	1359	+ /* See posix_cpu_timer_wait_running() */
	1360	+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
1222	1361	spin_unlock(&timer->it_lock);
1223	1362	}
1224		- spin_unlock(expiry_lock);
1225	1363	}
1226	1364
1227		-#ifdef CONFIG_PREEMPT_RT_BASE
1228		-#include <linux/kthread.h>
1229		-#include <linux/cpu.h>
1230		-DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
1231		-DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
1232		-DEFINE_PER_CPU(bool, posix_timer_th_active);
1233		-
1234		-static void posix_cpu_kthread_fn(unsigned int cpu)
	1365	+/*
	1366	+ * This is called from the timer interrupt handler. The irq handler has
	1367	+ * already updated our counts. We need to check if any timers fire now.
	1368	+ * Interrupts are disabled.
	1369	+ */
	1370	+void run_posix_cpu_timers(void)
1235	1371	{
1236		- struct task_struct *tsk = NULL;
1237		- struct task_struct *next = NULL;
	1372	+ struct task_struct *tsk = current;
1238	1373
1239		- BUG_ON(per_cpu(posix_timer_task, cpu) != current);
1240		-
1241		- /* grab task list */
1242		- raw_local_irq_disable();
1243		- tsk = per_cpu(posix_timer_tasklist, cpu);
1244		- per_cpu(posix_timer_tasklist, cpu) = NULL;
1245		- raw_local_irq_enable();
1246		-
1247		- /* its possible the list is empty, just return */
1248		- if (!tsk)
1249		- return;
1250		-
1251		- /* Process task list */
1252		- while (1) {
1253		- /* save next */
1254		- next = tsk->posix_timer_list;
1255		-
1256		- /* run the task timers, clear its ptr and
1257		- * unreference it
1258		- */
1259		- __run_posix_cpu_timers(tsk);
1260		- tsk->posix_timer_list = NULL;
1261		- put_task_struct(tsk);
1262		-
1263		- /* check if this is the last on the list */
1264		- if (next == tsk)
1265		- break;
1266		- tsk = next;
1267		- }
1268		-}
1269		-
1270		-static inline int __fastpath_timer_check(struct task_struct *tsk)
1271		-{
1272		- /* tsk == current, ensure it is safe to use ->signal/sighand */
1273		- if (unlikely(tsk->exit_state))
1274		- return 0;
1275		-
1276		- if (!task_cputime_zero(&tsk->cputime_expires))
1277		- return 1;
1278		-
1279		- if (!task_cputime_zero(&tsk->signal->cputime_expires))
1280		- return 1;
1281		-
1282		- return 0;
1283		-}
1284		-
1285		-void run_posix_cpu_timers(struct task_struct *tsk)
1286		-{
1287		- unsigned int cpu = smp_processor_id();
1288		- struct task_struct *tasklist;
1289		-
1290		- BUG_ON(!irqs_disabled());
1291		-
1292		- if (per_cpu(posix_timer_th_active, cpu) != true)
1293		- return;
1294		-
1295		- /* get per-cpu references */
1296		- tasklist = per_cpu(posix_timer_tasklist, cpu);
1297		-
1298		- /* check to see if we're already queued */
1299		- if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
1300		- get_task_struct(tsk);
1301		- if (tasklist) {
1302		- tsk->posix_timer_list = tasklist;
1303		- } else {
1304		- /*
1305		- * The list is terminated by a self-pointing
1306		- * task_struct
1307		- */
1308		- tsk->posix_timer_list = tsk;
1309		- }
1310		- per_cpu(posix_timer_tasklist, cpu) = tsk;
1311		-
1312		- wake_up_process(per_cpu(posix_timer_task, cpu));
1313		- }
1314		-}
1315		-
1316		-static int posix_cpu_kthread_should_run(unsigned int cpu)
1317		-{
1318		- return __this_cpu_read(posix_timer_tasklist) != NULL;
1319		-}
1320		-
1321		-static void posix_cpu_kthread_park(unsigned int cpu)
1322		-{
1323		- this_cpu_write(posix_timer_th_active, false);
1324		-}
1325		-
1326		-static void posix_cpu_kthread_unpark(unsigned int cpu)
1327		-{
1328		- this_cpu_write(posix_timer_th_active, true);
1329		-}
1330		-
1331		-static void posix_cpu_kthread_setup(unsigned int cpu)
1332		-{
1333		- struct sched_param sp;
1334		-
1335		- sp.sched_priority = MAX_RT_PRIO - 1;
1336		- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1337		- posix_cpu_kthread_unpark(cpu);
1338		-}
1339		-
1340		-static struct smp_hotplug_thread posix_cpu_thread = {
1341		- .store = &posix_timer_task,
1342		- .thread_should_run = posix_cpu_kthread_should_run,
1343		- .thread_fn = posix_cpu_kthread_fn,
1344		- .thread_comm = "posixcputmr/%u",
1345		- .setup = posix_cpu_kthread_setup,
1346		- .park = posix_cpu_kthread_park,
1347		- .unpark = posix_cpu_kthread_unpark,
1348		-};
1349		-
1350		-static int __init posix_cpu_thread_init(void)
1351		-{
1352		- /* Start one for boot CPU. */
1353		- unsigned long cpu;
1354		- int ret;
1355		-
1356		- /* init the per-cpu posix_timer_tasklets */
1357		- for_each_possible_cpu(cpu)
1358		- per_cpu(posix_timer_tasklist, cpu) = NULL;
1359		-
1360		- ret = smpboot_register_percpu_thread(&posix_cpu_thread);
1361		- WARN_ON(ret);
1362		-
1363		- return 0;
1364		-}
1365		-early_initcall(posix_cpu_thread_init);
1366		-#else /* CONFIG_PREEMPT_RT_BASE */
1367		-void run_posix_cpu_timers(struct task_struct *tsk)
1368		-{
1369	1374	lockdep_assert_irqs_disabled();
	1375	+
	1376	+ /*
	1377	+ * If the actual expiry is deferred to task work context and the
	1378	+ * work is already scheduled there is no point to do anything here.
	1379	+ */
	1380	+ if (posix_cpu_timers_work_scheduled(tsk))
	1381	+ return;
	1382	+
	1383	+ /*
	1384	+ * The fast path checks that there are no expired thread or thread
	1385	+ * group timers. If that's so, just return.
	1386	+ */
	1387	+ if (!fastpath_timer_check(tsk))
	1388	+ return;
	1389	+
1370	1390	__run_posix_cpu_timers(tsk);
1371	1391	}
1372		-#endif /* CONFIG_PREEMPT_RT_BASE */
1373	1392
1374	1393	/*
1375	1394	* Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1376	1395	* The tsk->sighand->siglock must be held by the caller.
1377	1396	*/
1378		-void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
	1397	+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
1379	1398	u64 newval, u64 oldval)
1380	1399	{
1381		- u64 now;
1382		- int ret;
	1400	+ u64 now, *nextevt;
1383	1401
1384		- if (WARN_ON_ONCE(clock_idx >= CPUCLOCK_SCHED))
	1402	+ if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
1385	1403	return;
1386	1404
1387		- ret = cpu_timer_sample_group(clock_idx, tsk, &now);
	1405	+ nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
	1406	+ now = cpu_clock_sample_group(clkid, tsk, true);
1388	1407
1389		- if (oldval && ret != -EINVAL) {
	1408	+ if (oldval) {
1390	1409	/*
1391	1410	* We are setting itimer. The *oldval is absolute and we update
1392	1411	* it to be relative, *newval argument is relative and we update
..	..	@@ -1407,19 +1426,11 @@
1407	1426	}
1408	1427
1409	1428	/*
1410		- * Update expiration cache if we are the earliest timer, or eventually
1411		- * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
	1429	+ * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
	1430	+ * expiry cache is also used by RLIMIT_CPU!.
1412	1431	*/
1413		- switch (clock_idx) {
1414		- case CPUCLOCK_PROF:
1415		- if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1416		- tsk->signal->cputime_expires.prof_exp = *newval;
1417		- break;
1418		- case CPUCLOCK_VIRT:
1419		- if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1420		- tsk->signal->cputime_expires.virt_exp = *newval;
1421		- break;
1422		- }
	1432	+ if (newval < nextevt)
	1433	+ nextevt = newval;
1423	1434
1424	1435	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
1425	1436	}
..	..	@@ -1441,6 +1452,7 @@
1441	1452	timer.it_overrun = -1;
1442	1453	error = posix_cpu_timer_create(&timer);
1443	1454	timer.it_process = current;
	1455	+
1444	1456	if (!error) {
1445	1457	static struct itimerspec64 zero_it;
1446	1458	struct restart_block *restart;
..	..	@@ -1456,7 +1468,7 @@
1456	1468	}
1457	1469
1458	1470	while (!signal_pending(current)) {
1459		- if (timer.it.cpu.expires == 0) {
	1471	+ if (!cpu_timer_getexpires(&timer.it.cpu)) {
1460	1472	/*
1461	1473	* Our timer fired and was reset, below
1462	1474	* deletion can not fail.
..	..	@@ -1478,28 +1490,19 @@
1478	1490	/*
1479	1491	* We were interrupted by a signal.
1480	1492	*/
1481		- expires = timer.it.cpu.expires;
	1493	+ expires = cpu_timer_getexpires(&timer.it.cpu);
1482	1494	error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
1483	1495	if (!error) {
1484		- /*
1485		- * Timer is now unarmed, deletion can not fail.
1486		- */
	1496	+ /* Timer is now unarmed, deletion can not fail. */
1487	1497	posix_cpu_timer_del(&timer);
	1498	+ } else {
	1499	+ while (error == TIMER_RETRY) {
	1500	+ posix_cpu_timer_wait_running_nsleep(&timer);
	1501	+ error = posix_cpu_timer_del(&timer);
	1502	+ }
1488	1503	}
	1504	+
1489	1505	spin_unlock_irq(&timer.it_lock);
1490		-
1491		- while (error == TIMER_RETRY) {
1492		-
1493		- cpu_timers_grab_expiry_lock(&timer);
1494		- /*
1495		- * We need to handle case when timer was or is in the
1496		- * middle of firing. In other cases we already freed
1497		- * resources.
1498		- */
1499		- spin_lock_irq(&timer.it_lock);
1500		- error = posix_cpu_timer_del(&timer);
1501		- spin_unlock_irq(&timer.it_lock);
1502		- }
1503	1506
1504	1507	if ((it.it_value.tv_sec \| it.it_value.tv_nsec) == 0) {
1505	1508	/*
..	..	@@ -1600,26 +1603,27 @@
1600	1603	}
1601	1604
1602	1605	const struct k_clock clock_posix_cpu = {
1603		- .clock_getres = posix_cpu_clock_getres,
1604		- .clock_set = posix_cpu_clock_set,
1605		- .clock_get = posix_cpu_clock_get,
1606		- .timer_create = posix_cpu_timer_create,
1607		- .nsleep = posix_cpu_nsleep,
1608		- .timer_set = posix_cpu_timer_set,
1609		- .timer_del = posix_cpu_timer_del,
1610		- .timer_get = posix_cpu_timer_get,
1611		- .timer_rearm = posix_cpu_timer_rearm,
	1606	+ .clock_getres = posix_cpu_clock_getres,
	1607	+ .clock_set = posix_cpu_clock_set,
	1608	+ .clock_get_timespec = posix_cpu_clock_get,
	1609	+ .timer_create = posix_cpu_timer_create,
	1610	+ .nsleep = posix_cpu_nsleep,
	1611	+ .timer_set = posix_cpu_timer_set,
	1612	+ .timer_del = posix_cpu_timer_del,
	1613	+ .timer_get = posix_cpu_timer_get,
	1614	+ .timer_rearm = posix_cpu_timer_rearm,
	1615	+ .timer_wait_running = posix_cpu_timer_wait_running,
1612	1616	};
1613	1617
1614	1618	const struct k_clock clock_process = {
1615		- .clock_getres = process_cpu_clock_getres,
1616		- .clock_get = process_cpu_clock_get,
1617		- .timer_create = process_cpu_timer_create,
1618		- .nsleep = process_cpu_nsleep,
	1619	+ .clock_getres = process_cpu_clock_getres,
	1620	+ .clock_get_timespec = process_cpu_clock_get,
	1621	+ .timer_create = process_cpu_timer_create,
	1622	+ .nsleep = process_cpu_nsleep,
1619	1623	};
1620	1624
1621	1625	const struct k_clock clock_thread = {
1622		- .clock_getres = thread_cpu_clock_getres,
1623		- .clock_get = thread_cpu_clock_get,
1624		- .timer_create = thread_cpu_timer_create,
	1626	+ .clock_getres = thread_cpu_clock_getres,
	1627	+ .clock_get_timespec = thread_cpu_clock_get,
	1628	+ .timer_create = thread_cpu_timer_create,
1625	1629	};