hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/time/timer.c
....@@ -1,6 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
2
- * linux/kernel/timer.c
3
- *
43 * Kernel internal timers
54 *
65 * Copyright (C) 1991, 1992 Linus Torvalds
....@@ -56,6 +55,11 @@
5655
5756 #define CREATE_TRACE_POINTS
5857 #include <trace/events/timer.h>
58
+#undef CREATE_TRACE_POINTS
59
+#include <trace/hooks/timer.h>
60
+
61
+EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_entry);
62
+EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_exit);
5963
6064 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
6165
....@@ -158,7 +162,8 @@
158162
159163 /*
160164 * The time start value for each level to select the bucket at enqueue
161
- * time.
165
+ * time. We start from the last possible delta of the previous level
166
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
162167 */
163168 #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
164169
....@@ -198,11 +203,16 @@
198203 struct timer_base {
199204 raw_spinlock_t lock;
200205 struct timer_list *running_timer;
206
+#ifdef CONFIG_PREEMPT_RT
207
+ spinlock_t expiry_lock;
208
+ atomic_t timer_waiters;
209
+#endif
201210 unsigned long clk;
202211 unsigned long next_expiry;
203212 unsigned int cpu;
213
+ bool next_expiry_recalc;
204214 bool is_idle;
205
- bool must_forward_clk;
215
+ bool timers_pending;
206216 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
207217 struct hlist_head vectors[WHEEL_SIZE];
208218 } ____cacheline_aligned;
....@@ -247,8 +257,7 @@
247257 }
248258
249259 int timer_migration_handler(struct ctl_table *table, int write,
250
- void __user *buffer, size_t *lenp,
251
- loff_t *ppos)
260
+ void *buffer, size_t *lenp, loff_t *ppos)
252261 {
253262 int ret;
254263
....@@ -486,35 +495,49 @@
486495 * Helper function to calculate the array index for a given expiry
487496 * time.
488497 */
489
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
498
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
499
+ unsigned long *bucket_expiry)
490500 {
501
+
502
+ /*
503
+ * The timer wheel has to guarantee that a timer does not fire
504
+ * early. Early expiry can happen due to:
505
+ * - Timer is armed at the edge of a tick
506
+ * - Truncation of the expiry time in the outer wheel levels
507
+ *
508
+ * Round up with level granularity to prevent this.
509
+ */
510
+ trace_android_vh_timer_calc_index(lvl, &expires);
491511 expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
512
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
492513 return LVL_OFFS(lvl) + (expires & LVL_MASK);
493514 }
494515
495
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
516
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
517
+ unsigned long *bucket_expiry)
496518 {
497519 unsigned long delta = expires - clk;
498520 unsigned int idx;
499521
500522 if (delta < LVL_START(1)) {
501
- idx = calc_index(expires, 0);
523
+ idx = calc_index(expires, 0, bucket_expiry);
502524 } else if (delta < LVL_START(2)) {
503
- idx = calc_index(expires, 1);
525
+ idx = calc_index(expires, 1, bucket_expiry);
504526 } else if (delta < LVL_START(3)) {
505
- idx = calc_index(expires, 2);
527
+ idx = calc_index(expires, 2, bucket_expiry);
506528 } else if (delta < LVL_START(4)) {
507
- idx = calc_index(expires, 3);
529
+ idx = calc_index(expires, 3, bucket_expiry);
508530 } else if (delta < LVL_START(5)) {
509
- idx = calc_index(expires, 4);
531
+ idx = calc_index(expires, 4, bucket_expiry);
510532 } else if (delta < LVL_START(6)) {
511
- idx = calc_index(expires, 5);
533
+ idx = calc_index(expires, 5, bucket_expiry);
512534 } else if (delta < LVL_START(7)) {
513
- idx = calc_index(expires, 6);
535
+ idx = calc_index(expires, 6, bucket_expiry);
514536 } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
515
- idx = calc_index(expires, 7);
537
+ idx = calc_index(expires, 7, bucket_expiry);
516538 } else if ((long) delta < 0) {
517539 idx = clk & LVL_MASK;
540
+ *bucket_expiry = clk;
518541 } else {
519542 /*
520543 * Force expire obscene large timeouts to expire at the
....@@ -523,30 +546,9 @@
523546 if (delta >= WHEEL_TIMEOUT_CUTOFF)
524547 expires = clk + WHEEL_TIMEOUT_MAX;
525548
526
- idx = calc_index(expires, LVL_DEPTH - 1);
549
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
527550 }
528551 return idx;
529
-}
530
-
531
-/*
532
- * Enqueue the timer into the hash bucket, mark it pending in
533
- * the bitmap and store the index in the timer flags.
534
- */
535
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
536
- unsigned int idx)
537
-{
538
- hlist_add_head(&timer->entry, base->vectors + idx);
539
- __set_bit(idx, base->pending_map);
540
- timer_set_idx(timer, idx);
541
-}
542
-
543
-static void
544
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
545
-{
546
- unsigned int idx;
547
-
548
- idx = calc_wheel_index(timer->expires, base->clk);
549
- enqueue_timer(base, timer, idx);
550552 }
551553
552554 static void
....@@ -570,39 +572,54 @@
570572 * timer is not deferrable. If the other CPU is on the way to idle
571573 * then it can't set base->is_idle as we hold the base lock:
572574 */
573
- if (!base->is_idle)
574
- return;
575
-
576
- /* Check whether this is the new first expiring timer: */
577
- if (time_after_eq(timer->expires, base->next_expiry))
578
- return;
579
-
580
- /*
581
- * Set the next expiry time and kick the CPU so it can reevaluate the
582
- * wheel:
583
- */
584
- if (time_before(timer->expires, base->clk)) {
585
- /*
586
- * Prevent from forward_timer_base() moving the base->clk
587
- * backward
588
- */
589
- base->next_expiry = base->clk;
590
- } else {
591
- base->next_expiry = timer->expires;
592
- }
593
- wake_up_nohz_cpu(base->cpu);
575
+ if (base->is_idle)
576
+ wake_up_nohz_cpu(base->cpu);
594577 }
595578
596
-static void
597
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
579
+/*
580
+ * Enqueue the timer into the hash bucket, mark it pending in
581
+ * the bitmap, store the index in the timer flags then wake up
582
+ * the target CPU if needed.
583
+ */
584
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
585
+ unsigned int idx, unsigned long bucket_expiry)
598586 {
599
- __internal_add_timer(base, timer);
600
- trigger_dyntick_cpu(base, timer);
587
+
588
+ hlist_add_head(&timer->entry, base->vectors + idx);
589
+ __set_bit(idx, base->pending_map);
590
+ timer_set_idx(timer, idx);
591
+
592
+ trace_timer_start(timer, timer->expires, timer->flags);
593
+
594
+ /*
595
+ * Check whether this is the new first expiring timer. The
596
+ * effective expiry time of the timer is required here
597
+ * (bucket_expiry) instead of timer->expires.
598
+ */
599
+ if (time_before(bucket_expiry, base->next_expiry)) {
600
+ /*
601
+ * Set the next expiry time and kick the CPU so it
602
+ * can reevaluate the wheel:
603
+ */
604
+ base->next_expiry = bucket_expiry;
605
+ base->timers_pending = true;
606
+ base->next_expiry_recalc = false;
607
+ trigger_dyntick_cpu(base, timer);
608
+ }
609
+}
610
+
611
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
612
+{
613
+ unsigned long bucket_expiry;
614
+ unsigned int idx;
615
+
616
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
617
+ enqueue_timer(base, timer, idx, bucket_expiry);
601618 }
602619
603620 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
604621
605
-static struct debug_obj_descr timer_debug_descr;
622
+static const struct debug_obj_descr timer_debug_descr;
606623
607624 static void *timer_debug_hint(void *addr)
608625 {
....@@ -657,7 +674,7 @@
657674
658675 case ODEBUG_STATE_ACTIVE:
659676 WARN_ON(1);
660
-
677
+ fallthrough;
661678 default:
662679 return false;
663680 }
....@@ -698,7 +715,7 @@
698715 }
699716 }
700717
701
-static struct debug_obj_descr timer_debug_descr = {
718
+static const struct debug_obj_descr timer_debug_descr = {
702719 .name = "timer_list",
703720 .debug_hint = timer_debug_hint,
704721 .is_static_object = timer_is_static_object,
....@@ -721,11 +738,6 @@
721738 static inline void debug_timer_deactivate(struct timer_list *timer)
722739 {
723740 debug_object_deactivate(timer, &timer_debug_descr);
724
-}
725
-
726
-static inline void debug_timer_free(struct timer_list *timer)
727
-{
728
- debug_object_free(timer, &timer_debug_descr);
729741 }
730742
731743 static inline void debug_timer_assert_init(struct timer_list *timer)
....@@ -767,13 +779,6 @@
767779 trace_timer_init(timer);
768780 }
769781
770
-static inline void
771
-debug_activate(struct timer_list *timer, unsigned long expires)
772
-{
773
- debug_timer_activate(timer);
774
- trace_timer_start(timer, expires, timer->flags);
775
-}
776
-
777782 static inline void debug_deactivate(struct timer_list *timer)
778783 {
779784 debug_timer_deactivate(timer);
....@@ -792,6 +797,8 @@
792797 {
793798 timer->entry.pprev = NULL;
794799 timer->function = func;
800
+ if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
801
+ flags &= TIMER_INIT_FLAGS;
795802 timer->flags = flags | raw_smp_processor_id();
796803 lockdep_init_map(&timer->lockdep_map, name, key, 0);
797804 }
....@@ -837,8 +844,10 @@
837844 if (!timer_pending(timer))
838845 return 0;
839846
840
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
847
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
841848 __clear_bit(idx, base->pending_map);
849
+ base->next_expiry_recalc = true;
850
+ }
842851
843852 detach_timer(timer, clear_pending);
844853 return 1;
....@@ -888,20 +897,14 @@
888897
889898 static inline void forward_timer_base(struct timer_base *base)
890899 {
891
-#ifdef CONFIG_NO_HZ_COMMON
892
- unsigned long jnow;
900
+ unsigned long jnow = READ_ONCE(jiffies);
893901
894902 /*
895
- * We only forward the base when we are idle or have just come out of
896
- * idle (must_forward_clk logic), and have a delta between base clock
897
- * and jiffies. In the common case, run_timers will take care of it.
903
+ * No need to forward if we are close enough below jiffies.
904
+ * Also while executing timers, base->clk is 1 offset ahead
905
+ * of jiffies to avoid endless requeuing to current jffies.
898906 */
899
- if (likely(!base->must_forward_clk))
900
- return;
901
-
902
- jnow = READ_ONCE(jiffies);
903
- base->must_forward_clk = base->is_idle;
904
- if ((long)(jnow - base->clk) < 2)
907
+ if ((long)(jnow - base->clk) < 1)
905908 return;
906909
907910 /*
....@@ -915,7 +918,6 @@
915918 return;
916919 base->clk = base->next_expiry;
917920 }
918
-#endif
919921 }
920922
921923
....@@ -958,13 +960,14 @@
958960
959961 #define MOD_TIMER_PENDING_ONLY 0x01
960962 #define MOD_TIMER_REDUCE 0x02
963
+#define MOD_TIMER_NOTPENDING 0x04
961964
962965 static inline int
963966 __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
964967 {
968
+ unsigned long clk = 0, flags, bucket_expiry;
965969 struct timer_base *base, *new_base;
966970 unsigned int idx = UINT_MAX;
967
- unsigned long clk = 0, flags;
968971 int ret = 0;
969972
970973 BUG_ON(!timer->function);
....@@ -974,7 +977,7 @@
974977 * the timer is re-modified to have the same timeout or ends up in the
975978 * same array bucket then just return:
976979 */
977
- if (timer_pending(timer)) {
980
+ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
978981 /*
979982 * The downside of this optimization is that it can result in
980983 * larger granularity than you would get from adding a new
....@@ -1003,7 +1006,7 @@
10031006 }
10041007
10051008 clk = base->clk;
1006
- idx = calc_wheel_index(expires, clk);
1009
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
10071010
10081011 /*
10091012 * Retrieve and compare the array index of the pending
....@@ -1050,22 +1053,19 @@
10501053 }
10511054 }
10521055
1053
- debug_activate(timer, expires);
1056
+ debug_timer_activate(timer);
10541057
10551058 timer->expires = expires;
10561059 /*
10571060 * If 'idx' was calculated above and the base time did not advance
10581061 * between calculating 'idx' and possibly switching the base, only
1059
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
1060
- * we need to (re)calculate the wheel index via
1061
- * internal_add_timer().
1062
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
1063
+ * the wheel index via internal_add_timer().
10621064 */
1063
- if (idx != UINT_MAX && clk == base->clk) {
1064
- enqueue_timer(base, timer, idx);
1065
- trigger_dyntick_cpu(base, timer);
1066
- } else {
1065
+ if (idx != UINT_MAX && clk == base->clk)
1066
+ enqueue_timer(base, timer, idx, bucket_expiry);
1067
+ else
10671068 internal_add_timer(base, timer);
1068
- }
10691069
10701070 out_unlock:
10711071 raw_spin_unlock_irqrestore(&base->lock, flags);
....@@ -1147,7 +1147,7 @@
11471147 void add_timer(struct timer_list *timer)
11481148 {
11491149 BUG_ON(timer_pending(timer));
1150
- mod_timer(timer, timer->expires);
1150
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
11511151 }
11521152 EXPORT_SYMBOL(add_timer);
11531153
....@@ -1184,7 +1184,7 @@
11841184 }
11851185 forward_timer_base(base);
11861186
1187
- debug_activate(timer, timer->expires);
1187
+ debug_timer_activate(timer);
11881188 internal_add_timer(base, timer);
11891189 raw_spin_unlock_irqrestore(&base->lock, flags);
11901190 }
....@@ -1245,7 +1245,80 @@
12451245 }
12461246 EXPORT_SYMBOL(try_to_del_timer_sync);
12471247
1248
-#ifdef CONFIG_SMP
1248
+#ifdef CONFIG_PREEMPT_RT
1249
+static __init void timer_base_init_expiry_lock(struct timer_base *base)
1250
+{
1251
+ spin_lock_init(&base->expiry_lock);
1252
+}
1253
+
1254
+static inline void timer_base_lock_expiry(struct timer_base *base)
1255
+{
1256
+ spin_lock(&base->expiry_lock);
1257
+}
1258
+
1259
+static inline void timer_base_unlock_expiry(struct timer_base *base)
1260
+{
1261
+ spin_unlock(&base->expiry_lock);
1262
+}
1263
+
1264
+/*
1265
+ * The counterpart to del_timer_wait_running().
1266
+ *
1267
+ * If there is a waiter for base->expiry_lock, then it was waiting for the
1268
+ * timer callback to finish. Drop expiry_lock and reaquire it. That allows
1269
+ * the waiter to acquire the lock and make progress.
1270
+ */
1271
+static void timer_sync_wait_running(struct timer_base *base)
1272
+{
1273
+ if (atomic_read(&base->timer_waiters)) {
1274
+ raw_spin_unlock_irq(&base->lock);
1275
+ spin_unlock(&base->expiry_lock);
1276
+ spin_lock(&base->expiry_lock);
1277
+ raw_spin_lock_irq(&base->lock);
1278
+ }
1279
+}
1280
+
1281
+/*
1282
+ * This function is called on PREEMPT_RT kernels when the fast path
1283
+ * deletion of a timer failed because the timer callback function was
1284
+ * running.
1285
+ *
1286
+ * This prevents priority inversion, if the softirq thread on a remote CPU
1287
+ * got preempted, and it prevents a life lock when the task which tries to
1288
+ * delete a timer preempted the softirq thread running the timer callback
1289
+ * function.
1290
+ */
1291
+static void del_timer_wait_running(struct timer_list *timer)
1292
+{
1293
+ u32 tf;
1294
+
1295
+ tf = READ_ONCE(timer->flags);
1296
+ if (!(tf & TIMER_MIGRATING)) {
1297
+ struct timer_base *base = get_timer_base(tf);
1298
+
1299
+ /*
1300
+ * Mark the base as contended and grab the expiry lock,
1301
+ * which is held by the softirq across the timer
1302
+ * callback. Drop the lock immediately so the softirq can
1303
+ * expire the next timer. In theory the timer could already
1304
+ * be running again, but that's more than unlikely and just
1305
+ * causes another wait loop.
1306
+ */
1307
+ atomic_inc(&base->timer_waiters);
1308
+ spin_lock_bh(&base->expiry_lock);
1309
+ atomic_dec(&base->timer_waiters);
1310
+ spin_unlock_bh(&base->expiry_lock);
1311
+ }
1312
+}
1313
+#else
1314
+static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
1315
+static inline void timer_base_lock_expiry(struct timer_base *base) { }
1316
+static inline void timer_base_unlock_expiry(struct timer_base *base) { }
1317
+static inline void timer_sync_wait_running(struct timer_base *base) { }
1318
+static inline void del_timer_wait_running(struct timer_list *timer) { }
1319
+#endif
1320
+
1321
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
12491322 /**
12501323 * del_timer_sync - deactivate a timer and wait for the handler to finish.
12511324 * @timer: the timer to be deactivated
....@@ -1284,6 +1357,8 @@
12841357 */
12851358 int del_timer_sync(struct timer_list *timer)
12861359 {
1360
+ int ret;
1361
+
12871362 #ifdef CONFIG_LOCKDEP
12881363 unsigned long flags;
12891364
....@@ -1301,17 +1376,24 @@
13011376 * could lead to deadlock.
13021377 */
13031378 WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
1304
- for (;;) {
1305
- int ret = try_to_del_timer_sync(timer);
1306
- if (ret >= 0)
1307
- return ret;
1308
- cpu_relax();
1309
- }
1379
+
1380
+ do {
1381
+ ret = try_to_del_timer_sync(timer);
1382
+
1383
+ if (unlikely(ret < 0)) {
1384
+ del_timer_wait_running(timer);
1385
+ cpu_relax();
1386
+ }
1387
+ } while (ret < 0);
1388
+
1389
+ return ret;
13101390 }
13111391 EXPORT_SYMBOL(del_timer_sync);
13121392 #endif
13131393
1314
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
1394
+static void call_timer_fn(struct timer_list *timer,
1395
+ void (*fn)(struct timer_list *),
1396
+ unsigned long baseclk)
13151397 {
13161398 int count = preempt_count();
13171399
....@@ -1334,14 +1416,14 @@
13341416 */
13351417 lock_map_acquire(&lockdep_map);
13361418
1337
- trace_timer_expire_entry(timer);
1419
+ trace_timer_expire_entry(timer, baseclk);
13381420 fn(timer);
13391421 trace_timer_expire_exit(timer);
13401422
13411423 lock_map_release(&lockdep_map);
13421424
13431425 if (count != preempt_count()) {
1344
- WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1426
+ WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
13451427 fn, count, preempt_count());
13461428 /*
13471429 * Restore the preempt count. That gives us a decent
....@@ -1355,6 +1437,13 @@
13551437
13561438 static void expire_timers(struct timer_base *base, struct hlist_head *head)
13571439 {
1440
+ /*
1441
+ * This value is required only for tracing. base->clk was
1442
+ * incremented directly before expire_timers was called. But expiry
1443
+ * is related to the old base->clk value.
1444
+ */
1445
+ unsigned long baseclk = base->clk - 1;
1446
+
13581447 while (!hlist_empty(head)) {
13591448 struct timer_list *timer;
13601449 void (*fn)(struct timer_list *);
....@@ -1368,20 +1457,23 @@
13681457
13691458 if (timer->flags & TIMER_IRQSAFE) {
13701459 raw_spin_unlock(&base->lock);
1371
- call_timer_fn(timer, fn);
1460
+ call_timer_fn(timer, fn, baseclk);
13721461 raw_spin_lock(&base->lock);
1462
+ base->running_timer = NULL;
13731463 } else {
13741464 raw_spin_unlock_irq(&base->lock);
1375
- call_timer_fn(timer, fn);
1465
+ call_timer_fn(timer, fn, baseclk);
13761466 raw_spin_lock_irq(&base->lock);
1467
+ base->running_timer = NULL;
1468
+ timer_sync_wait_running(base);
13771469 }
13781470 }
13791471 }
13801472
1381
-static int __collect_expired_timers(struct timer_base *base,
1382
- struct hlist_head *heads)
1473
+static int collect_expired_timers(struct timer_base *base,
1474
+ struct hlist_head *heads)
13831475 {
1384
- unsigned long clk = base->clk;
1476
+ unsigned long clk = base->clk = base->next_expiry;
13851477 struct hlist_head *vec;
13861478 int i, levels = 0;
13871479 unsigned int idx;
....@@ -1403,7 +1495,6 @@
14031495 return levels;
14041496 }
14051497
1406
-#ifdef CONFIG_NO_HZ_COMMON
14071498 /*
14081499 * Find the next pending bucket of a level. Search from level start (@offset)
14091500 * + @clk upwards and if nothing there, search from start of the level
....@@ -1436,6 +1527,7 @@
14361527 clk = base->clk;
14371528 for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
14381529 int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
1530
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
14391531
14401532 if (pos >= 0) {
14411533 unsigned long tmp = clk + (unsigned long) pos;
....@@ -1443,6 +1535,13 @@
14431535 tmp <<= LVL_SHIFT(lvl);
14441536 if (time_before(tmp, next))
14451537 next = tmp;
1538
+
1539
+ /*
1540
+ * If the next expiration happens before we reach
1541
+ * the next level, no need to check further.
1542
+ */
1543
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1544
+ break;
14461545 }
14471546 /*
14481547 * Clock for the next level. If the current level clock lower
....@@ -1480,13 +1579,18 @@
14801579 * So the simple check whether the lower bits of the current
14811580 * level are 0 or not is sufficient for all cases.
14821581 */
1483
- adj = clk & LVL_CLK_MASK ? 1 : 0;
1582
+ adj = lvl_clk ? 1 : 0;
14841583 clk >>= LVL_CLK_SHIFT;
14851584 clk += adj;
14861585 }
1586
+
1587
+ base->next_expiry_recalc = false;
1588
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1589
+
14871590 return next;
14881591 }
14891592
1593
+#ifdef CONFIG_NO_HZ_COMMON
14901594 /*
14911595 * Check, if the next hrtimer event is before the next timer wheel
14921596 * event:
....@@ -1533,7 +1637,6 @@
15331637 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
15341638 u64 expires = KTIME_MAX;
15351639 unsigned long nextevt;
1536
- bool is_max_delta;
15371640
15381641 /*
15391642 * Pretend that there is no timer pending if the cpu is offline.
....@@ -1543,9 +1646,10 @@
15431646 return expires;
15441647
15451648 raw_spin_lock(&base->lock);
1546
- nextevt = __next_timer_interrupt(base);
1547
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1548
- base->next_expiry = nextevt;
1649
+ if (base->next_expiry_recalc)
1650
+ base->next_expiry = __next_timer_interrupt(base);
1651
+ nextevt = base->next_expiry;
1652
+
15491653 /*
15501654 * We have a fresh next event. Check whether we can forward the
15511655 * base. We can only do that when @basej is past base->clk
....@@ -1562,7 +1666,7 @@
15621666 expires = basem;
15631667 base->is_idle = false;
15641668 } else {
1565
- if (!is_max_delta)
1669
+ if (base->timers_pending)
15661670 expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
15671671 /*
15681672 * If we expect to sleep more than a tick, mark the base idle.
....@@ -1571,10 +1675,8 @@
15711675 * logic is only maintained for the BASE_STD base, deferrable
15721676 * timers may still see large granularity skew (by design).
15731677 */
1574
- if ((expires - basem) > TICK_NSEC) {
1575
- base->must_forward_clk = true;
1678
+ if ((expires - basem) > TICK_NSEC)
15761679 base->is_idle = true;
1577
- }
15781680 }
15791681 raw_spin_unlock(&base->lock);
15801682
....@@ -1598,42 +1700,6 @@
15981700 */
15991701 base->is_idle = false;
16001702 }
1601
-
1602
-static int collect_expired_timers(struct timer_base *base,
1603
- struct hlist_head *heads)
1604
-{
1605
- unsigned long now = READ_ONCE(jiffies);
1606
-
1607
- /*
1608
- * NOHZ optimization. After a long idle sleep we need to forward the
1609
- * base to current jiffies. Avoid a loop by searching the bitfield for
1610
- * the next expiring timer.
1611
- */
1612
- if ((long)(now - base->clk) > 2) {
1613
- unsigned long next = __next_timer_interrupt(base);
1614
-
1615
- /*
1616
- * If the next timer is ahead of time forward to current
1617
- * jiffies, otherwise forward to the next expiry time:
1618
- */
1619
- if (time_after(next, now)) {
1620
- /*
1621
- * The call site will increment base->clk and then
1622
- * terminate the expiry loop immediately.
1623
- */
1624
- base->clk = now;
1625
- return 0;
1626
- }
1627
- base->clk = next;
1628
- }
1629
- return __collect_expired_timers(base, heads);
1630
-}
1631
-#else
1632
-static inline int collect_expired_timers(struct timer_base *base,
1633
- struct hlist_head *heads)
1634
-{
1635
- return __collect_expired_timers(base, heads);
1636
-}
16371703 #endif
16381704
16391705 /*
....@@ -1644,17 +1710,19 @@
16441710 {
16451711 struct task_struct *p = current;
16461712
1713
+ PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
1714
+
16471715 /* Note: this timer irq context must be accounted for as well. */
16481716 account_process_tick(p, user_tick);
16491717 run_local_timers();
1650
- rcu_check_callbacks(user_tick);
1718
+ rcu_sched_clock_irq(user_tick);
16511719 #ifdef CONFIG_IRQ_WORK
16521720 if (in_irq())
16531721 irq_work_tick();
16541722 #endif
16551723 scheduler_tick();
16561724 if (IS_ENABLED(CONFIG_POSIX_TIMERS))
1657
- run_posix_cpu_timers(p);
1725
+ run_posix_cpu_timers();
16581726 }
16591727
16601728 /**
....@@ -1666,37 +1734,32 @@
16661734 struct hlist_head heads[LVL_DEPTH];
16671735 int levels;
16681736
1669
- if (!time_after_eq(jiffies, base->clk))
1737
+ if (time_before(jiffies, base->next_expiry))
16701738 return;
16711739
1740
+ timer_base_lock_expiry(base);
16721741 raw_spin_lock_irq(&base->lock);
16731742
1674
- /*
1675
- * timer_base::must_forward_clk must be cleared before running
1676
- * timers so that any timer functions that call mod_timer() will
1677
- * not try to forward the base. Idle tracking / clock forwarding
1678
- * logic is only used with BASE_STD timers.
1679
- *
1680
- * The must_forward_clk flag is cleared unconditionally also for
1681
- * the deferrable base. The deferrable base is not affected by idle
1682
- * tracking and never forwarded, so clearing the flag is a NOOP.
1683
- *
1684
- * The fact that the deferrable base is never forwarded can cause
1685
- * large variations in granularity for deferrable timers, but they
1686
- * can be deferred for long periods due to idle anyway.
1687
- */
1688
- base->must_forward_clk = false;
1689
-
1690
- while (time_after_eq(jiffies, base->clk)) {
1691
-
1743
+ while (time_after_eq(jiffies, base->clk) &&
1744
+ time_after_eq(jiffies, base->next_expiry)) {
16921745 levels = collect_expired_timers(base, heads);
1746
+ /*
1747
+ * The two possible reasons for not finding any expired
1748
+ * timer at this clk are that all matching timers have been
1749
+ * dequeued or no timer has been queued since
1750
+ * base::next_expiry was set to base::clk +
1751
+ * NEXT_TIMER_MAX_DELTA.
1752
+ */
1753
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc
1754
+ && base->timers_pending);
16931755 base->clk++;
1756
+ base->next_expiry = __next_timer_interrupt(base);
16941757
16951758 while (levels--)
16961759 expire_timers(base, heads + levels);
16971760 }
1698
- base->running_timer = NULL;
16991761 raw_spin_unlock_irq(&base->lock);
1762
+ timer_base_unlock_expiry(base);
17001763 }
17011764
17021765 /*
....@@ -1720,12 +1783,12 @@
17201783
17211784 hrtimer_run_queues();
17221785 /* Raise the softirq only if required. */
1723
- if (time_before(jiffies, base->clk)) {
1786
+ if (time_before(jiffies, base->next_expiry)) {
17241787 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
17251788 return;
17261789 /* CPU is awake, so check the deferrable base. */
17271790 base++;
1728
- if (time_before(jiffies, base->clk))
1791
+ if (time_before(jiffies, base->next_expiry))
17291792 return;
17301793 }
17311794 raise_softirq(TIMER_SOFTIRQ);
....@@ -1751,21 +1814,23 @@
17511814 * schedule_timeout - sleep until timeout
17521815 * @timeout: timeout value in jiffies
17531816 *
1754
- * Make the current task sleep until @timeout jiffies have
1755
- * elapsed. The routine will return immediately unless
1756
- * the current task state has been set (see set_current_state()).
1817
+ * Make the current task sleep until @timeout jiffies have elapsed.
1818
+ * The function behavior depends on the current task state
1819
+ * (see also set_current_state() description):
17571820 *
1758
- * You can set the task state as follows -
1821
+ * %TASK_RUNNING - the scheduler is called, but the task does not sleep
1822
+ * at all. That happens because sched_submit_work() does nothing for
1823
+ * tasks in %TASK_RUNNING state.
17591824 *
17601825 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
17611826 * pass before the routine returns unless the current task is explicitly
1762
- * woken up, (e.g. by wake_up_process())".
1827
+ * woken up, (e.g. by wake_up_process()).
17631828 *
17641829 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
17651830 * delivered to the current task or the current task is explicitly woken
17661831 * up.
17671832 *
1768
- * The current task state is guaranteed to be TASK_RUNNING when this
1833
+ * The current task state is guaranteed to be %TASK_RUNNING when this
17691834 * routine returns.
17701835 *
17711836 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
....@@ -1773,7 +1838,7 @@
17731838 * value will be %MAX_SCHEDULE_TIMEOUT.
17741839 *
17751840 * Returns 0 when the timer has expired otherwise the remaining time in
1776
- * jiffies will be returned. In all cases the return value is guaranteed
1841
+ * jiffies will be returned. In all cases the return value is guaranteed
17771842 * to be non-negative.
17781843 */
17791844 signed long __sched schedule_timeout(signed long timeout)
....@@ -1814,7 +1879,7 @@
18141879
18151880 timer.task = current;
18161881 timer_setup_on_stack(&timer.timer, process_timeout, 0);
1817
- __mod_timer(&timer.timer, expire, 0);
1882
+ __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
18181883 schedule();
18191884 del_singleshot_timer_sync(&timer.timer);
18201885
....@@ -1887,8 +1952,8 @@
18871952 base = per_cpu_ptr(&timer_bases[b], cpu);
18881953 base->clk = jiffies;
18891954 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
1955
+ base->timers_pending = false;
18901956 base->is_idle = false;
1891
- base->must_forward_clk = true;
18921957 }
18931958 return 0;
18941959 }
....@@ -1941,6 +2006,8 @@
19412006 base->cpu = cpu;
19422007 raw_spin_lock_init(&base->lock);
19432008 base->clk = jiffies;
2009
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2010
+ timer_base_init_expiry_lock(base);
19442011 }
19452012 }
19462013
....@@ -1955,6 +2022,7 @@
19552022 void __init init_timers(void)
19562023 {
19572024 init_timer_cpus();
2025
+ posix_cputimers_init_work();
19582026 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
19592027 }
19602028
....@@ -1988,6 +2056,32 @@
19882056 EXPORT_SYMBOL(msleep_interruptible);
19892057
19902058 /**
2059
+ * usleep_range_state - Sleep for an approximate time in a given state
2060
+ * @min: Minimum time in usecs to sleep
2061
+ * @max: Maximum time in usecs to sleep
2062
+ * @state: State of the current task that will be while sleeping
2063
+ *
2064
+ * In non-atomic context where the exact wakeup time is flexible, use
2065
+ * usleep_range_state() instead of udelay(). The sleep improves responsiveness
2066
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
2067
+ * power usage by allowing hrtimers to take advantage of an already-
2068
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
2069
+ */
2070
+void __sched usleep_range_state(unsigned long min, unsigned long max,
2071
+ unsigned int state)
2072
+{
2073
+ ktime_t exp = ktime_add_us(ktime_get(), min);
2074
+ u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2075
+
2076
+ for (;;) {
2077
+ __set_current_state(state);
2078
+ /* Do not return before the requested sleep time has elapsed */
2079
+ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2080
+ break;
2081
+ }
2082
+}
2083
+
2084
+/**
19912085 * usleep_range - Sleep for an approximate time
19922086 * @min: Minimum time in usecs to sleep
19932087 * @max: Maximum time in usecs to sleep
....@@ -2000,14 +2094,6 @@
20002094 */
20012095 void __sched usleep_range(unsigned long min, unsigned long max)
20022096 {
2003
- ktime_t exp = ktime_add_us(ktime_get(), min);
2004
- u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2005
-
2006
- for (;;) {
2007
- __set_current_state(TASK_UNINTERRUPTIBLE);
2008
- /* Do not return before the requested sleep time has elapsed */
2009
- if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2010
- break;
2011
- }
2097
+ usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
20122098 }
20132099 EXPORT_SYMBOL(usleep_range);