hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/time/timer.c
....@@ -1,6 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
2
- * linux/kernel/timer.c
3
- *
43 * Kernel internal timers
54 *
65 * Copyright (C) 1991, 1992 Linus Torvalds
....@@ -56,6 +55,11 @@
5655
5756 #define CREATE_TRACE_POINTS
5857 #include <trace/events/timer.h>
58
+#undef CREATE_TRACE_POINTS
59
+#include <trace/hooks/timer.h>
60
+
61
+EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_entry);
62
+EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_exit);
5963
6064 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
6165
....@@ -158,7 +162,8 @@
158162
159163 /*
160164 * The time start value for each level to select the bucket at enqueue
161
- * time.
165
+ * time. We start from the last possible delta of the previous level
166
+ * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
162167 */
163168 #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
164169
....@@ -198,12 +203,16 @@
198203 struct timer_base {
199204 raw_spinlock_t lock;
200205 struct timer_list *running_timer;
206
+#ifdef CONFIG_PREEMPT_RT
201207 spinlock_t expiry_lock;
208
+ atomic_t timer_waiters;
209
+#endif
202210 unsigned long clk;
203211 unsigned long next_expiry;
204212 unsigned int cpu;
213
+ bool next_expiry_recalc;
205214 bool is_idle;
206
- bool must_forward_clk;
215
+ bool timers_pending;
207216 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
208217 struct hlist_head vectors[WHEEL_SIZE];
209218 } ____cacheline_aligned;
....@@ -215,7 +224,8 @@
215224 static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
216225 static DEFINE_MUTEX(timer_keys_mutex);
217226
218
-static struct swork_event timer_update_swork;
227
+static void timer_update_keys(struct work_struct *work);
228
+static DECLARE_WORK(timer_update_work, timer_update_keys);
219229
220230 #ifdef CONFIG_SMP
221231 unsigned int sysctl_timer_migration = 1;
....@@ -233,7 +243,7 @@
233243 static inline void timers_update_migration(void) { }
234244 #endif /* !CONFIG_SMP */
235245
236
-static void timer_update_keys(struct swork_event *event)
246
+static void timer_update_keys(struct work_struct *work)
237247 {
238248 mutex_lock(&timer_keys_mutex);
239249 timers_update_migration();
....@@ -243,20 +253,11 @@
243253
244254 void timers_update_nohz(void)
245255 {
246
- swork_queue(&timer_update_swork);
256
+ schedule_work(&timer_update_work);
247257 }
248
-
249
-static __init int hrtimer_init_thread(void)
250
-{
251
- WARN_ON(swork_get());
252
- INIT_SWORK(&timer_update_swork, timer_update_keys);
253
- return 0;
254
-}
255
-early_initcall(hrtimer_init_thread);
256258
257259 int timer_migration_handler(struct ctl_table *table, int write,
258
- void __user *buffer, size_t *lenp,
259
- loff_t *ppos)
260
+ void *buffer, size_t *lenp, loff_t *ppos)
260261 {
261262 int ret;
262263
....@@ -494,35 +495,49 @@
494495 * Helper function to calculate the array index for a given expiry
495496 * time.
496497 */
497
-static inline unsigned calc_index(unsigned expires, unsigned lvl)
498
+static inline unsigned calc_index(unsigned long expires, unsigned lvl,
499
+ unsigned long *bucket_expiry)
498500 {
501
+
502
+ /*
503
+ * The timer wheel has to guarantee that a timer does not fire
504
+ * early. Early expiry can happen due to:
505
+ * - Timer is armed at the edge of a tick
506
+ * - Truncation of the expiry time in the outer wheel levels
507
+ *
508
+ * Round up with level granularity to prevent this.
509
+ */
510
+ trace_android_vh_timer_calc_index(lvl, &expires);
499511 expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
512
+ *bucket_expiry = expires << LVL_SHIFT(lvl);
500513 return LVL_OFFS(lvl) + (expires & LVL_MASK);
501514 }
502515
503
-static int calc_wheel_index(unsigned long expires, unsigned long clk)
516
+static int calc_wheel_index(unsigned long expires, unsigned long clk,
517
+ unsigned long *bucket_expiry)
504518 {
505519 unsigned long delta = expires - clk;
506520 unsigned int idx;
507521
508522 if (delta < LVL_START(1)) {
509
- idx = calc_index(expires, 0);
523
+ idx = calc_index(expires, 0, bucket_expiry);
510524 } else if (delta < LVL_START(2)) {
511
- idx = calc_index(expires, 1);
525
+ idx = calc_index(expires, 1, bucket_expiry);
512526 } else if (delta < LVL_START(3)) {
513
- idx = calc_index(expires, 2);
527
+ idx = calc_index(expires, 2, bucket_expiry);
514528 } else if (delta < LVL_START(4)) {
515
- idx = calc_index(expires, 3);
529
+ idx = calc_index(expires, 3, bucket_expiry);
516530 } else if (delta < LVL_START(5)) {
517
- idx = calc_index(expires, 4);
531
+ idx = calc_index(expires, 4, bucket_expiry);
518532 } else if (delta < LVL_START(6)) {
519
- idx = calc_index(expires, 5);
533
+ idx = calc_index(expires, 5, bucket_expiry);
520534 } else if (delta < LVL_START(7)) {
521
- idx = calc_index(expires, 6);
535
+ idx = calc_index(expires, 6, bucket_expiry);
522536 } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
523
- idx = calc_index(expires, 7);
537
+ idx = calc_index(expires, 7, bucket_expiry);
524538 } else if ((long) delta < 0) {
525539 idx = clk & LVL_MASK;
540
+ *bucket_expiry = clk;
526541 } else {
527542 /*
528543 * Force expire obscene large timeouts to expire at the
....@@ -531,30 +546,9 @@
531546 if (delta >= WHEEL_TIMEOUT_CUTOFF)
532547 expires = clk + WHEEL_TIMEOUT_MAX;
533548
534
- idx = calc_index(expires, LVL_DEPTH - 1);
549
+ idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
535550 }
536551 return idx;
537
-}
538
-
539
-/*
540
- * Enqueue the timer into the hash bucket, mark it pending in
541
- * the bitmap and store the index in the timer flags.
542
- */
543
-static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
544
- unsigned int idx)
545
-{
546
- hlist_add_head(&timer->entry, base->vectors + idx);
547
- __set_bit(idx, base->pending_map);
548
- timer_set_idx(timer, idx);
549
-}
550
-
551
-static void
552
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
553
-{
554
- unsigned int idx;
555
-
556
- idx = calc_wheel_index(timer->expires, base->clk);
557
- enqueue_timer(base, timer, idx);
558552 }
559553
560554 static void
....@@ -578,39 +572,54 @@
578572 * timer is not deferrable. If the other CPU is on the way to idle
579573 * then it can't set base->is_idle as we hold the base lock:
580574 */
581
- if (!base->is_idle)
582
- return;
583
-
584
- /* Check whether this is the new first expiring timer: */
585
- if (time_after_eq(timer->expires, base->next_expiry))
586
- return;
587
-
588
- /*
589
- * Set the next expiry time and kick the CPU so it can reevaluate the
590
- * wheel:
591
- */
592
- if (time_before(timer->expires, base->clk)) {
593
- /*
594
- * Prevent from forward_timer_base() moving the base->clk
595
- * backward
596
- */
597
- base->next_expiry = base->clk;
598
- } else {
599
- base->next_expiry = timer->expires;
600
- }
601
- wake_up_nohz_cpu(base->cpu);
575
+ if (base->is_idle)
576
+ wake_up_nohz_cpu(base->cpu);
602577 }
603578
604
-static void
605
-internal_add_timer(struct timer_base *base, struct timer_list *timer)
579
+/*
580
+ * Enqueue the timer into the hash bucket, mark it pending in
581
+ * the bitmap, store the index in the timer flags then wake up
582
+ * the target CPU if needed.
583
+ */
584
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
585
+ unsigned int idx, unsigned long bucket_expiry)
606586 {
607
- __internal_add_timer(base, timer);
608
- trigger_dyntick_cpu(base, timer);
587
+
588
+ hlist_add_head(&timer->entry, base->vectors + idx);
589
+ __set_bit(idx, base->pending_map);
590
+ timer_set_idx(timer, idx);
591
+
592
+ trace_timer_start(timer, timer->expires, timer->flags);
593
+
594
+ /*
595
+ * Check whether this is the new first expiring timer. The
596
+ * effective expiry time of the timer is required here
597
+ * (bucket_expiry) instead of timer->expires.
598
+ */
599
+ if (time_before(bucket_expiry, base->next_expiry)) {
600
+ /*
601
+ * Set the next expiry time and kick the CPU so it
602
+ * can reevaluate the wheel:
603
+ */
604
+ base->next_expiry = bucket_expiry;
605
+ base->timers_pending = true;
606
+ base->next_expiry_recalc = false;
607
+ trigger_dyntick_cpu(base, timer);
608
+ }
609
+}
610
+
611
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
612
+{
613
+ unsigned long bucket_expiry;
614
+ unsigned int idx;
615
+
616
+ idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
617
+ enqueue_timer(base, timer, idx, bucket_expiry);
609618 }
610619
611620 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
612621
613
-static struct debug_obj_descr timer_debug_descr;
622
+static const struct debug_obj_descr timer_debug_descr;
614623
615624 static void *timer_debug_hint(void *addr)
616625 {
....@@ -665,7 +674,7 @@
665674
666675 case ODEBUG_STATE_ACTIVE:
667676 WARN_ON(1);
668
-
677
+ fallthrough;
669678 default:
670679 return false;
671680 }
....@@ -706,7 +715,7 @@
706715 }
707716 }
708717
709
-static struct debug_obj_descr timer_debug_descr = {
718
+static const struct debug_obj_descr timer_debug_descr = {
710719 .name = "timer_list",
711720 .debug_hint = timer_debug_hint,
712721 .is_static_object = timer_is_static_object,
....@@ -729,11 +738,6 @@
729738 static inline void debug_timer_deactivate(struct timer_list *timer)
730739 {
731740 debug_object_deactivate(timer, &timer_debug_descr);
732
-}
733
-
734
-static inline void debug_timer_free(struct timer_list *timer)
735
-{
736
- debug_object_free(timer, &timer_debug_descr);
737741 }
738742
739743 static inline void debug_timer_assert_init(struct timer_list *timer)
....@@ -775,13 +779,6 @@
775779 trace_timer_init(timer);
776780 }
777781
778
-static inline void
779
-debug_activate(struct timer_list *timer, unsigned long expires)
780
-{
781
- debug_timer_activate(timer);
782
- trace_timer_start(timer, expires, timer->flags);
783
-}
784
-
785782 static inline void debug_deactivate(struct timer_list *timer)
786783 {
787784 debug_timer_deactivate(timer);
....@@ -800,6 +797,8 @@
800797 {
801798 timer->entry.pprev = NULL;
802799 timer->function = func;
800
+ if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
801
+ flags &= TIMER_INIT_FLAGS;
803802 timer->flags = flags | raw_smp_processor_id();
804803 lockdep_init_map(&timer->lockdep_map, name, key, 0);
805804 }
....@@ -845,8 +844,10 @@
845844 if (!timer_pending(timer))
846845 return 0;
847846
848
- if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
847
+ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
849848 __clear_bit(idx, base->pending_map);
849
+ base->next_expiry_recalc = true;
850
+ }
850851
851852 detach_timer(timer, clear_pending);
852853 return 1;
....@@ -896,20 +897,14 @@
896897
897898 static inline void forward_timer_base(struct timer_base *base)
898899 {
899
-#ifdef CONFIG_NO_HZ_COMMON
900
- unsigned long jnow;
900
+ unsigned long jnow = READ_ONCE(jiffies);
901901
902902 /*
903
- * We only forward the base when we are idle or have just come out of
904
- * idle (must_forward_clk logic), and have a delta between base clock
905
- * and jiffies. In the common case, run_timers will take care of it.
903
+ * No need to forward if we are close enough below jiffies.
904
+ * Also while executing timers, base->clk is 1 offset ahead
905
+ * of jiffies to avoid endless requeuing to current jffies.
906906 */
907
- if (likely(!base->must_forward_clk))
908
- return;
909
-
910
- jnow = READ_ONCE(jiffies);
911
- base->must_forward_clk = base->is_idle;
912
- if ((long)(jnow - base->clk) < 2)
907
+ if ((long)(jnow - base->clk) < 1)
913908 return;
914909
915910 /*
....@@ -923,7 +918,6 @@
923918 return;
924919 base->clk = base->next_expiry;
925920 }
926
-#endif
927921 }
928922
929923
....@@ -966,13 +960,14 @@
966960
967961 #define MOD_TIMER_PENDING_ONLY 0x01
968962 #define MOD_TIMER_REDUCE 0x02
963
+#define MOD_TIMER_NOTPENDING 0x04
969964
970965 static inline int
971966 __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
972967 {
968
+ unsigned long clk = 0, flags, bucket_expiry;
973969 struct timer_base *base, *new_base;
974970 unsigned int idx = UINT_MAX;
975
- unsigned long clk = 0, flags;
976971 int ret = 0;
977972
978973 BUG_ON(!timer->function);
....@@ -982,7 +977,7 @@
982977 * the timer is re-modified to have the same timeout or ends up in the
983978 * same array bucket then just return:
984979 */
985
- if (timer_pending(timer)) {
980
+ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
986981 /*
987982 * The downside of this optimization is that it can result in
988983 * larger granularity than you would get from adding a new
....@@ -1011,7 +1006,7 @@
10111006 }
10121007
10131008 clk = base->clk;
1014
- idx = calc_wheel_index(expires, clk);
1009
+ idx = calc_wheel_index(expires, clk, &bucket_expiry);
10151010
10161011 /*
10171012 * Retrieve and compare the array index of the pending
....@@ -1058,22 +1053,19 @@
10581053 }
10591054 }
10601055
1061
- debug_activate(timer, expires);
1056
+ debug_timer_activate(timer);
10621057
10631058 timer->expires = expires;
10641059 /*
10651060 * If 'idx' was calculated above and the base time did not advance
10661061 * between calculating 'idx' and possibly switching the base, only
1067
- * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
1068
- * we need to (re)calculate the wheel index via
1069
- * internal_add_timer().
1062
+ * enqueue_timer() is required. Otherwise we need to (re)calculate
1063
+ * the wheel index via internal_add_timer().
10701064 */
1071
- if (idx != UINT_MAX && clk == base->clk) {
1072
- enqueue_timer(base, timer, idx);
1073
- trigger_dyntick_cpu(base, timer);
1074
- } else {
1065
+ if (idx != UINT_MAX && clk == base->clk)
1066
+ enqueue_timer(base, timer, idx, bucket_expiry);
1067
+ else
10751068 internal_add_timer(base, timer);
1076
- }
10771069
10781070 out_unlock:
10791071 raw_spin_unlock_irqrestore(&base->lock, flags);
....@@ -1155,7 +1147,7 @@
11551147 void add_timer(struct timer_list *timer)
11561148 {
11571149 BUG_ON(timer_pending(timer));
1158
- mod_timer(timer, timer->expires);
1150
+ __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
11591151 }
11601152 EXPORT_SYMBOL(add_timer);
11611153
....@@ -1192,7 +1184,7 @@
11921184 }
11931185 forward_timer_base(base);
11941186
1195
- debug_activate(timer, timer->expires);
1187
+ debug_timer_activate(timer);
11961188 internal_add_timer(base, timer);
11971189 raw_spin_unlock_irqrestore(&base->lock, flags);
11981190 }
....@@ -1227,25 +1219,6 @@
12271219 }
12281220 EXPORT_SYMBOL(del_timer);
12291221
1230
-static int __try_to_del_timer_sync(struct timer_list *timer,
1231
- struct timer_base **basep)
1232
-{
1233
- struct timer_base *base;
1234
- unsigned long flags;
1235
- int ret = -1;
1236
-
1237
- debug_assert_init(timer);
1238
-
1239
- *basep = base = lock_timer_base(timer, &flags);
1240
-
1241
- if (base->running_timer != timer)
1242
- ret = detach_if_pending(timer, base, true);
1243
-
1244
- raw_spin_unlock_irqrestore(&base->lock, flags);
1245
-
1246
- return ret;
1247
-}
1248
-
12491222 /**
12501223 * try_to_del_timer_sync - Try to deactivate a timer
12511224 * @timer: timer to delete
....@@ -1256,31 +1229,96 @@
12561229 int try_to_del_timer_sync(struct timer_list *timer)
12571230 {
12581231 struct timer_base *base;
1232
+ unsigned long flags;
1233
+ int ret = -1;
12591234
1260
- return __try_to_del_timer_sync(timer, &base);
1235
+ debug_assert_init(timer);
1236
+
1237
+ base = lock_timer_base(timer, &flags);
1238
+
1239
+ if (base->running_timer != timer)
1240
+ ret = detach_if_pending(timer, base, true);
1241
+
1242
+ raw_spin_unlock_irqrestore(&base->lock, flags);
1243
+
1244
+ return ret;
12611245 }
12621246 EXPORT_SYMBOL(try_to_del_timer_sync);
12631247
1264
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
1265
-static int __del_timer_sync(struct timer_list *timer)
1248
+#ifdef CONFIG_PREEMPT_RT
1249
+static __init void timer_base_init_expiry_lock(struct timer_base *base)
12661250 {
1267
- struct timer_base *base;
1268
- int ret;
1251
+ spin_lock_init(&base->expiry_lock);
1252
+}
12691253
1270
- for (;;) {
1271
- ret = __try_to_del_timer_sync(timer, &base);
1272
- if (ret >= 0)
1273
- return ret;
1254
+static inline void timer_base_lock_expiry(struct timer_base *base)
1255
+{
1256
+ spin_lock(&base->expiry_lock);
1257
+}
12741258
1275
- /*
1276
- * When accessing the lock, timers of base are no longer expired
1277
- * and so timer is no longer running.
1278
- */
1279
- spin_lock(&base->expiry_lock);
1259
+static inline void timer_base_unlock_expiry(struct timer_base *base)
1260
+{
1261
+ spin_unlock(&base->expiry_lock);
1262
+}
1263
+
1264
+/*
1265
+ * The counterpart to del_timer_wait_running().
1266
+ *
1267
+ * If there is a waiter for base->expiry_lock, then it was waiting for the
1268
+ * timer callback to finish. Drop expiry_lock and reaquire it. That allows
1269
+ * the waiter to acquire the lock and make progress.
1270
+ */
1271
+static void timer_sync_wait_running(struct timer_base *base)
1272
+{
1273
+ if (atomic_read(&base->timer_waiters)) {
1274
+ raw_spin_unlock_irq(&base->lock);
12801275 spin_unlock(&base->expiry_lock);
1276
+ spin_lock(&base->expiry_lock);
1277
+ raw_spin_lock_irq(&base->lock);
12811278 }
12821279 }
12831280
1281
+/*
1282
+ * This function is called on PREEMPT_RT kernels when the fast path
1283
+ * deletion of a timer failed because the timer callback function was
1284
+ * running.
1285
+ *
1286
+ * This prevents priority inversion, if the softirq thread on a remote CPU
1287
+ * got preempted, and it prevents a life lock when the task which tries to
1288
+ * delete a timer preempted the softirq thread running the timer callback
1289
+ * function.
1290
+ */
1291
+static void del_timer_wait_running(struct timer_list *timer)
1292
+{
1293
+ u32 tf;
1294
+
1295
+ tf = READ_ONCE(timer->flags);
1296
+ if (!(tf & TIMER_MIGRATING)) {
1297
+ struct timer_base *base = get_timer_base(tf);
1298
+
1299
+ /*
1300
+ * Mark the base as contended and grab the expiry lock,
1301
+ * which is held by the softirq across the timer
1302
+ * callback. Drop the lock immediately so the softirq can
1303
+ * expire the next timer. In theory the timer could already
1304
+ * be running again, but that's more than unlikely and just
1305
+ * causes another wait loop.
1306
+ */
1307
+ atomic_inc(&base->timer_waiters);
1308
+ spin_lock_bh(&base->expiry_lock);
1309
+ atomic_dec(&base->timer_waiters);
1310
+ spin_unlock_bh(&base->expiry_lock);
1311
+ }
1312
+}
1313
+#else
1314
+static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
1315
+static inline void timer_base_lock_expiry(struct timer_base *base) { }
1316
+static inline void timer_base_unlock_expiry(struct timer_base *base) { }
1317
+static inline void timer_sync_wait_running(struct timer_base *base) { }
1318
+static inline void del_timer_wait_running(struct timer_list *timer) { }
1319
+#endif
1320
+
1321
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
12841322 /**
12851323 * del_timer_sync - deactivate a timer and wait for the handler to finish.
12861324 * @timer: the timer to be deactivated
....@@ -1319,6 +1357,8 @@
13191357 */
13201358 int del_timer_sync(struct timer_list *timer)
13211359 {
1360
+ int ret;
1361
+
13221362 #ifdef CONFIG_LOCKDEP
13231363 unsigned long flags;
13241364
....@@ -1337,12 +1377,23 @@
13371377 */
13381378 WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
13391379
1340
- return __del_timer_sync(timer);
1380
+ do {
1381
+ ret = try_to_del_timer_sync(timer);
1382
+
1383
+ if (unlikely(ret < 0)) {
1384
+ del_timer_wait_running(timer);
1385
+ cpu_relax();
1386
+ }
1387
+ } while (ret < 0);
1388
+
1389
+ return ret;
13411390 }
13421391 EXPORT_SYMBOL(del_timer_sync);
13431392 #endif
13441393
1345
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
1394
+static void call_timer_fn(struct timer_list *timer,
1395
+ void (*fn)(struct timer_list *),
1396
+ unsigned long baseclk)
13461397 {
13471398 int count = preempt_count();
13481399
....@@ -1365,14 +1416,14 @@
13651416 */
13661417 lock_map_acquire(&lockdep_map);
13671418
1368
- trace_timer_expire_entry(timer);
1419
+ trace_timer_expire_entry(timer, baseclk);
13691420 fn(timer);
13701421 trace_timer_expire_exit(timer);
13711422
13721423 lock_map_release(&lockdep_map);
13731424
13741425 if (count != preempt_count()) {
1375
- WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1426
+ WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
13761427 fn, count, preempt_count());
13771428 /*
13781429 * Restore the preempt count. That gives us a decent
....@@ -1386,6 +1437,13 @@
13861437
13871438 static void expire_timers(struct timer_base *base, struct hlist_head *head)
13881439 {
1440
+ /*
1441
+ * This value is required only for tracing. base->clk was
1442
+ * incremented directly before expire_timers was called. But expiry
1443
+ * is related to the old base->clk value.
1444
+ */
1445
+ unsigned long baseclk = base->clk - 1;
1446
+
13891447 while (!hlist_empty(head)) {
13901448 struct timer_list *timer;
13911449 void (*fn)(struct timer_list *);
....@@ -1399,26 +1457,23 @@
13991457
14001458 if (timer->flags & TIMER_IRQSAFE) {
14011459 raw_spin_unlock(&base->lock);
1402
- call_timer_fn(timer, fn);
1403
- base->running_timer = NULL;
1404
- spin_unlock(&base->expiry_lock);
1405
- spin_lock(&base->expiry_lock);
1460
+ call_timer_fn(timer, fn, baseclk);
14061461 raw_spin_lock(&base->lock);
1462
+ base->running_timer = NULL;
14071463 } else {
14081464 raw_spin_unlock_irq(&base->lock);
1409
- call_timer_fn(timer, fn);
1410
- base->running_timer = NULL;
1411
- spin_unlock(&base->expiry_lock);
1412
- spin_lock(&base->expiry_lock);
1465
+ call_timer_fn(timer, fn, baseclk);
14131466 raw_spin_lock_irq(&base->lock);
1467
+ base->running_timer = NULL;
1468
+ timer_sync_wait_running(base);
14141469 }
14151470 }
14161471 }
14171472
1418
-static int __collect_expired_timers(struct timer_base *base,
1419
- struct hlist_head *heads)
1473
+static int collect_expired_timers(struct timer_base *base,
1474
+ struct hlist_head *heads)
14201475 {
1421
- unsigned long clk = base->clk;
1476
+ unsigned long clk = base->clk = base->next_expiry;
14221477 struct hlist_head *vec;
14231478 int i, levels = 0;
14241479 unsigned int idx;
....@@ -1440,7 +1495,6 @@
14401495 return levels;
14411496 }
14421497
1443
-#ifdef CONFIG_NO_HZ_COMMON
14441498 /*
14451499 * Find the next pending bucket of a level. Search from level start (@offset)
14461500 * + @clk upwards and if nothing there, search from start of the level
....@@ -1473,6 +1527,7 @@
14731527 clk = base->clk;
14741528 for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
14751529 int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
1530
+ unsigned long lvl_clk = clk & LVL_CLK_MASK;
14761531
14771532 if (pos >= 0) {
14781533 unsigned long tmp = clk + (unsigned long) pos;
....@@ -1480,6 +1535,13 @@
14801535 tmp <<= LVL_SHIFT(lvl);
14811536 if (time_before(tmp, next))
14821537 next = tmp;
1538
+
1539
+ /*
1540
+ * If the next expiration happens before we reach
1541
+ * the next level, no need to check further.
1542
+ */
1543
+ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
1544
+ break;
14831545 }
14841546 /*
14851547 * Clock for the next level. If the current level clock lower
....@@ -1517,13 +1579,18 @@
15171579 * So the simple check whether the lower bits of the current
15181580 * level are 0 or not is sufficient for all cases.
15191581 */
1520
- adj = clk & LVL_CLK_MASK ? 1 : 0;
1582
+ adj = lvl_clk ? 1 : 0;
15211583 clk >>= LVL_CLK_SHIFT;
15221584 clk += adj;
15231585 }
1586
+
1587
+ base->next_expiry_recalc = false;
1588
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1589
+
15241590 return next;
15251591 }
15261592
1593
+#ifdef CONFIG_NO_HZ_COMMON
15271594 /*
15281595 * Check, if the next hrtimer event is before the next timer wheel
15291596 * event:
....@@ -1570,7 +1637,6 @@
15701637 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
15711638 u64 expires = KTIME_MAX;
15721639 unsigned long nextevt;
1573
- bool is_max_delta;
15741640
15751641 /*
15761642 * Pretend that there is no timer pending if the cpu is offline.
....@@ -1580,9 +1646,10 @@
15801646 return expires;
15811647
15821648 raw_spin_lock(&base->lock);
1583
- nextevt = __next_timer_interrupt(base);
1584
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
1585
- base->next_expiry = nextevt;
1649
+ if (base->next_expiry_recalc)
1650
+ base->next_expiry = __next_timer_interrupt(base);
1651
+ nextevt = base->next_expiry;
1652
+
15861653 /*
15871654 * We have a fresh next event. Check whether we can forward the
15881655 * base. We can only do that when @basej is past base->clk
....@@ -1599,7 +1666,7 @@
15991666 expires = basem;
16001667 base->is_idle = false;
16011668 } else {
1602
- if (!is_max_delta)
1669
+ if (base->timers_pending)
16031670 expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
16041671 /*
16051672 * If we expect to sleep more than a tick, mark the base idle.
....@@ -1608,10 +1675,8 @@
16081675 * logic is only maintained for the BASE_STD base, deferrable
16091676 * timers may still see large granularity skew (by design).
16101677 */
1611
- if ((expires - basem) > TICK_NSEC) {
1612
- base->must_forward_clk = true;
1678
+ if ((expires - basem) > TICK_NSEC)
16131679 base->is_idle = true;
1614
- }
16151680 }
16161681 raw_spin_unlock(&base->lock);
16171682
....@@ -1635,42 +1700,6 @@
16351700 */
16361701 base->is_idle = false;
16371702 }
1638
-
1639
-static int collect_expired_timers(struct timer_base *base,
1640
- struct hlist_head *heads)
1641
-{
1642
- unsigned long now = READ_ONCE(jiffies);
1643
-
1644
- /*
1645
- * NOHZ optimization. After a long idle sleep we need to forward the
1646
- * base to current jiffies. Avoid a loop by searching the bitfield for
1647
- * the next expiring timer.
1648
- */
1649
- if ((long)(now - base->clk) > 2) {
1650
- unsigned long next = __next_timer_interrupt(base);
1651
-
1652
- /*
1653
- * If the next timer is ahead of time forward to current
1654
- * jiffies, otherwise forward to the next expiry time:
1655
- */
1656
- if (time_after(next, now)) {
1657
- /*
1658
- * The call site will increment base->clk and then
1659
- * terminate the expiry loop immediately.
1660
- */
1661
- base->clk = now;
1662
- return 0;
1663
- }
1664
- base->clk = next;
1665
- }
1666
- return __collect_expired_timers(base, heads);
1667
-}
1668
-#else
1669
-static inline int collect_expired_timers(struct timer_base *base,
1670
- struct hlist_head *heads)
1671
-{
1672
- return __collect_expired_timers(base, heads);
1673
-}
16741703 #endif
16751704
16761705 /*
....@@ -1681,17 +1710,19 @@
16811710 {
16821711 struct task_struct *p = current;
16831712
1713
+ PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
1714
+
16841715 /* Note: this timer irq context must be accounted for as well. */
16851716 account_process_tick(p, user_tick);
16861717 run_local_timers();
1687
- rcu_check_callbacks(user_tick);
1718
+ rcu_sched_clock_irq(user_tick);
16881719 #ifdef CONFIG_IRQ_WORK
16891720 if (in_irq())
16901721 irq_work_tick();
16911722 #endif
16921723 scheduler_tick();
16931724 if (IS_ENABLED(CONFIG_POSIX_TIMERS))
1694
- run_posix_cpu_timers(p);
1725
+ run_posix_cpu_timers();
16951726 }
16961727
16971728 /**
....@@ -1703,38 +1734,32 @@
17031734 struct hlist_head heads[LVL_DEPTH];
17041735 int levels;
17051736
1706
- if (!time_after_eq(jiffies, base->clk))
1737
+ if (time_before(jiffies, base->next_expiry))
17071738 return;
17081739
1709
- spin_lock(&base->expiry_lock);
1740
+ timer_base_lock_expiry(base);
17101741 raw_spin_lock_irq(&base->lock);
17111742
1712
- /*
1713
- * timer_base::must_forward_clk must be cleared before running
1714
- * timers so that any timer functions that call mod_timer() will
1715
- * not try to forward the base. Idle tracking / clock forwarding
1716
- * logic is only used with BASE_STD timers.
1717
- *
1718
- * The must_forward_clk flag is cleared unconditionally also for
1719
- * the deferrable base. The deferrable base is not affected by idle
1720
- * tracking and never forwarded, so clearing the flag is a NOOP.
1721
- *
1722
- * The fact that the deferrable base is never forwarded can cause
1723
- * large variations in granularity for deferrable timers, but they
1724
- * can be deferred for long periods due to idle anyway.
1725
- */
1726
- base->must_forward_clk = false;
1727
-
1728
- while (time_after_eq(jiffies, base->clk)) {
1729
-
1743
+ while (time_after_eq(jiffies, base->clk) &&
1744
+ time_after_eq(jiffies, base->next_expiry)) {
17301745 levels = collect_expired_timers(base, heads);
1746
+ /*
1747
+ * The two possible reasons for not finding any expired
1748
+ * timer at this clk are that all matching timers have been
1749
+ * dequeued or no timer has been queued since
1750
+ * base::next_expiry was set to base::clk +
1751
+ * NEXT_TIMER_MAX_DELTA.
1752
+ */
1753
+ WARN_ON_ONCE(!levels && !base->next_expiry_recalc
1754
+ && base->timers_pending);
17311755 base->clk++;
1756
+ base->next_expiry = __next_timer_interrupt(base);
17321757
17331758 while (levels--)
17341759 expire_timers(base, heads + levels);
17351760 }
17361761 raw_spin_unlock_irq(&base->lock);
1737
- spin_unlock(&base->expiry_lock);
1762
+ timer_base_unlock_expiry(base);
17381763 }
17391764
17401765 /*
....@@ -1743,8 +1768,6 @@
17431768 static __latent_entropy void run_timer_softirq(struct softirq_action *h)
17441769 {
17451770 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1746
-
1747
- irq_work_tick_soft();
17481771
17491772 __run_timers(base);
17501773 if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
....@@ -1760,12 +1783,12 @@
17601783
17611784 hrtimer_run_queues();
17621785 /* Raise the softirq only if required. */
1763
- if (time_before(jiffies, base->clk)) {
1786
+ if (time_before(jiffies, base->next_expiry)) {
17641787 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
17651788 return;
17661789 /* CPU is awake, so check the deferrable base. */
17671790 base++;
1768
- if (time_before(jiffies, base->clk))
1791
+ if (time_before(jiffies, base->next_expiry))
17691792 return;
17701793 }
17711794 raise_softirq(TIMER_SOFTIRQ);
....@@ -1791,21 +1814,23 @@
17911814 * schedule_timeout - sleep until timeout
17921815 * @timeout: timeout value in jiffies
17931816 *
1794
- * Make the current task sleep until @timeout jiffies have
1795
- * elapsed. The routine will return immediately unless
1796
- * the current task state has been set (see set_current_state()).
1817
+ * Make the current task sleep until @timeout jiffies have elapsed.
1818
+ * The function behavior depends on the current task state
1819
+ * (see also set_current_state() description):
17971820 *
1798
- * You can set the task state as follows -
1821
+ * %TASK_RUNNING - the scheduler is called, but the task does not sleep
1822
+ * at all. That happens because sched_submit_work() does nothing for
1823
+ * tasks in %TASK_RUNNING state.
17991824 *
18001825 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
18011826 * pass before the routine returns unless the current task is explicitly
1802
- * woken up, (e.g. by wake_up_process())".
1827
+ * woken up, (e.g. by wake_up_process()).
18031828 *
18041829 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
18051830 * delivered to the current task or the current task is explicitly woken
18061831 * up.
18071832 *
1808
- * The current task state is guaranteed to be TASK_RUNNING when this
1833
+ * The current task state is guaranteed to be %TASK_RUNNING when this
18091834 * routine returns.
18101835 *
18111836 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
....@@ -1813,7 +1838,7 @@
18131838 * value will be %MAX_SCHEDULE_TIMEOUT.
18141839 *
18151840 * Returns 0 when the timer has expired otherwise the remaining time in
1816
- * jiffies will be returned. In all cases the return value is guaranteed
1841
+ * jiffies will be returned. In all cases the return value is guaranteed
18171842 * to be non-negative.
18181843 */
18191844 signed long __sched schedule_timeout(signed long timeout)
....@@ -1854,7 +1879,7 @@
18541879
18551880 timer.task = current;
18561881 timer_setup_on_stack(&timer.timer, process_timeout, 0);
1857
- __mod_timer(&timer.timer, expire, 0);
1882
+ __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
18581883 schedule();
18591884 del_singleshot_timer_sync(&timer.timer);
18601885
....@@ -1927,8 +1952,8 @@
19271952 base = per_cpu_ptr(&timer_bases[b], cpu);
19281953 base->clk = jiffies;
19291954 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
1955
+ base->timers_pending = false;
19301956 base->is_idle = false;
1931
- base->must_forward_clk = true;
19321957 }
19331958 return 0;
19341959 }
....@@ -1981,7 +2006,8 @@
19812006 base->cpu = cpu;
19822007 raw_spin_lock_init(&base->lock);
19832008 base->clk = jiffies;
1984
- spin_lock_init(&base->expiry_lock);
2009
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
2010
+ timer_base_init_expiry_lock(base);
19852011 }
19862012 }
19872013
....@@ -1996,6 +2022,7 @@
19962022 void __init init_timers(void)
19972023 {
19982024 init_timer_cpus();
2025
+ posix_cputimers_init_work();
19992026 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
20002027 }
20012028
....@@ -2029,6 +2056,32 @@
20292056 EXPORT_SYMBOL(msleep_interruptible);
20302057
20312058 /**
2059
+ * usleep_range_state - Sleep for an approximate time in a given state
2060
+ * @min: Minimum time in usecs to sleep
2061
+ * @max: Maximum time in usecs to sleep
2062
+ * @state: State of the current task that will be while sleeping
2063
+ *
2064
+ * In non-atomic context where the exact wakeup time is flexible, use
2065
+ * usleep_range_state() instead of udelay(). The sleep improves responsiveness
2066
+ * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
2067
+ * power usage by allowing hrtimers to take advantage of an already-
2068
+ * scheduled interrupt instead of scheduling a new one just for this sleep.
2069
+ */
2070
+void __sched usleep_range_state(unsigned long min, unsigned long max,
2071
+ unsigned int state)
2072
+{
2073
+ ktime_t exp = ktime_add_us(ktime_get(), min);
2074
+ u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2075
+
2076
+ for (;;) {
2077
+ __set_current_state(state);
2078
+ /* Do not return before the requested sleep time has elapsed */
2079
+ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2080
+ break;
2081
+ }
2082
+}
2083
+
2084
+/**
20322085 * usleep_range - Sleep for an approximate time
20332086 * @min: Minimum time in usecs to sleep
20342087 * @max: Maximum time in usecs to sleep
....@@ -2041,14 +2094,6 @@
20412094 */
20422095 void __sched usleep_range(unsigned long min, unsigned long max)
20432096 {
2044
- ktime_t exp = ktime_add_us(ktime_get(), min);
2045
- u64 delta = (u64)(max - min) * NSEC_PER_USEC;
2046
-
2047
- for (;;) {
2048
- __set_current_state(TASK_UNINTERRUPTIBLE);
2049
- /* Do not return before the requested sleep time has elapsed */
2050
- if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
2051
- break;
2052
- }
2097
+ usleep_range_state(min, max, TASK_UNINTERRUPTIBLE);
20532098 }
20542099 EXPORT_SYMBOL(usleep_range);