hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/kernel/time/tick-sched.c
....@@ -1,6 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
2
- * linux/kernel/time/tick-sched.c
3
- *
43 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
54 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
65 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
....@@ -8,8 +7,6 @@
87 * No idle tick implementation for low and high resolution timers
98 *
109 * Started by: Thomas Gleixner and Ingo Molnar
11
- *
12
- * Distribute under GPLv2.
1310 */
1411 #include <linux/cpu.h>
1512 #include <linux/err.h>
....@@ -26,9 +23,9 @@
2623 #include <linux/module.h>
2724 #include <linux/irq_work.h>
2825 #include <linux/posix-timers.h>
29
-#include <linux/timer.h>
3026 #include <linux/context_tracking.h>
3127 #include <linux/mm.h>
28
+#include <trace/hooks/sched.h>
3229
3330 #include <asm/irq_regs.h>
3431
....@@ -57,49 +54,67 @@
5754 */
5855 static void tick_do_update_jiffies64(ktime_t now)
5956 {
60
- unsigned long ticks = 0;
57
+ unsigned long ticks = 1;
6158 ktime_t delta;
6259
6360 /*
64
- * Do a quick check without holding jiffies_lock:
65
- * The READ_ONCE() pairs with two updates done later in this function.
61
+ * Do a quick check without holding jiffies_lock. The READ_ONCE()
62
+ * pairs with the update done later in this function.
63
+ *
64
+ * This is also an intentional data race which is even safe on
65
+ * 32bit in theory. If there is a concurrent update then the check
66
+ * might give a random answer. It does not matter because if it
67
+ * returns then the concurrent update is already taking care, if it
68
+ * falls through then it will pointlessly contend on jiffies_lock.
69
+ *
70
+ * Though there is one nasty case on 32bit due to store tearing of
71
+ * the 64bit value. If the first 32bit store makes the quick check
72
+ * return on all other CPUs and the writing CPU context gets
73
+ * delayed to complete the second store (scheduled out on virt)
74
+ * then jiffies can become stale for up to ~2^32 nanoseconds
75
+ * without noticing. After that point all CPUs will wait for
76
+ * jiffies lock.
77
+ *
78
+ * OTOH, this is not any different than the situation with NOHZ=off
79
+ * where one CPU is responsible for updating jiffies and
80
+ * timekeeping. If that CPU goes out for lunch then all other CPUs
81
+ * will operate on stale jiffies until it decides to come back.
6682 */
67
- delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
68
- if (delta < tick_period)
83
+ if (ktime_before(now, READ_ONCE(tick_next_period)))
6984 return;
7085
7186 /* Reevaluate with jiffies_lock held */
7287 raw_spin_lock(&jiffies_lock);
73
- write_seqcount_begin(&jiffies_seq);
74
-
75
- delta = ktime_sub(now, last_jiffies_update);
76
- if (delta >= tick_period) {
77
-
78
- delta = ktime_sub(delta, tick_period);
79
- /* Pairs with the lockless read in this function. */
80
- WRITE_ONCE(last_jiffies_update,
81
- ktime_add(last_jiffies_update, tick_period));
82
-
83
- /* Slow path for long timeouts */
84
- if (unlikely(delta >= tick_period)) {
85
- s64 incr = ktime_to_ns(tick_period);
86
-
87
- ticks = ktime_divns(delta, incr);
88
-
89
- /* Pairs with the lockless read in this function. */
90
- WRITE_ONCE(last_jiffies_update,
91
- ktime_add_ns(last_jiffies_update,
92
- incr * ticks));
93
- }
94
- do_timer(++ticks);
95
-
96
- /* Keep the tick_next_period variable up to date */
97
- tick_next_period = ktime_add(last_jiffies_update, tick_period);
98
- } else {
99
- write_seqcount_end(&jiffies_seq);
88
+ if (ktime_before(now, tick_next_period)) {
10089 raw_spin_unlock(&jiffies_lock);
10190 return;
10291 }
92
+
93
+ write_seqcount_begin(&jiffies_seq);
94
+
95
+ delta = ktime_sub(now, tick_next_period);
96
+ if (unlikely(delta >= TICK_NSEC)) {
97
+ /* Slow path for long idle sleep times */
98
+ s64 incr = TICK_NSEC;
99
+
100
+ ticks += ktime_divns(delta, incr);
101
+
102
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
103
+ incr * ticks);
104
+ } else {
105
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
106
+ TICK_NSEC);
107
+ }
108
+
109
+ do_timer(ticks);
110
+
111
+ /*
112
+ * Keep the tick_next_period variable up to date. WRITE_ONCE()
113
+ * pairs with the READ_ONCE() in the lockless quick check above.
114
+ */
115
+ WRITE_ONCE(tick_next_period,
116
+ ktime_add_ns(last_jiffies_update, TICK_NSEC));
117
+
103118 write_seqcount_end(&jiffies_seq);
104119 raw_spin_unlock(&jiffies_lock);
105120 update_wall_time();
....@@ -115,13 +130,26 @@
115130 raw_spin_lock(&jiffies_lock);
116131 write_seqcount_begin(&jiffies_seq);
117132 /* Did we start the jiffies update yet ? */
118
- if (last_jiffies_update == 0)
133
+ if (last_jiffies_update == 0) {
134
+ u32 rem;
135
+
136
+ /*
137
+ * Ensure that the tick is aligned to a multiple of
138
+ * TICK_NSEC.
139
+ */
140
+ div_u64_rem(tick_next_period, TICK_NSEC, &rem);
141
+ if (rem)
142
+ tick_next_period += TICK_NSEC - rem;
143
+
119144 last_jiffies_update = tick_next_period;
145
+ }
120146 period = last_jiffies_update;
121147 write_seqcount_end(&jiffies_seq);
122148 raw_spin_unlock(&jiffies_lock);
123149 return period;
124150 }
151
+
152
+#define MAX_STALLED_JIFFIES 5
125153
126154 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
127155 {
....@@ -134,15 +162,38 @@
134162 * into a long sleep. If two CPUs happen to assign themselves to
135163 * this duty, then the jiffies update is still serialized by
136164 * jiffies_lock.
165
+ *
166
+ * If nohz_full is enabled, this should not happen because the
167
+ * tick_do_timer_cpu never relinquishes.
137168 */
138
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
139
- && !tick_nohz_full_cpu(cpu))
169
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
170
+#ifdef CONFIG_NO_HZ_FULL
171
+ WARN_ON_ONCE(tick_nohz_full_running);
172
+#endif
140173 tick_do_timer_cpu = cpu;
174
+ }
141175 #endif
142176
143177 /* Check, if the jiffies need an update */
144
- if (tick_do_timer_cpu == cpu)
178
+ if (tick_do_timer_cpu == cpu) {
145179 tick_do_update_jiffies64(now);
180
+ trace_android_vh_jiffies_update(NULL);
181
+ }
182
+
183
+ /*
184
+ * If jiffies update stalled for too long (timekeeper in stop_machine()
185
+ * or VMEXIT'ed for several msecs), force an update.
186
+ */
187
+ if (ts->last_tick_jiffies != jiffies) {
188
+ ts->stalled_jiffies = 0;
189
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
190
+ } else {
191
+ if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
192
+ tick_do_update_jiffies64(now);
193
+ ts->stalled_jiffies = 0;
194
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
195
+ }
196
+ }
146197
147198 if (ts->inidle)
148199 ts->got_idle_tick = 1;
....@@ -179,6 +230,7 @@
179230 #ifdef CONFIG_NO_HZ_FULL
180231 cpumask_var_t tick_nohz_full_mask;
181232 bool tick_nohz_full_running;
233
+EXPORT_SYMBOL_GPL(tick_nohz_full_running);
182234 static atomic_t tick_dep_mask;
183235
184236 static bool check_tick_dependency(atomic_t *dep)
....@@ -202,6 +254,16 @@
202254
203255 if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
204256 trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
257
+ return true;
258
+ }
259
+
260
+ if (val & TICK_DEP_MASK_RCU) {
261
+ trace_tick_stop(0, TICK_DEP_MASK_RCU);
262
+ return true;
263
+ }
264
+
265
+ if (val & TICK_DEP_MASK_RCU_EXP) {
266
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
205267 return true;
206268 }
207269
....@@ -237,7 +299,7 @@
237299
238300 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
239301 .func = nohz_full_kick_func,
240
- .flags = IRQ_WORK_HARD_IRQ,
302
+ .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
241303 };
242304
243305 /*
....@@ -332,6 +394,7 @@
332394 preempt_enable();
333395 }
334396 }
397
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
335398
336399 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
337400 {
....@@ -339,24 +402,35 @@
339402
340403 atomic_andnot(BIT(bit), &ts->tick_dep_mask);
341404 }
405
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
342406
343407 /*
344
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
345
- * per task timers.
408
+ * Set a per-task tick dependency. RCU need this. Also posix CPU timers
409
+ * in order to elapse per task timers.
346410 */
347411 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
348412 {
349
- /*
350
- * We could optimize this with just kicking the target running the task
351
- * if that noise matters for nohz full users.
352
- */
353
- tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
413
+ if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
414
+ if (tsk == current) {
415
+ preempt_disable();
416
+ tick_nohz_full_kick();
417
+ preempt_enable();
418
+ } else {
419
+ /*
420
+ * Some future tick_nohz_full_kick_task()
421
+ * should optimize this.
422
+ */
423
+ tick_nohz_full_kick_all();
424
+ }
425
+ }
354426 }
427
+EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
355428
356429 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
357430 {
358431 atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
359432 }
433
+EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
360434
361435 /*
362436 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
....@@ -406,16 +480,21 @@
406480 tick_nohz_full_running = true;
407481 }
408482
409
-static int tick_nohz_cpu_down(unsigned int cpu)
483
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
410484 {
411485 /*
412
- * The boot CPU handles housekeeping duty (unbound timers,
413
- * workqueues, timekeeping, ...) on behalf of full dynticks
486
+ * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
487
+ * timers, workqueues, timekeeping, ...) on behalf of full dynticks
414488 * CPUs. It must remain online when nohz full is enabled.
415489 */
416490 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
417
- return -EBUSY;
418
- return 0;
491
+ return false;
492
+ return true;
493
+}
494
+
495
+static int tick_nohz_cpu_down(unsigned int cpu)
496
+{
497
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
419498 }
420499
421500 void __init tick_nohz_init(void)
....@@ -437,12 +516,15 @@
437516 return;
438517 }
439518
440
- cpu = smp_processor_id();
519
+ if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
520
+ !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
521
+ cpu = smp_processor_id();
441522
442
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
443
- pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
444
- cpu);
445
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
523
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
524
+ pr_warn("NO_HZ: Clearing %d from nohz_full range "
525
+ "for timekeeping\n", cpu);
526
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
527
+ }
446528 }
447529
448530 for_each_cpu(cpu, tick_nohz_full_mask)
....@@ -637,12 +719,14 @@
637719 hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
638720
639721 /* Forward the time to expire in the future */
640
- hrtimer_forward(&ts->sched_timer, now, tick_period);
722
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
641723
642
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
643
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
644
- else
724
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
725
+ hrtimer_start_expires(&ts->sched_timer,
726
+ HRTIMER_MODE_ABS_PINNED_HARD);
727
+ } else {
645728 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
729
+ }
646730
647731 /*
648732 * Reset to make sure next tick stop doesn't get fooled by past
....@@ -659,7 +743,8 @@
659743 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
660744 {
661745 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
662
- unsigned long seq, basejiff;
746
+ unsigned long basejiff;
747
+ unsigned int seq;
663748
664749 /* Read jiffies and the time when jiffies were updated last */
665750 do {
....@@ -786,7 +871,6 @@
786871 */
787872 if (!ts->tick_stopped) {
788873 calc_load_nohz_start();
789
- cpu_load_update_nohz_start();
790874 quiet_vmstat();
791875
792876 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
....@@ -803,11 +887,14 @@
803887 if (unlikely(expires == KTIME_MAX)) {
804888 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
805889 hrtimer_cancel(&ts->sched_timer);
890
+ else
891
+ tick_program_event(KTIME_MAX, 1);
806892 return;
807893 }
808894
809895 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
810
- hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
896
+ hrtimer_start(&ts->sched_timer, tick,
897
+ HRTIMER_MODE_ABS_PINNED_HARD);
811898 } else {
812899 hrtimer_set_expires(&ts->sched_timer, tick);
813900 tick_program_event(tick, 1);
....@@ -833,7 +920,6 @@
833920 {
834921 /* Update jiffies first */
835922 tick_do_update_jiffies64(now);
836
- cpu_load_update_nohz_stop();
837923
838924 /*
839925 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
....@@ -896,8 +982,15 @@
896982 if (need_resched())
897983 return false;
898984
899
- if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
900
- softirq_check_pending_idle();
985
+ if (unlikely(local_softirq_pending())) {
986
+ static int ratelimit;
987
+
988
+ if (ratelimit < 10 &&
989
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
990
+ pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
991
+ (unsigned int) local_softirq_pending());
992
+ ratelimit++;
993
+ }
901994 return false;
902995 }
903996
....@@ -908,11 +1001,9 @@
9081001 */
9091002 if (tick_do_timer_cpu == cpu)
9101003 return false;
911
- /*
912
- * Boot safety: make sure the timekeeping duty has been
913
- * assigned before entering dyntick-idle mode,
914
- */
915
- if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
1004
+
1005
+ /* Should not happen for nohz-full */
1006
+ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
9161007 return false;
9171008 }
9181009
....@@ -1030,6 +1121,18 @@
10301121 }
10311122
10321123 /**
1124
+ * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
1125
+ * or the tick, whatever that expires first. Note that, if the tick has been
1126
+ * stopped, it returns the next hrtimer.
1127
+ *
1128
+ * Called from power state control code with interrupts disabled
1129
+ */
1130
+ktime_t tick_nohz_get_next_hrtimer(void)
1131
+{
1132
+ return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
1133
+}
1134
+
1135
+/**
10331136 * tick_nohz_get_sleep_length - return the expected length of the current sleep
10341137 * @delta_next: duration until the next event if the tick cannot be stopped
10351138 *
....@@ -1081,6 +1184,7 @@
10811184
10821185 return ts->idle_calls;
10831186 }
1187
+EXPORT_SYMBOL_GPL(tick_nohz_get_idle_calls_cpu);
10841188
10851189 /**
10861190 * tick_nohz_get_idle_calls - return the current idle calls counter value
....@@ -1099,7 +1203,7 @@
10991203 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
11001204 unsigned long ticks;
11011205
1102
- if (vtime_accounting_cpu_enabled())
1206
+ if (vtime_accounting_enabled_this_cpu())
11031207 return;
11041208 /*
11051209 * We stopped the tick in idle. Update process times would miss the
....@@ -1177,11 +1281,17 @@
11771281 tick_sched_do_timer(ts, now);
11781282 tick_sched_handle(ts, regs);
11791283
1180
- /* No need to reprogram if we are running tickless */
1181
- if (unlikely(ts->tick_stopped))
1284
+ if (unlikely(ts->tick_stopped)) {
1285
+ /*
1286
+ * The clockevent device is not reprogrammed, so change the
1287
+ * clock event device to ONESHOT_STOPPED to avoid spurious
1288
+ * interrupts on devices which might not be truly one shot.
1289
+ */
1290
+ tick_program_event(KTIME_MAX, 1);
11821291 return;
1292
+ }
11831293
1184
- hrtimer_forward(&ts->sched_timer, now, tick_period);
1294
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
11851295 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
11861296 }
11871297
....@@ -1213,12 +1323,12 @@
12131323 * Recycle the hrtimer in ts, so we can share the
12141324 * hrtimer_forward with the highres code.
12151325 */
1216
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1326
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
12171327 /* Get the next period */
12181328 next = tick_init_jiffy_update();
12191329
12201330 hrtimer_set_expires(&ts->sched_timer, next);
1221
- hrtimer_forward_now(&ts->sched_timer, tick_period);
1331
+ hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
12221332 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
12231333 tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
12241334 }
....@@ -1258,18 +1368,6 @@
12581368 * High resolution timer specific code
12591369 */
12601370 #ifdef CONFIG_HIGH_RES_TIMERS
1261
-
1262
-static void (*wake_callback)(void);
1263
-
1264
-void register_tick_sched_wakeup_callback(void (*cb)(void))
1265
-{
1266
- if (!wake_callback)
1267
- wake_callback = cb;
1268
- else
1269
- pr_warn("tick-sched wake cb already exists; skipping.\n");
1270
-}
1271
-EXPORT_SYMBOL_GPL(register_tick_sched_wakeup_callback);
1272
-
12731371 /*
12741372 * We rearm the timer until we get disabled by the idle code.
12751373 * Called with interrupts disabled.
....@@ -1287,15 +1385,8 @@
12871385 * Do not call, when we are not in irq context and have
12881386 * no valid regs pointer
12891387 */
1290
- if (regs) {
1388
+ if (regs)
12911389 tick_sched_handle(ts, regs);
1292
- if (wake_callback && tick_do_timer_cpu == smp_processor_id()) {
1293
- /*
1294
- * wakeup user if needed
1295
- */
1296
- wake_callback();
1297
- }
1298
- }
12991390 else
13001391 ts->next_tick = 0;
13011392
....@@ -1303,7 +1394,7 @@
13031394 if (unlikely(ts->tick_stopped))
13041395 return HRTIMER_NORESTART;
13051396
1306
- hrtimer_forward(timer, now, tick_period);
1397
+ hrtimer_forward(timer, now, TICK_NSEC);
13071398
13081399 return HRTIMER_RESTART;
13091400 }
....@@ -1337,14 +1428,14 @@
13371428
13381429 /* Offset the tick to avert jiffies_lock contention. */
13391430 if (sched_skew_tick) {
1340
- u64 offset = ktime_to_ns(tick_period) >> 1;
1431
+ u64 offset = TICK_NSEC >> 1;
13411432 do_div(offset, num_possible_cpus());
13421433 offset *= smp_processor_id();
13431434 hrtimer_add_expires_ns(&ts->sched_timer, offset);
13441435 }
13451436
1346
- hrtimer_forward(&ts->sched_timer, now, tick_period);
1347
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
1437
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
1438
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
13481439 tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
13491440 }
13501441 #endif /* HIGH_RES_TIMERS */
....@@ -1411,9 +1502,3 @@
14111502 tick_nohz_switch_to_nohz();
14121503 return 0;
14131504 }
1414
-
1415
-ktime_t *get_next_event_cpu(unsigned int cpu)
1416
-{
1417
- return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event);
1418
-}
1419
-EXPORT_SYMBOL_GPL(get_next_event_cpu);