hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/x86/kernel/tsc.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
23
34 #include <linux/kernel.h>
....@@ -40,6 +41,7 @@
4041 * TSC can be unstable due to cpufreq or due to unsynced TSCs
4142 */
4243 static int __read_mostly tsc_unstable;
44
+static unsigned int __initdata tsc_early_khz;
4345
4446 static DEFINE_STATIC_KEY_FALSE(__use_tsc);
4547
....@@ -52,30 +54,36 @@
5254
5355 struct cyc2ns {
5456 struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
55
- seqcount_t seq; /* 32 + 4 = 36 */
57
+ seqcount_latch_t seq; /* 32 + 4 = 36 */
5658
5759 }; /* fits one cacheline */
5860
5961 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
6062
61
-void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
63
+static int __init tsc_early_khz_setup(char *buf)
64
+{
65
+ return kstrtouint(buf, 0, &tsc_early_khz);
66
+}
67
+early_param("tsc_early_khz", tsc_early_khz_setup);
68
+
69
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
6270 {
6371 int seq, idx;
6472
6573 preempt_disable_notrace();
6674
6775 do {
68
- seq = this_cpu_read(cyc2ns.seq.sequence);
76
+ seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
6977 idx = seq & 1;
7078
7179 data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
7280 data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
7381 data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
7482
75
- } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
83
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
7684 }
7785
78
-void __always_inline cyc2ns_read_end(void)
86
+__always_inline void cyc2ns_read_end(void)
7987 {
8088 preempt_enable_notrace();
8189 }
....@@ -178,15 +186,14 @@
178186 {
179187 struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
180188
181
- seqcount_init(&c2n->seq);
189
+ seqcount_latch_init(&c2n->seq);
182190 __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
183191 }
184192
185193 /*
186194 * Secondary CPUs do not run through tsc_init(), so set up
187195 * all the scale factors for all CPUs, assuming the same
188
- * speed as the bootup CPU. (cpufreq notifiers will fix this
189
- * up if their speed diverges)
196
+ * speed as the bootup CPU.
190197 */
191198 static void __init cyc2ns_init_secondary_cpus(void)
192199 {
....@@ -196,7 +203,7 @@
196203
197204 for_each_possible_cpu(cpu) {
198205 if (cpu != this_cpu) {
199
- seqcount_init(&c2n->seq);
206
+ seqcount_latch_init(&c2n->seq);
200207 c2n = per_cpu_ptr(&cyc2ns, cpu);
201208 c2n->data[0] = data[0];
202209 c2n->data[1] = data[1];
....@@ -247,7 +254,7 @@
247254
248255 bool using_native_sched_clock(void)
249256 {
250
- return pv_time_ops.sched_clock == native_sched_clock;
257
+ return pv_ops.time.sched_clock == native_sched_clock;
251258 }
252259 #else
253260 unsigned long long
....@@ -283,6 +290,7 @@
283290 __setup("notsc", notsc_setup);
284291
285292 static int no_sched_irq_time;
293
+static int no_tsc_watchdog;
286294
287295 static int __init tsc_setup(char *str)
288296 {
....@@ -292,20 +300,23 @@
292300 no_sched_irq_time = 1;
293301 if (!strcmp(str, "unstable"))
294302 mark_tsc_unstable("boot parameter");
303
+ if (!strcmp(str, "nowatchdog"))
304
+ no_tsc_watchdog = 1;
295305 return 1;
296306 }
297307
298308 __setup("tsc=", tsc_setup);
299309
300
-#define MAX_RETRIES 5
301
-#define SMI_TRESHOLD 50000
310
+#define MAX_RETRIES 5
311
+#define TSC_DEFAULT_THRESHOLD 0x20000
302312
303313 /*
304
- * Read TSC and the reference counters. Take care of SMI disturbance
314
+ * Read TSC and the reference counters. Take care of any disturbances
305315 */
306316 static u64 tsc_read_refs(u64 *p, int hpet)
307317 {
308318 u64 t1, t2;
319
+ u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
309320 int i;
310321
311322 for (i = 0; i < MAX_RETRIES; i++) {
....@@ -315,7 +326,7 @@
315326 else
316327 *p = acpi_pm_read_early();
317328 t2 = get_cycles();
318
- if ((t2 - t1) < SMI_TRESHOLD)
329
+ if ((t2 - t1) < thresh)
319330 return t2;
320331 }
321332 return ULLONG_MAX;
....@@ -473,7 +484,7 @@
473484 * transition from one expected value to another with a fairly
474485 * high accuracy, and we didn't miss any events. We can thus
475486 * use the TSC value at the transitions to calculate a pretty
476
- * good value for the TSC frequencty.
487
+ * good value for the TSC frequency.
477488 */
478489 static inline int pit_verify_msb(unsigned char val)
479490 {
....@@ -628,31 +639,38 @@
628639
629640 crystal_khz = ecx_hz / 1000;
630641
631
- if (crystal_khz == 0) {
632
- switch (boot_cpu_data.x86_model) {
633
- case INTEL_FAM6_SKYLAKE_MOBILE:
634
- case INTEL_FAM6_SKYLAKE_DESKTOP:
635
- case INTEL_FAM6_KABYLAKE_MOBILE:
636
- case INTEL_FAM6_KABYLAKE_DESKTOP:
637
- crystal_khz = 24000; /* 24.0 MHz */
638
- break;
639
- case INTEL_FAM6_ATOM_GOLDMONT_X:
640
- crystal_khz = 25000; /* 25.0 MHz */
641
- break;
642
- case INTEL_FAM6_ATOM_GOLDMONT:
643
- crystal_khz = 19200; /* 19.2 MHz */
644
- break;
645
- }
642
+ /*
643
+ * Denverton SoCs don't report crystal clock, and also don't support
644
+ * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
645
+ * clock.
646
+ */
647
+ if (crystal_khz == 0 &&
648
+ boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
649
+ crystal_khz = 25000;
650
+
651
+ /*
652
+ * TSC frequency reported directly by CPUID is a "hardware reported"
653
+ * frequency and is the most accurate one so far we have. This
654
+ * is considered a known frequency.
655
+ */
656
+ if (crystal_khz != 0)
657
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
658
+
659
+ /*
660
+ * Some Intel SoCs like Skylake and Kabylake don't report the crystal
661
+ * clock, but we can easily calculate it to a high degree of accuracy
662
+ * by considering the crystal ratio and the CPU speed.
663
+ */
664
+ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
665
+ unsigned int eax_base_mhz, ebx, ecx, edx;
666
+
667
+ cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
668
+ crystal_khz = eax_base_mhz * 1000 *
669
+ eax_denominator / ebx_numerator;
646670 }
647671
648672 if (crystal_khz == 0)
649673 return 0;
650
- /*
651
- * TSC frequency determined by CPUID is a "hardware reported"
652
- * frequency and is the most accurate one so far we have. This
653
- * is considered a known frequency.
654
- */
655
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
656674
657675 /*
658676 * For Atom SoCs TSC is the only reliable clocksource.
....@@ -660,6 +678,16 @@
660678 */
661679 if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
662680 setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
681
+
682
+#ifdef CONFIG_X86_LOCAL_APIC
683
+ /*
684
+ * The local APIC appears to be fed by the core crystal clock
685
+ * (which sounds entirely sensible). We can set the global
686
+ * lapic_timer_period here to avoid having to calibrate the APIC
687
+ * timer later.
688
+ */
689
+ lapic_timer_period = crystal_khz * 1000 / HZ;
690
+#endif
663691
664692 return crystal_khz * ebx_numerator / eax_denominator;
665693 }
....@@ -703,15 +731,15 @@
703731 * zero. In each wait loop iteration we read the TSC and check
704732 * the delta to the previous read. We keep track of the min
705733 * and max values of that delta. The delta is mostly defined
706
- * by the IO time of the PIT access, so we can detect when a
707
- * SMI/SMM disturbance happened between the two reads. If the
734
+ * by the IO time of the PIT access, so we can detect when
735
+ * any disturbance happened between the two reads. If the
708736 * maximum time is significantly larger than the minimum time,
709737 * then we discard the result and have another try.
710738 *
711739 * 2) Reference counter. If available we use the HPET or the
712740 * PMTIMER as a reference to check the sanity of that value.
713741 * We use separate TSC readouts and check inside of the
714
- * reference read for a SMI/SMM disturbance. We dicard
742
+ * reference read for any possible disturbance. We dicard
715743 * disturbed values here as well. We do that around the PIT
716744 * calibration delay loop as we have to wait for a certain
717745 * amount of time anyway.
....@@ -744,7 +772,7 @@
744772 if (ref1 == ref2)
745773 continue;
746774
747
- /* Check, whether the sampling was disturbed by an SMI */
775
+ /* Check, whether the sampling was disturbed */
748776 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
749777 continue;
750778
....@@ -936,12 +964,12 @@
936964 }
937965
938966 #ifdef CONFIG_CPU_FREQ
939
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
967
+/*
968
+ * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
940969 * changes.
941970 *
942
- * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
943
- * not that important because current Opteron setups do not support
944
- * scaling on SMP anyroads.
971
+ * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
972
+ * as unstable and give up in those cases.
945973 *
946974 * Should fix up last_tsc too. Currently gettimeofday in the
947975 * first tick after the change will be slightly wrong.
....@@ -955,28 +983,28 @@
955983 void *data)
956984 {
957985 struct cpufreq_freqs *freq = data;
958
- unsigned long *lpj;
959986
960
- lpj = &boot_cpu_data.loops_per_jiffy;
961
-#ifdef CONFIG_SMP
962
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
963
- lpj = &cpu_data(freq->cpu).loops_per_jiffy;
964
-#endif
987
+ if (num_online_cpus() > 1) {
988
+ mark_tsc_unstable("cpufreq changes on SMP");
989
+ return 0;
990
+ }
965991
966992 if (!ref_freq) {
967993 ref_freq = freq->old;
968
- loops_per_jiffy_ref = *lpj;
994
+ loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
969995 tsc_khz_ref = tsc_khz;
970996 }
997
+
971998 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
972
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
973
- *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
999
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
1000
+ boot_cpu_data.loops_per_jiffy =
1001
+ cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
9741002
9751003 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
9761004 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
9771005 mark_tsc_unstable("cpufreq changes");
9781006
979
- set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
1007
+ set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
9801008 }
9811009
9821010 return 0;
....@@ -1087,17 +1115,24 @@
10871115 sched_clock_tick_stable();
10881116 }
10891117
1118
+static int tsc_cs_enable(struct clocksource *cs)
1119
+{
1120
+ vclocks_set_used(VDSO_CLOCKMODE_TSC);
1121
+ return 0;
1122
+}
1123
+
10901124 /*
10911125 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
10921126 */
10931127 static struct clocksource clocksource_tsc_early = {
1094
- .name = "tsc-early",
1095
- .rating = 299,
1096
- .read = read_tsc,
1097
- .mask = CLOCKSOURCE_MASK(64),
1098
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
1128
+ .name = "tsc-early",
1129
+ .rating = 299,
1130
+ .read = read_tsc,
1131
+ .mask = CLOCKSOURCE_MASK(64),
1132
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
10991133 CLOCK_SOURCE_MUST_VERIFY,
1100
- .archdata = { .vclock_mode = VCLOCK_TSC },
1134
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
1135
+ .enable = tsc_cs_enable,
11011136 .resume = tsc_resume,
11021137 .mark_unstable = tsc_cs_mark_unstable,
11031138 .tick_stable = tsc_cs_tick_stable,
....@@ -1110,14 +1145,16 @@
11101145 * been found good.
11111146 */
11121147 static struct clocksource clocksource_tsc = {
1113
- .name = "tsc",
1114
- .rating = 300,
1115
- .read = read_tsc,
1116
- .mask = CLOCKSOURCE_MASK(64),
1117
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
1148
+ .name = "tsc",
1149
+ .rating = 300,
1150
+ .read = read_tsc,
1151
+ .mask = CLOCKSOURCE_MASK(64),
1152
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
11181153 CLOCK_SOURCE_VALID_FOR_HRES |
1119
- CLOCK_SOURCE_MUST_VERIFY,
1120
- .archdata = { .vclock_mode = VCLOCK_TSC },
1154
+ CLOCK_SOURCE_MUST_VERIFY |
1155
+ CLOCK_SOURCE_VERIFY_PERCPU,
1156
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
1157
+ .enable = tsc_cs_enable,
11211158 .resume = tsc_resume,
11221159 .mark_unstable = tsc_cs_mark_unstable,
11231160 .tick_stable = tsc_cs_tick_stable,
....@@ -1141,6 +1178,12 @@
11411178
11421179 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
11431180
1181
+static void __init tsc_disable_clocksource_watchdog(void)
1182
+{
1183
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1184
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1185
+}
1186
+
11441187 static void __init check_system_tsc_reliable(void)
11451188 {
11461189 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
....@@ -1157,6 +1200,23 @@
11571200 #endif
11581201 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
11591202 tsc_clocksource_reliable = 1;
1203
+
1204
+ /*
1205
+ * Disable the clocksource watchdog when the system has:
1206
+ * - TSC running at constant frequency
1207
+ * - TSC which does not stop in C-States
1208
+ * - the TSC_ADJUST register which allows to detect even minimal
1209
+ * modifications
1210
+ * - not more than two sockets. As the number of sockets cannot be
1211
+ * evaluated at the early boot stage where this has to be
1212
+ * invoked, check the number of online memory nodes as a
1213
+ * fallback solution which is an reasonable estimate.
1214
+ */
1215
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1216
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
1217
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
1218
+ nr_online_nodes <= 2)
1219
+ tsc_disable_clocksource_watchdog();
11601220 }
11611221
11621222 /*
....@@ -1268,7 +1328,7 @@
12681328 */
12691329 static void tsc_refine_calibration_work(struct work_struct *work)
12701330 {
1271
- static u64 tsc_start = -1, ref_start;
1331
+ static u64 tsc_start = ULLONG_MAX, ref_start;
12721332 static int hpet;
12731333 u64 tsc_stop, ref_stop, delta;
12741334 unsigned long freq;
....@@ -1283,14 +1343,15 @@
12831343 * delayed the first time we expire. So set the workqueue
12841344 * again once we know timers are working.
12851345 */
1286
- if (tsc_start == -1) {
1346
+ if (tsc_start == ULLONG_MAX) {
1347
+restart:
12871348 /*
12881349 * Only set hpet once, to avoid mixing hardware
12891350 * if the hpet becomes enabled later.
12901351 */
12911352 hpet = is_hpet_enabled();
1292
- schedule_delayed_work(&tsc_irqwork, HZ);
12931353 tsc_start = tsc_read_refs(&ref_start, hpet);
1354
+ schedule_delayed_work(&tsc_irqwork, HZ);
12941355 return;
12951356 }
12961357
....@@ -1300,9 +1361,9 @@
13001361 if (ref_start == ref_stop)
13011362 goto out;
13021363
1303
- /* Check, whether the sampling was disturbed by an SMI */
1304
- if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
1305
- goto out;
1364
+ /* Check, whether the sampling was disturbed */
1365
+ if (tsc_stop == ULLONG_MAX)
1366
+ goto restart;
13061367
13071368 delta = tsc_stop - tsc_start;
13081369 delta *= 1000000LL;
....@@ -1347,9 +1408,6 @@
13471408 if (tsc_unstable)
13481409 goto unreg;
13491410
1350
- if (tsc_clocksource_reliable)
1351
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1352
-
13531411 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
13541412 clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
13551413
....@@ -1382,7 +1440,10 @@
13821440
13831441 if (early) {
13841442 cpu_khz = x86_platform.calibrate_cpu();
1385
- tsc_khz = x86_platform.calibrate_tsc();
1443
+ if (tsc_early_khz)
1444
+ tsc_khz = tsc_early_khz;
1445
+ else
1446
+ tsc_khz = x86_platform.calibrate_tsc();
13861447 } else {
13871448 /* We should not be here with non-native cpu calibration */
13881449 WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
....@@ -1483,6 +1544,9 @@
14831544 return;
14841545 }
14851546
1547
+ if (tsc_clocksource_reliable || no_tsc_watchdog)
1548
+ tsc_disable_clocksource_watchdog();
1549
+
14861550 clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
14871551 detect_art();
14881552 }