hc
2024-05-10 23fa18eaa71266feff7ba8d83022d9e1cc83c65a
kernel/drivers/cpuidle/cpuidle-pseries.c
....@@ -19,9 +19,11 @@
1919 #include <asm/machdep.h>
2020 #include <asm/firmware.h>
2121 #include <asm/runlatch.h>
22
+#include <asm/idle.h>
2223 #include <asm/plpar_wrappers.h>
24
+#include <asm/rtas.h>
2325
24
-struct cpuidle_driver pseries_idle_driver = {
26
+static struct cpuidle_driver pseries_idle_driver = {
2527 .name = "pseries_idle",
2628 .owner = THIS_MODULE,
2729 };
....@@ -31,39 +33,15 @@
3133 static u64 snooze_timeout __read_mostly;
3234 static bool snooze_timeout_en __read_mostly;
3335
34
-static inline void idle_loop_prolog(unsigned long *in_purr)
35
-{
36
- ppc64_runlatch_off();
37
- *in_purr = mfspr(SPRN_PURR);
38
- /*
39
- * Indicate to the HV that we are idle. Now would be
40
- * a good time to find other work to dispatch.
41
- */
42
- get_lppaca()->idle = 1;
43
-}
44
-
45
-static inline void idle_loop_epilog(unsigned long in_purr)
46
-{
47
- u64 wait_cycles;
48
-
49
- wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
50
- wait_cycles += mfspr(SPRN_PURR) - in_purr;
51
- get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
52
- get_lppaca()->idle = 0;
53
-
54
- ppc64_runlatch_on();
55
-}
56
-
5736 static int snooze_loop(struct cpuidle_device *dev,
5837 struct cpuidle_driver *drv,
5938 int index)
6039 {
61
- unsigned long in_purr;
6240 u64 snooze_exit_time;
6341
6442 set_thread_flag(TIF_POLLING_NRFLAG);
6543
66
- idle_loop_prolog(&in_purr);
44
+ pseries_idle_prolog();
6745 local_irq_enable();
6846 snooze_exit_time = get_tb() + snooze_timeout;
6947
....@@ -87,7 +65,7 @@
8765
8866 local_irq_disable();
8967
90
- idle_loop_epilog(in_purr);
68
+ pseries_idle_epilog();
9169
9270 return index;
9371 }
....@@ -109,22 +87,152 @@
10987 }
11088 }
11189
90
+/*
91
+ * XCEDE: Extended CEDE states discovered through the
92
+ * "ibm,get-systems-parameter" RTAS call with the token
93
+ * CEDE_LATENCY_TOKEN
94
+ */
95
+
96
+/*
97
+ * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
98
+ * table with all the parameters to ibm,get-system-parameters.
99
+ * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
100
+ * Settings Information.
101
+ */
102
+#define CEDE_LATENCY_TOKEN 45
103
+
104
+/*
105
+ * If the platform supports the cede latency settings information system
106
+ * parameter it must provide the following information in the NULL terminated
107
+ * parameter string:
108
+ *
109
+ * a. The first byte is the length ā€œNā€ of each cede latency setting record minus
110
+ * one (zero indicates a length of 1 byte).
111
+ *
112
+ * b. For each supported cede latency setting a cede latency setting record
113
+ * consisting of the first ā€œNā€ bytes as per the following table.
114
+ *
115
+ * -----------------------------
116
+ * | Field | Field |
117
+ * | Name | Length |
118
+ * -----------------------------
119
+ * | Cede Latency | 1 Byte |
120
+ * | Specifier Value | |
121
+ * -----------------------------
122
+ * | Maximum wakeup | |
123
+ * | latency in | 8 Bytes |
124
+ * | tb-ticks | |
125
+ * -----------------------------
126
+ * | Responsive to | |
127
+ * | external | 1 Byte |
128
+ * | interrupts | |
129
+ * -----------------------------
130
+ *
131
+ * This version has cede latency record size = 10.
132
+ *
133
+ * The structure xcede_latency_payload represents a) and b) with
134
+ * xcede_latency_record representing the table in b).
135
+ *
136
+ * xcede_latency_parameter is what gets returned by
137
+ * ibm,get-systems-parameter RTAS call when made with
138
+ * CEDE_LATENCY_TOKEN.
139
+ *
140
+ * These structures are only used to represent the data obtained by the RTAS
141
+ * call. The data is in big-endian.
142
+ */
143
+struct xcede_latency_record {
144
+ u8 hint;
145
+ __be64 latency_ticks;
146
+ u8 wake_on_irqs;
147
+} __packed;
148
+
149
+// Make space for 16 records, which "should be enough".
150
+struct xcede_latency_payload {
151
+ u8 record_size;
152
+ struct xcede_latency_record records[16];
153
+} __packed;
154
+
155
+struct xcede_latency_parameter {
156
+ __be16 payload_size;
157
+ struct xcede_latency_payload payload;
158
+ u8 null_char;
159
+} __packed;
160
+
161
+static unsigned int nr_xcede_records;
162
+static struct xcede_latency_parameter xcede_latency_parameter __initdata;
163
+
164
+static int __init parse_cede_parameters(void)
165
+{
166
+ struct xcede_latency_payload *payload;
167
+ u32 total_xcede_records_size;
168
+ u8 xcede_record_size;
169
+ u16 payload_size;
170
+ int ret, i;
171
+
172
+ ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
173
+ NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
174
+ sizeof(xcede_latency_parameter));
175
+ if (ret) {
176
+ pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
177
+ return ret;
178
+ }
179
+
180
+ payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
181
+ payload = &xcede_latency_parameter.payload;
182
+
183
+ xcede_record_size = payload->record_size + 1;
184
+
185
+ if (xcede_record_size != sizeof(struct xcede_latency_record)) {
186
+ pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
187
+ sizeof(struct xcede_latency_record), xcede_record_size);
188
+ return -EINVAL;
189
+ }
190
+
191
+ pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
192
+
193
+ /*
194
+ * Since the payload_size includes the last NULL byte and the
195
+ * xcede_record_size, the remaining bytes correspond to array of all
196
+ * cede_latency settings.
197
+ */
198
+ total_xcede_records_size = payload_size - 2;
199
+ nr_xcede_records = total_xcede_records_size / xcede_record_size;
200
+
201
+ for (i = 0; i < nr_xcede_records; i++) {
202
+ struct xcede_latency_record *record = &payload->records[i];
203
+ u64 latency_ticks = be64_to_cpu(record->latency_ticks);
204
+ u8 wake_on_irqs = record->wake_on_irqs;
205
+ u8 hint = record->hint;
206
+
207
+ pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
208
+ i, hint, latency_ticks, wake_on_irqs);
209
+ }
210
+
211
+ return 0;
212
+}
213
+
214
+#define NR_DEDICATED_STATES 2 /* snooze, CEDE */
215
+static u8 cede_latency_hint[NR_DEDICATED_STATES];
216
+
112217 static int dedicated_cede_loop(struct cpuidle_device *dev,
113218 struct cpuidle_driver *drv,
114219 int index)
115220 {
116
- unsigned long in_purr;
221
+ u8 old_latency_hint;
117222
118
- idle_loop_prolog(&in_purr);
223
+ pseries_idle_prolog();
119224 get_lppaca()->donate_dedicated_cpu = 1;
225
+ old_latency_hint = get_lppaca()->cede_latency_hint;
226
+ get_lppaca()->cede_latency_hint = cede_latency_hint[index];
120227
121228 HMT_medium();
122229 check_and_cede_processor();
123230
124231 local_irq_disable();
125232 get_lppaca()->donate_dedicated_cpu = 0;
233
+ get_lppaca()->cede_latency_hint = old_latency_hint;
126234
127
- idle_loop_epilog(in_purr);
235
+ pseries_idle_epilog();
128236
129237 return index;
130238 }
....@@ -133,9 +241,8 @@
133241 struct cpuidle_driver *drv,
134242 int index)
135243 {
136
- unsigned long in_purr;
137244
138
- idle_loop_prolog(&in_purr);
245
+ pseries_idle_prolog();
139246
140247 /*
141248 * Yield the processor to the hypervisor. We return if
....@@ -147,7 +254,7 @@
147254 check_and_cede_processor();
148255
149256 local_irq_disable();
150
- idle_loop_epilog(in_purr);
257
+ pseries_idle_epilog();
151258
152259 return index;
153260 }
....@@ -155,7 +262,7 @@
155262 /*
156263 * States for dedicated partition case.
157264 */
158
-static struct cpuidle_state dedicated_states[] = {
265
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
159266 { /* Snooze */
160267 .name = "snooze",
161268 .desc = "snooze",
....@@ -236,29 +343,93 @@
236343 return 0;
237344 }
238345
346
+static void __init fixup_cede0_latency(void)
347
+{
348
+ struct xcede_latency_payload *payload;
349
+ u64 min_latency_us;
350
+ int i;
351
+
352
+ min_latency_us = dedicated_states[1].exit_latency; // CEDE latency
353
+
354
+ if (parse_cede_parameters())
355
+ return;
356
+
357
+ pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
358
+ nr_xcede_records);
359
+
360
+ payload = &xcede_latency_parameter.payload;
361
+ for (i = 0; i < nr_xcede_records; i++) {
362
+ struct xcede_latency_record *record = &payload->records[i];
363
+ u64 latency_tb = be64_to_cpu(record->latency_ticks);
364
+ u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
365
+
366
+ if (latency_us == 0)
367
+ pr_warn("cpuidle: xcede record %d has an unrealistic latency of 0us.\n", i);
368
+
369
+ if (latency_us < min_latency_us)
370
+ min_latency_us = latency_us;
371
+ }
372
+
373
+ /*
374
+ * By default, we assume that CEDE(0) has exit latency 10us,
375
+ * since there is no way for us to query from the platform.
376
+ *
377
+ * However, if the wakeup latency of an Extended CEDE state is
378
+ * smaller than 10us, then we can be sure that CEDE(0)
379
+ * requires no more than that.
380
+ *
381
+ * Perform the fix-up.
382
+ */
383
+ if (min_latency_us < dedicated_states[1].exit_latency) {
384
+ /*
385
+ * We set a minimum of 1us wakeup latency for cede0 to
386
+ * distinguish it from snooze
387
+ */
388
+ u64 cede0_latency = 1;
389
+
390
+ if (min_latency_us > cede0_latency)
391
+ cede0_latency = min_latency_us - 1;
392
+
393
+ dedicated_states[1].exit_latency = cede0_latency;
394
+ dedicated_states[1].target_residency = 10 * (cede0_latency);
395
+ pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
396
+ cede0_latency);
397
+ }
398
+
399
+}
400
+
239401 /*
240402 * pseries_idle_probe()
241403 * Choose state table for shared versus dedicated partition
242404 */
243
-static int pseries_idle_probe(void)
405
+static int __init pseries_idle_probe(void)
244406 {
245407
246408 if (cpuidle_disable != IDLE_NO_OVERRIDE)
247409 return -ENODEV;
248410
249411 if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
250
- /*
251
- * Use local_paca instead of get_lppaca() since
252
- * preemption is not disabled, and it is not required in
253
- * fact, since lppaca_ptr does not need to be the value
254
- * associated to the current CPU, it can be from any CPU.
255
- */
256
- if (lppaca_shared_proc(local_paca->lppaca_ptr)) {
412
+ if (lppaca_shared_proc()) {
257413 cpuidle_state_table = shared_states;
258414 max_idle_state = ARRAY_SIZE(shared_states);
259415 } else {
416
+ /*
417
+ * Use firmware provided latency values
418
+ * starting with POWER10 platforms. In the
419
+ * case that we are running on a POWER10
420
+ * platform but in an earlier compat mode, we
421
+ * can still use the firmware provided values.
422
+ *
423
+ * However, on platforms prior to POWER10, we
424
+ * cannot rely on the accuracy of the firmware
425
+ * provided latency values. On such platforms,
426
+ * go with the conservative default estimate
427
+ * of 10us.
428
+ */
429
+ if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
430
+ fixup_cede0_latency();
260431 cpuidle_state_table = dedicated_states;
261
- max_idle_state = ARRAY_SIZE(dedicated_states);
432
+ max_idle_state = NR_DEDICATED_STATES;
262433 }
263434 } else
264435 return -ENODEV;