hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/arch/x86/kernel/traps.c
....@@ -37,34 +37,32 @@
3737 #include <linux/mm.h>
3838 #include <linux/smp.h>
3939 #include <linux/io.h>
40
-
41
-#if defined(CONFIG_EDAC)
42
-#include <linux/edac.h>
43
-#endif
40
+#include <linux/hardirq.h>
41
+#include <linux/atomic.h>
4442
4543 #include <asm/stacktrace.h>
4644 #include <asm/processor.h>
4745 #include <asm/debugreg.h>
48
-#include <linux/atomic.h>
46
+#include <asm/realmode.h>
4947 #include <asm/text-patching.h>
5048 #include <asm/ftrace.h>
5149 #include <asm/traps.h>
5250 #include <asm/desc.h>
5351 #include <asm/fpu/internal.h>
52
+#include <asm/cpu.h>
5453 #include <asm/cpu_entry_area.h>
5554 #include <asm/mce.h>
5655 #include <asm/fixmap.h>
5756 #include <asm/mach_traps.h>
5857 #include <asm/alternative.h>
5958 #include <asm/fpu/xstate.h>
60
-#include <asm/trace/mpx.h>
61
-#include <asm/mpx.h>
6259 #include <asm/vm86.h>
6360 #include <asm/umip.h>
61
+#include <asm/insn.h>
62
+#include <asm/insn-eval.h>
6463
6564 #ifdef CONFIG_X86_64
6665 #include <asm/x86_init.h>
67
-#include <asm/pgalloc.h>
6866 #include <asm/proto.h>
6967 #else
7068 #include <asm/processor-flags.h>
....@@ -86,110 +84,20 @@
8684 local_irq_disable();
8785 }
8886
89
-/*
90
- * In IST context, we explicitly disable preemption. This serves two
91
- * purposes: it makes it much less likely that we would accidentally
92
- * schedule in IST context and it will force a warning if we somehow
93
- * manage to schedule by accident.
94
- */
95
-void ist_enter(struct pt_regs *regs)
87
+__always_inline int is_valid_bugaddr(unsigned long addr)
9688 {
97
- if (user_mode(regs)) {
98
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
99
- } else {
100
- /*
101
- * We might have interrupted pretty much anything. In
102
- * fact, if we're a machine check, we can even interrupt
103
- * NMI processing. We don't want in_nmi() to return true,
104
- * but we need to notify RCU.
105
- */
106
- rcu_nmi_enter();
107
- }
108
-
109
- preempt_disable();
110
-
111
- /* This code is a bit fragile. Test it. */
112
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
113
-}
114
-
115
-void ist_exit(struct pt_regs *regs)
116
-{
117
- preempt_enable_no_resched();
118
-
119
- if (!user_mode(regs))
120
- rcu_nmi_exit();
121
-}
122
-
123
-/**
124
- * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
125
- * @regs: regs passed to the IST exception handler
126
- *
127
- * IST exception handlers normally cannot schedule. As a special
128
- * exception, if the exception interrupted userspace code (i.e.
129
- * user_mode(regs) would return true) and the exception was not
130
- * a double fault, it can be safe to schedule. ist_begin_non_atomic()
131
- * begins a non-atomic section within an ist_enter()/ist_exit() region.
132
- * Callers are responsible for enabling interrupts themselves inside
133
- * the non-atomic section, and callers must call ist_end_non_atomic()
134
- * before ist_exit().
135
- */
136
-void ist_begin_non_atomic(struct pt_regs *regs)
137
-{
138
- BUG_ON(!user_mode(regs));
139
-
140
- /*
141
- * Sanity check: we need to be on the normal thread stack. This
142
- * will catch asm bugs and any attempt to use ist_preempt_enable
143
- * from double_fault.
144
- */
145
- BUG_ON(!on_thread_stack());
146
-
147
- preempt_enable_no_resched();
148
-}
149
-
150
-/**
151
- * ist_end_non_atomic() - begin a non-atomic section in an IST exception
152
- *
153
- * Ends a non-atomic section started with ist_begin_non_atomic().
154
- */
155
-void ist_end_non_atomic(void)
156
-{
157
- preempt_disable();
158
-}
159
-
160
-int is_valid_bugaddr(unsigned long addr)
161
-{
162
- unsigned short ud;
163
-
16489 if (addr < TASK_SIZE_MAX)
16590 return 0;
16691
167
- if (probe_kernel_address((unsigned short *)addr, ud))
168
- return 0;
169
-
170
- return ud == INSN_UD0 || ud == INSN_UD2;
171
-}
172
-
173
-int fixup_bug(struct pt_regs *regs, int trapnr)
174
-{
175
- if (trapnr != X86_TRAP_UD)
176
- return 0;
177
-
178
- switch (report_bug(regs->ip, regs)) {
179
- case BUG_TRAP_TYPE_NONE:
180
- case BUG_TRAP_TYPE_BUG:
181
- break;
182
-
183
- case BUG_TRAP_TYPE_WARN:
184
- regs->ip += LEN_UD2;
185
- return 1;
186
- }
187
-
188
- return 0;
92
+ /*
93
+ * We got #UD, if the text isn't readable we'd have gotten
94
+ * a different exception.
95
+ */
96
+ return *(unsigned short *)addr == INSN_UD2;
18997 }
19098
19199 static nokprobe_inline int
192
-do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
100
+do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
193101 struct pt_regs *regs, long error_code)
194102 {
195103 if (v8086_mode(regs)) {
....@@ -202,11 +110,8 @@
202110 error_code, trapnr))
203111 return 0;
204112 }
205
- return -1;
206
- }
207
-
208
- if (!user_mode(regs)) {
209
- if (fixup_exception(regs, trapnr))
113
+ } else if (!user_mode(regs)) {
114
+ if (fixup_exception(regs, trapnr, error_code, 0))
210115 return 0;
211116
212117 tsk->thread.error_code = error_code;
....@@ -214,49 +119,6 @@
214119 die(str, regs, error_code);
215120 }
216121
217
- return -1;
218
-}
219
-
220
-static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
221
- siginfo_t *info)
222
-{
223
- unsigned long siaddr;
224
- int sicode;
225
-
226
- switch (trapnr) {
227
- default:
228
- return SEND_SIG_PRIV;
229
-
230
- case X86_TRAP_DE:
231
- sicode = FPE_INTDIV;
232
- siaddr = uprobe_get_trap_addr(regs);
233
- break;
234
- case X86_TRAP_UD:
235
- sicode = ILL_ILLOPN;
236
- siaddr = uprobe_get_trap_addr(regs);
237
- break;
238
- case X86_TRAP_AC:
239
- sicode = BUS_ADRALN;
240
- siaddr = 0;
241
- break;
242
- }
243
-
244
- info->si_signo = signr;
245
- info->si_errno = 0;
246
- info->si_code = sicode;
247
- info->si_addr = (void __user *)siaddr;
248
- return info;
249
-}
250
-
251
-static void
252
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
253
- long error_code, siginfo_t *info)
254
-{
255
- struct task_struct *tsk = current;
256
-
257
-
258
- if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
259
- return;
260122 /*
261123 * We want error_code and trap_nr set for userspace faults and
262124 * kernelspace faults which result in die(), but not
....@@ -264,61 +126,187 @@
264126 * process no chance to handle the signal and notice the
265127 * kernel fault information, so that won't result in polluting
266128 * the information about previously queued, but not yet
267
- * delivered, faults. See also do_general_protection below.
129
+ * delivered, faults. See also exc_general_protection below.
268130 */
269131 tsk->thread.error_code = error_code;
270132 tsk->thread.trap_nr = trapnr;
271133
134
+ return -1;
135
+}
136
+
137
+static void show_signal(struct task_struct *tsk, int signr,
138
+ const char *type, const char *desc,
139
+ struct pt_regs *regs, long error_code)
140
+{
272141 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
273142 printk_ratelimit()) {
274
- pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
275
- tsk->comm, tsk->pid, str,
143
+ pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
144
+ tsk->comm, task_pid_nr(tsk), type, desc,
276145 regs->ip, regs->sp, error_code);
277146 print_vma_addr(KERN_CONT " in ", regs->ip);
278147 pr_cont("\n");
279148 }
149
+}
280150
281
- force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
151
+static void
152
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
153
+ long error_code, int sicode, void __user *addr)
154
+{
155
+ struct task_struct *tsk = current;
156
+
157
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
158
+ return;
159
+
160
+ show_signal(tsk, signr, "trap ", str, regs, error_code);
161
+
162
+ if (!sicode)
163
+ force_sig(signr);
164
+ else
165
+ force_sig_fault(signr, sicode, addr);
282166 }
283167 NOKPROBE_SYMBOL(do_trap);
284168
285169 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
286
- unsigned long trapnr, int signr)
170
+ unsigned long trapnr, int signr, int sicode, void __user *addr)
287171 {
288
- siginfo_t info;
289
-
290172 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
291
-
292
- /*
293
- * WARN*()s end up here; fix them up before we call the
294
- * notifier chain.
295
- */
296
- if (!user_mode(regs) && fixup_bug(regs, trapnr))
297
- return;
298173
299174 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
300175 NOTIFY_STOP) {
301176 cond_local_irq_enable(regs);
302
- clear_siginfo(&info);
303
- do_trap(trapnr, signr, str, regs, error_code,
304
- fill_trap_info(regs, signr, trapnr, &info));
177
+ do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
178
+ cond_local_irq_disable(regs);
305179 }
306180 }
307181
308
-#define DO_ERROR(trapnr, signr, str, name) \
309
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
310
-{ \
311
- do_error_trap(regs, error_code, str, trapnr, signr); \
182
+/*
183
+ * Posix requires to provide the address of the faulting instruction for
184
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
185
+ *
186
+ * This address is usually regs->ip, but when an uprobe moved the code out
187
+ * of line then regs->ip points to the XOL code which would confuse
188
+ * anything which analyzes the fault address vs. the unmodified binary. If
189
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
190
+ * original instruction address.
191
+ */
192
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
193
+{
194
+ return (void __user *)uprobe_get_trap_addr(regs);
312195 }
313196
314
-DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
315
-DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
316
-DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
317
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
318
-DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
319
-DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
320
-DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
321
-DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
197
+DEFINE_IDTENTRY(exc_divide_error)
198
+{
199
+ do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
200
+ FPE_INTDIV, error_get_trap_addr(regs));
201
+}
202
+
203
+DEFINE_IDTENTRY(exc_overflow)
204
+{
205
+ do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
206
+}
207
+
208
+#ifdef CONFIG_X86_F00F_BUG
209
+void handle_invalid_op(struct pt_regs *regs)
210
+#else
211
+static inline void handle_invalid_op(struct pt_regs *regs)
212
+#endif
213
+{
214
+ do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
215
+ ILL_ILLOPN, error_get_trap_addr(regs));
216
+}
217
+
218
+static noinstr bool handle_bug(struct pt_regs *regs)
219
+{
220
+ bool handled = false;
221
+
222
+ if (!is_valid_bugaddr(regs->ip))
223
+ return handled;
224
+
225
+ /*
226
+ * All lies, just get the WARN/BUG out.
227
+ */
228
+ instrumentation_begin();
229
+ /*
230
+ * Since we're emulating a CALL with exceptions, restore the interrupt
231
+ * state to what it was at the exception site.
232
+ */
233
+ if (regs->flags & X86_EFLAGS_IF)
234
+ raw_local_irq_enable();
235
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
236
+ regs->ip += LEN_UD2;
237
+ handled = true;
238
+ }
239
+ if (regs->flags & X86_EFLAGS_IF)
240
+ raw_local_irq_disable();
241
+ instrumentation_end();
242
+
243
+ return handled;
244
+}
245
+
246
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
247
+{
248
+ irqentry_state_t state;
249
+
250
+ /*
251
+ * We use UD2 as a short encoding for 'CALL __WARN', as such
252
+ * handle it before exception entry to avoid recursive WARN
253
+ * in case exception entry is the one triggering WARNs.
254
+ */
255
+ if (!user_mode(regs) && handle_bug(regs))
256
+ return;
257
+
258
+ state = irqentry_enter(regs);
259
+ instrumentation_begin();
260
+ handle_invalid_op(regs);
261
+ instrumentation_end();
262
+ irqentry_exit(regs, state);
263
+}
264
+
265
+DEFINE_IDTENTRY(exc_coproc_segment_overrun)
266
+{
267
+ do_error_trap(regs, 0, "coprocessor segment overrun",
268
+ X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
269
+}
270
+
271
+DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
272
+{
273
+ do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
274
+ 0, NULL);
275
+}
276
+
277
+DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
278
+{
279
+ do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
280
+ SIGBUS, 0, NULL);
281
+}
282
+
283
+DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
284
+{
285
+ do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
286
+ 0, NULL);
287
+}
288
+
289
+DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
290
+{
291
+ char *str = "alignment check";
292
+
293
+ if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
294
+ return;
295
+
296
+ if (!user_mode(regs))
297
+ die("Split lock detected\n", regs, error_code);
298
+
299
+ local_irq_enable();
300
+
301
+ if (handle_user_split_lock(regs, error_code))
302
+ goto out;
303
+
304
+ do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
305
+ error_code, BUS_ADRALN, NULL);
306
+
307
+out:
308
+ local_irq_disable();
309
+}
322310
323311 #ifdef CONFIG_VMAP_STACK
324312 __visible void __noreturn handle_stack_overflow(const char *message,
....@@ -331,18 +319,36 @@
331319 die(message, regs, 0);
332320
333321 /* Be absolutely certain we don't return. */
334
- panic(message);
322
+ panic("%s", message);
335323 }
336324 #endif
337325
338
-#ifdef CONFIG_X86_64
339
-/* Runs on IST stack */
340
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
326
+/*
327
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
328
+ *
329
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
330
+ * SDM's warnings about double faults being unrecoverable, returning works as
331
+ * expected. Presumably what the SDM actually means is that the CPU may get
332
+ * the register state wrong on entry, so returning could be a bad idea.
333
+ *
334
+ * Various CPU engineers have promised that double faults due to an IRET fault
335
+ * while the stack is read-only are, in fact, recoverable.
336
+ *
337
+ * On x86_32, this is entered through a task gate, and regs are synthesized
338
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
339
+ * be lost. If, for some reason, we need to return to a context with modified
340
+ * regs, the shim code could be adjusted to synchronize the registers.
341
+ *
342
+ * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
343
+ * to be read before doing anything else.
344
+ */
345
+DEFINE_IDTENTRY_DF(exc_double_fault)
341346 {
342347 static const char str[] = "double fault";
343348 struct task_struct *tsk = current;
349
+
344350 #ifdef CONFIG_VMAP_STACK
345
- unsigned long cr2;
351
+ unsigned long address = read_cr2();
346352 #endif
347353
348354 #ifdef CONFIG_X86_ESPFIX64
....@@ -360,13 +366,14 @@
360366 * The net result is that our #GP handler will think that we
361367 * entered from usermode with the bad user context.
362368 *
363
- * No need for ist_enter here because we don't use RCU.
369
+ * No need for nmi_enter() here because we don't use RCU.
364370 */
365371 if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
366372 regs->cs == __KERNEL_CS &&
367373 regs->ip == (unsigned long)native_irq_return_iret)
368374 {
369375 struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
376
+ unsigned long *p = (unsigned long *)regs->sp;
370377
371378 /*
372379 * regs->sp points to the failing IRET frame on the
....@@ -374,7 +381,11 @@
374381 * in gpregs->ss through gpregs->ip.
375382 *
376383 */
377
- memmove(&gpregs->ip, (void *)regs->sp, 5*8);
384
+ gpregs->ip = p[0];
385
+ gpregs->cs = p[1];
386
+ gpregs->flags = p[2];
387
+ gpregs->sp = p[3];
388
+ gpregs->ss = p[4];
378389 gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
379390
380391 /*
....@@ -383,15 +394,20 @@
383394 * we won't enable interupts or schedule before we invoke
384395 * general_protection, so nothing will clobber the stack
385396 * frame we just set up.
397
+ *
398
+ * We will enter general_protection with kernel GSBASE,
399
+ * which is what the stub expects, given that the faulting
400
+ * RIP will be the IRET instruction.
386401 */
387
- regs->ip = (unsigned long)general_protection;
402
+ regs->ip = (unsigned long)asm_exc_general_protection;
388403 regs->sp = (unsigned long)&gpregs->orig_ax;
389404
390405 return;
391406 }
392407 #endif
393408
394
- ist_enter(regs);
409
+ irqentry_nmi_enter(regs);
410
+ instrumentation_begin();
395411 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
396412
397413 tsk->thread.error_code = error_code;
....@@ -435,195 +451,251 @@
435451 * stack even if the actual trigger for the double fault was
436452 * something else.
437453 */
438
- cr2 = read_cr2();
439
- if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
440
- handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
454
+ if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
455
+ handle_stack_overflow("kernel stack overflow (double-fault)",
456
+ regs, address);
457
+ }
441458 #endif
442459
443
-#ifdef CONFIG_DOUBLEFAULT
444
- df_debug(regs, error_code);
445
-#endif
446
- /*
447
- * This is always a kernel trap and never fixable (and thus must
448
- * never return).
449
- */
450
- for (;;)
451
- die(str, regs, error_code);
460
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
461
+ die("double fault", regs, error_code);
462
+ panic("Machine halted.");
463
+ instrumentation_end();
452464 }
453
-#endif
454465
455
-dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
466
+DEFINE_IDTENTRY(exc_bounds)
456467 {
457
- const struct mpx_bndcsr *bndcsr;
458
- siginfo_t *info;
459
-
460
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
461
- if (notify_die(DIE_TRAP, "bounds", regs, error_code,
468
+ if (notify_die(DIE_TRAP, "bounds", regs, 0,
462469 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
463470 return;
464471 cond_local_irq_enable(regs);
465472
466473 if (!user_mode(regs))
467
- die("bounds", regs, error_code);
474
+ die("bounds", regs, 0);
468475
469
- if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
470
- /* The exception is not from Intel MPX */
471
- goto exit_trap;
472
- }
476
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
473477
474
- /*
475
- * We need to look at BNDSTATUS to resolve this exception.
476
- * A NULL here might mean that it is in its 'init state',
477
- * which is all zeros which indicates MPX was not
478
- * responsible for the exception.
479
- */
480
- bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
481
- if (!bndcsr)
482
- goto exit_trap;
483
-
484
- trace_bounds_exception_mpx(bndcsr);
485
- /*
486
- * The error code field of the BNDSTATUS register communicates status
487
- * information of a bound range exception #BR or operation involving
488
- * bound directory.
489
- */
490
- switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
491
- case 2: /* Bound directory has invalid entry. */
492
- if (mpx_handle_bd_fault())
493
- goto exit_trap;
494
- break; /* Success, it was handled */
495
- case 1: /* Bound violation. */
496
- info = mpx_generate_siginfo(regs);
497
- if (IS_ERR(info)) {
498
- /*
499
- * We failed to decode the MPX instruction. Act as if
500
- * the exception was not caused by MPX.
501
- */
502
- goto exit_trap;
503
- }
504
- /*
505
- * Success, we decoded the instruction and retrieved
506
- * an 'info' containing the address being accessed
507
- * which caused the exception. This information
508
- * allows and application to possibly handle the
509
- * #BR exception itself.
510
- */
511
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
512
- kfree(info);
513
- break;
514
- case 0: /* No exception caused by Intel MPX operations. */
515
- goto exit_trap;
516
- default:
517
- die("bounds", regs, error_code);
518
- }
519
-
520
- return;
521
-
522
-exit_trap:
523
- /*
524
- * This path out is for all the cases where we could not
525
- * handle the exception in some way (like allocating a
526
- * table or telling userspace about it. We will also end
527
- * up here if the kernel has MPX turned off at compile
528
- * time..
529
- */
530
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
478
+ cond_local_irq_disable(regs);
531479 }
532480
533
-dotraplinkage void
534
-do_general_protection(struct pt_regs *regs, long error_code)
535
-{
536
- struct task_struct *tsk;
481
+enum kernel_gp_hint {
482
+ GP_NO_HINT,
483
+ GP_NON_CANONICAL,
484
+ GP_CANONICAL
485
+};
537486
538
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
487
+/*
488
+ * When an uncaught #GP occurs, try to determine the memory address accessed by
489
+ * the instruction and return that address to the caller. Also, try to figure
490
+ * out whether any part of the access to that address was non-canonical.
491
+ */
492
+static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
493
+ unsigned long *addr)
494
+{
495
+ u8 insn_buf[MAX_INSN_SIZE];
496
+ struct insn insn;
497
+
498
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
499
+ MAX_INSN_SIZE))
500
+ return GP_NO_HINT;
501
+
502
+ kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
503
+ insn_get_modrm(&insn);
504
+ insn_get_sib(&insn);
505
+
506
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
507
+ if (*addr == -1UL)
508
+ return GP_NO_HINT;
509
+
510
+#ifdef CONFIG_X86_64
511
+ /*
512
+ * Check that:
513
+ * - the operand is not in the kernel half
514
+ * - the last byte of the operand is not in the user canonical half
515
+ */
516
+ if (*addr < ~__VIRTUAL_MASK &&
517
+ *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
518
+ return GP_NON_CANONICAL;
519
+#endif
520
+
521
+ return GP_CANONICAL;
522
+}
523
+
524
+#define GPFSTR "general protection fault"
525
+
526
+static bool fixup_iopl_exception(struct pt_regs *regs)
527
+{
528
+ struct thread_struct *t = &current->thread;
529
+ unsigned char byte;
530
+ unsigned long ip;
531
+
532
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
533
+ return false;
534
+
535
+ ip = insn_get_effective_ip(regs);
536
+ if (!ip)
537
+ return false;
538
+
539
+ if (get_user(byte, (const char __user *)ip))
540
+ return false;
541
+
542
+ if (byte != 0xfa && byte != 0xfb)
543
+ return false;
544
+
545
+ if (!t->iopl_warn && printk_ratelimit()) {
546
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
547
+ current->comm, task_pid_nr(current), ip);
548
+ print_vma_addr(KERN_CONT " in ", ip);
549
+ pr_cont("\n");
550
+ t->iopl_warn = 1;
551
+ }
552
+
553
+ regs->ip += 1;
554
+ return true;
555
+}
556
+
557
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
558
+{
559
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
560
+ enum kernel_gp_hint hint = GP_NO_HINT;
561
+ struct task_struct *tsk;
562
+ unsigned long gp_addr;
563
+ int ret;
564
+
539565 cond_local_irq_enable(regs);
540566
541567 if (static_cpu_has(X86_FEATURE_UMIP)) {
542568 if (user_mode(regs) && fixup_umip_exception(regs))
543
- return;
569
+ goto exit;
544570 }
545571
546572 if (v8086_mode(regs)) {
547573 local_irq_enable();
548574 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
575
+ local_irq_disable();
549576 return;
550577 }
551578
552579 tsk = current;
553
- if (!user_mode(regs)) {
554
- if (fixup_exception(regs, X86_TRAP_GP))
555
- return;
580
+
581
+ if (user_mode(regs)) {
582
+ if (fixup_iopl_exception(regs))
583
+ goto exit;
556584
557585 tsk->thread.error_code = error_code;
558586 tsk->thread.trap_nr = X86_TRAP_GP;
559
- if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
560
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
561
- die("general protection fault", regs, error_code);
562
- return;
587
+
588
+ show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
589
+ force_sig(SIGSEGV);
590
+ goto exit;
563591 }
592
+
593
+ if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
594
+ goto exit;
564595
565596 tsk->thread.error_code = error_code;
566597 tsk->thread.trap_nr = X86_TRAP_GP;
567598
568
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
569
- printk_ratelimit()) {
570
- pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
571
- tsk->comm, task_pid_nr(tsk),
572
- regs->ip, regs->sp, error_code);
573
- print_vma_addr(KERN_CONT " in ", regs->ip);
574
- pr_cont("\n");
575
- }
576
-
577
- force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
578
-}
579
-NOKPROBE_SYMBOL(do_general_protection);
580
-
581
-dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
582
-{
583
-#ifdef CONFIG_DYNAMIC_FTRACE
584599 /*
585
- * ftrace must be first, everything else may cause a recursive crash.
586
- * See note by declaration of modifying_ftrace_code in ftrace.c
600
+ * To be potentially processing a kprobe fault and to trust the result
601
+ * from kprobe_running(), we have to be non-preemptible.
587602 */
588
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
589
- ftrace_int3_handler(regs))
590
- return;
591
-#endif
592
- if (poke_int3_handler(regs))
593
- return;
594
-
595
- /*
596
- * Use ist_enter despite the fact that we don't use an IST stack.
597
- * We can be called from a kprobe in non-CONTEXT_KERNEL kernel
598
- * mode or even during context tracking state changes.
599
- *
600
- * This means that we can't schedule. That's okay.
601
- */
602
- ist_enter(regs);
603
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
604
-#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
605
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
606
- SIGTRAP) == NOTIFY_STOP)
603
+ if (!preemptible() &&
604
+ kprobe_running() &&
605
+ kprobe_fault_handler(regs, X86_TRAP_GP))
607606 goto exit;
607
+
608
+ ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
609
+ if (ret == NOTIFY_STOP)
610
+ goto exit;
611
+
612
+ if (error_code)
613
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
614
+ else
615
+ hint = get_kernel_gp_address(regs, &gp_addr);
616
+
617
+ if (hint != GP_NO_HINT)
618
+ snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
619
+ (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
620
+ : "maybe for address",
621
+ gp_addr);
622
+
623
+ /*
624
+ * KASAN is interested only in the non-canonical case, clear it
625
+ * otherwise.
626
+ */
627
+ if (hint != GP_NON_CANONICAL)
628
+ gp_addr = 0;
629
+
630
+ die_addr(desc, regs, error_code, gp_addr);
631
+
632
+exit:
633
+ cond_local_irq_disable(regs);
634
+}
635
+
636
+static bool do_int3(struct pt_regs *regs)
637
+{
638
+ int res;
639
+
640
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
641
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
642
+ SIGTRAP) == NOTIFY_STOP)
643
+ return true;
608644 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
609645
610646 #ifdef CONFIG_KPROBES
611647 if (kprobe_int3_handler(regs))
612
- goto exit;
648
+ return true;
613649 #endif
650
+ res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
614651
615
- if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
616
- SIGTRAP) == NOTIFY_STOP)
617
- goto exit;
618
-
619
- cond_local_irq_enable(regs);
620
- do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
621
- cond_local_irq_disable(regs);
622
-
623
-exit:
624
- ist_exit(regs);
652
+ return res == NOTIFY_STOP;
625653 }
626654 NOKPROBE_SYMBOL(do_int3);
655
+
656
+static void do_int3_user(struct pt_regs *regs)
657
+{
658
+ if (do_int3(regs))
659
+ return;
660
+
661
+ cond_local_irq_enable(regs);
662
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
663
+ cond_local_irq_disable(regs);
664
+}
665
+
666
+DEFINE_IDTENTRY_RAW(exc_int3)
667
+{
668
+ /*
669
+ * poke_int3_handler() is completely self contained code; it does (and
670
+ * must) *NOT* call out to anything, lest it hits upon yet another
671
+ * INT3.
672
+ */
673
+ if (poke_int3_handler(regs))
674
+ return;
675
+
676
+ /*
677
+ * irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
678
+ * and therefore can trigger INT3, hence poke_int3_handler() must
679
+ * be done before. If the entry came from kernel mode, then use
680
+ * nmi_enter() because the INT3 could have been hit in any context
681
+ * including NMI.
682
+ */
683
+ if (user_mode(regs)) {
684
+ irqentry_enter_from_user_mode(regs);
685
+ instrumentation_begin();
686
+ do_int3_user(regs);
687
+ instrumentation_end();
688
+ irqentry_exit_to_user_mode(regs);
689
+ } else {
690
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
691
+
692
+ instrumentation_begin();
693
+ if (!do_int3(regs))
694
+ die("int3", regs, 0);
695
+ instrumentation_end();
696
+ irqentry_nmi_exit(regs, irq_state);
697
+ }
698
+}
627699
628700 #ifdef CONFIG_X86_64
629701 /*
....@@ -631,21 +703,63 @@
631703 * to switch to the normal thread stack if the interrupted code was in
632704 * user mode. The actual stack switch is done in entry_64.S
633705 */
634
-asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
706
+asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
635707 {
636708 struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
637709 if (regs != eregs)
638710 *regs = *eregs;
639711 return regs;
640712 }
641
-NOKPROBE_SYMBOL(sync_regs);
713
+
714
+#ifdef CONFIG_AMD_MEM_ENCRYPT
715
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
716
+{
717
+ unsigned long sp, *stack;
718
+ struct stack_info info;
719
+ struct pt_regs *regs_ret;
720
+
721
+ /*
722
+ * In the SYSCALL entry path the RSP value comes from user-space - don't
723
+ * trust it and switch to the current kernel stack
724
+ */
725
+ if (ip_within_syscall_gap(regs)) {
726
+ sp = this_cpu_read(cpu_current_top_of_stack);
727
+ goto sync;
728
+ }
729
+
730
+ /*
731
+ * From here on the RSP value is trusted. Now check whether entry
732
+ * happened from a safe stack. Not safe are the entry or unknown stacks,
733
+ * use the fall-back stack instead in this case.
734
+ */
735
+ sp = regs->sp;
736
+ stack = (unsigned long *)sp;
737
+
738
+ if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
739
+ info.type > STACK_TYPE_EXCEPTION_LAST)
740
+ sp = __this_cpu_ist_top_va(VC2);
741
+
742
+sync:
743
+ /*
744
+ * Found a safe stack - switch to it as if the entry didn't happen via
745
+ * IST stack. The code below only copies pt_regs, the real switch happens
746
+ * in assembly code.
747
+ */
748
+ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
749
+
750
+ regs_ret = (struct pt_regs *)sp;
751
+ *regs_ret = *regs;
752
+
753
+ return regs_ret;
754
+}
755
+#endif
642756
643757 struct bad_iret_stack {
644758 void *error_entry_ret;
645759 struct pt_regs regs;
646760 };
647761
648
-asmlinkage __visible notrace
762
+asmlinkage __visible noinstr
649763 struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
650764 {
651765 /*
....@@ -656,19 +770,21 @@
656770 * just below the IRET frame) and we want to pretend that the
657771 * exception came from the IRET target.
658772 */
659
- struct bad_iret_stack *new_stack =
660
- (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
773
+ struct bad_iret_stack tmp, *new_stack =
774
+ (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
661775
662
- /* Copy the IRET target to the new stack. */
663
- memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
776
+ /* Copy the IRET target to the temporary storage. */
777
+ __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
664778
665779 /* Copy the remainder of the stack from the current stack. */
666
- memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
780
+ __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
781
+
782
+ /* Update the entry stack */
783
+ __memcpy(new_stack, &tmp, sizeof(tmp));
667784
668785 BUG_ON(!user_mode(&new_stack->regs));
669786 return new_stack;
670787 }
671
-NOKPROBE_SYMBOL(fixup_bad_iret);
672788 #endif
673789
674790 static bool is_sysenter_singlestep(struct pt_regs *regs)
....@@ -692,6 +808,28 @@
692808 #else
693809 return false;
694810 #endif
811
+}
812
+
813
+static __always_inline unsigned long debug_read_clear_dr6(void)
814
+{
815
+ unsigned long dr6;
816
+
817
+ /*
818
+ * The Intel SDM says:
819
+ *
820
+ * Certain debug exceptions may clear bits 0-3. The remaining
821
+ * contents of the DR6 register are never cleared by the
822
+ * processor. To avoid confusion in identifying debug
823
+ * exceptions, debug handlers should clear the register before
824
+ * returning to the interrupted task.
825
+ *
826
+ * Keep it simple: clear DR6 immediately.
827
+ */
828
+ get_debugreg(dr6, 6);
829
+ set_debugreg(DR6_RESERVED, 6);
830
+ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
831
+
832
+ return dr6;
695833 }
696834
697835 /*
....@@ -718,136 +856,216 @@
718856 *
719857 * May run on IST stack.
720858 */
721
-dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
859
+
860
+static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
722861 {
723
- struct task_struct *tsk = current;
724
- int user_icebp = 0;
725
- unsigned long dr6;
726
- int si_code;
727
-
728
- ist_enter(regs);
729
-
730
- get_debugreg(dr6, 6);
731862 /*
732
- * The Intel SDM says:
863
+ * Notifiers will clear bits in @dr6 to indicate the event has been
864
+ * consumed - hw_breakpoint_handler(), single_stop_cont().
733865 *
734
- * Certain debug exceptions may clear bits 0-3. The remaining
735
- * contents of the DR6 register are never cleared by the
736
- * processor. To avoid confusion in identifying debug
737
- * exceptions, debug handlers should clear the register before
738
- * returning to the interrupted task.
739
- *
740
- * Keep it simple: clear DR6 immediately.
866
+ * Notifiers will set bits in @virtual_dr6 to indicate the desire
867
+ * for signals - ptrace_triggered(), kgdb_hw_overflow_handler().
741868 */
742
- set_debugreg(0, 6);
869
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP)
870
+ return true;
743871
744
- /* Filter out all the reserved bits which are preset to 1 */
745
- dr6 &= ~DR6_RESERVED;
872
+ return false;
873
+}
874
+
875
+static __always_inline void exc_debug_kernel(struct pt_regs *regs,
876
+ unsigned long dr6)
877
+{
878
+ /*
879
+ * Disable breakpoints during exception handling; recursive exceptions
880
+ * are exceedingly 'fun'.
881
+ *
882
+ * Since this function is NOKPROBE, and that also applies to
883
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
884
+ * HW_BREAKPOINT_W on our stack)
885
+ *
886
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
887
+ * includes the entry stack is excluded for everything.
888
+ */
889
+ unsigned long dr7 = local_db_save();
890
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
891
+ instrumentation_begin();
892
+
893
+ /*
894
+ * If something gets miswired and we end up here for a user mode
895
+ * #DB, we will malfunction.
896
+ */
897
+ WARN_ON_ONCE(user_mode(regs));
898
+
899
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
900
+ /*
901
+ * The SDM says "The processor clears the BTF flag when it
902
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
903
+ * it for userspace, but we just took a kernel #DB, so re-set
904
+ * BTF.
905
+ */
906
+ unsigned long debugctl;
907
+
908
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
909
+ debugctl |= DEBUGCTLMSR_BTF;
910
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
911
+ }
912
+
913
+ /*
914
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
915
+ * watchpoint at the same time then that will still be handled.
916
+ */
917
+ if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
918
+ dr6 &= ~DR_STEP;
919
+
920
+ /*
921
+ * The kernel doesn't use INT1
922
+ */
923
+ if (!dr6)
924
+ goto out;
925
+
926
+ if (notify_debug(regs, &dr6))
927
+ goto out;
928
+
929
+ /*
930
+ * The kernel doesn't use TF single-step outside of:
931
+ *
932
+ * - Kprobes, consumed through kprobe_debug_handler()
933
+ * - KGDB, consumed through notify_debug()
934
+ *
935
+ * So if we get here with DR_STEP set, something is wonky.
936
+ *
937
+ * A known way to trigger this is through QEMU's GDB stub,
938
+ * which leaks #DB into the guest and causes IST recursion.
939
+ */
940
+ if (WARN_ON_ONCE(dr6 & DR_STEP))
941
+ regs->flags &= ~X86_EFLAGS_TF;
942
+out:
943
+ instrumentation_end();
944
+ irqentry_nmi_exit(regs, irq_state);
945
+
946
+ local_db_restore(dr7);
947
+}
948
+
949
+static __always_inline void exc_debug_user(struct pt_regs *regs,
950
+ unsigned long dr6)
951
+{
952
+ bool icebp;
953
+
954
+ /*
955
+ * If something gets miswired and we end up here for a kernel mode
956
+ * #DB, we will malfunction.
957
+ */
958
+ WARN_ON_ONCE(!user_mode(regs));
959
+
960
+ /*
961
+ * NB: We can't easily clear DR7 here because
962
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
963
+ * user memory, etc. This means that a recursive #DB is possible. If
964
+ * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
965
+ * Since we're not on the IST stack right now, everything will be
966
+ * fine.
967
+ */
968
+
969
+ irqentry_enter_from_user_mode(regs);
970
+ instrumentation_begin();
971
+
972
+ /*
973
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
974
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
975
+ *
976
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
977
+ * even if it is not the result of PTRACE_SINGLESTEP.
978
+ */
979
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
746980
747981 /*
748982 * The SDM says "The processor clears the BTF flag when it
749983 * generates a debug exception." Clear TIF_BLOCKSTEP to keep
750984 * TIF_BLOCKSTEP in sync with the hardware BTF flag.
751985 */
752
- clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
753
-
754
- if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
755
- is_sysenter_singlestep(regs))) {
756
- dr6 &= ~DR_STEP;
757
- if (!dr6)
758
- goto exit;
759
- /*
760
- * else we might have gotten a single-step trap and hit a
761
- * watchpoint at the same time, in which case we should fall
762
- * through and handle the watchpoint.
763
- */
764
- }
986
+ clear_thread_flag(TIF_BLOCKSTEP);
765987
766988 /*
767989 * If dr6 has no reason to give us about the origin of this trap,
768990 * then it's very likely the result of an icebp/int01 trap.
769991 * User wants a sigtrap for that.
770992 */
771
- if (!dr6 && user_mode(regs))
772
- user_icebp = 1;
993
+ icebp = !dr6;
773994
774
- /* Store the virtualized DR6 value */
775
- tsk->thread.debugreg6 = dr6;
776
-
777
-#ifdef CONFIG_KPROBES
778
- if (kprobe_debug_handler(regs))
779
- goto exit;
780
-#endif
781
-
782
- if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
783
- SIGTRAP) == NOTIFY_STOP)
784
- goto exit;
785
-
786
- /*
787
- * Let others (NMI) know that the debug stack is in use
788
- * as we may switch to the interrupt stack.
789
- */
790
- debug_stack_usage_inc();
995
+ if (notify_debug(regs, &dr6))
996
+ goto out;
791997
792998 /* It's safe to allow irq's after DR6 has been saved */
793
- cond_local_irq_enable(regs);
999
+ local_irq_enable();
7941000
7951001 if (v8086_mode(regs)) {
796
- handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
797
- X86_TRAP_DB);
798
- cond_local_irq_disable(regs);
799
- debug_stack_usage_dec();
800
- goto exit;
1002
+ handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB);
1003
+ goto out_irq;
8011004 }
8021005
803
- if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
804
- /*
805
- * Historical junk that used to handle SYSENTER single-stepping.
806
- * This should be unreachable now. If we survive for a while
807
- * without anyone hitting this warning, we'll turn this into
808
- * an oops.
809
- */
810
- tsk->thread.debugreg6 &= ~DR_STEP;
811
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
812
- regs->flags &= ~X86_EFLAGS_TF;
813
- }
814
- si_code = get_si_code(tsk->thread.debugreg6);
815
- if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
816
- send_sigtrap(tsk, regs, error_code, si_code);
817
- cond_local_irq_disable(regs);
818
- debug_stack_usage_dec();
1006
+ /* Add the virtual_dr6 bits for signals. */
1007
+ dr6 |= current->thread.virtual_dr6;
1008
+ if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
1009
+ send_sigtrap(regs, 0, get_si_code(dr6));
8191010
820
-exit:
821
- ist_exit(regs);
1011
+out_irq:
1012
+ local_irq_disable();
1013
+out:
1014
+ instrumentation_end();
1015
+ irqentry_exit_to_user_mode(regs);
8221016 }
823
-NOKPROBE_SYMBOL(do_debug);
1017
+
1018
+#ifdef CONFIG_X86_64
1019
+/* IST stack entry */
1020
+DEFINE_IDTENTRY_DEBUG(exc_debug)
1021
+{
1022
+ exc_debug_kernel(regs, debug_read_clear_dr6());
1023
+}
1024
+
1025
+/* User entry, runs on regular task stack */
1026
+DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
1027
+{
1028
+ exc_debug_user(regs, debug_read_clear_dr6());
1029
+}
1030
+#else
1031
+/* 32 bit does not have separate entry points. */
1032
+DEFINE_IDTENTRY_RAW(exc_debug)
1033
+{
1034
+ unsigned long dr6 = debug_read_clear_dr6();
1035
+
1036
+ if (user_mode(regs))
1037
+ exc_debug_user(regs, dr6);
1038
+ else
1039
+ exc_debug_kernel(regs, dr6);
1040
+}
1041
+#endif
8241042
8251043 /*
8261044 * Note that we play around with the 'TS' bit in an attempt to get
8271045 * the correct behaviour even in the presence of the asynchronous
8281046 * IRQ13 behaviour
8291047 */
830
-static void math_error(struct pt_regs *regs, int error_code, int trapnr)
1048
+static void math_error(struct pt_regs *regs, int trapnr)
8311049 {
8321050 struct task_struct *task = current;
8331051 struct fpu *fpu = &task->thread.fpu;
834
- siginfo_t info;
1052
+ int si_code;
8351053 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
8361054 "simd exception";
8371055
8381056 cond_local_irq_enable(regs);
8391057
8401058 if (!user_mode(regs)) {
841
- if (fixup_exception(regs, trapnr))
842
- return;
1059
+ if (fixup_exception(regs, trapnr, 0, 0))
1060
+ goto exit;
8431061
844
- task->thread.error_code = error_code;
1062
+ task->thread.error_code = 0;
8451063 task->thread.trap_nr = trapnr;
8461064
847
- if (notify_die(DIE_TRAP, str, regs, error_code,
848
- trapnr, SIGFPE) != NOTIFY_STOP)
849
- die(str, regs, error_code);
850
- return;
1065
+ if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
1066
+ SIGFPE) != NOTIFY_STOP)
1067
+ die(str, regs, 0);
1068
+ goto exit;
8511069 }
8521070
8531071 /*
....@@ -856,61 +1074,78 @@
8561074 fpu__save(fpu);
8571075
8581076 task->thread.trap_nr = trapnr;
859
- task->thread.error_code = error_code;
860
- clear_siginfo(&info);
861
- info.si_signo = SIGFPE;
862
- info.si_errno = 0;
863
- info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
1077
+ task->thread.error_code = 0;
8641078
865
- info.si_code = fpu__exception_code(fpu, trapnr);
866
-
1079
+ si_code = fpu__exception_code(fpu, trapnr);
8671080 /* Retry when we get spurious exceptions: */
868
- if (!info.si_code)
869
- return;
1081
+ if (!si_code)
1082
+ goto exit;
8701083
871
- force_sig_info(SIGFPE, &info, task);
1084
+ force_sig_fault(SIGFPE, si_code,
1085
+ (void __user *)uprobe_get_trap_addr(regs));
1086
+exit:
1087
+ cond_local_irq_disable(regs);
8721088 }
8731089
874
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1090
+DEFINE_IDTENTRY(exc_coprocessor_error)
8751091 {
876
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
877
- math_error(regs, error_code, X86_TRAP_MF);
1092
+ math_error(regs, X86_TRAP_MF);
8781093 }
8791094
880
-dotraplinkage void
881
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1095
+DEFINE_IDTENTRY(exc_simd_coprocessor_error)
8821096 {
883
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
884
- math_error(regs, error_code, X86_TRAP_XF);
1097
+ if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
1098
+ /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
1099
+ if (!static_cpu_has(X86_FEATURE_XMM)) {
1100
+ __exc_general_protection(regs, 0);
1101
+ return;
1102
+ }
1103
+ }
1104
+ math_error(regs, X86_TRAP_XF);
8851105 }
8861106
887
-dotraplinkage void
888
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1107
+DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
8891108 {
890
- cond_local_irq_enable(regs);
1109
+ /*
1110
+ * This addresses a Pentium Pro Erratum:
1111
+ *
1112
+ * PROBLEM: If the APIC subsystem is configured in mixed mode with
1113
+ * Virtual Wire mode implemented through the local APIC, an
1114
+ * interrupt vector of 0Fh (Intel reserved encoding) may be
1115
+ * generated by the local APIC (Int 15). This vector may be
1116
+ * generated upon receipt of a spurious interrupt (an interrupt
1117
+ * which is removed before the system receives the INTA sequence)
1118
+ * instead of the programmed 8259 spurious interrupt vector.
1119
+ *
1120
+ * IMPLICATION: The spurious interrupt vector programmed in the
1121
+ * 8259 is normally handled by an operating system's spurious
1122
+ * interrupt handler. However, a vector of 0Fh is unknown to some
1123
+ * operating systems, which would crash if this erratum occurred.
1124
+ *
1125
+ * In theory this could be limited to 32bit, but the handler is not
1126
+ * hurting and who knows which other CPUs suffer from this.
1127
+ */
8911128 }
8921129
893
-dotraplinkage void
894
-do_device_not_available(struct pt_regs *regs, long error_code)
1130
+DEFINE_IDTENTRY(exc_device_not_available)
8951131 {
896
- unsigned long cr0;
897
-
898
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
1132
+ unsigned long cr0 = read_cr0();
8991133
9001134 #ifdef CONFIG_MATH_EMULATION
901
- if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
1135
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
9021136 struct math_emu_info info = { };
9031137
9041138 cond_local_irq_enable(regs);
9051139
9061140 info.regs = regs;
9071141 math_emulate(&info);
1142
+
1143
+ cond_local_irq_disable(regs);
9081144 return;
9091145 }
9101146 #endif
9111147
9121148 /* This should not happen. */
913
- cr0 = read_cr0();
9141149 if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
9151150 /* Try to fix it up and carry on. */
9161151 write_cr0(cr0 & ~X86_CR0_TS);
....@@ -920,29 +1155,20 @@
9201155 * to kill the task than getting stuck in a never-ending
9211156 * loop of #NM faults.
9221157 */
923
- die("unexpected #NM exception", regs, error_code);
1158
+ die("unexpected #NM exception", regs, 0);
9241159 }
9251160 }
926
-NOKPROBE_SYMBOL(do_device_not_available);
9271161
9281162 #ifdef CONFIG_X86_32
929
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
1163
+DEFINE_IDTENTRY_SW(iret_error)
9301164 {
931
- siginfo_t info;
932
-
933
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
9341165 local_irq_enable();
935
-
936
- clear_siginfo(&info);
937
- info.si_signo = SIGILL;
938
- info.si_errno = 0;
939
- info.si_code = ILL_BADSTK;
940
- info.si_addr = NULL;
941
- if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
1166
+ if (notify_die(DIE_TRAP, "iret exception", regs, 0,
9421167 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
943
- do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
944
- &info);
1168
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
1169
+ ILL_BADSTK, (void __user *)NULL);
9451170 }
1171
+ local_irq_disable();
9461172 }
9471173 #endif
9481174
....@@ -951,25 +1177,13 @@
9511177 /* Init cpu_entry_area before IST entries are set up */
9521178 setup_cpu_entry_areas();
9531179
1180
+ /* Init GHCB memory pages when running as an SEV-ES guest */
1181
+ sev_es_init_vc_handling();
1182
+
9541183 idt_setup_traps();
9551184
956
- /*
957
- * Set the IDT descriptor to a fixed read-only location, so that the
958
- * "sidt" instruction will not leak the location of the kernel, and
959
- * to defend the IDT against arbitrary memory write vulnerabilities.
960
- * It will be reloaded in cpu_init() */
961
- cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
962
- PAGE_KERNEL_RO);
963
- idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
964
-
965
- /*
966
- * Should be a barrier for any external CPU state:
967
- */
1185
+ cpu_init_exception_handling();
9681186 cpu_init();
9691187
9701188 idt_setup_ist_traps();
971
-
972
- x86_init.irqs.trap_init();
973
-
974
- idt_setup_debugidt_traps();
9751189 }