hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/x86/entry/common.c
....@@ -1,7 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * common.c - C code for kernel entry and exit
34 * Copyright (c) 2015 Andrew Lutomirski
4
- * GPL v2
55 *
66 * Based on asm and ptrace code by many authors. The code here originated
77 * in ptrace.c and signal.c.
....@@ -10,365 +10,149 @@
1010 #include <linux/kernel.h>
1111 #include <linux/sched.h>
1212 #include <linux/sched/task_stack.h>
13
+#include <linux/entry-common.h>
1314 #include <linux/mm.h>
1415 #include <linux/smp.h>
1516 #include <linux/errno.h>
1617 #include <linux/ptrace.h>
17
-#include <linux/tracehook.h>
18
-#include <linux/audit.h>
19
-#include <linux/seccomp.h>
20
-#include <linux/signal.h>
2118 #include <linux/export.h>
22
-#include <linux/context_tracking.h>
23
-#include <linux/user-return-notifier.h>
2419 #include <linux/nospec.h>
25
-#include <linux/uprobes.h>
26
-#include <linux/livepatch.h>
2720 #include <linux/syscalls.h>
21
+#include <linux/uaccess.h>
22
+
23
+#ifdef CONFIG_XEN_PV
24
+#include <xen/xen-ops.h>
25
+#include <xen/events.h>
26
+#endif
2827
2928 #include <asm/desc.h>
3029 #include <asm/traps.h>
3130 #include <asm/vdso.h>
32
-#include <linux/uaccess.h>
3331 #include <asm/cpufeature.h>
32
+#include <asm/fpu/api.h>
3433 #include <asm/nospec-branch.h>
35
-
36
-#define CREATE_TRACE_POINTS
37
-#include <trace/events/syscalls.h>
38
-
39
-#ifdef CONFIG_CONTEXT_TRACKING
40
-/* Called on entry from user mode with IRQs off. */
41
-__visible inline void enter_from_user_mode(void)
42
-{
43
- CT_WARN_ON(ct_state() != CONTEXT_USER);
44
- user_exit_irqoff();
45
-}
46
-#else
47
-static inline void enter_from_user_mode(void) {}
48
-#endif
49
-
50
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
51
-{
52
-#ifdef CONFIG_X86_64
53
- if (arch == AUDIT_ARCH_X86_64) {
54
- audit_syscall_entry(regs->orig_ax, regs->di,
55
- regs->si, regs->dx, regs->r10);
56
- } else
57
-#endif
58
- {
59
- audit_syscall_entry(regs->orig_ax, regs->bx,
60
- regs->cx, regs->dx, regs->si);
61
- }
62
-}
63
-
64
-/*
65
- * Returns the syscall nr to run (which should match regs->orig_ax) or -1
66
- * to skip the syscall.
67
- */
68
-static long syscall_trace_enter(struct pt_regs *regs)
69
-{
70
- u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
71
-
72
- struct thread_info *ti = current_thread_info();
73
- unsigned long ret = 0;
74
- bool emulated = false;
75
- u32 work;
76
-
77
- if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
78
- BUG_ON(regs != task_pt_regs(current));
79
-
80
- work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
81
-
82
- if (unlikely(work & _TIF_SYSCALL_EMU))
83
- emulated = true;
84
-
85
- if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
86
- tracehook_report_syscall_entry(regs))
87
- return -1L;
88
-
89
- if (emulated)
90
- return -1L;
91
-
92
-#ifdef CONFIG_SECCOMP
93
- /*
94
- * Do seccomp after ptrace, to catch any tracer changes.
95
- */
96
- if (work & _TIF_SECCOMP) {
97
- struct seccomp_data sd;
98
-
99
- sd.arch = arch;
100
- sd.nr = regs->orig_ax;
101
- sd.instruction_pointer = regs->ip;
102
-#ifdef CONFIG_X86_64
103
- if (arch == AUDIT_ARCH_X86_64) {
104
- sd.args[0] = regs->di;
105
- sd.args[1] = regs->si;
106
- sd.args[2] = regs->dx;
107
- sd.args[3] = regs->r10;
108
- sd.args[4] = regs->r8;
109
- sd.args[5] = regs->r9;
110
- } else
111
-#endif
112
- {
113
- sd.args[0] = regs->bx;
114
- sd.args[1] = regs->cx;
115
- sd.args[2] = regs->dx;
116
- sd.args[3] = regs->si;
117
- sd.args[4] = regs->di;
118
- sd.args[5] = regs->bp;
119
- }
120
-
121
- ret = __secure_computing(&sd);
122
- if (ret == -1)
123
- return ret;
124
- }
125
-#endif
126
-
127
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
128
- trace_sys_enter(regs, regs->orig_ax);
129
-
130
- do_audit_syscall_entry(regs, arch);
131
-
132
- return ret ?: regs->orig_ax;
133
-}
134
-
135
-#define EXIT_TO_USERMODE_LOOP_FLAGS \
136
- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
137
- _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
138
-
139
-static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
140
-{
141
- /*
142
- * In order to return to user mode, we need to have IRQs off with
143
- * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
144
- * can be set at any time on preemptable kernels if we have IRQs on,
145
- * so we need to loop. Disabling preemption wouldn't help: doing the
146
- * work to clear some of the flags can sleep.
147
- */
148
- while (true) {
149
- /* We have work to do. */
150
- local_irq_enable();
151
-
152
- if (cached_flags & _TIF_NEED_RESCHED_MASK)
153
- schedule();
154
-
155
-#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
156
- if (unlikely(current->forced_info.si_signo)) {
157
- struct task_struct *t = current;
158
- force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
159
- t->forced_info.si_signo = 0;
160
- }
161
-#endif
162
- if (cached_flags & _TIF_UPROBE)
163
- uprobe_notify_resume(regs);
164
-
165
- if (cached_flags & _TIF_PATCH_PENDING)
166
- klp_update_patch_state(current);
167
-
168
- /* deal with pending signal delivery */
169
- if (cached_flags & _TIF_SIGPENDING)
170
- do_signal(regs);
171
-
172
- if (cached_flags & _TIF_NOTIFY_RESUME) {
173
- clear_thread_flag(TIF_NOTIFY_RESUME);
174
- tracehook_notify_resume(regs);
175
- rseq_handle_notify_resume(NULL, regs);
176
- }
177
-
178
- if (cached_flags & _TIF_USER_RETURN_NOTIFY)
179
- fire_user_return_notifiers();
180
-
181
- /* Disable IRQs and retry */
182
- local_irq_disable();
183
-
184
- cached_flags = READ_ONCE(current_thread_info()->flags);
185
-
186
- if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
187
- break;
188
- }
189
-}
190
-
191
-/* Called with IRQs disabled. */
192
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
193
-{
194
- struct thread_info *ti = current_thread_info();
195
- u32 cached_flags;
196
-
197
- addr_limit_user_check();
198
-
199
- lockdep_assert_irqs_disabled();
200
- lockdep_sys_exit();
201
-
202
- cached_flags = READ_ONCE(ti->flags);
203
-
204
- if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
205
- exit_to_usermode_loop(regs, cached_flags);
206
-
207
-#ifdef CONFIG_COMPAT
208
- /*
209
- * Compat syscalls set TS_COMPAT. Make sure we clear it before
210
- * returning to user mode. We need to clear it *after* signal
211
- * handling, because syscall restart has a fixup for compat
212
- * syscalls. The fixup is exercised by the ptrace_syscall_32
213
- * selftest.
214
- *
215
- * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
216
- * special case only applies after poking regs and before the
217
- * very next return to user mode.
218
- */
219
- ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
220
-#endif
221
-
222
- user_enter_irqoff();
223
-
224
- mds_user_clear_cpu_buffers();
225
-}
226
-
227
-#define SYSCALL_EXIT_WORK_FLAGS \
228
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
229
- _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
230
-
231
-static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
232
-{
233
- bool step;
234
-
235
- audit_syscall_exit(regs);
236
-
237
- if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
238
- trace_sys_exit(regs, regs->ax);
239
-
240
- /*
241
- * If TIF_SYSCALL_EMU is set, we only get here because of
242
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
243
- * We already reported this syscall instruction in
244
- * syscall_trace_enter().
245
- */
246
- step = unlikely(
247
- (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
248
- == _TIF_SINGLESTEP);
249
- if (step || cached_flags & _TIF_SYSCALL_TRACE)
250
- tracehook_report_syscall_exit(regs, step);
251
-}
252
-
253
-/*
254
- * Called with IRQs on and fully valid regs. Returns with IRQs off in a
255
- * state such that we can immediately switch to user mode.
256
- */
257
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
258
-{
259
- struct thread_info *ti = current_thread_info();
260
- u32 cached_flags = READ_ONCE(ti->flags);
261
-
262
- CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
263
-
264
- if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
265
- WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
266
- local_irq_enable();
267
-
268
- rseq_syscall(regs);
269
-
270
- /*
271
- * First do one-time work. If these work items are enabled, we
272
- * want to run them exactly once per syscall exit with IRQs on.
273
- */
274
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
275
- syscall_slow_exit_work(regs, cached_flags);
276
-
277
- local_irq_disable();
278
- prepare_exit_to_usermode(regs);
279
-}
34
+#include <asm/io_bitmap.h>
35
+#include <asm/syscall.h>
36
+#include <asm/irq_stack.h>
28037
28138 #ifdef CONFIG_X86_64
282
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
39
+__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
28340 {
284
- struct thread_info *ti;
41
+ nr = syscall_enter_from_user_mode(regs, nr);
28542
286
- enter_from_user_mode();
287
- local_irq_enable();
288
- ti = current_thread_info();
289
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
290
- nr = syscall_trace_enter(regs);
291
-
292
- /*
293
- * NB: Native and x32 syscalls are dispatched from the same
294
- * table. The only functional difference is the x32 bit in
295
- * regs->orig_ax, which changes the behavior of some syscalls.
296
- */
297
- nr &= __SYSCALL_MASK;
43
+ instrumentation_begin();
29844 if (likely(nr < NR_syscalls)) {
29945 nr = array_index_nospec(nr, NR_syscalls);
30046 regs->ax = sys_call_table[nr](regs);
47
+#ifdef CONFIG_X86_X32_ABI
48
+ } else if (likely((nr & __X32_SYSCALL_BIT) &&
49
+ (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
50
+ nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
51
+ X32_NR_syscalls);
52
+ regs->ax = x32_sys_call_table[nr](regs);
53
+#endif
30154 }
302
-
303
- syscall_return_slowpath(regs);
55
+ instrumentation_end();
56
+ syscall_exit_to_user_mode(regs);
30457 }
30558 #endif
30659
30760 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
308
-/*
309
- * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
310
- * all entry and exit work and returns with IRQs off. This function is
311
- * extremely hot in workloads that use it, and it's usually called from
312
- * do_fast_syscall_32, so forcibly inline it to improve performance.
313
- */
314
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
61
+static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
31562 {
316
- struct thread_info *ti = current_thread_info();
317
- unsigned int nr = (unsigned int)regs->orig_ax;
63
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
64
+ current_thread_info()->status |= TS_COMPAT;
31865
319
-#ifdef CONFIG_IA32_EMULATION
320
- ti->status |= TS_COMPAT;
321
-#endif
66
+ return (unsigned int)regs->orig_ax;
67
+}
32268
323
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
324
- /*
325
- * Subtlety here: if ptrace pokes something larger than
326
- * 2^32-1 into orig_ax, this truncates it. This may or
327
- * may not be necessary, but it matches the old asm
328
- * behavior.
329
- */
330
- nr = syscall_trace_enter(regs);
331
- }
332
-
69
+/*
70
+ * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
71
+ */
72
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
73
+ unsigned int nr)
74
+{
33375 if (likely(nr < IA32_NR_syscalls)) {
33476 nr = array_index_nospec(nr, IA32_NR_syscalls);
335
-#ifdef CONFIG_IA32_EMULATION
33677 regs->ax = ia32_sys_call_table[nr](regs);
337
-#else
338
- /*
339
- * It's possible that a 32-bit syscall implementation
340
- * takes a 64-bit parameter but nonetheless assumes that
341
- * the high bits are zero. Make sure we zero-extend all
342
- * of the args.
343
- */
344
- regs->ax = ia32_sys_call_table[nr](
345
- (unsigned int)regs->bx, (unsigned int)regs->cx,
346
- (unsigned int)regs->dx, (unsigned int)regs->si,
347
- (unsigned int)regs->di, (unsigned int)regs->bp);
348
-#endif /* CONFIG_IA32_EMULATION */
34978 }
350
-
351
- syscall_return_slowpath(regs);
35279 }
35380
35481 /* Handles int $0x80 */
355
-__visible void do_int80_syscall_32(struct pt_regs *regs)
82
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
35683 {
357
- enter_from_user_mode();
358
- local_irq_enable();
359
- do_syscall_32_irqs_on(regs);
84
+ unsigned int nr = syscall_32_enter(regs);
85
+
86
+ /*
87
+ * Subtlety here: if ptrace pokes something larger than 2^32-1 into
88
+ * orig_ax, the unsigned int return value truncates it. This may
89
+ * or may not be necessary, but it matches the old asm behavior.
90
+ */
91
+ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
92
+ instrumentation_begin();
93
+
94
+ do_syscall_32_irqs_on(regs, nr);
95
+
96
+ instrumentation_end();
97
+ syscall_exit_to_user_mode(regs);
98
+}
99
+
100
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
101
+{
102
+ unsigned int nr = syscall_32_enter(regs);
103
+ int res;
104
+
105
+ /*
106
+ * This cannot use syscall_enter_from_user_mode() as it has to
107
+ * fetch EBP before invoking any of the syscall entry work
108
+ * functions.
109
+ */
110
+ syscall_enter_from_user_mode_prepare(regs);
111
+
112
+ instrumentation_begin();
113
+ /* Fetch EBP from where the vDSO stashed it. */
114
+ if (IS_ENABLED(CONFIG_X86_64)) {
115
+ /*
116
+ * Micro-optimization: the pointer we're following is
117
+ * explicitly 32 bits, so it can't be out of range.
118
+ */
119
+ res = __get_user(*(u32 *)&regs->bp,
120
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
121
+ } else {
122
+ res = get_user(*(u32 *)&regs->bp,
123
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
124
+ }
125
+
126
+ if (res) {
127
+ /* User code screwed up. */
128
+ regs->ax = -EFAULT;
129
+
130
+ local_irq_disable();
131
+ instrumentation_end();
132
+ irqentry_exit_to_user_mode(regs);
133
+ return false;
134
+ }
135
+
136
+ /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
137
+ nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
138
+
139
+ /* Now this is just like a normal syscall. */
140
+ do_syscall_32_irqs_on(regs, nr);
141
+
142
+ instrumentation_end();
143
+ syscall_exit_to_user_mode(regs);
144
+ return true;
360145 }
361146
362147 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
363
-__visible long do_fast_syscall_32(struct pt_regs *regs)
148
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
364149 {
365150 /*
366151 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
367152 * convention. Adjust regs so it looks like we entered using int80.
368153 */
369
-
370154 unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
371
- vdso_image_32.sym_int80_landing_pad;
155
+ vdso_image_32.sym_int80_landing_pad;
372156
373157 /*
374158 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
....@@ -377,34 +161,9 @@
377161 */
378162 regs->ip = landing_pad;
379163
380
- enter_from_user_mode();
381
-
382
- local_irq_enable();
383
-
384
- /* Fetch EBP from where the vDSO stashed it. */
385
- if (
386
-#ifdef CONFIG_X86_64
387
- /*
388
- * Micro-optimization: the pointer we're following is explicitly
389
- * 32 bits, so it can't be out of range.
390
- */
391
- __get_user(*(u32 *)&regs->bp,
392
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
393
-#else
394
- get_user(*(u32 *)&regs->bp,
395
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
396
-#endif
397
- ) {
398
-
399
- /* User code screwed up. */
400
- local_irq_disable();
401
- regs->ax = -EFAULT;
402
- prepare_exit_to_usermode(regs);
403
- return 0; /* Keep it simple: use IRET. */
404
- }
405
-
406
- /* Now this is just like a normal syscall. */
407
- do_syscall_32_irqs_on(regs);
164
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
165
+ if (!__do_fast_syscall_32(regs))
166
+ return 0;
408167
409168 #ifdef CONFIG_X86_64
410169 /*
....@@ -436,4 +195,94 @@
436195 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
437196 #endif
438197 }
198
+
199
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
200
+__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
201
+{
202
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
203
+ regs->sp = regs->bp;
204
+
205
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
206
+ regs->flags |= X86_EFLAGS_IF;
207
+
208
+ return do_fast_syscall_32(regs);
209
+}
439210 #endif
211
+
212
+SYSCALL_DEFINE0(ni_syscall)
213
+{
214
+ return -ENOSYS;
215
+}
216
+
217
+#ifdef CONFIG_XEN_PV
218
+#ifndef CONFIG_PREEMPTION
219
+/*
220
+ * Some hypercalls issued by the toolstack can take many 10s of
221
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
222
+ * be voluntarily preempted even if full kernel preemption is
223
+ * disabled.
224
+ *
225
+ * Such preemptible hypercalls are bracketed by
226
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
227
+ * calls.
228
+ */
229
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
230
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
231
+
232
+/*
233
+ * In case of scheduling the flag must be cleared and restored after
234
+ * returning from schedule as the task might move to a different CPU.
235
+ */
236
+static __always_inline bool get_and_clear_inhcall(void)
237
+{
238
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
239
+
240
+ __this_cpu_write(xen_in_preemptible_hcall, false);
241
+ return inhcall;
242
+}
243
+
244
+static __always_inline void restore_inhcall(bool inhcall)
245
+{
246
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
247
+}
248
+#else
249
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
250
+static __always_inline void restore_inhcall(bool inhcall) { }
251
+#endif
252
+
253
+static void __xen_pv_evtchn_do_upcall(void)
254
+{
255
+ irq_enter_rcu();
256
+ inc_irq_stat(irq_hv_callback_count);
257
+
258
+ xen_hvm_evtchn_do_upcall();
259
+
260
+ irq_exit_rcu();
261
+}
262
+
263
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
264
+{
265
+ struct pt_regs *old_regs;
266
+ bool inhcall;
267
+ irqentry_state_t state;
268
+
269
+ state = irqentry_enter(regs);
270
+ old_regs = set_irq_regs(regs);
271
+
272
+ instrumentation_begin();
273
+ run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
274
+ instrumentation_end();
275
+
276
+ set_irq_regs(old_regs);
277
+
278
+ inhcall = get_and_clear_inhcall();
279
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
280
+ instrumentation_begin();
281
+ irqentry_exit_cond_resched();
282
+ instrumentation_end();
283
+ restore_inhcall(inhcall);
284
+ } else {
285
+ irqentry_exit(regs, state);
286
+ }
287
+}
288
+#endif /* CONFIG_XEN_PV */