hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/x86/entry/common.c
....@@ -1,7 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * common.c - C code for kernel entry and exit
34 * Copyright (c) 2015 Andrew Lutomirski
4
- * GPL v2
55 *
66 * Based on asm and ptrace code by many authors. The code here originated
77 * in ptrace.c and signal.c.
....@@ -10,358 +10,149 @@
1010 #include <linux/kernel.h>
1111 #include <linux/sched.h>
1212 #include <linux/sched/task_stack.h>
13
+#include <linux/entry-common.h>
1314 #include <linux/mm.h>
1415 #include <linux/smp.h>
1516 #include <linux/errno.h>
1617 #include <linux/ptrace.h>
17
-#include <linux/tracehook.h>
18
-#include <linux/audit.h>
19
-#include <linux/seccomp.h>
20
-#include <linux/signal.h>
2118 #include <linux/export.h>
22
-#include <linux/context_tracking.h>
23
-#include <linux/user-return-notifier.h>
2419 #include <linux/nospec.h>
25
-#include <linux/uprobes.h>
26
-#include <linux/livepatch.h>
2720 #include <linux/syscalls.h>
21
+#include <linux/uaccess.h>
22
+
23
+#ifdef CONFIG_XEN_PV
24
+#include <xen/xen-ops.h>
25
+#include <xen/events.h>
26
+#endif
2827
2928 #include <asm/desc.h>
3029 #include <asm/traps.h>
3130 #include <asm/vdso.h>
32
-#include <linux/uaccess.h>
3331 #include <asm/cpufeature.h>
32
+#include <asm/fpu/api.h>
3433 #include <asm/nospec-branch.h>
35
-
36
-#define CREATE_TRACE_POINTS
37
-#include <trace/events/syscalls.h>
38
-
39
-#ifdef CONFIG_CONTEXT_TRACKING
40
-/* Called on entry from user mode with IRQs off. */
41
-__visible inline void enter_from_user_mode(void)
42
-{
43
- CT_WARN_ON(ct_state() != CONTEXT_USER);
44
- user_exit_irqoff();
45
-}
46
-#else
47
-static inline void enter_from_user_mode(void) {}
48
-#endif
49
-
50
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
51
-{
52
-#ifdef CONFIG_X86_64
53
- if (arch == AUDIT_ARCH_X86_64) {
54
- audit_syscall_entry(regs->orig_ax, regs->di,
55
- regs->si, regs->dx, regs->r10);
56
- } else
57
-#endif
58
- {
59
- audit_syscall_entry(regs->orig_ax, regs->bx,
60
- regs->cx, regs->dx, regs->si);
61
- }
62
-}
63
-
64
-/*
65
- * Returns the syscall nr to run (which should match regs->orig_ax) or -1
66
- * to skip the syscall.
67
- */
68
-static long syscall_trace_enter(struct pt_regs *regs)
69
-{
70
- u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
71
-
72
- struct thread_info *ti = current_thread_info();
73
- unsigned long ret = 0;
74
- bool emulated = false;
75
- u32 work;
76
-
77
- if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
78
- BUG_ON(regs != task_pt_regs(current));
79
-
80
- work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
81
-
82
- if (unlikely(work & _TIF_SYSCALL_EMU))
83
- emulated = true;
84
-
85
- if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
86
- tracehook_report_syscall_entry(regs))
87
- return -1L;
88
-
89
- if (emulated)
90
- return -1L;
91
-
92
-#ifdef CONFIG_SECCOMP
93
- /*
94
- * Do seccomp after ptrace, to catch any tracer changes.
95
- */
96
- if (work & _TIF_SECCOMP) {
97
- struct seccomp_data sd;
98
-
99
- sd.arch = arch;
100
- sd.nr = regs->orig_ax;
101
- sd.instruction_pointer = regs->ip;
102
-#ifdef CONFIG_X86_64
103
- if (arch == AUDIT_ARCH_X86_64) {
104
- sd.args[0] = regs->di;
105
- sd.args[1] = regs->si;
106
- sd.args[2] = regs->dx;
107
- sd.args[3] = regs->r10;
108
- sd.args[4] = regs->r8;
109
- sd.args[5] = regs->r9;
110
- } else
111
-#endif
112
- {
113
- sd.args[0] = regs->bx;
114
- sd.args[1] = regs->cx;
115
- sd.args[2] = regs->dx;
116
- sd.args[3] = regs->si;
117
- sd.args[4] = regs->di;
118
- sd.args[5] = regs->bp;
119
- }
120
-
121
- ret = __secure_computing(&sd);
122
- if (ret == -1)
123
- return ret;
124
- }
125
-#endif
126
-
127
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
128
- trace_sys_enter(regs, regs->orig_ax);
129
-
130
- do_audit_syscall_entry(regs, arch);
131
-
132
- return ret ?: regs->orig_ax;
133
-}
134
-
135
-#define EXIT_TO_USERMODE_LOOP_FLAGS \
136
- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
137
- _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
138
-
139
-static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
140
-{
141
- /*
142
- * In order to return to user mode, we need to have IRQs off with
143
- * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
144
- * can be set at any time on preemptable kernels if we have IRQs on,
145
- * so we need to loop. Disabling preemption wouldn't help: doing the
146
- * work to clear some of the flags can sleep.
147
- */
148
- while (true) {
149
- /* We have work to do. */
150
- local_irq_enable();
151
-
152
- if (cached_flags & _TIF_NEED_RESCHED)
153
- schedule();
154
-
155
- if (cached_flags & _TIF_UPROBE)
156
- uprobe_notify_resume(regs);
157
-
158
- if (cached_flags & _TIF_PATCH_PENDING)
159
- klp_update_patch_state(current);
160
-
161
- /* deal with pending signal delivery */
162
- if (cached_flags & _TIF_SIGPENDING)
163
- do_signal(regs);
164
-
165
- if (cached_flags & _TIF_NOTIFY_RESUME) {
166
- clear_thread_flag(TIF_NOTIFY_RESUME);
167
- tracehook_notify_resume(regs);
168
- rseq_handle_notify_resume(NULL, regs);
169
- }
170
-
171
- if (cached_flags & _TIF_USER_RETURN_NOTIFY)
172
- fire_user_return_notifiers();
173
-
174
- /* Disable IRQs and retry */
175
- local_irq_disable();
176
-
177
- cached_flags = READ_ONCE(current_thread_info()->flags);
178
-
179
- if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
180
- break;
181
- }
182
-}
183
-
184
-/* Called with IRQs disabled. */
185
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
186
-{
187
- struct thread_info *ti = current_thread_info();
188
- u32 cached_flags;
189
-
190
- addr_limit_user_check();
191
-
192
- lockdep_assert_irqs_disabled();
193
- lockdep_sys_exit();
194
-
195
- cached_flags = READ_ONCE(ti->flags);
196
-
197
- if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
198
- exit_to_usermode_loop(regs, cached_flags);
199
-
200
-#ifdef CONFIG_COMPAT
201
- /*
202
- * Compat syscalls set TS_COMPAT. Make sure we clear it before
203
- * returning to user mode. We need to clear it *after* signal
204
- * handling, because syscall restart has a fixup for compat
205
- * syscalls. The fixup is exercised by the ptrace_syscall_32
206
- * selftest.
207
- *
208
- * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
209
- * special case only applies after poking regs and before the
210
- * very next return to user mode.
211
- */
212
- ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
213
-#endif
214
-
215
- user_enter_irqoff();
216
-
217
- mds_user_clear_cpu_buffers();
218
-}
219
-
220
-#define SYSCALL_EXIT_WORK_FLAGS \
221
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
222
- _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
223
-
224
-static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
225
-{
226
- bool step;
227
-
228
- audit_syscall_exit(regs);
229
-
230
- if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
231
- trace_sys_exit(regs, regs->ax);
232
-
233
- /*
234
- * If TIF_SYSCALL_EMU is set, we only get here because of
235
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
236
- * We already reported this syscall instruction in
237
- * syscall_trace_enter().
238
- */
239
- step = unlikely(
240
- (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
241
- == _TIF_SINGLESTEP);
242
- if (step || cached_flags & _TIF_SYSCALL_TRACE)
243
- tracehook_report_syscall_exit(regs, step);
244
-}
245
-
246
-/*
247
- * Called with IRQs on and fully valid regs. Returns with IRQs off in a
248
- * state such that we can immediately switch to user mode.
249
- */
250
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
251
-{
252
- struct thread_info *ti = current_thread_info();
253
- u32 cached_flags = READ_ONCE(ti->flags);
254
-
255
- CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
256
-
257
- if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
258
- WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
259
- local_irq_enable();
260
-
261
- rseq_syscall(regs);
262
-
263
- /*
264
- * First do one-time work. If these work items are enabled, we
265
- * want to run them exactly once per syscall exit with IRQs on.
266
- */
267
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
268
- syscall_slow_exit_work(regs, cached_flags);
269
-
270
- local_irq_disable();
271
- prepare_exit_to_usermode(regs);
272
-}
34
+#include <asm/io_bitmap.h>
35
+#include <asm/syscall.h>
36
+#include <asm/irq_stack.h>
27337
27438 #ifdef CONFIG_X86_64
275
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
39
+__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
27640 {
277
- struct thread_info *ti;
41
+ nr = syscall_enter_from_user_mode(regs, nr);
27842
279
- enter_from_user_mode();
280
- local_irq_enable();
281
- ti = current_thread_info();
282
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
283
- nr = syscall_trace_enter(regs);
284
-
285
- /*
286
- * NB: Native and x32 syscalls are dispatched from the same
287
- * table. The only functional difference is the x32 bit in
288
- * regs->orig_ax, which changes the behavior of some syscalls.
289
- */
290
- nr &= __SYSCALL_MASK;
43
+ instrumentation_begin();
29144 if (likely(nr < NR_syscalls)) {
29245 nr = array_index_nospec(nr, NR_syscalls);
29346 regs->ax = sys_call_table[nr](regs);
47
+#ifdef CONFIG_X86_X32_ABI
48
+ } else if (likely((nr & __X32_SYSCALL_BIT) &&
49
+ (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
50
+ nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
51
+ X32_NR_syscalls);
52
+ regs->ax = x32_sys_call_table[nr](regs);
53
+#endif
29454 }
295
-
296
- syscall_return_slowpath(regs);
55
+ instrumentation_end();
56
+ syscall_exit_to_user_mode(regs);
29757 }
29858 #endif
29959
30060 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
301
-/*
302
- * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
303
- * all entry and exit work and returns with IRQs off. This function is
304
- * extremely hot in workloads that use it, and it's usually called from
305
- * do_fast_syscall_32, so forcibly inline it to improve performance.
306
- */
307
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
61
+static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
30862 {
309
- struct thread_info *ti = current_thread_info();
310
- unsigned int nr = (unsigned int)regs->orig_ax;
63
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
64
+ current_thread_info()->status |= TS_COMPAT;
31165
312
-#ifdef CONFIG_IA32_EMULATION
313
- ti->status |= TS_COMPAT;
314
-#endif
66
+ return (unsigned int)regs->orig_ax;
67
+}
31568
316
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
317
- /*
318
- * Subtlety here: if ptrace pokes something larger than
319
- * 2^32-1 into orig_ax, this truncates it. This may or
320
- * may not be necessary, but it matches the old asm
321
- * behavior.
322
- */
323
- nr = syscall_trace_enter(regs);
324
- }
325
-
69
+/*
70
+ * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
71
+ */
72
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
73
+ unsigned int nr)
74
+{
32675 if (likely(nr < IA32_NR_syscalls)) {
32776 nr = array_index_nospec(nr, IA32_NR_syscalls);
328
-#ifdef CONFIG_IA32_EMULATION
32977 regs->ax = ia32_sys_call_table[nr](regs);
330
-#else
331
- /*
332
- * It's possible that a 32-bit syscall implementation
333
- * takes a 64-bit parameter but nonetheless assumes that
334
- * the high bits are zero. Make sure we zero-extend all
335
- * of the args.
336
- */
337
- regs->ax = ia32_sys_call_table[nr](
338
- (unsigned int)regs->bx, (unsigned int)regs->cx,
339
- (unsigned int)regs->dx, (unsigned int)regs->si,
340
- (unsigned int)regs->di, (unsigned int)regs->bp);
341
-#endif /* CONFIG_IA32_EMULATION */
34278 }
343
-
344
- syscall_return_slowpath(regs);
34579 }
34680
34781 /* Handles int $0x80 */
348
-__visible void do_int80_syscall_32(struct pt_regs *regs)
82
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
34983 {
350
- enter_from_user_mode();
351
- local_irq_enable();
352
- do_syscall_32_irqs_on(regs);
84
+ unsigned int nr = syscall_32_enter(regs);
85
+
86
+ /*
87
+ * Subtlety here: if ptrace pokes something larger than 2^32-1 into
88
+ * orig_ax, the unsigned int return value truncates it. This may
89
+ * or may not be necessary, but it matches the old asm behavior.
90
+ */
91
+ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
92
+ instrumentation_begin();
93
+
94
+ do_syscall_32_irqs_on(regs, nr);
95
+
96
+ instrumentation_end();
97
+ syscall_exit_to_user_mode(regs);
98
+}
99
+
100
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
101
+{
102
+ unsigned int nr = syscall_32_enter(regs);
103
+ int res;
104
+
105
+ /*
106
+ * This cannot use syscall_enter_from_user_mode() as it has to
107
+ * fetch EBP before invoking any of the syscall entry work
108
+ * functions.
109
+ */
110
+ syscall_enter_from_user_mode_prepare(regs);
111
+
112
+ instrumentation_begin();
113
+ /* Fetch EBP from where the vDSO stashed it. */
114
+ if (IS_ENABLED(CONFIG_X86_64)) {
115
+ /*
116
+ * Micro-optimization: the pointer we're following is
117
+ * explicitly 32 bits, so it can't be out of range.
118
+ */
119
+ res = __get_user(*(u32 *)&regs->bp,
120
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
121
+ } else {
122
+ res = get_user(*(u32 *)&regs->bp,
123
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
124
+ }
125
+
126
+ if (res) {
127
+ /* User code screwed up. */
128
+ regs->ax = -EFAULT;
129
+
130
+ local_irq_disable();
131
+ instrumentation_end();
132
+ irqentry_exit_to_user_mode(regs);
133
+ return false;
134
+ }
135
+
136
+ /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
137
+ nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
138
+
139
+ /* Now this is just like a normal syscall. */
140
+ do_syscall_32_irqs_on(regs, nr);
141
+
142
+ instrumentation_end();
143
+ syscall_exit_to_user_mode(regs);
144
+ return true;
353145 }
354146
355147 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
356
-__visible long do_fast_syscall_32(struct pt_regs *regs)
148
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
357149 {
358150 /*
359151 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
360152 * convention. Adjust regs so it looks like we entered using int80.
361153 */
362
-
363154 unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
364
- vdso_image_32.sym_int80_landing_pad;
155
+ vdso_image_32.sym_int80_landing_pad;
365156
366157 /*
367158 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
....@@ -370,34 +161,9 @@
370161 */
371162 regs->ip = landing_pad;
372163
373
- enter_from_user_mode();
374
-
375
- local_irq_enable();
376
-
377
- /* Fetch EBP from where the vDSO stashed it. */
378
- if (
379
-#ifdef CONFIG_X86_64
380
- /*
381
- * Micro-optimization: the pointer we're following is explicitly
382
- * 32 bits, so it can't be out of range.
383
- */
384
- __get_user(*(u32 *)&regs->bp,
385
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
386
-#else
387
- get_user(*(u32 *)&regs->bp,
388
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
389
-#endif
390
- ) {
391
-
392
- /* User code screwed up. */
393
- local_irq_disable();
394
- regs->ax = -EFAULT;
395
- prepare_exit_to_usermode(regs);
396
- return 0; /* Keep it simple: use IRET. */
397
- }
398
-
399
- /* Now this is just like a normal syscall. */
400
- do_syscall_32_irqs_on(regs);
164
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
165
+ if (!__do_fast_syscall_32(regs))
166
+ return 0;
401167
402168 #ifdef CONFIG_X86_64
403169 /*
....@@ -429,4 +195,94 @@
429195 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
430196 #endif
431197 }
198
+
199
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
200
+__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
201
+{
202
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
203
+ regs->sp = regs->bp;
204
+
205
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
206
+ regs->flags |= X86_EFLAGS_IF;
207
+
208
+ return do_fast_syscall_32(regs);
209
+}
432210 #endif
211
+
212
+SYSCALL_DEFINE0(ni_syscall)
213
+{
214
+ return -ENOSYS;
215
+}
216
+
217
+#ifdef CONFIG_XEN_PV
218
+#ifndef CONFIG_PREEMPTION
219
+/*
220
+ * Some hypercalls issued by the toolstack can take many 10s of
221
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
222
+ * be voluntarily preempted even if full kernel preemption is
223
+ * disabled.
224
+ *
225
+ * Such preemptible hypercalls are bracketed by
226
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
227
+ * calls.
228
+ */
229
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
230
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
231
+
232
+/*
233
+ * In case of scheduling the flag must be cleared and restored after
234
+ * returning from schedule as the task might move to a different CPU.
235
+ */
236
+static __always_inline bool get_and_clear_inhcall(void)
237
+{
238
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
239
+
240
+ __this_cpu_write(xen_in_preemptible_hcall, false);
241
+ return inhcall;
242
+}
243
+
244
+static __always_inline void restore_inhcall(bool inhcall)
245
+{
246
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
247
+}
248
+#else
249
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
250
+static __always_inline void restore_inhcall(bool inhcall) { }
251
+#endif
252
+
253
+static void __xen_pv_evtchn_do_upcall(void)
254
+{
255
+ irq_enter_rcu();
256
+ inc_irq_stat(irq_hv_callback_count);
257
+
258
+ xen_hvm_evtchn_do_upcall();
259
+
260
+ irq_exit_rcu();
261
+}
262
+
263
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
264
+{
265
+ struct pt_regs *old_regs;
266
+ bool inhcall;
267
+ irqentry_state_t state;
268
+
269
+ state = irqentry_enter(regs);
270
+ old_regs = set_irq_regs(regs);
271
+
272
+ instrumentation_begin();
273
+ run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
274
+ instrumentation_end();
275
+
276
+ set_irq_regs(old_regs);
277
+
278
+ inhcall = get_and_clear_inhcall();
279
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
280
+ instrumentation_begin();
281
+ irqentry_exit_cond_resched();
282
+ instrumentation_end();
283
+ restore_inhcall(inhcall);
284
+ } else {
285
+ irqentry_exit(regs, state);
286
+ }
287
+}
288
+#endif /* CONFIG_XEN_PV */