hc
2024-02-19 1c055e55a242a33e574e48be530e06770a210dcd
kernel/arch/x86/kernel/process.c
....@@ -22,11 +22,12 @@
2222 #include <linux/utsname.h>
2323 #include <linux/stackprotector.h>
2424 #include <linux/cpuidle.h>
25
+#include <linux/acpi.h>
26
+#include <linux/elf-randomize.h>
2527 #include <trace/events/power.h>
2628 #include <linux/hw_breakpoint.h>
2729 #include <asm/cpu.h>
2830 #include <asm/apic.h>
29
-#include <asm/syscalls.h>
3031 #include <linux/uaccess.h>
3132 #include <asm/mwait.h>
3233 #include <asm/fpu/internal.h>
....@@ -39,6 +40,9 @@
3940 #include <asm/desc.h>
4041 #include <asm/prctl.h>
4142 #include <asm/spec-ctrl.h>
43
+#include <asm/io_bitmap.h>
44
+#include <asm/proto.h>
45
+#include <asm/frame.h>
4246
4347 #include "process.h"
4448
....@@ -69,18 +73,9 @@
6973 #ifdef CONFIG_X86_32
7074 .ss0 = __KERNEL_DS,
7175 .ss1 = __KERNEL_CS,
72
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
7376 #endif
77
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
7478 },
75
-#ifdef CONFIG_X86_32
76
- /*
77
- * Note that the .io_bitmap member must be extra-big. This is because
78
- * the CPU will access an additional byte beyond the end of the IO
79
- * permission bitmap. The extra byte must be all 1 bits, and must
80
- * be within the limit.
81
- */
82
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
83
-#endif
8479 };
8580 EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
8681
....@@ -98,35 +93,116 @@
9893 dst->thread.vm86 = NULL;
9994 #endif
10095
101
- return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
96
+ return fpu__copy(dst, src);
10297 }
10398
10499 /*
105
- * Free current thread data structures etc..
100
+ * Free thread data structures etc..
106101 */
107102 void exit_thread(struct task_struct *tsk)
108103 {
109104 struct thread_struct *t = &tsk->thread;
110
- unsigned long *bp = t->io_bitmap_ptr;
111105 struct fpu *fpu = &t->fpu;
112106
113
- if (bp) {
114
- struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
115
-
116
- t->io_bitmap_ptr = NULL;
117
- clear_thread_flag(TIF_IO_BITMAP);
118
- /*
119
- * Careful, clear this in the TSS too:
120
- */
121
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
122
- t->io_bitmap_max = 0;
123
- put_cpu();
124
- kfree(bp);
125
- }
107
+ if (test_thread_flag(TIF_IO_BITMAP))
108
+ io_bitmap_exit(tsk);
126109
127110 free_vm86(t);
128111
129112 fpu__drop(fpu);
113
+}
114
+
115
+static int set_new_tls(struct task_struct *p, unsigned long tls)
116
+{
117
+ struct user_desc __user *utls = (struct user_desc __user *)tls;
118
+
119
+ if (in_ia32_syscall())
120
+ return do_set_thread_area(p, -1, utls, 0);
121
+ else
122
+ return do_set_thread_area_64(p, ARCH_SET_FS, tls);
123
+}
124
+
125
+int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
126
+ struct task_struct *p, unsigned long tls)
127
+{
128
+ struct inactive_task_frame *frame;
129
+ struct fork_frame *fork_frame;
130
+ struct pt_regs *childregs;
131
+ int ret = 0;
132
+
133
+ childregs = task_pt_regs(p);
134
+ fork_frame = container_of(childregs, struct fork_frame, regs);
135
+ frame = &fork_frame->frame;
136
+
137
+ frame->bp = encode_frame_pointer(childregs);
138
+ frame->ret_addr = (unsigned long) ret_from_fork;
139
+ p->thread.sp = (unsigned long) fork_frame;
140
+ p->thread.io_bitmap = NULL;
141
+ p->thread.iopl_warn = 0;
142
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
143
+
144
+#ifdef CONFIG_X86_64
145
+ current_save_fsgs();
146
+ p->thread.fsindex = current->thread.fsindex;
147
+ p->thread.fsbase = current->thread.fsbase;
148
+ p->thread.gsindex = current->thread.gsindex;
149
+ p->thread.gsbase = current->thread.gsbase;
150
+
151
+ savesegment(es, p->thread.es);
152
+ savesegment(ds, p->thread.ds);
153
+#else
154
+ p->thread.sp0 = (unsigned long) (childregs + 1);
155
+ /*
156
+ * Clear all status flags including IF and set fixed bit. 64bit
157
+ * does not have this initialization as the frame does not contain
158
+ * flags. The flags consistency (especially vs. AC) is there
159
+ * ensured via objtool, which lacks 32bit support.
160
+ */
161
+ frame->flags = X86_EFLAGS_FIXED;
162
+#endif
163
+
164
+ /* Kernel thread ? */
165
+ if (unlikely(p->flags & PF_KTHREAD)) {
166
+ memset(childregs, 0, sizeof(struct pt_regs));
167
+ kthread_frame_init(frame, sp, arg);
168
+ return 0;
169
+ }
170
+
171
+ frame->bx = 0;
172
+ *childregs = *current_pt_regs();
173
+ childregs->ax = 0;
174
+ if (sp)
175
+ childregs->sp = sp;
176
+
177
+#ifdef CONFIG_X86_32
178
+ task_user_gs(p) = get_user_gs(current_pt_regs());
179
+#endif
180
+
181
+ if (unlikely(p->flags & PF_IO_WORKER)) {
182
+ /*
183
+ * An IO thread is a user space thread, but it doesn't
184
+ * return to ret_after_fork().
185
+ *
186
+ * In order to indicate that to tools like gdb,
187
+ * we reset the stack and instruction pointers.
188
+ *
189
+ * It does the same kernel frame setup to return to a kernel
190
+ * function that a kernel thread does.
191
+ */
192
+ childregs->sp = 0;
193
+ childregs->ip = 0;
194
+ kthread_frame_init(frame, sp, arg);
195
+ return 0;
196
+ }
197
+
198
+ /* Set a new TLS for the child thread? */
199
+ if (clone_flags & CLONE_SETTLS)
200
+ ret = set_new_tls(p, tls);
201
+
202
+ if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
203
+ io_bitmap_share(p);
204
+
205
+ return ret;
130206 }
131207
132208 void flush_thread(void)
....@@ -136,7 +212,7 @@
136212 flush_ptrace_hw_breakpoint(tsk);
137213 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
138214
139
- fpu__clear(&tsk->thread.fpu);
215
+ fpu__clear_all(&tsk->thread.fpu);
140216 }
141217
142218 void disable_TSC(void)
....@@ -233,7 +309,7 @@
233309
234310 static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
235311 {
236
- if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
312
+ if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
237313 return -ENODEV;
238314
239315 if (cpuid_enabled)
....@@ -252,33 +328,96 @@
252328 /* If cpuid was previously disabled for this task, re-enable it. */
253329 if (test_thread_flag(TIF_NOCPUID))
254330 enable_cpuid();
255
-}
256331
257
-static inline void switch_to_bitmap(struct thread_struct *prev,
258
- struct thread_struct *next,
259
- unsigned long tifp, unsigned long tifn)
260
-{
261
- struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
262
-
263
- if (tifn & _TIF_IO_BITMAP) {
264
- /*
265
- * Copy the relevant range of the IO bitmap.
266
- * Normally this is 128 bytes or less:
267
- */
268
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
269
- max(prev->io_bitmap_max, next->io_bitmap_max));
270
- /*
271
- * Make sure that the TSS limit is correct for the CPU
272
- * to notice the IO bitmap.
273
- */
274
- refresh_tss_limit();
275
- } else if (tifp & _TIF_IO_BITMAP) {
276
- /*
277
- * Clear any possible leftover bits:
278
- */
279
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
332
+ /*
333
+ * Don't inherit TIF_SSBD across exec boundary when
334
+ * PR_SPEC_DISABLE_NOEXEC is used.
335
+ */
336
+ if (test_thread_flag(TIF_SSBD) &&
337
+ task_spec_ssb_noexec(current)) {
338
+ clear_thread_flag(TIF_SSBD);
339
+ task_clear_spec_ssb_disable(current);
340
+ task_clear_spec_ssb_noexec(current);
341
+ speculation_ctrl_update(task_thread_info(current)->flags);
280342 }
281343 }
344
+
345
+#ifdef CONFIG_X86_IOPL_IOPERM
346
+static inline void switch_to_bitmap(unsigned long tifp)
347
+{
348
+ /*
349
+ * Invalidate I/O bitmap if the previous task used it. This prevents
350
+ * any possible leakage of an active I/O bitmap.
351
+ *
352
+ * If the next task has an I/O bitmap it will handle it on exit to
353
+ * user mode.
354
+ */
355
+ if (tifp & _TIF_IO_BITMAP)
356
+ tss_invalidate_io_bitmap();
357
+}
358
+
359
+static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
360
+{
361
+ /*
362
+ * Copy at least the byte range of the incoming tasks bitmap which
363
+ * covers the permitted I/O ports.
364
+ *
365
+ * If the previous task which used an I/O bitmap had more bits
366
+ * permitted, then the copy needs to cover those as well so they
367
+ * get turned off.
368
+ */
369
+ memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
370
+ max(tss->io_bitmap.prev_max, iobm->max));
371
+
372
+ /*
373
+ * Store the new max and the sequence number of this bitmap
374
+ * and a pointer to the bitmap itself.
375
+ */
376
+ tss->io_bitmap.prev_max = iobm->max;
377
+ tss->io_bitmap.prev_sequence = iobm->sequence;
378
+}
379
+
380
+/**
381
+ * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
382
+ */
383
+void native_tss_update_io_bitmap(void)
384
+{
385
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
386
+ struct thread_struct *t = &current->thread;
387
+ u16 *base = &tss->x86_tss.io_bitmap_base;
388
+
389
+ if (!test_thread_flag(TIF_IO_BITMAP)) {
390
+ native_tss_invalidate_io_bitmap();
391
+ return;
392
+ }
393
+
394
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
395
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
396
+ } else {
397
+ struct io_bitmap *iobm = t->io_bitmap;
398
+
399
+ /*
400
+ * Only copy bitmap data when the sequence number differs. The
401
+ * update time is accounted to the incoming task.
402
+ */
403
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
404
+ tss_copy_io_bitmap(tss, iobm);
405
+
406
+ /* Enable the bitmap */
407
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
408
+ }
409
+
410
+ /*
411
+ * Make sure that the TSS limit is covering the IO bitmap. It might have
412
+ * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
413
+ * access from user space to trigger a #GP because tbe bitmap is outside
414
+ * the TSS limit.
415
+ */
416
+ refresh_tss_limit();
417
+}
418
+#else /* CONFIG_X86_IOPL_IOPERM */
419
+static inline void switch_to_bitmap(unsigned long tifp) { }
420
+#endif
282421
283422 #ifdef CONFIG_SMP
284423
....@@ -434,7 +573,7 @@
434573 }
435574
436575 if (updmsr)
437
- wrmsrl(MSR_IA32_SPEC_CTRL, msr);
576
+ update_spec_ctrl_cond(msr);
438577 }
439578
440579 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
....@@ -472,17 +611,25 @@
472611 preempt_enable();
473612 }
474613
614
+static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
615
+{
616
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
617
+
618
+ newval = cr4 ^ mask;
619
+ if (newval != cr4) {
620
+ this_cpu_write(cpu_tlbstate.cr4, newval);
621
+ __write_cr4(newval);
622
+ }
623
+}
624
+
475625 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
476626 {
477
- struct thread_struct *prev, *next;
478627 unsigned long tifp, tifn;
479
-
480
- prev = &prev_p->thread;
481
- next = &next_p->thread;
482628
483629 tifn = READ_ONCE(task_thread_info(next_p)->flags);
484630 tifp = READ_ONCE(task_thread_info(prev_p)->flags);
485
- switch_to_bitmap(prev, next, tifp, tifn);
631
+
632
+ switch_to_bitmap(tifp);
486633
487634 propagate_user_return_notify(prev_p, next_p);
488635
....@@ -512,6 +659,9 @@
512659 /* Enforce MSR update to ensure consistent state */
513660 __speculation_ctrl_update(~tifn, tifn);
514661 }
662
+
663
+ if ((tifp ^ tifn) & _TIF_SLD)
664
+ switch_to_sld(tifn);
515665 }
516666
517667 /*
....@@ -553,11 +703,9 @@
553703 */
554704 void __cpuidle default_idle(void)
555705 {
556
- trace_cpu_idle_rcuidle(1, smp_processor_id());
557
- safe_halt();
558
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
706
+ raw_safe_halt();
559707 }
560
-#ifdef CONFIG_APM_MODULE
708
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
561709 EXPORT_SYMBOL(default_idle);
562710 #endif
563711
....@@ -572,7 +720,7 @@
572720 }
573721 #endif
574722
575
-void stop_this_cpu(void *dummy)
723
+void __noreturn stop_this_cpu(void *dummy)
576724 {
577725 local_irq_disable();
578726 /*
....@@ -606,6 +754,8 @@
606754 /*
607755 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
608756 * states (local apic timer and TSC stop).
757
+ *
758
+ * XXX this function is completely buggered vs RCU and tracing.
609759 */
610760 static void amd_e400_idle(void)
611761 {
....@@ -627,9 +777,9 @@
627777 * The switch back from broadcast mode needs to be called with
628778 * interrupts disabled.
629779 */
630
- local_irq_disable();
780
+ raw_local_irq_disable();
631781 tick_broadcast_exit();
632
- local_irq_enable();
782
+ raw_local_irq_enable();
633783 }
634784
635785 /*
....@@ -644,10 +794,14 @@
644794 */
645795 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
646796 {
797
+ /* User has disallowed the use of MWAIT. Fallback to HALT */
798
+ if (boot_option_idle_override == IDLE_NOMWAIT)
799
+ return 0;
800
+
647801 if (c->x86_vendor != X86_VENDOR_INTEL)
648802 return 0;
649803
650
- if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
804
+ if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR))
651805 return 0;
652806
653807 return 1;
....@@ -661,7 +815,6 @@
661815 static __cpuidle void mwait_idle(void)
662816 {
663817 if (!current_set_polling_and_test()) {
664
- trace_cpu_idle_rcuidle(1, smp_processor_id());
665818 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
666819 mb(); /* quirk */
667820 clflush((void *)&current_thread_info()->flags);
....@@ -672,10 +825,9 @@
672825 if (!need_resched())
673826 __sti_mwait(0, 0);
674827 else
675
- local_irq_enable();
676
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
828
+ raw_local_irq_enable();
677829 } else {
678
- local_irq_enable();
830
+ raw_local_irq_enable();
679831 }
680832 __current_clr_polling();
681833 }
....@@ -754,9 +906,8 @@
754906 } else if (!strcmp(str, "nomwait")) {
755907 /*
756908 * If the boot option of "idle=nomwait" is added,
757
- * it means that mwait will be disabled for CPU C2/C3
758
- * states. In such case it won't touch the variable
759
- * of boot_option_idle_override.
909
+ * it means that mwait will be disabled for CPU C1/C2/C3
910
+ * states.
760911 */
761912 boot_option_idle_override = IDLE_NOMWAIT;
762913 } else
....@@ -789,7 +940,7 @@
789940 unsigned long start, bottom, top, sp, fp, ip, ret = 0;
790941 int count = 0;
791942
792
- if (!p || p == current || p->state == TASK_RUNNING)
943
+ if (p == current || p->state == TASK_RUNNING)
793944 return 0;
794945
795946 if (!try_get_task_stack(p))