hc
2024-10-12 a5969cabbb4660eab42b6ef0412cbbd1200cf14d
kernel/arch/x86/include/asm/tlbflush.h
....@@ -13,156 +13,51 @@
1313 #include <asm/pti.h>
1414 #include <asm/processor-flags.h>
1515
16
-/*
17
- * The x86 feature is called PCID (Process Context IDentifier). It is similar
18
- * to what is traditionally called ASID on the RISC processors.
19
- *
20
- * We don't use the traditional ASID implementation, where each process/mm gets
21
- * its own ASID and flush/restart when we run out of ASID space.
22
- *
23
- * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
24
- * that came by on this CPU, allowing cheaper switch_mm between processes on
25
- * this CPU.
26
- *
27
- * We end up with different spaces for different things. To avoid confusion we
28
- * use different names for each of them:
29
- *
30
- * ASID - [0, TLB_NR_DYN_ASIDS-1]
31
- * the canonical identifier for an mm
32
- *
33
- * kPCID - [1, TLB_NR_DYN_ASIDS]
34
- * the value we write into the PCID part of CR3; corresponds to the
35
- * ASID+1, because PCID 0 is special.
36
- *
37
- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
38
- * for KPTI each mm has two address spaces and thus needs two
39
- * PCID values, but we can still do with a single ASID denomination
40
- * for each mm. Corresponds to kPCID + 2048.
41
- *
42
- */
16
+void __flush_tlb_all(void);
4317
44
-/* There are 12 bits of space for ASIDS in CR3 */
45
-#define CR3_HW_ASID_BITS 12
18
+#define TLB_FLUSH_ALL -1UL
4619
47
-/*
48
- * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
49
- * user/kernel switches
50
- */
51
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
52
-# define PTI_CONSUMED_PCID_BITS 1
53
-#else
54
-# define PTI_CONSUMED_PCID_BITS 0
55
-#endif
20
+void cr4_update_irqsoff(unsigned long set, unsigned long clear);
21
+unsigned long cr4_read_shadow(void);
5622
57
-#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
23
+/* Set in this cpu's CR4. */
24
+static inline void cr4_set_bits_irqsoff(unsigned long mask)
25
+{
26
+ cr4_update_irqsoff(mask, 0);
27
+}
5828
59
-/*
60
- * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
61
- * for them being zero-based. Another -1 is because PCID 0 is reserved for
62
- * use by non-PCID-aware users.
63
- */
64
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
29
+/* Clear in this cpu's CR4. */
30
+static inline void cr4_clear_bits_irqsoff(unsigned long mask)
31
+{
32
+ cr4_update_irqsoff(0, mask);
33
+}
6534
35
+/* Set in this cpu's CR4. */
36
+static inline void cr4_set_bits(unsigned long mask)
37
+{
38
+ unsigned long flags;
39
+
40
+ local_irq_save(flags);
41
+ cr4_set_bits_irqsoff(mask);
42
+ local_irq_restore(flags);
43
+}
44
+
45
+/* Clear in this cpu's CR4. */
46
+static inline void cr4_clear_bits(unsigned long mask)
47
+{
48
+ unsigned long flags;
49
+
50
+ local_irq_save(flags);
51
+ cr4_clear_bits_irqsoff(mask);
52
+ local_irq_restore(flags);
53
+}
54
+
55
+#ifndef MODULE
6656 /*
6757 * 6 because 6 should be plenty and struct tlb_state will fit in two cache
6858 * lines.
6959 */
7060 #define TLB_NR_DYN_ASIDS 6
71
-
72
-/*
73
- * Given @asid, compute kPCID
74
- */
75
-static inline u16 kern_pcid(u16 asid)
76
-{
77
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
78
-
79
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
80
- /*
81
- * Make sure that the dynamic ASID space does not confict with the
82
- * bit we are using to switch between user and kernel ASIDs.
83
- */
84
- BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
85
-
86
- /*
87
- * The ASID being passed in here should have respected the
88
- * MAX_ASID_AVAILABLE and thus never have the switch bit set.
89
- */
90
- VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
91
-#endif
92
- /*
93
- * The dynamically-assigned ASIDs that get passed in are small
94
- * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
95
- * so do not bother to clear it.
96
- *
97
- * If PCID is on, ASID-aware code paths put the ASID+1 into the
98
- * PCID bits. This serves two purposes. It prevents a nasty
99
- * situation in which PCID-unaware code saves CR3, loads some other
100
- * value (with PCID == 0), and then restores CR3, thus corrupting
101
- * the TLB for ASID 0 if the saved ASID was nonzero. It also means
102
- * that any bugs involving loading a PCID-enabled CR3 with
103
- * CR4.PCIDE off will trigger deterministically.
104
- */
105
- return asid + 1;
106
-}
107
-
108
-/*
109
- * Given @asid, compute uPCID
110
- */
111
-static inline u16 user_pcid(u16 asid)
112
-{
113
- u16 ret = kern_pcid(asid);
114
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
115
- ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
116
-#endif
117
- return ret;
118
-}
119
-
120
-struct pgd_t;
121
-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
122
-{
123
- if (static_cpu_has(X86_FEATURE_PCID)) {
124
- return __sme_pa(pgd) | kern_pcid(asid);
125
- } else {
126
- VM_WARN_ON_ONCE(asid != 0);
127
- return __sme_pa(pgd);
128
- }
129
-}
130
-
131
-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
132
-{
133
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
134
- /*
135
- * Use boot_cpu_has() instead of this_cpu_has() as this function
136
- * might be called during early boot. This should work even after
137
- * boot because all CPU's the have same capabilities:
138
- */
139
- VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
140
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
141
-}
142
-
143
-#ifdef CONFIG_PARAVIRT
144
-#include <asm/paravirt.h>
145
-#else
146
-#define __flush_tlb() __native_flush_tlb()
147
-#define __flush_tlb_global() __native_flush_tlb_global()
148
-#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
149
-#endif
150
-
151
-static inline bool tlb_defer_switch_to_init_mm(void)
152
-{
153
- /*
154
- * If we have PCID, then switching to init_mm is reasonably
155
- * fast. If we don't have PCID, then switching to init_mm is
156
- * quite slow, so we try to defer it in the hopes that we can
157
- * avoid it entirely. The latter approach runs the risk of
158
- * receiving otherwise unnecessary IPIs.
159
- *
160
- * This choice is just a heuristic. The tlb code can handle this
161
- * function returning true or false regardless of whether we have
162
- * PCID.
163
- */
164
- return !static_cpu_has(X86_FEATURE_PCID);
165
-}
16661
16762 struct tlb_context {
16863 u64 ctx_id;
....@@ -183,7 +78,7 @@
18378 */
18479 struct mm_struct *loaded_mm;
18580
186
-#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
81
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL)
18782
18883 /* Last user mm for optimizing IBPB */
18984 union {
....@@ -258,37 +153,8 @@
258153 };
259154 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
260155
261
-/*
262
- * Blindly accessing user memory from NMI context can be dangerous
263
- * if we're in the middle of switching the current user task or
264
- * switching the loaded mm. It can also be dangerous if we
265
- * interrupted some kernel code that was temporarily using a
266
- * different mm.
267
- */
268
-static inline bool nmi_uaccess_okay(void)
269
-{
270
- struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
271
- struct mm_struct *current_mm = current->mm;
272
-
273
- VM_WARN_ON_ONCE(!loaded_mm);
274
-
275
- /*
276
- * The condition we want to check is
277
- * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
278
- * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
279
- * is supposed to be reasonably fast.
280
- *
281
- * Instead, we check the almost equivalent but somewhat conservative
282
- * condition below, and we rely on the fact that switch_mm_irqs_off()
283
- * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
284
- */
285
- if (loaded_mm != current_mm)
286
- return false;
287
-
288
- VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
289
-
290
- return true;
291
-}
156
+bool nmi_uaccess_okay(void);
157
+#define nmi_uaccess_okay nmi_uaccess_okay
292158
293159 /* Initialize cr4 shadow for this CPU. */
294160 static inline void cr4_init_shadow(void)
....@@ -296,232 +162,10 @@
296162 this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
297163 }
298164
299
-static inline void __cr4_set(unsigned long cr4)
300
-{
301
- lockdep_assert_irqs_disabled();
302
- this_cpu_write(cpu_tlbstate.cr4, cr4);
303
- __write_cr4(cr4);
304
-}
305
-
306
-/* Set in this cpu's CR4. */
307
-static inline void cr4_set_bits(unsigned long mask)
308
-{
309
- unsigned long cr4, flags;
310
-
311
- local_irq_save(flags);
312
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
313
- if ((cr4 | mask) != cr4)
314
- __cr4_set(cr4 | mask);
315
- local_irq_restore(flags);
316
-}
317
-
318
-/* Clear in this cpu's CR4. */
319
-static inline void cr4_clear_bits(unsigned long mask)
320
-{
321
- unsigned long cr4, flags;
322
-
323
- local_irq_save(flags);
324
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
325
- if ((cr4 & ~mask) != cr4)
326
- __cr4_set(cr4 & ~mask);
327
- local_irq_restore(flags);
328
-}
329
-
330
-static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
331
-{
332
- unsigned long cr4;
333
-
334
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
335
- __cr4_set(cr4 ^ mask);
336
-}
337
-
338
-/* Read the CR4 shadow. */
339
-static inline unsigned long cr4_read_shadow(void)
340
-{
341
- return this_cpu_read(cpu_tlbstate.cr4);
342
-}
343
-
344
-/*
345
- * Mark all other ASIDs as invalid, preserves the current.
346
- */
347
-static inline void invalidate_other_asid(void)
348
-{
349
- this_cpu_write(cpu_tlbstate.invalidate_other, true);
350
-}
351
-
352
-/*
353
- * Save some of cr4 feature set we're using (e.g. Pentium 4MB
354
- * enable and PPro Global page enable), so that any CPU's that boot
355
- * up after us can get the correct flags. This should only be used
356
- * during boot on the boot cpu.
357
- */
358165 extern unsigned long mmu_cr4_features;
359166 extern u32 *trampoline_cr4_features;
360167
361
-static inline void cr4_set_bits_and_update_boot(unsigned long mask)
362
-{
363
- mmu_cr4_features |= mask;
364
- if (trampoline_cr4_features)
365
- *trampoline_cr4_features = mmu_cr4_features;
366
- cr4_set_bits(mask);
367
-}
368
-
369168 extern void initialize_tlbstate_and_flush(void);
370
-
371
-/*
372
- * Given an ASID, flush the corresponding user ASID. We can delay this
373
- * until the next time we switch to it.
374
- *
375
- * See SWITCH_TO_USER_CR3.
376
- */
377
-static inline void invalidate_user_asid(u16 asid)
378
-{
379
- /* There is no user ASID if address space separation is off */
380
- if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
381
- return;
382
-
383
- /*
384
- * We only have a single ASID if PCID is off and the CR3
385
- * write will have flushed it.
386
- */
387
- if (!cpu_feature_enabled(X86_FEATURE_PCID))
388
- return;
389
-
390
- if (!static_cpu_has(X86_FEATURE_PTI))
391
- return;
392
-
393
- __set_bit(kern_pcid(asid),
394
- (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
395
-}
396
-
397
-/*
398
- * flush the entire current user mapping
399
- */
400
-static inline void __native_flush_tlb(void)
401
-{
402
- /*
403
- * Preemption or interrupts must be disabled to protect the access
404
- * to the per CPU variable and to prevent being preempted between
405
- * read_cr3() and write_cr3().
406
- */
407
- WARN_ON_ONCE(preemptible());
408
-
409
- invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
410
-
411
- /* If current->mm == NULL then the read_cr3() "borrows" an mm */
412
- native_write_cr3(__native_read_cr3());
413
-}
414
-
415
-/*
416
- * flush everything
417
- */
418
-static inline void __native_flush_tlb_global(void)
419
-{
420
- unsigned long cr4, flags;
421
-
422
- if (static_cpu_has(X86_FEATURE_INVPCID)) {
423
- /*
424
- * Using INVPCID is considerably faster than a pair of writes
425
- * to CR4 sandwiched inside an IRQ flag save/restore.
426
- *
427
- * Note, this works with CR4.PCIDE=0 or 1.
428
- */
429
- invpcid_flush_all();
430
- return;
431
- }
432
-
433
- /*
434
- * Read-modify-write to CR4 - protect it from preemption and
435
- * from interrupts. (Use the raw variant because this code can
436
- * be called from deep inside debugging code.)
437
- */
438
- raw_local_irq_save(flags);
439
-
440
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
441
- /* toggle PGE */
442
- native_write_cr4(cr4 ^ X86_CR4_PGE);
443
- /* write old PGE again and flush TLBs */
444
- native_write_cr4(cr4);
445
-
446
- raw_local_irq_restore(flags);
447
-}
448
-
449
-/*
450
- * flush one page in the user mapping
451
- */
452
-static inline void __native_flush_tlb_one_user(unsigned long addr)
453
-{
454
- u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
455
-
456
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
457
-
458
- if (!static_cpu_has(X86_FEATURE_PTI))
459
- return;
460
-
461
- /*
462
- * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
463
- * Just use invalidate_user_asid() in case we are called early.
464
- */
465
- if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
466
- invalidate_user_asid(loaded_mm_asid);
467
- else
468
- invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
469
-}
470
-
471
-/*
472
- * flush everything
473
- */
474
-static inline void __flush_tlb_all(void)
475
-{
476
- /*
477
- * This is to catch users with enabled preemption and the PGE feature
478
- * and don't trigger the warning in __native_flush_tlb().
479
- */
480
- VM_WARN_ON_ONCE(preemptible());
481
-
482
- if (boot_cpu_has(X86_FEATURE_PGE)) {
483
- __flush_tlb_global();
484
- } else {
485
- /*
486
- * !PGE -> !PCID (setup_pcid()), thus every flush is total.
487
- */
488
- __flush_tlb();
489
- }
490
-}
491
-
492
-/*
493
- * flush one page in the kernel mapping
494
- */
495
-static inline void __flush_tlb_one_kernel(unsigned long addr)
496
-{
497
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
498
-
499
- /*
500
- * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
501
- * paravirt equivalent. Even with PCID, this is sufficient: we only
502
- * use PCID if we also use global PTEs for the kernel mapping, and
503
- * INVLPG flushes global translations across all address spaces.
504
- *
505
- * If PTI is on, then the kernel is mapped with non-global PTEs, and
506
- * __flush_tlb_one_user() will flush the given address for the current
507
- * kernel address space and for its usermode counterpart, but it does
508
- * not flush it for other address spaces.
509
- */
510
- __flush_tlb_one_user(addr);
511
-
512
- if (!static_cpu_has(X86_FEATURE_PTI))
513
- return;
514
-
515
- /*
516
- * See above. We need to propagate the flush to all other address
517
- * spaces. In principle, we only need to propagate it to kernelmode
518
- * address spaces, but the extra bookkeeping we would need is not
519
- * worth it.
520
- */
521
- invalidate_other_asid();
522
-}
523
-
524
-#define TLB_FLUSH_ALL -1UL
525169
526170 /*
527171 * TLB flushing:
....@@ -557,27 +201,39 @@
557201 unsigned long start;
558202 unsigned long end;
559203 u64 new_tlb_gen;
204
+ unsigned int stride_shift;
205
+ bool freed_tables;
560206 };
561207
562
-#define local_flush_tlb() __flush_tlb()
208
+void flush_tlb_local(void);
209
+void flush_tlb_one_user(unsigned long addr);
210
+void flush_tlb_one_kernel(unsigned long addr);
211
+void flush_tlb_others(const struct cpumask *cpumask,
212
+ const struct flush_tlb_info *info);
563213
564
-#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
214
+#ifdef CONFIG_PARAVIRT
215
+#include <asm/paravirt.h>
216
+#endif
565217
566
-#define flush_tlb_range(vma, start, end) \
567
- flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
218
+#define flush_tlb_mm(mm) \
219
+ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
220
+
221
+#define flush_tlb_range(vma, start, end) \
222
+ flush_tlb_mm_range((vma)->vm_mm, start, end, \
223
+ ((vma)->vm_flags & VM_HUGETLB) \
224
+ ? huge_page_shift(hstate_vma(vma)) \
225
+ : PAGE_SHIFT, false)
568226
569227 extern void flush_tlb_all(void);
570228 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
571
- unsigned long end, unsigned long vmflag);
229
+ unsigned long end, unsigned int stride_shift,
230
+ bool freed_tables);
572231 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
573232
574233 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
575234 {
576
- flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
235
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
577236 }
578
-
579
-void native_flush_tlb_others(const struct cpumask *cpumask,
580
- const struct flush_tlb_info *info);
581237
582238 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
583239 {
....@@ -599,12 +255,6 @@
599255
600256 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
601257
602
-#ifndef CONFIG_PARAVIRT
603
-#define flush_tlb_others(mask, info) \
604
- native_flush_tlb_others(mask, info)
605
-
606
-#define paravirt_tlb_remove_table(tlb, page) \
607
- tlb_remove_page(tlb, (void *)(page))
608
-#endif
258
+#endif /* !MODULE */
609259
610260 #endif /* _ASM_X86_TLBFLUSH_H */