forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/arch/x86/mm/tlb.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 #include <linux/init.h>
23
34 #include <linux/mm.h>
....@@ -13,7 +14,18 @@
1314 #include <asm/nospec-branch.h>
1415 #include <asm/cache.h>
1516 #include <asm/apic.h>
16
-#include <asm/uv/uv.h>
17
+
18
+#include "mm_internal.h"
19
+
20
+#ifdef CONFIG_PARAVIRT
21
+# define STATIC_NOPV
22
+#else
23
+# define STATIC_NOPV static
24
+# define __flush_tlb_local native_flush_tlb_local
25
+# define __flush_tlb_global native_flush_tlb_global
26
+# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
27
+# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
28
+#endif
1729
1830 /*
1931 * TLB flushing, formerly SMP-only
....@@ -34,6 +46,126 @@
3446 * stored in cpu_tlb_state.last_user_mm_ibpb.
3547 */
3648 #define LAST_USER_MM_IBPB 0x1UL
49
+
50
+/*
51
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
52
+ * to what is traditionally called ASID on the RISC processors.
53
+ *
54
+ * We don't use the traditional ASID implementation, where each process/mm gets
55
+ * its own ASID and flush/restart when we run out of ASID space.
56
+ *
57
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
58
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
59
+ * this CPU.
60
+ *
61
+ * We end up with different spaces for different things. To avoid confusion we
62
+ * use different names for each of them:
63
+ *
64
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
65
+ * the canonical identifier for an mm
66
+ *
67
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
68
+ * the value we write into the PCID part of CR3; corresponds to the
69
+ * ASID+1, because PCID 0 is special.
70
+ *
71
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
72
+ * for KPTI each mm has two address spaces and thus needs two
73
+ * PCID values, but we can still do with a single ASID denomination
74
+ * for each mm. Corresponds to kPCID + 2048.
75
+ *
76
+ */
77
+
78
+/* There are 12 bits of space for ASIDS in CR3 */
79
+#define CR3_HW_ASID_BITS 12
80
+
81
+/*
82
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
83
+ * user/kernel switches
84
+ */
85
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
86
+# define PTI_CONSUMED_PCID_BITS 1
87
+#else
88
+# define PTI_CONSUMED_PCID_BITS 0
89
+#endif
90
+
91
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
92
+
93
+/*
94
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
95
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
96
+ * use by non-PCID-aware users.
97
+ */
98
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
99
+
100
+/*
101
+ * Given @asid, compute kPCID
102
+ */
103
+static inline u16 kern_pcid(u16 asid)
104
+{
105
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
106
+
107
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
108
+ /*
109
+ * Make sure that the dynamic ASID space does not confict with the
110
+ * bit we are using to switch between user and kernel ASIDs.
111
+ */
112
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
113
+
114
+ /*
115
+ * The ASID being passed in here should have respected the
116
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
117
+ */
118
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
119
+#endif
120
+ /*
121
+ * The dynamically-assigned ASIDs that get passed in are small
122
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
123
+ * so do not bother to clear it.
124
+ *
125
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
126
+ * PCID bits. This serves two purposes. It prevents a nasty
127
+ * situation in which PCID-unaware code saves CR3, loads some other
128
+ * value (with PCID == 0), and then restores CR3, thus corrupting
129
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
130
+ * that any bugs involving loading a PCID-enabled CR3 with
131
+ * CR4.PCIDE off will trigger deterministically.
132
+ */
133
+ return asid + 1;
134
+}
135
+
136
+/*
137
+ * Given @asid, compute uPCID
138
+ */
139
+static inline u16 user_pcid(u16 asid)
140
+{
141
+ u16 ret = kern_pcid(asid);
142
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
143
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
144
+#endif
145
+ return ret;
146
+}
147
+
148
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
149
+{
150
+ if (static_cpu_has(X86_FEATURE_PCID)) {
151
+ return __sme_pa(pgd) | kern_pcid(asid);
152
+ } else {
153
+ VM_WARN_ON_ONCE(asid != 0);
154
+ return __sme_pa(pgd);
155
+ }
156
+}
157
+
158
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
159
+{
160
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
161
+ /*
162
+ * Use boot_cpu_has() instead of this_cpu_has() as this function
163
+ * might be called during early boot. This should work even after
164
+ * boot because all CPU's the have same capabilities:
165
+ */
166
+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
167
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
168
+}
37169
38170 /*
39171 * We get here when we do something requiring a TLB invalidation
....@@ -107,6 +239,32 @@
107239 *need_flush = true;
108240 }
109241
242
+/*
243
+ * Given an ASID, flush the corresponding user ASID. We can delay this
244
+ * until the next time we switch to it.
245
+ *
246
+ * See SWITCH_TO_USER_CR3.
247
+ */
248
+static inline void invalidate_user_asid(u16 asid)
249
+{
250
+ /* There is no user ASID if address space separation is off */
251
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
252
+ return;
253
+
254
+ /*
255
+ * We only have a single ASID if PCID is off and the CR3
256
+ * write will have flushed it.
257
+ */
258
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
259
+ return;
260
+
261
+ if (!static_cpu_has(X86_FEATURE_PTI))
262
+ return;
263
+
264
+ __set_bit(kern_pcid(asid),
265
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
266
+}
267
+
110268 static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
111269 {
112270 unsigned long new_mm_cr3;
....@@ -156,34 +314,6 @@
156314 local_irq_save(flags);
157315 switch_mm_irqs_off(prev, next, tsk);
158316 local_irq_restore(flags);
159
-}
160
-
161
-static void sync_current_stack_to_mm(struct mm_struct *mm)
162
-{
163
- unsigned long sp = current_stack_pointer;
164
- pgd_t *pgd = pgd_offset(mm, sp);
165
-
166
- if (pgtable_l5_enabled()) {
167
- if (unlikely(pgd_none(*pgd))) {
168
- pgd_t *pgd_ref = pgd_offset_k(sp);
169
-
170
- set_pgd(pgd, *pgd_ref);
171
- }
172
- } else {
173
- /*
174
- * "pgd" is faked. The top level entries are "p4d"s, so sync
175
- * the p4d. This compiles to approximately the same code as
176
- * the 5-level case.
177
- */
178
- p4d_t *p4d = p4d_offset(pgd, sp);
179
-
180
- if (unlikely(p4d_none(*p4d))) {
181
- pgd_t *pgd_ref = pgd_offset_k(sp);
182
- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
183
-
184
- set_p4d(p4d, *p4d_ref);
185
- }
186
- }
187317 }
188318
189319 static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
....@@ -269,13 +399,36 @@
269399 }
270400 }
271401
402
+#ifdef CONFIG_PERF_EVENTS
403
+static inline void cr4_update_pce_mm(struct mm_struct *mm)
404
+{
405
+ if (static_branch_unlikely(&rdpmc_always_available_key) ||
406
+ (!static_branch_unlikely(&rdpmc_never_available_key) &&
407
+ atomic_read(&mm->context.perf_rdpmc_allowed)))
408
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
409
+ else
410
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
411
+}
412
+
413
+void cr4_update_pce(void *ignored)
414
+{
415
+ cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
416
+}
417
+
418
+#else
419
+static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
420
+#endif
421
+
272422 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
273423 struct task_struct *tsk)
274424 {
275425 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
276426 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
427
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
277428 unsigned cpu = smp_processor_id();
278429 u64 next_tlb_gen;
430
+ bool need_flush;
431
+ u16 new_asid;
279432
280433 /*
281434 * NB: The scheduler will call us with prev == next when switching
....@@ -335,36 +488,47 @@
335488 next->context.ctx_id);
336489
337490 /*
338
- * We don't currently support having a real mm loaded without
339
- * our cpu set in mm_cpumask(). We have all the bookkeeping
340
- * in place to figure out whether we would need to flush
341
- * if our cpu were cleared in mm_cpumask(), but we don't
342
- * currently use it.
491
+ * Even in lazy TLB mode, the CPU should stay set in the
492
+ * mm_cpumask. The TLB shootdown code can figure out from
493
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
343494 */
344495 if (WARN_ON_ONCE(real_prev != &init_mm &&
345496 !cpumask_test_cpu(cpu, mm_cpumask(next))))
346497 cpumask_set_cpu(cpu, mm_cpumask(next));
347498
348
- return;
349
- } else {
350
- u16 new_asid;
351
- bool need_flush;
499
+ /*
500
+ * If the CPU is not in lazy TLB mode, we are just switching
501
+ * from one thread in a process to another thread in the same
502
+ * process. No TLB flush required.
503
+ */
504
+ if (!was_lazy)
505
+ return;
352506
507
+ /*
508
+ * Read the tlb_gen to check whether a flush is needed.
509
+ * If the TLB is up to date, just use it.
510
+ * The barrier synchronizes with the tlb_gen increment in
511
+ * the TLB shootdown code.
512
+ */
513
+ smp_mb();
514
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
515
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
516
+ next_tlb_gen)
517
+ return;
518
+
519
+ /*
520
+ * TLB contents went out of date while we were in lazy
521
+ * mode. Fall through to the TLB switching code below.
522
+ */
523
+ new_asid = prev_asid;
524
+ need_flush = true;
525
+ } else {
353526 /*
354527 * Avoid user/user BTB poisoning by flushing the branch
355528 * predictor when switching between processes. This stops
356529 * one process from doing Spectre-v2 attacks on another.
357530 */
358531 cond_ibpb(tsk);
359
-
360
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
361
- /*
362
- * If our current stack is in vmalloc space and isn't
363
- * mapped in the new pgd, we'll double-fault. Forcibly
364
- * map it.
365
- */
366
- sync_current_stack_to_mm(next);
367
- }
368532
369533 /*
370534 * Stop remote flushes for the previous mm.
....@@ -389,38 +553,31 @@
389553 /* Let nmi_uaccess_okay() know that we're changing CR3. */
390554 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
391555 barrier();
392
-
393
- if (need_flush) {
394
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
395
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
396
- load_new_mm_cr3(next->pgd, new_asid, true);
397
-
398
- /*
399
- * NB: This gets called via leave_mm() in the idle path
400
- * where RCU functions differently. Tracing normally
401
- * uses RCU, so we need to use the _rcuidle variant.
402
- *
403
- * (There is no good reason for this. The idle code should
404
- * be rearranged to call this before rcu_idle_enter().)
405
- */
406
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
407
- } else {
408
- /* The new ASID is already up to date. */
409
- load_new_mm_cr3(next->pgd, new_asid, false);
410
-
411
- /* See above wrt _rcuidle. */
412
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
413
- }
414
-
415
- /* Make sure we write CR3 before loaded_mm. */
416
- barrier();
417
-
418
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
419
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
420556 }
421557
422
- load_mm_cr4(next);
423
- switch_ldt(real_prev, next);
558
+ if (need_flush) {
559
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
560
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
561
+ load_new_mm_cr3(next->pgd, new_asid, true);
562
+
563
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
564
+ } else {
565
+ /* The new ASID is already up to date. */
566
+ load_new_mm_cr3(next->pgd, new_asid, false);
567
+
568
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
569
+ }
570
+
571
+ /* Make sure we write CR3 before loaded_mm. */
572
+ barrier();
573
+
574
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
575
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
576
+
577
+ if (next != real_prev) {
578
+ cr4_update_pce_mm(next);
579
+ switch_ldt(real_prev, next);
580
+ }
424581 }
425582
426583 /*
....@@ -441,20 +598,7 @@
441598 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
442599 return;
443600
444
- if (tlb_defer_switch_to_init_mm()) {
445
- /*
446
- * There's a significant optimization that may be possible
447
- * here. We have accurate enough TLB flush tracking that we
448
- * don't need to maintain coherence of TLB per se when we're
449
- * lazy. We do, however, need to maintain coherence of
450
- * paging-structure caches. We could, in principle, leave our
451
- * old mm loaded and only switch to init_mm when
452
- * tlb_remove_page() happens.
453
- */
454
- this_cpu_write(cpu_tlbstate.is_lazy, true);
455
- } else {
456
- switch_mm(NULL, &init_mm, NULL);
457
- }
601
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
458602 }
459603
460604 /*
....@@ -541,6 +685,9 @@
541685 * paging-structure cache to avoid speculatively reading
542686 * garbage into our TLB. Since switching to init_mm is barely
543687 * slower than a minimal flush, just switch to init_mm.
688
+ *
689
+ * This should be rare, with native_flush_tlb_others skipping
690
+ * IPIs to lazy TLB mode CPUs.
544691 */
545692 switch_mm_irqs_off(NULL, &init_mm, NULL);
546693 return;
....@@ -601,20 +748,19 @@
601748 f->new_tlb_gen == local_tlb_gen + 1 &&
602749 f->new_tlb_gen == mm_tlb_gen) {
603750 /* Partial flush */
604
- unsigned long addr;
605
- unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
751
+ unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
752
+ unsigned long addr = f->start;
606753
607
- addr = f->start;
608754 while (addr < f->end) {
609
- __flush_tlb_one_user(addr);
610
- addr += PAGE_SIZE;
755
+ flush_tlb_one_user(addr);
756
+ addr += 1UL << f->stride_shift;
611757 }
612758 if (local)
613
- count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
614
- trace_tlb_flush(reason, nr_pages);
759
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
760
+ trace_tlb_flush(reason, nr_invalidate);
615761 } else {
616762 /* Full flush. */
617
- local_flush_tlb();
763
+ flush_tlb_local();
618764 if (local)
619765 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
620766 trace_tlb_flush(reason, TLB_FLUSH_ALL);
....@@ -624,7 +770,7 @@
624770 this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
625771 }
626772
627
-static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
773
+static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
628774 {
629775 const struct flush_tlb_info *f = info;
630776
....@@ -644,8 +790,13 @@
644790 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
645791 }
646792
647
-void native_flush_tlb_others(const struct cpumask *cpumask,
648
- const struct flush_tlb_info *info)
793
+static bool tlb_is_not_lazy(int cpu, void *data)
794
+{
795
+ return !per_cpu(cpu_tlbstate.is_lazy, cpu);
796
+}
797
+
798
+STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
799
+ const struct flush_tlb_info *info)
649800 {
650801 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
651802 if (info->end == TLB_FLUSH_ALL)
....@@ -654,34 +805,32 @@
654805 trace_tlb_flush(TLB_REMOTE_SEND_IPI,
655806 (info->end - info->start) >> PAGE_SHIFT);
656807
657
- if (is_uv_system()) {
658
- /*
659
- * This whole special case is confused. UV has a "Broadcast
660
- * Assist Unit", which seems to be a fancy way to send IPIs.
661
- * Back when x86 used an explicit TLB flush IPI, UV was
662
- * optimized to use its own mechanism. These days, x86 uses
663
- * smp_call_function_many(), but UV still uses a manual IPI,
664
- * and that IPI's action is out of date -- it does a manual
665
- * flush instead of calling flush_tlb_func_remote(). This
666
- * means that the percpu tlb_gen variables won't be updated
667
- * and we'll do pointless flushes on future context switches.
668
- *
669
- * Rather than hooking native_flush_tlb_others() here, I think
670
- * that UV should be updated so that smp_call_function_many(),
671
- * etc, are optimal on UV.
672
- */
673
- cpumask = uv_flush_tlb_others(cpumask, info);
674
- if (cpumask)
675
- smp_call_function_many(cpumask, flush_tlb_func_remote,
676
- (void *)info, 1);
677
- return;
678
- }
679
- smp_call_function_many(cpumask, flush_tlb_func_remote,
808
+ /*
809
+ * If no page tables were freed, we can skip sending IPIs to
810
+ * CPUs in lazy TLB mode. They will flush the CPU themselves
811
+ * at the next context switch.
812
+ *
813
+ * However, if page tables are getting freed, we need to send the
814
+ * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
815
+ * up on the new contents of what used to be page tables, while
816
+ * doing a speculative memory access.
817
+ */
818
+ if (info->freed_tables)
819
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
680820 (void *)info, 1);
821
+ else
822
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
823
+ (void *)info, 1, cpumask);
824
+}
825
+
826
+void flush_tlb_others(const struct cpumask *cpumask,
827
+ const struct flush_tlb_info *info)
828
+{
829
+ __flush_tlb_others(cpumask, info);
681830 }
682831
683832 /*
684
- * See Documentation/x86/tlb.txt for details. We choose 33
833
+ * See Documentation/x86/tlb.rst for details. We choose 33
685834 * because it is large enough to cover the vast majority (at
686835 * least 95%) of allocations, and is small enough that we are
687836 * confident it will not cause too much overhead. Each single
....@@ -690,43 +839,83 @@
690839 *
691840 * This is in units of pages.
692841 */
693
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
842
+unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
843
+
844
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
845
+
846
+#ifdef CONFIG_DEBUG_VM
847
+static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
848
+#endif
849
+
850
+static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
851
+ unsigned long start, unsigned long end,
852
+ unsigned int stride_shift, bool freed_tables,
853
+ u64 new_tlb_gen)
854
+{
855
+ struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
856
+
857
+#ifdef CONFIG_DEBUG_VM
858
+ /*
859
+ * Ensure that the following code is non-reentrant and flush_tlb_info
860
+ * is not overwritten. This means no TLB flushing is initiated by
861
+ * interrupt handlers and machine-check exception handlers.
862
+ */
863
+ BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
864
+#endif
865
+
866
+ info->start = start;
867
+ info->end = end;
868
+ info->mm = mm;
869
+ info->stride_shift = stride_shift;
870
+ info->freed_tables = freed_tables;
871
+ info->new_tlb_gen = new_tlb_gen;
872
+
873
+ return info;
874
+}
875
+
876
+static inline void put_flush_tlb_info(void)
877
+{
878
+#ifdef CONFIG_DEBUG_VM
879
+ /* Complete reentrency prevention checks */
880
+ barrier();
881
+ this_cpu_dec(flush_tlb_info_idx);
882
+#endif
883
+}
694884
695885 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
696
- unsigned long end, unsigned long vmflag)
886
+ unsigned long end, unsigned int stride_shift,
887
+ bool freed_tables)
697888 {
889
+ struct flush_tlb_info *info;
890
+ u64 new_tlb_gen;
698891 int cpu;
699
-
700
- struct flush_tlb_info info = {
701
- .mm = mm,
702
- };
703892
704893 cpu = get_cpu();
705894
706
- /* This is also a barrier that synchronizes with switch_mm(). */
707
- info.new_tlb_gen = inc_mm_tlb_gen(mm);
708
-
709895 /* Should we flush just the requested range? */
710
- if ((end != TLB_FLUSH_ALL) &&
711
- !(vmflag & VM_HUGETLB) &&
712
- ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
713
- info.start = start;
714
- info.end = end;
715
- } else {
716
- info.start = 0UL;
717
- info.end = TLB_FLUSH_ALL;
896
+ if ((end == TLB_FLUSH_ALL) ||
897
+ ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
898
+ start = 0;
899
+ end = TLB_FLUSH_ALL;
718900 }
719901
902
+ /* This is also a barrier that synchronizes with switch_mm(). */
903
+ new_tlb_gen = inc_mm_tlb_gen(mm);
904
+
905
+ info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
906
+ new_tlb_gen);
907
+
720908 if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
721
- VM_WARN_ON(irqs_disabled());
909
+ lockdep_assert_irqs_enabled();
722910 local_irq_disable();
723
- flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
911
+ flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
724912 local_irq_enable();
725913 }
726914
727915 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
728
- flush_tlb_others(mm_cpumask(mm), &info);
916
+ flush_tlb_others(mm_cpumask(mm), info);
729917
918
+ put_flush_tlb_info();
730919 put_cpu();
731920 }
732921
....@@ -750,49 +939,249 @@
750939
751940 /* flush range by one by one 'invlpg' */
752941 for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
753
- __flush_tlb_one_kernel(addr);
942
+ flush_tlb_one_kernel(addr);
754943 }
755944
756945 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
757946 {
758
-
759947 /* Balance as user space task's flush, a bit conservative */
760948 if (end == TLB_FLUSH_ALL ||
761949 (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
762950 on_each_cpu(do_flush_tlb_all, NULL, 1);
763951 } else {
764
- struct flush_tlb_info info;
765
- info.start = start;
766
- info.end = end;
767
- on_each_cpu(do_kernel_range_flush, &info, 1);
952
+ struct flush_tlb_info *info;
953
+
954
+ preempt_disable();
955
+ info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
956
+
957
+ on_each_cpu(do_kernel_range_flush, info, 1);
958
+
959
+ put_flush_tlb_info();
960
+ preempt_enable();
768961 }
769962 }
770963
964
+/*
965
+ * This can be used from process context to figure out what the value of
966
+ * CR3 is without needing to do a (slow) __read_cr3().
967
+ *
968
+ * It's intended to be used for code like KVM that sneakily changes CR3
969
+ * and needs to restore it. It needs to be used very carefully.
970
+ */
971
+unsigned long __get_current_cr3_fast(void)
972
+{
973
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
974
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
975
+
976
+ /* For now, be very restrictive about when this can be called. */
977
+ VM_WARN_ON(in_nmi() || preemptible());
978
+
979
+ VM_BUG_ON(cr3 != __read_cr3());
980
+ return cr3;
981
+}
982
+EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
983
+
984
+/*
985
+ * Flush one page in the kernel mapping
986
+ */
987
+void flush_tlb_one_kernel(unsigned long addr)
988
+{
989
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
990
+
991
+ /*
992
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
993
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
994
+ * use PCID if we also use global PTEs for the kernel mapping, and
995
+ * INVLPG flushes global translations across all address spaces.
996
+ *
997
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
998
+ * __flush_tlb_one_user() will flush the given address for the current
999
+ * kernel address space and for its usermode counterpart, but it does
1000
+ * not flush it for other address spaces.
1001
+ */
1002
+ flush_tlb_one_user(addr);
1003
+
1004
+ if (!static_cpu_has(X86_FEATURE_PTI))
1005
+ return;
1006
+
1007
+ /*
1008
+ * See above. We need to propagate the flush to all other address
1009
+ * spaces. In principle, we only need to propagate it to kernelmode
1010
+ * address spaces, but the extra bookkeeping we would need is not
1011
+ * worth it.
1012
+ */
1013
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
1014
+}
1015
+
1016
+/*
1017
+ * Flush one page in the user mapping
1018
+ */
1019
+STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
1020
+{
1021
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1022
+
1023
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1024
+
1025
+ if (!static_cpu_has(X86_FEATURE_PTI))
1026
+ return;
1027
+
1028
+ /*
1029
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
1030
+ * Just use invalidate_user_asid() in case we are called early.
1031
+ */
1032
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
1033
+ invalidate_user_asid(loaded_mm_asid);
1034
+ else
1035
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
1036
+}
1037
+
1038
+void flush_tlb_one_user(unsigned long addr)
1039
+{
1040
+ __flush_tlb_one_user(addr);
1041
+}
1042
+
1043
+/*
1044
+ * Flush everything
1045
+ */
1046
+STATIC_NOPV void native_flush_tlb_global(void)
1047
+{
1048
+ unsigned long cr4, flags;
1049
+
1050
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
1051
+ /*
1052
+ * Using INVPCID is considerably faster than a pair of writes
1053
+ * to CR4 sandwiched inside an IRQ flag save/restore.
1054
+ *
1055
+ * Note, this works with CR4.PCIDE=0 or 1.
1056
+ */
1057
+ invpcid_flush_all();
1058
+ return;
1059
+ }
1060
+
1061
+ /*
1062
+ * Read-modify-write to CR4 - protect it from preemption and
1063
+ * from interrupts. (Use the raw variant because this code can
1064
+ * be called from deep inside debugging code.)
1065
+ */
1066
+ raw_local_irq_save(flags);
1067
+
1068
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
1069
+ /* toggle PGE */
1070
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
1071
+ /* write old PGE again and flush TLBs */
1072
+ native_write_cr4(cr4);
1073
+
1074
+ raw_local_irq_restore(flags);
1075
+}
1076
+
1077
+/*
1078
+ * Flush the entire current user mapping
1079
+ */
1080
+STATIC_NOPV void native_flush_tlb_local(void)
1081
+{
1082
+ /*
1083
+ * Preemption or interrupts must be disabled to protect the access
1084
+ * to the per CPU variable and to prevent being preempted between
1085
+ * read_cr3() and write_cr3().
1086
+ */
1087
+ WARN_ON_ONCE(preemptible());
1088
+
1089
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1090
+
1091
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
1092
+ native_write_cr3(__native_read_cr3());
1093
+}
1094
+
1095
+void flush_tlb_local(void)
1096
+{
1097
+ __flush_tlb_local();
1098
+}
1099
+
1100
+/*
1101
+ * Flush everything
1102
+ */
1103
+void __flush_tlb_all(void)
1104
+{
1105
+ /*
1106
+ * This is to catch users with enabled preemption and the PGE feature
1107
+ * and don't trigger the warning in __native_flush_tlb().
1108
+ */
1109
+ VM_WARN_ON_ONCE(preemptible());
1110
+
1111
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
1112
+ __flush_tlb_global();
1113
+ } else {
1114
+ /*
1115
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
1116
+ */
1117
+ flush_tlb_local();
1118
+ }
1119
+}
1120
+EXPORT_SYMBOL_GPL(__flush_tlb_all);
1121
+
1122
+/*
1123
+ * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
1124
+ * This means that the 'struct flush_tlb_info' that describes which mappings to
1125
+ * flush is actually fixed. We therefore set a single fixed struct and use it in
1126
+ * arch_tlbbatch_flush().
1127
+ */
1128
+static const struct flush_tlb_info full_flush_tlb_info = {
1129
+ .mm = NULL,
1130
+ .start = 0,
1131
+ .end = TLB_FLUSH_ALL,
1132
+};
1133
+
7711134 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
7721135 {
773
- struct flush_tlb_info info = {
774
- .mm = NULL,
775
- .start = 0UL,
776
- .end = TLB_FLUSH_ALL,
777
- };
778
-
7791136 int cpu = get_cpu();
7801137
7811138 if (cpumask_test_cpu(cpu, &batch->cpumask)) {
782
- VM_WARN_ON(irqs_disabled());
1139
+ lockdep_assert_irqs_enabled();
7831140 local_irq_disable();
784
- flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
1141
+ flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
7851142 local_irq_enable();
7861143 }
7871144
7881145 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
789
- flush_tlb_others(&batch->cpumask, &info);
1146
+ flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
7901147
7911148 cpumask_clear(&batch->cpumask);
7921149
7931150 put_cpu();
7941151 }
7951152
1153
+/*
1154
+ * Blindly accessing user memory from NMI context can be dangerous
1155
+ * if we're in the middle of switching the current user task or
1156
+ * switching the loaded mm. It can also be dangerous if we
1157
+ * interrupted some kernel code that was temporarily using a
1158
+ * different mm.
1159
+ */
1160
+bool nmi_uaccess_okay(void)
1161
+{
1162
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1163
+ struct mm_struct *current_mm = current->mm;
1164
+
1165
+ VM_WARN_ON_ONCE(!loaded_mm);
1166
+
1167
+ /*
1168
+ * The condition we want to check is
1169
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
1170
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
1171
+ * is supposed to be reasonably fast.
1172
+ *
1173
+ * Instead, we check the almost equivalent but somewhat conservative
1174
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
1175
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
1176
+ */
1177
+ if (loaded_mm != current_mm)
1178
+ return false;
1179
+
1180
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
1181
+
1182
+ return true;
1183
+}
1184
+
7961185 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
7971186 size_t count, loff_t *ppos)
7981187 {