From 95099d4622f8cb224d94e314c7a8e0df60b13f87 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 09 Dec 2023 08:38:01 +0000
Subject: [PATCH] enable docker ppp
---
kernel/arch/x86/mm/tlb.c | 711 +++++++++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 550 insertions(+), 161 deletions(-)
diff --git a/kernel/arch/x86/mm/tlb.c b/kernel/arch/x86/mm/tlb.c
index 2f41a34..569ac1d 100644
--- a/kernel/arch/x86/mm/tlb.c
+++ b/kernel/arch/x86/mm/tlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/init.h>
#include <linux/mm.h>
@@ -13,7 +14,18 @@
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
-#include <asm/uv/uv.h>
+
+#include "mm_internal.h"
+
+#ifdef CONFIG_PARAVIRT
+# define STATIC_NOPV
+#else
+# define STATIC_NOPV static
+# define __flush_tlb_local native_flush_tlb_local
+# define __flush_tlb_global native_flush_tlb_global
+# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
+# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+#endif
/*
* TLB flushing, formerly SMP-only
@@ -34,6 +46,126 @@
* stored in cpu_tlb_state.last_user_mm_ibpb.
*/
#define LAST_USER_MM_IBPB 0x1UL
+
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
+
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * Given @asid, compute kPCID
+ */
+static inline u16 kern_pcid(u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
+}
+
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
+#endif
+ return ret;
+}
+
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ /*
+ * Use boot_cpu_has() instead of this_cpu_has() as this function
+ * might be called during early boot. This should work even after
+ * boot because all CPU's the have same capabilities:
+ */
+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+}
/*
* We get here when we do something requiring a TLB invalidation
@@ -107,6 +239,32 @@
*need_flush = true;
}
+/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
@@ -156,34 +314,6 @@
local_irq_save(flags);
switch_mm_irqs_off(prev, next, tsk);
local_irq_restore(flags);
-}
-
-static void sync_current_stack_to_mm(struct mm_struct *mm)
-{
- unsigned long sp = current_stack_pointer;
- pgd_t *pgd = pgd_offset(mm, sp);
-
- if (pgtable_l5_enabled()) {
- if (unlikely(pgd_none(*pgd))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
-
- set_pgd(pgd, *pgd_ref);
- }
- } else {
- /*
- * "pgd" is faked. The top level entries are "p4d"s, so sync
- * the p4d. This compiles to approximately the same code as
- * the 5-level case.
- */
- p4d_t *p4d = p4d_offset(pgd, sp);
-
- if (unlikely(p4d_none(*p4d))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
-
- set_p4d(p4d, *p4d_ref);
- }
- }
}
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
@@ -269,13 +399,36 @@
}
}
+#ifdef CONFIG_PERF_EVENTS
+static inline void cr4_update_pce_mm(struct mm_struct *mm)
+{
+ if (static_branch_unlikely(&rdpmc_always_available_key) ||
+ (!static_branch_unlikely(&rdpmc_never_available_key) &&
+ atomic_read(&mm->context.perf_rdpmc_allowed)))
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
+ else
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
+}
+
+void cr4_update_pce(void *ignored)
+{
+ cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
+}
+
+#else
+static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
+#endif
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -335,36 +488,47 @@
next->context.ctx_id);
/*
- * We don't currently support having a real mm loaded without
- * our cpu set in mm_cpumask(). We have all the bookkeeping
- * in place to figure out whether we would need to flush
- * if our cpu were cleared in mm_cpumask(), but we don't
- * currently use it.
+ * Even in lazy TLB mode, the CPU should stay set in the
+ * mm_cpumask. The TLB shootdown code can figure out from
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
- } else {
- u16 new_asid;
- bool need_flush;
+ /*
+ * If the CPU is not in lazy TLB mode, we are just switching
+ * from one thread in a process to another thread in the same
+ * process. No TLB flush required.
+ */
+ if (!was_lazy)
+ return;
+ /*
+ * Read the tlb_gen to check whether a flush is needed.
+ * If the TLB is up to date, just use it.
+ * The barrier synchronizes with the tlb_gen increment in
+ * the TLB shootdown code.
+ */
+ smp_mb();
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ next_tlb_gen)
+ return;
+
+ /*
+ * TLB contents went out of date while we were in lazy
+ * mode. Fall through to the TLB switching code below.
+ */
+ new_asid = prev_asid;
+ need_flush = true;
+ } else {
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
*/
cond_ibpb(tsk);
-
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- sync_current_stack_to_mm(next);
- }
/*
* Stop remote flushes for the previous mm.
@@ -389,38 +553,31 @@
/* Let nmi_uaccess_okay() know that we're changing CR3. */
this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
barrier();
-
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
-
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
-
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
-
- /* Make sure we write CR3 before loaded_mm. */
- barrier();
-
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
- load_mm_cr4(next);
- switch_ldt(real_prev, next);
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
+
+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
+ if (next != real_prev) {
+ cr4_update_pce_mm(next);
+ switch_ldt(real_prev, next);
+ }
}
/*
@@ -441,20 +598,7 @@
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- if (tlb_defer_switch_to_init_mm()) {
- /*
- * There's a significant optimization that may be possible
- * here. We have accurate enough TLB flush tracking that we
- * don't need to maintain coherence of TLB per se when we're
- * lazy. We do, however, need to maintain coherence of
- * paging-structure caches. We could, in principle, leave our
- * old mm loaded and only switch to init_mm when
- * tlb_remove_page() happens.
- */
- this_cpu_write(cpu_tlbstate.is_lazy, true);
- } else {
- switch_mm(NULL, &init_mm, NULL);
- }
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -541,6 +685,9 @@
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -601,20 +748,19 @@
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long addr;
- unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+ unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
+ unsigned long addr = f->start;
- addr = f->start;
while (addr < f->end) {
- __flush_tlb_one_user(addr);
- addr += PAGE_SIZE;
+ flush_tlb_one_user(addr);
+ addr += 1UL << f->stride_shift;
}
if (local)
- count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
- trace_tlb_flush(reason, nr_pages);
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
+ trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
- local_flush_tlb();
+ flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
trace_tlb_flush(reason, TLB_FLUSH_ALL);
@@ -624,7 +770,7 @@
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
-static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
{
const struct flush_tlb_info *f = info;
@@ -644,8 +790,13 @@
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
-void native_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
+static bool tlb_is_not_lazy(int cpu, void *data)
+{
+ return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
+
+STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
@@ -654,34 +805,32 @@
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
- if (is_uv_system()) {
- /*
- * This whole special case is confused. UV has a "Broadcast
- * Assist Unit", which seems to be a fancy way to send IPIs.
- * Back when x86 used an explicit TLB flush IPI, UV was
- * optimized to use its own mechanism. These days, x86 uses
- * smp_call_function_many(), but UV still uses a manual IPI,
- * and that IPI's action is out of date -- it does a manual
- * flush instead of calling flush_tlb_func_remote(). This
- * means that the percpu tlb_gen variables won't be updated
- * and we'll do pointless flushes on future context switches.
- *
- * Rather than hooking native_flush_tlb_others() here, I think
- * that UV should be updated so that smp_call_function_many(),
- * etc, are optimal on UV.
- */
- cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+ /*
+ * If no page tables were freed, we can skip sending IPIs to
+ * CPUs in lazy TLB mode. They will flush the CPU themselves
+ * at the next context switch.
+ *
+ * However, if page tables are getting freed, we need to send the
+ * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+ * up on the new contents of what used to be page tables, while
+ * doing a speculative memory access.
+ */
+ if (info->freed_tables)
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
+ else
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+ (void *)info, 1, cpumask);
+}
+
+void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ __flush_tlb_others(cpumask, info);
}
/*
- * See Documentation/x86/tlb.txt for details. We choose 33
+ * See Documentation/x86/tlb.rst for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
@@ -690,43 +839,83 @@
*
* This is in units of pages.
*/
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
+
+#ifdef CONFIG_DEBUG_VM
+static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
+#endif
+
+static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned int stride_shift, bool freed_tables,
+ u64 new_tlb_gen)
+{
+ struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
+
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Ensure that the following code is non-reentrant and flush_tlb_info
+ * is not overwritten. This means no TLB flushing is initiated by
+ * interrupt handlers and machine-check exception handlers.
+ */
+ BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
+#endif
+
+ info->start = start;
+ info->end = end;
+ info->mm = mm;
+ info->stride_shift = stride_shift;
+ info->freed_tables = freed_tables;
+ info->new_tlb_gen = new_tlb_gen;
+
+ return info;
+}
+
+static inline void put_flush_tlb_info(void)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* Complete reentrency prevention checks */
+ barrier();
+ this_cpu_dec(flush_tlb_info_idx);
+#endif
+}
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag)
+ unsigned long end, unsigned int stride_shift,
+ bool freed_tables)
{
+ struct flush_tlb_info *info;
+ u64 new_tlb_gen;
int cpu;
-
- struct flush_tlb_info info = {
- .mm = mm,
- };
cpu = get_cpu();
- /* This is also a barrier that synchronizes with switch_mm(). */
- info.new_tlb_gen = inc_mm_tlb_gen(mm);
-
/* Should we flush just the requested range? */
- if ((end != TLB_FLUSH_ALL) &&
- !(vmflag & VM_HUGETLB) &&
- ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
- info.start = start;
- info.end = end;
- } else {
- info.start = 0UL;
- info.end = TLB_FLUSH_ALL;
+ if ((end == TLB_FLUSH_ALL) ||
+ ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
+ start = 0;
+ end = TLB_FLUSH_ALL;
}
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ new_tlb_gen = inc_mm_tlb_gen(mm);
+
+ info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
+ new_tlb_gen);
+
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
- VM_WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), &info);
+ flush_tlb_others(mm_cpumask(mm), info);
+ put_flush_tlb_info();
put_cpu();
}
@@ -750,49 +939,249 @@
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_one_kernel(addr);
+ flush_tlb_one_kernel(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
-
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
- struct flush_tlb_info info;
- info.start = start;
- info.end = end;
- on_each_cpu(do_kernel_range_flush, &info, 1);
+ struct flush_tlb_info *info;
+
+ preempt_disable();
+ info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
+
+ on_each_cpu(do_kernel_range_flush, info, 1);
+
+ put_flush_tlb_info();
+ preempt_enable();
}
}
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || preemptible());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+
+/*
+ * Flush one page in the kernel mapping
+ */
+void flush_tlb_one_kernel(unsigned long addr)
+{
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+
+ /*
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
+ * use PCID if we also use global PTEs for the kernel mapping, and
+ * INVLPG flushes global translations across all address spaces.
+ *
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
+ * __flush_tlb_one_user() will flush the given address for the current
+ * kernel address space and for its usermode counterpart, but it does
+ * not flush it for other address spaces.
+ */
+ flush_tlb_one_user(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * See above. We need to propagate the flush to all other address
+ * spaces. In principle, we only need to propagate it to kernelmode
+ * address spaces, but the extra bookkeeping we would need is not
+ * worth it.
+ */
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
+ * Flush one page in the user mapping
+ */
+STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
+{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+}
+
+void flush_tlb_one_user(unsigned long addr)
+{
+ __flush_tlb_one_user(addr);
+}
+
+/*
+ * Flush everything
+ */
+STATIC_NOPV void native_flush_tlb_global(void)
+{
+ unsigned long cr4, flags;
+
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Flush the entire current user mapping
+ */
+STATIC_NOPV void native_flush_tlb_local(void)
+{
+ /*
+ * Preemption or interrupts must be disabled to protect the access
+ * to the per CPU variable and to prevent being preempted between
+ * read_cr3() and write_cr3().
+ */
+ WARN_ON_ONCE(preemptible());
+
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
+ native_write_cr3(__native_read_cr3());
+}
+
+void flush_tlb_local(void)
+{
+ __flush_tlb_local();
+}
+
+/*
+ * Flush everything
+ */
+void __flush_tlb_all(void)
+{
+ /*
+ * This is to catch users with enabled preemption and the PGE feature
+ * and don't trigger the warning in __native_flush_tlb().
+ */
+ VM_WARN_ON_ONCE(preemptible());
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ __flush_tlb_global();
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
+ flush_tlb_local();
+ }
+}
+EXPORT_SYMBOL_GPL(__flush_tlb_all);
+
+/*
+ * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
+ * This means that the 'struct flush_tlb_info' that describes which mappings to
+ * flush is actually fixed. We therefore set a single fixed struct and use it in
+ * arch_tlbbatch_flush().
+ */
+static const struct flush_tlb_info full_flush_tlb_info = {
+ .mm = NULL,
+ .start = 0,
+ .end = TLB_FLUSH_ALL,
+};
+
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
- struct flush_tlb_info info = {
- .mm = NULL,
- .start = 0UL,
- .end = TLB_FLUSH_ALL,
- };
-
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
- VM_WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+ flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &info);
+ flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
cpumask_clear(&batch->cpumask);
put_cpu();
}
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
--
Gitblit v1.6.2