.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | #include <linux/init.h> |
---|
2 | 3 | |
---|
3 | 4 | #include <linux/mm.h> |
---|
.. | .. |
---|
13 | 14 | #include <asm/nospec-branch.h> |
---|
14 | 15 | #include <asm/cache.h> |
---|
15 | 16 | #include <asm/apic.h> |
---|
16 | | -#include <asm/uv/uv.h> |
---|
| 17 | + |
---|
| 18 | +#include "mm_internal.h" |
---|
| 19 | + |
---|
| 20 | +#ifdef CONFIG_PARAVIRT |
---|
| 21 | +# define STATIC_NOPV |
---|
| 22 | +#else |
---|
| 23 | +# define STATIC_NOPV static |
---|
| 24 | +# define __flush_tlb_local native_flush_tlb_local |
---|
| 25 | +# define __flush_tlb_global native_flush_tlb_global |
---|
| 26 | +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) |
---|
| 27 | +# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info) |
---|
| 28 | +#endif |
---|
17 | 29 | |
---|
18 | 30 | /* |
---|
19 | 31 | * TLB flushing, formerly SMP-only |
---|
.. | .. |
---|
34 | 46 | * stored in cpu_tlb_state.last_user_mm_ibpb. |
---|
35 | 47 | */ |
---|
36 | 48 | #define LAST_USER_MM_IBPB 0x1UL |
---|
| 49 | + |
---|
| 50 | +/* |
---|
| 51 | + * The x86 feature is called PCID (Process Context IDentifier). It is similar |
---|
| 52 | + * to what is traditionally called ASID on the RISC processors. |
---|
| 53 | + * |
---|
| 54 | + * We don't use the traditional ASID implementation, where each process/mm gets |
---|
| 55 | + * its own ASID and flush/restart when we run out of ASID space. |
---|
| 56 | + * |
---|
| 57 | + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's |
---|
| 58 | + * that came by on this CPU, allowing cheaper switch_mm between processes on |
---|
| 59 | + * this CPU. |
---|
| 60 | + * |
---|
| 61 | + * We end up with different spaces for different things. To avoid confusion we |
---|
| 62 | + * use different names for each of them: |
---|
| 63 | + * |
---|
| 64 | + * ASID - [0, TLB_NR_DYN_ASIDS-1] |
---|
| 65 | + * the canonical identifier for an mm |
---|
| 66 | + * |
---|
| 67 | + * kPCID - [1, TLB_NR_DYN_ASIDS] |
---|
| 68 | + * the value we write into the PCID part of CR3; corresponds to the |
---|
| 69 | + * ASID+1, because PCID 0 is special. |
---|
| 70 | + * |
---|
| 71 | + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] |
---|
| 72 | + * for KPTI each mm has two address spaces and thus needs two |
---|
| 73 | + * PCID values, but we can still do with a single ASID denomination |
---|
| 74 | + * for each mm. Corresponds to kPCID + 2048. |
---|
| 75 | + * |
---|
| 76 | + */ |
---|
| 77 | + |
---|
| 78 | +/* There are 12 bits of space for ASIDS in CR3 */ |
---|
| 79 | +#define CR3_HW_ASID_BITS 12 |
---|
| 80 | + |
---|
| 81 | +/* |
---|
| 82 | + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for |
---|
| 83 | + * user/kernel switches |
---|
| 84 | + */ |
---|
| 85 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
---|
| 86 | +# define PTI_CONSUMED_PCID_BITS 1 |
---|
| 87 | +#else |
---|
| 88 | +# define PTI_CONSUMED_PCID_BITS 0 |
---|
| 89 | +#endif |
---|
| 90 | + |
---|
| 91 | +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) |
---|
| 92 | + |
---|
| 93 | +/* |
---|
| 94 | + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account |
---|
| 95 | + * for them being zero-based. Another -1 is because PCID 0 is reserved for |
---|
| 96 | + * use by non-PCID-aware users. |
---|
| 97 | + */ |
---|
| 98 | +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) |
---|
| 99 | + |
---|
| 100 | +/* |
---|
| 101 | + * Given @asid, compute kPCID |
---|
| 102 | + */ |
---|
| 103 | +static inline u16 kern_pcid(u16 asid) |
---|
| 104 | +{ |
---|
| 105 | + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); |
---|
| 106 | + |
---|
| 107 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
---|
| 108 | + /* |
---|
| 109 | + * Make sure that the dynamic ASID space does not confict with the |
---|
| 110 | + * bit we are using to switch between user and kernel ASIDs. |
---|
| 111 | + */ |
---|
| 112 | + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); |
---|
| 113 | + |
---|
| 114 | + /* |
---|
| 115 | + * The ASID being passed in here should have respected the |
---|
| 116 | + * MAX_ASID_AVAILABLE and thus never have the switch bit set. |
---|
| 117 | + */ |
---|
| 118 | + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); |
---|
| 119 | +#endif |
---|
| 120 | + /* |
---|
| 121 | + * The dynamically-assigned ASIDs that get passed in are small |
---|
| 122 | + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, |
---|
| 123 | + * so do not bother to clear it. |
---|
| 124 | + * |
---|
| 125 | + * If PCID is on, ASID-aware code paths put the ASID+1 into the |
---|
| 126 | + * PCID bits. This serves two purposes. It prevents a nasty |
---|
| 127 | + * situation in which PCID-unaware code saves CR3, loads some other |
---|
| 128 | + * value (with PCID == 0), and then restores CR3, thus corrupting |
---|
| 129 | + * the TLB for ASID 0 if the saved ASID was nonzero. It also means |
---|
| 130 | + * that any bugs involving loading a PCID-enabled CR3 with |
---|
| 131 | + * CR4.PCIDE off will trigger deterministically. |
---|
| 132 | + */ |
---|
| 133 | + return asid + 1; |
---|
| 134 | +} |
---|
| 135 | + |
---|
| 136 | +/* |
---|
| 137 | + * Given @asid, compute uPCID |
---|
| 138 | + */ |
---|
| 139 | +static inline u16 user_pcid(u16 asid) |
---|
| 140 | +{ |
---|
| 141 | + u16 ret = kern_pcid(asid); |
---|
| 142 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
---|
| 143 | + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; |
---|
| 144 | +#endif |
---|
| 145 | + return ret; |
---|
| 146 | +} |
---|
| 147 | + |
---|
| 148 | +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) |
---|
| 149 | +{ |
---|
| 150 | + if (static_cpu_has(X86_FEATURE_PCID)) { |
---|
| 151 | + return __sme_pa(pgd) | kern_pcid(asid); |
---|
| 152 | + } else { |
---|
| 153 | + VM_WARN_ON_ONCE(asid != 0); |
---|
| 154 | + return __sme_pa(pgd); |
---|
| 155 | + } |
---|
| 156 | +} |
---|
| 157 | + |
---|
| 158 | +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) |
---|
| 159 | +{ |
---|
| 160 | + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); |
---|
| 161 | + /* |
---|
| 162 | + * Use boot_cpu_has() instead of this_cpu_has() as this function |
---|
| 163 | + * might be called during early boot. This should work even after |
---|
| 164 | + * boot because all CPU's the have same capabilities: |
---|
| 165 | + */ |
---|
| 166 | + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); |
---|
| 167 | + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; |
---|
| 168 | +} |
---|
37 | 169 | |
---|
38 | 170 | /* |
---|
39 | 171 | * We get here when we do something requiring a TLB invalidation |
---|
.. | .. |
---|
107 | 239 | *need_flush = true; |
---|
108 | 240 | } |
---|
109 | 241 | |
---|
| 242 | +/* |
---|
| 243 | + * Given an ASID, flush the corresponding user ASID. We can delay this |
---|
| 244 | + * until the next time we switch to it. |
---|
| 245 | + * |
---|
| 246 | + * See SWITCH_TO_USER_CR3. |
---|
| 247 | + */ |
---|
| 248 | +static inline void invalidate_user_asid(u16 asid) |
---|
| 249 | +{ |
---|
| 250 | + /* There is no user ASID if address space separation is off */ |
---|
| 251 | + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) |
---|
| 252 | + return; |
---|
| 253 | + |
---|
| 254 | + /* |
---|
| 255 | + * We only have a single ASID if PCID is off and the CR3 |
---|
| 256 | + * write will have flushed it. |
---|
| 257 | + */ |
---|
| 258 | + if (!cpu_feature_enabled(X86_FEATURE_PCID)) |
---|
| 259 | + return; |
---|
| 260 | + |
---|
| 261 | + if (!static_cpu_has(X86_FEATURE_PTI)) |
---|
| 262 | + return; |
---|
| 263 | + |
---|
| 264 | + __set_bit(kern_pcid(asid), |
---|
| 265 | + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); |
---|
| 266 | +} |
---|
| 267 | + |
---|
110 | 268 | static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) |
---|
111 | 269 | { |
---|
112 | 270 | unsigned long new_mm_cr3; |
---|
.. | .. |
---|
156 | 314 | local_irq_save(flags); |
---|
157 | 315 | switch_mm_irqs_off(prev, next, tsk); |
---|
158 | 316 | local_irq_restore(flags); |
---|
159 | | -} |
---|
160 | | - |
---|
161 | | -static void sync_current_stack_to_mm(struct mm_struct *mm) |
---|
162 | | -{ |
---|
163 | | - unsigned long sp = current_stack_pointer; |
---|
164 | | - pgd_t *pgd = pgd_offset(mm, sp); |
---|
165 | | - |
---|
166 | | - if (pgtable_l5_enabled()) { |
---|
167 | | - if (unlikely(pgd_none(*pgd))) { |
---|
168 | | - pgd_t *pgd_ref = pgd_offset_k(sp); |
---|
169 | | - |
---|
170 | | - set_pgd(pgd, *pgd_ref); |
---|
171 | | - } |
---|
172 | | - } else { |
---|
173 | | - /* |
---|
174 | | - * "pgd" is faked. The top level entries are "p4d"s, so sync |
---|
175 | | - * the p4d. This compiles to approximately the same code as |
---|
176 | | - * the 5-level case. |
---|
177 | | - */ |
---|
178 | | - p4d_t *p4d = p4d_offset(pgd, sp); |
---|
179 | | - |
---|
180 | | - if (unlikely(p4d_none(*p4d))) { |
---|
181 | | - pgd_t *pgd_ref = pgd_offset_k(sp); |
---|
182 | | - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); |
---|
183 | | - |
---|
184 | | - set_p4d(p4d, *p4d_ref); |
---|
185 | | - } |
---|
186 | | - } |
---|
187 | 317 | } |
---|
188 | 318 | |
---|
189 | 319 | static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) |
---|
.. | .. |
---|
269 | 399 | } |
---|
270 | 400 | } |
---|
271 | 401 | |
---|
| 402 | +#ifdef CONFIG_PERF_EVENTS |
---|
| 403 | +static inline void cr4_update_pce_mm(struct mm_struct *mm) |
---|
| 404 | +{ |
---|
| 405 | + if (static_branch_unlikely(&rdpmc_always_available_key) || |
---|
| 406 | + (!static_branch_unlikely(&rdpmc_never_available_key) && |
---|
| 407 | + atomic_read(&mm->context.perf_rdpmc_allowed))) |
---|
| 408 | + cr4_set_bits_irqsoff(X86_CR4_PCE); |
---|
| 409 | + else |
---|
| 410 | + cr4_clear_bits_irqsoff(X86_CR4_PCE); |
---|
| 411 | +} |
---|
| 412 | + |
---|
| 413 | +void cr4_update_pce(void *ignored) |
---|
| 414 | +{ |
---|
| 415 | + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); |
---|
| 416 | +} |
---|
| 417 | + |
---|
| 418 | +#else |
---|
| 419 | +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } |
---|
| 420 | +#endif |
---|
| 421 | + |
---|
272 | 422 | void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
---|
273 | 423 | struct task_struct *tsk) |
---|
274 | 424 | { |
---|
275 | 425 | struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); |
---|
276 | 426 | u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); |
---|
| 427 | + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); |
---|
277 | 428 | unsigned cpu = smp_processor_id(); |
---|
278 | 429 | u64 next_tlb_gen; |
---|
| 430 | + bool need_flush; |
---|
| 431 | + u16 new_asid; |
---|
279 | 432 | |
---|
280 | 433 | /* |
---|
281 | 434 | * NB: The scheduler will call us with prev == next when switching |
---|
.. | .. |
---|
335 | 488 | next->context.ctx_id); |
---|
336 | 489 | |
---|
337 | 490 | /* |
---|
338 | | - * We don't currently support having a real mm loaded without |
---|
339 | | - * our cpu set in mm_cpumask(). We have all the bookkeeping |
---|
340 | | - * in place to figure out whether we would need to flush |
---|
341 | | - * if our cpu were cleared in mm_cpumask(), but we don't |
---|
342 | | - * currently use it. |
---|
| 491 | + * Even in lazy TLB mode, the CPU should stay set in the |
---|
| 492 | + * mm_cpumask. The TLB shootdown code can figure out from |
---|
| 493 | + * from cpu_tlbstate.is_lazy whether or not to send an IPI. |
---|
343 | 494 | */ |
---|
344 | 495 | if (WARN_ON_ONCE(real_prev != &init_mm && |
---|
345 | 496 | !cpumask_test_cpu(cpu, mm_cpumask(next)))) |
---|
346 | 497 | cpumask_set_cpu(cpu, mm_cpumask(next)); |
---|
347 | 498 | |
---|
348 | | - return; |
---|
349 | | - } else { |
---|
350 | | - u16 new_asid; |
---|
351 | | - bool need_flush; |
---|
| 499 | + /* |
---|
| 500 | + * If the CPU is not in lazy TLB mode, we are just switching |
---|
| 501 | + * from one thread in a process to another thread in the same |
---|
| 502 | + * process. No TLB flush required. |
---|
| 503 | + */ |
---|
| 504 | + if (!was_lazy) |
---|
| 505 | + return; |
---|
352 | 506 | |
---|
| 507 | + /* |
---|
| 508 | + * Read the tlb_gen to check whether a flush is needed. |
---|
| 509 | + * If the TLB is up to date, just use it. |
---|
| 510 | + * The barrier synchronizes with the tlb_gen increment in |
---|
| 511 | + * the TLB shootdown code. |
---|
| 512 | + */ |
---|
| 513 | + smp_mb(); |
---|
| 514 | + next_tlb_gen = atomic64_read(&next->context.tlb_gen); |
---|
| 515 | + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == |
---|
| 516 | + next_tlb_gen) |
---|
| 517 | + return; |
---|
| 518 | + |
---|
| 519 | + /* |
---|
| 520 | + * TLB contents went out of date while we were in lazy |
---|
| 521 | + * mode. Fall through to the TLB switching code below. |
---|
| 522 | + */ |
---|
| 523 | + new_asid = prev_asid; |
---|
| 524 | + need_flush = true; |
---|
| 525 | + } else { |
---|
353 | 526 | /* |
---|
354 | 527 | * Avoid user/user BTB poisoning by flushing the branch |
---|
355 | 528 | * predictor when switching between processes. This stops |
---|
356 | 529 | * one process from doing Spectre-v2 attacks on another. |
---|
357 | 530 | */ |
---|
358 | 531 | cond_ibpb(tsk); |
---|
359 | | - |
---|
360 | | - if (IS_ENABLED(CONFIG_VMAP_STACK)) { |
---|
361 | | - /* |
---|
362 | | - * If our current stack is in vmalloc space and isn't |
---|
363 | | - * mapped in the new pgd, we'll double-fault. Forcibly |
---|
364 | | - * map it. |
---|
365 | | - */ |
---|
366 | | - sync_current_stack_to_mm(next); |
---|
367 | | - } |
---|
368 | 532 | |
---|
369 | 533 | /* |
---|
370 | 534 | * Stop remote flushes for the previous mm. |
---|
.. | .. |
---|
389 | 553 | /* Let nmi_uaccess_okay() know that we're changing CR3. */ |
---|
390 | 554 | this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); |
---|
391 | 555 | barrier(); |
---|
392 | | - |
---|
393 | | - if (need_flush) { |
---|
394 | | - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
---|
395 | | - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
---|
396 | | - load_new_mm_cr3(next->pgd, new_asid, true); |
---|
397 | | - |
---|
398 | | - /* |
---|
399 | | - * NB: This gets called via leave_mm() in the idle path |
---|
400 | | - * where RCU functions differently. Tracing normally |
---|
401 | | - * uses RCU, so we need to use the _rcuidle variant. |
---|
402 | | - * |
---|
403 | | - * (There is no good reason for this. The idle code should |
---|
404 | | - * be rearranged to call this before rcu_idle_enter().) |
---|
405 | | - */ |
---|
406 | | - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
---|
407 | | - } else { |
---|
408 | | - /* The new ASID is already up to date. */ |
---|
409 | | - load_new_mm_cr3(next->pgd, new_asid, false); |
---|
410 | | - |
---|
411 | | - /* See above wrt _rcuidle. */ |
---|
412 | | - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); |
---|
413 | | - } |
---|
414 | | - |
---|
415 | | - /* Make sure we write CR3 before loaded_mm. */ |
---|
416 | | - barrier(); |
---|
417 | | - |
---|
418 | | - this_cpu_write(cpu_tlbstate.loaded_mm, next); |
---|
419 | | - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); |
---|
420 | 556 | } |
---|
421 | 557 | |
---|
422 | | - load_mm_cr4(next); |
---|
423 | | - switch_ldt(real_prev, next); |
---|
| 558 | + if (need_flush) { |
---|
| 559 | + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); |
---|
| 560 | + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); |
---|
| 561 | + load_new_mm_cr3(next->pgd, new_asid, true); |
---|
| 562 | + |
---|
| 563 | + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
---|
| 564 | + } else { |
---|
| 565 | + /* The new ASID is already up to date. */ |
---|
| 566 | + load_new_mm_cr3(next->pgd, new_asid, false); |
---|
| 567 | + |
---|
| 568 | + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); |
---|
| 569 | + } |
---|
| 570 | + |
---|
| 571 | + /* Make sure we write CR3 before loaded_mm. */ |
---|
| 572 | + barrier(); |
---|
| 573 | + |
---|
| 574 | + this_cpu_write(cpu_tlbstate.loaded_mm, next); |
---|
| 575 | + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); |
---|
| 576 | + |
---|
| 577 | + if (next != real_prev) { |
---|
| 578 | + cr4_update_pce_mm(next); |
---|
| 579 | + switch_ldt(real_prev, next); |
---|
| 580 | + } |
---|
424 | 581 | } |
---|
425 | 582 | |
---|
426 | 583 | /* |
---|
.. | .. |
---|
441 | 598 | if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) |
---|
442 | 599 | return; |
---|
443 | 600 | |
---|
444 | | - if (tlb_defer_switch_to_init_mm()) { |
---|
445 | | - /* |
---|
446 | | - * There's a significant optimization that may be possible |
---|
447 | | - * here. We have accurate enough TLB flush tracking that we |
---|
448 | | - * don't need to maintain coherence of TLB per se when we're |
---|
449 | | - * lazy. We do, however, need to maintain coherence of |
---|
450 | | - * paging-structure caches. We could, in principle, leave our |
---|
451 | | - * old mm loaded and only switch to init_mm when |
---|
452 | | - * tlb_remove_page() happens. |
---|
453 | | - */ |
---|
454 | | - this_cpu_write(cpu_tlbstate.is_lazy, true); |
---|
455 | | - } else { |
---|
456 | | - switch_mm(NULL, &init_mm, NULL); |
---|
457 | | - } |
---|
| 601 | + this_cpu_write(cpu_tlbstate.is_lazy, true); |
---|
458 | 602 | } |
---|
459 | 603 | |
---|
460 | 604 | /* |
---|
.. | .. |
---|
541 | 685 | * paging-structure cache to avoid speculatively reading |
---|
542 | 686 | * garbage into our TLB. Since switching to init_mm is barely |
---|
543 | 687 | * slower than a minimal flush, just switch to init_mm. |
---|
| 688 | + * |
---|
| 689 | + * This should be rare, with native_flush_tlb_others skipping |
---|
| 690 | + * IPIs to lazy TLB mode CPUs. |
---|
544 | 691 | */ |
---|
545 | 692 | switch_mm_irqs_off(NULL, &init_mm, NULL); |
---|
546 | 693 | return; |
---|
.. | .. |
---|
601 | 748 | f->new_tlb_gen == local_tlb_gen + 1 && |
---|
602 | 749 | f->new_tlb_gen == mm_tlb_gen) { |
---|
603 | 750 | /* Partial flush */ |
---|
604 | | - unsigned long addr; |
---|
605 | | - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; |
---|
| 751 | + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; |
---|
| 752 | + unsigned long addr = f->start; |
---|
606 | 753 | |
---|
607 | | - addr = f->start; |
---|
608 | 754 | while (addr < f->end) { |
---|
609 | | - __flush_tlb_one_user(addr); |
---|
610 | | - addr += PAGE_SIZE; |
---|
| 755 | + flush_tlb_one_user(addr); |
---|
| 756 | + addr += 1UL << f->stride_shift; |
---|
611 | 757 | } |
---|
612 | 758 | if (local) |
---|
613 | | - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); |
---|
614 | | - trace_tlb_flush(reason, nr_pages); |
---|
| 759 | + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); |
---|
| 760 | + trace_tlb_flush(reason, nr_invalidate); |
---|
615 | 761 | } else { |
---|
616 | 762 | /* Full flush. */ |
---|
617 | | - local_flush_tlb(); |
---|
| 763 | + flush_tlb_local(); |
---|
618 | 764 | if (local) |
---|
619 | 765 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
---|
620 | 766 | trace_tlb_flush(reason, TLB_FLUSH_ALL); |
---|
.. | .. |
---|
624 | 770 | this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); |
---|
625 | 771 | } |
---|
626 | 772 | |
---|
627 | | -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) |
---|
| 773 | +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) |
---|
628 | 774 | { |
---|
629 | 775 | const struct flush_tlb_info *f = info; |
---|
630 | 776 | |
---|
.. | .. |
---|
644 | 790 | flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); |
---|
645 | 791 | } |
---|
646 | 792 | |
---|
647 | | -void native_flush_tlb_others(const struct cpumask *cpumask, |
---|
648 | | - const struct flush_tlb_info *info) |
---|
| 793 | +static bool tlb_is_not_lazy(int cpu, void *data) |
---|
| 794 | +{ |
---|
| 795 | + return !per_cpu(cpu_tlbstate.is_lazy, cpu); |
---|
| 796 | +} |
---|
| 797 | + |
---|
| 798 | +STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, |
---|
| 799 | + const struct flush_tlb_info *info) |
---|
649 | 800 | { |
---|
650 | 801 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
---|
651 | 802 | if (info->end == TLB_FLUSH_ALL) |
---|
.. | .. |
---|
654 | 805 | trace_tlb_flush(TLB_REMOTE_SEND_IPI, |
---|
655 | 806 | (info->end - info->start) >> PAGE_SHIFT); |
---|
656 | 807 | |
---|
657 | | - if (is_uv_system()) { |
---|
658 | | - /* |
---|
659 | | - * This whole special case is confused. UV has a "Broadcast |
---|
660 | | - * Assist Unit", which seems to be a fancy way to send IPIs. |
---|
661 | | - * Back when x86 used an explicit TLB flush IPI, UV was |
---|
662 | | - * optimized to use its own mechanism. These days, x86 uses |
---|
663 | | - * smp_call_function_many(), but UV still uses a manual IPI, |
---|
664 | | - * and that IPI's action is out of date -- it does a manual |
---|
665 | | - * flush instead of calling flush_tlb_func_remote(). This |
---|
666 | | - * means that the percpu tlb_gen variables won't be updated |
---|
667 | | - * and we'll do pointless flushes on future context switches. |
---|
668 | | - * |
---|
669 | | - * Rather than hooking native_flush_tlb_others() here, I think |
---|
670 | | - * that UV should be updated so that smp_call_function_many(), |
---|
671 | | - * etc, are optimal on UV. |
---|
672 | | - */ |
---|
673 | | - cpumask = uv_flush_tlb_others(cpumask, info); |
---|
674 | | - if (cpumask) |
---|
675 | | - smp_call_function_many(cpumask, flush_tlb_func_remote, |
---|
676 | | - (void *)info, 1); |
---|
677 | | - return; |
---|
678 | | - } |
---|
679 | | - smp_call_function_many(cpumask, flush_tlb_func_remote, |
---|
| 808 | + /* |
---|
| 809 | + * If no page tables were freed, we can skip sending IPIs to |
---|
| 810 | + * CPUs in lazy TLB mode. They will flush the CPU themselves |
---|
| 811 | + * at the next context switch. |
---|
| 812 | + * |
---|
| 813 | + * However, if page tables are getting freed, we need to send the |
---|
| 814 | + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping |
---|
| 815 | + * up on the new contents of what used to be page tables, while |
---|
| 816 | + * doing a speculative memory access. |
---|
| 817 | + */ |
---|
| 818 | + if (info->freed_tables) |
---|
| 819 | + smp_call_function_many(cpumask, flush_tlb_func_remote, |
---|
680 | 820 | (void *)info, 1); |
---|
| 821 | + else |
---|
| 822 | + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, |
---|
| 823 | + (void *)info, 1, cpumask); |
---|
| 824 | +} |
---|
| 825 | + |
---|
| 826 | +void flush_tlb_others(const struct cpumask *cpumask, |
---|
| 827 | + const struct flush_tlb_info *info) |
---|
| 828 | +{ |
---|
| 829 | + __flush_tlb_others(cpumask, info); |
---|
681 | 830 | } |
---|
682 | 831 | |
---|
683 | 832 | /* |
---|
684 | | - * See Documentation/x86/tlb.txt for details. We choose 33 |
---|
| 833 | + * See Documentation/x86/tlb.rst for details. We choose 33 |
---|
685 | 834 | * because it is large enough to cover the vast majority (at |
---|
686 | 835 | * least 95%) of allocations, and is small enough that we are |
---|
687 | 836 | * confident it will not cause too much overhead. Each single |
---|
.. | .. |
---|
690 | 839 | * |
---|
691 | 840 | * This is in units of pages. |
---|
692 | 841 | */ |
---|
693 | | -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; |
---|
| 842 | +unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; |
---|
| 843 | + |
---|
| 844 | +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); |
---|
| 845 | + |
---|
| 846 | +#ifdef CONFIG_DEBUG_VM |
---|
| 847 | +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); |
---|
| 848 | +#endif |
---|
| 849 | + |
---|
| 850 | +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, |
---|
| 851 | + unsigned long start, unsigned long end, |
---|
| 852 | + unsigned int stride_shift, bool freed_tables, |
---|
| 853 | + u64 new_tlb_gen) |
---|
| 854 | +{ |
---|
| 855 | + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); |
---|
| 856 | + |
---|
| 857 | +#ifdef CONFIG_DEBUG_VM |
---|
| 858 | + /* |
---|
| 859 | + * Ensure that the following code is non-reentrant and flush_tlb_info |
---|
| 860 | + * is not overwritten. This means no TLB flushing is initiated by |
---|
| 861 | + * interrupt handlers and machine-check exception handlers. |
---|
| 862 | + */ |
---|
| 863 | + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); |
---|
| 864 | +#endif |
---|
| 865 | + |
---|
| 866 | + info->start = start; |
---|
| 867 | + info->end = end; |
---|
| 868 | + info->mm = mm; |
---|
| 869 | + info->stride_shift = stride_shift; |
---|
| 870 | + info->freed_tables = freed_tables; |
---|
| 871 | + info->new_tlb_gen = new_tlb_gen; |
---|
| 872 | + |
---|
| 873 | + return info; |
---|
| 874 | +} |
---|
| 875 | + |
---|
| 876 | +static inline void put_flush_tlb_info(void) |
---|
| 877 | +{ |
---|
| 878 | +#ifdef CONFIG_DEBUG_VM |
---|
| 879 | + /* Complete reentrency prevention checks */ |
---|
| 880 | + barrier(); |
---|
| 881 | + this_cpu_dec(flush_tlb_info_idx); |
---|
| 882 | +#endif |
---|
| 883 | +} |
---|
694 | 884 | |
---|
695 | 885 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
---|
696 | | - unsigned long end, unsigned long vmflag) |
---|
| 886 | + unsigned long end, unsigned int stride_shift, |
---|
| 887 | + bool freed_tables) |
---|
697 | 888 | { |
---|
| 889 | + struct flush_tlb_info *info; |
---|
| 890 | + u64 new_tlb_gen; |
---|
698 | 891 | int cpu; |
---|
699 | | - |
---|
700 | | - struct flush_tlb_info info = { |
---|
701 | | - .mm = mm, |
---|
702 | | - }; |
---|
703 | 892 | |
---|
704 | 893 | cpu = get_cpu(); |
---|
705 | 894 | |
---|
706 | | - /* This is also a barrier that synchronizes with switch_mm(). */ |
---|
707 | | - info.new_tlb_gen = inc_mm_tlb_gen(mm); |
---|
708 | | - |
---|
709 | 895 | /* Should we flush just the requested range? */ |
---|
710 | | - if ((end != TLB_FLUSH_ALL) && |
---|
711 | | - !(vmflag & VM_HUGETLB) && |
---|
712 | | - ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { |
---|
713 | | - info.start = start; |
---|
714 | | - info.end = end; |
---|
715 | | - } else { |
---|
716 | | - info.start = 0UL; |
---|
717 | | - info.end = TLB_FLUSH_ALL; |
---|
| 896 | + if ((end == TLB_FLUSH_ALL) || |
---|
| 897 | + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { |
---|
| 898 | + start = 0; |
---|
| 899 | + end = TLB_FLUSH_ALL; |
---|
718 | 900 | } |
---|
719 | 901 | |
---|
| 902 | + /* This is also a barrier that synchronizes with switch_mm(). */ |
---|
| 903 | + new_tlb_gen = inc_mm_tlb_gen(mm); |
---|
| 904 | + |
---|
| 905 | + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, |
---|
| 906 | + new_tlb_gen); |
---|
| 907 | + |
---|
720 | 908 | if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { |
---|
721 | | - VM_WARN_ON(irqs_disabled()); |
---|
| 909 | + lockdep_assert_irqs_enabled(); |
---|
722 | 910 | local_irq_disable(); |
---|
723 | | - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); |
---|
| 911 | + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); |
---|
724 | 912 | local_irq_enable(); |
---|
725 | 913 | } |
---|
726 | 914 | |
---|
727 | 915 | if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) |
---|
728 | | - flush_tlb_others(mm_cpumask(mm), &info); |
---|
| 916 | + flush_tlb_others(mm_cpumask(mm), info); |
---|
729 | 917 | |
---|
| 918 | + put_flush_tlb_info(); |
---|
730 | 919 | put_cpu(); |
---|
731 | 920 | } |
---|
732 | 921 | |
---|
.. | .. |
---|
750 | 939 | |
---|
751 | 940 | /* flush range by one by one 'invlpg' */ |
---|
752 | 941 | for (addr = f->start; addr < f->end; addr += PAGE_SIZE) |
---|
753 | | - __flush_tlb_one_kernel(addr); |
---|
| 942 | + flush_tlb_one_kernel(addr); |
---|
754 | 943 | } |
---|
755 | 944 | |
---|
756 | 945 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
---|
757 | 946 | { |
---|
758 | | - |
---|
759 | 947 | /* Balance as user space task's flush, a bit conservative */ |
---|
760 | 948 | if (end == TLB_FLUSH_ALL || |
---|
761 | 949 | (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { |
---|
762 | 950 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
---|
763 | 951 | } else { |
---|
764 | | - struct flush_tlb_info info; |
---|
765 | | - info.start = start; |
---|
766 | | - info.end = end; |
---|
767 | | - on_each_cpu(do_kernel_range_flush, &info, 1); |
---|
| 952 | + struct flush_tlb_info *info; |
---|
| 953 | + |
---|
| 954 | + preempt_disable(); |
---|
| 955 | + info = get_flush_tlb_info(NULL, start, end, 0, false, 0); |
---|
| 956 | + |
---|
| 957 | + on_each_cpu(do_kernel_range_flush, info, 1); |
---|
| 958 | + |
---|
| 959 | + put_flush_tlb_info(); |
---|
| 960 | + preempt_enable(); |
---|
768 | 961 | } |
---|
769 | 962 | } |
---|
770 | 963 | |
---|
| 964 | +/* |
---|
| 965 | + * This can be used from process context to figure out what the value of |
---|
| 966 | + * CR3 is without needing to do a (slow) __read_cr3(). |
---|
| 967 | + * |
---|
| 968 | + * It's intended to be used for code like KVM that sneakily changes CR3 |
---|
| 969 | + * and needs to restore it. It needs to be used very carefully. |
---|
| 970 | + */ |
---|
| 971 | +unsigned long __get_current_cr3_fast(void) |
---|
| 972 | +{ |
---|
| 973 | + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, |
---|
| 974 | + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); |
---|
| 975 | + |
---|
| 976 | + /* For now, be very restrictive about when this can be called. */ |
---|
| 977 | + VM_WARN_ON(in_nmi() || preemptible()); |
---|
| 978 | + |
---|
| 979 | + VM_BUG_ON(cr3 != __read_cr3()); |
---|
| 980 | + return cr3; |
---|
| 981 | +} |
---|
| 982 | +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); |
---|
| 983 | + |
---|
| 984 | +/* |
---|
| 985 | + * Flush one page in the kernel mapping |
---|
| 986 | + */ |
---|
| 987 | +void flush_tlb_one_kernel(unsigned long addr) |
---|
| 988 | +{ |
---|
| 989 | + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
---|
| 990 | + |
---|
| 991 | + /* |
---|
| 992 | + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its |
---|
| 993 | + * paravirt equivalent. Even with PCID, this is sufficient: we only |
---|
| 994 | + * use PCID if we also use global PTEs for the kernel mapping, and |
---|
| 995 | + * INVLPG flushes global translations across all address spaces. |
---|
| 996 | + * |
---|
| 997 | + * If PTI is on, then the kernel is mapped with non-global PTEs, and |
---|
| 998 | + * __flush_tlb_one_user() will flush the given address for the current |
---|
| 999 | + * kernel address space and for its usermode counterpart, but it does |
---|
| 1000 | + * not flush it for other address spaces. |
---|
| 1001 | + */ |
---|
| 1002 | + flush_tlb_one_user(addr); |
---|
| 1003 | + |
---|
| 1004 | + if (!static_cpu_has(X86_FEATURE_PTI)) |
---|
| 1005 | + return; |
---|
| 1006 | + |
---|
| 1007 | + /* |
---|
| 1008 | + * See above. We need to propagate the flush to all other address |
---|
| 1009 | + * spaces. In principle, we only need to propagate it to kernelmode |
---|
| 1010 | + * address spaces, but the extra bookkeeping we would need is not |
---|
| 1011 | + * worth it. |
---|
| 1012 | + */ |
---|
| 1013 | + this_cpu_write(cpu_tlbstate.invalidate_other, true); |
---|
| 1014 | +} |
---|
| 1015 | + |
---|
| 1016 | +/* |
---|
| 1017 | + * Flush one page in the user mapping |
---|
| 1018 | + */ |
---|
| 1019 | +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) |
---|
| 1020 | +{ |
---|
| 1021 | + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); |
---|
| 1022 | + |
---|
| 1023 | + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
---|
| 1024 | + |
---|
| 1025 | + if (!static_cpu_has(X86_FEATURE_PTI)) |
---|
| 1026 | + return; |
---|
| 1027 | + |
---|
| 1028 | + /* |
---|
| 1029 | + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. |
---|
| 1030 | + * Just use invalidate_user_asid() in case we are called early. |
---|
| 1031 | + */ |
---|
| 1032 | + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) |
---|
| 1033 | + invalidate_user_asid(loaded_mm_asid); |
---|
| 1034 | + else |
---|
| 1035 | + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); |
---|
| 1036 | +} |
---|
| 1037 | + |
---|
| 1038 | +void flush_tlb_one_user(unsigned long addr) |
---|
| 1039 | +{ |
---|
| 1040 | + __flush_tlb_one_user(addr); |
---|
| 1041 | +} |
---|
| 1042 | + |
---|
| 1043 | +/* |
---|
| 1044 | + * Flush everything |
---|
| 1045 | + */ |
---|
| 1046 | +STATIC_NOPV void native_flush_tlb_global(void) |
---|
| 1047 | +{ |
---|
| 1048 | + unsigned long cr4, flags; |
---|
| 1049 | + |
---|
| 1050 | + if (static_cpu_has(X86_FEATURE_INVPCID)) { |
---|
| 1051 | + /* |
---|
| 1052 | + * Using INVPCID is considerably faster than a pair of writes |
---|
| 1053 | + * to CR4 sandwiched inside an IRQ flag save/restore. |
---|
| 1054 | + * |
---|
| 1055 | + * Note, this works with CR4.PCIDE=0 or 1. |
---|
| 1056 | + */ |
---|
| 1057 | + invpcid_flush_all(); |
---|
| 1058 | + return; |
---|
| 1059 | + } |
---|
| 1060 | + |
---|
| 1061 | + /* |
---|
| 1062 | + * Read-modify-write to CR4 - protect it from preemption and |
---|
| 1063 | + * from interrupts. (Use the raw variant because this code can |
---|
| 1064 | + * be called from deep inside debugging code.) |
---|
| 1065 | + */ |
---|
| 1066 | + raw_local_irq_save(flags); |
---|
| 1067 | + |
---|
| 1068 | + cr4 = this_cpu_read(cpu_tlbstate.cr4); |
---|
| 1069 | + /* toggle PGE */ |
---|
| 1070 | + native_write_cr4(cr4 ^ X86_CR4_PGE); |
---|
| 1071 | + /* write old PGE again and flush TLBs */ |
---|
| 1072 | + native_write_cr4(cr4); |
---|
| 1073 | + |
---|
| 1074 | + raw_local_irq_restore(flags); |
---|
| 1075 | +} |
---|
| 1076 | + |
---|
| 1077 | +/* |
---|
| 1078 | + * Flush the entire current user mapping |
---|
| 1079 | + */ |
---|
| 1080 | +STATIC_NOPV void native_flush_tlb_local(void) |
---|
| 1081 | +{ |
---|
| 1082 | + /* |
---|
| 1083 | + * Preemption or interrupts must be disabled to protect the access |
---|
| 1084 | + * to the per CPU variable and to prevent being preempted between |
---|
| 1085 | + * read_cr3() and write_cr3(). |
---|
| 1086 | + */ |
---|
| 1087 | + WARN_ON_ONCE(preemptible()); |
---|
| 1088 | + |
---|
| 1089 | + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); |
---|
| 1090 | + |
---|
| 1091 | + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ |
---|
| 1092 | + native_write_cr3(__native_read_cr3()); |
---|
| 1093 | +} |
---|
| 1094 | + |
---|
| 1095 | +void flush_tlb_local(void) |
---|
| 1096 | +{ |
---|
| 1097 | + __flush_tlb_local(); |
---|
| 1098 | +} |
---|
| 1099 | + |
---|
| 1100 | +/* |
---|
| 1101 | + * Flush everything |
---|
| 1102 | + */ |
---|
| 1103 | +void __flush_tlb_all(void) |
---|
| 1104 | +{ |
---|
| 1105 | + /* |
---|
| 1106 | + * This is to catch users with enabled preemption and the PGE feature |
---|
| 1107 | + * and don't trigger the warning in __native_flush_tlb(). |
---|
| 1108 | + */ |
---|
| 1109 | + VM_WARN_ON_ONCE(preemptible()); |
---|
| 1110 | + |
---|
| 1111 | + if (boot_cpu_has(X86_FEATURE_PGE)) { |
---|
| 1112 | + __flush_tlb_global(); |
---|
| 1113 | + } else { |
---|
| 1114 | + /* |
---|
| 1115 | + * !PGE -> !PCID (setup_pcid()), thus every flush is total. |
---|
| 1116 | + */ |
---|
| 1117 | + flush_tlb_local(); |
---|
| 1118 | + } |
---|
| 1119 | +} |
---|
| 1120 | +EXPORT_SYMBOL_GPL(__flush_tlb_all); |
---|
| 1121 | + |
---|
| 1122 | +/* |
---|
| 1123 | + * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. |
---|
| 1124 | + * This means that the 'struct flush_tlb_info' that describes which mappings to |
---|
| 1125 | + * flush is actually fixed. We therefore set a single fixed struct and use it in |
---|
| 1126 | + * arch_tlbbatch_flush(). |
---|
| 1127 | + */ |
---|
| 1128 | +static const struct flush_tlb_info full_flush_tlb_info = { |
---|
| 1129 | + .mm = NULL, |
---|
| 1130 | + .start = 0, |
---|
| 1131 | + .end = TLB_FLUSH_ALL, |
---|
| 1132 | +}; |
---|
| 1133 | + |
---|
771 | 1134 | void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) |
---|
772 | 1135 | { |
---|
773 | | - struct flush_tlb_info info = { |
---|
774 | | - .mm = NULL, |
---|
775 | | - .start = 0UL, |
---|
776 | | - .end = TLB_FLUSH_ALL, |
---|
777 | | - }; |
---|
778 | | - |
---|
779 | 1136 | int cpu = get_cpu(); |
---|
780 | 1137 | |
---|
781 | 1138 | if (cpumask_test_cpu(cpu, &batch->cpumask)) { |
---|
782 | | - VM_WARN_ON(irqs_disabled()); |
---|
| 1139 | + lockdep_assert_irqs_enabled(); |
---|
783 | 1140 | local_irq_disable(); |
---|
784 | | - flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); |
---|
| 1141 | + flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); |
---|
785 | 1142 | local_irq_enable(); |
---|
786 | 1143 | } |
---|
787 | 1144 | |
---|
788 | 1145 | if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) |
---|
789 | | - flush_tlb_others(&batch->cpumask, &info); |
---|
| 1146 | + flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); |
---|
790 | 1147 | |
---|
791 | 1148 | cpumask_clear(&batch->cpumask); |
---|
792 | 1149 | |
---|
793 | 1150 | put_cpu(); |
---|
794 | 1151 | } |
---|
795 | 1152 | |
---|
| 1153 | +/* |
---|
| 1154 | + * Blindly accessing user memory from NMI context can be dangerous |
---|
| 1155 | + * if we're in the middle of switching the current user task or |
---|
| 1156 | + * switching the loaded mm. It can also be dangerous if we |
---|
| 1157 | + * interrupted some kernel code that was temporarily using a |
---|
| 1158 | + * different mm. |
---|
| 1159 | + */ |
---|
| 1160 | +bool nmi_uaccess_okay(void) |
---|
| 1161 | +{ |
---|
| 1162 | + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); |
---|
| 1163 | + struct mm_struct *current_mm = current->mm; |
---|
| 1164 | + |
---|
| 1165 | + VM_WARN_ON_ONCE(!loaded_mm); |
---|
| 1166 | + |
---|
| 1167 | + /* |
---|
| 1168 | + * The condition we want to check is |
---|
| 1169 | + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, |
---|
| 1170 | + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() |
---|
| 1171 | + * is supposed to be reasonably fast. |
---|
| 1172 | + * |
---|
| 1173 | + * Instead, we check the almost equivalent but somewhat conservative |
---|
| 1174 | + * condition below, and we rely on the fact that switch_mm_irqs_off() |
---|
| 1175 | + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. |
---|
| 1176 | + */ |
---|
| 1177 | + if (loaded_mm != current_mm) |
---|
| 1178 | + return false; |
---|
| 1179 | + |
---|
| 1180 | + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); |
---|
| 1181 | + |
---|
| 1182 | + return true; |
---|
| 1183 | +} |
---|
| 1184 | + |
---|
796 | 1185 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, |
---|
797 | 1186 | size_t count, loff_t *ppos) |
---|
798 | 1187 | { |
---|