~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	#include <linux/init.h>
2	3
3	4	#include <linux/mm.h>
..	..	@@ -13,7 +14,18 @@
13	14	#include <asm/nospec-branch.h>
14	15	#include <asm/cache.h>
15	16	#include <asm/apic.h>
16		-#include <asm/uv/uv.h>
	17	+
	18	+#include "mm_internal.h"
	19	+
	20	+#ifdef CONFIG_PARAVIRT
	21	+# define STATIC_NOPV
	22	+#else
	23	+# define STATIC_NOPV static
	24	+# define __flush_tlb_local native_flush_tlb_local
	25	+# define __flush_tlb_global native_flush_tlb_global
	26	+# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
	27	+# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
	28	+#endif
17	29
18	30	/*
19	31	* TLB flushing, formerly SMP-only
..	..	@@ -34,6 +46,126 @@
34	46	* stored in cpu_tlb_state.last_user_mm_ibpb.
35	47	*/
36	48	#define LAST_USER_MM_IBPB 0x1UL
	49	+
	50	+/*
	51	+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
	52	+ * to what is traditionally called ASID on the RISC processors.
	53	+ *
	54	+ * We don't use the traditional ASID implementation, where each process/mm gets
	55	+ * its own ASID and flush/restart when we run out of ASID space.
	56	+ *
	57	+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
	58	+ * that came by on this CPU, allowing cheaper switch_mm between processes on
	59	+ * this CPU.
	60	+ *
	61	+ * We end up with different spaces for different things. To avoid confusion we
	62	+ * use different names for each of them:
	63	+ *
	64	+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
	65	+ * the canonical identifier for an mm
	66	+ *
	67	+ * kPCID - [1, TLB_NR_DYN_ASIDS]
	68	+ * the value we write into the PCID part of CR3; corresponds to the
	69	+ * ASID+1, because PCID 0 is special.
	70	+ *
	71	+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
	72	+ * for KPTI each mm has two address spaces and thus needs two
	73	+ * PCID values, but we can still do with a single ASID denomination
	74	+ * for each mm. Corresponds to kPCID + 2048.
	75	+ *
	76	+ */
	77	+
	78	+/* There are 12 bits of space for ASIDS in CR3 */
	79	+#define CR3_HW_ASID_BITS 12
	80	+
	81	+/*
	82	+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
	83	+ * user/kernel switches
	84	+ */
	85	+#ifdef CONFIG_PAGE_TABLE_ISOLATION
	86	+# define PTI_CONSUMED_PCID_BITS 1
	87	+#else
	88	+# define PTI_CONSUMED_PCID_BITS 0
	89	+#endif
	90	+
	91	+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
	92	+
	93	+/*
	94	+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
	95	+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
	96	+ * use by non-PCID-aware users.
	97	+ */
	98	+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
	99	+
	100	+/*
	101	+ * Given @asid, compute kPCID
	102	+ */
	103	+static inline u16 kern_pcid(u16 asid)
	104	+{
	105	+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
	106	+
	107	+#ifdef CONFIG_PAGE_TABLE_ISOLATION
	108	+ /*
	109	+ * Make sure that the dynamic ASID space does not confict with the
	110	+ * bit we are using to switch between user and kernel ASIDs.
	111	+ */
	112	+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
	113	+
	114	+ /*
	115	+ * The ASID being passed in here should have respected the
	116	+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
	117	+ */
	118	+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
	119	+#endif
	120	+ /*
	121	+ * The dynamically-assigned ASIDs that get passed in are small
	122	+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
	123	+ * so do not bother to clear it.
	124	+ *
	125	+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
	126	+ * PCID bits. This serves two purposes. It prevents a nasty
	127	+ * situation in which PCID-unaware code saves CR3, loads some other
	128	+ * value (with PCID == 0), and then restores CR3, thus corrupting
	129	+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
	130	+ * that any bugs involving loading a PCID-enabled CR3 with
	131	+ * CR4.PCIDE off will trigger deterministically.
	132	+ */
	133	+ return asid + 1;
	134	+}
	135	+
	136	+/*
	137	+ * Given @asid, compute uPCID
	138	+ */
	139	+static inline u16 user_pcid(u16 asid)
	140	+{
	141	+ u16 ret = kern_pcid(asid);
	142	+#ifdef CONFIG_PAGE_TABLE_ISOLATION
	143	+ ret \|= 1 << X86_CR3_PTI_PCID_USER_BIT;
	144	+#endif
	145	+ return ret;
	146	+}
	147	+
	148	+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
	149	+{
	150	+ if (static_cpu_has(X86_FEATURE_PCID)) {
	151	+ return __sme_pa(pgd) \| kern_pcid(asid);
	152	+ } else {
	153	+ VM_WARN_ON_ONCE(asid != 0);
	154	+ return __sme_pa(pgd);
	155	+ }
	156	+}
	157	+
	158	+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
	159	+{
	160	+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
	161	+ /*
	162	+ * Use boot_cpu_has() instead of this_cpu_has() as this function
	163	+ * might be called during early boot. This should work even after
	164	+ * boot because all CPU's the have same capabilities:
	165	+ */
	166	+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
	167	+ return __sme_pa(pgd) \| kern_pcid(asid) \| CR3_NOFLUSH;
	168	+}
37	169
38	170	/*
39	171	* We get here when we do something requiring a TLB invalidation
..	..	@@ -107,6 +239,32 @@
107	239	*need_flush = true;
108	240	}
109	241
	242	+/*
	243	+ * Given an ASID, flush the corresponding user ASID. We can delay this
	244	+ * until the next time we switch to it.
	245	+ *
	246	+ * See SWITCH_TO_USER_CR3.
	247	+ */
	248	+static inline void invalidate_user_asid(u16 asid)
	249	+{
	250	+ /* There is no user ASID if address space separation is off */
	251	+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
	252	+ return;
	253	+
	254	+ /*
	255	+ * We only have a single ASID if PCID is off and the CR3
	256	+ * write will have flushed it.
	257	+ */
	258	+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
	259	+ return;
	260	+
	261	+ if (!static_cpu_has(X86_FEATURE_PTI))
	262	+ return;
	263	+
	264	+ __set_bit(kern_pcid(asid),
	265	+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
	266	+}
	267	+
110	268	static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
111	269	{
112	270	unsigned long new_mm_cr3;
..	..	@@ -156,34 +314,6 @@
156	314	local_irq_save(flags);
157	315	switch_mm_irqs_off(prev, next, tsk);
158	316	local_irq_restore(flags);
159		-}
160		-
161		-static void sync_current_stack_to_mm(struct mm_struct *mm)
162		-{
163		- unsigned long sp = current_stack_pointer;
164		- pgd_t *pgd = pgd_offset(mm, sp);
165		-
166		- if (pgtable_l5_enabled()) {
167		- if (unlikely(pgd_none(*pgd))) {
168		- pgd_t *pgd_ref = pgd_offset_k(sp);
169		-
170		- set_pgd(pgd, *pgd_ref);
171		- }
172		- } else {
173		- /*
174		- * "pgd" is faked. The top level entries are "p4d"s, so sync
175		- * the p4d. This compiles to approximately the same code as
176		- * the 5-level case.
177		- */
178		- p4d_t *p4d = p4d_offset(pgd, sp);
179		-
180		- if (unlikely(p4d_none(*p4d))) {
181		- pgd_t *pgd_ref = pgd_offset_k(sp);
182		- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
183		-
184		- set_p4d(p4d, *p4d_ref);
185		- }
186		- }
187	317	}
188	318
189	319	static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
..	..	@@ -269,13 +399,36 @@
269	399	}
270	400	}
271	401
	402	+#ifdef CONFIG_PERF_EVENTS
	403	+static inline void cr4_update_pce_mm(struct mm_struct *mm)
	404	+{
	405	+ if (static_branch_unlikely(&rdpmc_always_available_key) \|\|
	406	+ (!static_branch_unlikely(&rdpmc_never_available_key) &&
	407	+ atomic_read(&mm->context.perf_rdpmc_allowed)))
	408	+ cr4_set_bits_irqsoff(X86_CR4_PCE);
	409	+ else
	410	+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
	411	+}
	412	+
	413	+void cr4_update_pce(void *ignored)
	414	+{
	415	+ cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
	416	+}
	417	+
	418	+#else
	419	+static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
	420	+#endif
	421	+
272	422	void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
273	423	struct task_struct *tsk)
274	424	{
275	425	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
276	426	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	427	+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
277	428	unsigned cpu = smp_processor_id();
278	429	u64 next_tlb_gen;
	430	+ bool need_flush;
	431	+ u16 new_asid;
279	432
280	433	/*
281	434	* NB: The scheduler will call us with prev == next when switching
..	..	@@ -335,36 +488,47 @@
335	488	next->context.ctx_id);
336	489
337	490	/*
338		- * We don't currently support having a real mm loaded without
339		- * our cpu set in mm_cpumask(). We have all the bookkeeping
340		- * in place to figure out whether we would need to flush
341		- * if our cpu were cleared in mm_cpumask(), but we don't
342		- * currently use it.
	491	+ * Even in lazy TLB mode, the CPU should stay set in the
	492	+ * mm_cpumask. The TLB shootdown code can figure out from
	493	+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
343	494	*/
344	495	if (WARN_ON_ONCE(real_prev != &init_mm &&
345	496	!cpumask_test_cpu(cpu, mm_cpumask(next))))
346	497	cpumask_set_cpu(cpu, mm_cpumask(next));
347	498
348		- return;
349		- } else {
350		- u16 new_asid;
351		- bool need_flush;
	499	+ /*
	500	+ * If the CPU is not in lazy TLB mode, we are just switching
	501	+ * from one thread in a process to another thread in the same
	502	+ * process. No TLB flush required.
	503	+ */
	504	+ if (!was_lazy)
	505	+ return;
352	506
	507	+ /*
	508	+ * Read the tlb_gen to check whether a flush is needed.
	509	+ * If the TLB is up to date, just use it.
	510	+ * The barrier synchronizes with the tlb_gen increment in
	511	+ * the TLB shootdown code.
	512	+ */
	513	+ smp_mb();
	514	+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
	515	+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
	516	+ next_tlb_gen)
	517	+ return;
	518	+
	519	+ /*
	520	+ * TLB contents went out of date while we were in lazy
	521	+ * mode. Fall through to the TLB switching code below.
	522	+ */
	523	+ new_asid = prev_asid;
	524	+ need_flush = true;
	525	+ } else {
353	526	/*
354	527	* Avoid user/user BTB poisoning by flushing the branch
355	528	* predictor when switching between processes. This stops
356	529	* one process from doing Spectre-v2 attacks on another.
357	530	*/
358	531	cond_ibpb(tsk);
359		-
360		- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
361		- /*
362		- * If our current stack is in vmalloc space and isn't
363		- * mapped in the new pgd, we'll double-fault. Forcibly
364		- * map it.
365		- */
366		- sync_current_stack_to_mm(next);
367		- }
368	532
369	533	/*
370	534	* Stop remote flushes for the previous mm.
..	..	@@ -389,38 +553,31 @@
389	553	/* Let nmi_uaccess_okay() know that we're changing CR3. */
390	554	this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
391	555	barrier();
392		-
393		- if (need_flush) {
394		- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
395		- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
396		- load_new_mm_cr3(next->pgd, new_asid, true);
397		-
398		- /*
399		- * NB: This gets called via leave_mm() in the idle path
400		- * where RCU functions differently. Tracing normally
401		- * uses RCU, so we need to use the _rcuidle variant.
402		- *
403		- * (There is no good reason for this. The idle code should
404		- * be rearranged to call this before rcu_idle_enter().)
405		- */
406		- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
407		- } else {
408		- /* The new ASID is already up to date. */
409		- load_new_mm_cr3(next->pgd, new_asid, false);
410		-
411		- /* See above wrt _rcuidle. */
412		- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
413		- }
414		-
415		- /* Make sure we write CR3 before loaded_mm. */
416		- barrier();
417		-
418		- this_cpu_write(cpu_tlbstate.loaded_mm, next);
419		- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
420	556	}
421	557
422		- load_mm_cr4(next);
423		- switch_ldt(real_prev, next);
	558	+ if (need_flush) {
	559	+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
	560	+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
	561	+ load_new_mm_cr3(next->pgd, new_asid, true);
	562	+
	563	+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
	564	+ } else {
	565	+ /* The new ASID is already up to date. */
	566	+ load_new_mm_cr3(next->pgd, new_asid, false);
	567	+
	568	+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
	569	+ }
	570	+
	571	+ /* Make sure we write CR3 before loaded_mm. */
	572	+ barrier();
	573	+
	574	+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
	575	+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
	576	+
	577	+ if (next != real_prev) {
	578	+ cr4_update_pce_mm(next);
	579	+ switch_ldt(real_prev, next);
	580	+ }
424	581	}
425	582
426	583	/*
..	..	@@ -441,20 +598,7 @@
441	598	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
442	599	return;
443	600
444		- if (tlb_defer_switch_to_init_mm()) {
445		- /*
446		- * There's a significant optimization that may be possible
447		- * here. We have accurate enough TLB flush tracking that we
448		- * don't need to maintain coherence of TLB per se when we're
449		- * lazy. We do, however, need to maintain coherence of
450		- * paging-structure caches. We could, in principle, leave our
451		- * old mm loaded and only switch to init_mm when
452		- * tlb_remove_page() happens.
453		- */
454		- this_cpu_write(cpu_tlbstate.is_lazy, true);
455		- } else {
456		- switch_mm(NULL, &init_mm, NULL);
457		- }
	601	+ this_cpu_write(cpu_tlbstate.is_lazy, true);
458	602	}
459	603
460	604	/*
..	..	@@ -541,6 +685,9 @@
541	685	* paging-structure cache to avoid speculatively reading
542	686	* garbage into our TLB. Since switching to init_mm is barely
543	687	* slower than a minimal flush, just switch to init_mm.
	688	+ *
	689	+ * This should be rare, with native_flush_tlb_others skipping
	690	+ * IPIs to lazy TLB mode CPUs.
544	691	*/
545	692	switch_mm_irqs_off(NULL, &init_mm, NULL);
546	693	return;
..	..	@@ -601,20 +748,19 @@
601	748	f->new_tlb_gen == local_tlb_gen + 1 &&
602	749	f->new_tlb_gen == mm_tlb_gen) {
603	750	/* Partial flush */
604		- unsigned long addr;
605		- unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
	751	+ unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
	752	+ unsigned long addr = f->start;
606	753
607		- addr = f->start;
608	754	while (addr < f->end) {
609		- __flush_tlb_one_user(addr);
610		- addr += PAGE_SIZE;
	755	+ flush_tlb_one_user(addr);
	756	+ addr += 1UL << f->stride_shift;
611	757	}
612	758	if (local)
613		- count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
614		- trace_tlb_flush(reason, nr_pages);
	759	+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
	760	+ trace_tlb_flush(reason, nr_invalidate);
615	761	} else {
616	762	/* Full flush. */
617		- local_flush_tlb();
	763	+ flush_tlb_local();
618	764	if (local)
619	765	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
620	766	trace_tlb_flush(reason, TLB_FLUSH_ALL);
..	..	@@ -624,7 +770,7 @@
624	770	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
625	771	}
626	772
627		-static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
	773	+static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
628	774	{
629	775	const struct flush_tlb_info *f = info;
630	776
..	..	@@ -644,8 +790,13 @@
644	790	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
645	791	}
646	792
647		-void native_flush_tlb_others(const struct cpumask *cpumask,
648		- const struct flush_tlb_info *info)
	793	+static bool tlb_is_not_lazy(int cpu, void *data)
	794	+{
	795	+ return !per_cpu(cpu_tlbstate.is_lazy, cpu);
	796	+}
	797	+
	798	+STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
	799	+ const struct flush_tlb_info *info)
649	800	{
650	801	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
651	802	if (info->end == TLB_FLUSH_ALL)
..	..	@@ -654,34 +805,32 @@
654	805	trace_tlb_flush(TLB_REMOTE_SEND_IPI,
655	806	(info->end - info->start) >> PAGE_SHIFT);
656	807
657		- if (is_uv_system()) {
658		- /*
659		- * This whole special case is confused. UV has a "Broadcast
660		- * Assist Unit", which seems to be a fancy way to send IPIs.
661		- * Back when x86 used an explicit TLB flush IPI, UV was
662		- * optimized to use its own mechanism. These days, x86 uses
663		- * smp_call_function_many(), but UV still uses a manual IPI,
664		- * and that IPI's action is out of date -- it does a manual
665		- * flush instead of calling flush_tlb_func_remote(). This
666		- * means that the percpu tlb_gen variables won't be updated
667		- * and we'll do pointless flushes on future context switches.
668		- *
669		- * Rather than hooking native_flush_tlb_others() here, I think
670		- * that UV should be updated so that smp_call_function_many(),
671		- * etc, are optimal on UV.
672		- */
673		- cpumask = uv_flush_tlb_others(cpumask, info);
674		- if (cpumask)
675		- smp_call_function_many(cpumask, flush_tlb_func_remote,
676		- (void *)info, 1);
677		- return;
678		- }
679		- smp_call_function_many(cpumask, flush_tlb_func_remote,
	808	+ /*
	809	+ * If no page tables were freed, we can skip sending IPIs to
	810	+ * CPUs in lazy TLB mode. They will flush the CPU themselves
	811	+ * at the next context switch.
	812	+ *
	813	+ * However, if page tables are getting freed, we need to send the
	814	+ * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
	815	+ * up on the new contents of what used to be page tables, while
	816	+ * doing a speculative memory access.
	817	+ */
	818	+ if (info->freed_tables)
	819	+ smp_call_function_many(cpumask, flush_tlb_func_remote,
680	820	(void *)info, 1);
	821	+ else
	822	+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
	823	+ (void *)info, 1, cpumask);
	824	+}
	825	+
	826	+void flush_tlb_others(const struct cpumask *cpumask,
	827	+ const struct flush_tlb_info *info)
	828	+{
	829	+ __flush_tlb_others(cpumask, info);
681	830	}
682	831
683	832	/*
684		- * See Documentation/x86/tlb.txt for details. We choose 33
	833	+ * See Documentation/x86/tlb.rst for details. We choose 33
685	834	* because it is large enough to cover the vast majority (at
686	835	* least 95%) of allocations, and is small enough that we are
687	836	* confident it will not cause too much overhead. Each single
..	..	@@ -690,43 +839,83 @@
690	839	*
691	840	* This is in units of pages.
692	841	*/
693		-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
	842	+unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
	843	+
	844	+static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
	845	+
	846	+#ifdef CONFIG_DEBUG_VM
	847	+static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
	848	+#endif
	849	+
	850	+static inline struct flush_tlb_info get_flush_tlb_info(struct mm_struct mm,
	851	+ unsigned long start, unsigned long end,
	852	+ unsigned int stride_shift, bool freed_tables,
	853	+ u64 new_tlb_gen)
	854	+{
	855	+ struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
	856	+
	857	+#ifdef CONFIG_DEBUG_VM
	858	+ /*
	859	+ * Ensure that the following code is non-reentrant and flush_tlb_info
	860	+ * is not overwritten. This means no TLB flushing is initiated by
	861	+ * interrupt handlers and machine-check exception handlers.
	862	+ */
	863	+ BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
	864	+#endif
	865	+
	866	+ info->start = start;
	867	+ info->end = end;
	868	+ info->mm = mm;
	869	+ info->stride_shift = stride_shift;
	870	+ info->freed_tables = freed_tables;
	871	+ info->new_tlb_gen = new_tlb_gen;
	872	+
	873	+ return info;
	874	+}
	875	+
	876	+static inline void put_flush_tlb_info(void)
	877	+{
	878	+#ifdef CONFIG_DEBUG_VM
	879	+ /* Complete reentrency prevention checks */
	880	+ barrier();
	881	+ this_cpu_dec(flush_tlb_info_idx);
	882	+#endif
	883	+}
694	884
695	885	void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
696		- unsigned long end, unsigned long vmflag)
	886	+ unsigned long end, unsigned int stride_shift,
	887	+ bool freed_tables)
697	888	{
	889	+ struct flush_tlb_info *info;
	890	+ u64 new_tlb_gen;
698	891	int cpu;
699		-
700		- struct flush_tlb_info info = {
701		- .mm = mm,
702		- };
703	892
704	893	cpu = get_cpu();
705	894
706		- /* This is also a barrier that synchronizes with switch_mm(). */
707		- info.new_tlb_gen = inc_mm_tlb_gen(mm);
708		-
709	895	/* Should we flush just the requested range? */
710		- if ((end != TLB_FLUSH_ALL) &&
711		- !(vmflag & VM_HUGETLB) &&
712		- ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
713		- info.start = start;
714		- info.end = end;
715		- } else {
716		- info.start = 0UL;
717		- info.end = TLB_FLUSH_ALL;
	896	+ if ((end == TLB_FLUSH_ALL) \|\|
	897	+ ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
	898	+ start = 0;
	899	+ end = TLB_FLUSH_ALL;
718	900	}
719	901
	902	+ /* This is also a barrier that synchronizes with switch_mm(). */
	903	+ new_tlb_gen = inc_mm_tlb_gen(mm);
	904	+
	905	+ info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
	906	+ new_tlb_gen);
	907	+
720	908	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
721		- VM_WARN_ON(irqs_disabled());
	909	+ lockdep_assert_irqs_enabled();
722	910	local_irq_disable();
723		- flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
	911	+ flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
724	912	local_irq_enable();
725	913	}
726	914
727	915	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
728		- flush_tlb_others(mm_cpumask(mm), &info);
	916	+ flush_tlb_others(mm_cpumask(mm), info);
729	917
	918	+ put_flush_tlb_info();
730	919	put_cpu();
731	920	}
732	921
..	..	@@ -750,49 +939,249 @@
750	939
751	940	/* flush range by one by one 'invlpg' */
752	941	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
753		- __flush_tlb_one_kernel(addr);
	942	+ flush_tlb_one_kernel(addr);
754	943	}
755	944
756	945	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
757	946	{
758		-
759	947	/* Balance as user space task's flush, a bit conservative */
760	948	if (end == TLB_FLUSH_ALL \|\|
761	949	(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
762	950	on_each_cpu(do_flush_tlb_all, NULL, 1);
763	951	} else {
764		- struct flush_tlb_info info;
765		- info.start = start;
766		- info.end = end;
767		- on_each_cpu(do_kernel_range_flush, &info, 1);
	952	+ struct flush_tlb_info *info;
	953	+
	954	+ preempt_disable();
	955	+ info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
	956	+
	957	+ on_each_cpu(do_kernel_range_flush, info, 1);
	958	+
	959	+ put_flush_tlb_info();
	960	+ preempt_enable();
768	961	}
769	962	}
770	963
	964	+/*
	965	+ * This can be used from process context to figure out what the value of
	966	+ * CR3 is without needing to do a (slow) __read_cr3().
	967	+ *
	968	+ * It's intended to be used for code like KVM that sneakily changes CR3
	969	+ * and needs to restore it. It needs to be used very carefully.
	970	+ */
	971	+unsigned long __get_current_cr3_fast(void)
	972	+{
	973	+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
	974	+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
	975	+
	976	+ /* For now, be very restrictive about when this can be called. */
	977	+ VM_WARN_ON(in_nmi() \|\| preemptible());
	978	+
	979	+ VM_BUG_ON(cr3 != __read_cr3());
	980	+ return cr3;
	981	+}
	982	+EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
	983	+
	984	+/*
	985	+ * Flush one page in the kernel mapping
	986	+ */
	987	+void flush_tlb_one_kernel(unsigned long addr)
	988	+{
	989	+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
	990	+
	991	+ /*
	992	+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
	993	+ * paravirt equivalent. Even with PCID, this is sufficient: we only
	994	+ * use PCID if we also use global PTEs for the kernel mapping, and
	995	+ * INVLPG flushes global translations across all address spaces.
	996	+ *
	997	+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
	998	+ * __flush_tlb_one_user() will flush the given address for the current
	999	+ * kernel address space and for its usermode counterpart, but it does
	1000	+ * not flush it for other address spaces.
	1001	+ */
	1002	+ flush_tlb_one_user(addr);
	1003	+
	1004	+ if (!static_cpu_has(X86_FEATURE_PTI))
	1005	+ return;
	1006	+
	1007	+ /*
	1008	+ * See above. We need to propagate the flush to all other address
	1009	+ * spaces. In principle, we only need to propagate it to kernelmode
	1010	+ * address spaces, but the extra bookkeeping we would need is not
	1011	+ * worth it.
	1012	+ */
	1013	+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
	1014	+}
	1015	+
	1016	+/*
	1017	+ * Flush one page in the user mapping
	1018	+ */
	1019	+STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
	1020	+{
	1021	+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
	1022	+
	1023	+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
	1024	+
	1025	+ if (!static_cpu_has(X86_FEATURE_PTI))
	1026	+ return;
	1027	+
	1028	+ /*
	1029	+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
	1030	+ * Just use invalidate_user_asid() in case we are called early.
	1031	+ */
	1032	+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
	1033	+ invalidate_user_asid(loaded_mm_asid);
	1034	+ else
	1035	+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
	1036	+}
	1037	+
	1038	+void flush_tlb_one_user(unsigned long addr)
	1039	+{
	1040	+ __flush_tlb_one_user(addr);
	1041	+}
	1042	+
	1043	+/*
	1044	+ * Flush everything
	1045	+ */
	1046	+STATIC_NOPV void native_flush_tlb_global(void)
	1047	+{
	1048	+ unsigned long cr4, flags;
	1049	+
	1050	+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
	1051	+ /*
	1052	+ * Using INVPCID is considerably faster than a pair of writes
	1053	+ * to CR4 sandwiched inside an IRQ flag save/restore.
	1054	+ *
	1055	+ * Note, this works with CR4.PCIDE=0 or 1.
	1056	+ */
	1057	+ invpcid_flush_all();
	1058	+ return;
	1059	+ }
	1060	+
	1061	+ /*
	1062	+ * Read-modify-write to CR4 - protect it from preemption and
	1063	+ * from interrupts. (Use the raw variant because this code can
	1064	+ * be called from deep inside debugging code.)
	1065	+ */
	1066	+ raw_local_irq_save(flags);
	1067	+
	1068	+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
	1069	+ /* toggle PGE */
	1070	+ native_write_cr4(cr4 ^ X86_CR4_PGE);
	1071	+ /* write old PGE again and flush TLBs */
	1072	+ native_write_cr4(cr4);
	1073	+
	1074	+ raw_local_irq_restore(flags);
	1075	+}
	1076	+
	1077	+/*
	1078	+ * Flush the entire current user mapping
	1079	+ */
	1080	+STATIC_NOPV void native_flush_tlb_local(void)
	1081	+{
	1082	+ /*
	1083	+ * Preemption or interrupts must be disabled to protect the access
	1084	+ * to the per CPU variable and to prevent being preempted between
	1085	+ * read_cr3() and write_cr3().
	1086	+ */
	1087	+ WARN_ON_ONCE(preemptible());
	1088	+
	1089	+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
	1090	+
	1091	+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
	1092	+ native_write_cr3(__native_read_cr3());
	1093	+}
	1094	+
	1095	+void flush_tlb_local(void)
	1096	+{
	1097	+ __flush_tlb_local();
	1098	+}
	1099	+
	1100	+/*
	1101	+ * Flush everything
	1102	+ */
	1103	+void __flush_tlb_all(void)
	1104	+{
	1105	+ /*
	1106	+ * This is to catch users with enabled preemption and the PGE feature
	1107	+ * and don't trigger the warning in __native_flush_tlb().
	1108	+ */
	1109	+ VM_WARN_ON_ONCE(preemptible());
	1110	+
	1111	+ if (boot_cpu_has(X86_FEATURE_PGE)) {
	1112	+ __flush_tlb_global();
	1113	+ } else {
	1114	+ /*
	1115	+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
	1116	+ */
	1117	+ flush_tlb_local();
	1118	+ }
	1119	+}
	1120	+EXPORT_SYMBOL_GPL(__flush_tlb_all);
	1121	+
	1122	+/*
	1123	+ * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
	1124	+ * This means that the 'struct flush_tlb_info' that describes which mappings to
	1125	+ * flush is actually fixed. We therefore set a single fixed struct and use it in
	1126	+ * arch_tlbbatch_flush().
	1127	+ */
	1128	+static const struct flush_tlb_info full_flush_tlb_info = {
	1129	+ .mm = NULL,
	1130	+ .start = 0,
	1131	+ .end = TLB_FLUSH_ALL,
	1132	+};
	1133	+
771	1134	void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
772	1135	{
773		- struct flush_tlb_info info = {
774		- .mm = NULL,
775		- .start = 0UL,
776		- .end = TLB_FLUSH_ALL,
777		- };
778		-
779	1136	int cpu = get_cpu();
780	1137
781	1138	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
782		- VM_WARN_ON(irqs_disabled());
	1139	+ lockdep_assert_irqs_enabled();
783	1140	local_irq_disable();
784		- flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
	1141	+ flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
785	1142	local_irq_enable();
786	1143	}
787	1144
788	1145	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
789		- flush_tlb_others(&batch->cpumask, &info);
	1146	+ flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
790	1147
791	1148	cpumask_clear(&batch->cpumask);
792	1149
793	1150	put_cpu();
794	1151	}
795	1152
	1153	+/*
	1154	+ * Blindly accessing user memory from NMI context can be dangerous
	1155	+ * if we're in the middle of switching the current user task or
	1156	+ * switching the loaded mm. It can also be dangerous if we
	1157	+ * interrupted some kernel code that was temporarily using a
	1158	+ * different mm.
	1159	+ */
	1160	+bool nmi_uaccess_okay(void)
	1161	+{
	1162	+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
	1163	+ struct mm_struct *current_mm = current->mm;
	1164	+
	1165	+ VM_WARN_ON_ONCE(!loaded_mm);
	1166	+
	1167	+ /*
	1168	+ * The condition we want to check is
	1169	+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
	1170	+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
	1171	+ * is supposed to be reasonably fast.
	1172	+ *
	1173	+ * Instead, we check the almost equivalent but somewhat conservative
	1174	+ * condition below, and we rely on the fact that switch_mm_irqs_off()
	1175	+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
	1176	+ */
	1177	+ if (loaded_mm != current_mm)
	1178	+ return false;
	1179	+
	1180	+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
	1181	+
	1182	+ return true;
	1183	+}
	1184	+
796	1185	static ssize_t tlbflush_read_file(struct file file, char __user user_buf,
797	1186	size_t count, loff_t *ppos)
798	1187	{