~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/memory.c
3	4	*
..	..	@@ -64,11 +65,15 @@
64	65	#include <linux/gfp.h>
65	66	#include <linux/migrate.h>
66	67	#include <linux/string.h>
67		-#include <linux/dma-debug.h>
68	68	#include <linux/debugfs.h>
69	69	#include <linux/userfaultfd_k.h>
70	70	#include <linux/dax.h>
71	71	#include <linux/oom.h>
	72	+#include <linux/numa.h>
	73	+#include <linux/perf_event.h>
	74	+#include <linux/ptrace.h>
	75	+#include <linux/vmalloc.h>
	76	+#include <trace/hooks/mm.h>
72	77
73	78	#include <trace/events/kmem.h>
74	79
..	..	@@ -78,9 +83,13 @@
78	83	#include <linux/uaccess.h>
79	84	#include <asm/tlb.h>
80	85	#include <asm/tlbflush.h>
81		-#include <asm/pgtable.h>
82	86
	87	+#include "pgalloc-track.h"
83	88	#include "internal.h"
	89	+#include <trace/hooks/mm.h>
	90	+
	91	+#define CREATE_TRACE_POINTS
	92	+#include <trace/events/pagefault.h>
84	93
85	94	#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
86	95	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
..	..	@@ -127,6 +136,18 @@
127	136	* will be hit on old pte.
128	137	*/
129	138	return true;
	139	+}
	140	+#endif
	141	+
	142	+#ifndef arch_wants_old_prefaulted_pte
	143	+static inline bool arch_wants_old_prefaulted_pte(void)
	144	+{
	145	+ /*
	146	+ * Transitioning a PTE from 'old' to 'young' can be expensive on
	147	+ * some architectures, even if it's performed in hardware. By
	148	+ * default, "false" means prefaulted entries will be 'young'.
	149	+ */
	150	+ return false;
130	151	}
131	152	#endif
132	153
..	..	@@ -217,263 +238,6 @@
217	238
218	239	#endif /* SPLIT_RSS_COUNTING */
219	240
220		-#ifdef HAVE_GENERIC_MMU_GATHER
221		-
222		-static bool tlb_next_batch(struct mmu_gather *tlb)
223		-{
224		- struct mmu_gather_batch *batch;
225		-
226		- batch = tlb->active;
227		- if (batch->next) {
228		- tlb->active = batch->next;
229		- return true;
230		- }
231		-
232		- if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
233		- return false;
234		-
235		- batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
236		- if (!batch)
237		- return false;
238		-
239		- tlb->batch_count++;
240		- batch->next = NULL;
241		- batch->nr = 0;
242		- batch->max = MAX_GATHER_BATCH;
243		-
244		- tlb->active->next = batch;
245		- tlb->active = batch;
246		-
247		- return true;
248		-}
249		-
250		-void arch_tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm,
251		- unsigned long start, unsigned long end)
252		-{
253		- tlb->mm = mm;
254		-
255		- /* Is it from 0 to ~0? */
256		- tlb->fullmm = !(start \| (end+1));
257		- tlb->need_flush_all = 0;
258		- tlb->local.next = NULL;
259		- tlb->local.nr = 0;
260		- tlb->local.max = ARRAY_SIZE(tlb->__pages);
261		- tlb->active = &tlb->local;
262		- tlb->batch_count = 0;
263		-
264		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
265		- tlb->batch = NULL;
266		-#endif
267		- tlb->page_size = 0;
268		-
269		- __tlb_reset_range(tlb);
270		-}
271		-
272		-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
273		-{
274		- struct mmu_gather_batch *batch;
275		-
276		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
277		- tlb_table_flush(tlb);
278		-#endif
279		- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
280		- free_pages_and_swap_cache(batch->pages, batch->nr);
281		- batch->nr = 0;
282		- }
283		- tlb->active = &tlb->local;
284		-}
285		-
286		-void tlb_flush_mmu(struct mmu_gather *tlb)
287		-{
288		- tlb_flush_mmu_tlbonly(tlb);
289		- tlb_flush_mmu_free(tlb);
290		-}
291		-
292		-/* tlb_finish_mmu
293		- * Called at the end of the shootdown operation to free up any resources
294		- * that were required.
295		- */
296		-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
297		- unsigned long start, unsigned long end, bool force)
298		-{
299		- struct mmu_gather_batch batch, next;
300		-
301		- if (force)
302		- __tlb_adjust_range(tlb, start, end - start);
303		-
304		- tlb_flush_mmu(tlb);
305		-
306		- /* keep the page table cache within bounds */
307		- check_pgt_cache();
308		-
309		- for (batch = tlb->local.next; batch; batch = next) {
310		- next = batch->next;
311		- free_pages((unsigned long)batch, 0);
312		- }
313		- tlb->local.next = NULL;
314		-}
315		-
316		-/* __tlb_remove_page
317		- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
318		- * handling the additional races in SMP caused by other CPUs caching valid
319		- * mappings in their TLBs. Returns the number of free page slots left.
320		- * When out of page slots we must call tlb_flush_mmu().
321		- *returns true if the caller should flush.
322		- */
323		-bool __tlb_remove_page_size(struct mmu_gather tlb, struct page page, int page_size)
324		-{
325		- struct mmu_gather_batch *batch;
326		-
327		- VM_BUG_ON(!tlb->end);
328		- VM_WARN_ON(tlb->page_size != page_size);
329		-
330		- batch = tlb->active;
331		- /*
332		- * Add the page and check if we are full. If so
333		- * force a flush.
334		- */
335		- batch->pages[batch->nr++] = page;
336		- if (batch->nr == batch->max) {
337		- if (!tlb_next_batch(tlb))
338		- return true;
339		- batch = tlb->active;
340		- }
341		- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
342		-
343		- return false;
344		-}
345		-
346		-void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
347		- unsigned long size)
348		-{
349		- if (tlb->page_size != 0 && tlb->page_size != PMD_SIZE)
350		- tlb_flush_mmu(tlb);
351		-
352		- tlb->page_size = PMD_SIZE;
353		- tlb->start = min(tlb->start, address);
354		- tlb->end = max(tlb->end, address + size);
355		-}
356		-#endif /* HAVE_GENERIC_MMU_GATHER */
357		-
358		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
359		-
360		-/*
361		- * See the comment near struct mmu_table_batch.
362		- */
363		-
364		-/*
365		- * If we want tlb_remove_table() to imply TLB invalidates.
366		- */
367		-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
368		-{
369		-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
370		- /*
371		- * Invalidate page-table caches used by hardware walkers. Then we still
372		- * need to RCU-sched wait while freeing the pages because software
373		- * walkers can still be in-flight.
374		- */
375		- tlb_flush_mmu_tlbonly(tlb);
376		-#endif
377		-}
378		-
379		-static void tlb_remove_table_smp_sync(void *arg)
380		-{
381		- /* Simply deliver the interrupt */
382		-}
383		-
384		-static void tlb_remove_table_one(void *table)
385		-{
386		- /*
387		- * This isn't an RCU grace period and hence the page-tables cannot be
388		- * assumed to be actually RCU-freed.
389		- *
390		- * It is however sufficient for software page-table walkers that rely on
391		- * IRQ disabling. See the comment near struct mmu_table_batch.
392		- */
393		- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
394		- __tlb_remove_table(table);
395		-}
396		-
397		-static void tlb_remove_table_rcu(struct rcu_head *head)
398		-{
399		- struct mmu_table_batch *batch;
400		- int i;
401		-
402		- batch = container_of(head, struct mmu_table_batch, rcu);
403		-
404		- for (i = 0; i < batch->nr; i++)
405		- __tlb_remove_table(batch->tables[i]);
406		-
407		- free_page((unsigned long)batch);
408		-}
409		-
410		-void tlb_table_flush(struct mmu_gather *tlb)
411		-{
412		- struct mmu_table_batch **batch = &tlb->batch;
413		-
414		- if (*batch) {
415		- tlb_table_invalidate(tlb);
416		- call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
417		- *batch = NULL;
418		- }
419		-}
420		-
421		-void tlb_remove_table(struct mmu_gather tlb, void table)
422		-{
423		- struct mmu_table_batch **batch = &tlb->batch;
424		-
425		- if (*batch == NULL) {
426		- batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);
427		- if (*batch == NULL) {
428		- tlb_table_invalidate(tlb);
429		- tlb_remove_table_one(table);
430		- return;
431		- }
432		- (*batch)->nr = 0;
433		- }
434		-
435		- (batch)->tables[(batch)->nr++] = table;
436		- if ((*batch)->nr == MAX_TABLE_BATCH)
437		- tlb_table_flush(tlb);
438		-}
439		-
440		-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
441		-
442		-/**
443		- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
444		- * @tlb: the mmu_gather structure to initialize
445		- * @mm: the mm_struct of the target address space
446		- * @start: start of the region that will be removed from the page-table
447		- * @end: end of the region that will be removed from the page-table
448		- *
449		- * Called to initialize an (on-stack) mmu_gather structure for page-table
450		- * tear-down from @mm. The @start and @end are set to 0 and -1
451		- * respectively when @mm is without users and we're going to destroy
452		- * the full address space (exit/execve).
453		- */
454		-void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm,
455		- unsigned long start, unsigned long end)
456		-{
457		- arch_tlb_gather_mmu(tlb, mm, start, end);
458		- inc_tlb_flush_pending(tlb->mm);
459		-}
460		-
461		-void tlb_finish_mmu(struct mmu_gather *tlb,
462		- unsigned long start, unsigned long end)
463		-{
464		- /*
465		- * If there are parallel threads are doing PTE changes on same range
466		- * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
467		- * flush by batching, a thread has stable TLB entry can fail to flush
468		- * the TLB by observing pte_none\|!pte_dirty, for example so flush TLB
469		- * forcefully if we detect parallel PTE batching threads.
470		- */
471		- bool force = mm_tlb_flush_nested(tlb->mm);
472		-
473		- arch_tlb_finish_mmu(tlb, start, end, force);
474		- dec_tlb_flush_pending(tlb->mm);
475		-}
476		-
477	241	/*
478	242	* Note: this doesn't free the actual pages themselves. That
479	243	* has been handled earlier when unmapping all the memory regions.
..	..	@@ -482,6 +246,16 @@
482	246	unsigned long addr)
483	247	{
484	248	pgtable_t token = pmd_pgtable(*pmd);
	249	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	250	+ /*
	251	+ * Ensure page table destruction is blocked if __pte_map_lock managed
	252	+ * to take this lock. Without this barrier tlb_remove_table_rcu can
	253	+ * destroy ptl after __pte_map_lock locked it and during unlock would
	254	+ * cause a use-after-free.
	255	+ */
	256	+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
	257	+ spin_unlock(ptl);
	258	+#endif
485	259	pmd_clear(pmd);
486	260	pte_free_tlb(tlb, token, addr);
487	261	mm_dec_nr_ptes(tlb->mm);
..	..	@@ -643,7 +417,7 @@
643	417	* We add page table cache pages with PAGE_SIZE,
644	418	* (see pte_free_tlb()), flush the tlb if we need
645	419	*/
646		- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
	420	+ tlb_change_page_size(tlb, PAGE_SIZE);
647	421	pgd = pgd_offset(tlb->mm, addr);
648	422	do {
649	423	next = pgd_addr_end(addr, end);
..	..	@@ -664,7 +438,9 @@
664	438	* Hide vma from rmap and truncate_pagecache before freeing
665	439	* pgtables
666	440	*/
	441	+ vm_write_begin(vma);
667	442	unlink_anon_vmas(vma);
	443	+ vm_write_end(vma);
668	444	unlink_file_vma(vma);
669	445
670	446	if (is_vm_hugetlb_page(vma)) {
..	..	@@ -678,7 +454,9 @@
678	454	&& !is_vm_hugetlb_page(next)) {
679	455	vma = next;
680	456	next = vma->vm_next;
	457	+ vm_write_begin(vma);
681	458	unlink_anon_vmas(vma);
	459	+ vm_write_end(vma);
682	460	unlink_file_vma(vma);
683	461	}
684	462	free_pgd_range(tlb, addr, vma->vm_end,
..	..	@@ -688,10 +466,10 @@
688	466	}
689	467	}
690	468
691		-int __pte_alloc(struct mm_struct mm, pmd_t pmd, unsigned long address)
	469	+int __pte_alloc(struct mm_struct mm, pmd_t pmd)
692	470	{
693	471	spinlock_t *ptl;
694		- pgtable_t new = pte_alloc_one(mm, address);
	472	+ pgtable_t new = pte_alloc_one(mm);
695	473	if (!new)
696	474	return -ENOMEM;
697	475
..	..	@@ -706,7 +484,7 @@
706	484	* of a chain of data-dependent loads, meaning most CPUs (alpha
707	485	* being the notable exception) will already guarantee loads are
708	486	* seen in-order. See the alpha page table accessors for the
709		- * smp_read_barrier_depends() barriers in page table walking code.
	487	+ * smp_rmb() barriers in page table walking code.
710	488	*/
711	489	smp_wmb(); /* Could be smp_wmb__xxx(before\|after)_spin_lock */
712	490
..	..	@@ -722,9 +500,9 @@
722	500	return 0;
723	501	}
724	502
725		-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
	503	+int __pte_alloc_kernel(pmd_t *pmd)
726	504	{
727		- pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	505	+ pte_t *new = pte_alloc_one_kernel(&init_mm);
728	506	if (!new)
729	507	return -ENOMEM;
730	508
..	..	@@ -804,9 +582,9 @@
804	582	(long long)pte_val(pte), (long long)pmd_val(*pmd));
805	583	if (page)
806	584	dump_page(page, "bad pte");
807		- pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
808		- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
809		- pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
	585	+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
	586	+ (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
	587	+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
810	588	vma->vm_file,
811	589	vma->vm_ops ? vma->vm_ops->fault : NULL,
812	590	vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
..	..	@@ -816,7 +594,8 @@
816	594	}
817	595
818	596	/*
819		- * vm_normal_page -- This function gets the "struct page" associated with a pte.
	597	+ * __vm_normal_page -- This function gets the "struct page" associated with
	598	+ * a pte.
820	599	*
821	600	* "Special" mappings do not wish to be associated with a "struct page" (either
822	601	* it doesn't exist, or it exists but they don't want to touch it). In this
..	..	@@ -858,7 +637,7 @@
858	637	*
859	638	*/
860	639	struct page _vm_normal_page(struct vm_area_struct vma, unsigned long addr,
861		- pte_t pte, bool with_public_device)
	640	+ pte_t pte, unsigned long vma_flags)
862	641	{
863	642	unsigned long pfn = pte_pfn(pte);
864	643
..	..	@@ -867,33 +646,10 @@
867	646	goto check_pfn;
868	647	if (vma->vm_ops && vma->vm_ops->find_special_page)
869	648	return vma->vm_ops->find_special_page(vma, addr);
870		- if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
	649	+ if (vma_flags & (VM_PFNMAP \| VM_MIXEDMAP))
871	650	return NULL;
872	651	if (is_zero_pfn(pfn))
873	652	return NULL;
874		-
875		- /*
876		- * Device public pages are special pages (they are ZONE_DEVICE
877		- * pages but different from persistent memory). They behave
878		- * allmost like normal pages. The difference is that they are
879		- * not on the lru and thus should never be involve with any-
880		- * thing that involve lru manipulation (mlock, numa balancing,
881		- * ...).
882		- *
883		- * This is why we still want to return NULL for such page from
884		- * vm_normal_page() so that we do not have to special case all
885		- * call site of vm_normal_page().
886		- */
887		- if (likely(pfn <= highest_memmap_pfn)) {
888		- struct page *page = pfn_to_page(pfn);
889		-
890		- if (is_device_public_page(page)) {
891		- if (with_public_device)
892		- return page;
893		- return NULL;
894		- }
895		- }
896		-
897	653	if (pte_devmap(pte))
898	654	return NULL;
899	655
..	..	@@ -902,9 +658,13 @@
902	658	}
903	659
904	660	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
	661	+ /*
	662	+ * This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT
	663	+ * is set. This is mainly because we can't rely on vm_start.
	664	+ */
905	665
906		- if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
907		- if (vma->vm_flags & VM_MIXEDMAP) {
	666	+ if (unlikely(vma_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
	667	+ if (vma_flags & VM_MIXEDMAP) {
908	668	if (!pfn_valid(pfn))
909	669	return NULL;
910	670	goto out;
..	..	@@ -913,7 +673,7 @@
913	673	off = (addr - vma->vm_start) >> PAGE_SHIFT;
914	674	if (pfn == vma->vm_pgoff + off)
915	675	return NULL;
916		- if (!is_cow_mapping(vma->vm_flags))
	676	+ if (!is_cow_mapping(vma_flags))
917	677	return NULL;
918	678	}
919	679	}
..	..	@@ -963,7 +723,7 @@
963	723
964	724	if (pmd_devmap(pmd))
965	725	return NULL;
966		- if (is_zero_pfn(pfn))
	726	+ if (is_huge_zero_pmd(pmd))
967	727	return NULL;
968	728	if (unlikely(pfn > highest_memmap_pfn))
969	729	return NULL;
..	..	@@ -983,80 +743,197 @@
983	743	* covered by this vma.
984	744	*/
985	745
986		-static inline unsigned long
987		-copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
988		- pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
989		- unsigned long addr, int *rss)
	746	+static unsigned long
	747	+copy_nonpresent_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
	748	+ pte_t dst_pte, pte_t src_pte, struct vm_area_struct *dst_vma,
	749	+ struct vm_area_struct src_vma, unsigned long addr, int rss)
990	750	{
991		- unsigned long vm_flags = vma->vm_flags;
	751	+ unsigned long vm_flags = dst_vma->vm_flags;
	752	+ pte_t pte = *src_pte;
	753	+ struct page *page;
	754	+ swp_entry_t entry = pte_to_swp_entry(pte);
	755	+
	756	+ if (likely(!non_swap_entry(entry))) {
	757	+ if (swap_duplicate(entry) < 0)
	758	+ return entry.val;
	759	+
	760	+ /* make sure dst_mm is on swapoff's mmlist. */
	761	+ if (unlikely(list_empty(&dst_mm->mmlist))) {
	762	+ spin_lock(&mmlist_lock);
	763	+ if (list_empty(&dst_mm->mmlist))
	764	+ list_add(&dst_mm->mmlist,
	765	+ &src_mm->mmlist);
	766	+ spin_unlock(&mmlist_lock);
	767	+ }
	768	+ rss[MM_SWAPENTS]++;
	769	+ } else if (is_migration_entry(entry)) {
	770	+ page = migration_entry_to_page(entry);
	771	+
	772	+ rss[mm_counter(page)]++;
	773	+
	774	+ if (is_write_migration_entry(entry) &&
	775	+ is_cow_mapping(vm_flags)) {
	776	+ /*
	777	+ * COW mappings require pages in both
	778	+ * parent and child to be set to read.
	779	+ */
	780	+ make_migration_entry_read(&entry);
	781	+ pte = swp_entry_to_pte(entry);
	782	+ if (pte_swp_soft_dirty(*src_pte))
	783	+ pte = pte_swp_mksoft_dirty(pte);
	784	+ if (pte_swp_uffd_wp(*src_pte))
	785	+ pte = pte_swp_mkuffd_wp(pte);
	786	+ set_pte_at(src_mm, addr, src_pte, pte);
	787	+ }
	788	+ } else if (is_device_private_entry(entry)) {
	789	+ page = device_private_entry_to_page(entry);
	790	+
	791	+ /*
	792	+ * Update rss count even for unaddressable pages, as
	793	+ * they should treated just like normal pages in this
	794	+ * respect.
	795	+ *
	796	+ * We will likely want to have some new rss counters
	797	+ * for unaddressable pages, at some point. But for now
	798	+ * keep things as they are.
	799	+ */
	800	+ get_page(page);
	801	+ rss[mm_counter(page)]++;
	802	+ page_dup_rmap(page, false);
	803	+
	804	+ /*
	805	+ * We do not preserve soft-dirty information, because so
	806	+ * far, checkpoint/restore is the only feature that
	807	+ * requires that. And checkpoint/restore does not work
	808	+ * when a device driver is involved (you cannot easily
	809	+ * save and restore device driver state).
	810	+ */
	811	+ if (is_write_device_private_entry(entry) &&
	812	+ is_cow_mapping(vm_flags)) {
	813	+ make_device_private_entry_read(&entry);
	814	+ pte = swp_entry_to_pte(entry);
	815	+ if (pte_swp_uffd_wp(*src_pte))
	816	+ pte = pte_swp_mkuffd_wp(pte);
	817	+ set_pte_at(src_mm, addr, src_pte, pte);
	818	+ }
	819	+ }
	820	+ if (!userfaultfd_wp(dst_vma))
	821	+ pte = pte_swp_clear_uffd_wp(pte);
	822	+ set_pte_at(dst_mm, addr, dst_pte, pte);
	823	+ return 0;
	824	+}
	825	+
	826	+/*
	827	+ * Copy a present and normal page if necessary.
	828	+ *
	829	+ * NOTE! The usual case is that this doesn't need to do
	830	+ * anything, and can just return a positive value. That
	831	+ * will let the caller know that it can just increase
	832	+ * the page refcount and re-use the pte the traditional
	833	+ * way.
	834	+ *
	835	+ * But _if_ we need to copy it because it needs to be
	836	+ * pinned in the parent (and the child should get its own
	837	+ * copy rather than just a reference to the same page),
	838	+ * we'll do that here and return zero to let the caller
	839	+ * know we're done.
	840	+ *
	841	+ * And if we need a pre-allocated page but don't yet have
	842	+ * one, return a negative error to let the preallocation
	843	+ * code know so that it can do so outside the page table
	844	+ * lock.
	845	+ */
	846	+static inline int
	847	+copy_present_page(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	848	+ pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
	849	+ struct page *prealloc, pte_t pte, struct page page)
	850	+{
	851	+ struct mm_struct *src_mm = src_vma->vm_mm;
	852	+ struct page *new_page;
	853	+
	854	+ if (!is_cow_mapping(src_vma->vm_flags))
	855	+ return 1;
	856	+
	857	+ /*
	858	+ * What we want to do is to check whether this page may
	859	+ * have been pinned by the parent process. If so,
	860	+ * instead of wrprotect the pte on both sides, we copy
	861	+ * the page immediately so that we'll always guarantee
	862	+ * the pinned page won't be randomly replaced in the
	863	+ * future.
	864	+ *
	865	+ * The page pinning checks are just "has this mm ever
	866	+ * seen pinning", along with the (inexact) check of
	867	+ * the page count. That might give false positives for
	868	+ * for pinning, but it will work correctly.
	869	+ */
	870	+ if (likely(!atomic_read(&src_mm->has_pinned)))
	871	+ return 1;
	872	+ if (likely(!page_maybe_dma_pinned(page)))
	873	+ return 1;
	874	+
	875	+ /*
	876	+ * The vma->anon_vma of the child process may be NULL
	877	+ * because the entire vma does not contain anonymous pages.
	878	+ * A BUG will occur when the copy_present_page() passes
	879	+ * a copy of a non-anonymous page of that vma to the
	880	+ * page_add_new_anon_rmap() to set up new anonymous rmap.
	881	+ * Return 1 if the page is not an anonymous page.
	882	+ */
	883	+ if (!PageAnon(page))
	884	+ return 1;
	885	+
	886	+ new_page = *prealloc;
	887	+ if (!new_page)
	888	+ return -EAGAIN;
	889	+
	890	+ /*
	891	+ * We have a prealloc page, all good! Take it
	892	+ * over and copy the page & arm it.
	893	+ */
	894	+ *prealloc = NULL;
	895	+ copy_user_highpage(new_page, page, addr, src_vma);
	896	+ __SetPageUptodate(new_page);
	897	+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
	898	+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
	899	+ rss[mm_counter(new_page)]++;
	900	+
	901	+ /* All done, just insert the new page copy in the child */
	902	+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
	903	+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma->vm_flags);
	904	+ if (userfaultfd_pte_wp(dst_vma, *src_pte))
	905	+ /* Uffd-wp needs to be delivered to dest pte as well */
	906	+ pte = pte_wrprotect(pte_mkuffd_wp(pte));
	907	+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
	908	+ return 0;
	909	+}
	910	+
	911	+/*
	912	+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
	913	+ * is required to copy this pte.
	914	+ */
	915	+static inline int
	916	+copy_present_pte(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	917	+ pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
	918	+ struct page **prealloc)
	919	+{
	920	+ struct mm_struct *src_mm = src_vma->vm_mm;
	921	+ unsigned long vm_flags = src_vma->vm_flags;
992	922	pte_t pte = *src_pte;
993	923	struct page *page;
994	924
995		- /* pte contains position in swap or file, so copy. */
996		- if (unlikely(!pte_present(pte))) {
997		- swp_entry_t entry = pte_to_swp_entry(pte);
	925	+ page = vm_normal_page(src_vma, addr, pte);
	926	+ if (page) {
	927	+ int retval;
998	928
999		- if (likely(!non_swap_entry(entry))) {
1000		- if (swap_duplicate(entry) < 0)
1001		- return entry.val;
	929	+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
	930	+ addr, rss, prealloc, pte, page);
	931	+ if (retval <= 0)
	932	+ return retval;
1002	933
1003		- /* make sure dst_mm is on swapoff's mmlist. */
1004		- if (unlikely(list_empty(&dst_mm->mmlist))) {
1005		- spin_lock(&mmlist_lock);
1006		- if (list_empty(&dst_mm->mmlist))
1007		- list_add(&dst_mm->mmlist,
1008		- &src_mm->mmlist);
1009		- spin_unlock(&mmlist_lock);
1010		- }
1011		- rss[MM_SWAPENTS]++;
1012		- } else if (is_migration_entry(entry)) {
1013		- page = migration_entry_to_page(entry);
1014		-
1015		- rss[mm_counter(page)]++;
1016		-
1017		- if (is_write_migration_entry(entry) &&
1018		- is_cow_mapping(vm_flags)) {
1019		- /*
1020		- * COW mappings require pages in both
1021		- * parent and child to be set to read.
1022		- */
1023		- make_migration_entry_read(&entry);
1024		- pte = swp_entry_to_pte(entry);
1025		- if (pte_swp_soft_dirty(*src_pte))
1026		- pte = pte_swp_mksoft_dirty(pte);
1027		- set_pte_at(src_mm, addr, src_pte, pte);
1028		- }
1029		- } else if (is_device_private_entry(entry)) {
1030		- page = device_private_entry_to_page(entry);
1031		-
1032		- /*
1033		- * Update rss count even for unaddressable pages, as
1034		- * they should treated just like normal pages in this
1035		- * respect.
1036		- *
1037		- * We will likely want to have some new rss counters
1038		- * for unaddressable pages, at some point. But for now
1039		- * keep things as they are.
1040		- */
1041		- get_page(page);
1042		- rss[mm_counter(page)]++;
1043		- page_dup_rmap(page, false);
1044		-
1045		- /*
1046		- * We do not preserve soft-dirty information, because so
1047		- * far, checkpoint/restore is the only feature that
1048		- * requires that. And checkpoint/restore does not work
1049		- * when a device driver is involved (you cannot easily
1050		- * save and restore device driver state).
1051		- */
1052		- if (is_write_device_private_entry(entry) &&
1053		- is_cow_mapping(vm_flags)) {
1054		- make_device_private_entry_read(&entry);
1055		- pte = swp_entry_to_pte(entry);
1056		- set_pte_at(src_mm, addr, src_pte, pte);
1057		- }
1058		- }
1059		- goto out_set_pte;
	934	+ get_page(page);
	935	+ page_dup_rmap(page, false);
	936	+ rss[mm_counter(page)]++;
1060	937	}
1061	938
1062	939	/*
..	..	@@ -1076,48 +953,56 @@
1076	953	pte = pte_mkclean(pte);
1077	954	pte = pte_mkold(pte);
1078	955
1079		- page = vm_normal_page(vma, addr, pte);
1080		- if (page) {
1081		- get_page(page);
1082		- page_dup_rmap(page, false);
1083		- rss[mm_counter(page)]++;
1084		- } else if (pte_devmap(pte)) {
1085		- page = pte_page(pte);
	956	+ if (!userfaultfd_wp(dst_vma))
	957	+ pte = pte_clear_uffd_wp(pte);
1086	958
1087		- /*
1088		- * Cache coherent device memory behave like regular page and
1089		- * not like persistent memory page. For more informations see
1090		- * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
1091		- */
1092		- if (is_device_public_page(page)) {
1093		- get_page(page);
1094		- page_dup_rmap(page, false);
1095		- rss[mm_counter(page)]++;
1096		- }
1097		- }
1098		-
1099		-out_set_pte:
1100		- set_pte_at(dst_mm, addr, dst_pte, pte);
	959	+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
1101	960	return 0;
1102	961	}
1103	962
1104		-static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1105		- pmd_t dst_pmd, pmd_t src_pmd, struct vm_area_struct *vma,
1106		- unsigned long addr, unsigned long end)
	963	+static inline struct page *
	964	+page_copy_prealloc(struct mm_struct src_mm, struct vm_area_struct vma,
	965	+ unsigned long addr)
1107	966	{
	967	+ struct page *new_page;
	968	+
	969	+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
	970	+ if (!new_page)
	971	+ return NULL;
	972	+
	973	+ if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
	974	+ put_page(new_page);
	975	+ return NULL;
	976	+ }
	977	+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
	978	+
	979	+ return new_page;
	980	+}
	981	+
	982	+static int
	983	+copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	984	+ pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
	985	+ unsigned long end)
	986	+{
	987	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	988	+ struct mm_struct *src_mm = src_vma->vm_mm;
1108	989	pte_t orig_src_pte, orig_dst_pte;
1109	990	pte_t src_pte, dst_pte;
1110	991	spinlock_t src_ptl, dst_ptl;
1111		- int progress = 0;
	992	+ int progress, ret = 0;
1112	993	int rss[NR_MM_COUNTERS];
1113	994	swp_entry_t entry = (swp_entry_t){0};
	995	+ struct page *prealloc = NULL;
1114	996
1115	997	again:
	998	+ progress = 0;
1116	999	init_rss_vec(rss);
1117	1000
1118	1001	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1119		- if (!dst_pte)
1120		- return -ENOMEM;
	1002	+ if (!dst_pte) {
	1003	+ ret = -ENOMEM;
	1004	+ goto out;
	1005	+ }
1121	1006	src_pte = pte_offset_map(src_pmd, addr);
1122	1007	src_ptl = pte_lockptr(src_mm, src_pmd);
1123	1008	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
..	..	@@ -1140,10 +1025,35 @@
1140	1025	progress++;
1141	1026	continue;
1142	1027	}
1143		- entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1144		- vma, addr, rss);
1145		- if (entry.val)
	1028	+ if (unlikely(!pte_present(*src_pte))) {
	1029	+ entry.val = copy_nonpresent_pte(dst_mm, src_mm,
	1030	+ dst_pte, src_pte,
	1031	+ dst_vma, src_vma,
	1032	+ addr, rss);
	1033	+ if (entry.val)
	1034	+ break;
	1035	+ progress += 8;
	1036	+ continue;
	1037	+ }
	1038	+ /* copy_present_pte() will clear `prealloc' if consumed /
	1039	+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
	1040	+ addr, rss, &prealloc);
	1041	+ /*
	1042	+ * If we need a pre-allocated page for this pte, drop the
	1043	+ * locks, allocate, and try again.
	1044	+ */
	1045	+ if (unlikely(ret == -EAGAIN))
1146	1046	break;
	1047	+ if (unlikely(prealloc)) {
	1048	+ /*
	1049	+ * pre-alloc page cannot be reused by next time so as
	1050	+ * to strictly follow mempolicy (e.g., alloc_page_vma()
	1051	+ * will allocate page according to address). This
	1052	+ * could only happen if one pinned pte changed.
	1053	+ */
	1054	+ put_page(prealloc);
	1055	+ prealloc = NULL;
	1056	+ }
1147	1057	progress += 8;
1148	1058	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1149	1059
..	..	@@ -1155,19 +1065,34 @@
1155	1065	cond_resched();
1156	1066
1157	1067	if (entry.val) {
1158		- if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
	1068	+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
	1069	+ ret = -ENOMEM;
	1070	+ goto out;
	1071	+ }
	1072	+ entry.val = 0;
	1073	+ } else if (ret) {
	1074	+ WARN_ON_ONCE(ret != -EAGAIN);
	1075	+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
	1076	+ if (!prealloc)
1159	1077	return -ENOMEM;
1160		- progress = 0;
	1078	+ /* We've captured and resolved the error. Reset, try again. */
	1079	+ ret = 0;
1161	1080	}
1162	1081	if (addr != end)
1163	1082	goto again;
1164		- return 0;
	1083	+out:
	1084	+ if (unlikely(prealloc))
	1085	+ put_page(prealloc);
	1086	+ return ret;
1165	1087	}
1166	1088
1167		-static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1168		- pud_t dst_pud, pud_t src_pud, struct vm_area_struct *vma,
1169		- unsigned long addr, unsigned long end)
	1089	+static inline int
	1090	+copy_pmd_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1091	+ pud_t dst_pud, pud_t src_pud, unsigned long addr,
	1092	+ unsigned long end)
1170	1093	{
	1094	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1095	+ struct mm_struct *src_mm = src_vma->vm_mm;
1171	1096	pmd_t src_pmd, dst_pmd;
1172	1097	unsigned long next;
1173	1098
..	..	@@ -1180,9 +1105,9 @@
1180	1105	if (is_swap_pmd(src_pmd) \|\| pmd_trans_huge(src_pmd)
1181	1106	\|\| pmd_devmap(*src_pmd)) {
1182	1107	int err;
1183		- VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1184		- err = copy_huge_pmd(dst_mm, src_mm,
1185		- dst_pmd, src_pmd, addr, vma);
	1108	+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
	1109	+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
	1110	+ addr, dst_vma, src_vma);
1186	1111	if (err == -ENOMEM)
1187	1112	return -ENOMEM;
1188	1113	if (!err)
..	..	@@ -1191,17 +1116,20 @@
1191	1116	}
1192	1117	if (pmd_none_or_clear_bad(src_pmd))
1193	1118	continue;
1194		- if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1195		- vma, addr, next))
	1119	+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
	1120	+ addr, next))
1196	1121	return -ENOMEM;
1197	1122	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1198	1123	return 0;
1199	1124	}
1200	1125
1201		-static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1202		- p4d_t dst_p4d, p4d_t src_p4d, struct vm_area_struct *vma,
1203		- unsigned long addr, unsigned long end)
	1126	+static inline int
	1127	+copy_pud_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1128	+ p4d_t dst_p4d, p4d_t src_p4d, unsigned long addr,
	1129	+ unsigned long end)
1204	1130	{
	1131	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1132	+ struct mm_struct *src_mm = src_vma->vm_mm;
1205	1133	pud_t src_pud, dst_pud;
1206	1134	unsigned long next;
1207	1135
..	..	@@ -1214,9 +1142,9 @@
1214	1142	if (pud_trans_huge(src_pud) \|\| pud_devmap(src_pud)) {
1215	1143	int err;
1216	1144
1217		- VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
	1145	+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1218	1146	err = copy_huge_pud(dst_mm, src_mm,
1219		- dst_pud, src_pud, addr, vma);
	1147	+ dst_pud, src_pud, addr, src_vma);
1220	1148	if (err == -ENOMEM)
1221	1149	return -ENOMEM;
1222	1150	if (!err)
..	..	@@ -1225,17 +1153,19 @@
1225	1153	}
1226	1154	if (pud_none_or_clear_bad(src_pud))
1227	1155	continue;
1228		- if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1229		- vma, addr, next))
	1156	+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
	1157	+ addr, next))
1230	1158	return -ENOMEM;
1231	1159	} while (dst_pud++, src_pud++, addr = next, addr != end);
1232	1160	return 0;
1233	1161	}
1234	1162
1235		-static inline int copy_p4d_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1236		- pgd_t dst_pgd, pgd_t src_pgd, struct vm_area_struct *vma,
1237		- unsigned long addr, unsigned long end)
	1163	+static inline int
	1164	+copy_p4d_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1165	+ pgd_t dst_pgd, pgd_t src_pgd, unsigned long addr,
	1166	+ unsigned long end)
1238	1167	{
	1168	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
1239	1169	p4d_t src_p4d, dst_p4d;
1240	1170	unsigned long next;
1241	1171
..	..	@@ -1247,22 +1177,23 @@
1247	1177	next = p4d_addr_end(addr, end);
1248	1178	if (p4d_none_or_clear_bad(src_p4d))
1249	1179	continue;
1250		- if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1251		- vma, addr, next))
	1180	+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
	1181	+ addr, next))
1252	1182	return -ENOMEM;
1253	1183	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1254	1184	return 0;
1255	1185	}
1256	1186
1257		-int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1258		- struct vm_area_struct *vma)
	1187	+int
	1188	+copy_page_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma)
1259	1189	{
1260	1190	pgd_t src_pgd, dst_pgd;
1261	1191	unsigned long next;
1262		- unsigned long addr = vma->vm_start;
1263		- unsigned long end = vma->vm_end;
1264		- unsigned long mmun_start; /* For mmu_notifiers */
1265		- unsigned long mmun_end; /* For mmu_notifiers */
	1192	+ unsigned long addr = src_vma->vm_start;
	1193	+ unsigned long end = src_vma->vm_end;
	1194	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1195	+ struct mm_struct *src_mm = src_vma->vm_mm;
	1196	+ struct mmu_notifier_range range;
1266	1197	bool is_cow;
1267	1198	int ret;
1268	1199
..	..	@@ -1272,19 +1203,19 @@
1272	1203	* readonly mappings. The tradeoff is that copy_page_range is more
1273	1204	* efficient than faulting.
1274	1205	*/
1275		- if (!(vma->vm_flags & (VM_HUGETLB \| VM_PFNMAP \| VM_MIXEDMAP)) &&
1276		- !vma->anon_vma)
	1206	+ if (!(src_vma->vm_flags & (VM_HUGETLB \| VM_PFNMAP \| VM_MIXEDMAP)) &&
	1207	+ !src_vma->anon_vma)
1277	1208	return 0;
1278	1209
1279		- if (is_vm_hugetlb_page(vma))
1280		- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
	1210	+ if (is_vm_hugetlb_page(src_vma))
	1211	+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1281	1212
1282		- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
	1213	+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1283	1214	/*
1284	1215	* We do not free on error cases below as remove_vma
1285	1216	* gets called on error from higher level routine
1286	1217	*/
1287		- ret = track_pfn_copy(vma);
	1218	+ ret = track_pfn_copy(src_vma);
1288	1219	if (ret)
1289	1220	return ret;
1290	1221	}
..	..	@@ -1295,12 +1226,22 @@
1295	1226	* parent mm. And a permission downgrade will only happen if
1296	1227	* is_cow_mapping() returns true.
1297	1228	*/
1298		- is_cow = is_cow_mapping(vma->vm_flags);
1299		- mmun_start = addr;
1300		- mmun_end = end;
1301		- if (is_cow)
1302		- mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1303		- mmun_end);
	1229	+ is_cow = is_cow_mapping(src_vma->vm_flags);
	1230	+
	1231	+ if (is_cow) {
	1232	+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
	1233	+ 0, src_vma, src_mm, addr, end);
	1234	+ mmu_notifier_invalidate_range_start(&range);
	1235	+ /*
	1236	+ * Disabling preemption is not needed for the write side, as
	1237	+ * the read side doesn't spin, but goes to the mmap_lock.
	1238	+ *
	1239	+ * Use the raw variant of the seqcount_t write API to avoid
	1240	+ * lockdep complaining about preemptibility.
	1241	+ */
	1242	+ mmap_assert_write_locked(src_mm);
	1243	+ raw_write_seqcount_begin(&src_mm->write_protect_seq);
	1244	+ }
1304	1245
1305	1246	ret = 0;
1306	1247	dst_pgd = pgd_offset(dst_mm, addr);
..	..	@@ -1309,16 +1250,29 @@
1309	1250	next = pgd_addr_end(addr, end);
1310	1251	if (pgd_none_or_clear_bad(src_pgd))
1311	1252	continue;
1312		- if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1313		- vma, addr, next))) {
	1253	+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
	1254	+ addr, next))) {
1314	1255	ret = -ENOMEM;
1315	1256	break;
1316	1257	}
1317	1258	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1318	1259
1319		- if (is_cow)
1320		- mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
	1260	+ if (is_cow) {
	1261	+ raw_write_seqcount_end(&src_mm->write_protect_seq);
	1262	+ mmu_notifier_invalidate_range_end(&range);
	1263	+ }
1321	1264	return ret;
	1265	+}
	1266	+
	1267	+/* Whether we should zap all COWed (private) pages too */
	1268	+static inline bool should_zap_cows(struct zap_details *details)
	1269	+{
	1270	+ /* By default, zap all pages */
	1271	+ if (!details)
	1272	+ return true;
	1273	+
	1274	+ /* Or, we zap COWed pages only if the caller wants to */
	1275	+ return !details->check_mapping;
1322	1276	}
1323	1277
1324	1278	static unsigned long zap_pte_range(struct mmu_gather *tlb,
..	..	@@ -1334,7 +1288,7 @@
1334	1288	pte_t *pte;
1335	1289	swp_entry_t entry;
1336	1290
1337		- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
	1291	+ tlb_change_page_size(tlb, PAGE_SIZE);
1338	1292	again:
1339	1293	init_rss_vec(rss);
1340	1294	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
..	..	@@ -1346,10 +1300,13 @@
1346	1300	if (pte_none(ptent))
1347	1301	continue;
1348	1302
	1303	+ if (need_resched())
	1304	+ break;
	1305	+
1349	1306	if (pte_present(ptent)) {
1350	1307	struct page *page;
1351	1308
1352		- page = _vm_normal_page(vma, addr, ptent, true);
	1309	+ page = vm_normal_page(vma, addr, ptent);
1353	1310	if (unlikely(details) && page) {
1354	1311	/*
1355	1312	* unmap_shared_mapping_pages() wants to
..	..	@@ -1379,7 +1336,8 @@
1379	1336	page_remove_rmap(page, false);
1380	1337	if (unlikely(page_mapcount(page) < 0))
1381	1338	print_bad_pte(vma, addr, ptent, page);
1382		- if (unlikely(__tlb_remove_page(tlb, page))) {
	1339	+ if (unlikely(__tlb_remove_page(tlb, page)) \|\|
	1340	+ lru_cache_disabled()) {
1383	1341	force_flush = 1;
1384	1342	addr += PAGE_SIZE;
1385	1343	break;
..	..	@@ -1388,7 +1346,7 @@
1388	1346	}
1389	1347
1390	1348	entry = pte_to_swp_entry(ptent);
1391		- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
	1349	+ if (is_device_private_entry(entry)) {
1392	1350	struct page *page = device_private_entry_to_page(entry);
1393	1351
1394	1352	if (unlikely(details && details->check_mapping)) {
..	..	@@ -1409,17 +1367,18 @@
1409	1367	continue;
1410	1368	}
1411	1369
1412		- /* If details->check_mapping, we leave swap entries. */
1413		- if (unlikely(details))
1414		- continue;
1415		-
1416		- entry = pte_to_swp_entry(ptent);
1417		- if (!non_swap_entry(entry))
	1370	+ if (!non_swap_entry(entry)) {
	1371	+ /* Genuine swap entry, hence a private anon page */
	1372	+ if (!should_zap_cows(details))
	1373	+ continue;
1418	1374	rss[MM_SWAPENTS]--;
1419		- else if (is_migration_entry(entry)) {
	1375	+ } else if (is_migration_entry(entry)) {
1420	1376	struct page *page;
1421	1377
1422	1378	page = migration_entry_to_page(entry);
	1379	+ if (details && details->check_mapping &&
	1380	+ details->check_mapping != page_rmapping(page))
	1381	+ continue;
1423	1382	rss[mm_counter(page)]--;
1424	1383	}
1425	1384	if (unlikely(!free_swap_and_cache(entry)))
..	..	@@ -1443,9 +1402,12 @@
1443	1402	*/
1444	1403	if (force_flush) {
1445	1404	force_flush = 0;
1446		- tlb_flush_mmu_free(tlb);
1447		- if (addr != end)
1448		- goto again;
	1405	+ tlb_flush_mmu(tlb);
	1406	+ }
	1407	+
	1408	+ if (addr != end) {
	1409	+ cond_resched();
	1410	+ goto again;
1449	1411	}
1450	1412
1451	1413	return addr;
..	..	@@ -1484,7 +1446,7 @@
1484	1446	* Here there can be other concurrent MADV_DONTNEED or
1485	1447	* trans huge page faults running, and if the pmd is
1486	1448	* none or trans huge it can change under us. This is
1487		- * because MADV_DONTNEED holds the mmap_sem in read
	1449	+ * because MADV_DONTNEED holds the mmap_lock in read
1488	1450	* mode.
1489	1451	*/
1490	1452	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
..	..	@@ -1510,7 +1472,7 @@
1510	1472	next = pud_addr_end(addr, end);
1511	1473	if (pud_trans_huge(pud) \|\| pud_devmap(pud)) {
1512	1474	if (next - addr != HPAGE_PUD_SIZE) {
1513		- VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
	1475	+ mmap_assert_locked(tlb->mm);
1514	1476	split_huge_pud(vma, pud, addr);
1515	1477	} else if (zap_huge_pud(tlb, vma, pud, addr))
1516	1478	goto next;
..	..	@@ -1631,12 +1593,14 @@
1631	1593	struct vm_area_struct *vma, unsigned long start_addr,
1632	1594	unsigned long end_addr)
1633	1595	{
1634		- struct mm_struct *mm = vma->vm_mm;
	1596	+ struct mmu_notifier_range range;
1635	1597
1636		- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
	1598	+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
	1599	+ start_addr, end_addr);
	1600	+ mmu_notifier_invalidate_range_start(&range);
1637	1601	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1638	1602	unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1639		- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
	1603	+ mmu_notifier_invalidate_range_end(&range);
1640	1604	}
1641	1605
1642	1606	/**
..	..	@@ -1650,18 +1614,19 @@
1650	1614	void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1651	1615	unsigned long size)
1652	1616	{
1653		- struct mm_struct *mm = vma->vm_mm;
	1617	+ struct mmu_notifier_range range;
1654	1618	struct mmu_gather tlb;
1655		- unsigned long end = start + size;
1656	1619
1657	1620	lru_add_drain();
1658		- tlb_gather_mmu(&tlb, mm, start, end);
1659		- update_hiwater_rss(mm);
1660		- mmu_notifier_invalidate_range_start(mm, start, end);
1661		- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1662		- unmap_single_vma(&tlb, vma, start, end, NULL);
1663		- mmu_notifier_invalidate_range_end(mm, start, end);
1664		- tlb_finish_mmu(&tlb, start, end);
	1621	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
	1622	+ start, start + size);
	1623	+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
	1624	+ update_hiwater_rss(vma->vm_mm);
	1625	+ mmu_notifier_invalidate_range_start(&range);
	1626	+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
	1627	+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
	1628	+ mmu_notifier_invalidate_range_end(&range);
	1629	+ tlb_finish_mmu(&tlb, start, range.end);
1665	1630	}
1666	1631
1667	1632	/**
..	..	@@ -1676,17 +1641,18 @@
1676	1641	static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1677	1642	unsigned long size, struct zap_details *details)
1678	1643	{
1679		- struct mm_struct *mm = vma->vm_mm;
	1644	+ struct mmu_notifier_range range;
1680	1645	struct mmu_gather tlb;
1681		- unsigned long end = address + size;
1682	1646
1683	1647	lru_add_drain();
1684		- tlb_gather_mmu(&tlb, mm, address, end);
1685		- update_hiwater_rss(mm);
1686		- mmu_notifier_invalidate_range_start(mm, address, end);
1687		- unmap_single_vma(&tlb, vma, address, end, details);
1688		- mmu_notifier_invalidate_range_end(mm, address, end);
1689		- tlb_finish_mmu(&tlb, address, end);
	1648	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
	1649	+ address, address + size);
	1650	+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
	1651	+ update_hiwater_rss(vma->vm_mm);
	1652	+ mmu_notifier_invalidate_range_start(&range);
	1653	+ unmap_single_vma(&tlb, vma, address, range.end, details);
	1654	+ mmu_notifier_invalidate_range_end(&range);
	1655	+ tlb_finish_mmu(&tlb, address, range.end);
1690	1656	}
1691	1657
1692	1658	/**
..	..	@@ -1711,8 +1677,7 @@
1711	1677	}
1712	1678	EXPORT_SYMBOL_GPL(zap_vma_ptes);
1713	1679
1714		-pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
1715		- spinlock_t **ptl)
	1680	+static pmd_t walk_to_pmd(struct mm_struct mm, unsigned long addr)
1716	1681	{
1717	1682	pgd_t *pgd;
1718	1683	p4d_t *p4d;
..	..	@@ -1731,7 +1696,38 @@
1731	1696	return NULL;
1732	1697
1733	1698	VM_BUG_ON(pmd_trans_huge(*pmd));
	1699	+ return pmd;
	1700	+}
	1701	+
	1702	+pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
	1703	+ spinlock_t **ptl)
	1704	+{
	1705	+ pmd_t *pmd = walk_to_pmd(mm, addr);
	1706	+
	1707	+ if (!pmd)
	1708	+ return NULL;
1734	1709	return pte_alloc_map_lock(mm, pmd, addr, ptl);
	1710	+}
	1711	+
	1712	+static int validate_page_before_insert(struct page *page)
	1713	+{
	1714	+ if (PageAnon(page) \|\| PageSlab(page) \|\| page_has_type(page))
	1715	+ return -EINVAL;
	1716	+ flush_dcache_page(page);
	1717	+ return 0;
	1718	+}
	1719	+
	1720	+static int insert_page_into_pte_locked(struct mm_struct mm, pte_t pte,
	1721	+ unsigned long addr, struct page *page, pgprot_t prot)
	1722	+{
	1723	+ if (!pte_none(*pte))
	1724	+ return -EBUSY;
	1725	+ /* Ok, finally just insert the thing.. */
	1726	+ get_page(page);
	1727	+ inc_mm_counter_fast(mm, mm_counter_file(page));
	1728	+ page_add_file_rmap(page, false);
	1729	+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
	1730	+ return 0;
1735	1731	}
1736	1732
1737	1733	/*
..	..	@@ -1749,32 +1745,135 @@
1749	1745	pte_t *pte;
1750	1746	spinlock_t *ptl;
1751	1747
1752		- retval = -EINVAL;
1753		- if (PageAnon(page))
	1748	+ retval = validate_page_before_insert(page);
	1749	+ if (retval)
1754	1750	goto out;
1755	1751	retval = -ENOMEM;
1756		- flush_dcache_page(page);
1757	1752	pte = get_locked_pte(mm, addr, &ptl);
1758	1753	if (!pte)
1759	1754	goto out;
1760		- retval = -EBUSY;
1761		- if (!pte_none(*pte))
1762		- goto out_unlock;
1763		-
1764		- /* Ok, finally just insert the thing.. */
1765		- get_page(page);
1766		- inc_mm_counter_fast(mm, mm_counter_file(page));
1767		- page_add_file_rmap(page, false);
1768		- set_pte_at(mm, addr, pte, mk_pte(page, prot));
1769		-
1770		- retval = 0;
1771		- pte_unmap_unlock(pte, ptl);
1772		- return retval;
1773		-out_unlock:
	1755	+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1774	1756	pte_unmap_unlock(pte, ptl);
1775	1757	out:
1776	1758	return retval;
1777	1759	}
	1760	+
	1761	+#ifdef pte_index
	1762	+static int insert_page_in_batch_locked(struct mm_struct mm, pte_t pte,
	1763	+ unsigned long addr, struct page *page, pgprot_t prot)
	1764	+{
	1765	+ int err;
	1766	+
	1767	+ if (!page_count(page))
	1768	+ return -EINVAL;
	1769	+ err = validate_page_before_insert(page);
	1770	+ if (err)
	1771	+ return err;
	1772	+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
	1773	+}
	1774	+
	1775	+/* insert_pages() amortizes the cost of spinlock operations
	1776	+ * when inserting pages in a loop. Arch must define pte_index.
	1777	+ */
	1778	+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
	1779	+ struct page *pages, unsigned long num, pgprot_t prot)
	1780	+{
	1781	+ pmd_t *pmd = NULL;
	1782	+ pte_t start_pte, pte;
	1783	+ spinlock_t *pte_lock;
	1784	+ struct mm_struct *const mm = vma->vm_mm;
	1785	+ unsigned long curr_page_idx = 0;
	1786	+ unsigned long remaining_pages_total = *num;
	1787	+ unsigned long pages_to_write_in_pmd;
	1788	+ int ret;
	1789	+more:
	1790	+ ret = -EFAULT;
	1791	+ pmd = walk_to_pmd(mm, addr);
	1792	+ if (!pmd)
	1793	+ goto out;
	1794	+
	1795	+ pages_to_write_in_pmd = min_t(unsigned long,
	1796	+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
	1797	+
	1798	+ /* Allocate the PTE if necessary; takes PMD lock once only. */
	1799	+ ret = -ENOMEM;
	1800	+ if (pte_alloc(mm, pmd))
	1801	+ goto out;
	1802	+
	1803	+ while (pages_to_write_in_pmd) {
	1804	+ int pte_idx = 0;
	1805	+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
	1806	+
	1807	+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
	1808	+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
	1809	+ int err = insert_page_in_batch_locked(mm, pte,
	1810	+ addr, pages[curr_page_idx], prot);
	1811	+ if (unlikely(err)) {
	1812	+ pte_unmap_unlock(start_pte, pte_lock);
	1813	+ ret = err;
	1814	+ remaining_pages_total -= pte_idx;
	1815	+ goto out;
	1816	+ }
	1817	+ addr += PAGE_SIZE;
	1818	+ ++curr_page_idx;
	1819	+ }
	1820	+ pte_unmap_unlock(start_pte, pte_lock);
	1821	+ pages_to_write_in_pmd -= batch_size;
	1822	+ remaining_pages_total -= batch_size;
	1823	+ }
	1824	+ if (remaining_pages_total)
	1825	+ goto more;
	1826	+ ret = 0;
	1827	+out:
	1828	+ *num = remaining_pages_total;
	1829	+ return ret;
	1830	+}
	1831	+#endif /* ifdef pte_index */
	1832	+
	1833	+/**
	1834	+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
	1835	+ * @vma: user vma to map to
	1836	+ * @addr: target start user address of these pages
	1837	+ * @pages: source kernel pages
	1838	+ * @num: in: number of pages to map. out: number of pages that were not
	1839	+ * mapped. (0 means all pages were successfully mapped).
	1840	+ *
	1841	+ * Preferred over vm_insert_page() when inserting multiple pages.
	1842	+ *
	1843	+ * In case of error, we may have mapped a subset of the provided
	1844	+ * pages. It is the caller's responsibility to account for this case.
	1845	+ *
	1846	+ * The same restrictions apply as in vm_insert_page().
	1847	+ */
	1848	+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
	1849	+ struct page *pages, unsigned long num)
	1850	+{
	1851	+#ifdef pte_index
	1852	+ const unsigned long end_addr = addr + (num PAGE_SIZE) - 1;
	1853	+
	1854	+ if (addr < vma->vm_start \|\| end_addr >= vma->vm_end)
	1855	+ return -EFAULT;
	1856	+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
	1857	+ BUG_ON(mmap_read_trylock(vma->vm_mm));
	1858	+ BUG_ON(vma->vm_flags & VM_PFNMAP);
	1859	+ vma->vm_flags \|= VM_MIXEDMAP;
	1860	+ }
	1861	+ /* Defer page refcount checking till we're about to map that page. */
	1862	+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
	1863	+#else
	1864	+ unsigned long idx = 0, pgcount = *num;
	1865	+ int err = -EINVAL;
	1866	+
	1867	+ for (; idx < pgcount; ++idx) {
	1868	+ err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
	1869	+ if (err)
	1870	+ break;
	1871	+ }
	1872	+ *num = pgcount - idx;
	1873	+ return err;
	1874	+#endif /* ifdef pte_index */
	1875	+}
	1876	+EXPORT_SYMBOL(vm_insert_pages);
1778	1877
1779	1878	/**
1780	1879	* vm_insert_page - insert single page into user vma
..	..	@@ -1799,9 +1898,11 @@
1799	1898	* The page does not need to be reserved.
1800	1899	*
1801	1900	* Usually this function is called from f_op->mmap() handler
1802		- * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
	1901	+ * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
1803	1902	* Caller must set VM_MIXEDMAP on vma if it wants to call this
1804	1903	* function from other places, for example from page-fault handler.
	1904	+ *
	1905	+ * Return: %0 on success, negative error code otherwise.
1805	1906	*/
1806	1907	int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1807	1908	struct page *page)
..	..	@@ -1811,7 +1912,7 @@
1811	1912	if (!page_count(page))
1812	1913	return -EINVAL;
1813	1914	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1814		- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
	1915	+ BUG_ON(mmap_read_trylock(vma->vm_mm));
1815	1916	BUG_ON(vma->vm_flags & VM_PFNMAP);
1816	1917	vma->vm_flags \|= VM_MIXEDMAP;
1817	1918	}
..	..	@@ -1819,19 +1920,97 @@
1819	1920	}
1820	1921	EXPORT_SYMBOL(vm_insert_page);
1821	1922
1822		-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
	1923	+/*
	1924	+ * __vm_map_pages - maps range of kernel pages into user vma
	1925	+ * @vma: user vma to map to
	1926	+ * @pages: pointer to array of source kernel pages
	1927	+ * @num: number of pages in page array
	1928	+ * @offset: user's requested vm_pgoff
	1929	+ *
	1930	+ * This allows drivers to map range of kernel pages into a user vma.
	1931	+ *
	1932	+ * Return: 0 on success and error code otherwise.
	1933	+ */
	1934	+static int __vm_map_pages(struct vm_area_struct vma, struct page *pages,
	1935	+ unsigned long num, unsigned long offset)
	1936	+{
	1937	+ unsigned long count = vma_pages(vma);
	1938	+ unsigned long uaddr = vma->vm_start;
	1939	+ int ret, i;
	1940	+
	1941	+ /* Fail if the user requested offset is beyond the end of the object */
	1942	+ if (offset >= num)
	1943	+ return -ENXIO;
	1944	+
	1945	+ /* Fail if the user requested size exceeds available object size */
	1946	+ if (count > num - offset)
	1947	+ return -ENXIO;
	1948	+
	1949	+ for (i = 0; i < count; i++) {
	1950	+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
	1951	+ if (ret < 0)
	1952	+ return ret;
	1953	+ uaddr += PAGE_SIZE;
	1954	+ }
	1955	+
	1956	+ return 0;
	1957	+}
	1958	+
	1959	+/**
	1960	+ * vm_map_pages - maps range of kernel pages starts with non zero offset
	1961	+ * @vma: user vma to map to
	1962	+ * @pages: pointer to array of source kernel pages
	1963	+ * @num: number of pages in page array
	1964	+ *
	1965	+ * Maps an object consisting of @num pages, catering for the user's
	1966	+ * requested vm_pgoff
	1967	+ *
	1968	+ * If we fail to insert any page into the vma, the function will return
	1969	+ * immediately leaving any previously inserted pages present. Callers
	1970	+ * from the mmap handler may immediately return the error as their caller
	1971	+ * will destroy the vma, removing any successfully inserted pages. Other
	1972	+ * callers should make their own arrangements for calling unmap_region().
	1973	+ *
	1974	+ * Context: Process context. Called by mmap handlers.
	1975	+ * Return: 0 on success and error code otherwise.
	1976	+ */
	1977	+int vm_map_pages(struct vm_area_struct vma, struct page *pages,
	1978	+ unsigned long num)
	1979	+{
	1980	+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
	1981	+}
	1982	+EXPORT_SYMBOL(vm_map_pages);
	1983	+
	1984	+/**
	1985	+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
	1986	+ * @vma: user vma to map to
	1987	+ * @pages: pointer to array of source kernel pages
	1988	+ * @num: number of pages in page array
	1989	+ *
	1990	+ * Similar to vm_map_pages(), except that it explicitly sets the offset
	1991	+ * to 0. This function is intended for the drivers that did not consider
	1992	+ * vm_pgoff.
	1993	+ *
	1994	+ * Context: Process context. Called by mmap handlers.
	1995	+ * Return: 0 on success and error code otherwise.
	1996	+ */
	1997	+int vm_map_pages_zero(struct vm_area_struct vma, struct page *pages,
	1998	+ unsigned long num)
	1999	+{
	2000	+ return __vm_map_pages(vma, pages, num, 0);
	2001	+}
	2002	+EXPORT_SYMBOL(vm_map_pages_zero);
	2003	+
	2004	+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1823	2005	pfn_t pfn, pgprot_t prot, bool mkwrite)
1824	2006	{
1825	2007	struct mm_struct *mm = vma->vm_mm;
1826		- int retval;
1827	2008	pte_t *pte, entry;
1828	2009	spinlock_t *ptl;
1829	2010
1830		- retval = -ENOMEM;
1831	2011	pte = get_locked_pte(mm, addr, &ptl);
1832	2012	if (!pte)
1833		- goto out;
1834		- retval = -EBUSY;
	2013	+ return VM_FAULT_OOM;
1835	2014	if (!pte_none(*pte)) {
1836	2015	if (mkwrite) {
1837	2016	/*
..	..	@@ -1849,7 +2028,8 @@
1849	2028	goto out_unlock;
1850	2029	}
1851	2030	entry = pte_mkyoung(*pte);
1852		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	2031	+ entry = maybe_mkwrite(pte_mkdirty(entry),
	2032	+ vma->vm_flags);
1853	2033	if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1854	2034	update_mmu_cache(vma, addr, pte);
1855	2035	}
..	..	@@ -1864,62 +2044,41 @@
1864	2044
1865	2045	if (mkwrite) {
1866	2046	entry = pte_mkyoung(entry);
1867		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	2047	+ entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
1868	2048	}
1869	2049
1870	2050	set_pte_at(mm, addr, pte, entry);
1871	2051	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1872	2052
1873		- retval = 0;
1874	2053	out_unlock:
1875	2054	pte_unmap_unlock(pte, ptl);
1876		-out:
1877		- return retval;
	2055	+ return VM_FAULT_NOPAGE;
1878	2056	}
1879	2057
1880	2058	/**
1881		- * vm_insert_pfn - insert single pfn into user vma
1882		- * @vma: user vma to map to
1883		- * @addr: target user address of this page
1884		- * @pfn: source kernel pfn
1885		- *
1886		- * Similar to vm_insert_page, this allows drivers to insert individual pages
1887		- * they've allocated into a user vma. Same comments apply.
1888		- *
1889		- * This function should only be called from a vm_ops->fault handler, and
1890		- * in that case the handler should return NULL.
1891		- *
1892		- * vma cannot be a COW mapping.
1893		- *
1894		- * As this is called only for pages that do not currently exist, we
1895		- * do not need to flush old virtual caches or the TLB.
1896		- */
1897		-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1898		- unsigned long pfn)
1899		-{
1900		- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1901		-}
1902		-EXPORT_SYMBOL(vm_insert_pfn);
1903		-
1904		-/**
1905		- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
	2059	+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
1906	2060	* @vma: user vma to map to
1907	2061	* @addr: target user address of this page
1908	2062	* @pfn: source kernel pfn
1909	2063	* @pgprot: pgprot flags for the inserted page
1910	2064	*
1911		- * This is exactly like vm_insert_pfn, except that it allows drivers to
	2065	+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
1912	2066	* to override pgprot on a per-page basis.
1913	2067	*
1914	2068	* This only makes sense for IO mappings, and it makes no sense for
1915		- * cow mappings. In general, using multiple vmas is preferable;
1916		- * vm_insert_pfn_prot should only be used if using multiple VMAs is
	2069	+ * COW mappings. In general, using multiple vmas is preferable;
	2070	+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
1917	2071	* impractical.
	2072	+ *
	2073	+ * See vmf_insert_mixed_prot() for a discussion of the implication of using
	2074	+ * a value of @pgprot different from that of @vma->vm_page_prot.
	2075	+ *
	2076	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2077	+ * Return: vm_fault_t value.
1918	2078	*/
1919		-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
	2079	+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1920	2080	unsigned long pfn, pgprot_t pgprot)
1921	2081	{
1922		- int ret;
1923	2082	/*
1924	2083	* Technically, architectures with pte_special can avoid all these
1925	2084	* restrictions (same for remap_pfn_range). However we would like
..	..	@@ -1933,19 +2092,44 @@
1933	2092	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1934	2093
1935	2094	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1936		- return -EFAULT;
	2095	+ return VM_FAULT_SIGBUS;
1937	2096
1938	2097	if (!pfn_modify_allowed(pfn, pgprot))
1939		- return -EACCES;
	2098	+ return VM_FAULT_SIGBUS;
1940	2099
1941	2100	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1942	2101
1943		- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
	2102	+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1944	2103	false);
1945		-
1946		- return ret;
1947	2104	}
1948		-EXPORT_SYMBOL(vm_insert_pfn_prot);
	2105	+EXPORT_SYMBOL(vmf_insert_pfn_prot);
	2106	+
	2107	+/**
	2108	+ * vmf_insert_pfn - insert single pfn into user vma
	2109	+ * @vma: user vma to map to
	2110	+ * @addr: target user address of this page
	2111	+ * @pfn: source kernel pfn
	2112	+ *
	2113	+ * Similar to vm_insert_page, this allows drivers to insert individual pages
	2114	+ * they've allocated into a user vma. Same comments apply.
	2115	+ *
	2116	+ * This function should only be called from a vm_ops->fault handler, and
	2117	+ * in that case the handler should return the result of this function.
	2118	+ *
	2119	+ * vma cannot be a COW mapping.
	2120	+ *
	2121	+ * As this is called only for pages that do not currently exist, we
	2122	+ * do not need to flush old virtual caches or the TLB.
	2123	+ *
	2124	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2125	+ * Return: vm_fault_t value.
	2126	+ */
	2127	+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
	2128	+ unsigned long pfn)
	2129	+{
	2130	+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
	2131	+}
	2132	+EXPORT_SYMBOL(vmf_insert_pfn);
1949	2133
1950	2134	static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1951	2135	{
..	..	@@ -1961,20 +2145,21 @@
1961	2145	return false;
1962	2146	}
1963	2147
1964		-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1965		- pfn_t pfn, bool mkwrite)
	2148	+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
	2149	+ unsigned long addr, pfn_t pfn, pgprot_t pgprot,
	2150	+ bool mkwrite)
1966	2151	{
1967		- pgprot_t pgprot = vma->vm_page_prot;
	2152	+ int err;
1968	2153
1969	2154	BUG_ON(!vm_mixed_ok(vma, pfn));
1970	2155
1971	2156	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1972		- return -EFAULT;
	2157	+ return VM_FAULT_SIGBUS;
1973	2158
1974	2159	track_pfn_insert(vma, &pgprot, pfn);
1975	2160
1976	2161	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1977		- return -EACCES;
	2162	+ return VM_FAULT_SIGBUS;
1978	2163
1979	2164	/*
1980	2165	* If we don't have pte special, then we have to use the pfn_valid()
..	..	@@ -1993,36 +2178,68 @@
1993	2178	* result in pfn_t_has_page() == false.
1994	2179	*/
1995	2180	page = pfn_to_page(pfn_t_to_pfn(pfn));
1996		- return insert_page(vma, addr, page, pgprot);
	2181	+ err = insert_page(vma, addr, page, pgprot);
	2182	+ } else {
	2183	+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1997	2184	}
1998		- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
	2185	+
	2186	+ if (err == -ENOMEM)
	2187	+ return VM_FAULT_OOM;
	2188	+ if (err < 0 && err != -EBUSY)
	2189	+ return VM_FAULT_SIGBUS;
	2190	+
	2191	+ return VM_FAULT_NOPAGE;
1999	2192	}
2000	2193
2001		-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2002		- pfn_t pfn)
	2194	+/**
	2195	+ * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
	2196	+ * @vma: user vma to map to
	2197	+ * @addr: target user address of this page
	2198	+ * @pfn: source kernel pfn
	2199	+ * @pgprot: pgprot flags for the inserted page
	2200	+ *
	2201	+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
	2202	+ * to override pgprot on a per-page basis.
	2203	+ *
	2204	+ * Typically this function should be used by drivers to set caching- and
	2205	+ * encryption bits different than those of @vma->vm_page_prot, because
	2206	+ * the caching- or encryption mode may not be known at mmap() time.
	2207	+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
	2208	+ * to set caching and encryption bits for those vmas (except for COW pages).
	2209	+ * This is ensured by core vm only modifying these page table entries using
	2210	+ * functions that don't touch caching- or encryption bits, using pte_modify()
	2211	+ * if needed. (See for example mprotect()).
	2212	+ * Also when new page-table entries are created, this is only done using the
	2213	+ * fault() callback, and never using the value of vma->vm_page_prot,
	2214	+ * except for page-table entries that point to anonymous pages as the result
	2215	+ * of COW.
	2216	+ *
	2217	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2218	+ * Return: vm_fault_t value.
	2219	+ */
	2220	+vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
	2221	+ pfn_t pfn, pgprot_t pgprot)
2003	2222	{
2004		- return __vm_insert_mixed(vma, addr, pfn, false);
2005		-
	2223	+ return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2006	2224	}
2007		-EXPORT_SYMBOL(vm_insert_mixed);
	2225	+EXPORT_SYMBOL(vmf_insert_mixed_prot);
	2226	+
	2227	+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
	2228	+ pfn_t pfn)
	2229	+{
	2230	+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
	2231	+}
	2232	+EXPORT_SYMBOL(vmf_insert_mixed);
2008	2233
2009	2234	/*
2010	2235	* If the insertion of PTE failed because someone else already added a
2011	2236	* different entry in the mean time, we treat that as success as we assume
2012	2237	* the same entry was actually inserted.
2013	2238	*/
2014		-
2015	2239	vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2016	2240	unsigned long addr, pfn_t pfn)
2017	2241	{
2018		- int err;
2019		-
2020		- err = __vm_insert_mixed(vma, addr, pfn, true);
2021		- if (err == -ENOMEM)
2022		- return VM_FAULT_OOM;
2023		- if (err < 0 && err != -EBUSY)
2024		- return VM_FAULT_SIGBUS;
2025		- return VM_FAULT_NOPAGE;
	2242	+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2026	2243	}
2027	2244	EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2028	2245
..	..	@@ -2127,12 +2344,14 @@
2127	2344	/**
2128	2345	* remap_pfn_range - remap kernel memory to userspace
2129	2346	* @vma: user vma to map to
2130		- * @addr: target user address to start at
2131		- * @pfn: physical address of kernel memory
2132		- * @size: size of map area
	2347	+ * @addr: target page aligned user address to start at
	2348	+ * @pfn: page frame number of kernel physical memory address
	2349	+ * @size: size of mapping area
2133	2350	* @prot: page protection flags for this mapping
2134	2351	*
2135		- * Note: this is only safe if the mm semaphore is held when called.
	2352	+ * Note: this is only safe if the mm semaphore is held when called.
	2353	+ *
	2354	+ * Return: %0 on success, negative error code otherwise.
2136	2355	*/
2137	2356	int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2138	2357	unsigned long pfn, unsigned long size, pgprot_t prot)
..	..	@@ -2143,6 +2362,9 @@
2143	2362	struct mm_struct *mm = vma->vm_mm;
2144	2363	unsigned long remap_pfn = pfn;
2145	2364	int err;
	2365	+
	2366	+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
	2367	+ return -EINVAL;
2146	2368
2147	2369	/*
2148	2370	* Physically remapped pages are special. Tell the
..	..	@@ -2196,7 +2418,7 @@
2196	2418	/**
2197	2419	* vm_iomap_memory - remap memory to userspace
2198	2420	* @vma: user vma to map to
2199		- * @start: start of area
	2421	+ * @start: start of the physical memory to be mapped
2200	2422	* @len: size of area
2201	2423	*
2202	2424	* This is a simplified io_remap_pfn_range() for common driver use. The
..	..	@@ -2205,6 +2427,8 @@
2205	2427	*
2206	2428	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2207	2429	* whatever write-combining details or similar.
	2430	+ *
	2431	+ * Return: %0 on success, negative error code otherwise.
2208	2432	*/
2209	2433	int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2210	2434	{
..	..	@@ -2242,30 +2466,39 @@
2242	2466
2243	2467	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
2244	2468	unsigned long addr, unsigned long end,
2245		- pte_fn_t fn, void *data)
	2469	+ pte_fn_t fn, void *data, bool create,
	2470	+ pgtbl_mod_mask *mask)
2246	2471	{
2247	2472	pte_t *pte;
2248		- int err;
2249		- pgtable_t token;
2250		- spinlock_t *uninitialized_var(ptl);
	2473	+ int err = 0;
	2474	+ spinlock_t *ptl;
2251	2475
2252		- pte = (mm == &init_mm) ?
2253		- pte_alloc_kernel(pmd, addr) :
2254		- pte_alloc_map_lock(mm, pmd, addr, &ptl);
2255		- if (!pte)
2256		- return -ENOMEM;
	2476	+ if (create) {
	2477	+ pte = (mm == &init_mm) ?
	2478	+ pte_alloc_kernel_track(pmd, addr, mask) :
	2479	+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
	2480	+ if (!pte)
	2481	+ return -ENOMEM;
	2482	+ } else {
	2483	+ pte = (mm == &init_mm) ?
	2484	+ pte_offset_kernel(pmd, addr) :
	2485	+ pte_offset_map_lock(mm, pmd, addr, &ptl);
	2486	+ }
2257	2487
2258	2488	BUG_ON(pmd_huge(*pmd));
2259	2489
2260	2490	arch_enter_lazy_mmu_mode();
2261	2491
2262		- token = pmd_pgtable(*pmd);
2263		-
2264		- do {
2265		- err = fn(pte++, token, addr, data);
2266		- if (err)
2267		- break;
2268		- } while (addr += PAGE_SIZE, addr != end);
	2492	+ if (fn) {
	2493	+ do {
	2494	+ if (create \|\| !pte_none(*pte)) {
	2495	+ err = fn(pte++, addr, data);
	2496	+ if (err)
	2497	+ break;
	2498	+ }
	2499	+ } while (addr += PAGE_SIZE, addr != end);
	2500	+ }
	2501	+ *mask \|= PGTBL_PTE_MODIFIED;
2269	2502
2270	2503	arch_leave_lazy_mmu_mode();
2271	2504
..	..	@@ -2276,63 +2509,116 @@
2276	2509
2277	2510	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
2278	2511	unsigned long addr, unsigned long end,
2279		- pte_fn_t fn, void *data)
	2512	+ pte_fn_t fn, void *data, bool create,
	2513	+ pgtbl_mod_mask *mask)
2280	2514	{
2281	2515	pmd_t *pmd;
2282	2516	unsigned long next;
2283		- int err;
	2517	+ int err = 0;
2284	2518
2285	2519	BUG_ON(pud_huge(*pud));
2286	2520
2287		- pmd = pmd_alloc(mm, pud, addr);
2288		- if (!pmd)
2289		- return -ENOMEM;
	2521	+ if (create) {
	2522	+ pmd = pmd_alloc_track(mm, pud, addr, mask);
	2523	+ if (!pmd)
	2524	+ return -ENOMEM;
	2525	+ } else {
	2526	+ pmd = pmd_offset(pud, addr);
	2527	+ }
2290	2528	do {
2291	2529	next = pmd_addr_end(addr, end);
2292		- err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2293		- if (err)
2294		- break;
	2530	+ if (create \|\| !pmd_none_or_clear_bad(pmd)) {
	2531	+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
	2532	+ create, mask);
	2533	+ if (err)
	2534	+ break;
	2535	+ }
2295	2536	} while (pmd++, addr = next, addr != end);
2296	2537	return err;
2297	2538	}
2298	2539
2299	2540	static int apply_to_pud_range(struct mm_struct mm, p4d_t p4d,
2300	2541	unsigned long addr, unsigned long end,
2301		- pte_fn_t fn, void *data)
	2542	+ pte_fn_t fn, void *data, bool create,
	2543	+ pgtbl_mod_mask *mask)
2302	2544	{
2303	2545	pud_t *pud;
2304	2546	unsigned long next;
2305		- int err;
	2547	+ int err = 0;
2306	2548
2307		- pud = pud_alloc(mm, p4d, addr);
2308		- if (!pud)
2309		- return -ENOMEM;
	2549	+ if (create) {
	2550	+ pud = pud_alloc_track(mm, p4d, addr, mask);
	2551	+ if (!pud)
	2552	+ return -ENOMEM;
	2553	+ } else {
	2554	+ pud = pud_offset(p4d, addr);
	2555	+ }
2310	2556	do {
2311	2557	next = pud_addr_end(addr, end);
2312		- err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2313		- if (err)
2314		- break;
	2558	+ if (create \|\| !pud_none_or_clear_bad(pud)) {
	2559	+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
	2560	+ create, mask);
	2561	+ if (err)
	2562	+ break;
	2563	+ }
2315	2564	} while (pud++, addr = next, addr != end);
2316	2565	return err;
2317	2566	}
2318	2567
2319	2568	static int apply_to_p4d_range(struct mm_struct mm, pgd_t pgd,
2320	2569	unsigned long addr, unsigned long end,
2321		- pte_fn_t fn, void *data)
	2570	+ pte_fn_t fn, void *data, bool create,
	2571	+ pgtbl_mod_mask *mask)
2322	2572	{
2323	2573	p4d_t *p4d;
2324	2574	unsigned long next;
2325		- int err;
	2575	+ int err = 0;
2326	2576
2327		- p4d = p4d_alloc(mm, pgd, addr);
2328		- if (!p4d)
2329		- return -ENOMEM;
	2577	+ if (create) {
	2578	+ p4d = p4d_alloc_track(mm, pgd, addr, mask);
	2579	+ if (!p4d)
	2580	+ return -ENOMEM;
	2581	+ } else {
	2582	+ p4d = p4d_offset(pgd, addr);
	2583	+ }
2330	2584	do {
2331	2585	next = p4d_addr_end(addr, end);
2332		- err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
	2586	+ if (create \|\| !p4d_none_or_clear_bad(p4d)) {
	2587	+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
	2588	+ create, mask);
	2589	+ if (err)
	2590	+ break;
	2591	+ }
	2592	+ } while (p4d++, addr = next, addr != end);
	2593	+ return err;
	2594	+}
	2595	+
	2596	+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
	2597	+ unsigned long size, pte_fn_t fn,
	2598	+ void *data, bool create)
	2599	+{
	2600	+ pgd_t *pgd;
	2601	+ unsigned long start = addr, next;
	2602	+ unsigned long end = addr + size;
	2603	+ pgtbl_mod_mask mask = 0;
	2604	+ int err = 0;
	2605	+
	2606	+ if (WARN_ON(addr >= end))
	2607	+ return -EINVAL;
	2608	+
	2609	+ pgd = pgd_offset(mm, addr);
	2610	+ do {
	2611	+ next = pgd_addr_end(addr, end);
	2612	+ if (!create && pgd_none_or_clear_bad(pgd))
	2613	+ continue;
	2614	+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
2333	2615	if (err)
2334	2616	break;
2335		- } while (p4d++, addr = next, addr != end);
	2617	+ } while (pgd++, addr = next, addr != end);
	2618	+
	2619	+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
	2620	+ arch_sync_kernel_mappings(start, start + size);
	2621	+
2336	2622	return err;
2337	2623	}
2338	2624
..	..	@@ -2343,25 +2629,242 @@
2343	2629	int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2344	2630	unsigned long size, pte_fn_t fn, void *data)
2345	2631	{
2346		- pgd_t *pgd;
2347		- unsigned long next;
2348		- unsigned long end = addr + size;
2349		- int err;
2350		-
2351		- if (WARN_ON(addr >= end))
2352		- return -EINVAL;
2353		-
2354		- pgd = pgd_offset(mm, addr);
2355		- do {
2356		- next = pgd_addr_end(addr, end);
2357		- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2358		- if (err)
2359		- break;
2360		- } while (pgd++, addr = next, addr != end);
2361		-
2362		- return err;
	2632	+ return __apply_to_page_range(mm, addr, size, fn, data, true);
2363	2633	}
2364	2634	EXPORT_SYMBOL_GPL(apply_to_page_range);
	2635	+
	2636	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	2637	+static bool pte_spinlock(struct vm_fault *vmf)
	2638	+{
	2639	+ bool ret = false;
	2640	+ pmd_t pmdval;
	2641	+
	2642	+ /* Check if vma is still valid */
	2643	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2644	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2645	+ spin_lock(vmf->ptl);
	2646	+ return true;
	2647	+ }
	2648	+
	2649	+ local_irq_disable();
	2650	+ if (vma_has_changed(vmf)) {
	2651	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
	2652	+ goto out;
	2653	+ }
	2654	+
	2655	+ /*
	2656	+ * We check if the pmd value is still the same to ensure that there
	2657	+ * is not a huge collapse operation in progress in our back.
	2658	+ * It also ensures that pmd was not cleared by pmd_clear in
	2659	+ * free_pte_range and ptl is still valid.
	2660	+ */
	2661	+ pmdval = READ_ONCE(*vmf->pmd);
	2662	+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
	2663	+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
	2664	+ goto out;
	2665	+ }
	2666	+
	2667	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, &pmdval);
	2668	+ if (unlikely(!spin_trylock(vmf->ptl))) {
	2669	+ trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
	2670	+ goto out;
	2671	+ }
	2672	+
	2673	+ /*
	2674	+ * The check below will fail if pte_spinlock passed its ptl barrier
	2675	+ * before we took the ptl lock.
	2676	+ */
	2677	+ if (vma_has_changed(vmf)) {
	2678	+ spin_unlock(vmf->ptl);
	2679	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
	2680	+ goto out;
	2681	+ }
	2682	+
	2683	+ ret = true;
	2684	+out:
	2685	+ local_irq_enable();
	2686	+ return ret;
	2687	+}
	2688	+
	2689	+static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr)
	2690	+{
	2691	+ bool ret = false;
	2692	+ pte_t *pte;
	2693	+ spinlock_t *ptl;
	2694	+ pmd_t pmdval;
	2695	+
	2696	+ /*
	2697	+ * The first vma_has_changed() guarantees the page-tables are still
	2698	+ * valid, having IRQs disabled ensures they stay around, hence the
	2699	+ * second vma_has_changed() to make sure they are still valid once
	2700	+ * we've got the lock. After that a concurrent zap_pte_range() will
	2701	+ * block on the PTL and thus we're safe.
	2702	+ */
	2703	+ local_irq_disable();
	2704	+ if (vma_has_changed(vmf)) {
	2705	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
	2706	+ goto out;
	2707	+ }
	2708	+
	2709	+ /*
	2710	+ * We check if the pmd value is still the same to ensure that there
	2711	+ * is not a huge collapse operation in progress in our back.
	2712	+ */
	2713	+ pmdval = READ_ONCE(*vmf->pmd);
	2714	+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
	2715	+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, addr);
	2716	+ goto out;
	2717	+ }
	2718	+
	2719	+ /*
	2720	+ * Same as pte_offset_map_lock() except that we call
	2721	+ * spin_trylock() in place of spin_lock() to avoid race with
	2722	+ * unmap path which may have the lock and wait for this CPU
	2723	+ * to invalidate TLB but this CPU has irq disabled.
	2724	+ * Since we are in a speculative patch, accept it could fail
	2725	+ */
	2726	+ ptl = pte_lockptr(vmf->vma->vm_mm, &pmdval);
	2727	+ pte = pte_offset_map(&pmdval, addr);
	2728	+ if (unlikely(!spin_trylock(ptl))) {
	2729	+ pte_unmap(pte);
	2730	+ trace_spf_pte_lock(_RET_IP_, vmf->vma, addr);
	2731	+ goto out;
	2732	+ }
	2733	+
	2734	+ /*
	2735	+ * The check below will fail if __pte_map_lock_speculative passed its ptl
	2736	+ * barrier before we took the ptl lock.
	2737	+ */
	2738	+ if (vma_has_changed(vmf)) {
	2739	+ pte_unmap_unlock(pte, ptl);
	2740	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
	2741	+ goto out;
	2742	+ }
	2743	+
	2744	+ vmf->pte = pte;
	2745	+ vmf->ptl = ptl;
	2746	+ ret = true;
	2747	+out:
	2748	+ local_irq_enable();
	2749	+ return ret;
	2750	+}
	2751	+
	2752	+static bool pte_map_lock(struct vm_fault *vmf)
	2753	+{
	2754	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2755	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2756	+ vmf->address, &vmf->ptl);
	2757	+ return true;
	2758	+ }
	2759	+
	2760	+ return __pte_map_lock_speculative(vmf, vmf->address);
	2761	+}
	2762	+
	2763	+bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
	2764	+{
	2765	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2766	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2767	+ addr, &vmf->ptl);
	2768	+ return true;
	2769	+ }
	2770	+
	2771	+ return __pte_map_lock_speculative(vmf, addr);
	2772	+}
	2773	+
	2774	+static bool __read_mostly allow_file_spec_access;
	2775	+static int __init allow_file_spec_access_setup(char *str)
	2776	+{
	2777	+ allow_file_spec_access = true;
	2778	+ return 1;
	2779	+}
	2780	+__setup("allow_file_spec_access", allow_file_spec_access_setup);
	2781	+
	2782	+static bool vmf_allows_speculation(struct vm_fault *vmf)
	2783	+{
	2784	+ if (vma_is_anonymous(vmf->vma)) {
	2785	+ /*
	2786	+ * __anon_vma_prepare() requires the mmap_sem to be held
	2787	+ * because vm_next and vm_prev must be safe. This can't be
	2788	+ * guaranteed in the speculative path.
	2789	+ */
	2790	+ if (!vmf->vma->anon_vma) {
	2791	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2792	+ return false;
	2793	+ }
	2794	+ return true;
	2795	+ }
	2796	+
	2797	+ if (!allow_file_spec_access) {
	2798	+ /*
	2799	+ * Can't call vm_ops service has we don't know what they would
	2800	+ * do with the VMA.
	2801	+ * This include huge page from hugetlbfs.
	2802	+ */
	2803	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2804	+ return false;
	2805	+ }
	2806	+
	2807	+ if (!(vmf->vma->vm_flags & VM_SHARED) &&
	2808	+ (vmf->flags & FAULT_FLAG_WRITE) &&
	2809	+ !vmf->vma->anon_vma) {
	2810	+ /*
	2811	+ * non-anonymous private COW without anon_vma.
	2812	+ * See above.
	2813	+ */
	2814	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2815	+ return false;
	2816	+ }
	2817	+
	2818	+ if (vmf->vma->vm_ops->allow_speculation &&
	2819	+ vmf->vma->vm_ops->allow_speculation()) {
	2820	+ return true;
	2821	+ }
	2822	+
	2823	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2824	+ return false;
	2825	+}
	2826	+
	2827	+#else
	2828	+static inline bool pte_spinlock(struct vm_fault *vmf)
	2829	+{
	2830	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2831	+ spin_lock(vmf->ptl);
	2832	+ return true;
	2833	+}
	2834	+
	2835	+static inline bool pte_map_lock(struct vm_fault *vmf)
	2836	+{
	2837	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2838	+ vmf->address, &vmf->ptl);
	2839	+ return true;
	2840	+}
	2841	+
	2842	+inline bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
	2843	+{
	2844	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2845	+ addr, &vmf->ptl);
	2846	+ return true;
	2847	+}
	2848	+
	2849	+static inline bool vmf_allows_speculation(struct vm_fault *vmf)
	2850	+{
	2851	+ return false;
	2852	+}
	2853	+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
	2854	+
	2855	+/*
	2856	+ * Scan a region of virtual memory, calling a provided function on
	2857	+ * each leaf page table where it exists.
	2858	+ *
	2859	+ * Unlike apply_to_page_range, this does _not_ fill in page tables
	2860	+ * where they are absent.
	2861	+ */
	2862	+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
	2863	+ unsigned long size, pte_fn_t fn, void *data)
	2864	+{
	2865	+ return __apply_to_page_range(mm, addr, size, fn, data, false);
	2866	+}
	2867	+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2365	2868
2366	2869	/*
2367	2870	* handle_pte_fault chooses page fault handler according to an entry which was
..	..	@@ -2370,21 +2873,29 @@
2370	2873	* parts, do_swap_page must check under lock before unmapping the pte and
2371	2874	* proceeding (but do_wp_page is only called after already making such a check;
2372	2875	* and do_anonymous_page can safely check later on).
	2876	+ *
	2877	+ * pte_unmap_same() returns:
	2878	+ * 0 if the PTE are the same
	2879	+ * VM_FAULT_PTNOTSAME if the PTE are different
	2880	+ * VM_FAULT_RETRY if the VMA has changed in our back during
	2881	+ * a speculative page fault handling.
2373	2882	*/
2374		-static inline int pte_unmap_same(struct mm_struct mm, pmd_t pmd,
2375		- pte_t *page_table, pte_t orig_pte)
	2883	+static inline int pte_unmap_same(struct vm_fault *vmf)
2376	2884	{
2377		- int same = 1;
	2885	+ int ret = 0;
	2886	+
2378	2887	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)
2379	2888	if (sizeof(pte_t) > sizeof(unsigned long)) {
2380		- spinlock_t *ptl = pte_lockptr(mm, pmd);
2381		- spin_lock(ptl);
2382		- same = pte_same(*page_table, orig_pte);
2383		- spin_unlock(ptl);
	2889	+ if (pte_spinlock(vmf)) {
	2890	+ if (!pte_same(*vmf->pte, vmf->orig_pte))
	2891	+ ret = VM_FAULT_PTNOTSAME;
	2892	+ spin_unlock(vmf->ptl);
	2893	+ } else
	2894	+ ret = VM_FAULT_RETRY;
2384	2895	}
2385	2896	#endif
2386		- pte_unmap(page_table);
2387		- return same;
	2897	+ pte_unmap(vmf->pte);
	2898	+ return ret;
2388	2899	}
2389	2900
2390	2901	static inline bool cow_user_page(struct page dst, struct page src,
..	..	@@ -2397,8 +2908,6 @@
2397	2908	struct vm_area_struct *vma = vmf->vma;
2398	2909	struct mm_struct *mm = vma->vm_mm;
2399	2910	unsigned long addr = vmf->address;
2400		-
2401		- debug_dma_assert_idle(src);
2402	2911
2403	2912	if (likely(src)) {
2404	2913	copy_user_highpage(dst, src, addr, vma);
..	..	@@ -2426,10 +2935,9 @@
2426	2935	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2427	2936	/*
2428	2937	* Other thread has already handled the fault
2429		- * and we don't need to do anything. If it's
2430		- * not the case, the fault will be triggered
2431		- * again on the same address.
	2938	+ * and update local tlb only
2432	2939	*/
	2940	+ update_mmu_tlb(vma, addr, vmf->pte);
2433	2941	ret = false;
2434	2942	goto pte_unlock;
2435	2943	}
..	..	@@ -2453,13 +2961,14 @@
2453	2961	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2454	2962	locked = true;
2455	2963	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2456		- /* The PTE changed under us. Retry page fault. */
	2964	+ /* The PTE changed under us, update local tlb */
	2965	+ update_mmu_tlb(vma, addr, vmf->pte);
2457	2966	ret = false;
2458	2967	goto pte_unlock;
2459	2968	}
2460	2969
2461	2970	/*
2462		- * The same page can be mapped back since last copy attampt.
	2971	+ * The same page can be mapped back since last copy attempt.
2463	2972	* Try to copy again under PTL.
2464	2973	*/
2465	2974	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
..	..	@@ -2538,10 +3047,11 @@
2538	3047	*
2539	3048	* The function expects the page to be locked and unlocks it.
2540	3049	*/
2541		-static void fault_dirty_shared_page(struct vm_area_struct *vma,
2542		- struct page *page)
	3050	+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2543	3051	{
	3052	+ struct vm_area_struct *vma = vmf->vma;
2544	3053	struct address_space *mapping;
	3054	+ struct page *page = vmf->page;
2545	3055	bool dirtied;
2546	3056	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2547	3057
..	..	@@ -2556,16 +3066,30 @@
2556	3066	mapping = page_rmapping(page);
2557	3067	unlock_page(page);
2558	3068
2559		- if ((dirtied \|\| page_mkwrite) && mapping) {
2560		- /*
2561		- * Some device drivers do not set page.mapping
2562		- * but still dirty their pages
2563		- */
2564		- balance_dirty_pages_ratelimited(mapping);
2565		- }
2566		-
2567	3069	if (!page_mkwrite)
2568	3070	file_update_time(vma->vm_file);
	3071	+
	3072	+ /*
	3073	+ * Throttle page dirtying rate down to writeback speed.
	3074	+ *
	3075	+ * mapping may be NULL here because some device drivers do not
	3076	+ * set page.mapping but still dirty their pages
	3077	+ *
	3078	+ * Drop the mmap_lock before waiting on IO, if we can. The file
	3079	+ * is pinning the mapping, as per above.
	3080	+ */
	3081	+ if ((dirtied \|\| page_mkwrite) && mapping) {
	3082	+ struct file *fpin;
	3083	+
	3084	+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
	3085	+ balance_dirty_pages_ratelimited(mapping);
	3086	+ if (fpin) {
	3087	+ fput(fpin);
	3088	+ return VM_FAULT_RETRY;
	3089	+ }
	3090	+ }
	3091	+
	3092	+ return 0;
2569	3093	}
2570	3094
2571	3095	/*
..	..	@@ -2592,16 +3116,17 @@
2592	3116
2593	3117	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2594	3118	entry = pte_mkyoung(vmf->orig_pte);
2595		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	3119	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
2596	3120	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2597	3121	update_mmu_cache(vma, vmf->address, vmf->pte);
2598	3122	pte_unmap_unlock(vmf->pte, vmf->ptl);
	3123	+ count_vm_event(PGREUSE);
2599	3124	}
2600	3125
2601	3126	/*
2602	3127	* Handle the case of a page which we actually need to copy to a new page.
2603	3128	*
2604		- * Called with mmap_sem locked and the old page referenced, but
	3129	+ * Called with mmap_lock locked and the old page referenced, but
2605	3130	* without the ptl held.
2606	3131	*
2607	3132	* High level logic flow:
..	..	@@ -2622,23 +3147,22 @@
2622	3147	struct page *new_page = NULL;
2623	3148	pte_t entry;
2624	3149	int page_copied = 0;
2625		- const unsigned long mmun_start = vmf->address & PAGE_MASK;
2626		- const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2627		- struct mem_cgroup *memcg;
	3150	+ struct mmu_notifier_range range;
	3151	+ vm_fault_t ret = VM_FAULT_OOM;
2628	3152
2629	3153	if (unlikely(anon_vma_prepare(vma)))
2630		- goto oom;
	3154	+ goto out;
2631	3155
2632	3156	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2633	3157	new_page = alloc_zeroed_user_highpage_movable(vma,
2634	3158	vmf->address);
2635	3159	if (!new_page)
2636		- goto oom;
	3160	+ goto out;
2637	3161	} else {
2638	3162	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2639	3163	vmf->address);
2640	3164	if (!new_page)
2641		- goto oom;
	3165	+ goto out;
2642	3166
2643	3167	if (!cow_user_page(new_page, old_page, vmf)) {
2644	3168	/*
..	..	@@ -2652,19 +3176,27 @@
2652	3176	put_page(old_page);
2653	3177	return 0;
2654	3178	}
	3179	+ trace_android_vh_cow_user_page(vmf, new_page);
2655	3180	}
2656	3181
2657		- if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2658		- goto oom_free_new;
	3182	+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
	3183	+ goto out_free_new;
	3184	+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
2659	3185
2660	3186	__SetPageUptodate(new_page);
2661	3187
2662		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	3188	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
	3189	+ vmf->address & PAGE_MASK,
	3190	+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
	3191	+ mmu_notifier_invalidate_range_start(&range);
2663	3192
2664	3193	/*
2665	3194	* Re-check the pte - we dropped the lock
2666	3195	*/
2667		- vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
	3196	+ if (!pte_map_lock(vmf)) {
	3197	+ ret = VM_FAULT_RETRY;
	3198	+ goto out_invalidate_end;
	3199	+ }
2668	3200	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2669	3201	if (old_page) {
2670	3202	if (!PageAnon(old_page)) {
..	..	@@ -2676,8 +3208,9 @@
2676	3208	inc_mm_counter_fast(mm, MM_ANONPAGES);
2677	3209	}
2678	3210	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2679		- entry = mk_pte(new_page, vma->vm_page_prot);
2680		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	3211	+ entry = mk_pte(new_page, vmf->vma_page_prot);
	3212	+ entry = pte_sw_mkyoung(entry);
	3213	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
2681	3214	/*
2682	3215	* Clear the pte entry and flush it first, before updating the
2683	3216	* pte with the new entry. This will avoid a race condition
..	..	@@ -2685,9 +3218,8 @@
2685	3218	* thread doing COW.
2686	3219	*/
2687	3220	ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2688		- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2689		- mem_cgroup_commit_charge(new_page, memcg, false, false);
2690		- lru_cache_add_active_or_unevictable(new_page, vma);
	3221	+ __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
	3222	+ __lru_cache_add_inactive_or_unevictable(new_page, vmf->vma_flags);
2691	3223	/*
2692	3224	* We call the notify macro here because, when using secondary
2693	3225	* mmu page tables (such as kvm shadow page tables), we want the
..	..	@@ -2725,7 +3257,7 @@
2725	3257	new_page = old_page;
2726	3258	page_copied = 1;
2727	3259	} else {
2728		- mem_cgroup_cancel_charge(new_page, memcg, false);
	3260	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
2729	3261	}
2730	3262
2731	3263	if (new_page)
..	..	@@ -2736,13 +3268,13 @@
2736	3268	* No need to double call mmu_notifier->invalidate_range() callback as
2737	3269	* the above ptep_clear_flush_notify() did already call it.
2738	3270	*/
2739		- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
	3271	+ mmu_notifier_invalidate_range_only_end(&range);
2740	3272	if (old_page) {
2741	3273	/*
2742	3274	* Don't let another task, with possibly unlocked vma,
2743	3275	* keep the mlocked page.
2744	3276	*/
2745		- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
	3277	+ if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
2746	3278	lock_page(old_page); /* LRU manipulation */
2747	3279	if (PageMlocked(old_page))
2748	3280	munlock_vma_page(old_page);
..	..	@@ -2751,12 +3283,14 @@
2751	3283	put_page(old_page);
2752	3284	}
2753	3285	return page_copied ? VM_FAULT_WRITE : 0;
2754		-oom_free_new:
	3286	+out_invalidate_end:
	3287	+ mmu_notifier_invalidate_range_only_end(&range);
	3288	+out_free_new:
2755	3289	put_page(new_page);
2756		-oom:
	3290	+out:
2757	3291	if (old_page)
2758	3292	put_page(old_page);
2759		- return VM_FAULT_OOM;
	3293	+ return ret;
2760	3294	}
2761	3295
2762	3296	/**
..	..	@@ -2767,23 +3301,25 @@
2767	3301	*
2768	3302	* This function handles all that is needed to finish a write page fault in a
2769	3303	* shared mapping due to PTE being read-only once the mapped page is prepared.
2770		- * It handles locking of PTE and modifying it. The function returns
2771		- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
2772		- * lock.
	3304	+ * It handles locking of PTE and modifying it.
2773	3305	*
2774	3306	* The function expects the page to be locked or other protection against
2775	3307	* concurrent faults / writeback (such as DAX radix tree locks).
	3308	+ *
	3309	+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
	3310	+ * we acquired PTE lock.
2776	3311	*/
2777	3312	vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2778	3313	{
2779		- WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2780		- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2781		- &vmf->ptl);
	3314	+ WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
	3315	+ if (!pte_map_lock(vmf))
	3316	+ return VM_FAULT_RETRY;
2782	3317	/*
2783	3318	* We might have raced with another page fault while we released the
2784	3319	* pte_offset_map_lock.
2785	3320	*/
2786	3321	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
	3322	+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
2787	3323	pte_unmap_unlock(vmf->pte, vmf->ptl);
2788	3324	return VM_FAULT_NOPAGE;
2789	3325	}
..	..	@@ -2817,6 +3353,7 @@
2817	3353	__releases(vmf->ptl)
2818	3354	{
2819	3355	struct vm_area_struct *vma = vmf->vma;
	3356	+ vm_fault_t ret = VM_FAULT_WRITE;
2820	3357
2821	3358	get_page(vmf->page);
2822	3359
..	..	@@ -2840,10 +3377,10 @@
2840	3377	wp_page_reuse(vmf);
2841	3378	lock_page(vmf->page);
2842	3379	}
2843		- fault_dirty_shared_page(vma, vmf->page);
	3380	+ ret \|= fault_dirty_shared_page(vmf);
2844	3381	put_page(vmf->page);
2845	3382
2846		- return VM_FAULT_WRITE;
	3383	+ return ret;
2847	3384	}
2848	3385
2849	3386	/*
..	..	@@ -2860,16 +3397,32 @@
2860	3397	* change only once the write actually happens. This avoids a few races,
2861	3398	* and potentially makes it more efficient.
2862	3399	*
2863		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3400	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
2864	3401	* but allow concurrent faults), with pte both mapped and locked.
2865		- * We return with mmap_sem still held, but pte unmapped and unlocked.
	3402	+ * We return with mmap_lock still held, but pte unmapped and unlocked.
2866	3403	*/
2867	3404	static vm_fault_t do_wp_page(struct vm_fault *vmf)
2868	3405	__releases(vmf->ptl)
2869	3406	{
2870	3407	struct vm_area_struct *vma = vmf->vma;
2871	3408
2872		- vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
	3409	+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
	3410	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
	3411	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3412	+ return VM_FAULT_RETRY;
	3413	+ return handle_userfault(vmf, VM_UFFD_WP);
	3414	+ }
	3415	+
	3416	+ /*
	3417	+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
	3418	+ * is flushed in this case before copying.
	3419	+ */
	3420	+ if (unlikely(userfaultfd_wp(vmf->vma) &&
	3421	+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
	3422	+ flush_tlb_page(vmf->vma, vmf->address);
	3423	+
	3424	+ vmf->page = _vm_normal_page(vma, vmf->address, vmf->orig_pte,
	3425	+ vmf->vma_flags);
2873	3426	if (!vmf->page) {
2874	3427	/*
2875	3428	* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
..	..	@@ -2878,7 +3431,7 @@
2878	3431	* We should not cow pages in a shared writeable mapping.
2879	3432	* Just mark the pages writable and/or call ops->pfn_mkwrite.
2880	3433	*/
2881		- if ((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
	3434	+ if ((vmf->vma_flags & (VM_WRITE\|VM_SHARED)) ==
2882	3435	(VM_WRITE\|VM_SHARED))
2883	3436	return wp_pfn_shared(vmf);
2884	3437
..	..	@@ -2890,43 +3443,31 @@
2890	3443	* Take out anonymous pages first, anonymous shared vmas are
2891	3444	* not dirty accountable.
2892	3445	*/
2893		- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2894		- int total_map_swapcount;
2895		- if (!trylock_page(vmf->page)) {
2896		- get_page(vmf->page);
2897		- pte_unmap_unlock(vmf->pte, vmf->ptl);
2898		- lock_page(vmf->page);
2899		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2900		- vmf->address, &vmf->ptl);
2901		- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2902		- unlock_page(vmf->page);
2903		- pte_unmap_unlock(vmf->pte, vmf->ptl);
2904		- put_page(vmf->page);
2905		- return 0;
2906		- }
2907		- put_page(vmf->page);
	3446	+ if (PageAnon(vmf->page)) {
	3447	+ struct page *page = vmf->page;
	3448	+
	3449	+ /* PageKsm() doesn't necessarily raise the page refcount */
	3450	+ if (PageKsm(page) \|\| page_count(page) != 1)
	3451	+ goto copy;
	3452	+ if (!trylock_page(page))
	3453	+ goto copy;
	3454	+ if (PageKsm(page) \|\| page_mapcount(page) != 1 \|\| page_count(page) != 1) {
	3455	+ unlock_page(page);
	3456	+ goto copy;
2908	3457	}
2909		- if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2910		- if (total_map_swapcount == 1) {
2911		- /*
2912		- * The page is all ours. Move it to
2913		- * our anon_vma so the rmap code will
2914		- * not search our parent or siblings.
2915		- * Protected against the rmap code by
2916		- * the page lock.
2917		- */
2918		- page_move_anon_rmap(vmf->page, vma);
2919		- }
2920		- unlock_page(vmf->page);
2921		- wp_page_reuse(vmf);
2922		- return VM_FAULT_WRITE;
2923		- }
2924		- unlock_page(vmf->page);
2925		- } else if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
	3458	+ /*
	3459	+ * Ok, we've got the only map reference, and the only
	3460	+ * page count reference, and the page is locked,
	3461	+ * it's dark out, and we're wearing sunglasses. Hit it.
	3462	+ */
	3463	+ unlock_page(page);
	3464	+ wp_page_reuse(vmf);
	3465	+ return VM_FAULT_WRITE;
	3466	+ } else if (unlikely((vmf->vma_flags & (VM_WRITE\|VM_SHARED)) ==
2926	3467	(VM_WRITE\|VM_SHARED))) {
2927	3468	return wp_page_shared(vmf);
2928	3469	}
2929		-
	3470	+copy:
2930	3471	/*
2931	3472	* Ok, we need to copy. Oh, well..
2932	3473	*/
..	..	@@ -2989,7 +3530,7 @@
2989	3530
2990	3531	details.check_mapping = mapping;
2991	3532	details.first_index = page->index;
2992		- details.last_index = page->index + hpage_nr_pages(page) - 1;
	3533	+ details.last_index = page->index + thp_nr_pages(page) - 1;
2993	3534	details.single_page = page;
2994	3535
2995	3536	i_mmap_lock_write(mapping);
..	..	@@ -3063,26 +3604,40 @@
3063	3604	EXPORT_SYMBOL(unmap_mapping_range);
3064	3605
3065	3606	/*
3066		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3607	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3067	3608	* but allow concurrent faults), and pte mapped but not yet locked.
3068	3609	* We return with pte unmapped and unlocked.
3069	3610	*
3070		- * We return with the mmap_sem locked or unlocked in the same cases
	3611	+ * We return with the mmap_lock locked or unlocked in the same cases
3071	3612	* as does filemap_fault().
3072	3613	*/
3073	3614	vm_fault_t do_swap_page(struct vm_fault *vmf)
3074	3615	{
3075	3616	struct vm_area_struct *vma = vmf->vma;
3076	3617	struct page page = NULL, swapcache;
3077		- struct mem_cgroup *memcg;
3078	3618	swp_entry_t entry;
3079	3619	pte_t pte;
3080	3620	int locked;
3081	3621	int exclusive = 0;
3082		- vm_fault_t ret = 0;
	3622	+ vm_fault_t ret;
	3623	+ void *shadow = NULL;
3083	3624
3084		- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
	3625	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
	3626	+ pte_unmap(vmf->pte);
	3627	+ return VM_FAULT_RETRY;
	3628	+ }
	3629	+
	3630	+ ret = pte_unmap_same(vmf);
	3631	+ if (ret) {
	3632	+ /*
	3633	+ * If pte != orig_pte, this means another thread did the
	3634	+ * swap operation in our back.
	3635	+ * So nothing else to do.
	3636	+ */
	3637	+ if (ret == VM_FAULT_PTNOTSAME)
	3638	+ ret = 0;
3085	3639	goto out;
	3640	+ }
3086	3641
3087	3642	entry = pte_to_swp_entry(vmf->orig_pte);
3088	3643	if (unlikely(non_swap_entry(entry))) {
..	..	@@ -3090,13 +3645,8 @@
3090	3645	migration_entry_wait(vma->vm_mm, vmf->pmd,
3091	3646	vmf->address);
3092	3647	} else if (is_device_private_entry(entry)) {
3093		- /*
3094		- * For un-addressable device memory we call the pgmap
3095		- * fault handler callback. The callback must migrate
3096		- * the page back to some CPU accessible page.
3097		- */
3098		- ret = device_private_entry_fault(vma, vmf->address, entry,
3099		- vmf->flags, vmf->pmd);
	3648	+ vmf->page = device_private_entry_to_page(entry);
	3649	+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3100	3650	} else if (is_hwpoison_entry(entry)) {
3101	3651	ret = VM_FAULT_HWPOISON;
3102	3652	} else {
..	..	@@ -3113,19 +3663,51 @@
3113	3663
3114	3664	if (!page) {
3115	3665	struct swap_info_struct *si = swp_swap_info(entry);
	3666	+ bool skip_swapcache = false;
3116	3667
3117		- if (si->flags & SWP_SYNCHRONOUS_IO &&
3118		- __swap_count(si, entry) == 1) {
	3668	+ trace_android_vh_skip_swapcache(entry, &skip_swapcache);
	3669	+ if ((data_race(si->flags & SWP_SYNCHRONOUS_IO) \|\| skip_swapcache) &&
	3670	+ __swap_count(entry) == 1) {
3119	3671	/* skip swapcache */
3120		- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3121		- vmf->address);
	3672	+ gfp_t flags = GFP_HIGHUSER_MOVABLE;
	3673	+
	3674	+ trace_android_rvh_set_skip_swapcache_flags(&flags);
	3675	+ page = alloc_page_vma(flags, vma, vmf->address);
3122	3676	if (page) {
	3677	+ int err;
	3678	+
3123	3679	__SetPageLocked(page);
3124	3680	__SetPageSwapBacked(page);
3125	3681	set_page_private(page, entry.val);
3126		- lru_cache_add_anon(page);
	3682	+
	3683	+ /* Tell memcg to use swap ownership records */
	3684	+ SetPageSwapCache(page);
	3685	+ err = mem_cgroup_charge(page, vma->vm_mm,
	3686	+ GFP_KERNEL);
	3687	+ ClearPageSwapCache(page);
	3688	+ if (err) {
	3689	+ ret = VM_FAULT_OOM;
	3690	+ goto out_page;
	3691	+ }
	3692	+
	3693	+ shadow = get_shadow_from_swap_cache(entry);
	3694	+ if (shadow)
	3695	+ workingset_refault(page, shadow);
	3696	+
	3697	+ lru_cache_add(page);
3127	3698	swap_readpage(page, true);
3128	3699	}
	3700	+ } else if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
	3701	+ /*
	3702	+ * Don't try readahead during a speculative page fault
	3703	+ * as the VMA's boundaries may change in our back.
	3704	+ * If the page is not in the swap cache and synchronous
	3705	+ * read is disabled, fall back to the regular page fault
	3706	+ * mechanism.
	3707	+ */
	3708	+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
	3709	+ ret = VM_FAULT_RETRY;
	3710	+ goto out;
3129	3711	} else {
3130	3712	page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3131	3713	vmf);
..	..	@@ -3134,11 +3716,16 @@
3134	3716
3135	3717	if (!page) {
3136	3718	/*
3137		- * Back out if somebody else faulted in this pte
3138		- * while we released the pte lock.
	3719	+ * Back out if the VMA has changed in our back during
	3720	+ * a speculative page fault or if somebody else
	3721	+ * faulted in this pte while we released the pte lock.
3139	3722	*/
3140		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3141		- vmf->address, &vmf->ptl);
	3723	+ if (!pte_map_lock(vmf)) {
	3724	+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
	3725	+ ret = VM_FAULT_RETRY;
	3726	+ goto out;
	3727	+ }
	3728	+
3142	3729	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3143	3730	ret = VM_FAULT_OOM;
3144	3731	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
..	..	@@ -3184,17 +3771,16 @@
3184	3771	goto out_page;
3185	3772	}
3186	3773
3187		- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
3188		- &memcg, false)) {
3189		- ret = VM_FAULT_OOM;
3190		- goto out_page;
3191		- }
	3774	+ cgroup_throttle_swaprate(page, GFP_KERNEL);
3192	3775
3193	3776	/*
3194		- * Back out if somebody else already faulted in this pte.
	3777	+ * Back out if the VMA has changed in our back during a speculative
	3778	+ * page fault or if somebody else already faulted in this pte.
3195	3779	*/
3196		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3197		- &vmf->ptl);
	3780	+ if (!pte_map_lock(vmf)) {
	3781	+ ret = VM_FAULT_RETRY;
	3782	+ goto out_page;
	3783	+ }
3198	3784	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3199	3785	goto out_nomap;
3200	3786
..	..	@@ -3215,9 +3801,9 @@
3215	3801
3216	3802	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3217	3803	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3218		- pte = mk_pte(page, vma->vm_page_prot);
	3804	+ pte = mk_pte(page, vmf->vma_page_prot);
3219	3805	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3220		- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
	3806	+ pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
3221	3807	vmf->flags &= ~FAULT_FLAG_WRITE;
3222	3808	ret \|= VM_FAULT_WRITE;
3223	3809	exclusive = RMAP_EXCLUSIVE;
..	..	@@ -3225,24 +3811,26 @@
3225	3811	flush_icache_page(vma, page);
3226	3812	if (pte_swp_soft_dirty(vmf->orig_pte))
3227	3813	pte = pte_mksoft_dirty(pte);
	3814	+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
	3815	+ pte = pte_mkuffd_wp(pte);
	3816	+ pte = pte_wrprotect(pte);
	3817	+ }
3228	3818	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3229	3819	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3230	3820	vmf->orig_pte = pte;
3231	3821
3232	3822	/* ksm created a completely new copy */
3233	3823	if (unlikely(page != swapcache && swapcache)) {
3234		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3235		- mem_cgroup_commit_charge(page, memcg, false, false);
3236		- lru_cache_add_active_or_unevictable(page, vma);
	3824	+ __page_add_new_anon_rmap(page, vma, vmf->address, false);
	3825	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3237	3826	} else {
3238	3827	do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3239		- mem_cgroup_commit_charge(page, memcg, true, false);
3240		- activate_page(page);
3241	3828	}
3242	3829
	3830	+ trace_android_vh_swapin_add_anon_rmap(vmf, page);
3243	3831	swap_free(entry);
3244	3832	if (mem_cgroup_swap_full(page) \|\|
3245		- (vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))
	3833	+ (vmf->vma_flags & VM_LOCKED) \|\| PageMlocked(page))
3246	3834	try_to_free_swap(page);
3247	3835	unlock_page(page);
3248	3836	if (page != swapcache && swapcache) {
..	..	@@ -3272,7 +3860,6 @@
3272	3860	out:
3273	3861	return ret;
3274	3862	out_nomap:
3275		- mem_cgroup_cancel_charge(page, memcg, false);
3276	3863	pte_unmap_unlock(vmf->pte, vmf->ptl);
3277	3864	out_page:
3278	3865	unlock_page(page);
..	..	@@ -3286,51 +3873,65 @@
3286	3873	}
3287	3874
3288	3875	/*
3289		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3876	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3290	3877	* but allow concurrent faults), and pte mapped but not yet locked.
3291		- * We return with mmap_sem still held, but pte unmapped and unlocked.
	3878	+ * We return with mmap_lock still held, but pte unmapped and unlocked.
3292	3879	*/
3293	3880	static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3294	3881	{
3295	3882	struct vm_area_struct *vma = vmf->vma;
3296		- struct mem_cgroup *memcg;
3297	3883	struct page *page;
3298	3884	vm_fault_t ret = 0;
3299	3885	pte_t entry;
3300	3886
3301	3887	/* File mapping without ->vm_ops ? */
3302		- if (vma->vm_flags & VM_SHARED)
	3888	+ if (vmf->vma_flags & VM_SHARED)
3303	3889	return VM_FAULT_SIGBUS;
	3890	+
	3891	+ /* Do not check unstable pmd, if it's changed will retry later */
	3892	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3893	+ goto skip_pmd_checks;
3304	3894
3305	3895	/*
3306	3896	* Use pte_alloc() instead of pte_alloc_map(). We can't run
3307	3897	* pte_offset_map() on pmds where a huge pmd might be created
3308	3898	* from a different thread.
3309	3899	*
3310		- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
	3900	+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
3311	3901	* parallel threads are excluded by other means.
3312	3902	*
3313		- * Here we only have down_read(mmap_sem).
	3903	+ * Here we only have mmap_read_lock(mm).
3314	3904	*/
3315		- if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
	3905	+ if (pte_alloc(vma->vm_mm, vmf->pmd))
3316	3906	return VM_FAULT_OOM;
3317	3907
3318		- /* See the comment in pte_alloc_one_map() */
	3908	+ /* See comment in handle_pte_fault() */
3319	3909	if (unlikely(pmd_trans_unstable(vmf->pmd)))
3320	3910	return 0;
3321	3911
	3912	+skip_pmd_checks:
3322	3913	/* Use the zero-page for reads */
3323	3914	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3324	3915	!mm_forbids_zeropage(vma->vm_mm)) {
3325	3916	entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3326		- vma->vm_page_prot));
3327		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3328		- vmf->address, &vmf->ptl);
3329		- if (!pte_none(*vmf->pte))
	3917	+ vmf->vma_page_prot));
	3918	+ if (!pte_map_lock(vmf))
	3919	+ return VM_FAULT_RETRY;
	3920	+ if (!pte_none(*vmf->pte)) {
	3921	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
3330	3922	goto unlock;
	3923	+ }
3331	3924	ret = check_stable_address_space(vma->vm_mm);
3332	3925	if (ret)
3333	3926	goto unlock;
	3927	+ /*
	3928	+ * Don't call the userfaultfd during the speculative path.
	3929	+ * We already checked for the VMA to not be managed through
	3930	+ * userfaultfd, but it may be set in our back once we have lock
	3931	+ * the pte. In such a case we can ignore it this time.
	3932	+ */
	3933	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3934	+ goto setpte;
3334	3935	/* Deliver the page fault to userland, check inside PT lock */
3335	3936	if (userfaultfd_missing(vma)) {
3336	3937	pte_unmap_unlock(vmf->pte, vmf->ptl);
..	..	@@ -3346,42 +3947,47 @@
3346	3947	if (!page)
3347	3948	goto oom;
3348	3949
3349		- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3350		- false))
	3950	+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3351	3951	goto oom_free_page;
	3952	+ cgroup_throttle_swaprate(page, GFP_KERNEL);
3352	3953
3353	3954	/*
3354	3955	* The memory barrier inside __SetPageUptodate makes sure that
3355		- * preceeding stores to the page contents become visible before
	3956	+ * preceding stores to the page contents become visible before
3356	3957	* the set_pte_at() write.
3357	3958	*/
3358	3959	__SetPageUptodate(page);
3359	3960
3360		- entry = mk_pte(page, vma->vm_page_prot);
3361		- if (vma->vm_flags & VM_WRITE)
	3961	+ entry = mk_pte(page, vmf->vma_page_prot);
	3962	+ entry = pte_sw_mkyoung(entry);
	3963	+ if (vmf->vma_flags & VM_WRITE)
3362	3964	entry = pte_mkwrite(pte_mkdirty(entry));
3363	3965
3364		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3365		- &vmf->ptl);
3366		- if (!pte_none(*vmf->pte))
	3966	+ if (!pte_map_lock(vmf)) {
	3967	+ ret = VM_FAULT_RETRY;
3367	3968	goto release;
	3969	+ }
	3970	+
	3971	+ if (!pte_none(*vmf->pte)) {
	3972	+ update_mmu_cache(vma, vmf->address, vmf->pte);
	3973	+ goto unlock_and_release;
	3974	+ }
3368	3975
3369	3976	ret = check_stable_address_space(vma->vm_mm);
3370	3977	if (ret)
3371		- goto release;
	3978	+ goto unlock_and_release;
3372	3979
3373	3980	/* Deliver the page fault to userland, check inside PT lock */
3374		- if (userfaultfd_missing(vma)) {
	3981	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	3982	+ userfaultfd_missing(vma)) {
3375	3983	pte_unmap_unlock(vmf->pte, vmf->ptl);
3376		- mem_cgroup_cancel_charge(page, memcg, false);
3377	3984	put_page(page);
3378	3985	return handle_userfault(vmf, VM_UFFD_MISSING);
3379	3986	}
3380	3987
3381	3988	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3382		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3383		- mem_cgroup_commit_charge(page, memcg, false, false);
3384		- lru_cache_add_active_or_unevictable(page, vma);
	3989	+ __page_add_new_anon_rmap(page, vma, vmf->address, false);
	3990	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3385	3991	setpte:
3386	3992	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3387	3993
..	..	@@ -3390,10 +3996,11 @@
3390	3996	unlock:
3391	3997	pte_unmap_unlock(vmf->pte, vmf->ptl);
3392	3998	return ret;
	3999	+unlock_and_release:
	4000	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
3393	4001	release:
3394		- mem_cgroup_cancel_charge(page, memcg, false);
3395	4002	put_page(page);
3396		- goto unlock;
	4003	+ return ret;
3397	4004	oom_free_page:
3398	4005	put_page(page);
3399	4006	oom:
..	..	@@ -3401,7 +4008,7 @@
3401	4008	}
3402	4009
3403	4010	/*
3404		- * The mmap_sem must have been held on entry, and may have been
	4011	+ * The mmap_lock must have been held on entry, and may have been
3405	4012	* released depending on flags and vma->vm_ops->fault() return value.
3406	4013	* See filemap_fault() and __lock_page_retry().
3407	4014	*/
..	..	@@ -3409,6 +4016,10 @@
3409	4016	{
3410	4017	struct vm_area_struct *vma = vmf->vma;
3411	4018	vm_fault_t ret;
	4019	+
	4020	+ /* Do not check unstable pmd, if it's changed will retry later */
	4021	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4022	+ goto skip_pmd_checks;
3412	4023
3413	4024	/*
3414	4025	* Preallocate pte before we take page_lock because this might lead to
..	..	@@ -3418,7 +4029,7 @@
3418	4029	* unlock_page(A)
3419	4030	* lock_page(B)
3420	4031	* lock_page(B)
3421		- * pte_alloc_pne
	4032	+ * pte_alloc_one
3422	4033	* shrink_page_list
3423	4034	* wait_on_page_writeback(A)
3424	4035	* SetPageWriteback(B)
..	..	@@ -3426,24 +4037,33 @@
3426	4037	* # flush A, B to clear the writeback
3427	4038	*/
3428	4039	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3429		- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3430		- vmf->address);
	4040	+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3431	4041	if (!vmf->prealloc_pte)
3432	4042	return VM_FAULT_OOM;
3433	4043	smp_wmb(); /* See comment in __pte_alloc() */
3434	4044	}
3435	4045
	4046	+skip_pmd_checks:
3436	4047	ret = vma->vm_ops->fault(vmf);
3437	4048	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY \|
3438	4049	VM_FAULT_DONE_COW)))
3439	4050	return ret;
3440	4051
3441	4052	if (unlikely(PageHWPoison(vmf->page))) {
3442		- if (ret & VM_FAULT_LOCKED)
3443		- unlock_page(vmf->page);
3444		- put_page(vmf->page);
	4053	+ struct page *page = vmf->page;
	4054	+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
	4055	+ if (ret & VM_FAULT_LOCKED) {
	4056	+ if (page_mapped(page))
	4057	+ unmap_mapping_pages(page_mapping(page),
	4058	+ page->index, 1, false);
	4059	+ /* Retry if a clean page was removed from the cache. */
	4060	+ if (invalidate_inode_page(page))
	4061	+ poisonret = VM_FAULT_NOPAGE;
	4062	+ unlock_page(page);
	4063	+ }
	4064	+ put_page(page);
3445	4065	vmf->page = NULL;
3446		- return VM_FAULT_HWPOISON;
	4066	+ return poisonret;
3447	4067	}
3448	4068
3449	4069	if (unlikely(!(ret & VM_FAULT_LOCKED)))
..	..	@@ -3454,80 +4074,7 @@
3454	4074	return ret;
3455	4075	}
3456	4076
3457		-/*
3458		- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
3459		- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
3460		- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
3461		- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
3462		- */
3463		-static int pmd_devmap_trans_unstable(pmd_t *pmd)
3464		-{
3465		- return pmd_devmap(*pmd) \|\| pmd_trans_unstable(pmd);
3466		-}
3467		-
3468		-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3469		-{
3470		- struct vm_area_struct *vma = vmf->vma;
3471		-
3472		- if (!pmd_none(*vmf->pmd))
3473		- goto map_pte;
3474		- if (vmf->prealloc_pte) {
3475		- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3476		- if (unlikely(!pmd_none(*vmf->pmd))) {
3477		- spin_unlock(vmf->ptl);
3478		- goto map_pte;
3479		- }
3480		-
3481		- mm_inc_nr_ptes(vma->vm_mm);
3482		- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3483		- spin_unlock(vmf->ptl);
3484		- vmf->prealloc_pte = NULL;
3485		- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3486		- return VM_FAULT_OOM;
3487		- }
3488		-map_pte:
3489		- /*
3490		- * If a huge pmd materialized under us just retry later. Use
3491		- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
3492		- * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
3493		- * under us and then back to pmd_none, as a result of MADV_DONTNEED
3494		- * running immediately after a huge pmd fault in a different thread of
3495		- * this mm, in turn leading to a misleading pmd_trans_huge() retval.
3496		- * All we have to ensure is that it is a regular pmd that we can walk
3497		- * with pte_offset_map() and we can do that through an atomic read in
3498		- * C, which is what pmd_trans_unstable() provides.
3499		- */
3500		- if (pmd_devmap_trans_unstable(vmf->pmd))
3501		- return VM_FAULT_NOPAGE;
3502		-
3503		- /*
3504		- * At this point we know that our vmf->pmd points to a page of ptes
3505		- * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
3506		- * for the duration of the fault. If a racing MADV_DONTNEED runs and
3507		- * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
3508		- * be valid and we will re-check to make sure the vmf->pte isn't
3509		- * pte_none() under vmf->ptl protection when we return to
3510		- * alloc_set_pte().
3511		- */
3512		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3513		- &vmf->ptl);
3514		- return 0;
3515		-}
3516		-
3517		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3518		-
3519		-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3520		-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3521		- unsigned long haddr)
3522		-{
3523		- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3524		- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3525		- return false;
3526		- if (haddr < vma->vm_start \|\| haddr + HPAGE_PMD_SIZE > vma->vm_end)
3527		- return false;
3528		- return true;
3529		-}
3530		-
	4077	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3531	4078	static void deposit_prealloc_pte(struct vm_fault *vmf)
3532	4079	{
3533	4080	struct vm_area_struct *vma = vmf->vma;
..	..	@@ -3541,27 +4088,28 @@
3541	4088	vmf->prealloc_pte = NULL;
3542	4089	}
3543	4090
3544		-static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
	4091	+vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
3545	4092	{
3546	4093	struct vm_area_struct *vma = vmf->vma;
3547	4094	bool write = vmf->flags & FAULT_FLAG_WRITE;
3548	4095	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3549	4096	pmd_t entry;
3550	4097	int i;
3551		- vm_fault_t ret;
	4098	+ vm_fault_t ret = VM_FAULT_FALLBACK;
3552	4099
3553	4100	if (!transhuge_vma_suitable(vma, haddr))
3554		- return VM_FAULT_FALLBACK;
	4101	+ return ret;
3555	4102
3556		- ret = VM_FAULT_FALLBACK;
3557	4103	page = compound_head(page);
	4104	+ if (compound_order(page) != HPAGE_PMD_ORDER)
	4105	+ return ret;
3558	4106
3559	4107	/*
3560	4108	* Archs like ppc64 need additonal space to store information
3561	4109	* related to pte entry. Use the preallocated table for that.
3562	4110	*/
3563	4111	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3564		- vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
	4112	+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3565	4113	if (!vmf->prealloc_pte)
3566	4114	return VM_FAULT_OOM;
3567	4115	smp_wmb(); /* See comment in __pte_alloc() */
..	..	@@ -3574,7 +4122,7 @@
3574	4122	for (i = 0; i < HPAGE_PMD_NR; i++)
3575	4123	flush_icache_page(vma, page + i);
3576	4124
3577		- entry = mk_huge_pmd(page, vma->vm_page_prot);
	4125	+ entry = mk_huge_pmd(page, vmf->vma_page_prot);
3578	4126	if (write)
3579	4127	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3580	4128
..	..	@@ -3598,77 +4146,40 @@
3598	4146	return ret;
3599	4147	}
3600	4148	#else
3601		-static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
	4149	+vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
3602	4150	{
3603		- BUILD_BUG();
3604		- return 0;
	4151	+ return VM_FAULT_FALLBACK;
3605	4152	}
3606	4153	#endif
3607	4154
3608		-/**
3609		- * alloc_set_pte - setup new PTE entry for given page and add reverse page
3610		- * mapping. If needed, the fucntion allocates page table or use pre-allocated.
3611		- *
3612		- * @vmf: fault environment
3613		- * @memcg: memcg to charge page (only for private mappings)
3614		- * @page: page to map
3615		- *
3616		- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
3617		- * return.
3618		- *
3619		- * Target users are page handler itself and implementations of
3620		- * vm_ops->map_pages.
3621		- */
3622		-vm_fault_t alloc_set_pte(struct vm_fault vmf, struct mem_cgroup memcg,
3623		- struct page *page)
	4155	+void do_set_pte(struct vm_fault vmf, struct page page, unsigned long addr)
3624	4156	{
3625	4157	struct vm_area_struct *vma = vmf->vma;
3626	4158	bool write = vmf->flags & FAULT_FLAG_WRITE;
	4159	+ bool prefault = vmf->address != addr;
3627	4160	pte_t entry;
3628		- vm_fault_t ret;
3629		-
3630		- if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3631		- IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3632		- /* THP on COW? */
3633		- VM_BUG_ON_PAGE(memcg, page);
3634		-
3635		- ret = do_set_pmd(vmf, page);
3636		- if (ret != VM_FAULT_FALLBACK)
3637		- return ret;
3638		- }
3639		-
3640		- if (!vmf->pte) {
3641		- ret = pte_alloc_one_map(vmf);
3642		- if (ret)
3643		- return ret;
3644		- }
3645		-
3646		- /* Re-check under ptl */
3647		- if (unlikely(!pte_none(*vmf->pte)))
3648		- return VM_FAULT_NOPAGE;
3649	4161
3650	4162	flush_icache_page(vma, page);
3651		- entry = mk_pte(page, vma->vm_page_prot);
	4163	+ entry = mk_pte(page, vmf->vma_page_prot);
	4164	+
	4165	+ if (prefault && arch_wants_old_prefaulted_pte())
	4166	+ entry = pte_mkold(entry);
	4167	+ else
	4168	+ entry = pte_sw_mkyoung(entry);
	4169	+
3652	4170	if (write)
3653		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	4171	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
3654	4172	/* copy-on-write page */
3655		- if (write && !(vma->vm_flags & VM_SHARED)) {
	4173	+ if (write && !(vmf->vma_flags & VM_SHARED)) {
3656	4174	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3657		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3658		- mem_cgroup_commit_charge(page, memcg, false, false);
3659		- lru_cache_add_active_or_unevictable(page, vma);
	4175	+ __page_add_new_anon_rmap(page, vma, addr, false);
	4176	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3660	4177	} else {
3661	4178	inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3662	4179	page_add_file_rmap(page, false);
3663	4180	}
3664		- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3665		-
3666		- /* no need to invalidate: a not-present page won't be cached */
3667		- update_mmu_cache(vma, vmf->address, vmf->pte);
3668		-
3669		- return 0;
	4181	+ set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
3670	4182	}
3671		-
3672	4183
3673	4184	/**
3674	4185	* finish_fault - finish page fault once we have prepared the page to fault
..	..	@@ -3678,20 +4189,22 @@
3678	4189	* This function handles all that is needed to finish a page fault once the
3679	4190	* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
3680	4191	* given page, adds reverse page mapping, handles memcg charges and LRU
3681		- * addition. The function returns 0 on success, VM_FAULT_ code in case of
3682		- * error.
	4192	+ * addition.
3683	4193	*
3684	4194	* The function expects the page to be locked and on success it consumes a
3685	4195	* reference of a page being mapped (for the PTE which maps it).
	4196	+ *
	4197	+ * Return: %0 on success, %VM_FAULT_ code in case of error.
3686	4198	*/
3687	4199	vm_fault_t finish_fault(struct vm_fault *vmf)
3688	4200	{
	4201	+ struct vm_area_struct *vma = vmf->vma;
3689	4202	struct page *page;
3690		- vm_fault_t ret = 0;
	4203	+ vm_fault_t ret;
3691	4204
3692	4205	/* Did we COW the page? */
3693	4206	if ((vmf->flags & FAULT_FLAG_WRITE) &&
3694		- !(vmf->vma->vm_flags & VM_SHARED))
	4207	+ !(vmf->vma_flags & VM_SHARED))
3695	4208	page = vmf->cow_page;
3696	4209	else
3697	4210	page = vmf->page;
..	..	@@ -3700,12 +4213,56 @@
3700	4213	* check even for read faults because we might have lost our CoWed
3701	4214	* page
3702	4215	*/
3703		- if (!(vmf->vma->vm_flags & VM_SHARED))
3704		- ret = check_stable_address_space(vmf->vma->vm_mm);
3705		- if (!ret)
3706		- ret = alloc_set_pte(vmf, vmf->memcg, page);
3707		- if (vmf->pte)
3708		- pte_unmap_unlock(vmf->pte, vmf->ptl);
	4216	+ if (!(vma->vm_flags & VM_SHARED)) {
	4217	+ ret = check_stable_address_space(vma->vm_mm);
	4218	+ if (ret)
	4219	+ return ret;
	4220	+ }
	4221	+
	4222	+ /* Do not check unstable pmd, if it's changed will retry later */
	4223	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4224	+ goto skip_pmd_checks;
	4225	+
	4226	+ if (pmd_none(*vmf->pmd)) {
	4227	+ if (PageTransCompound(page)) {
	4228	+ ret = do_set_pmd(vmf, page);
	4229	+ if (ret != VM_FAULT_FALLBACK)
	4230	+ return ret;
	4231	+ }
	4232	+
	4233	+ if (vmf->prealloc_pte) {
	4234	+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	4235	+ if (likely(pmd_none(*vmf->pmd))) {
	4236	+ mm_inc_nr_ptes(vma->vm_mm);
	4237	+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
	4238	+ vmf->prealloc_pte = NULL;
	4239	+ }
	4240	+ spin_unlock(vmf->ptl);
	4241	+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
	4242	+ return VM_FAULT_OOM;
	4243	+ }
	4244	+ }
	4245	+
	4246	+ /*
	4247	+ * See comment in handle_pte_fault() for how this scenario happens, we
	4248	+ * need to return NOPAGE so that we drop this page.
	4249	+ */
	4250	+ if (pmd_devmap_trans_unstable(vmf->pmd))
	4251	+ return VM_FAULT_NOPAGE;
	4252	+
	4253	+skip_pmd_checks:
	4254	+ if (!pte_map_lock(vmf))
	4255	+ return VM_FAULT_RETRY;
	4256	+
	4257	+ ret = 0;
	4258	+ /* Re-check under ptl */
	4259	+ if (likely(pte_none(*vmf->pte)))
	4260	+ do_set_pte(vmf, page, vmf->address);
	4261	+ else
	4262	+ ret = VM_FAULT_NOPAGE;
	4263	+
	4264	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
	4265	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
3709	4266	return ret;
3710	4267	}
3711	4268
..	..	@@ -3738,12 +4295,8 @@
3738	4295
3739	4296	static int __init fault_around_debugfs(void)
3740	4297	{
3741		- void *ret;
3742		-
3743		- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3744		- &fault_around_bytes_fops);
3745		- if (!ret)
3746		- pr_warn("Failed to create fault_around_bytes in debugfs");
	4298	+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
	4299	+ &fault_around_bytes_fops);
3747	4300	return 0;
3748	4301	}
3749	4302	late_initcall(fault_around_debugfs);
..	..	@@ -3779,13 +4332,12 @@
3779	4332	pgoff_t start_pgoff = vmf->pgoff;
3780	4333	pgoff_t end_pgoff;
3781	4334	int off;
3782		- vm_fault_t ret = 0;
3783	4335
3784	4336	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3785	4337	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3786	4338
3787		- vmf->address = max(address & mask, vmf->vma->vm_start);
3788		- off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
	4339	+ address = max(address & mask, vmf->vma->vm_start);
	4340	+ off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3789	4341	start_pgoff -= off;
3790	4342
3791	4343	/*
..	..	@@ -3793,40 +4345,20 @@
3793	4345	* the vma or nr_pages from start_pgoff, depending what is nearest.
3794	4346	*/
3795	4347	end_pgoff = start_pgoff -
3796		- ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
	4348	+ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3797	4349	PTRS_PER_PTE - 1;
3798	4350	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3799	4351	start_pgoff + nr_pages - 1);
3800	4352
3801		- if (pmd_none(*vmf->pmd)) {
3802		- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3803		- vmf->address);
	4353	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	4354	+ pmd_none(*vmf->pmd)) {
	4355	+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3804	4356	if (!vmf->prealloc_pte)
3805		- goto out;
	4357	+ return VM_FAULT_OOM;
3806	4358	smp_wmb(); /* See comment in __pte_alloc() */
3807	4359	}
3808	4360
3809		- vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3810		-
3811		- /* Huge page is mapped? Page fault is solved */
3812		- if (pmd_trans_huge(*vmf->pmd)) {
3813		- ret = VM_FAULT_NOPAGE;
3814		- goto out;
3815		- }
3816		-
3817		- /* ->map_pages() haven't done anything useful. Cold page cache? */
3818		- if (!vmf->pte)
3819		- goto out;
3820		-
3821		- /* check if the page fault is solved */
3822		- vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3823		- if (!pte_none(*vmf->pte))
3824		- ret = VM_FAULT_NOPAGE;
3825		- pte_unmap_unlock(vmf->pte, vmf->ptl);
3826		-out:
3827		- vmf->address = address;
3828		- vmf->pte = NULL;
3829		- return ret;
	4361	+ return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3830	4362	}
3831	4363
3832	4364	static vm_fault_t do_read_fault(struct vm_fault *vmf)
..	..	@@ -3840,9 +4372,11 @@
3840	4372	* something).
3841	4373	*/
3842	4374	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3843		- ret = do_fault_around(vmf);
3844		- if (ret)
3845		- return ret;
	4375	+ if (likely(!userfaultfd_minor(vmf->vma))) {
	4376	+ ret = do_fault_around(vmf);
	4377	+ if (ret)
	4378	+ return ret;
	4379	+ }
3846	4380	}
3847	4381
3848	4382	ret = __do_fault(vmf);
..	..	@@ -3868,11 +4402,11 @@
3868	4402	if (!vmf->cow_page)
3869	4403	return VM_FAULT_OOM;
3870	4404
3871		- if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3872		- &vmf->memcg, false)) {
	4405	+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
3873	4406	put_page(vmf->cow_page);
3874	4407	return VM_FAULT_OOM;
3875	4408	}
	4409	+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
3876	4410
3877	4411	ret = __do_fault(vmf);
3878	4412	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
..	..	@@ -3890,7 +4424,6 @@
3890	4424	goto uncharge_out;
3891	4425	return ret;
3892	4426	uncharge_out:
3893		- mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3894	4427	put_page(vmf->cow_page);
3895	4428	return ret;
3896	4429	}
..	..	@@ -3926,16 +4459,16 @@
3926	4459	return ret;
3927	4460	}
3928	4461
3929		- fault_dirty_shared_page(vma, vmf->page);
	4462	+ ret \|= fault_dirty_shared_page(vmf);
3930	4463	return ret;
3931	4464	}
3932	4465
3933	4466	/*
3934		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	4467	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3935	4468	* but allow concurrent faults).
3936		- * The mmap_sem may have been released depending on flags and our
	4469	+ * The mmap_lock may have been released depending on flags and our
3937	4470	* return value. See filemap_fault() and __lock_page_or_retry().
3938		- * If mmap_sem is released, vma may become invalid (for example
	4471	+ * If mmap_lock is released, vma may become invalid (for example
3939	4472	* by other thread calling munmap()).
3940	4473	*/
3941	4474	static vm_fault_t do_fault(struct vm_fault *vmf)
..	..	@@ -3975,7 +4508,7 @@
3975	4508	}
3976	4509	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
3977	4510	ret = do_read_fault(vmf);
3978		- else if (!(vma->vm_flags & VM_SHARED))
	4511	+ else if (!(vmf->vma_flags & VM_SHARED))
3979	4512	ret = do_cow_fault(vmf);
3980	4513	else
3981	4514	ret = do_shared_fault(vmf);
..	..	@@ -4007,11 +4540,11 @@
4007	4540	{
4008	4541	struct vm_area_struct *vma = vmf->vma;
4009	4542	struct page *page = NULL;
4010		- int page_nid = -1;
	4543	+ int page_nid = NUMA_NO_NODE;
4011	4544	int last_cpupid;
4012	4545	int target_nid;
4013	4546	bool migrated = false;
4014		- pte_t pte;
	4547	+ pte_t pte, old_pte;
4015	4548	bool was_writable = pte_savedwrite(vmf->orig_pte);
4016	4549	int flags = 0;
4017	4550
..	..	@@ -4020,8 +4553,8 @@
4020	4553	* validation through pte_unmap_same(). It's of NUMA type but
4021	4554	* the pfn may be screwed if the read is non atomic.
4022	4555	*/
4023		- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4024		- spin_lock(vmf->ptl);
	4556	+ if (!pte_spinlock(vmf))
	4557	+ return VM_FAULT_RETRY;
4025	4558	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4026	4559	pte_unmap_unlock(vmf->pte, vmf->ptl);
4027	4560	goto out;
..	..	@@ -4031,15 +4564,15 @@
4031	4564	* Make it present again, Depending on how arch implementes non
4032	4565	* accessible ptes, some can allow access by kernel mode.
4033	4566	*/
4034		- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
4035		- pte = pte_modify(pte, vma->vm_page_prot);
	4567	+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
	4568	+ pte = pte_modify(old_pte, vmf->vma_page_prot);
4036	4569	pte = pte_mkyoung(pte);
4037	4570	if (was_writable)
4038	4571	pte = pte_mkwrite(pte);
4039		- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
	4572	+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4040	4573	update_mmu_cache(vma, vmf->address, vmf->pte);
4041	4574
4042		- page = vm_normal_page(vma, vmf->address, pte);
	4575	+ page = _vm_normal_page(vma, vmf->address, pte, vmf->vma_flags);
4043	4576	if (!page) {
4044	4577	pte_unmap_unlock(vmf->pte, vmf->ptl);
4045	4578	return 0;
..	..	@@ -4066,7 +4599,7 @@
4066	4599	* Flag if the page is shared between multiple address spaces. This
4067	4600	* is later used when determining whether to group tasks together
4068	4601	*/
4069		- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
	4602	+ if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
4070	4603	flags \|= TNF_SHARED;
4071	4604
4072	4605	last_cpupid = page_cpupid_last(page);
..	..	@@ -4074,13 +4607,13 @@
4074	4607	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4075	4608	&flags);
4076	4609	pte_unmap_unlock(vmf->pte, vmf->ptl);
4077		- if (target_nid == -1) {
	4610	+ if (target_nid == NUMA_NO_NODE) {
4078	4611	put_page(page);
4079	4612	goto out;
4080	4613	}
4081	4614
4082	4615	/* Migrate to the requested node */
4083		- migrated = migrate_misplaced_page(page, vma, target_nid);
	4616	+ migrated = migrate_misplaced_page(page, vmf, target_nid);
4084	4617	if (migrated) {
4085	4618	page_nid = target_nid;
4086	4619	flags \|= TNF_MIGRATED;
..	..	@@ -4088,7 +4621,7 @@
4088	4621	flags \|= TNF_MIGRATE_FAIL;
4089	4622
4090	4623	out:
4091		- if (page_nid != -1)
	4624	+ if (page_nid != NUMA_NO_NODE)
4092	4625	task_numa_fault(last_cpupid, page_nid, 1, flags);
4093	4626	return 0;
4094	4627	}
..	..	@@ -4105,26 +4638,28 @@
4105	4638	/* `inline' is required to avoid gcc 4.1.2 build error */
4106	4639	static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
4107	4640	{
4108		- if (vma_is_anonymous(vmf->vma))
	4641	+ if (vma_is_anonymous(vmf->vma)) {
	4642	+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
	4643	+ return handle_userfault(vmf, VM_UFFD_WP);
4109	4644	return do_huge_pmd_wp_page(vmf, orig_pmd);
4110		- if (vmf->vma->vm_ops->huge_fault)
4111		- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
	4645	+ }
	4646	+ if (vmf->vma->vm_ops->huge_fault) {
	4647	+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4112	4648
4113		- /* COW handled on pte level: split pmd */
4114		- VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
	4649	+ if (!(ret & VM_FAULT_FALLBACK))
	4650	+ return ret;
	4651	+ }
	4652	+
	4653	+ /* COW or write-notify handled on pte level: split pmd. */
4115	4654	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4116	4655
4117	4656	return VM_FAULT_FALLBACK;
4118	4657	}
4119	4658
4120		-static inline bool vma_is_accessible(struct vm_area_struct *vma)
4121		-{
4122		- return vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE);
4123		-}
4124		-
4125	4659	static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4126	4660	{
4127		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4661	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
	4662	+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4128	4663	/* No support for anonymous transparent PUD pages yet */
4129	4664	if (vma_is_anonymous(vmf->vma))
4130	4665	return VM_FAULT_FALLBACK;
..	..	@@ -4136,13 +4671,21 @@
4136	4671
4137	4672	static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4138	4673	{
4139		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4674	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
	4675	+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4140	4676	/* No support for anonymous transparent PUD pages yet */
4141	4677	if (vma_is_anonymous(vmf->vma))
4142		- return VM_FAULT_FALLBACK;
4143		- if (vmf->vma->vm_ops->huge_fault)
4144		- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4145		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	4678	+ goto split;
	4679	+ if (vmf->vma->vm_ops->huge_fault) {
	4680	+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
	4681	+
	4682	+ if (!(ret & VM_FAULT_FALLBACK))
	4683	+ return ret;
	4684	+ }
	4685	+split:
	4686	+ /* COW or write-notify not handled on PUD level: split pud.*/
	4687	+ __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
	4688	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
4146	4689	return VM_FAULT_FALLBACK;
4147	4690	}
4148	4691
..	..	@@ -4155,15 +4698,20 @@
4155	4698	* with external mmu caches can use to update those (ie the Sparc or
4156	4699	* PowerPC hashed page tables that act as extended TLBs).
4157	4700	*
4158		- * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
	4701	+ * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
4159	4702	* concurrent faults).
4160	4703	*
4161		- * The mmap_sem may have been released depending on flags and our return value.
	4704	+ * The mmap_lock may have been released depending on flags and our return value.
4162	4705	* See filemap_fault() and __lock_page_or_retry().
4163	4706	*/
4164	4707	static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4165	4708	{
4166	4709	pte_t entry;
	4710	+ vm_fault_t ret = 0;
	4711	+
	4712	+ /* Do not check unstable pmd, if it's changed will retry later */
	4713	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4714	+ goto skip_pmd_checks;
4167	4715
4168	4716	if (unlikely(pmd_none(*vmf->pmd))) {
4169	4717	/*
..	..	@@ -4174,14 +4722,28 @@
4174	4722	*/
4175	4723	vmf->pte = NULL;
4176	4724	} else {
4177		- /* See comment in pte_alloc_one_map() */
	4725	+ /*
	4726	+ * If a huge pmd materialized under us just retry later. Use
	4727	+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
	4728	+ * of pmd_trans_huge() to ensure the pmd didn't become
	4729	+ * pmd_trans_huge under us and then back to pmd_none, as a
	4730	+ * result of MADV_DONTNEED running immediately after a huge pmd
	4731	+ * fault in a different thread of this mm, in turn leading to a
	4732	+ * misleading pmd_trans_huge() retval. All we have to ensure is
	4733	+ * that it is a regular pmd that we can walk with
	4734	+ * pte_offset_map() and we can do that through an atomic read
	4735	+ * in C, which is what pmd_trans_unstable() provides.
	4736	+ */
4178	4737	if (pmd_devmap_trans_unstable(vmf->pmd))
4179	4738	return 0;
4180	4739	/*
4181	4740	* A regular pmd is established and it can't morph into a huge
4182	4741	* pmd from under us anymore at this point because we hold the
4183		- * mmap_sem read mode and khugepaged takes it in write mode.
	4742	+ * mmap_lock read mode and khugepaged takes it in write mode.
4184	4743	* So now it's safe to run pte_offset_map().
	4744	+ * This is not applicable to the speculative page fault handler
	4745	+ * but in that case, the pte is fetched earlier in
	4746	+ * handle_speculative_fault().
4185	4747	*/
4186	4748	vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4187	4749	vmf->orig_pte = *vmf->pte;
..	..	@@ -4201,9 +4763,13 @@
4201	4763	}
4202	4764	}
4203	4765
	4766	+skip_pmd_checks:
4204	4767	if (!vmf->pte) {
4205	4768	if (vma_is_anonymous(vmf->vma))
4206	4769	return do_anonymous_page(vmf);
	4770	+ else if ((vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	4771	+ !vmf_allows_speculation(vmf))
	4772	+ return VM_FAULT_RETRY;
4207	4773	else
4208	4774	return do_fault(vmf);
4209	4775	}
..	..	@@ -4214,14 +4780,27 @@
4214	4780	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4215	4781	return do_numa_page(vmf);
4216	4782
4217		- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4218		- spin_lock(vmf->ptl);
	4783	+ if (!pte_spinlock(vmf))
	4784	+ return VM_FAULT_RETRY;
4219	4785	entry = vmf->orig_pte;
4220		- if (unlikely(!pte_same(*vmf->pte, entry)))
	4786	+ if (unlikely(!pte_same(*vmf->pte, entry))) {
	4787	+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4221	4788	goto unlock;
	4789	+ }
4222	4790	if (vmf->flags & FAULT_FLAG_WRITE) {
4223		- if (!pte_write(entry))
4224		- return do_wp_page(vmf);
	4791	+ if (!pte_write(entry)) {
	4792	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE))
	4793	+ return do_wp_page(vmf);
	4794	+
	4795	+ if (!mmu_notifier_trylock(vmf->vma->vm_mm)) {
	4796	+ ret = VM_FAULT_RETRY;
	4797	+ goto unlock;
	4798	+ }
	4799	+
	4800	+ ret = do_wp_page(vmf);
	4801	+ mmu_notifier_unlock(vmf->vma->vm_mm);
	4802	+ return ret;
	4803	+ }
4225	4804	entry = pte_mkdirty(entry);
4226	4805	}
4227	4806	entry = pte_mkyoung(entry);
..	..	@@ -4229,6 +4808,11 @@
4229	4808	vmf->flags & FAULT_FLAG_WRITE)) {
4230	4809	update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4231	4810	} else {
	4811	+ /* Skip spurious TLB flush for retried page fault */
	4812	+ if (vmf->flags & FAULT_FLAG_TRIED)
	4813	+ goto unlock;
	4814	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4815	+ ret = VM_FAULT_RETRY;
4232	4816	/*
4233	4817	* This is needed only for protection faults but the arch code
4234	4818	* is not yet telling us if this is a protection fault or not.
..	..	@@ -4238,15 +4822,17 @@
4238	4822	if (vmf->flags & FAULT_FLAG_WRITE)
4239	4823	flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4240	4824	}
	4825	+ trace_android_rvh_handle_pte_fault_end(vmf, highest_memmap_pfn);
	4826	+ trace_android_vh_handle_pte_fault_end(vmf, highest_memmap_pfn);
4241	4827	unlock:
4242	4828	pte_unmap_unlock(vmf->pte, vmf->ptl);
4243		- return 0;
	4829	+ return ret;
4244	4830	}
4245	4831
4246	4832	/*
4247	4833	* By the time we get here, we already hold the mm semaphore
4248	4834	*
4249		- * The mmap_sem may have been released depending on flags and our
	4835	+ * The mmap_lock may have been released depending on flags and our
4250	4836	* return value. See filemap_fault() and __lock_page_or_retry().
4251	4837	*/
4252	4838	static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
..	..	@@ -4258,6 +4844,8 @@
4258	4844	.flags = flags,
4259	4845	.pgoff = linear_page_index(vma, address),
4260	4846	.gfp_mask = __get_fault_gfp_mask(vma),
	4847	+ .vma_flags = vma->vm_flags,
	4848	+ .vma_page_prot = vma->vm_page_prot,
4261	4849	};
4262	4850	unsigned int dirty = flags & FAULT_FLAG_WRITE;
4263	4851	struct mm_struct *mm = vma->vm_mm;
..	..	@@ -4273,6 +4861,7 @@
4273	4861	vmf.pud = pud_alloc(mm, p4d, address);
4274	4862	if (!vmf.pud)
4275	4863	return VM_FAULT_OOM;
	4864	+retry_pud:
4276	4865	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4277	4866	ret = create_huge_pud(&vmf);
4278	4867	if (!(ret & VM_FAULT_FALLBACK))
..	..	@@ -4299,6 +4888,14 @@
4299	4888	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4300	4889	if (!vmf.pmd)
4301	4890	return VM_FAULT_OOM;
	4891	+
	4892	+ /* Huge pud page fault raced with pmd_alloc? */
	4893	+ if (pud_trans_unstable(vmf.pud))
	4894	+ goto retry_pud;
	4895	+
	4896	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	4897	+ vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
	4898	+#endif
4302	4899	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4303	4900	ret = create_huge_pmd(&vmf);
4304	4901	if (!(ret & VM_FAULT_FALLBACK))
..	..	@@ -4332,14 +4929,342 @@
4332	4929	return handle_pte_fault(&vmf);
4333	4930	}
4334	4931
	4932	+/**
	4933	+ * mm_account_fault - Do page fault accountings
	4934	+ *
	4935	+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
	4936	+ * of perf event counters, but we'll still do the per-task accounting to
	4937	+ * the task who triggered this page fault.
	4938	+ * @address: the faulted address.
	4939	+ * @flags: the fault flags.
	4940	+ * @ret: the fault retcode.
	4941	+ *
	4942	+ * This will take care of most of the page fault accountings. Meanwhile, it
	4943	+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ\|MIN] perf counter
	4944	+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
	4945	+ * still be in per-arch page fault handlers at the entry of page fault.
	4946	+ */
	4947	+static inline void mm_account_fault(struct pt_regs *regs,
	4948	+ unsigned long address, unsigned int flags,
	4949	+ vm_fault_t ret)
	4950	+{
	4951	+ bool major;
	4952	+
	4953	+ /*
	4954	+ * We don't do accounting for some specific faults:
	4955	+ *
	4956	+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
	4957	+ * includes arch_vma_access_permitted() failing before reaching here.
	4958	+ * So this is not a "this many hardware page faults" counter. We
	4959	+ * should use the hw profiling for that.
	4960	+ *
	4961	+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
	4962	+ * once they're completed.
	4963	+ */
	4964	+ if (ret & (VM_FAULT_ERROR \| VM_FAULT_RETRY))
	4965	+ return;
	4966	+
	4967	+ /*
	4968	+ * We define the fault as a major fault when the final successful fault
	4969	+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
	4970	+ * handle it immediately previously).
	4971	+ */
	4972	+ major = (ret & VM_FAULT_MAJOR) \|\| (flags & FAULT_FLAG_TRIED);
	4973	+
	4974	+ if (major)
	4975	+ current->maj_flt++;
	4976	+ else
	4977	+ current->min_flt++;
	4978	+
	4979	+ /*
	4980	+ * If the fault is done for GUP, regs will be NULL. We only do the
	4981	+ * accounting for the per thread fault counters who triggered the
	4982	+ * fault, and we skip the perf event updates.
	4983	+ */
	4984	+ if (!regs)
	4985	+ return;
	4986	+
	4987	+ if (major)
	4988	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
	4989	+ else
	4990	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
	4991	+}
	4992	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	4993	+
	4994	+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
	4995	+/* This is required by vm_normal_page() */
	4996	+#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL"
	4997	+#endif
	4998	+/*
	4999	+ * vm_normal_page() adds some processing which should be done while
	5000	+ * hodling the mmap_sem.
	5001	+ */
	5002	+
	5003	+/*
	5004	+ * Tries to handle the page fault in a speculative way, without grabbing the
	5005	+ * mmap_sem.
	5006	+ * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
	5007	+ * be checked later when the mmap_sem has been grabbed by calling
	5008	+ * can_reuse_spf_vma().
	5009	+ * This is needed as the returned vma is kept in memory until the call to
	5010	+ * can_reuse_spf_vma() is made.
	5011	+ */
	5012	+static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
	5013	+ unsigned long address, unsigned int flags,
	5014	+ struct vm_area_struct *vma)
	5015	+{
	5016	+ struct vm_fault vmf = {
	5017	+ .address = address,
	5018	+ .pgoff = linear_page_index(vma, address),
	5019	+ .vma = vma,
	5020	+ .gfp_mask = __get_fault_gfp_mask(vma),
	5021	+ .flags = flags,
	5022	+ };
	5023	+#ifdef CONFIG_NUMA
	5024	+ struct mempolicy *pol;
	5025	+#endif
	5026	+ pgd_t *pgd, pgdval;
	5027	+ p4d_t *p4d, p4dval;
	5028	+ pud_t pudval;
	5029	+ int seq;
	5030	+ vm_fault_t ret;
	5031	+
	5032	+ /* Clear flags that may lead to release the mmap_sem to retry */
	5033	+ flags &= ~(FAULT_FLAG_ALLOW_RETRY\|FAULT_FLAG_KILLABLE);
	5034	+ flags \|= FAULT_FLAG_SPECULATIVE;
	5035	+
	5036	+ /* rmb <-> seqlock,vma_rb_erase() */
	5037	+ seq = raw_read_seqcount(&vmf.vma->vm_sequence);
	5038	+ if (seq & 1) {
	5039	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5040	+ return VM_FAULT_RETRY;
	5041	+ }
	5042	+
	5043	+ if (!vmf_allows_speculation(&vmf))
	5044	+ return VM_FAULT_RETRY;
	5045	+
	5046	+ vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
	5047	+ vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
	5048	+
	5049	+#ifdef CONFIG_USERFAULTFD
	5050	+ /* Can't call userland page fault handler in the speculative path */
	5051	+ if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) {
	5052	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5053	+ return VM_FAULT_RETRY;
	5054	+ }
	5055	+#endif
	5056	+
	5057	+ if (vmf.vma_flags & VM_GROWSDOWN \|\| vmf.vma_flags & VM_GROWSUP) {
	5058	+ /*
	5059	+ * This could be detected by the check address against VMA's
	5060	+ * boundaries but we want to trace it as not supported instead
	5061	+ * of changed.
	5062	+ */
	5063	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5064	+ return VM_FAULT_RETRY;
	5065	+ }
	5066	+
	5067	+ if (address < READ_ONCE(vmf.vma->vm_start)
	5068	+ \|\| READ_ONCE(vmf.vma->vm_end) <= address) {
	5069	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5070	+ return VM_FAULT_RETRY;
	5071	+ }
	5072	+
	5073	+ if (!arch_vma_access_permitted(vmf.vma, flags & FAULT_FLAG_WRITE,
	5074	+ flags & FAULT_FLAG_INSTRUCTION,
	5075	+ flags & FAULT_FLAG_REMOTE))
	5076	+ goto out_segv;
	5077	+
	5078	+ /* This is one is required to check that the VMA has write access set */
	5079	+ if (flags & FAULT_FLAG_WRITE) {
	5080	+ if (unlikely(!(vmf.vma_flags & VM_WRITE)))
	5081	+ goto out_segv;
	5082	+ } else if (unlikely(!(vmf.vma_flags & (VM_READ\|VM_EXEC\|VM_WRITE))))
	5083	+ goto out_segv;
	5084	+
	5085	+#ifdef CONFIG_NUMA
	5086	+ /*
	5087	+ * MPOL_INTERLEAVE implies additional checks in
	5088	+ * mpol_misplaced() which are not compatible with the
	5089	+ *speculative page fault processing.
	5090	+ */
	5091	+ pol = __get_vma_policy(vmf.vma, address);
	5092	+ if (!pol)
	5093	+ pol = get_task_policy(current);
	5094	+ if (pol && pol->mode == MPOL_INTERLEAVE) {
	5095	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5096	+ return VM_FAULT_RETRY;
	5097	+ }
	5098	+#endif
	5099	+
	5100	+ /*
	5101	+ * Do a speculative lookup of the PTE entry.
	5102	+ */
	5103	+ local_irq_disable();
	5104	+ pgd = pgd_offset(mm, address);
	5105	+ pgdval = READ_ONCE(*pgd);
	5106	+ if (pgd_none(pgdval) \|\| unlikely(pgd_bad(pgdval)))
	5107	+ goto out_walk;
	5108	+
	5109	+ p4d = p4d_offset(pgd, address);
	5110	+ if (pgd_val(READ_ONCE(*pgd)) != pgd_val(pgdval))
	5111	+ goto out_walk;
	5112	+ p4dval = READ_ONCE(*p4d);
	5113	+ if (p4d_none(p4dval) \|\| unlikely(p4d_bad(p4dval)))
	5114	+ goto out_walk;
	5115	+
	5116	+ vmf.pud = pud_offset(p4d, address);
	5117	+ if (p4d_val(READ_ONCE(*p4d)) != p4d_val(p4dval))
	5118	+ goto out_walk;
	5119	+ pudval = READ_ONCE(*vmf.pud);
	5120	+ if (pud_none(pudval) \|\| unlikely(pud_bad(pudval)))
	5121	+ goto out_walk;
	5122	+
	5123	+ /* Huge pages at PUD level are not supported. */
	5124	+ if (unlikely(pud_trans_huge(pudval)))
	5125	+ goto out_walk;
	5126	+
	5127	+ vmf.pmd = pmd_offset(vmf.pud, address);
	5128	+ if (pud_val(READ_ONCE(*vmf.pud)) != pud_val(pudval))
	5129	+ goto out_walk;
	5130	+ vmf.orig_pmd = READ_ONCE(*vmf.pmd);
	5131	+ /*
	5132	+ * pmd_none could mean that a hugepage collapse is in progress
	5133	+ * in our back as collapse_huge_page() mark it before
	5134	+ * invalidating the pte (which is done once the IPI is catched
	5135	+ * by all CPU and we have interrupt disabled).
	5136	+ * For this reason we cannot handle THP in a speculative way since we
	5137	+ * can't safely indentify an in progress collapse operation done in our
	5138	+ * back on that PMD.
	5139	+ * Regarding the order of the following checks, see comment in
	5140	+ * pmd_devmap_trans_unstable()
	5141	+ */
	5142	+ if (unlikely(pmd_devmap(vmf.orig_pmd) \|\|
	5143	+ pmd_none(vmf.orig_pmd) \|\| pmd_trans_huge(vmf.orig_pmd) \|\|
	5144	+ is_swap_pmd(vmf.orig_pmd)))
	5145	+ goto out_walk;
	5146	+
	5147	+ /*
	5148	+ * The above does not allocate/instantiate page-tables because doing so
	5149	+ * would lead to the possibility of instantiating page-tables after
	5150	+ * free_pgtables() -- and consequently leaking them.
	5151	+ *
	5152	+ * The result is that we take at least one !speculative fault per PMD
	5153	+ * in order to instantiate it.
	5154	+ */
	5155	+
	5156	+ vmf.pte = pte_offset_map(vmf.pmd, address);
	5157	+ if (pmd_val(READ_ONCE(*vmf.pmd)) != pmd_val(vmf.orig_pmd)) {
	5158	+ pte_unmap(vmf.pte);
	5159	+ vmf.pte = NULL;
	5160	+ goto out_walk;
	5161	+ }
	5162	+ vmf.orig_pte = READ_ONCE(*vmf.pte);
	5163	+ barrier(); /* See comment in handle_pte_fault() */
	5164	+ if (pte_none(vmf.orig_pte)) {
	5165	+ pte_unmap(vmf.pte);
	5166	+ vmf.pte = NULL;
	5167	+ }
	5168	+
	5169	+ vmf.sequence = seq;
	5170	+ vmf.flags = flags;
	5171	+
	5172	+ local_irq_enable();
	5173	+
	5174	+ /*
	5175	+ * We need to re-validate the VMA after checking the bounds, otherwise
	5176	+ * we might have a false positive on the bounds.
	5177	+ */
	5178	+ if (read_seqcount_retry(&vmf.vma->vm_sequence, seq)) {
	5179	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5180	+ return VM_FAULT_RETRY;
	5181	+ }
	5182	+
	5183	+ mem_cgroup_enter_user_fault();
	5184	+ ret = handle_pte_fault(&vmf);
	5185	+ mem_cgroup_exit_user_fault();
	5186	+
	5187	+ if (ret != VM_FAULT_RETRY) {
	5188	+ if (vma_is_anonymous(vmf.vma))
	5189	+ count_vm_event(SPECULATIVE_PGFAULT_ANON);
	5190	+ else
	5191	+ count_vm_event(SPECULATIVE_PGFAULT_FILE);
	5192	+ }
	5193	+
	5194	+ /*
	5195	+ * The task may have entered a memcg OOM situation but
	5196	+ * if the allocation error was handled gracefully (no
	5197	+ * VM_FAULT_OOM), there is no need to kill anything.
	5198	+ * Just clean up the OOM state peacefully.
	5199	+ */
	5200	+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
	5201	+ mem_cgroup_oom_synchronize(false);
	5202	+ return ret;
	5203	+
	5204	+out_walk:
	5205	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5206	+ local_irq_enable();
	5207	+ return VM_FAULT_RETRY;
	5208	+
	5209	+out_segv:
	5210	+ trace_spf_vma_access(_RET_IP_, vmf.vma, address);
	5211	+ return VM_FAULT_SIGSEGV;
	5212	+}
	5213	+
	5214	+vm_fault_t __handle_speculative_fault(struct mm_struct *mm,
	5215	+ unsigned long address, unsigned int flags,
	5216	+ struct vm_area_struct **vma,
	5217	+ struct pt_regs *regs)
	5218	+{
	5219	+ vm_fault_t ret;
	5220	+
	5221	+ check_sync_rss_stat(current);
	5222	+
	5223	+ *vma = get_vma(mm, address);
	5224	+ if (!*vma)
	5225	+ return VM_FAULT_RETRY;
	5226	+
	5227	+ ret = ___handle_speculative_fault(mm, address, flags, *vma);
	5228	+
	5229	+ /*
	5230	+ * If there is no need to retry, don't return the vma to the caller.
	5231	+ */
	5232	+ if (ret != VM_FAULT_RETRY) {
	5233	+ put_vma(*vma);
	5234	+ *vma = NULL;
	5235	+ mm_account_fault(regs, address, flags, ret);
	5236	+ }
	5237	+
	5238	+ return ret;
	5239	+}
	5240	+
	5241	+/*
	5242	+ * This is used to know if the vma fetch in the speculative page fault handler
	5243	+ * is still valid when trying the regular fault path while holding the
	5244	+ * mmap_sem.
	5245	+ * The call to put_vma(vma) must be made after checking the vma's fields, as
	5246	+ * the vma may be freed by put_vma(). In such a case it is expected that false
	5247	+ * is returned.
	5248	+ */
	5249	+bool can_reuse_spf_vma(struct vm_area_struct *vma, unsigned long address)
	5250	+{
	5251	+ bool ret;
	5252	+
	5253	+ ret = !RB_EMPTY_NODE(&vma->vm_rb) &&
	5254	+ vma->vm_start <= address && address < vma->vm_end;
	5255	+ put_vma(vma);
	5256	+ return ret;
	5257	+}
	5258	+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
	5259	+
4335	5260	/*
4336	5261	* By the time we get here, we already hold the mm semaphore
4337	5262	*
4338		- * The mmap_sem may have been released depending on flags and our
	5263	+ * The mmap_lock may have been released depending on flags and our
4339	5264	* return value. See filemap_fault() and __lock_page_or_retry().
4340	5265	*/
4341	5266	vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4342		- unsigned int flags)
	5267	+ unsigned int flags, struct pt_regs *regs)
4343	5268	{
4344	5269	vm_fault_t ret;
4345	5270
..	..	@@ -4379,6 +5304,8 @@
4379	5304	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4380	5305	mem_cgroup_oom_synchronize(false);
4381	5306	}
	5307	+
	5308	+ mm_account_fault(regs, address, flags, ret);
4382	5309
4383	5310	return ret;
4384	5311	}
..	..	@@ -4421,19 +5348,11 @@
4421	5348	smp_wmb(); /* See comment in __pte_alloc */
4422	5349
4423	5350	spin_lock(&mm->page_table_lock);
4424		-#ifndef __ARCH_HAS_5LEVEL_HACK
4425	5351	if (!p4d_present(*p4d)) {
4426	5352	mm_inc_nr_puds(mm);
4427	5353	p4d_populate(mm, p4d, new);
4428	5354	} else /* Another has populated it */
4429	5355	pud_free(mm, new);
4430		-#else
4431		- if (!pgd_present(*p4d)) {
4432		- mm_inc_nr_puds(mm);
4433		- pgd_populate(mm, p4d, new);
4434		- } else /* Another has populated it */
4435		- pud_free(mm, new);
4436		-#endif /* __ARCH_HAS_5LEVEL_HACK */
4437	5356	spin_unlock(&mm->page_table_lock);
4438	5357	return 0;
4439	5358	}
..	..	@@ -4454,27 +5373,19 @@
4454	5373	smp_wmb(); /* See comment in __pte_alloc */
4455	5374
4456	5375	ptl = pud_lock(mm, pud);
4457		-#ifndef __ARCH_HAS_4LEVEL_HACK
4458	5376	if (!pud_present(*pud)) {
4459	5377	mm_inc_nr_pmds(mm);
4460	5378	pud_populate(mm, pud, new);
4461	5379	} else /* Another has populated it */
4462	5380	pmd_free(mm, new);
4463		-#else
4464		- if (!pgd_present(*pud)) {
4465		- mm_inc_nr_pmds(mm);
4466		- pgd_populate(mm, pud, new);
4467		- } else /* Another has populated it */
4468		- pmd_free(mm, new);
4469		-#endif /* __ARCH_HAS_4LEVEL_HACK */
4470	5381	spin_unlock(ptl);
4471	5382	return 0;
4472	5383	}
4473	5384	#endif /* __PAGETABLE_PMD_FOLDED */
4474	5385
4475		-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4476		- unsigned long start, unsigned long end,
4477		- pte_t ptepp, pmd_t pmdpp, spinlock_t **ptlp)
	5386	+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
	5387	+ struct mmu_notifier_range range, pte_t *ptepp,
	5388	+ pmd_t pmdpp, spinlock_t ptlp)
4478	5389	{
4479	5390	pgd_t *pgd;
4480	5391	p4d_t *p4d;
..	..	@@ -4501,10 +5412,11 @@
4501	5412	if (!pmdpp)
4502	5413	goto out;
4503	5414
4504		- if (start && end) {
4505		- *start = address & PMD_MASK;
4506		- end = start + PMD_SIZE;
4507		- mmu_notifier_invalidate_range_start(mm, start, end);
	5415	+ if (range) {
	5416	+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
	5417	+ NULL, mm, address & PMD_MASK,
	5418	+ (address & PMD_MASK) + PMD_SIZE);
	5419	+ mmu_notifier_invalidate_range_start(range);
4508	5420	}
4509	5421	*ptlp = pmd_lock(mm, pmd);
4510	5422	if (pmd_huge(*pmd)) {
..	..	@@ -4512,17 +5424,18 @@
4512	5424	return 0;
4513	5425	}
4514	5426	spin_unlock(*ptlp);
4515		- if (start && end)
4516		- mmu_notifier_invalidate_range_end(mm, start, end);
	5427	+ if (range)
	5428	+ mmu_notifier_invalidate_range_end(range);
4517	5429	}
4518	5430
4519	5431	if (pmd_none(pmd) \|\| unlikely(pmd_bad(pmd)))
4520	5432	goto out;
4521	5433
4522		- if (start && end) {
4523		- *start = address & PAGE_MASK;
4524		- end = start + PAGE_SIZE;
4525		- mmu_notifier_invalidate_range_start(mm, start, end);
	5434	+ if (range) {
	5435	+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
	5436	+ address & PAGE_MASK,
	5437	+ (address & PAGE_MASK) + PAGE_SIZE);
	5438	+ mmu_notifier_invalidate_range_start(range);
4526	5439	}
4527	5440	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4528	5441	if (!pte_present(*ptep))
..	..	@@ -4531,37 +5444,39 @@
4531	5444	return 0;
4532	5445	unlock:
4533	5446	pte_unmap_unlock(ptep, *ptlp);
4534		- if (start && end)
4535		- mmu_notifier_invalidate_range_end(mm, start, end);
	5447	+ if (range)
	5448	+ mmu_notifier_invalidate_range_end(range);
4536	5449	out:
4537	5450	return -EINVAL;
4538	5451	}
4539	5452
4540		-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4541		- pte_t ptepp, spinlock_t ptlp)
	5453	+/**
	5454	+ * follow_pte - look up PTE at a user virtual address
	5455	+ * @mm: the mm_struct of the target address space
	5456	+ * @address: user virtual address
	5457	+ * @ptepp: location to store found PTE
	5458	+ * @ptlp: location to store the lock for the PTE
	5459	+ *
	5460	+ * On a successful return, the pointer to the PTE is stored in @ptepp;
	5461	+ * the corresponding lock is taken and its location is stored in @ptlp.
	5462	+ * The contents of the PTE are only stable until @ptlp is released;
	5463	+ * any further use, if any, must be protected against invalidation
	5464	+ * with MMU notifiers.
	5465	+ *
	5466	+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
	5467	+ * should be taken for read.
	5468	+ *
	5469	+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
	5470	+ * it is not a good general-purpose API.
	5471	+ *
	5472	+ * Return: zero on success, -ve otherwise.
	5473	+ */
	5474	+int follow_pte(struct mm_struct *mm, unsigned long address,
	5475	+ pte_t ptepp, spinlock_t ptlp)
4542	5476	{
4543		- int res;
4544		-
4545		- /* (void) is needed to make gcc happy */
4546		- (void) __cond_lock(*ptlp,
4547		- !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4548		- ptepp, NULL, ptlp)));
4549		- return res;
	5477	+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
4550	5478	}
4551		-
4552		-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4553		- unsigned long start, unsigned long end,
4554		- pte_t ptepp, pmd_t pmdpp, spinlock_t **ptlp)
4555		-{
4556		- int res;
4557		-
4558		- /* (void) is needed to make gcc happy */
4559		- (void) __cond_lock(*ptlp,
4560		- !(res = __follow_pte_pmd(mm, address, start, end,
4561		- ptepp, pmdpp, ptlp)));
4562		- return res;
4563		-}
4564		-EXPORT_SYMBOL(follow_pte_pmd);
	5479	+EXPORT_SYMBOL_GPL(follow_pte);
4565	5480
4566	5481	/**
4567	5482	* follow_pfn - look up PFN at a user virtual address
..	..	@@ -4571,7 +5486,10 @@
4571	5486	*
4572	5487	* Only IO mappings and raw PFN mappings are allowed.
4573	5488	*
4574		- * Returns zero and the pfn at @pfn on success, -ve otherwise.
	5489	+ * This function does not allow the caller to read the permissions
	5490	+ * of the PTE. Do not use it.
	5491	+ *
	5492	+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
4575	5493	*/
4576	5494	int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4577	5495	unsigned long *pfn)
..	..	@@ -4658,7 +5576,7 @@
4658	5576	void *old_buf = buf;
4659	5577	int write = gup_flags & FOLL_WRITE;
4660	5578
4661		- if (down_read_killable(&mm->mmap_sem))
	5579	+ if (mmap_read_lock_killable(mm))
4662	5580	return 0;
4663	5581
4664	5582	/* ignore errors, just check how much was successfully transferred */
..	..	@@ -4667,7 +5585,7 @@
4667	5585	void *maddr;
4668	5586	struct page *page = NULL;
4669	5587
4670		- ret = get_user_pages_remote(tsk, mm, addr, 1,
	5588	+ ret = get_user_pages_remote(mm, addr, 1,
4671	5589	gup_flags, &page, &vma, NULL);
4672	5590	if (ret <= 0) {
4673	5591	#ifndef CONFIG_HAVE_IOREMAP_PROT
..	..	@@ -4703,13 +5621,13 @@
4703	5621	buf, maddr + offset, bytes);
4704	5622	}
4705	5623	kunmap(page);
4706		- put_page(page);
	5624	+ put_user_page(page);
4707	5625	}
4708	5626	len -= bytes;
4709	5627	buf += bytes;
4710	5628	addr += bytes;
4711	5629	}
4712		- up_read(&mm->mmap_sem);
	5630	+ mmap_read_unlock(mm);
4713	5631
4714	5632	return buf - old_buf;
4715	5633	}
..	..	@@ -4723,6 +5641,8 @@
4723	5641	* @gup_flags: flags modifying lookup behaviour
4724	5642	*
4725	5643	* The caller must hold a reference on @mm.
	5644	+ *
	5645	+ * Return: number of bytes copied from source to destination.
4726	5646	*/
4727	5647	int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4728	5648	void *buf, int len, unsigned int gup_flags)
..	..	@@ -4764,7 +5684,7 @@
4764	5684	/*
4765	5685	* we might be running from an atomic context so we cannot sleep
4766	5686	*/
4767		- if (!down_read_trylock(&mm->mmap_sem))
	5687	+ if (!mmap_read_trylock(mm))
4768	5688	return;
4769	5689
4770	5690	vma = find_vma(mm, ip);
..	..	@@ -4783,7 +5703,7 @@
4783	5703	free_page((unsigned long)buf);
4784	5704	}
4785	5705	}
4786		- up_read(&mm->mmap_sem);
	5706	+ mmap_read_unlock(mm);
4787	5707	}
4788	5708
4789	5709	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
..	..	@@ -4791,7 +5711,7 @@
4791	5711	{
4792	5712	/*
4793	5713	* Some code (nfs/sunrpc) uses socket ops on kernel memory while
4794		- * holding the mmap_sem, this is safe because kernel memory doesn't
	5714	+ * holding the mmap_lock, this is safe because kernel memory doesn't
4795	5715	* get paged out, therefore we'll never actually fault, and the
4796	5716	* below annotations will generate false positives.
4797	5717	*/
..	..	@@ -4802,7 +5722,7 @@
4802	5722	__might_sleep(file, line, 0);
4803	5723	#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4804	5724	if (current->mm)
4805		- might_lock_read(&current->mm->mmap_sem);
	5725	+ might_lock_read(&current->mm->mmap_lock);
4806	5726	#endif
4807	5727	}
4808	5728	EXPORT_SYMBOL(__might_fault);
..	..	@@ -4979,6 +5899,8 @@
4979	5899	if (rc)
4980	5900	break;
4981	5901
	5902	+ flush_dcache_page(subpage);
	5903	+
4982	5904	cond_resched();
4983	5905	}
4984	5906	return ret_val;