~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/memory.c
3	4	*
..	..	@@ -64,11 +65,15 @@
64	65	#include <linux/gfp.h>
65	66	#include <linux/migrate.h>
66	67	#include <linux/string.h>
67		-#include <linux/dma-debug.h>
68	68	#include <linux/debugfs.h>
69	69	#include <linux/userfaultfd_k.h>
70	70	#include <linux/dax.h>
71	71	#include <linux/oom.h>
	72	+#include <linux/numa.h>
	73	+#include <linux/perf_event.h>
	74	+#include <linux/ptrace.h>
	75	+#include <linux/vmalloc.h>
	76	+#include <trace/hooks/mm.h>
72	77
73	78	#include <trace/events/kmem.h>
74	79
..	..	@@ -78,9 +83,13 @@
78	83	#include <linux/uaccess.h>
79	84	#include <asm/tlb.h>
80	85	#include <asm/tlbflush.h>
81		-#include <asm/pgtable.h>
82	86
	87	+#include "pgalloc-track.h"
83	88	#include "internal.h"
	89	+#include <trace/hooks/mm.h>
	90	+
	91	+#define CREATE_TRACE_POINTS
	92	+#include <trace/events/pagefault.h>
84	93
85	94	#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
86	95	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
..	..	@@ -127,6 +136,18 @@
127	136	* will be hit on old pte.
128	137	*/
129	138	return true;
	139	+}
	140	+#endif
	141	+
	142	+#ifndef arch_wants_old_prefaulted_pte
	143	+static inline bool arch_wants_old_prefaulted_pte(void)
	144	+{
	145	+ /*
	146	+ * Transitioning a PTE from 'old' to 'young' can be expensive on
	147	+ * some architectures, even if it's performed in hardware. By
	148	+ * default, "false" means prefaulted entries will be 'young'.
	149	+ */
	150	+ return false;
130	151	}
131	152	#endif
132	153
..	..	@@ -216,263 +237,6 @@
216	237	}
217	238
218	239	#endif /* SPLIT_RSS_COUNTING */
219		-
220		-#ifdef HAVE_GENERIC_MMU_GATHER
221		-
222		-static bool tlb_next_batch(struct mmu_gather *tlb)
223		-{
224		- struct mmu_gather_batch *batch;
225		-
226		- batch = tlb->active;
227		- if (batch->next) {
228		- tlb->active = batch->next;
229		- return true;
230		- }
231		-
232		- if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
233		- return false;
234		-
235		- batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
236		- if (!batch)
237		- return false;
238		-
239		- tlb->batch_count++;
240		- batch->next = NULL;
241		- batch->nr = 0;
242		- batch->max = MAX_GATHER_BATCH;
243		-
244		- tlb->active->next = batch;
245		- tlb->active = batch;
246		-
247		- return true;
248		-}
249		-
250		-void arch_tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm,
251		- unsigned long start, unsigned long end)
252		-{
253		- tlb->mm = mm;
254		-
255		- /* Is it from 0 to ~0? */
256		- tlb->fullmm = !(start \| (end+1));
257		- tlb->need_flush_all = 0;
258		- tlb->local.next = NULL;
259		- tlb->local.nr = 0;
260		- tlb->local.max = ARRAY_SIZE(tlb->__pages);
261		- tlb->active = &tlb->local;
262		- tlb->batch_count = 0;
263		-
264		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
265		- tlb->batch = NULL;
266		-#endif
267		- tlb->page_size = 0;
268		-
269		- __tlb_reset_range(tlb);
270		-}
271		-
272		-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
273		-{
274		- struct mmu_gather_batch *batch;
275		-
276		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
277		- tlb_table_flush(tlb);
278		-#endif
279		- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
280		- free_pages_and_swap_cache(batch->pages, batch->nr);
281		- batch->nr = 0;
282		- }
283		- tlb->active = &tlb->local;
284		-}
285		-
286		-void tlb_flush_mmu(struct mmu_gather *tlb)
287		-{
288		- tlb_flush_mmu_tlbonly(tlb);
289		- tlb_flush_mmu_free(tlb);
290		-}
291		-
292		-/* tlb_finish_mmu
293		- * Called at the end of the shootdown operation to free up any resources
294		- * that were required.
295		- */
296		-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
297		- unsigned long start, unsigned long end, bool force)
298		-{
299		- struct mmu_gather_batch batch, next;
300		-
301		- if (force)
302		- __tlb_adjust_range(tlb, start, end - start);
303		-
304		- tlb_flush_mmu(tlb);
305		-
306		- /* keep the page table cache within bounds */
307		- check_pgt_cache();
308		-
309		- for (batch = tlb->local.next; batch; batch = next) {
310		- next = batch->next;
311		- free_pages((unsigned long)batch, 0);
312		- }
313		- tlb->local.next = NULL;
314		-}
315		-
316		-/* __tlb_remove_page
317		- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
318		- * handling the additional races in SMP caused by other CPUs caching valid
319		- * mappings in their TLBs. Returns the number of free page slots left.
320		- * When out of page slots we must call tlb_flush_mmu().
321		- *returns true if the caller should flush.
322		- */
323		-bool __tlb_remove_page_size(struct mmu_gather tlb, struct page page, int page_size)
324		-{
325		- struct mmu_gather_batch *batch;
326		-
327		- VM_BUG_ON(!tlb->end);
328		- VM_WARN_ON(tlb->page_size != page_size);
329		-
330		- batch = tlb->active;
331		- /*
332		- * Add the page and check if we are full. If so
333		- * force a flush.
334		- */
335		- batch->pages[batch->nr++] = page;
336		- if (batch->nr == batch->max) {
337		- if (!tlb_next_batch(tlb))
338		- return true;
339		- batch = tlb->active;
340		- }
341		- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
342		-
343		- return false;
344		-}
345		-
346		-void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address,
347		- unsigned long size)
348		-{
349		- if (tlb->page_size != 0 && tlb->page_size != PMD_SIZE)
350		- tlb_flush_mmu(tlb);
351		-
352		- tlb->page_size = PMD_SIZE;
353		- tlb->start = min(tlb->start, address);
354		- tlb->end = max(tlb->end, address + size);
355		-}
356		-#endif /* HAVE_GENERIC_MMU_GATHER */
357		-
358		-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
359		-
360		-/*
361		- * See the comment near struct mmu_table_batch.
362		- */
363		-
364		-/*
365		- * If we want tlb_remove_table() to imply TLB invalidates.
366		- */
367		-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
368		-{
369		-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
370		- /*
371		- * Invalidate page-table caches used by hardware walkers. Then we still
372		- * need to RCU-sched wait while freeing the pages because software
373		- * walkers can still be in-flight.
374		- */
375		- tlb_flush_mmu_tlbonly(tlb);
376		-#endif
377		-}
378		-
379		-static void tlb_remove_table_smp_sync(void *arg)
380		-{
381		- /* Simply deliver the interrupt */
382		-}
383		-
384		-static void tlb_remove_table_one(void *table)
385		-{
386		- /*
387		- * This isn't an RCU grace period and hence the page-tables cannot be
388		- * assumed to be actually RCU-freed.
389		- *
390		- * It is however sufficient for software page-table walkers that rely on
391		- * IRQ disabling. See the comment near struct mmu_table_batch.
392		- */
393		- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
394		- __tlb_remove_table(table);
395		-}
396		-
397		-static void tlb_remove_table_rcu(struct rcu_head *head)
398		-{
399		- struct mmu_table_batch *batch;
400		- int i;
401		-
402		- batch = container_of(head, struct mmu_table_batch, rcu);
403		-
404		- for (i = 0; i < batch->nr; i++)
405		- __tlb_remove_table(batch->tables[i]);
406		-
407		- free_page((unsigned long)batch);
408		-}
409		-
410		-void tlb_table_flush(struct mmu_gather *tlb)
411		-{
412		- struct mmu_table_batch **batch = &tlb->batch;
413		-
414		- if (*batch) {
415		- tlb_table_invalidate(tlb);
416		- call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
417		- *batch = NULL;
418		- }
419		-}
420		-
421		-void tlb_remove_table(struct mmu_gather tlb, void table)
422		-{
423		- struct mmu_table_batch **batch = &tlb->batch;
424		-
425		- if (*batch == NULL) {
426		- batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);
427		- if (*batch == NULL) {
428		- tlb_table_invalidate(tlb);
429		- tlb_remove_table_one(table);
430		- return;
431		- }
432		- (*batch)->nr = 0;
433		- }
434		-
435		- (batch)->tables[(batch)->nr++] = table;
436		- if ((*batch)->nr == MAX_TABLE_BATCH)
437		- tlb_table_flush(tlb);
438		-}
439		-
440		-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
441		-
442		-/**
443		- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
444		- * @tlb: the mmu_gather structure to initialize
445		- * @mm: the mm_struct of the target address space
446		- * @start: start of the region that will be removed from the page-table
447		- * @end: end of the region that will be removed from the page-table
448		- *
449		- * Called to initialize an (on-stack) mmu_gather structure for page-table
450		- * tear-down from @mm. The @start and @end are set to 0 and -1
451		- * respectively when @mm is without users and we're going to destroy
452		- * the full address space (exit/execve).
453		- */
454		-void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm,
455		- unsigned long start, unsigned long end)
456		-{
457		- arch_tlb_gather_mmu(tlb, mm, start, end);
458		- inc_tlb_flush_pending(tlb->mm);
459		-}
460		-
461		-void tlb_finish_mmu(struct mmu_gather *tlb,
462		- unsigned long start, unsigned long end)
463		-{
464		- /*
465		- * If there are parallel threads are doing PTE changes on same range
466		- * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
467		- * flush by batching, a thread has stable TLB entry can fail to flush
468		- * the TLB by observing pte_none\|!pte_dirty, for example so flush TLB
469		- * forcefully if we detect parallel PTE batching threads.
470		- */
471		- bool force = mm_tlb_flush_nested(tlb->mm);
472		-
473		- arch_tlb_finish_mmu(tlb, start, end, force);
474		- dec_tlb_flush_pending(tlb->mm);
475		-}
476	240
477	241	/*
478	242	* Note: this doesn't free the actual pages themselves. That
..	..	@@ -643,7 +407,7 @@
643	407	* We add page table cache pages with PAGE_SIZE,
644	408	* (see pte_free_tlb()), flush the tlb if we need
645	409	*/
646		- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
	410	+ tlb_change_page_size(tlb, PAGE_SIZE);
647	411	pgd = pgd_offset(tlb->mm, addr);
648	412	do {
649	413	next = pgd_addr_end(addr, end);
..	..	@@ -664,7 +428,9 @@
664	428	* Hide vma from rmap and truncate_pagecache before freeing
665	429	* pgtables
666	430	*/
	431	+ vm_write_begin(vma);
667	432	unlink_anon_vmas(vma);
	433	+ vm_write_end(vma);
668	434	unlink_file_vma(vma);
669	435
670	436	if (is_vm_hugetlb_page(vma)) {
..	..	@@ -678,7 +444,9 @@
678	444	&& !is_vm_hugetlb_page(next)) {
679	445	vma = next;
680	446	next = vma->vm_next;
	447	+ vm_write_begin(vma);
681	448	unlink_anon_vmas(vma);
	449	+ vm_write_end(vma);
682	450	unlink_file_vma(vma);
683	451	}
684	452	free_pgd_range(tlb, addr, vma->vm_end,
..	..	@@ -688,10 +456,10 @@
688	456	}
689	457	}
690	458
691		-int __pte_alloc(struct mm_struct mm, pmd_t pmd, unsigned long address)
	459	+int __pte_alloc(struct mm_struct mm, pmd_t pmd)
692	460	{
693	461	spinlock_t *ptl;
694		- pgtable_t new = pte_alloc_one(mm, address);
	462	+ pgtable_t new = pte_alloc_one(mm);
695	463	if (!new)
696	464	return -ENOMEM;
697	465
..	..	@@ -706,7 +474,7 @@
706	474	* of a chain of data-dependent loads, meaning most CPUs (alpha
707	475	* being the notable exception) will already guarantee loads are
708	476	* seen in-order. See the alpha page table accessors for the
709		- * smp_read_barrier_depends() barriers in page table walking code.
	477	+ * smp_rmb() barriers in page table walking code.
710	478	*/
711	479	smp_wmb(); /* Could be smp_wmb__xxx(before\|after)_spin_lock */
712	480
..	..	@@ -722,9 +490,9 @@
722	490	return 0;
723	491	}
724	492
725		-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
	493	+int __pte_alloc_kernel(pmd_t *pmd)
726	494	{
727		- pte_t *new = pte_alloc_one_kernel(&init_mm, address);
	495	+ pte_t *new = pte_alloc_one_kernel(&init_mm);
728	496	if (!new)
729	497	return -ENOMEM;
730	498
..	..	@@ -804,9 +572,9 @@
804	572	(long long)pte_val(pte), (long long)pmd_val(*pmd));
805	573	if (page)
806	574	dump_page(page, "bad pte");
807		- pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
808		- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
809		- pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
	575	+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
	576	+ (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
	577	+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
810	578	vma->vm_file,
811	579	vma->vm_ops ? vma->vm_ops->fault : NULL,
812	580	vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
..	..	@@ -816,7 +584,8 @@
816	584	}
817	585
818	586	/*
819		- * vm_normal_page -- This function gets the "struct page" associated with a pte.
	587	+ * __vm_normal_page -- This function gets the "struct page" associated with
	588	+ * a pte.
820	589	*
821	590	* "Special" mappings do not wish to be associated with a "struct page" (either
822	591	* it doesn't exist, or it exists but they don't want to touch it). In this
..	..	@@ -858,7 +627,7 @@
858	627	*
859	628	*/
860	629	struct page _vm_normal_page(struct vm_area_struct vma, unsigned long addr,
861		- pte_t pte, bool with_public_device)
	630	+ pte_t pte, unsigned long vma_flags)
862	631	{
863	632	unsigned long pfn = pte_pfn(pte);
864	633
..	..	@@ -867,33 +636,10 @@
867	636	goto check_pfn;
868	637	if (vma->vm_ops && vma->vm_ops->find_special_page)
869	638	return vma->vm_ops->find_special_page(vma, addr);
870		- if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
	639	+ if (vma_flags & (VM_PFNMAP \| VM_MIXEDMAP))
871	640	return NULL;
872	641	if (is_zero_pfn(pfn))
873	642	return NULL;
874		-
875		- /*
876		- * Device public pages are special pages (they are ZONE_DEVICE
877		- * pages but different from persistent memory). They behave
878		- * allmost like normal pages. The difference is that they are
879		- * not on the lru and thus should never be involve with any-
880		- * thing that involve lru manipulation (mlock, numa balancing,
881		- * ...).
882		- *
883		- * This is why we still want to return NULL for such page from
884		- * vm_normal_page() so that we do not have to special case all
885		- * call site of vm_normal_page().
886		- */
887		- if (likely(pfn <= highest_memmap_pfn)) {
888		- struct page *page = pfn_to_page(pfn);
889		-
890		- if (is_device_public_page(page)) {
891		- if (with_public_device)
892		- return page;
893		- return NULL;
894		- }
895		- }
896		-
897	643	if (pte_devmap(pte))
898	644	return NULL;
899	645
..	..	@@ -902,9 +648,13 @@
902	648	}
903	649
904	650	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
	651	+ /*
	652	+ * This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT
	653	+ * is set. This is mainly because we can't rely on vm_start.
	654	+ */
905	655
906		- if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
907		- if (vma->vm_flags & VM_MIXEDMAP) {
	656	+ if (unlikely(vma_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
	657	+ if (vma_flags & VM_MIXEDMAP) {
908	658	if (!pfn_valid(pfn))
909	659	return NULL;
910	660	goto out;
..	..	@@ -913,7 +663,7 @@
913	663	off = (addr - vma->vm_start) >> PAGE_SHIFT;
914	664	if (pfn == vma->vm_pgoff + off)
915	665	return NULL;
916		- if (!is_cow_mapping(vma->vm_flags))
	666	+ if (!is_cow_mapping(vma_flags))
917	667	return NULL;
918	668	}
919	669	}
..	..	@@ -963,7 +713,7 @@
963	713
964	714	if (pmd_devmap(pmd))
965	715	return NULL;
966		- if (is_zero_pfn(pfn))
	716	+ if (is_huge_zero_pmd(pmd))
967	717	return NULL;
968	718	if (unlikely(pfn > highest_memmap_pfn))
969	719	return NULL;
..	..	@@ -983,80 +733,197 @@
983	733	* covered by this vma.
984	734	*/
985	735
986		-static inline unsigned long
987		-copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
988		- pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
989		- unsigned long addr, int *rss)
	736	+static unsigned long
	737	+copy_nonpresent_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
	738	+ pte_t dst_pte, pte_t src_pte, struct vm_area_struct *dst_vma,
	739	+ struct vm_area_struct src_vma, unsigned long addr, int rss)
990	740	{
991		- unsigned long vm_flags = vma->vm_flags;
	741	+ unsigned long vm_flags = dst_vma->vm_flags;
	742	+ pte_t pte = *src_pte;
	743	+ struct page *page;
	744	+ swp_entry_t entry = pte_to_swp_entry(pte);
	745	+
	746	+ if (likely(!non_swap_entry(entry))) {
	747	+ if (swap_duplicate(entry) < 0)
	748	+ return entry.val;
	749	+
	750	+ /* make sure dst_mm is on swapoff's mmlist. */
	751	+ if (unlikely(list_empty(&dst_mm->mmlist))) {
	752	+ spin_lock(&mmlist_lock);
	753	+ if (list_empty(&dst_mm->mmlist))
	754	+ list_add(&dst_mm->mmlist,
	755	+ &src_mm->mmlist);
	756	+ spin_unlock(&mmlist_lock);
	757	+ }
	758	+ rss[MM_SWAPENTS]++;
	759	+ } else if (is_migration_entry(entry)) {
	760	+ page = migration_entry_to_page(entry);
	761	+
	762	+ rss[mm_counter(page)]++;
	763	+
	764	+ if (is_write_migration_entry(entry) &&
	765	+ is_cow_mapping(vm_flags)) {
	766	+ /*
	767	+ * COW mappings require pages in both
	768	+ * parent and child to be set to read.
	769	+ */
	770	+ make_migration_entry_read(&entry);
	771	+ pte = swp_entry_to_pte(entry);
	772	+ if (pte_swp_soft_dirty(*src_pte))
	773	+ pte = pte_swp_mksoft_dirty(pte);
	774	+ if (pte_swp_uffd_wp(*src_pte))
	775	+ pte = pte_swp_mkuffd_wp(pte);
	776	+ set_pte_at(src_mm, addr, src_pte, pte);
	777	+ }
	778	+ } else if (is_device_private_entry(entry)) {
	779	+ page = device_private_entry_to_page(entry);
	780	+
	781	+ /*
	782	+ * Update rss count even for unaddressable pages, as
	783	+ * they should treated just like normal pages in this
	784	+ * respect.
	785	+ *
	786	+ * We will likely want to have some new rss counters
	787	+ * for unaddressable pages, at some point. But for now
	788	+ * keep things as they are.
	789	+ */
	790	+ get_page(page);
	791	+ rss[mm_counter(page)]++;
	792	+ page_dup_rmap(page, false);
	793	+
	794	+ /*
	795	+ * We do not preserve soft-dirty information, because so
	796	+ * far, checkpoint/restore is the only feature that
	797	+ * requires that. And checkpoint/restore does not work
	798	+ * when a device driver is involved (you cannot easily
	799	+ * save and restore device driver state).
	800	+ */
	801	+ if (is_write_device_private_entry(entry) &&
	802	+ is_cow_mapping(vm_flags)) {
	803	+ make_device_private_entry_read(&entry);
	804	+ pte = swp_entry_to_pte(entry);
	805	+ if (pte_swp_uffd_wp(*src_pte))
	806	+ pte = pte_swp_mkuffd_wp(pte);
	807	+ set_pte_at(src_mm, addr, src_pte, pte);
	808	+ }
	809	+ }
	810	+ if (!userfaultfd_wp(dst_vma))
	811	+ pte = pte_swp_clear_uffd_wp(pte);
	812	+ set_pte_at(dst_mm, addr, dst_pte, pte);
	813	+ return 0;
	814	+}
	815	+
	816	+/*
	817	+ * Copy a present and normal page if necessary.
	818	+ *
	819	+ * NOTE! The usual case is that this doesn't need to do
	820	+ * anything, and can just return a positive value. That
	821	+ * will let the caller know that it can just increase
	822	+ * the page refcount and re-use the pte the traditional
	823	+ * way.
	824	+ *
	825	+ * But _if_ we need to copy it because it needs to be
	826	+ * pinned in the parent (and the child should get its own
	827	+ * copy rather than just a reference to the same page),
	828	+ * we'll do that here and return zero to let the caller
	829	+ * know we're done.
	830	+ *
	831	+ * And if we need a pre-allocated page but don't yet have
	832	+ * one, return a negative error to let the preallocation
	833	+ * code know so that it can do so outside the page table
	834	+ * lock.
	835	+ */
	836	+static inline int
	837	+copy_present_page(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	838	+ pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
	839	+ struct page *prealloc, pte_t pte, struct page page)
	840	+{
	841	+ struct mm_struct *src_mm = src_vma->vm_mm;
	842	+ struct page *new_page;
	843	+
	844	+ if (!is_cow_mapping(src_vma->vm_flags))
	845	+ return 1;
	846	+
	847	+ /*
	848	+ * What we want to do is to check whether this page may
	849	+ * have been pinned by the parent process. If so,
	850	+ * instead of wrprotect the pte on both sides, we copy
	851	+ * the page immediately so that we'll always guarantee
	852	+ * the pinned page won't be randomly replaced in the
	853	+ * future.
	854	+ *
	855	+ * The page pinning checks are just "has this mm ever
	856	+ * seen pinning", along with the (inexact) check of
	857	+ * the page count. That might give false positives for
	858	+ * for pinning, but it will work correctly.
	859	+ */
	860	+ if (likely(!atomic_read(&src_mm->has_pinned)))
	861	+ return 1;
	862	+ if (likely(!page_maybe_dma_pinned(page)))
	863	+ return 1;
	864	+
	865	+ /*
	866	+ * The vma->anon_vma of the child process may be NULL
	867	+ * because the entire vma does not contain anonymous pages.
	868	+ * A BUG will occur when the copy_present_page() passes
	869	+ * a copy of a non-anonymous page of that vma to the
	870	+ * page_add_new_anon_rmap() to set up new anonymous rmap.
	871	+ * Return 1 if the page is not an anonymous page.
	872	+ */
	873	+ if (!PageAnon(page))
	874	+ return 1;
	875	+
	876	+ new_page = *prealloc;
	877	+ if (!new_page)
	878	+ return -EAGAIN;
	879	+
	880	+ /*
	881	+ * We have a prealloc page, all good! Take it
	882	+ * over and copy the page & arm it.
	883	+ */
	884	+ *prealloc = NULL;
	885	+ copy_user_highpage(new_page, page, addr, src_vma);
	886	+ __SetPageUptodate(new_page);
	887	+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
	888	+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
	889	+ rss[mm_counter(new_page)]++;
	890	+
	891	+ /* All done, just insert the new page copy in the child */
	892	+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
	893	+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma->vm_flags);
	894	+ if (userfaultfd_pte_wp(dst_vma, *src_pte))
	895	+ /* Uffd-wp needs to be delivered to dest pte as well */
	896	+ pte = pte_wrprotect(pte_mkuffd_wp(pte));
	897	+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
	898	+ return 0;
	899	+}
	900	+
	901	+/*
	902	+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
	903	+ * is required to copy this pte.
	904	+ */
	905	+static inline int
	906	+copy_present_pte(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	907	+ pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
	908	+ struct page **prealloc)
	909	+{
	910	+ struct mm_struct *src_mm = src_vma->vm_mm;
	911	+ unsigned long vm_flags = src_vma->vm_flags;
992	912	pte_t pte = *src_pte;
993	913	struct page *page;
994	914
995		- /* pte contains position in swap or file, so copy. */
996		- if (unlikely(!pte_present(pte))) {
997		- swp_entry_t entry = pte_to_swp_entry(pte);
	915	+ page = vm_normal_page(src_vma, addr, pte);
	916	+ if (page) {
	917	+ int retval;
998	918
999		- if (likely(!non_swap_entry(entry))) {
1000		- if (swap_duplicate(entry) < 0)
1001		- return entry.val;
	919	+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
	920	+ addr, rss, prealloc, pte, page);
	921	+ if (retval <= 0)
	922	+ return retval;
1002	923
1003		- /* make sure dst_mm is on swapoff's mmlist. */
1004		- if (unlikely(list_empty(&dst_mm->mmlist))) {
1005		- spin_lock(&mmlist_lock);
1006		- if (list_empty(&dst_mm->mmlist))
1007		- list_add(&dst_mm->mmlist,
1008		- &src_mm->mmlist);
1009		- spin_unlock(&mmlist_lock);
1010		- }
1011		- rss[MM_SWAPENTS]++;
1012		- } else if (is_migration_entry(entry)) {
1013		- page = migration_entry_to_page(entry);
1014		-
1015		- rss[mm_counter(page)]++;
1016		-
1017		- if (is_write_migration_entry(entry) &&
1018		- is_cow_mapping(vm_flags)) {
1019		- /*
1020		- * COW mappings require pages in both
1021		- * parent and child to be set to read.
1022		- */
1023		- make_migration_entry_read(&entry);
1024		- pte = swp_entry_to_pte(entry);
1025		- if (pte_swp_soft_dirty(*src_pte))
1026		- pte = pte_swp_mksoft_dirty(pte);
1027		- set_pte_at(src_mm, addr, src_pte, pte);
1028		- }
1029		- } else if (is_device_private_entry(entry)) {
1030		- page = device_private_entry_to_page(entry);
1031		-
1032		- /*
1033		- * Update rss count even for unaddressable pages, as
1034		- * they should treated just like normal pages in this
1035		- * respect.
1036		- *
1037		- * We will likely want to have some new rss counters
1038		- * for unaddressable pages, at some point. But for now
1039		- * keep things as they are.
1040		- */
1041		- get_page(page);
1042		- rss[mm_counter(page)]++;
1043		- page_dup_rmap(page, false);
1044		-
1045		- /*
1046		- * We do not preserve soft-dirty information, because so
1047		- * far, checkpoint/restore is the only feature that
1048		- * requires that. And checkpoint/restore does not work
1049		- * when a device driver is involved (you cannot easily
1050		- * save and restore device driver state).
1051		- */
1052		- if (is_write_device_private_entry(entry) &&
1053		- is_cow_mapping(vm_flags)) {
1054		- make_device_private_entry_read(&entry);
1055		- pte = swp_entry_to_pte(entry);
1056		- set_pte_at(src_mm, addr, src_pte, pte);
1057		- }
1058		- }
1059		- goto out_set_pte;
	924	+ get_page(page);
	925	+ page_dup_rmap(page, false);
	926	+ rss[mm_counter(page)]++;
1060	927	}
1061	928
1062	929	/*
..	..	@@ -1076,48 +943,56 @@
1076	943	pte = pte_mkclean(pte);
1077	944	pte = pte_mkold(pte);
1078	945
1079		- page = vm_normal_page(vma, addr, pte);
1080		- if (page) {
1081		- get_page(page);
1082		- page_dup_rmap(page, false);
1083		- rss[mm_counter(page)]++;
1084		- } else if (pte_devmap(pte)) {
1085		- page = pte_page(pte);
	946	+ if (!userfaultfd_wp(dst_vma))
	947	+ pte = pte_clear_uffd_wp(pte);
1086	948
1087		- /*
1088		- * Cache coherent device memory behave like regular page and
1089		- * not like persistent memory page. For more informations see
1090		- * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
1091		- */
1092		- if (is_device_public_page(page)) {
1093		- get_page(page);
1094		- page_dup_rmap(page, false);
1095		- rss[mm_counter(page)]++;
1096		- }
1097		- }
1098		-
1099		-out_set_pte:
1100		- set_pte_at(dst_mm, addr, dst_pte, pte);
	949	+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
1101	950	return 0;
1102	951	}
1103	952
1104		-static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1105		- pmd_t dst_pmd, pmd_t src_pmd, struct vm_area_struct *vma,
1106		- unsigned long addr, unsigned long end)
	953	+static inline struct page *
	954	+page_copy_prealloc(struct mm_struct src_mm, struct vm_area_struct vma,
	955	+ unsigned long addr)
1107	956	{
	957	+ struct page *new_page;
	958	+
	959	+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
	960	+ if (!new_page)
	961	+ return NULL;
	962	+
	963	+ if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
	964	+ put_page(new_page);
	965	+ return NULL;
	966	+ }
	967	+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
	968	+
	969	+ return new_page;
	970	+}
	971	+
	972	+static int
	973	+copy_pte_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	974	+ pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
	975	+ unsigned long end)
	976	+{
	977	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	978	+ struct mm_struct *src_mm = src_vma->vm_mm;
1108	979	pte_t orig_src_pte, orig_dst_pte;
1109	980	pte_t src_pte, dst_pte;
1110	981	spinlock_t src_ptl, dst_ptl;
1111		- int progress = 0;
	982	+ int progress, ret = 0;
1112	983	int rss[NR_MM_COUNTERS];
1113	984	swp_entry_t entry = (swp_entry_t){0};
	985	+ struct page *prealloc = NULL;
1114	986
1115	987	again:
	988	+ progress = 0;
1116	989	init_rss_vec(rss);
1117	990
1118	991	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1119		- if (!dst_pte)
1120		- return -ENOMEM;
	992	+ if (!dst_pte) {
	993	+ ret = -ENOMEM;
	994	+ goto out;
	995	+ }
1121	996	src_pte = pte_offset_map(src_pmd, addr);
1122	997	src_ptl = pte_lockptr(src_mm, src_pmd);
1123	998	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
..	..	@@ -1140,10 +1015,35 @@
1140	1015	progress++;
1141	1016	continue;
1142	1017	}
1143		- entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
1144		- vma, addr, rss);
1145		- if (entry.val)
	1018	+ if (unlikely(!pte_present(*src_pte))) {
	1019	+ entry.val = copy_nonpresent_pte(dst_mm, src_mm,
	1020	+ dst_pte, src_pte,
	1021	+ dst_vma, src_vma,
	1022	+ addr, rss);
	1023	+ if (entry.val)
	1024	+ break;
	1025	+ progress += 8;
	1026	+ continue;
	1027	+ }
	1028	+ /* copy_present_pte() will clear `prealloc' if consumed /
	1029	+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
	1030	+ addr, rss, &prealloc);
	1031	+ /*
	1032	+ * If we need a pre-allocated page for this pte, drop the
	1033	+ * locks, allocate, and try again.
	1034	+ */
	1035	+ if (unlikely(ret == -EAGAIN))
1146	1036	break;
	1037	+ if (unlikely(prealloc)) {
	1038	+ /*
	1039	+ * pre-alloc page cannot be reused by next time so as
	1040	+ * to strictly follow mempolicy (e.g., alloc_page_vma()
	1041	+ * will allocate page according to address). This
	1042	+ * could only happen if one pinned pte changed.
	1043	+ */
	1044	+ put_page(prealloc);
	1045	+ prealloc = NULL;
	1046	+ }
1147	1047	progress += 8;
1148	1048	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
1149	1049
..	..	@@ -1155,19 +1055,34 @@
1155	1055	cond_resched();
1156	1056
1157	1057	if (entry.val) {
1158		- if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
	1058	+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
	1059	+ ret = -ENOMEM;
	1060	+ goto out;
	1061	+ }
	1062	+ entry.val = 0;
	1063	+ } else if (ret) {
	1064	+ WARN_ON_ONCE(ret != -EAGAIN);
	1065	+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
	1066	+ if (!prealloc)
1159	1067	return -ENOMEM;
1160		- progress = 0;
	1068	+ /* We've captured and resolved the error. Reset, try again. */
	1069	+ ret = 0;
1161	1070	}
1162	1071	if (addr != end)
1163	1072	goto again;
1164		- return 0;
	1073	+out:
	1074	+ if (unlikely(prealloc))
	1075	+ put_page(prealloc);
	1076	+ return ret;
1165	1077	}
1166	1078
1167		-static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1168		- pud_t dst_pud, pud_t src_pud, struct vm_area_struct *vma,
1169		- unsigned long addr, unsigned long end)
	1079	+static inline int
	1080	+copy_pmd_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1081	+ pud_t dst_pud, pud_t src_pud, unsigned long addr,
	1082	+ unsigned long end)
1170	1083	{
	1084	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1085	+ struct mm_struct *src_mm = src_vma->vm_mm;
1171	1086	pmd_t src_pmd, dst_pmd;
1172	1087	unsigned long next;
1173	1088
..	..	@@ -1180,9 +1095,9 @@
1180	1095	if (is_swap_pmd(src_pmd) \|\| pmd_trans_huge(src_pmd)
1181	1096	\|\| pmd_devmap(*src_pmd)) {
1182	1097	int err;
1183		- VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
1184		- err = copy_huge_pmd(dst_mm, src_mm,
1185		- dst_pmd, src_pmd, addr, vma);
	1098	+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
	1099	+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
	1100	+ addr, dst_vma, src_vma);
1186	1101	if (err == -ENOMEM)
1187	1102	return -ENOMEM;
1188	1103	if (!err)
..	..	@@ -1191,17 +1106,20 @@
1191	1106	}
1192	1107	if (pmd_none_or_clear_bad(src_pmd))
1193	1108	continue;
1194		- if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
1195		- vma, addr, next))
	1109	+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
	1110	+ addr, next))
1196	1111	return -ENOMEM;
1197	1112	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1198	1113	return 0;
1199	1114	}
1200	1115
1201		-static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1202		- p4d_t dst_p4d, p4d_t src_p4d, struct vm_area_struct *vma,
1203		- unsigned long addr, unsigned long end)
	1116	+static inline int
	1117	+copy_pud_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1118	+ p4d_t dst_p4d, p4d_t src_p4d, unsigned long addr,
	1119	+ unsigned long end)
1204	1120	{
	1121	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1122	+ struct mm_struct *src_mm = src_vma->vm_mm;
1205	1123	pud_t src_pud, dst_pud;
1206	1124	unsigned long next;
1207	1125
..	..	@@ -1214,9 +1132,9 @@
1214	1132	if (pud_trans_huge(src_pud) \|\| pud_devmap(src_pud)) {
1215	1133	int err;
1216	1134
1217		- VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
	1135	+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1218	1136	err = copy_huge_pud(dst_mm, src_mm,
1219		- dst_pud, src_pud, addr, vma);
	1137	+ dst_pud, src_pud, addr, src_vma);
1220	1138	if (err == -ENOMEM)
1221	1139	return -ENOMEM;
1222	1140	if (!err)
..	..	@@ -1225,17 +1143,19 @@
1225	1143	}
1226	1144	if (pud_none_or_clear_bad(src_pud))
1227	1145	continue;
1228		- if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
1229		- vma, addr, next))
	1146	+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
	1147	+ addr, next))
1230	1148	return -ENOMEM;
1231	1149	} while (dst_pud++, src_pud++, addr = next, addr != end);
1232	1150	return 0;
1233	1151	}
1234	1152
1235		-static inline int copy_p4d_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1236		- pgd_t dst_pgd, pgd_t src_pgd, struct vm_area_struct *vma,
1237		- unsigned long addr, unsigned long end)
	1153	+static inline int
	1154	+copy_p4d_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma,
	1155	+ pgd_t dst_pgd, pgd_t src_pgd, unsigned long addr,
	1156	+ unsigned long end)
1238	1157	{
	1158	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
1239	1159	p4d_t src_p4d, dst_p4d;
1240	1160	unsigned long next;
1241	1161
..	..	@@ -1247,22 +1167,23 @@
1247	1167	next = p4d_addr_end(addr, end);
1248	1168	if (p4d_none_or_clear_bad(src_p4d))
1249	1169	continue;
1250		- if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
1251		- vma, addr, next))
	1170	+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
	1171	+ addr, next))
1252	1172	return -ENOMEM;
1253	1173	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1254	1174	return 0;
1255	1175	}
1256	1176
1257		-int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,
1258		- struct vm_area_struct *vma)
	1177	+int
	1178	+copy_page_range(struct vm_area_struct dst_vma, struct vm_area_struct src_vma)
1259	1179	{
1260	1180	pgd_t src_pgd, dst_pgd;
1261	1181	unsigned long next;
1262		- unsigned long addr = vma->vm_start;
1263		- unsigned long end = vma->vm_end;
1264		- unsigned long mmun_start; /* For mmu_notifiers */
1265		- unsigned long mmun_end; /* For mmu_notifiers */
	1182	+ unsigned long addr = src_vma->vm_start;
	1183	+ unsigned long end = src_vma->vm_end;
	1184	+ struct mm_struct *dst_mm = dst_vma->vm_mm;
	1185	+ struct mm_struct *src_mm = src_vma->vm_mm;
	1186	+ struct mmu_notifier_range range;
1266	1187	bool is_cow;
1267	1188	int ret;
1268	1189
..	..	@@ -1272,19 +1193,19 @@
1272	1193	* readonly mappings. The tradeoff is that copy_page_range is more
1273	1194	* efficient than faulting.
1274	1195	*/
1275		- if (!(vma->vm_flags & (VM_HUGETLB \| VM_PFNMAP \| VM_MIXEDMAP)) &&
1276		- !vma->anon_vma)
	1196	+ if (!(src_vma->vm_flags & (VM_HUGETLB \| VM_PFNMAP \| VM_MIXEDMAP)) &&
	1197	+ !src_vma->anon_vma)
1277	1198	return 0;
1278	1199
1279		- if (is_vm_hugetlb_page(vma))
1280		- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
	1200	+ if (is_vm_hugetlb_page(src_vma))
	1201	+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
1281	1202
1282		- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
	1203	+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
1283	1204	/*
1284	1205	* We do not free on error cases below as remove_vma
1285	1206	* gets called on error from higher level routine
1286	1207	*/
1287		- ret = track_pfn_copy(vma);
	1208	+ ret = track_pfn_copy(src_vma);
1288	1209	if (ret)
1289	1210	return ret;
1290	1211	}
..	..	@@ -1295,12 +1216,22 @@
1295	1216	* parent mm. And a permission downgrade will only happen if
1296	1217	* is_cow_mapping() returns true.
1297	1218	*/
1298		- is_cow = is_cow_mapping(vma->vm_flags);
1299		- mmun_start = addr;
1300		- mmun_end = end;
1301		- if (is_cow)
1302		- mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1303		- mmun_end);
	1219	+ is_cow = is_cow_mapping(src_vma->vm_flags);
	1220	+
	1221	+ if (is_cow) {
	1222	+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
	1223	+ 0, src_vma, src_mm, addr, end);
	1224	+ mmu_notifier_invalidate_range_start(&range);
	1225	+ /*
	1226	+ * Disabling preemption is not needed for the write side, as
	1227	+ * the read side doesn't spin, but goes to the mmap_lock.
	1228	+ *
	1229	+ * Use the raw variant of the seqcount_t write API to avoid
	1230	+ * lockdep complaining about preemptibility.
	1231	+ */
	1232	+ mmap_assert_write_locked(src_mm);
	1233	+ raw_write_seqcount_begin(&src_mm->write_protect_seq);
	1234	+ }
1304	1235
1305	1236	ret = 0;
1306	1237	dst_pgd = pgd_offset(dst_mm, addr);
..	..	@@ -1309,16 +1240,29 @@
1309	1240	next = pgd_addr_end(addr, end);
1310	1241	if (pgd_none_or_clear_bad(src_pgd))
1311	1242	continue;
1312		- if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
1313		- vma, addr, next))) {
	1243	+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
	1244	+ addr, next))) {
1314	1245	ret = -ENOMEM;
1315	1246	break;
1316	1247	}
1317	1248	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1318	1249
1319		- if (is_cow)
1320		- mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
	1250	+ if (is_cow) {
	1251	+ raw_write_seqcount_end(&src_mm->write_protect_seq);
	1252	+ mmu_notifier_invalidate_range_end(&range);
	1253	+ }
1321	1254	return ret;
	1255	+}
	1256	+
	1257	+/* Whether we should zap all COWed (private) pages too */
	1258	+static inline bool should_zap_cows(struct zap_details *details)
	1259	+{
	1260	+ /* By default, zap all pages */
	1261	+ if (!details)
	1262	+ return true;
	1263	+
	1264	+ /* Or, we zap COWed pages only if the caller wants to */
	1265	+ return !details->check_mapping;
1322	1266	}
1323	1267
1324	1268	static unsigned long zap_pte_range(struct mmu_gather *tlb,
..	..	@@ -1334,7 +1278,7 @@
1334	1278	pte_t *pte;
1335	1279	swp_entry_t entry;
1336	1280
1337		- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
	1281	+ tlb_change_page_size(tlb, PAGE_SIZE);
1338	1282	again:
1339	1283	init_rss_vec(rss);
1340	1284	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
..	..	@@ -1346,10 +1290,13 @@
1346	1290	if (pte_none(ptent))
1347	1291	continue;
1348	1292
	1293	+ if (need_resched())
	1294	+ break;
	1295	+
1349	1296	if (pte_present(ptent)) {
1350	1297	struct page *page;
1351	1298
1352		- page = _vm_normal_page(vma, addr, ptent, true);
	1299	+ page = vm_normal_page(vma, addr, ptent);
1353	1300	if (unlikely(details) && page) {
1354	1301	/*
1355	1302	* unmap_shared_mapping_pages() wants to
..	..	@@ -1379,7 +1326,8 @@
1379	1326	page_remove_rmap(page, false);
1380	1327	if (unlikely(page_mapcount(page) < 0))
1381	1328	print_bad_pte(vma, addr, ptent, page);
1382		- if (unlikely(__tlb_remove_page(tlb, page))) {
	1329	+ if (unlikely(__tlb_remove_page(tlb, page)) \|\|
	1330	+ lru_cache_disabled()) {
1383	1331	force_flush = 1;
1384	1332	addr += PAGE_SIZE;
1385	1333	break;
..	..	@@ -1388,7 +1336,7 @@
1388	1336	}
1389	1337
1390	1338	entry = pte_to_swp_entry(ptent);
1391		- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
	1339	+ if (is_device_private_entry(entry)) {
1392	1340	struct page *page = device_private_entry_to_page(entry);
1393	1341
1394	1342	if (unlikely(details && details->check_mapping)) {
..	..	@@ -1409,17 +1357,18 @@
1409	1357	continue;
1410	1358	}
1411	1359
1412		- /* If details->check_mapping, we leave swap entries. */
1413		- if (unlikely(details))
1414		- continue;
1415		-
1416		- entry = pte_to_swp_entry(ptent);
1417		- if (!non_swap_entry(entry))
	1360	+ if (!non_swap_entry(entry)) {
	1361	+ /* Genuine swap entry, hence a private anon page */
	1362	+ if (!should_zap_cows(details))
	1363	+ continue;
1418	1364	rss[MM_SWAPENTS]--;
1419		- else if (is_migration_entry(entry)) {
	1365	+ } else if (is_migration_entry(entry)) {
1420	1366	struct page *page;
1421	1367
1422	1368	page = migration_entry_to_page(entry);
	1369	+ if (details && details->check_mapping &&
	1370	+ details->check_mapping != page_rmapping(page))
	1371	+ continue;
1423	1372	rss[mm_counter(page)]--;
1424	1373	}
1425	1374	if (unlikely(!free_swap_and_cache(entry)))
..	..	@@ -1443,9 +1392,12 @@
1443	1392	*/
1444	1393	if (force_flush) {
1445	1394	force_flush = 0;
1446		- tlb_flush_mmu_free(tlb);
1447		- if (addr != end)
1448		- goto again;
	1395	+ tlb_flush_mmu(tlb);
	1396	+ }
	1397	+
	1398	+ if (addr != end) {
	1399	+ cond_resched();
	1400	+ goto again;
1449	1401	}
1450	1402
1451	1403	return addr;
..	..	@@ -1484,7 +1436,7 @@
1484	1436	* Here there can be other concurrent MADV_DONTNEED or
1485	1437	* trans huge page faults running, and if the pmd is
1486	1438	* none or trans huge it can change under us. This is
1487		- * because MADV_DONTNEED holds the mmap_sem in read
	1439	+ * because MADV_DONTNEED holds the mmap_lock in read
1488	1440	* mode.
1489	1441	*/
1490	1442	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
..	..	@@ -1510,7 +1462,7 @@
1510	1462	next = pud_addr_end(addr, end);
1511	1463	if (pud_trans_huge(pud) \|\| pud_devmap(pud)) {
1512	1464	if (next - addr != HPAGE_PUD_SIZE) {
1513		- VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
	1465	+ mmap_assert_locked(tlb->mm);
1514	1466	split_huge_pud(vma, pud, addr);
1515	1467	} else if (zap_huge_pud(tlb, vma, pud, addr))
1516	1468	goto next;
..	..	@@ -1631,12 +1583,14 @@
1631	1583	struct vm_area_struct *vma, unsigned long start_addr,
1632	1584	unsigned long end_addr)
1633	1585	{
1634		- struct mm_struct *mm = vma->vm_mm;
	1586	+ struct mmu_notifier_range range;
1635	1587
1636		- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
	1588	+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
	1589	+ start_addr, end_addr);
	1590	+ mmu_notifier_invalidate_range_start(&range);
1637	1591	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
1638	1592	unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
1639		- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
	1593	+ mmu_notifier_invalidate_range_end(&range);
1640	1594	}
1641	1595
1642	1596	/**
..	..	@@ -1650,18 +1604,19 @@
1650	1604	void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1651	1605	unsigned long size)
1652	1606	{
1653		- struct mm_struct *mm = vma->vm_mm;
	1607	+ struct mmu_notifier_range range;
1654	1608	struct mmu_gather tlb;
1655		- unsigned long end = start + size;
1656	1609
1657	1610	lru_add_drain();
1658		- tlb_gather_mmu(&tlb, mm, start, end);
1659		- update_hiwater_rss(mm);
1660		- mmu_notifier_invalidate_range_start(mm, start, end);
1661		- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
1662		- unmap_single_vma(&tlb, vma, start, end, NULL);
1663		- mmu_notifier_invalidate_range_end(mm, start, end);
1664		- tlb_finish_mmu(&tlb, start, end);
	1611	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
	1612	+ start, start + size);
	1613	+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
	1614	+ update_hiwater_rss(vma->vm_mm);
	1615	+ mmu_notifier_invalidate_range_start(&range);
	1616	+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
	1617	+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
	1618	+ mmu_notifier_invalidate_range_end(&range);
	1619	+ tlb_finish_mmu(&tlb, start, range.end);
1665	1620	}
1666	1621
1667	1622	/**
..	..	@@ -1676,17 +1631,18 @@
1676	1631	static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
1677	1632	unsigned long size, struct zap_details *details)
1678	1633	{
1679		- struct mm_struct *mm = vma->vm_mm;
	1634	+ struct mmu_notifier_range range;
1680	1635	struct mmu_gather tlb;
1681		- unsigned long end = address + size;
1682	1636
1683	1637	lru_add_drain();
1684		- tlb_gather_mmu(&tlb, mm, address, end);
1685		- update_hiwater_rss(mm);
1686		- mmu_notifier_invalidate_range_start(mm, address, end);
1687		- unmap_single_vma(&tlb, vma, address, end, details);
1688		- mmu_notifier_invalidate_range_end(mm, address, end);
1689		- tlb_finish_mmu(&tlb, address, end);
	1638	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
	1639	+ address, address + size);
	1640	+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
	1641	+ update_hiwater_rss(vma->vm_mm);
	1642	+ mmu_notifier_invalidate_range_start(&range);
	1643	+ unmap_single_vma(&tlb, vma, address, range.end, details);
	1644	+ mmu_notifier_invalidate_range_end(&range);
	1645	+ tlb_finish_mmu(&tlb, address, range.end);
1690	1646	}
1691	1647
1692	1648	/**
..	..	@@ -1711,8 +1667,7 @@
1711	1667	}
1712	1668	EXPORT_SYMBOL_GPL(zap_vma_ptes);
1713	1669
1714		-pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
1715		- spinlock_t **ptl)
	1670	+static pmd_t walk_to_pmd(struct mm_struct mm, unsigned long addr)
1716	1671	{
1717	1672	pgd_t *pgd;
1718	1673	p4d_t *p4d;
..	..	@@ -1731,7 +1686,38 @@
1731	1686	return NULL;
1732	1687
1733	1688	VM_BUG_ON(pmd_trans_huge(*pmd));
	1689	+ return pmd;
	1690	+}
	1691	+
	1692	+pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
	1693	+ spinlock_t **ptl)
	1694	+{
	1695	+ pmd_t *pmd = walk_to_pmd(mm, addr);
	1696	+
	1697	+ if (!pmd)
	1698	+ return NULL;
1734	1699	return pte_alloc_map_lock(mm, pmd, addr, ptl);
	1700	+}
	1701	+
	1702	+static int validate_page_before_insert(struct page *page)
	1703	+{
	1704	+ if (PageAnon(page) \|\| PageSlab(page) \|\| page_has_type(page))
	1705	+ return -EINVAL;
	1706	+ flush_dcache_page(page);
	1707	+ return 0;
	1708	+}
	1709	+
	1710	+static int insert_page_into_pte_locked(struct mm_struct mm, pte_t pte,
	1711	+ unsigned long addr, struct page *page, pgprot_t prot)
	1712	+{
	1713	+ if (!pte_none(*pte))
	1714	+ return -EBUSY;
	1715	+ /* Ok, finally just insert the thing.. */
	1716	+ get_page(page);
	1717	+ inc_mm_counter_fast(mm, mm_counter_file(page));
	1718	+ page_add_file_rmap(page, false);
	1719	+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
	1720	+ return 0;
1735	1721	}
1736	1722
1737	1723	/*
..	..	@@ -1749,32 +1735,135 @@
1749	1735	pte_t *pte;
1750	1736	spinlock_t *ptl;
1751	1737
1752		- retval = -EINVAL;
1753		- if (PageAnon(page))
	1738	+ retval = validate_page_before_insert(page);
	1739	+ if (retval)
1754	1740	goto out;
1755	1741	retval = -ENOMEM;
1756		- flush_dcache_page(page);
1757	1742	pte = get_locked_pte(mm, addr, &ptl);
1758	1743	if (!pte)
1759	1744	goto out;
1760		- retval = -EBUSY;
1761		- if (!pte_none(*pte))
1762		- goto out_unlock;
1763		-
1764		- /* Ok, finally just insert the thing.. */
1765		- get_page(page);
1766		- inc_mm_counter_fast(mm, mm_counter_file(page));
1767		- page_add_file_rmap(page, false);
1768		- set_pte_at(mm, addr, pte, mk_pte(page, prot));
1769		-
1770		- retval = 0;
1771		- pte_unmap_unlock(pte, ptl);
1772		- return retval;
1773		-out_unlock:
	1745	+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
1774	1746	pte_unmap_unlock(pte, ptl);
1775	1747	out:
1776	1748	return retval;
1777	1749	}
	1750	+
	1751	+#ifdef pte_index
	1752	+static int insert_page_in_batch_locked(struct mm_struct mm, pte_t pte,
	1753	+ unsigned long addr, struct page *page, pgprot_t prot)
	1754	+{
	1755	+ int err;
	1756	+
	1757	+ if (!page_count(page))
	1758	+ return -EINVAL;
	1759	+ err = validate_page_before_insert(page);
	1760	+ if (err)
	1761	+ return err;
	1762	+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
	1763	+}
	1764	+
	1765	+/* insert_pages() amortizes the cost of spinlock operations
	1766	+ * when inserting pages in a loop. Arch must define pte_index.
	1767	+ */
	1768	+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
	1769	+ struct page *pages, unsigned long num, pgprot_t prot)
	1770	+{
	1771	+ pmd_t *pmd = NULL;
	1772	+ pte_t start_pte, pte;
	1773	+ spinlock_t *pte_lock;
	1774	+ struct mm_struct *const mm = vma->vm_mm;
	1775	+ unsigned long curr_page_idx = 0;
	1776	+ unsigned long remaining_pages_total = *num;
	1777	+ unsigned long pages_to_write_in_pmd;
	1778	+ int ret;
	1779	+more:
	1780	+ ret = -EFAULT;
	1781	+ pmd = walk_to_pmd(mm, addr);
	1782	+ if (!pmd)
	1783	+ goto out;
	1784	+
	1785	+ pages_to_write_in_pmd = min_t(unsigned long,
	1786	+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
	1787	+
	1788	+ /* Allocate the PTE if necessary; takes PMD lock once only. */
	1789	+ ret = -ENOMEM;
	1790	+ if (pte_alloc(mm, pmd))
	1791	+ goto out;
	1792	+
	1793	+ while (pages_to_write_in_pmd) {
	1794	+ int pte_idx = 0;
	1795	+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
	1796	+
	1797	+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
	1798	+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
	1799	+ int err = insert_page_in_batch_locked(mm, pte,
	1800	+ addr, pages[curr_page_idx], prot);
	1801	+ if (unlikely(err)) {
	1802	+ pte_unmap_unlock(start_pte, pte_lock);
	1803	+ ret = err;
	1804	+ remaining_pages_total -= pte_idx;
	1805	+ goto out;
	1806	+ }
	1807	+ addr += PAGE_SIZE;
	1808	+ ++curr_page_idx;
	1809	+ }
	1810	+ pte_unmap_unlock(start_pte, pte_lock);
	1811	+ pages_to_write_in_pmd -= batch_size;
	1812	+ remaining_pages_total -= batch_size;
	1813	+ }
	1814	+ if (remaining_pages_total)
	1815	+ goto more;
	1816	+ ret = 0;
	1817	+out:
	1818	+ *num = remaining_pages_total;
	1819	+ return ret;
	1820	+}
	1821	+#endif /* ifdef pte_index */
	1822	+
	1823	+/**
	1824	+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
	1825	+ * @vma: user vma to map to
	1826	+ * @addr: target start user address of these pages
	1827	+ * @pages: source kernel pages
	1828	+ * @num: in: number of pages to map. out: number of pages that were not
	1829	+ * mapped. (0 means all pages were successfully mapped).
	1830	+ *
	1831	+ * Preferred over vm_insert_page() when inserting multiple pages.
	1832	+ *
	1833	+ * In case of error, we may have mapped a subset of the provided
	1834	+ * pages. It is the caller's responsibility to account for this case.
	1835	+ *
	1836	+ * The same restrictions apply as in vm_insert_page().
	1837	+ */
	1838	+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
	1839	+ struct page *pages, unsigned long num)
	1840	+{
	1841	+#ifdef pte_index
	1842	+ const unsigned long end_addr = addr + (num PAGE_SIZE) - 1;
	1843	+
	1844	+ if (addr < vma->vm_start \|\| end_addr >= vma->vm_end)
	1845	+ return -EFAULT;
	1846	+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
	1847	+ BUG_ON(mmap_read_trylock(vma->vm_mm));
	1848	+ BUG_ON(vma->vm_flags & VM_PFNMAP);
	1849	+ vma->vm_flags \|= VM_MIXEDMAP;
	1850	+ }
	1851	+ /* Defer page refcount checking till we're about to map that page. */
	1852	+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
	1853	+#else
	1854	+ unsigned long idx = 0, pgcount = *num;
	1855	+ int err = -EINVAL;
	1856	+
	1857	+ for (; idx < pgcount; ++idx) {
	1858	+ err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
	1859	+ if (err)
	1860	+ break;
	1861	+ }
	1862	+ *num = pgcount - idx;
	1863	+ return err;
	1864	+#endif /* ifdef pte_index */
	1865	+}
	1866	+EXPORT_SYMBOL(vm_insert_pages);
1778	1867
1779	1868	/**
1780	1869	* vm_insert_page - insert single page into user vma
..	..	@@ -1799,9 +1888,11 @@
1799	1888	* The page does not need to be reserved.
1800	1889	*
1801	1890	* Usually this function is called from f_op->mmap() handler
1802		- * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
	1891	+ * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
1803	1892	* Caller must set VM_MIXEDMAP on vma if it wants to call this
1804	1893	* function from other places, for example from page-fault handler.
	1894	+ *
	1895	+ * Return: %0 on success, negative error code otherwise.
1805	1896	*/
1806	1897	int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1807	1898	struct page *page)
..	..	@@ -1811,7 +1902,7 @@
1811	1902	if (!page_count(page))
1812	1903	return -EINVAL;
1813	1904	if (!(vma->vm_flags & VM_MIXEDMAP)) {
1814		- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
	1905	+ BUG_ON(mmap_read_trylock(vma->vm_mm));
1815	1906	BUG_ON(vma->vm_flags & VM_PFNMAP);
1816	1907	vma->vm_flags \|= VM_MIXEDMAP;
1817	1908	}
..	..	@@ -1819,19 +1910,97 @@
1819	1910	}
1820	1911	EXPORT_SYMBOL(vm_insert_page);
1821	1912
1822		-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
	1913	+/*
	1914	+ * __vm_map_pages - maps range of kernel pages into user vma
	1915	+ * @vma: user vma to map to
	1916	+ * @pages: pointer to array of source kernel pages
	1917	+ * @num: number of pages in page array
	1918	+ * @offset: user's requested vm_pgoff
	1919	+ *
	1920	+ * This allows drivers to map range of kernel pages into a user vma.
	1921	+ *
	1922	+ * Return: 0 on success and error code otherwise.
	1923	+ */
	1924	+static int __vm_map_pages(struct vm_area_struct vma, struct page *pages,
	1925	+ unsigned long num, unsigned long offset)
	1926	+{
	1927	+ unsigned long count = vma_pages(vma);
	1928	+ unsigned long uaddr = vma->vm_start;
	1929	+ int ret, i;
	1930	+
	1931	+ /* Fail if the user requested offset is beyond the end of the object */
	1932	+ if (offset >= num)
	1933	+ return -ENXIO;
	1934	+
	1935	+ /* Fail if the user requested size exceeds available object size */
	1936	+ if (count > num - offset)
	1937	+ return -ENXIO;
	1938	+
	1939	+ for (i = 0; i < count; i++) {
	1940	+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
	1941	+ if (ret < 0)
	1942	+ return ret;
	1943	+ uaddr += PAGE_SIZE;
	1944	+ }
	1945	+
	1946	+ return 0;
	1947	+}
	1948	+
	1949	+/**
	1950	+ * vm_map_pages - maps range of kernel pages starts with non zero offset
	1951	+ * @vma: user vma to map to
	1952	+ * @pages: pointer to array of source kernel pages
	1953	+ * @num: number of pages in page array
	1954	+ *
	1955	+ * Maps an object consisting of @num pages, catering for the user's
	1956	+ * requested vm_pgoff
	1957	+ *
	1958	+ * If we fail to insert any page into the vma, the function will return
	1959	+ * immediately leaving any previously inserted pages present. Callers
	1960	+ * from the mmap handler may immediately return the error as their caller
	1961	+ * will destroy the vma, removing any successfully inserted pages. Other
	1962	+ * callers should make their own arrangements for calling unmap_region().
	1963	+ *
	1964	+ * Context: Process context. Called by mmap handlers.
	1965	+ * Return: 0 on success and error code otherwise.
	1966	+ */
	1967	+int vm_map_pages(struct vm_area_struct vma, struct page *pages,
	1968	+ unsigned long num)
	1969	+{
	1970	+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
	1971	+}
	1972	+EXPORT_SYMBOL(vm_map_pages);
	1973	+
	1974	+/**
	1975	+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
	1976	+ * @vma: user vma to map to
	1977	+ * @pages: pointer to array of source kernel pages
	1978	+ * @num: number of pages in page array
	1979	+ *
	1980	+ * Similar to vm_map_pages(), except that it explicitly sets the offset
	1981	+ * to 0. This function is intended for the drivers that did not consider
	1982	+ * vm_pgoff.
	1983	+ *
	1984	+ * Context: Process context. Called by mmap handlers.
	1985	+ * Return: 0 on success and error code otherwise.
	1986	+ */
	1987	+int vm_map_pages_zero(struct vm_area_struct vma, struct page *pages,
	1988	+ unsigned long num)
	1989	+{
	1990	+ return __vm_map_pages(vma, pages, num, 0);
	1991	+}
	1992	+EXPORT_SYMBOL(vm_map_pages_zero);
	1993	+
	1994	+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1823	1995	pfn_t pfn, pgprot_t prot, bool mkwrite)
1824	1996	{
1825	1997	struct mm_struct *mm = vma->vm_mm;
1826		- int retval;
1827	1998	pte_t *pte, entry;
1828	1999	spinlock_t *ptl;
1829	2000
1830		- retval = -ENOMEM;
1831	2001	pte = get_locked_pte(mm, addr, &ptl);
1832	2002	if (!pte)
1833		- goto out;
1834		- retval = -EBUSY;
	2003	+ return VM_FAULT_OOM;
1835	2004	if (!pte_none(*pte)) {
1836	2005	if (mkwrite) {
1837	2006	/*
..	..	@@ -1849,7 +2018,8 @@
1849	2018	goto out_unlock;
1850	2019	}
1851	2020	entry = pte_mkyoung(*pte);
1852		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	2021	+ entry = maybe_mkwrite(pte_mkdirty(entry),
	2022	+ vma->vm_flags);
1853	2023	if (ptep_set_access_flags(vma, addr, pte, entry, 1))
1854	2024	update_mmu_cache(vma, addr, pte);
1855	2025	}
..	..	@@ -1864,62 +2034,41 @@
1864	2034
1865	2035	if (mkwrite) {
1866	2036	entry = pte_mkyoung(entry);
1867		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	2037	+ entry = maybe_mkwrite(pte_mkdirty(entry), vma->vm_flags);
1868	2038	}
1869	2039
1870	2040	set_pte_at(mm, addr, pte, entry);
1871	2041	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1872	2042
1873		- retval = 0;
1874	2043	out_unlock:
1875	2044	pte_unmap_unlock(pte, ptl);
1876		-out:
1877		- return retval;
	2045	+ return VM_FAULT_NOPAGE;
1878	2046	}
1879	2047
1880	2048	/**
1881		- * vm_insert_pfn - insert single pfn into user vma
1882		- * @vma: user vma to map to
1883		- * @addr: target user address of this page
1884		- * @pfn: source kernel pfn
1885		- *
1886		- * Similar to vm_insert_page, this allows drivers to insert individual pages
1887		- * they've allocated into a user vma. Same comments apply.
1888		- *
1889		- * This function should only be called from a vm_ops->fault handler, and
1890		- * in that case the handler should return NULL.
1891		- *
1892		- * vma cannot be a COW mapping.
1893		- *
1894		- * As this is called only for pages that do not currently exist, we
1895		- * do not need to flush old virtual caches or the TLB.
1896		- */
1897		-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1898		- unsigned long pfn)
1899		-{
1900		- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
1901		-}
1902		-EXPORT_SYMBOL(vm_insert_pfn);
1903		-
1904		-/**
1905		- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
	2049	+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
1906	2050	* @vma: user vma to map to
1907	2051	* @addr: target user address of this page
1908	2052	* @pfn: source kernel pfn
1909	2053	* @pgprot: pgprot flags for the inserted page
1910	2054	*
1911		- * This is exactly like vm_insert_pfn, except that it allows drivers to
	2055	+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
1912	2056	* to override pgprot on a per-page basis.
1913	2057	*
1914	2058	* This only makes sense for IO mappings, and it makes no sense for
1915		- * cow mappings. In general, using multiple vmas is preferable;
1916		- * vm_insert_pfn_prot should only be used if using multiple VMAs is
	2059	+ * COW mappings. In general, using multiple vmas is preferable;
	2060	+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
1917	2061	* impractical.
	2062	+ *
	2063	+ * See vmf_insert_mixed_prot() for a discussion of the implication of using
	2064	+ * a value of @pgprot different from that of @vma->vm_page_prot.
	2065	+ *
	2066	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2067	+ * Return: vm_fault_t value.
1918	2068	*/
1919		-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
	2069	+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
1920	2070	unsigned long pfn, pgprot_t pgprot)
1921	2071	{
1922		- int ret;
1923	2072	/*
1924	2073	* Technically, architectures with pte_special can avoid all these
1925	2074	* restrictions (same for remap_pfn_range). However we would like
..	..	@@ -1933,19 +2082,44 @@
1933	2082	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1934	2083
1935	2084	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1936		- return -EFAULT;
	2085	+ return VM_FAULT_SIGBUS;
1937	2086
1938	2087	if (!pfn_modify_allowed(pfn, pgprot))
1939		- return -EACCES;
	2088	+ return VM_FAULT_SIGBUS;
1940	2089
1941	2090	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
1942	2091
1943		- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
	2092	+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
1944	2093	false);
1945		-
1946		- return ret;
1947	2094	}
1948		-EXPORT_SYMBOL(vm_insert_pfn_prot);
	2095	+EXPORT_SYMBOL(vmf_insert_pfn_prot);
	2096	+
	2097	+/**
	2098	+ * vmf_insert_pfn - insert single pfn into user vma
	2099	+ * @vma: user vma to map to
	2100	+ * @addr: target user address of this page
	2101	+ * @pfn: source kernel pfn
	2102	+ *
	2103	+ * Similar to vm_insert_page, this allows drivers to insert individual pages
	2104	+ * they've allocated into a user vma. Same comments apply.
	2105	+ *
	2106	+ * This function should only be called from a vm_ops->fault handler, and
	2107	+ * in that case the handler should return the result of this function.
	2108	+ *
	2109	+ * vma cannot be a COW mapping.
	2110	+ *
	2111	+ * As this is called only for pages that do not currently exist, we
	2112	+ * do not need to flush old virtual caches or the TLB.
	2113	+ *
	2114	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2115	+ * Return: vm_fault_t value.
	2116	+ */
	2117	+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
	2118	+ unsigned long pfn)
	2119	+{
	2120	+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
	2121	+}
	2122	+EXPORT_SYMBOL(vmf_insert_pfn);
1949	2123
1950	2124	static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
1951	2125	{
..	..	@@ -1961,20 +2135,21 @@
1961	2135	return false;
1962	2136	}
1963	2137
1964		-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1965		- pfn_t pfn, bool mkwrite)
	2138	+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
	2139	+ unsigned long addr, pfn_t pfn, pgprot_t pgprot,
	2140	+ bool mkwrite)
1966	2141	{
1967		- pgprot_t pgprot = vma->vm_page_prot;
	2142	+ int err;
1968	2143
1969	2144	BUG_ON(!vm_mixed_ok(vma, pfn));
1970	2145
1971	2146	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
1972		- return -EFAULT;
	2147	+ return VM_FAULT_SIGBUS;
1973	2148
1974	2149	track_pfn_insert(vma, &pgprot, pfn);
1975	2150
1976	2151	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
1977		- return -EACCES;
	2152	+ return VM_FAULT_SIGBUS;
1978	2153
1979	2154	/*
1980	2155	* If we don't have pte special, then we have to use the pfn_valid()
..	..	@@ -1993,36 +2168,68 @@
1993	2168	* result in pfn_t_has_page() == false.
1994	2169	*/
1995	2170	page = pfn_to_page(pfn_t_to_pfn(pfn));
1996		- return insert_page(vma, addr, page, pgprot);
	2171	+ err = insert_page(vma, addr, page, pgprot);
	2172	+ } else {
	2173	+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
1997	2174	}
1998		- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
	2175	+
	2176	+ if (err == -ENOMEM)
	2177	+ return VM_FAULT_OOM;
	2178	+ if (err < 0 && err != -EBUSY)
	2179	+ return VM_FAULT_SIGBUS;
	2180	+
	2181	+ return VM_FAULT_NOPAGE;
1999	2182	}
2000	2183
2001		-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
2002		- pfn_t pfn)
	2184	+/**
	2185	+ * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
	2186	+ * @vma: user vma to map to
	2187	+ * @addr: target user address of this page
	2188	+ * @pfn: source kernel pfn
	2189	+ * @pgprot: pgprot flags for the inserted page
	2190	+ *
	2191	+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
	2192	+ * to override pgprot on a per-page basis.
	2193	+ *
	2194	+ * Typically this function should be used by drivers to set caching- and
	2195	+ * encryption bits different than those of @vma->vm_page_prot, because
	2196	+ * the caching- or encryption mode may not be known at mmap() time.
	2197	+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
	2198	+ * to set caching and encryption bits for those vmas (except for COW pages).
	2199	+ * This is ensured by core vm only modifying these page table entries using
	2200	+ * functions that don't touch caching- or encryption bits, using pte_modify()
	2201	+ * if needed. (See for example mprotect()).
	2202	+ * Also when new page-table entries are created, this is only done using the
	2203	+ * fault() callback, and never using the value of vma->vm_page_prot,
	2204	+ * except for page-table entries that point to anonymous pages as the result
	2205	+ * of COW.
	2206	+ *
	2207	+ * Context: Process context. May allocate using %GFP_KERNEL.
	2208	+ * Return: vm_fault_t value.
	2209	+ */
	2210	+vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
	2211	+ pfn_t pfn, pgprot_t pgprot)
2003	2212	{
2004		- return __vm_insert_mixed(vma, addr, pfn, false);
2005		-
	2213	+ return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
2006	2214	}
2007		-EXPORT_SYMBOL(vm_insert_mixed);
	2215	+EXPORT_SYMBOL(vmf_insert_mixed_prot);
	2216	+
	2217	+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
	2218	+ pfn_t pfn)
	2219	+{
	2220	+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
	2221	+}
	2222	+EXPORT_SYMBOL(vmf_insert_mixed);
2008	2223
2009	2224	/*
2010	2225	* If the insertion of PTE failed because someone else already added a
2011	2226	* different entry in the mean time, we treat that as success as we assume
2012	2227	* the same entry was actually inserted.
2013	2228	*/
2014		-
2015	2229	vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2016	2230	unsigned long addr, pfn_t pfn)
2017	2231	{
2018		- int err;
2019		-
2020		- err = __vm_insert_mixed(vma, addr, pfn, true);
2021		- if (err == -ENOMEM)
2022		- return VM_FAULT_OOM;
2023		- if (err < 0 && err != -EBUSY)
2024		- return VM_FAULT_SIGBUS;
2025		- return VM_FAULT_NOPAGE;
	2232	+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
2026	2233	}
2027	2234	EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
2028	2235
..	..	@@ -2127,12 +2334,14 @@
2127	2334	/**
2128	2335	* remap_pfn_range - remap kernel memory to userspace
2129	2336	* @vma: user vma to map to
2130		- * @addr: target user address to start at
2131		- * @pfn: physical address of kernel memory
2132		- * @size: size of map area
	2337	+ * @addr: target page aligned user address to start at
	2338	+ * @pfn: page frame number of kernel physical memory address
	2339	+ * @size: size of mapping area
2133	2340	* @prot: page protection flags for this mapping
2134	2341	*
2135		- * Note: this is only safe if the mm semaphore is held when called.
	2342	+ * Note: this is only safe if the mm semaphore is held when called.
	2343	+ *
	2344	+ * Return: %0 on success, negative error code otherwise.
2136	2345	*/
2137	2346	int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2138	2347	unsigned long pfn, unsigned long size, pgprot_t prot)
..	..	@@ -2143,6 +2352,9 @@
2143	2352	struct mm_struct *mm = vma->vm_mm;
2144	2353	unsigned long remap_pfn = pfn;
2145	2354	int err;
	2355	+
	2356	+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
	2357	+ return -EINVAL;
2146	2358
2147	2359	/*
2148	2360	* Physically remapped pages are special. Tell the
..	..	@@ -2196,7 +2408,7 @@
2196	2408	/**
2197	2409	* vm_iomap_memory - remap memory to userspace
2198	2410	* @vma: user vma to map to
2199		- * @start: start of area
	2411	+ * @start: start of the physical memory to be mapped
2200	2412	* @len: size of area
2201	2413	*
2202	2414	* This is a simplified io_remap_pfn_range() for common driver use. The
..	..	@@ -2205,6 +2417,8 @@
2205	2417	*
2206	2418	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
2207	2419	* whatever write-combining details or similar.
	2420	+ *
	2421	+ * Return: %0 on success, negative error code otherwise.
2208	2422	*/
2209	2423	int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
2210	2424	{
..	..	@@ -2242,30 +2456,39 @@
2242	2456
2243	2457	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
2244	2458	unsigned long addr, unsigned long end,
2245		- pte_fn_t fn, void *data)
	2459	+ pte_fn_t fn, void *data, bool create,
	2460	+ pgtbl_mod_mask *mask)
2246	2461	{
2247	2462	pte_t *pte;
2248		- int err;
2249		- pgtable_t token;
2250		- spinlock_t *uninitialized_var(ptl);
	2463	+ int err = 0;
	2464	+ spinlock_t *ptl;
2251	2465
2252		- pte = (mm == &init_mm) ?
2253		- pte_alloc_kernel(pmd, addr) :
2254		- pte_alloc_map_lock(mm, pmd, addr, &ptl);
2255		- if (!pte)
2256		- return -ENOMEM;
	2466	+ if (create) {
	2467	+ pte = (mm == &init_mm) ?
	2468	+ pte_alloc_kernel_track(pmd, addr, mask) :
	2469	+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
	2470	+ if (!pte)
	2471	+ return -ENOMEM;
	2472	+ } else {
	2473	+ pte = (mm == &init_mm) ?
	2474	+ pte_offset_kernel(pmd, addr) :
	2475	+ pte_offset_map_lock(mm, pmd, addr, &ptl);
	2476	+ }
2257	2477
2258	2478	BUG_ON(pmd_huge(*pmd));
2259	2479
2260	2480	arch_enter_lazy_mmu_mode();
2261	2481
2262		- token = pmd_pgtable(*pmd);
2263		-
2264		- do {
2265		- err = fn(pte++, token, addr, data);
2266		- if (err)
2267		- break;
2268		- } while (addr += PAGE_SIZE, addr != end);
	2482	+ if (fn) {
	2483	+ do {
	2484	+ if (create \|\| !pte_none(*pte)) {
	2485	+ err = fn(pte++, addr, data);
	2486	+ if (err)
	2487	+ break;
	2488	+ }
	2489	+ } while (addr += PAGE_SIZE, addr != end);
	2490	+ }
	2491	+ *mask \|= PGTBL_PTE_MODIFIED;
2269	2492
2270	2493	arch_leave_lazy_mmu_mode();
2271	2494
..	..	@@ -2276,63 +2499,116 @@
2276	2499
2277	2500	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
2278	2501	unsigned long addr, unsigned long end,
2279		- pte_fn_t fn, void *data)
	2502	+ pte_fn_t fn, void *data, bool create,
	2503	+ pgtbl_mod_mask *mask)
2280	2504	{
2281	2505	pmd_t *pmd;
2282	2506	unsigned long next;
2283		- int err;
	2507	+ int err = 0;
2284	2508
2285	2509	BUG_ON(pud_huge(*pud));
2286	2510
2287		- pmd = pmd_alloc(mm, pud, addr);
2288		- if (!pmd)
2289		- return -ENOMEM;
	2511	+ if (create) {
	2512	+ pmd = pmd_alloc_track(mm, pud, addr, mask);
	2513	+ if (!pmd)
	2514	+ return -ENOMEM;
	2515	+ } else {
	2516	+ pmd = pmd_offset(pud, addr);
	2517	+ }
2290	2518	do {
2291	2519	next = pmd_addr_end(addr, end);
2292		- err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2293		- if (err)
2294		- break;
	2520	+ if (create \|\| !pmd_none_or_clear_bad(pmd)) {
	2521	+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
	2522	+ create, mask);
	2523	+ if (err)
	2524	+ break;
	2525	+ }
2295	2526	} while (pmd++, addr = next, addr != end);
2296	2527	return err;
2297	2528	}
2298	2529
2299	2530	static int apply_to_pud_range(struct mm_struct mm, p4d_t p4d,
2300	2531	unsigned long addr, unsigned long end,
2301		- pte_fn_t fn, void *data)
	2532	+ pte_fn_t fn, void *data, bool create,
	2533	+ pgtbl_mod_mask *mask)
2302	2534	{
2303	2535	pud_t *pud;
2304	2536	unsigned long next;
2305		- int err;
	2537	+ int err = 0;
2306	2538
2307		- pud = pud_alloc(mm, p4d, addr);
2308		- if (!pud)
2309		- return -ENOMEM;
	2539	+ if (create) {
	2540	+ pud = pud_alloc_track(mm, p4d, addr, mask);
	2541	+ if (!pud)
	2542	+ return -ENOMEM;
	2543	+ } else {
	2544	+ pud = pud_offset(p4d, addr);
	2545	+ }
2310	2546	do {
2311	2547	next = pud_addr_end(addr, end);
2312		- err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2313		- if (err)
2314		- break;
	2548	+ if (create \|\| !pud_none_or_clear_bad(pud)) {
	2549	+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
	2550	+ create, mask);
	2551	+ if (err)
	2552	+ break;
	2553	+ }
2315	2554	} while (pud++, addr = next, addr != end);
2316	2555	return err;
2317	2556	}
2318	2557
2319	2558	static int apply_to_p4d_range(struct mm_struct mm, pgd_t pgd,
2320	2559	unsigned long addr, unsigned long end,
2321		- pte_fn_t fn, void *data)
	2560	+ pte_fn_t fn, void *data, bool create,
	2561	+ pgtbl_mod_mask *mask)
2322	2562	{
2323	2563	p4d_t *p4d;
2324	2564	unsigned long next;
2325		- int err;
	2565	+ int err = 0;
2326	2566
2327		- p4d = p4d_alloc(mm, pgd, addr);
2328		- if (!p4d)
2329		- return -ENOMEM;
	2567	+ if (create) {
	2568	+ p4d = p4d_alloc_track(mm, pgd, addr, mask);
	2569	+ if (!p4d)
	2570	+ return -ENOMEM;
	2571	+ } else {
	2572	+ p4d = p4d_offset(pgd, addr);
	2573	+ }
2330	2574	do {
2331	2575	next = p4d_addr_end(addr, end);
2332		- err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
	2576	+ if (create \|\| !p4d_none_or_clear_bad(p4d)) {
	2577	+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
	2578	+ create, mask);
	2579	+ if (err)
	2580	+ break;
	2581	+ }
	2582	+ } while (p4d++, addr = next, addr != end);
	2583	+ return err;
	2584	+}
	2585	+
	2586	+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
	2587	+ unsigned long size, pte_fn_t fn,
	2588	+ void *data, bool create)
	2589	+{
	2590	+ pgd_t *pgd;
	2591	+ unsigned long start = addr, next;
	2592	+ unsigned long end = addr + size;
	2593	+ pgtbl_mod_mask mask = 0;
	2594	+ int err = 0;
	2595	+
	2596	+ if (WARN_ON(addr >= end))
	2597	+ return -EINVAL;
	2598	+
	2599	+ pgd = pgd_offset(mm, addr);
	2600	+ do {
	2601	+ next = pgd_addr_end(addr, end);
	2602	+ if (!create && pgd_none_or_clear_bad(pgd))
	2603	+ continue;
	2604	+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
2333	2605	if (err)
2334	2606	break;
2335		- } while (p4d++, addr = next, addr != end);
	2607	+ } while (pgd++, addr = next, addr != end);
	2608	+
	2609	+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
	2610	+ arch_sync_kernel_mappings(start, start + size);
	2611	+
2336	2612	return err;
2337	2613	}
2338	2614
..	..	@@ -2343,25 +2619,240 @@
2343	2619	int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2344	2620	unsigned long size, pte_fn_t fn, void *data)
2345	2621	{
2346		- pgd_t *pgd;
2347		- unsigned long next;
2348		- unsigned long end = addr + size;
2349		- int err;
2350		-
2351		- if (WARN_ON(addr >= end))
2352		- return -EINVAL;
2353		-
2354		- pgd = pgd_offset(mm, addr);
2355		- do {
2356		- next = pgd_addr_end(addr, end);
2357		- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
2358		- if (err)
2359		- break;
2360		- } while (pgd++, addr = next, addr != end);
2361		-
2362		- return err;
	2622	+ return __apply_to_page_range(mm, addr, size, fn, data, true);
2363	2623	}
2364	2624	EXPORT_SYMBOL_GPL(apply_to_page_range);
	2625	+
	2626	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	2627	+static bool pte_spinlock(struct vm_fault *vmf)
	2628	+{
	2629	+ bool ret = false;
	2630	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	2631	+ pmd_t pmdval;
	2632	+#endif
	2633	+
	2634	+ /* Check if vma is still valid */
	2635	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2636	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2637	+ spin_lock(vmf->ptl);
	2638	+ return true;
	2639	+ }
	2640	+
	2641	+ local_irq_disable();
	2642	+ if (vma_has_changed(vmf)) {
	2643	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
	2644	+ goto out;
	2645	+ }
	2646	+
	2647	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	2648	+ /*
	2649	+ * We check if the pmd value is still the same to ensure that there
	2650	+ * is not a huge collapse operation in progress in our back.
	2651	+ */
	2652	+ pmdval = READ_ONCE(*vmf->pmd);
	2653	+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
	2654	+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
	2655	+ goto out;
	2656	+ }
	2657	+#endif
	2658	+
	2659	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2660	+ if (unlikely(!spin_trylock(vmf->ptl))) {
	2661	+ trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address);
	2662	+ goto out;
	2663	+ }
	2664	+
	2665	+ if (vma_has_changed(vmf)) {
	2666	+ spin_unlock(vmf->ptl);
	2667	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
	2668	+ goto out;
	2669	+ }
	2670	+
	2671	+ ret = true;
	2672	+out:
	2673	+ local_irq_enable();
	2674	+ return ret;
	2675	+}
	2676	+
	2677	+static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr)
	2678	+{
	2679	+ bool ret = false;
	2680	+ pte_t *pte;
	2681	+ spinlock_t *ptl;
	2682	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	2683	+ pmd_t pmdval;
	2684	+#endif
	2685	+
	2686	+ /*
	2687	+ * The first vma_has_changed() guarantees the page-tables are still
	2688	+ * valid, having IRQs disabled ensures they stay around, hence the
	2689	+ * second vma_has_changed() to make sure they are still valid once
	2690	+ * we've got the lock. After that a concurrent zap_pte_range() will
	2691	+ * block on the PTL and thus we're safe.
	2692	+ */
	2693	+ local_irq_disable();
	2694	+ if (vma_has_changed(vmf)) {
	2695	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
	2696	+ goto out;
	2697	+ }
	2698	+
	2699	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	2700	+ /*
	2701	+ * We check if the pmd value is still the same to ensure that there
	2702	+ * is not a huge collapse operation in progress in our back.
	2703	+ */
	2704	+ pmdval = READ_ONCE(*vmf->pmd);
	2705	+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
	2706	+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, addr);
	2707	+ goto out;
	2708	+ }
	2709	+#endif
	2710	+
	2711	+ /*
	2712	+ * Same as pte_offset_map_lock() except that we call
	2713	+ * spin_trylock() in place of spin_lock() to avoid race with
	2714	+ * unmap path which may have the lock and wait for this CPU
	2715	+ * to invalidate TLB but this CPU has irq disabled.
	2716	+ * Since we are in a speculative patch, accept it could fail
	2717	+ */
	2718	+ ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2719	+ pte = pte_offset_map(vmf->pmd, addr);
	2720	+ if (unlikely(!spin_trylock(ptl))) {
	2721	+ pte_unmap(pte);
	2722	+ trace_spf_pte_lock(_RET_IP_, vmf->vma, addr);
	2723	+ goto out;
	2724	+ }
	2725	+
	2726	+ if (vma_has_changed(vmf)) {
	2727	+ pte_unmap_unlock(pte, ptl);
	2728	+ trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);
	2729	+ goto out;
	2730	+ }
	2731	+
	2732	+ vmf->pte = pte;
	2733	+ vmf->ptl = ptl;
	2734	+ ret = true;
	2735	+out:
	2736	+ local_irq_enable();
	2737	+ return ret;
	2738	+}
	2739	+
	2740	+static bool pte_map_lock(struct vm_fault *vmf)
	2741	+{
	2742	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2743	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2744	+ vmf->address, &vmf->ptl);
	2745	+ return true;
	2746	+ }
	2747	+
	2748	+ return __pte_map_lock_speculative(vmf, vmf->address);
	2749	+}
	2750	+
	2751	+bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
	2752	+{
	2753	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
	2754	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2755	+ addr, &vmf->ptl);
	2756	+ return true;
	2757	+ }
	2758	+
	2759	+ return __pte_map_lock_speculative(vmf, addr);
	2760	+}
	2761	+
	2762	+static bool __read_mostly allow_file_spec_access;
	2763	+static int __init allow_file_spec_access_setup(char *str)
	2764	+{
	2765	+ allow_file_spec_access = true;
	2766	+ return 1;
	2767	+}
	2768	+__setup("allow_file_spec_access", allow_file_spec_access_setup);
	2769	+
	2770	+static bool vmf_allows_speculation(struct vm_fault *vmf)
	2771	+{
	2772	+ if (vma_is_anonymous(vmf->vma)) {
	2773	+ /*
	2774	+ * __anon_vma_prepare() requires the mmap_sem to be held
	2775	+ * because vm_next and vm_prev must be safe. This can't be
	2776	+ * guaranteed in the speculative path.
	2777	+ */
	2778	+ if (!vmf->vma->anon_vma) {
	2779	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2780	+ return false;
	2781	+ }
	2782	+ return true;
	2783	+ }
	2784	+
	2785	+ if (!allow_file_spec_access) {
	2786	+ /*
	2787	+ * Can't call vm_ops service has we don't know what they would
	2788	+ * do with the VMA.
	2789	+ * This include huge page from hugetlbfs.
	2790	+ */
	2791	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2792	+ return false;
	2793	+ }
	2794	+
	2795	+ if (!(vmf->vma->vm_flags & VM_SHARED) &&
	2796	+ (vmf->flags & FAULT_FLAG_WRITE) &&
	2797	+ !vmf->vma->anon_vma) {
	2798	+ /*
	2799	+ * non-anonymous private COW without anon_vma.
	2800	+ * See above.
	2801	+ */
	2802	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2803	+ return false;
	2804	+ }
	2805	+
	2806	+ if (vmf->vma->vm_ops->allow_speculation &&
	2807	+ vmf->vma->vm_ops->allow_speculation()) {
	2808	+ return true;
	2809	+ }
	2810	+
	2811	+ trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address);
	2812	+ return false;
	2813	+}
	2814	+
	2815	+#else
	2816	+static inline bool pte_spinlock(struct vm_fault *vmf)
	2817	+{
	2818	+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
	2819	+ spin_lock(vmf->ptl);
	2820	+ return true;
	2821	+}
	2822	+
	2823	+static inline bool pte_map_lock(struct vm_fault *vmf)
	2824	+{
	2825	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2826	+ vmf->address, &vmf->ptl);
	2827	+ return true;
	2828	+}
	2829	+
	2830	+inline bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr)
	2831	+{
	2832	+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
	2833	+ addr, &vmf->ptl);
	2834	+ return true;
	2835	+}
	2836	+
	2837	+static inline bool vmf_allows_speculation(struct vm_fault *vmf)
	2838	+{
	2839	+ return false;
	2840	+}
	2841	+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
	2842	+
	2843	+/*
	2844	+ * Scan a region of virtual memory, calling a provided function on
	2845	+ * each leaf page table where it exists.
	2846	+ *
	2847	+ * Unlike apply_to_page_range, this does _not_ fill in page tables
	2848	+ * where they are absent.
	2849	+ */
	2850	+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
	2851	+ unsigned long size, pte_fn_t fn, void *data)
	2852	+{
	2853	+ return __apply_to_page_range(mm, addr, size, fn, data, false);
	2854	+}
	2855	+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
2365	2856
2366	2857	/*
2367	2858	* handle_pte_fault chooses page fault handler according to an entry which was
..	..	@@ -2370,21 +2861,29 @@
2370	2861	* parts, do_swap_page must check under lock before unmapping the pte and
2371	2862	* proceeding (but do_wp_page is only called after already making such a check;
2372	2863	* and do_anonymous_page can safely check later on).
	2864	+ *
	2865	+ * pte_unmap_same() returns:
	2866	+ * 0 if the PTE are the same
	2867	+ * VM_FAULT_PTNOTSAME if the PTE are different
	2868	+ * VM_FAULT_RETRY if the VMA has changed in our back during
	2869	+ * a speculative page fault handling.
2373	2870	*/
2374		-static inline int pte_unmap_same(struct mm_struct mm, pmd_t pmd,
2375		- pte_t *page_table, pte_t orig_pte)
	2871	+static inline int pte_unmap_same(struct vm_fault *vmf)
2376	2872	{
2377		- int same = 1;
	2873	+ int ret = 0;
	2874	+
2378	2875	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)
2379	2876	if (sizeof(pte_t) > sizeof(unsigned long)) {
2380		- spinlock_t *ptl = pte_lockptr(mm, pmd);
2381		- spin_lock(ptl);
2382		- same = pte_same(*page_table, orig_pte);
2383		- spin_unlock(ptl);
	2877	+ if (pte_spinlock(vmf)) {
	2878	+ if (!pte_same(*vmf->pte, vmf->orig_pte))
	2879	+ ret = VM_FAULT_PTNOTSAME;
	2880	+ spin_unlock(vmf->ptl);
	2881	+ } else
	2882	+ ret = VM_FAULT_RETRY;
2384	2883	}
2385	2884	#endif
2386		- pte_unmap(page_table);
2387		- return same;
	2885	+ pte_unmap(vmf->pte);
	2886	+ return ret;
2388	2887	}
2389	2888
2390	2889	static inline bool cow_user_page(struct page dst, struct page src,
..	..	@@ -2397,8 +2896,6 @@
2397	2896	struct vm_area_struct *vma = vmf->vma;
2398	2897	struct mm_struct *mm = vma->vm_mm;
2399	2898	unsigned long addr = vmf->address;
2400		-
2401		- debug_dma_assert_idle(src);
2402	2899
2403	2900	if (likely(src)) {
2404	2901	copy_user_highpage(dst, src, addr, vma);
..	..	@@ -2426,10 +2923,9 @@
2426	2923	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2427	2924	/*
2428	2925	* Other thread has already handled the fault
2429		- * and we don't need to do anything. If it's
2430		- * not the case, the fault will be triggered
2431		- * again on the same address.
	2926	+ * and update local tlb only
2432	2927	*/
	2928	+ update_mmu_tlb(vma, addr, vmf->pte);
2433	2929	ret = false;
2434	2930	goto pte_unlock;
2435	2931	}
..	..	@@ -2453,13 +2949,14 @@
2453	2949	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
2454	2950	locked = true;
2455	2951	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2456		- /* The PTE changed under us. Retry page fault. */
	2952	+ /* The PTE changed under us, update local tlb */
	2953	+ update_mmu_tlb(vma, addr, vmf->pte);
2457	2954	ret = false;
2458	2955	goto pte_unlock;
2459	2956	}
2460	2957
2461	2958	/*
2462		- * The same page can be mapped back since last copy attampt.
	2959	+ * The same page can be mapped back since last copy attempt.
2463	2960	* Try to copy again under PTL.
2464	2961	*/
2465	2962	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
..	..	@@ -2538,10 +3035,11 @@
2538	3035	*
2539	3036	* The function expects the page to be locked and unlocks it.
2540	3037	*/
2541		-static void fault_dirty_shared_page(struct vm_area_struct *vma,
2542		- struct page *page)
	3038	+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
2543	3039	{
	3040	+ struct vm_area_struct *vma = vmf->vma;
2544	3041	struct address_space *mapping;
	3042	+ struct page *page = vmf->page;
2545	3043	bool dirtied;
2546	3044	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2547	3045
..	..	@@ -2556,16 +3054,30 @@
2556	3054	mapping = page_rmapping(page);
2557	3055	unlock_page(page);
2558	3056
2559		- if ((dirtied \|\| page_mkwrite) && mapping) {
2560		- /*
2561		- * Some device drivers do not set page.mapping
2562		- * but still dirty their pages
2563		- */
2564		- balance_dirty_pages_ratelimited(mapping);
2565		- }
2566		-
2567	3057	if (!page_mkwrite)
2568	3058	file_update_time(vma->vm_file);
	3059	+
	3060	+ /*
	3061	+ * Throttle page dirtying rate down to writeback speed.
	3062	+ *
	3063	+ * mapping may be NULL here because some device drivers do not
	3064	+ * set page.mapping but still dirty their pages
	3065	+ *
	3066	+ * Drop the mmap_lock before waiting on IO, if we can. The file
	3067	+ * is pinning the mapping, as per above.
	3068	+ */
	3069	+ if ((dirtied \|\| page_mkwrite) && mapping) {
	3070	+ struct file *fpin;
	3071	+
	3072	+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
	3073	+ balance_dirty_pages_ratelimited(mapping);
	3074	+ if (fpin) {
	3075	+ fput(fpin);
	3076	+ return VM_FAULT_RETRY;
	3077	+ }
	3078	+ }
	3079	+
	3080	+ return 0;
2569	3081	}
2570	3082
2571	3083	/*
..	..	@@ -2592,16 +3104,17 @@
2592	3104
2593	3105	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2594	3106	entry = pte_mkyoung(vmf->orig_pte);
2595		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	3107	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
2596	3108	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2597	3109	update_mmu_cache(vma, vmf->address, vmf->pte);
2598	3110	pte_unmap_unlock(vmf->pte, vmf->ptl);
	3111	+ count_vm_event(PGREUSE);
2599	3112	}
2600	3113
2601	3114	/*
2602	3115	* Handle the case of a page which we actually need to copy to a new page.
2603	3116	*
2604		- * Called with mmap_sem locked and the old page referenced, but
	3117	+ * Called with mmap_lock locked and the old page referenced, but
2605	3118	* without the ptl held.
2606	3119	*
2607	3120	* High level logic flow:
..	..	@@ -2622,23 +3135,22 @@
2622	3135	struct page *new_page = NULL;
2623	3136	pte_t entry;
2624	3137	int page_copied = 0;
2625		- const unsigned long mmun_start = vmf->address & PAGE_MASK;
2626		- const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2627		- struct mem_cgroup *memcg;
	3138	+ struct mmu_notifier_range range;
	3139	+ vm_fault_t ret = VM_FAULT_OOM;
2628	3140
2629	3141	if (unlikely(anon_vma_prepare(vma)))
2630		- goto oom;
	3142	+ goto out;
2631	3143
2632	3144	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2633	3145	new_page = alloc_zeroed_user_highpage_movable(vma,
2634	3146	vmf->address);
2635	3147	if (!new_page)
2636		- goto oom;
	3148	+ goto out;
2637	3149	} else {
2638	3150	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2639	3151	vmf->address);
2640	3152	if (!new_page)
2641		- goto oom;
	3153	+ goto out;
2642	3154
2643	3155	if (!cow_user_page(new_page, old_page, vmf)) {
2644	3156	/*
..	..	@@ -2652,19 +3164,27 @@
2652	3164	put_page(old_page);
2653	3165	return 0;
2654	3166	}
	3167	+ trace_android_vh_cow_user_page(vmf, new_page);
2655	3168	}
2656	3169
2657		- if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2658		- goto oom_free_new;
	3170	+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
	3171	+ goto out_free_new;
	3172	+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
2659	3173
2660	3174	__SetPageUptodate(new_page);
2661	3175
2662		- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	3176	+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
	3177	+ vmf->address & PAGE_MASK,
	3178	+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
	3179	+ mmu_notifier_invalidate_range_start(&range);
2663	3180
2664	3181	/*
2665	3182	* Re-check the pte - we dropped the lock
2666	3183	*/
2667		- vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
	3184	+ if (!pte_map_lock(vmf)) {
	3185	+ ret = VM_FAULT_RETRY;
	3186	+ goto out_invalidate_end;
	3187	+ }
2668	3188	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2669	3189	if (old_page) {
2670	3190	if (!PageAnon(old_page)) {
..	..	@@ -2676,8 +3196,9 @@
2676	3196	inc_mm_counter_fast(mm, MM_ANONPAGES);
2677	3197	}
2678	3198	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2679		- entry = mk_pte(new_page, vma->vm_page_prot);
2680		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	3199	+ entry = mk_pte(new_page, vmf->vma_page_prot);
	3200	+ entry = pte_sw_mkyoung(entry);
	3201	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
2681	3202	/*
2682	3203	* Clear the pte entry and flush it first, before updating the
2683	3204	* pte with the new entry. This will avoid a race condition
..	..	@@ -2685,9 +3206,8 @@
2685	3206	* thread doing COW.
2686	3207	*/
2687	3208	ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2688		- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2689		- mem_cgroup_commit_charge(new_page, memcg, false, false);
2690		- lru_cache_add_active_or_unevictable(new_page, vma);
	3209	+ __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
	3210	+ __lru_cache_add_inactive_or_unevictable(new_page, vmf->vma_flags);
2691	3211	/*
2692	3212	* We call the notify macro here because, when using secondary
2693	3213	* mmu page tables (such as kvm shadow page tables), we want the
..	..	@@ -2725,7 +3245,7 @@
2725	3245	new_page = old_page;
2726	3246	page_copied = 1;
2727	3247	} else {
2728		- mem_cgroup_cancel_charge(new_page, memcg, false);
	3248	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
2729	3249	}
2730	3250
2731	3251	if (new_page)
..	..	@@ -2736,13 +3256,13 @@
2736	3256	* No need to double call mmu_notifier->invalidate_range() callback as
2737	3257	* the above ptep_clear_flush_notify() did already call it.
2738	3258	*/
2739		- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
	3259	+ mmu_notifier_invalidate_range_only_end(&range);
2740	3260	if (old_page) {
2741	3261	/*
2742	3262	* Don't let another task, with possibly unlocked vma,
2743	3263	* keep the mlocked page.
2744	3264	*/
2745		- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
	3265	+ if (page_copied && (vmf->vma_flags & VM_LOCKED)) {
2746	3266	lock_page(old_page); /* LRU manipulation */
2747	3267	if (PageMlocked(old_page))
2748	3268	munlock_vma_page(old_page);
..	..	@@ -2751,12 +3271,14 @@
2751	3271	put_page(old_page);
2752	3272	}
2753	3273	return page_copied ? VM_FAULT_WRITE : 0;
2754		-oom_free_new:
	3274	+out_invalidate_end:
	3275	+ mmu_notifier_invalidate_range_only_end(&range);
	3276	+out_free_new:
2755	3277	put_page(new_page);
2756		-oom:
	3278	+out:
2757	3279	if (old_page)
2758	3280	put_page(old_page);
2759		- return VM_FAULT_OOM;
	3281	+ return ret;
2760	3282	}
2761	3283
2762	3284	/**
..	..	@@ -2767,23 +3289,25 @@
2767	3289	*
2768	3290	* This function handles all that is needed to finish a write page fault in a
2769	3291	* shared mapping due to PTE being read-only once the mapped page is prepared.
2770		- * It handles locking of PTE and modifying it. The function returns
2771		- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
2772		- * lock.
	3292	+ * It handles locking of PTE and modifying it.
2773	3293	*
2774	3294	* The function expects the page to be locked or other protection against
2775	3295	* concurrent faults / writeback (such as DAX radix tree locks).
	3296	+ *
	3297	+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
	3298	+ * we acquired PTE lock.
2776	3299	*/
2777	3300	vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
2778	3301	{
2779		- WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2780		- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2781		- &vmf->ptl);
	3302	+ WARN_ON_ONCE(!(vmf->vma_flags & VM_SHARED));
	3303	+ if (!pte_map_lock(vmf))
	3304	+ return VM_FAULT_RETRY;
2782	3305	/*
2783	3306	* We might have raced with another page fault while we released the
2784	3307	* pte_offset_map_lock.
2785	3308	*/
2786	3309	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
	3310	+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
2787	3311	pte_unmap_unlock(vmf->pte, vmf->ptl);
2788	3312	return VM_FAULT_NOPAGE;
2789	3313	}
..	..	@@ -2817,6 +3341,7 @@
2817	3341	__releases(vmf->ptl)
2818	3342	{
2819	3343	struct vm_area_struct *vma = vmf->vma;
	3344	+ vm_fault_t ret = VM_FAULT_WRITE;
2820	3345
2821	3346	get_page(vmf->page);
2822	3347
..	..	@@ -2840,10 +3365,10 @@
2840	3365	wp_page_reuse(vmf);
2841	3366	lock_page(vmf->page);
2842	3367	}
2843		- fault_dirty_shared_page(vma, vmf->page);
	3368	+ ret \|= fault_dirty_shared_page(vmf);
2844	3369	put_page(vmf->page);
2845	3370
2846		- return VM_FAULT_WRITE;
	3371	+ return ret;
2847	3372	}
2848	3373
2849	3374	/*
..	..	@@ -2860,16 +3385,32 @@
2860	3385	* change only once the write actually happens. This avoids a few races,
2861	3386	* and potentially makes it more efficient.
2862	3387	*
2863		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3388	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
2864	3389	* but allow concurrent faults), with pte both mapped and locked.
2865		- * We return with mmap_sem still held, but pte unmapped and unlocked.
	3390	+ * We return with mmap_lock still held, but pte unmapped and unlocked.
2866	3391	*/
2867	3392	static vm_fault_t do_wp_page(struct vm_fault *vmf)
2868	3393	__releases(vmf->ptl)
2869	3394	{
2870	3395	struct vm_area_struct *vma = vmf->vma;
2871	3396
2872		- vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
	3397	+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
	3398	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
	3399	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3400	+ return VM_FAULT_RETRY;
	3401	+ return handle_userfault(vmf, VM_UFFD_WP);
	3402	+ }
	3403	+
	3404	+ /*
	3405	+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
	3406	+ * is flushed in this case before copying.
	3407	+ */
	3408	+ if (unlikely(userfaultfd_wp(vmf->vma) &&
	3409	+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
	3410	+ flush_tlb_page(vmf->vma, vmf->address);
	3411	+
	3412	+ vmf->page = _vm_normal_page(vma, vmf->address, vmf->orig_pte,
	3413	+ vmf->vma_flags);
2873	3414	if (!vmf->page) {
2874	3415	/*
2875	3416	* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
..	..	@@ -2878,7 +3419,7 @@
2878	3419	* We should not cow pages in a shared writeable mapping.
2879	3420	* Just mark the pages writable and/or call ops->pfn_mkwrite.
2880	3421	*/
2881		- if ((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
	3422	+ if ((vmf->vma_flags & (VM_WRITE\|VM_SHARED)) ==
2882	3423	(VM_WRITE\|VM_SHARED))
2883	3424	return wp_pfn_shared(vmf);
2884	3425
..	..	@@ -2890,43 +3431,31 @@
2890	3431	* Take out anonymous pages first, anonymous shared vmas are
2891	3432	* not dirty accountable.
2892	3433	*/
2893		- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2894		- int total_map_swapcount;
2895		- if (!trylock_page(vmf->page)) {
2896		- get_page(vmf->page);
2897		- pte_unmap_unlock(vmf->pte, vmf->ptl);
2898		- lock_page(vmf->page);
2899		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2900		- vmf->address, &vmf->ptl);
2901		- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2902		- unlock_page(vmf->page);
2903		- pte_unmap_unlock(vmf->pte, vmf->ptl);
2904		- put_page(vmf->page);
2905		- return 0;
2906		- }
2907		- put_page(vmf->page);
	3434	+ if (PageAnon(vmf->page)) {
	3435	+ struct page *page = vmf->page;
	3436	+
	3437	+ /* PageKsm() doesn't necessarily raise the page refcount */
	3438	+ if (PageKsm(page) \|\| page_count(page) != 1)
	3439	+ goto copy;
	3440	+ if (!trylock_page(page))
	3441	+ goto copy;
	3442	+ if (PageKsm(page) \|\| page_mapcount(page) != 1 \|\| page_count(page) != 1) {
	3443	+ unlock_page(page);
	3444	+ goto copy;
2908	3445	}
2909		- if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
2910		- if (total_map_swapcount == 1) {
2911		- /*
2912		- * The page is all ours. Move it to
2913		- * our anon_vma so the rmap code will
2914		- * not search our parent or siblings.
2915		- * Protected against the rmap code by
2916		- * the page lock.
2917		- */
2918		- page_move_anon_rmap(vmf->page, vma);
2919		- }
2920		- unlock_page(vmf->page);
2921		- wp_page_reuse(vmf);
2922		- return VM_FAULT_WRITE;
2923		- }
2924		- unlock_page(vmf->page);
2925		- } else if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
	3446	+ /*
	3447	+ * Ok, we've got the only map reference, and the only
	3448	+ * page count reference, and the page is locked,
	3449	+ * it's dark out, and we're wearing sunglasses. Hit it.
	3450	+ */
	3451	+ unlock_page(page);
	3452	+ wp_page_reuse(vmf);
	3453	+ return VM_FAULT_WRITE;
	3454	+ } else if (unlikely((vmf->vma_flags & (VM_WRITE\|VM_SHARED)) ==
2926	3455	(VM_WRITE\|VM_SHARED))) {
2927	3456	return wp_page_shared(vmf);
2928	3457	}
2929		-
	3458	+copy:
2930	3459	/*
2931	3460	* Ok, we need to copy. Oh, well..
2932	3461	*/
..	..	@@ -2989,7 +3518,7 @@
2989	3518
2990	3519	details.check_mapping = mapping;
2991	3520	details.first_index = page->index;
2992		- details.last_index = page->index + hpage_nr_pages(page) - 1;
	3521	+ details.last_index = page->index + thp_nr_pages(page) - 1;
2993	3522	details.single_page = page;
2994	3523
2995	3524	i_mmap_lock_write(mapping);
..	..	@@ -3063,26 +3592,40 @@
3063	3592	EXPORT_SYMBOL(unmap_mapping_range);
3064	3593
3065	3594	/*
3066		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3595	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3067	3596	* but allow concurrent faults), and pte mapped but not yet locked.
3068	3597	* We return with pte unmapped and unlocked.
3069	3598	*
3070		- * We return with the mmap_sem locked or unlocked in the same cases
	3599	+ * We return with the mmap_lock locked or unlocked in the same cases
3071	3600	* as does filemap_fault().
3072	3601	*/
3073	3602	vm_fault_t do_swap_page(struct vm_fault *vmf)
3074	3603	{
3075	3604	struct vm_area_struct *vma = vmf->vma;
3076	3605	struct page page = NULL, swapcache;
3077		- struct mem_cgroup *memcg;
3078	3606	swp_entry_t entry;
3079	3607	pte_t pte;
3080	3608	int locked;
3081	3609	int exclusive = 0;
3082		- vm_fault_t ret = 0;
	3610	+ vm_fault_t ret;
	3611	+ void *shadow = NULL;
3083	3612
3084		- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
	3613	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
	3614	+ pte_unmap(vmf->pte);
	3615	+ return VM_FAULT_RETRY;
	3616	+ }
	3617	+
	3618	+ ret = pte_unmap_same(vmf);
	3619	+ if (ret) {
	3620	+ /*
	3621	+ * If pte != orig_pte, this means another thread did the
	3622	+ * swap operation in our back.
	3623	+ * So nothing else to do.
	3624	+ */
	3625	+ if (ret == VM_FAULT_PTNOTSAME)
	3626	+ ret = 0;
3085	3627	goto out;
	3628	+ }
3086	3629
3087	3630	entry = pte_to_swp_entry(vmf->orig_pte);
3088	3631	if (unlikely(non_swap_entry(entry))) {
..	..	@@ -3090,13 +3633,8 @@
3090	3633	migration_entry_wait(vma->vm_mm, vmf->pmd,
3091	3634	vmf->address);
3092	3635	} else if (is_device_private_entry(entry)) {
3093		- /*
3094		- * For un-addressable device memory we call the pgmap
3095		- * fault handler callback. The callback must migrate
3096		- * the page back to some CPU accessible page.
3097		- */
3098		- ret = device_private_entry_fault(vma, vmf->address, entry,
3099		- vmf->flags, vmf->pmd);
	3636	+ vmf->page = device_private_entry_to_page(entry);
	3637	+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
3100	3638	} else if (is_hwpoison_entry(entry)) {
3101	3639	ret = VM_FAULT_HWPOISON;
3102	3640	} else {
..	..	@@ -3114,18 +3652,48 @@
3114	3652	if (!page) {
3115	3653	struct swap_info_struct *si = swp_swap_info(entry);
3116	3654
3117		- if (si->flags & SWP_SYNCHRONOUS_IO &&
3118		- __swap_count(si, entry) == 1) {
	3655	+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
	3656	+ __swap_count(entry) == 1) {
3119	3657	/* skip swapcache */
3120		- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
3121		- vmf->address);
	3658	+ gfp_t flags = GFP_HIGHUSER_MOVABLE;
	3659	+
	3660	+ trace_android_rvh_set_skip_swapcache_flags(&flags);
	3661	+ page = alloc_page_vma(flags, vma, vmf->address);
3122	3662	if (page) {
	3663	+ int err;
	3664	+
3123	3665	__SetPageLocked(page);
3124	3666	__SetPageSwapBacked(page);
3125	3667	set_page_private(page, entry.val);
3126		- lru_cache_add_anon(page);
	3668	+
	3669	+ /* Tell memcg to use swap ownership records */
	3670	+ SetPageSwapCache(page);
	3671	+ err = mem_cgroup_charge(page, vma->vm_mm,
	3672	+ GFP_KERNEL);
	3673	+ ClearPageSwapCache(page);
	3674	+ if (err) {
	3675	+ ret = VM_FAULT_OOM;
	3676	+ goto out_page;
	3677	+ }
	3678	+
	3679	+ shadow = get_shadow_from_swap_cache(entry);
	3680	+ if (shadow)
	3681	+ workingset_refault(page, shadow);
	3682	+
	3683	+ lru_cache_add(page);
3127	3684	swap_readpage(page, true);
3128	3685	}
	3686	+ } else if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
	3687	+ /*
	3688	+ * Don't try readahead during a speculative page fault
	3689	+ * as the VMA's boundaries may change in our back.
	3690	+ * If the page is not in the swap cache and synchronous
	3691	+ * read is disabled, fall back to the regular page fault
	3692	+ * mechanism.
	3693	+ */
	3694	+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
	3695	+ ret = VM_FAULT_RETRY;
	3696	+ goto out;
3129	3697	} else {
3130	3698	page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
3131	3699	vmf);
..	..	@@ -3134,11 +3702,16 @@
3134	3702
3135	3703	if (!page) {
3136	3704	/*
3137		- * Back out if somebody else faulted in this pte
3138		- * while we released the pte lock.
	3705	+ * Back out if the VMA has changed in our back during
	3706	+ * a speculative page fault or if somebody else
	3707	+ * faulted in this pte while we released the pte lock.
3139	3708	*/
3140		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3141		- vmf->address, &vmf->ptl);
	3709	+ if (!pte_map_lock(vmf)) {
	3710	+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
	3711	+ ret = VM_FAULT_RETRY;
	3712	+ goto out;
	3713	+ }
	3714	+
3142	3715	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
3143	3716	ret = VM_FAULT_OOM;
3144	3717	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
..	..	@@ -3184,17 +3757,16 @@
3184	3757	goto out_page;
3185	3758	}
3186	3759
3187		- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
3188		- &memcg, false)) {
3189		- ret = VM_FAULT_OOM;
3190		- goto out_page;
3191		- }
	3760	+ cgroup_throttle_swaprate(page, GFP_KERNEL);
3192	3761
3193	3762	/*
3194		- * Back out if somebody else already faulted in this pte.
	3763	+ * Back out if the VMA has changed in our back during a speculative
	3764	+ * page fault or if somebody else already faulted in this pte.
3195	3765	*/
3196		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3197		- &vmf->ptl);
	3766	+ if (!pte_map_lock(vmf)) {
	3767	+ ret = VM_FAULT_RETRY;
	3768	+ goto out_page;
	3769	+ }
3198	3770	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
3199	3771	goto out_nomap;
3200	3772
..	..	@@ -3215,9 +3787,9 @@
3215	3787
3216	3788	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3217	3789	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
3218		- pte = mk_pte(page, vma->vm_page_prot);
	3790	+ pte = mk_pte(page, vmf->vma_page_prot);
3219	3791	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
3220		- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
	3792	+ pte = maybe_mkwrite(pte_mkdirty(pte), vmf->vma_flags);
3221	3793	vmf->flags &= ~FAULT_FLAG_WRITE;
3222	3794	ret \|= VM_FAULT_WRITE;
3223	3795	exclusive = RMAP_EXCLUSIVE;
..	..	@@ -3225,24 +3797,26 @@
3225	3797	flush_icache_page(vma, page);
3226	3798	if (pte_swp_soft_dirty(vmf->orig_pte))
3227	3799	pte = pte_mksoft_dirty(pte);
	3800	+ if (pte_swp_uffd_wp(vmf->orig_pte)) {
	3801	+ pte = pte_mkuffd_wp(pte);
	3802	+ pte = pte_wrprotect(pte);
	3803	+ }
3228	3804	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3229	3805	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
3230	3806	vmf->orig_pte = pte;
3231	3807
3232	3808	/* ksm created a completely new copy */
3233	3809	if (unlikely(page != swapcache && swapcache)) {
3234		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3235		- mem_cgroup_commit_charge(page, memcg, false, false);
3236		- lru_cache_add_active_or_unevictable(page, vma);
	3810	+ __page_add_new_anon_rmap(page, vma, vmf->address, false);
	3811	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3237	3812	} else {
3238	3813	do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
3239		- mem_cgroup_commit_charge(page, memcg, true, false);
3240		- activate_page(page);
3241	3814	}
3242	3815
	3816	+ trace_android_vh_swapin_add_anon_rmap(vmf, page);
3243	3817	swap_free(entry);
3244	3818	if (mem_cgroup_swap_full(page) \|\|
3245		- (vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))
	3819	+ (vmf->vma_flags & VM_LOCKED) \|\| PageMlocked(page))
3246	3820	try_to_free_swap(page);
3247	3821	unlock_page(page);
3248	3822	if (page != swapcache && swapcache) {
..	..	@@ -3272,7 +3846,6 @@
3272	3846	out:
3273	3847	return ret;
3274	3848	out_nomap:
3275		- mem_cgroup_cancel_charge(page, memcg, false);
3276	3849	pte_unmap_unlock(vmf->pte, vmf->ptl);
3277	3850	out_page:
3278	3851	unlock_page(page);
..	..	@@ -3286,51 +3859,65 @@
3286	3859	}
3287	3860
3288	3861	/*
3289		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	3862	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3290	3863	* but allow concurrent faults), and pte mapped but not yet locked.
3291		- * We return with mmap_sem still held, but pte unmapped and unlocked.
	3864	+ * We return with mmap_lock still held, but pte unmapped and unlocked.
3292	3865	*/
3293	3866	static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
3294	3867	{
3295	3868	struct vm_area_struct *vma = vmf->vma;
3296		- struct mem_cgroup *memcg;
3297	3869	struct page *page;
3298	3870	vm_fault_t ret = 0;
3299	3871	pte_t entry;
3300	3872
3301	3873	/* File mapping without ->vm_ops ? */
3302		- if (vma->vm_flags & VM_SHARED)
	3874	+ if (vmf->vma_flags & VM_SHARED)
3303	3875	return VM_FAULT_SIGBUS;
	3876	+
	3877	+ /* Do not check unstable pmd, if it's changed will retry later */
	3878	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3879	+ goto skip_pmd_checks;
3304	3880
3305	3881	/*
3306	3882	* Use pte_alloc() instead of pte_alloc_map(). We can't run
3307	3883	* pte_offset_map() on pmds where a huge pmd might be created
3308	3884	* from a different thread.
3309	3885	*
3310		- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
	3886	+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
3311	3887	* parallel threads are excluded by other means.
3312	3888	*
3313		- * Here we only have down_read(mmap_sem).
	3889	+ * Here we only have mmap_read_lock(mm).
3314	3890	*/
3315		- if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
	3891	+ if (pte_alloc(vma->vm_mm, vmf->pmd))
3316	3892	return VM_FAULT_OOM;
3317	3893
3318		- /* See the comment in pte_alloc_one_map() */
	3894	+ /* See comment in handle_pte_fault() */
3319	3895	if (unlikely(pmd_trans_unstable(vmf->pmd)))
3320	3896	return 0;
3321	3897
	3898	+skip_pmd_checks:
3322	3899	/* Use the zero-page for reads */
3323	3900	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
3324	3901	!mm_forbids_zeropage(vma->vm_mm)) {
3325	3902	entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
3326		- vma->vm_page_prot));
3327		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
3328		- vmf->address, &vmf->ptl);
3329		- if (!pte_none(*vmf->pte))
	3903	+ vmf->vma_page_prot));
	3904	+ if (!pte_map_lock(vmf))
	3905	+ return VM_FAULT_RETRY;
	3906	+ if (!pte_none(*vmf->pte)) {
	3907	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
3330	3908	goto unlock;
	3909	+ }
3331	3910	ret = check_stable_address_space(vma->vm_mm);
3332	3911	if (ret)
3333	3912	goto unlock;
	3913	+ /*
	3914	+ * Don't call the userfaultfd during the speculative path.
	3915	+ * We already checked for the VMA to not be managed through
	3916	+ * userfaultfd, but it may be set in our back once we have lock
	3917	+ * the pte. In such a case we can ignore it this time.
	3918	+ */
	3919	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	3920	+ goto setpte;
3334	3921	/* Deliver the page fault to userland, check inside PT lock */
3335	3922	if (userfaultfd_missing(vma)) {
3336	3923	pte_unmap_unlock(vmf->pte, vmf->ptl);
..	..	@@ -3346,42 +3933,47 @@
3346	3933	if (!page)
3347	3934	goto oom;
3348	3935
3349		- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3350		- false))
	3936	+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
3351	3937	goto oom_free_page;
	3938	+ cgroup_throttle_swaprate(page, GFP_KERNEL);
3352	3939
3353	3940	/*
3354	3941	* The memory barrier inside __SetPageUptodate makes sure that
3355		- * preceeding stores to the page contents become visible before
	3942	+ * preceding stores to the page contents become visible before
3356	3943	* the set_pte_at() write.
3357	3944	*/
3358	3945	__SetPageUptodate(page);
3359	3946
3360		- entry = mk_pte(page, vma->vm_page_prot);
3361		- if (vma->vm_flags & VM_WRITE)
	3947	+ entry = mk_pte(page, vmf->vma_page_prot);
	3948	+ entry = pte_sw_mkyoung(entry);
	3949	+ if (vmf->vma_flags & VM_WRITE)
3362	3950	entry = pte_mkwrite(pte_mkdirty(entry));
3363	3951
3364		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3365		- &vmf->ptl);
3366		- if (!pte_none(*vmf->pte))
	3952	+ if (!pte_map_lock(vmf)) {
	3953	+ ret = VM_FAULT_RETRY;
3367	3954	goto release;
	3955	+ }
	3956	+
	3957	+ if (!pte_none(*vmf->pte)) {
	3958	+ update_mmu_cache(vma, vmf->address, vmf->pte);
	3959	+ goto unlock_and_release;
	3960	+ }
3368	3961
3369	3962	ret = check_stable_address_space(vma->vm_mm);
3370	3963	if (ret)
3371		- goto release;
	3964	+ goto unlock_and_release;
3372	3965
3373	3966	/* Deliver the page fault to userland, check inside PT lock */
3374		- if (userfaultfd_missing(vma)) {
	3967	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	3968	+ userfaultfd_missing(vma)) {
3375	3969	pte_unmap_unlock(vmf->pte, vmf->ptl);
3376		- mem_cgroup_cancel_charge(page, memcg, false);
3377	3970	put_page(page);
3378	3971	return handle_userfault(vmf, VM_UFFD_MISSING);
3379	3972	}
3380	3973
3381	3974	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3382		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3383		- mem_cgroup_commit_charge(page, memcg, false, false);
3384		- lru_cache_add_active_or_unevictable(page, vma);
	3975	+ __page_add_new_anon_rmap(page, vma, vmf->address, false);
	3976	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3385	3977	setpte:
3386	3978	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3387	3979
..	..	@@ -3390,10 +3982,11 @@
3390	3982	unlock:
3391	3983	pte_unmap_unlock(vmf->pte, vmf->ptl);
3392	3984	return ret;
	3985	+unlock_and_release:
	3986	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
3393	3987	release:
3394		- mem_cgroup_cancel_charge(page, memcg, false);
3395	3988	put_page(page);
3396		- goto unlock;
	3989	+ return ret;
3397	3990	oom_free_page:
3398	3991	put_page(page);
3399	3992	oom:
..	..	@@ -3401,7 +3994,7 @@
3401	3994	}
3402	3995
3403	3996	/*
3404		- * The mmap_sem must have been held on entry, and may have been
	3997	+ * The mmap_lock must have been held on entry, and may have been
3405	3998	* released depending on flags and vma->vm_ops->fault() return value.
3406	3999	* See filemap_fault() and __lock_page_retry().
3407	4000	*/
..	..	@@ -3409,6 +4002,10 @@
3409	4002	{
3410	4003	struct vm_area_struct *vma = vmf->vma;
3411	4004	vm_fault_t ret;
	4005	+
	4006	+ /* Do not check unstable pmd, if it's changed will retry later */
	4007	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4008	+ goto skip_pmd_checks;
3412	4009
3413	4010	/*
3414	4011	* Preallocate pte before we take page_lock because this might lead to
..	..	@@ -3418,7 +4015,7 @@
3418	4015	* unlock_page(A)
3419	4016	* lock_page(B)
3420	4017	* lock_page(B)
3421		- * pte_alloc_pne
	4018	+ * pte_alloc_one
3422	4019	* shrink_page_list
3423	4020	* wait_on_page_writeback(A)
3424	4021	* SetPageWriteback(B)
..	..	@@ -3426,24 +4023,33 @@
3426	4023	* # flush A, B to clear the writeback
3427	4024	*/
3428	4025	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
3429		- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3430		- vmf->address);
	4026	+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3431	4027	if (!vmf->prealloc_pte)
3432	4028	return VM_FAULT_OOM;
3433	4029	smp_wmb(); /* See comment in __pte_alloc() */
3434	4030	}
3435	4031
	4032	+skip_pmd_checks:
3436	4033	ret = vma->vm_ops->fault(vmf);
3437	4034	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY \|
3438	4035	VM_FAULT_DONE_COW)))
3439	4036	return ret;
3440	4037
3441	4038	if (unlikely(PageHWPoison(vmf->page))) {
3442		- if (ret & VM_FAULT_LOCKED)
3443		- unlock_page(vmf->page);
3444		- put_page(vmf->page);
	4039	+ struct page *page = vmf->page;
	4040	+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
	4041	+ if (ret & VM_FAULT_LOCKED) {
	4042	+ if (page_mapped(page))
	4043	+ unmap_mapping_pages(page_mapping(page),
	4044	+ page->index, 1, false);
	4045	+ /* Retry if a clean page was removed from the cache. */
	4046	+ if (invalidate_inode_page(page))
	4047	+ poisonret = VM_FAULT_NOPAGE;
	4048	+ unlock_page(page);
	4049	+ }
	4050	+ put_page(page);
3445	4051	vmf->page = NULL;
3446		- return VM_FAULT_HWPOISON;
	4052	+ return poisonret;
3447	4053	}
3448	4054
3449	4055	if (unlikely(!(ret & VM_FAULT_LOCKED)))
..	..	@@ -3454,80 +4060,7 @@
3454	4060	return ret;
3455	4061	}
3456	4062
3457		-/*
3458		- * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
3459		- * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
3460		- * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
3461		- * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
3462		- */
3463		-static int pmd_devmap_trans_unstable(pmd_t *pmd)
3464		-{
3465		- return pmd_devmap(*pmd) \|\| pmd_trans_unstable(pmd);
3466		-}
3467		-
3468		-static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
3469		-{
3470		- struct vm_area_struct *vma = vmf->vma;
3471		-
3472		- if (!pmd_none(*vmf->pmd))
3473		- goto map_pte;
3474		- if (vmf->prealloc_pte) {
3475		- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
3476		- if (unlikely(!pmd_none(*vmf->pmd))) {
3477		- spin_unlock(vmf->ptl);
3478		- goto map_pte;
3479		- }
3480		-
3481		- mm_inc_nr_ptes(vma->vm_mm);
3482		- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
3483		- spin_unlock(vmf->ptl);
3484		- vmf->prealloc_pte = NULL;
3485		- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
3486		- return VM_FAULT_OOM;
3487		- }
3488		-map_pte:
3489		- /*
3490		- * If a huge pmd materialized under us just retry later. Use
3491		- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
3492		- * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
3493		- * under us and then back to pmd_none, as a result of MADV_DONTNEED
3494		- * running immediately after a huge pmd fault in a different thread of
3495		- * this mm, in turn leading to a misleading pmd_trans_huge() retval.
3496		- * All we have to ensure is that it is a regular pmd that we can walk
3497		- * with pte_offset_map() and we can do that through an atomic read in
3498		- * C, which is what pmd_trans_unstable() provides.
3499		- */
3500		- if (pmd_devmap_trans_unstable(vmf->pmd))
3501		- return VM_FAULT_NOPAGE;
3502		-
3503		- /*
3504		- * At this point we know that our vmf->pmd points to a page of ptes
3505		- * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
3506		- * for the duration of the fault. If a racing MADV_DONTNEED runs and
3507		- * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
3508		- * be valid and we will re-check to make sure the vmf->pte isn't
3509		- * pte_none() under vmf->ptl protection when we return to
3510		- * alloc_set_pte().
3511		- */
3512		- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
3513		- &vmf->ptl);
3514		- return 0;
3515		-}
3516		-
3517		-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3518		-
3519		-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
3520		-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
3521		- unsigned long haddr)
3522		-{
3523		- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
3524		- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
3525		- return false;
3526		- if (haddr < vma->vm_start \|\| haddr + HPAGE_PMD_SIZE > vma->vm_end)
3527		- return false;
3528		- return true;
3529		-}
3530		-
	4063	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3531	4064	static void deposit_prealloc_pte(struct vm_fault *vmf)
3532	4065	{
3533	4066	struct vm_area_struct *vma = vmf->vma;
..	..	@@ -3541,27 +4074,28 @@
3541	4074	vmf->prealloc_pte = NULL;
3542	4075	}
3543	4076
3544		-static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
	4077	+vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
3545	4078	{
3546	4079	struct vm_area_struct *vma = vmf->vma;
3547	4080	bool write = vmf->flags & FAULT_FLAG_WRITE;
3548	4081	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
3549	4082	pmd_t entry;
3550	4083	int i;
3551		- vm_fault_t ret;
	4084	+ vm_fault_t ret = VM_FAULT_FALLBACK;
3552	4085
3553	4086	if (!transhuge_vma_suitable(vma, haddr))
3554		- return VM_FAULT_FALLBACK;
	4087	+ return ret;
3555	4088
3556		- ret = VM_FAULT_FALLBACK;
3557	4089	page = compound_head(page);
	4090	+ if (compound_order(page) != HPAGE_PMD_ORDER)
	4091	+ return ret;
3558	4092
3559	4093	/*
3560	4094	* Archs like ppc64 need additonal space to store information
3561	4095	* related to pte entry. Use the preallocated table for that.
3562	4096	*/
3563	4097	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
3564		- vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
	4098	+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
3565	4099	if (!vmf->prealloc_pte)
3566	4100	return VM_FAULT_OOM;
3567	4101	smp_wmb(); /* See comment in __pte_alloc() */
..	..	@@ -3574,7 +4108,7 @@
3574	4108	for (i = 0; i < HPAGE_PMD_NR; i++)
3575	4109	flush_icache_page(vma, page + i);
3576	4110
3577		- entry = mk_huge_pmd(page, vma->vm_page_prot);
	4111	+ entry = mk_huge_pmd(page, vmf->vma_page_prot);
3578	4112	if (write)
3579	4113	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
3580	4114
..	..	@@ -3598,77 +4132,40 @@
3598	4132	return ret;
3599	4133	}
3600	4134	#else
3601		-static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
	4135	+vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
3602	4136	{
3603		- BUILD_BUG();
3604		- return 0;
	4137	+ return VM_FAULT_FALLBACK;
3605	4138	}
3606	4139	#endif
3607	4140
3608		-/**
3609		- * alloc_set_pte - setup new PTE entry for given page and add reverse page
3610		- * mapping. If needed, the fucntion allocates page table or use pre-allocated.
3611		- *
3612		- * @vmf: fault environment
3613		- * @memcg: memcg to charge page (only for private mappings)
3614		- * @page: page to map
3615		- *
3616		- * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
3617		- * return.
3618		- *
3619		- * Target users are page handler itself and implementations of
3620		- * vm_ops->map_pages.
3621		- */
3622		-vm_fault_t alloc_set_pte(struct vm_fault vmf, struct mem_cgroup memcg,
3623		- struct page *page)
	4141	+void do_set_pte(struct vm_fault vmf, struct page page, unsigned long addr)
3624	4142	{
3625	4143	struct vm_area_struct *vma = vmf->vma;
3626	4144	bool write = vmf->flags & FAULT_FLAG_WRITE;
	4145	+ bool prefault = vmf->address != addr;
3627	4146	pte_t entry;
3628		- vm_fault_t ret;
3629		-
3630		- if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3631		- IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3632		- /* THP on COW? */
3633		- VM_BUG_ON_PAGE(memcg, page);
3634		-
3635		- ret = do_set_pmd(vmf, page);
3636		- if (ret != VM_FAULT_FALLBACK)
3637		- return ret;
3638		- }
3639		-
3640		- if (!vmf->pte) {
3641		- ret = pte_alloc_one_map(vmf);
3642		- if (ret)
3643		- return ret;
3644		- }
3645		-
3646		- /* Re-check under ptl */
3647		- if (unlikely(!pte_none(*vmf->pte)))
3648		- return VM_FAULT_NOPAGE;
3649	4147
3650	4148	flush_icache_page(vma, page);
3651		- entry = mk_pte(page, vma->vm_page_prot);
	4149	+ entry = mk_pte(page, vmf->vma_page_prot);
	4150	+
	4151	+ if (prefault && arch_wants_old_prefaulted_pte())
	4152	+ entry = pte_mkold(entry);
	4153	+ else
	4154	+ entry = pte_sw_mkyoung(entry);
	4155	+
3652	4156	if (write)
3653		- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
	4157	+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
3654	4158	/* copy-on-write page */
3655		- if (write && !(vma->vm_flags & VM_SHARED)) {
	4159	+ if (write && !(vmf->vma_flags & VM_SHARED)) {
3656	4160	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3657		- page_add_new_anon_rmap(page, vma, vmf->address, false);
3658		- mem_cgroup_commit_charge(page, memcg, false, false);
3659		- lru_cache_add_active_or_unevictable(page, vma);
	4161	+ __page_add_new_anon_rmap(page, vma, addr, false);
	4162	+ __lru_cache_add_inactive_or_unevictable(page, vmf->vma_flags);
3660	4163	} else {
3661	4164	inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3662	4165	page_add_file_rmap(page, false);
3663	4166	}
3664		- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3665		-
3666		- /* no need to invalidate: a not-present page won't be cached */
3667		- update_mmu_cache(vma, vmf->address, vmf->pte);
3668		-
3669		- return 0;
	4167	+ set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
3670	4168	}
3671		-
3672	4169
3673	4170	/**
3674	4171	* finish_fault - finish page fault once we have prepared the page to fault
..	..	@@ -3678,20 +4175,22 @@
3678	4175	* This function handles all that is needed to finish a page fault once the
3679	4176	* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
3680	4177	* given page, adds reverse page mapping, handles memcg charges and LRU
3681		- * addition. The function returns 0 on success, VM_FAULT_ code in case of
3682		- * error.
	4178	+ * addition.
3683	4179	*
3684	4180	* The function expects the page to be locked and on success it consumes a
3685	4181	* reference of a page being mapped (for the PTE which maps it).
	4182	+ *
	4183	+ * Return: %0 on success, %VM_FAULT_ code in case of error.
3686	4184	*/
3687	4185	vm_fault_t finish_fault(struct vm_fault *vmf)
3688	4186	{
	4187	+ struct vm_area_struct *vma = vmf->vma;
3689	4188	struct page *page;
3690		- vm_fault_t ret = 0;
	4189	+ vm_fault_t ret;
3691	4190
3692	4191	/* Did we COW the page? */
3693	4192	if ((vmf->flags & FAULT_FLAG_WRITE) &&
3694		- !(vmf->vma->vm_flags & VM_SHARED))
	4193	+ !(vmf->vma_flags & VM_SHARED))
3695	4194	page = vmf->cow_page;
3696	4195	else
3697	4196	page = vmf->page;
..	..	@@ -3700,12 +4199,56 @@
3700	4199	* check even for read faults because we might have lost our CoWed
3701	4200	* page
3702	4201	*/
3703		- if (!(vmf->vma->vm_flags & VM_SHARED))
3704		- ret = check_stable_address_space(vmf->vma->vm_mm);
3705		- if (!ret)
3706		- ret = alloc_set_pte(vmf, vmf->memcg, page);
3707		- if (vmf->pte)
3708		- pte_unmap_unlock(vmf->pte, vmf->ptl);
	4202	+ if (!(vma->vm_flags & VM_SHARED)) {
	4203	+ ret = check_stable_address_space(vma->vm_mm);
	4204	+ if (ret)
	4205	+ return ret;
	4206	+ }
	4207	+
	4208	+ /* Do not check unstable pmd, if it's changed will retry later */
	4209	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4210	+ goto skip_pmd_checks;
	4211	+
	4212	+ if (pmd_none(*vmf->pmd)) {
	4213	+ if (PageTransCompound(page)) {
	4214	+ ret = do_set_pmd(vmf, page);
	4215	+ if (ret != VM_FAULT_FALLBACK)
	4216	+ return ret;
	4217	+ }
	4218	+
	4219	+ if (vmf->prealloc_pte) {
	4220	+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
	4221	+ if (likely(pmd_none(*vmf->pmd))) {
	4222	+ mm_inc_nr_ptes(vma->vm_mm);
	4223	+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
	4224	+ vmf->prealloc_pte = NULL;
	4225	+ }
	4226	+ spin_unlock(vmf->ptl);
	4227	+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
	4228	+ return VM_FAULT_OOM;
	4229	+ }
	4230	+ }
	4231	+
	4232	+ /*
	4233	+ * See comment in handle_pte_fault() for how this scenario happens, we
	4234	+ * need to return NOPAGE so that we drop this page.
	4235	+ */
	4236	+ if (pmd_devmap_trans_unstable(vmf->pmd))
	4237	+ return VM_FAULT_NOPAGE;
	4238	+
	4239	+skip_pmd_checks:
	4240	+ if (!pte_map_lock(vmf))
	4241	+ return VM_FAULT_RETRY;
	4242	+
	4243	+ ret = 0;
	4244	+ /* Re-check under ptl */
	4245	+ if (likely(pte_none(*vmf->pte)))
	4246	+ do_set_pte(vmf, page, vmf->address);
	4247	+ else
	4248	+ ret = VM_FAULT_NOPAGE;
	4249	+
	4250	+ update_mmu_tlb(vma, vmf->address, vmf->pte);
	4251	+ pte_unmap_unlock(vmf->pte, vmf->ptl);
3709	4252	return ret;
3710	4253	}
3711	4254
..	..	@@ -3738,12 +4281,8 @@
3738	4281
3739	4282	static int __init fault_around_debugfs(void)
3740	4283	{
3741		- void *ret;
3742		-
3743		- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
3744		- &fault_around_bytes_fops);
3745		- if (!ret)
3746		- pr_warn("Failed to create fault_around_bytes in debugfs");
	4284	+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
	4285	+ &fault_around_bytes_fops);
3747	4286	return 0;
3748	4287	}
3749	4288	late_initcall(fault_around_debugfs);
..	..	@@ -3779,13 +4318,12 @@
3779	4318	pgoff_t start_pgoff = vmf->pgoff;
3780	4319	pgoff_t end_pgoff;
3781	4320	int off;
3782		- vm_fault_t ret = 0;
3783	4321
3784	4322	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3785	4323	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3786	4324
3787		- vmf->address = max(address & mask, vmf->vma->vm_start);
3788		- off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
	4325	+ address = max(address & mask, vmf->vma->vm_start);
	4326	+ off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3789	4327	start_pgoff -= off;
3790	4328
3791	4329	/*
..	..	@@ -3793,40 +4331,20 @@
3793	4331	* the vma or nr_pages from start_pgoff, depending what is nearest.
3794	4332	*/
3795	4333	end_pgoff = start_pgoff -
3796		- ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
	4334	+ ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3797	4335	PTRS_PER_PTE - 1;
3798	4336	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3799	4337	start_pgoff + nr_pages - 1);
3800	4338
3801		- if (pmd_none(*vmf->pmd)) {
3802		- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3803		- vmf->address);
	4339	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	4340	+ pmd_none(*vmf->pmd)) {
	4341	+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
3804	4342	if (!vmf->prealloc_pte)
3805		- goto out;
	4343	+ return VM_FAULT_OOM;
3806	4344	smp_wmb(); /* See comment in __pte_alloc() */
3807	4345	}
3808	4346
3809		- vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3810		-
3811		- /* Huge page is mapped? Page fault is solved */
3812		- if (pmd_trans_huge(*vmf->pmd)) {
3813		- ret = VM_FAULT_NOPAGE;
3814		- goto out;
3815		- }
3816		-
3817		- /* ->map_pages() haven't done anything useful. Cold page cache? */
3818		- if (!vmf->pte)
3819		- goto out;
3820		-
3821		- /* check if the page fault is solved */
3822		- vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3823		- if (!pte_none(*vmf->pte))
3824		- ret = VM_FAULT_NOPAGE;
3825		- pte_unmap_unlock(vmf->pte, vmf->ptl);
3826		-out:
3827		- vmf->address = address;
3828		- vmf->pte = NULL;
3829		- return ret;
	4347	+ return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3830	4348	}
3831	4349
3832	4350	static vm_fault_t do_read_fault(struct vm_fault *vmf)
..	..	@@ -3840,9 +4358,11 @@
3840	4358	* something).
3841	4359	*/
3842	4360	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3843		- ret = do_fault_around(vmf);
3844		- if (ret)
3845		- return ret;
	4361	+ if (likely(!userfaultfd_minor(vmf->vma))) {
	4362	+ ret = do_fault_around(vmf);
	4363	+ if (ret)
	4364	+ return ret;
	4365	+ }
3846	4366	}
3847	4367
3848	4368	ret = __do_fault(vmf);
..	..	@@ -3868,11 +4388,11 @@
3868	4388	if (!vmf->cow_page)
3869	4389	return VM_FAULT_OOM;
3870	4390
3871		- if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3872		- &vmf->memcg, false)) {
	4391	+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
3873	4392	put_page(vmf->cow_page);
3874	4393	return VM_FAULT_OOM;
3875	4394	}
	4395	+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
3876	4396
3877	4397	ret = __do_fault(vmf);
3878	4398	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
..	..	@@ -3890,7 +4410,6 @@
3890	4410	goto uncharge_out;
3891	4411	return ret;
3892	4412	uncharge_out:
3893		- mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3894	4413	put_page(vmf->cow_page);
3895	4414	return ret;
3896	4415	}
..	..	@@ -3926,16 +4445,16 @@
3926	4445	return ret;
3927	4446	}
3928	4447
3929		- fault_dirty_shared_page(vma, vmf->page);
	4448	+ ret \|= fault_dirty_shared_page(vmf);
3930	4449	return ret;
3931	4450	}
3932	4451
3933	4452	/*
3934		- * We enter with non-exclusive mmap_sem (to exclude vma changes,
	4453	+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
3935	4454	* but allow concurrent faults).
3936		- * The mmap_sem may have been released depending on flags and our
	4455	+ * The mmap_lock may have been released depending on flags and our
3937	4456	* return value. See filemap_fault() and __lock_page_or_retry().
3938		- * If mmap_sem is released, vma may become invalid (for example
	4457	+ * If mmap_lock is released, vma may become invalid (for example
3939	4458	* by other thread calling munmap()).
3940	4459	*/
3941	4460	static vm_fault_t do_fault(struct vm_fault *vmf)
..	..	@@ -3975,7 +4494,7 @@
3975	4494	}
3976	4495	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
3977	4496	ret = do_read_fault(vmf);
3978		- else if (!(vma->vm_flags & VM_SHARED))
	4497	+ else if (!(vmf->vma_flags & VM_SHARED))
3979	4498	ret = do_cow_fault(vmf);
3980	4499	else
3981	4500	ret = do_shared_fault(vmf);
..	..	@@ -4007,11 +4526,11 @@
4007	4526	{
4008	4527	struct vm_area_struct *vma = vmf->vma;
4009	4528	struct page *page = NULL;
4010		- int page_nid = -1;
	4529	+ int page_nid = NUMA_NO_NODE;
4011	4530	int last_cpupid;
4012	4531	int target_nid;
4013	4532	bool migrated = false;
4014		- pte_t pte;
	4533	+ pte_t pte, old_pte;
4015	4534	bool was_writable = pte_savedwrite(vmf->orig_pte);
4016	4535	int flags = 0;
4017	4536
..	..	@@ -4020,8 +4539,8 @@
4020	4539	* validation through pte_unmap_same(). It's of NUMA type but
4021	4540	* the pfn may be screwed if the read is non atomic.
4022	4541	*/
4023		- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
4024		- spin_lock(vmf->ptl);
	4542	+ if (!pte_spinlock(vmf))
	4543	+ return VM_FAULT_RETRY;
4025	4544	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
4026	4545	pte_unmap_unlock(vmf->pte, vmf->ptl);
4027	4546	goto out;
..	..	@@ -4031,15 +4550,15 @@
4031	4550	* Make it present again, Depending on how arch implementes non
4032	4551	* accessible ptes, some can allow access by kernel mode.
4033	4552	*/
4034		- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
4035		- pte = pte_modify(pte, vma->vm_page_prot);
	4553	+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
	4554	+ pte = pte_modify(old_pte, vmf->vma_page_prot);
4036	4555	pte = pte_mkyoung(pte);
4037	4556	if (was_writable)
4038	4557	pte = pte_mkwrite(pte);
4039		- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
	4558	+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
4040	4559	update_mmu_cache(vma, vmf->address, vmf->pte);
4041	4560
4042		- page = vm_normal_page(vma, vmf->address, pte);
	4561	+ page = _vm_normal_page(vma, vmf->address, pte, vmf->vma_flags);
4043	4562	if (!page) {
4044	4563	pte_unmap_unlock(vmf->pte, vmf->ptl);
4045	4564	return 0;
..	..	@@ -4066,7 +4585,7 @@
4066	4585	* Flag if the page is shared between multiple address spaces. This
4067	4586	* is later used when determining whether to group tasks together
4068	4587	*/
4069		- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
	4588	+ if (page_mapcount(page) > 1 && (vmf->vma_flags & VM_SHARED))
4070	4589	flags \|= TNF_SHARED;
4071	4590
4072	4591	last_cpupid = page_cpupid_last(page);
..	..	@@ -4074,13 +4593,13 @@
4074	4593	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
4075	4594	&flags);
4076	4595	pte_unmap_unlock(vmf->pte, vmf->ptl);
4077		- if (target_nid == -1) {
	4596	+ if (target_nid == NUMA_NO_NODE) {
4078	4597	put_page(page);
4079	4598	goto out;
4080	4599	}
4081	4600
4082	4601	/* Migrate to the requested node */
4083		- migrated = migrate_misplaced_page(page, vma, target_nid);
	4602	+ migrated = migrate_misplaced_page(page, vmf, target_nid);
4084	4603	if (migrated) {
4085	4604	page_nid = target_nid;
4086	4605	flags \|= TNF_MIGRATED;
..	..	@@ -4088,7 +4607,7 @@
4088	4607	flags \|= TNF_MIGRATE_FAIL;
4089	4608
4090	4609	out:
4091		- if (page_nid != -1)
	4610	+ if (page_nid != NUMA_NO_NODE)
4092	4611	task_numa_fault(last_cpupid, page_nid, 1, flags);
4093	4612	return 0;
4094	4613	}
..	..	@@ -4105,26 +4624,28 @@
4105	4624	/* `inline' is required to avoid gcc 4.1.2 build error */
4106	4625	static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
4107	4626	{
4108		- if (vma_is_anonymous(vmf->vma))
	4627	+ if (vma_is_anonymous(vmf->vma)) {
	4628	+ if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
	4629	+ return handle_userfault(vmf, VM_UFFD_WP);
4109	4630	return do_huge_pmd_wp_page(vmf, orig_pmd);
4110		- if (vmf->vma->vm_ops->huge_fault)
4111		- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
	4631	+ }
	4632	+ if (vmf->vma->vm_ops->huge_fault) {
	4633	+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
4112	4634
4113		- /* COW handled on pte level: split pmd */
4114		- VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
	4635	+ if (!(ret & VM_FAULT_FALLBACK))
	4636	+ return ret;
	4637	+ }
	4638	+
	4639	+ /* COW or write-notify handled on pte level: split pmd. */
4115	4640	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
4116	4641
4117	4642	return VM_FAULT_FALLBACK;
4118	4643	}
4119	4644
4120		-static inline bool vma_is_accessible(struct vm_area_struct *vma)
4121		-{
4122		- return vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE);
4123		-}
4124		-
4125	4645	static vm_fault_t create_huge_pud(struct vm_fault *vmf)
4126	4646	{
4127		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4647	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
	4648	+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4128	4649	/* No support for anonymous transparent PUD pages yet */
4129	4650	if (vma_is_anonymous(vmf->vma))
4130	4651	return VM_FAULT_FALLBACK;
..	..	@@ -4136,13 +4657,21 @@
4136	4657
4137	4658	static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
4138	4659	{
4139		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4660	+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
	4661	+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
4140	4662	/* No support for anonymous transparent PUD pages yet */
4141	4663	if (vma_is_anonymous(vmf->vma))
4142		- return VM_FAULT_FALLBACK;
4143		- if (vmf->vma->vm_ops->huge_fault)
4144		- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
4145		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	4664	+ goto split;
	4665	+ if (vmf->vma->vm_ops->huge_fault) {
	4666	+ vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
	4667	+
	4668	+ if (!(ret & VM_FAULT_FALLBACK))
	4669	+ return ret;
	4670	+ }
	4671	+split:
	4672	+ /* COW or write-notify not handled on PUD level: split pud.*/
	4673	+ __split_huge_pud(vmf->vma, vmf->pud, vmf->address);
	4674	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
4146	4675	return VM_FAULT_FALLBACK;
4147	4676	}
4148	4677
..	..	@@ -4155,15 +4684,20 @@
4155	4684	* with external mmu caches can use to update those (ie the Sparc or
4156	4685	* PowerPC hashed page tables that act as extended TLBs).
4157	4686	*
4158		- * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
	4687	+ * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
4159	4688	* concurrent faults).
4160	4689	*
4161		- * The mmap_sem may have been released depending on flags and our return value.
	4690	+ * The mmap_lock may have been released depending on flags and our return value.
4162	4691	* See filemap_fault() and __lock_page_or_retry().
4163	4692	*/
4164	4693	static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
4165	4694	{
4166	4695	pte_t entry;
	4696	+ vm_fault_t ret = 0;
	4697	+
	4698	+ /* Do not check unstable pmd, if it's changed will retry later */
	4699	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4700	+ goto skip_pmd_checks;
4167	4701
4168	4702	if (unlikely(pmd_none(*vmf->pmd))) {
4169	4703	/*
..	..	@@ -4174,14 +4708,28 @@
4174	4708	*/
4175	4709	vmf->pte = NULL;
4176	4710	} else {
4177		- /* See comment in pte_alloc_one_map() */
	4711	+ /*
	4712	+ * If a huge pmd materialized under us just retry later. Use
	4713	+ * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
	4714	+ * of pmd_trans_huge() to ensure the pmd didn't become
	4715	+ * pmd_trans_huge under us and then back to pmd_none, as a
	4716	+ * result of MADV_DONTNEED running immediately after a huge pmd
	4717	+ * fault in a different thread of this mm, in turn leading to a
	4718	+ * misleading pmd_trans_huge() retval. All we have to ensure is
	4719	+ * that it is a regular pmd that we can walk with
	4720	+ * pte_offset_map() and we can do that through an atomic read
	4721	+ * in C, which is what pmd_trans_unstable() provides.
	4722	+ */
4178	4723	if (pmd_devmap_trans_unstable(vmf->pmd))
4179	4724	return 0;
4180	4725	/*
4181	4726	* A regular pmd is established and it can't morph into a huge
4182	4727	* pmd from under us anymore at this point because we hold the
4183		- * mmap_sem read mode and khugepaged takes it in write mode.
	4728	+ * mmap_lock read mode and khugepaged takes it in write mode.
4184	4729	* So now it's safe to run pte_offset_map().
	4730	+ * This is not applicable to the speculative page fault handler
	4731	+ * but in that case, the pte is fetched earlier in
	4732	+ * handle_speculative_fault().
4185	4733	*/
4186	4734	vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
4187	4735	vmf->orig_pte = *vmf->pte;
..	..	@@ -4201,9 +4749,13 @@
4201	4749	}
4202	4750	}
4203	4751
	4752	+skip_pmd_checks:
4204	4753	if (!vmf->pte) {
4205	4754	if (vma_is_anonymous(vmf->vma))
4206	4755	return do_anonymous_page(vmf);
	4756	+ else if ((vmf->flags & FAULT_FLAG_SPECULATIVE) &&
	4757	+ !vmf_allows_speculation(vmf))
	4758	+ return VM_FAULT_RETRY;
4207	4759	else
4208	4760	return do_fault(vmf);
4209	4761	}
..	..	@@ -4214,14 +4766,27 @@
4214	4766	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
4215	4767	return do_numa_page(vmf);
4216	4768
4217		- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
4218		- spin_lock(vmf->ptl);
	4769	+ if (!pte_spinlock(vmf))
	4770	+ return VM_FAULT_RETRY;
4219	4771	entry = vmf->orig_pte;
4220		- if (unlikely(!pte_same(*vmf->pte, entry)))
	4772	+ if (unlikely(!pte_same(*vmf->pte, entry))) {
	4773	+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
4221	4774	goto unlock;
	4775	+ }
4222	4776	if (vmf->flags & FAULT_FLAG_WRITE) {
4223		- if (!pte_write(entry))
4224		- return do_wp_page(vmf);
	4777	+ if (!pte_write(entry)) {
	4778	+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE))
	4779	+ return do_wp_page(vmf);
	4780	+
	4781	+ if (!mmu_notifier_trylock(vmf->vma->vm_mm)) {
	4782	+ ret = VM_FAULT_RETRY;
	4783	+ goto unlock;
	4784	+ }
	4785	+
	4786	+ ret = do_wp_page(vmf);
	4787	+ mmu_notifier_unlock(vmf->vma->vm_mm);
	4788	+ return ret;
	4789	+ }
4225	4790	entry = pte_mkdirty(entry);
4226	4791	}
4227	4792	entry = pte_mkyoung(entry);
..	..	@@ -4229,6 +4794,11 @@
4229	4794	vmf->flags & FAULT_FLAG_WRITE)) {
4230	4795	update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
4231	4796	} else {
	4797	+ /* Skip spurious TLB flush for retried page fault */
	4798	+ if (vmf->flags & FAULT_FLAG_TRIED)
	4799	+ goto unlock;
	4800	+ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
	4801	+ ret = VM_FAULT_RETRY;
4232	4802	/*
4233	4803	* This is needed only for protection faults but the arch code
4234	4804	* is not yet telling us if this is a protection fault or not.
..	..	@@ -4238,15 +4808,17 @@
4238	4808	if (vmf->flags & FAULT_FLAG_WRITE)
4239	4809	flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
4240	4810	}
	4811	+ trace_android_rvh_handle_pte_fault_end(vmf, highest_memmap_pfn);
	4812	+ trace_android_vh_handle_pte_fault_end(vmf, highest_memmap_pfn);
4241	4813	unlock:
4242	4814	pte_unmap_unlock(vmf->pte, vmf->ptl);
4243		- return 0;
	4815	+ return ret;
4244	4816	}
4245	4817
4246	4818	/*
4247	4819	* By the time we get here, we already hold the mm semaphore
4248	4820	*
4249		- * The mmap_sem may have been released depending on flags and our
	4821	+ * The mmap_lock may have been released depending on flags and our
4250	4822	* return value. See filemap_fault() and __lock_page_or_retry().
4251	4823	*/
4252	4824	static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
..	..	@@ -4258,6 +4830,8 @@
4258	4830	.flags = flags,
4259	4831	.pgoff = linear_page_index(vma, address),
4260	4832	.gfp_mask = __get_fault_gfp_mask(vma),
	4833	+ .vma_flags = vma->vm_flags,
	4834	+ .vma_page_prot = vma->vm_page_prot,
4261	4835	};
4262	4836	unsigned int dirty = flags & FAULT_FLAG_WRITE;
4263	4837	struct mm_struct *mm = vma->vm_mm;
..	..	@@ -4273,6 +4847,7 @@
4273	4847	vmf.pud = pud_alloc(mm, p4d, address);
4274	4848	if (!vmf.pud)
4275	4849	return VM_FAULT_OOM;
	4850	+retry_pud:
4276	4851	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
4277	4852	ret = create_huge_pud(&vmf);
4278	4853	if (!(ret & VM_FAULT_FALLBACK))
..	..	@@ -4299,6 +4874,14 @@
4299	4874	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
4300	4875	if (!vmf.pmd)
4301	4876	return VM_FAULT_OOM;
	4877	+
	4878	+ /* Huge pud page fault raced with pmd_alloc? */
	4879	+ if (pud_trans_unstable(vmf.pud))
	4880	+ goto retry_pud;
	4881	+
	4882	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	4883	+ vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
	4884	+#endif
4302	4885	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
4303	4886	ret = create_huge_pmd(&vmf);
4304	4887	if (!(ret & VM_FAULT_FALLBACK))
..	..	@@ -4332,14 +4915,342 @@
4332	4915	return handle_pte_fault(&vmf);
4333	4916	}
4334	4917
	4918	+/**
	4919	+ * mm_account_fault - Do page fault accountings
	4920	+ *
	4921	+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
	4922	+ * of perf event counters, but we'll still do the per-task accounting to
	4923	+ * the task who triggered this page fault.
	4924	+ * @address: the faulted address.
	4925	+ * @flags: the fault flags.
	4926	+ * @ret: the fault retcode.
	4927	+ *
	4928	+ * This will take care of most of the page fault accountings. Meanwhile, it
	4929	+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ\|MIN] perf counter
	4930	+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
	4931	+ * still be in per-arch page fault handlers at the entry of page fault.
	4932	+ */
	4933	+static inline void mm_account_fault(struct pt_regs *regs,
	4934	+ unsigned long address, unsigned int flags,
	4935	+ vm_fault_t ret)
	4936	+{
	4937	+ bool major;
	4938	+
	4939	+ /*
	4940	+ * We don't do accounting for some specific faults:
	4941	+ *
	4942	+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
	4943	+ * includes arch_vma_access_permitted() failing before reaching here.
	4944	+ * So this is not a "this many hardware page faults" counter. We
	4945	+ * should use the hw profiling for that.
	4946	+ *
	4947	+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
	4948	+ * once they're completed.
	4949	+ */
	4950	+ if (ret & (VM_FAULT_ERROR \| VM_FAULT_RETRY))
	4951	+ return;
	4952	+
	4953	+ /*
	4954	+ * We define the fault as a major fault when the final successful fault
	4955	+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
	4956	+ * handle it immediately previously).
	4957	+ */
	4958	+ major = (ret & VM_FAULT_MAJOR) \|\| (flags & FAULT_FLAG_TRIED);
	4959	+
	4960	+ if (major)
	4961	+ current->maj_flt++;
	4962	+ else
	4963	+ current->min_flt++;
	4964	+
	4965	+ /*
	4966	+ * If the fault is done for GUP, regs will be NULL. We only do the
	4967	+ * accounting for the per thread fault counters who triggered the
	4968	+ * fault, and we skip the perf event updates.
	4969	+ */
	4970	+ if (!regs)
	4971	+ return;
	4972	+
	4973	+ if (major)
	4974	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
	4975	+ else
	4976	+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
	4977	+}
	4978	+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
	4979	+
	4980	+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
	4981	+/* This is required by vm_normal_page() */
	4982	+#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL"
	4983	+#endif
	4984	+/*
	4985	+ * vm_normal_page() adds some processing which should be done while
	4986	+ * hodling the mmap_sem.
	4987	+ */
	4988	+
	4989	+/*
	4990	+ * Tries to handle the page fault in a speculative way, without grabbing the
	4991	+ * mmap_sem.
	4992	+ * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
	4993	+ * be checked later when the mmap_sem has been grabbed by calling
	4994	+ * can_reuse_spf_vma().
	4995	+ * This is needed as the returned vma is kept in memory until the call to
	4996	+ * can_reuse_spf_vma() is made.
	4997	+ */
	4998	+static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
	4999	+ unsigned long address, unsigned int flags,
	5000	+ struct vm_area_struct *vma)
	5001	+{
	5002	+ struct vm_fault vmf = {
	5003	+ .address = address,
	5004	+ .pgoff = linear_page_index(vma, address),
	5005	+ .vma = vma,
	5006	+ .gfp_mask = __get_fault_gfp_mask(vma),
	5007	+ .flags = flags,
	5008	+ };
	5009	+#ifdef CONFIG_NUMA
	5010	+ struct mempolicy *pol;
	5011	+#endif
	5012	+ pgd_t *pgd, pgdval;
	5013	+ p4d_t *p4d, p4dval;
	5014	+ pud_t pudval;
	5015	+ int seq;
	5016	+ vm_fault_t ret;
	5017	+
	5018	+ /* Clear flags that may lead to release the mmap_sem to retry */
	5019	+ flags &= ~(FAULT_FLAG_ALLOW_RETRY\|FAULT_FLAG_KILLABLE);
	5020	+ flags \|= FAULT_FLAG_SPECULATIVE;
	5021	+
	5022	+ /* rmb <-> seqlock,vma_rb_erase() */
	5023	+ seq = raw_read_seqcount(&vmf.vma->vm_sequence);
	5024	+ if (seq & 1) {
	5025	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5026	+ return VM_FAULT_RETRY;
	5027	+ }
	5028	+
	5029	+ if (!vmf_allows_speculation(&vmf))
	5030	+ return VM_FAULT_RETRY;
	5031	+
	5032	+ vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
	5033	+ vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
	5034	+
	5035	+#ifdef CONFIG_USERFAULTFD
	5036	+ /* Can't call userland page fault handler in the speculative path */
	5037	+ if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) {
	5038	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5039	+ return VM_FAULT_RETRY;
	5040	+ }
	5041	+#endif
	5042	+
	5043	+ if (vmf.vma_flags & VM_GROWSDOWN \|\| vmf.vma_flags & VM_GROWSUP) {
	5044	+ /*
	5045	+ * This could be detected by the check address against VMA's
	5046	+ * boundaries but we want to trace it as not supported instead
	5047	+ * of changed.
	5048	+ */
	5049	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5050	+ return VM_FAULT_RETRY;
	5051	+ }
	5052	+
	5053	+ if (address < READ_ONCE(vmf.vma->vm_start)
	5054	+ \|\| READ_ONCE(vmf.vma->vm_end) <= address) {
	5055	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5056	+ return VM_FAULT_RETRY;
	5057	+ }
	5058	+
	5059	+ if (!arch_vma_access_permitted(vmf.vma, flags & FAULT_FLAG_WRITE,
	5060	+ flags & FAULT_FLAG_INSTRUCTION,
	5061	+ flags & FAULT_FLAG_REMOTE))
	5062	+ goto out_segv;
	5063	+
	5064	+ /* This is one is required to check that the VMA has write access set */
	5065	+ if (flags & FAULT_FLAG_WRITE) {
	5066	+ if (unlikely(!(vmf.vma_flags & VM_WRITE)))
	5067	+ goto out_segv;
	5068	+ } else if (unlikely(!(vmf.vma_flags & (VM_READ\|VM_EXEC\|VM_WRITE))))
	5069	+ goto out_segv;
	5070	+
	5071	+#ifdef CONFIG_NUMA
	5072	+ /*
	5073	+ * MPOL_INTERLEAVE implies additional checks in
	5074	+ * mpol_misplaced() which are not compatible with the
	5075	+ *speculative page fault processing.
	5076	+ */
	5077	+ pol = __get_vma_policy(vmf.vma, address);
	5078	+ if (!pol)
	5079	+ pol = get_task_policy(current);
	5080	+ if (pol && pol->mode == MPOL_INTERLEAVE) {
	5081	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5082	+ return VM_FAULT_RETRY;
	5083	+ }
	5084	+#endif
	5085	+
	5086	+ /*
	5087	+ * Do a speculative lookup of the PTE entry.
	5088	+ */
	5089	+ local_irq_disable();
	5090	+ pgd = pgd_offset(mm, address);
	5091	+ pgdval = READ_ONCE(*pgd);
	5092	+ if (pgd_none(pgdval) \|\| unlikely(pgd_bad(pgdval)))
	5093	+ goto out_walk;
	5094	+
	5095	+ p4d = p4d_offset(pgd, address);
	5096	+ if (pgd_val(READ_ONCE(*pgd)) != pgd_val(pgdval))
	5097	+ goto out_walk;
	5098	+ p4dval = READ_ONCE(*p4d);
	5099	+ if (p4d_none(p4dval) \|\| unlikely(p4d_bad(p4dval)))
	5100	+ goto out_walk;
	5101	+
	5102	+ vmf.pud = pud_offset(p4d, address);
	5103	+ if (p4d_val(READ_ONCE(*p4d)) != p4d_val(p4dval))
	5104	+ goto out_walk;
	5105	+ pudval = READ_ONCE(*vmf.pud);
	5106	+ if (pud_none(pudval) \|\| unlikely(pud_bad(pudval)))
	5107	+ goto out_walk;
	5108	+
	5109	+ /* Huge pages at PUD level are not supported. */
	5110	+ if (unlikely(pud_trans_huge(pudval)))
	5111	+ goto out_walk;
	5112	+
	5113	+ vmf.pmd = pmd_offset(vmf.pud, address);
	5114	+ if (pud_val(READ_ONCE(*vmf.pud)) != pud_val(pudval))
	5115	+ goto out_walk;
	5116	+ vmf.orig_pmd = READ_ONCE(*vmf.pmd);
	5117	+ /*
	5118	+ * pmd_none could mean that a hugepage collapse is in progress
	5119	+ * in our back as collapse_huge_page() mark it before
	5120	+ * invalidating the pte (which is done once the IPI is catched
	5121	+ * by all CPU and we have interrupt disabled).
	5122	+ * For this reason we cannot handle THP in a speculative way since we
	5123	+ * can't safely indentify an in progress collapse operation done in our
	5124	+ * back on that PMD.
	5125	+ * Regarding the order of the following checks, see comment in
	5126	+ * pmd_devmap_trans_unstable()
	5127	+ */
	5128	+ if (unlikely(pmd_devmap(vmf.orig_pmd) \|\|
	5129	+ pmd_none(vmf.orig_pmd) \|\| pmd_trans_huge(vmf.orig_pmd) \|\|
	5130	+ is_swap_pmd(vmf.orig_pmd)))
	5131	+ goto out_walk;
	5132	+
	5133	+ /*
	5134	+ * The above does not allocate/instantiate page-tables because doing so
	5135	+ * would lead to the possibility of instantiating page-tables after
	5136	+ * free_pgtables() -- and consequently leaking them.
	5137	+ *
	5138	+ * The result is that we take at least one !speculative fault per PMD
	5139	+ * in order to instantiate it.
	5140	+ */
	5141	+
	5142	+ vmf.pte = pte_offset_map(vmf.pmd, address);
	5143	+ if (pmd_val(READ_ONCE(*vmf.pmd)) != pmd_val(vmf.orig_pmd)) {
	5144	+ pte_unmap(vmf.pte);
	5145	+ vmf.pte = NULL;
	5146	+ goto out_walk;
	5147	+ }
	5148	+ vmf.orig_pte = READ_ONCE(*vmf.pte);
	5149	+ barrier(); /* See comment in handle_pte_fault() */
	5150	+ if (pte_none(vmf.orig_pte)) {
	5151	+ pte_unmap(vmf.pte);
	5152	+ vmf.pte = NULL;
	5153	+ }
	5154	+
	5155	+ vmf.sequence = seq;
	5156	+ vmf.flags = flags;
	5157	+
	5158	+ local_irq_enable();
	5159	+
	5160	+ /*
	5161	+ * We need to re-validate the VMA after checking the bounds, otherwise
	5162	+ * we might have a false positive on the bounds.
	5163	+ */
	5164	+ if (read_seqcount_retry(&vmf.vma->vm_sequence, seq)) {
	5165	+ trace_spf_vma_changed(_RET_IP_, vmf.vma, address);
	5166	+ return VM_FAULT_RETRY;
	5167	+ }
	5168	+
	5169	+ mem_cgroup_enter_user_fault();
	5170	+ ret = handle_pte_fault(&vmf);
	5171	+ mem_cgroup_exit_user_fault();
	5172	+
	5173	+ if (ret != VM_FAULT_RETRY) {
	5174	+ if (vma_is_anonymous(vmf.vma))
	5175	+ count_vm_event(SPECULATIVE_PGFAULT_ANON);
	5176	+ else
	5177	+ count_vm_event(SPECULATIVE_PGFAULT_FILE);
	5178	+ }
	5179	+
	5180	+ /*
	5181	+ * The task may have entered a memcg OOM situation but
	5182	+ * if the allocation error was handled gracefully (no
	5183	+ * VM_FAULT_OOM), there is no need to kill anything.
	5184	+ * Just clean up the OOM state peacefully.
	5185	+ */
	5186	+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
	5187	+ mem_cgroup_oom_synchronize(false);
	5188	+ return ret;
	5189	+
	5190	+out_walk:
	5191	+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
	5192	+ local_irq_enable();
	5193	+ return VM_FAULT_RETRY;
	5194	+
	5195	+out_segv:
	5196	+ trace_spf_vma_access(_RET_IP_, vmf.vma, address);
	5197	+ return VM_FAULT_SIGSEGV;
	5198	+}
	5199	+
	5200	+vm_fault_t __handle_speculative_fault(struct mm_struct *mm,
	5201	+ unsigned long address, unsigned int flags,
	5202	+ struct vm_area_struct **vma,
	5203	+ struct pt_regs *regs)
	5204	+{
	5205	+ vm_fault_t ret;
	5206	+
	5207	+ check_sync_rss_stat(current);
	5208	+
	5209	+ *vma = get_vma(mm, address);
	5210	+ if (!*vma)
	5211	+ return VM_FAULT_RETRY;
	5212	+
	5213	+ ret = ___handle_speculative_fault(mm, address, flags, *vma);
	5214	+
	5215	+ /*
	5216	+ * If there is no need to retry, don't return the vma to the caller.
	5217	+ */
	5218	+ if (ret != VM_FAULT_RETRY) {
	5219	+ put_vma(*vma);
	5220	+ *vma = NULL;
	5221	+ mm_account_fault(regs, address, flags, ret);
	5222	+ }
	5223	+
	5224	+ return ret;
	5225	+}
	5226	+
	5227	+/*
	5228	+ * This is used to know if the vma fetch in the speculative page fault handler
	5229	+ * is still valid when trying the regular fault path while holding the
	5230	+ * mmap_sem.
	5231	+ * The call to put_vma(vma) must be made after checking the vma's fields, as
	5232	+ * the vma may be freed by put_vma(). In such a case it is expected that false
	5233	+ * is returned.
	5234	+ */
	5235	+bool can_reuse_spf_vma(struct vm_area_struct *vma, unsigned long address)
	5236	+{
	5237	+ bool ret;
	5238	+
	5239	+ ret = !RB_EMPTY_NODE(&vma->vm_rb) &&
	5240	+ vma->vm_start <= address && address < vma->vm_end;
	5241	+ put_vma(vma);
	5242	+ return ret;
	5243	+}
	5244	+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
	5245	+
4335	5246	/*
4336	5247	* By the time we get here, we already hold the mm semaphore
4337	5248	*
4338		- * The mmap_sem may have been released depending on flags and our
	5249	+ * The mmap_lock may have been released depending on flags and our
4339	5250	* return value. See filemap_fault() and __lock_page_or_retry().
4340	5251	*/
4341	5252	vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
4342		- unsigned int flags)
	5253	+ unsigned int flags, struct pt_regs *regs)
4343	5254	{
4344	5255	vm_fault_t ret;
4345	5256
..	..	@@ -4379,6 +5290,8 @@
4379	5290	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
4380	5291	mem_cgroup_oom_synchronize(false);
4381	5292	}
	5293	+
	5294	+ mm_account_fault(regs, address, flags, ret);
4382	5295
4383	5296	return ret;
4384	5297	}
..	..	@@ -4421,19 +5334,11 @@
4421	5334	smp_wmb(); /* See comment in __pte_alloc */
4422	5335
4423	5336	spin_lock(&mm->page_table_lock);
4424		-#ifndef __ARCH_HAS_5LEVEL_HACK
4425	5337	if (!p4d_present(*p4d)) {
4426	5338	mm_inc_nr_puds(mm);
4427	5339	p4d_populate(mm, p4d, new);
4428	5340	} else /* Another has populated it */
4429	5341	pud_free(mm, new);
4430		-#else
4431		- if (!pgd_present(*p4d)) {
4432		- mm_inc_nr_puds(mm);
4433		- pgd_populate(mm, p4d, new);
4434		- } else /* Another has populated it */
4435		- pud_free(mm, new);
4436		-#endif /* __ARCH_HAS_5LEVEL_HACK */
4437	5342	spin_unlock(&mm->page_table_lock);
4438	5343	return 0;
4439	5344	}
..	..	@@ -4454,27 +5359,19 @@
4454	5359	smp_wmb(); /* See comment in __pte_alloc */
4455	5360
4456	5361	ptl = pud_lock(mm, pud);
4457		-#ifndef __ARCH_HAS_4LEVEL_HACK
4458	5362	if (!pud_present(*pud)) {
4459	5363	mm_inc_nr_pmds(mm);
4460	5364	pud_populate(mm, pud, new);
4461	5365	} else /* Another has populated it */
4462	5366	pmd_free(mm, new);
4463		-#else
4464		- if (!pgd_present(*pud)) {
4465		- mm_inc_nr_pmds(mm);
4466		- pgd_populate(mm, pud, new);
4467		- } else /* Another has populated it */
4468		- pmd_free(mm, new);
4469		-#endif /* __ARCH_HAS_4LEVEL_HACK */
4470	5367	spin_unlock(ptl);
4471	5368	return 0;
4472	5369	}
4473	5370	#endif /* __PAGETABLE_PMD_FOLDED */
4474	5371
4475		-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4476		- unsigned long start, unsigned long end,
4477		- pte_t ptepp, pmd_t pmdpp, spinlock_t **ptlp)
	5372	+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
	5373	+ struct mmu_notifier_range range, pte_t *ptepp,
	5374	+ pmd_t pmdpp, spinlock_t ptlp)
4478	5375	{
4479	5376	pgd_t *pgd;
4480	5377	p4d_t *p4d;
..	..	@@ -4501,10 +5398,11 @@
4501	5398	if (!pmdpp)
4502	5399	goto out;
4503	5400
4504		- if (start && end) {
4505		- *start = address & PMD_MASK;
4506		- end = start + PMD_SIZE;
4507		- mmu_notifier_invalidate_range_start(mm, start, end);
	5401	+ if (range) {
	5402	+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
	5403	+ NULL, mm, address & PMD_MASK,
	5404	+ (address & PMD_MASK) + PMD_SIZE);
	5405	+ mmu_notifier_invalidate_range_start(range);
4508	5406	}
4509	5407	*ptlp = pmd_lock(mm, pmd);
4510	5408	if (pmd_huge(*pmd)) {
..	..	@@ -4512,17 +5410,18 @@
4512	5410	return 0;
4513	5411	}
4514	5412	spin_unlock(*ptlp);
4515		- if (start && end)
4516		- mmu_notifier_invalidate_range_end(mm, start, end);
	5413	+ if (range)
	5414	+ mmu_notifier_invalidate_range_end(range);
4517	5415	}
4518	5416
4519	5417	if (pmd_none(pmd) \|\| unlikely(pmd_bad(pmd)))
4520	5418	goto out;
4521	5419
4522		- if (start && end) {
4523		- *start = address & PAGE_MASK;
4524		- end = start + PAGE_SIZE;
4525		- mmu_notifier_invalidate_range_start(mm, start, end);
	5420	+ if (range) {
	5421	+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
	5422	+ address & PAGE_MASK,
	5423	+ (address & PAGE_MASK) + PAGE_SIZE);
	5424	+ mmu_notifier_invalidate_range_start(range);
4526	5425	}
4527	5426	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
4528	5427	if (!pte_present(*ptep))
..	..	@@ -4531,37 +5430,39 @@
4531	5430	return 0;
4532	5431	unlock:
4533	5432	pte_unmap_unlock(ptep, *ptlp);
4534		- if (start && end)
4535		- mmu_notifier_invalidate_range_end(mm, start, end);
	5433	+ if (range)
	5434	+ mmu_notifier_invalidate_range_end(range);
4536	5435	out:
4537	5436	return -EINVAL;
4538	5437	}
4539	5438
4540		-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
4541		- pte_t ptepp, spinlock_t ptlp)
	5439	+/**
	5440	+ * follow_pte - look up PTE at a user virtual address
	5441	+ * @mm: the mm_struct of the target address space
	5442	+ * @address: user virtual address
	5443	+ * @ptepp: location to store found PTE
	5444	+ * @ptlp: location to store the lock for the PTE
	5445	+ *
	5446	+ * On a successful return, the pointer to the PTE is stored in @ptepp;
	5447	+ * the corresponding lock is taken and its location is stored in @ptlp.
	5448	+ * The contents of the PTE are only stable until @ptlp is released;
	5449	+ * any further use, if any, must be protected against invalidation
	5450	+ * with MMU notifiers.
	5451	+ *
	5452	+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
	5453	+ * should be taken for read.
	5454	+ *
	5455	+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
	5456	+ * it is not a good general-purpose API.
	5457	+ *
	5458	+ * Return: zero on success, -ve otherwise.
	5459	+ */
	5460	+int follow_pte(struct mm_struct *mm, unsigned long address,
	5461	+ pte_t ptepp, spinlock_t ptlp)
4542	5462	{
4543		- int res;
4544		-
4545		- /* (void) is needed to make gcc happy */
4546		- (void) __cond_lock(*ptlp,
4547		- !(res = __follow_pte_pmd(mm, address, NULL, NULL,
4548		- ptepp, NULL, ptlp)));
4549		- return res;
	5463	+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
4550	5464	}
4551		-
4552		-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
4553		- unsigned long start, unsigned long end,
4554		- pte_t ptepp, pmd_t pmdpp, spinlock_t **ptlp)
4555		-{
4556		- int res;
4557		-
4558		- /* (void) is needed to make gcc happy */
4559		- (void) __cond_lock(*ptlp,
4560		- !(res = __follow_pte_pmd(mm, address, start, end,
4561		- ptepp, pmdpp, ptlp)));
4562		- return res;
4563		-}
4564		-EXPORT_SYMBOL(follow_pte_pmd);
	5465	+EXPORT_SYMBOL_GPL(follow_pte);
4565	5466
4566	5467	/**
4567	5468	* follow_pfn - look up PFN at a user virtual address
..	..	@@ -4571,7 +5472,10 @@
4571	5472	*
4572	5473	* Only IO mappings and raw PFN mappings are allowed.
4573	5474	*
4574		- * Returns zero and the pfn at @pfn on success, -ve otherwise.
	5475	+ * This function does not allow the caller to read the permissions
	5476	+ * of the PTE. Do not use it.
	5477	+ *
	5478	+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
4575	5479	*/
4576	5480	int follow_pfn(struct vm_area_struct *vma, unsigned long address,
4577	5481	unsigned long *pfn)
..	..	@@ -4658,7 +5562,7 @@
4658	5562	void *old_buf = buf;
4659	5563	int write = gup_flags & FOLL_WRITE;
4660	5564
4661		- if (down_read_killable(&mm->mmap_sem))
	5565	+ if (mmap_read_lock_killable(mm))
4662	5566	return 0;
4663	5567
4664	5568	/* ignore errors, just check how much was successfully transferred */
..	..	@@ -4667,7 +5571,7 @@
4667	5571	void *maddr;
4668	5572	struct page *page = NULL;
4669	5573
4670		- ret = get_user_pages_remote(tsk, mm, addr, 1,
	5574	+ ret = get_user_pages_remote(mm, addr, 1,
4671	5575	gup_flags, &page, &vma, NULL);
4672	5576	if (ret <= 0) {
4673	5577	#ifndef CONFIG_HAVE_IOREMAP_PROT
..	..	@@ -4703,13 +5607,13 @@
4703	5607	buf, maddr + offset, bytes);
4704	5608	}
4705	5609	kunmap(page);
4706		- put_page(page);
	5610	+ put_user_page(page);
4707	5611	}
4708	5612	len -= bytes;
4709	5613	buf += bytes;
4710	5614	addr += bytes;
4711	5615	}
4712		- up_read(&mm->mmap_sem);
	5616	+ mmap_read_unlock(mm);
4713	5617
4714	5618	return buf - old_buf;
4715	5619	}
..	..	@@ -4723,6 +5627,8 @@
4723	5627	* @gup_flags: flags modifying lookup behaviour
4724	5628	*
4725	5629	* The caller must hold a reference on @mm.
	5630	+ *
	5631	+ * Return: number of bytes copied from source to destination.
4726	5632	*/
4727	5633	int access_remote_vm(struct mm_struct *mm, unsigned long addr,
4728	5634	void *buf, int len, unsigned int gup_flags)
..	..	@@ -4764,7 +5670,7 @@
4764	5670	/*
4765	5671	* we might be running from an atomic context so we cannot sleep
4766	5672	*/
4767		- if (!down_read_trylock(&mm->mmap_sem))
	5673	+ if (!mmap_read_trylock(mm))
4768	5674	return;
4769	5675
4770	5676	vma = find_vma(mm, ip);
..	..	@@ -4783,7 +5689,7 @@
4783	5689	free_page((unsigned long)buf);
4784	5690	}
4785	5691	}
4786		- up_read(&mm->mmap_sem);
	5692	+ mmap_read_unlock(mm);
4787	5693	}
4788	5694
4789	5695	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
..	..	@@ -4791,7 +5697,7 @@
4791	5697	{
4792	5698	/*
4793	5699	* Some code (nfs/sunrpc) uses socket ops on kernel memory while
4794		- * holding the mmap_sem, this is safe because kernel memory doesn't
	5700	+ * holding the mmap_lock, this is safe because kernel memory doesn't
4795	5701	* get paged out, therefore we'll never actually fault, and the
4796	5702	* below annotations will generate false positives.
4797	5703	*/
..	..	@@ -4802,7 +5708,7 @@
4802	5708	__might_sleep(file, line, 0);
4803	5709	#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4804	5710	if (current->mm)
4805		- might_lock_read(&current->mm->mmap_sem);
	5711	+ might_lock_read(&current->mm->mmap_lock);
4806	5712	#endif
4807	5713	}
4808	5714	EXPORT_SYMBOL(__might_fault);
..	..	@@ -4979,6 +5885,8 @@
4979	5885	if (rc)
4980	5886	break;
4981	5887
	5888	+ flush_dcache_page(subpage);
	5889	+
4982	5890	cond_resched();
4983	5891	}
4984	5892	return ret_val;