~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	#include <linux/kernel.h>
2	3	#include <linux/errno.h>
3	4	#include <linux/err.h>
..	..	@@ -13,13 +14,363 @@
13	14	#include <linux/sched/signal.h>
14	15	#include <linux/rwsem.h>
15	16	#include <linux/hugetlb.h>
	17	+#include <linux/migrate.h>
	18	+#include <linux/mm_inline.h>
	19	+#include <linux/sched/mm.h>
	20	+
	21	+#include <linux/page_pinner.h>
16	22
17	23	#include <asm/mmu_context.h>
18		-#include <asm/pgtable.h>
19	24	#include <asm/tlbflush.h>
20	25
21	26	#include "internal.h"
22	27
	28	+struct follow_page_context {
	29	+ struct dev_pagemap *pgmap;
	30	+ unsigned int page_mask;
	31	+};
	32	+
	33	+static void hpage_pincount_add(struct page *page, int refs)
	34	+{
	35	+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
	36	+ VM_BUG_ON_PAGE(page != compound_head(page), page);
	37	+
	38	+ atomic_add(refs, compound_pincount_ptr(page));
	39	+}
	40	+
	41	+static void hpage_pincount_sub(struct page *page, int refs)
	42	+{
	43	+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
	44	+ VM_BUG_ON_PAGE(page != compound_head(page), page);
	45	+
	46	+ atomic_sub(refs, compound_pincount_ptr(page));
	47	+}
	48	+
	49	+/* Equivalent to calling put_page() @refs times. */
	50	+static void put_page_refs(struct page *page, int refs)
	51	+{
	52	+#ifdef CONFIG_DEBUG_VM
	53	+ if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
	54	+ return;
	55	+#endif
	56	+
	57	+ /*
	58	+ * Calling put_page() for each ref is unnecessarily slow. Only the last
	59	+ * ref needs a put_page().
	60	+ */
	61	+ if (refs > 1)
	62	+ page_ref_sub(page, refs - 1);
	63	+ put_page(page);
	64	+}
	65	+
	66	+/*
	67	+ * Return the compound head page with ref appropriately incremented,
	68	+ * or NULL if that failed.
	69	+ */
	70	+static inline struct page try_get_compound_head(struct page page, int refs)
	71	+{
	72	+ struct page *head = compound_head(page);
	73	+
	74	+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
	75	+ return NULL;
	76	+ if (unlikely(!page_cache_add_speculative(head, refs)))
	77	+ return NULL;
	78	+
	79	+ /*
	80	+ * At this point we have a stable reference to the head page; but it
	81	+ * could be that between the compound_head() lookup and the refcount
	82	+ * increment, the compound page was split, in which case we'd end up
	83	+ * holding a reference on a page that has nothing to do with the page
	84	+ * we were given anymore.
	85	+ * So now that the head page is stable, recheck that the pages still
	86	+ * belong together.
	87	+ */
	88	+ if (unlikely(compound_head(page) != head)) {
	89	+ put_page_refs(head, refs);
	90	+ return NULL;
	91	+ }
	92	+
	93	+ return head;
	94	+}
	95	+
	96	+/*
	97	+ * try_grab_compound_head() - attempt to elevate a page's refcount, by a
	98	+ * flags-dependent amount.
	99	+ *
	100	+ * "grab" names in this file mean, "look at flags to decide whether to use
	101	+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
	102	+ *
	103	+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
	104	+ * same time. (That's true throughout the get_user_pages*() and
	105	+ * pin_user_pages*() APIs.) Cases:
	106	+ *
	107	+ * FOLL_GET: page's refcount will be incremented by 1.
	108	+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
	109	+ *
	110	+ * Return: head page (with refcount appropriately incremented) for success, or
	111	+ * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
	112	+ * considered failure, and furthermore, a likely bug in the caller, so a warning
	113	+ * is also emitted.
	114	+ */
	115	+static __maybe_unused struct page try_grab_compound_head(struct page page,
	116	+ int refs,
	117	+ unsigned int flags)
	118	+{
	119	+ if (flags & FOLL_GET) {
	120	+ struct page *head = try_get_compound_head(page, refs);
	121	+ if (head)
	122	+ set_page_pinner(head, compound_order(head));
	123	+ return head;
	124	+ } else if (flags & FOLL_PIN) {
	125	+ int orig_refs = refs;
	126	+
	127	+ /*
	128	+ * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
	129	+ * path, so fail and let the caller fall back to the slow path.
	130	+ */
	131	+ if (unlikely(flags & FOLL_LONGTERM) &&
	132	+ is_migrate_cma_page(page))
	133	+ return NULL;
	134	+
	135	+ /*
	136	+ * CAUTION: Don't use compound_head() on the page before this
	137	+ * point, the result won't be stable.
	138	+ */
	139	+ page = try_get_compound_head(page, refs);
	140	+ if (!page)
	141	+ return NULL;
	142	+
	143	+ /*
	144	+ * When pinning a compound page of order > 1 (which is what
	145	+ * hpage_pincount_available() checks for), use an exact count to
	146	+ * track it, via hpage_pincount_add/_sub().
	147	+ *
	148	+ * However, be sure to also increment the normal page refcount
	149	+ * field at least once, so that the page really is pinned.
	150	+ */
	151	+ if (hpage_pincount_available(page))
	152	+ hpage_pincount_add(page, refs);
	153	+ else
	154	+ page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
	155	+
	156	+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
	157	+ orig_refs);
	158	+
	159	+ return page;
	160	+ }
	161	+
	162	+ WARN_ON_ONCE(1);
	163	+ return NULL;
	164	+}
	165	+
	166	+static void put_compound_head(struct page *page, int refs, unsigned int flags)
	167	+{
	168	+ if (flags & FOLL_PIN) {
	169	+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
	170	+ refs);
	171	+
	172	+ if (hpage_pincount_available(page))
	173	+ hpage_pincount_sub(page, refs);
	174	+ else
	175	+ refs *= GUP_PIN_COUNTING_BIAS;
	176	+ }
	177	+
	178	+ if (flags & FOLL_GET)
	179	+ reset_page_pinner(page, compound_order(page));
	180	+ put_page_refs(page, refs);
	181	+}
	182	+
	183	+/**
	184	+ * try_grab_page() - elevate a page's refcount by a flag-dependent amount
	185	+ *
	186	+ * This might not do anything at all, depending on the flags argument.
	187	+ *
	188	+ * "grab" names in this file mean, "look at flags to decide whether to use
	189	+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
	190	+ *
	191	+ * @page: pointer to page to be grabbed
	192	+ * @flags: gup flags: these are the FOLL_* flag values.
	193	+ *
	194	+ * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
	195	+ * time. Cases:
	196	+ *
	197	+ * FOLL_GET: page's refcount will be incremented by 1.
	198	+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
	199	+ *
	200	+ * Return: true for success, or if no action was required (if neither FOLL_PIN
	201	+ * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
	202	+ * FOLL_PIN was set, but the page could not be grabbed.
	203	+ */
	204	+bool __must_check try_grab_page(struct page *page, unsigned int flags)
	205	+{
	206	+ WARN_ON_ONCE((flags & (FOLL_GET \| FOLL_PIN)) == (FOLL_GET \| FOLL_PIN));
	207	+
	208	+ if (flags & FOLL_GET) {
	209	+ bool ret = try_get_page(page);
	210	+
	211	+ if (ret) {
	212	+ page = compound_head(page);
	213	+ set_page_pinner(page, compound_order(page));
	214	+ }
	215	+ return ret;
	216	+ } else if (flags & FOLL_PIN) {
	217	+ int refs = 1;
	218	+
	219	+ page = compound_head(page);
	220	+
	221	+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
	222	+ return false;
	223	+
	224	+ if (hpage_pincount_available(page))
	225	+ hpage_pincount_add(page, 1);
	226	+ else
	227	+ refs = GUP_PIN_COUNTING_BIAS;
	228	+
	229	+ /*
	230	+ * Similar to try_grab_compound_head(): even if using the
	231	+ * hpage_pincount_add/_sub() routines, be sure to
	232	+ * also increment the normal page refcount field at least
	233	+ * once, so that the page really is pinned.
	234	+ */
	235	+ page_ref_add(page, refs);
	236	+
	237	+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
	238	+ }
	239	+
	240	+ return true;
	241	+}
	242	+
	243	+/**
	244	+ * unpin_user_page() - release a dma-pinned page
	245	+ * @page: pointer to page to be released
	246	+ *
	247	+ * Pages that were pinned via pin_user_pages*() must be released via either
	248	+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
	249	+ * that such pages can be separately tracked and uniquely handled. In
	250	+ * particular, interactions with RDMA and filesystems need special handling.
	251	+ */
	252	+void unpin_user_page(struct page *page)
	253	+{
	254	+ put_compound_head(compound_head(page), 1, FOLL_PIN);
	255	+}
	256	+EXPORT_SYMBOL(unpin_user_page);
	257	+
	258	+/*
	259	+ * put_user_page() - release a page obtained using get_user_pages() or
	260	+ * follow_page(FOLL_GET)
	261	+ * @page: pointer to page to be released
	262	+ *
	263	+ * Pages that were obtained via get_user_pages()/follow_page(FOLL_GET) must be
	264	+ * released via put_user_page.
	265	+ * note: If it's not a page from GUP or follow_page(FOLL_GET), it's harmless.
	266	+ */
	267	+void put_user_page(struct page *page)
	268	+{
	269	+ struct page *head = compound_head(page);
	270	+
	271	+ reset_page_pinner(head, compound_order(head));
	272	+ put_page(page);
	273	+}
	274	+EXPORT_SYMBOL(put_user_page);
	275	+
	276	+/**
	277	+ * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
	278	+ * @pages: array of pages to be maybe marked dirty, and definitely released.
	279	+ * @npages: number of pages in the @pages array.
	280	+ * @make_dirty: whether to mark the pages dirty
	281	+ *
	282	+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
	283	+ * variants called on that page.
	284	+ *
	285	+ * For each page in the @pages array, make that page (or its head page, if a
	286	+ * compound page) dirty, if @make_dirty is true, and if the page was previously
	287	+ * listed as clean. In any case, releases all pages using unpin_user_page(),
	288	+ * possibly via unpin_user_pages(), for the non-dirty case.
	289	+ *
	290	+ * Please see the unpin_user_page() documentation for details.
	291	+ *
	292	+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
	293	+ * required, then the caller should a) verify that this is really correct,
	294	+ * because _lock() is usually required, and b) hand code it:
	295	+ * set_page_dirty_lock(), unpin_user_page().
	296	+ *
	297	+ */
	298	+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
	299	+ bool make_dirty)
	300	+{
	301	+ unsigned long index;
	302	+
	303	+ /*
	304	+ * TODO: this can be optimized for huge pages: if a series of pages is
	305	+ * physically contiguous and part of the same compound page, then a
	306	+ * single operation to the head page should suffice.
	307	+ */
	308	+
	309	+ if (!make_dirty) {
	310	+ unpin_user_pages(pages, npages);
	311	+ return;
	312	+ }
	313	+
	314	+ for (index = 0; index < npages; index++) {
	315	+ struct page *page = compound_head(pages[index]);
	316	+ /*
	317	+ * Checking PageDirty at this point may race with
	318	+ * clear_page_dirty_for_io(), but that's OK. Two key
	319	+ * cases:
	320	+ *
	321	+ * 1) This code sees the page as already dirty, so it
	322	+ * skips the call to set_page_dirty(). That could happen
	323	+ * because clear_page_dirty_for_io() called
	324	+ * page_mkclean(), followed by set_page_dirty().
	325	+ * However, now the page is going to get written back,
	326	+ * which meets the original intention of setting it
	327	+ * dirty, so all is well: clear_page_dirty_for_io() goes
	328	+ * on to call TestClearPageDirty(), and write the page
	329	+ * back.
	330	+ *
	331	+ * 2) This code sees the page as clean, so it calls
	332	+ * set_page_dirty(). The page stays dirty, despite being
	333	+ * written back, so it gets written back again in the
	334	+ * next writeback cycle. This is harmless.
	335	+ */
	336	+ if (!PageDirty(page))
	337	+ set_page_dirty_lock(page);
	338	+ unpin_user_page(page);
	339	+ }
	340	+}
	341	+EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
	342	+
	343	+/**
	344	+ * unpin_user_pages() - release an array of gup-pinned pages.
	345	+ * @pages: array of pages to be marked dirty and released.
	346	+ * @npages: number of pages in the @pages array.
	347	+ *
	348	+ * For each page in the @pages array, release the page using unpin_user_page().
	349	+ *
	350	+ * Please see the unpin_user_page() documentation for details.
	351	+ */
	352	+void unpin_user_pages(struct page **pages, unsigned long npages)
	353	+{
	354	+ unsigned long index;
	355	+
	356	+ /*
	357	+ * If this WARN_ON() fires, then the system might be leaking pages (by
	358	+ * leaving them pinned), but probably not. More likely, gup/pup returned
	359	+ * a hard -ERRNO error to the caller, who erroneously passed it here.
	360	+ */
	361	+ if (WARN_ON(IS_ERR_VALUE(npages)))
	362	+ return;
	363	+ /*
	364	+ * TODO: this can be optimized for huge pages: if a series of pages is
	365	+ * physically contiguous and part of the same compound page, then a
	366	+ * single operation to the head page should suffice.
	367	+ */
	368	+ for (index = 0; index < npages; index++)
	369	+ unpin_user_page(pages[index]);
	370	+}
	371	+EXPORT_SYMBOL(unpin_user_pages);
	372	+
	373	+#ifdef CONFIG_MMU
23	374	static struct page no_page_table(struct vm_area_struct vma,
24	375	unsigned int flags)
25	376	{
..	..	@@ -31,7 +382,8 @@
31	382	* But we can only make this optimization where a hole would surely
32	383	* be zero-filled if handle_mm_fault() actually did handle it.
33	384	*/
34		- if ((flags & FOLL_DUMP) && (!vma->vm_ops \|\| !vma->vm_ops->fault))
	385	+ if ((flags & FOLL_DUMP) &&
	386	+ (vma_is_anonymous(vma) \|\| !vma->vm_ops->fault))
35	387	return ERR_PTR(-EFAULT);
36	388	return NULL;
37	389	}
..	..	@@ -61,32 +413,40 @@
61	413	}
62	414
63	415	/*
64		- * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
65		- * but only after we've gone through a COW cycle and they are dirty.
	416	+ * FOLL_FORCE can write to even unwritable pte's, but only
	417	+ * after we've gone through a COW cycle and they are dirty.
66	418	*/
67	419	static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
68	420	{
69		- return pte_write(pte) \|\| ((flags & FOLL_COW) && pte_dirty(pte));
70		-}
71		-
72		-/*
73		- * A (separate) COW fault might break the page the other way and
74		- * get_user_pages() would return the page from what is now the wrong
75		- * VM. So we need to force a COW break at GUP time even for reads.
76		- */
77		-static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
78		-{
79		- return is_cow_mapping(vma->vm_flags) && (flags & FOLL_GET);
	421	+ return pte_write(pte) \|\|
	422	+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
80	423	}
81	424
82	425	static struct page follow_page_pte(struct vm_area_struct vma,
83		- unsigned long address, pmd_t *pmd, unsigned int flags)
	426	+ unsigned long address, pmd_t *pmd, unsigned int flags,
	427	+ struct dev_pagemap **pgmap)
84	428	{
85	429	struct mm_struct *mm = vma->vm_mm;
86		- struct dev_pagemap *pgmap = NULL;
87	430	struct page *page;
88	431	spinlock_t *ptl;
89	432	pte_t *ptep, pte;
	433	+ int ret;
	434	+
	435	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	436	+ if (WARN_ON_ONCE((flags & (FOLL_PIN \| FOLL_GET)) ==
	437	+ (FOLL_PIN \| FOLL_GET)))
	438	+ return ERR_PTR(-EINVAL);
	439	+
	440	+ /*
	441	+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
	442	+ * ARM64 architecture.
	443	+ */
	444	+ if (is_vm_hugetlb_page(vma)) {
	445	+ page = follow_huge_pmd_pte(vma, address, flags);
	446	+ if (page)
	447	+ return page;
	448	+ return no_page_table(vma, flags);
	449	+ }
90	450
91	451	retry:
92	452	if (unlikely(pmd_bad(*pmd)))
..	..	@@ -120,13 +480,14 @@
120	480	}
121	481
122	482	page = vm_normal_page(vma, address, pte);
123		- if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
	483	+ if (!page && pte_devmap(pte) && (flags & (FOLL_GET \| FOLL_PIN))) {
124	484	/*
125		- * Only return device mapping pages in the FOLL_GET case since
126		- * they are only valid while holding the pgmap reference.
	485	+ * Only return device mapping pages in the FOLL_GET or FOLL_PIN
	486	+ * case since they are only valid while holding the pgmap
	487	+ * reference.
127	488	*/
128		- pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
129		- if (pgmap)
	489	+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
	490	+ if (*pgmap)
130	491	page = pte_page(pte);
131	492	else
132	493	goto no_page;
..	..	@@ -140,8 +501,6 @@
140	501	if (is_zero_pfn(pte_pfn(pte))) {
141	502	page = pte_page(pte);
142	503	} else {
143		- int ret;
144		-
145	504	ret = follow_pfn_pte(vma, address, ptep, flags);
146	505	page = ERR_PTR(ret);
147	506	goto out;
..	..	@@ -149,7 +508,6 @@
149	508	}
150	509
151	510	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
152		- int ret;
153	511	get_page(page);
154	512	pte_unmap_unlock(ptep, ptl);
155	513	lock_page(page);
..	..	@@ -161,16 +519,22 @@
161	519	goto retry;
162	520	}
163	521
164		- if (flags & FOLL_GET) {
165		- if (unlikely(!try_get_page(page))) {
166		- page = ERR_PTR(-ENOMEM);
	522	+ /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
	523	+ if (unlikely(!try_grab_page(page, flags))) {
	524	+ page = ERR_PTR(-ENOMEM);
	525	+ goto out;
	526	+ }
	527	+ /*
	528	+ * We need to make the page accessible if and only if we are going
	529	+ * to access its content (the FOLL_PIN case). Please see
	530	+ * Documentation/core-api/pin_user_pages.rst for details.
	531	+ */
	532	+ if (flags & FOLL_PIN) {
	533	+ ret = arch_make_page_accessible(page);
	534	+ if (ret) {
	535	+ unpin_user_page(page);
	536	+ page = ERR_PTR(ret);
167	537	goto out;
168		- }
169		-
170		- /* drop the pgmap reference now that we hold the page */
171		- if (pgmap) {
172		- put_dev_pagemap(pgmap);
173		- pgmap = NULL;
174	538	}
175	539	}
176	540	if (flags & FOLL_TOUCH) {
..	..	@@ -222,7 +586,8 @@
222	586
223	587	static struct page follow_pmd_mask(struct vm_area_struct vma,
224	588	unsigned long address, pud_t *pudp,
225		- unsigned int flags, unsigned int *page_mask)
	589	+ unsigned int flags,
	590	+ struct follow_page_context *ctx)
226	591	{
227	592	pmd_t *pmd, pmdval;
228	593	spinlock_t *ptl;
..	..	@@ -237,8 +602,8 @@
237	602	pmdval = READ_ONCE(*pmd);
238	603	if (pmd_none(pmdval))
239	604	return no_page_table(vma, flags);
240		- if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
241		- page = follow_huge_pmd(mm, address, pmd, flags);
	605	+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
	606	+ page = follow_huge_pmd_pte(vma, address, flags);
242	607	if (page)
243	608	return page;
244	609	return no_page_table(vma, flags);
..	..	@@ -262,7 +627,7 @@
262	627	pmdval = READ_ONCE(*pmd);
263	628	/*
264	629	* MADV_DONTNEED may convert the pmd to null because
265		- * mmap_sem is held in read mode
	630	+ * mmap_lock is held in read mode
266	631	*/
267	632	if (pmd_none(pmdval))
268	633	return no_page_table(vma, flags);
..	..	@@ -270,13 +635,13 @@
270	635	}
271	636	if (pmd_devmap(pmdval)) {
272	637	ptl = pmd_lock(mm, pmd);
273		- page = follow_devmap_pmd(vma, address, pmd, flags);
	638	+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
274	639	spin_unlock(ptl);
275	640	if (page)
276	641	return page;
277	642	}
278	643	if (likely(!pmd_trans_huge(pmdval)))
279		- return follow_page_pte(vma, address, pmd, flags);
	644	+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
280	645
281	646	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
282	647	return no_page_table(vma, flags);
..	..	@@ -296,9 +661,9 @@
296	661	}
297	662	if (unlikely(!pmd_trans_huge(*pmd))) {
298	663	spin_unlock(ptl);
299		- return follow_page_pte(vma, address, pmd, flags);
	664	+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
300	665	}
301		- if (flags & FOLL_SPLIT) {
	666	+ if (flags & (FOLL_SPLIT \| FOLL_SPLIT_PMD)) {
302	667	int ret;
303	668	page = pmd_page(*pmd);
304	669	if (is_huge_zero_page(page)) {
..	..	@@ -307,7 +672,7 @@
307	672	split_huge_pmd(vma, pmd, address);
308	673	if (pmd_trans_unstable(pmd))
309	674	ret = -EBUSY;
310		- } else {
	675	+ } else if (flags & FOLL_SPLIT) {
311	676	if (unlikely(!try_get_page(page))) {
312	677	spin_unlock(ptl);
313	678	return ERR_PTR(-ENOMEM);
..	..	@@ -319,21 +684,25 @@
319	684	put_page(page);
320	685	if (pmd_none(*pmd))
321	686	return no_page_table(vma, flags);
	687	+ } else { /* flags & FOLL_SPLIT_PMD */
	688	+ spin_unlock(ptl);
	689	+ split_huge_pmd(vma, pmd, address);
	690	+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
322	691	}
323	692
324	693	return ret ? ERR_PTR(ret) :
325		- follow_page_pte(vma, address, pmd, flags);
	694	+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
326	695	}
327	696	page = follow_trans_huge_pmd(vma, address, pmd, flags);
328	697	spin_unlock(ptl);
329		- *page_mask = HPAGE_PMD_NR - 1;
	698	+ ctx->page_mask = HPAGE_PMD_NR - 1;
330	699	return page;
331	700	}
332	701
333		-
334	702	static struct page follow_pud_mask(struct vm_area_struct vma,
335	703	unsigned long address, p4d_t *p4dp,
336		- unsigned int flags, unsigned int *page_mask)
	704	+ unsigned int flags,
	705	+ struct follow_page_context *ctx)
337	706	{
338	707	pud_t *pud;
339	708	spinlock_t *ptl;
..	..	@@ -343,7 +712,7 @@
343	712	pud = pud_offset(p4dp, address);
344	713	if (pud_none(*pud))
345	714	return no_page_table(vma, flags);
346		- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
	715	+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
347	716	page = follow_huge_pud(mm, address, pud, flags);
348	717	if (page)
349	718	return page;
..	..	@@ -359,7 +728,7 @@
359	728	}
360	729	if (pud_devmap(*pud)) {
361	730	ptl = pud_lock(mm, pud);
362		- page = follow_devmap_pud(vma, address, pud, flags);
	731	+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
363	732	spin_unlock(ptl);
364	733	if (page)
365	734	return page;
..	..	@@ -367,13 +736,13 @@
367	736	if (unlikely(pud_bad(*pud)))
368	737	return no_page_table(vma, flags);
369	738
370		- return follow_pmd_mask(vma, address, pud, flags, page_mask);
	739	+ return follow_pmd_mask(vma, address, pud, flags, ctx);
371	740	}
372		-
373	741
374	742	static struct page follow_p4d_mask(struct vm_area_struct vma,
375	743	unsigned long address, pgd_t *pgdp,
376		- unsigned int flags, unsigned int *page_mask)
	744	+ unsigned int flags,
	745	+ struct follow_page_context *ctx)
377	746	{
378	747	p4d_t *p4d;
379	748	struct page *page;
..	..	@@ -393,7 +762,7 @@
393	762	return page;
394	763	return no_page_table(vma, flags);
395	764	}
396		- return follow_pud_mask(vma, address, p4d, flags, page_mask);
	765	+ return follow_pud_mask(vma, address, p4d, flags, ctx);
397	766	}
398	767
399	768	/**
..	..	@@ -401,28 +770,34 @@
401	770	* @vma: vm_area_struct mapping @address
402	771	* @address: virtual address to look up
403	772	* @flags: flags modifying lookup behaviour
404		- * @page_mask: on output, *page_mask is set according to the size of the page
	773	+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
	774	+ * pointer to output page_mask
405	775	*
406	776	* @flags can have FOLL_ flags set, defined in <linux/mm.h>
407	777	*
408		- * Returns the mapped (struct page *), %NULL if no mapping exists, or
	778	+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
	779	+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
	780	+ *
	781	+ * On output, the @ctx->page_mask is set according to the size of the page.
	782	+ *
	783	+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
409	784	* an error pointer if there is a mapping to something not represented
410	785	* by a page descriptor (see also vm_normal_page()).
411	786	*/
412		-struct page follow_page_mask(struct vm_area_struct vma,
	787	+static struct page follow_page_mask(struct vm_area_struct vma,
413	788	unsigned long address, unsigned int flags,
414		- unsigned int *page_mask)
	789	+ struct follow_page_context *ctx)
415	790	{
416	791	pgd_t *pgd;
417	792	struct page *page;
418	793	struct mm_struct *mm = vma->vm_mm;
419	794
420		- *page_mask = 0;
	795	+ ctx->page_mask = 0;
421	796
422	797	/* make this handle hugepd */
423	798	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
424	799	if (!IS_ERR(page)) {
425		- BUG_ON(flags & FOLL_GET);
	800	+ WARN_ON_ONCE(flags & (FOLL_GET \| FOLL_PIN));
426	801	return page;
427	802	}
428	803
..	..	@@ -446,7 +821,19 @@
446	821	return no_page_table(vma, flags);
447	822	}
448	823
449		- return follow_p4d_mask(vma, address, pgd, flags, page_mask);
	824	+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
	825	+}
	826	+
	827	+struct page follow_page(struct vm_area_struct vma, unsigned long address,
	828	+ unsigned int foll_flags)
	829	+{
	830	+ struct follow_page_context ctx = { NULL };
	831	+ struct page *page;
	832	+
	833	+ page = follow_page_mask(vma, address, foll_flags, &ctx);
	834	+ if (ctx.pgmap)
	835	+ put_dev_pagemap(ctx.pgmap);
	836	+ return page;
450	837	}
451	838
452	839	static int get_gate_page(struct mm_struct *mm, unsigned long address,
..	..	@@ -490,15 +877,8 @@
490	877	if ((gup_flags & FOLL_DUMP) \|\| !is_zero_pfn(pte_pfn(*pte)))
491	878	goto unmap;
492	879	page = pte_page(pte);
493		-
494		- /*
495		- * This should never happen (a device public page in the gate
496		- * area).
497		- */
498		- if (is_device_public_page(*page))
499		- goto unmap;
500	880	}
501		- if (unlikely(!try_get_page(*page))) {
	881	+ if (unlikely(!try_grab_page(*page, gup_flags))) {
502	882	ret = -ENOMEM;
503	883	goto unmap;
504	884	}
..	..	@@ -510,12 +890,12 @@
510	890	}
511	891
512	892	/*
513		- * mmap_sem must be held on entry. If @nonblocking != NULL and
514		- * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
515		- * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
	893	+ * mmap_lock must be held on entry. If @locked != NULL and *@flags
	894	+ * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
	895	+ * is, *@locked will be set to 0 and -EBUSY returned.
516	896	*/
517		-static int faultin_page(struct task_struct tsk, struct vm_area_struct vma,
518		- unsigned long address, unsigned int flags, int nonblocking)
	897	+static int faultin_page(struct vm_area_struct *vma,
	898	+ unsigned long address, unsigned int flags, int locked)
519	899	{
520	900	unsigned int fault_flags = 0;
521	901	vm_fault_t ret;
..	..	@@ -527,16 +907,19 @@
527	907	fault_flags \|= FAULT_FLAG_WRITE;
528	908	if (*flags & FOLL_REMOTE)
529	909	fault_flags \|= FAULT_FLAG_REMOTE;
530		- if (nonblocking)
531		- fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
	910	+ if (locked)
	911	+ fault_flags \|= FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;
532	912	if (*flags & FOLL_NOWAIT)
533	913	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_RETRY_NOWAIT;
534	914	if (*flags & FOLL_TRIED) {
535		- VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
	915	+ /*
	916	+ * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
	917	+ * can co-exist
	918	+ */
536	919	fault_flags \|= FAULT_FLAG_TRIED;
537	920	}
538	921
539		- ret = handle_mm_fault(vma, address, fault_flags);
	922	+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
540	923	if (ret & VM_FAULT_ERROR) {
541	924	int err = vm_fault_to_errno(ret, *flags);
542	925
..	..	@@ -545,16 +928,9 @@
545	928	BUG();
546	929	}
547	930
548		- if (tsk) {
549		- if (ret & VM_FAULT_MAJOR)
550		- tsk->maj_flt++;
551		- else
552		- tsk->min_flt++;
553		- }
554		-
555	931	if (ret & VM_FAULT_RETRY) {
556		- if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
557		- *nonblocking = 0;
	932	+ if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
	933	+ *locked = 0;
558	934	return -EBUSY;
559	935	}
560	936
..	..	@@ -583,6 +959,9 @@
583	959
584	960	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
585	961	return -EFAULT;
	962	+
	963	+ if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
	964	+ return -EOPNOTSUPP;
586	965
587	966	if (write) {
588	967	if (!(vm_flags & VM_WRITE)) {
..	..	@@ -621,7 +1000,6 @@
621	1000
622	1001	/**
623	1002	* __get_user_pages() - pin user pages in memory
624		- * @tsk: task_struct of target task
625	1003	* @mm: mm_struct of target mm
626	1004	* @start: starting user address
627	1005	* @nr_pages: number of pages from start to pin
..	..	@@ -631,15 +1009,22 @@
631	1009	* only intends to ensure the pages are faulted in.
632	1010	* @vmas: array of pointers to vmas corresponding to each page.
633	1011	* Or NULL if the caller does not require them.
634		- * @nonblocking: whether waiting for disk IO or mmap_sem contention
	1012	+ * @locked: whether we're still with the mmap_lock held
635	1013	*
636		- * Returns number of pages pinned. This may be fewer than the number
637		- * requested. If nr_pages is 0 or negative, returns 0. If no pages
638		- * were pinned, returns -errno. Each page returned must be released
639		- * with a put_page() call when it is finished with. vmas will only
640		- * remain valid while mmap_sem is held.
	1014	+ * Returns either number of pages pinned (which may be less than the
	1015	+ * number requested), or an error. Details about the return value:
641	1016	*
642		- * Must be called with mmap_sem held. It may be released. See below.
	1017	+ * -- If nr_pages is 0, returns 0.
	1018	+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
	1019	+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
	1020	+ * pages pinned. Again, this may be less than nr_pages.
	1021	+ * -- 0 return value is possible when the fault would need to be retried.
	1022	+ *
	1023	+ * The caller is responsible for releasing returned @pages, via put_page().
	1024	+ *
	1025	+ * @vmas are valid only as long as mmap_lock is held.
	1026	+ *
	1027	+ * Must be called with mmap_lock held. It may be released. See below.
643	1028	*
644	1029	* __get_user_pages walks a process's page tables and takes a reference to
645	1030	* each struct page that each user address corresponds to at a given
..	..	@@ -660,14 +1045,12 @@
660	1045	* appropriate) must be called after the page is finished with, and
661	1046	* before put_page is called.
662	1047	*
663		- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
664		- * or mmap_sem contention, and if waiting is needed to pin all pages,
665		- * *@nonblocking will be set to 0. Further, if @gup_flags does not
666		- * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
667		- * this case.
	1048	+ * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
	1049	+ * released by an up_read(). That can happen if @gup_flags does not
	1050	+ * have FOLL_NOWAIT.
668	1051	*
669		- * A caller using such a combination of @nonblocking and @gup_flags
670		- * must therefore hold the mmap_sem for reading only, and recognize
	1052	+ * A caller using such a combination of @locked and @gup_flags
	1053	+ * must therefore hold the mmap_lock for reading only, and recognize
671	1054	* when it's been released. Otherwise, it must be held for either
672	1055	* reading or writing and will not be released.
673	1056	*
..	..	@@ -675,21 +1058,21 @@
675	1058	* instead of __get_user_pages. __get_user_pages should be used only if
676	1059	* you need some special @gup_flags.
677	1060	*/
678		-static long __get_user_pages(struct task_struct tsk, struct mm_struct mm,
	1061	+static long __get_user_pages(struct mm_struct *mm,
679	1062	unsigned long start, unsigned long nr_pages,
680	1063	unsigned int gup_flags, struct page **pages,
681		- struct vm_area_struct *vmas, int nonblocking)
	1064	+ struct vm_area_struct *vmas, int locked)
682	1065	{
683		- long i = 0;
684		- unsigned int page_mask;
	1066	+ long ret = 0, i = 0;
685	1067	struct vm_area_struct *vma = NULL;
	1068	+ struct follow_page_context ctx = { NULL };
686	1069
687	1070	if (!nr_pages)
688	1071	return 0;
689	1072
690	1073	start = untagged_addr(start);
691	1074
692		- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
	1075	+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET \| FOLL_PIN)));
693	1076
694	1077	/*
695	1078	* If FOLL_FORCE is set then do not force a full fault as the hinting
..	..	@@ -708,53 +1091,64 @@
708	1091	if (!vma \|\| start >= vma->vm_end) {
709	1092	vma = find_extend_vma(mm, start);
710	1093	if (!vma && in_gate_area(mm, start)) {
711		- int ret;
712	1094	ret = get_gate_page(mm, start & PAGE_MASK,
713	1095	gup_flags, &vma,
714	1096	pages ? &pages[i] : NULL);
715	1097	if (ret)
716		- return i ? : ret;
717		- page_mask = 0;
	1098	+ goto out;
	1099	+ ctx.page_mask = 0;
718	1100	goto next_page;
719	1101	}
720	1102
721		- if (!vma \|\| check_vma_flags(vma, gup_flags))
722		- return i ? : -EFAULT;
	1103	+ if (!vma) {
	1104	+ ret = -EFAULT;
	1105	+ goto out;
	1106	+ }
	1107	+ ret = check_vma_flags(vma, gup_flags);
	1108	+ if (ret)
	1109	+ goto out;
	1110	+
723	1111	if (is_vm_hugetlb_page(vma)) {
724		- if (should_force_cow_break(vma, foll_flags))
725		- foll_flags \|= FOLL_WRITE;
726	1112	i = follow_hugetlb_page(mm, vma, pages, vmas,
727	1113	&start, &nr_pages, i,
728		- foll_flags, nonblocking);
	1114	+ gup_flags, locked);
	1115	+ if (locked && *locked == 0) {
	1116	+ /*
	1117	+ * We've got a VM_FAULT_RETRY
	1118	+ * and we've lost mmap_lock.
	1119	+ * We must stop here.
	1120	+ */
	1121	+ BUG_ON(gup_flags & FOLL_NOWAIT);
	1122	+ BUG_ON(ret != 0);
	1123	+ goto out;
	1124	+ }
729	1125	continue;
730	1126	}
731	1127	}
732		-
733		- if (should_force_cow_break(vma, foll_flags))
734		- foll_flags \|= FOLL_WRITE;
735		-
736	1128	retry:
737	1129	/*
738	1130	* If we have a pending SIGKILL, don't keep faulting pages and
739	1131	* potentially allocating memory.
740	1132	*/
741		- if (unlikely(fatal_signal_pending(current)))
742		- return i ? i : -ERESTARTSYS;
	1133	+ if (fatal_signal_pending(current)) {
	1134	+ ret = -EINTR;
	1135	+ goto out;
	1136	+ }
743	1137	cond_resched();
744		- page = follow_page_mask(vma, start, foll_flags, &page_mask);
	1138	+
	1139	+ page = follow_page_mask(vma, start, foll_flags, &ctx);
745	1140	if (!page) {
746		- int ret;
747		- ret = faultin_page(tsk, vma, start, &foll_flags,
748		- nonblocking);
	1141	+ ret = faultin_page(vma, start, &foll_flags, locked);
749	1142	switch (ret) {
750	1143	case 0:
751	1144	goto retry;
	1145	+ case -EBUSY:
	1146	+ ret = 0;
	1147	+ fallthrough;
752	1148	case -EFAULT:
753	1149	case -ENOMEM:
754	1150	case -EHWPOISON:
755		- return i ? i : ret;
756		- case -EBUSY:
757		- return i;
	1151	+ goto out;
758	1152	case -ENOENT:
759	1153	goto next_page;
760	1154	}
..	..	@@ -766,27 +1160,31 @@
766	1160	*/
767	1161	goto next_page;
768	1162	} else if (IS_ERR(page)) {
769		- return i ? i : PTR_ERR(page);
	1163	+ ret = PTR_ERR(page);
	1164	+ goto out;
770	1165	}
771	1166	if (pages) {
772	1167	pages[i] = page;
773	1168	flush_anon_page(vma, page, start);
774	1169	flush_dcache_page(page);
775		- page_mask = 0;
	1170	+ ctx.page_mask = 0;
776	1171	}
777	1172	next_page:
778	1173	if (vmas) {
779	1174	vmas[i] = vma;
780		- page_mask = 0;
	1175	+ ctx.page_mask = 0;
781	1176	}
782		- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
	1177	+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
783	1178	if (page_increm > nr_pages)
784	1179	page_increm = nr_pages;
785	1180	i += page_increm;
786	1181	start += page_increm * PAGE_SIZE;
787	1182	nr_pages -= page_increm;
788	1183	} while (nr_pages);
789		- return i;
	1184	+out:
	1185	+ if (ctx.pgmap)
	1186	+ put_dev_pagemap(ctx.pgmap);
	1187	+ return i ? i : ret;
790	1188	}
791	1189
792	1190	static bool vma_permits_fault(struct vm_area_struct *vma,
..	..	@@ -812,15 +1210,14 @@
812	1210	return true;
813	1211	}
814	1212
815		-/*
	1213	+/**
816	1214	* fixup_user_fault() - manually resolve a user page fault
817		- * @tsk: the task_struct to use for page fault accounting, or
818		- * NULL if faults are not to be recorded.
819	1215	* @mm: mm_struct of target mm
820	1216	* @address: user address
821	1217	* @fault_flags:flags to pass down to handle_mm_fault()
822		- * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
823		- * does not allow retry
	1218	+ * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
	1219	+ * does not allow retry. If NULL, the caller must guarantee
	1220	+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
824	1221	*
825	1222	* This is meant to be called in the specific scenario where for locking reasons
826	1223	* we try to access user memory in atomic context (within a pagefault_disable()
..	..	@@ -839,10 +1236,10 @@
839	1236	* such architectures, gup() will not be enough to make a subsequent access
840	1237	* succeed.
841	1238	*
842		- * This function will not return with an unlocked mmap_sem. So it has not the
843		- * same semantics wrt the @mm->mmap_sem as does filemap_fault().
	1239	+ * This function will not return with an unlocked mmap_lock. So it has not the
	1240	+ * same semantics wrt the @mm->mmap_lock as does filemap_fault().
844	1241	*/
845		-int fixup_user_fault(struct task_struct tsk, struct mm_struct mm,
	1242	+int fixup_user_fault(struct mm_struct *mm,
846	1243	unsigned long address, unsigned int fault_flags,
847	1244	bool *unlocked)
848	1245	{
..	..	@@ -852,7 +1249,7 @@
852	1249	address = untagged_addr(address);
853	1250
854	1251	if (unlocked)
855		- fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
	1252	+ fault_flags \|= FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;
856	1253
857	1254	retry:
858	1255	vma = find_extend_vma(mm, address);
..	..	@@ -862,7 +1259,11 @@
862	1259	if (!vma_permits_fault(vma, fault_flags))
863	1260	return -EFAULT;
864	1261
865		- ret = handle_mm_fault(vma, address, fault_flags);
	1262	+ if ((fault_flags & FAULT_FLAG_KILLABLE) &&
	1263	+ fatal_signal_pending(current))
	1264	+ return -EINTR;
	1265	+
	1266	+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
866	1267	major \|= ret & VM_FAULT_MAJOR;
867	1268	if (ret & VM_FAULT_ERROR) {
868	1269	int err = vm_fault_to_errno(ret, 0);
..	..	@@ -873,27 +1274,21 @@
873	1274	}
874	1275
875	1276	if (ret & VM_FAULT_RETRY) {
876		- down_read(&mm->mmap_sem);
877		- if (!(fault_flags & FAULT_FLAG_TRIED)) {
878		- *unlocked = true;
879		- fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
880		- fault_flags \|= FAULT_FLAG_TRIED;
881		- goto retry;
882		- }
	1277	+ mmap_read_lock(mm);
	1278	+ *unlocked = true;
	1279	+ fault_flags \|= FAULT_FLAG_TRIED;
	1280	+ goto retry;
883	1281	}
884	1282
885		- if (tsk) {
886		- if (major)
887		- tsk->maj_flt++;
888		- else
889		- tsk->min_flt++;
890		- }
891	1283	return 0;
892	1284	}
893	1285	EXPORT_SYMBOL_GPL(fixup_user_fault);
894	1286
895		-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
896		- struct mm_struct *mm,
	1287	+/*
	1288	+ * Please note that this function, unlike __get_user_pages will not
	1289	+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
	1290	+ */
	1291	+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
897	1292	unsigned long start,
898	1293	unsigned long nr_pages,
899	1294	struct page **pages,
..	..	@@ -911,13 +1306,25 @@
911	1306	BUG_ON(*locked != 1);
912	1307	}
913	1308
914		- if (pages)
	1309	+ if (flags & FOLL_PIN)
	1310	+ atomic_set(&mm->has_pinned, 1);
	1311	+
	1312	+ /*
	1313	+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
	1314	+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
	1315	+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
	1316	+ * for FOLL_GET, not for the newer FOLL_PIN.
	1317	+ *
	1318	+ * FOLL_PIN always expects pages to be non-null, but no need to assert
	1319	+ * that here, as any failures will be obvious enough.
	1320	+ */
	1321	+ if (pages && !(flags & FOLL_PIN))
915	1322	flags \|= FOLL_GET;
916	1323
917	1324	pages_done = 0;
918	1325	lock_dropped = false;
919	1326	for (;;) {
920		- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
	1327	+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
921	1328	vmas, locked);
922	1329	if (!locked)
923	1330	/* VM_FAULT_RETRY couldn't trigger, bypass */
..	..	@@ -928,10 +1335,6 @@
928	1335	BUG_ON(ret < 0);
929	1336	BUG_ON(ret >= nr_pages);
930	1337	}
931		-
932		- if (!pages)
933		- /* If it's a prefault don't insist harder */
934		- return ret;
935	1338
936	1339	if (ret > 0) {
937	1340	nr_pages -= ret;
..	..	@@ -948,20 +1351,46 @@
948	1351	pages_done = ret;
949	1352	break;
950	1353	}
951		- /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
952		- pages += ret;
	1354	+ /*
	1355	+ * VM_FAULT_RETRY triggered, so seek to the faulting offset.
	1356	+ * For the prefault case (!pages) we only update counts.
	1357	+ */
	1358	+ if (likely(pages))
	1359	+ pages += ret;
953	1360	start += ret << PAGE_SHIFT;
	1361	+ lock_dropped = true;
954	1362
	1363	+retry:
955	1364	/*
956	1365	* Repeat on the address that fired VM_FAULT_RETRY
957		- * without FAULT_FLAG_ALLOW_RETRY but with
958		- * FAULT_FLAG_TRIED.
	1366	+ * with both FAULT_FLAG_ALLOW_RETRY and
	1367	+ * FAULT_FLAG_TRIED. Note that GUP can be interrupted
	1368	+ * by fatal signals, so we need to check it before we
	1369	+ * start trying again otherwise it can loop forever.
959	1370	*/
	1371	+
	1372	+ if (fatal_signal_pending(current)) {
	1373	+ if (!pages_done)
	1374	+ pages_done = -EINTR;
	1375	+ break;
	1376	+ }
	1377	+
	1378	+ ret = mmap_read_lock_killable(mm);
	1379	+ if (ret) {
	1380	+ BUG_ON(ret > 0);
	1381	+ if (!pages_done)
	1382	+ pages_done = ret;
	1383	+ break;
	1384	+ }
	1385	+
960	1386	*locked = 1;
961		- lock_dropped = true;
962		- down_read(&mm->mmap_sem);
963		- ret = __get_user_pages(tsk, mm, start, 1, flags \| FOLL_TRIED,
964		- pages, NULL, NULL);
	1387	+ ret = __get_user_pages(mm, start, 1, flags \| FOLL_TRIED,
	1388	+ pages, NULL, locked);
	1389	+ if (!*locked) {
	1390	+ /* Continue to retry until we succeeded */
	1391	+ BUG_ON(ret != 0);
	1392	+ goto retry;
	1393	+ }
965	1394	if (ret != 1) {
966	1395	BUG_ON(ret > 1);
967	1396	if (!pages_done)
..	..	@@ -972,7 +1401,8 @@
972	1401	pages_done++;
973	1402	if (!nr_pages)
974	1403	break;
975		- pages++;
	1404	+ if (likely(pages))
	1405	+ pages++;
976	1406	start += PAGE_SIZE;
977	1407	}
978	1408	if (lock_dropped && *locked) {
..	..	@@ -980,243 +1410,34 @@
980	1410	* We must let the caller know we temporarily dropped the lock
981	1411	* and so the critical section protected by it was lost.
982	1412	*/
983		- up_read(&mm->mmap_sem);
	1413	+ mmap_read_unlock(mm);
984	1414	*locked = 0;
985	1415	}
986	1416	return pages_done;
987	1417	}
988		-
989		-/*
990		- * We can leverage the VM_FAULT_RETRY functionality in the page fault
991		- * paths better by using either get_user_pages_locked() or
992		- * get_user_pages_unlocked().
993		- *
994		- * get_user_pages_locked() is suitable to replace the form:
995		- *
996		- * down_read(&mm->mmap_sem);
997		- * do_something()
998		- * get_user_pages(tsk, mm, ..., pages, NULL);
999		- * up_read(&mm->mmap_sem);
1000		- *
1001		- * to:
1002		- *
1003		- * int locked = 1;
1004		- * down_read(&mm->mmap_sem);
1005		- * do_something()
1006		- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
1007		- * if (locked)
1008		- * up_read(&mm->mmap_sem);
1009		- */
1010		-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1011		- unsigned int gup_flags, struct page **pages,
1012		- int *locked)
1013		-{
1014		- return __get_user_pages_locked(current, current->mm, start, nr_pages,
1015		- pages, NULL, locked,
1016		- gup_flags \| FOLL_TOUCH);
1017		-}
1018		-EXPORT_SYMBOL(get_user_pages_locked);
1019		-
1020		-/*
1021		- * get_user_pages_unlocked() is suitable to replace the form:
1022		- *
1023		- * down_read(&mm->mmap_sem);
1024		- * get_user_pages(tsk, mm, ..., pages, NULL);
1025		- * up_read(&mm->mmap_sem);
1026		- *
1027		- * with:
1028		- *
1029		- * get_user_pages_unlocked(tsk, mm, ..., pages);
1030		- *
1031		- * It is functionally equivalent to get_user_pages_fast so
1032		- * get_user_pages_fast should be used instead if specific gup_flags
1033		- * (e.g. FOLL_FORCE) are not required.
1034		- */
1035		-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1036		- struct page **pages, unsigned int gup_flags)
1037		-{
1038		- struct mm_struct *mm = current->mm;
1039		- int locked = 1;
1040		- long ret;
1041		-
1042		- down_read(&mm->mmap_sem);
1043		- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1044		- &locked, gup_flags \| FOLL_TOUCH);
1045		- if (locked)
1046		- up_read(&mm->mmap_sem);
1047		- return ret;
1048		-}
1049		-EXPORT_SYMBOL(get_user_pages_unlocked);
1050		-
1051		-/*
1052		- * get_user_pages_remote() - pin user pages in memory
1053		- * @tsk: the task_struct to use for page fault accounting, or
1054		- * NULL if faults are not to be recorded.
1055		- * @mm: mm_struct of target mm
1056		- * @start: starting user address
1057		- * @nr_pages: number of pages from start to pin
1058		- * @gup_flags: flags modifying lookup behaviour
1059		- * @pages: array that receives pointers to the pages pinned.
1060		- * Should be at least nr_pages long. Or NULL, if caller
1061		- * only intends to ensure the pages are faulted in.
1062		- * @vmas: array of pointers to vmas corresponding to each page.
1063		- * Or NULL if the caller does not require them.
1064		- * @locked: pointer to lock flag indicating whether lock is held and
1065		- * subsequently whether VM_FAULT_RETRY functionality can be
1066		- * utilised. Lock must initially be held.
1067		- *
1068		- * Returns number of pages pinned. This may be fewer than the number
1069		- * requested. If nr_pages is 0 or negative, returns 0. If no pages
1070		- * were pinned, returns -errno. Each page returned must be released
1071		- * with a put_page() call when it is finished with. vmas will only
1072		- * remain valid while mmap_sem is held.
1073		- *
1074		- * Must be called with mmap_sem held for read or write.
1075		- *
1076		- * get_user_pages walks a process's page tables and takes a reference to
1077		- * each struct page that each user address corresponds to at a given
1078		- * instant. That is, it takes the page that would be accessed if a user
1079		- * thread accesses the given user virtual address at that instant.
1080		- *
1081		- * This does not guarantee that the page exists in the user mappings when
1082		- * get_user_pages returns, and there may even be a completely different
1083		- * page there in some cases (eg. if mmapped pagecache has been invalidated
1084		- * and subsequently re faulted). However it does guarantee that the page
1085		- * won't be freed completely. And mostly callers simply care that the page
1086		- * contains data that was valid at some point in time. Typically, an IO
1087		- * or similar operation cannot guarantee anything stronger anyway because
1088		- * locks can't be held over the syscall boundary.
1089		- *
1090		- * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1091		- * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1092		- * be called after the page is finished with, and before put_page is called.
1093		- *
1094		- * get_user_pages is typically used for fewer-copy IO operations, to get a
1095		- * handle on the memory by some means other than accesses via the user virtual
1096		- * addresses. The pages may be submitted for DMA to devices or accessed via
1097		- * their kernel linear mapping (via the kmap APIs). Care should be taken to
1098		- * use the correct cache flushing APIs.
1099		- *
1100		- * See also get_user_pages_fast, for performance critical applications.
1101		- *
1102		- * get_user_pages should be phased out in favor of
1103		- * get_user_pages_locked\|unlocked or get_user_pages_fast. Nothing
1104		- * should use get_user_pages because it cannot pass
1105		- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1106		- */
1107		-long get_user_pages_remote(struct task_struct tsk, struct mm_struct mm,
1108		- unsigned long start, unsigned long nr_pages,
1109		- unsigned int gup_flags, struct page **pages,
1110		- struct vm_area_struct *vmas, int locked)
1111		-{
1112		- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1113		- locked,
1114		- gup_flags \| FOLL_TOUCH \| FOLL_REMOTE);
1115		-}
1116		-EXPORT_SYMBOL(get_user_pages_remote);
1117		-
1118		-/*
1119		- * This is the same as get_user_pages_remote(), just with a
1120		- * less-flexible calling convention where we assume that the task
1121		- * and mm being operated on are the current task's and don't allow
1122		- * passing of a locked parameter. We also obviously don't pass
1123		- * FOLL_REMOTE in here.
1124		- */
1125		-long get_user_pages(unsigned long start, unsigned long nr_pages,
1126		- unsigned int gup_flags, struct page **pages,
1127		- struct vm_area_struct **vmas)
1128		-{
1129		- return __get_user_pages_locked(current, current->mm, start, nr_pages,
1130		- pages, vmas, NULL,
1131		- gup_flags \| FOLL_TOUCH);
1132		-}
1133		-EXPORT_SYMBOL(get_user_pages);
1134		-
1135		-#ifdef CONFIG_FS_DAX
1136		-/*
1137		- * This is the same as get_user_pages() in that it assumes we are
1138		- * operating on the current task's mm, but it goes further to validate
1139		- * that the vmas associated with the address range are suitable for
1140		- * longterm elevated page reference counts. For example, filesystem-dax
1141		- * mappings are subject to the lifetime enforced by the filesystem and
1142		- * we need guarantees that longterm users like RDMA and V4L2 only
1143		- * establish mappings that have a kernel enforced revocation mechanism.
1144		- *
1145		- * "longterm" == userspace controlled elevated page count lifetime.
1146		- * Contrast this to iov_iter_get_pages() usages which are transient.
1147		- */
1148		-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1149		- unsigned int gup_flags, struct page **pages,
1150		- struct vm_area_struct **vmas_arg)
1151		-{
1152		- struct vm_area_struct **vmas = vmas_arg;
1153		- struct vm_area_struct *vma_prev = NULL;
1154		- long rc, i;
1155		-
1156		- if (!pages)
1157		- return -EINVAL;
1158		-
1159		- if (!vmas) {
1160		- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1161		- GFP_KERNEL);
1162		- if (!vmas)
1163		- return -ENOMEM;
1164		- }
1165		-
1166		- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1167		-
1168		- for (i = 0; i < rc; i++) {
1169		- struct vm_area_struct *vma = vmas[i];
1170		-
1171		- if (vma == vma_prev)
1172		- continue;
1173		-
1174		- vma_prev = vma;
1175		-
1176		- if (vma_is_fsdax(vma))
1177		- break;
1178		- }
1179		-
1180		- /*
1181		- * Either get_user_pages() failed, or the vma validation
1182		- * succeeded, in either case we don't need to put_page() before
1183		- * returning.
1184		- */
1185		- if (i >= rc)
1186		- goto out;
1187		-
1188		- for (i = 0; i < rc; i++)
1189		- put_page(pages[i]);
1190		- rc = -EOPNOTSUPP;
1191		-out:
1192		- if (vmas != vmas_arg)
1193		- kfree(vmas);
1194		- return rc;
1195		-}
1196		-EXPORT_SYMBOL(get_user_pages_longterm);
1197		-#endif /* CONFIG_FS_DAX */
1198	1418
1199	1419	/**
1200	1420	* populate_vma_page_range() - populate a range of pages in the vma.
1201	1421	* @vma: target vma
1202	1422	* @start: start address
1203	1423	* @end: end address
1204		- * @nonblocking:
	1424	+ * @locked: whether the mmap_lock is still held
1205	1425	*
1206	1426	* This takes care of mlocking the pages too if VM_LOCKED is set.
1207	1427	*
1208		- * return 0 on success, negative error code on error.
	1428	+ * Return either number of pages pinned in the vma, or a negative error
	1429	+ * code on error.
1209	1430	*
1210		- * vma->vm_mm->mmap_sem must be held.
	1431	+ * vma->vm_mm->mmap_lock must be held.
1211	1432	*
1212		- * If @nonblocking is NULL, it may be held for read or write and will
	1433	+ * If @locked is NULL, it may be held for read or write and will
1213	1434	* be unperturbed.
1214	1435	*
1215		- * If @nonblocking is non-NULL, it must held for read only and may be
1216		- * released. If it's released, *@nonblocking will be set to 0.
	1436	+ * If @locked is non-NULL, it must held for read only and may be
	1437	+ * released. If it's released, *@locked will be set to 0.
1217	1438	*/
1218	1439	long populate_vma_page_range(struct vm_area_struct *vma,
1219		- unsigned long start, unsigned long end, int *nonblocking)
	1440	+ unsigned long start, unsigned long end, int *locked)
1220	1441	{
1221	1442	struct mm_struct *mm = vma->vm_mm;
1222	1443	unsigned long nr_pages = (end - start) / PAGE_SIZE;
..	..	@@ -1226,7 +1447,7 @@
1226	1447	VM_BUG_ON(end & ~PAGE_MASK);
1227	1448	VM_BUG_ON_VMA(start < vma->vm_start, vma);
1228	1449	VM_BUG_ON_VMA(end > vma->vm_end, vma);
1229		- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
	1450	+ mmap_assert_locked(mm);
1230	1451
1231	1452	gup_flags = FOLL_TOUCH \| FOLL_POPULATE \| FOLL_MLOCK;
1232	1453	if (vma->vm_flags & VM_LOCKONFAULT)
..	..	@@ -1243,15 +1464,15 @@
1243	1464	* We want mlock to succeed for regions that have any permissions
1244	1465	* other than PROT_NONE.
1245	1466	*/
1246		- if (vma->vm_flags & (VM_READ \| VM_WRITE \| VM_EXEC))
	1467	+ if (vma_is_accessible(vma))
1247	1468	gup_flags \|= FOLL_FORCE;
1248	1469
1249	1470	/*
1250	1471	* We made sure addr is within a VMA, so the following will
1251	1472	* not result in a stack expansion that recurses back here.
1252	1473	*/
1253		- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1254		- NULL, NULL, nonblocking);
	1474	+ return __get_user_pages(mm, start, nr_pages, gup_flags,
	1475	+ NULL, NULL, locked);
1255	1476	}
1256	1477
1257	1478	/*
..	..	@@ -1259,7 +1480,7 @@
1259	1480	*
1260	1481	* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1261	1482	* flags. VMAs must be already marked with the desired vm_flags, and
1262		- * mmap_sem must not be held.
	1483	+ * mmap_lock must not be held.
1263	1484	*/
1264	1485	int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1265	1486	{
..	..	@@ -1278,7 +1499,7 @@
1278	1499	*/
1279	1500	if (!locked) {
1280	1501	locked = 1;
1281		- down_read(&mm->mmap_sem);
	1502	+ mmap_read_lock(mm);
1282	1503	vma = find_vma(mm, nstart);
1283	1504	} else if (nstart >= vma->vm_end)
1284	1505	vma = vma->vm_next;
..	..	@@ -1310,9 +1531,53 @@
1310	1531	ret = 0;
1311	1532	}
1312	1533	if (locked)
1313		- up_read(&mm->mmap_sem);
	1534	+ mmap_read_unlock(mm);
1314	1535	return ret; /* 0 or negative error code */
1315	1536	}
	1537	+#else /* CONFIG_MMU */
	1538	+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
	1539	+ unsigned long nr_pages, struct page **pages,
	1540	+ struct vm_area_struct *vmas, int locked,
	1541	+ unsigned int foll_flags)
	1542	+{
	1543	+ struct vm_area_struct *vma;
	1544	+ unsigned long vm_flags;
	1545	+ int i;
	1546	+
	1547	+ /* calculate required read or write permissions.
	1548	+ * If FOLL_FORCE is set, we only require the "MAY" flags.
	1549	+ */
	1550	+ vm_flags = (foll_flags & FOLL_WRITE) ?
	1551	+ (VM_WRITE \| VM_MAYWRITE) : (VM_READ \| VM_MAYREAD);
	1552	+ vm_flags &= (foll_flags & FOLL_FORCE) ?
	1553	+ (VM_MAYREAD \| VM_MAYWRITE) : (VM_READ \| VM_WRITE);
	1554	+
	1555	+ for (i = 0; i < nr_pages; i++) {
	1556	+ vma = find_vma(mm, start);
	1557	+ if (!vma)
	1558	+ goto finish_or_fault;
	1559	+
	1560	+ /* protect what we can, including chardevs */
	1561	+ if ((vma->vm_flags & (VM_IO \| VM_PFNMAP)) \|\|
	1562	+ !(vm_flags & vma->vm_flags))
	1563	+ goto finish_or_fault;
	1564	+
	1565	+ if (pages) {
	1566	+ pages[i] = virt_to_page(start);
	1567	+ if (pages[i])
	1568	+ get_page(pages[i]);
	1569	+ }
	1570	+ if (vmas)
	1571	+ vmas[i] = vma;
	1572	+ start = (start + PAGE_SIZE) & PAGE_MASK;
	1573	+ }
	1574	+
	1575	+ return i;
	1576	+
	1577	+finish_or_fault:
	1578	+ return i ? : -EFAULT;
	1579	+}
	1580	+#endif /* !CONFIG_MMU */
1316	1581
1317	1582	/**
1318	1583	* get_dump_page() - pin user page in memory while writing it to core dump
..	..	@@ -1326,25 +1591,429 @@
1326	1591	* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1327	1592	* allowing a hole to be left in the corefile to save diskspace.
1328	1593	*
1329		- * Called without mmap_sem, but after all other threads have been killed.
	1594	+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
1330	1595	*/
1331	1596	#ifdef CONFIG_ELF_CORE
1332	1597	struct page *get_dump_page(unsigned long addr)
1333	1598	{
1334		- struct vm_area_struct *vma;
	1599	+ struct mm_struct *mm = current->mm;
1335	1600	struct page *page;
	1601	+ int locked = 1;
	1602	+ int ret;
1336	1603
1337		- if (__get_user_pages(current, current->mm, addr, 1,
1338		- FOLL_FORCE \| FOLL_DUMP \| FOLL_GET, &page, &vma,
1339		- NULL) < 1)
	1604	+ if (mmap_read_lock_killable(mm))
1340	1605	return NULL;
1341		- flush_cache_page(vma, addr, page_to_pfn(page));
1342		- return page;
	1606	+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
	1607	+ FOLL_FORCE \| FOLL_DUMP \| FOLL_GET);
	1608	+ if (locked)
	1609	+ mmap_read_unlock(mm);
	1610	+ return (ret == 1) ? page : NULL;
1343	1611	}
1344	1612	#endif /* CONFIG_ELF_CORE */
1345	1613
	1614	+#ifdef CONFIG_CMA
	1615	+static long check_and_migrate_cma_pages(struct mm_struct *mm,
	1616	+ unsigned long start,
	1617	+ unsigned long nr_pages,
	1618	+ struct page **pages,
	1619	+ struct vm_area_struct **vmas,
	1620	+ unsigned int gup_flags)
	1621	+{
	1622	+ unsigned long i, isolation_error_count;
	1623	+ bool drain_allow;
	1624	+ LIST_HEAD(cma_page_list);
	1625	+ long ret = nr_pages;
	1626	+ struct page prev_head, head;
	1627	+ struct migration_target_control mtc = {
	1628	+ .nid = NUMA_NO_NODE,
	1629	+ .gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_NOWARN,
	1630	+ };
	1631	+
	1632	+check_again:
	1633	+ prev_head = NULL;
	1634	+ isolation_error_count = 0;
	1635	+ drain_allow = true;
	1636	+ for (i = 0; i < nr_pages; i++) {
	1637	+ head = compound_head(pages[i]);
	1638	+ if (head == prev_head)
	1639	+ continue;
	1640	+ prev_head = head;
	1641	+ /*
	1642	+ * If we get a page from the CMA zone, since we are going to
	1643	+ * be pinning these entries, we might as well move them out
	1644	+ * of the CMA zone if possible.
	1645	+ */
	1646	+ if (is_migrate_cma_page(head)) {
	1647	+ if (PageHuge(head)) {
	1648	+ if (isolate_hugetlb(head, &cma_page_list))
	1649	+ isolation_error_count++;
	1650	+ } else {
	1651	+ if (!PageLRU(head) && drain_allow) {
	1652	+ lru_add_drain_all();
	1653	+ drain_allow = false;
	1654	+ }
	1655	+
	1656	+ if (isolate_lru_page(head)) {
	1657	+ isolation_error_count++;
	1658	+ continue;
	1659	+ }
	1660	+ list_add_tail(&head->lru, &cma_page_list);
	1661	+ mod_node_page_state(page_pgdat(head),
	1662	+ NR_ISOLATED_ANON +
	1663	+ page_is_file_lru(head),
	1664	+ thp_nr_pages(head));
	1665	+ }
	1666	+ }
	1667	+ }
	1668	+
	1669	+ /*
	1670	+ * If list is empty, and no isolation errors, means that all pages are
	1671	+ * in the correct zone.
	1672	+ */
	1673	+ if (list_empty(&cma_page_list) && !isolation_error_count)
	1674	+ return ret;
	1675	+
	1676	+ if (!list_empty(&cma_page_list)) {
	1677	+ /*
	1678	+ * drop the above get_user_pages reference.
	1679	+ */
	1680	+ if (gup_flags & FOLL_PIN)
	1681	+ unpin_user_pages(pages, nr_pages);
	1682	+ else
	1683	+ for (i = 0; i < nr_pages; i++)
	1684	+ put_page(pages[i]);
	1685	+
	1686	+ ret = migrate_pages(&cma_page_list, alloc_migration_target,
	1687	+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
	1688	+ MR_CONTIG_RANGE);
	1689	+ if (ret) {
	1690	+ if (!list_empty(&cma_page_list))
	1691	+ putback_movable_pages(&cma_page_list);
	1692	+ return ret > 0 ? -ENOMEM : ret;
	1693	+ }
	1694	+
	1695	+ /* We unpinned pages before migration, pin them again */
	1696	+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
	1697	+ NULL, gup_flags);
	1698	+ if (ret <= 0)
	1699	+ return ret;
	1700	+ nr_pages = ret;
	1701	+ }
	1702	+
	1703	+ /*
	1704	+ * check again because pages were unpinned, and we also might have
	1705	+ * had isolation errors and need more pages to migrate.
	1706	+ */
	1707	+ goto check_again;
	1708	+}
	1709	+#else
	1710	+static long check_and_migrate_cma_pages(struct mm_struct *mm,
	1711	+ unsigned long start,
	1712	+ unsigned long nr_pages,
	1713	+ struct page **pages,
	1714	+ struct vm_area_struct **vmas,
	1715	+ unsigned int gup_flags)
	1716	+{
	1717	+ return nr_pages;
	1718	+}
	1719	+#endif /* CONFIG_CMA */
	1720	+
1346	1721	/*
1347		- * Generic Fast GUP
	1722	+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
	1723	+ * allows us to process the FOLL_LONGTERM flag.
	1724	+ */
	1725	+static long __gup_longterm_locked(struct mm_struct *mm,
	1726	+ unsigned long start,
	1727	+ unsigned long nr_pages,
	1728	+ struct page **pages,
	1729	+ struct vm_area_struct **vmas,
	1730	+ unsigned int gup_flags)
	1731	+{
	1732	+ unsigned long flags = 0;
	1733	+ long rc;
	1734	+
	1735	+ if (gup_flags & FOLL_LONGTERM)
	1736	+ flags = memalloc_nocma_save();
	1737	+
	1738	+ rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
	1739	+ gup_flags);
	1740	+
	1741	+ if (gup_flags & FOLL_LONGTERM) {
	1742	+ if (rc > 0)
	1743	+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
	1744	+ vmas, gup_flags);
	1745	+ memalloc_nocma_restore(flags);
	1746	+ }
	1747	+ return rc;
	1748	+}
	1749	+
	1750	+static bool is_valid_gup_flags(unsigned int gup_flags)
	1751	+{
	1752	+ /*
	1753	+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
	1754	+ * never directly by the caller, so enforce that with an assertion:
	1755	+ */
	1756	+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
	1757	+ return false;
	1758	+ /*
	1759	+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
	1760	+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
	1761	+ * FOLL_PIN.
	1762	+ */
	1763	+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
	1764	+ return false;
	1765	+
	1766	+ return true;
	1767	+}
	1768	+
	1769	+#ifdef CONFIG_MMU
	1770	+static long __get_user_pages_remote(struct mm_struct *mm,
	1771	+ unsigned long start, unsigned long nr_pages,
	1772	+ unsigned int gup_flags, struct page **pages,
	1773	+ struct vm_area_struct *vmas, int locked)
	1774	+{
	1775	+ /*
	1776	+ * Parts of FOLL_LONGTERM behavior are incompatible with
	1777	+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
	1778	+ * vmas. However, this only comes up if locked is set, and there are
	1779	+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
	1780	+ * allow what we can.
	1781	+ */
	1782	+ if (gup_flags & FOLL_LONGTERM) {
	1783	+ if (WARN_ON_ONCE(locked))
	1784	+ return -EINVAL;
	1785	+ /*
	1786	+ * This will check the vmas (even if our vmas arg is NULL)
	1787	+ * and return -ENOTSUPP if DAX isn't allowed in this case:
	1788	+ */
	1789	+ return __gup_longterm_locked(mm, start, nr_pages, pages,
	1790	+ vmas, gup_flags \| FOLL_TOUCH \|
	1791	+ FOLL_REMOTE);
	1792	+ }
	1793	+
	1794	+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
	1795	+ locked,
	1796	+ gup_flags \| FOLL_TOUCH \| FOLL_REMOTE);
	1797	+}
	1798	+
	1799	+/**
	1800	+ * get_user_pages_remote() - pin user pages in memory
	1801	+ * @mm: mm_struct of target mm
	1802	+ * @start: starting user address
	1803	+ * @nr_pages: number of pages from start to pin
	1804	+ * @gup_flags: flags modifying lookup behaviour
	1805	+ * @pages: array that receives pointers to the pages pinned.
	1806	+ * Should be at least nr_pages long. Or NULL, if caller
	1807	+ * only intends to ensure the pages are faulted in.
	1808	+ * @vmas: array of pointers to vmas corresponding to each page.
	1809	+ * Or NULL if the caller does not require them.
	1810	+ * @locked: pointer to lock flag indicating whether lock is held and
	1811	+ * subsequently whether VM_FAULT_RETRY functionality can be
	1812	+ * utilised. Lock must initially be held.
	1813	+ *
	1814	+ * Returns either number of pages pinned (which may be less than the
	1815	+ * number requested), or an error. Details about the return value:
	1816	+ *
	1817	+ * -- If nr_pages is 0, returns 0.
	1818	+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
	1819	+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
	1820	+ * pages pinned. Again, this may be less than nr_pages.
	1821	+ *
	1822	+ * The caller is responsible for releasing returned @pages, via put_page().
	1823	+ *
	1824	+ * @vmas are valid only as long as mmap_lock is held.
	1825	+ *
	1826	+ * Must be called with mmap_lock held for read or write.
	1827	+ *
	1828	+ * get_user_pages_remote walks a process's page tables and takes a reference
	1829	+ * to each struct page that each user address corresponds to at a given
	1830	+ * instant. That is, it takes the page that would be accessed if a user
	1831	+ * thread accesses the given user virtual address at that instant.
	1832	+ *
	1833	+ * This does not guarantee that the page exists in the user mappings when
	1834	+ * get_user_pages_remote returns, and there may even be a completely different
	1835	+ * page there in some cases (eg. if mmapped pagecache has been invalidated
	1836	+ * and subsequently re faulted). However it does guarantee that the page
	1837	+ * won't be freed completely. And mostly callers simply care that the page
	1838	+ * contains data that was valid at some point in time. Typically, an IO
	1839	+ * or similar operation cannot guarantee anything stronger anyway because
	1840	+ * locks can't be held over the syscall boundary.
	1841	+ *
	1842	+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
	1843	+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
	1844	+ * be called after the page is finished with, and before put_page is called.
	1845	+ *
	1846	+ * get_user_pages_remote is typically used for fewer-copy IO operations,
	1847	+ * to get a handle on the memory by some means other than accesses
	1848	+ * via the user virtual addresses. The pages may be submitted for
	1849	+ * DMA to devices or accessed via their kernel linear mapping (via the
	1850	+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
	1851	+ *
	1852	+ * See also get_user_pages_fast, for performance critical applications.
	1853	+ *
	1854	+ * get_user_pages_remote should be phased out in favor of
	1855	+ * get_user_pages_locked\|unlocked or get_user_pages_fast. Nothing
	1856	+ * should use get_user_pages_remote because it cannot pass
	1857	+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
	1858	+ */
	1859	+long get_user_pages_remote(struct mm_struct *mm,
	1860	+ unsigned long start, unsigned long nr_pages,
	1861	+ unsigned int gup_flags, struct page **pages,
	1862	+ struct vm_area_struct *vmas, int locked)
	1863	+{
	1864	+ if (!is_valid_gup_flags(gup_flags))
	1865	+ return -EINVAL;
	1866	+
	1867	+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
	1868	+ pages, vmas, locked);
	1869	+}
	1870	+EXPORT_SYMBOL(get_user_pages_remote);
	1871	+
	1872	+#else /* CONFIG_MMU */
	1873	+long get_user_pages_remote(struct mm_struct *mm,
	1874	+ unsigned long start, unsigned long nr_pages,
	1875	+ unsigned int gup_flags, struct page **pages,
	1876	+ struct vm_area_struct *vmas, int locked)
	1877	+{
	1878	+ return 0;
	1879	+}
	1880	+
	1881	+static long __get_user_pages_remote(struct mm_struct *mm,
	1882	+ unsigned long start, unsigned long nr_pages,
	1883	+ unsigned int gup_flags, struct page **pages,
	1884	+ struct vm_area_struct *vmas, int locked)
	1885	+{
	1886	+ return 0;
	1887	+}
	1888	+#endif /* !CONFIG_MMU */
	1889	+
	1890	+/**
	1891	+ * get_user_pages() - pin user pages in memory
	1892	+ * @start: starting user address
	1893	+ * @nr_pages: number of pages from start to pin
	1894	+ * @gup_flags: flags modifying lookup behaviour
	1895	+ * @pages: array that receives pointers to the pages pinned.
	1896	+ * Should be at least nr_pages long. Or NULL, if caller
	1897	+ * only intends to ensure the pages are faulted in.
	1898	+ * @vmas: array of pointers to vmas corresponding to each page.
	1899	+ * Or NULL if the caller does not require them.
	1900	+ *
	1901	+ * This is the same as get_user_pages_remote(), just with a less-flexible
	1902	+ * calling convention where we assume that the mm being operated on belongs to
	1903	+ * the current task, and doesn't allow passing of a locked parameter. We also
	1904	+ * obviously don't pass FOLL_REMOTE in here.
	1905	+ */
	1906	+long get_user_pages(unsigned long start, unsigned long nr_pages,
	1907	+ unsigned int gup_flags, struct page **pages,
	1908	+ struct vm_area_struct **vmas)
	1909	+{
	1910	+ if (!is_valid_gup_flags(gup_flags))
	1911	+ return -EINVAL;
	1912	+
	1913	+ return __gup_longterm_locked(current->mm, start, nr_pages,
	1914	+ pages, vmas, gup_flags \| FOLL_TOUCH);
	1915	+}
	1916	+EXPORT_SYMBOL(get_user_pages);
	1917	+
	1918	+/**
	1919	+ * get_user_pages_locked() is suitable to replace the form:
	1920	+ *
	1921	+ * mmap_read_lock(mm);
	1922	+ * do_something()
	1923	+ * get_user_pages(mm, ..., pages, NULL);
	1924	+ * mmap_read_unlock(mm);
	1925	+ *
	1926	+ * to:
	1927	+ *
	1928	+ * int locked = 1;
	1929	+ * mmap_read_lock(mm);
	1930	+ * do_something()
	1931	+ * get_user_pages_locked(mm, ..., pages, &locked);
	1932	+ * if (locked)
	1933	+ * mmap_read_unlock(mm);
	1934	+ *
	1935	+ * @start: starting user address
	1936	+ * @nr_pages: number of pages from start to pin
	1937	+ * @gup_flags: flags modifying lookup behaviour
	1938	+ * @pages: array that receives pointers to the pages pinned.
	1939	+ * Should be at least nr_pages long. Or NULL, if caller
	1940	+ * only intends to ensure the pages are faulted in.
	1941	+ * @locked: pointer to lock flag indicating whether lock is held and
	1942	+ * subsequently whether VM_FAULT_RETRY functionality can be
	1943	+ * utilised. Lock must initially be held.
	1944	+ *
	1945	+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
	1946	+ * paths better by using either get_user_pages_locked() or
	1947	+ * get_user_pages_unlocked().
	1948	+ *
	1949	+ */
	1950	+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
	1951	+ unsigned int gup_flags, struct page **pages,
	1952	+ int *locked)
	1953	+{
	1954	+ /*
	1955	+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
	1956	+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
	1957	+ * vmas. As there are no users of this flag in this call we simply
	1958	+ * disallow this option for now.
	1959	+ */
	1960	+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
	1961	+ return -EINVAL;
	1962	+ /*
	1963	+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
	1964	+ * never directly by the caller, so enforce that:
	1965	+ */
	1966	+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
	1967	+ return -EINVAL;
	1968	+
	1969	+ return __get_user_pages_locked(current->mm, start, nr_pages,
	1970	+ pages, NULL, locked,
	1971	+ gup_flags \| FOLL_TOUCH);
	1972	+}
	1973	+EXPORT_SYMBOL(get_user_pages_locked);
	1974	+
	1975	+/*
	1976	+ * get_user_pages_unlocked() is suitable to replace the form:
	1977	+ *
	1978	+ * mmap_read_lock(mm);
	1979	+ * get_user_pages(mm, ..., pages, NULL);
	1980	+ * mmap_read_unlock(mm);
	1981	+ *
	1982	+ * with:
	1983	+ *
	1984	+ * get_user_pages_unlocked(mm, ..., pages);
	1985	+ *
	1986	+ * It is functionally equivalent to get_user_pages_fast so
	1987	+ * get_user_pages_fast should be used instead if specific gup_flags
	1988	+ * (e.g. FOLL_FORCE) are not required.
	1989	+ */
	1990	+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
	1991	+ struct page **pages, unsigned int gup_flags)
	1992	+{
	1993	+ struct mm_struct *mm = current->mm;
	1994	+ int locked = 1;
	1995	+ long ret;
	1996	+
	1997	+ /*
	1998	+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
	1999	+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
	2000	+ * vmas. As there are no users of this flag in this call we simply
	2001	+ * disallow this option for now.
	2002	+ */
	2003	+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
	2004	+ return -EINVAL;
	2005	+
	2006	+ mmap_read_lock(mm);
	2007	+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
	2008	+ &locked, gup_flags \| FOLL_TOUCH);
	2009	+ if (locked)
	2010	+ mmap_read_unlock(mm);
	2011	+ return ret;
	2012	+}
	2013	+EXPORT_SYMBOL(get_user_pages_unlocked);
	2014	+
	2015	+/*
	2016	+ * Fast GUP
1348	2017	*
1349	2018	* get_user_pages_fast attempts to pin user pages by walking the page
1350	2019	* tables directly and avoids taking locks. Thus the walker needs to be
..	..	@@ -1365,7 +2034,7 @@
1365	2034	* Before activating this code, please be aware that the following assumptions
1366	2035	* are currently made:
1367	2036	*
1368		- * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
	2037	+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
1369	2038	* free pages containing page tables or TLB flushing requires IPI broadcast.
1370	2039	*
1371	2040	* *) ptes can be read atomically by the architecture.
..	..	@@ -1376,47 +2045,101 @@
1376	2045	*
1377	2046	* This code is based heavily on the PowerPC implementation by Nick Piggin.
1378	2047	*/
1379		-#ifdef CONFIG_HAVE_GENERIC_GUP
	2048	+#ifdef CONFIG_HAVE_FAST_GUP
	2049	+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
1380	2050
1381		-#ifndef gup_get_pte
1382	2051	/*
1383		- * We assume that the PTE can be read atomically. If this is not the case for
1384		- * your architecture, please provide the helper.
	2052	+ * WARNING: only to be used in the get_user_pages_fast() implementation.
	2053	+ *
	2054	+ * With get_user_pages_fast(), we walk down the pagetables without taking any
	2055	+ * locks. For this we would like to load the pointers atomically, but sometimes
	2056	+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
	2057	+ * we do have is the guarantee that a PTE will only either go from not present
	2058	+ * to present, or present to not present or both -- it will not switch to a
	2059	+ * completely different present page without a TLB flush in between; something
	2060	+ * that we are blocking by holding interrupts off.
	2061	+ *
	2062	+ * Setting ptes from not present to present goes:
	2063	+ *
	2064	+ * ptep->pte_high = h;
	2065	+ * smp_wmb();
	2066	+ * ptep->pte_low = l;
	2067	+ *
	2068	+ * And present to not present goes:
	2069	+ *
	2070	+ * ptep->pte_low = 0;
	2071	+ * smp_wmb();
	2072	+ * ptep->pte_high = 0;
	2073	+ *
	2074	+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
	2075	+ * We load pte_high after loading pte_low, which ensures we don't see an older
	2076	+ * value of pte_high. Then we recheck pte_low, which ensures that we haven't
	2077	+ * picked up a changed pte high. We might have gotten rubbish values from
	2078	+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
	2079	+ * present bit set unless it is 'l'. Because get_user_pages_fast() only
	2080	+ * operates on present ptes we're safe.
1385	2081	*/
1386	2082	static inline pte_t gup_get_pte(pte_t *ptep)
1387	2083	{
1388		- return READ_ONCE(*ptep);
	2084	+ pte_t pte;
	2085	+
	2086	+ do {
	2087	+ pte.pte_low = ptep->pte_low;
	2088	+ smp_rmb();
	2089	+ pte.pte_high = ptep->pte_high;
	2090	+ smp_rmb();
	2091	+ } while (unlikely(pte.pte_low != ptep->pte_low));
	2092	+
	2093	+ return pte;
1389	2094	}
1390		-#endif
	2095	+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
	2096	+/*
	2097	+ * We require that the PTE can be read atomically.
	2098	+ */
	2099	+static inline pte_t gup_get_pte(pte_t *ptep)
	2100	+{
	2101	+ return ptep_get(ptep);
	2102	+}
	2103	+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
1391	2104
1392	2105	static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
	2106	+ unsigned int flags,
1393	2107	struct page **pages)
1394	2108	{
1395	2109	while ((*nr) - nr_start) {
1396	2110	struct page page = pages[--(nr)];
1397	2111
1398	2112	ClearPageReferenced(page);
1399		- put_page(page);
	2113	+ if (flags & FOLL_PIN)
	2114	+ unpin_user_page(page);
	2115	+ else
	2116	+ put_page(page);
1400	2117	}
1401	2118	}
1402	2119
1403		-/*
1404		- * Return the compund head page with ref appropriately incremented,
1405		- * or NULL if that failed.
1406		- */
1407		-static inline struct page try_get_compound_head(struct page page, int refs)
1408		-{
1409		- struct page *head = compound_head(page);
1410		- if (WARN_ON_ONCE(page_ref_count(head) < 0))
1411		- return NULL;
1412		- if (unlikely(!page_cache_add_speculative(head, refs)))
1413		- return NULL;
1414		- return head;
1415		-}
1416		-
1417	2120	#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1418		-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1419		- int write, struct page *pages, int nr)
	2121	+/*
	2122	+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
	2123	+ * operations.
	2124	+ *
	2125	+ * To pin the page, fast-gup needs to do below in order:
	2126	+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
	2127	+ *
	2128	+ * For the rest of pgtable operations where pgtable updates can be racy
	2129	+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
	2130	+ * is pinned.
	2131	+ *
	2132	+ * Above will work for all pte-level operations, including THP split.
	2133	+ *
	2134	+ * For THP collapse, it's a bit more complicated because fast-gup may be
	2135	+ * walking a pgtable page that is being freed (pte is still valid but pmd
	2136	+ * can be cleared already). To avoid race in such condition, we need to
	2137	+ * also check pmd here to make sure pmd doesn't change (corresponds to
	2138	+ * pmdp_collapse_flush() in the THP collapse code path).
	2139	+ */
	2140	+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
	2141	+ unsigned long end, unsigned int flags,
	2142	+ struct page *pages, int nr)
1420	2143	{
1421	2144	struct dev_pagemap *pgmap = NULL;
1422	2145	int nr_start = *nr, ret = 0;
..	..	@@ -1434,13 +2157,16 @@
1434	2157	if (pte_protnone(pte))
1435	2158	goto pte_unmap;
1436	2159
1437		- if (!pte_access_permitted(pte, write))
	2160	+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
1438	2161	goto pte_unmap;
1439	2162
1440	2163	if (pte_devmap(pte)) {
	2164	+ if (unlikely(flags & FOLL_LONGTERM))
	2165	+ goto pte_unmap;
	2166	+
1441	2167	pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
1442	2168	if (unlikely(!pgmap)) {
1443		- undo_dev_pagemap(nr, nr_start, pages);
	2169	+ undo_dev_pagemap(nr, nr_start, flags, pages);
1444	2170	goto pte_unmap;
1445	2171	}
1446	2172	} else if (pte_special(pte))
..	..	@@ -1449,17 +2175,31 @@
1449	2175	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1450	2176	page = pte_page(pte);
1451	2177
1452		- head = try_get_compound_head(page, 1);
	2178	+ head = try_grab_compound_head(page, 1, flags);
1453	2179	if (!head)
1454	2180	goto pte_unmap;
1455	2181
1456		- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1457		- put_page(head);
	2182	+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) \|\|
	2183	+ unlikely(pte_val(pte) != pte_val(*ptep))) {
	2184	+ put_compound_head(head, 1, flags);
1458	2185	goto pte_unmap;
1459	2186	}
1460	2187
1461	2188	VM_BUG_ON_PAGE(compound_head(page) != head, page);
1462	2189
	2190	+ /*
	2191	+ * We need to make the page accessible if and only if we are
	2192	+ * going to access its content (the FOLL_PIN case). Please
	2193	+ * see Documentation/core-api/pin_user_pages.rst for
	2194	+ * details.
	2195	+ */
	2196	+ if (flags & FOLL_PIN) {
	2197	+ ret = arch_make_page_accessible(page);
	2198	+ if (ret) {
	2199	+ unpin_user_page(page);
	2200	+ goto pte_unmap;
	2201	+ }
	2202	+ }
1463	2203	SetPageReferenced(page);
1464	2204	pages[*nr] = page;
1465	2205	(*nr)++;
..	..	@@ -1482,19 +2222,21 @@
1482	2222	* to be special.
1483	2223	*
1484	2224	* For a futex to be placed on a THP tail page, get_futex_key requires a
1485		- * __get_user_pages_fast implementation that can pin pages. Thus it's still
	2225	+ * get_user_pages_fast_only implementation that can pin pages. Thus it's still
1486	2226	* useful to have gup_huge_pmd even if we can't operate on ptes.
1487	2227	*/
1488		-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1489		- int write, struct page *pages, int nr)
	2228	+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
	2229	+ unsigned long end, unsigned int flags,
	2230	+ struct page *pages, int nr)
1490	2231	{
1491	2232	return 0;
1492	2233	}
1493	2234	#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
1494	2235
1495		-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
	2236	+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1496	2237	static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1497		- unsigned long end, struct page *pages, int nr)
	2238	+ unsigned long end, unsigned int flags,
	2239	+ struct page *pages, int nr)
1498	2240	{
1499	2241	int nr_start = *nr;
1500	2242	struct dev_pagemap *pgmap = NULL;
..	..	@@ -1504,12 +2246,15 @@
1504	2246
1505	2247	pgmap = get_dev_pagemap(pfn, pgmap);
1506	2248	if (unlikely(!pgmap)) {
1507		- undo_dev_pagemap(nr, nr_start, pages);
	2249	+ undo_dev_pagemap(nr, nr_start, flags, pages);
1508	2250	return 0;
1509	2251	}
1510	2252	SetPageReferenced(page);
1511	2253	pages[*nr] = page;
1512		- get_page(page);
	2254	+ if (unlikely(!try_grab_page(page, flags))) {
	2255	+ undo_dev_pagemap(nr, nr_start, flags, pages);
	2256	+ return 0;
	2257	+ }
1513	2258	(*nr)++;
1514	2259	pfn++;
1515	2260	} while (addr += PAGE_SIZE, addr != end);
..	..	@@ -1520,174 +2265,246 @@
1520	2265	}
1521	2266
1522	2267	static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1523		- unsigned long end, struct page *pages, int nr)
	2268	+ unsigned long end, unsigned int flags,
	2269	+ struct page *pages, int nr)
1524	2270	{
1525	2271	unsigned long fault_pfn;
1526	2272	int nr_start = *nr;
1527	2273
1528	2274	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1529		- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
	2275	+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
1530	2276	return 0;
1531	2277
1532	2278	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1533		- undo_dev_pagemap(nr, nr_start, pages);
	2279	+ undo_dev_pagemap(nr, nr_start, flags, pages);
1534	2280	return 0;
1535	2281	}
1536	2282	return 1;
1537	2283	}
1538	2284
1539	2285	static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1540		- unsigned long end, struct page *pages, int nr)
	2286	+ unsigned long end, unsigned int flags,
	2287	+ struct page *pages, int nr)
1541	2288	{
1542	2289	unsigned long fault_pfn;
1543	2290	int nr_start = *nr;
1544	2291
1545	2292	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1546		- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
	2293	+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
1547	2294	return 0;
1548	2295
1549	2296	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1550		- undo_dev_pagemap(nr, nr_start, pages);
	2297	+ undo_dev_pagemap(nr, nr_start, flags, pages);
1551	2298	return 0;
1552	2299	}
1553	2300	return 1;
1554	2301	}
1555	2302	#else
1556	2303	static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1557		- unsigned long end, struct page *pages, int nr)
	2304	+ unsigned long end, unsigned int flags,
	2305	+ struct page *pages, int nr)
1558	2306	{
1559	2307	BUILD_BUG();
1560	2308	return 0;
1561	2309	}
1562	2310
1563	2311	static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1564		- unsigned long end, struct page *pages, int nr)
	2312	+ unsigned long end, unsigned int flags,
	2313	+ struct page *pages, int nr)
1565	2314	{
1566	2315	BUILD_BUG();
1567	2316	return 0;
1568	2317	}
1569	2318	#endif
1570	2319
	2320	+static int record_subpages(struct page *page, unsigned long addr,
	2321	+ unsigned long end, struct page **pages)
	2322	+{
	2323	+ int nr;
	2324	+
	2325	+ for (nr = 0; addr != end; addr += PAGE_SIZE)
	2326	+ pages[nr++] = page++;
	2327	+
	2328	+ return nr;
	2329	+}
	2330	+
	2331	+#ifdef CONFIG_ARCH_HAS_HUGEPD
	2332	+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
	2333	+ unsigned long sz)
	2334	+{
	2335	+ unsigned long __boundary = (addr + sz) & ~(sz-1);
	2336	+ return (__boundary - 1 < end - 1) ? __boundary : end;
	2337	+}
	2338	+
	2339	+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
	2340	+ unsigned long end, unsigned int flags,
	2341	+ struct page *pages, int nr)
	2342	+{
	2343	+ unsigned long pte_end;
	2344	+ struct page head, page;
	2345	+ pte_t pte;
	2346	+ int refs;
	2347	+
	2348	+ pte_end = (addr + sz) & ~(sz-1);
	2349	+ if (pte_end < end)
	2350	+ end = pte_end;
	2351	+
	2352	+ pte = huge_ptep_get(ptep);
	2353	+
	2354	+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
	2355	+ return 0;
	2356	+
	2357	+ /* hugepages are never "special" */
	2358	+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	2359	+
	2360	+ head = pte_page(pte);
	2361	+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	2362	+ refs = record_subpages(page, addr, end, pages + *nr);
	2363	+
	2364	+ head = try_grab_compound_head(head, refs, flags);
	2365	+ if (!head)
	2366	+ return 0;
	2367	+
	2368	+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
	2369	+ put_compound_head(head, refs, flags);
	2370	+ return 0;
	2371	+ }
	2372	+
	2373	+ *nr += refs;
	2374	+ SetPageReferenced(head);
	2375	+ return 1;
	2376	+}
	2377	+
	2378	+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
	2379	+ unsigned int pdshift, unsigned long end, unsigned int flags,
	2380	+ struct page *pages, int nr)
	2381	+{
	2382	+ pte_t *ptep;
	2383	+ unsigned long sz = 1UL << hugepd_shift(hugepd);
	2384	+ unsigned long next;
	2385	+
	2386	+ ptep = hugepte_offset(hugepd, addr, pdshift);
	2387	+ do {
	2388	+ next = hugepte_addr_end(addr, end, sz);
	2389	+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
	2390	+ return 0;
	2391	+ } while (ptep++, addr = next, addr != end);
	2392	+
	2393	+ return 1;
	2394	+}
	2395	+#else
	2396	+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
	2397	+ unsigned int pdshift, unsigned long end, unsigned int flags,
	2398	+ struct page *pages, int nr)
	2399	+{
	2400	+ return 0;
	2401	+}
	2402	+#endif /* CONFIG_ARCH_HAS_HUGEPD */
	2403	+
1571	2404	static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1572		- unsigned long end, int write, struct page *pages, int nr)
	2405	+ unsigned long end, unsigned int flags,
	2406	+ struct page *pages, int nr)
1573	2407	{
1574	2408	struct page head, page;
1575	2409	int refs;
1576	2410
1577		- if (!pmd_access_permitted(orig, write))
	2411	+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
1578	2412	return 0;
1579	2413
1580		- if (pmd_devmap(orig))
1581		- return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1582		-
1583		- refs = 0;
1584		- page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1585		- do {
1586		- pages[*nr] = page;
1587		- (*nr)++;
1588		- page++;
1589		- refs++;
1590		- } while (addr += PAGE_SIZE, addr != end);
1591		-
1592		- head = try_get_compound_head(pmd_page(orig), refs);
1593		- if (!head) {
1594		- *nr -= refs;
1595		- return 0;
	2414	+ if (pmd_devmap(orig)) {
	2415	+ if (unlikely(flags & FOLL_LONGTERM))
	2416	+ return 0;
	2417	+ return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
	2418	+ pages, nr);
1596	2419	}
	2420	+
	2421	+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
	2422	+ refs = record_subpages(page, addr, end, pages + *nr);
	2423	+
	2424	+ head = try_grab_compound_head(pmd_page(orig), refs, flags);
	2425	+ if (!head)
	2426	+ return 0;
1597	2427
1598	2428	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1599		- *nr -= refs;
1600		- while (refs--)
1601		- put_page(head);
	2429	+ put_compound_head(head, refs, flags);
1602	2430	return 0;
1603	2431	}
1604	2432
	2433	+ *nr += refs;
1605	2434	SetPageReferenced(head);
1606	2435	return 1;
1607	2436	}
1608	2437
1609	2438	static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1610		- unsigned long end, int write, struct page *pages, int nr)
	2439	+ unsigned long end, unsigned int flags,
	2440	+ struct page *pages, int nr)
1611	2441	{
1612	2442	struct page head, page;
1613	2443	int refs;
1614	2444
1615		- if (!pud_access_permitted(orig, write))
	2445	+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
1616	2446	return 0;
1617	2447
1618		- if (pud_devmap(orig))
1619		- return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1620		-
1621		- refs = 0;
1622		- page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1623		- do {
1624		- pages[*nr] = page;
1625		- (*nr)++;
1626		- page++;
1627		- refs++;
1628		- } while (addr += PAGE_SIZE, addr != end);
1629		-
1630		- head = try_get_compound_head(pud_page(orig), refs);
1631		- if (!head) {
1632		- *nr -= refs;
1633		- return 0;
	2448	+ if (pud_devmap(orig)) {
	2449	+ if (unlikely(flags & FOLL_LONGTERM))
	2450	+ return 0;
	2451	+ return __gup_device_huge_pud(orig, pudp, addr, end, flags,
	2452	+ pages, nr);
1634	2453	}
	2454	+
	2455	+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
	2456	+ refs = record_subpages(page, addr, end, pages + *nr);
	2457	+
	2458	+ head = try_grab_compound_head(pud_page(orig), refs, flags);
	2459	+ if (!head)
	2460	+ return 0;
1635	2461
1636	2462	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1637		- *nr -= refs;
1638		- while (refs--)
1639		- put_page(head);
	2463	+ put_compound_head(head, refs, flags);
1640	2464	return 0;
1641	2465	}
1642	2466
	2467	+ *nr += refs;
1643	2468	SetPageReferenced(head);
1644	2469	return 1;
1645	2470	}
1646	2471
1647	2472	static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1648		- unsigned long end, int write,
	2473	+ unsigned long end, unsigned int flags,
1649	2474	struct page *pages, int nr)
1650	2475	{
1651	2476	int refs;
1652	2477	struct page head, page;
1653	2478
1654		- if (!pgd_access_permitted(orig, write))
	2479	+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
1655	2480	return 0;
1656	2481
1657	2482	BUILD_BUG_ON(pgd_devmap(orig));
1658		- refs = 0;
1659		- page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1660		- do {
1661		- pages[*nr] = page;
1662		- (*nr)++;
1663		- page++;
1664		- refs++;
1665		- } while (addr += PAGE_SIZE, addr != end);
1666	2483
1667		- head = try_get_compound_head(pgd_page(orig), refs);
1668		- if (!head) {
1669		- *nr -= refs;
	2484	+ page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
	2485	+ refs = record_subpages(page, addr, end, pages + *nr);
	2486	+
	2487	+ head = try_grab_compound_head(pgd_page(orig), refs, flags);
	2488	+ if (!head)
1670	2489	return 0;
1671		- }
1672	2490
1673	2491	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1674		- *nr -= refs;
1675		- while (refs--)
1676		- put_page(head);
	2492	+ put_compound_head(head, refs, flags);
1677	2493	return 0;
1678	2494	}
1679	2495
	2496	+ *nr += refs;
1680	2497	SetPageReferenced(head);
1681	2498	return 1;
1682	2499	}
1683	2500
1684		-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1685		- int write, struct page *pages, int nr)
	2501	+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
	2502	+ unsigned int flags, struct page *pages, int nr)
1686	2503	{
1687	2504	unsigned long next;
1688	2505	pmd_t *pmdp;
1689	2506
1690		- pmdp = pmd_offset(&pud, addr);
	2507	+ pmdp = pmd_offset_lockless(pudp, pud, addr);
1691	2508	do {
1692	2509	pmd_t pmd = READ_ONCE(*pmdp);
1693	2510
..	..	@@ -1705,7 +2522,7 @@
1705	2522	if (pmd_protnone(pmd))
1706	2523	return 0;
1707	2524
1708		- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
	2525	+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
1709	2526	pages, nr))
1710	2527	return 0;
1711	2528
..	..	@@ -1715,50 +2532,50 @@
1715	2532	* pmd format and THP pmd format
1716	2533	*/
1717	2534	if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
1718		- PMD_SHIFT, next, write, pages, nr))
	2535	+ PMD_SHIFT, next, flags, pages, nr))
1719	2536	return 0;
1720		- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
	2537	+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
1721	2538	return 0;
1722	2539	} while (pmdp++, addr = next, addr != end);
1723	2540
1724	2541	return 1;
1725	2542	}
1726	2543
1727		-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
1728		- int write, struct page *pages, int nr)
	2544	+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
	2545	+ unsigned int flags, struct page *pages, int nr)
1729	2546	{
1730	2547	unsigned long next;
1731	2548	pud_t *pudp;
1732	2549
1733		- pudp = pud_offset(&p4d, addr);
	2550	+ pudp = pud_offset_lockless(p4dp, p4d, addr);
1734	2551	do {
1735	2552	pud_t pud = READ_ONCE(*pudp);
1736	2553
1737	2554	next = pud_addr_end(addr, end);
1738		- if (pud_none(pud))
	2555	+ if (unlikely(!pud_present(pud)))
1739	2556	return 0;
1740		- if (unlikely(pud_huge(pud))) {
1741		- if (!gup_huge_pud(pud, pudp, addr, next, write,
	2557	+ if (unlikely(pud_huge(pud) \|\| pud_devmap(pud))) {
	2558	+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
1742	2559	pages, nr))
1743	2560	return 0;
1744	2561	} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
1745	2562	if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
1746		- PUD_SHIFT, next, write, pages, nr))
	2563	+ PUD_SHIFT, next, flags, pages, nr))
1747	2564	return 0;
1748		- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
	2565	+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
1749	2566	return 0;
1750	2567	} while (pudp++, addr = next, addr != end);
1751	2568
1752	2569	return 1;
1753	2570	}
1754	2571
1755		-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
1756		- int write, struct page *pages, int nr)
	2572	+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
	2573	+ unsigned int flags, struct page *pages, int nr)
1757	2574	{
1758	2575	unsigned long next;
1759	2576	p4d_t *p4dp;
1760	2577
1761		- p4dp = p4d_offset(&pgd, addr);
	2578	+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
1762	2579	do {
1763	2580	p4d_t p4d = READ_ONCE(*p4dp);
1764	2581
..	..	@@ -1768,9 +2585,9 @@
1768	2585	BUILD_BUG_ON(p4d_huge(p4d));
1769	2586	if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
1770	2587	if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
1771		- P4D_SHIFT, next, write, pages, nr))
	2588	+ P4D_SHIFT, next, flags, pages, nr))
1772	2589	return 0;
1773		- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
	2590	+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
1774	2591	return 0;
1775	2592	} while (p4dp++, addr = next, addr != end);
1776	2593
..	..	@@ -1778,7 +2595,7 @@
1778	2595	}
1779	2596
1780	2597	static void gup_pgd_range(unsigned long addr, unsigned long end,
1781		- int write, struct page *pages, int nr)
	2598	+ unsigned int flags, struct page *pages, int nr)
1782	2599	{
1783	2600	unsigned long next;
1784	2601	pgd_t *pgdp;
..	..	@@ -1791,152 +2608,411 @@
1791	2608	if (pgd_none(pgd))
1792	2609	return;
1793	2610	if (unlikely(pgd_huge(pgd))) {
1794		- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
	2611	+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
1795	2612	pages, nr))
1796	2613	return;
1797	2614	} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
1798	2615	if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1799		- PGDIR_SHIFT, next, write, pages, nr))
	2616	+ PGDIR_SHIFT, next, flags, pages, nr))
1800	2617	return;
1801		- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
	2618	+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
1802	2619	return;
1803	2620	} while (pgdp++, addr = next, addr != end);
1804	2621	}
	2622	+#else
	2623	+static inline void gup_pgd_range(unsigned long addr, unsigned long end,
	2624	+ unsigned int flags, struct page *pages, int nr)
	2625	+{
	2626	+}
	2627	+#endif /* CONFIG_HAVE_FAST_GUP */
1805	2628
1806	2629	#ifndef gup_fast_permitted
1807	2630	/*
1808		- * Check if it's allowed to use __get_user_pages_fast() for the range, or
	2631	+ * Check if it's allowed to use get_user_pages_fast_only() for the range, or
1809	2632	* we need to fall back to the slow version:
1810	2633	*/
1811		-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
	2634	+static bool gup_fast_permitted(unsigned long start, unsigned long end)
1812	2635	{
1813		- unsigned long len, end;
1814		-
1815		- len = (unsigned long) nr_pages << PAGE_SHIFT;
1816		- end = start + len;
1817		- return end >= start;
	2636	+ return true;
1818	2637	}
1819	2638	#endif
1820	2639
1821		-/*
1822		- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
1823		- * the regular GUP.
1824		- * Note a difference with get_user_pages_fast: this always returns the
1825		- * number of pages pinned, 0 if no pages were pinned.
1826		- *
1827		- * Careful, careful! COW breaking can go either way, so a non-write
1828		- * access can get ambiguous page results. If you call this function without
1829		- * 'write' set, you'd better be sure that you're ok with that ambiguity.
1830		- */
1831		-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1832		- struct page **pages)
	2640	+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
	2641	+ unsigned int gup_flags, struct page **pages)
1833	2642	{
1834		- unsigned long addr, len, end;
1835		- unsigned long flags;
1836		- int nr = 0;
1837		-
1838		- start &= PAGE_MASK;
1839		- addr = start;
1840		- len = (unsigned long) nr_pages << PAGE_SHIFT;
1841		- end = start + len;
1842		-
1843		- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1844		- (void __user *)start, len)))
1845		- return 0;
	2643	+ int ret;
1846	2644
1847	2645	/*
1848		- * Disable interrupts. We use the nested form as we can already have
1849		- * interrupts disabled by get_futex_key.
1850		- *
1851		- * With interrupts disabled, we block page table pages from being
1852		- * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
1853		- * for more details.
1854		- *
1855		- * We do not adopt an rcu_read_lock(.) here as we also want to
1856		- * block IPIs that come from THPs splitting.
1857		- *
1858		- * NOTE! We allow read-only gup_fast() here, but you'd better be
1859		- * careful about possible COW pages. You'll get _a_ COW page, but
1860		- * not necessarily the one you intended to get depending on what
1861		- * COW event happens after this. COW may break the page copy in a
1862		- * random direction.
	2646	+ * FIXME: FOLL_LONGTERM does not work with
	2647	+ * get_user_pages_unlocked() (see comments in that function)
1863	2648	*/
1864		-
1865		- if (gup_fast_permitted(start, nr_pages, write)) {
1866		- local_irq_save(flags);
1867		- gup_pgd_range(addr, end, write, pages, &nr);
1868		- local_irq_restore(flags);
1869		- }
1870		-
1871		- return nr;
1872		-}
1873		-
1874		-/**
1875		- * get_user_pages_fast() - pin user pages in memory
1876		- * @start: starting user address
1877		- * @nr_pages: number of pages from start to pin
1878		- * @write: whether pages will be written to
1879		- * @pages: array that receives pointers to the pages pinned.
1880		- * Should be at least nr_pages long.
1881		- *
1882		- * Attempt to pin user pages in memory without taking mm->mmap_sem.
1883		- * If not successful, it will fall back to taking the lock and
1884		- * calling get_user_pages().
1885		- *
1886		- * Returns number of pages pinned. This may be fewer than the number
1887		- * requested. If nr_pages is 0 or negative, returns 0. If no pages
1888		- * were pinned, returns -errno.
1889		- */
1890		-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1891		- struct page **pages)
1892		-{
1893		- unsigned long addr, len, end;
1894		- int nr = 0, ret = 0;
1895		-
1896		- start &= PAGE_MASK;
1897		- addr = start;
1898		- len = (unsigned long) nr_pages << PAGE_SHIFT;
1899		- end = start + len;
1900		-
1901		- if (nr_pages <= 0)
1902		- return 0;
1903		-
1904		- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1905		- (void __user *)start, len)))
1906		- return -EFAULT;
1907		-
1908		- /*
1909		- * The FAST_GUP case requires FOLL_WRITE even for pure reads,
1910		- * because get_user_pages() may need to cause an early COW in
1911		- * order to avoid confusing the normal COW routines. So only
1912		- * targets that are already writable are safe to do by just
1913		- * looking at the page tables.
1914		- */
1915		- if (gup_fast_permitted(start, nr_pages, write)) {
1916		- local_irq_disable();
1917		- gup_pgd_range(addr, end, 1, pages, &nr);
1918		- local_irq_enable();
1919		- ret = nr;
1920		- }
1921		-
1922		- if (nr < nr_pages) {
1923		- /* Try to get the remaining pages with get_user_pages */
1924		- start += nr << PAGE_SHIFT;
1925		- pages += nr;
1926		-
1927		- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
1928		- write ? FOLL_WRITE : 0);
1929		-
1930		- /* Have to be a bit careful with return values */
1931		- if (nr > 0) {
1932		- if (ret < 0)
1933		- ret = nr;
1934		- else
1935		- ret += nr;
1936		- }
	2649	+ if (gup_flags & FOLL_LONGTERM) {
	2650	+ mmap_read_lock(current->mm);
	2651	+ ret = __gup_longterm_locked(current->mm,
	2652	+ start, nr_pages,
	2653	+ pages, NULL, gup_flags);
	2654	+ mmap_read_unlock(current->mm);
	2655	+ } else {
	2656	+ ret = get_user_pages_unlocked(start, nr_pages,
	2657	+ pages, gup_flags);
1937	2658	}
1938	2659
1939	2660	return ret;
1940	2661	}
1941	2662
1942		-#endif /* CONFIG_HAVE_GENERIC_GUP */
	2663	+static unsigned long lockless_pages_from_mm(unsigned long start,
	2664	+ unsigned long end,
	2665	+ unsigned int gup_flags,
	2666	+ struct page **pages)
	2667	+{
	2668	+ unsigned long flags;
	2669	+ int nr_pinned = 0;
	2670	+ unsigned seq;
	2671	+
	2672	+ if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) \|\|
	2673	+ !gup_fast_permitted(start, end))
	2674	+ return 0;
	2675	+
	2676	+ if (gup_flags & FOLL_PIN) {
	2677	+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
	2678	+ if (seq & 1)
	2679	+ return 0;
	2680	+ }
	2681	+
	2682	+ /*
	2683	+ * Disable interrupts. The nested form is used, in order to allow full,
	2684	+ * general purpose use of this routine.
	2685	+ *
	2686	+ * With interrupts disabled, we block page table pages from being freed
	2687	+ * from under us. See struct mmu_table_batch comments in
	2688	+ * include/asm-generic/tlb.h for more details.
	2689	+ *
	2690	+ * We do not adopt an rcu_read_lock() here as we also want to block IPIs
	2691	+ * that come from THPs splitting.
	2692	+ */
	2693	+ local_irq_save(flags);
	2694	+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
	2695	+ local_irq_restore(flags);
	2696	+
	2697	+ /*
	2698	+ * When pinning pages for DMA there could be a concurrent write protect
	2699	+ * from fork() via copy_page_range(), in this case always fail fast GUP.
	2700	+ */
	2701	+ if (gup_flags & FOLL_PIN) {
	2702	+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
	2703	+ unpin_user_pages(pages, nr_pinned);
	2704	+ return 0;
	2705	+ }
	2706	+ }
	2707	+ return nr_pinned;
	2708	+}
	2709	+
	2710	+static int internal_get_user_pages_fast(unsigned long start,
	2711	+ unsigned long nr_pages,
	2712	+ unsigned int gup_flags,
	2713	+ struct page **pages)
	2714	+{
	2715	+ unsigned long len, end;
	2716	+ unsigned long nr_pinned;
	2717	+ int ret;
	2718	+
	2719	+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE \| FOLL_LONGTERM \|
	2720	+ FOLL_FORCE \| FOLL_PIN \| FOLL_GET \|
	2721	+ FOLL_FAST_ONLY)))
	2722	+ return -EINVAL;
	2723	+
	2724	+ if (gup_flags & FOLL_PIN)
	2725	+ atomic_set(&current->mm->has_pinned, 1);
	2726	+
	2727	+ if (!(gup_flags & FOLL_FAST_ONLY))
	2728	+ might_lock_read(&current->mm->mmap_lock);
	2729	+
	2730	+ start = untagged_addr(start) & PAGE_MASK;
	2731	+ len = nr_pages << PAGE_SHIFT;
	2732	+ if (check_add_overflow(start, len, &end))
	2733	+ return 0;
	2734	+ if (unlikely(!access_ok((void __user *)start, len)))
	2735	+ return -EFAULT;
	2736	+
	2737	+ nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
	2738	+ if (nr_pinned == nr_pages \|\| gup_flags & FOLL_FAST_ONLY)
	2739	+ return nr_pinned;
	2740	+
	2741	+ /* Slow path: try to get the remaining pages with get_user_pages */
	2742	+ start += nr_pinned << PAGE_SHIFT;
	2743	+ pages += nr_pinned;
	2744	+ ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
	2745	+ pages);
	2746	+ if (ret < 0) {
	2747	+ /*
	2748	+ * The caller has to unpin the pages we already pinned so
	2749	+ * returning -errno is not an option
	2750	+ */
	2751	+ if (nr_pinned)
	2752	+ return nr_pinned;
	2753	+ return ret;
	2754	+ }
	2755	+ return ret + nr_pinned;
	2756	+}
	2757	+
	2758	+/**
	2759	+ * get_user_pages_fast_only() - pin user pages in memory
	2760	+ * @start: starting user address
	2761	+ * @nr_pages: number of pages from start to pin
	2762	+ * @gup_flags: flags modifying pin behaviour
	2763	+ * @pages: array that receives pointers to the pages pinned.
	2764	+ * Should be at least nr_pages long.
	2765	+ *
	2766	+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
	2767	+ * the regular GUP.
	2768	+ * Note a difference with get_user_pages_fast: this always returns the
	2769	+ * number of pages pinned, 0 if no pages were pinned.
	2770	+ *
	2771	+ * If the architecture does not support this function, simply return with no
	2772	+ * pages pinned.
	2773	+ *
	2774	+ * Careful, careful! COW breaking can go either way, so a non-write
	2775	+ * access can get ambiguous page results. If you call this function without
	2776	+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
	2777	+ */
	2778	+int get_user_pages_fast_only(unsigned long start, int nr_pages,
	2779	+ unsigned int gup_flags, struct page **pages)
	2780	+{
	2781	+ int nr_pinned;
	2782	+ /*
	2783	+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
	2784	+ * because gup fast is always a "pin with a +1 page refcount" request.
	2785	+ *
	2786	+ * FOLL_FAST_ONLY is required in order to match the API description of
	2787	+ * this routine: no fall back to regular ("slow") GUP.
	2788	+ */
	2789	+ gup_flags \|= FOLL_GET \| FOLL_FAST_ONLY;
	2790	+
	2791	+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
	2792	+ pages);
	2793	+
	2794	+ /*
	2795	+ * As specified in the API description above, this routine is not
	2796	+ * allowed to return negative values. However, the common core
	2797	+ * routine internal_get_user_pages_fast() can return -errno.
	2798	+ * Therefore, correct for that here:
	2799	+ */
	2800	+ if (nr_pinned < 0)
	2801	+ nr_pinned = 0;
	2802	+
	2803	+ return nr_pinned;
	2804	+}
	2805	+EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
	2806	+
	2807	+/**
	2808	+ * get_user_pages_fast() - pin user pages in memory
	2809	+ * @start: starting user address
	2810	+ * @nr_pages: number of pages from start to pin
	2811	+ * @gup_flags: flags modifying pin behaviour
	2812	+ * @pages: array that receives pointers to the pages pinned.
	2813	+ * Should be at least nr_pages long.
	2814	+ *
	2815	+ * Attempt to pin user pages in memory without taking mm->mmap_lock.
	2816	+ * If not successful, it will fall back to taking the lock and
	2817	+ * calling get_user_pages().
	2818	+ *
	2819	+ * Returns number of pages pinned. This may be fewer than the number requested.
	2820	+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
	2821	+ * -errno.
	2822	+ */
	2823	+int get_user_pages_fast(unsigned long start, int nr_pages,
	2824	+ unsigned int gup_flags, struct page **pages)
	2825	+{
	2826	+ if (!is_valid_gup_flags(gup_flags))
	2827	+ return -EINVAL;
	2828	+
	2829	+ /*
	2830	+ * The caller may or may not have explicitly set FOLL_GET; either way is
	2831	+ * OK. However, internally (within mm/gup.c), gup fast variants must set
	2832	+ * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
	2833	+ * request.
	2834	+ */
	2835	+ gup_flags \|= FOLL_GET;
	2836	+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
	2837	+}
	2838	+EXPORT_SYMBOL_GPL(get_user_pages_fast);
	2839	+
	2840	+/**
	2841	+ * pin_user_pages_fast() - pin user pages in memory without taking locks
	2842	+ *
	2843	+ * @start: starting user address
	2844	+ * @nr_pages: number of pages from start to pin
	2845	+ * @gup_flags: flags modifying pin behaviour
	2846	+ * @pages: array that receives pointers to the pages pinned.
	2847	+ * Should be at least nr_pages long.
	2848	+ *
	2849	+ * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
	2850	+ * get_user_pages_fast() for documentation on the function arguments, because
	2851	+ * the arguments here are identical.
	2852	+ *
	2853	+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
	2854	+ * see Documentation/core-api/pin_user_pages.rst for further details.
	2855	+ */
	2856	+int pin_user_pages_fast(unsigned long start, int nr_pages,
	2857	+ unsigned int gup_flags, struct page **pages)
	2858	+{
	2859	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	2860	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	2861	+ return -EINVAL;
	2862	+
	2863	+ gup_flags \|= FOLL_PIN;
	2864	+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
	2865	+}
	2866	+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
	2867	+
	2868	+/*
	2869	+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
	2870	+ * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
	2871	+ *
	2872	+ * The API rules are the same, too: no negative values may be returned.
	2873	+ */
	2874	+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
	2875	+ unsigned int gup_flags, struct page **pages)
	2876	+{
	2877	+ int nr_pinned;
	2878	+
	2879	+ /*
	2880	+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
	2881	+ * rules require returning 0, rather than -errno:
	2882	+ */
	2883	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	2884	+ return 0;
	2885	+ /*
	2886	+ * FOLL_FAST_ONLY is required in order to match the API description of
	2887	+ * this routine: no fall back to regular ("slow") GUP.
	2888	+ */
	2889	+ gup_flags \|= (FOLL_PIN \| FOLL_FAST_ONLY);
	2890	+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
	2891	+ pages);
	2892	+ /*
	2893	+ * This routine is not allowed to return negative values. However,
	2894	+ * internal_get_user_pages_fast() can return -errno. Therefore,
	2895	+ * correct for that here:
	2896	+ */
	2897	+ if (nr_pinned < 0)
	2898	+ nr_pinned = 0;
	2899	+
	2900	+ return nr_pinned;
	2901	+}
	2902	+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
	2903	+
	2904	+/**
	2905	+ * pin_user_pages_remote() - pin pages of a remote process
	2906	+ *
	2907	+ * @mm: mm_struct of target mm
	2908	+ * @start: starting user address
	2909	+ * @nr_pages: number of pages from start to pin
	2910	+ * @gup_flags: flags modifying lookup behaviour
	2911	+ * @pages: array that receives pointers to the pages pinned.
	2912	+ * Should be at least nr_pages long. Or NULL, if caller
	2913	+ * only intends to ensure the pages are faulted in.
	2914	+ * @vmas: array of pointers to vmas corresponding to each page.
	2915	+ * Or NULL if the caller does not require them.
	2916	+ * @locked: pointer to lock flag indicating whether lock is held and
	2917	+ * subsequently whether VM_FAULT_RETRY functionality can be
	2918	+ * utilised. Lock must initially be held.
	2919	+ *
	2920	+ * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
	2921	+ * get_user_pages_remote() for documentation on the function arguments, because
	2922	+ * the arguments here are identical.
	2923	+ *
	2924	+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
	2925	+ * see Documentation/core-api/pin_user_pages.rst for details.
	2926	+ */
	2927	+long pin_user_pages_remote(struct mm_struct *mm,
	2928	+ unsigned long start, unsigned long nr_pages,
	2929	+ unsigned int gup_flags, struct page **pages,
	2930	+ struct vm_area_struct *vmas, int locked)
	2931	+{
	2932	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	2933	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	2934	+ return -EINVAL;
	2935	+
	2936	+ gup_flags \|= FOLL_PIN;
	2937	+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
	2938	+ pages, vmas, locked);
	2939	+}
	2940	+EXPORT_SYMBOL(pin_user_pages_remote);
	2941	+
	2942	+/**
	2943	+ * pin_user_pages() - pin user pages in memory for use by other devices
	2944	+ *
	2945	+ * @start: starting user address
	2946	+ * @nr_pages: number of pages from start to pin
	2947	+ * @gup_flags: flags modifying lookup behaviour
	2948	+ * @pages: array that receives pointers to the pages pinned.
	2949	+ * Should be at least nr_pages long. Or NULL, if caller
	2950	+ * only intends to ensure the pages are faulted in.
	2951	+ * @vmas: array of pointers to vmas corresponding to each page.
	2952	+ * Or NULL if the caller does not require them.
	2953	+ *
	2954	+ * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
	2955	+ * FOLL_PIN is set.
	2956	+ *
	2957	+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
	2958	+ * see Documentation/core-api/pin_user_pages.rst for details.
	2959	+ */
	2960	+long pin_user_pages(unsigned long start, unsigned long nr_pages,
	2961	+ unsigned int gup_flags, struct page **pages,
	2962	+ struct vm_area_struct **vmas)
	2963	+{
	2964	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	2965	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	2966	+ return -EINVAL;
	2967	+
	2968	+ gup_flags \|= FOLL_PIN;
	2969	+ return __gup_longterm_locked(current->mm, start, nr_pages,
	2970	+ pages, vmas, gup_flags);
	2971	+}
	2972	+EXPORT_SYMBOL(pin_user_pages);
	2973	+
	2974	+/*
	2975	+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
	2976	+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
	2977	+ * FOLL_PIN and rejects FOLL_GET.
	2978	+ */
	2979	+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
	2980	+ struct page **pages, unsigned int gup_flags)
	2981	+{
	2982	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	2983	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	2984	+ return -EINVAL;
	2985	+
	2986	+ gup_flags \|= FOLL_PIN;
	2987	+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
	2988	+}
	2989	+EXPORT_SYMBOL(pin_user_pages_unlocked);
	2990	+
	2991	+/*
	2992	+ * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
	2993	+ * Behavior is the same, except that this one sets FOLL_PIN and rejects
	2994	+ * FOLL_GET.
	2995	+ */
	2996	+long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
	2997	+ unsigned int gup_flags, struct page **pages,
	2998	+ int *locked)
	2999	+{
	3000	+ /*
	3001	+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
	3002	+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
	3003	+ * vmas. As there are no users of this flag in this call we simply
	3004	+ * disallow this option for now.
	3005	+ */
	3006	+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
	3007	+ return -EINVAL;
	3008	+
	3009	+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
	3010	+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
	3011	+ return -EINVAL;
	3012	+
	3013	+ gup_flags \|= FOLL_PIN;
	3014	+ return __get_user_pages_locked(current->mm, start, nr_pages,
	3015	+ pages, NULL, locked,
	3016	+ gup_flags \| FOLL_TOUCH);
	3017	+}
	3018	+EXPORT_SYMBOL(pin_user_pages_locked);