hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/gup.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 #include <linux/kernel.h>
23 #include <linux/errno.h>
34 #include <linux/err.h>
....@@ -13,13 +14,363 @@
1314 #include <linux/sched/signal.h>
1415 #include <linux/rwsem.h>
1516 #include <linux/hugetlb.h>
17
+#include <linux/migrate.h>
18
+#include <linux/mm_inline.h>
19
+#include <linux/sched/mm.h>
20
+
21
+#include <linux/page_pinner.h>
1622
1723 #include <asm/mmu_context.h>
18
-#include <asm/pgtable.h>
1924 #include <asm/tlbflush.h>
2025
2126 #include "internal.h"
2227
28
+struct follow_page_context {
29
+ struct dev_pagemap *pgmap;
30
+ unsigned int page_mask;
31
+};
32
+
33
+static void hpage_pincount_add(struct page *page, int refs)
34
+{
35
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
36
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
37
+
38
+ atomic_add(refs, compound_pincount_ptr(page));
39
+}
40
+
41
+static void hpage_pincount_sub(struct page *page, int refs)
42
+{
43
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
44
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
45
+
46
+ atomic_sub(refs, compound_pincount_ptr(page));
47
+}
48
+
49
+/* Equivalent to calling put_page() @refs times. */
50
+static void put_page_refs(struct page *page, int refs)
51
+{
52
+#ifdef CONFIG_DEBUG_VM
53
+ if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
54
+ return;
55
+#endif
56
+
57
+ /*
58
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
59
+ * ref needs a put_page().
60
+ */
61
+ if (refs > 1)
62
+ page_ref_sub(page, refs - 1);
63
+ put_page(page);
64
+}
65
+
66
+/*
67
+ * Return the compound head page with ref appropriately incremented,
68
+ * or NULL if that failed.
69
+ */
70
+static inline struct page *try_get_compound_head(struct page *page, int refs)
71
+{
72
+ struct page *head = compound_head(page);
73
+
74
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
75
+ return NULL;
76
+ if (unlikely(!page_cache_add_speculative(head, refs)))
77
+ return NULL;
78
+
79
+ /*
80
+ * At this point we have a stable reference to the head page; but it
81
+ * could be that between the compound_head() lookup and the refcount
82
+ * increment, the compound page was split, in which case we'd end up
83
+ * holding a reference on a page that has nothing to do with the page
84
+ * we were given anymore.
85
+ * So now that the head page is stable, recheck that the pages still
86
+ * belong together.
87
+ */
88
+ if (unlikely(compound_head(page) != head)) {
89
+ put_page_refs(head, refs);
90
+ return NULL;
91
+ }
92
+
93
+ return head;
94
+}
95
+
96
+/*
97
+ * try_grab_compound_head() - attempt to elevate a page's refcount, by a
98
+ * flags-dependent amount.
99
+ *
100
+ * "grab" names in this file mean, "look at flags to decide whether to use
101
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
102
+ *
103
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
104
+ * same time. (That's true throughout the get_user_pages*() and
105
+ * pin_user_pages*() APIs.) Cases:
106
+ *
107
+ * FOLL_GET: page's refcount will be incremented by 1.
108
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
109
+ *
110
+ * Return: head page (with refcount appropriately incremented) for success, or
111
+ * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
112
+ * considered failure, and furthermore, a likely bug in the caller, so a warning
113
+ * is also emitted.
114
+ */
115
+static __maybe_unused struct page *try_grab_compound_head(struct page *page,
116
+ int refs,
117
+ unsigned int flags)
118
+{
119
+ if (flags & FOLL_GET) {
120
+ struct page *head = try_get_compound_head(page, refs);
121
+ if (head)
122
+ set_page_pinner(head, compound_order(head));
123
+ return head;
124
+ } else if (flags & FOLL_PIN) {
125
+ int orig_refs = refs;
126
+
127
+ /*
128
+ * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
129
+ * path, so fail and let the caller fall back to the slow path.
130
+ */
131
+ if (unlikely(flags & FOLL_LONGTERM) &&
132
+ is_migrate_cma_page(page))
133
+ return NULL;
134
+
135
+ /*
136
+ * CAUTION: Don't use compound_head() on the page before this
137
+ * point, the result won't be stable.
138
+ */
139
+ page = try_get_compound_head(page, refs);
140
+ if (!page)
141
+ return NULL;
142
+
143
+ /*
144
+ * When pinning a compound page of order > 1 (which is what
145
+ * hpage_pincount_available() checks for), use an exact count to
146
+ * track it, via hpage_pincount_add/_sub().
147
+ *
148
+ * However, be sure to *also* increment the normal page refcount
149
+ * field at least once, so that the page really is pinned.
150
+ */
151
+ if (hpage_pincount_available(page))
152
+ hpage_pincount_add(page, refs);
153
+ else
154
+ page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
155
+
156
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
157
+ orig_refs);
158
+
159
+ return page;
160
+ }
161
+
162
+ WARN_ON_ONCE(1);
163
+ return NULL;
164
+}
165
+
166
+static void put_compound_head(struct page *page, int refs, unsigned int flags)
167
+{
168
+ if (flags & FOLL_PIN) {
169
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
170
+ refs);
171
+
172
+ if (hpage_pincount_available(page))
173
+ hpage_pincount_sub(page, refs);
174
+ else
175
+ refs *= GUP_PIN_COUNTING_BIAS;
176
+ }
177
+
178
+ if (flags & FOLL_GET)
179
+ reset_page_pinner(page, compound_order(page));
180
+ put_page_refs(page, refs);
181
+}
182
+
183
+/**
184
+ * try_grab_page() - elevate a page's refcount by a flag-dependent amount
185
+ *
186
+ * This might not do anything at all, depending on the flags argument.
187
+ *
188
+ * "grab" names in this file mean, "look at flags to decide whether to use
189
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
190
+ *
191
+ * @page: pointer to page to be grabbed
192
+ * @flags: gup flags: these are the FOLL_* flag values.
193
+ *
194
+ * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
195
+ * time. Cases:
196
+ *
197
+ * FOLL_GET: page's refcount will be incremented by 1.
198
+ * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
199
+ *
200
+ * Return: true for success, or if no action was required (if neither FOLL_PIN
201
+ * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
202
+ * FOLL_PIN was set, but the page could not be grabbed.
203
+ */
204
+bool __must_check try_grab_page(struct page *page, unsigned int flags)
205
+{
206
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
207
+
208
+ if (flags & FOLL_GET) {
209
+ bool ret = try_get_page(page);
210
+
211
+ if (ret) {
212
+ page = compound_head(page);
213
+ set_page_pinner(page, compound_order(page));
214
+ }
215
+ return ret;
216
+ } else if (flags & FOLL_PIN) {
217
+ int refs = 1;
218
+
219
+ page = compound_head(page);
220
+
221
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
222
+ return false;
223
+
224
+ if (hpage_pincount_available(page))
225
+ hpage_pincount_add(page, 1);
226
+ else
227
+ refs = GUP_PIN_COUNTING_BIAS;
228
+
229
+ /*
230
+ * Similar to try_grab_compound_head(): even if using the
231
+ * hpage_pincount_add/_sub() routines, be sure to
232
+ * *also* increment the normal page refcount field at least
233
+ * once, so that the page really is pinned.
234
+ */
235
+ page_ref_add(page, refs);
236
+
237
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
238
+ }
239
+
240
+ return true;
241
+}
242
+
243
+/**
244
+ * unpin_user_page() - release a dma-pinned page
245
+ * @page: pointer to page to be released
246
+ *
247
+ * Pages that were pinned via pin_user_pages*() must be released via either
248
+ * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
249
+ * that such pages can be separately tracked and uniquely handled. In
250
+ * particular, interactions with RDMA and filesystems need special handling.
251
+ */
252
+void unpin_user_page(struct page *page)
253
+{
254
+ put_compound_head(compound_head(page), 1, FOLL_PIN);
255
+}
256
+EXPORT_SYMBOL(unpin_user_page);
257
+
258
+/*
259
+ * put_user_page() - release a page obtained using get_user_pages() or
260
+ * follow_page(FOLL_GET)
261
+ * @page: pointer to page to be released
262
+ *
263
+ * Pages that were obtained via get_user_pages()/follow_page(FOLL_GET) must be
264
+ * released via put_user_page.
265
+ * note: If it's not a page from GUP or follow_page(FOLL_GET), it's harmless.
266
+ */
267
+void put_user_page(struct page *page)
268
+{
269
+ struct page *head = compound_head(page);
270
+
271
+ reset_page_pinner(head, compound_order(head));
272
+ put_page(page);
273
+}
274
+EXPORT_SYMBOL(put_user_page);
275
+
276
+/**
277
+ * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
278
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
279
+ * @npages: number of pages in the @pages array.
280
+ * @make_dirty: whether to mark the pages dirty
281
+ *
282
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
283
+ * variants called on that page.
284
+ *
285
+ * For each page in the @pages array, make that page (or its head page, if a
286
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
287
+ * listed as clean. In any case, releases all pages using unpin_user_page(),
288
+ * possibly via unpin_user_pages(), for the non-dirty case.
289
+ *
290
+ * Please see the unpin_user_page() documentation for details.
291
+ *
292
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
293
+ * required, then the caller should a) verify that this is really correct,
294
+ * because _lock() is usually required, and b) hand code it:
295
+ * set_page_dirty_lock(), unpin_user_page().
296
+ *
297
+ */
298
+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
299
+ bool make_dirty)
300
+{
301
+ unsigned long index;
302
+
303
+ /*
304
+ * TODO: this can be optimized for huge pages: if a series of pages is
305
+ * physically contiguous and part of the same compound page, then a
306
+ * single operation to the head page should suffice.
307
+ */
308
+
309
+ if (!make_dirty) {
310
+ unpin_user_pages(pages, npages);
311
+ return;
312
+ }
313
+
314
+ for (index = 0; index < npages; index++) {
315
+ struct page *page = compound_head(pages[index]);
316
+ /*
317
+ * Checking PageDirty at this point may race with
318
+ * clear_page_dirty_for_io(), but that's OK. Two key
319
+ * cases:
320
+ *
321
+ * 1) This code sees the page as already dirty, so it
322
+ * skips the call to set_page_dirty(). That could happen
323
+ * because clear_page_dirty_for_io() called
324
+ * page_mkclean(), followed by set_page_dirty().
325
+ * However, now the page is going to get written back,
326
+ * which meets the original intention of setting it
327
+ * dirty, so all is well: clear_page_dirty_for_io() goes
328
+ * on to call TestClearPageDirty(), and write the page
329
+ * back.
330
+ *
331
+ * 2) This code sees the page as clean, so it calls
332
+ * set_page_dirty(). The page stays dirty, despite being
333
+ * written back, so it gets written back again in the
334
+ * next writeback cycle. This is harmless.
335
+ */
336
+ if (!PageDirty(page))
337
+ set_page_dirty_lock(page);
338
+ unpin_user_page(page);
339
+ }
340
+}
341
+EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
342
+
343
+/**
344
+ * unpin_user_pages() - release an array of gup-pinned pages.
345
+ * @pages: array of pages to be marked dirty and released.
346
+ * @npages: number of pages in the @pages array.
347
+ *
348
+ * For each page in the @pages array, release the page using unpin_user_page().
349
+ *
350
+ * Please see the unpin_user_page() documentation for details.
351
+ */
352
+void unpin_user_pages(struct page **pages, unsigned long npages)
353
+{
354
+ unsigned long index;
355
+
356
+ /*
357
+ * If this WARN_ON() fires, then the system *might* be leaking pages (by
358
+ * leaving them pinned), but probably not. More likely, gup/pup returned
359
+ * a hard -ERRNO error to the caller, who erroneously passed it here.
360
+ */
361
+ if (WARN_ON(IS_ERR_VALUE(npages)))
362
+ return;
363
+ /*
364
+ * TODO: this can be optimized for huge pages: if a series of pages is
365
+ * physically contiguous and part of the same compound page, then a
366
+ * single operation to the head page should suffice.
367
+ */
368
+ for (index = 0; index < npages; index++)
369
+ unpin_user_page(pages[index]);
370
+}
371
+EXPORT_SYMBOL(unpin_user_pages);
372
+
373
+#ifdef CONFIG_MMU
23374 static struct page *no_page_table(struct vm_area_struct *vma,
24375 unsigned int flags)
25376 {
....@@ -31,7 +382,8 @@
31382 * But we can only make this optimization where a hole would surely
32383 * be zero-filled if handle_mm_fault() actually did handle it.
33384 */
34
- if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
385
+ if ((flags & FOLL_DUMP) &&
386
+ (vma_is_anonymous(vma) || !vma->vm_ops->fault))
35387 return ERR_PTR(-EFAULT);
36388 return NULL;
37389 }
....@@ -61,32 +413,40 @@
61413 }
62414
63415 /*
64
- * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
65
- * but only after we've gone through a COW cycle and they are dirty.
416
+ * FOLL_FORCE can write to even unwritable pte's, but only
417
+ * after we've gone through a COW cycle and they are dirty.
66418 */
67419 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
68420 {
69
- return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte));
70
-}
71
-
72
-/*
73
- * A (separate) COW fault might break the page the other way and
74
- * get_user_pages() would return the page from what is now the wrong
75
- * VM. So we need to force a COW break at GUP time even for reads.
76
- */
77
-static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
78
-{
79
- return is_cow_mapping(vma->vm_flags) && (flags & FOLL_GET);
421
+ return pte_write(pte) ||
422
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
80423 }
81424
82425 static struct page *follow_page_pte(struct vm_area_struct *vma,
83
- unsigned long address, pmd_t *pmd, unsigned int flags)
426
+ unsigned long address, pmd_t *pmd, unsigned int flags,
427
+ struct dev_pagemap **pgmap)
84428 {
85429 struct mm_struct *mm = vma->vm_mm;
86
- struct dev_pagemap *pgmap = NULL;
87430 struct page *page;
88431 spinlock_t *ptl;
89432 pte_t *ptep, pte;
433
+ int ret;
434
+
435
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
436
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
437
+ (FOLL_PIN | FOLL_GET)))
438
+ return ERR_PTR(-EINVAL);
439
+
440
+ /*
441
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
442
+ * ARM64 architecture.
443
+ */
444
+ if (is_vm_hugetlb_page(vma)) {
445
+ page = follow_huge_pmd_pte(vma, address, flags);
446
+ if (page)
447
+ return page;
448
+ return no_page_table(vma, flags);
449
+ }
90450
91451 retry:
92452 if (unlikely(pmd_bad(*pmd)))
....@@ -120,13 +480,14 @@
120480 }
121481
122482 page = vm_normal_page(vma, address, pte);
123
- if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
483
+ if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
124484 /*
125
- * Only return device mapping pages in the FOLL_GET case since
126
- * they are only valid while holding the pgmap reference.
485
+ * Only return device mapping pages in the FOLL_GET or FOLL_PIN
486
+ * case since they are only valid while holding the pgmap
487
+ * reference.
127488 */
128
- pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
129
- if (pgmap)
489
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
490
+ if (*pgmap)
130491 page = pte_page(pte);
131492 else
132493 goto no_page;
....@@ -140,8 +501,6 @@
140501 if (is_zero_pfn(pte_pfn(pte))) {
141502 page = pte_page(pte);
142503 } else {
143
- int ret;
144
-
145504 ret = follow_pfn_pte(vma, address, ptep, flags);
146505 page = ERR_PTR(ret);
147506 goto out;
....@@ -149,7 +508,6 @@
149508 }
150509
151510 if (flags & FOLL_SPLIT && PageTransCompound(page)) {
152
- int ret;
153511 get_page(page);
154512 pte_unmap_unlock(ptep, ptl);
155513 lock_page(page);
....@@ -161,16 +519,22 @@
161519 goto retry;
162520 }
163521
164
- if (flags & FOLL_GET) {
165
- if (unlikely(!try_get_page(page))) {
166
- page = ERR_PTR(-ENOMEM);
522
+ /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
523
+ if (unlikely(!try_grab_page(page, flags))) {
524
+ page = ERR_PTR(-ENOMEM);
525
+ goto out;
526
+ }
527
+ /*
528
+ * We need to make the page accessible if and only if we are going
529
+ * to access its content (the FOLL_PIN case). Please see
530
+ * Documentation/core-api/pin_user_pages.rst for details.
531
+ */
532
+ if (flags & FOLL_PIN) {
533
+ ret = arch_make_page_accessible(page);
534
+ if (ret) {
535
+ unpin_user_page(page);
536
+ page = ERR_PTR(ret);
167537 goto out;
168
- }
169
-
170
- /* drop the pgmap reference now that we hold the page */
171
- if (pgmap) {
172
- put_dev_pagemap(pgmap);
173
- pgmap = NULL;
174538 }
175539 }
176540 if (flags & FOLL_TOUCH) {
....@@ -222,7 +586,8 @@
222586
223587 static struct page *follow_pmd_mask(struct vm_area_struct *vma,
224588 unsigned long address, pud_t *pudp,
225
- unsigned int flags, unsigned int *page_mask)
589
+ unsigned int flags,
590
+ struct follow_page_context *ctx)
226591 {
227592 pmd_t *pmd, pmdval;
228593 spinlock_t *ptl;
....@@ -237,8 +602,8 @@
237602 pmdval = READ_ONCE(*pmd);
238603 if (pmd_none(pmdval))
239604 return no_page_table(vma, flags);
240
- if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
241
- page = follow_huge_pmd(mm, address, pmd, flags);
605
+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
606
+ page = follow_huge_pmd_pte(vma, address, flags);
242607 if (page)
243608 return page;
244609 return no_page_table(vma, flags);
....@@ -262,7 +627,7 @@
262627 pmdval = READ_ONCE(*pmd);
263628 /*
264629 * MADV_DONTNEED may convert the pmd to null because
265
- * mmap_sem is held in read mode
630
+ * mmap_lock is held in read mode
266631 */
267632 if (pmd_none(pmdval))
268633 return no_page_table(vma, flags);
....@@ -270,13 +635,13 @@
270635 }
271636 if (pmd_devmap(pmdval)) {
272637 ptl = pmd_lock(mm, pmd);
273
- page = follow_devmap_pmd(vma, address, pmd, flags);
638
+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
274639 spin_unlock(ptl);
275640 if (page)
276641 return page;
277642 }
278643 if (likely(!pmd_trans_huge(pmdval)))
279
- return follow_page_pte(vma, address, pmd, flags);
644
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
280645
281646 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
282647 return no_page_table(vma, flags);
....@@ -296,9 +661,9 @@
296661 }
297662 if (unlikely(!pmd_trans_huge(*pmd))) {
298663 spin_unlock(ptl);
299
- return follow_page_pte(vma, address, pmd, flags);
664
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
300665 }
301
- if (flags & FOLL_SPLIT) {
666
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
302667 int ret;
303668 page = pmd_page(*pmd);
304669 if (is_huge_zero_page(page)) {
....@@ -307,7 +672,7 @@
307672 split_huge_pmd(vma, pmd, address);
308673 if (pmd_trans_unstable(pmd))
309674 ret = -EBUSY;
310
- } else {
675
+ } else if (flags & FOLL_SPLIT) {
311676 if (unlikely(!try_get_page(page))) {
312677 spin_unlock(ptl);
313678 return ERR_PTR(-ENOMEM);
....@@ -319,21 +684,25 @@
319684 put_page(page);
320685 if (pmd_none(*pmd))
321686 return no_page_table(vma, flags);
687
+ } else { /* flags & FOLL_SPLIT_PMD */
688
+ spin_unlock(ptl);
689
+ split_huge_pmd(vma, pmd, address);
690
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
322691 }
323692
324693 return ret ? ERR_PTR(ret) :
325
- follow_page_pte(vma, address, pmd, flags);
694
+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
326695 }
327696 page = follow_trans_huge_pmd(vma, address, pmd, flags);
328697 spin_unlock(ptl);
329
- *page_mask = HPAGE_PMD_NR - 1;
698
+ ctx->page_mask = HPAGE_PMD_NR - 1;
330699 return page;
331700 }
332701
333
-
334702 static struct page *follow_pud_mask(struct vm_area_struct *vma,
335703 unsigned long address, p4d_t *p4dp,
336
- unsigned int flags, unsigned int *page_mask)
704
+ unsigned int flags,
705
+ struct follow_page_context *ctx)
337706 {
338707 pud_t *pud;
339708 spinlock_t *ptl;
....@@ -343,7 +712,7 @@
343712 pud = pud_offset(p4dp, address);
344713 if (pud_none(*pud))
345714 return no_page_table(vma, flags);
346
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
715
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
347716 page = follow_huge_pud(mm, address, pud, flags);
348717 if (page)
349718 return page;
....@@ -359,7 +728,7 @@
359728 }
360729 if (pud_devmap(*pud)) {
361730 ptl = pud_lock(mm, pud);
362
- page = follow_devmap_pud(vma, address, pud, flags);
731
+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
363732 spin_unlock(ptl);
364733 if (page)
365734 return page;
....@@ -367,13 +736,13 @@
367736 if (unlikely(pud_bad(*pud)))
368737 return no_page_table(vma, flags);
369738
370
- return follow_pmd_mask(vma, address, pud, flags, page_mask);
739
+ return follow_pmd_mask(vma, address, pud, flags, ctx);
371740 }
372
-
373741
374742 static struct page *follow_p4d_mask(struct vm_area_struct *vma,
375743 unsigned long address, pgd_t *pgdp,
376
- unsigned int flags, unsigned int *page_mask)
744
+ unsigned int flags,
745
+ struct follow_page_context *ctx)
377746 {
378747 p4d_t *p4d;
379748 struct page *page;
....@@ -393,7 +762,7 @@
393762 return page;
394763 return no_page_table(vma, flags);
395764 }
396
- return follow_pud_mask(vma, address, p4d, flags, page_mask);
765
+ return follow_pud_mask(vma, address, p4d, flags, ctx);
397766 }
398767
399768 /**
....@@ -401,28 +770,34 @@
401770 * @vma: vm_area_struct mapping @address
402771 * @address: virtual address to look up
403772 * @flags: flags modifying lookup behaviour
404
- * @page_mask: on output, *page_mask is set according to the size of the page
773
+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
774
+ * pointer to output page_mask
405775 *
406776 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
407777 *
408
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
778
+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
779
+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
780
+ *
781
+ * On output, the @ctx->page_mask is set according to the size of the page.
782
+ *
783
+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
409784 * an error pointer if there is a mapping to something not represented
410785 * by a page descriptor (see also vm_normal_page()).
411786 */
412
-struct page *follow_page_mask(struct vm_area_struct *vma,
787
+static struct page *follow_page_mask(struct vm_area_struct *vma,
413788 unsigned long address, unsigned int flags,
414
- unsigned int *page_mask)
789
+ struct follow_page_context *ctx)
415790 {
416791 pgd_t *pgd;
417792 struct page *page;
418793 struct mm_struct *mm = vma->vm_mm;
419794
420
- *page_mask = 0;
795
+ ctx->page_mask = 0;
421796
422797 /* make this handle hugepd */
423798 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
424799 if (!IS_ERR(page)) {
425
- BUG_ON(flags & FOLL_GET);
800
+ WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
426801 return page;
427802 }
428803
....@@ -446,7 +821,19 @@
446821 return no_page_table(vma, flags);
447822 }
448823
449
- return follow_p4d_mask(vma, address, pgd, flags, page_mask);
824
+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
825
+}
826
+
827
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
828
+ unsigned int foll_flags)
829
+{
830
+ struct follow_page_context ctx = { NULL };
831
+ struct page *page;
832
+
833
+ page = follow_page_mask(vma, address, foll_flags, &ctx);
834
+ if (ctx.pgmap)
835
+ put_dev_pagemap(ctx.pgmap);
836
+ return page;
450837 }
451838
452839 static int get_gate_page(struct mm_struct *mm, unsigned long address,
....@@ -490,15 +877,8 @@
490877 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
491878 goto unmap;
492879 *page = pte_page(*pte);
493
-
494
- /*
495
- * This should never happen (a device public page in the gate
496
- * area).
497
- */
498
- if (is_device_public_page(*page))
499
- goto unmap;
500880 }
501
- if (unlikely(!try_get_page(*page))) {
881
+ if (unlikely(!try_grab_page(*page, gup_flags))) {
502882 ret = -ENOMEM;
503883 goto unmap;
504884 }
....@@ -510,12 +890,12 @@
510890 }
511891
512892 /*
513
- * mmap_sem must be held on entry. If @nonblocking != NULL and
514
- * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
515
- * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
893
+ * mmap_lock must be held on entry. If @locked != NULL and *@flags
894
+ * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
895
+ * is, *@locked will be set to 0 and -EBUSY returned.
516896 */
517
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
518
- unsigned long address, unsigned int *flags, int *nonblocking)
897
+static int faultin_page(struct vm_area_struct *vma,
898
+ unsigned long address, unsigned int *flags, int *locked)
519899 {
520900 unsigned int fault_flags = 0;
521901 vm_fault_t ret;
....@@ -527,16 +907,19 @@
527907 fault_flags |= FAULT_FLAG_WRITE;
528908 if (*flags & FOLL_REMOTE)
529909 fault_flags |= FAULT_FLAG_REMOTE;
530
- if (nonblocking)
531
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
910
+ if (locked)
911
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
532912 if (*flags & FOLL_NOWAIT)
533913 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
534914 if (*flags & FOLL_TRIED) {
535
- VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
915
+ /*
916
+ * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
917
+ * can co-exist
918
+ */
536919 fault_flags |= FAULT_FLAG_TRIED;
537920 }
538921
539
- ret = handle_mm_fault(vma, address, fault_flags);
922
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
540923 if (ret & VM_FAULT_ERROR) {
541924 int err = vm_fault_to_errno(ret, *flags);
542925
....@@ -545,16 +928,9 @@
545928 BUG();
546929 }
547930
548
- if (tsk) {
549
- if (ret & VM_FAULT_MAJOR)
550
- tsk->maj_flt++;
551
- else
552
- tsk->min_flt++;
553
- }
554
-
555931 if (ret & VM_FAULT_RETRY) {
556
- if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
557
- *nonblocking = 0;
932
+ if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
933
+ *locked = 0;
558934 return -EBUSY;
559935 }
560936
....@@ -583,6 +959,9 @@
583959
584960 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
585961 return -EFAULT;
962
+
963
+ if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
964
+ return -EOPNOTSUPP;
586965
587966 if (write) {
588967 if (!(vm_flags & VM_WRITE)) {
....@@ -621,7 +1000,6 @@
6211000
6221001 /**
6231002 * __get_user_pages() - pin user pages in memory
624
- * @tsk: task_struct of target task
6251003 * @mm: mm_struct of target mm
6261004 * @start: starting user address
6271005 * @nr_pages: number of pages from start to pin
....@@ -631,15 +1009,22 @@
6311009 * only intends to ensure the pages are faulted in.
6321010 * @vmas: array of pointers to vmas corresponding to each page.
6331011 * Or NULL if the caller does not require them.
634
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
1012
+ * @locked: whether we're still with the mmap_lock held
6351013 *
636
- * Returns number of pages pinned. This may be fewer than the number
637
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
638
- * were pinned, returns -errno. Each page returned must be released
639
- * with a put_page() call when it is finished with. vmas will only
640
- * remain valid while mmap_sem is held.
1014
+ * Returns either number of pages pinned (which may be less than the
1015
+ * number requested), or an error. Details about the return value:
6411016 *
642
- * Must be called with mmap_sem held. It may be released. See below.
1017
+ * -- If nr_pages is 0, returns 0.
1018
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1019
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
1020
+ * pages pinned. Again, this may be less than nr_pages.
1021
+ * -- 0 return value is possible when the fault would need to be retried.
1022
+ *
1023
+ * The caller is responsible for releasing returned @pages, via put_page().
1024
+ *
1025
+ * @vmas are valid only as long as mmap_lock is held.
1026
+ *
1027
+ * Must be called with mmap_lock held. It may be released. See below.
6431028 *
6441029 * __get_user_pages walks a process's page tables and takes a reference to
6451030 * each struct page that each user address corresponds to at a given
....@@ -660,14 +1045,12 @@
6601045 * appropriate) must be called after the page is finished with, and
6611046 * before put_page is called.
6621047 *
663
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
664
- * or mmap_sem contention, and if waiting is needed to pin all pages,
665
- * *@nonblocking will be set to 0. Further, if @gup_flags does not
666
- * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
667
- * this case.
1048
+ * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
1049
+ * released by an up_read(). That can happen if @gup_flags does not
1050
+ * have FOLL_NOWAIT.
6681051 *
669
- * A caller using such a combination of @nonblocking and @gup_flags
670
- * must therefore hold the mmap_sem for reading only, and recognize
1052
+ * A caller using such a combination of @locked and @gup_flags
1053
+ * must therefore hold the mmap_lock for reading only, and recognize
6711054 * when it's been released. Otherwise, it must be held for either
6721055 * reading or writing and will not be released.
6731056 *
....@@ -675,21 +1058,21 @@
6751058 * instead of __get_user_pages. __get_user_pages should be used only if
6761059 * you need some special @gup_flags.
6771060 */
678
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1061
+static long __get_user_pages(struct mm_struct *mm,
6791062 unsigned long start, unsigned long nr_pages,
6801063 unsigned int gup_flags, struct page **pages,
681
- struct vm_area_struct **vmas, int *nonblocking)
1064
+ struct vm_area_struct **vmas, int *locked)
6821065 {
683
- long i = 0;
684
- unsigned int page_mask;
1066
+ long ret = 0, i = 0;
6851067 struct vm_area_struct *vma = NULL;
1068
+ struct follow_page_context ctx = { NULL };
6861069
6871070 if (!nr_pages)
6881071 return 0;
6891072
6901073 start = untagged_addr(start);
6911074
692
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1075
+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
6931076
6941077 /*
6951078 * If FOLL_FORCE is set then do not force a full fault as the hinting
....@@ -708,53 +1091,64 @@
7081091 if (!vma || start >= vma->vm_end) {
7091092 vma = find_extend_vma(mm, start);
7101093 if (!vma && in_gate_area(mm, start)) {
711
- int ret;
7121094 ret = get_gate_page(mm, start & PAGE_MASK,
7131095 gup_flags, &vma,
7141096 pages ? &pages[i] : NULL);
7151097 if (ret)
716
- return i ? : ret;
717
- page_mask = 0;
1098
+ goto out;
1099
+ ctx.page_mask = 0;
7181100 goto next_page;
7191101 }
7201102
721
- if (!vma || check_vma_flags(vma, gup_flags))
722
- return i ? : -EFAULT;
1103
+ if (!vma) {
1104
+ ret = -EFAULT;
1105
+ goto out;
1106
+ }
1107
+ ret = check_vma_flags(vma, gup_flags);
1108
+ if (ret)
1109
+ goto out;
1110
+
7231111 if (is_vm_hugetlb_page(vma)) {
724
- if (should_force_cow_break(vma, foll_flags))
725
- foll_flags |= FOLL_WRITE;
7261112 i = follow_hugetlb_page(mm, vma, pages, vmas,
7271113 &start, &nr_pages, i,
728
- foll_flags, nonblocking);
1114
+ gup_flags, locked);
1115
+ if (locked && *locked == 0) {
1116
+ /*
1117
+ * We've got a VM_FAULT_RETRY
1118
+ * and we've lost mmap_lock.
1119
+ * We must stop here.
1120
+ */
1121
+ BUG_ON(gup_flags & FOLL_NOWAIT);
1122
+ BUG_ON(ret != 0);
1123
+ goto out;
1124
+ }
7291125 continue;
7301126 }
7311127 }
732
-
733
- if (should_force_cow_break(vma, foll_flags))
734
- foll_flags |= FOLL_WRITE;
735
-
7361128 retry:
7371129 /*
7381130 * If we have a pending SIGKILL, don't keep faulting pages and
7391131 * potentially allocating memory.
7401132 */
741
- if (unlikely(fatal_signal_pending(current)))
742
- return i ? i : -ERESTARTSYS;
1133
+ if (fatal_signal_pending(current)) {
1134
+ ret = -EINTR;
1135
+ goto out;
1136
+ }
7431137 cond_resched();
744
- page = follow_page_mask(vma, start, foll_flags, &page_mask);
1138
+
1139
+ page = follow_page_mask(vma, start, foll_flags, &ctx);
7451140 if (!page) {
746
- int ret;
747
- ret = faultin_page(tsk, vma, start, &foll_flags,
748
- nonblocking);
1141
+ ret = faultin_page(vma, start, &foll_flags, locked);
7491142 switch (ret) {
7501143 case 0:
7511144 goto retry;
1145
+ case -EBUSY:
1146
+ ret = 0;
1147
+ fallthrough;
7521148 case -EFAULT:
7531149 case -ENOMEM:
7541150 case -EHWPOISON:
755
- return i ? i : ret;
756
- case -EBUSY:
757
- return i;
1151
+ goto out;
7581152 case -ENOENT:
7591153 goto next_page;
7601154 }
....@@ -766,27 +1160,31 @@
7661160 */
7671161 goto next_page;
7681162 } else if (IS_ERR(page)) {
769
- return i ? i : PTR_ERR(page);
1163
+ ret = PTR_ERR(page);
1164
+ goto out;
7701165 }
7711166 if (pages) {
7721167 pages[i] = page;
7731168 flush_anon_page(vma, page, start);
7741169 flush_dcache_page(page);
775
- page_mask = 0;
1170
+ ctx.page_mask = 0;
7761171 }
7771172 next_page:
7781173 if (vmas) {
7791174 vmas[i] = vma;
780
- page_mask = 0;
1175
+ ctx.page_mask = 0;
7811176 }
782
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1177
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
7831178 if (page_increm > nr_pages)
7841179 page_increm = nr_pages;
7851180 i += page_increm;
7861181 start += page_increm * PAGE_SIZE;
7871182 nr_pages -= page_increm;
7881183 } while (nr_pages);
789
- return i;
1184
+out:
1185
+ if (ctx.pgmap)
1186
+ put_dev_pagemap(ctx.pgmap);
1187
+ return i ? i : ret;
7901188 }
7911189
7921190 static bool vma_permits_fault(struct vm_area_struct *vma,
....@@ -812,15 +1210,14 @@
8121210 return true;
8131211 }
8141212
815
-/*
1213
+/**
8161214 * fixup_user_fault() - manually resolve a user page fault
817
- * @tsk: the task_struct to use for page fault accounting, or
818
- * NULL if faults are not to be recorded.
8191215 * @mm: mm_struct of target mm
8201216 * @address: user address
8211217 * @fault_flags:flags to pass down to handle_mm_fault()
822
- * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
823
- * does not allow retry
1218
+ * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
1219
+ * does not allow retry. If NULL, the caller must guarantee
1220
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
8241221 *
8251222 * This is meant to be called in the specific scenario where for locking reasons
8261223 * we try to access user memory in atomic context (within a pagefault_disable()
....@@ -839,10 +1236,10 @@
8391236 * such architectures, gup() will not be enough to make a subsequent access
8401237 * succeed.
8411238 *
842
- * This function will not return with an unlocked mmap_sem. So it has not the
843
- * same semantics wrt the @mm->mmap_sem as does filemap_fault().
1239
+ * This function will not return with an unlocked mmap_lock. So it has not the
1240
+ * same semantics wrt the @mm->mmap_lock as does filemap_fault().
8441241 */
845
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1242
+int fixup_user_fault(struct mm_struct *mm,
8461243 unsigned long address, unsigned int fault_flags,
8471244 bool *unlocked)
8481245 {
....@@ -852,7 +1249,7 @@
8521249 address = untagged_addr(address);
8531250
8541251 if (unlocked)
855
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1252
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
8561253
8571254 retry:
8581255 vma = find_extend_vma(mm, address);
....@@ -862,7 +1259,11 @@
8621259 if (!vma_permits_fault(vma, fault_flags))
8631260 return -EFAULT;
8641261
865
- ret = handle_mm_fault(vma, address, fault_flags);
1262
+ if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1263
+ fatal_signal_pending(current))
1264
+ return -EINTR;
1265
+
1266
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
8661267 major |= ret & VM_FAULT_MAJOR;
8671268 if (ret & VM_FAULT_ERROR) {
8681269 int err = vm_fault_to_errno(ret, 0);
....@@ -873,27 +1274,21 @@
8731274 }
8741275
8751276 if (ret & VM_FAULT_RETRY) {
876
- down_read(&mm->mmap_sem);
877
- if (!(fault_flags & FAULT_FLAG_TRIED)) {
878
- *unlocked = true;
879
- fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
880
- fault_flags |= FAULT_FLAG_TRIED;
881
- goto retry;
882
- }
1277
+ mmap_read_lock(mm);
1278
+ *unlocked = true;
1279
+ fault_flags |= FAULT_FLAG_TRIED;
1280
+ goto retry;
8831281 }
8841282
885
- if (tsk) {
886
- if (major)
887
- tsk->maj_flt++;
888
- else
889
- tsk->min_flt++;
890
- }
8911283 return 0;
8921284 }
8931285 EXPORT_SYMBOL_GPL(fixup_user_fault);
8941286
895
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
896
- struct mm_struct *mm,
1287
+/*
1288
+ * Please note that this function, unlike __get_user_pages will not
1289
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
1290
+ */
1291
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
8971292 unsigned long start,
8981293 unsigned long nr_pages,
8991294 struct page **pages,
....@@ -911,13 +1306,25 @@
9111306 BUG_ON(*locked != 1);
9121307 }
9131308
914
- if (pages)
1309
+ if (flags & FOLL_PIN)
1310
+ atomic_set(&mm->has_pinned, 1);
1311
+
1312
+ /*
1313
+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1314
+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
1315
+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
1316
+ * for FOLL_GET, not for the newer FOLL_PIN.
1317
+ *
1318
+ * FOLL_PIN always expects pages to be non-null, but no need to assert
1319
+ * that here, as any failures will be obvious enough.
1320
+ */
1321
+ if (pages && !(flags & FOLL_PIN))
9151322 flags |= FOLL_GET;
9161323
9171324 pages_done = 0;
9181325 lock_dropped = false;
9191326 for (;;) {
920
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
1327
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
9211328 vmas, locked);
9221329 if (!locked)
9231330 /* VM_FAULT_RETRY couldn't trigger, bypass */
....@@ -928,10 +1335,6 @@
9281335 BUG_ON(ret < 0);
9291336 BUG_ON(ret >= nr_pages);
9301337 }
931
-
932
- if (!pages)
933
- /* If it's a prefault don't insist harder */
934
- return ret;
9351338
9361339 if (ret > 0) {
9371340 nr_pages -= ret;
....@@ -948,20 +1351,46 @@
9481351 pages_done = ret;
9491352 break;
9501353 }
951
- /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
952
- pages += ret;
1354
+ /*
1355
+ * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1356
+ * For the prefault case (!pages) we only update counts.
1357
+ */
1358
+ if (likely(pages))
1359
+ pages += ret;
9531360 start += ret << PAGE_SHIFT;
1361
+ lock_dropped = true;
9541362
1363
+retry:
9551364 /*
9561365 * Repeat on the address that fired VM_FAULT_RETRY
957
- * without FAULT_FLAG_ALLOW_RETRY but with
958
- * FAULT_FLAG_TRIED.
1366
+ * with both FAULT_FLAG_ALLOW_RETRY and
1367
+ * FAULT_FLAG_TRIED. Note that GUP can be interrupted
1368
+ * by fatal signals, so we need to check it before we
1369
+ * start trying again otherwise it can loop forever.
9591370 */
1371
+
1372
+ if (fatal_signal_pending(current)) {
1373
+ if (!pages_done)
1374
+ pages_done = -EINTR;
1375
+ break;
1376
+ }
1377
+
1378
+ ret = mmap_read_lock_killable(mm);
1379
+ if (ret) {
1380
+ BUG_ON(ret > 0);
1381
+ if (!pages_done)
1382
+ pages_done = ret;
1383
+ break;
1384
+ }
1385
+
9601386 *locked = 1;
961
- lock_dropped = true;
962
- down_read(&mm->mmap_sem);
963
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
964
- pages, NULL, NULL);
1387
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
1388
+ pages, NULL, locked);
1389
+ if (!*locked) {
1390
+ /* Continue to retry until we succeeded */
1391
+ BUG_ON(ret != 0);
1392
+ goto retry;
1393
+ }
9651394 if (ret != 1) {
9661395 BUG_ON(ret > 1);
9671396 if (!pages_done)
....@@ -972,7 +1401,8 @@
9721401 pages_done++;
9731402 if (!nr_pages)
9741403 break;
975
- pages++;
1404
+ if (likely(pages))
1405
+ pages++;
9761406 start += PAGE_SIZE;
9771407 }
9781408 if (lock_dropped && *locked) {
....@@ -980,243 +1410,34 @@
9801410 * We must let the caller know we temporarily dropped the lock
9811411 * and so the critical section protected by it was lost.
9821412 */
983
- up_read(&mm->mmap_sem);
1413
+ mmap_read_unlock(mm);
9841414 *locked = 0;
9851415 }
9861416 return pages_done;
9871417 }
988
-
989
-/*
990
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
991
- * paths better by using either get_user_pages_locked() or
992
- * get_user_pages_unlocked().
993
- *
994
- * get_user_pages_locked() is suitable to replace the form:
995
- *
996
- * down_read(&mm->mmap_sem);
997
- * do_something()
998
- * get_user_pages(tsk, mm, ..., pages, NULL);
999
- * up_read(&mm->mmap_sem);
1000
- *
1001
- * to:
1002
- *
1003
- * int locked = 1;
1004
- * down_read(&mm->mmap_sem);
1005
- * do_something()
1006
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
1007
- * if (locked)
1008
- * up_read(&mm->mmap_sem);
1009
- */
1010
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1011
- unsigned int gup_flags, struct page **pages,
1012
- int *locked)
1013
-{
1014
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
1015
- pages, NULL, locked,
1016
- gup_flags | FOLL_TOUCH);
1017
-}
1018
-EXPORT_SYMBOL(get_user_pages_locked);
1019
-
1020
-/*
1021
- * get_user_pages_unlocked() is suitable to replace the form:
1022
- *
1023
- * down_read(&mm->mmap_sem);
1024
- * get_user_pages(tsk, mm, ..., pages, NULL);
1025
- * up_read(&mm->mmap_sem);
1026
- *
1027
- * with:
1028
- *
1029
- * get_user_pages_unlocked(tsk, mm, ..., pages);
1030
- *
1031
- * It is functionally equivalent to get_user_pages_fast so
1032
- * get_user_pages_fast should be used instead if specific gup_flags
1033
- * (e.g. FOLL_FORCE) are not required.
1034
- */
1035
-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1036
- struct page **pages, unsigned int gup_flags)
1037
-{
1038
- struct mm_struct *mm = current->mm;
1039
- int locked = 1;
1040
- long ret;
1041
-
1042
- down_read(&mm->mmap_sem);
1043
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
1044
- &locked, gup_flags | FOLL_TOUCH);
1045
- if (locked)
1046
- up_read(&mm->mmap_sem);
1047
- return ret;
1048
-}
1049
-EXPORT_SYMBOL(get_user_pages_unlocked);
1050
-
1051
-/*
1052
- * get_user_pages_remote() - pin user pages in memory
1053
- * @tsk: the task_struct to use for page fault accounting, or
1054
- * NULL if faults are not to be recorded.
1055
- * @mm: mm_struct of target mm
1056
- * @start: starting user address
1057
- * @nr_pages: number of pages from start to pin
1058
- * @gup_flags: flags modifying lookup behaviour
1059
- * @pages: array that receives pointers to the pages pinned.
1060
- * Should be at least nr_pages long. Or NULL, if caller
1061
- * only intends to ensure the pages are faulted in.
1062
- * @vmas: array of pointers to vmas corresponding to each page.
1063
- * Or NULL if the caller does not require them.
1064
- * @locked: pointer to lock flag indicating whether lock is held and
1065
- * subsequently whether VM_FAULT_RETRY functionality can be
1066
- * utilised. Lock must initially be held.
1067
- *
1068
- * Returns number of pages pinned. This may be fewer than the number
1069
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
1070
- * were pinned, returns -errno. Each page returned must be released
1071
- * with a put_page() call when it is finished with. vmas will only
1072
- * remain valid while mmap_sem is held.
1073
- *
1074
- * Must be called with mmap_sem held for read or write.
1075
- *
1076
- * get_user_pages walks a process's page tables and takes a reference to
1077
- * each struct page that each user address corresponds to at a given
1078
- * instant. That is, it takes the page that would be accessed if a user
1079
- * thread accesses the given user virtual address at that instant.
1080
- *
1081
- * This does not guarantee that the page exists in the user mappings when
1082
- * get_user_pages returns, and there may even be a completely different
1083
- * page there in some cases (eg. if mmapped pagecache has been invalidated
1084
- * and subsequently re faulted). However it does guarantee that the page
1085
- * won't be freed completely. And mostly callers simply care that the page
1086
- * contains data that was valid *at some point in time*. Typically, an IO
1087
- * or similar operation cannot guarantee anything stronger anyway because
1088
- * locks can't be held over the syscall boundary.
1089
- *
1090
- * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1091
- * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1092
- * be called after the page is finished with, and before put_page is called.
1093
- *
1094
- * get_user_pages is typically used for fewer-copy IO operations, to get a
1095
- * handle on the memory by some means other than accesses via the user virtual
1096
- * addresses. The pages may be submitted for DMA to devices or accessed via
1097
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
1098
- * use the correct cache flushing APIs.
1099
- *
1100
- * See also get_user_pages_fast, for performance critical applications.
1101
- *
1102
- * get_user_pages should be phased out in favor of
1103
- * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1104
- * should use get_user_pages because it cannot pass
1105
- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1106
- */
1107
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1108
- unsigned long start, unsigned long nr_pages,
1109
- unsigned int gup_flags, struct page **pages,
1110
- struct vm_area_struct **vmas, int *locked)
1111
-{
1112
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
1113
- locked,
1114
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1115
-}
1116
-EXPORT_SYMBOL(get_user_pages_remote);
1117
-
1118
-/*
1119
- * This is the same as get_user_pages_remote(), just with a
1120
- * less-flexible calling convention where we assume that the task
1121
- * and mm being operated on are the current task's and don't allow
1122
- * passing of a locked parameter. We also obviously don't pass
1123
- * FOLL_REMOTE in here.
1124
- */
1125
-long get_user_pages(unsigned long start, unsigned long nr_pages,
1126
- unsigned int gup_flags, struct page **pages,
1127
- struct vm_area_struct **vmas)
1128
-{
1129
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
1130
- pages, vmas, NULL,
1131
- gup_flags | FOLL_TOUCH);
1132
-}
1133
-EXPORT_SYMBOL(get_user_pages);
1134
-
1135
-#ifdef CONFIG_FS_DAX
1136
-/*
1137
- * This is the same as get_user_pages() in that it assumes we are
1138
- * operating on the current task's mm, but it goes further to validate
1139
- * that the vmas associated with the address range are suitable for
1140
- * longterm elevated page reference counts. For example, filesystem-dax
1141
- * mappings are subject to the lifetime enforced by the filesystem and
1142
- * we need guarantees that longterm users like RDMA and V4L2 only
1143
- * establish mappings that have a kernel enforced revocation mechanism.
1144
- *
1145
- * "longterm" == userspace controlled elevated page count lifetime.
1146
- * Contrast this to iov_iter_get_pages() usages which are transient.
1147
- */
1148
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
1149
- unsigned int gup_flags, struct page **pages,
1150
- struct vm_area_struct **vmas_arg)
1151
-{
1152
- struct vm_area_struct **vmas = vmas_arg;
1153
- struct vm_area_struct *vma_prev = NULL;
1154
- long rc, i;
1155
-
1156
- if (!pages)
1157
- return -EINVAL;
1158
-
1159
- if (!vmas) {
1160
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
1161
- GFP_KERNEL);
1162
- if (!vmas)
1163
- return -ENOMEM;
1164
- }
1165
-
1166
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
1167
-
1168
- for (i = 0; i < rc; i++) {
1169
- struct vm_area_struct *vma = vmas[i];
1170
-
1171
- if (vma == vma_prev)
1172
- continue;
1173
-
1174
- vma_prev = vma;
1175
-
1176
- if (vma_is_fsdax(vma))
1177
- break;
1178
- }
1179
-
1180
- /*
1181
- * Either get_user_pages() failed, or the vma validation
1182
- * succeeded, in either case we don't need to put_page() before
1183
- * returning.
1184
- */
1185
- if (i >= rc)
1186
- goto out;
1187
-
1188
- for (i = 0; i < rc; i++)
1189
- put_page(pages[i]);
1190
- rc = -EOPNOTSUPP;
1191
-out:
1192
- if (vmas != vmas_arg)
1193
- kfree(vmas);
1194
- return rc;
1195
-}
1196
-EXPORT_SYMBOL(get_user_pages_longterm);
1197
-#endif /* CONFIG_FS_DAX */
11981418
11991419 /**
12001420 * populate_vma_page_range() - populate a range of pages in the vma.
12011421 * @vma: target vma
12021422 * @start: start address
12031423 * @end: end address
1204
- * @nonblocking:
1424
+ * @locked: whether the mmap_lock is still held
12051425 *
12061426 * This takes care of mlocking the pages too if VM_LOCKED is set.
12071427 *
1208
- * return 0 on success, negative error code on error.
1428
+ * Return either number of pages pinned in the vma, or a negative error
1429
+ * code on error.
12091430 *
1210
- * vma->vm_mm->mmap_sem must be held.
1431
+ * vma->vm_mm->mmap_lock must be held.
12111432 *
1212
- * If @nonblocking is NULL, it may be held for read or write and will
1433
+ * If @locked is NULL, it may be held for read or write and will
12131434 * be unperturbed.
12141435 *
1215
- * If @nonblocking is non-NULL, it must held for read only and may be
1216
- * released. If it's released, *@nonblocking will be set to 0.
1436
+ * If @locked is non-NULL, it must held for read only and may be
1437
+ * released. If it's released, *@locked will be set to 0.
12171438 */
12181439 long populate_vma_page_range(struct vm_area_struct *vma,
1219
- unsigned long start, unsigned long end, int *nonblocking)
1440
+ unsigned long start, unsigned long end, int *locked)
12201441 {
12211442 struct mm_struct *mm = vma->vm_mm;
12221443 unsigned long nr_pages = (end - start) / PAGE_SIZE;
....@@ -1226,7 +1447,7 @@
12261447 VM_BUG_ON(end & ~PAGE_MASK);
12271448 VM_BUG_ON_VMA(start < vma->vm_start, vma);
12281449 VM_BUG_ON_VMA(end > vma->vm_end, vma);
1229
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1450
+ mmap_assert_locked(mm);
12301451
12311452 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
12321453 if (vma->vm_flags & VM_LOCKONFAULT)
....@@ -1243,15 +1464,15 @@
12431464 * We want mlock to succeed for regions that have any permissions
12441465 * other than PROT_NONE.
12451466 */
1246
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
1467
+ if (vma_is_accessible(vma))
12471468 gup_flags |= FOLL_FORCE;
12481469
12491470 /*
12501471 * We made sure addr is within a VMA, so the following will
12511472 * not result in a stack expansion that recurses back here.
12521473 */
1253
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
1254
- NULL, NULL, nonblocking);
1474
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
1475
+ NULL, NULL, locked);
12551476 }
12561477
12571478 /*
....@@ -1259,7 +1480,7 @@
12591480 *
12601481 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
12611482 * flags. VMAs must be already marked with the desired vm_flags, and
1262
- * mmap_sem must not be held.
1483
+ * mmap_lock must not be held.
12631484 */
12641485 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
12651486 {
....@@ -1278,7 +1499,7 @@
12781499 */
12791500 if (!locked) {
12801501 locked = 1;
1281
- down_read(&mm->mmap_sem);
1502
+ mmap_read_lock(mm);
12821503 vma = find_vma(mm, nstart);
12831504 } else if (nstart >= vma->vm_end)
12841505 vma = vma->vm_next;
....@@ -1310,9 +1531,53 @@
13101531 ret = 0;
13111532 }
13121533 if (locked)
1313
- up_read(&mm->mmap_sem);
1534
+ mmap_read_unlock(mm);
13141535 return ret; /* 0 or negative error code */
13151536 }
1537
+#else /* CONFIG_MMU */
1538
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
1539
+ unsigned long nr_pages, struct page **pages,
1540
+ struct vm_area_struct **vmas, int *locked,
1541
+ unsigned int foll_flags)
1542
+{
1543
+ struct vm_area_struct *vma;
1544
+ unsigned long vm_flags;
1545
+ int i;
1546
+
1547
+ /* calculate required read or write permissions.
1548
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
1549
+ */
1550
+ vm_flags = (foll_flags & FOLL_WRITE) ?
1551
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1552
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
1553
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1554
+
1555
+ for (i = 0; i < nr_pages; i++) {
1556
+ vma = find_vma(mm, start);
1557
+ if (!vma)
1558
+ goto finish_or_fault;
1559
+
1560
+ /* protect what we can, including chardevs */
1561
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1562
+ !(vm_flags & vma->vm_flags))
1563
+ goto finish_or_fault;
1564
+
1565
+ if (pages) {
1566
+ pages[i] = virt_to_page(start);
1567
+ if (pages[i])
1568
+ get_page(pages[i]);
1569
+ }
1570
+ if (vmas)
1571
+ vmas[i] = vma;
1572
+ start = (start + PAGE_SIZE) & PAGE_MASK;
1573
+ }
1574
+
1575
+ return i;
1576
+
1577
+finish_or_fault:
1578
+ return i ? : -EFAULT;
1579
+}
1580
+#endif /* !CONFIG_MMU */
13161581
13171582 /**
13181583 * get_dump_page() - pin user page in memory while writing it to core dump
....@@ -1326,25 +1591,429 @@
13261591 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
13271592 * allowing a hole to be left in the corefile to save diskspace.
13281593 *
1329
- * Called without mmap_sem, but after all other threads have been killed.
1594
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
13301595 */
13311596 #ifdef CONFIG_ELF_CORE
13321597 struct page *get_dump_page(unsigned long addr)
13331598 {
1334
- struct vm_area_struct *vma;
1599
+ struct mm_struct *mm = current->mm;
13351600 struct page *page;
1601
+ int locked = 1;
1602
+ int ret;
13361603
1337
- if (__get_user_pages(current, current->mm, addr, 1,
1338
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1339
- NULL) < 1)
1604
+ if (mmap_read_lock_killable(mm))
13401605 return NULL;
1341
- flush_cache_page(vma, addr, page_to_pfn(page));
1342
- return page;
1606
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
1607
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1608
+ if (locked)
1609
+ mmap_read_unlock(mm);
1610
+ return (ret == 1) ? page : NULL;
13431611 }
13441612 #endif /* CONFIG_ELF_CORE */
13451613
1614
+#ifdef CONFIG_CMA
1615
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
1616
+ unsigned long start,
1617
+ unsigned long nr_pages,
1618
+ struct page **pages,
1619
+ struct vm_area_struct **vmas,
1620
+ unsigned int gup_flags)
1621
+{
1622
+ unsigned long i, isolation_error_count;
1623
+ bool drain_allow;
1624
+ LIST_HEAD(cma_page_list);
1625
+ long ret = nr_pages;
1626
+ struct page *prev_head, *head;
1627
+ struct migration_target_control mtc = {
1628
+ .nid = NUMA_NO_NODE,
1629
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
1630
+ };
1631
+
1632
+check_again:
1633
+ prev_head = NULL;
1634
+ isolation_error_count = 0;
1635
+ drain_allow = true;
1636
+ for (i = 0; i < nr_pages; i++) {
1637
+ head = compound_head(pages[i]);
1638
+ if (head == prev_head)
1639
+ continue;
1640
+ prev_head = head;
1641
+ /*
1642
+ * If we get a page from the CMA zone, since we are going to
1643
+ * be pinning these entries, we might as well move them out
1644
+ * of the CMA zone if possible.
1645
+ */
1646
+ if (is_migrate_cma_page(head)) {
1647
+ if (PageHuge(head)) {
1648
+ if (isolate_hugetlb(head, &cma_page_list))
1649
+ isolation_error_count++;
1650
+ } else {
1651
+ if (!PageLRU(head) && drain_allow) {
1652
+ lru_add_drain_all();
1653
+ drain_allow = false;
1654
+ }
1655
+
1656
+ if (isolate_lru_page(head)) {
1657
+ isolation_error_count++;
1658
+ continue;
1659
+ }
1660
+ list_add_tail(&head->lru, &cma_page_list);
1661
+ mod_node_page_state(page_pgdat(head),
1662
+ NR_ISOLATED_ANON +
1663
+ page_is_file_lru(head),
1664
+ thp_nr_pages(head));
1665
+ }
1666
+ }
1667
+ }
1668
+
1669
+ /*
1670
+ * If list is empty, and no isolation errors, means that all pages are
1671
+ * in the correct zone.
1672
+ */
1673
+ if (list_empty(&cma_page_list) && !isolation_error_count)
1674
+ return ret;
1675
+
1676
+ if (!list_empty(&cma_page_list)) {
1677
+ /*
1678
+ * drop the above get_user_pages reference.
1679
+ */
1680
+ if (gup_flags & FOLL_PIN)
1681
+ unpin_user_pages(pages, nr_pages);
1682
+ else
1683
+ for (i = 0; i < nr_pages; i++)
1684
+ put_page(pages[i]);
1685
+
1686
+ ret = migrate_pages(&cma_page_list, alloc_migration_target,
1687
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
1688
+ MR_CONTIG_RANGE);
1689
+ if (ret) {
1690
+ if (!list_empty(&cma_page_list))
1691
+ putback_movable_pages(&cma_page_list);
1692
+ return ret > 0 ? -ENOMEM : ret;
1693
+ }
1694
+
1695
+ /* We unpinned pages before migration, pin them again */
1696
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1697
+ NULL, gup_flags);
1698
+ if (ret <= 0)
1699
+ return ret;
1700
+ nr_pages = ret;
1701
+ }
1702
+
1703
+ /*
1704
+ * check again because pages were unpinned, and we also might have
1705
+ * had isolation errors and need more pages to migrate.
1706
+ */
1707
+ goto check_again;
1708
+}
1709
+#else
1710
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
1711
+ unsigned long start,
1712
+ unsigned long nr_pages,
1713
+ struct page **pages,
1714
+ struct vm_area_struct **vmas,
1715
+ unsigned int gup_flags)
1716
+{
1717
+ return nr_pages;
1718
+}
1719
+#endif /* CONFIG_CMA */
1720
+
13461721 /*
1347
- * Generic Fast GUP
1722
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1723
+ * allows us to process the FOLL_LONGTERM flag.
1724
+ */
1725
+static long __gup_longterm_locked(struct mm_struct *mm,
1726
+ unsigned long start,
1727
+ unsigned long nr_pages,
1728
+ struct page **pages,
1729
+ struct vm_area_struct **vmas,
1730
+ unsigned int gup_flags)
1731
+{
1732
+ unsigned long flags = 0;
1733
+ long rc;
1734
+
1735
+ if (gup_flags & FOLL_LONGTERM)
1736
+ flags = memalloc_nocma_save();
1737
+
1738
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
1739
+ gup_flags);
1740
+
1741
+ if (gup_flags & FOLL_LONGTERM) {
1742
+ if (rc > 0)
1743
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
1744
+ vmas, gup_flags);
1745
+ memalloc_nocma_restore(flags);
1746
+ }
1747
+ return rc;
1748
+}
1749
+
1750
+static bool is_valid_gup_flags(unsigned int gup_flags)
1751
+{
1752
+ /*
1753
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1754
+ * never directly by the caller, so enforce that with an assertion:
1755
+ */
1756
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1757
+ return false;
1758
+ /*
1759
+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
1760
+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
1761
+ * FOLL_PIN.
1762
+ */
1763
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1764
+ return false;
1765
+
1766
+ return true;
1767
+}
1768
+
1769
+#ifdef CONFIG_MMU
1770
+static long __get_user_pages_remote(struct mm_struct *mm,
1771
+ unsigned long start, unsigned long nr_pages,
1772
+ unsigned int gup_flags, struct page **pages,
1773
+ struct vm_area_struct **vmas, int *locked)
1774
+{
1775
+ /*
1776
+ * Parts of FOLL_LONGTERM behavior are incompatible with
1777
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1778
+ * vmas. However, this only comes up if locked is set, and there are
1779
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
1780
+ * allow what we can.
1781
+ */
1782
+ if (gup_flags & FOLL_LONGTERM) {
1783
+ if (WARN_ON_ONCE(locked))
1784
+ return -EINVAL;
1785
+ /*
1786
+ * This will check the vmas (even if our vmas arg is NULL)
1787
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
1788
+ */
1789
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
1790
+ vmas, gup_flags | FOLL_TOUCH |
1791
+ FOLL_REMOTE);
1792
+ }
1793
+
1794
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1795
+ locked,
1796
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1797
+}
1798
+
1799
+/**
1800
+ * get_user_pages_remote() - pin user pages in memory
1801
+ * @mm: mm_struct of target mm
1802
+ * @start: starting user address
1803
+ * @nr_pages: number of pages from start to pin
1804
+ * @gup_flags: flags modifying lookup behaviour
1805
+ * @pages: array that receives pointers to the pages pinned.
1806
+ * Should be at least nr_pages long. Or NULL, if caller
1807
+ * only intends to ensure the pages are faulted in.
1808
+ * @vmas: array of pointers to vmas corresponding to each page.
1809
+ * Or NULL if the caller does not require them.
1810
+ * @locked: pointer to lock flag indicating whether lock is held and
1811
+ * subsequently whether VM_FAULT_RETRY functionality can be
1812
+ * utilised. Lock must initially be held.
1813
+ *
1814
+ * Returns either number of pages pinned (which may be less than the
1815
+ * number requested), or an error. Details about the return value:
1816
+ *
1817
+ * -- If nr_pages is 0, returns 0.
1818
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1819
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
1820
+ * pages pinned. Again, this may be less than nr_pages.
1821
+ *
1822
+ * The caller is responsible for releasing returned @pages, via put_page().
1823
+ *
1824
+ * @vmas are valid only as long as mmap_lock is held.
1825
+ *
1826
+ * Must be called with mmap_lock held for read or write.
1827
+ *
1828
+ * get_user_pages_remote walks a process's page tables and takes a reference
1829
+ * to each struct page that each user address corresponds to at a given
1830
+ * instant. That is, it takes the page that would be accessed if a user
1831
+ * thread accesses the given user virtual address at that instant.
1832
+ *
1833
+ * This does not guarantee that the page exists in the user mappings when
1834
+ * get_user_pages_remote returns, and there may even be a completely different
1835
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
1836
+ * and subsequently re faulted). However it does guarantee that the page
1837
+ * won't be freed completely. And mostly callers simply care that the page
1838
+ * contains data that was valid *at some point in time*. Typically, an IO
1839
+ * or similar operation cannot guarantee anything stronger anyway because
1840
+ * locks can't be held over the syscall boundary.
1841
+ *
1842
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1843
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1844
+ * be called after the page is finished with, and before put_page is called.
1845
+ *
1846
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
1847
+ * to get a handle on the memory by some means other than accesses
1848
+ * via the user virtual addresses. The pages may be submitted for
1849
+ * DMA to devices or accessed via their kernel linear mapping (via the
1850
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
1851
+ *
1852
+ * See also get_user_pages_fast, for performance critical applications.
1853
+ *
1854
+ * get_user_pages_remote should be phased out in favor of
1855
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
1856
+ * should use get_user_pages_remote because it cannot pass
1857
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1858
+ */
1859
+long get_user_pages_remote(struct mm_struct *mm,
1860
+ unsigned long start, unsigned long nr_pages,
1861
+ unsigned int gup_flags, struct page **pages,
1862
+ struct vm_area_struct **vmas, int *locked)
1863
+{
1864
+ if (!is_valid_gup_flags(gup_flags))
1865
+ return -EINVAL;
1866
+
1867
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
1868
+ pages, vmas, locked);
1869
+}
1870
+EXPORT_SYMBOL(get_user_pages_remote);
1871
+
1872
+#else /* CONFIG_MMU */
1873
+long get_user_pages_remote(struct mm_struct *mm,
1874
+ unsigned long start, unsigned long nr_pages,
1875
+ unsigned int gup_flags, struct page **pages,
1876
+ struct vm_area_struct **vmas, int *locked)
1877
+{
1878
+ return 0;
1879
+}
1880
+
1881
+static long __get_user_pages_remote(struct mm_struct *mm,
1882
+ unsigned long start, unsigned long nr_pages,
1883
+ unsigned int gup_flags, struct page **pages,
1884
+ struct vm_area_struct **vmas, int *locked)
1885
+{
1886
+ return 0;
1887
+}
1888
+#endif /* !CONFIG_MMU */
1889
+
1890
+/**
1891
+ * get_user_pages() - pin user pages in memory
1892
+ * @start: starting user address
1893
+ * @nr_pages: number of pages from start to pin
1894
+ * @gup_flags: flags modifying lookup behaviour
1895
+ * @pages: array that receives pointers to the pages pinned.
1896
+ * Should be at least nr_pages long. Or NULL, if caller
1897
+ * only intends to ensure the pages are faulted in.
1898
+ * @vmas: array of pointers to vmas corresponding to each page.
1899
+ * Or NULL if the caller does not require them.
1900
+ *
1901
+ * This is the same as get_user_pages_remote(), just with a less-flexible
1902
+ * calling convention where we assume that the mm being operated on belongs to
1903
+ * the current task, and doesn't allow passing of a locked parameter. We also
1904
+ * obviously don't pass FOLL_REMOTE in here.
1905
+ */
1906
+long get_user_pages(unsigned long start, unsigned long nr_pages,
1907
+ unsigned int gup_flags, struct page **pages,
1908
+ struct vm_area_struct **vmas)
1909
+{
1910
+ if (!is_valid_gup_flags(gup_flags))
1911
+ return -EINVAL;
1912
+
1913
+ return __gup_longterm_locked(current->mm, start, nr_pages,
1914
+ pages, vmas, gup_flags | FOLL_TOUCH);
1915
+}
1916
+EXPORT_SYMBOL(get_user_pages);
1917
+
1918
+/**
1919
+ * get_user_pages_locked() is suitable to replace the form:
1920
+ *
1921
+ * mmap_read_lock(mm);
1922
+ * do_something()
1923
+ * get_user_pages(mm, ..., pages, NULL);
1924
+ * mmap_read_unlock(mm);
1925
+ *
1926
+ * to:
1927
+ *
1928
+ * int locked = 1;
1929
+ * mmap_read_lock(mm);
1930
+ * do_something()
1931
+ * get_user_pages_locked(mm, ..., pages, &locked);
1932
+ * if (locked)
1933
+ * mmap_read_unlock(mm);
1934
+ *
1935
+ * @start: starting user address
1936
+ * @nr_pages: number of pages from start to pin
1937
+ * @gup_flags: flags modifying lookup behaviour
1938
+ * @pages: array that receives pointers to the pages pinned.
1939
+ * Should be at least nr_pages long. Or NULL, if caller
1940
+ * only intends to ensure the pages are faulted in.
1941
+ * @locked: pointer to lock flag indicating whether lock is held and
1942
+ * subsequently whether VM_FAULT_RETRY functionality can be
1943
+ * utilised. Lock must initially be held.
1944
+ *
1945
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
1946
+ * paths better by using either get_user_pages_locked() or
1947
+ * get_user_pages_unlocked().
1948
+ *
1949
+ */
1950
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1951
+ unsigned int gup_flags, struct page **pages,
1952
+ int *locked)
1953
+{
1954
+ /*
1955
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1956
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1957
+ * vmas. As there are no users of this flag in this call we simply
1958
+ * disallow this option for now.
1959
+ */
1960
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1961
+ return -EINVAL;
1962
+ /*
1963
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1964
+ * never directly by the caller, so enforce that:
1965
+ */
1966
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1967
+ return -EINVAL;
1968
+
1969
+ return __get_user_pages_locked(current->mm, start, nr_pages,
1970
+ pages, NULL, locked,
1971
+ gup_flags | FOLL_TOUCH);
1972
+}
1973
+EXPORT_SYMBOL(get_user_pages_locked);
1974
+
1975
+/*
1976
+ * get_user_pages_unlocked() is suitable to replace the form:
1977
+ *
1978
+ * mmap_read_lock(mm);
1979
+ * get_user_pages(mm, ..., pages, NULL);
1980
+ * mmap_read_unlock(mm);
1981
+ *
1982
+ * with:
1983
+ *
1984
+ * get_user_pages_unlocked(mm, ..., pages);
1985
+ *
1986
+ * It is functionally equivalent to get_user_pages_fast so
1987
+ * get_user_pages_fast should be used instead if specific gup_flags
1988
+ * (e.g. FOLL_FORCE) are not required.
1989
+ */
1990
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1991
+ struct page **pages, unsigned int gup_flags)
1992
+{
1993
+ struct mm_struct *mm = current->mm;
1994
+ int locked = 1;
1995
+ long ret;
1996
+
1997
+ /*
1998
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1999
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2000
+ * vmas. As there are no users of this flag in this call we simply
2001
+ * disallow this option for now.
2002
+ */
2003
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2004
+ return -EINVAL;
2005
+
2006
+ mmap_read_lock(mm);
2007
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
2008
+ &locked, gup_flags | FOLL_TOUCH);
2009
+ if (locked)
2010
+ mmap_read_unlock(mm);
2011
+ return ret;
2012
+}
2013
+EXPORT_SYMBOL(get_user_pages_unlocked);
2014
+
2015
+/*
2016
+ * Fast GUP
13482017 *
13492018 * get_user_pages_fast attempts to pin user pages by walking the page
13502019 * tables directly and avoids taking locks. Thus the walker needs to be
....@@ -1365,7 +2034,7 @@
13652034 * Before activating this code, please be aware that the following assumptions
13662035 * are currently made:
13672036 *
1368
- * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
2037
+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
13692038 * free pages containing page tables or TLB flushing requires IPI broadcast.
13702039 *
13712040 * *) ptes can be read atomically by the architecture.
....@@ -1376,47 +2045,101 @@
13762045 *
13772046 * This code is based heavily on the PowerPC implementation by Nick Piggin.
13782047 */
1379
-#ifdef CONFIG_HAVE_GENERIC_GUP
2048
+#ifdef CONFIG_HAVE_FAST_GUP
2049
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
13802050
1381
-#ifndef gup_get_pte
13822051 /*
1383
- * We assume that the PTE can be read atomically. If this is not the case for
1384
- * your architecture, please provide the helper.
2052
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
2053
+ *
2054
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
2055
+ * locks. For this we would like to load the pointers atomically, but sometimes
2056
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
2057
+ * we do have is the guarantee that a PTE will only either go from not present
2058
+ * to present, or present to not present or both -- it will not switch to a
2059
+ * completely different present page without a TLB flush in between; something
2060
+ * that we are blocking by holding interrupts off.
2061
+ *
2062
+ * Setting ptes from not present to present goes:
2063
+ *
2064
+ * ptep->pte_high = h;
2065
+ * smp_wmb();
2066
+ * ptep->pte_low = l;
2067
+ *
2068
+ * And present to not present goes:
2069
+ *
2070
+ * ptep->pte_low = 0;
2071
+ * smp_wmb();
2072
+ * ptep->pte_high = 0;
2073
+ *
2074
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
2075
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
2076
+ * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
2077
+ * picked up a changed pte high. We might have gotten rubbish values from
2078
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
2079
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
2080
+ * operates on present ptes we're safe.
13852081 */
13862082 static inline pte_t gup_get_pte(pte_t *ptep)
13872083 {
1388
- return READ_ONCE(*ptep);
2084
+ pte_t pte;
2085
+
2086
+ do {
2087
+ pte.pte_low = ptep->pte_low;
2088
+ smp_rmb();
2089
+ pte.pte_high = ptep->pte_high;
2090
+ smp_rmb();
2091
+ } while (unlikely(pte.pte_low != ptep->pte_low));
2092
+
2093
+ return pte;
13892094 }
1390
-#endif
2095
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
2096
+/*
2097
+ * We require that the PTE can be read atomically.
2098
+ */
2099
+static inline pte_t gup_get_pte(pte_t *ptep)
2100
+{
2101
+ return ptep_get(ptep);
2102
+}
2103
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
13912104
13922105 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
2106
+ unsigned int flags,
13932107 struct page **pages)
13942108 {
13952109 while ((*nr) - nr_start) {
13962110 struct page *page = pages[--(*nr)];
13972111
13982112 ClearPageReferenced(page);
1399
- put_page(page);
2113
+ if (flags & FOLL_PIN)
2114
+ unpin_user_page(page);
2115
+ else
2116
+ put_page(page);
14002117 }
14012118 }
14022119
1403
-/*
1404
- * Return the compund head page with ref appropriately incremented,
1405
- * or NULL if that failed.
1406
- */
1407
-static inline struct page *try_get_compound_head(struct page *page, int refs)
1408
-{
1409
- struct page *head = compound_head(page);
1410
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
1411
- return NULL;
1412
- if (unlikely(!page_cache_add_speculative(head, refs)))
1413
- return NULL;
1414
- return head;
1415
-}
1416
-
14172120 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
1418
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1419
- int write, struct page **pages, int *nr)
2121
+/*
2122
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
2123
+ * operations.
2124
+ *
2125
+ * To pin the page, fast-gup needs to do below in order:
2126
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
2127
+ *
2128
+ * For the rest of pgtable operations where pgtable updates can be racy
2129
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
2130
+ * is pinned.
2131
+ *
2132
+ * Above will work for all pte-level operations, including THP split.
2133
+ *
2134
+ * For THP collapse, it's a bit more complicated because fast-gup may be
2135
+ * walking a pgtable page that is being freed (pte is still valid but pmd
2136
+ * can be cleared already). To avoid race in such condition, we need to
2137
+ * also check pmd here to make sure pmd doesn't change (corresponds to
2138
+ * pmdp_collapse_flush() in the THP collapse code path).
2139
+ */
2140
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2141
+ unsigned long end, unsigned int flags,
2142
+ struct page **pages, int *nr)
14202143 {
14212144 struct dev_pagemap *pgmap = NULL;
14222145 int nr_start = *nr, ret = 0;
....@@ -1434,13 +2157,16 @@
14342157 if (pte_protnone(pte))
14352158 goto pte_unmap;
14362159
1437
- if (!pte_access_permitted(pte, write))
2160
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
14382161 goto pte_unmap;
14392162
14402163 if (pte_devmap(pte)) {
2164
+ if (unlikely(flags & FOLL_LONGTERM))
2165
+ goto pte_unmap;
2166
+
14412167 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
14422168 if (unlikely(!pgmap)) {
1443
- undo_dev_pagemap(nr, nr_start, pages);
2169
+ undo_dev_pagemap(nr, nr_start, flags, pages);
14442170 goto pte_unmap;
14452171 }
14462172 } else if (pte_special(pte))
....@@ -1449,17 +2175,31 @@
14492175 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
14502176 page = pte_page(pte);
14512177
1452
- head = try_get_compound_head(page, 1);
2178
+ head = try_grab_compound_head(page, 1, flags);
14532179 if (!head)
14542180 goto pte_unmap;
14552181
1456
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1457
- put_page(head);
2182
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
2183
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
2184
+ put_compound_head(head, 1, flags);
14582185 goto pte_unmap;
14592186 }
14602187
14612188 VM_BUG_ON_PAGE(compound_head(page) != head, page);
14622189
2190
+ /*
2191
+ * We need to make the page accessible if and only if we are
2192
+ * going to access its content (the FOLL_PIN case). Please
2193
+ * see Documentation/core-api/pin_user_pages.rst for
2194
+ * details.
2195
+ */
2196
+ if (flags & FOLL_PIN) {
2197
+ ret = arch_make_page_accessible(page);
2198
+ if (ret) {
2199
+ unpin_user_page(page);
2200
+ goto pte_unmap;
2201
+ }
2202
+ }
14632203 SetPageReferenced(page);
14642204 pages[*nr] = page;
14652205 (*nr)++;
....@@ -1482,19 +2222,21 @@
14822222 * to be special.
14832223 *
14842224 * For a futex to be placed on a THP tail page, get_futex_key requires a
1485
- * __get_user_pages_fast implementation that can pin pages. Thus it's still
2225
+ * get_user_pages_fast_only implementation that can pin pages. Thus it's still
14862226 * useful to have gup_huge_pmd even if we can't operate on ptes.
14872227 */
1488
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
1489
- int write, struct page **pages, int *nr)
2228
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
2229
+ unsigned long end, unsigned int flags,
2230
+ struct page **pages, int *nr)
14902231 {
14912232 return 0;
14922233 }
14932234 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
14942235
1495
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
2236
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
14962237 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
1497
- unsigned long end, struct page **pages, int *nr)
2238
+ unsigned long end, unsigned int flags,
2239
+ struct page **pages, int *nr)
14982240 {
14992241 int nr_start = *nr;
15002242 struct dev_pagemap *pgmap = NULL;
....@@ -1504,12 +2246,15 @@
15042246
15052247 pgmap = get_dev_pagemap(pfn, pgmap);
15062248 if (unlikely(!pgmap)) {
1507
- undo_dev_pagemap(nr, nr_start, pages);
2249
+ undo_dev_pagemap(nr, nr_start, flags, pages);
15082250 return 0;
15092251 }
15102252 SetPageReferenced(page);
15112253 pages[*nr] = page;
1512
- get_page(page);
2254
+ if (unlikely(!try_grab_page(page, flags))) {
2255
+ undo_dev_pagemap(nr, nr_start, flags, pages);
2256
+ return 0;
2257
+ }
15132258 (*nr)++;
15142259 pfn++;
15152260 } while (addr += PAGE_SIZE, addr != end);
....@@ -1520,174 +2265,246 @@
15202265 }
15212266
15222267 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1523
- unsigned long end, struct page **pages, int *nr)
2268
+ unsigned long end, unsigned int flags,
2269
+ struct page **pages, int *nr)
15242270 {
15252271 unsigned long fault_pfn;
15262272 int nr_start = *nr;
15272273
15282274 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1529
- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
2275
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
15302276 return 0;
15312277
15322278 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1533
- undo_dev_pagemap(nr, nr_start, pages);
2279
+ undo_dev_pagemap(nr, nr_start, flags, pages);
15342280 return 0;
15352281 }
15362282 return 1;
15372283 }
15382284
15392285 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1540
- unsigned long end, struct page **pages, int *nr)
2286
+ unsigned long end, unsigned int flags,
2287
+ struct page **pages, int *nr)
15412288 {
15422289 unsigned long fault_pfn;
15432290 int nr_start = *nr;
15442291
15452292 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1546
- if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
2293
+ if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
15472294 return 0;
15482295
15492296 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1550
- undo_dev_pagemap(nr, nr_start, pages);
2297
+ undo_dev_pagemap(nr, nr_start, flags, pages);
15512298 return 0;
15522299 }
15532300 return 1;
15542301 }
15552302 #else
15562303 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1557
- unsigned long end, struct page **pages, int *nr)
2304
+ unsigned long end, unsigned int flags,
2305
+ struct page **pages, int *nr)
15582306 {
15592307 BUILD_BUG();
15602308 return 0;
15612309 }
15622310
15632311 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
1564
- unsigned long end, struct page **pages, int *nr)
2312
+ unsigned long end, unsigned int flags,
2313
+ struct page **pages, int *nr)
15652314 {
15662315 BUILD_BUG();
15672316 return 0;
15682317 }
15692318 #endif
15702319
2320
+static int record_subpages(struct page *page, unsigned long addr,
2321
+ unsigned long end, struct page **pages)
2322
+{
2323
+ int nr;
2324
+
2325
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
2326
+ pages[nr++] = page++;
2327
+
2328
+ return nr;
2329
+}
2330
+
2331
+#ifdef CONFIG_ARCH_HAS_HUGEPD
2332
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2333
+ unsigned long sz)
2334
+{
2335
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
2336
+ return (__boundary - 1 < end - 1) ? __boundary : end;
2337
+}
2338
+
2339
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
2340
+ unsigned long end, unsigned int flags,
2341
+ struct page **pages, int *nr)
2342
+{
2343
+ unsigned long pte_end;
2344
+ struct page *head, *page;
2345
+ pte_t pte;
2346
+ int refs;
2347
+
2348
+ pte_end = (addr + sz) & ~(sz-1);
2349
+ if (pte_end < end)
2350
+ end = pte_end;
2351
+
2352
+ pte = huge_ptep_get(ptep);
2353
+
2354
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
2355
+ return 0;
2356
+
2357
+ /* hugepages are never "special" */
2358
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2359
+
2360
+ head = pte_page(pte);
2361
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
2362
+ refs = record_subpages(page, addr, end, pages + *nr);
2363
+
2364
+ head = try_grab_compound_head(head, refs, flags);
2365
+ if (!head)
2366
+ return 0;
2367
+
2368
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
2369
+ put_compound_head(head, refs, flags);
2370
+ return 0;
2371
+ }
2372
+
2373
+ *nr += refs;
2374
+ SetPageReferenced(head);
2375
+ return 1;
2376
+}
2377
+
2378
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2379
+ unsigned int pdshift, unsigned long end, unsigned int flags,
2380
+ struct page **pages, int *nr)
2381
+{
2382
+ pte_t *ptep;
2383
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
2384
+ unsigned long next;
2385
+
2386
+ ptep = hugepte_offset(hugepd, addr, pdshift);
2387
+ do {
2388
+ next = hugepte_addr_end(addr, end, sz);
2389
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
2390
+ return 0;
2391
+ } while (ptep++, addr = next, addr != end);
2392
+
2393
+ return 1;
2394
+}
2395
+#else
2396
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
2397
+ unsigned int pdshift, unsigned long end, unsigned int flags,
2398
+ struct page **pages, int *nr)
2399
+{
2400
+ return 0;
2401
+}
2402
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
2403
+
15712404 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
1572
- unsigned long end, int write, struct page **pages, int *nr)
2405
+ unsigned long end, unsigned int flags,
2406
+ struct page **pages, int *nr)
15732407 {
15742408 struct page *head, *page;
15752409 int refs;
15762410
1577
- if (!pmd_access_permitted(orig, write))
2411
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
15782412 return 0;
15792413
1580
- if (pmd_devmap(orig))
1581
- return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
1582
-
1583
- refs = 0;
1584
- page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1585
- do {
1586
- pages[*nr] = page;
1587
- (*nr)++;
1588
- page++;
1589
- refs++;
1590
- } while (addr += PAGE_SIZE, addr != end);
1591
-
1592
- head = try_get_compound_head(pmd_page(orig), refs);
1593
- if (!head) {
1594
- *nr -= refs;
1595
- return 0;
2414
+ if (pmd_devmap(orig)) {
2415
+ if (unlikely(flags & FOLL_LONGTERM))
2416
+ return 0;
2417
+ return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2418
+ pages, nr);
15962419 }
2420
+
2421
+ page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
2422
+ refs = record_subpages(page, addr, end, pages + *nr);
2423
+
2424
+ head = try_grab_compound_head(pmd_page(orig), refs, flags);
2425
+ if (!head)
2426
+ return 0;
15972427
15982428 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
1599
- *nr -= refs;
1600
- while (refs--)
1601
- put_page(head);
2429
+ put_compound_head(head, refs, flags);
16022430 return 0;
16032431 }
16042432
2433
+ *nr += refs;
16052434 SetPageReferenced(head);
16062435 return 1;
16072436 }
16082437
16092438 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
1610
- unsigned long end, int write, struct page **pages, int *nr)
2439
+ unsigned long end, unsigned int flags,
2440
+ struct page **pages, int *nr)
16112441 {
16122442 struct page *head, *page;
16132443 int refs;
16142444
1615
- if (!pud_access_permitted(orig, write))
2445
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
16162446 return 0;
16172447
1618
- if (pud_devmap(orig))
1619
- return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
1620
-
1621
- refs = 0;
1622
- page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
1623
- do {
1624
- pages[*nr] = page;
1625
- (*nr)++;
1626
- page++;
1627
- refs++;
1628
- } while (addr += PAGE_SIZE, addr != end);
1629
-
1630
- head = try_get_compound_head(pud_page(orig), refs);
1631
- if (!head) {
1632
- *nr -= refs;
1633
- return 0;
2448
+ if (pud_devmap(orig)) {
2449
+ if (unlikely(flags & FOLL_LONGTERM))
2450
+ return 0;
2451
+ return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2452
+ pages, nr);
16342453 }
2454
+
2455
+ page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
2456
+ refs = record_subpages(page, addr, end, pages + *nr);
2457
+
2458
+ head = try_grab_compound_head(pud_page(orig), refs, flags);
2459
+ if (!head)
2460
+ return 0;
16352461
16362462 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
1637
- *nr -= refs;
1638
- while (refs--)
1639
- put_page(head);
2463
+ put_compound_head(head, refs, flags);
16402464 return 0;
16412465 }
16422466
2467
+ *nr += refs;
16432468 SetPageReferenced(head);
16442469 return 1;
16452470 }
16462471
16472472 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
1648
- unsigned long end, int write,
2473
+ unsigned long end, unsigned int flags,
16492474 struct page **pages, int *nr)
16502475 {
16512476 int refs;
16522477 struct page *head, *page;
16532478
1654
- if (!pgd_access_permitted(orig, write))
2479
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
16552480 return 0;
16562481
16572482 BUILD_BUG_ON(pgd_devmap(orig));
1658
- refs = 0;
1659
- page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
1660
- do {
1661
- pages[*nr] = page;
1662
- (*nr)++;
1663
- page++;
1664
- refs++;
1665
- } while (addr += PAGE_SIZE, addr != end);
16662483
1667
- head = try_get_compound_head(pgd_page(orig), refs);
1668
- if (!head) {
1669
- *nr -= refs;
2484
+ page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
2485
+ refs = record_subpages(page, addr, end, pages + *nr);
2486
+
2487
+ head = try_grab_compound_head(pgd_page(orig), refs, flags);
2488
+ if (!head)
16702489 return 0;
1671
- }
16722490
16732491 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
1674
- *nr -= refs;
1675
- while (refs--)
1676
- put_page(head);
2492
+ put_compound_head(head, refs, flags);
16772493 return 0;
16782494 }
16792495
2496
+ *nr += refs;
16802497 SetPageReferenced(head);
16812498 return 1;
16822499 }
16832500
1684
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1685
- int write, struct page **pages, int *nr)
2501
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
2502
+ unsigned int flags, struct page **pages, int *nr)
16862503 {
16872504 unsigned long next;
16882505 pmd_t *pmdp;
16892506
1690
- pmdp = pmd_offset(&pud, addr);
2507
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
16912508 do {
16922509 pmd_t pmd = READ_ONCE(*pmdp);
16932510
....@@ -1705,7 +2522,7 @@
17052522 if (pmd_protnone(pmd))
17062523 return 0;
17072524
1708
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
2525
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
17092526 pages, nr))
17102527 return 0;
17112528
....@@ -1715,50 +2532,50 @@
17152532 * pmd format and THP pmd format
17162533 */
17172534 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
1718
- PMD_SHIFT, next, write, pages, nr))
2535
+ PMD_SHIFT, next, flags, pages, nr))
17192536 return 0;
1720
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
2537
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
17212538 return 0;
17222539 } while (pmdp++, addr = next, addr != end);
17232540
17242541 return 1;
17252542 }
17262543
1727
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
1728
- int write, struct page **pages, int *nr)
2544
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
2545
+ unsigned int flags, struct page **pages, int *nr)
17292546 {
17302547 unsigned long next;
17312548 pud_t *pudp;
17322549
1733
- pudp = pud_offset(&p4d, addr);
2550
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
17342551 do {
17352552 pud_t pud = READ_ONCE(*pudp);
17362553
17372554 next = pud_addr_end(addr, end);
1738
- if (pud_none(pud))
2555
+ if (unlikely(!pud_present(pud)))
17392556 return 0;
1740
- if (unlikely(pud_huge(pud))) {
1741
- if (!gup_huge_pud(pud, pudp, addr, next, write,
2557
+ if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
2558
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
17422559 pages, nr))
17432560 return 0;
17442561 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
17452562 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
1746
- PUD_SHIFT, next, write, pages, nr))
2563
+ PUD_SHIFT, next, flags, pages, nr))
17472564 return 0;
1748
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
2565
+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
17492566 return 0;
17502567 } while (pudp++, addr = next, addr != end);
17512568
17522569 return 1;
17532570 }
17542571
1755
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
1756
- int write, struct page **pages, int *nr)
2572
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
2573
+ unsigned int flags, struct page **pages, int *nr)
17572574 {
17582575 unsigned long next;
17592576 p4d_t *p4dp;
17602577
1761
- p4dp = p4d_offset(&pgd, addr);
2578
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
17622579 do {
17632580 p4d_t p4d = READ_ONCE(*p4dp);
17642581
....@@ -1768,9 +2585,9 @@
17682585 BUILD_BUG_ON(p4d_huge(p4d));
17692586 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
17702587 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
1771
- P4D_SHIFT, next, write, pages, nr))
2588
+ P4D_SHIFT, next, flags, pages, nr))
17722589 return 0;
1773
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
2590
+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
17742591 return 0;
17752592 } while (p4dp++, addr = next, addr != end);
17762593
....@@ -1778,7 +2595,7 @@
17782595 }
17792596
17802597 static void gup_pgd_range(unsigned long addr, unsigned long end,
1781
- int write, struct page **pages, int *nr)
2598
+ unsigned int flags, struct page **pages, int *nr)
17822599 {
17832600 unsigned long next;
17842601 pgd_t *pgdp;
....@@ -1791,152 +2608,411 @@
17912608 if (pgd_none(pgd))
17922609 return;
17932610 if (unlikely(pgd_huge(pgd))) {
1794
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
2611
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
17952612 pages, nr))
17962613 return;
17972614 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
17982615 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
1799
- PGDIR_SHIFT, next, write, pages, nr))
2616
+ PGDIR_SHIFT, next, flags, pages, nr))
18002617 return;
1801
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
2618
+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
18022619 return;
18032620 } while (pgdp++, addr = next, addr != end);
18042621 }
2622
+#else
2623
+static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2624
+ unsigned int flags, struct page **pages, int *nr)
2625
+{
2626
+}
2627
+#endif /* CONFIG_HAVE_FAST_GUP */
18052628
18062629 #ifndef gup_fast_permitted
18072630 /*
1808
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
2631
+ * Check if it's allowed to use get_user_pages_fast_only() for the range, or
18092632 * we need to fall back to the slow version:
18102633 */
1811
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
2634
+static bool gup_fast_permitted(unsigned long start, unsigned long end)
18122635 {
1813
- unsigned long len, end;
1814
-
1815
- len = (unsigned long) nr_pages << PAGE_SHIFT;
1816
- end = start + len;
1817
- return end >= start;
2636
+ return true;
18182637 }
18192638 #endif
18202639
1821
-/*
1822
- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
1823
- * the regular GUP.
1824
- * Note a difference with get_user_pages_fast: this always returns the
1825
- * number of pages pinned, 0 if no pages were pinned.
1826
- *
1827
- * Careful, careful! COW breaking can go either way, so a non-write
1828
- * access can get ambiguous page results. If you call this function without
1829
- * 'write' set, you'd better be sure that you're ok with that ambiguity.
1830
- */
1831
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1832
- struct page **pages)
2640
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2641
+ unsigned int gup_flags, struct page **pages)
18332642 {
1834
- unsigned long addr, len, end;
1835
- unsigned long flags;
1836
- int nr = 0;
1837
-
1838
- start &= PAGE_MASK;
1839
- addr = start;
1840
- len = (unsigned long) nr_pages << PAGE_SHIFT;
1841
- end = start + len;
1842
-
1843
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1844
- (void __user *)start, len)))
1845
- return 0;
2643
+ int ret;
18462644
18472645 /*
1848
- * Disable interrupts. We use the nested form as we can already have
1849
- * interrupts disabled by get_futex_key.
1850
- *
1851
- * With interrupts disabled, we block page table pages from being
1852
- * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
1853
- * for more details.
1854
- *
1855
- * We do not adopt an rcu_read_lock(.) here as we also want to
1856
- * block IPIs that come from THPs splitting.
1857
- *
1858
- * NOTE! We allow read-only gup_fast() here, but you'd better be
1859
- * careful about possible COW pages. You'll get _a_ COW page, but
1860
- * not necessarily the one you intended to get depending on what
1861
- * COW event happens after this. COW may break the page copy in a
1862
- * random direction.
2646
+ * FIXME: FOLL_LONGTERM does not work with
2647
+ * get_user_pages_unlocked() (see comments in that function)
18632648 */
1864
-
1865
- if (gup_fast_permitted(start, nr_pages, write)) {
1866
- local_irq_save(flags);
1867
- gup_pgd_range(addr, end, write, pages, &nr);
1868
- local_irq_restore(flags);
1869
- }
1870
-
1871
- return nr;
1872
-}
1873
-
1874
-/**
1875
- * get_user_pages_fast() - pin user pages in memory
1876
- * @start: starting user address
1877
- * @nr_pages: number of pages from start to pin
1878
- * @write: whether pages will be written to
1879
- * @pages: array that receives pointers to the pages pinned.
1880
- * Should be at least nr_pages long.
1881
- *
1882
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
1883
- * If not successful, it will fall back to taking the lock and
1884
- * calling get_user_pages().
1885
- *
1886
- * Returns number of pages pinned. This may be fewer than the number
1887
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
1888
- * were pinned, returns -errno.
1889
- */
1890
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1891
- struct page **pages)
1892
-{
1893
- unsigned long addr, len, end;
1894
- int nr = 0, ret = 0;
1895
-
1896
- start &= PAGE_MASK;
1897
- addr = start;
1898
- len = (unsigned long) nr_pages << PAGE_SHIFT;
1899
- end = start + len;
1900
-
1901
- if (nr_pages <= 0)
1902
- return 0;
1903
-
1904
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
1905
- (void __user *)start, len)))
1906
- return -EFAULT;
1907
-
1908
- /*
1909
- * The FAST_GUP case requires FOLL_WRITE even for pure reads,
1910
- * because get_user_pages() may need to cause an early COW in
1911
- * order to avoid confusing the normal COW routines. So only
1912
- * targets that are already writable are safe to do by just
1913
- * looking at the page tables.
1914
- */
1915
- if (gup_fast_permitted(start, nr_pages, write)) {
1916
- local_irq_disable();
1917
- gup_pgd_range(addr, end, 1, pages, &nr);
1918
- local_irq_enable();
1919
- ret = nr;
1920
- }
1921
-
1922
- if (nr < nr_pages) {
1923
- /* Try to get the remaining pages with get_user_pages */
1924
- start += nr << PAGE_SHIFT;
1925
- pages += nr;
1926
-
1927
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
1928
- write ? FOLL_WRITE : 0);
1929
-
1930
- /* Have to be a bit careful with return values */
1931
- if (nr > 0) {
1932
- if (ret < 0)
1933
- ret = nr;
1934
- else
1935
- ret += nr;
1936
- }
2649
+ if (gup_flags & FOLL_LONGTERM) {
2650
+ mmap_read_lock(current->mm);
2651
+ ret = __gup_longterm_locked(current->mm,
2652
+ start, nr_pages,
2653
+ pages, NULL, gup_flags);
2654
+ mmap_read_unlock(current->mm);
2655
+ } else {
2656
+ ret = get_user_pages_unlocked(start, nr_pages,
2657
+ pages, gup_flags);
19372658 }
19382659
19392660 return ret;
19402661 }
19412662
1942
-#endif /* CONFIG_HAVE_GENERIC_GUP */
2663
+static unsigned long lockless_pages_from_mm(unsigned long start,
2664
+ unsigned long end,
2665
+ unsigned int gup_flags,
2666
+ struct page **pages)
2667
+{
2668
+ unsigned long flags;
2669
+ int nr_pinned = 0;
2670
+ unsigned seq;
2671
+
2672
+ if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2673
+ !gup_fast_permitted(start, end))
2674
+ return 0;
2675
+
2676
+ if (gup_flags & FOLL_PIN) {
2677
+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
2678
+ if (seq & 1)
2679
+ return 0;
2680
+ }
2681
+
2682
+ /*
2683
+ * Disable interrupts. The nested form is used, in order to allow full,
2684
+ * general purpose use of this routine.
2685
+ *
2686
+ * With interrupts disabled, we block page table pages from being freed
2687
+ * from under us. See struct mmu_table_batch comments in
2688
+ * include/asm-generic/tlb.h for more details.
2689
+ *
2690
+ * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2691
+ * that come from THPs splitting.
2692
+ */
2693
+ local_irq_save(flags);
2694
+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2695
+ local_irq_restore(flags);
2696
+
2697
+ /*
2698
+ * When pinning pages for DMA there could be a concurrent write protect
2699
+ * from fork() via copy_page_range(), in this case always fail fast GUP.
2700
+ */
2701
+ if (gup_flags & FOLL_PIN) {
2702
+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
2703
+ unpin_user_pages(pages, nr_pinned);
2704
+ return 0;
2705
+ }
2706
+ }
2707
+ return nr_pinned;
2708
+}
2709
+
2710
+static int internal_get_user_pages_fast(unsigned long start,
2711
+ unsigned long nr_pages,
2712
+ unsigned int gup_flags,
2713
+ struct page **pages)
2714
+{
2715
+ unsigned long len, end;
2716
+ unsigned long nr_pinned;
2717
+ int ret;
2718
+
2719
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
2720
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
2721
+ FOLL_FAST_ONLY)))
2722
+ return -EINVAL;
2723
+
2724
+ if (gup_flags & FOLL_PIN)
2725
+ atomic_set(&current->mm->has_pinned, 1);
2726
+
2727
+ if (!(gup_flags & FOLL_FAST_ONLY))
2728
+ might_lock_read(&current->mm->mmap_lock);
2729
+
2730
+ start = untagged_addr(start) & PAGE_MASK;
2731
+ len = nr_pages << PAGE_SHIFT;
2732
+ if (check_add_overflow(start, len, &end))
2733
+ return 0;
2734
+ if (unlikely(!access_ok((void __user *)start, len)))
2735
+ return -EFAULT;
2736
+
2737
+ nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2738
+ if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2739
+ return nr_pinned;
2740
+
2741
+ /* Slow path: try to get the remaining pages with get_user_pages */
2742
+ start += nr_pinned << PAGE_SHIFT;
2743
+ pages += nr_pinned;
2744
+ ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
2745
+ pages);
2746
+ if (ret < 0) {
2747
+ /*
2748
+ * The caller has to unpin the pages we already pinned so
2749
+ * returning -errno is not an option
2750
+ */
2751
+ if (nr_pinned)
2752
+ return nr_pinned;
2753
+ return ret;
2754
+ }
2755
+ return ret + nr_pinned;
2756
+}
2757
+
2758
+/**
2759
+ * get_user_pages_fast_only() - pin user pages in memory
2760
+ * @start: starting user address
2761
+ * @nr_pages: number of pages from start to pin
2762
+ * @gup_flags: flags modifying pin behaviour
2763
+ * @pages: array that receives pointers to the pages pinned.
2764
+ * Should be at least nr_pages long.
2765
+ *
2766
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2767
+ * the regular GUP.
2768
+ * Note a difference with get_user_pages_fast: this always returns the
2769
+ * number of pages pinned, 0 if no pages were pinned.
2770
+ *
2771
+ * If the architecture does not support this function, simply return with no
2772
+ * pages pinned.
2773
+ *
2774
+ * Careful, careful! COW breaking can go either way, so a non-write
2775
+ * access can get ambiguous page results. If you call this function without
2776
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
2777
+ */
2778
+int get_user_pages_fast_only(unsigned long start, int nr_pages,
2779
+ unsigned int gup_flags, struct page **pages)
2780
+{
2781
+ int nr_pinned;
2782
+ /*
2783
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
2784
+ * because gup fast is always a "pin with a +1 page refcount" request.
2785
+ *
2786
+ * FOLL_FAST_ONLY is required in order to match the API description of
2787
+ * this routine: no fall back to regular ("slow") GUP.
2788
+ */
2789
+ gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
2790
+
2791
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2792
+ pages);
2793
+
2794
+ /*
2795
+ * As specified in the API description above, this routine is not
2796
+ * allowed to return negative values. However, the common core
2797
+ * routine internal_get_user_pages_fast() *can* return -errno.
2798
+ * Therefore, correct for that here:
2799
+ */
2800
+ if (nr_pinned < 0)
2801
+ nr_pinned = 0;
2802
+
2803
+ return nr_pinned;
2804
+}
2805
+EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
2806
+
2807
+/**
2808
+ * get_user_pages_fast() - pin user pages in memory
2809
+ * @start: starting user address
2810
+ * @nr_pages: number of pages from start to pin
2811
+ * @gup_flags: flags modifying pin behaviour
2812
+ * @pages: array that receives pointers to the pages pinned.
2813
+ * Should be at least nr_pages long.
2814
+ *
2815
+ * Attempt to pin user pages in memory without taking mm->mmap_lock.
2816
+ * If not successful, it will fall back to taking the lock and
2817
+ * calling get_user_pages().
2818
+ *
2819
+ * Returns number of pages pinned. This may be fewer than the number requested.
2820
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
2821
+ * -errno.
2822
+ */
2823
+int get_user_pages_fast(unsigned long start, int nr_pages,
2824
+ unsigned int gup_flags, struct page **pages)
2825
+{
2826
+ if (!is_valid_gup_flags(gup_flags))
2827
+ return -EINVAL;
2828
+
2829
+ /*
2830
+ * The caller may or may not have explicitly set FOLL_GET; either way is
2831
+ * OK. However, internally (within mm/gup.c), gup fast variants must set
2832
+ * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
2833
+ * request.
2834
+ */
2835
+ gup_flags |= FOLL_GET;
2836
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2837
+}
2838
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
2839
+
2840
+/**
2841
+ * pin_user_pages_fast() - pin user pages in memory without taking locks
2842
+ *
2843
+ * @start: starting user address
2844
+ * @nr_pages: number of pages from start to pin
2845
+ * @gup_flags: flags modifying pin behaviour
2846
+ * @pages: array that receives pointers to the pages pinned.
2847
+ * Should be at least nr_pages long.
2848
+ *
2849
+ * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
2850
+ * get_user_pages_fast() for documentation on the function arguments, because
2851
+ * the arguments here are identical.
2852
+ *
2853
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2854
+ * see Documentation/core-api/pin_user_pages.rst for further details.
2855
+ */
2856
+int pin_user_pages_fast(unsigned long start, int nr_pages,
2857
+ unsigned int gup_flags, struct page **pages)
2858
+{
2859
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2860
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2861
+ return -EINVAL;
2862
+
2863
+ gup_flags |= FOLL_PIN;
2864
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2865
+}
2866
+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
2867
+
2868
+/*
2869
+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
2870
+ * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
2871
+ *
2872
+ * The API rules are the same, too: no negative values may be returned.
2873
+ */
2874
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
2875
+ unsigned int gup_flags, struct page **pages)
2876
+{
2877
+ int nr_pinned;
2878
+
2879
+ /*
2880
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
2881
+ * rules require returning 0, rather than -errno:
2882
+ */
2883
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2884
+ return 0;
2885
+ /*
2886
+ * FOLL_FAST_ONLY is required in order to match the API description of
2887
+ * this routine: no fall back to regular ("slow") GUP.
2888
+ */
2889
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
2890
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2891
+ pages);
2892
+ /*
2893
+ * This routine is not allowed to return negative values. However,
2894
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
2895
+ * correct for that here:
2896
+ */
2897
+ if (nr_pinned < 0)
2898
+ nr_pinned = 0;
2899
+
2900
+ return nr_pinned;
2901
+}
2902
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
2903
+
2904
+/**
2905
+ * pin_user_pages_remote() - pin pages of a remote process
2906
+ *
2907
+ * @mm: mm_struct of target mm
2908
+ * @start: starting user address
2909
+ * @nr_pages: number of pages from start to pin
2910
+ * @gup_flags: flags modifying lookup behaviour
2911
+ * @pages: array that receives pointers to the pages pinned.
2912
+ * Should be at least nr_pages long. Or NULL, if caller
2913
+ * only intends to ensure the pages are faulted in.
2914
+ * @vmas: array of pointers to vmas corresponding to each page.
2915
+ * Or NULL if the caller does not require them.
2916
+ * @locked: pointer to lock flag indicating whether lock is held and
2917
+ * subsequently whether VM_FAULT_RETRY functionality can be
2918
+ * utilised. Lock must initially be held.
2919
+ *
2920
+ * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
2921
+ * get_user_pages_remote() for documentation on the function arguments, because
2922
+ * the arguments here are identical.
2923
+ *
2924
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2925
+ * see Documentation/core-api/pin_user_pages.rst for details.
2926
+ */
2927
+long pin_user_pages_remote(struct mm_struct *mm,
2928
+ unsigned long start, unsigned long nr_pages,
2929
+ unsigned int gup_flags, struct page **pages,
2930
+ struct vm_area_struct **vmas, int *locked)
2931
+{
2932
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2933
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2934
+ return -EINVAL;
2935
+
2936
+ gup_flags |= FOLL_PIN;
2937
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
2938
+ pages, vmas, locked);
2939
+}
2940
+EXPORT_SYMBOL(pin_user_pages_remote);
2941
+
2942
+/**
2943
+ * pin_user_pages() - pin user pages in memory for use by other devices
2944
+ *
2945
+ * @start: starting user address
2946
+ * @nr_pages: number of pages from start to pin
2947
+ * @gup_flags: flags modifying lookup behaviour
2948
+ * @pages: array that receives pointers to the pages pinned.
2949
+ * Should be at least nr_pages long. Or NULL, if caller
2950
+ * only intends to ensure the pages are faulted in.
2951
+ * @vmas: array of pointers to vmas corresponding to each page.
2952
+ * Or NULL if the caller does not require them.
2953
+ *
2954
+ * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
2955
+ * FOLL_PIN is set.
2956
+ *
2957
+ * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
2958
+ * see Documentation/core-api/pin_user_pages.rst for details.
2959
+ */
2960
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
2961
+ unsigned int gup_flags, struct page **pages,
2962
+ struct vm_area_struct **vmas)
2963
+{
2964
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2965
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2966
+ return -EINVAL;
2967
+
2968
+ gup_flags |= FOLL_PIN;
2969
+ return __gup_longterm_locked(current->mm, start, nr_pages,
2970
+ pages, vmas, gup_flags);
2971
+}
2972
+EXPORT_SYMBOL(pin_user_pages);
2973
+
2974
+/*
2975
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
2976
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
2977
+ * FOLL_PIN and rejects FOLL_GET.
2978
+ */
2979
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2980
+ struct page **pages, unsigned int gup_flags)
2981
+{
2982
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2983
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2984
+ return -EINVAL;
2985
+
2986
+ gup_flags |= FOLL_PIN;
2987
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
2988
+}
2989
+EXPORT_SYMBOL(pin_user_pages_unlocked);
2990
+
2991
+/*
2992
+ * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
2993
+ * Behavior is the same, except that this one sets FOLL_PIN and rejects
2994
+ * FOLL_GET.
2995
+ */
2996
+long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
2997
+ unsigned int gup_flags, struct page **pages,
2998
+ int *locked)
2999
+{
3000
+ /*
3001
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
3002
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
3003
+ * vmas. As there are no users of this flag in this call we simply
3004
+ * disallow this option for now.
3005
+ */
3006
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
3007
+ return -EINVAL;
3008
+
3009
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
3010
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
3011
+ return -EINVAL;
3012
+
3013
+ gup_flags |= FOLL_PIN;
3014
+ return __get_user_pages_locked(current->mm, start, nr_pages,
3015
+ pages, NULL, locked,
3016
+ gup_flags | FOLL_TOUCH);
3017
+}
3018
+EXPORT_SYMBOL(pin_user_pages_locked);