hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/mm/hmm.c
....@@ -1,23 +1,14 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Copyright 2013 Red Hat Inc.
34 *
4
- * This program is free software; you can redistribute it and/or modify
5
- * it under the terms of the GNU General Public License as published by
6
- * the Free Software Foundation; either version 2 of the License, or
7
- * (at your option) any later version.
8
- *
9
- * This program is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
- * GNU General Public License for more details.
13
- *
14
- * Authors: Jérôme Glisse <jglisse@redhat.com>
5
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
156 */
167 /*
178 * Refer to include/linux/hmm.h for information about heterogeneous memory
189 * management or HMM for short.
1910 */
20
-#include <linux/mm.h>
11
+#include <linux/pagewalk.h>
2112 #include <linux/hmm.h>
2213 #include <linux/init.h>
2314 #include <linux/rmap.h>
....@@ -29,545 +20,300 @@
2920 #include <linux/swapops.h>
3021 #include <linux/hugetlb.h>
3122 #include <linux/memremap.h>
23
+#include <linux/sched/mm.h>
3224 #include <linux/jump_label.h>
25
+#include <linux/dma-mapping.h>
3326 #include <linux/mmu_notifier.h>
3427 #include <linux/memory_hotplug.h>
35
-
36
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
37
-
38
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
39
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
40
-
41
-/*
42
- * struct hmm - HMM per mm struct
43
- *
44
- * @mm: mm struct this HMM struct is bound to
45
- * @lock: lock protecting ranges list
46
- * @sequence: we track updates to the CPU page table with a sequence number
47
- * @ranges: list of range being snapshotted
48
- * @mirrors: list of mirrors for this mm
49
- * @mmu_notifier: mmu notifier to track updates to CPU page table
50
- * @mirrors_sem: read/write semaphore protecting the mirrors list
51
- */
52
-struct hmm {
53
- struct mm_struct *mm;
54
- spinlock_t lock;
55
- atomic_t sequence;
56
- struct list_head ranges;
57
- struct list_head mirrors;
58
- struct mmu_notifier mmu_notifier;
59
- struct rw_semaphore mirrors_sem;
60
-};
61
-
62
-/*
63
- * hmm_register - register HMM against an mm (HMM internal)
64
- *
65
- * @mm: mm struct to attach to
66
- *
67
- * This is not intended to be used directly by device drivers. It allocates an
68
- * HMM struct if mm does not have one, and initializes it.
69
- */
70
-static struct hmm *hmm_register(struct mm_struct *mm)
71
-{
72
- struct hmm *hmm = READ_ONCE(mm->hmm);
73
- bool cleanup = false;
74
-
75
- /*
76
- * The hmm struct can only be freed once the mm_struct goes away,
77
- * hence we should always have pre-allocated an new hmm struct
78
- * above.
79
- */
80
- if (hmm)
81
- return hmm;
82
-
83
- hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
84
- if (!hmm)
85
- return NULL;
86
- INIT_LIST_HEAD(&hmm->mirrors);
87
- init_rwsem(&hmm->mirrors_sem);
88
- atomic_set(&hmm->sequence, 0);
89
- hmm->mmu_notifier.ops = NULL;
90
- INIT_LIST_HEAD(&hmm->ranges);
91
- spin_lock_init(&hmm->lock);
92
- hmm->mm = mm;
93
-
94
- spin_lock(&mm->page_table_lock);
95
- if (!mm->hmm)
96
- mm->hmm = hmm;
97
- else
98
- cleanup = true;
99
- spin_unlock(&mm->page_table_lock);
100
-
101
- if (cleanup)
102
- goto error;
103
-
104
- /*
105
- * We should only get here if hold the mmap_sem in write mode ie on
106
- * registration of first mirror through hmm_mirror_register()
107
- */
108
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
109
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
110
- goto error_mm;
111
-
112
- return mm->hmm;
113
-
114
-error_mm:
115
- spin_lock(&mm->page_table_lock);
116
- if (mm->hmm == hmm)
117
- mm->hmm = NULL;
118
- spin_unlock(&mm->page_table_lock);
119
-error:
120
- kfree(hmm);
121
- return NULL;
122
-}
123
-
124
-void hmm_mm_destroy(struct mm_struct *mm)
125
-{
126
- kfree(mm->hmm);
127
-}
128
-
129
-static void hmm_invalidate_range(struct hmm *hmm,
130
- enum hmm_update_type action,
131
- unsigned long start,
132
- unsigned long end)
133
-{
134
- struct hmm_mirror *mirror;
135
- struct hmm_range *range;
136
-
137
- spin_lock(&hmm->lock);
138
- list_for_each_entry(range, &hmm->ranges, list) {
139
- unsigned long addr, idx, npages;
140
-
141
- if (end < range->start || start >= range->end)
142
- continue;
143
-
144
- range->valid = false;
145
- addr = max(start, range->start);
146
- idx = (addr - range->start) >> PAGE_SHIFT;
147
- npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
148
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
149
- }
150
- spin_unlock(&hmm->lock);
151
-
152
- down_read(&hmm->mirrors_sem);
153
- list_for_each_entry(mirror, &hmm->mirrors, list)
154
- mirror->ops->sync_cpu_device_pagetables(mirror, action,
155
- start, end);
156
- up_read(&hmm->mirrors_sem);
157
-}
158
-
159
-static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
160
-{
161
- struct hmm_mirror *mirror;
162
- struct hmm *hmm = mm->hmm;
163
-
164
- down_write(&hmm->mirrors_sem);
165
- mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
166
- list);
167
- while (mirror) {
168
- list_del_init(&mirror->list);
169
- if (mirror->ops->release) {
170
- /*
171
- * Drop mirrors_sem so callback can wait on any pending
172
- * work that might itself trigger mmu_notifier callback
173
- * and thus would deadlock with us.
174
- */
175
- up_write(&hmm->mirrors_sem);
176
- mirror->ops->release(mirror);
177
- down_write(&hmm->mirrors_sem);
178
- }
179
- mirror = list_first_entry_or_null(&hmm->mirrors,
180
- struct hmm_mirror, list);
181
- }
182
- up_write(&hmm->mirrors_sem);
183
-}
184
-
185
-static int hmm_invalidate_range_start(struct mmu_notifier *mn,
186
- struct mm_struct *mm,
187
- unsigned long start,
188
- unsigned long end,
189
- bool blockable)
190
-{
191
- struct hmm *hmm = mm->hmm;
192
-
193
- VM_BUG_ON(!hmm);
194
-
195
- atomic_inc(&hmm->sequence);
196
-
197
- return 0;
198
-}
199
-
200
-static void hmm_invalidate_range_end(struct mmu_notifier *mn,
201
- struct mm_struct *mm,
202
- unsigned long start,
203
- unsigned long end)
204
-{
205
- struct hmm *hmm = mm->hmm;
206
-
207
- VM_BUG_ON(!hmm);
208
-
209
- hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
210
-}
211
-
212
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
213
- .release = hmm_release,
214
- .invalidate_range_start = hmm_invalidate_range_start,
215
- .invalidate_range_end = hmm_invalidate_range_end,
216
-};
217
-
218
-/*
219
- * hmm_mirror_register() - register a mirror against an mm
220
- *
221
- * @mirror: new mirror struct to register
222
- * @mm: mm to register against
223
- *
224
- * To start mirroring a process address space, the device driver must register
225
- * an HMM mirror struct.
226
- *
227
- * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
228
- */
229
-int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
230
-{
231
- /* Sanity check */
232
- if (!mm || !mirror || !mirror->ops)
233
- return -EINVAL;
234
-
235
-again:
236
- mirror->hmm = hmm_register(mm);
237
- if (!mirror->hmm)
238
- return -ENOMEM;
239
-
240
- down_write(&mirror->hmm->mirrors_sem);
241
- if (mirror->hmm->mm == NULL) {
242
- /*
243
- * A racing hmm_mirror_unregister() is about to destroy the hmm
244
- * struct. Try again to allocate a new one.
245
- */
246
- up_write(&mirror->hmm->mirrors_sem);
247
- mirror->hmm = NULL;
248
- goto again;
249
- } else {
250
- list_add(&mirror->list, &mirror->hmm->mirrors);
251
- up_write(&mirror->hmm->mirrors_sem);
252
- }
253
-
254
- return 0;
255
-}
256
-EXPORT_SYMBOL(hmm_mirror_register);
257
-
258
-/*
259
- * hmm_mirror_unregister() - unregister a mirror
260
- *
261
- * @mirror: new mirror struct to register
262
- *
263
- * Stop mirroring a process address space, and cleanup.
264
- */
265
-void hmm_mirror_unregister(struct hmm_mirror *mirror)
266
-{
267
- bool should_unregister = false;
268
- struct mm_struct *mm;
269
- struct hmm *hmm;
270
-
271
- if (mirror->hmm == NULL)
272
- return;
273
-
274
- hmm = mirror->hmm;
275
- down_write(&hmm->mirrors_sem);
276
- list_del_init(&mirror->list);
277
- should_unregister = list_empty(&hmm->mirrors);
278
- mirror->hmm = NULL;
279
- mm = hmm->mm;
280
- hmm->mm = NULL;
281
- up_write(&hmm->mirrors_sem);
282
-
283
- if (!should_unregister || mm == NULL)
284
- return;
285
-
286
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
287
-
288
- spin_lock(&mm->page_table_lock);
289
- if (mm->hmm == hmm)
290
- mm->hmm = NULL;
291
- spin_unlock(&mm->page_table_lock);
292
-
293
- kfree(hmm);
294
-}
295
-EXPORT_SYMBOL(hmm_mirror_unregister);
29628
29729 struct hmm_vma_walk {
29830 struct hmm_range *range;
29931 unsigned long last;
300
- bool fault;
301
- bool block;
30232 };
30333
304
-static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
305
- bool write_fault, uint64_t *pfn)
34
+enum {
35
+ HMM_NEED_FAULT = 1 << 0,
36
+ HMM_NEED_WRITE_FAULT = 1 << 1,
37
+ HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
38
+};
39
+
40
+static int hmm_pfns_fill(unsigned long addr, unsigned long end,
41
+ struct hmm_range *range, unsigned long cpu_flags)
30642 {
307
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
308
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
309
- struct hmm_range *range = hmm_vma_walk->range;
310
- struct vm_area_struct *vma = walk->vma;
311
- vm_fault_t ret;
43
+ unsigned long i = (addr - range->start) >> PAGE_SHIFT;
31244
313
- flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
314
- flags |= write_fault ? FAULT_FLAG_WRITE : 0;
315
- ret = handle_mm_fault(vma, addr, flags);
316
- if (ret & VM_FAULT_RETRY)
317
- return -EBUSY;
318
- if (ret & VM_FAULT_ERROR) {
319
- *pfn = range->values[HMM_PFN_ERROR];
320
- return -EFAULT;
321
- }
322
-
323
- return -EAGAIN;
324
-}
325
-
326
-static int hmm_pfns_bad(unsigned long addr,
327
- unsigned long end,
328
- struct mm_walk *walk)
329
-{
330
- struct hmm_vma_walk *hmm_vma_walk = walk->private;
331
- struct hmm_range *range = hmm_vma_walk->range;
332
- uint64_t *pfns = range->pfns;
333
- unsigned long i;
334
-
335
- i = (addr - range->start) >> PAGE_SHIFT;
33645 for (; addr < end; addr += PAGE_SIZE, i++)
337
- pfns[i] = range->values[HMM_PFN_ERROR];
338
-
46
+ range->hmm_pfns[i] = cpu_flags;
33947 return 0;
34048 }
34149
34250 /*
343
- * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
344
- * @start: range virtual start address (inclusive)
51
+ * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
52
+ * @addr: range virtual start address (inclusive)
34553 * @end: range virtual end address (exclusive)
346
- * @fault: should we fault or not ?
347
- * @write_fault: write fault ?
54
+ * @required_fault: HMM_NEED_* flags
34855 * @walk: mm_walk structure
349
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
56
+ * Return: -EBUSY after page fault, or page fault error
35057 *
35158 * This function will be called whenever pmd_none() or pte_none() returns true,
35259 * or whenever there is no page directory covering the virtual address range.
35360 */
354
-static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
355
- bool fault, bool write_fault,
356
- struct mm_walk *walk)
61
+static int hmm_vma_fault(unsigned long addr, unsigned long end,
62
+ unsigned int required_fault, struct mm_walk *walk)
35763 {
35864 struct hmm_vma_walk *hmm_vma_walk = walk->private;
359
- struct hmm_range *range = hmm_vma_walk->range;
360
- uint64_t *pfns = range->pfns;
361
- unsigned long i;
65
+ struct vm_area_struct *vma = walk->vma;
66
+ unsigned int fault_flags = FAULT_FLAG_REMOTE;
36267
68
+ WARN_ON_ONCE(!required_fault);
36369 hmm_vma_walk->last = addr;
364
- i = (addr - range->start) >> PAGE_SHIFT;
365
- for (; addr < end; addr += PAGE_SIZE, i++) {
366
- pfns[i] = range->values[HMM_PFN_NONE];
367
- if (fault || write_fault) {
368
- int ret;
36970
370
- ret = hmm_vma_do_fault(walk, addr, write_fault,
371
- &pfns[i]);
372
- if (ret != -EAGAIN)
373
- return ret;
374
- }
71
+ if (required_fault & HMM_NEED_WRITE_FAULT) {
72
+ if (!(vma->vm_flags & VM_WRITE))
73
+ return -EPERM;
74
+ fault_flags |= FAULT_FLAG_WRITE;
37575 }
37676
377
- return (fault || write_fault) ? -EAGAIN : 0;
77
+ for (; addr < end; addr += PAGE_SIZE)
78
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
79
+ VM_FAULT_ERROR)
80
+ return -EFAULT;
81
+ return -EBUSY;
37882 }
37983
380
-static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
381
- uint64_t pfns, uint64_t cpu_flags,
382
- bool *fault, bool *write_fault)
84
+static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
85
+ unsigned long pfn_req_flags,
86
+ unsigned long cpu_flags)
38387 {
38488 struct hmm_range *range = hmm_vma_walk->range;
38589
386
- *fault = *write_fault = false;
387
- if (!hmm_vma_walk->fault)
388
- return;
90
+ /*
91
+ * So we not only consider the individual per page request we also
92
+ * consider the default flags requested for the range. The API can
93
+ * be used 2 ways. The first one where the HMM user coalesces
94
+ * multiple page faults into one request and sets flags per pfn for
95
+ * those faults. The second one where the HMM user wants to pre-
96
+ * fault a range with specific flags. For the latter one it is a
97
+ * waste to have the user pre-fill the pfn arrays with a default
98
+ * flags value.
99
+ */
100
+ pfn_req_flags &= range->pfn_flags_mask;
101
+ pfn_req_flags |= range->default_flags;
389102
390103 /* We aren't ask to do anything ... */
391
- if (!(pfns & range->flags[HMM_PFN_VALID]))
392
- return;
393
- /* If this is device memory than only fault if explicitly requested */
394
- if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
395
- /* Do we fault on device memory ? */
396
- if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
397
- *write_fault = pfns & range->flags[HMM_PFN_WRITE];
398
- *fault = true;
399
- }
400
- return;
401
- }
104
+ if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
105
+ return 0;
106
+
107
+ /* Need to write fault ? */
108
+ if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
109
+ !(cpu_flags & HMM_PFN_WRITE))
110
+ return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
402111
403112 /* If CPU page table is not valid then we need to fault */
404
- *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
405
- /* Need to write fault ? */
406
- if ((pfns & range->flags[HMM_PFN_WRITE]) &&
407
- !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
408
- *write_fault = true;
409
- *fault = true;
410
- }
113
+ if (!(cpu_flags & HMM_PFN_VALID))
114
+ return HMM_NEED_FAULT;
115
+ return 0;
411116 }
412117
413
-static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
414
- const uint64_t *pfns, unsigned long npages,
415
- uint64_t cpu_flags, bool *fault,
416
- bool *write_fault)
118
+static unsigned int
119
+hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
120
+ const unsigned long hmm_pfns[], unsigned long npages,
121
+ unsigned long cpu_flags)
417122 {
123
+ struct hmm_range *range = hmm_vma_walk->range;
124
+ unsigned int required_fault = 0;
418125 unsigned long i;
419126
420
- if (!hmm_vma_walk->fault) {
421
- *fault = *write_fault = false;
422
- return;
423
- }
127
+ /*
128
+ * If the default flags do not request to fault pages, and the mask does
129
+ * not allow for individual pages to be faulted, then
130
+ * hmm_pte_need_fault() will always return 0.
131
+ */
132
+ if (!((range->default_flags | range->pfn_flags_mask) &
133
+ HMM_PFN_REQ_FAULT))
134
+ return 0;
424135
425136 for (i = 0; i < npages; ++i) {
426
- hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
427
- fault, write_fault);
428
- if ((*fault) || (*write_fault))
429
- return;
137
+ required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
138
+ cpu_flags);
139
+ if (required_fault == HMM_NEED_ALL_BITS)
140
+ return required_fault;
430141 }
142
+ return required_fault;
431143 }
432144
433145 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
434
- struct mm_walk *walk)
146
+ __always_unused int depth, struct mm_walk *walk)
435147 {
436148 struct hmm_vma_walk *hmm_vma_walk = walk->private;
437149 struct hmm_range *range = hmm_vma_walk->range;
438
- bool fault, write_fault;
150
+ unsigned int required_fault;
439151 unsigned long i, npages;
440
- uint64_t *pfns;
152
+ unsigned long *hmm_pfns;
441153
442154 i = (addr - range->start) >> PAGE_SHIFT;
443155 npages = (end - addr) >> PAGE_SHIFT;
444
- pfns = &range->pfns[i];
445
- hmm_range_need_fault(hmm_vma_walk, pfns, npages,
446
- 0, &fault, &write_fault);
447
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
156
+ hmm_pfns = &range->hmm_pfns[i];
157
+ required_fault =
158
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
159
+ if (!walk->vma) {
160
+ if (required_fault)
161
+ return -EFAULT;
162
+ return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
163
+ }
164
+ if (required_fault)
165
+ return hmm_vma_fault(addr, end, required_fault, walk);
166
+ return hmm_pfns_fill(addr, end, range, 0);
448167 }
449168
450
-static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
169
+static inline unsigned long hmm_pfn_flags_order(unsigned long order)
170
+{
171
+ return order << HMM_PFN_ORDER_SHIFT;
172
+}
173
+
174
+static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
175
+ pmd_t pmd)
451176 {
452177 if (pmd_protnone(pmd))
453178 return 0;
454
- return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
455
- range->flags[HMM_PFN_WRITE] :
456
- range->flags[HMM_PFN_VALID];
179
+ return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
180
+ HMM_PFN_VALID) |
181
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
457182 }
458183
459
-static int hmm_vma_handle_pmd(struct mm_walk *walk,
460
- unsigned long addr,
461
- unsigned long end,
462
- uint64_t *pfns,
184
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
185
+static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
186
+ unsigned long end, unsigned long hmm_pfns[],
463187 pmd_t pmd)
464188 {
465189 struct hmm_vma_walk *hmm_vma_walk = walk->private;
466190 struct hmm_range *range = hmm_vma_walk->range;
467191 unsigned long pfn, npages, i;
468
- bool fault, write_fault;
469
- uint64_t cpu_flags;
192
+ unsigned int required_fault;
193
+ unsigned long cpu_flags;
470194
471195 npages = (end - addr) >> PAGE_SHIFT;
472196 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
473
- hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
474
- &fault, &write_fault);
197
+ required_fault =
198
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
199
+ if (required_fault)
200
+ return hmm_vma_fault(addr, end, required_fault, walk);
475201
476
- if (pmd_protnone(pmd) || fault || write_fault)
477
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
478
-
479
- pfn = pmd_pfn(pmd) + pte_index(addr);
202
+ pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
480203 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
481
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
482
- hmm_vma_walk->last = end;
204
+ hmm_pfns[i] = pfn | cpu_flags;
483205 return 0;
484206 }
207
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
208
+/* stub to allow the code below to compile */
209
+int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
210
+ unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
211
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
485212
486
-static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
213
+static inline bool hmm_is_device_private_entry(struct hmm_range *range,
214
+ swp_entry_t entry)
487215 {
488
- if (pte_none(pte) || !pte_present(pte))
216
+ return is_device_private_entry(entry) &&
217
+ device_private_entry_to_page(entry)->pgmap->owner ==
218
+ range->dev_private_owner;
219
+}
220
+
221
+static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
222
+ pte_t pte)
223
+{
224
+ if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
489225 return 0;
490
- return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
491
- range->flags[HMM_PFN_WRITE] :
492
- range->flags[HMM_PFN_VALID];
226
+ return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
493227 }
494228
495229 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
496230 unsigned long end, pmd_t *pmdp, pte_t *ptep,
497
- uint64_t *pfn)
231
+ unsigned long *hmm_pfn)
498232 {
499233 struct hmm_vma_walk *hmm_vma_walk = walk->private;
500234 struct hmm_range *range = hmm_vma_walk->range;
501
- struct vm_area_struct *vma = walk->vma;
502
- bool fault, write_fault;
503
- uint64_t cpu_flags;
235
+ unsigned int required_fault;
236
+ unsigned long cpu_flags;
504237 pte_t pte = *ptep;
505
- uint64_t orig_pfn = *pfn;
506
-
507
- *pfn = range->values[HMM_PFN_NONE];
508
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
509
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
510
- &fault, &write_fault);
238
+ uint64_t pfn_req_flags = *hmm_pfn;
511239
512240 if (pte_none(pte)) {
513
- if (fault || write_fault)
241
+ required_fault =
242
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
243
+ if (required_fault)
514244 goto fault;
245
+ *hmm_pfn = 0;
515246 return 0;
516247 }
517248
518249 if (!pte_present(pte)) {
519250 swp_entry_t entry = pte_to_swp_entry(pte);
520251
521
- if (!non_swap_entry(entry)) {
522
- if (fault || write_fault)
523
- goto fault;
252
+ /*
253
+ * Never fault in device private pages, but just report
254
+ * the PFN even if not present.
255
+ */
256
+ if (hmm_is_device_private_entry(range, entry)) {
257
+ cpu_flags = HMM_PFN_VALID;
258
+ if (is_write_device_private_entry(entry))
259
+ cpu_flags |= HMM_PFN_WRITE;
260
+ *hmm_pfn = device_private_entry_to_pfn(entry) |
261
+ cpu_flags;
524262 return 0;
525263 }
526264
527
- /*
528
- * This is a special swap entry, ignore migration, use
529
- * device and report anything else as error.
530
- */
531
- if (is_device_private_entry(entry)) {
532
- cpu_flags = range->flags[HMM_PFN_VALID] |
533
- range->flags[HMM_PFN_DEVICE_PRIVATE];
534
- cpu_flags |= is_write_device_private_entry(entry) ?
535
- range->flags[HMM_PFN_WRITE] : 0;
536
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
537
- &fault, &write_fault);
538
- if (fault || write_fault)
539
- goto fault;
540
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
541
- *pfn |= cpu_flags;
265
+ required_fault =
266
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
267
+ if (!required_fault) {
268
+ *hmm_pfn = 0;
542269 return 0;
543270 }
271
+
272
+ if (!non_swap_entry(entry))
273
+ goto fault;
544274
545275 if (is_migration_entry(entry)) {
546
- if (fault || write_fault) {
547
- pte_unmap(ptep);
548
- hmm_vma_walk->last = addr;
549
- migration_entry_wait(vma->vm_mm,
550
- pmdp, addr);
551
- return -EAGAIN;
552
- }
553
- return 0;
276
+ pte_unmap(ptep);
277
+ hmm_vma_walk->last = addr;
278
+ migration_entry_wait(walk->mm, pmdp, addr);
279
+ return -EBUSY;
554280 }
555281
556282 /* Report error for everything else */
557
- *pfn = range->values[HMM_PFN_ERROR];
283
+ pte_unmap(ptep);
558284 return -EFAULT;
559285 }
560286
561
- if (fault || write_fault)
287
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
288
+ required_fault =
289
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
290
+ if (required_fault)
562291 goto fault;
563292
564
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
293
+ /*
294
+ * Bypass devmap pte such as DAX page when all pfn requested
295
+ * flags(pfn_req_flags) are fulfilled.
296
+ * Since each architecture defines a struct page for the zero page, just
297
+ * fall through and treat it like a normal page.
298
+ */
299
+ if (!vm_normal_page(walk->vma, addr, pte) &&
300
+ !pte_devmap(pte) &&
301
+ !is_zero_pfn(pte_pfn(pte))) {
302
+ if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
303
+ pte_unmap(ptep);
304
+ return -EFAULT;
305
+ }
306
+ *hmm_pfn = HMM_PFN_ERROR;
307
+ return 0;
308
+ }
309
+
310
+ *hmm_pfn = pte_pfn(pte) | cpu_flags;
565311 return 0;
566312
567313 fault:
568314 pte_unmap(ptep);
569315 /* Fault any virtual address we were asked to fault */
570
- return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
316
+ return hmm_vma_fault(addr, end, required_fault, walk);
571317 }
572318
573319 static int hmm_vma_walk_pmd(pmd_t *pmdp,
....@@ -577,28 +323,40 @@
577323 {
578324 struct hmm_vma_walk *hmm_vma_walk = walk->private;
579325 struct hmm_range *range = hmm_vma_walk->range;
580
- uint64_t *pfns = range->pfns;
581
- unsigned long addr = start, i;
326
+ unsigned long *hmm_pfns =
327
+ &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
328
+ unsigned long npages = (end - start) >> PAGE_SHIFT;
329
+ unsigned long addr = start;
582330 pte_t *ptep;
583
-
584
- i = (addr - range->start) >> PAGE_SHIFT;
331
+ pmd_t pmd;
585332
586333 again:
587
- if (pmd_none(*pmdp))
588
- return hmm_vma_walk_hole(start, end, walk);
334
+ pmd = READ_ONCE(*pmdp);
335
+ if (pmd_none(pmd))
336
+ return hmm_vma_walk_hole(start, end, -1, walk);
589337
590
- if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
591
- return hmm_pfns_bad(start, end, walk);
338
+ if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
339
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
340
+ hmm_vma_walk->last = addr;
341
+ pmd_migration_entry_wait(walk->mm, pmdp);
342
+ return -EBUSY;
343
+ }
344
+ return hmm_pfns_fill(start, end, range, 0);
345
+ }
592346
593
- if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
594
- pmd_t pmd;
347
+ if (!pmd_present(pmd)) {
348
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
349
+ return -EFAULT;
350
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
351
+ }
595352
353
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
596354 /*
597
- * No need to take pmd_lock here, even if some other threads
355
+ * No need to take pmd_lock here, even if some other thread
598356 * is splitting the huge pmd we will get that event through
599357 * mmu_notifier callback.
600358 *
601
- * So just read pmd value and check again its a transparent
359
+ * So just read pmd value and check again it's a transparent
602360 * huge or device mapping one and compute corresponding pfn
603361 * values.
604362 */
....@@ -607,742 +365,235 @@
607365 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
608366 goto again;
609367
610
- return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
368
+ return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
611369 }
612370
613
- if (pmd_bad(*pmdp))
614
- return hmm_pfns_bad(start, end, walk);
371
+ /*
372
+ * We have handled all the valid cases above ie either none, migration,
373
+ * huge or transparent huge. At this point either it is a valid pmd
374
+ * entry pointing to pte directory or it is a bad pmd that will not
375
+ * recover.
376
+ */
377
+ if (pmd_bad(pmd)) {
378
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
379
+ return -EFAULT;
380
+ return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
381
+ }
615382
616383 ptep = pte_offset_map(pmdp, addr);
617
- for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
384
+ for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
618385 int r;
619386
620
- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
387
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
621388 if (r) {
622
- /* hmm_vma_handle_pte() did unmap pte directory */
623
- hmm_vma_walk->last = addr;
389
+ /* hmm_vma_handle_pte() did pte_unmap() */
624390 return r;
625391 }
626392 }
627393 pte_unmap(ptep - 1);
628
-
629
- hmm_vma_walk->last = addr;
630394 return 0;
631395 }
632396
633
-static void hmm_pfns_clear(struct hmm_range *range,
634
- uint64_t *pfns,
635
- unsigned long addr,
636
- unsigned long end)
397
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
398
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
399
+static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
400
+ pud_t pud)
637401 {
638
- for (; addr < end; addr += PAGE_SIZE, pfns++)
639
- *pfns = range->values[HMM_PFN_NONE];
402
+ if (!pud_present(pud))
403
+ return 0;
404
+ return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
405
+ HMM_PFN_VALID) |
406
+ hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
640407 }
641408
642
-static void hmm_pfns_special(struct hmm_range *range)
409
+static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
410
+ struct mm_walk *walk)
643411 {
644
- unsigned long addr = range->start, i = 0;
412
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
413
+ struct hmm_range *range = hmm_vma_walk->range;
414
+ unsigned long addr = start;
415
+ pud_t pud;
416
+ int ret = 0;
417
+ spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
645418
646
- for (; addr < range->end; addr += PAGE_SIZE, i++)
647
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
419
+ if (!ptl)
420
+ return 0;
421
+
422
+ /* Normally we don't want to split the huge page */
423
+ walk->action = ACTION_CONTINUE;
424
+
425
+ pud = READ_ONCE(*pudp);
426
+ if (pud_none(pud)) {
427
+ spin_unlock(ptl);
428
+ return hmm_vma_walk_hole(start, end, -1, walk);
429
+ }
430
+
431
+ if (pud_huge(pud) && pud_devmap(pud)) {
432
+ unsigned long i, npages, pfn;
433
+ unsigned int required_fault;
434
+ unsigned long *hmm_pfns;
435
+ unsigned long cpu_flags;
436
+
437
+ if (!pud_present(pud)) {
438
+ spin_unlock(ptl);
439
+ return hmm_vma_walk_hole(start, end, -1, walk);
440
+ }
441
+
442
+ i = (addr - range->start) >> PAGE_SHIFT;
443
+ npages = (end - addr) >> PAGE_SHIFT;
444
+ hmm_pfns = &range->hmm_pfns[i];
445
+
446
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
447
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
448
+ npages, cpu_flags);
449
+ if (required_fault) {
450
+ spin_unlock(ptl);
451
+ return hmm_vma_fault(addr, end, required_fault, walk);
452
+ }
453
+
454
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
455
+ for (i = 0; i < npages; ++i, ++pfn)
456
+ hmm_pfns[i] = pfn | cpu_flags;
457
+ goto out_unlock;
458
+ }
459
+
460
+ /* Ask for the PUD to be split */
461
+ walk->action = ACTION_SUBTREE;
462
+
463
+out_unlock:
464
+ spin_unlock(ptl);
465
+ return ret;
648466 }
467
+#else
468
+#define hmm_vma_walk_pud NULL
469
+#endif
649470
650
-/*
651
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
652
- * @range: range being snapshotted
653
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
654
- * vma permission, 0 success
655
- *
656
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
657
- * validity is tracked by range struct. See hmm_vma_range_done() for further
658
- * information.
659
- *
660
- * The range struct is initialized here. It tracks the CPU page table, but only
661
- * if the function returns success (0), in which case the caller must then call
662
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
663
- *
664
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
665
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
666
- */
667
-int hmm_vma_get_pfns(struct hmm_range *range)
471
+#ifdef CONFIG_HUGETLB_PAGE
472
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
473
+ unsigned long start, unsigned long end,
474
+ struct mm_walk *walk)
668475 {
669
- struct vm_area_struct *vma = range->vma;
670
- struct hmm_vma_walk hmm_vma_walk;
671
- struct mm_walk mm_walk;
672
- struct hmm *hmm;
476
+ unsigned long addr = start, i, pfn;
477
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
478
+ struct hmm_range *range = hmm_vma_walk->range;
479
+ struct vm_area_struct *vma = walk->vma;
480
+ unsigned int required_fault;
481
+ unsigned long pfn_req_flags;
482
+ unsigned long cpu_flags;
483
+ spinlock_t *ptl;
484
+ pte_t entry;
673485
674
- /* Sanity check, this really should not happen ! */
675
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
676
- return -EINVAL;
677
- if (range->end < vma->vm_start || range->end > vma->vm_end)
678
- return -EINVAL;
486
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
487
+ entry = huge_ptep_get(pte);
679488
680
- hmm = hmm_register(vma->vm_mm);
681
- if (!hmm)
682
- return -ENOMEM;
683
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
684
- if (!hmm->mmu_notifier.ops)
685
- return -EINVAL;
686
-
687
- /* FIXME support hugetlb fs */
688
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
689
- vma_is_dax(vma)) {
690
- hmm_pfns_special(range);
691
- return -EINVAL;
489
+ i = (start - range->start) >> PAGE_SHIFT;
490
+ pfn_req_flags = range->hmm_pfns[i];
491
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
492
+ hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
493
+ required_fault =
494
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
495
+ if (required_fault) {
496
+ spin_unlock(ptl);
497
+ return hmm_vma_fault(addr, end, required_fault, walk);
692498 }
693499
694
- if (!(vma->vm_flags & VM_READ)) {
695
- /*
696
- * If vma do not allow read access, then assume that it does
697
- * not allow write access, either. Architecture that allow
698
- * write without read access are not supported by HMM, because
699
- * operations such has atomic access would not work.
700
- */
701
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
702
- return -EPERM;
703
- }
500
+ pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
501
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
502
+ range->hmm_pfns[i] = pfn | cpu_flags;
704503
705
- /* Initialize range to track CPU page table update */
706
- spin_lock(&hmm->lock);
707
- range->valid = true;
708
- list_add_rcu(&range->list, &hmm->ranges);
709
- spin_unlock(&hmm->lock);
710
-
711
- hmm_vma_walk.fault = false;
712
- hmm_vma_walk.range = range;
713
- mm_walk.private = &hmm_vma_walk;
714
-
715
- mm_walk.vma = vma;
716
- mm_walk.mm = vma->vm_mm;
717
- mm_walk.pte_entry = NULL;
718
- mm_walk.test_walk = NULL;
719
- mm_walk.hugetlb_entry = NULL;
720
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
721
- mm_walk.pte_hole = hmm_vma_walk_hole;
722
-
723
- walk_page_range(range->start, range->end, &mm_walk);
504
+ spin_unlock(ptl);
724505 return 0;
725506 }
726
-EXPORT_SYMBOL(hmm_vma_get_pfns);
507
+#else
508
+#define hmm_vma_walk_hugetlb_entry NULL
509
+#endif /* CONFIG_HUGETLB_PAGE */
727510
728
-/*
729
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
730
- * @range: range being tracked
731
- * Returns: false if range data has been invalidated, true otherwise
732
- *
733
- * Range struct is used to track updates to the CPU page table after a call to
734
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
735
- * using the data, or wants to lock updates to the data it got from those
736
- * functions, it must call the hmm_vma_range_done() function, which will then
737
- * stop tracking CPU page table updates.
738
- *
739
- * Note that device driver must still implement general CPU page table update
740
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
741
- * the mmu_notifier API directly.
742
- *
743
- * CPU page table update tracking done through hmm_range is only temporary and
744
- * to be used while trying to duplicate CPU page table contents for a range of
745
- * virtual addresses.
746
- *
747
- * There are two ways to use this :
748
- * again:
749
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
750
- * trans = device_build_page_table_update_transaction(pfns);
751
- * device_page_table_lock();
752
- * if (!hmm_vma_range_done(range)) {
753
- * device_page_table_unlock();
754
- * goto again;
755
- * }
756
- * device_commit_transaction(trans);
757
- * device_page_table_unlock();
758
- *
759
- * Or:
760
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
761
- * device_page_table_lock();
762
- * hmm_vma_range_done(range);
763
- * device_update_page_table(range->pfns);
764
- * device_page_table_unlock();
765
- */
766
-bool hmm_vma_range_done(struct hmm_range *range)
511
+static int hmm_vma_walk_test(unsigned long start, unsigned long end,
512
+ struct mm_walk *walk)
767513 {
768
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
769
- struct hmm *hmm;
514
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
515
+ struct hmm_range *range = hmm_vma_walk->range;
516
+ struct vm_area_struct *vma = walk->vma;
770517
771
- if (range->end <= range->start) {
772
- BUG();
773
- return false;
774
- }
518
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
519
+ vma->vm_flags & VM_READ)
520
+ return 0;
775521
776
- hmm = hmm_register(range->vma->vm_mm);
777
- if (!hmm) {
778
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
779
- return false;
780
- }
522
+ /*
523
+ * vma ranges that don't have struct page backing them or map I/O
524
+ * devices directly cannot be handled by hmm_range_fault().
525
+ *
526
+ * If the vma does not allow read access, then assume that it does not
527
+ * allow write access either. HMM does not support architectures that
528
+ * allow write without read.
529
+ *
530
+ * If a fault is requested for an unsupported range then it is a hard
531
+ * failure.
532
+ */
533
+ if (hmm_range_need_fault(hmm_vma_walk,
534
+ range->hmm_pfns +
535
+ ((start - range->start) >> PAGE_SHIFT),
536
+ (end - start) >> PAGE_SHIFT, 0))
537
+ return -EFAULT;
781538
782
- spin_lock(&hmm->lock);
783
- list_del_rcu(&range->list);
784
- spin_unlock(&hmm->lock);
539
+ hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
785540
786
- return range->valid;
541
+ /* Skip this vma and continue processing the next vma. */
542
+ return 1;
787543 }
788
-EXPORT_SYMBOL(hmm_vma_range_done);
789544
790
-/*
791
- * hmm_vma_fault() - try to fault some address in a virtual address range
792
- * @range: range being faulted
793
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
794
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
545
+static const struct mm_walk_ops hmm_walk_ops = {
546
+ .pud_entry = hmm_vma_walk_pud,
547
+ .pmd_entry = hmm_vma_walk_pmd,
548
+ .pte_hole = hmm_vma_walk_hole,
549
+ .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
550
+ .test_walk = hmm_vma_walk_test,
551
+};
552
+
553
+/**
554
+ * hmm_range_fault - try to fault some address in a virtual address range
555
+ * @range: argument structure
795556 *
796
- * This is similar to a regular CPU page fault except that it will not trigger
797
- * any memory migration if the memory being faulted is not accessible by CPUs.
557
+ * Returns 0 on success or one of the following error codes:
798558 *
799
- * On error, for one virtual address in the range, the function will mark the
800
- * corresponding HMM pfn entry with an error flag.
559
+ * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
560
+ * (e.g., device file vma).
561
+ * -ENOMEM: Out of memory.
562
+ * -EPERM: Invalid permission (e.g., asking for write and range is read
563
+ * only).
564
+ * -EBUSY: The range has been invalidated and the caller needs to wait for
565
+ * the invalidation to finish.
566
+ * -EFAULT: A page was requested to be valid and could not be made valid
567
+ * ie it has no backing VMA or it is illegal to access
801568 *
802
- * Expected use pattern:
803
- * retry:
804
- * down_read(&mm->mmap_sem);
805
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
806
- * // array accordingly
807
- * ret = hmm_vma_fault(range, write, block);
808
- * switch (ret) {
809
- * case -EAGAIN:
810
- * hmm_vma_range_done(range);
811
- * // You might want to rate limit or yield to play nicely, you may
812
- * // also commit any valid pfn in the array assuming that you are
813
- * // getting true from hmm_vma_range_monitor_end()
814
- * goto retry;
815
- * case 0:
816
- * break;
817
- * case -ENOMEM:
818
- * case -EINVAL:
819
- * case -EPERM:
820
- * default:
821
- * // Handle error !
822
- * up_read(&mm->mmap_sem)
823
- * return;
824
- * }
825
- * // Take device driver lock that serialize device page table update
826
- * driver_lock_device_page_table_update();
827
- * hmm_vma_range_done(range);
828
- * // Commit pfns we got from hmm_vma_fault()
829
- * driver_unlock_device_page_table_update();
830
- * up_read(&mm->mmap_sem)
831
- *
832
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
833
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
834
- *
835
- * YOU HAVE BEEN WARNED !
569
+ * This is similar to get_user_pages(), except that it can read the page tables
570
+ * without mutating them (ie causing faults).
836571 */
837
-int hmm_vma_fault(struct hmm_range *range, bool block)
572
+int hmm_range_fault(struct hmm_range *range)
838573 {
839
- struct vm_area_struct *vma = range->vma;
840
- unsigned long start = range->start;
841
- struct hmm_vma_walk hmm_vma_walk;
842
- struct mm_walk mm_walk;
843
- struct hmm *hmm;
574
+ struct hmm_vma_walk hmm_vma_walk = {
575
+ .range = range,
576
+ .last = range->start,
577
+ };
578
+ struct mm_struct *mm = range->notifier->mm;
844579 int ret;
845580
846
- /* Sanity check, this really should not happen ! */
847
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
848
- return -EINVAL;
849
- if (range->end < vma->vm_start || range->end > vma->vm_end)
850
- return -EINVAL;
851
-
852
- hmm = hmm_register(vma->vm_mm);
853
- if (!hmm) {
854
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
855
- return -ENOMEM;
856
- }
857
- /* Caller must have registered a mirror using hmm_mirror_register() */
858
- if (!hmm->mmu_notifier.ops)
859
- return -EINVAL;
860
-
861
- /* FIXME support hugetlb fs */
862
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
863
- vma_is_dax(vma)) {
864
- hmm_pfns_special(range);
865
- return -EINVAL;
866
- }
867
-
868
- if (!(vma->vm_flags & VM_READ)) {
869
- /*
870
- * If vma do not allow read access, then assume that it does
871
- * not allow write access, either. Architecture that allow
872
- * write without read access are not supported by HMM, because
873
- * operations such has atomic access would not work.
874
- */
875
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
876
- return -EPERM;
877
- }
878
-
879
- /* Initialize range to track CPU page table update */
880
- spin_lock(&hmm->lock);
881
- range->valid = true;
882
- list_add_rcu(&range->list, &hmm->ranges);
883
- spin_unlock(&hmm->lock);
884
-
885
- hmm_vma_walk.fault = true;
886
- hmm_vma_walk.block = block;
887
- hmm_vma_walk.range = range;
888
- mm_walk.private = &hmm_vma_walk;
889
- hmm_vma_walk.last = range->start;
890
-
891
- mm_walk.vma = vma;
892
- mm_walk.mm = vma->vm_mm;
893
- mm_walk.pte_entry = NULL;
894
- mm_walk.test_walk = NULL;
895
- mm_walk.hugetlb_entry = NULL;
896
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
897
- mm_walk.pte_hole = hmm_vma_walk_hole;
581
+ mmap_assert_locked(mm);
898582
899583 do {
900
- ret = walk_page_range(start, range->end, &mm_walk);
901
- start = hmm_vma_walk.last;
902
- } while (ret == -EAGAIN);
903
-
904
- if (ret) {
905
- unsigned long i;
906
-
907
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
908
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
909
- range->end);
910
- hmm_vma_range_done(range);
911
- }
584
+ /* If range is no longer valid force retry. */
585
+ if (mmu_interval_check_retry(range->notifier,
586
+ range->notifier_seq))
587
+ return -EBUSY;
588
+ ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
589
+ &hmm_walk_ops, &hmm_vma_walk);
590
+ /*
591
+ * When -EBUSY is returned the loop restarts with
592
+ * hmm_vma_walk.last set to an address that has not been stored
593
+ * in pfns. All entries < last in the pfn array are set to their
594
+ * output, and all >= are still at their input values.
595
+ */
596
+ } while (ret == -EBUSY);
912597 return ret;
913598 }
914
-EXPORT_SYMBOL(hmm_vma_fault);
915
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
916
-
917
-
918
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
919
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
920
- unsigned long addr)
921
-{
922
- struct page *page;
923
-
924
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
925
- if (!page)
926
- return NULL;
927
- lock_page(page);
928
- return page;
929
-}
930
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
931
-
932
-
933
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
934
-{
935
- struct hmm_devmem *devmem;
936
-
937
- devmem = container_of(ref, struct hmm_devmem, ref);
938
- complete(&devmem->completion);
939
-}
940
-
941
-static void hmm_devmem_ref_exit(void *data)
942
-{
943
- struct percpu_ref *ref = data;
944
- struct hmm_devmem *devmem;
945
-
946
- devmem = container_of(ref, struct hmm_devmem, ref);
947
- percpu_ref_exit(ref);
948
-}
949
-
950
-static void hmm_devmem_ref_kill(void *data)
951
-{
952
- struct percpu_ref *ref = data;
953
- struct hmm_devmem *devmem;
954
-
955
- devmem = container_of(ref, struct hmm_devmem, ref);
956
- percpu_ref_kill(ref);
957
- wait_for_completion(&devmem->completion);
958
-}
959
-
960
-static int hmm_devmem_fault(struct vm_area_struct *vma,
961
- unsigned long addr,
962
- const struct page *page,
963
- unsigned int flags,
964
- pmd_t *pmdp)
965
-{
966
- struct hmm_devmem *devmem = page->pgmap->data;
967
-
968
- return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
969
-}
970
-
971
-static void hmm_devmem_free(struct page *page, void *data)
972
-{
973
- struct hmm_devmem *devmem = data;
974
-
975
- page->mapping = NULL;
976
-
977
- devmem->ops->free(devmem, page);
978
-}
979
-
980
-static DEFINE_MUTEX(hmm_devmem_lock);
981
-static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
982
-
983
-static void hmm_devmem_radix_release(struct resource *resource)
984
-{
985
- resource_size_t key;
986
-
987
- mutex_lock(&hmm_devmem_lock);
988
- for (key = resource->start;
989
- key <= resource->end;
990
- key += PA_SECTION_SIZE)
991
- radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
992
- mutex_unlock(&hmm_devmem_lock);
993
-}
994
-
995
-static void hmm_devmem_release(void *data)
996
-{
997
- struct hmm_devmem *devmem = data;
998
- struct resource *resource = devmem->resource;
999
- unsigned long start_pfn, npages;
1000
- struct page *page;
1001
- int nid;
1002
-
1003
- /* pages are dead and unused, undo the arch mapping */
1004
- start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
1005
- npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
1006
-
1007
- page = pfn_to_page(start_pfn);
1008
- nid = page_to_nid(page);
1009
-
1010
- mem_hotplug_begin();
1011
- if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
1012
- __remove_pages(start_pfn, npages, NULL);
1013
- else
1014
- arch_remove_memory(nid, start_pfn << PAGE_SHIFT,
1015
- npages << PAGE_SHIFT, NULL);
1016
- mem_hotplug_done();
1017
-
1018
- hmm_devmem_radix_release(resource);
1019
-}
1020
-
1021
-static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1022
-{
1023
- resource_size_t key, align_start, align_size, align_end;
1024
- struct device *device = devmem->device;
1025
- int ret, nid, is_ram;
1026
- unsigned long pfn;
1027
-
1028
- align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
1029
- align_size = ALIGN(devmem->resource->start +
1030
- resource_size(devmem->resource),
1031
- PA_SECTION_SIZE) - align_start;
1032
-
1033
- is_ram = region_intersects(align_start, align_size,
1034
- IORESOURCE_SYSTEM_RAM,
1035
- IORES_DESC_NONE);
1036
- if (is_ram == REGION_MIXED) {
1037
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
1038
- __func__, devmem->resource);
1039
- return -ENXIO;
1040
- }
1041
- if (is_ram == REGION_INTERSECTS)
1042
- return -ENXIO;
1043
-
1044
- if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
1045
- devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
1046
- else
1047
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
1048
-
1049
- devmem->pagemap.res = *devmem->resource;
1050
- devmem->pagemap.page_fault = hmm_devmem_fault;
1051
- devmem->pagemap.page_free = hmm_devmem_free;
1052
- devmem->pagemap.dev = devmem->device;
1053
- devmem->pagemap.ref = &devmem->ref;
1054
- devmem->pagemap.data = devmem;
1055
-
1056
- mutex_lock(&hmm_devmem_lock);
1057
- align_end = align_start + align_size - 1;
1058
- for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
1059
- struct hmm_devmem *dup;
1060
-
1061
- dup = radix_tree_lookup(&hmm_devmem_radix,
1062
- key >> PA_SECTION_SHIFT);
1063
- if (dup) {
1064
- dev_err(device, "%s: collides with mapping for %s\n",
1065
- __func__, dev_name(dup->device));
1066
- mutex_unlock(&hmm_devmem_lock);
1067
- ret = -EBUSY;
1068
- goto error;
1069
- }
1070
- ret = radix_tree_insert(&hmm_devmem_radix,
1071
- key >> PA_SECTION_SHIFT,
1072
- devmem);
1073
- if (ret) {
1074
- dev_err(device, "%s: failed: %d\n", __func__, ret);
1075
- mutex_unlock(&hmm_devmem_lock);
1076
- goto error_radix;
1077
- }
1078
- }
1079
- mutex_unlock(&hmm_devmem_lock);
1080
-
1081
- nid = dev_to_node(device);
1082
- if (nid < 0)
1083
- nid = numa_mem_id();
1084
-
1085
- mem_hotplug_begin();
1086
- /*
1087
- * For device private memory we call add_pages() as we only need to
1088
- * allocate and initialize struct page for the device memory. More-
1089
- * over the device memory is un-accessible thus we do not want to
1090
- * create a linear mapping for the memory like arch_add_memory()
1091
- * would do.
1092
- *
1093
- * For device public memory, which is accesible by the CPU, we do
1094
- * want the linear mapping and thus use arch_add_memory().
1095
- */
1096
- if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
1097
- ret = arch_add_memory(nid, align_start, align_size, NULL,
1098
- false);
1099
- else
1100
- ret = add_pages(nid, align_start >> PAGE_SHIFT,
1101
- align_size >> PAGE_SHIFT, NULL, false);
1102
- if (ret) {
1103
- mem_hotplug_done();
1104
- goto error_add_memory;
1105
- }
1106
- move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1107
- align_start >> PAGE_SHIFT,
1108
- align_size >> PAGE_SHIFT, NULL);
1109
- mem_hotplug_done();
1110
-
1111
- for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
1112
- struct page *page = pfn_to_page(pfn);
1113
-
1114
- page->pgmap = &devmem->pagemap;
1115
- }
1116
- return 0;
1117
-
1118
-error_add_memory:
1119
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
1120
-error_radix:
1121
- hmm_devmem_radix_release(devmem->resource);
1122
-error:
1123
- return ret;
1124
-}
1125
-
1126
-/*
1127
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
1128
- *
1129
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
1130
- * @device: device struct to bind the resource too
1131
- * @size: size in bytes of the device memory to add
1132
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
1133
- *
1134
- * This function first finds an empty range of physical address big enough to
1135
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
1136
- * in turn allocates struct pages. It does not do anything beyond that; all
1137
- * events affecting the memory will go through the various callbacks provided
1138
- * by hmm_devmem_ops struct.
1139
- *
1140
- * Device driver should call this function during device initialization and
1141
- * is then responsible of memory management. HMM only provides helpers.
1142
- */
1143
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1144
- struct device *device,
1145
- unsigned long size)
1146
-{
1147
- struct hmm_devmem *devmem;
1148
- resource_size_t addr;
1149
- int ret;
1150
-
1151
- dev_pagemap_get_ops();
1152
-
1153
- devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
1154
- if (!devmem)
1155
- return ERR_PTR(-ENOMEM);
1156
-
1157
- init_completion(&devmem->completion);
1158
- devmem->pfn_first = -1UL;
1159
- devmem->pfn_last = -1UL;
1160
- devmem->resource = NULL;
1161
- devmem->device = device;
1162
- devmem->ops = ops;
1163
-
1164
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1165
- 0, GFP_KERNEL);
1166
- if (ret)
1167
- return ERR_PTR(ret);
1168
-
1169
- ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
1170
- if (ret)
1171
- return ERR_PTR(ret);
1172
-
1173
- size = ALIGN(size, PA_SECTION_SIZE);
1174
- addr = min((unsigned long)iomem_resource.end,
1175
- (1UL << MAX_PHYSMEM_BITS) - 1);
1176
- addr = addr - size + 1UL;
1177
-
1178
- /*
1179
- * FIXME add a new helper to quickly walk resource tree and find free
1180
- * range
1181
- *
1182
- * FIXME what about ioport_resource resource ?
1183
- */
1184
- for (; addr > size && addr >= iomem_resource.start; addr -= size) {
1185
- ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
1186
- if (ret != REGION_DISJOINT)
1187
- continue;
1188
-
1189
- devmem->resource = devm_request_mem_region(device, addr, size,
1190
- dev_name(device));
1191
- if (!devmem->resource)
1192
- return ERR_PTR(-ENOMEM);
1193
- break;
1194
- }
1195
- if (!devmem->resource)
1196
- return ERR_PTR(-ERANGE);
1197
-
1198
- devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
1199
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1200
- devmem->pfn_last = devmem->pfn_first +
1201
- (resource_size(devmem->resource) >> PAGE_SHIFT);
1202
-
1203
- ret = hmm_devmem_pages_create(devmem);
1204
- if (ret)
1205
- return ERR_PTR(ret);
1206
-
1207
- ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
1208
- if (ret)
1209
- return ERR_PTR(ret);
1210
-
1211
- return devmem;
1212
-}
1213
-EXPORT_SYMBOL_GPL(hmm_devmem_add);
1214
-
1215
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1216
- struct device *device,
1217
- struct resource *res)
1218
-{
1219
- struct hmm_devmem *devmem;
1220
- int ret;
1221
-
1222
- if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
1223
- return ERR_PTR(-EINVAL);
1224
-
1225
- dev_pagemap_get_ops();
1226
-
1227
- devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
1228
- if (!devmem)
1229
- return ERR_PTR(-ENOMEM);
1230
-
1231
- init_completion(&devmem->completion);
1232
- devmem->pfn_first = -1UL;
1233
- devmem->pfn_last = -1UL;
1234
- devmem->resource = res;
1235
- devmem->device = device;
1236
- devmem->ops = ops;
1237
-
1238
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1239
- 0, GFP_KERNEL);
1240
- if (ret)
1241
- return ERR_PTR(ret);
1242
-
1243
- ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
1244
- &devmem->ref);
1245
- if (ret)
1246
- return ERR_PTR(ret);
1247
-
1248
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1249
- devmem->pfn_last = devmem->pfn_first +
1250
- (resource_size(devmem->resource) >> PAGE_SHIFT);
1251
-
1252
- ret = hmm_devmem_pages_create(devmem);
1253
- if (ret)
1254
- return ERR_PTR(ret);
1255
-
1256
- ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
1257
- if (ret)
1258
- return ERR_PTR(ret);
1259
-
1260
- ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill,
1261
- &devmem->ref);
1262
- if (ret)
1263
- return ERR_PTR(ret);
1264
-
1265
- return devmem;
1266
-}
1267
-EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
1268
-
1269
-/*
1270
- * A device driver that wants to handle multiple devices memory through a
1271
- * single fake device can use hmm_device to do so. This is purely a helper
1272
- * and it is not needed to make use of any HMM functionality.
1273
- */
1274
-#define HMM_DEVICE_MAX 256
1275
-
1276
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
1277
-static DEFINE_SPINLOCK(hmm_device_lock);
1278
-static struct class *hmm_device_class;
1279
-static dev_t hmm_device_devt;
1280
-
1281
-static void hmm_device_release(struct device *device)
1282
-{
1283
- struct hmm_device *hmm_device;
1284
-
1285
- hmm_device = container_of(device, struct hmm_device, device);
1286
- spin_lock(&hmm_device_lock);
1287
- clear_bit(hmm_device->minor, hmm_device_mask);
1288
- spin_unlock(&hmm_device_lock);
1289
-
1290
- kfree(hmm_device);
1291
-}
1292
-
1293
-struct hmm_device *hmm_device_new(void *drvdata)
1294
-{
1295
- struct hmm_device *hmm_device;
1296
-
1297
- hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
1298
- if (!hmm_device)
1299
- return ERR_PTR(-ENOMEM);
1300
-
1301
- spin_lock(&hmm_device_lock);
1302
- hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
1303
- if (hmm_device->minor >= HMM_DEVICE_MAX) {
1304
- spin_unlock(&hmm_device_lock);
1305
- kfree(hmm_device);
1306
- return ERR_PTR(-EBUSY);
1307
- }
1308
- set_bit(hmm_device->minor, hmm_device_mask);
1309
- spin_unlock(&hmm_device_lock);
1310
-
1311
- dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
1312
- hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
1313
- hmm_device->minor);
1314
- hmm_device->device.release = hmm_device_release;
1315
- dev_set_drvdata(&hmm_device->device, drvdata);
1316
- hmm_device->device.class = hmm_device_class;
1317
- device_initialize(&hmm_device->device);
1318
-
1319
- return hmm_device;
1320
-}
1321
-EXPORT_SYMBOL(hmm_device_new);
1322
-
1323
-void hmm_device_put(struct hmm_device *hmm_device)
1324
-{
1325
- put_device(&hmm_device->device);
1326
-}
1327
-EXPORT_SYMBOL(hmm_device_put);
1328
-
1329
-static int __init hmm_init(void)
1330
-{
1331
- int ret;
1332
-
1333
- ret = alloc_chrdev_region(&hmm_device_devt, 0,
1334
- HMM_DEVICE_MAX,
1335
- "hmm_device");
1336
- if (ret)
1337
- return ret;
1338
-
1339
- hmm_device_class = class_create(THIS_MODULE, "hmm_device");
1340
- if (IS_ERR(hmm_device_class)) {
1341
- unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
1342
- return PTR_ERR(hmm_device_class);
1343
- }
1344
- return 0;
1345
-}
1346
-
1347
-device_initcall(hmm_init);
1348
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
599
+EXPORT_SYMBOL(hmm_range_fault);