| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright 2013 Red Hat Inc. |
|---|
| 3 | 4 | * |
|---|
| 4 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 5 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 6 | | - * the Free Software Foundation; either version 2 of the License, or |
|---|
| 7 | | - * (at your option) any later version. |
|---|
| 8 | | - * |
|---|
| 9 | | - * This program is distributed in the hope that it will be useful, |
|---|
| 10 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 11 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 12 | | - * GNU General Public License for more details. |
|---|
| 13 | | - * |
|---|
| 14 | | - * Authors: Jérôme Glisse <jglisse@redhat.com> |
|---|
| 5 | + * Authors: Jérôme Glisse <jglisse@redhat.com> |
|---|
| 15 | 6 | */ |
|---|
| 16 | 7 | /* |
|---|
| 17 | 8 | * Refer to include/linux/hmm.h for information about heterogeneous memory |
|---|
| 18 | 9 | * management or HMM for short. |
|---|
| 19 | 10 | */ |
|---|
| 20 | | -#include <linux/mm.h> |
|---|
| 11 | +#include <linux/pagewalk.h> |
|---|
| 21 | 12 | #include <linux/hmm.h> |
|---|
| 22 | 13 | #include <linux/init.h> |
|---|
| 23 | 14 | #include <linux/rmap.h> |
|---|
| .. | .. |
|---|
| 29 | 20 | #include <linux/swapops.h> |
|---|
| 30 | 21 | #include <linux/hugetlb.h> |
|---|
| 31 | 22 | #include <linux/memremap.h> |
|---|
| 23 | +#include <linux/sched/mm.h> |
|---|
| 32 | 24 | #include <linux/jump_label.h> |
|---|
| 25 | +#include <linux/dma-mapping.h> |
|---|
| 33 | 26 | #include <linux/mmu_notifier.h> |
|---|
| 34 | 27 | #include <linux/memory_hotplug.h> |
|---|
| 35 | | - |
|---|
| 36 | | -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) |
|---|
| 37 | | - |
|---|
| 38 | | -#if IS_ENABLED(CONFIG_HMM_MIRROR) |
|---|
| 39 | | -static const struct mmu_notifier_ops hmm_mmu_notifier_ops; |
|---|
| 40 | | - |
|---|
| 41 | | -/* |
|---|
| 42 | | - * struct hmm - HMM per mm struct |
|---|
| 43 | | - * |
|---|
| 44 | | - * @mm: mm struct this HMM struct is bound to |
|---|
| 45 | | - * @lock: lock protecting ranges list |
|---|
| 46 | | - * @sequence: we track updates to the CPU page table with a sequence number |
|---|
| 47 | | - * @ranges: list of range being snapshotted |
|---|
| 48 | | - * @mirrors: list of mirrors for this mm |
|---|
| 49 | | - * @mmu_notifier: mmu notifier to track updates to CPU page table |
|---|
| 50 | | - * @mirrors_sem: read/write semaphore protecting the mirrors list |
|---|
| 51 | | - */ |
|---|
| 52 | | -struct hmm { |
|---|
| 53 | | - struct mm_struct *mm; |
|---|
| 54 | | - spinlock_t lock; |
|---|
| 55 | | - atomic_t sequence; |
|---|
| 56 | | - struct list_head ranges; |
|---|
| 57 | | - struct list_head mirrors; |
|---|
| 58 | | - struct mmu_notifier mmu_notifier; |
|---|
| 59 | | - struct rw_semaphore mirrors_sem; |
|---|
| 60 | | -}; |
|---|
| 61 | | - |
|---|
| 62 | | -/* |
|---|
| 63 | | - * hmm_register - register HMM against an mm (HMM internal) |
|---|
| 64 | | - * |
|---|
| 65 | | - * @mm: mm struct to attach to |
|---|
| 66 | | - * |
|---|
| 67 | | - * This is not intended to be used directly by device drivers. It allocates an |
|---|
| 68 | | - * HMM struct if mm does not have one, and initializes it. |
|---|
| 69 | | - */ |
|---|
| 70 | | -static struct hmm *hmm_register(struct mm_struct *mm) |
|---|
| 71 | | -{ |
|---|
| 72 | | - struct hmm *hmm = READ_ONCE(mm->hmm); |
|---|
| 73 | | - bool cleanup = false; |
|---|
| 74 | | - |
|---|
| 75 | | - /* |
|---|
| 76 | | - * The hmm struct can only be freed once the mm_struct goes away, |
|---|
| 77 | | - * hence we should always have pre-allocated an new hmm struct |
|---|
| 78 | | - * above. |
|---|
| 79 | | - */ |
|---|
| 80 | | - if (hmm) |
|---|
| 81 | | - return hmm; |
|---|
| 82 | | - |
|---|
| 83 | | - hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); |
|---|
| 84 | | - if (!hmm) |
|---|
| 85 | | - return NULL; |
|---|
| 86 | | - INIT_LIST_HEAD(&hmm->mirrors); |
|---|
| 87 | | - init_rwsem(&hmm->mirrors_sem); |
|---|
| 88 | | - atomic_set(&hmm->sequence, 0); |
|---|
| 89 | | - hmm->mmu_notifier.ops = NULL; |
|---|
| 90 | | - INIT_LIST_HEAD(&hmm->ranges); |
|---|
| 91 | | - spin_lock_init(&hmm->lock); |
|---|
| 92 | | - hmm->mm = mm; |
|---|
| 93 | | - |
|---|
| 94 | | - spin_lock(&mm->page_table_lock); |
|---|
| 95 | | - if (!mm->hmm) |
|---|
| 96 | | - mm->hmm = hmm; |
|---|
| 97 | | - else |
|---|
| 98 | | - cleanup = true; |
|---|
| 99 | | - spin_unlock(&mm->page_table_lock); |
|---|
| 100 | | - |
|---|
| 101 | | - if (cleanup) |
|---|
| 102 | | - goto error; |
|---|
| 103 | | - |
|---|
| 104 | | - /* |
|---|
| 105 | | - * We should only get here if hold the mmap_sem in write mode ie on |
|---|
| 106 | | - * registration of first mirror through hmm_mirror_register() |
|---|
| 107 | | - */ |
|---|
| 108 | | - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; |
|---|
| 109 | | - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) |
|---|
| 110 | | - goto error_mm; |
|---|
| 111 | | - |
|---|
| 112 | | - return mm->hmm; |
|---|
| 113 | | - |
|---|
| 114 | | -error_mm: |
|---|
| 115 | | - spin_lock(&mm->page_table_lock); |
|---|
| 116 | | - if (mm->hmm == hmm) |
|---|
| 117 | | - mm->hmm = NULL; |
|---|
| 118 | | - spin_unlock(&mm->page_table_lock); |
|---|
| 119 | | -error: |
|---|
| 120 | | - kfree(hmm); |
|---|
| 121 | | - return NULL; |
|---|
| 122 | | -} |
|---|
| 123 | | - |
|---|
| 124 | | -void hmm_mm_destroy(struct mm_struct *mm) |
|---|
| 125 | | -{ |
|---|
| 126 | | - kfree(mm->hmm); |
|---|
| 127 | | -} |
|---|
| 128 | | - |
|---|
| 129 | | -static void hmm_invalidate_range(struct hmm *hmm, |
|---|
| 130 | | - enum hmm_update_type action, |
|---|
| 131 | | - unsigned long start, |
|---|
| 132 | | - unsigned long end) |
|---|
| 133 | | -{ |
|---|
| 134 | | - struct hmm_mirror *mirror; |
|---|
| 135 | | - struct hmm_range *range; |
|---|
| 136 | | - |
|---|
| 137 | | - spin_lock(&hmm->lock); |
|---|
| 138 | | - list_for_each_entry(range, &hmm->ranges, list) { |
|---|
| 139 | | - unsigned long addr, idx, npages; |
|---|
| 140 | | - |
|---|
| 141 | | - if (end < range->start || start >= range->end) |
|---|
| 142 | | - continue; |
|---|
| 143 | | - |
|---|
| 144 | | - range->valid = false; |
|---|
| 145 | | - addr = max(start, range->start); |
|---|
| 146 | | - idx = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 147 | | - npages = (min(range->end, end) - addr) >> PAGE_SHIFT; |
|---|
| 148 | | - memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); |
|---|
| 149 | | - } |
|---|
| 150 | | - spin_unlock(&hmm->lock); |
|---|
| 151 | | - |
|---|
| 152 | | - down_read(&hmm->mirrors_sem); |
|---|
| 153 | | - list_for_each_entry(mirror, &hmm->mirrors, list) |
|---|
| 154 | | - mirror->ops->sync_cpu_device_pagetables(mirror, action, |
|---|
| 155 | | - start, end); |
|---|
| 156 | | - up_read(&hmm->mirrors_sem); |
|---|
| 157 | | -} |
|---|
| 158 | | - |
|---|
| 159 | | -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) |
|---|
| 160 | | -{ |
|---|
| 161 | | - struct hmm_mirror *mirror; |
|---|
| 162 | | - struct hmm *hmm = mm->hmm; |
|---|
| 163 | | - |
|---|
| 164 | | - down_write(&hmm->mirrors_sem); |
|---|
| 165 | | - mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, |
|---|
| 166 | | - list); |
|---|
| 167 | | - while (mirror) { |
|---|
| 168 | | - list_del_init(&mirror->list); |
|---|
| 169 | | - if (mirror->ops->release) { |
|---|
| 170 | | - /* |
|---|
| 171 | | - * Drop mirrors_sem so callback can wait on any pending |
|---|
| 172 | | - * work that might itself trigger mmu_notifier callback |
|---|
| 173 | | - * and thus would deadlock with us. |
|---|
| 174 | | - */ |
|---|
| 175 | | - up_write(&hmm->mirrors_sem); |
|---|
| 176 | | - mirror->ops->release(mirror); |
|---|
| 177 | | - down_write(&hmm->mirrors_sem); |
|---|
| 178 | | - } |
|---|
| 179 | | - mirror = list_first_entry_or_null(&hmm->mirrors, |
|---|
| 180 | | - struct hmm_mirror, list); |
|---|
| 181 | | - } |
|---|
| 182 | | - up_write(&hmm->mirrors_sem); |
|---|
| 183 | | -} |
|---|
| 184 | | - |
|---|
| 185 | | -static int hmm_invalidate_range_start(struct mmu_notifier *mn, |
|---|
| 186 | | - struct mm_struct *mm, |
|---|
| 187 | | - unsigned long start, |
|---|
| 188 | | - unsigned long end, |
|---|
| 189 | | - bool blockable) |
|---|
| 190 | | -{ |
|---|
| 191 | | - struct hmm *hmm = mm->hmm; |
|---|
| 192 | | - |
|---|
| 193 | | - VM_BUG_ON(!hmm); |
|---|
| 194 | | - |
|---|
| 195 | | - atomic_inc(&hmm->sequence); |
|---|
| 196 | | - |
|---|
| 197 | | - return 0; |
|---|
| 198 | | -} |
|---|
| 199 | | - |
|---|
| 200 | | -static void hmm_invalidate_range_end(struct mmu_notifier *mn, |
|---|
| 201 | | - struct mm_struct *mm, |
|---|
| 202 | | - unsigned long start, |
|---|
| 203 | | - unsigned long end) |
|---|
| 204 | | -{ |
|---|
| 205 | | - struct hmm *hmm = mm->hmm; |
|---|
| 206 | | - |
|---|
| 207 | | - VM_BUG_ON(!hmm); |
|---|
| 208 | | - |
|---|
| 209 | | - hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); |
|---|
| 210 | | -} |
|---|
| 211 | | - |
|---|
| 212 | | -static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { |
|---|
| 213 | | - .release = hmm_release, |
|---|
| 214 | | - .invalidate_range_start = hmm_invalidate_range_start, |
|---|
| 215 | | - .invalidate_range_end = hmm_invalidate_range_end, |
|---|
| 216 | | -}; |
|---|
| 217 | | - |
|---|
| 218 | | -/* |
|---|
| 219 | | - * hmm_mirror_register() - register a mirror against an mm |
|---|
| 220 | | - * |
|---|
| 221 | | - * @mirror: new mirror struct to register |
|---|
| 222 | | - * @mm: mm to register against |
|---|
| 223 | | - * |
|---|
| 224 | | - * To start mirroring a process address space, the device driver must register |
|---|
| 225 | | - * an HMM mirror struct. |
|---|
| 226 | | - * |
|---|
| 227 | | - * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! |
|---|
| 228 | | - */ |
|---|
| 229 | | -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) |
|---|
| 230 | | -{ |
|---|
| 231 | | - /* Sanity check */ |
|---|
| 232 | | - if (!mm || !mirror || !mirror->ops) |
|---|
| 233 | | - return -EINVAL; |
|---|
| 234 | | - |
|---|
| 235 | | -again: |
|---|
| 236 | | - mirror->hmm = hmm_register(mm); |
|---|
| 237 | | - if (!mirror->hmm) |
|---|
| 238 | | - return -ENOMEM; |
|---|
| 239 | | - |
|---|
| 240 | | - down_write(&mirror->hmm->mirrors_sem); |
|---|
| 241 | | - if (mirror->hmm->mm == NULL) { |
|---|
| 242 | | - /* |
|---|
| 243 | | - * A racing hmm_mirror_unregister() is about to destroy the hmm |
|---|
| 244 | | - * struct. Try again to allocate a new one. |
|---|
| 245 | | - */ |
|---|
| 246 | | - up_write(&mirror->hmm->mirrors_sem); |
|---|
| 247 | | - mirror->hmm = NULL; |
|---|
| 248 | | - goto again; |
|---|
| 249 | | - } else { |
|---|
| 250 | | - list_add(&mirror->list, &mirror->hmm->mirrors); |
|---|
| 251 | | - up_write(&mirror->hmm->mirrors_sem); |
|---|
| 252 | | - } |
|---|
| 253 | | - |
|---|
| 254 | | - return 0; |
|---|
| 255 | | -} |
|---|
| 256 | | -EXPORT_SYMBOL(hmm_mirror_register); |
|---|
| 257 | | - |
|---|
| 258 | | -/* |
|---|
| 259 | | - * hmm_mirror_unregister() - unregister a mirror |
|---|
| 260 | | - * |
|---|
| 261 | | - * @mirror: new mirror struct to register |
|---|
| 262 | | - * |
|---|
| 263 | | - * Stop mirroring a process address space, and cleanup. |
|---|
| 264 | | - */ |
|---|
| 265 | | -void hmm_mirror_unregister(struct hmm_mirror *mirror) |
|---|
| 266 | | -{ |
|---|
| 267 | | - bool should_unregister = false; |
|---|
| 268 | | - struct mm_struct *mm; |
|---|
| 269 | | - struct hmm *hmm; |
|---|
| 270 | | - |
|---|
| 271 | | - if (mirror->hmm == NULL) |
|---|
| 272 | | - return; |
|---|
| 273 | | - |
|---|
| 274 | | - hmm = mirror->hmm; |
|---|
| 275 | | - down_write(&hmm->mirrors_sem); |
|---|
| 276 | | - list_del_init(&mirror->list); |
|---|
| 277 | | - should_unregister = list_empty(&hmm->mirrors); |
|---|
| 278 | | - mirror->hmm = NULL; |
|---|
| 279 | | - mm = hmm->mm; |
|---|
| 280 | | - hmm->mm = NULL; |
|---|
| 281 | | - up_write(&hmm->mirrors_sem); |
|---|
| 282 | | - |
|---|
| 283 | | - if (!should_unregister || mm == NULL) |
|---|
| 284 | | - return; |
|---|
| 285 | | - |
|---|
| 286 | | - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); |
|---|
| 287 | | - |
|---|
| 288 | | - spin_lock(&mm->page_table_lock); |
|---|
| 289 | | - if (mm->hmm == hmm) |
|---|
| 290 | | - mm->hmm = NULL; |
|---|
| 291 | | - spin_unlock(&mm->page_table_lock); |
|---|
| 292 | | - |
|---|
| 293 | | - kfree(hmm); |
|---|
| 294 | | -} |
|---|
| 295 | | -EXPORT_SYMBOL(hmm_mirror_unregister); |
|---|
| 296 | 28 | |
|---|
| 297 | 29 | struct hmm_vma_walk { |
|---|
| 298 | 30 | struct hmm_range *range; |
|---|
| 299 | 31 | unsigned long last; |
|---|
| 300 | | - bool fault; |
|---|
| 301 | | - bool block; |
|---|
| 302 | 32 | }; |
|---|
| 303 | 33 | |
|---|
| 304 | | -static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, |
|---|
| 305 | | - bool write_fault, uint64_t *pfn) |
|---|
| 34 | +enum { |
|---|
| 35 | + HMM_NEED_FAULT = 1 << 0, |
|---|
| 36 | + HMM_NEED_WRITE_FAULT = 1 << 1, |
|---|
| 37 | + HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, |
|---|
| 38 | +}; |
|---|
| 39 | + |
|---|
| 40 | +static int hmm_pfns_fill(unsigned long addr, unsigned long end, |
|---|
| 41 | + struct hmm_range *range, unsigned long cpu_flags) |
|---|
| 306 | 42 | { |
|---|
| 307 | | - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; |
|---|
| 308 | | - struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 309 | | - struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 310 | | - struct vm_area_struct *vma = walk->vma; |
|---|
| 311 | | - vm_fault_t ret; |
|---|
| 43 | + unsigned long i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 312 | 44 | |
|---|
| 313 | | - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; |
|---|
| 314 | | - flags |= write_fault ? FAULT_FLAG_WRITE : 0; |
|---|
| 315 | | - ret = handle_mm_fault(vma, addr, flags); |
|---|
| 316 | | - if (ret & VM_FAULT_RETRY) |
|---|
| 317 | | - return -EBUSY; |
|---|
| 318 | | - if (ret & VM_FAULT_ERROR) { |
|---|
| 319 | | - *pfn = range->values[HMM_PFN_ERROR]; |
|---|
| 320 | | - return -EFAULT; |
|---|
| 321 | | - } |
|---|
| 322 | | - |
|---|
| 323 | | - return -EAGAIN; |
|---|
| 324 | | -} |
|---|
| 325 | | - |
|---|
| 326 | | -static int hmm_pfns_bad(unsigned long addr, |
|---|
| 327 | | - unsigned long end, |
|---|
| 328 | | - struct mm_walk *walk) |
|---|
| 329 | | -{ |
|---|
| 330 | | - struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 331 | | - struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 332 | | - uint64_t *pfns = range->pfns; |
|---|
| 333 | | - unsigned long i; |
|---|
| 334 | | - |
|---|
| 335 | | - i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 336 | 45 | for (; addr < end; addr += PAGE_SIZE, i++) |
|---|
| 337 | | - pfns[i] = range->values[HMM_PFN_ERROR]; |
|---|
| 338 | | - |
|---|
| 46 | + range->hmm_pfns[i] = cpu_flags; |
|---|
| 339 | 47 | return 0; |
|---|
| 340 | 48 | } |
|---|
| 341 | 49 | |
|---|
| 342 | 50 | /* |
|---|
| 343 | | - * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) |
|---|
| 344 | | - * @start: range virtual start address (inclusive) |
|---|
| 51 | + * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) |
|---|
| 52 | + * @addr: range virtual start address (inclusive) |
|---|
| 345 | 53 | * @end: range virtual end address (exclusive) |
|---|
| 346 | | - * @fault: should we fault or not ? |
|---|
| 347 | | - * @write_fault: write fault ? |
|---|
| 54 | + * @required_fault: HMM_NEED_* flags |
|---|
| 348 | 55 | * @walk: mm_walk structure |
|---|
| 349 | | - * Returns: 0 on success, -EAGAIN after page fault, or page fault error |
|---|
| 56 | + * Return: -EBUSY after page fault, or page fault error |
|---|
| 350 | 57 | * |
|---|
| 351 | 58 | * This function will be called whenever pmd_none() or pte_none() returns true, |
|---|
| 352 | 59 | * or whenever there is no page directory covering the virtual address range. |
|---|
| 353 | 60 | */ |
|---|
| 354 | | -static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, |
|---|
| 355 | | - bool fault, bool write_fault, |
|---|
| 356 | | - struct mm_walk *walk) |
|---|
| 61 | +static int hmm_vma_fault(unsigned long addr, unsigned long end, |
|---|
| 62 | + unsigned int required_fault, struct mm_walk *walk) |
|---|
| 357 | 63 | { |
|---|
| 358 | 64 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 359 | | - struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 360 | | - uint64_t *pfns = range->pfns; |
|---|
| 361 | | - unsigned long i; |
|---|
| 65 | + struct vm_area_struct *vma = walk->vma; |
|---|
| 66 | + unsigned int fault_flags = FAULT_FLAG_REMOTE; |
|---|
| 362 | 67 | |
|---|
| 68 | + WARN_ON_ONCE(!required_fault); |
|---|
| 363 | 69 | hmm_vma_walk->last = addr; |
|---|
| 364 | | - i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 365 | | - for (; addr < end; addr += PAGE_SIZE, i++) { |
|---|
| 366 | | - pfns[i] = range->values[HMM_PFN_NONE]; |
|---|
| 367 | | - if (fault || write_fault) { |
|---|
| 368 | | - int ret; |
|---|
| 369 | 70 | |
|---|
| 370 | | - ret = hmm_vma_do_fault(walk, addr, write_fault, |
|---|
| 371 | | - &pfns[i]); |
|---|
| 372 | | - if (ret != -EAGAIN) |
|---|
| 373 | | - return ret; |
|---|
| 374 | | - } |
|---|
| 71 | + if (required_fault & HMM_NEED_WRITE_FAULT) { |
|---|
| 72 | + if (!(vma->vm_flags & VM_WRITE)) |
|---|
| 73 | + return -EPERM; |
|---|
| 74 | + fault_flags |= FAULT_FLAG_WRITE; |
|---|
| 375 | 75 | } |
|---|
| 376 | 76 | |
|---|
| 377 | | - return (fault || write_fault) ? -EAGAIN : 0; |
|---|
| 77 | + for (; addr < end; addr += PAGE_SIZE) |
|---|
| 78 | + if (handle_mm_fault(vma, addr, fault_flags, NULL) & |
|---|
| 79 | + VM_FAULT_ERROR) |
|---|
| 80 | + return -EFAULT; |
|---|
| 81 | + return -EBUSY; |
|---|
| 378 | 82 | } |
|---|
| 379 | 83 | |
|---|
| 380 | | -static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, |
|---|
| 381 | | - uint64_t pfns, uint64_t cpu_flags, |
|---|
| 382 | | - bool *fault, bool *write_fault) |
|---|
| 84 | +static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, |
|---|
| 85 | + unsigned long pfn_req_flags, |
|---|
| 86 | + unsigned long cpu_flags) |
|---|
| 383 | 87 | { |
|---|
| 384 | 88 | struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 385 | 89 | |
|---|
| 386 | | - *fault = *write_fault = false; |
|---|
| 387 | | - if (!hmm_vma_walk->fault) |
|---|
| 388 | | - return; |
|---|
| 90 | + /* |
|---|
| 91 | + * So we not only consider the individual per page request we also |
|---|
| 92 | + * consider the default flags requested for the range. The API can |
|---|
| 93 | + * be used 2 ways. The first one where the HMM user coalesces |
|---|
| 94 | + * multiple page faults into one request and sets flags per pfn for |
|---|
| 95 | + * those faults. The second one where the HMM user wants to pre- |
|---|
| 96 | + * fault a range with specific flags. For the latter one it is a |
|---|
| 97 | + * waste to have the user pre-fill the pfn arrays with a default |
|---|
| 98 | + * flags value. |
|---|
| 99 | + */ |
|---|
| 100 | + pfn_req_flags &= range->pfn_flags_mask; |
|---|
| 101 | + pfn_req_flags |= range->default_flags; |
|---|
| 389 | 102 | |
|---|
| 390 | 103 | /* We aren't ask to do anything ... */ |
|---|
| 391 | | - if (!(pfns & range->flags[HMM_PFN_VALID])) |
|---|
| 392 | | - return; |
|---|
| 393 | | - /* If this is device memory than only fault if explicitly requested */ |
|---|
| 394 | | - if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { |
|---|
| 395 | | - /* Do we fault on device memory ? */ |
|---|
| 396 | | - if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { |
|---|
| 397 | | - *write_fault = pfns & range->flags[HMM_PFN_WRITE]; |
|---|
| 398 | | - *fault = true; |
|---|
| 399 | | - } |
|---|
| 400 | | - return; |
|---|
| 401 | | - } |
|---|
| 104 | + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) |
|---|
| 105 | + return 0; |
|---|
| 106 | + |
|---|
| 107 | + /* Need to write fault ? */ |
|---|
| 108 | + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && |
|---|
| 109 | + !(cpu_flags & HMM_PFN_WRITE)) |
|---|
| 110 | + return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; |
|---|
| 402 | 111 | |
|---|
| 403 | 112 | /* If CPU page table is not valid then we need to fault */ |
|---|
| 404 | | - *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); |
|---|
| 405 | | - /* Need to write fault ? */ |
|---|
| 406 | | - if ((pfns & range->flags[HMM_PFN_WRITE]) && |
|---|
| 407 | | - !(cpu_flags & range->flags[HMM_PFN_WRITE])) { |
|---|
| 408 | | - *write_fault = true; |
|---|
| 409 | | - *fault = true; |
|---|
| 410 | | - } |
|---|
| 113 | + if (!(cpu_flags & HMM_PFN_VALID)) |
|---|
| 114 | + return HMM_NEED_FAULT; |
|---|
| 115 | + return 0; |
|---|
| 411 | 116 | } |
|---|
| 412 | 117 | |
|---|
| 413 | | -static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, |
|---|
| 414 | | - const uint64_t *pfns, unsigned long npages, |
|---|
| 415 | | - uint64_t cpu_flags, bool *fault, |
|---|
| 416 | | - bool *write_fault) |
|---|
| 118 | +static unsigned int |
|---|
| 119 | +hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, |
|---|
| 120 | + const unsigned long hmm_pfns[], unsigned long npages, |
|---|
| 121 | + unsigned long cpu_flags) |
|---|
| 417 | 122 | { |
|---|
| 123 | + struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 124 | + unsigned int required_fault = 0; |
|---|
| 418 | 125 | unsigned long i; |
|---|
| 419 | 126 | |
|---|
| 420 | | - if (!hmm_vma_walk->fault) { |
|---|
| 421 | | - *fault = *write_fault = false; |
|---|
| 422 | | - return; |
|---|
| 423 | | - } |
|---|
| 127 | + /* |
|---|
| 128 | + * If the default flags do not request to fault pages, and the mask does |
|---|
| 129 | + * not allow for individual pages to be faulted, then |
|---|
| 130 | + * hmm_pte_need_fault() will always return 0. |
|---|
| 131 | + */ |
|---|
| 132 | + if (!((range->default_flags | range->pfn_flags_mask) & |
|---|
| 133 | + HMM_PFN_REQ_FAULT)) |
|---|
| 134 | + return 0; |
|---|
| 424 | 135 | |
|---|
| 425 | 136 | for (i = 0; i < npages; ++i) { |
|---|
| 426 | | - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, |
|---|
| 427 | | - fault, write_fault); |
|---|
| 428 | | - if ((*fault) || (*write_fault)) |
|---|
| 429 | | - return; |
|---|
| 137 | + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], |
|---|
| 138 | + cpu_flags); |
|---|
| 139 | + if (required_fault == HMM_NEED_ALL_BITS) |
|---|
| 140 | + return required_fault; |
|---|
| 430 | 141 | } |
|---|
| 142 | + return required_fault; |
|---|
| 431 | 143 | } |
|---|
| 432 | 144 | |
|---|
| 433 | 145 | static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, |
|---|
| 434 | | - struct mm_walk *walk) |
|---|
| 146 | + __always_unused int depth, struct mm_walk *walk) |
|---|
| 435 | 147 | { |
|---|
| 436 | 148 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 437 | 149 | struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 438 | | - bool fault, write_fault; |
|---|
| 150 | + unsigned int required_fault; |
|---|
| 439 | 151 | unsigned long i, npages; |
|---|
| 440 | | - uint64_t *pfns; |
|---|
| 152 | + unsigned long *hmm_pfns; |
|---|
| 441 | 153 | |
|---|
| 442 | 154 | i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 443 | 155 | npages = (end - addr) >> PAGE_SHIFT; |
|---|
| 444 | | - pfns = &range->pfns[i]; |
|---|
| 445 | | - hmm_range_need_fault(hmm_vma_walk, pfns, npages, |
|---|
| 446 | | - 0, &fault, &write_fault); |
|---|
| 447 | | - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
|---|
| 156 | + hmm_pfns = &range->hmm_pfns[i]; |
|---|
| 157 | + required_fault = |
|---|
| 158 | + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); |
|---|
| 159 | + if (!walk->vma) { |
|---|
| 160 | + if (required_fault) |
|---|
| 161 | + return -EFAULT; |
|---|
| 162 | + return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); |
|---|
| 163 | + } |
|---|
| 164 | + if (required_fault) |
|---|
| 165 | + return hmm_vma_fault(addr, end, required_fault, walk); |
|---|
| 166 | + return hmm_pfns_fill(addr, end, range, 0); |
|---|
| 448 | 167 | } |
|---|
| 449 | 168 | |
|---|
| 450 | | -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) |
|---|
| 169 | +static inline unsigned long hmm_pfn_flags_order(unsigned long order) |
|---|
| 170 | +{ |
|---|
| 171 | + return order << HMM_PFN_ORDER_SHIFT; |
|---|
| 172 | +} |
|---|
| 173 | + |
|---|
| 174 | +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, |
|---|
| 175 | + pmd_t pmd) |
|---|
| 451 | 176 | { |
|---|
| 452 | 177 | if (pmd_protnone(pmd)) |
|---|
| 453 | 178 | return 0; |
|---|
| 454 | | - return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | |
|---|
| 455 | | - range->flags[HMM_PFN_WRITE] : |
|---|
| 456 | | - range->flags[HMM_PFN_VALID]; |
|---|
| 179 | + return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : |
|---|
| 180 | + HMM_PFN_VALID) | |
|---|
| 181 | + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); |
|---|
| 457 | 182 | } |
|---|
| 458 | 183 | |
|---|
| 459 | | -static int hmm_vma_handle_pmd(struct mm_walk *walk, |
|---|
| 460 | | - unsigned long addr, |
|---|
| 461 | | - unsigned long end, |
|---|
| 462 | | - uint64_t *pfns, |
|---|
| 184 | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE |
|---|
| 185 | +static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, |
|---|
| 186 | + unsigned long end, unsigned long hmm_pfns[], |
|---|
| 463 | 187 | pmd_t pmd) |
|---|
| 464 | 188 | { |
|---|
| 465 | 189 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 466 | 190 | struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 467 | 191 | unsigned long pfn, npages, i; |
|---|
| 468 | | - bool fault, write_fault; |
|---|
| 469 | | - uint64_t cpu_flags; |
|---|
| 192 | + unsigned int required_fault; |
|---|
| 193 | + unsigned long cpu_flags; |
|---|
| 470 | 194 | |
|---|
| 471 | 195 | npages = (end - addr) >> PAGE_SHIFT; |
|---|
| 472 | 196 | cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); |
|---|
| 473 | | - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, |
|---|
| 474 | | - &fault, &write_fault); |
|---|
| 197 | + required_fault = |
|---|
| 198 | + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); |
|---|
| 199 | + if (required_fault) |
|---|
| 200 | + return hmm_vma_fault(addr, end, required_fault, walk); |
|---|
| 475 | 201 | |
|---|
| 476 | | - if (pmd_protnone(pmd) || fault || write_fault) |
|---|
| 477 | | - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
|---|
| 478 | | - |
|---|
| 479 | | - pfn = pmd_pfn(pmd) + pte_index(addr); |
|---|
| 202 | + pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
|---|
| 480 | 203 | for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) |
|---|
| 481 | | - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; |
|---|
| 482 | | - hmm_vma_walk->last = end; |
|---|
| 204 | + hmm_pfns[i] = pfn | cpu_flags; |
|---|
| 483 | 205 | return 0; |
|---|
| 484 | 206 | } |
|---|
| 207 | +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 208 | +/* stub to allow the code below to compile */ |
|---|
| 209 | +int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, |
|---|
| 210 | + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); |
|---|
| 211 | +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
|---|
| 485 | 212 | |
|---|
| 486 | | -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) |
|---|
| 213 | +static inline bool hmm_is_device_private_entry(struct hmm_range *range, |
|---|
| 214 | + swp_entry_t entry) |
|---|
| 487 | 215 | { |
|---|
| 488 | | - if (pte_none(pte) || !pte_present(pte)) |
|---|
| 216 | + return is_device_private_entry(entry) && |
|---|
| 217 | + device_private_entry_to_page(entry)->pgmap->owner == |
|---|
| 218 | + range->dev_private_owner; |
|---|
| 219 | +} |
|---|
| 220 | + |
|---|
| 221 | +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, |
|---|
| 222 | + pte_t pte) |
|---|
| 223 | +{ |
|---|
| 224 | + if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) |
|---|
| 489 | 225 | return 0; |
|---|
| 490 | | - return pte_write(pte) ? range->flags[HMM_PFN_VALID] | |
|---|
| 491 | | - range->flags[HMM_PFN_WRITE] : |
|---|
| 492 | | - range->flags[HMM_PFN_VALID]; |
|---|
| 226 | + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; |
|---|
| 493 | 227 | } |
|---|
| 494 | 228 | |
|---|
| 495 | 229 | static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, |
|---|
| 496 | 230 | unsigned long end, pmd_t *pmdp, pte_t *ptep, |
|---|
| 497 | | - uint64_t *pfn) |
|---|
| 231 | + unsigned long *hmm_pfn) |
|---|
| 498 | 232 | { |
|---|
| 499 | 233 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 500 | 234 | struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 501 | | - struct vm_area_struct *vma = walk->vma; |
|---|
| 502 | | - bool fault, write_fault; |
|---|
| 503 | | - uint64_t cpu_flags; |
|---|
| 235 | + unsigned int required_fault; |
|---|
| 236 | + unsigned long cpu_flags; |
|---|
| 504 | 237 | pte_t pte = *ptep; |
|---|
| 505 | | - uint64_t orig_pfn = *pfn; |
|---|
| 506 | | - |
|---|
| 507 | | - *pfn = range->values[HMM_PFN_NONE]; |
|---|
| 508 | | - cpu_flags = pte_to_hmm_pfn_flags(range, pte); |
|---|
| 509 | | - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, |
|---|
| 510 | | - &fault, &write_fault); |
|---|
| 238 | + uint64_t pfn_req_flags = *hmm_pfn; |
|---|
| 511 | 239 | |
|---|
| 512 | 240 | if (pte_none(pte)) { |
|---|
| 513 | | - if (fault || write_fault) |
|---|
| 241 | + required_fault = |
|---|
| 242 | + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); |
|---|
| 243 | + if (required_fault) |
|---|
| 514 | 244 | goto fault; |
|---|
| 245 | + *hmm_pfn = 0; |
|---|
| 515 | 246 | return 0; |
|---|
| 516 | 247 | } |
|---|
| 517 | 248 | |
|---|
| 518 | 249 | if (!pte_present(pte)) { |
|---|
| 519 | 250 | swp_entry_t entry = pte_to_swp_entry(pte); |
|---|
| 520 | 251 | |
|---|
| 521 | | - if (!non_swap_entry(entry)) { |
|---|
| 522 | | - if (fault || write_fault) |
|---|
| 523 | | - goto fault; |
|---|
| 252 | + /* |
|---|
| 253 | + * Never fault in device private pages, but just report |
|---|
| 254 | + * the PFN even if not present. |
|---|
| 255 | + */ |
|---|
| 256 | + if (hmm_is_device_private_entry(range, entry)) { |
|---|
| 257 | + cpu_flags = HMM_PFN_VALID; |
|---|
| 258 | + if (is_write_device_private_entry(entry)) |
|---|
| 259 | + cpu_flags |= HMM_PFN_WRITE; |
|---|
| 260 | + *hmm_pfn = device_private_entry_to_pfn(entry) | |
|---|
| 261 | + cpu_flags; |
|---|
| 524 | 262 | return 0; |
|---|
| 525 | 263 | } |
|---|
| 526 | 264 | |
|---|
| 527 | | - /* |
|---|
| 528 | | - * This is a special swap entry, ignore migration, use |
|---|
| 529 | | - * device and report anything else as error. |
|---|
| 530 | | - */ |
|---|
| 531 | | - if (is_device_private_entry(entry)) { |
|---|
| 532 | | - cpu_flags = range->flags[HMM_PFN_VALID] | |
|---|
| 533 | | - range->flags[HMM_PFN_DEVICE_PRIVATE]; |
|---|
| 534 | | - cpu_flags |= is_write_device_private_entry(entry) ? |
|---|
| 535 | | - range->flags[HMM_PFN_WRITE] : 0; |
|---|
| 536 | | - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, |
|---|
| 537 | | - &fault, &write_fault); |
|---|
| 538 | | - if (fault || write_fault) |
|---|
| 539 | | - goto fault; |
|---|
| 540 | | - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); |
|---|
| 541 | | - *pfn |= cpu_flags; |
|---|
| 265 | + required_fault = |
|---|
| 266 | + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); |
|---|
| 267 | + if (!required_fault) { |
|---|
| 268 | + *hmm_pfn = 0; |
|---|
| 542 | 269 | return 0; |
|---|
| 543 | 270 | } |
|---|
| 271 | + |
|---|
| 272 | + if (!non_swap_entry(entry)) |
|---|
| 273 | + goto fault; |
|---|
| 544 | 274 | |
|---|
| 545 | 275 | if (is_migration_entry(entry)) { |
|---|
| 546 | | - if (fault || write_fault) { |
|---|
| 547 | | - pte_unmap(ptep); |
|---|
| 548 | | - hmm_vma_walk->last = addr; |
|---|
| 549 | | - migration_entry_wait(vma->vm_mm, |
|---|
| 550 | | - pmdp, addr); |
|---|
| 551 | | - return -EAGAIN; |
|---|
| 552 | | - } |
|---|
| 553 | | - return 0; |
|---|
| 276 | + pte_unmap(ptep); |
|---|
| 277 | + hmm_vma_walk->last = addr; |
|---|
| 278 | + migration_entry_wait(walk->mm, pmdp, addr); |
|---|
| 279 | + return -EBUSY; |
|---|
| 554 | 280 | } |
|---|
| 555 | 281 | |
|---|
| 556 | 282 | /* Report error for everything else */ |
|---|
| 557 | | - *pfn = range->values[HMM_PFN_ERROR]; |
|---|
| 283 | + pte_unmap(ptep); |
|---|
| 558 | 284 | return -EFAULT; |
|---|
| 559 | 285 | } |
|---|
| 560 | 286 | |
|---|
| 561 | | - if (fault || write_fault) |
|---|
| 287 | + cpu_flags = pte_to_hmm_pfn_flags(range, pte); |
|---|
| 288 | + required_fault = |
|---|
| 289 | + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); |
|---|
| 290 | + if (required_fault) |
|---|
| 562 | 291 | goto fault; |
|---|
| 563 | 292 | |
|---|
| 564 | | - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; |
|---|
| 293 | + /* |
|---|
| 294 | + * Bypass devmap pte such as DAX page when all pfn requested |
|---|
| 295 | + * flags(pfn_req_flags) are fulfilled. |
|---|
| 296 | + * Since each architecture defines a struct page for the zero page, just |
|---|
| 297 | + * fall through and treat it like a normal page. |
|---|
| 298 | + */ |
|---|
| 299 | + if (!vm_normal_page(walk->vma, addr, pte) && |
|---|
| 300 | + !pte_devmap(pte) && |
|---|
| 301 | + !is_zero_pfn(pte_pfn(pte))) { |
|---|
| 302 | + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { |
|---|
| 303 | + pte_unmap(ptep); |
|---|
| 304 | + return -EFAULT; |
|---|
| 305 | + } |
|---|
| 306 | + *hmm_pfn = HMM_PFN_ERROR; |
|---|
| 307 | + return 0; |
|---|
| 308 | + } |
|---|
| 309 | + |
|---|
| 310 | + *hmm_pfn = pte_pfn(pte) | cpu_flags; |
|---|
| 565 | 311 | return 0; |
|---|
| 566 | 312 | |
|---|
| 567 | 313 | fault: |
|---|
| 568 | 314 | pte_unmap(ptep); |
|---|
| 569 | 315 | /* Fault any virtual address we were asked to fault */ |
|---|
| 570 | | - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
|---|
| 316 | + return hmm_vma_fault(addr, end, required_fault, walk); |
|---|
| 571 | 317 | } |
|---|
| 572 | 318 | |
|---|
| 573 | 319 | static int hmm_vma_walk_pmd(pmd_t *pmdp, |
|---|
| .. | .. |
|---|
| 577 | 323 | { |
|---|
| 578 | 324 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 579 | 325 | struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 580 | | - uint64_t *pfns = range->pfns; |
|---|
| 581 | | - unsigned long addr = start, i; |
|---|
| 326 | + unsigned long *hmm_pfns = |
|---|
| 327 | + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; |
|---|
| 328 | + unsigned long npages = (end - start) >> PAGE_SHIFT; |
|---|
| 329 | + unsigned long addr = start; |
|---|
| 582 | 330 | pte_t *ptep; |
|---|
| 583 | | - |
|---|
| 584 | | - i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 331 | + pmd_t pmd; |
|---|
| 585 | 332 | |
|---|
| 586 | 333 | again: |
|---|
| 587 | | - if (pmd_none(*pmdp)) |
|---|
| 588 | | - return hmm_vma_walk_hole(start, end, walk); |
|---|
| 334 | + pmd = READ_ONCE(*pmdp); |
|---|
| 335 | + if (pmd_none(pmd)) |
|---|
| 336 | + return hmm_vma_walk_hole(start, end, -1, walk); |
|---|
| 589 | 337 | |
|---|
| 590 | | - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) |
|---|
| 591 | | - return hmm_pfns_bad(start, end, walk); |
|---|
| 338 | + if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { |
|---|
| 339 | + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { |
|---|
| 340 | + hmm_vma_walk->last = addr; |
|---|
| 341 | + pmd_migration_entry_wait(walk->mm, pmdp); |
|---|
| 342 | + return -EBUSY; |
|---|
| 343 | + } |
|---|
| 344 | + return hmm_pfns_fill(start, end, range, 0); |
|---|
| 345 | + } |
|---|
| 592 | 346 | |
|---|
| 593 | | - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { |
|---|
| 594 | | - pmd_t pmd; |
|---|
| 347 | + if (!pmd_present(pmd)) { |
|---|
| 348 | + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) |
|---|
| 349 | + return -EFAULT; |
|---|
| 350 | + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); |
|---|
| 351 | + } |
|---|
| 595 | 352 | |
|---|
| 353 | + if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { |
|---|
| 596 | 354 | /* |
|---|
| 597 | | - * No need to take pmd_lock here, even if some other threads |
|---|
| 355 | + * No need to take pmd_lock here, even if some other thread |
|---|
| 598 | 356 | * is splitting the huge pmd we will get that event through |
|---|
| 599 | 357 | * mmu_notifier callback. |
|---|
| 600 | 358 | * |
|---|
| 601 | | - * So just read pmd value and check again its a transparent |
|---|
| 359 | + * So just read pmd value and check again it's a transparent |
|---|
| 602 | 360 | * huge or device mapping one and compute corresponding pfn |
|---|
| 603 | 361 | * values. |
|---|
| 604 | 362 | */ |
|---|
| .. | .. |
|---|
| 607 | 365 | if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) |
|---|
| 608 | 366 | goto again; |
|---|
| 609 | 367 | |
|---|
| 610 | | - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); |
|---|
| 368 | + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); |
|---|
| 611 | 369 | } |
|---|
| 612 | 370 | |
|---|
| 613 | | - if (pmd_bad(*pmdp)) |
|---|
| 614 | | - return hmm_pfns_bad(start, end, walk); |
|---|
| 371 | + /* |
|---|
| 372 | + * We have handled all the valid cases above ie either none, migration, |
|---|
| 373 | + * huge or transparent huge. At this point either it is a valid pmd |
|---|
| 374 | + * entry pointing to pte directory or it is a bad pmd that will not |
|---|
| 375 | + * recover. |
|---|
| 376 | + */ |
|---|
| 377 | + if (pmd_bad(pmd)) { |
|---|
| 378 | + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) |
|---|
| 379 | + return -EFAULT; |
|---|
| 380 | + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); |
|---|
| 381 | + } |
|---|
| 615 | 382 | |
|---|
| 616 | 383 | ptep = pte_offset_map(pmdp, addr); |
|---|
| 617 | | - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { |
|---|
| 384 | + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { |
|---|
| 618 | 385 | int r; |
|---|
| 619 | 386 | |
|---|
| 620 | | - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); |
|---|
| 387 | + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); |
|---|
| 621 | 388 | if (r) { |
|---|
| 622 | | - /* hmm_vma_handle_pte() did unmap pte directory */ |
|---|
| 623 | | - hmm_vma_walk->last = addr; |
|---|
| 389 | + /* hmm_vma_handle_pte() did pte_unmap() */ |
|---|
| 624 | 390 | return r; |
|---|
| 625 | 391 | } |
|---|
| 626 | 392 | } |
|---|
| 627 | 393 | pte_unmap(ptep - 1); |
|---|
| 628 | | - |
|---|
| 629 | | - hmm_vma_walk->last = addr; |
|---|
| 630 | 394 | return 0; |
|---|
| 631 | 395 | } |
|---|
| 632 | 396 | |
|---|
| 633 | | -static void hmm_pfns_clear(struct hmm_range *range, |
|---|
| 634 | | - uint64_t *pfns, |
|---|
| 635 | | - unsigned long addr, |
|---|
| 636 | | - unsigned long end) |
|---|
| 397 | +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ |
|---|
| 398 | + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) |
|---|
| 399 | +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, |
|---|
| 400 | + pud_t pud) |
|---|
| 637 | 401 | { |
|---|
| 638 | | - for (; addr < end; addr += PAGE_SIZE, pfns++) |
|---|
| 639 | | - *pfns = range->values[HMM_PFN_NONE]; |
|---|
| 402 | + if (!pud_present(pud)) |
|---|
| 403 | + return 0; |
|---|
| 404 | + return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : |
|---|
| 405 | + HMM_PFN_VALID) | |
|---|
| 406 | + hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); |
|---|
| 640 | 407 | } |
|---|
| 641 | 408 | |
|---|
| 642 | | -static void hmm_pfns_special(struct hmm_range *range) |
|---|
| 409 | +static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, |
|---|
| 410 | + struct mm_walk *walk) |
|---|
| 643 | 411 | { |
|---|
| 644 | | - unsigned long addr = range->start, i = 0; |
|---|
| 412 | + struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 413 | + struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 414 | + unsigned long addr = start; |
|---|
| 415 | + pud_t pud; |
|---|
| 416 | + int ret = 0; |
|---|
| 417 | + spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); |
|---|
| 645 | 418 | |
|---|
| 646 | | - for (; addr < range->end; addr += PAGE_SIZE, i++) |
|---|
| 647 | | - range->pfns[i] = range->values[HMM_PFN_SPECIAL]; |
|---|
| 419 | + if (!ptl) |
|---|
| 420 | + return 0; |
|---|
| 421 | + |
|---|
| 422 | + /* Normally we don't want to split the huge page */ |
|---|
| 423 | + walk->action = ACTION_CONTINUE; |
|---|
| 424 | + |
|---|
| 425 | + pud = READ_ONCE(*pudp); |
|---|
| 426 | + if (pud_none(pud)) { |
|---|
| 427 | + spin_unlock(ptl); |
|---|
| 428 | + return hmm_vma_walk_hole(start, end, -1, walk); |
|---|
| 429 | + } |
|---|
| 430 | + |
|---|
| 431 | + if (pud_huge(pud) && pud_devmap(pud)) { |
|---|
| 432 | + unsigned long i, npages, pfn; |
|---|
| 433 | + unsigned int required_fault; |
|---|
| 434 | + unsigned long *hmm_pfns; |
|---|
| 435 | + unsigned long cpu_flags; |
|---|
| 436 | + |
|---|
| 437 | + if (!pud_present(pud)) { |
|---|
| 438 | + spin_unlock(ptl); |
|---|
| 439 | + return hmm_vma_walk_hole(start, end, -1, walk); |
|---|
| 440 | + } |
|---|
| 441 | + |
|---|
| 442 | + i = (addr - range->start) >> PAGE_SHIFT; |
|---|
| 443 | + npages = (end - addr) >> PAGE_SHIFT; |
|---|
| 444 | + hmm_pfns = &range->hmm_pfns[i]; |
|---|
| 445 | + |
|---|
| 446 | + cpu_flags = pud_to_hmm_pfn_flags(range, pud); |
|---|
| 447 | + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, |
|---|
| 448 | + npages, cpu_flags); |
|---|
| 449 | + if (required_fault) { |
|---|
| 450 | + spin_unlock(ptl); |
|---|
| 451 | + return hmm_vma_fault(addr, end, required_fault, walk); |
|---|
| 452 | + } |
|---|
| 453 | + |
|---|
| 454 | + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
|---|
| 455 | + for (i = 0; i < npages; ++i, ++pfn) |
|---|
| 456 | + hmm_pfns[i] = pfn | cpu_flags; |
|---|
| 457 | + goto out_unlock; |
|---|
| 458 | + } |
|---|
| 459 | + |
|---|
| 460 | + /* Ask for the PUD to be split */ |
|---|
| 461 | + walk->action = ACTION_SUBTREE; |
|---|
| 462 | + |
|---|
| 463 | +out_unlock: |
|---|
| 464 | + spin_unlock(ptl); |
|---|
| 465 | + return ret; |
|---|
| 648 | 466 | } |
|---|
| 467 | +#else |
|---|
| 468 | +#define hmm_vma_walk_pud NULL |
|---|
| 469 | +#endif |
|---|
| 649 | 470 | |
|---|
| 650 | | -/* |
|---|
| 651 | | - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses |
|---|
| 652 | | - * @range: range being snapshotted |
|---|
| 653 | | - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid |
|---|
| 654 | | - * vma permission, 0 success |
|---|
| 655 | | - * |
|---|
| 656 | | - * This snapshots the CPU page table for a range of virtual addresses. Snapshot |
|---|
| 657 | | - * validity is tracked by range struct. See hmm_vma_range_done() for further |
|---|
| 658 | | - * information. |
|---|
| 659 | | - * |
|---|
| 660 | | - * The range struct is initialized here. It tracks the CPU page table, but only |
|---|
| 661 | | - * if the function returns success (0), in which case the caller must then call |
|---|
| 662 | | - * hmm_vma_range_done() to stop CPU page table update tracking on this range. |
|---|
| 663 | | - * |
|---|
| 664 | | - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS |
|---|
| 665 | | - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! |
|---|
| 666 | | - */ |
|---|
| 667 | | -int hmm_vma_get_pfns(struct hmm_range *range) |
|---|
| 471 | +#ifdef CONFIG_HUGETLB_PAGE |
|---|
| 472 | +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, |
|---|
| 473 | + unsigned long start, unsigned long end, |
|---|
| 474 | + struct mm_walk *walk) |
|---|
| 668 | 475 | { |
|---|
| 669 | | - struct vm_area_struct *vma = range->vma; |
|---|
| 670 | | - struct hmm_vma_walk hmm_vma_walk; |
|---|
| 671 | | - struct mm_walk mm_walk; |
|---|
| 672 | | - struct hmm *hmm; |
|---|
| 476 | + unsigned long addr = start, i, pfn; |
|---|
| 477 | + struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 478 | + struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 479 | + struct vm_area_struct *vma = walk->vma; |
|---|
| 480 | + unsigned int required_fault; |
|---|
| 481 | + unsigned long pfn_req_flags; |
|---|
| 482 | + unsigned long cpu_flags; |
|---|
| 483 | + spinlock_t *ptl; |
|---|
| 484 | + pte_t entry; |
|---|
| 673 | 485 | |
|---|
| 674 | | - /* Sanity check, this really should not happen ! */ |
|---|
| 675 | | - if (range->start < vma->vm_start || range->start >= vma->vm_end) |
|---|
| 676 | | - return -EINVAL; |
|---|
| 677 | | - if (range->end < vma->vm_start || range->end > vma->vm_end) |
|---|
| 678 | | - return -EINVAL; |
|---|
| 486 | + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); |
|---|
| 487 | + entry = huge_ptep_get(pte); |
|---|
| 679 | 488 | |
|---|
| 680 | | - hmm = hmm_register(vma->vm_mm); |
|---|
| 681 | | - if (!hmm) |
|---|
| 682 | | - return -ENOMEM; |
|---|
| 683 | | - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ |
|---|
| 684 | | - if (!hmm->mmu_notifier.ops) |
|---|
| 685 | | - return -EINVAL; |
|---|
| 686 | | - |
|---|
| 687 | | - /* FIXME support hugetlb fs */ |
|---|
| 688 | | - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || |
|---|
| 689 | | - vma_is_dax(vma)) { |
|---|
| 690 | | - hmm_pfns_special(range); |
|---|
| 691 | | - return -EINVAL; |
|---|
| 489 | + i = (start - range->start) >> PAGE_SHIFT; |
|---|
| 490 | + pfn_req_flags = range->hmm_pfns[i]; |
|---|
| 491 | + cpu_flags = pte_to_hmm_pfn_flags(range, entry) | |
|---|
| 492 | + hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); |
|---|
| 493 | + required_fault = |
|---|
| 494 | + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); |
|---|
| 495 | + if (required_fault) { |
|---|
| 496 | + spin_unlock(ptl); |
|---|
| 497 | + return hmm_vma_fault(addr, end, required_fault, walk); |
|---|
| 692 | 498 | } |
|---|
| 693 | 499 | |
|---|
| 694 | | - if (!(vma->vm_flags & VM_READ)) { |
|---|
| 695 | | - /* |
|---|
| 696 | | - * If vma do not allow read access, then assume that it does |
|---|
| 697 | | - * not allow write access, either. Architecture that allow |
|---|
| 698 | | - * write without read access are not supported by HMM, because |
|---|
| 699 | | - * operations such has atomic access would not work. |
|---|
| 700 | | - */ |
|---|
| 701 | | - hmm_pfns_clear(range, range->pfns, range->start, range->end); |
|---|
| 702 | | - return -EPERM; |
|---|
| 703 | | - } |
|---|
| 500 | + pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); |
|---|
| 501 | + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) |
|---|
| 502 | + range->hmm_pfns[i] = pfn | cpu_flags; |
|---|
| 704 | 503 | |
|---|
| 705 | | - /* Initialize range to track CPU page table update */ |
|---|
| 706 | | - spin_lock(&hmm->lock); |
|---|
| 707 | | - range->valid = true; |
|---|
| 708 | | - list_add_rcu(&range->list, &hmm->ranges); |
|---|
| 709 | | - spin_unlock(&hmm->lock); |
|---|
| 710 | | - |
|---|
| 711 | | - hmm_vma_walk.fault = false; |
|---|
| 712 | | - hmm_vma_walk.range = range; |
|---|
| 713 | | - mm_walk.private = &hmm_vma_walk; |
|---|
| 714 | | - |
|---|
| 715 | | - mm_walk.vma = vma; |
|---|
| 716 | | - mm_walk.mm = vma->vm_mm; |
|---|
| 717 | | - mm_walk.pte_entry = NULL; |
|---|
| 718 | | - mm_walk.test_walk = NULL; |
|---|
| 719 | | - mm_walk.hugetlb_entry = NULL; |
|---|
| 720 | | - mm_walk.pmd_entry = hmm_vma_walk_pmd; |
|---|
| 721 | | - mm_walk.pte_hole = hmm_vma_walk_hole; |
|---|
| 722 | | - |
|---|
| 723 | | - walk_page_range(range->start, range->end, &mm_walk); |
|---|
| 504 | + spin_unlock(ptl); |
|---|
| 724 | 505 | return 0; |
|---|
| 725 | 506 | } |
|---|
| 726 | | -EXPORT_SYMBOL(hmm_vma_get_pfns); |
|---|
| 507 | +#else |
|---|
| 508 | +#define hmm_vma_walk_hugetlb_entry NULL |
|---|
| 509 | +#endif /* CONFIG_HUGETLB_PAGE */ |
|---|
| 727 | 510 | |
|---|
| 728 | | -/* |
|---|
| 729 | | - * hmm_vma_range_done() - stop tracking change to CPU page table over a range |
|---|
| 730 | | - * @range: range being tracked |
|---|
| 731 | | - * Returns: false if range data has been invalidated, true otherwise |
|---|
| 732 | | - * |
|---|
| 733 | | - * Range struct is used to track updates to the CPU page table after a call to |
|---|
| 734 | | - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done |
|---|
| 735 | | - * using the data, or wants to lock updates to the data it got from those |
|---|
| 736 | | - * functions, it must call the hmm_vma_range_done() function, which will then |
|---|
| 737 | | - * stop tracking CPU page table updates. |
|---|
| 738 | | - * |
|---|
| 739 | | - * Note that device driver must still implement general CPU page table update |
|---|
| 740 | | - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using |
|---|
| 741 | | - * the mmu_notifier API directly. |
|---|
| 742 | | - * |
|---|
| 743 | | - * CPU page table update tracking done through hmm_range is only temporary and |
|---|
| 744 | | - * to be used while trying to duplicate CPU page table contents for a range of |
|---|
| 745 | | - * virtual addresses. |
|---|
| 746 | | - * |
|---|
| 747 | | - * There are two ways to use this : |
|---|
| 748 | | - * again: |
|---|
| 749 | | - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); |
|---|
| 750 | | - * trans = device_build_page_table_update_transaction(pfns); |
|---|
| 751 | | - * device_page_table_lock(); |
|---|
| 752 | | - * if (!hmm_vma_range_done(range)) { |
|---|
| 753 | | - * device_page_table_unlock(); |
|---|
| 754 | | - * goto again; |
|---|
| 755 | | - * } |
|---|
| 756 | | - * device_commit_transaction(trans); |
|---|
| 757 | | - * device_page_table_unlock(); |
|---|
| 758 | | - * |
|---|
| 759 | | - * Or: |
|---|
| 760 | | - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); |
|---|
| 761 | | - * device_page_table_lock(); |
|---|
| 762 | | - * hmm_vma_range_done(range); |
|---|
| 763 | | - * device_update_page_table(range->pfns); |
|---|
| 764 | | - * device_page_table_unlock(); |
|---|
| 765 | | - */ |
|---|
| 766 | | -bool hmm_vma_range_done(struct hmm_range *range) |
|---|
| 511 | +static int hmm_vma_walk_test(unsigned long start, unsigned long end, |
|---|
| 512 | + struct mm_walk *walk) |
|---|
| 767 | 513 | { |
|---|
| 768 | | - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; |
|---|
| 769 | | - struct hmm *hmm; |
|---|
| 514 | + struct hmm_vma_walk *hmm_vma_walk = walk->private; |
|---|
| 515 | + struct hmm_range *range = hmm_vma_walk->range; |
|---|
| 516 | + struct vm_area_struct *vma = walk->vma; |
|---|
| 770 | 517 | |
|---|
| 771 | | - if (range->end <= range->start) { |
|---|
| 772 | | - BUG(); |
|---|
| 773 | | - return false; |
|---|
| 774 | | - } |
|---|
| 518 | + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && |
|---|
| 519 | + vma->vm_flags & VM_READ) |
|---|
| 520 | + return 0; |
|---|
| 775 | 521 | |
|---|
| 776 | | - hmm = hmm_register(range->vma->vm_mm); |
|---|
| 777 | | - if (!hmm) { |
|---|
| 778 | | - memset(range->pfns, 0, sizeof(*range->pfns) * npages); |
|---|
| 779 | | - return false; |
|---|
| 780 | | - } |
|---|
| 522 | + /* |
|---|
| 523 | + * vma ranges that don't have struct page backing them or map I/O |
|---|
| 524 | + * devices directly cannot be handled by hmm_range_fault(). |
|---|
| 525 | + * |
|---|
| 526 | + * If the vma does not allow read access, then assume that it does not |
|---|
| 527 | + * allow write access either. HMM does not support architectures that |
|---|
| 528 | + * allow write without read. |
|---|
| 529 | + * |
|---|
| 530 | + * If a fault is requested for an unsupported range then it is a hard |
|---|
| 531 | + * failure. |
|---|
| 532 | + */ |
|---|
| 533 | + if (hmm_range_need_fault(hmm_vma_walk, |
|---|
| 534 | + range->hmm_pfns + |
|---|
| 535 | + ((start - range->start) >> PAGE_SHIFT), |
|---|
| 536 | + (end - start) >> PAGE_SHIFT, 0)) |
|---|
| 537 | + return -EFAULT; |
|---|
| 781 | 538 | |
|---|
| 782 | | - spin_lock(&hmm->lock); |
|---|
| 783 | | - list_del_rcu(&range->list); |
|---|
| 784 | | - spin_unlock(&hmm->lock); |
|---|
| 539 | + hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); |
|---|
| 785 | 540 | |
|---|
| 786 | | - return range->valid; |
|---|
| 541 | + /* Skip this vma and continue processing the next vma. */ |
|---|
| 542 | + return 1; |
|---|
| 787 | 543 | } |
|---|
| 788 | | -EXPORT_SYMBOL(hmm_vma_range_done); |
|---|
| 789 | 544 | |
|---|
| 790 | | -/* |
|---|
| 791 | | - * hmm_vma_fault() - try to fault some address in a virtual address range |
|---|
| 792 | | - * @range: range being faulted |
|---|
| 793 | | - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) |
|---|
| 794 | | - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) |
|---|
| 545 | +static const struct mm_walk_ops hmm_walk_ops = { |
|---|
| 546 | + .pud_entry = hmm_vma_walk_pud, |
|---|
| 547 | + .pmd_entry = hmm_vma_walk_pmd, |
|---|
| 548 | + .pte_hole = hmm_vma_walk_hole, |
|---|
| 549 | + .hugetlb_entry = hmm_vma_walk_hugetlb_entry, |
|---|
| 550 | + .test_walk = hmm_vma_walk_test, |
|---|
| 551 | +}; |
|---|
| 552 | + |
|---|
| 553 | +/** |
|---|
| 554 | + * hmm_range_fault - try to fault some address in a virtual address range |
|---|
| 555 | + * @range: argument structure |
|---|
| 795 | 556 | * |
|---|
| 796 | | - * This is similar to a regular CPU page fault except that it will not trigger |
|---|
| 797 | | - * any memory migration if the memory being faulted is not accessible by CPUs. |
|---|
| 557 | + * Returns 0 on success or one of the following error codes: |
|---|
| 798 | 558 | * |
|---|
| 799 | | - * On error, for one virtual address in the range, the function will mark the |
|---|
| 800 | | - * corresponding HMM pfn entry with an error flag. |
|---|
| 559 | + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma |
|---|
| 560 | + * (e.g., device file vma). |
|---|
| 561 | + * -ENOMEM: Out of memory. |
|---|
| 562 | + * -EPERM: Invalid permission (e.g., asking for write and range is read |
|---|
| 563 | + * only). |
|---|
| 564 | + * -EBUSY: The range has been invalidated and the caller needs to wait for |
|---|
| 565 | + * the invalidation to finish. |
|---|
| 566 | + * -EFAULT: A page was requested to be valid and could not be made valid |
|---|
| 567 | + * ie it has no backing VMA or it is illegal to access |
|---|
| 801 | 568 | * |
|---|
| 802 | | - * Expected use pattern: |
|---|
| 803 | | - * retry: |
|---|
| 804 | | - * down_read(&mm->mmap_sem); |
|---|
| 805 | | - * // Find vma and address device wants to fault, initialize hmm_pfn_t |
|---|
| 806 | | - * // array accordingly |
|---|
| 807 | | - * ret = hmm_vma_fault(range, write, block); |
|---|
| 808 | | - * switch (ret) { |
|---|
| 809 | | - * case -EAGAIN: |
|---|
| 810 | | - * hmm_vma_range_done(range); |
|---|
| 811 | | - * // You might want to rate limit or yield to play nicely, you may |
|---|
| 812 | | - * // also commit any valid pfn in the array assuming that you are |
|---|
| 813 | | - * // getting true from hmm_vma_range_monitor_end() |
|---|
| 814 | | - * goto retry; |
|---|
| 815 | | - * case 0: |
|---|
| 816 | | - * break; |
|---|
| 817 | | - * case -ENOMEM: |
|---|
| 818 | | - * case -EINVAL: |
|---|
| 819 | | - * case -EPERM: |
|---|
| 820 | | - * default: |
|---|
| 821 | | - * // Handle error ! |
|---|
| 822 | | - * up_read(&mm->mmap_sem) |
|---|
| 823 | | - * return; |
|---|
| 824 | | - * } |
|---|
| 825 | | - * // Take device driver lock that serialize device page table update |
|---|
| 826 | | - * driver_lock_device_page_table_update(); |
|---|
| 827 | | - * hmm_vma_range_done(range); |
|---|
| 828 | | - * // Commit pfns we got from hmm_vma_fault() |
|---|
| 829 | | - * driver_unlock_device_page_table_update(); |
|---|
| 830 | | - * up_read(&mm->mmap_sem) |
|---|
| 831 | | - * |
|---|
| 832 | | - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) |
|---|
| 833 | | - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! |
|---|
| 834 | | - * |
|---|
| 835 | | - * YOU HAVE BEEN WARNED ! |
|---|
| 569 | + * This is similar to get_user_pages(), except that it can read the page tables |
|---|
| 570 | + * without mutating them (ie causing faults). |
|---|
| 836 | 571 | */ |
|---|
| 837 | | -int hmm_vma_fault(struct hmm_range *range, bool block) |
|---|
| 572 | +int hmm_range_fault(struct hmm_range *range) |
|---|
| 838 | 573 | { |
|---|
| 839 | | - struct vm_area_struct *vma = range->vma; |
|---|
| 840 | | - unsigned long start = range->start; |
|---|
| 841 | | - struct hmm_vma_walk hmm_vma_walk; |
|---|
| 842 | | - struct mm_walk mm_walk; |
|---|
| 843 | | - struct hmm *hmm; |
|---|
| 574 | + struct hmm_vma_walk hmm_vma_walk = { |
|---|
| 575 | + .range = range, |
|---|
| 576 | + .last = range->start, |
|---|
| 577 | + }; |
|---|
| 578 | + struct mm_struct *mm = range->notifier->mm; |
|---|
| 844 | 579 | int ret; |
|---|
| 845 | 580 | |
|---|
| 846 | | - /* Sanity check, this really should not happen ! */ |
|---|
| 847 | | - if (range->start < vma->vm_start || range->start >= vma->vm_end) |
|---|
| 848 | | - return -EINVAL; |
|---|
| 849 | | - if (range->end < vma->vm_start || range->end > vma->vm_end) |
|---|
| 850 | | - return -EINVAL; |
|---|
| 851 | | - |
|---|
| 852 | | - hmm = hmm_register(vma->vm_mm); |
|---|
| 853 | | - if (!hmm) { |
|---|
| 854 | | - hmm_pfns_clear(range, range->pfns, range->start, range->end); |
|---|
| 855 | | - return -ENOMEM; |
|---|
| 856 | | - } |
|---|
| 857 | | - /* Caller must have registered a mirror using hmm_mirror_register() */ |
|---|
| 858 | | - if (!hmm->mmu_notifier.ops) |
|---|
| 859 | | - return -EINVAL; |
|---|
| 860 | | - |
|---|
| 861 | | - /* FIXME support hugetlb fs */ |
|---|
| 862 | | - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || |
|---|
| 863 | | - vma_is_dax(vma)) { |
|---|
| 864 | | - hmm_pfns_special(range); |
|---|
| 865 | | - return -EINVAL; |
|---|
| 866 | | - } |
|---|
| 867 | | - |
|---|
| 868 | | - if (!(vma->vm_flags & VM_READ)) { |
|---|
| 869 | | - /* |
|---|
| 870 | | - * If vma do not allow read access, then assume that it does |
|---|
| 871 | | - * not allow write access, either. Architecture that allow |
|---|
| 872 | | - * write without read access are not supported by HMM, because |
|---|
| 873 | | - * operations such has atomic access would not work. |
|---|
| 874 | | - */ |
|---|
| 875 | | - hmm_pfns_clear(range, range->pfns, range->start, range->end); |
|---|
| 876 | | - return -EPERM; |
|---|
| 877 | | - } |
|---|
| 878 | | - |
|---|
| 879 | | - /* Initialize range to track CPU page table update */ |
|---|
| 880 | | - spin_lock(&hmm->lock); |
|---|
| 881 | | - range->valid = true; |
|---|
| 882 | | - list_add_rcu(&range->list, &hmm->ranges); |
|---|
| 883 | | - spin_unlock(&hmm->lock); |
|---|
| 884 | | - |
|---|
| 885 | | - hmm_vma_walk.fault = true; |
|---|
| 886 | | - hmm_vma_walk.block = block; |
|---|
| 887 | | - hmm_vma_walk.range = range; |
|---|
| 888 | | - mm_walk.private = &hmm_vma_walk; |
|---|
| 889 | | - hmm_vma_walk.last = range->start; |
|---|
| 890 | | - |
|---|
| 891 | | - mm_walk.vma = vma; |
|---|
| 892 | | - mm_walk.mm = vma->vm_mm; |
|---|
| 893 | | - mm_walk.pte_entry = NULL; |
|---|
| 894 | | - mm_walk.test_walk = NULL; |
|---|
| 895 | | - mm_walk.hugetlb_entry = NULL; |
|---|
| 896 | | - mm_walk.pmd_entry = hmm_vma_walk_pmd; |
|---|
| 897 | | - mm_walk.pte_hole = hmm_vma_walk_hole; |
|---|
| 581 | + mmap_assert_locked(mm); |
|---|
| 898 | 582 | |
|---|
| 899 | 583 | do { |
|---|
| 900 | | - ret = walk_page_range(start, range->end, &mm_walk); |
|---|
| 901 | | - start = hmm_vma_walk.last; |
|---|
| 902 | | - } while (ret == -EAGAIN); |
|---|
| 903 | | - |
|---|
| 904 | | - if (ret) { |
|---|
| 905 | | - unsigned long i; |
|---|
| 906 | | - |
|---|
| 907 | | - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; |
|---|
| 908 | | - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, |
|---|
| 909 | | - range->end); |
|---|
| 910 | | - hmm_vma_range_done(range); |
|---|
| 911 | | - } |
|---|
| 584 | + /* If range is no longer valid force retry. */ |
|---|
| 585 | + if (mmu_interval_check_retry(range->notifier, |
|---|
| 586 | + range->notifier_seq)) |
|---|
| 587 | + return -EBUSY; |
|---|
| 588 | + ret = walk_page_range(mm, hmm_vma_walk.last, range->end, |
|---|
| 589 | + &hmm_walk_ops, &hmm_vma_walk); |
|---|
| 590 | + /* |
|---|
| 591 | + * When -EBUSY is returned the loop restarts with |
|---|
| 592 | + * hmm_vma_walk.last set to an address that has not been stored |
|---|
| 593 | + * in pfns. All entries < last in the pfn array are set to their |
|---|
| 594 | + * output, and all >= are still at their input values. |
|---|
| 595 | + */ |
|---|
| 596 | + } while (ret == -EBUSY); |
|---|
| 912 | 597 | return ret; |
|---|
| 913 | 598 | } |
|---|
| 914 | | -EXPORT_SYMBOL(hmm_vma_fault); |
|---|
| 915 | | -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ |
|---|
| 916 | | - |
|---|
| 917 | | - |
|---|
| 918 | | -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) |
|---|
| 919 | | -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, |
|---|
| 920 | | - unsigned long addr) |
|---|
| 921 | | -{ |
|---|
| 922 | | - struct page *page; |
|---|
| 923 | | - |
|---|
| 924 | | - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); |
|---|
| 925 | | - if (!page) |
|---|
| 926 | | - return NULL; |
|---|
| 927 | | - lock_page(page); |
|---|
| 928 | | - return page; |
|---|
| 929 | | -} |
|---|
| 930 | | -EXPORT_SYMBOL(hmm_vma_alloc_locked_page); |
|---|
| 931 | | - |
|---|
| 932 | | - |
|---|
| 933 | | -static void hmm_devmem_ref_release(struct percpu_ref *ref) |
|---|
| 934 | | -{ |
|---|
| 935 | | - struct hmm_devmem *devmem; |
|---|
| 936 | | - |
|---|
| 937 | | - devmem = container_of(ref, struct hmm_devmem, ref); |
|---|
| 938 | | - complete(&devmem->completion); |
|---|
| 939 | | -} |
|---|
| 940 | | - |
|---|
| 941 | | -static void hmm_devmem_ref_exit(void *data) |
|---|
| 942 | | -{ |
|---|
| 943 | | - struct percpu_ref *ref = data; |
|---|
| 944 | | - struct hmm_devmem *devmem; |
|---|
| 945 | | - |
|---|
| 946 | | - devmem = container_of(ref, struct hmm_devmem, ref); |
|---|
| 947 | | - percpu_ref_exit(ref); |
|---|
| 948 | | -} |
|---|
| 949 | | - |
|---|
| 950 | | -static void hmm_devmem_ref_kill(void *data) |
|---|
| 951 | | -{ |
|---|
| 952 | | - struct percpu_ref *ref = data; |
|---|
| 953 | | - struct hmm_devmem *devmem; |
|---|
| 954 | | - |
|---|
| 955 | | - devmem = container_of(ref, struct hmm_devmem, ref); |
|---|
| 956 | | - percpu_ref_kill(ref); |
|---|
| 957 | | - wait_for_completion(&devmem->completion); |
|---|
| 958 | | -} |
|---|
| 959 | | - |
|---|
| 960 | | -static int hmm_devmem_fault(struct vm_area_struct *vma, |
|---|
| 961 | | - unsigned long addr, |
|---|
| 962 | | - const struct page *page, |
|---|
| 963 | | - unsigned int flags, |
|---|
| 964 | | - pmd_t *pmdp) |
|---|
| 965 | | -{ |
|---|
| 966 | | - struct hmm_devmem *devmem = page->pgmap->data; |
|---|
| 967 | | - |
|---|
| 968 | | - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); |
|---|
| 969 | | -} |
|---|
| 970 | | - |
|---|
| 971 | | -static void hmm_devmem_free(struct page *page, void *data) |
|---|
| 972 | | -{ |
|---|
| 973 | | - struct hmm_devmem *devmem = data; |
|---|
| 974 | | - |
|---|
| 975 | | - page->mapping = NULL; |
|---|
| 976 | | - |
|---|
| 977 | | - devmem->ops->free(devmem, page); |
|---|
| 978 | | -} |
|---|
| 979 | | - |
|---|
| 980 | | -static DEFINE_MUTEX(hmm_devmem_lock); |
|---|
| 981 | | -static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); |
|---|
| 982 | | - |
|---|
| 983 | | -static void hmm_devmem_radix_release(struct resource *resource) |
|---|
| 984 | | -{ |
|---|
| 985 | | - resource_size_t key; |
|---|
| 986 | | - |
|---|
| 987 | | - mutex_lock(&hmm_devmem_lock); |
|---|
| 988 | | - for (key = resource->start; |
|---|
| 989 | | - key <= resource->end; |
|---|
| 990 | | - key += PA_SECTION_SIZE) |
|---|
| 991 | | - radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); |
|---|
| 992 | | - mutex_unlock(&hmm_devmem_lock); |
|---|
| 993 | | -} |
|---|
| 994 | | - |
|---|
| 995 | | -static void hmm_devmem_release(void *data) |
|---|
| 996 | | -{ |
|---|
| 997 | | - struct hmm_devmem *devmem = data; |
|---|
| 998 | | - struct resource *resource = devmem->resource; |
|---|
| 999 | | - unsigned long start_pfn, npages; |
|---|
| 1000 | | - struct page *page; |
|---|
| 1001 | | - int nid; |
|---|
| 1002 | | - |
|---|
| 1003 | | - /* pages are dead and unused, undo the arch mapping */ |
|---|
| 1004 | | - start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; |
|---|
| 1005 | | - npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; |
|---|
| 1006 | | - |
|---|
| 1007 | | - page = pfn_to_page(start_pfn); |
|---|
| 1008 | | - nid = page_to_nid(page); |
|---|
| 1009 | | - |
|---|
| 1010 | | - mem_hotplug_begin(); |
|---|
| 1011 | | - if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) |
|---|
| 1012 | | - __remove_pages(start_pfn, npages, NULL); |
|---|
| 1013 | | - else |
|---|
| 1014 | | - arch_remove_memory(nid, start_pfn << PAGE_SHIFT, |
|---|
| 1015 | | - npages << PAGE_SHIFT, NULL); |
|---|
| 1016 | | - mem_hotplug_done(); |
|---|
| 1017 | | - |
|---|
| 1018 | | - hmm_devmem_radix_release(resource); |
|---|
| 1019 | | -} |
|---|
| 1020 | | - |
|---|
| 1021 | | -static int hmm_devmem_pages_create(struct hmm_devmem *devmem) |
|---|
| 1022 | | -{ |
|---|
| 1023 | | - resource_size_t key, align_start, align_size, align_end; |
|---|
| 1024 | | - struct device *device = devmem->device; |
|---|
| 1025 | | - int ret, nid, is_ram; |
|---|
| 1026 | | - unsigned long pfn; |
|---|
| 1027 | | - |
|---|
| 1028 | | - align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); |
|---|
| 1029 | | - align_size = ALIGN(devmem->resource->start + |
|---|
| 1030 | | - resource_size(devmem->resource), |
|---|
| 1031 | | - PA_SECTION_SIZE) - align_start; |
|---|
| 1032 | | - |
|---|
| 1033 | | - is_ram = region_intersects(align_start, align_size, |
|---|
| 1034 | | - IORESOURCE_SYSTEM_RAM, |
|---|
| 1035 | | - IORES_DESC_NONE); |
|---|
| 1036 | | - if (is_ram == REGION_MIXED) { |
|---|
| 1037 | | - WARN_ONCE(1, "%s attempted on mixed region %pr\n", |
|---|
| 1038 | | - __func__, devmem->resource); |
|---|
| 1039 | | - return -ENXIO; |
|---|
| 1040 | | - } |
|---|
| 1041 | | - if (is_ram == REGION_INTERSECTS) |
|---|
| 1042 | | - return -ENXIO; |
|---|
| 1043 | | - |
|---|
| 1044 | | - if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) |
|---|
| 1045 | | - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; |
|---|
| 1046 | | - else |
|---|
| 1047 | | - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; |
|---|
| 1048 | | - |
|---|
| 1049 | | - devmem->pagemap.res = *devmem->resource; |
|---|
| 1050 | | - devmem->pagemap.page_fault = hmm_devmem_fault; |
|---|
| 1051 | | - devmem->pagemap.page_free = hmm_devmem_free; |
|---|
| 1052 | | - devmem->pagemap.dev = devmem->device; |
|---|
| 1053 | | - devmem->pagemap.ref = &devmem->ref; |
|---|
| 1054 | | - devmem->pagemap.data = devmem; |
|---|
| 1055 | | - |
|---|
| 1056 | | - mutex_lock(&hmm_devmem_lock); |
|---|
| 1057 | | - align_end = align_start + align_size - 1; |
|---|
| 1058 | | - for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { |
|---|
| 1059 | | - struct hmm_devmem *dup; |
|---|
| 1060 | | - |
|---|
| 1061 | | - dup = radix_tree_lookup(&hmm_devmem_radix, |
|---|
| 1062 | | - key >> PA_SECTION_SHIFT); |
|---|
| 1063 | | - if (dup) { |
|---|
| 1064 | | - dev_err(device, "%s: collides with mapping for %s\n", |
|---|
| 1065 | | - __func__, dev_name(dup->device)); |
|---|
| 1066 | | - mutex_unlock(&hmm_devmem_lock); |
|---|
| 1067 | | - ret = -EBUSY; |
|---|
| 1068 | | - goto error; |
|---|
| 1069 | | - } |
|---|
| 1070 | | - ret = radix_tree_insert(&hmm_devmem_radix, |
|---|
| 1071 | | - key >> PA_SECTION_SHIFT, |
|---|
| 1072 | | - devmem); |
|---|
| 1073 | | - if (ret) { |
|---|
| 1074 | | - dev_err(device, "%s: failed: %d\n", __func__, ret); |
|---|
| 1075 | | - mutex_unlock(&hmm_devmem_lock); |
|---|
| 1076 | | - goto error_radix; |
|---|
| 1077 | | - } |
|---|
| 1078 | | - } |
|---|
| 1079 | | - mutex_unlock(&hmm_devmem_lock); |
|---|
| 1080 | | - |
|---|
| 1081 | | - nid = dev_to_node(device); |
|---|
| 1082 | | - if (nid < 0) |
|---|
| 1083 | | - nid = numa_mem_id(); |
|---|
| 1084 | | - |
|---|
| 1085 | | - mem_hotplug_begin(); |
|---|
| 1086 | | - /* |
|---|
| 1087 | | - * For device private memory we call add_pages() as we only need to |
|---|
| 1088 | | - * allocate and initialize struct page for the device memory. More- |
|---|
| 1089 | | - * over the device memory is un-accessible thus we do not want to |
|---|
| 1090 | | - * create a linear mapping for the memory like arch_add_memory() |
|---|
| 1091 | | - * would do. |
|---|
| 1092 | | - * |
|---|
| 1093 | | - * For device public memory, which is accesible by the CPU, we do |
|---|
| 1094 | | - * want the linear mapping and thus use arch_add_memory(). |
|---|
| 1095 | | - */ |
|---|
| 1096 | | - if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) |
|---|
| 1097 | | - ret = arch_add_memory(nid, align_start, align_size, NULL, |
|---|
| 1098 | | - false); |
|---|
| 1099 | | - else |
|---|
| 1100 | | - ret = add_pages(nid, align_start >> PAGE_SHIFT, |
|---|
| 1101 | | - align_size >> PAGE_SHIFT, NULL, false); |
|---|
| 1102 | | - if (ret) { |
|---|
| 1103 | | - mem_hotplug_done(); |
|---|
| 1104 | | - goto error_add_memory; |
|---|
| 1105 | | - } |
|---|
| 1106 | | - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], |
|---|
| 1107 | | - align_start >> PAGE_SHIFT, |
|---|
| 1108 | | - align_size >> PAGE_SHIFT, NULL); |
|---|
| 1109 | | - mem_hotplug_done(); |
|---|
| 1110 | | - |
|---|
| 1111 | | - for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { |
|---|
| 1112 | | - struct page *page = pfn_to_page(pfn); |
|---|
| 1113 | | - |
|---|
| 1114 | | - page->pgmap = &devmem->pagemap; |
|---|
| 1115 | | - } |
|---|
| 1116 | | - return 0; |
|---|
| 1117 | | - |
|---|
| 1118 | | -error_add_memory: |
|---|
| 1119 | | - untrack_pfn(NULL, PHYS_PFN(align_start), align_size); |
|---|
| 1120 | | -error_radix: |
|---|
| 1121 | | - hmm_devmem_radix_release(devmem->resource); |
|---|
| 1122 | | -error: |
|---|
| 1123 | | - return ret; |
|---|
| 1124 | | -} |
|---|
| 1125 | | - |
|---|
| 1126 | | -/* |
|---|
| 1127 | | - * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory |
|---|
| 1128 | | - * |
|---|
| 1129 | | - * @ops: memory event device driver callback (see struct hmm_devmem_ops) |
|---|
| 1130 | | - * @device: device struct to bind the resource too |
|---|
| 1131 | | - * @size: size in bytes of the device memory to add |
|---|
| 1132 | | - * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise |
|---|
| 1133 | | - * |
|---|
| 1134 | | - * This function first finds an empty range of physical address big enough to |
|---|
| 1135 | | - * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which |
|---|
| 1136 | | - * in turn allocates struct pages. It does not do anything beyond that; all |
|---|
| 1137 | | - * events affecting the memory will go through the various callbacks provided |
|---|
| 1138 | | - * by hmm_devmem_ops struct. |
|---|
| 1139 | | - * |
|---|
| 1140 | | - * Device driver should call this function during device initialization and |
|---|
| 1141 | | - * is then responsible of memory management. HMM only provides helpers. |
|---|
| 1142 | | - */ |
|---|
| 1143 | | -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, |
|---|
| 1144 | | - struct device *device, |
|---|
| 1145 | | - unsigned long size) |
|---|
| 1146 | | -{ |
|---|
| 1147 | | - struct hmm_devmem *devmem; |
|---|
| 1148 | | - resource_size_t addr; |
|---|
| 1149 | | - int ret; |
|---|
| 1150 | | - |
|---|
| 1151 | | - dev_pagemap_get_ops(); |
|---|
| 1152 | | - |
|---|
| 1153 | | - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); |
|---|
| 1154 | | - if (!devmem) |
|---|
| 1155 | | - return ERR_PTR(-ENOMEM); |
|---|
| 1156 | | - |
|---|
| 1157 | | - init_completion(&devmem->completion); |
|---|
| 1158 | | - devmem->pfn_first = -1UL; |
|---|
| 1159 | | - devmem->pfn_last = -1UL; |
|---|
| 1160 | | - devmem->resource = NULL; |
|---|
| 1161 | | - devmem->device = device; |
|---|
| 1162 | | - devmem->ops = ops; |
|---|
| 1163 | | - |
|---|
| 1164 | | - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, |
|---|
| 1165 | | - 0, GFP_KERNEL); |
|---|
| 1166 | | - if (ret) |
|---|
| 1167 | | - return ERR_PTR(ret); |
|---|
| 1168 | | - |
|---|
| 1169 | | - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); |
|---|
| 1170 | | - if (ret) |
|---|
| 1171 | | - return ERR_PTR(ret); |
|---|
| 1172 | | - |
|---|
| 1173 | | - size = ALIGN(size, PA_SECTION_SIZE); |
|---|
| 1174 | | - addr = min((unsigned long)iomem_resource.end, |
|---|
| 1175 | | - (1UL << MAX_PHYSMEM_BITS) - 1); |
|---|
| 1176 | | - addr = addr - size + 1UL; |
|---|
| 1177 | | - |
|---|
| 1178 | | - /* |
|---|
| 1179 | | - * FIXME add a new helper to quickly walk resource tree and find free |
|---|
| 1180 | | - * range |
|---|
| 1181 | | - * |
|---|
| 1182 | | - * FIXME what about ioport_resource resource ? |
|---|
| 1183 | | - */ |
|---|
| 1184 | | - for (; addr > size && addr >= iomem_resource.start; addr -= size) { |
|---|
| 1185 | | - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); |
|---|
| 1186 | | - if (ret != REGION_DISJOINT) |
|---|
| 1187 | | - continue; |
|---|
| 1188 | | - |
|---|
| 1189 | | - devmem->resource = devm_request_mem_region(device, addr, size, |
|---|
| 1190 | | - dev_name(device)); |
|---|
| 1191 | | - if (!devmem->resource) |
|---|
| 1192 | | - return ERR_PTR(-ENOMEM); |
|---|
| 1193 | | - break; |
|---|
| 1194 | | - } |
|---|
| 1195 | | - if (!devmem->resource) |
|---|
| 1196 | | - return ERR_PTR(-ERANGE); |
|---|
| 1197 | | - |
|---|
| 1198 | | - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; |
|---|
| 1199 | | - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; |
|---|
| 1200 | | - devmem->pfn_last = devmem->pfn_first + |
|---|
| 1201 | | - (resource_size(devmem->resource) >> PAGE_SHIFT); |
|---|
| 1202 | | - |
|---|
| 1203 | | - ret = hmm_devmem_pages_create(devmem); |
|---|
| 1204 | | - if (ret) |
|---|
| 1205 | | - return ERR_PTR(ret); |
|---|
| 1206 | | - |
|---|
| 1207 | | - ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem); |
|---|
| 1208 | | - if (ret) |
|---|
| 1209 | | - return ERR_PTR(ret); |
|---|
| 1210 | | - |
|---|
| 1211 | | - return devmem; |
|---|
| 1212 | | -} |
|---|
| 1213 | | -EXPORT_SYMBOL_GPL(hmm_devmem_add); |
|---|
| 1214 | | - |
|---|
| 1215 | | -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, |
|---|
| 1216 | | - struct device *device, |
|---|
| 1217 | | - struct resource *res) |
|---|
| 1218 | | -{ |
|---|
| 1219 | | - struct hmm_devmem *devmem; |
|---|
| 1220 | | - int ret; |
|---|
| 1221 | | - |
|---|
| 1222 | | - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) |
|---|
| 1223 | | - return ERR_PTR(-EINVAL); |
|---|
| 1224 | | - |
|---|
| 1225 | | - dev_pagemap_get_ops(); |
|---|
| 1226 | | - |
|---|
| 1227 | | - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); |
|---|
| 1228 | | - if (!devmem) |
|---|
| 1229 | | - return ERR_PTR(-ENOMEM); |
|---|
| 1230 | | - |
|---|
| 1231 | | - init_completion(&devmem->completion); |
|---|
| 1232 | | - devmem->pfn_first = -1UL; |
|---|
| 1233 | | - devmem->pfn_last = -1UL; |
|---|
| 1234 | | - devmem->resource = res; |
|---|
| 1235 | | - devmem->device = device; |
|---|
| 1236 | | - devmem->ops = ops; |
|---|
| 1237 | | - |
|---|
| 1238 | | - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, |
|---|
| 1239 | | - 0, GFP_KERNEL); |
|---|
| 1240 | | - if (ret) |
|---|
| 1241 | | - return ERR_PTR(ret); |
|---|
| 1242 | | - |
|---|
| 1243 | | - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, |
|---|
| 1244 | | - &devmem->ref); |
|---|
| 1245 | | - if (ret) |
|---|
| 1246 | | - return ERR_PTR(ret); |
|---|
| 1247 | | - |
|---|
| 1248 | | - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; |
|---|
| 1249 | | - devmem->pfn_last = devmem->pfn_first + |
|---|
| 1250 | | - (resource_size(devmem->resource) >> PAGE_SHIFT); |
|---|
| 1251 | | - |
|---|
| 1252 | | - ret = hmm_devmem_pages_create(devmem); |
|---|
| 1253 | | - if (ret) |
|---|
| 1254 | | - return ERR_PTR(ret); |
|---|
| 1255 | | - |
|---|
| 1256 | | - ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem); |
|---|
| 1257 | | - if (ret) |
|---|
| 1258 | | - return ERR_PTR(ret); |
|---|
| 1259 | | - |
|---|
| 1260 | | - ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill, |
|---|
| 1261 | | - &devmem->ref); |
|---|
| 1262 | | - if (ret) |
|---|
| 1263 | | - return ERR_PTR(ret); |
|---|
| 1264 | | - |
|---|
| 1265 | | - return devmem; |
|---|
| 1266 | | -} |
|---|
| 1267 | | -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); |
|---|
| 1268 | | - |
|---|
| 1269 | | -/* |
|---|
| 1270 | | - * A device driver that wants to handle multiple devices memory through a |
|---|
| 1271 | | - * single fake device can use hmm_device to do so. This is purely a helper |
|---|
| 1272 | | - * and it is not needed to make use of any HMM functionality. |
|---|
| 1273 | | - */ |
|---|
| 1274 | | -#define HMM_DEVICE_MAX 256 |
|---|
| 1275 | | - |
|---|
| 1276 | | -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); |
|---|
| 1277 | | -static DEFINE_SPINLOCK(hmm_device_lock); |
|---|
| 1278 | | -static struct class *hmm_device_class; |
|---|
| 1279 | | -static dev_t hmm_device_devt; |
|---|
| 1280 | | - |
|---|
| 1281 | | -static void hmm_device_release(struct device *device) |
|---|
| 1282 | | -{ |
|---|
| 1283 | | - struct hmm_device *hmm_device; |
|---|
| 1284 | | - |
|---|
| 1285 | | - hmm_device = container_of(device, struct hmm_device, device); |
|---|
| 1286 | | - spin_lock(&hmm_device_lock); |
|---|
| 1287 | | - clear_bit(hmm_device->minor, hmm_device_mask); |
|---|
| 1288 | | - spin_unlock(&hmm_device_lock); |
|---|
| 1289 | | - |
|---|
| 1290 | | - kfree(hmm_device); |
|---|
| 1291 | | -} |
|---|
| 1292 | | - |
|---|
| 1293 | | -struct hmm_device *hmm_device_new(void *drvdata) |
|---|
| 1294 | | -{ |
|---|
| 1295 | | - struct hmm_device *hmm_device; |
|---|
| 1296 | | - |
|---|
| 1297 | | - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); |
|---|
| 1298 | | - if (!hmm_device) |
|---|
| 1299 | | - return ERR_PTR(-ENOMEM); |
|---|
| 1300 | | - |
|---|
| 1301 | | - spin_lock(&hmm_device_lock); |
|---|
| 1302 | | - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); |
|---|
| 1303 | | - if (hmm_device->minor >= HMM_DEVICE_MAX) { |
|---|
| 1304 | | - spin_unlock(&hmm_device_lock); |
|---|
| 1305 | | - kfree(hmm_device); |
|---|
| 1306 | | - return ERR_PTR(-EBUSY); |
|---|
| 1307 | | - } |
|---|
| 1308 | | - set_bit(hmm_device->minor, hmm_device_mask); |
|---|
| 1309 | | - spin_unlock(&hmm_device_lock); |
|---|
| 1310 | | - |
|---|
| 1311 | | - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); |
|---|
| 1312 | | - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), |
|---|
| 1313 | | - hmm_device->minor); |
|---|
| 1314 | | - hmm_device->device.release = hmm_device_release; |
|---|
| 1315 | | - dev_set_drvdata(&hmm_device->device, drvdata); |
|---|
| 1316 | | - hmm_device->device.class = hmm_device_class; |
|---|
| 1317 | | - device_initialize(&hmm_device->device); |
|---|
| 1318 | | - |
|---|
| 1319 | | - return hmm_device; |
|---|
| 1320 | | -} |
|---|
| 1321 | | -EXPORT_SYMBOL(hmm_device_new); |
|---|
| 1322 | | - |
|---|
| 1323 | | -void hmm_device_put(struct hmm_device *hmm_device) |
|---|
| 1324 | | -{ |
|---|
| 1325 | | - put_device(&hmm_device->device); |
|---|
| 1326 | | -} |
|---|
| 1327 | | -EXPORT_SYMBOL(hmm_device_put); |
|---|
| 1328 | | - |
|---|
| 1329 | | -static int __init hmm_init(void) |
|---|
| 1330 | | -{ |
|---|
| 1331 | | - int ret; |
|---|
| 1332 | | - |
|---|
| 1333 | | - ret = alloc_chrdev_region(&hmm_device_devt, 0, |
|---|
| 1334 | | - HMM_DEVICE_MAX, |
|---|
| 1335 | | - "hmm_device"); |
|---|
| 1336 | | - if (ret) |
|---|
| 1337 | | - return ret; |
|---|
| 1338 | | - |
|---|
| 1339 | | - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); |
|---|
| 1340 | | - if (IS_ERR(hmm_device_class)) { |
|---|
| 1341 | | - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); |
|---|
| 1342 | | - return PTR_ERR(hmm_device_class); |
|---|
| 1343 | | - } |
|---|
| 1344 | | - return 0; |
|---|
| 1345 | | -} |
|---|
| 1346 | | - |
|---|
| 1347 | | -device_initcall(hmm_init); |
|---|
| 1348 | | -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ |
|---|
| 599 | +EXPORT_SYMBOL(hmm_range_fault); |
|---|