| .. | .. |
|---|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright 2013 Red Hat Inc. |
|---|
| 3 | 4 | * |
|---|
| 4 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 5 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 6 | | - * the Free Software Foundation; either version 2 of the License, or |
|---|
| 7 | | - * (at your option) any later version. |
|---|
| 5 | + * Authors: Jérôme Glisse <jglisse@redhat.com> |
|---|
| 8 | 6 | * |
|---|
| 9 | | - * This program is distributed in the hope that it will be useful, |
|---|
| 10 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 11 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 12 | | - * GNU General Public License for more details. |
|---|
| 13 | | - * |
|---|
| 14 | | - * Authors: Jérôme Glisse <jglisse@redhat.com> |
|---|
| 15 | | - */ |
|---|
| 16 | | -/* |
|---|
| 17 | | - * Heterogeneous Memory Management (HMM) |
|---|
| 18 | | - * |
|---|
| 19 | | - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it |
|---|
| 20 | | - * is for. Here we focus on the HMM API description, with some explanation of |
|---|
| 21 | | - * the underlying implementation. |
|---|
| 22 | | - * |
|---|
| 23 | | - * Short description: HMM provides a set of helpers to share a virtual address |
|---|
| 24 | | - * space between CPU and a device, so that the device can access any valid |
|---|
| 25 | | - * address of the process (while still obeying memory protection). HMM also |
|---|
| 26 | | - * provides helpers to migrate process memory to device memory, and back. Each |
|---|
| 27 | | - * set of functionality (address space mirroring, and migration to and from |
|---|
| 28 | | - * device memory) can be used independently of the other. |
|---|
| 29 | | - * |
|---|
| 30 | | - * |
|---|
| 31 | | - * HMM address space mirroring API: |
|---|
| 32 | | - * |
|---|
| 33 | | - * Use HMM address space mirroring if you want to mirror range of the CPU page |
|---|
| 34 | | - * table of a process into a device page table. Here, "mirror" means "keep |
|---|
| 35 | | - * synchronized". Prerequisites: the device must provide the ability to write- |
|---|
| 36 | | - * protect its page tables (at PAGE_SIZE granularity), and must be able to |
|---|
| 37 | | - * recover from the resulting potential page faults. |
|---|
| 38 | | - * |
|---|
| 39 | | - * HMM guarantees that at any point in time, a given virtual address points to |
|---|
| 40 | | - * either the same memory in both CPU and device page tables (that is: CPU and |
|---|
| 41 | | - * device page tables each point to the same pages), or that one page table (CPU |
|---|
| 42 | | - * or device) points to no entry, while the other still points to the old page |
|---|
| 43 | | - * for the address. The latter case happens when the CPU page table update |
|---|
| 44 | | - * happens first, and then the update is mirrored over to the device page table. |
|---|
| 45 | | - * This does not cause any issue, because the CPU page table cannot start |
|---|
| 46 | | - * pointing to a new page until the device page table is invalidated. |
|---|
| 47 | | - * |
|---|
| 48 | | - * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any |
|---|
| 49 | | - * updates to each device driver that has registered a mirror. It also provides |
|---|
| 50 | | - * some API calls to help with taking a snapshot of the CPU page table, and to |
|---|
| 51 | | - * synchronize with any updates that might happen concurrently. |
|---|
| 52 | | - * |
|---|
| 53 | | - * |
|---|
| 54 | | - * HMM migration to and from device memory: |
|---|
| 55 | | - * |
|---|
| 56 | | - * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with |
|---|
| 57 | | - * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page |
|---|
| 58 | | - * of the device memory, and allows the device driver to manage its memory |
|---|
| 59 | | - * using those struct pages. Having struct pages for device memory makes |
|---|
| 60 | | - * migration easier. Because that memory is not addressable by the CPU it must |
|---|
| 61 | | - * never be pinned to the device; in other words, any CPU page fault can always |
|---|
| 62 | | - * cause the device memory to be migrated (copied/moved) back to regular memory. |
|---|
| 63 | | - * |
|---|
| 64 | | - * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that |
|---|
| 65 | | - * allows use of a device DMA engine to perform the copy operation between |
|---|
| 66 | | - * regular system memory and device memory. |
|---|
| 7 | + * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. |
|---|
| 67 | 8 | */ |
|---|
| 68 | 9 | #ifndef LINUX_HMM_H |
|---|
| 69 | 10 | #define LINUX_HMM_H |
|---|
| 70 | 11 | |
|---|
| 71 | 12 | #include <linux/kconfig.h> |
|---|
| 72 | | - |
|---|
| 73 | | -#if IS_ENABLED(CONFIG_HMM) |
|---|
| 13 | +#include <linux/pgtable.h> |
|---|
| 74 | 14 | |
|---|
| 75 | 15 | #include <linux/device.h> |
|---|
| 76 | 16 | #include <linux/migrate.h> |
|---|
| 77 | 17 | #include <linux/memremap.h> |
|---|
| 78 | 18 | #include <linux/completion.h> |
|---|
| 79 | | - |
|---|
| 80 | | -struct hmm; |
|---|
| 19 | +#include <linux/mmu_notifier.h> |
|---|
| 81 | 20 | |
|---|
| 82 | 21 | /* |
|---|
| 83 | | - * hmm_pfn_flag_e - HMM flag enums |
|---|
| 22 | + * On output: |
|---|
| 23 | + * 0 - The page is faultable and a future call with |
|---|
| 24 | + * HMM_PFN_REQ_FAULT could succeed. |
|---|
| 25 | + * HMM_PFN_VALID - the pfn field points to a valid PFN. This PFN is at |
|---|
| 26 | + * least readable. If dev_private_owner is !NULL then this could |
|---|
| 27 | + * point at a DEVICE_PRIVATE page. |
|---|
| 28 | + * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) |
|---|
| 29 | + * HMM_PFN_ERROR - accessing the pfn is impossible and the device should |
|---|
| 30 | + * fail. ie poisoned memory, special pages, no vma, etc |
|---|
| 84 | 31 | * |
|---|
| 85 | | - * Flags: |
|---|
| 86 | | - * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. |
|---|
| 87 | | - * HMM_PFN_WRITE: CPU page table has write permission set |
|---|
| 88 | | - * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) |
|---|
| 89 | | - * |
|---|
| 90 | | - * The driver provide a flags array, if driver valid bit for an entry is bit |
|---|
| 91 | | - * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide |
|---|
| 92 | | - * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. |
|---|
| 93 | | - * Same logic apply to all flags. This is same idea as vm_page_prot in vma |
|---|
| 94 | | - * except that this is per device driver rather than per architecture. |
|---|
| 32 | + * On input: |
|---|
| 33 | + * 0 - Return the current state of the page, do not fault it. |
|---|
| 34 | + * HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or hmm_range_fault() |
|---|
| 35 | + * will fail |
|---|
| 36 | + * HMM_PFN_REQ_WRITE - The output must have HMM_PFN_WRITE or hmm_range_fault() |
|---|
| 37 | + * will fail. Must be combined with HMM_PFN_REQ_FAULT. |
|---|
| 95 | 38 | */ |
|---|
| 96 | | -enum hmm_pfn_flag_e { |
|---|
| 97 | | - HMM_PFN_VALID = 0, |
|---|
| 98 | | - HMM_PFN_WRITE, |
|---|
| 99 | | - HMM_PFN_DEVICE_PRIVATE, |
|---|
| 100 | | - HMM_PFN_FLAG_MAX |
|---|
| 39 | +enum hmm_pfn_flags { |
|---|
| 40 | + /* Output fields and flags */ |
|---|
| 41 | + HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), |
|---|
| 42 | + HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), |
|---|
| 43 | + HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), |
|---|
| 44 | + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), |
|---|
| 45 | + |
|---|
| 46 | + /* Input flags */ |
|---|
| 47 | + HMM_PFN_REQ_FAULT = HMM_PFN_VALID, |
|---|
| 48 | + HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, |
|---|
| 49 | + |
|---|
| 50 | + HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT, |
|---|
| 101 | 51 | }; |
|---|
| 102 | 52 | |
|---|
| 103 | 53 | /* |
|---|
| 104 | | - * hmm_pfn_value_e - HMM pfn special value |
|---|
| 54 | + * hmm_pfn_to_page() - return struct page pointed to by a device entry |
|---|
| 105 | 55 | * |
|---|
| 106 | | - * Flags: |
|---|
| 107 | | - * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory |
|---|
| 108 | | - * HMM_PFN_NONE: corresponding CPU page table entry is pte_none() |
|---|
| 109 | | - * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the |
|---|
| 110 | | - * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not |
|---|
| 111 | | - * be mirrored by a device, because the entry will never have HMM_PFN_VALID |
|---|
| 112 | | - * set and the pfn value is undefined. |
|---|
| 113 | | - * |
|---|
| 114 | | - * Driver provide entry value for none entry, error entry and special entry, |
|---|
| 115 | | - * driver can alias (ie use same value for error and special for instance). It |
|---|
| 116 | | - * should not alias none and error or special. |
|---|
| 117 | | - * |
|---|
| 118 | | - * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: |
|---|
| 119 | | - * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, |
|---|
| 120 | | - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table |
|---|
| 121 | | - * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one |
|---|
| 56 | + * This must be called under the caller 'user_lock' after a successful |
|---|
| 57 | + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID |
|---|
| 58 | + * already. |
|---|
| 122 | 59 | */ |
|---|
| 123 | | -enum hmm_pfn_value_e { |
|---|
| 124 | | - HMM_PFN_ERROR, |
|---|
| 125 | | - HMM_PFN_NONE, |
|---|
| 126 | | - HMM_PFN_SPECIAL, |
|---|
| 127 | | - HMM_PFN_VALUE_MAX |
|---|
| 128 | | -}; |
|---|
| 60 | +static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) |
|---|
| 61 | +{ |
|---|
| 62 | + return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); |
|---|
| 63 | +} |
|---|
| 64 | + |
|---|
| 65 | +/* |
|---|
| 66 | + * hmm_pfn_to_map_order() - return the CPU mapping size order |
|---|
| 67 | + * |
|---|
| 68 | + * This is optionally useful to optimize processing of the pfn result |
|---|
| 69 | + * array. It indicates that the page starts at the order aligned VA and is |
|---|
| 70 | + * 1<<order bytes long. Every pfn within an high order page will have the |
|---|
| 71 | + * same pfn flags, both access protections and the map_order. The caller must |
|---|
| 72 | + * be careful with edge cases as the start and end VA of the given page may |
|---|
| 73 | + * extend past the range used with hmm_range_fault(). |
|---|
| 74 | + * |
|---|
| 75 | + * This must be called under the caller 'user_lock' after a successful |
|---|
| 76 | + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID |
|---|
| 77 | + * already. |
|---|
| 78 | + */ |
|---|
| 79 | +static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn) |
|---|
| 80 | +{ |
|---|
| 81 | + return (hmm_pfn >> HMM_PFN_ORDER_SHIFT) & 0x1F; |
|---|
| 82 | +} |
|---|
| 129 | 83 | |
|---|
| 130 | 84 | /* |
|---|
| 131 | 85 | * struct hmm_range - track invalidation lock on virtual address range |
|---|
| 132 | 86 | * |
|---|
| 133 | | - * @vma: the vm area struct for the range |
|---|
| 134 | | - * @list: all range lock are on a list |
|---|
| 87 | + * @notifier: a mmu_interval_notifier that includes the start/end |
|---|
| 88 | + * @notifier_seq: result of mmu_interval_read_begin() |
|---|
| 135 | 89 | * @start: range virtual start address (inclusive) |
|---|
| 136 | 90 | * @end: range virtual end address (exclusive) |
|---|
| 137 | | - * @pfns: array of pfns (big enough for the range) |
|---|
| 138 | | - * @flags: pfn flags to match device driver page table |
|---|
| 139 | | - * @values: pfn value for some special case (none, special, error, ...) |
|---|
| 140 | | - * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) |
|---|
| 141 | | - * @valid: pfns array did not change since it has been fill by an HMM function |
|---|
| 91 | + * @hmm_pfns: array of pfns (big enough for the range) |
|---|
| 92 | + * @default_flags: default flags for the range (write, read, ... see hmm doc) |
|---|
| 93 | + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter |
|---|
| 94 | + * @dev_private_owner: owner of device private pages |
|---|
| 142 | 95 | */ |
|---|
| 143 | 96 | struct hmm_range { |
|---|
| 144 | | - struct vm_area_struct *vma; |
|---|
| 145 | | - struct list_head list; |
|---|
| 97 | + struct mmu_interval_notifier *notifier; |
|---|
| 98 | + unsigned long notifier_seq; |
|---|
| 146 | 99 | unsigned long start; |
|---|
| 147 | 100 | unsigned long end; |
|---|
| 148 | | - uint64_t *pfns; |
|---|
| 149 | | - const uint64_t *flags; |
|---|
| 150 | | - const uint64_t *values; |
|---|
| 151 | | - uint8_t pfn_shift; |
|---|
| 152 | | - bool valid; |
|---|
| 101 | + unsigned long *hmm_pfns; |
|---|
| 102 | + unsigned long default_flags; |
|---|
| 103 | + unsigned long pfn_flags_mask; |
|---|
| 104 | + void *dev_private_owner; |
|---|
| 153 | 105 | }; |
|---|
| 154 | 106 | |
|---|
| 155 | 107 | /* |
|---|
| 156 | | - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn |
|---|
| 157 | | - * @range: range use to decode HMM pfn value |
|---|
| 158 | | - * @pfn: HMM pfn value to get corresponding struct page from |
|---|
| 159 | | - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise |
|---|
| 160 | | - * |
|---|
| 161 | | - * If the HMM pfn is valid (ie valid flag set) then return the struct page |
|---|
| 162 | | - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. |
|---|
| 108 | + * Please see Documentation/vm/hmm.rst for how to use the range API. |
|---|
| 163 | 109 | */ |
|---|
| 164 | | -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, |
|---|
| 165 | | - uint64_t pfn) |
|---|
| 166 | | -{ |
|---|
| 167 | | - if (pfn == range->values[HMM_PFN_NONE]) |
|---|
| 168 | | - return NULL; |
|---|
| 169 | | - if (pfn == range->values[HMM_PFN_ERROR]) |
|---|
| 170 | | - return NULL; |
|---|
| 171 | | - if (pfn == range->values[HMM_PFN_SPECIAL]) |
|---|
| 172 | | - return NULL; |
|---|
| 173 | | - if (!(pfn & range->flags[HMM_PFN_VALID])) |
|---|
| 174 | | - return NULL; |
|---|
| 175 | | - return pfn_to_page(pfn >> range->pfn_shift); |
|---|
| 176 | | -} |
|---|
| 110 | +int hmm_range_fault(struct hmm_range *range); |
|---|
| 177 | 111 | |
|---|
| 178 | 112 | /* |
|---|
| 179 | | - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn |
|---|
| 180 | | - * @range: range use to decode HMM pfn value |
|---|
| 181 | | - * @pfn: HMM pfn value to extract pfn from |
|---|
| 182 | | - * Returns: pfn value if HMM pfn is valid, -1UL otherwise |
|---|
| 113 | + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range |
|---|
| 114 | + * |
|---|
| 115 | + * When waiting for mmu notifiers we need some kind of time out otherwise we |
|---|
| 116 | + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to |
|---|
| 117 | + * wait already. |
|---|
| 183 | 118 | */ |
|---|
| 184 | | -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, |
|---|
| 185 | | - uint64_t pfn) |
|---|
| 186 | | -{ |
|---|
| 187 | | - if (pfn == range->values[HMM_PFN_NONE]) |
|---|
| 188 | | - return -1UL; |
|---|
| 189 | | - if (pfn == range->values[HMM_PFN_ERROR]) |
|---|
| 190 | | - return -1UL; |
|---|
| 191 | | - if (pfn == range->values[HMM_PFN_SPECIAL]) |
|---|
| 192 | | - return -1UL; |
|---|
| 193 | | - if (!(pfn & range->flags[HMM_PFN_VALID])) |
|---|
| 194 | | - return -1UL; |
|---|
| 195 | | - return (pfn >> range->pfn_shift); |
|---|
| 196 | | -} |
|---|
| 197 | | - |
|---|
| 198 | | -/* |
|---|
| 199 | | - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page |
|---|
| 200 | | - * @range: range use to encode HMM pfn value |
|---|
| 201 | | - * @page: struct page pointer for which to create the HMM pfn |
|---|
| 202 | | - * Returns: valid HMM pfn for the page |
|---|
| 203 | | - */ |
|---|
| 204 | | -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, |
|---|
| 205 | | - struct page *page) |
|---|
| 206 | | -{ |
|---|
| 207 | | - return (page_to_pfn(page) << range->pfn_shift) | |
|---|
| 208 | | - range->flags[HMM_PFN_VALID]; |
|---|
| 209 | | -} |
|---|
| 210 | | - |
|---|
| 211 | | -/* |
|---|
| 212 | | - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn |
|---|
| 213 | | - * @range: range use to encode HMM pfn value |
|---|
| 214 | | - * @pfn: pfn value for which to create the HMM pfn |
|---|
| 215 | | - * Returns: valid HMM pfn for the pfn |
|---|
| 216 | | - */ |
|---|
| 217 | | -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, |
|---|
| 218 | | - unsigned long pfn) |
|---|
| 219 | | -{ |
|---|
| 220 | | - return (pfn << range->pfn_shift) | |
|---|
| 221 | | - range->flags[HMM_PFN_VALID]; |
|---|
| 222 | | -} |
|---|
| 223 | | - |
|---|
| 224 | | - |
|---|
| 225 | | -#if IS_ENABLED(CONFIG_HMM_MIRROR) |
|---|
| 226 | | -/* |
|---|
| 227 | | - * Mirroring: how to synchronize device page table with CPU page table. |
|---|
| 228 | | - * |
|---|
| 229 | | - * A device driver that is participating in HMM mirroring must always |
|---|
| 230 | | - * synchronize with CPU page table updates. For this, device drivers can either |
|---|
| 231 | | - * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device |
|---|
| 232 | | - * drivers can decide to register one mirror per device per process, or just |
|---|
| 233 | | - * one mirror per process for a group of devices. The pattern is: |
|---|
| 234 | | - * |
|---|
| 235 | | - * int device_bind_address_space(..., struct mm_struct *mm, ...) |
|---|
| 236 | | - * { |
|---|
| 237 | | - * struct device_address_space *das; |
|---|
| 238 | | - * |
|---|
| 239 | | - * // Device driver specific initialization, and allocation of das |
|---|
| 240 | | - * // which contains an hmm_mirror struct as one of its fields. |
|---|
| 241 | | - * ... |
|---|
| 242 | | - * |
|---|
| 243 | | - * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops); |
|---|
| 244 | | - * if (ret) { |
|---|
| 245 | | - * // Cleanup on error |
|---|
| 246 | | - * return ret; |
|---|
| 247 | | - * } |
|---|
| 248 | | - * |
|---|
| 249 | | - * // Other device driver specific initialization |
|---|
| 250 | | - * ... |
|---|
| 251 | | - * } |
|---|
| 252 | | - * |
|---|
| 253 | | - * Once an hmm_mirror is registered for an address space, the device driver |
|---|
| 254 | | - * will get callbacks through sync_cpu_device_pagetables() operation (see |
|---|
| 255 | | - * hmm_mirror_ops struct). |
|---|
| 256 | | - * |
|---|
| 257 | | - * Device driver must not free the struct containing the hmm_mirror struct |
|---|
| 258 | | - * before calling hmm_mirror_unregister(). The expected usage is to do that when |
|---|
| 259 | | - * the device driver is unbinding from an address space. |
|---|
| 260 | | - * |
|---|
| 261 | | - * |
|---|
| 262 | | - * void device_unbind_address_space(struct device_address_space *das) |
|---|
| 263 | | - * { |
|---|
| 264 | | - * // Device driver specific cleanup |
|---|
| 265 | | - * ... |
|---|
| 266 | | - * |
|---|
| 267 | | - * hmm_mirror_unregister(&das->mirror); |
|---|
| 268 | | - * |
|---|
| 269 | | - * // Other device driver specific cleanup, and now das can be freed |
|---|
| 270 | | - * ... |
|---|
| 271 | | - * } |
|---|
| 272 | | - */ |
|---|
| 273 | | - |
|---|
| 274 | | -struct hmm_mirror; |
|---|
| 275 | | - |
|---|
| 276 | | -/* |
|---|
| 277 | | - * enum hmm_update_type - type of update |
|---|
| 278 | | - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) |
|---|
| 279 | | - */ |
|---|
| 280 | | -enum hmm_update_type { |
|---|
| 281 | | - HMM_UPDATE_INVALIDATE, |
|---|
| 282 | | -}; |
|---|
| 283 | | - |
|---|
| 284 | | -/* |
|---|
| 285 | | - * struct hmm_mirror_ops - HMM mirror device operations callback |
|---|
| 286 | | - * |
|---|
| 287 | | - * @update: callback to update range on a device |
|---|
| 288 | | - */ |
|---|
| 289 | | -struct hmm_mirror_ops { |
|---|
| 290 | | - /* release() - release hmm_mirror |
|---|
| 291 | | - * |
|---|
| 292 | | - * @mirror: pointer to struct hmm_mirror |
|---|
| 293 | | - * |
|---|
| 294 | | - * This is called when the mm_struct is being released. |
|---|
| 295 | | - * The callback should make sure no references to the mirror occur |
|---|
| 296 | | - * after the callback returns. |
|---|
| 297 | | - */ |
|---|
| 298 | | - void (*release)(struct hmm_mirror *mirror); |
|---|
| 299 | | - |
|---|
| 300 | | - /* sync_cpu_device_pagetables() - synchronize page tables |
|---|
| 301 | | - * |
|---|
| 302 | | - * @mirror: pointer to struct hmm_mirror |
|---|
| 303 | | - * @update_type: type of update that occurred to the CPU page table |
|---|
| 304 | | - * @start: virtual start address of the range to update |
|---|
| 305 | | - * @end: virtual end address of the range to update |
|---|
| 306 | | - * |
|---|
| 307 | | - * This callback ultimately originates from mmu_notifiers when the CPU |
|---|
| 308 | | - * page table is updated. The device driver must update its page table |
|---|
| 309 | | - * in response to this callback. The update argument tells what action |
|---|
| 310 | | - * to perform. |
|---|
| 311 | | - * |
|---|
| 312 | | - * The device driver must not return from this callback until the device |
|---|
| 313 | | - * page tables are completely updated (TLBs flushed, etc); this is a |
|---|
| 314 | | - * synchronous call. |
|---|
| 315 | | - */ |
|---|
| 316 | | - void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, |
|---|
| 317 | | - enum hmm_update_type update_type, |
|---|
| 318 | | - unsigned long start, |
|---|
| 319 | | - unsigned long end); |
|---|
| 320 | | -}; |
|---|
| 321 | | - |
|---|
| 322 | | -/* |
|---|
| 323 | | - * struct hmm_mirror - mirror struct for a device driver |
|---|
| 324 | | - * |
|---|
| 325 | | - * @hmm: pointer to struct hmm (which is unique per mm_struct) |
|---|
| 326 | | - * @ops: device driver callback for HMM mirror operations |
|---|
| 327 | | - * @list: for list of mirrors of a given mm |
|---|
| 328 | | - * |
|---|
| 329 | | - * Each address space (mm_struct) being mirrored by a device must register one |
|---|
| 330 | | - * instance of an hmm_mirror struct with HMM. HMM will track the list of all |
|---|
| 331 | | - * mirrors for each mm_struct. |
|---|
| 332 | | - */ |
|---|
| 333 | | -struct hmm_mirror { |
|---|
| 334 | | - struct hmm *hmm; |
|---|
| 335 | | - const struct hmm_mirror_ops *ops; |
|---|
| 336 | | - struct list_head list; |
|---|
| 337 | | -}; |
|---|
| 338 | | - |
|---|
| 339 | | -int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); |
|---|
| 340 | | -void hmm_mirror_unregister(struct hmm_mirror *mirror); |
|---|
| 341 | | - |
|---|
| 342 | | - |
|---|
| 343 | | -/* |
|---|
| 344 | | - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device |
|---|
| 345 | | - * driver lock that serializes device page table updates, then call |
|---|
| 346 | | - * hmm_vma_range_done(), to check if the snapshot is still valid. The same |
|---|
| 347 | | - * device driver page table update lock must also be used in the |
|---|
| 348 | | - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page |
|---|
| 349 | | - * table invalidation serializes on it. |
|---|
| 350 | | - * |
|---|
| 351 | | - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL |
|---|
| 352 | | - * hmm_vma_get_pfns() WITHOUT ERROR ! |
|---|
| 353 | | - * |
|---|
| 354 | | - * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! |
|---|
| 355 | | - */ |
|---|
| 356 | | -int hmm_vma_get_pfns(struct hmm_range *range); |
|---|
| 357 | | -bool hmm_vma_range_done(struct hmm_range *range); |
|---|
| 358 | | - |
|---|
| 359 | | - |
|---|
| 360 | | -/* |
|---|
| 361 | | - * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will |
|---|
| 362 | | - * not migrate any device memory back to system memory. The HMM pfn array will |
|---|
| 363 | | - * be updated with the fault result and current snapshot of the CPU page table |
|---|
| 364 | | - * for the range. |
|---|
| 365 | | - * |
|---|
| 366 | | - * The mmap_sem must be taken in read mode before entering and it might be |
|---|
| 367 | | - * dropped by the function if the block argument is false. In that case, the |
|---|
| 368 | | - * function returns -EAGAIN. |
|---|
| 369 | | - * |
|---|
| 370 | | - * Return value does not reflect if the fault was successful for every single |
|---|
| 371 | | - * address or not. Therefore, the caller must to inspect the HMM pfn array to |
|---|
| 372 | | - * determine fault status for each address. |
|---|
| 373 | | - * |
|---|
| 374 | | - * Trying to fault inside an invalid vma will result in -EINVAL. |
|---|
| 375 | | - * |
|---|
| 376 | | - * See the function description in mm/hmm.c for further documentation. |
|---|
| 377 | | - */ |
|---|
| 378 | | -int hmm_vma_fault(struct hmm_range *range, bool block); |
|---|
| 379 | | - |
|---|
| 380 | | -/* Below are for HMM internal use only! Not to be used by device driver! */ |
|---|
| 381 | | -void hmm_mm_destroy(struct mm_struct *mm); |
|---|
| 382 | | - |
|---|
| 383 | | -static inline void hmm_mm_init(struct mm_struct *mm) |
|---|
| 384 | | -{ |
|---|
| 385 | | - mm->hmm = NULL; |
|---|
| 386 | | -} |
|---|
| 387 | | -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ |
|---|
| 388 | | -static inline void hmm_mm_destroy(struct mm_struct *mm) {} |
|---|
| 389 | | -static inline void hmm_mm_init(struct mm_struct *mm) {} |
|---|
| 390 | | -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ |
|---|
| 391 | | - |
|---|
| 392 | | -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) |
|---|
| 393 | | -struct hmm_devmem; |
|---|
| 394 | | - |
|---|
| 395 | | -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, |
|---|
| 396 | | - unsigned long addr); |
|---|
| 397 | | - |
|---|
| 398 | | -/* |
|---|
| 399 | | - * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events |
|---|
| 400 | | - * |
|---|
| 401 | | - * @free: call when refcount on page reach 1 and thus is no longer use |
|---|
| 402 | | - * @fault: call when there is a page fault to unaddressable memory |
|---|
| 403 | | - * |
|---|
| 404 | | - * Both callback happens from page_free() and page_fault() callback of struct |
|---|
| 405 | | - * dev_pagemap respectively. See include/linux/memremap.h for more details on |
|---|
| 406 | | - * those. |
|---|
| 407 | | - * |
|---|
| 408 | | - * The hmm_devmem_ops callback are just here to provide a coherent and |
|---|
| 409 | | - * uniq API to device driver and device driver should not register their |
|---|
| 410 | | - * own page_free() or page_fault() but rely on the hmm_devmem_ops call- |
|---|
| 411 | | - * back. |
|---|
| 412 | | - */ |
|---|
| 413 | | -struct hmm_devmem_ops { |
|---|
| 414 | | - /* |
|---|
| 415 | | - * free() - free a device page |
|---|
| 416 | | - * @devmem: device memory structure (see struct hmm_devmem) |
|---|
| 417 | | - * @page: pointer to struct page being freed |
|---|
| 418 | | - * |
|---|
| 419 | | - * Call back occurs whenever a device page refcount reach 1 which |
|---|
| 420 | | - * means that no one is holding any reference on the page anymore |
|---|
| 421 | | - * (ZONE_DEVICE page have an elevated refcount of 1 as default so |
|---|
| 422 | | - * that they are not release to the general page allocator). |
|---|
| 423 | | - * |
|---|
| 424 | | - * Note that callback has exclusive ownership of the page (as no |
|---|
| 425 | | - * one is holding any reference). |
|---|
| 426 | | - */ |
|---|
| 427 | | - void (*free)(struct hmm_devmem *devmem, struct page *page); |
|---|
| 428 | | - /* |
|---|
| 429 | | - * fault() - CPU page fault or get user page (GUP) |
|---|
| 430 | | - * @devmem: device memory structure (see struct hmm_devmem) |
|---|
| 431 | | - * @vma: virtual memory area containing the virtual address |
|---|
| 432 | | - * @addr: virtual address that faulted or for which there is a GUP |
|---|
| 433 | | - * @page: pointer to struct page backing virtual address (unreliable) |
|---|
| 434 | | - * @flags: FAULT_FLAG_* (see include/linux/mm.h) |
|---|
| 435 | | - * @pmdp: page middle directory |
|---|
| 436 | | - * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR |
|---|
| 437 | | - * on error |
|---|
| 438 | | - * |
|---|
| 439 | | - * The callback occurs whenever there is a CPU page fault or GUP on a |
|---|
| 440 | | - * virtual address. This means that the device driver must migrate the |
|---|
| 441 | | - * page back to regular memory (CPU accessible). |
|---|
| 442 | | - * |
|---|
| 443 | | - * The device driver is free to migrate more than one page from the |
|---|
| 444 | | - * fault() callback as an optimization. However if device decide to |
|---|
| 445 | | - * migrate more than one page it must always priotirize the faulting |
|---|
| 446 | | - * address over the others. |
|---|
| 447 | | - * |
|---|
| 448 | | - * The struct page pointer is only given as an hint to allow quick |
|---|
| 449 | | - * lookup of internal device driver data. A concurrent migration |
|---|
| 450 | | - * might have already free that page and the virtual address might |
|---|
| 451 | | - * not longer be back by it. So it should not be modified by the |
|---|
| 452 | | - * callback. |
|---|
| 453 | | - * |
|---|
| 454 | | - * Note that mmap semaphore is held in read mode at least when this |
|---|
| 455 | | - * callback occurs, hence the vma is valid upon callback entry. |
|---|
| 456 | | - */ |
|---|
| 457 | | - int (*fault)(struct hmm_devmem *devmem, |
|---|
| 458 | | - struct vm_area_struct *vma, |
|---|
| 459 | | - unsigned long addr, |
|---|
| 460 | | - const struct page *page, |
|---|
| 461 | | - unsigned int flags, |
|---|
| 462 | | - pmd_t *pmdp); |
|---|
| 463 | | -}; |
|---|
| 464 | | - |
|---|
| 465 | | -/* |
|---|
| 466 | | - * struct hmm_devmem - track device memory |
|---|
| 467 | | - * |
|---|
| 468 | | - * @completion: completion object for device memory |
|---|
| 469 | | - * @pfn_first: first pfn for this resource (set by hmm_devmem_add()) |
|---|
| 470 | | - * @pfn_last: last pfn for this resource (set by hmm_devmem_add()) |
|---|
| 471 | | - * @resource: IO resource reserved for this chunk of memory |
|---|
| 472 | | - * @pagemap: device page map for that chunk |
|---|
| 473 | | - * @device: device to bind resource to |
|---|
| 474 | | - * @ops: memory operations callback |
|---|
| 475 | | - * @ref: per CPU refcount |
|---|
| 476 | | - * |
|---|
| 477 | | - * This an helper structure for device drivers that do not wish to implement |
|---|
| 478 | | - * the gory details related to hotplugging new memoy and allocating struct |
|---|
| 479 | | - * pages. |
|---|
| 480 | | - * |
|---|
| 481 | | - * Device drivers can directly use ZONE_DEVICE memory on their own if they |
|---|
| 482 | | - * wish to do so. |
|---|
| 483 | | - */ |
|---|
| 484 | | -struct hmm_devmem { |
|---|
| 485 | | - struct completion completion; |
|---|
| 486 | | - unsigned long pfn_first; |
|---|
| 487 | | - unsigned long pfn_last; |
|---|
| 488 | | - struct resource *resource; |
|---|
| 489 | | - struct device *device; |
|---|
| 490 | | - struct dev_pagemap pagemap; |
|---|
| 491 | | - const struct hmm_devmem_ops *ops; |
|---|
| 492 | | - struct percpu_ref ref; |
|---|
| 493 | | -}; |
|---|
| 494 | | - |
|---|
| 495 | | -/* |
|---|
| 496 | | - * To add (hotplug) device memory, HMM assumes that there is no real resource |
|---|
| 497 | | - * that reserves a range in the physical address space (this is intended to be |
|---|
| 498 | | - * use by unaddressable device memory). It will reserve a physical range big |
|---|
| 499 | | - * enough and allocate struct page for it. |
|---|
| 500 | | - * |
|---|
| 501 | | - * The device driver can wrap the hmm_devmem struct inside a private device |
|---|
| 502 | | - * driver struct. |
|---|
| 503 | | - */ |
|---|
| 504 | | -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, |
|---|
| 505 | | - struct device *device, |
|---|
| 506 | | - unsigned long size); |
|---|
| 507 | | -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, |
|---|
| 508 | | - struct device *device, |
|---|
| 509 | | - struct resource *res); |
|---|
| 510 | | - |
|---|
| 511 | | -/* |
|---|
| 512 | | - * hmm_devmem_page_set_drvdata - set per-page driver data field |
|---|
| 513 | | - * |
|---|
| 514 | | - * @page: pointer to struct page |
|---|
| 515 | | - * @data: driver data value to set |
|---|
| 516 | | - * |
|---|
| 517 | | - * Because page can not be on lru we have an unsigned long that driver can use |
|---|
| 518 | | - * to store a per page field. This just a simple helper to do that. |
|---|
| 519 | | - */ |
|---|
| 520 | | -static inline void hmm_devmem_page_set_drvdata(struct page *page, |
|---|
| 521 | | - unsigned long data) |
|---|
| 522 | | -{ |
|---|
| 523 | | - page->hmm_data = data; |
|---|
| 524 | | -} |
|---|
| 525 | | - |
|---|
| 526 | | -/* |
|---|
| 527 | | - * hmm_devmem_page_get_drvdata - get per page driver data field |
|---|
| 528 | | - * |
|---|
| 529 | | - * @page: pointer to struct page |
|---|
| 530 | | - * Return: driver data value |
|---|
| 531 | | - */ |
|---|
| 532 | | -static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) |
|---|
| 533 | | -{ |
|---|
| 534 | | - return page->hmm_data; |
|---|
| 535 | | -} |
|---|
| 536 | | - |
|---|
| 537 | | - |
|---|
| 538 | | -/* |
|---|
| 539 | | - * struct hmm_device - fake device to hang device memory onto |
|---|
| 540 | | - * |
|---|
| 541 | | - * @device: device struct |
|---|
| 542 | | - * @minor: device minor number |
|---|
| 543 | | - */ |
|---|
| 544 | | -struct hmm_device { |
|---|
| 545 | | - struct device device; |
|---|
| 546 | | - unsigned int minor; |
|---|
| 547 | | -}; |
|---|
| 548 | | - |
|---|
| 549 | | -/* |
|---|
| 550 | | - * A device driver that wants to handle multiple devices memory through a |
|---|
| 551 | | - * single fake device can use hmm_device to do so. This is purely a helper and |
|---|
| 552 | | - * it is not strictly needed, in order to make use of any HMM functionality. |
|---|
| 553 | | - */ |
|---|
| 554 | | -struct hmm_device *hmm_device_new(void *drvdata); |
|---|
| 555 | | -void hmm_device_put(struct hmm_device *hmm_device); |
|---|
| 556 | | -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ |
|---|
| 557 | | -#else /* IS_ENABLED(CONFIG_HMM) */ |
|---|
| 558 | | -static inline void hmm_mm_destroy(struct mm_struct *mm) {} |
|---|
| 559 | | -static inline void hmm_mm_init(struct mm_struct *mm) {} |
|---|
| 560 | | -#endif /* IS_ENABLED(CONFIG_HMM) */ |
|---|
| 119 | +#define HMM_RANGE_DEFAULT_TIMEOUT 1000 |
|---|
| 561 | 120 | |
|---|
| 562 | 121 | #endif /* LINUX_HMM_H */ |
|---|