.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | | - * linux/mm/vmalloc.c |
---|
3 | | - * |
---|
4 | 3 | * Copyright (C) 1993 Linus Torvalds |
---|
5 | 4 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
---|
6 | 5 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 |
---|
7 | 6 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 |
---|
8 | 7 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
---|
| 8 | + * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 |
---|
9 | 9 | */ |
---|
10 | 10 | |
---|
11 | 11 | #include <linux/vmalloc.h> |
---|
.. | .. |
---|
18 | 18 | #include <linux/interrupt.h> |
---|
19 | 19 | #include <linux/proc_fs.h> |
---|
20 | 20 | #include <linux/seq_file.h> |
---|
| 21 | +#include <linux/set_memory.h> |
---|
21 | 22 | #include <linux/debugobjects.h> |
---|
22 | 23 | #include <linux/kallsyms.h> |
---|
23 | 24 | #include <linux/list.h> |
---|
24 | 25 | #include <linux/notifier.h> |
---|
25 | 26 | #include <linux/rbtree.h> |
---|
26 | | -#include <linux/radix-tree.h> |
---|
| 27 | +#include <linux/xarray.h> |
---|
27 | 28 | #include <linux/rcupdate.h> |
---|
28 | 29 | #include <linux/pfn.h> |
---|
29 | 30 | #include <linux/kmemleak.h> |
---|
.. | .. |
---|
31 | 32 | #include <linux/compiler.h> |
---|
32 | 33 | #include <linux/llist.h> |
---|
33 | 34 | #include <linux/bitops.h> |
---|
| 35 | +#include <linux/rbtree_augmented.h> |
---|
34 | 36 | #include <linux/overflow.h> |
---|
| 37 | +#include <trace/hooks/mm.h> |
---|
35 | 38 | |
---|
36 | 39 | #include <linux/uaccess.h> |
---|
37 | 40 | #include <asm/tlbflush.h> |
---|
38 | 41 | #include <asm/shmparam.h> |
---|
39 | 42 | |
---|
40 | 43 | #include "internal.h" |
---|
| 44 | +#include "pgalloc-track.h" |
---|
| 45 | + |
---|
| 46 | +bool is_vmalloc_addr(const void *x) |
---|
| 47 | +{ |
---|
| 48 | + unsigned long addr = (unsigned long)x; |
---|
| 49 | + |
---|
| 50 | + return addr >= VMALLOC_START && addr < VMALLOC_END; |
---|
| 51 | +} |
---|
| 52 | +EXPORT_SYMBOL(is_vmalloc_addr); |
---|
41 | 53 | |
---|
42 | 54 | struct vfree_deferred { |
---|
43 | 55 | struct llist_head list; |
---|
.. | .. |
---|
58 | 70 | |
---|
59 | 71 | /*** Page table manipulation functions ***/ |
---|
60 | 72 | |
---|
61 | | -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
---|
| 73 | +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
---|
| 74 | + pgtbl_mod_mask *mask) |
---|
62 | 75 | { |
---|
63 | 76 | pte_t *pte; |
---|
64 | 77 | |
---|
.. | .. |
---|
67 | 80 | pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); |
---|
68 | 81 | WARN_ON(!pte_none(ptent) && !pte_present(ptent)); |
---|
69 | 82 | } while (pte++, addr += PAGE_SIZE, addr != end); |
---|
| 83 | + *mask |= PGTBL_PTE_MODIFIED; |
---|
70 | 84 | } |
---|
71 | 85 | |
---|
72 | | -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
---|
| 86 | +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, |
---|
| 87 | + pgtbl_mod_mask *mask) |
---|
73 | 88 | { |
---|
74 | 89 | pmd_t *pmd; |
---|
75 | 90 | unsigned long next; |
---|
| 91 | + int cleared; |
---|
76 | 92 | |
---|
77 | 93 | pmd = pmd_offset(pud, addr); |
---|
78 | 94 | do { |
---|
79 | 95 | next = pmd_addr_end(addr, end); |
---|
80 | | - if (pmd_clear_huge(pmd)) |
---|
| 96 | + |
---|
| 97 | + cleared = pmd_clear_huge(pmd); |
---|
| 98 | + if (cleared || pmd_bad(*pmd)) |
---|
| 99 | + *mask |= PGTBL_PMD_MODIFIED; |
---|
| 100 | + |
---|
| 101 | + if (cleared) |
---|
81 | 102 | continue; |
---|
82 | 103 | if (pmd_none_or_clear_bad(pmd)) |
---|
83 | 104 | continue; |
---|
84 | | - vunmap_pte_range(pmd, addr, next); |
---|
| 105 | + vunmap_pte_range(pmd, addr, next, mask); |
---|
| 106 | + |
---|
| 107 | + cond_resched(); |
---|
85 | 108 | } while (pmd++, addr = next, addr != end); |
---|
86 | 109 | } |
---|
87 | 110 | |
---|
88 | | -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) |
---|
| 111 | +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, |
---|
| 112 | + pgtbl_mod_mask *mask) |
---|
89 | 113 | { |
---|
90 | 114 | pud_t *pud; |
---|
91 | 115 | unsigned long next; |
---|
| 116 | + int cleared; |
---|
92 | 117 | |
---|
93 | 118 | pud = pud_offset(p4d, addr); |
---|
94 | 119 | do { |
---|
95 | 120 | next = pud_addr_end(addr, end); |
---|
96 | | - if (pud_clear_huge(pud)) |
---|
| 121 | + |
---|
| 122 | + cleared = pud_clear_huge(pud); |
---|
| 123 | + if (cleared || pud_bad(*pud)) |
---|
| 124 | + *mask |= PGTBL_PUD_MODIFIED; |
---|
| 125 | + |
---|
| 126 | + if (cleared) |
---|
97 | 127 | continue; |
---|
98 | 128 | if (pud_none_or_clear_bad(pud)) |
---|
99 | 129 | continue; |
---|
100 | | - vunmap_pmd_range(pud, addr, next); |
---|
| 130 | + vunmap_pmd_range(pud, addr, next, mask); |
---|
101 | 131 | } while (pud++, addr = next, addr != end); |
---|
102 | 132 | } |
---|
103 | 133 | |
---|
104 | | -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
---|
| 134 | +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, |
---|
| 135 | + pgtbl_mod_mask *mask) |
---|
105 | 136 | { |
---|
106 | 137 | p4d_t *p4d; |
---|
107 | 138 | unsigned long next; |
---|
| 139 | + int cleared; |
---|
108 | 140 | |
---|
109 | 141 | p4d = p4d_offset(pgd, addr); |
---|
110 | 142 | do { |
---|
111 | 143 | next = p4d_addr_end(addr, end); |
---|
112 | | - if (p4d_clear_huge(p4d)) |
---|
| 144 | + |
---|
| 145 | + cleared = p4d_clear_huge(p4d); |
---|
| 146 | + if (cleared || p4d_bad(*p4d)) |
---|
| 147 | + *mask |= PGTBL_P4D_MODIFIED; |
---|
| 148 | + |
---|
| 149 | + if (cleared) |
---|
113 | 150 | continue; |
---|
114 | 151 | if (p4d_none_or_clear_bad(p4d)) |
---|
115 | 152 | continue; |
---|
116 | | - vunmap_pud_range(p4d, addr, next); |
---|
| 153 | + vunmap_pud_range(p4d, addr, next, mask); |
---|
117 | 154 | } while (p4d++, addr = next, addr != end); |
---|
118 | 155 | } |
---|
119 | 156 | |
---|
120 | | -static void vunmap_page_range(unsigned long addr, unsigned long end) |
---|
| 157 | +/** |
---|
| 158 | + * unmap_kernel_range_noflush - unmap kernel VM area |
---|
| 159 | + * @start: start of the VM area to unmap |
---|
| 160 | + * @size: size of the VM area to unmap |
---|
| 161 | + * |
---|
| 162 | + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify |
---|
| 163 | + * should have been allocated using get_vm_area() and its friends. |
---|
| 164 | + * |
---|
| 165 | + * NOTE: |
---|
| 166 | + * This function does NOT do any cache flushing. The caller is responsible |
---|
| 167 | + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this |
---|
| 168 | + * function and flush_tlb_kernel_range() after. |
---|
| 169 | + */ |
---|
| 170 | +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) |
---|
121 | 171 | { |
---|
122 | | - pgd_t *pgd; |
---|
| 172 | + unsigned long end = start + size; |
---|
123 | 173 | unsigned long next; |
---|
| 174 | + pgd_t *pgd; |
---|
| 175 | + unsigned long addr = start; |
---|
| 176 | + pgtbl_mod_mask mask = 0; |
---|
124 | 177 | |
---|
125 | 178 | BUG_ON(addr >= end); |
---|
126 | 179 | pgd = pgd_offset_k(addr); |
---|
127 | 180 | do { |
---|
128 | 181 | next = pgd_addr_end(addr, end); |
---|
| 182 | + if (pgd_bad(*pgd)) |
---|
| 183 | + mask |= PGTBL_PGD_MODIFIED; |
---|
129 | 184 | if (pgd_none_or_clear_bad(pgd)) |
---|
130 | 185 | continue; |
---|
131 | | - vunmap_p4d_range(pgd, addr, next); |
---|
| 186 | + vunmap_p4d_range(pgd, addr, next, &mask); |
---|
132 | 187 | } while (pgd++, addr = next, addr != end); |
---|
| 188 | + |
---|
| 189 | + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) |
---|
| 190 | + arch_sync_kernel_mappings(start, end); |
---|
133 | 191 | } |
---|
134 | 192 | |
---|
135 | 193 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
---|
136 | | - unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
---|
| 194 | + unsigned long end, pgprot_t prot, struct page **pages, int *nr, |
---|
| 195 | + pgtbl_mod_mask *mask) |
---|
137 | 196 | { |
---|
138 | 197 | pte_t *pte; |
---|
139 | 198 | |
---|
.. | .. |
---|
142 | 201 | * callers keep track of where we're up to. |
---|
143 | 202 | */ |
---|
144 | 203 | |
---|
145 | | - pte = pte_alloc_kernel(pmd, addr); |
---|
| 204 | + pte = pte_alloc_kernel_track(pmd, addr, mask); |
---|
146 | 205 | if (!pte) |
---|
147 | 206 | return -ENOMEM; |
---|
148 | 207 | do { |
---|
.. | .. |
---|
155 | 214 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
---|
156 | 215 | (*nr)++; |
---|
157 | 216 | } while (pte++, addr += PAGE_SIZE, addr != end); |
---|
| 217 | + *mask |= PGTBL_PTE_MODIFIED; |
---|
158 | 218 | return 0; |
---|
159 | 219 | } |
---|
160 | 220 | |
---|
161 | 221 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
---|
162 | | - unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
---|
| 222 | + unsigned long end, pgprot_t prot, struct page **pages, int *nr, |
---|
| 223 | + pgtbl_mod_mask *mask) |
---|
163 | 224 | { |
---|
164 | 225 | pmd_t *pmd; |
---|
165 | 226 | unsigned long next; |
---|
166 | 227 | |
---|
167 | | - pmd = pmd_alloc(&init_mm, pud, addr); |
---|
| 228 | + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); |
---|
168 | 229 | if (!pmd) |
---|
169 | 230 | return -ENOMEM; |
---|
170 | 231 | do { |
---|
171 | 232 | next = pmd_addr_end(addr, end); |
---|
172 | | - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
---|
| 233 | + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) |
---|
173 | 234 | return -ENOMEM; |
---|
174 | 235 | } while (pmd++, addr = next, addr != end); |
---|
175 | 236 | return 0; |
---|
176 | 237 | } |
---|
177 | 238 | |
---|
178 | 239 | static int vmap_pud_range(p4d_t *p4d, unsigned long addr, |
---|
179 | | - unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
---|
| 240 | + unsigned long end, pgprot_t prot, struct page **pages, int *nr, |
---|
| 241 | + pgtbl_mod_mask *mask) |
---|
180 | 242 | { |
---|
181 | 243 | pud_t *pud; |
---|
182 | 244 | unsigned long next; |
---|
183 | 245 | |
---|
184 | | - pud = pud_alloc(&init_mm, p4d, addr); |
---|
| 246 | + pud = pud_alloc_track(&init_mm, p4d, addr, mask); |
---|
185 | 247 | if (!pud) |
---|
186 | 248 | return -ENOMEM; |
---|
187 | 249 | do { |
---|
188 | 250 | next = pud_addr_end(addr, end); |
---|
189 | | - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
---|
| 251 | + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) |
---|
190 | 252 | return -ENOMEM; |
---|
191 | 253 | } while (pud++, addr = next, addr != end); |
---|
192 | 254 | return 0; |
---|
193 | 255 | } |
---|
194 | 256 | |
---|
195 | 257 | static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, |
---|
196 | | - unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
---|
| 258 | + unsigned long end, pgprot_t prot, struct page **pages, int *nr, |
---|
| 259 | + pgtbl_mod_mask *mask) |
---|
197 | 260 | { |
---|
198 | 261 | p4d_t *p4d; |
---|
199 | 262 | unsigned long next; |
---|
200 | 263 | |
---|
201 | | - p4d = p4d_alloc(&init_mm, pgd, addr); |
---|
| 264 | + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); |
---|
202 | 265 | if (!p4d) |
---|
203 | 266 | return -ENOMEM; |
---|
204 | 267 | do { |
---|
205 | 268 | next = p4d_addr_end(addr, end); |
---|
206 | | - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) |
---|
| 269 | + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) |
---|
207 | 270 | return -ENOMEM; |
---|
208 | 271 | } while (p4d++, addr = next, addr != end); |
---|
209 | 272 | return 0; |
---|
210 | 273 | } |
---|
211 | 274 | |
---|
212 | | -/* |
---|
213 | | - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and |
---|
214 | | - * will have pfns corresponding to the "pages" array. |
---|
| 275 | +/** |
---|
| 276 | + * map_kernel_range_noflush - map kernel VM area with the specified pages |
---|
| 277 | + * @addr: start of the VM area to map |
---|
| 278 | + * @size: size of the VM area to map |
---|
| 279 | + * @prot: page protection flags to use |
---|
| 280 | + * @pages: pages to map |
---|
215 | 281 | * |
---|
216 | | - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] |
---|
| 282 | + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should |
---|
| 283 | + * have been allocated using get_vm_area() and its friends. |
---|
| 284 | + * |
---|
| 285 | + * NOTE: |
---|
| 286 | + * This function does NOT do any cache flushing. The caller is responsible for |
---|
| 287 | + * calling flush_cache_vmap() on to-be-mapped areas before calling this |
---|
| 288 | + * function. |
---|
| 289 | + * |
---|
| 290 | + * RETURNS: |
---|
| 291 | + * 0 on success, -errno on failure. |
---|
217 | 292 | */ |
---|
218 | | -static int vmap_page_range_noflush(unsigned long start, unsigned long end, |
---|
219 | | - pgprot_t prot, struct page **pages) |
---|
| 293 | +int map_kernel_range_noflush(unsigned long addr, unsigned long size, |
---|
| 294 | + pgprot_t prot, struct page **pages) |
---|
220 | 295 | { |
---|
221 | | - pgd_t *pgd; |
---|
| 296 | + unsigned long start = addr; |
---|
| 297 | + unsigned long end = addr + size; |
---|
222 | 298 | unsigned long next; |
---|
223 | | - unsigned long addr = start; |
---|
| 299 | + pgd_t *pgd; |
---|
224 | 300 | int err = 0; |
---|
225 | 301 | int nr = 0; |
---|
| 302 | + pgtbl_mod_mask mask = 0; |
---|
226 | 303 | |
---|
227 | 304 | BUG_ON(addr >= end); |
---|
228 | 305 | pgd = pgd_offset_k(addr); |
---|
229 | 306 | do { |
---|
230 | 307 | next = pgd_addr_end(addr, end); |
---|
231 | | - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); |
---|
| 308 | + if (pgd_bad(*pgd)) |
---|
| 309 | + mask |= PGTBL_PGD_MODIFIED; |
---|
| 310 | + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); |
---|
232 | 311 | if (err) |
---|
233 | 312 | return err; |
---|
234 | 313 | } while (pgd++, addr = next, addr != end); |
---|
235 | 314 | |
---|
236 | | - return nr; |
---|
| 315 | + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) |
---|
| 316 | + arch_sync_kernel_mappings(start, end); |
---|
| 317 | + |
---|
| 318 | + return 0; |
---|
237 | 319 | } |
---|
238 | 320 | |
---|
239 | | -static int vmap_page_range(unsigned long start, unsigned long end, |
---|
240 | | - pgprot_t prot, struct page **pages) |
---|
| 321 | +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, |
---|
| 322 | + struct page **pages) |
---|
241 | 323 | { |
---|
242 | 324 | int ret; |
---|
243 | 325 | |
---|
244 | | - ret = vmap_page_range_noflush(start, end, prot, pages); |
---|
245 | | - flush_cache_vmap(start, end); |
---|
| 326 | + ret = map_kernel_range_noflush(start, size, prot, pages); |
---|
| 327 | + flush_cache_vmap(start, start + size); |
---|
246 | 328 | return ret; |
---|
247 | 329 | } |
---|
| 330 | +EXPORT_SYMBOL_GPL(map_kernel_range); |
---|
248 | 331 | |
---|
249 | 332 | int is_vmalloc_or_module_addr(const void *x) |
---|
250 | 333 | { |
---|
.. | .. |
---|
324 | 407 | |
---|
325 | 408 | /*** Global kva allocator ***/ |
---|
326 | 409 | |
---|
327 | | -#define VM_LAZY_FREE 0x02 |
---|
328 | | -#define VM_VM_AREA 0x04 |
---|
| 410 | +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 |
---|
| 411 | +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 |
---|
| 412 | + |
---|
329 | 413 | |
---|
330 | 414 | static DEFINE_SPINLOCK(vmap_area_lock); |
---|
| 415 | +static DEFINE_SPINLOCK(free_vmap_area_lock); |
---|
331 | 416 | /* Export for kexec only */ |
---|
332 | 417 | LIST_HEAD(vmap_area_list); |
---|
333 | 418 | static LLIST_HEAD(vmap_purge_list); |
---|
334 | 419 | static struct rb_root vmap_area_root = RB_ROOT; |
---|
| 420 | +static bool vmap_initialized __read_mostly; |
---|
335 | 421 | |
---|
336 | | -/* The vmap cache globals are protected by vmap_area_lock */ |
---|
337 | | -static struct rb_node *free_vmap_cache; |
---|
338 | | -static unsigned long cached_hole_size; |
---|
339 | | -static unsigned long cached_vstart; |
---|
340 | | -static unsigned long cached_align; |
---|
| 422 | +/* |
---|
| 423 | + * This kmem_cache is used for vmap_area objects. Instead of |
---|
| 424 | + * allocating from slab we reuse an object from this cache to |
---|
| 425 | + * make things faster. Especially in "no edge" splitting of |
---|
| 426 | + * free block. |
---|
| 427 | + */ |
---|
| 428 | +static struct kmem_cache *vmap_area_cachep; |
---|
341 | 429 | |
---|
342 | | -static unsigned long vmap_area_pcpu_hole; |
---|
| 430 | +/* |
---|
| 431 | + * This linked list is used in pair with free_vmap_area_root. |
---|
| 432 | + * It gives O(1) access to prev/next to perform fast coalescing. |
---|
| 433 | + */ |
---|
| 434 | +static LIST_HEAD(free_vmap_area_list); |
---|
| 435 | + |
---|
| 436 | +/* |
---|
| 437 | + * This augment red-black tree represents the free vmap space. |
---|
| 438 | + * All vmap_area objects in this tree are sorted by va->va_start |
---|
| 439 | + * address. It is used for allocation and merging when a vmap |
---|
| 440 | + * object is released. |
---|
| 441 | + * |
---|
| 442 | + * Each vmap_area node contains a maximum available free block |
---|
| 443 | + * of its sub-tree, right or left. Therefore it is possible to |
---|
| 444 | + * find a lowest match of free area. |
---|
| 445 | + */ |
---|
| 446 | +static struct rb_root free_vmap_area_root = RB_ROOT; |
---|
| 447 | + |
---|
| 448 | +/* |
---|
| 449 | + * Preload a CPU with one object for "no edge" split case. The |
---|
| 450 | + * aim is to get rid of allocations from the atomic context, thus |
---|
| 451 | + * to use more permissive allocation masks. |
---|
| 452 | + */ |
---|
| 453 | +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); |
---|
| 454 | + |
---|
| 455 | +static __always_inline unsigned long |
---|
| 456 | +va_size(struct vmap_area *va) |
---|
| 457 | +{ |
---|
| 458 | + return (va->va_end - va->va_start); |
---|
| 459 | +} |
---|
| 460 | + |
---|
| 461 | +static __always_inline unsigned long |
---|
| 462 | +get_subtree_max_size(struct rb_node *node) |
---|
| 463 | +{ |
---|
| 464 | + struct vmap_area *va; |
---|
| 465 | + |
---|
| 466 | + va = rb_entry_safe(node, struct vmap_area, rb_node); |
---|
| 467 | + return va ? va->subtree_max_size : 0; |
---|
| 468 | +} |
---|
| 469 | + |
---|
| 470 | +/* |
---|
| 471 | + * Gets called when remove the node and rotate. |
---|
| 472 | + */ |
---|
| 473 | +static __always_inline unsigned long |
---|
| 474 | +compute_subtree_max_size(struct vmap_area *va) |
---|
| 475 | +{ |
---|
| 476 | + return max3(va_size(va), |
---|
| 477 | + get_subtree_max_size(va->rb_node.rb_left), |
---|
| 478 | + get_subtree_max_size(va->rb_node.rb_right)); |
---|
| 479 | +} |
---|
| 480 | + |
---|
| 481 | +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, |
---|
| 482 | + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) |
---|
| 483 | + |
---|
| 484 | +static void purge_vmap_area_lazy(void); |
---|
| 485 | +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); |
---|
| 486 | +static unsigned long lazy_max_pages(void); |
---|
343 | 487 | |
---|
344 | 488 | static atomic_long_t nr_vmalloc_pages; |
---|
345 | 489 | |
---|
.. | .. |
---|
347 | 491 | { |
---|
348 | 492 | return atomic_long_read(&nr_vmalloc_pages); |
---|
349 | 493 | } |
---|
| 494 | +EXPORT_SYMBOL_GPL(vmalloc_nr_pages); |
---|
350 | 495 | |
---|
351 | 496 | static struct vmap_area *__find_vmap_area(unsigned long addr) |
---|
352 | 497 | { |
---|
.. | .. |
---|
367 | 512 | return NULL; |
---|
368 | 513 | } |
---|
369 | 514 | |
---|
370 | | -static void __insert_vmap_area(struct vmap_area *va) |
---|
| 515 | +/* |
---|
| 516 | + * This function returns back addresses of parent node |
---|
| 517 | + * and its left or right link for further processing. |
---|
| 518 | + * |
---|
| 519 | + * Otherwise NULL is returned. In that case all further |
---|
| 520 | + * steps regarding inserting of conflicting overlap range |
---|
| 521 | + * have to be declined and actually considered as a bug. |
---|
| 522 | + */ |
---|
| 523 | +static __always_inline struct rb_node ** |
---|
| 524 | +find_va_links(struct vmap_area *va, |
---|
| 525 | + struct rb_root *root, struct rb_node *from, |
---|
| 526 | + struct rb_node **parent) |
---|
371 | 527 | { |
---|
372 | | - struct rb_node **p = &vmap_area_root.rb_node; |
---|
373 | | - struct rb_node *parent = NULL; |
---|
374 | | - struct rb_node *tmp; |
---|
| 528 | + struct vmap_area *tmp_va; |
---|
| 529 | + struct rb_node **link; |
---|
375 | 530 | |
---|
376 | | - while (*p) { |
---|
377 | | - struct vmap_area *tmp_va; |
---|
378 | | - |
---|
379 | | - parent = *p; |
---|
380 | | - tmp_va = rb_entry(parent, struct vmap_area, rb_node); |
---|
381 | | - if (va->va_start < tmp_va->va_end) |
---|
382 | | - p = &(*p)->rb_left; |
---|
383 | | - else if (va->va_end > tmp_va->va_start) |
---|
384 | | - p = &(*p)->rb_right; |
---|
385 | | - else |
---|
386 | | - BUG(); |
---|
| 531 | + if (root) { |
---|
| 532 | + link = &root->rb_node; |
---|
| 533 | + if (unlikely(!*link)) { |
---|
| 534 | + *parent = NULL; |
---|
| 535 | + return link; |
---|
| 536 | + } |
---|
| 537 | + } else { |
---|
| 538 | + link = &from; |
---|
387 | 539 | } |
---|
388 | 540 | |
---|
389 | | - rb_link_node(&va->rb_node, parent, p); |
---|
390 | | - rb_insert_color(&va->rb_node, &vmap_area_root); |
---|
| 541 | + /* |
---|
| 542 | + * Go to the bottom of the tree. When we hit the last point |
---|
| 543 | + * we end up with parent rb_node and correct direction, i name |
---|
| 544 | + * it link, where the new va->rb_node will be attached to. |
---|
| 545 | + */ |
---|
| 546 | + do { |
---|
| 547 | + tmp_va = rb_entry(*link, struct vmap_area, rb_node); |
---|
391 | 548 | |
---|
392 | | - /* address-sort this list */ |
---|
393 | | - tmp = rb_prev(&va->rb_node); |
---|
394 | | - if (tmp) { |
---|
395 | | - struct vmap_area *prev; |
---|
396 | | - prev = rb_entry(tmp, struct vmap_area, rb_node); |
---|
397 | | - list_add_rcu(&va->list, &prev->list); |
---|
398 | | - } else |
---|
399 | | - list_add_rcu(&va->list, &vmap_area_list); |
---|
| 549 | + /* |
---|
| 550 | + * During the traversal we also do some sanity check. |
---|
| 551 | + * Trigger the BUG() if there are sides(left/right) |
---|
| 552 | + * or full overlaps. |
---|
| 553 | + */ |
---|
| 554 | + if (va->va_start < tmp_va->va_end && |
---|
| 555 | + va->va_end <= tmp_va->va_start) |
---|
| 556 | + link = &(*link)->rb_left; |
---|
| 557 | + else if (va->va_end > tmp_va->va_start && |
---|
| 558 | + va->va_start >= tmp_va->va_end) |
---|
| 559 | + link = &(*link)->rb_right; |
---|
| 560 | + else { |
---|
| 561 | + WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", |
---|
| 562 | + va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); |
---|
| 563 | + |
---|
| 564 | + return NULL; |
---|
| 565 | + } |
---|
| 566 | + } while (*link); |
---|
| 567 | + |
---|
| 568 | + *parent = &tmp_va->rb_node; |
---|
| 569 | + return link; |
---|
400 | 570 | } |
---|
401 | 571 | |
---|
402 | | -static void purge_vmap_area_lazy(void); |
---|
| 572 | +static __always_inline struct list_head * |
---|
| 573 | +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) |
---|
| 574 | +{ |
---|
| 575 | + struct list_head *list; |
---|
403 | 576 | |
---|
404 | | -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); |
---|
| 577 | + if (unlikely(!parent)) |
---|
| 578 | + /* |
---|
| 579 | + * The red-black tree where we try to find VA neighbors |
---|
| 580 | + * before merging or inserting is empty, i.e. it means |
---|
| 581 | + * there is no free vmap space. Normally it does not |
---|
| 582 | + * happen but we handle this case anyway. |
---|
| 583 | + */ |
---|
| 584 | + return NULL; |
---|
| 585 | + |
---|
| 586 | + list = &rb_entry(parent, struct vmap_area, rb_node)->list; |
---|
| 587 | + return (&parent->rb_right == link ? list->next : list); |
---|
| 588 | +} |
---|
| 589 | + |
---|
| 590 | +static __always_inline void |
---|
| 591 | +link_va(struct vmap_area *va, struct rb_root *root, |
---|
| 592 | + struct rb_node *parent, struct rb_node **link, struct list_head *head) |
---|
| 593 | +{ |
---|
| 594 | + /* |
---|
| 595 | + * VA is still not in the list, but we can |
---|
| 596 | + * identify its future previous list_head node. |
---|
| 597 | + */ |
---|
| 598 | + if (likely(parent)) { |
---|
| 599 | + head = &rb_entry(parent, struct vmap_area, rb_node)->list; |
---|
| 600 | + if (&parent->rb_right != link) |
---|
| 601 | + head = head->prev; |
---|
| 602 | + } |
---|
| 603 | + |
---|
| 604 | + /* Insert to the rb-tree */ |
---|
| 605 | + rb_link_node(&va->rb_node, parent, link); |
---|
| 606 | + if (root == &free_vmap_area_root) { |
---|
| 607 | + /* |
---|
| 608 | + * Some explanation here. Just perform simple insertion |
---|
| 609 | + * to the tree. We do not set va->subtree_max_size to |
---|
| 610 | + * its current size before calling rb_insert_augmented(). |
---|
| 611 | + * It is because of we populate the tree from the bottom |
---|
| 612 | + * to parent levels when the node _is_ in the tree. |
---|
| 613 | + * |
---|
| 614 | + * Therefore we set subtree_max_size to zero after insertion, |
---|
| 615 | + * to let __augment_tree_propagate_from() puts everything to |
---|
| 616 | + * the correct order later on. |
---|
| 617 | + */ |
---|
| 618 | + rb_insert_augmented(&va->rb_node, |
---|
| 619 | + root, &free_vmap_area_rb_augment_cb); |
---|
| 620 | + va->subtree_max_size = 0; |
---|
| 621 | + } else { |
---|
| 622 | + rb_insert_color(&va->rb_node, root); |
---|
| 623 | + } |
---|
| 624 | + |
---|
| 625 | + /* Address-sort this list */ |
---|
| 626 | + list_add(&va->list, head); |
---|
| 627 | +} |
---|
| 628 | + |
---|
| 629 | +static __always_inline void |
---|
| 630 | +unlink_va(struct vmap_area *va, struct rb_root *root) |
---|
| 631 | +{ |
---|
| 632 | + if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) |
---|
| 633 | + return; |
---|
| 634 | + |
---|
| 635 | + if (root == &free_vmap_area_root) |
---|
| 636 | + rb_erase_augmented(&va->rb_node, |
---|
| 637 | + root, &free_vmap_area_rb_augment_cb); |
---|
| 638 | + else |
---|
| 639 | + rb_erase(&va->rb_node, root); |
---|
| 640 | + |
---|
| 641 | + list_del(&va->list); |
---|
| 642 | + RB_CLEAR_NODE(&va->rb_node); |
---|
| 643 | +} |
---|
| 644 | + |
---|
| 645 | +#if DEBUG_AUGMENT_PROPAGATE_CHECK |
---|
| 646 | +static void |
---|
| 647 | +augment_tree_propagate_check(void) |
---|
| 648 | +{ |
---|
| 649 | + struct vmap_area *va; |
---|
| 650 | + unsigned long computed_size; |
---|
| 651 | + |
---|
| 652 | + list_for_each_entry(va, &free_vmap_area_list, list) { |
---|
| 653 | + computed_size = compute_subtree_max_size(va); |
---|
| 654 | + if (computed_size != va->subtree_max_size) |
---|
| 655 | + pr_emerg("tree is corrupted: %lu, %lu\n", |
---|
| 656 | + va_size(va), va->subtree_max_size); |
---|
| 657 | + } |
---|
| 658 | +} |
---|
| 659 | +#endif |
---|
| 660 | + |
---|
| 661 | +/* |
---|
| 662 | + * This function populates subtree_max_size from bottom to upper |
---|
| 663 | + * levels starting from VA point. The propagation must be done |
---|
| 664 | + * when VA size is modified by changing its va_start/va_end. Or |
---|
| 665 | + * in case of newly inserting of VA to the tree. |
---|
| 666 | + * |
---|
| 667 | + * It means that __augment_tree_propagate_from() must be called: |
---|
| 668 | + * - After VA has been inserted to the tree(free path); |
---|
| 669 | + * - After VA has been shrunk(allocation path); |
---|
| 670 | + * - After VA has been increased(merging path). |
---|
| 671 | + * |
---|
| 672 | + * Please note that, it does not mean that upper parent nodes |
---|
| 673 | + * and their subtree_max_size are recalculated all the time up |
---|
| 674 | + * to the root node. |
---|
| 675 | + * |
---|
| 676 | + * 4--8 |
---|
| 677 | + * /\ |
---|
| 678 | + * / \ |
---|
| 679 | + * / \ |
---|
| 680 | + * 2--2 8--8 |
---|
| 681 | + * |
---|
| 682 | + * For example if we modify the node 4, shrinking it to 2, then |
---|
| 683 | + * no any modification is required. If we shrink the node 2 to 1 |
---|
| 684 | + * its subtree_max_size is updated only, and set to 1. If we shrink |
---|
| 685 | + * the node 8 to 6, then its subtree_max_size is set to 6 and parent |
---|
| 686 | + * node becomes 4--6. |
---|
| 687 | + */ |
---|
| 688 | +static __always_inline void |
---|
| 689 | +augment_tree_propagate_from(struct vmap_area *va) |
---|
| 690 | +{ |
---|
| 691 | + /* |
---|
| 692 | + * Populate the tree from bottom towards the root until |
---|
| 693 | + * the calculated maximum available size of checked node |
---|
| 694 | + * is equal to its current one. |
---|
| 695 | + */ |
---|
| 696 | + free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); |
---|
| 697 | + |
---|
| 698 | +#if DEBUG_AUGMENT_PROPAGATE_CHECK |
---|
| 699 | + augment_tree_propagate_check(); |
---|
| 700 | +#endif |
---|
| 701 | +} |
---|
| 702 | + |
---|
| 703 | +static void |
---|
| 704 | +insert_vmap_area(struct vmap_area *va, |
---|
| 705 | + struct rb_root *root, struct list_head *head) |
---|
| 706 | +{ |
---|
| 707 | + struct rb_node **link; |
---|
| 708 | + struct rb_node *parent; |
---|
| 709 | + |
---|
| 710 | + link = find_va_links(va, root, NULL, &parent); |
---|
| 711 | + if (link) |
---|
| 712 | + link_va(va, root, parent, link, head); |
---|
| 713 | +} |
---|
| 714 | + |
---|
| 715 | +static void |
---|
| 716 | +insert_vmap_area_augment(struct vmap_area *va, |
---|
| 717 | + struct rb_node *from, struct rb_root *root, |
---|
| 718 | + struct list_head *head) |
---|
| 719 | +{ |
---|
| 720 | + struct rb_node **link; |
---|
| 721 | + struct rb_node *parent; |
---|
| 722 | + |
---|
| 723 | + if (from) |
---|
| 724 | + link = find_va_links(va, NULL, from, &parent); |
---|
| 725 | + else |
---|
| 726 | + link = find_va_links(va, root, NULL, &parent); |
---|
| 727 | + |
---|
| 728 | + if (link) { |
---|
| 729 | + link_va(va, root, parent, link, head); |
---|
| 730 | + augment_tree_propagate_from(va); |
---|
| 731 | + } |
---|
| 732 | +} |
---|
| 733 | + |
---|
| 734 | +/* |
---|
| 735 | + * Merge de-allocated chunk of VA memory with previous |
---|
| 736 | + * and next free blocks. If coalesce is not done a new |
---|
| 737 | + * free area is inserted. If VA has been merged, it is |
---|
| 738 | + * freed. |
---|
| 739 | + * |
---|
| 740 | + * Please note, it can return NULL in case of overlap |
---|
| 741 | + * ranges, followed by WARN() report. Despite it is a |
---|
| 742 | + * buggy behaviour, a system can be alive and keep |
---|
| 743 | + * ongoing. |
---|
| 744 | + */ |
---|
| 745 | +static __always_inline struct vmap_area * |
---|
| 746 | +merge_or_add_vmap_area(struct vmap_area *va, |
---|
| 747 | + struct rb_root *root, struct list_head *head) |
---|
| 748 | +{ |
---|
| 749 | + struct vmap_area *sibling; |
---|
| 750 | + struct list_head *next; |
---|
| 751 | + struct rb_node **link; |
---|
| 752 | + struct rb_node *parent; |
---|
| 753 | + bool merged = false; |
---|
| 754 | + |
---|
| 755 | + /* |
---|
| 756 | + * Find a place in the tree where VA potentially will be |
---|
| 757 | + * inserted, unless it is merged with its sibling/siblings. |
---|
| 758 | + */ |
---|
| 759 | + link = find_va_links(va, root, NULL, &parent); |
---|
| 760 | + if (!link) |
---|
| 761 | + return NULL; |
---|
| 762 | + |
---|
| 763 | + /* |
---|
| 764 | + * Get next node of VA to check if merging can be done. |
---|
| 765 | + */ |
---|
| 766 | + next = get_va_next_sibling(parent, link); |
---|
| 767 | + if (unlikely(next == NULL)) |
---|
| 768 | + goto insert; |
---|
| 769 | + |
---|
| 770 | + /* |
---|
| 771 | + * start end |
---|
| 772 | + * | | |
---|
| 773 | + * |<------VA------>|<-----Next----->| |
---|
| 774 | + * | | |
---|
| 775 | + * start end |
---|
| 776 | + */ |
---|
| 777 | + if (next != head) { |
---|
| 778 | + sibling = list_entry(next, struct vmap_area, list); |
---|
| 779 | + if (sibling->va_start == va->va_end) { |
---|
| 780 | + sibling->va_start = va->va_start; |
---|
| 781 | + |
---|
| 782 | + /* Free vmap_area object. */ |
---|
| 783 | + kmem_cache_free(vmap_area_cachep, va); |
---|
| 784 | + |
---|
| 785 | + /* Point to the new merged area. */ |
---|
| 786 | + va = sibling; |
---|
| 787 | + merged = true; |
---|
| 788 | + } |
---|
| 789 | + } |
---|
| 790 | + |
---|
| 791 | + /* |
---|
| 792 | + * start end |
---|
| 793 | + * | | |
---|
| 794 | + * |<-----Prev----->|<------VA------>| |
---|
| 795 | + * | | |
---|
| 796 | + * start end |
---|
| 797 | + */ |
---|
| 798 | + if (next->prev != head) { |
---|
| 799 | + sibling = list_entry(next->prev, struct vmap_area, list); |
---|
| 800 | + if (sibling->va_end == va->va_start) { |
---|
| 801 | + /* |
---|
| 802 | + * If both neighbors are coalesced, it is important |
---|
| 803 | + * to unlink the "next" node first, followed by merging |
---|
| 804 | + * with "previous" one. Otherwise the tree might not be |
---|
| 805 | + * fully populated if a sibling's augmented value is |
---|
| 806 | + * "normalized" because of rotation operations. |
---|
| 807 | + */ |
---|
| 808 | + if (merged) |
---|
| 809 | + unlink_va(va, root); |
---|
| 810 | + |
---|
| 811 | + sibling->va_end = va->va_end; |
---|
| 812 | + |
---|
| 813 | + /* Free vmap_area object. */ |
---|
| 814 | + kmem_cache_free(vmap_area_cachep, va); |
---|
| 815 | + |
---|
| 816 | + /* Point to the new merged area. */ |
---|
| 817 | + va = sibling; |
---|
| 818 | + merged = true; |
---|
| 819 | + } |
---|
| 820 | + } |
---|
| 821 | + |
---|
| 822 | +insert: |
---|
| 823 | + if (!merged) |
---|
| 824 | + link_va(va, root, parent, link, head); |
---|
| 825 | + |
---|
| 826 | + /* |
---|
| 827 | + * Last step is to check and update the tree. |
---|
| 828 | + */ |
---|
| 829 | + augment_tree_propagate_from(va); |
---|
| 830 | + return va; |
---|
| 831 | +} |
---|
| 832 | + |
---|
| 833 | +static __always_inline bool |
---|
| 834 | +is_within_this_va(struct vmap_area *va, unsigned long size, |
---|
| 835 | + unsigned long align, unsigned long vstart) |
---|
| 836 | +{ |
---|
| 837 | + unsigned long nva_start_addr; |
---|
| 838 | + |
---|
| 839 | + if (va->va_start > vstart) |
---|
| 840 | + nva_start_addr = ALIGN(va->va_start, align); |
---|
| 841 | + else |
---|
| 842 | + nva_start_addr = ALIGN(vstart, align); |
---|
| 843 | + |
---|
| 844 | + /* Can be overflowed due to big size or alignment. */ |
---|
| 845 | + if (nva_start_addr + size < nva_start_addr || |
---|
| 846 | + nva_start_addr < vstart) |
---|
| 847 | + return false; |
---|
| 848 | + |
---|
| 849 | + return (nva_start_addr + size <= va->va_end); |
---|
| 850 | +} |
---|
| 851 | + |
---|
| 852 | +/* |
---|
| 853 | + * Find the first free block(lowest start address) in the tree, |
---|
| 854 | + * that will accomplish the request corresponding to passing |
---|
| 855 | + * parameters. |
---|
| 856 | + */ |
---|
| 857 | +static __always_inline struct vmap_area * |
---|
| 858 | +find_vmap_lowest_match(unsigned long size, |
---|
| 859 | + unsigned long align, unsigned long vstart) |
---|
| 860 | +{ |
---|
| 861 | + struct vmap_area *va; |
---|
| 862 | + struct rb_node *node; |
---|
| 863 | + unsigned long length; |
---|
| 864 | + |
---|
| 865 | + /* Start from the root. */ |
---|
| 866 | + node = free_vmap_area_root.rb_node; |
---|
| 867 | + |
---|
| 868 | + /* Adjust the search size for alignment overhead. */ |
---|
| 869 | + length = size + align - 1; |
---|
| 870 | + |
---|
| 871 | + while (node) { |
---|
| 872 | + va = rb_entry(node, struct vmap_area, rb_node); |
---|
| 873 | + |
---|
| 874 | + if (get_subtree_max_size(node->rb_left) >= length && |
---|
| 875 | + vstart < va->va_start) { |
---|
| 876 | + node = node->rb_left; |
---|
| 877 | + } else { |
---|
| 878 | + if (is_within_this_va(va, size, align, vstart)) |
---|
| 879 | + return va; |
---|
| 880 | + |
---|
| 881 | + /* |
---|
| 882 | + * Does not make sense to go deeper towards the right |
---|
| 883 | + * sub-tree if it does not have a free block that is |
---|
| 884 | + * equal or bigger to the requested search length. |
---|
| 885 | + */ |
---|
| 886 | + if (get_subtree_max_size(node->rb_right) >= length) { |
---|
| 887 | + node = node->rb_right; |
---|
| 888 | + continue; |
---|
| 889 | + } |
---|
| 890 | + |
---|
| 891 | + /* |
---|
| 892 | + * OK. We roll back and find the first right sub-tree, |
---|
| 893 | + * that will satisfy the search criteria. It can happen |
---|
| 894 | + * only once due to "vstart" restriction. |
---|
| 895 | + */ |
---|
| 896 | + while ((node = rb_parent(node))) { |
---|
| 897 | + va = rb_entry(node, struct vmap_area, rb_node); |
---|
| 898 | + if (is_within_this_va(va, size, align, vstart)) |
---|
| 899 | + return va; |
---|
| 900 | + |
---|
| 901 | + if (get_subtree_max_size(node->rb_right) >= length && |
---|
| 902 | + vstart <= va->va_start) { |
---|
| 903 | + node = node->rb_right; |
---|
| 904 | + break; |
---|
| 905 | + } |
---|
| 906 | + } |
---|
| 907 | + } |
---|
| 908 | + } |
---|
| 909 | + |
---|
| 910 | + return NULL; |
---|
| 911 | +} |
---|
| 912 | + |
---|
| 913 | +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK |
---|
| 914 | +#include <linux/random.h> |
---|
| 915 | + |
---|
| 916 | +static struct vmap_area * |
---|
| 917 | +find_vmap_lowest_linear_match(unsigned long size, |
---|
| 918 | + unsigned long align, unsigned long vstart) |
---|
| 919 | +{ |
---|
| 920 | + struct vmap_area *va; |
---|
| 921 | + |
---|
| 922 | + list_for_each_entry(va, &free_vmap_area_list, list) { |
---|
| 923 | + if (!is_within_this_va(va, size, align, vstart)) |
---|
| 924 | + continue; |
---|
| 925 | + |
---|
| 926 | + return va; |
---|
| 927 | + } |
---|
| 928 | + |
---|
| 929 | + return NULL; |
---|
| 930 | +} |
---|
| 931 | + |
---|
| 932 | +static void |
---|
| 933 | +find_vmap_lowest_match_check(unsigned long size) |
---|
| 934 | +{ |
---|
| 935 | + struct vmap_area *va_1, *va_2; |
---|
| 936 | + unsigned long vstart; |
---|
| 937 | + unsigned int rnd; |
---|
| 938 | + |
---|
| 939 | + get_random_bytes(&rnd, sizeof(rnd)); |
---|
| 940 | + vstart = VMALLOC_START + rnd; |
---|
| 941 | + |
---|
| 942 | + va_1 = find_vmap_lowest_match(size, 1, vstart); |
---|
| 943 | + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); |
---|
| 944 | + |
---|
| 945 | + if (va_1 != va_2) |
---|
| 946 | + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", |
---|
| 947 | + va_1, va_2, vstart); |
---|
| 948 | +} |
---|
| 949 | +#endif |
---|
| 950 | + |
---|
| 951 | +enum fit_type { |
---|
| 952 | + NOTHING_FIT = 0, |
---|
| 953 | + FL_FIT_TYPE = 1, /* full fit */ |
---|
| 954 | + LE_FIT_TYPE = 2, /* left edge fit */ |
---|
| 955 | + RE_FIT_TYPE = 3, /* right edge fit */ |
---|
| 956 | + NE_FIT_TYPE = 4 /* no edge fit */ |
---|
| 957 | +}; |
---|
| 958 | + |
---|
| 959 | +static __always_inline enum fit_type |
---|
| 960 | +classify_va_fit_type(struct vmap_area *va, |
---|
| 961 | + unsigned long nva_start_addr, unsigned long size) |
---|
| 962 | +{ |
---|
| 963 | + enum fit_type type; |
---|
| 964 | + |
---|
| 965 | + /* Check if it is within VA. */ |
---|
| 966 | + if (nva_start_addr < va->va_start || |
---|
| 967 | + nva_start_addr + size > va->va_end) |
---|
| 968 | + return NOTHING_FIT; |
---|
| 969 | + |
---|
| 970 | + /* Now classify. */ |
---|
| 971 | + if (va->va_start == nva_start_addr) { |
---|
| 972 | + if (va->va_end == nva_start_addr + size) |
---|
| 973 | + type = FL_FIT_TYPE; |
---|
| 974 | + else |
---|
| 975 | + type = LE_FIT_TYPE; |
---|
| 976 | + } else if (va->va_end == nva_start_addr + size) { |
---|
| 977 | + type = RE_FIT_TYPE; |
---|
| 978 | + } else { |
---|
| 979 | + type = NE_FIT_TYPE; |
---|
| 980 | + } |
---|
| 981 | + |
---|
| 982 | + return type; |
---|
| 983 | +} |
---|
| 984 | + |
---|
| 985 | +static __always_inline int |
---|
| 986 | +adjust_va_to_fit_type(struct vmap_area *va, |
---|
| 987 | + unsigned long nva_start_addr, unsigned long size, |
---|
| 988 | + enum fit_type type) |
---|
| 989 | +{ |
---|
| 990 | + struct vmap_area *lva = NULL; |
---|
| 991 | + |
---|
| 992 | + if (type == FL_FIT_TYPE) { |
---|
| 993 | + /* |
---|
| 994 | + * No need to split VA, it fully fits. |
---|
| 995 | + * |
---|
| 996 | + * | | |
---|
| 997 | + * V NVA V |
---|
| 998 | + * |---------------| |
---|
| 999 | + */ |
---|
| 1000 | + unlink_va(va, &free_vmap_area_root); |
---|
| 1001 | + kmem_cache_free(vmap_area_cachep, va); |
---|
| 1002 | + } else if (type == LE_FIT_TYPE) { |
---|
| 1003 | + /* |
---|
| 1004 | + * Split left edge of fit VA. |
---|
| 1005 | + * |
---|
| 1006 | + * | | |
---|
| 1007 | + * V NVA V R |
---|
| 1008 | + * |-------|-------| |
---|
| 1009 | + */ |
---|
| 1010 | + va->va_start += size; |
---|
| 1011 | + } else if (type == RE_FIT_TYPE) { |
---|
| 1012 | + /* |
---|
| 1013 | + * Split right edge of fit VA. |
---|
| 1014 | + * |
---|
| 1015 | + * | | |
---|
| 1016 | + * L V NVA V |
---|
| 1017 | + * |-------|-------| |
---|
| 1018 | + */ |
---|
| 1019 | + va->va_end = nva_start_addr; |
---|
| 1020 | + } else if (type == NE_FIT_TYPE) { |
---|
| 1021 | + /* |
---|
| 1022 | + * Split no edge of fit VA. |
---|
| 1023 | + * |
---|
| 1024 | + * | | |
---|
| 1025 | + * L V NVA V R |
---|
| 1026 | + * |---|-------|---| |
---|
| 1027 | + */ |
---|
| 1028 | + lva = __this_cpu_xchg(ne_fit_preload_node, NULL); |
---|
| 1029 | + if (unlikely(!lva)) { |
---|
| 1030 | + /* |
---|
| 1031 | + * For percpu allocator we do not do any pre-allocation |
---|
| 1032 | + * and leave it as it is. The reason is it most likely |
---|
| 1033 | + * never ends up with NE_FIT_TYPE splitting. In case of |
---|
| 1034 | + * percpu allocations offsets and sizes are aligned to |
---|
| 1035 | + * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE |
---|
| 1036 | + * are its main fitting cases. |
---|
| 1037 | + * |
---|
| 1038 | + * There are a few exceptions though, as an example it is |
---|
| 1039 | + * a first allocation (early boot up) when we have "one" |
---|
| 1040 | + * big free space that has to be split. |
---|
| 1041 | + * |
---|
| 1042 | + * Also we can hit this path in case of regular "vmap" |
---|
| 1043 | + * allocations, if "this" current CPU was not preloaded. |
---|
| 1044 | + * See the comment in alloc_vmap_area() why. If so, then |
---|
| 1045 | + * GFP_NOWAIT is used instead to get an extra object for |
---|
| 1046 | + * split purpose. That is rare and most time does not |
---|
| 1047 | + * occur. |
---|
| 1048 | + * |
---|
| 1049 | + * What happens if an allocation gets failed. Basically, |
---|
| 1050 | + * an "overflow" path is triggered to purge lazily freed |
---|
| 1051 | + * areas to free some memory, then, the "retry" path is |
---|
| 1052 | + * triggered to repeat one more time. See more details |
---|
| 1053 | + * in alloc_vmap_area() function. |
---|
| 1054 | + */ |
---|
| 1055 | + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); |
---|
| 1056 | + if (!lva) |
---|
| 1057 | + return -1; |
---|
| 1058 | + } |
---|
| 1059 | + |
---|
| 1060 | + /* |
---|
| 1061 | + * Build the remainder. |
---|
| 1062 | + */ |
---|
| 1063 | + lva->va_start = va->va_start; |
---|
| 1064 | + lva->va_end = nva_start_addr; |
---|
| 1065 | + |
---|
| 1066 | + /* |
---|
| 1067 | + * Shrink this VA to remaining size. |
---|
| 1068 | + */ |
---|
| 1069 | + va->va_start = nva_start_addr + size; |
---|
| 1070 | + } else { |
---|
| 1071 | + return -1; |
---|
| 1072 | + } |
---|
| 1073 | + |
---|
| 1074 | + if (type != FL_FIT_TYPE) { |
---|
| 1075 | + augment_tree_propagate_from(va); |
---|
| 1076 | + |
---|
| 1077 | + if (lva) /* type == NE_FIT_TYPE */ |
---|
| 1078 | + insert_vmap_area_augment(lva, &va->rb_node, |
---|
| 1079 | + &free_vmap_area_root, &free_vmap_area_list); |
---|
| 1080 | + } |
---|
| 1081 | + |
---|
| 1082 | + return 0; |
---|
| 1083 | +} |
---|
| 1084 | + |
---|
| 1085 | +/* |
---|
| 1086 | + * Returns a start address of the newly allocated area, if success. |
---|
| 1087 | + * Otherwise a vend is returned that indicates failure. |
---|
| 1088 | + */ |
---|
| 1089 | +static __always_inline unsigned long |
---|
| 1090 | +__alloc_vmap_area(unsigned long size, unsigned long align, |
---|
| 1091 | + unsigned long vstart, unsigned long vend) |
---|
| 1092 | +{ |
---|
| 1093 | + unsigned long nva_start_addr; |
---|
| 1094 | + struct vmap_area *va; |
---|
| 1095 | + enum fit_type type; |
---|
| 1096 | + int ret; |
---|
| 1097 | + |
---|
| 1098 | + va = find_vmap_lowest_match(size, align, vstart); |
---|
| 1099 | + if (unlikely(!va)) |
---|
| 1100 | + return vend; |
---|
| 1101 | + |
---|
| 1102 | + if (va->va_start > vstart) |
---|
| 1103 | + nva_start_addr = ALIGN(va->va_start, align); |
---|
| 1104 | + else |
---|
| 1105 | + nva_start_addr = ALIGN(vstart, align); |
---|
| 1106 | + |
---|
| 1107 | + /* Check the "vend" restriction. */ |
---|
| 1108 | + if (nva_start_addr + size > vend) |
---|
| 1109 | + return vend; |
---|
| 1110 | + |
---|
| 1111 | + /* Classify what we have found. */ |
---|
| 1112 | + type = classify_va_fit_type(va, nva_start_addr, size); |
---|
| 1113 | + if (WARN_ON_ONCE(type == NOTHING_FIT)) |
---|
| 1114 | + return vend; |
---|
| 1115 | + |
---|
| 1116 | + /* Update the free vmap_area. */ |
---|
| 1117 | + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); |
---|
| 1118 | + if (ret) |
---|
| 1119 | + return vend; |
---|
| 1120 | + |
---|
| 1121 | +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK |
---|
| 1122 | + find_vmap_lowest_match_check(size); |
---|
| 1123 | +#endif |
---|
| 1124 | + |
---|
| 1125 | + return nva_start_addr; |
---|
| 1126 | +} |
---|
| 1127 | + |
---|
| 1128 | +/* |
---|
| 1129 | + * Free a region of KVA allocated by alloc_vmap_area |
---|
| 1130 | + */ |
---|
| 1131 | +static void free_vmap_area(struct vmap_area *va) |
---|
| 1132 | +{ |
---|
| 1133 | + /* |
---|
| 1134 | + * Remove from the busy tree/list. |
---|
| 1135 | + */ |
---|
| 1136 | + spin_lock(&vmap_area_lock); |
---|
| 1137 | + unlink_va(va, &vmap_area_root); |
---|
| 1138 | + spin_unlock(&vmap_area_lock); |
---|
| 1139 | + |
---|
| 1140 | + /* |
---|
| 1141 | + * Insert/Merge it back to the free tree/list. |
---|
| 1142 | + */ |
---|
| 1143 | + spin_lock(&free_vmap_area_lock); |
---|
| 1144 | + merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); |
---|
| 1145 | + spin_unlock(&free_vmap_area_lock); |
---|
| 1146 | +} |
---|
405 | 1147 | |
---|
406 | 1148 | /* |
---|
407 | 1149 | * Allocate a region of KVA of the specified size and alignment, within the |
---|
.. | .. |
---|
412 | 1154 | unsigned long vstart, unsigned long vend, |
---|
413 | 1155 | int node, gfp_t gfp_mask) |
---|
414 | 1156 | { |
---|
415 | | - struct vmap_area *va; |
---|
416 | | - struct rb_node *n; |
---|
| 1157 | + struct vmap_area *va, *pva; |
---|
417 | 1158 | unsigned long addr; |
---|
418 | 1159 | int purged = 0; |
---|
419 | | - struct vmap_area *first; |
---|
| 1160 | + int ret; |
---|
420 | 1161 | |
---|
421 | 1162 | BUG_ON(!size); |
---|
422 | 1163 | BUG_ON(offset_in_page(size)); |
---|
423 | 1164 | BUG_ON(!is_power_of_2(align)); |
---|
424 | 1165 | |
---|
425 | | - might_sleep(); |
---|
| 1166 | + if (unlikely(!vmap_initialized)) |
---|
| 1167 | + return ERR_PTR(-EBUSY); |
---|
426 | 1168 | |
---|
427 | | - va = kmalloc_node(sizeof(struct vmap_area), |
---|
428 | | - gfp_mask & GFP_RECLAIM_MASK, node); |
---|
| 1169 | + might_sleep(); |
---|
| 1170 | + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; |
---|
| 1171 | + |
---|
| 1172 | + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); |
---|
429 | 1173 | if (unlikely(!va)) |
---|
430 | 1174 | return ERR_PTR(-ENOMEM); |
---|
431 | 1175 | |
---|
.. | .. |
---|
433 | 1177 | * Only scan the relevant parts containing pointers to other objects |
---|
434 | 1178 | * to avoid false negatives. |
---|
435 | 1179 | */ |
---|
436 | | - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); |
---|
| 1180 | + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); |
---|
437 | 1181 | |
---|
438 | 1182 | retry: |
---|
439 | | - spin_lock(&vmap_area_lock); |
---|
440 | 1183 | /* |
---|
441 | | - * Invalidate cache if we have more permissive parameters. |
---|
442 | | - * cached_hole_size notes the largest hole noticed _below_ |
---|
443 | | - * the vmap_area cached in free_vmap_cache: if size fits |
---|
444 | | - * into that hole, we want to scan from vstart to reuse |
---|
445 | | - * the hole instead of allocating above free_vmap_cache. |
---|
446 | | - * Note that __free_vmap_area may update free_vmap_cache |
---|
447 | | - * without updating cached_hole_size or cached_align. |
---|
| 1184 | + * Preload this CPU with one extra vmap_area object. It is used |
---|
| 1185 | + * when fit type of free area is NE_FIT_TYPE. Please note, it |
---|
| 1186 | + * does not guarantee that an allocation occurs on a CPU that |
---|
| 1187 | + * is preloaded, instead we minimize the case when it is not. |
---|
| 1188 | + * It can happen because of cpu migration, because there is a |
---|
| 1189 | + * race until the below spinlock is taken. |
---|
| 1190 | + * |
---|
| 1191 | + * The preload is done in non-atomic context, thus it allows us |
---|
| 1192 | + * to use more permissive allocation masks to be more stable under |
---|
| 1193 | + * low memory condition and high memory pressure. In rare case, |
---|
| 1194 | + * if not preloaded, GFP_NOWAIT is used. |
---|
| 1195 | + * |
---|
| 1196 | + * Set "pva" to NULL here, because of "retry" path. |
---|
448 | 1197 | */ |
---|
449 | | - if (!free_vmap_cache || |
---|
450 | | - size < cached_hole_size || |
---|
451 | | - vstart < cached_vstart || |
---|
452 | | - align < cached_align) { |
---|
453 | | -nocache: |
---|
454 | | - cached_hole_size = 0; |
---|
455 | | - free_vmap_cache = NULL; |
---|
456 | | - } |
---|
457 | | - /* record if we encounter less permissive parameters */ |
---|
458 | | - cached_vstart = vstart; |
---|
459 | | - cached_align = align; |
---|
| 1198 | + pva = NULL; |
---|
460 | 1199 | |
---|
461 | | - /* find starting point for our search */ |
---|
462 | | - if (free_vmap_cache) { |
---|
463 | | - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); |
---|
464 | | - addr = ALIGN(first->va_end, align); |
---|
465 | | - if (addr < vstart) |
---|
466 | | - goto nocache; |
---|
467 | | - if (addr + size < addr) |
---|
468 | | - goto overflow; |
---|
| 1200 | + if (!this_cpu_read(ne_fit_preload_node)) |
---|
| 1201 | + /* |
---|
| 1202 | + * Even if it fails we do not really care about that. |
---|
| 1203 | + * Just proceed as it is. If needed "overflow" path |
---|
| 1204 | + * will refill the cache we allocate from. |
---|
| 1205 | + */ |
---|
| 1206 | + pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); |
---|
469 | 1207 | |
---|
470 | | - } else { |
---|
471 | | - addr = ALIGN(vstart, align); |
---|
472 | | - if (addr + size < addr) |
---|
473 | | - goto overflow; |
---|
| 1208 | + spin_lock(&free_vmap_area_lock); |
---|
474 | 1209 | |
---|
475 | | - n = vmap_area_root.rb_node; |
---|
476 | | - first = NULL; |
---|
| 1210 | + if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) |
---|
| 1211 | + kmem_cache_free(vmap_area_cachep, pva); |
---|
477 | 1212 | |
---|
478 | | - while (n) { |
---|
479 | | - struct vmap_area *tmp; |
---|
480 | | - tmp = rb_entry(n, struct vmap_area, rb_node); |
---|
481 | | - if (tmp->va_end >= addr) { |
---|
482 | | - first = tmp; |
---|
483 | | - if (tmp->va_start <= addr) |
---|
484 | | - break; |
---|
485 | | - n = n->rb_left; |
---|
486 | | - } else |
---|
487 | | - n = n->rb_right; |
---|
488 | | - } |
---|
489 | | - |
---|
490 | | - if (!first) |
---|
491 | | - goto found; |
---|
492 | | - } |
---|
493 | | - |
---|
494 | | - /* from the starting point, walk areas until a suitable hole is found */ |
---|
495 | | - while (addr + size > first->va_start && addr + size <= vend) { |
---|
496 | | - if (addr + cached_hole_size < first->va_start) |
---|
497 | | - cached_hole_size = first->va_start - addr; |
---|
498 | | - addr = ALIGN(first->va_end, align); |
---|
499 | | - if (addr + size < addr) |
---|
500 | | - goto overflow; |
---|
501 | | - |
---|
502 | | - if (list_is_last(&first->list, &vmap_area_list)) |
---|
503 | | - goto found; |
---|
504 | | - |
---|
505 | | - first = list_next_entry(first, list); |
---|
506 | | - } |
---|
507 | | - |
---|
508 | | -found: |
---|
509 | 1213 | /* |
---|
510 | | - * Check also calculated address against the vstart, |
---|
511 | | - * because it can be 0 because of big align request. |
---|
| 1214 | + * If an allocation fails, the "vend" address is |
---|
| 1215 | + * returned. Therefore trigger the overflow path. |
---|
512 | 1216 | */ |
---|
513 | | - if (addr + size > vend || addr < vstart) |
---|
| 1217 | + addr = __alloc_vmap_area(size, align, vstart, vend); |
---|
| 1218 | + spin_unlock(&free_vmap_area_lock); |
---|
| 1219 | + |
---|
| 1220 | + if (unlikely(addr == vend)) |
---|
514 | 1221 | goto overflow; |
---|
515 | 1222 | |
---|
516 | 1223 | va->va_start = addr; |
---|
517 | 1224 | va->va_end = addr + size; |
---|
518 | | - va->flags = 0; |
---|
519 | | - __insert_vmap_area(va); |
---|
520 | | - free_vmap_cache = &va->rb_node; |
---|
| 1225 | + va->vm = NULL; |
---|
| 1226 | + |
---|
| 1227 | + |
---|
| 1228 | + spin_lock(&vmap_area_lock); |
---|
| 1229 | + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); |
---|
521 | 1230 | spin_unlock(&vmap_area_lock); |
---|
522 | 1231 | |
---|
523 | 1232 | BUG_ON(!IS_ALIGNED(va->va_start, align)); |
---|
524 | 1233 | BUG_ON(va->va_start < vstart); |
---|
525 | 1234 | BUG_ON(va->va_end > vend); |
---|
526 | 1235 | |
---|
| 1236 | + ret = kasan_populate_vmalloc(addr, size); |
---|
| 1237 | + if (ret) { |
---|
| 1238 | + free_vmap_area(va); |
---|
| 1239 | + return ERR_PTR(ret); |
---|
| 1240 | + } |
---|
| 1241 | + |
---|
527 | 1242 | return va; |
---|
528 | 1243 | |
---|
529 | 1244 | overflow: |
---|
530 | | - spin_unlock(&vmap_area_lock); |
---|
531 | 1245 | if (!purged) { |
---|
532 | 1246 | purge_vmap_area_lazy(); |
---|
533 | 1247 | purged = 1; |
---|
.. | .. |
---|
546 | 1260 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) |
---|
547 | 1261 | pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", |
---|
548 | 1262 | size); |
---|
549 | | - kfree(va); |
---|
| 1263 | + |
---|
| 1264 | + kmem_cache_free(vmap_area_cachep, va); |
---|
550 | 1265 | return ERR_PTR(-EBUSY); |
---|
551 | 1266 | } |
---|
552 | 1267 | |
---|
.. | .. |
---|
562 | 1277 | } |
---|
563 | 1278 | EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); |
---|
564 | 1279 | |
---|
565 | | -static void __free_vmap_area(struct vmap_area *va) |
---|
566 | | -{ |
---|
567 | | - BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
---|
568 | | - |
---|
569 | | - if (free_vmap_cache) { |
---|
570 | | - if (va->va_end < cached_vstart) { |
---|
571 | | - free_vmap_cache = NULL; |
---|
572 | | - } else { |
---|
573 | | - struct vmap_area *cache; |
---|
574 | | - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); |
---|
575 | | - if (va->va_start <= cache->va_start) { |
---|
576 | | - free_vmap_cache = rb_prev(&va->rb_node); |
---|
577 | | - /* |
---|
578 | | - * We don't try to update cached_hole_size or |
---|
579 | | - * cached_align, but it won't go very wrong. |
---|
580 | | - */ |
---|
581 | | - } |
---|
582 | | - } |
---|
583 | | - } |
---|
584 | | - rb_erase(&va->rb_node, &vmap_area_root); |
---|
585 | | - RB_CLEAR_NODE(&va->rb_node); |
---|
586 | | - list_del_rcu(&va->list); |
---|
587 | | - |
---|
588 | | - /* |
---|
589 | | - * Track the highest possible candidate for pcpu area |
---|
590 | | - * allocation. Areas outside of vmalloc area can be returned |
---|
591 | | - * here too, consider only end addresses which fall inside |
---|
592 | | - * vmalloc area proper. |
---|
593 | | - */ |
---|
594 | | - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) |
---|
595 | | - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); |
---|
596 | | - |
---|
597 | | - kfree_rcu(va, rcu_head); |
---|
598 | | -} |
---|
599 | | - |
---|
600 | | -/* |
---|
601 | | - * Free a region of KVA allocated by alloc_vmap_area |
---|
602 | | - */ |
---|
603 | | -static void free_vmap_area(struct vmap_area *va) |
---|
604 | | -{ |
---|
605 | | - spin_lock(&vmap_area_lock); |
---|
606 | | - __free_vmap_area(va); |
---|
607 | | - spin_unlock(&vmap_area_lock); |
---|
608 | | -} |
---|
609 | | - |
---|
610 | | -/* |
---|
611 | | - * Clear the pagetable entries of a given vmap_area |
---|
612 | | - */ |
---|
613 | | -static void unmap_vmap_area(struct vmap_area *va) |
---|
614 | | -{ |
---|
615 | | - vunmap_page_range(va->va_start, va->va_end); |
---|
616 | | -} |
---|
617 | | - |
---|
| 1280 | +bool lazy_vunmap_enable __read_mostly = true; |
---|
618 | 1281 | /* |
---|
619 | 1282 | * lazy_max_pages is the maximum amount of virtual address space we gather up |
---|
620 | 1283 | * before attempting to purge with a TLB flush. |
---|
.. | .. |
---|
635 | 1298 | { |
---|
636 | 1299 | unsigned int log; |
---|
637 | 1300 | |
---|
| 1301 | + if (!lazy_vunmap_enable) |
---|
| 1302 | + return 0; |
---|
| 1303 | + |
---|
638 | 1304 | log = fls(num_online_cpus()); |
---|
639 | 1305 | |
---|
640 | 1306 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); |
---|
641 | 1307 | } |
---|
642 | 1308 | |
---|
643 | | -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); |
---|
| 1309 | +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); |
---|
644 | 1310 | |
---|
645 | 1311 | /* |
---|
646 | 1312 | * Serialize vmap purging. There is no actual criticial section protected |
---|
.. | .. |
---|
658 | 1324 | */ |
---|
659 | 1325 | void set_iounmap_nonlazy(void) |
---|
660 | 1326 | { |
---|
661 | | - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); |
---|
| 1327 | + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); |
---|
662 | 1328 | } |
---|
663 | 1329 | |
---|
664 | 1330 | /* |
---|
.. | .. |
---|
666 | 1332 | */ |
---|
667 | 1333 | static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) |
---|
668 | 1334 | { |
---|
| 1335 | + unsigned long resched_threshold; |
---|
669 | 1336 | struct llist_node *valist; |
---|
670 | 1337 | struct vmap_area *va; |
---|
671 | 1338 | struct vmap_area *n_va; |
---|
672 | | - bool do_free = false; |
---|
673 | 1339 | |
---|
674 | 1340 | lockdep_assert_held(&vmap_purge_lock); |
---|
675 | 1341 | |
---|
676 | 1342 | valist = llist_del_all(&vmap_purge_list); |
---|
| 1343 | + if (unlikely(valist == NULL)) |
---|
| 1344 | + return false; |
---|
| 1345 | + |
---|
| 1346 | + /* |
---|
| 1347 | + * TODO: to calculate a flush range without looping. |
---|
| 1348 | + * The list can be up to lazy_max_pages() elements. |
---|
| 1349 | + */ |
---|
677 | 1350 | llist_for_each_entry(va, valist, purge_list) { |
---|
678 | 1351 | if (va->va_start < start) |
---|
679 | 1352 | start = va->va_start; |
---|
680 | 1353 | if (va->va_end > end) |
---|
681 | 1354 | end = va->va_end; |
---|
682 | | - do_free = true; |
---|
683 | 1355 | } |
---|
684 | | - |
---|
685 | | - if (!do_free) |
---|
686 | | - return false; |
---|
687 | 1356 | |
---|
688 | 1357 | flush_tlb_kernel_range(start, end); |
---|
| 1358 | + resched_threshold = lazy_max_pages() << 1; |
---|
689 | 1359 | |
---|
690 | | - spin_lock(&vmap_area_lock); |
---|
| 1360 | + spin_lock(&free_vmap_area_lock); |
---|
691 | 1361 | llist_for_each_entry_safe(va, n_va, valist, purge_list) { |
---|
692 | | - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; |
---|
| 1362 | + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; |
---|
| 1363 | + unsigned long orig_start = va->va_start; |
---|
| 1364 | + unsigned long orig_end = va->va_end; |
---|
693 | 1365 | |
---|
694 | | - __free_vmap_area(va); |
---|
695 | | - atomic_sub(nr, &vmap_lazy_nr); |
---|
696 | | - cond_resched_lock(&vmap_area_lock); |
---|
| 1366 | + /* |
---|
| 1367 | + * Finally insert or merge lazily-freed area. It is |
---|
| 1368 | + * detached and there is no need to "unlink" it from |
---|
| 1369 | + * anything. |
---|
| 1370 | + */ |
---|
| 1371 | + va = merge_or_add_vmap_area(va, &free_vmap_area_root, |
---|
| 1372 | + &free_vmap_area_list); |
---|
| 1373 | + |
---|
| 1374 | + if (!va) |
---|
| 1375 | + continue; |
---|
| 1376 | + |
---|
| 1377 | + if (is_vmalloc_or_module_addr((void *)orig_start)) |
---|
| 1378 | + kasan_release_vmalloc(orig_start, orig_end, |
---|
| 1379 | + va->va_start, va->va_end); |
---|
| 1380 | + |
---|
| 1381 | + atomic_long_sub(nr, &vmap_lazy_nr); |
---|
| 1382 | + |
---|
| 1383 | + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) |
---|
| 1384 | + cond_resched_lock(&free_vmap_area_lock); |
---|
697 | 1385 | } |
---|
698 | | - spin_unlock(&vmap_area_lock); |
---|
| 1386 | + spin_unlock(&free_vmap_area_lock); |
---|
699 | 1387 | return true; |
---|
700 | 1388 | } |
---|
701 | 1389 | |
---|
.. | .. |
---|
729 | 1417 | */ |
---|
730 | 1418 | static void free_vmap_area_noflush(struct vmap_area *va) |
---|
731 | 1419 | { |
---|
732 | | - int nr_lazy; |
---|
| 1420 | + unsigned long nr_lazy; |
---|
733 | 1421 | |
---|
734 | | - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, |
---|
735 | | - &vmap_lazy_nr); |
---|
| 1422 | + spin_lock(&vmap_area_lock); |
---|
| 1423 | + unlink_va(va, &vmap_area_root); |
---|
| 1424 | + spin_unlock(&vmap_area_lock); |
---|
| 1425 | + |
---|
| 1426 | + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> |
---|
| 1427 | + PAGE_SHIFT, &vmap_lazy_nr); |
---|
736 | 1428 | |
---|
737 | 1429 | /* After this point, we may free va at any time */ |
---|
738 | 1430 | llist_add(&va->purge_list, &vmap_purge_list); |
---|
.. | .. |
---|
747 | 1439 | static void free_unmap_vmap_area(struct vmap_area *va) |
---|
748 | 1440 | { |
---|
749 | 1441 | flush_cache_vunmap(va->va_start, va->va_end); |
---|
750 | | - unmap_vmap_area(va); |
---|
751 | | - if (debug_pagealloc_enabled()) |
---|
| 1442 | + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); |
---|
| 1443 | + if (debug_pagealloc_enabled_static()) |
---|
752 | 1444 | flush_tlb_kernel_range(va->va_start, va->va_end); |
---|
753 | 1445 | |
---|
754 | 1446 | free_vmap_area_noflush(va); |
---|
.. | .. |
---|
795 | 1487 | |
---|
796 | 1488 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
---|
797 | 1489 | |
---|
798 | | -static bool vmap_initialized __read_mostly = false; |
---|
799 | | - |
---|
800 | 1490 | struct vmap_block_queue { |
---|
801 | 1491 | spinlock_t lock; |
---|
802 | 1492 | struct list_head free; |
---|
.. | .. |
---|
816 | 1506 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); |
---|
817 | 1507 | |
---|
818 | 1508 | /* |
---|
819 | | - * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block |
---|
| 1509 | + * XArray of vmap blocks, indexed by address, to quickly find a vmap block |
---|
820 | 1510 | * in the free path. Could get rid of this if we change the API to return a |
---|
821 | 1511 | * "cookie" from alloc, to be passed to free. But no big deal yet. |
---|
822 | 1512 | */ |
---|
823 | | -static DEFINE_SPINLOCK(vmap_block_tree_lock); |
---|
824 | | -static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); |
---|
| 1513 | +static DEFINE_XARRAY(vmap_blocks); |
---|
825 | 1514 | |
---|
826 | 1515 | /* |
---|
827 | 1516 | * We should probably have a fallback mechanism to allocate virtual memory |
---|
.. | .. |
---|
852 | 1541 | * @order: how many 2^order pages should be occupied in newly allocated block |
---|
853 | 1542 | * @gfp_mask: flags for the page level allocator |
---|
854 | 1543 | * |
---|
855 | | - * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) |
---|
| 1544 | + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) |
---|
856 | 1545 | */ |
---|
857 | 1546 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) |
---|
858 | 1547 | { |
---|
.. | .. |
---|
878 | 1567 | return ERR_CAST(va); |
---|
879 | 1568 | } |
---|
880 | 1569 | |
---|
881 | | - err = radix_tree_preload(gfp_mask); |
---|
882 | | - if (unlikely(err)) { |
---|
883 | | - kfree(vb); |
---|
884 | | - free_vmap_area(va); |
---|
885 | | - return ERR_PTR(err); |
---|
886 | | - } |
---|
887 | | - |
---|
888 | 1570 | vaddr = vmap_block_vaddr(va->va_start, 0); |
---|
889 | 1571 | spin_lock_init(&vb->lock); |
---|
890 | 1572 | vb->va = va; |
---|
.. | .. |
---|
897 | 1579 | INIT_LIST_HEAD(&vb->free_list); |
---|
898 | 1580 | |
---|
899 | 1581 | vb_idx = addr_to_vb_idx(va->va_start); |
---|
900 | | - spin_lock(&vmap_block_tree_lock); |
---|
901 | | - err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); |
---|
902 | | - spin_unlock(&vmap_block_tree_lock); |
---|
903 | | - BUG_ON(err); |
---|
904 | | - radix_tree_preload_end(); |
---|
| 1582 | + err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); |
---|
| 1583 | + if (err) { |
---|
| 1584 | + kfree(vb); |
---|
| 1585 | + free_vmap_area(va); |
---|
| 1586 | + return ERR_PTR(err); |
---|
| 1587 | + } |
---|
905 | 1588 | |
---|
906 | 1589 | cpu = get_cpu_light(); |
---|
907 | 1590 | vbq = this_cpu_ptr(&vmap_block_queue); |
---|
.. | .. |
---|
916 | 1599 | static void free_vmap_block(struct vmap_block *vb) |
---|
917 | 1600 | { |
---|
918 | 1601 | struct vmap_block *tmp; |
---|
919 | | - unsigned long vb_idx; |
---|
920 | 1602 | |
---|
921 | | - vb_idx = addr_to_vb_idx(vb->va->va_start); |
---|
922 | | - spin_lock(&vmap_block_tree_lock); |
---|
923 | | - tmp = radix_tree_delete(&vmap_block_tree, vb_idx); |
---|
924 | | - spin_unlock(&vmap_block_tree_lock); |
---|
| 1603 | + tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); |
---|
925 | 1604 | BUG_ON(tmp != vb); |
---|
926 | 1605 | |
---|
927 | 1606 | free_vmap_area_noflush(vb->va); |
---|
.. | .. |
---|
1026 | 1705 | return vaddr; |
---|
1027 | 1706 | } |
---|
1028 | 1707 | |
---|
1029 | | -static void vb_free(const void *addr, unsigned long size) |
---|
| 1708 | +static void vb_free(unsigned long addr, unsigned long size) |
---|
1030 | 1709 | { |
---|
1031 | 1710 | unsigned long offset; |
---|
1032 | | - unsigned long vb_idx; |
---|
1033 | 1711 | unsigned int order; |
---|
1034 | 1712 | struct vmap_block *vb; |
---|
1035 | 1713 | |
---|
1036 | 1714 | BUG_ON(offset_in_page(size)); |
---|
1037 | 1715 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
---|
1038 | 1716 | |
---|
1039 | | - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); |
---|
| 1717 | + flush_cache_vunmap(addr, addr + size); |
---|
1040 | 1718 | |
---|
1041 | 1719 | order = get_order(size); |
---|
| 1720 | + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; |
---|
| 1721 | + vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); |
---|
1042 | 1722 | |
---|
1043 | | - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); |
---|
1044 | | - offset >>= PAGE_SHIFT; |
---|
| 1723 | + unmap_kernel_range_noflush(addr, size); |
---|
1045 | 1724 | |
---|
1046 | | - vb_idx = addr_to_vb_idx((unsigned long)addr); |
---|
1047 | | - rcu_read_lock(); |
---|
1048 | | - vb = radix_tree_lookup(&vmap_block_tree, vb_idx); |
---|
1049 | | - rcu_read_unlock(); |
---|
1050 | | - BUG_ON(!vb); |
---|
1051 | | - |
---|
1052 | | - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); |
---|
1053 | | - |
---|
1054 | | - if (debug_pagealloc_enabled()) |
---|
1055 | | - flush_tlb_kernel_range((unsigned long)addr, |
---|
1056 | | - (unsigned long)addr + size); |
---|
| 1725 | + if (debug_pagealloc_enabled_static()) |
---|
| 1726 | + flush_tlb_kernel_range(addr, addr + size); |
---|
1057 | 1727 | |
---|
1058 | 1728 | spin_lock(&vb->lock); |
---|
1059 | 1729 | |
---|
.. | .. |
---|
1070 | 1740 | spin_unlock(&vb->lock); |
---|
1071 | 1741 | } |
---|
1072 | 1742 | |
---|
1073 | | -/** |
---|
1074 | | - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer |
---|
1075 | | - * |
---|
1076 | | - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily |
---|
1077 | | - * to amortize TLB flushing overheads. What this means is that any page you |
---|
1078 | | - * have now, may, in a former life, have been mapped into kernel virtual |
---|
1079 | | - * address by the vmap layer and so there might be some CPUs with TLB entries |
---|
1080 | | - * still referencing that page (additional to the regular 1:1 kernel mapping). |
---|
1081 | | - * |
---|
1082 | | - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can |
---|
1083 | | - * be sure that none of the pages we have control over will have any aliases |
---|
1084 | | - * from the vmap layer. |
---|
1085 | | - */ |
---|
1086 | | -void vm_unmap_aliases(void) |
---|
| 1743 | +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) |
---|
1087 | 1744 | { |
---|
1088 | | - unsigned long start = ULONG_MAX, end = 0; |
---|
1089 | 1745 | int cpu; |
---|
1090 | | - int flush = 0; |
---|
1091 | 1746 | |
---|
1092 | 1747 | if (unlikely(!vmap_initialized)) |
---|
1093 | 1748 | return; |
---|
.. | .. |
---|
1101 | 1756 | rcu_read_lock(); |
---|
1102 | 1757 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
---|
1103 | 1758 | spin_lock(&vb->lock); |
---|
1104 | | - if (vb->dirty) { |
---|
| 1759 | + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { |
---|
1105 | 1760 | unsigned long va_start = vb->va->va_start; |
---|
1106 | 1761 | unsigned long s, e; |
---|
1107 | 1762 | |
---|
.. | .. |
---|
1124 | 1779 | flush_tlb_kernel_range(start, end); |
---|
1125 | 1780 | mutex_unlock(&vmap_purge_lock); |
---|
1126 | 1781 | } |
---|
| 1782 | + |
---|
| 1783 | +/** |
---|
| 1784 | + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer |
---|
| 1785 | + * |
---|
| 1786 | + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily |
---|
| 1787 | + * to amortize TLB flushing overheads. What this means is that any page you |
---|
| 1788 | + * have now, may, in a former life, have been mapped into kernel virtual |
---|
| 1789 | + * address by the vmap layer and so there might be some CPUs with TLB entries |
---|
| 1790 | + * still referencing that page (additional to the regular 1:1 kernel mapping). |
---|
| 1791 | + * |
---|
| 1792 | + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can |
---|
| 1793 | + * be sure that none of the pages we have control over will have any aliases |
---|
| 1794 | + * from the vmap layer. |
---|
| 1795 | + */ |
---|
| 1796 | +void vm_unmap_aliases(void) |
---|
| 1797 | +{ |
---|
| 1798 | + unsigned long start = ULONG_MAX, end = 0; |
---|
| 1799 | + int flush = 0; |
---|
| 1800 | + |
---|
| 1801 | + _vm_unmap_aliases(start, end, flush); |
---|
| 1802 | +} |
---|
1127 | 1803 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); |
---|
1128 | 1804 | |
---|
1129 | 1805 | /** |
---|
.. | .. |
---|
1143 | 1819 | BUG_ON(addr > VMALLOC_END); |
---|
1144 | 1820 | BUG_ON(!PAGE_ALIGNED(addr)); |
---|
1145 | 1821 | |
---|
| 1822 | + kasan_poison_vmalloc(mem, size); |
---|
| 1823 | + |
---|
1146 | 1824 | if (likely(count <= VMAP_MAX_ALLOC)) { |
---|
1147 | 1825 | debug_check_no_locks_freed(mem, size); |
---|
1148 | | - vb_free(mem, size); |
---|
| 1826 | + vb_free(addr, size); |
---|
1149 | 1827 | return; |
---|
1150 | 1828 | } |
---|
1151 | 1829 | |
---|
.. | .. |
---|
1162 | 1840 | * @pages: an array of pointers to the pages to be mapped |
---|
1163 | 1841 | * @count: number of pages |
---|
1164 | 1842 | * @node: prefer to allocate data structures on this node |
---|
1165 | | - * @prot: memory protection to use. PAGE_KERNEL for regular RAM |
---|
1166 | 1843 | * |
---|
1167 | 1844 | * If you use this function for less than VMAP_MAX_ALLOC pages, it could be |
---|
1168 | 1845 | * faster than vmap so it's good. But if you mix long-life and short-life |
---|
.. | .. |
---|
1172 | 1849 | * |
---|
1173 | 1850 | * Returns: a pointer to the address that has been mapped, or %NULL on failure |
---|
1174 | 1851 | */ |
---|
1175 | | -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) |
---|
| 1852 | +void *vm_map_ram(struct page **pages, unsigned int count, int node) |
---|
1176 | 1853 | { |
---|
1177 | 1854 | unsigned long size = (unsigned long)count << PAGE_SHIFT; |
---|
1178 | 1855 | unsigned long addr; |
---|
.. | .. |
---|
1193 | 1870 | addr = va->va_start; |
---|
1194 | 1871 | mem = (void *)addr; |
---|
1195 | 1872 | } |
---|
1196 | | - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { |
---|
| 1873 | + |
---|
| 1874 | + kasan_unpoison_vmalloc(mem, size); |
---|
| 1875 | + |
---|
| 1876 | + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { |
---|
1197 | 1877 | vm_unmap_ram(mem, count); |
---|
1198 | 1878 | return NULL; |
---|
1199 | 1879 | } |
---|
.. | .. |
---|
1202 | 1882 | EXPORT_SYMBOL(vm_map_ram); |
---|
1203 | 1883 | |
---|
1204 | 1884 | static struct vm_struct *vmlist __initdata; |
---|
| 1885 | + |
---|
1205 | 1886 | /** |
---|
1206 | 1887 | * vm_area_add_early - add vmap area early during boot |
---|
1207 | 1888 | * @vm: vm_struct to add |
---|
.. | .. |
---|
1253 | 1934 | vm_area_add_early(vm); |
---|
1254 | 1935 | } |
---|
1255 | 1936 | |
---|
| 1937 | +static void vmap_init_free_space(void) |
---|
| 1938 | +{ |
---|
| 1939 | + unsigned long vmap_start = 1; |
---|
| 1940 | + const unsigned long vmap_end = ULONG_MAX; |
---|
| 1941 | + struct vmap_area *busy, *free; |
---|
| 1942 | + |
---|
| 1943 | + /* |
---|
| 1944 | + * B F B B B F |
---|
| 1945 | + * -|-----|.....|-----|-----|-----|.....|- |
---|
| 1946 | + * | The KVA space | |
---|
| 1947 | + * |<--------------------------------->| |
---|
| 1948 | + */ |
---|
| 1949 | + list_for_each_entry(busy, &vmap_area_list, list) { |
---|
| 1950 | + if (busy->va_start - vmap_start > 0) { |
---|
| 1951 | + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); |
---|
| 1952 | + if (!WARN_ON_ONCE(!free)) { |
---|
| 1953 | + free->va_start = vmap_start; |
---|
| 1954 | + free->va_end = busy->va_start; |
---|
| 1955 | + |
---|
| 1956 | + insert_vmap_area_augment(free, NULL, |
---|
| 1957 | + &free_vmap_area_root, |
---|
| 1958 | + &free_vmap_area_list); |
---|
| 1959 | + } |
---|
| 1960 | + } |
---|
| 1961 | + |
---|
| 1962 | + vmap_start = busy->va_end; |
---|
| 1963 | + } |
---|
| 1964 | + |
---|
| 1965 | + if (vmap_end - vmap_start > 0) { |
---|
| 1966 | + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); |
---|
| 1967 | + if (!WARN_ON_ONCE(!free)) { |
---|
| 1968 | + free->va_start = vmap_start; |
---|
| 1969 | + free->va_end = vmap_end; |
---|
| 1970 | + |
---|
| 1971 | + insert_vmap_area_augment(free, NULL, |
---|
| 1972 | + &free_vmap_area_root, |
---|
| 1973 | + &free_vmap_area_list); |
---|
| 1974 | + } |
---|
| 1975 | + } |
---|
| 1976 | +} |
---|
| 1977 | + |
---|
1256 | 1978 | void __init vmalloc_init(void) |
---|
1257 | 1979 | { |
---|
1258 | 1980 | struct vmap_area *va; |
---|
1259 | 1981 | struct vm_struct *tmp; |
---|
1260 | 1982 | int i; |
---|
| 1983 | + |
---|
| 1984 | + /* |
---|
| 1985 | + * Create the cache for vmap_area objects. |
---|
| 1986 | + */ |
---|
| 1987 | + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); |
---|
1261 | 1988 | |
---|
1262 | 1989 | for_each_possible_cpu(i) { |
---|
1263 | 1990 | struct vmap_block_queue *vbq; |
---|
.. | .. |
---|
1273 | 2000 | |
---|
1274 | 2001 | /* Import existing vmlist entries. */ |
---|
1275 | 2002 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
---|
1276 | | - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
---|
1277 | | - va->flags = VM_VM_AREA; |
---|
| 2003 | + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); |
---|
| 2004 | + if (WARN_ON_ONCE(!va)) |
---|
| 2005 | + continue; |
---|
| 2006 | + |
---|
1278 | 2007 | va->va_start = (unsigned long)tmp->addr; |
---|
1279 | 2008 | va->va_end = va->va_start + tmp->size; |
---|
1280 | 2009 | va->vm = tmp; |
---|
1281 | | - __insert_vmap_area(va); |
---|
| 2010 | + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); |
---|
1282 | 2011 | } |
---|
1283 | 2012 | |
---|
1284 | | - vmap_area_pcpu_hole = VMALLOC_END; |
---|
1285 | | - |
---|
| 2013 | + /* |
---|
| 2014 | + * Now we can initialize a free vmap space. |
---|
| 2015 | + */ |
---|
| 2016 | + vmap_init_free_space(); |
---|
1286 | 2017 | vmap_initialized = true; |
---|
1287 | 2018 | } |
---|
1288 | | - |
---|
1289 | | -/** |
---|
1290 | | - * map_kernel_range_noflush - map kernel VM area with the specified pages |
---|
1291 | | - * @addr: start of the VM area to map |
---|
1292 | | - * @size: size of the VM area to map |
---|
1293 | | - * @prot: page protection flags to use |
---|
1294 | | - * @pages: pages to map |
---|
1295 | | - * |
---|
1296 | | - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size |
---|
1297 | | - * specify should have been allocated using get_vm_area() and its |
---|
1298 | | - * friends. |
---|
1299 | | - * |
---|
1300 | | - * NOTE: |
---|
1301 | | - * This function does NOT do any cache flushing. The caller is |
---|
1302 | | - * responsible for calling flush_cache_vmap() on to-be-mapped areas |
---|
1303 | | - * before calling this function. |
---|
1304 | | - * |
---|
1305 | | - * RETURNS: |
---|
1306 | | - * The number of pages mapped on success, -errno on failure. |
---|
1307 | | - */ |
---|
1308 | | -int map_kernel_range_noflush(unsigned long addr, unsigned long size, |
---|
1309 | | - pgprot_t prot, struct page **pages) |
---|
1310 | | -{ |
---|
1311 | | - return vmap_page_range_noflush(addr, addr + size, prot, pages); |
---|
1312 | | -} |
---|
1313 | | - |
---|
1314 | | -/** |
---|
1315 | | - * unmap_kernel_range_noflush - unmap kernel VM area |
---|
1316 | | - * @addr: start of the VM area to unmap |
---|
1317 | | - * @size: size of the VM area to unmap |
---|
1318 | | - * |
---|
1319 | | - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size |
---|
1320 | | - * specify should have been allocated using get_vm_area() and its |
---|
1321 | | - * friends. |
---|
1322 | | - * |
---|
1323 | | - * NOTE: |
---|
1324 | | - * This function does NOT do any cache flushing. The caller is |
---|
1325 | | - * responsible for calling flush_cache_vunmap() on to-be-mapped areas |
---|
1326 | | - * before calling this function and flush_tlb_kernel_range() after. |
---|
1327 | | - */ |
---|
1328 | | -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) |
---|
1329 | | -{ |
---|
1330 | | - vunmap_page_range(addr, addr + size); |
---|
1331 | | -} |
---|
1332 | | -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); |
---|
1333 | 2019 | |
---|
1334 | 2020 | /** |
---|
1335 | 2021 | * unmap_kernel_range - unmap kernel VM area and flush cache and TLB |
---|
.. | .. |
---|
1344 | 2030 | unsigned long end = addr + size; |
---|
1345 | 2031 | |
---|
1346 | 2032 | flush_cache_vunmap(addr, end); |
---|
1347 | | - vunmap_page_range(addr, end); |
---|
| 2033 | + unmap_kernel_range_noflush(addr, size); |
---|
1348 | 2034 | flush_tlb_kernel_range(addr, end); |
---|
1349 | 2035 | } |
---|
1350 | | -EXPORT_SYMBOL_GPL(unmap_kernel_range); |
---|
1351 | 2036 | |
---|
1352 | | -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) |
---|
| 2037 | +static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, |
---|
| 2038 | + struct vmap_area *va, unsigned long flags, const void *caller) |
---|
1353 | 2039 | { |
---|
1354 | | - unsigned long addr = (unsigned long)area->addr; |
---|
1355 | | - unsigned long end = addr + get_vm_area_size(area); |
---|
1356 | | - int err; |
---|
1357 | | - |
---|
1358 | | - err = vmap_page_range(addr, end, prot, pages); |
---|
1359 | | - |
---|
1360 | | - return err > 0 ? 0 : err; |
---|
1361 | | -} |
---|
1362 | | -EXPORT_SYMBOL_GPL(map_vm_area); |
---|
1363 | | - |
---|
1364 | | -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
---|
1365 | | - unsigned long flags, const void *caller) |
---|
1366 | | -{ |
---|
1367 | | - spin_lock(&vmap_area_lock); |
---|
1368 | 2040 | vm->flags = flags; |
---|
1369 | 2041 | vm->addr = (void *)va->va_start; |
---|
1370 | 2042 | vm->size = va->va_end - va->va_start; |
---|
1371 | 2043 | vm->caller = caller; |
---|
1372 | 2044 | va->vm = vm; |
---|
1373 | | - va->flags |= VM_VM_AREA; |
---|
| 2045 | + trace_android_vh_save_vmalloc_stack(flags, vm); |
---|
| 2046 | +} |
---|
| 2047 | + |
---|
| 2048 | +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, |
---|
| 2049 | + unsigned long flags, const void *caller) |
---|
| 2050 | +{ |
---|
| 2051 | + spin_lock(&vmap_area_lock); |
---|
| 2052 | + setup_vmalloc_vm_locked(vm, va, flags, caller); |
---|
1374 | 2053 | spin_unlock(&vmap_area_lock); |
---|
1375 | 2054 | } |
---|
1376 | 2055 | |
---|
.. | .. |
---|
1391 | 2070 | { |
---|
1392 | 2071 | struct vmap_area *va; |
---|
1393 | 2072 | struct vm_struct *area; |
---|
| 2073 | + unsigned long requested_size = size; |
---|
1394 | 2074 | |
---|
1395 | 2075 | BUG_ON(in_interrupt()); |
---|
1396 | 2076 | size = PAGE_ALIGN(size); |
---|
.. | .. |
---|
1414 | 2094 | return NULL; |
---|
1415 | 2095 | } |
---|
1416 | 2096 | |
---|
| 2097 | + kasan_unpoison_vmalloc((void *)va->va_start, requested_size); |
---|
| 2098 | + |
---|
1417 | 2099 | setup_vmalloc_vm(area, va, flags, caller); |
---|
1418 | 2100 | |
---|
1419 | 2101 | return area; |
---|
1420 | 2102 | } |
---|
1421 | | - |
---|
1422 | | -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
---|
1423 | | - unsigned long start, unsigned long end) |
---|
1424 | | -{ |
---|
1425 | | - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
---|
1426 | | - GFP_KERNEL, __builtin_return_address(0)); |
---|
1427 | | -} |
---|
1428 | | -EXPORT_SYMBOL_GPL(__get_vm_area); |
---|
1429 | 2103 | |
---|
1430 | 2104 | struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, |
---|
1431 | 2105 | unsigned long start, unsigned long end, |
---|
.. | .. |
---|
1434 | 2108 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
---|
1435 | 2109 | GFP_KERNEL, caller); |
---|
1436 | 2110 | } |
---|
| 2111 | +EXPORT_SYMBOL_GPL(__get_vm_area_caller); |
---|
1437 | 2112 | |
---|
1438 | 2113 | /** |
---|
1439 | | - * get_vm_area - reserve a contiguous kernel virtual area |
---|
1440 | | - * @size: size of the area |
---|
1441 | | - * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC |
---|
| 2114 | + * get_vm_area - reserve a contiguous kernel virtual area |
---|
| 2115 | + * @size: size of the area |
---|
| 2116 | + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC |
---|
1442 | 2117 | * |
---|
1443 | | - * Search an area of @size in the kernel virtual mapping area, |
---|
1444 | | - * and reserved it for out purposes. Returns the area descriptor |
---|
1445 | | - * on success or %NULL on failure. |
---|
| 2118 | + * Search an area of @size in the kernel virtual mapping area, |
---|
| 2119 | + * and reserved it for out purposes. Returns the area descriptor |
---|
| 2120 | + * on success or %NULL on failure. |
---|
| 2121 | + * |
---|
| 2122 | + * Return: the area descriptor on success or %NULL on failure. |
---|
1446 | 2123 | */ |
---|
1447 | 2124 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
---|
1448 | 2125 | { |
---|
.. | .. |
---|
1450 | 2127 | NUMA_NO_NODE, GFP_KERNEL, |
---|
1451 | 2128 | __builtin_return_address(0)); |
---|
1452 | 2129 | } |
---|
1453 | | -EXPORT_SYMBOL_GPL(get_vm_area); |
---|
1454 | 2130 | |
---|
1455 | 2131 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
---|
1456 | 2132 | const void *caller) |
---|
.. | .. |
---|
1460 | 2136 | } |
---|
1461 | 2137 | |
---|
1462 | 2138 | /** |
---|
1463 | | - * find_vm_area - find a continuous kernel virtual area |
---|
1464 | | - * @addr: base address |
---|
| 2139 | + * find_vm_area - find a continuous kernel virtual area |
---|
| 2140 | + * @addr: base address |
---|
1465 | 2141 | * |
---|
1466 | | - * Search for the kernel VM area starting at @addr, and return it. |
---|
1467 | | - * It is up to the caller to do all required locking to keep the returned |
---|
1468 | | - * pointer valid. |
---|
| 2142 | + * Search for the kernel VM area starting at @addr, and return it. |
---|
| 2143 | + * It is up to the caller to do all required locking to keep the returned |
---|
| 2144 | + * pointer valid. |
---|
| 2145 | + * |
---|
| 2146 | + * Return: the area descriptor on success or %NULL on failure. |
---|
1469 | 2147 | */ |
---|
1470 | 2148 | struct vm_struct *find_vm_area(const void *addr) |
---|
1471 | 2149 | { |
---|
1472 | 2150 | struct vmap_area *va; |
---|
1473 | 2151 | |
---|
1474 | 2152 | va = find_vmap_area((unsigned long)addr); |
---|
1475 | | - if (va && va->flags & VM_VM_AREA) |
---|
1476 | | - return va->vm; |
---|
| 2153 | + if (!va) |
---|
| 2154 | + return NULL; |
---|
1477 | 2155 | |
---|
1478 | | - return NULL; |
---|
| 2156 | + return va->vm; |
---|
1479 | 2157 | } |
---|
1480 | 2158 | |
---|
1481 | 2159 | /** |
---|
1482 | | - * remove_vm_area - find and remove a continuous kernel virtual area |
---|
1483 | | - * @addr: base address |
---|
| 2160 | + * remove_vm_area - find and remove a continuous kernel virtual area |
---|
| 2161 | + * @addr: base address |
---|
1484 | 2162 | * |
---|
1485 | | - * Search for the kernel VM area starting at @addr, and remove it. |
---|
1486 | | - * This function returns the found VM area, but using it is NOT safe |
---|
1487 | | - * on SMP machines, except for its size or flags. |
---|
| 2163 | + * Search for the kernel VM area starting at @addr, and remove it. |
---|
| 2164 | + * This function returns the found VM area, but using it is NOT safe |
---|
| 2165 | + * on SMP machines, except for its size or flags. |
---|
| 2166 | + * |
---|
| 2167 | + * Return: the area descriptor on success or %NULL on failure. |
---|
1488 | 2168 | */ |
---|
1489 | 2169 | struct vm_struct *remove_vm_area(const void *addr) |
---|
1490 | 2170 | { |
---|
.. | .. |
---|
1492 | 2172 | |
---|
1493 | 2173 | might_sleep(); |
---|
1494 | 2174 | |
---|
1495 | | - va = find_vmap_area((unsigned long)addr); |
---|
1496 | | - if (va && va->flags & VM_VM_AREA) { |
---|
| 2175 | + spin_lock(&vmap_area_lock); |
---|
| 2176 | + va = __find_vmap_area((unsigned long)addr); |
---|
| 2177 | + if (va && va->vm) { |
---|
1497 | 2178 | struct vm_struct *vm = va->vm; |
---|
1498 | 2179 | |
---|
1499 | | - spin_lock(&vmap_area_lock); |
---|
| 2180 | + trace_android_vh_remove_vmalloc_stack(vm); |
---|
1500 | 2181 | va->vm = NULL; |
---|
1501 | | - va->flags &= ~VM_VM_AREA; |
---|
1502 | | - va->flags |= VM_LAZY_FREE; |
---|
1503 | 2182 | spin_unlock(&vmap_area_lock); |
---|
1504 | 2183 | |
---|
1505 | 2184 | kasan_free_shadow(vm); |
---|
.. | .. |
---|
1507 | 2186 | |
---|
1508 | 2187 | return vm; |
---|
1509 | 2188 | } |
---|
| 2189 | + |
---|
| 2190 | + spin_unlock(&vmap_area_lock); |
---|
1510 | 2191 | return NULL; |
---|
| 2192 | +} |
---|
| 2193 | + |
---|
| 2194 | +static inline void set_area_direct_map(const struct vm_struct *area, |
---|
| 2195 | + int (*set_direct_map)(struct page *page)) |
---|
| 2196 | +{ |
---|
| 2197 | + int i; |
---|
| 2198 | + |
---|
| 2199 | + for (i = 0; i < area->nr_pages; i++) |
---|
| 2200 | + if (page_address(area->pages[i])) |
---|
| 2201 | + set_direct_map(area->pages[i]); |
---|
| 2202 | +} |
---|
| 2203 | + |
---|
| 2204 | +/* Handle removing and resetting vm mappings related to the vm_struct. */ |
---|
| 2205 | +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) |
---|
| 2206 | +{ |
---|
| 2207 | + unsigned long start = ULONG_MAX, end = 0; |
---|
| 2208 | + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; |
---|
| 2209 | + int flush_dmap = 0; |
---|
| 2210 | + int i; |
---|
| 2211 | + |
---|
| 2212 | + remove_vm_area(area->addr); |
---|
| 2213 | + |
---|
| 2214 | + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ |
---|
| 2215 | + if (!flush_reset) |
---|
| 2216 | + return; |
---|
| 2217 | + |
---|
| 2218 | + /* |
---|
| 2219 | + * If not deallocating pages, just do the flush of the VM area and |
---|
| 2220 | + * return. |
---|
| 2221 | + */ |
---|
| 2222 | + if (!deallocate_pages) { |
---|
| 2223 | + vm_unmap_aliases(); |
---|
| 2224 | + return; |
---|
| 2225 | + } |
---|
| 2226 | + |
---|
| 2227 | + /* |
---|
| 2228 | + * If execution gets here, flush the vm mapping and reset the direct |
---|
| 2229 | + * map. Find the start and end range of the direct mappings to make sure |
---|
| 2230 | + * the vm_unmap_aliases() flush includes the direct map. |
---|
| 2231 | + */ |
---|
| 2232 | + for (i = 0; i < area->nr_pages; i++) { |
---|
| 2233 | + unsigned long addr = (unsigned long)page_address(area->pages[i]); |
---|
| 2234 | + if (addr) { |
---|
| 2235 | + start = min(addr, start); |
---|
| 2236 | + end = max(addr + PAGE_SIZE, end); |
---|
| 2237 | + flush_dmap = 1; |
---|
| 2238 | + } |
---|
| 2239 | + } |
---|
| 2240 | + |
---|
| 2241 | + /* |
---|
| 2242 | + * Set direct map to something invalid so that it won't be cached if |
---|
| 2243 | + * there are any accesses after the TLB flush, then flush the TLB and |
---|
| 2244 | + * reset the direct map permissions to the default. |
---|
| 2245 | + */ |
---|
| 2246 | + set_area_direct_map(area, set_direct_map_invalid_noflush); |
---|
| 2247 | + _vm_unmap_aliases(start, end, flush_dmap); |
---|
| 2248 | + set_area_direct_map(area, set_direct_map_default_noflush); |
---|
1511 | 2249 | } |
---|
1512 | 2250 | |
---|
1513 | 2251 | static void __vunmap(const void *addr, int deallocate_pages) |
---|
.. | .. |
---|
1531 | 2269 | debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); |
---|
1532 | 2270 | debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); |
---|
1533 | 2271 | |
---|
1534 | | - remove_vm_area(addr); |
---|
| 2272 | + kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); |
---|
| 2273 | + |
---|
| 2274 | + vm_remove_mappings(area, deallocate_pages); |
---|
| 2275 | + |
---|
1535 | 2276 | if (deallocate_pages) { |
---|
1536 | 2277 | int i; |
---|
1537 | 2278 | |
---|
.. | .. |
---|
1556 | 2297 | * Use raw_cpu_ptr() because this can be called from preemptible |
---|
1557 | 2298 | * context. Preemption is absolutely fine here, because the llist_add() |
---|
1558 | 2299 | * implementation is lockless, so it works even if we are adding to |
---|
1559 | | - * nother cpu's list. schedule_work() should be fine with this too. |
---|
| 2300 | + * another cpu's list. schedule_work() should be fine with this too. |
---|
1560 | 2301 | */ |
---|
1561 | 2302 | struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); |
---|
1562 | 2303 | |
---|
.. | .. |
---|
1565 | 2306 | } |
---|
1566 | 2307 | |
---|
1567 | 2308 | /** |
---|
1568 | | - * vfree_atomic - release memory allocated by vmalloc() |
---|
1569 | | - * @addr: memory base address |
---|
| 2309 | + * vfree_atomic - release memory allocated by vmalloc() |
---|
| 2310 | + * @addr: memory base address |
---|
1570 | 2311 | * |
---|
1571 | | - * This one is just like vfree() but can be called in any atomic context |
---|
1572 | | - * except NMIs. |
---|
| 2312 | + * This one is just like vfree() but can be called in any atomic context |
---|
| 2313 | + * except NMIs. |
---|
1573 | 2314 | */ |
---|
1574 | 2315 | void vfree_atomic(const void *addr) |
---|
1575 | 2316 | { |
---|
.. | .. |
---|
1582 | 2323 | __vfree_deferred(addr); |
---|
1583 | 2324 | } |
---|
1584 | 2325 | |
---|
| 2326 | +static void __vfree(const void *addr) |
---|
| 2327 | +{ |
---|
| 2328 | + if (unlikely(in_interrupt())) |
---|
| 2329 | + __vfree_deferred(addr); |
---|
| 2330 | + else |
---|
| 2331 | + __vunmap(addr, 1); |
---|
| 2332 | +} |
---|
| 2333 | + |
---|
1585 | 2334 | /** |
---|
1586 | | - * vfree - release memory allocated by vmalloc() |
---|
1587 | | - * @addr: memory base address |
---|
| 2335 | + * vfree - Release memory allocated by vmalloc() |
---|
| 2336 | + * @addr: Memory base address |
---|
1588 | 2337 | * |
---|
1589 | | - * Free the virtually continuous memory area starting at @addr, as |
---|
1590 | | - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is |
---|
1591 | | - * NULL, no operation is performed. |
---|
| 2338 | + * Free the virtually continuous memory area starting at @addr, as obtained |
---|
| 2339 | + * from one of the vmalloc() family of APIs. This will usually also free the |
---|
| 2340 | + * physical memory underlying the virtual allocation, but that memory is |
---|
| 2341 | + * reference counted, so it will not be freed until the last user goes away. |
---|
1592 | 2342 | * |
---|
1593 | | - * Must not be called in NMI context (strictly speaking, only if we don't |
---|
1594 | | - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
---|
1595 | | - * conventions for vfree() arch-depenedent would be a really bad idea) |
---|
| 2343 | + * If @addr is NULL, no operation is performed. |
---|
1596 | 2344 | * |
---|
1597 | | - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) |
---|
| 2345 | + * Context: |
---|
| 2346 | + * May sleep if called *not* from interrupt context. |
---|
| 2347 | + * Must not be called in NMI context (strictly speaking, it could be |
---|
| 2348 | + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
---|
| 2349 | + * conventions for vfree() arch-depenedent would be a really bad idea). |
---|
1598 | 2350 | */ |
---|
1599 | 2351 | void vfree(const void *addr) |
---|
1600 | 2352 | { |
---|
.. | .. |
---|
1602 | 2354 | |
---|
1603 | 2355 | kmemleak_free(addr); |
---|
1604 | 2356 | |
---|
| 2357 | + might_sleep_if(!in_interrupt()); |
---|
| 2358 | + |
---|
1605 | 2359 | if (!addr) |
---|
1606 | 2360 | return; |
---|
1607 | | - if (unlikely(in_interrupt())) |
---|
1608 | | - __vfree_deferred(addr); |
---|
1609 | | - else |
---|
1610 | | - __vunmap(addr, 1); |
---|
| 2361 | + |
---|
| 2362 | + __vfree(addr); |
---|
1611 | 2363 | } |
---|
1612 | 2364 | EXPORT_SYMBOL(vfree); |
---|
1613 | 2365 | |
---|
1614 | 2366 | /** |
---|
1615 | | - * vunmap - release virtual mapping obtained by vmap() |
---|
1616 | | - * @addr: memory base address |
---|
| 2367 | + * vunmap - release virtual mapping obtained by vmap() |
---|
| 2368 | + * @addr: memory base address |
---|
1617 | 2369 | * |
---|
1618 | | - * Free the virtually contiguous memory area starting at @addr, |
---|
1619 | | - * which was created from the page array passed to vmap(). |
---|
| 2370 | + * Free the virtually contiguous memory area starting at @addr, |
---|
| 2371 | + * which was created from the page array passed to vmap(). |
---|
1620 | 2372 | * |
---|
1621 | | - * Must not be called in interrupt context. |
---|
| 2373 | + * Must not be called in interrupt context. |
---|
1622 | 2374 | */ |
---|
1623 | 2375 | void vunmap(const void *addr) |
---|
1624 | 2376 | { |
---|
.. | .. |
---|
1630 | 2382 | EXPORT_SYMBOL(vunmap); |
---|
1631 | 2383 | |
---|
1632 | 2384 | /** |
---|
1633 | | - * vmap - map an array of pages into virtually contiguous space |
---|
1634 | | - * @pages: array of page pointers |
---|
1635 | | - * @count: number of pages to map |
---|
1636 | | - * @flags: vm_area->flags |
---|
1637 | | - * @prot: page protection for the mapping |
---|
| 2385 | + * vmap - map an array of pages into virtually contiguous space |
---|
| 2386 | + * @pages: array of page pointers |
---|
| 2387 | + * @count: number of pages to map |
---|
| 2388 | + * @flags: vm_area->flags |
---|
| 2389 | + * @prot: page protection for the mapping |
---|
1638 | 2390 | * |
---|
1639 | | - * Maps @count pages from @pages into contiguous kernel virtual |
---|
1640 | | - * space. |
---|
| 2391 | + * Maps @count pages from @pages into contiguous kernel virtual space. |
---|
| 2392 | + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself |
---|
| 2393 | + * (which must be kmalloc or vmalloc memory) and one reference per pages in it |
---|
| 2394 | + * are transferred from the caller to vmap(), and will be freed / dropped when |
---|
| 2395 | + * vfree() is called on the return value. |
---|
| 2396 | + * |
---|
| 2397 | + * Return: the address of the area or %NULL on failure |
---|
1641 | 2398 | */ |
---|
1642 | 2399 | void *vmap(struct page **pages, unsigned int count, |
---|
1643 | | - unsigned long flags, pgprot_t prot) |
---|
| 2400 | + unsigned long flags, pgprot_t prot) |
---|
1644 | 2401 | { |
---|
1645 | 2402 | struct vm_struct *area; |
---|
1646 | 2403 | unsigned long size; /* In bytes */ |
---|
1647 | 2404 | |
---|
1648 | 2405 | might_sleep(); |
---|
1649 | 2406 | |
---|
1650 | | - if (count > totalram_pages) |
---|
| 2407 | + if (count > totalram_pages()) |
---|
1651 | 2408 | return NULL; |
---|
1652 | 2409 | |
---|
1653 | 2410 | size = (unsigned long)count << PAGE_SHIFT; |
---|
.. | .. |
---|
1655 | 2412 | if (!area) |
---|
1656 | 2413 | return NULL; |
---|
1657 | 2414 | |
---|
1658 | | - if (map_vm_area(area, prot, pages)) { |
---|
| 2415 | + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), |
---|
| 2416 | + pages) < 0) { |
---|
1659 | 2417 | vunmap(area->addr); |
---|
1660 | 2418 | return NULL; |
---|
1661 | 2419 | } |
---|
1662 | 2420 | |
---|
| 2421 | + if (flags & VM_MAP_PUT_PAGES) { |
---|
| 2422 | + area->pages = pages; |
---|
| 2423 | + area->nr_pages = count; |
---|
| 2424 | + } |
---|
1663 | 2425 | return area->addr; |
---|
1664 | 2426 | } |
---|
1665 | 2427 | EXPORT_SYMBOL(vmap); |
---|
1666 | 2428 | |
---|
1667 | | -static void *__vmalloc_node(unsigned long size, unsigned long align, |
---|
1668 | | - gfp_t gfp_mask, pgprot_t prot, |
---|
1669 | | - int node, const void *caller); |
---|
| 2429 | +#ifdef CONFIG_VMAP_PFN |
---|
| 2430 | +struct vmap_pfn_data { |
---|
| 2431 | + unsigned long *pfns; |
---|
| 2432 | + pgprot_t prot; |
---|
| 2433 | + unsigned int idx; |
---|
| 2434 | +}; |
---|
| 2435 | + |
---|
| 2436 | +static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) |
---|
| 2437 | +{ |
---|
| 2438 | + struct vmap_pfn_data *data = private; |
---|
| 2439 | + |
---|
| 2440 | + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) |
---|
| 2441 | + return -EINVAL; |
---|
| 2442 | + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); |
---|
| 2443 | + return 0; |
---|
| 2444 | +} |
---|
| 2445 | + |
---|
| 2446 | +/** |
---|
| 2447 | + * vmap_pfn - map an array of PFNs into virtually contiguous space |
---|
| 2448 | + * @pfns: array of PFNs |
---|
| 2449 | + * @count: number of pages to map |
---|
| 2450 | + * @prot: page protection for the mapping |
---|
| 2451 | + * |
---|
| 2452 | + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns |
---|
| 2453 | + * the start address of the mapping. |
---|
| 2454 | + */ |
---|
| 2455 | +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) |
---|
| 2456 | +{ |
---|
| 2457 | + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; |
---|
| 2458 | + struct vm_struct *area; |
---|
| 2459 | + |
---|
| 2460 | + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, |
---|
| 2461 | + __builtin_return_address(0)); |
---|
| 2462 | + if (!area) |
---|
| 2463 | + return NULL; |
---|
| 2464 | + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, |
---|
| 2465 | + count * PAGE_SIZE, vmap_pfn_apply, &data)) { |
---|
| 2466 | + free_vm_area(area); |
---|
| 2467 | + return NULL; |
---|
| 2468 | + } |
---|
| 2469 | + return area->addr; |
---|
| 2470 | +} |
---|
| 2471 | +EXPORT_SYMBOL_GPL(vmap_pfn); |
---|
| 2472 | +#endif /* CONFIG_VMAP_PFN */ |
---|
| 2473 | + |
---|
1670 | 2474 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
---|
1671 | 2475 | pgprot_t prot, int node) |
---|
1672 | 2476 | { |
---|
1673 | | - struct page **pages; |
---|
1674 | | - unsigned int nr_pages, array_size, i; |
---|
1675 | 2477 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
---|
1676 | | - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; |
---|
1677 | | - const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? |
---|
1678 | | - 0 : |
---|
1679 | | - __GFP_HIGHMEM; |
---|
| 2478 | + unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
---|
| 2479 | + unsigned int array_size = nr_pages * sizeof(struct page *), i; |
---|
| 2480 | + struct page **pages; |
---|
1680 | 2481 | |
---|
1681 | | - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
---|
1682 | | - array_size = (nr_pages * sizeof(struct page *)); |
---|
| 2482 | + gfp_mask |= __GFP_NOWARN; |
---|
| 2483 | + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) |
---|
| 2484 | + gfp_mask |= __GFP_HIGHMEM; |
---|
1683 | 2485 | |
---|
1684 | 2486 | /* Please note that the recursion is strictly bounded. */ |
---|
1685 | 2487 | if (array_size > PAGE_SIZE) { |
---|
1686 | | - pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, |
---|
1687 | | - PAGE_KERNEL, node, area->caller); |
---|
| 2488 | + pages = __vmalloc_node(array_size, 1, nested_gfp, node, |
---|
| 2489 | + area->caller); |
---|
1688 | 2490 | } else { |
---|
1689 | 2491 | pages = kmalloc_node(array_size, nested_gfp, node); |
---|
1690 | 2492 | } |
---|
.. | .. |
---|
1702 | 2504 | struct page *page; |
---|
1703 | 2505 | |
---|
1704 | 2506 | if (node == NUMA_NO_NODE) |
---|
1705 | | - page = alloc_page(alloc_mask|highmem_mask); |
---|
| 2507 | + page = alloc_page(gfp_mask); |
---|
1706 | 2508 | else |
---|
1707 | | - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); |
---|
| 2509 | + page = alloc_pages_node(node, gfp_mask, 0); |
---|
1708 | 2510 | |
---|
1709 | 2511 | if (unlikely(!page)) { |
---|
1710 | | - /* Successfully allocated i pages, free them in __vunmap() */ |
---|
| 2512 | + /* Successfully allocated i pages, free them in __vfree() */ |
---|
1711 | 2513 | area->nr_pages = i; |
---|
1712 | 2514 | atomic_long_add(area->nr_pages, &nr_vmalloc_pages); |
---|
1713 | 2515 | goto fail; |
---|
1714 | 2516 | } |
---|
1715 | 2517 | area->pages[i] = page; |
---|
1716 | | - if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) |
---|
| 2518 | + if (gfpflags_allow_blocking(gfp_mask)) |
---|
1717 | 2519 | cond_resched(); |
---|
1718 | 2520 | } |
---|
1719 | 2521 | atomic_long_add(area->nr_pages, &nr_vmalloc_pages); |
---|
1720 | 2522 | |
---|
1721 | | - if (map_vm_area(area, prot, pages)) |
---|
| 2523 | + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), |
---|
| 2524 | + prot, pages) < 0) |
---|
1722 | 2525 | goto fail; |
---|
| 2526 | + |
---|
1723 | 2527 | return area->addr; |
---|
1724 | 2528 | |
---|
1725 | 2529 | fail: |
---|
1726 | 2530 | warn_alloc(gfp_mask, NULL, |
---|
1727 | 2531 | "vmalloc: allocation failure, allocated %ld of %ld bytes", |
---|
1728 | 2532 | (area->nr_pages*PAGE_SIZE), area->size); |
---|
1729 | | - vfree(area->addr); |
---|
| 2533 | + __vfree(area->addr); |
---|
1730 | 2534 | return NULL; |
---|
1731 | 2535 | } |
---|
1732 | 2536 | |
---|
1733 | 2537 | /** |
---|
1734 | | - * __vmalloc_node_range - allocate virtually contiguous memory |
---|
1735 | | - * @size: allocation size |
---|
1736 | | - * @align: desired alignment |
---|
1737 | | - * @start: vm area range start |
---|
1738 | | - * @end: vm area range end |
---|
1739 | | - * @gfp_mask: flags for the page level allocator |
---|
1740 | | - * @prot: protection mask for the allocated pages |
---|
1741 | | - * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) |
---|
1742 | | - * @node: node to use for allocation or NUMA_NO_NODE |
---|
1743 | | - * @caller: caller's return address |
---|
| 2538 | + * __vmalloc_node_range - allocate virtually contiguous memory |
---|
| 2539 | + * @size: allocation size |
---|
| 2540 | + * @align: desired alignment |
---|
| 2541 | + * @start: vm area range start |
---|
| 2542 | + * @end: vm area range end |
---|
| 2543 | + * @gfp_mask: flags for the page level allocator |
---|
| 2544 | + * @prot: protection mask for the allocated pages |
---|
| 2545 | + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) |
---|
| 2546 | + * @node: node to use for allocation or NUMA_NO_NODE |
---|
| 2547 | + * @caller: caller's return address |
---|
1744 | 2548 | * |
---|
1745 | | - * Allocate enough pages to cover @size from the page level |
---|
1746 | | - * allocator with @gfp_mask flags. Map them into contiguous |
---|
1747 | | - * kernel virtual space, using a pagetable protection of @prot. |
---|
| 2549 | + * Allocate enough pages to cover @size from the page level |
---|
| 2550 | + * allocator with @gfp_mask flags. Map them into contiguous |
---|
| 2551 | + * kernel virtual space, using a pagetable protection of @prot. |
---|
| 2552 | + * |
---|
| 2553 | + * Return: the address of the area or %NULL on failure |
---|
1748 | 2554 | */ |
---|
1749 | 2555 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
---|
1750 | 2556 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
---|
.. | .. |
---|
1756 | 2562 | unsigned long real_size = size; |
---|
1757 | 2563 | |
---|
1758 | 2564 | size = PAGE_ALIGN(size); |
---|
1759 | | - if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
---|
| 2565 | + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) |
---|
1760 | 2566 | goto fail; |
---|
1761 | 2567 | |
---|
1762 | | - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | |
---|
| 2568 | + area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | |
---|
1763 | 2569 | vm_flags, start, end, node, gfp_mask, caller); |
---|
1764 | 2570 | if (!area) |
---|
1765 | 2571 | goto fail; |
---|
.. | .. |
---|
1767 | 2573 | addr = __vmalloc_area_node(area, gfp_mask, prot, node); |
---|
1768 | 2574 | if (!addr) |
---|
1769 | 2575 | return NULL; |
---|
1770 | | - |
---|
1771 | | - /* |
---|
1772 | | - * First make sure the mappings are removed from all page-tables |
---|
1773 | | - * before they are freed. |
---|
1774 | | - */ |
---|
1775 | | - vmalloc_sync_unmappings(); |
---|
1776 | 2576 | |
---|
1777 | 2577 | /* |
---|
1778 | 2578 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED |
---|
.. | .. |
---|
1792 | 2592 | } |
---|
1793 | 2593 | |
---|
1794 | 2594 | /** |
---|
1795 | | - * __vmalloc_node - allocate virtually contiguous memory |
---|
1796 | | - * @size: allocation size |
---|
1797 | | - * @align: desired alignment |
---|
1798 | | - * @gfp_mask: flags for the page level allocator |
---|
1799 | | - * @prot: protection mask for the allocated pages |
---|
1800 | | - * @node: node to use for allocation or NUMA_NO_NODE |
---|
1801 | | - * @caller: caller's return address |
---|
| 2595 | + * __vmalloc_node - allocate virtually contiguous memory |
---|
| 2596 | + * @size: allocation size |
---|
| 2597 | + * @align: desired alignment |
---|
| 2598 | + * @gfp_mask: flags for the page level allocator |
---|
| 2599 | + * @node: node to use for allocation or NUMA_NO_NODE |
---|
| 2600 | + * @caller: caller's return address |
---|
1802 | 2601 | * |
---|
1803 | | - * Allocate enough pages to cover @size from the page level |
---|
1804 | | - * allocator with @gfp_mask flags. Map them into contiguous |
---|
1805 | | - * kernel virtual space, using a pagetable protection of @prot. |
---|
| 2602 | + * Allocate enough pages to cover @size from the page level allocator with |
---|
| 2603 | + * @gfp_mask flags. Map them into contiguous kernel virtual space. |
---|
1806 | 2604 | * |
---|
1807 | | - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL |
---|
1808 | | - * and __GFP_NOFAIL are not supported |
---|
| 2605 | + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL |
---|
| 2606 | + * and __GFP_NOFAIL are not supported |
---|
1809 | 2607 | * |
---|
1810 | | - * Any use of gfp flags outside of GFP_KERNEL should be consulted |
---|
1811 | | - * with mm people. |
---|
| 2608 | + * Any use of gfp flags outside of GFP_KERNEL should be consulted |
---|
| 2609 | + * with mm people. |
---|
1812 | 2610 | * |
---|
| 2611 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1813 | 2612 | */ |
---|
1814 | | -static void *__vmalloc_node(unsigned long size, unsigned long align, |
---|
1815 | | - gfp_t gfp_mask, pgprot_t prot, |
---|
1816 | | - int node, const void *caller) |
---|
| 2613 | +void *__vmalloc_node(unsigned long size, unsigned long align, |
---|
| 2614 | + gfp_t gfp_mask, int node, const void *caller) |
---|
1817 | 2615 | { |
---|
1818 | 2616 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, |
---|
1819 | | - gfp_mask, prot, 0, node, caller); |
---|
| 2617 | + gfp_mask, PAGE_KERNEL, 0, node, caller); |
---|
1820 | 2618 | } |
---|
| 2619 | +/* |
---|
| 2620 | + * This is only for performance analysis of vmalloc and stress purpose. |
---|
| 2621 | + * It is required by vmalloc test module, therefore do not use it other |
---|
| 2622 | + * than that. |
---|
| 2623 | + */ |
---|
| 2624 | +#ifdef CONFIG_TEST_VMALLOC_MODULE |
---|
| 2625 | +EXPORT_SYMBOL_GPL(__vmalloc_node); |
---|
| 2626 | +#endif |
---|
1821 | 2627 | |
---|
1822 | | -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
---|
| 2628 | +void *__vmalloc(unsigned long size, gfp_t gfp_mask) |
---|
1823 | 2629 | { |
---|
1824 | | - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, |
---|
| 2630 | + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, |
---|
1825 | 2631 | __builtin_return_address(0)); |
---|
1826 | 2632 | } |
---|
1827 | 2633 | EXPORT_SYMBOL(__vmalloc); |
---|
1828 | 2634 | |
---|
1829 | | -static inline void *__vmalloc_node_flags(unsigned long size, |
---|
1830 | | - int node, gfp_t flags) |
---|
1831 | | -{ |
---|
1832 | | - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, |
---|
1833 | | - node, __builtin_return_address(0)); |
---|
1834 | | -} |
---|
1835 | | - |
---|
1836 | | - |
---|
1837 | | -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, |
---|
1838 | | - void *caller) |
---|
1839 | | -{ |
---|
1840 | | - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); |
---|
1841 | | -} |
---|
1842 | | - |
---|
1843 | 2635 | /** |
---|
1844 | | - * vmalloc - allocate virtually contiguous memory |
---|
1845 | | - * @size: allocation size |
---|
1846 | | - * Allocate enough pages to cover @size from the page level |
---|
1847 | | - * allocator and map them into contiguous kernel virtual space. |
---|
| 2636 | + * vmalloc - allocate virtually contiguous memory |
---|
| 2637 | + * @size: allocation size |
---|
1848 | 2638 | * |
---|
1849 | | - * For tight control over page level allocator and protection flags |
---|
1850 | | - * use __vmalloc() instead. |
---|
| 2639 | + * Allocate enough pages to cover @size from the page level |
---|
| 2640 | + * allocator and map them into contiguous kernel virtual space. |
---|
| 2641 | + * |
---|
| 2642 | + * For tight control over page level allocator and protection flags |
---|
| 2643 | + * use __vmalloc() instead. |
---|
| 2644 | + * |
---|
| 2645 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1851 | 2646 | */ |
---|
1852 | 2647 | void *vmalloc(unsigned long size) |
---|
1853 | 2648 | { |
---|
1854 | | - return __vmalloc_node_flags(size, NUMA_NO_NODE, |
---|
1855 | | - GFP_KERNEL); |
---|
| 2649 | + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, |
---|
| 2650 | + __builtin_return_address(0)); |
---|
1856 | 2651 | } |
---|
1857 | 2652 | EXPORT_SYMBOL(vmalloc); |
---|
1858 | 2653 | |
---|
1859 | 2654 | /** |
---|
1860 | | - * vzalloc - allocate virtually contiguous memory with zero fill |
---|
1861 | | - * @size: allocation size |
---|
1862 | | - * Allocate enough pages to cover @size from the page level |
---|
1863 | | - * allocator and map them into contiguous kernel virtual space. |
---|
1864 | | - * The memory allocated is set to zero. |
---|
| 2655 | + * vzalloc - allocate virtually contiguous memory with zero fill |
---|
| 2656 | + * @size: allocation size |
---|
1865 | 2657 | * |
---|
1866 | | - * For tight control over page level allocator and protection flags |
---|
1867 | | - * use __vmalloc() instead. |
---|
| 2658 | + * Allocate enough pages to cover @size from the page level |
---|
| 2659 | + * allocator and map them into contiguous kernel virtual space. |
---|
| 2660 | + * The memory allocated is set to zero. |
---|
| 2661 | + * |
---|
| 2662 | + * For tight control over page level allocator and protection flags |
---|
| 2663 | + * use __vmalloc() instead. |
---|
| 2664 | + * |
---|
| 2665 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1868 | 2666 | */ |
---|
1869 | 2667 | void *vzalloc(unsigned long size) |
---|
1870 | 2668 | { |
---|
1871 | | - return __vmalloc_node_flags(size, NUMA_NO_NODE, |
---|
1872 | | - GFP_KERNEL | __GFP_ZERO); |
---|
| 2669 | + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, |
---|
| 2670 | + __builtin_return_address(0)); |
---|
1873 | 2671 | } |
---|
1874 | 2672 | EXPORT_SYMBOL(vzalloc); |
---|
1875 | 2673 | |
---|
.. | .. |
---|
1879 | 2677 | * |
---|
1880 | 2678 | * The resulting memory area is zeroed so it can be mapped to userspace |
---|
1881 | 2679 | * without leaking data. |
---|
| 2680 | + * |
---|
| 2681 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1882 | 2682 | */ |
---|
1883 | 2683 | void *vmalloc_user(unsigned long size) |
---|
1884 | 2684 | { |
---|
1885 | | - struct vm_struct *area; |
---|
1886 | | - void *ret; |
---|
1887 | | - |
---|
1888 | | - ret = __vmalloc_node(size, SHMLBA, |
---|
1889 | | - GFP_KERNEL | __GFP_ZERO, |
---|
1890 | | - PAGE_KERNEL, NUMA_NO_NODE, |
---|
1891 | | - __builtin_return_address(0)); |
---|
1892 | | - if (ret) { |
---|
1893 | | - area = find_vm_area(ret); |
---|
1894 | | - area->flags |= VM_USERMAP; |
---|
1895 | | - } |
---|
1896 | | - return ret; |
---|
| 2685 | + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, |
---|
| 2686 | + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, |
---|
| 2687 | + VM_USERMAP, NUMA_NO_NODE, |
---|
| 2688 | + __builtin_return_address(0)); |
---|
1897 | 2689 | } |
---|
1898 | 2690 | EXPORT_SYMBOL(vmalloc_user); |
---|
1899 | 2691 | |
---|
1900 | 2692 | /** |
---|
1901 | | - * vmalloc_node - allocate memory on a specific node |
---|
1902 | | - * @size: allocation size |
---|
1903 | | - * @node: numa node |
---|
| 2693 | + * vmalloc_node - allocate memory on a specific node |
---|
| 2694 | + * @size: allocation size |
---|
| 2695 | + * @node: numa node |
---|
1904 | 2696 | * |
---|
1905 | | - * Allocate enough pages to cover @size from the page level |
---|
1906 | | - * allocator and map them into contiguous kernel virtual space. |
---|
| 2697 | + * Allocate enough pages to cover @size from the page level |
---|
| 2698 | + * allocator and map them into contiguous kernel virtual space. |
---|
1907 | 2699 | * |
---|
1908 | | - * For tight control over page level allocator and protection flags |
---|
1909 | | - * use __vmalloc() instead. |
---|
| 2700 | + * For tight control over page level allocator and protection flags |
---|
| 2701 | + * use __vmalloc() instead. |
---|
| 2702 | + * |
---|
| 2703 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1910 | 2704 | */ |
---|
1911 | 2705 | void *vmalloc_node(unsigned long size, int node) |
---|
1912 | 2706 | { |
---|
1913 | | - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, |
---|
1914 | | - node, __builtin_return_address(0)); |
---|
| 2707 | + return __vmalloc_node(size, 1, GFP_KERNEL, node, |
---|
| 2708 | + __builtin_return_address(0)); |
---|
1915 | 2709 | } |
---|
1916 | 2710 | EXPORT_SYMBOL(vmalloc_node); |
---|
1917 | 2711 | |
---|
.. | .. |
---|
1924 | 2718 | * allocator and map them into contiguous kernel virtual space. |
---|
1925 | 2719 | * The memory allocated is set to zero. |
---|
1926 | 2720 | * |
---|
1927 | | - * For tight control over page level allocator and protection flags |
---|
1928 | | - * use __vmalloc_node() instead. |
---|
| 2721 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1929 | 2722 | */ |
---|
1930 | 2723 | void *vzalloc_node(unsigned long size, int node) |
---|
1931 | 2724 | { |
---|
1932 | | - return __vmalloc_node_flags(size, node, |
---|
1933 | | - GFP_KERNEL | __GFP_ZERO); |
---|
| 2725 | + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, |
---|
| 2726 | + __builtin_return_address(0)); |
---|
1934 | 2727 | } |
---|
1935 | 2728 | EXPORT_SYMBOL(vzalloc_node); |
---|
1936 | | - |
---|
1937 | | -/** |
---|
1938 | | - * vmalloc_exec - allocate virtually contiguous, executable memory |
---|
1939 | | - * @size: allocation size |
---|
1940 | | - * |
---|
1941 | | - * Kernel-internal function to allocate enough pages to cover @size |
---|
1942 | | - * the page level allocator and map them into contiguous and |
---|
1943 | | - * executable kernel virtual space. |
---|
1944 | | - * |
---|
1945 | | - * For tight control over page level allocator and protection flags |
---|
1946 | | - * use __vmalloc() instead. |
---|
1947 | | - */ |
---|
1948 | | - |
---|
1949 | | -void *vmalloc_exec(unsigned long size) |
---|
1950 | | -{ |
---|
1951 | | - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, |
---|
1952 | | - NUMA_NO_NODE, __builtin_return_address(0)); |
---|
1953 | | -} |
---|
1954 | 2729 | |
---|
1955 | 2730 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
---|
1956 | 2731 | #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) |
---|
.. | .. |
---|
1965 | 2740 | #endif |
---|
1966 | 2741 | |
---|
1967 | 2742 | /** |
---|
1968 | | - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
---|
1969 | | - * @size: allocation size |
---|
| 2743 | + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
---|
| 2744 | + * @size: allocation size |
---|
1970 | 2745 | * |
---|
1971 | | - * Allocate enough 32bit PA addressable pages to cover @size from the |
---|
1972 | | - * page level allocator and map them into contiguous kernel virtual space. |
---|
| 2746 | + * Allocate enough 32bit PA addressable pages to cover @size from the |
---|
| 2747 | + * page level allocator and map them into contiguous kernel virtual space. |
---|
| 2748 | + * |
---|
| 2749 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1973 | 2750 | */ |
---|
1974 | 2751 | void *vmalloc_32(unsigned long size) |
---|
1975 | 2752 | { |
---|
1976 | | - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
---|
1977 | | - NUMA_NO_NODE, __builtin_return_address(0)); |
---|
| 2753 | + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, |
---|
| 2754 | + __builtin_return_address(0)); |
---|
1978 | 2755 | } |
---|
1979 | 2756 | EXPORT_SYMBOL(vmalloc_32); |
---|
1980 | 2757 | |
---|
1981 | 2758 | /** |
---|
1982 | 2759 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory |
---|
1983 | | - * @size: allocation size |
---|
| 2760 | + * @size: allocation size |
---|
1984 | 2761 | * |
---|
1985 | 2762 | * The resulting memory area is 32bit addressable and zeroed so it can be |
---|
1986 | 2763 | * mapped to userspace without leaking data. |
---|
| 2764 | + * |
---|
| 2765 | + * Return: pointer to the allocated memory or %NULL on error |
---|
1987 | 2766 | */ |
---|
1988 | 2767 | void *vmalloc_32_user(unsigned long size) |
---|
1989 | 2768 | { |
---|
1990 | | - struct vm_struct *area; |
---|
1991 | | - void *ret; |
---|
1992 | | - |
---|
1993 | | - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
---|
1994 | | - NUMA_NO_NODE, __builtin_return_address(0)); |
---|
1995 | | - if (ret) { |
---|
1996 | | - area = find_vm_area(ret); |
---|
1997 | | - area->flags |= VM_USERMAP; |
---|
1998 | | - } |
---|
1999 | | - return ret; |
---|
| 2769 | + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, |
---|
| 2770 | + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
---|
| 2771 | + VM_USERMAP, NUMA_NO_NODE, |
---|
| 2772 | + __builtin_return_address(0)); |
---|
2000 | 2773 | } |
---|
2001 | 2774 | EXPORT_SYMBOL(vmalloc_32_user); |
---|
2002 | 2775 | |
---|
.. | .. |
---|
2082 | 2855 | } |
---|
2083 | 2856 | |
---|
2084 | 2857 | /** |
---|
2085 | | - * vread() - read vmalloc area in a safe way. |
---|
2086 | | - * @buf: buffer for reading data |
---|
2087 | | - * @addr: vm address. |
---|
2088 | | - * @count: number of bytes to be read. |
---|
| 2858 | + * vread() - read vmalloc area in a safe way. |
---|
| 2859 | + * @buf: buffer for reading data |
---|
| 2860 | + * @addr: vm address. |
---|
| 2861 | + * @count: number of bytes to be read. |
---|
2089 | 2862 | * |
---|
2090 | | - * Returns # of bytes which addr and buf should be increased. |
---|
2091 | | - * (same number to @count). Returns 0 if [addr...addr+count) doesn't |
---|
2092 | | - * includes any intersect with alive vmalloc area. |
---|
| 2863 | + * This function checks that addr is a valid vmalloc'ed area, and |
---|
| 2864 | + * copy data from that area to a given buffer. If the given memory range |
---|
| 2865 | + * of [addr...addr+count) includes some valid address, data is copied to |
---|
| 2866 | + * proper area of @buf. If there are memory holes, they'll be zero-filled. |
---|
| 2867 | + * IOREMAP area is treated as memory hole and no copy is done. |
---|
2093 | 2868 | * |
---|
2094 | | - * This function checks that addr is a valid vmalloc'ed area, and |
---|
2095 | | - * copy data from that area to a given buffer. If the given memory range |
---|
2096 | | - * of [addr...addr+count) includes some valid address, data is copied to |
---|
2097 | | - * proper area of @buf. If there are memory holes, they'll be zero-filled. |
---|
2098 | | - * IOREMAP area is treated as memory hole and no copy is done. |
---|
| 2869 | + * If [addr...addr+count) doesn't includes any intersects with alive |
---|
| 2870 | + * vm_struct area, returns 0. @buf should be kernel's buffer. |
---|
2099 | 2871 | * |
---|
2100 | | - * If [addr...addr+count) doesn't includes any intersects with alive |
---|
2101 | | - * vm_struct area, returns 0. @buf should be kernel's buffer. |
---|
| 2872 | + * Note: In usual ops, vread() is never necessary because the caller |
---|
| 2873 | + * should know vmalloc() area is valid and can use memcpy(). |
---|
| 2874 | + * This is for routines which have to access vmalloc area without |
---|
| 2875 | + * any information, as /dev/kmem. |
---|
2102 | 2876 | * |
---|
2103 | | - * Note: In usual ops, vread() is never necessary because the caller |
---|
2104 | | - * should know vmalloc() area is valid and can use memcpy(). |
---|
2105 | | - * This is for routines which have to access vmalloc area without |
---|
2106 | | - * any informaion, as /dev/kmem. |
---|
2107 | | - * |
---|
| 2877 | + * Return: number of bytes for which addr and buf should be increased |
---|
| 2878 | + * (same number as @count) or %0 if [addr...addr+count) doesn't |
---|
| 2879 | + * include any intersection with valid vmalloc area |
---|
2108 | 2880 | */ |
---|
2109 | | - |
---|
2110 | 2881 | long vread(char *buf, char *addr, unsigned long count) |
---|
2111 | 2882 | { |
---|
2112 | 2883 | struct vmap_area *va; |
---|
.. | .. |
---|
2124 | 2895 | if (!count) |
---|
2125 | 2896 | break; |
---|
2126 | 2897 | |
---|
2127 | | - if (!(va->flags & VM_VM_AREA)) |
---|
| 2898 | + if (!va->vm) |
---|
2128 | 2899 | continue; |
---|
2129 | 2900 | |
---|
2130 | 2901 | vm = va->vm; |
---|
.. | .. |
---|
2163 | 2934 | } |
---|
2164 | 2935 | |
---|
2165 | 2936 | /** |
---|
2166 | | - * vwrite() - write vmalloc area in a safe way. |
---|
2167 | | - * @buf: buffer for source data |
---|
2168 | | - * @addr: vm address. |
---|
2169 | | - * @count: number of bytes to be read. |
---|
| 2937 | + * vwrite() - write vmalloc area in a safe way. |
---|
| 2938 | + * @buf: buffer for source data |
---|
| 2939 | + * @addr: vm address. |
---|
| 2940 | + * @count: number of bytes to be read. |
---|
2170 | 2941 | * |
---|
2171 | | - * Returns # of bytes which addr and buf should be incresed. |
---|
2172 | | - * (same number to @count). |
---|
2173 | | - * If [addr...addr+count) doesn't includes any intersect with valid |
---|
2174 | | - * vmalloc area, returns 0. |
---|
| 2942 | + * This function checks that addr is a valid vmalloc'ed area, and |
---|
| 2943 | + * copy data from a buffer to the given addr. If specified range of |
---|
| 2944 | + * [addr...addr+count) includes some valid address, data is copied from |
---|
| 2945 | + * proper area of @buf. If there are memory holes, no copy to hole. |
---|
| 2946 | + * IOREMAP area is treated as memory hole and no copy is done. |
---|
2175 | 2947 | * |
---|
2176 | | - * This function checks that addr is a valid vmalloc'ed area, and |
---|
2177 | | - * copy data from a buffer to the given addr. If specified range of |
---|
2178 | | - * [addr...addr+count) includes some valid address, data is copied from |
---|
2179 | | - * proper area of @buf. If there are memory holes, no copy to hole. |
---|
2180 | | - * IOREMAP area is treated as memory hole and no copy is done. |
---|
| 2948 | + * If [addr...addr+count) doesn't includes any intersects with alive |
---|
| 2949 | + * vm_struct area, returns 0. @buf should be kernel's buffer. |
---|
2181 | 2950 | * |
---|
2182 | | - * If [addr...addr+count) doesn't includes any intersects with alive |
---|
2183 | | - * vm_struct area, returns 0. @buf should be kernel's buffer. |
---|
| 2951 | + * Note: In usual ops, vwrite() is never necessary because the caller |
---|
| 2952 | + * should know vmalloc() area is valid and can use memcpy(). |
---|
| 2953 | + * This is for routines which have to access vmalloc area without |
---|
| 2954 | + * any information, as /dev/kmem. |
---|
2184 | 2955 | * |
---|
2185 | | - * Note: In usual ops, vwrite() is never necessary because the caller |
---|
2186 | | - * should know vmalloc() area is valid and can use memcpy(). |
---|
2187 | | - * This is for routines which have to access vmalloc area without |
---|
2188 | | - * any informaion, as /dev/kmem. |
---|
| 2956 | + * Return: number of bytes for which addr and buf should be |
---|
| 2957 | + * increased (same number as @count) or %0 if [addr...addr+count) |
---|
| 2958 | + * doesn't include any intersection with valid vmalloc area |
---|
2189 | 2959 | */ |
---|
2190 | | - |
---|
2191 | 2960 | long vwrite(char *buf, char *addr, unsigned long count) |
---|
2192 | 2961 | { |
---|
2193 | 2962 | struct vmap_area *va; |
---|
.. | .. |
---|
2206 | 2975 | if (!count) |
---|
2207 | 2976 | break; |
---|
2208 | 2977 | |
---|
2209 | | - if (!(va->flags & VM_VM_AREA)) |
---|
| 2978 | + if (!va->vm) |
---|
2210 | 2979 | continue; |
---|
2211 | 2980 | |
---|
2212 | 2981 | vm = va->vm; |
---|
.. | .. |
---|
2239 | 3008 | } |
---|
2240 | 3009 | |
---|
2241 | 3010 | /** |
---|
2242 | | - * remap_vmalloc_range_partial - map vmalloc pages to userspace |
---|
2243 | | - * @vma: vma to cover |
---|
2244 | | - * @uaddr: target user address to start at |
---|
2245 | | - * @kaddr: virtual address of vmalloc kernel memory |
---|
2246 | | - * @pgoff: offset from @kaddr to start at |
---|
2247 | | - * @size: size of map area |
---|
| 3011 | + * remap_vmalloc_range_partial - map vmalloc pages to userspace |
---|
| 3012 | + * @vma: vma to cover |
---|
| 3013 | + * @uaddr: target user address to start at |
---|
| 3014 | + * @kaddr: virtual address of vmalloc kernel memory |
---|
| 3015 | + * @pgoff: offset from @kaddr to start at |
---|
| 3016 | + * @size: size of map area |
---|
2248 | 3017 | * |
---|
2249 | | - * Returns: 0 for success, -Exxx on failure |
---|
| 3018 | + * Returns: 0 for success, -Exxx on failure |
---|
2250 | 3019 | * |
---|
2251 | | - * This function checks that @kaddr is a valid vmalloc'ed area, |
---|
2252 | | - * and that it is big enough to cover the range starting at |
---|
2253 | | - * @uaddr in @vma. Will return failure if that criteria isn't |
---|
2254 | | - * met. |
---|
| 3020 | + * This function checks that @kaddr is a valid vmalloc'ed area, |
---|
| 3021 | + * and that it is big enough to cover the range starting at |
---|
| 3022 | + * @uaddr in @vma. Will return failure if that criteria isn't |
---|
| 3023 | + * met. |
---|
2255 | 3024 | * |
---|
2256 | | - * Similar to remap_pfn_range() (see mm/memory.c) |
---|
| 3025 | + * Similar to remap_pfn_range() (see mm/memory.c) |
---|
2257 | 3026 | */ |
---|
2258 | 3027 | int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, |
---|
2259 | 3028 | void *kaddr, unsigned long pgoff, |
---|
.. | .. |
---|
2275 | 3044 | if (!area) |
---|
2276 | 3045 | return -EINVAL; |
---|
2277 | 3046 | |
---|
2278 | | - if (!(area->flags & VM_USERMAP)) |
---|
| 3047 | + if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) |
---|
2279 | 3048 | return -EINVAL; |
---|
2280 | 3049 | |
---|
2281 | 3050 | if (check_add_overflow(size, off, &end_index) || |
---|
.. | .. |
---|
2303 | 3072 | EXPORT_SYMBOL(remap_vmalloc_range_partial); |
---|
2304 | 3073 | |
---|
2305 | 3074 | /** |
---|
2306 | | - * remap_vmalloc_range - map vmalloc pages to userspace |
---|
2307 | | - * @vma: vma to cover (map full range of vma) |
---|
2308 | | - * @addr: vmalloc memory |
---|
2309 | | - * @pgoff: number of pages into addr before first page to map |
---|
| 3075 | + * remap_vmalloc_range - map vmalloc pages to userspace |
---|
| 3076 | + * @vma: vma to cover (map full range of vma) |
---|
| 3077 | + * @addr: vmalloc memory |
---|
| 3078 | + * @pgoff: number of pages into addr before first page to map |
---|
2310 | 3079 | * |
---|
2311 | | - * Returns: 0 for success, -Exxx on failure |
---|
| 3080 | + * Returns: 0 for success, -Exxx on failure |
---|
2312 | 3081 | * |
---|
2313 | | - * This function checks that addr is a valid vmalloc'ed area, and |
---|
2314 | | - * that it is big enough to cover the vma. Will return failure if |
---|
2315 | | - * that criteria isn't met. |
---|
| 3082 | + * This function checks that addr is a valid vmalloc'ed area, and |
---|
| 3083 | + * that it is big enough to cover the vma. Will return failure if |
---|
| 3084 | + * that criteria isn't met. |
---|
2316 | 3085 | * |
---|
2317 | | - * Similar to remap_pfn_range() (see mm/memory.c) |
---|
| 3086 | + * Similar to remap_pfn_range() (see mm/memory.c) |
---|
2318 | 3087 | */ |
---|
2319 | 3088 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, |
---|
2320 | 3089 | unsigned long pgoff) |
---|
.. | .. |
---|
2324 | 3093 | vma->vm_end - vma->vm_start); |
---|
2325 | 3094 | } |
---|
2326 | 3095 | EXPORT_SYMBOL(remap_vmalloc_range); |
---|
2327 | | - |
---|
2328 | | -/* |
---|
2329 | | - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose |
---|
2330 | | - * not to have one. |
---|
2331 | | - * |
---|
2332 | | - * The purpose of this function is to make sure the vmalloc area |
---|
2333 | | - * mappings are identical in all page-tables in the system. |
---|
2334 | | - */ |
---|
2335 | | -void __weak vmalloc_sync_mappings(void) |
---|
2336 | | -{ |
---|
2337 | | -} |
---|
2338 | | - |
---|
2339 | | -void __weak vmalloc_sync_unmappings(void) |
---|
2340 | | -{ |
---|
2341 | | -} |
---|
2342 | | - |
---|
2343 | | -static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) |
---|
2344 | | -{ |
---|
2345 | | - pte_t ***p = data; |
---|
2346 | | - |
---|
2347 | | - if (p) { |
---|
2348 | | - *(*p) = pte; |
---|
2349 | | - (*p)++; |
---|
2350 | | - } |
---|
2351 | | - return 0; |
---|
2352 | | -} |
---|
2353 | | - |
---|
2354 | | -/** |
---|
2355 | | - * alloc_vm_area - allocate a range of kernel address space |
---|
2356 | | - * @size: size of the area |
---|
2357 | | - * @ptes: returns the PTEs for the address space |
---|
2358 | | - * |
---|
2359 | | - * Returns: NULL on failure, vm_struct on success |
---|
2360 | | - * |
---|
2361 | | - * This function reserves a range of kernel address space, and |
---|
2362 | | - * allocates pagetables to map that range. No actual mappings |
---|
2363 | | - * are created. |
---|
2364 | | - * |
---|
2365 | | - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) |
---|
2366 | | - * allocated for the VM area are returned. |
---|
2367 | | - */ |
---|
2368 | | -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) |
---|
2369 | | -{ |
---|
2370 | | - struct vm_struct *area; |
---|
2371 | | - |
---|
2372 | | - area = get_vm_area_caller(size, VM_IOREMAP, |
---|
2373 | | - __builtin_return_address(0)); |
---|
2374 | | - if (area == NULL) |
---|
2375 | | - return NULL; |
---|
2376 | | - |
---|
2377 | | - /* |
---|
2378 | | - * This ensures that page tables are constructed for this region |
---|
2379 | | - * of kernel virtual address space and mapped into init_mm. |
---|
2380 | | - */ |
---|
2381 | | - if (apply_to_page_range(&init_mm, (unsigned long)area->addr, |
---|
2382 | | - size, f, ptes ? &ptes : NULL)) { |
---|
2383 | | - free_vm_area(area); |
---|
2384 | | - return NULL; |
---|
2385 | | - } |
---|
2386 | | - |
---|
2387 | | - return area; |
---|
2388 | | -} |
---|
2389 | | -EXPORT_SYMBOL_GPL(alloc_vm_area); |
---|
2390 | 3096 | |
---|
2391 | 3097 | void free_vm_area(struct vm_struct *area) |
---|
2392 | 3098 | { |
---|
.. | .. |
---|
2404 | 3110 | } |
---|
2405 | 3111 | |
---|
2406 | 3112 | /** |
---|
2407 | | - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end |
---|
2408 | | - * @end: target address |
---|
2409 | | - * @pnext: out arg for the next vmap_area |
---|
2410 | | - * @pprev: out arg for the previous vmap_area |
---|
| 3113 | + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to |
---|
| 3114 | + * @addr: target address |
---|
2411 | 3115 | * |
---|
2412 | | - * Returns: %true if either or both of next and prev are found, |
---|
2413 | | - * %false if no vmap_area exists |
---|
2414 | | - * |
---|
2415 | | - * Find vmap_areas end addresses of which enclose @end. ie. if not |
---|
2416 | | - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. |
---|
| 3116 | + * Returns: vmap_area if it is found. If there is no such area |
---|
| 3117 | + * the first highest(reverse order) vmap_area is returned |
---|
| 3118 | + * i.e. va->va_start < addr && va->va_end < addr or NULL |
---|
| 3119 | + * if there are no any areas before @addr. |
---|
2417 | 3120 | */ |
---|
2418 | | -static bool pvm_find_next_prev(unsigned long end, |
---|
2419 | | - struct vmap_area **pnext, |
---|
2420 | | - struct vmap_area **pprev) |
---|
| 3121 | +static struct vmap_area * |
---|
| 3122 | +pvm_find_va_enclose_addr(unsigned long addr) |
---|
2421 | 3123 | { |
---|
2422 | | - struct rb_node *n = vmap_area_root.rb_node; |
---|
2423 | | - struct vmap_area *va = NULL; |
---|
| 3124 | + struct vmap_area *va, *tmp; |
---|
| 3125 | + struct rb_node *n; |
---|
| 3126 | + |
---|
| 3127 | + n = free_vmap_area_root.rb_node; |
---|
| 3128 | + va = NULL; |
---|
2424 | 3129 | |
---|
2425 | 3130 | while (n) { |
---|
2426 | | - va = rb_entry(n, struct vmap_area, rb_node); |
---|
2427 | | - if (end < va->va_end) |
---|
2428 | | - n = n->rb_left; |
---|
2429 | | - else if (end > va->va_end) |
---|
| 3131 | + tmp = rb_entry(n, struct vmap_area, rb_node); |
---|
| 3132 | + if (tmp->va_start <= addr) { |
---|
| 3133 | + va = tmp; |
---|
| 3134 | + if (tmp->va_end >= addr) |
---|
| 3135 | + break; |
---|
| 3136 | + |
---|
2430 | 3137 | n = n->rb_right; |
---|
2431 | | - else |
---|
2432 | | - break; |
---|
| 3138 | + } else { |
---|
| 3139 | + n = n->rb_left; |
---|
| 3140 | + } |
---|
2433 | 3141 | } |
---|
2434 | 3142 | |
---|
2435 | | - if (!va) |
---|
2436 | | - return false; |
---|
2437 | | - |
---|
2438 | | - if (va->va_end > end) { |
---|
2439 | | - *pnext = va; |
---|
2440 | | - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); |
---|
2441 | | - } else { |
---|
2442 | | - *pprev = va; |
---|
2443 | | - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); |
---|
2444 | | - } |
---|
2445 | | - return true; |
---|
| 3143 | + return va; |
---|
2446 | 3144 | } |
---|
2447 | 3145 | |
---|
2448 | 3146 | /** |
---|
2449 | | - * pvm_determine_end - find the highest aligned address between two vmap_areas |
---|
2450 | | - * @pnext: in/out arg for the next vmap_area |
---|
2451 | | - * @pprev: in/out arg for the previous vmap_area |
---|
2452 | | - * @align: alignment |
---|
| 3147 | + * pvm_determine_end_from_reverse - find the highest aligned address |
---|
| 3148 | + * of free block below VMALLOC_END |
---|
| 3149 | + * @va: |
---|
| 3150 | + * in - the VA we start the search(reverse order); |
---|
| 3151 | + * out - the VA with the highest aligned end address. |
---|
2453 | 3152 | * |
---|
2454 | | - * Returns: determined end address |
---|
2455 | | - * |
---|
2456 | | - * Find the highest aligned address between *@pnext and *@pprev below |
---|
2457 | | - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned |
---|
2458 | | - * down address is between the end addresses of the two vmap_areas. |
---|
2459 | | - * |
---|
2460 | | - * Please note that the address returned by this function may fall |
---|
2461 | | - * inside *@pnext vmap_area. The caller is responsible for checking |
---|
2462 | | - * that. |
---|
| 3153 | + * Returns: determined end address within vmap_area |
---|
2463 | 3154 | */ |
---|
2464 | | -static unsigned long pvm_determine_end(struct vmap_area **pnext, |
---|
2465 | | - struct vmap_area **pprev, |
---|
2466 | | - unsigned long align) |
---|
| 3155 | +static unsigned long |
---|
| 3156 | +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) |
---|
2467 | 3157 | { |
---|
2468 | | - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
---|
| 3158 | + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
---|
2469 | 3159 | unsigned long addr; |
---|
2470 | 3160 | |
---|
2471 | | - if (*pnext) |
---|
2472 | | - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); |
---|
2473 | | - else |
---|
2474 | | - addr = vmalloc_end; |
---|
2475 | | - |
---|
2476 | | - while (*pprev && (*pprev)->va_end > addr) { |
---|
2477 | | - *pnext = *pprev; |
---|
2478 | | - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); |
---|
| 3161 | + if (likely(*va)) { |
---|
| 3162 | + list_for_each_entry_from_reverse((*va), |
---|
| 3163 | + &free_vmap_area_list, list) { |
---|
| 3164 | + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); |
---|
| 3165 | + if ((*va)->va_start < addr) |
---|
| 3166 | + return addr; |
---|
| 3167 | + } |
---|
2479 | 3168 | } |
---|
2480 | 3169 | |
---|
2481 | | - return addr; |
---|
| 3170 | + return 0; |
---|
2482 | 3171 | } |
---|
2483 | 3172 | |
---|
2484 | 3173 | /** |
---|
.. | .. |
---|
2498 | 3187 | * to gigabytes. To avoid interacting with regular vmallocs, these |
---|
2499 | 3188 | * areas are allocated from top. |
---|
2500 | 3189 | * |
---|
2501 | | - * Despite its complicated look, this allocator is rather simple. It |
---|
2502 | | - * does everything top-down and scans areas from the end looking for |
---|
2503 | | - * matching slot. While scanning, if any of the areas overlaps with |
---|
2504 | | - * existing vmap_area, the base address is pulled down to fit the |
---|
2505 | | - * area. Scanning is repeated till all the areas fit and then all |
---|
2506 | | - * necessary data structures are inserted and the result is returned. |
---|
| 3190 | + * Despite its complicated look, this allocator is rather simple. It |
---|
| 3191 | + * does everything top-down and scans free blocks from the end looking |
---|
| 3192 | + * for matching base. While scanning, if any of the areas do not fit the |
---|
| 3193 | + * base address is pulled down to fit the area. Scanning is repeated till |
---|
| 3194 | + * all the areas fit and then all necessary data structures are inserted |
---|
| 3195 | + * and the result is returned. |
---|
2507 | 3196 | */ |
---|
2508 | 3197 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, |
---|
2509 | 3198 | const size_t *sizes, int nr_vms, |
---|
.. | .. |
---|
2511 | 3200 | { |
---|
2512 | 3201 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); |
---|
2513 | 3202 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); |
---|
2514 | | - struct vmap_area **vas, *prev, *next; |
---|
| 3203 | + struct vmap_area **vas, *va; |
---|
2515 | 3204 | struct vm_struct **vms; |
---|
2516 | 3205 | int area, area2, last_area, term_area; |
---|
2517 | | - unsigned long base, start, end, last_end; |
---|
| 3206 | + unsigned long base, start, size, end, last_end, orig_start, orig_end; |
---|
2518 | 3207 | bool purged = false; |
---|
| 3208 | + enum fit_type type; |
---|
2519 | 3209 | |
---|
2520 | 3210 | /* verify parameters and allocate data structures */ |
---|
2521 | 3211 | BUG_ON(offset_in_page(align) || !is_power_of_2(align)); |
---|
.. | .. |
---|
2551 | 3241 | goto err_free2; |
---|
2552 | 3242 | |
---|
2553 | 3243 | for (area = 0; area < nr_vms; area++) { |
---|
2554 | | - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); |
---|
| 3244 | + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); |
---|
2555 | 3245 | vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); |
---|
2556 | 3246 | if (!vas[area] || !vms[area]) |
---|
2557 | 3247 | goto err_free; |
---|
2558 | 3248 | } |
---|
2559 | 3249 | retry: |
---|
2560 | | - spin_lock(&vmap_area_lock); |
---|
| 3250 | + spin_lock(&free_vmap_area_lock); |
---|
2561 | 3251 | |
---|
2562 | 3252 | /* start scanning - we scan from the top, begin with the last area */ |
---|
2563 | 3253 | area = term_area = last_area; |
---|
2564 | 3254 | start = offsets[area]; |
---|
2565 | 3255 | end = start + sizes[area]; |
---|
2566 | 3256 | |
---|
2567 | | - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { |
---|
2568 | | - base = vmalloc_end - last_end; |
---|
2569 | | - goto found; |
---|
2570 | | - } |
---|
2571 | | - base = pvm_determine_end(&next, &prev, align) - end; |
---|
| 3257 | + va = pvm_find_va_enclose_addr(vmalloc_end); |
---|
| 3258 | + base = pvm_determine_end_from_reverse(&va, align) - end; |
---|
2572 | 3259 | |
---|
2573 | 3260 | while (true) { |
---|
2574 | | - BUG_ON(next && next->va_end <= base + end); |
---|
2575 | | - BUG_ON(prev && prev->va_end > base + end); |
---|
2576 | | - |
---|
2577 | 3261 | /* |
---|
2578 | 3262 | * base might have underflowed, add last_end before |
---|
2579 | 3263 | * comparing. |
---|
2580 | 3264 | */ |
---|
2581 | | - if (base + last_end < vmalloc_start + last_end) { |
---|
2582 | | - spin_unlock(&vmap_area_lock); |
---|
2583 | | - if (!purged) { |
---|
2584 | | - purge_vmap_area_lazy(); |
---|
2585 | | - purged = true; |
---|
2586 | | - goto retry; |
---|
2587 | | - } |
---|
2588 | | - goto err_free; |
---|
2589 | | - } |
---|
| 3265 | + if (base + last_end < vmalloc_start + last_end) |
---|
| 3266 | + goto overflow; |
---|
2590 | 3267 | |
---|
2591 | 3268 | /* |
---|
2592 | | - * If next overlaps, move base downwards so that it's |
---|
2593 | | - * right below next and then recheck. |
---|
| 3269 | + * Fitting base has not been found. |
---|
2594 | 3270 | */ |
---|
2595 | | - if (next && next->va_start < base + end) { |
---|
2596 | | - base = pvm_determine_end(&next, &prev, align) - end; |
---|
| 3271 | + if (va == NULL) |
---|
| 3272 | + goto overflow; |
---|
| 3273 | + |
---|
| 3274 | + /* |
---|
| 3275 | + * If required width exceeds current VA block, move |
---|
| 3276 | + * base downwards and then recheck. |
---|
| 3277 | + */ |
---|
| 3278 | + if (base + end > va->va_end) { |
---|
| 3279 | + base = pvm_determine_end_from_reverse(&va, align) - end; |
---|
2597 | 3280 | term_area = area; |
---|
2598 | 3281 | continue; |
---|
2599 | 3282 | } |
---|
2600 | 3283 | |
---|
2601 | 3284 | /* |
---|
2602 | | - * If prev overlaps, shift down next and prev and move |
---|
2603 | | - * base so that it's right below new next and then |
---|
2604 | | - * recheck. |
---|
| 3285 | + * If this VA does not fit, move base downwards and recheck. |
---|
2605 | 3286 | */ |
---|
2606 | | - if (prev && prev->va_end > base + start) { |
---|
2607 | | - next = prev; |
---|
2608 | | - prev = node_to_va(rb_prev(&next->rb_node)); |
---|
2609 | | - base = pvm_determine_end(&next, &prev, align) - end; |
---|
| 3287 | + if (base + start < va->va_start) { |
---|
| 3288 | + va = node_to_va(rb_prev(&va->rb_node)); |
---|
| 3289 | + base = pvm_determine_end_from_reverse(&va, align) - end; |
---|
2610 | 3290 | term_area = area; |
---|
2611 | 3291 | continue; |
---|
2612 | 3292 | } |
---|
.. | .. |
---|
2618 | 3298 | area = (area + nr_vms - 1) % nr_vms; |
---|
2619 | 3299 | if (area == term_area) |
---|
2620 | 3300 | break; |
---|
| 3301 | + |
---|
2621 | 3302 | start = offsets[area]; |
---|
2622 | 3303 | end = start + sizes[area]; |
---|
2623 | | - pvm_find_next_prev(base + end, &next, &prev); |
---|
| 3304 | + va = pvm_find_va_enclose_addr(base + end); |
---|
2624 | 3305 | } |
---|
2625 | | -found: |
---|
| 3306 | + |
---|
2626 | 3307 | /* we've found a fitting base, insert all va's */ |
---|
2627 | 3308 | for (area = 0; area < nr_vms; area++) { |
---|
2628 | | - struct vmap_area *va = vas[area]; |
---|
| 3309 | + int ret; |
---|
2629 | 3310 | |
---|
2630 | | - va->va_start = base + offsets[area]; |
---|
2631 | | - va->va_end = va->va_start + sizes[area]; |
---|
2632 | | - __insert_vmap_area(va); |
---|
| 3311 | + start = base + offsets[area]; |
---|
| 3312 | + size = sizes[area]; |
---|
| 3313 | + |
---|
| 3314 | + va = pvm_find_va_enclose_addr(start); |
---|
| 3315 | + if (WARN_ON_ONCE(va == NULL)) |
---|
| 3316 | + /* It is a BUG(), but trigger recovery instead. */ |
---|
| 3317 | + goto recovery; |
---|
| 3318 | + |
---|
| 3319 | + type = classify_va_fit_type(va, start, size); |
---|
| 3320 | + if (WARN_ON_ONCE(type == NOTHING_FIT)) |
---|
| 3321 | + /* It is a BUG(), but trigger recovery instead. */ |
---|
| 3322 | + goto recovery; |
---|
| 3323 | + |
---|
| 3324 | + ret = adjust_va_to_fit_type(va, start, size, type); |
---|
| 3325 | + if (unlikely(ret)) |
---|
| 3326 | + goto recovery; |
---|
| 3327 | + |
---|
| 3328 | + /* Allocated area. */ |
---|
| 3329 | + va = vas[area]; |
---|
| 3330 | + va->va_start = start; |
---|
| 3331 | + va->va_end = start + size; |
---|
2633 | 3332 | } |
---|
2634 | 3333 | |
---|
2635 | | - vmap_area_pcpu_hole = base + offsets[last_area]; |
---|
| 3334 | + spin_unlock(&free_vmap_area_lock); |
---|
2636 | 3335 | |
---|
2637 | | - spin_unlock(&vmap_area_lock); |
---|
| 3336 | + /* populate the kasan shadow space */ |
---|
| 3337 | + for (area = 0; area < nr_vms; area++) { |
---|
| 3338 | + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) |
---|
| 3339 | + goto err_free_shadow; |
---|
| 3340 | + |
---|
| 3341 | + kasan_unpoison_vmalloc((void *)vas[area]->va_start, |
---|
| 3342 | + sizes[area]); |
---|
| 3343 | + } |
---|
2638 | 3344 | |
---|
2639 | 3345 | /* insert all vm's */ |
---|
2640 | | - for (area = 0; area < nr_vms; area++) |
---|
2641 | | - setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, |
---|
| 3346 | + spin_lock(&vmap_area_lock); |
---|
| 3347 | + for (area = 0; area < nr_vms; area++) { |
---|
| 3348 | + insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); |
---|
| 3349 | + |
---|
| 3350 | + setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, |
---|
2642 | 3351 | pcpu_get_vm_areas); |
---|
| 3352 | + } |
---|
| 3353 | + spin_unlock(&vmap_area_lock); |
---|
2643 | 3354 | |
---|
2644 | 3355 | kfree(vas); |
---|
2645 | 3356 | return vms; |
---|
2646 | 3357 | |
---|
| 3358 | +recovery: |
---|
| 3359 | + /* |
---|
| 3360 | + * Remove previously allocated areas. There is no |
---|
| 3361 | + * need in removing these areas from the busy tree, |
---|
| 3362 | + * because they are inserted only on the final step |
---|
| 3363 | + * and when pcpu_get_vm_areas() is success. |
---|
| 3364 | + */ |
---|
| 3365 | + while (area--) { |
---|
| 3366 | + orig_start = vas[area]->va_start; |
---|
| 3367 | + orig_end = vas[area]->va_end; |
---|
| 3368 | + va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, |
---|
| 3369 | + &free_vmap_area_list); |
---|
| 3370 | + if (va) |
---|
| 3371 | + kasan_release_vmalloc(orig_start, orig_end, |
---|
| 3372 | + va->va_start, va->va_end); |
---|
| 3373 | + vas[area] = NULL; |
---|
| 3374 | + } |
---|
| 3375 | + |
---|
| 3376 | +overflow: |
---|
| 3377 | + spin_unlock(&free_vmap_area_lock); |
---|
| 3378 | + if (!purged) { |
---|
| 3379 | + purge_vmap_area_lazy(); |
---|
| 3380 | + purged = true; |
---|
| 3381 | + |
---|
| 3382 | + /* Before "retry", check if we recover. */ |
---|
| 3383 | + for (area = 0; area < nr_vms; area++) { |
---|
| 3384 | + if (vas[area]) |
---|
| 3385 | + continue; |
---|
| 3386 | + |
---|
| 3387 | + vas[area] = kmem_cache_zalloc( |
---|
| 3388 | + vmap_area_cachep, GFP_KERNEL); |
---|
| 3389 | + if (!vas[area]) |
---|
| 3390 | + goto err_free; |
---|
| 3391 | + } |
---|
| 3392 | + |
---|
| 3393 | + goto retry; |
---|
| 3394 | + } |
---|
| 3395 | + |
---|
2647 | 3396 | err_free: |
---|
2648 | 3397 | for (area = 0; area < nr_vms; area++) { |
---|
2649 | | - kfree(vas[area]); |
---|
| 3398 | + if (vas[area]) |
---|
| 3399 | + kmem_cache_free(vmap_area_cachep, vas[area]); |
---|
| 3400 | + |
---|
2650 | 3401 | kfree(vms[area]); |
---|
2651 | 3402 | } |
---|
2652 | 3403 | err_free2: |
---|
| 3404 | + kfree(vas); |
---|
| 3405 | + kfree(vms); |
---|
| 3406 | + return NULL; |
---|
| 3407 | + |
---|
| 3408 | +err_free_shadow: |
---|
| 3409 | + spin_lock(&free_vmap_area_lock); |
---|
| 3410 | + /* |
---|
| 3411 | + * We release all the vmalloc shadows, even the ones for regions that |
---|
| 3412 | + * hadn't been successfully added. This relies on kasan_release_vmalloc |
---|
| 3413 | + * being able to tolerate this case. |
---|
| 3414 | + */ |
---|
| 3415 | + for (area = 0; area < nr_vms; area++) { |
---|
| 3416 | + orig_start = vas[area]->va_start; |
---|
| 3417 | + orig_end = vas[area]->va_end; |
---|
| 3418 | + va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, |
---|
| 3419 | + &free_vmap_area_list); |
---|
| 3420 | + if (va) |
---|
| 3421 | + kasan_release_vmalloc(orig_start, orig_end, |
---|
| 3422 | + va->va_start, va->va_end); |
---|
| 3423 | + vas[area] = NULL; |
---|
| 3424 | + kfree(vms[area]); |
---|
| 3425 | + } |
---|
| 3426 | + spin_unlock(&free_vmap_area_lock); |
---|
2653 | 3427 | kfree(vas); |
---|
2654 | 3428 | kfree(vms); |
---|
2655 | 3429 | return NULL; |
---|
.. | .. |
---|
2674 | 3448 | |
---|
2675 | 3449 | #ifdef CONFIG_PROC_FS |
---|
2676 | 3450 | static void *s_start(struct seq_file *m, loff_t *pos) |
---|
| 3451 | + __acquires(&vmap_purge_lock) |
---|
2677 | 3452 | __acquires(&vmap_area_lock) |
---|
2678 | 3453 | { |
---|
| 3454 | + mutex_lock(&vmap_purge_lock); |
---|
2679 | 3455 | spin_lock(&vmap_area_lock); |
---|
| 3456 | + |
---|
2680 | 3457 | return seq_list_start(&vmap_area_list, *pos); |
---|
2681 | 3458 | } |
---|
2682 | 3459 | |
---|
.. | .. |
---|
2687 | 3464 | |
---|
2688 | 3465 | static void s_stop(struct seq_file *m, void *p) |
---|
2689 | 3466 | __releases(&vmap_area_lock) |
---|
| 3467 | + __releases(&vmap_purge_lock) |
---|
2690 | 3468 | { |
---|
2691 | 3469 | spin_unlock(&vmap_area_lock); |
---|
| 3470 | + mutex_unlock(&vmap_purge_lock); |
---|
2692 | 3471 | } |
---|
2693 | 3472 | |
---|
2694 | 3473 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) |
---|
.. | .. |
---|
2715 | 3494 | } |
---|
2716 | 3495 | } |
---|
2717 | 3496 | |
---|
| 3497 | +static void show_purge_info(struct seq_file *m) |
---|
| 3498 | +{ |
---|
| 3499 | + struct llist_node *head; |
---|
| 3500 | + struct vmap_area *va; |
---|
| 3501 | + |
---|
| 3502 | + head = READ_ONCE(vmap_purge_list.first); |
---|
| 3503 | + if (head == NULL) |
---|
| 3504 | + return; |
---|
| 3505 | + |
---|
| 3506 | + llist_for_each_entry(va, head, purge_list) { |
---|
| 3507 | + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", |
---|
| 3508 | + (void *)va->va_start, (void *)va->va_end, |
---|
| 3509 | + va->va_end - va->va_start); |
---|
| 3510 | + } |
---|
| 3511 | +} |
---|
| 3512 | + |
---|
2718 | 3513 | static int s_show(struct seq_file *m, void *p) |
---|
2719 | 3514 | { |
---|
2720 | 3515 | struct vmap_area *va; |
---|
.. | .. |
---|
2723 | 3518 | va = list_entry(p, struct vmap_area, list); |
---|
2724 | 3519 | |
---|
2725 | 3520 | /* |
---|
2726 | | - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on |
---|
2727 | | - * behalf of vmap area is being tear down or vm_map_ram allocation. |
---|
| 3521 | + * s_show can encounter race with remove_vm_area, !vm on behalf |
---|
| 3522 | + * of vmap area is being tear down or vm_map_ram allocation. |
---|
2728 | 3523 | */ |
---|
2729 | | - if (!(va->flags & VM_VM_AREA)) { |
---|
2730 | | - seq_printf(m, "0x%pK-0x%pK %7ld %s\n", |
---|
| 3524 | + if (!va->vm) { |
---|
| 3525 | + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", |
---|
2731 | 3526 | (void *)va->va_start, (void *)va->va_end, |
---|
2732 | | - va->va_end - va->va_start, |
---|
2733 | | - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); |
---|
| 3527 | + va->va_end - va->va_start); |
---|
2734 | 3528 | |
---|
2735 | 3529 | return 0; |
---|
2736 | 3530 | } |
---|
.. | .. |
---|
2761 | 3555 | if (v->flags & VM_USERMAP) |
---|
2762 | 3556 | seq_puts(m, " user"); |
---|
2763 | 3557 | |
---|
| 3558 | + if (v->flags & VM_DMA_COHERENT) |
---|
| 3559 | + seq_puts(m, " dma-coherent"); |
---|
| 3560 | + |
---|
2764 | 3561 | if (is_vmalloc_addr(v->pages)) |
---|
2765 | 3562 | seq_puts(m, " vpages"); |
---|
2766 | 3563 | |
---|
2767 | 3564 | show_numa_info(m, v); |
---|
| 3565 | + trace_android_vh_show_stack_hash(m, v); |
---|
2768 | 3566 | seq_putc(m, '\n'); |
---|
| 3567 | + |
---|
| 3568 | + /* |
---|
| 3569 | + * As a final step, dump "unpurged" areas. Note, |
---|
| 3570 | + * that entire "/proc/vmallocinfo" output will not |
---|
| 3571 | + * be address sorted, because the purge list is not |
---|
| 3572 | + * sorted. |
---|
| 3573 | + */ |
---|
| 3574 | + if (list_is_last(&va->list, &vmap_area_list)) |
---|
| 3575 | + show_purge_info(m); |
---|
| 3576 | + |
---|
2769 | 3577 | return 0; |
---|
2770 | 3578 | } |
---|
2771 | 3579 | |
---|
.. | .. |
---|
2789 | 3597 | module_init(proc_vmalloc_init); |
---|
2790 | 3598 | |
---|
2791 | 3599 | #endif |
---|
2792 | | - |
---|