hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/mm/vmalloc.c
....@@ -1,11 +1,11 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
2
- * linux/mm/vmalloc.c
3
- *
43 * Copyright (C) 1993 Linus Torvalds
54 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
65 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
76 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
87 * Numa awareness, Christoph Lameter, SGI, June 2005
8
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
99 */
1010
1111 #include <linux/vmalloc.h>
....@@ -18,12 +18,13 @@
1818 #include <linux/interrupt.h>
1919 #include <linux/proc_fs.h>
2020 #include <linux/seq_file.h>
21
+#include <linux/set_memory.h>
2122 #include <linux/debugobjects.h>
2223 #include <linux/kallsyms.h>
2324 #include <linux/list.h>
2425 #include <linux/notifier.h>
2526 #include <linux/rbtree.h>
26
-#include <linux/radix-tree.h>
27
+#include <linux/xarray.h>
2728 #include <linux/rcupdate.h>
2829 #include <linux/pfn.h>
2930 #include <linux/kmemleak.h>
....@@ -31,13 +32,24 @@
3132 #include <linux/compiler.h>
3233 #include <linux/llist.h>
3334 #include <linux/bitops.h>
35
+#include <linux/rbtree_augmented.h>
3436 #include <linux/overflow.h>
37
+#include <trace/hooks/mm.h>
3538
3639 #include <linux/uaccess.h>
3740 #include <asm/tlbflush.h>
3841 #include <asm/shmparam.h>
3942
4043 #include "internal.h"
44
+#include "pgalloc-track.h"
45
+
46
+bool is_vmalloc_addr(const void *x)
47
+{
48
+ unsigned long addr = (unsigned long)x;
49
+
50
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
51
+}
52
+EXPORT_SYMBOL(is_vmalloc_addr);
4153
4254 struct vfree_deferred {
4355 struct llist_head list;
....@@ -58,7 +70,8 @@
5870
5971 /*** Page table manipulation functions ***/
6072
61
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
73
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
74
+ pgtbl_mod_mask *mask)
6275 {
6376 pte_t *pte;
6477
....@@ -67,73 +80,119 @@
6780 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
6881 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
6982 } while (pte++, addr += PAGE_SIZE, addr != end);
83
+ *mask |= PGTBL_PTE_MODIFIED;
7084 }
7185
72
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
86
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
87
+ pgtbl_mod_mask *mask)
7388 {
7489 pmd_t *pmd;
7590 unsigned long next;
91
+ int cleared;
7692
7793 pmd = pmd_offset(pud, addr);
7894 do {
7995 next = pmd_addr_end(addr, end);
80
- if (pmd_clear_huge(pmd))
96
+
97
+ cleared = pmd_clear_huge(pmd);
98
+ if (cleared || pmd_bad(*pmd))
99
+ *mask |= PGTBL_PMD_MODIFIED;
100
+
101
+ if (cleared)
81102 continue;
82103 if (pmd_none_or_clear_bad(pmd))
83104 continue;
84
- vunmap_pte_range(pmd, addr, next);
105
+ vunmap_pte_range(pmd, addr, next, mask);
106
+
107
+ cond_resched();
85108 } while (pmd++, addr = next, addr != end);
86109 }
87110
88
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
111
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
112
+ pgtbl_mod_mask *mask)
89113 {
90114 pud_t *pud;
91115 unsigned long next;
116
+ int cleared;
92117
93118 pud = pud_offset(p4d, addr);
94119 do {
95120 next = pud_addr_end(addr, end);
96
- if (pud_clear_huge(pud))
121
+
122
+ cleared = pud_clear_huge(pud);
123
+ if (cleared || pud_bad(*pud))
124
+ *mask |= PGTBL_PUD_MODIFIED;
125
+
126
+ if (cleared)
97127 continue;
98128 if (pud_none_or_clear_bad(pud))
99129 continue;
100
- vunmap_pmd_range(pud, addr, next);
130
+ vunmap_pmd_range(pud, addr, next, mask);
101131 } while (pud++, addr = next, addr != end);
102132 }
103133
104
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
134
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
135
+ pgtbl_mod_mask *mask)
105136 {
106137 p4d_t *p4d;
107138 unsigned long next;
139
+ int cleared;
108140
109141 p4d = p4d_offset(pgd, addr);
110142 do {
111143 next = p4d_addr_end(addr, end);
112
- if (p4d_clear_huge(p4d))
144
+
145
+ cleared = p4d_clear_huge(p4d);
146
+ if (cleared || p4d_bad(*p4d))
147
+ *mask |= PGTBL_P4D_MODIFIED;
148
+
149
+ if (cleared)
113150 continue;
114151 if (p4d_none_or_clear_bad(p4d))
115152 continue;
116
- vunmap_pud_range(p4d, addr, next);
153
+ vunmap_pud_range(p4d, addr, next, mask);
117154 } while (p4d++, addr = next, addr != end);
118155 }
119156
120
-static void vunmap_page_range(unsigned long addr, unsigned long end)
157
+/**
158
+ * unmap_kernel_range_noflush - unmap kernel VM area
159
+ * @start: start of the VM area to unmap
160
+ * @size: size of the VM area to unmap
161
+ *
162
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
163
+ * should have been allocated using get_vm_area() and its friends.
164
+ *
165
+ * NOTE:
166
+ * This function does NOT do any cache flushing. The caller is responsible
167
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
168
+ * function and flush_tlb_kernel_range() after.
169
+ */
170
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
121171 {
122
- pgd_t *pgd;
172
+ unsigned long end = start + size;
123173 unsigned long next;
174
+ pgd_t *pgd;
175
+ unsigned long addr = start;
176
+ pgtbl_mod_mask mask = 0;
124177
125178 BUG_ON(addr >= end);
126179 pgd = pgd_offset_k(addr);
127180 do {
128181 next = pgd_addr_end(addr, end);
182
+ if (pgd_bad(*pgd))
183
+ mask |= PGTBL_PGD_MODIFIED;
129184 if (pgd_none_or_clear_bad(pgd))
130185 continue;
131
- vunmap_p4d_range(pgd, addr, next);
186
+ vunmap_p4d_range(pgd, addr, next, &mask);
132187 } while (pgd++, addr = next, addr != end);
188
+
189
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
190
+ arch_sync_kernel_mappings(start, end);
133191 }
134192
135193 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
136
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
194
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
195
+ pgtbl_mod_mask *mask)
137196 {
138197 pte_t *pte;
139198
....@@ -142,7 +201,7 @@
142201 * callers keep track of where we're up to.
143202 */
144203
145
- pte = pte_alloc_kernel(pmd, addr);
204
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
146205 if (!pte)
147206 return -ENOMEM;
148207 do {
....@@ -155,96 +214,120 @@
155214 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
156215 (*nr)++;
157216 } while (pte++, addr += PAGE_SIZE, addr != end);
217
+ *mask |= PGTBL_PTE_MODIFIED;
158218 return 0;
159219 }
160220
161221 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
162
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
222
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
223
+ pgtbl_mod_mask *mask)
163224 {
164225 pmd_t *pmd;
165226 unsigned long next;
166227
167
- pmd = pmd_alloc(&init_mm, pud, addr);
228
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
168229 if (!pmd)
169230 return -ENOMEM;
170231 do {
171232 next = pmd_addr_end(addr, end);
172
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
233
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
173234 return -ENOMEM;
174235 } while (pmd++, addr = next, addr != end);
175236 return 0;
176237 }
177238
178239 static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
179
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
240
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
241
+ pgtbl_mod_mask *mask)
180242 {
181243 pud_t *pud;
182244 unsigned long next;
183245
184
- pud = pud_alloc(&init_mm, p4d, addr);
246
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
185247 if (!pud)
186248 return -ENOMEM;
187249 do {
188250 next = pud_addr_end(addr, end);
189
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
251
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
190252 return -ENOMEM;
191253 } while (pud++, addr = next, addr != end);
192254 return 0;
193255 }
194256
195257 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
196
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
258
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
259
+ pgtbl_mod_mask *mask)
197260 {
198261 p4d_t *p4d;
199262 unsigned long next;
200263
201
- p4d = p4d_alloc(&init_mm, pgd, addr);
264
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
202265 if (!p4d)
203266 return -ENOMEM;
204267 do {
205268 next = p4d_addr_end(addr, end);
206
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
269
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
207270 return -ENOMEM;
208271 } while (p4d++, addr = next, addr != end);
209272 return 0;
210273 }
211274
212
-/*
213
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
214
- * will have pfns corresponding to the "pages" array.
275
+/**
276
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
277
+ * @addr: start of the VM area to map
278
+ * @size: size of the VM area to map
279
+ * @prot: page protection flags to use
280
+ * @pages: pages to map
215281 *
216
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
282
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
283
+ * have been allocated using get_vm_area() and its friends.
284
+ *
285
+ * NOTE:
286
+ * This function does NOT do any cache flushing. The caller is responsible for
287
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
288
+ * function.
289
+ *
290
+ * RETURNS:
291
+ * 0 on success, -errno on failure.
217292 */
218
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
219
- pgprot_t prot, struct page **pages)
293
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
294
+ pgprot_t prot, struct page **pages)
220295 {
221
- pgd_t *pgd;
296
+ unsigned long start = addr;
297
+ unsigned long end = addr + size;
222298 unsigned long next;
223
- unsigned long addr = start;
299
+ pgd_t *pgd;
224300 int err = 0;
225301 int nr = 0;
302
+ pgtbl_mod_mask mask = 0;
226303
227304 BUG_ON(addr >= end);
228305 pgd = pgd_offset_k(addr);
229306 do {
230307 next = pgd_addr_end(addr, end);
231
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
308
+ if (pgd_bad(*pgd))
309
+ mask |= PGTBL_PGD_MODIFIED;
310
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
232311 if (err)
233312 return err;
234313 } while (pgd++, addr = next, addr != end);
235314
236
- return nr;
315
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
316
+ arch_sync_kernel_mappings(start, end);
317
+
318
+ return 0;
237319 }
238320
239
-static int vmap_page_range(unsigned long start, unsigned long end,
240
- pgprot_t prot, struct page **pages)
321
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
322
+ struct page **pages)
241323 {
242324 int ret;
243325
244
- ret = vmap_page_range_noflush(start, end, prot, pages);
245
- flush_cache_vmap(start, end);
326
+ ret = map_kernel_range_noflush(start, size, prot, pages);
327
+ flush_cache_vmap(start, start + size);
246328 return ret;
247329 }
330
+EXPORT_SYMBOL_GPL(map_kernel_range);
248331
249332 int is_vmalloc_or_module_addr(const void *x)
250333 {
....@@ -324,22 +407,83 @@
324407
325408 /*** Global kva allocator ***/
326409
327
-#define VM_LAZY_FREE 0x02
328
-#define VM_VM_AREA 0x04
410
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
411
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
412
+
329413
330414 static DEFINE_SPINLOCK(vmap_area_lock);
415
+static DEFINE_SPINLOCK(free_vmap_area_lock);
331416 /* Export for kexec only */
332417 LIST_HEAD(vmap_area_list);
333418 static LLIST_HEAD(vmap_purge_list);
334419 static struct rb_root vmap_area_root = RB_ROOT;
420
+static bool vmap_initialized __read_mostly;
335421
336
-/* The vmap cache globals are protected by vmap_area_lock */
337
-static struct rb_node *free_vmap_cache;
338
-static unsigned long cached_hole_size;
339
-static unsigned long cached_vstart;
340
-static unsigned long cached_align;
422
+/*
423
+ * This kmem_cache is used for vmap_area objects. Instead of
424
+ * allocating from slab we reuse an object from this cache to
425
+ * make things faster. Especially in "no edge" splitting of
426
+ * free block.
427
+ */
428
+static struct kmem_cache *vmap_area_cachep;
341429
342
-static unsigned long vmap_area_pcpu_hole;
430
+/*
431
+ * This linked list is used in pair with free_vmap_area_root.
432
+ * It gives O(1) access to prev/next to perform fast coalescing.
433
+ */
434
+static LIST_HEAD(free_vmap_area_list);
435
+
436
+/*
437
+ * This augment red-black tree represents the free vmap space.
438
+ * All vmap_area objects in this tree are sorted by va->va_start
439
+ * address. It is used for allocation and merging when a vmap
440
+ * object is released.
441
+ *
442
+ * Each vmap_area node contains a maximum available free block
443
+ * of its sub-tree, right or left. Therefore it is possible to
444
+ * find a lowest match of free area.
445
+ */
446
+static struct rb_root free_vmap_area_root = RB_ROOT;
447
+
448
+/*
449
+ * Preload a CPU with one object for "no edge" split case. The
450
+ * aim is to get rid of allocations from the atomic context, thus
451
+ * to use more permissive allocation masks.
452
+ */
453
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
454
+
455
+static __always_inline unsigned long
456
+va_size(struct vmap_area *va)
457
+{
458
+ return (va->va_end - va->va_start);
459
+}
460
+
461
+static __always_inline unsigned long
462
+get_subtree_max_size(struct rb_node *node)
463
+{
464
+ struct vmap_area *va;
465
+
466
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
467
+ return va ? va->subtree_max_size : 0;
468
+}
469
+
470
+/*
471
+ * Gets called when remove the node and rotate.
472
+ */
473
+static __always_inline unsigned long
474
+compute_subtree_max_size(struct vmap_area *va)
475
+{
476
+ return max3(va_size(va),
477
+ get_subtree_max_size(va->rb_node.rb_left),
478
+ get_subtree_max_size(va->rb_node.rb_right));
479
+}
480
+
481
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
482
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
483
+
484
+static void purge_vmap_area_lazy(void);
485
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
486
+static unsigned long lazy_max_pages(void);
343487
344488 static atomic_long_t nr_vmalloc_pages;
345489
....@@ -347,6 +491,7 @@
347491 {
348492 return atomic_long_read(&nr_vmalloc_pages);
349493 }
494
+EXPORT_SYMBOL_GPL(vmalloc_nr_pages);
350495
351496 static struct vmap_area *__find_vmap_area(unsigned long addr)
352497 {
....@@ -367,41 +512,638 @@
367512 return NULL;
368513 }
369514
370
-static void __insert_vmap_area(struct vmap_area *va)
515
+/*
516
+ * This function returns back addresses of parent node
517
+ * and its left or right link for further processing.
518
+ *
519
+ * Otherwise NULL is returned. In that case all further
520
+ * steps regarding inserting of conflicting overlap range
521
+ * have to be declined and actually considered as a bug.
522
+ */
523
+static __always_inline struct rb_node **
524
+find_va_links(struct vmap_area *va,
525
+ struct rb_root *root, struct rb_node *from,
526
+ struct rb_node **parent)
371527 {
372
- struct rb_node **p = &vmap_area_root.rb_node;
373
- struct rb_node *parent = NULL;
374
- struct rb_node *tmp;
528
+ struct vmap_area *tmp_va;
529
+ struct rb_node **link;
375530
376
- while (*p) {
377
- struct vmap_area *tmp_va;
378
-
379
- parent = *p;
380
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
381
- if (va->va_start < tmp_va->va_end)
382
- p = &(*p)->rb_left;
383
- else if (va->va_end > tmp_va->va_start)
384
- p = &(*p)->rb_right;
385
- else
386
- BUG();
531
+ if (root) {
532
+ link = &root->rb_node;
533
+ if (unlikely(!*link)) {
534
+ *parent = NULL;
535
+ return link;
536
+ }
537
+ } else {
538
+ link = &from;
387539 }
388540
389
- rb_link_node(&va->rb_node, parent, p);
390
- rb_insert_color(&va->rb_node, &vmap_area_root);
541
+ /*
542
+ * Go to the bottom of the tree. When we hit the last point
543
+ * we end up with parent rb_node and correct direction, i name
544
+ * it link, where the new va->rb_node will be attached to.
545
+ */
546
+ do {
547
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
391548
392
- /* address-sort this list */
393
- tmp = rb_prev(&va->rb_node);
394
- if (tmp) {
395
- struct vmap_area *prev;
396
- prev = rb_entry(tmp, struct vmap_area, rb_node);
397
- list_add_rcu(&va->list, &prev->list);
398
- } else
399
- list_add_rcu(&va->list, &vmap_area_list);
549
+ /*
550
+ * During the traversal we also do some sanity check.
551
+ * Trigger the BUG() if there are sides(left/right)
552
+ * or full overlaps.
553
+ */
554
+ if (va->va_start < tmp_va->va_end &&
555
+ va->va_end <= tmp_va->va_start)
556
+ link = &(*link)->rb_left;
557
+ else if (va->va_end > tmp_va->va_start &&
558
+ va->va_start >= tmp_va->va_end)
559
+ link = &(*link)->rb_right;
560
+ else {
561
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
562
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
563
+
564
+ return NULL;
565
+ }
566
+ } while (*link);
567
+
568
+ *parent = &tmp_va->rb_node;
569
+ return link;
400570 }
401571
402
-static void purge_vmap_area_lazy(void);
572
+static __always_inline struct list_head *
573
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
574
+{
575
+ struct list_head *list;
403576
404
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
577
+ if (unlikely(!parent))
578
+ /*
579
+ * The red-black tree where we try to find VA neighbors
580
+ * before merging or inserting is empty, i.e. it means
581
+ * there is no free vmap space. Normally it does not
582
+ * happen but we handle this case anyway.
583
+ */
584
+ return NULL;
585
+
586
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
587
+ return (&parent->rb_right == link ? list->next : list);
588
+}
589
+
590
+static __always_inline void
591
+link_va(struct vmap_area *va, struct rb_root *root,
592
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
593
+{
594
+ /*
595
+ * VA is still not in the list, but we can
596
+ * identify its future previous list_head node.
597
+ */
598
+ if (likely(parent)) {
599
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
600
+ if (&parent->rb_right != link)
601
+ head = head->prev;
602
+ }
603
+
604
+ /* Insert to the rb-tree */
605
+ rb_link_node(&va->rb_node, parent, link);
606
+ if (root == &free_vmap_area_root) {
607
+ /*
608
+ * Some explanation here. Just perform simple insertion
609
+ * to the tree. We do not set va->subtree_max_size to
610
+ * its current size before calling rb_insert_augmented().
611
+ * It is because of we populate the tree from the bottom
612
+ * to parent levels when the node _is_ in the tree.
613
+ *
614
+ * Therefore we set subtree_max_size to zero after insertion,
615
+ * to let __augment_tree_propagate_from() puts everything to
616
+ * the correct order later on.
617
+ */
618
+ rb_insert_augmented(&va->rb_node,
619
+ root, &free_vmap_area_rb_augment_cb);
620
+ va->subtree_max_size = 0;
621
+ } else {
622
+ rb_insert_color(&va->rb_node, root);
623
+ }
624
+
625
+ /* Address-sort this list */
626
+ list_add(&va->list, head);
627
+}
628
+
629
+static __always_inline void
630
+unlink_va(struct vmap_area *va, struct rb_root *root)
631
+{
632
+ if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
633
+ return;
634
+
635
+ if (root == &free_vmap_area_root)
636
+ rb_erase_augmented(&va->rb_node,
637
+ root, &free_vmap_area_rb_augment_cb);
638
+ else
639
+ rb_erase(&va->rb_node, root);
640
+
641
+ list_del(&va->list);
642
+ RB_CLEAR_NODE(&va->rb_node);
643
+}
644
+
645
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
646
+static void
647
+augment_tree_propagate_check(void)
648
+{
649
+ struct vmap_area *va;
650
+ unsigned long computed_size;
651
+
652
+ list_for_each_entry(va, &free_vmap_area_list, list) {
653
+ computed_size = compute_subtree_max_size(va);
654
+ if (computed_size != va->subtree_max_size)
655
+ pr_emerg("tree is corrupted: %lu, %lu\n",
656
+ va_size(va), va->subtree_max_size);
657
+ }
658
+}
659
+#endif
660
+
661
+/*
662
+ * This function populates subtree_max_size from bottom to upper
663
+ * levels starting from VA point. The propagation must be done
664
+ * when VA size is modified by changing its va_start/va_end. Or
665
+ * in case of newly inserting of VA to the tree.
666
+ *
667
+ * It means that __augment_tree_propagate_from() must be called:
668
+ * - After VA has been inserted to the tree(free path);
669
+ * - After VA has been shrunk(allocation path);
670
+ * - After VA has been increased(merging path).
671
+ *
672
+ * Please note that, it does not mean that upper parent nodes
673
+ * and their subtree_max_size are recalculated all the time up
674
+ * to the root node.
675
+ *
676
+ * 4--8
677
+ * /\
678
+ * / \
679
+ * / \
680
+ * 2--2 8--8
681
+ *
682
+ * For example if we modify the node 4, shrinking it to 2, then
683
+ * no any modification is required. If we shrink the node 2 to 1
684
+ * its subtree_max_size is updated only, and set to 1. If we shrink
685
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
686
+ * node becomes 4--6.
687
+ */
688
+static __always_inline void
689
+augment_tree_propagate_from(struct vmap_area *va)
690
+{
691
+ /*
692
+ * Populate the tree from bottom towards the root until
693
+ * the calculated maximum available size of checked node
694
+ * is equal to its current one.
695
+ */
696
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
697
+
698
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
699
+ augment_tree_propagate_check();
700
+#endif
701
+}
702
+
703
+static void
704
+insert_vmap_area(struct vmap_area *va,
705
+ struct rb_root *root, struct list_head *head)
706
+{
707
+ struct rb_node **link;
708
+ struct rb_node *parent;
709
+
710
+ link = find_va_links(va, root, NULL, &parent);
711
+ if (link)
712
+ link_va(va, root, parent, link, head);
713
+}
714
+
715
+static void
716
+insert_vmap_area_augment(struct vmap_area *va,
717
+ struct rb_node *from, struct rb_root *root,
718
+ struct list_head *head)
719
+{
720
+ struct rb_node **link;
721
+ struct rb_node *parent;
722
+
723
+ if (from)
724
+ link = find_va_links(va, NULL, from, &parent);
725
+ else
726
+ link = find_va_links(va, root, NULL, &parent);
727
+
728
+ if (link) {
729
+ link_va(va, root, parent, link, head);
730
+ augment_tree_propagate_from(va);
731
+ }
732
+}
733
+
734
+/*
735
+ * Merge de-allocated chunk of VA memory with previous
736
+ * and next free blocks. If coalesce is not done a new
737
+ * free area is inserted. If VA has been merged, it is
738
+ * freed.
739
+ *
740
+ * Please note, it can return NULL in case of overlap
741
+ * ranges, followed by WARN() report. Despite it is a
742
+ * buggy behaviour, a system can be alive and keep
743
+ * ongoing.
744
+ */
745
+static __always_inline struct vmap_area *
746
+merge_or_add_vmap_area(struct vmap_area *va,
747
+ struct rb_root *root, struct list_head *head)
748
+{
749
+ struct vmap_area *sibling;
750
+ struct list_head *next;
751
+ struct rb_node **link;
752
+ struct rb_node *parent;
753
+ bool merged = false;
754
+
755
+ /*
756
+ * Find a place in the tree where VA potentially will be
757
+ * inserted, unless it is merged with its sibling/siblings.
758
+ */
759
+ link = find_va_links(va, root, NULL, &parent);
760
+ if (!link)
761
+ return NULL;
762
+
763
+ /*
764
+ * Get next node of VA to check if merging can be done.
765
+ */
766
+ next = get_va_next_sibling(parent, link);
767
+ if (unlikely(next == NULL))
768
+ goto insert;
769
+
770
+ /*
771
+ * start end
772
+ * | |
773
+ * |<------VA------>|<-----Next----->|
774
+ * | |
775
+ * start end
776
+ */
777
+ if (next != head) {
778
+ sibling = list_entry(next, struct vmap_area, list);
779
+ if (sibling->va_start == va->va_end) {
780
+ sibling->va_start = va->va_start;
781
+
782
+ /* Free vmap_area object. */
783
+ kmem_cache_free(vmap_area_cachep, va);
784
+
785
+ /* Point to the new merged area. */
786
+ va = sibling;
787
+ merged = true;
788
+ }
789
+ }
790
+
791
+ /*
792
+ * start end
793
+ * | |
794
+ * |<-----Prev----->|<------VA------>|
795
+ * | |
796
+ * start end
797
+ */
798
+ if (next->prev != head) {
799
+ sibling = list_entry(next->prev, struct vmap_area, list);
800
+ if (sibling->va_end == va->va_start) {
801
+ /*
802
+ * If both neighbors are coalesced, it is important
803
+ * to unlink the "next" node first, followed by merging
804
+ * with "previous" one. Otherwise the tree might not be
805
+ * fully populated if a sibling's augmented value is
806
+ * "normalized" because of rotation operations.
807
+ */
808
+ if (merged)
809
+ unlink_va(va, root);
810
+
811
+ sibling->va_end = va->va_end;
812
+
813
+ /* Free vmap_area object. */
814
+ kmem_cache_free(vmap_area_cachep, va);
815
+
816
+ /* Point to the new merged area. */
817
+ va = sibling;
818
+ merged = true;
819
+ }
820
+ }
821
+
822
+insert:
823
+ if (!merged)
824
+ link_va(va, root, parent, link, head);
825
+
826
+ /*
827
+ * Last step is to check and update the tree.
828
+ */
829
+ augment_tree_propagate_from(va);
830
+ return va;
831
+}
832
+
833
+static __always_inline bool
834
+is_within_this_va(struct vmap_area *va, unsigned long size,
835
+ unsigned long align, unsigned long vstart)
836
+{
837
+ unsigned long nva_start_addr;
838
+
839
+ if (va->va_start > vstart)
840
+ nva_start_addr = ALIGN(va->va_start, align);
841
+ else
842
+ nva_start_addr = ALIGN(vstart, align);
843
+
844
+ /* Can be overflowed due to big size or alignment. */
845
+ if (nva_start_addr + size < nva_start_addr ||
846
+ nva_start_addr < vstart)
847
+ return false;
848
+
849
+ return (nva_start_addr + size <= va->va_end);
850
+}
851
+
852
+/*
853
+ * Find the first free block(lowest start address) in the tree,
854
+ * that will accomplish the request corresponding to passing
855
+ * parameters.
856
+ */
857
+static __always_inline struct vmap_area *
858
+find_vmap_lowest_match(unsigned long size,
859
+ unsigned long align, unsigned long vstart)
860
+{
861
+ struct vmap_area *va;
862
+ struct rb_node *node;
863
+ unsigned long length;
864
+
865
+ /* Start from the root. */
866
+ node = free_vmap_area_root.rb_node;
867
+
868
+ /* Adjust the search size for alignment overhead. */
869
+ length = size + align - 1;
870
+
871
+ while (node) {
872
+ va = rb_entry(node, struct vmap_area, rb_node);
873
+
874
+ if (get_subtree_max_size(node->rb_left) >= length &&
875
+ vstart < va->va_start) {
876
+ node = node->rb_left;
877
+ } else {
878
+ if (is_within_this_va(va, size, align, vstart))
879
+ return va;
880
+
881
+ /*
882
+ * Does not make sense to go deeper towards the right
883
+ * sub-tree if it does not have a free block that is
884
+ * equal or bigger to the requested search length.
885
+ */
886
+ if (get_subtree_max_size(node->rb_right) >= length) {
887
+ node = node->rb_right;
888
+ continue;
889
+ }
890
+
891
+ /*
892
+ * OK. We roll back and find the first right sub-tree,
893
+ * that will satisfy the search criteria. It can happen
894
+ * only once due to "vstart" restriction.
895
+ */
896
+ while ((node = rb_parent(node))) {
897
+ va = rb_entry(node, struct vmap_area, rb_node);
898
+ if (is_within_this_va(va, size, align, vstart))
899
+ return va;
900
+
901
+ if (get_subtree_max_size(node->rb_right) >= length &&
902
+ vstart <= va->va_start) {
903
+ node = node->rb_right;
904
+ break;
905
+ }
906
+ }
907
+ }
908
+ }
909
+
910
+ return NULL;
911
+}
912
+
913
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
914
+#include <linux/random.h>
915
+
916
+static struct vmap_area *
917
+find_vmap_lowest_linear_match(unsigned long size,
918
+ unsigned long align, unsigned long vstart)
919
+{
920
+ struct vmap_area *va;
921
+
922
+ list_for_each_entry(va, &free_vmap_area_list, list) {
923
+ if (!is_within_this_va(va, size, align, vstart))
924
+ continue;
925
+
926
+ return va;
927
+ }
928
+
929
+ return NULL;
930
+}
931
+
932
+static void
933
+find_vmap_lowest_match_check(unsigned long size)
934
+{
935
+ struct vmap_area *va_1, *va_2;
936
+ unsigned long vstart;
937
+ unsigned int rnd;
938
+
939
+ get_random_bytes(&rnd, sizeof(rnd));
940
+ vstart = VMALLOC_START + rnd;
941
+
942
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
943
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
944
+
945
+ if (va_1 != va_2)
946
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
947
+ va_1, va_2, vstart);
948
+}
949
+#endif
950
+
951
+enum fit_type {
952
+ NOTHING_FIT = 0,
953
+ FL_FIT_TYPE = 1, /* full fit */
954
+ LE_FIT_TYPE = 2, /* left edge fit */
955
+ RE_FIT_TYPE = 3, /* right edge fit */
956
+ NE_FIT_TYPE = 4 /* no edge fit */
957
+};
958
+
959
+static __always_inline enum fit_type
960
+classify_va_fit_type(struct vmap_area *va,
961
+ unsigned long nva_start_addr, unsigned long size)
962
+{
963
+ enum fit_type type;
964
+
965
+ /* Check if it is within VA. */
966
+ if (nva_start_addr < va->va_start ||
967
+ nva_start_addr + size > va->va_end)
968
+ return NOTHING_FIT;
969
+
970
+ /* Now classify. */
971
+ if (va->va_start == nva_start_addr) {
972
+ if (va->va_end == nva_start_addr + size)
973
+ type = FL_FIT_TYPE;
974
+ else
975
+ type = LE_FIT_TYPE;
976
+ } else if (va->va_end == nva_start_addr + size) {
977
+ type = RE_FIT_TYPE;
978
+ } else {
979
+ type = NE_FIT_TYPE;
980
+ }
981
+
982
+ return type;
983
+}
984
+
985
+static __always_inline int
986
+adjust_va_to_fit_type(struct vmap_area *va,
987
+ unsigned long nva_start_addr, unsigned long size,
988
+ enum fit_type type)
989
+{
990
+ struct vmap_area *lva = NULL;
991
+
992
+ if (type == FL_FIT_TYPE) {
993
+ /*
994
+ * No need to split VA, it fully fits.
995
+ *
996
+ * | |
997
+ * V NVA V
998
+ * |---------------|
999
+ */
1000
+ unlink_va(va, &free_vmap_area_root);
1001
+ kmem_cache_free(vmap_area_cachep, va);
1002
+ } else if (type == LE_FIT_TYPE) {
1003
+ /*
1004
+ * Split left edge of fit VA.
1005
+ *
1006
+ * | |
1007
+ * V NVA V R
1008
+ * |-------|-------|
1009
+ */
1010
+ va->va_start += size;
1011
+ } else if (type == RE_FIT_TYPE) {
1012
+ /*
1013
+ * Split right edge of fit VA.
1014
+ *
1015
+ * | |
1016
+ * L V NVA V
1017
+ * |-------|-------|
1018
+ */
1019
+ va->va_end = nva_start_addr;
1020
+ } else if (type == NE_FIT_TYPE) {
1021
+ /*
1022
+ * Split no edge of fit VA.
1023
+ *
1024
+ * | |
1025
+ * L V NVA V R
1026
+ * |---|-------|---|
1027
+ */
1028
+ lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1029
+ if (unlikely(!lva)) {
1030
+ /*
1031
+ * For percpu allocator we do not do any pre-allocation
1032
+ * and leave it as it is. The reason is it most likely
1033
+ * never ends up with NE_FIT_TYPE splitting. In case of
1034
+ * percpu allocations offsets and sizes are aligned to
1035
+ * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1036
+ * are its main fitting cases.
1037
+ *
1038
+ * There are a few exceptions though, as an example it is
1039
+ * a first allocation (early boot up) when we have "one"
1040
+ * big free space that has to be split.
1041
+ *
1042
+ * Also we can hit this path in case of regular "vmap"
1043
+ * allocations, if "this" current CPU was not preloaded.
1044
+ * See the comment in alloc_vmap_area() why. If so, then
1045
+ * GFP_NOWAIT is used instead to get an extra object for
1046
+ * split purpose. That is rare and most time does not
1047
+ * occur.
1048
+ *
1049
+ * What happens if an allocation gets failed. Basically,
1050
+ * an "overflow" path is triggered to purge lazily freed
1051
+ * areas to free some memory, then, the "retry" path is
1052
+ * triggered to repeat one more time. See more details
1053
+ * in alloc_vmap_area() function.
1054
+ */
1055
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1056
+ if (!lva)
1057
+ return -1;
1058
+ }
1059
+
1060
+ /*
1061
+ * Build the remainder.
1062
+ */
1063
+ lva->va_start = va->va_start;
1064
+ lva->va_end = nva_start_addr;
1065
+
1066
+ /*
1067
+ * Shrink this VA to remaining size.
1068
+ */
1069
+ va->va_start = nva_start_addr + size;
1070
+ } else {
1071
+ return -1;
1072
+ }
1073
+
1074
+ if (type != FL_FIT_TYPE) {
1075
+ augment_tree_propagate_from(va);
1076
+
1077
+ if (lva) /* type == NE_FIT_TYPE */
1078
+ insert_vmap_area_augment(lva, &va->rb_node,
1079
+ &free_vmap_area_root, &free_vmap_area_list);
1080
+ }
1081
+
1082
+ return 0;
1083
+}
1084
+
1085
+/*
1086
+ * Returns a start address of the newly allocated area, if success.
1087
+ * Otherwise a vend is returned that indicates failure.
1088
+ */
1089
+static __always_inline unsigned long
1090
+__alloc_vmap_area(unsigned long size, unsigned long align,
1091
+ unsigned long vstart, unsigned long vend)
1092
+{
1093
+ unsigned long nva_start_addr;
1094
+ struct vmap_area *va;
1095
+ enum fit_type type;
1096
+ int ret;
1097
+
1098
+ va = find_vmap_lowest_match(size, align, vstart);
1099
+ if (unlikely(!va))
1100
+ return vend;
1101
+
1102
+ if (va->va_start > vstart)
1103
+ nva_start_addr = ALIGN(va->va_start, align);
1104
+ else
1105
+ nva_start_addr = ALIGN(vstart, align);
1106
+
1107
+ /* Check the "vend" restriction. */
1108
+ if (nva_start_addr + size > vend)
1109
+ return vend;
1110
+
1111
+ /* Classify what we have found. */
1112
+ type = classify_va_fit_type(va, nva_start_addr, size);
1113
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
1114
+ return vend;
1115
+
1116
+ /* Update the free vmap_area. */
1117
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
1118
+ if (ret)
1119
+ return vend;
1120
+
1121
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1122
+ find_vmap_lowest_match_check(size);
1123
+#endif
1124
+
1125
+ return nva_start_addr;
1126
+}
1127
+
1128
+/*
1129
+ * Free a region of KVA allocated by alloc_vmap_area
1130
+ */
1131
+static void free_vmap_area(struct vmap_area *va)
1132
+{
1133
+ /*
1134
+ * Remove from the busy tree/list.
1135
+ */
1136
+ spin_lock(&vmap_area_lock);
1137
+ unlink_va(va, &vmap_area_root);
1138
+ spin_unlock(&vmap_area_lock);
1139
+
1140
+ /*
1141
+ * Insert/Merge it back to the free tree/list.
1142
+ */
1143
+ spin_lock(&free_vmap_area_lock);
1144
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
1145
+ spin_unlock(&free_vmap_area_lock);
1146
+}
4051147
4061148 /*
4071149 * Allocate a region of KVA of the specified size and alignment, within the
....@@ -412,20 +1154,22 @@
4121154 unsigned long vstart, unsigned long vend,
4131155 int node, gfp_t gfp_mask)
4141156 {
415
- struct vmap_area *va;
416
- struct rb_node *n;
1157
+ struct vmap_area *va, *pva;
4171158 unsigned long addr;
4181159 int purged = 0;
419
- struct vmap_area *first;
1160
+ int ret;
4201161
4211162 BUG_ON(!size);
4221163 BUG_ON(offset_in_page(size));
4231164 BUG_ON(!is_power_of_2(align));
4241165
425
- might_sleep();
1166
+ if (unlikely(!vmap_initialized))
1167
+ return ERR_PTR(-EBUSY);
4261168
427
- va = kmalloc_node(sizeof(struct vmap_area),
428
- gfp_mask & GFP_RECLAIM_MASK, node);
1169
+ might_sleep();
1170
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1171
+
1172
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
4291173 if (unlikely(!va))
4301174 return ERR_PTR(-ENOMEM);
4311175
....@@ -433,101 +1177,71 @@
4331177 * Only scan the relevant parts containing pointers to other objects
4341178 * to avoid false negatives.
4351179 */
436
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
1180
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
4371181
4381182 retry:
439
- spin_lock(&vmap_area_lock);
4401183 /*
441
- * Invalidate cache if we have more permissive parameters.
442
- * cached_hole_size notes the largest hole noticed _below_
443
- * the vmap_area cached in free_vmap_cache: if size fits
444
- * into that hole, we want to scan from vstart to reuse
445
- * the hole instead of allocating above free_vmap_cache.
446
- * Note that __free_vmap_area may update free_vmap_cache
447
- * without updating cached_hole_size or cached_align.
1184
+ * Preload this CPU with one extra vmap_area object. It is used
1185
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
1186
+ * does not guarantee that an allocation occurs on a CPU that
1187
+ * is preloaded, instead we minimize the case when it is not.
1188
+ * It can happen because of cpu migration, because there is a
1189
+ * race until the below spinlock is taken.
1190
+ *
1191
+ * The preload is done in non-atomic context, thus it allows us
1192
+ * to use more permissive allocation masks to be more stable under
1193
+ * low memory condition and high memory pressure. In rare case,
1194
+ * if not preloaded, GFP_NOWAIT is used.
1195
+ *
1196
+ * Set "pva" to NULL here, because of "retry" path.
4481197 */
449
- if (!free_vmap_cache ||
450
- size < cached_hole_size ||
451
- vstart < cached_vstart ||
452
- align < cached_align) {
453
-nocache:
454
- cached_hole_size = 0;
455
- free_vmap_cache = NULL;
456
- }
457
- /* record if we encounter less permissive parameters */
458
- cached_vstart = vstart;
459
- cached_align = align;
1198
+ pva = NULL;
4601199
461
- /* find starting point for our search */
462
- if (free_vmap_cache) {
463
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
464
- addr = ALIGN(first->va_end, align);
465
- if (addr < vstart)
466
- goto nocache;
467
- if (addr + size < addr)
468
- goto overflow;
1200
+ if (!this_cpu_read(ne_fit_preload_node))
1201
+ /*
1202
+ * Even if it fails we do not really care about that.
1203
+ * Just proceed as it is. If needed "overflow" path
1204
+ * will refill the cache we allocate from.
1205
+ */
1206
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
4691207
470
- } else {
471
- addr = ALIGN(vstart, align);
472
- if (addr + size < addr)
473
- goto overflow;
1208
+ spin_lock(&free_vmap_area_lock);
4741209
475
- n = vmap_area_root.rb_node;
476
- first = NULL;
1210
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
1211
+ kmem_cache_free(vmap_area_cachep, pva);
4771212
478
- while (n) {
479
- struct vmap_area *tmp;
480
- tmp = rb_entry(n, struct vmap_area, rb_node);
481
- if (tmp->va_end >= addr) {
482
- first = tmp;
483
- if (tmp->va_start <= addr)
484
- break;
485
- n = n->rb_left;
486
- } else
487
- n = n->rb_right;
488
- }
489
-
490
- if (!first)
491
- goto found;
492
- }
493
-
494
- /* from the starting point, walk areas until a suitable hole is found */
495
- while (addr + size > first->va_start && addr + size <= vend) {
496
- if (addr + cached_hole_size < first->va_start)
497
- cached_hole_size = first->va_start - addr;
498
- addr = ALIGN(first->va_end, align);
499
- if (addr + size < addr)
500
- goto overflow;
501
-
502
- if (list_is_last(&first->list, &vmap_area_list))
503
- goto found;
504
-
505
- first = list_next_entry(first, list);
506
- }
507
-
508
-found:
5091213 /*
510
- * Check also calculated address against the vstart,
511
- * because it can be 0 because of big align request.
1214
+ * If an allocation fails, the "vend" address is
1215
+ * returned. Therefore trigger the overflow path.
5121216 */
513
- if (addr + size > vend || addr < vstart)
1217
+ addr = __alloc_vmap_area(size, align, vstart, vend);
1218
+ spin_unlock(&free_vmap_area_lock);
1219
+
1220
+ if (unlikely(addr == vend))
5141221 goto overflow;
5151222
5161223 va->va_start = addr;
5171224 va->va_end = addr + size;
518
- va->flags = 0;
519
- __insert_vmap_area(va);
520
- free_vmap_cache = &va->rb_node;
1225
+ va->vm = NULL;
1226
+
1227
+
1228
+ spin_lock(&vmap_area_lock);
1229
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
5211230 spin_unlock(&vmap_area_lock);
5221231
5231232 BUG_ON(!IS_ALIGNED(va->va_start, align));
5241233 BUG_ON(va->va_start < vstart);
5251234 BUG_ON(va->va_end > vend);
5261235
1236
+ ret = kasan_populate_vmalloc(addr, size);
1237
+ if (ret) {
1238
+ free_vmap_area(va);
1239
+ return ERR_PTR(ret);
1240
+ }
1241
+
5271242 return va;
5281243
5291244 overflow:
530
- spin_unlock(&vmap_area_lock);
5311245 if (!purged) {
5321246 purge_vmap_area_lazy();
5331247 purged = 1;
....@@ -546,7 +1260,8 @@
5461260 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
5471261 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
5481262 size);
549
- kfree(va);
1263
+
1264
+ kmem_cache_free(vmap_area_cachep, va);
5501265 return ERR_PTR(-EBUSY);
5511266 }
5521267
....@@ -562,59 +1277,7 @@
5621277 }
5631278 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
5641279
565
-static void __free_vmap_area(struct vmap_area *va)
566
-{
567
- BUG_ON(RB_EMPTY_NODE(&va->rb_node));
568
-
569
- if (free_vmap_cache) {
570
- if (va->va_end < cached_vstart) {
571
- free_vmap_cache = NULL;
572
- } else {
573
- struct vmap_area *cache;
574
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
575
- if (va->va_start <= cache->va_start) {
576
- free_vmap_cache = rb_prev(&va->rb_node);
577
- /*
578
- * We don't try to update cached_hole_size or
579
- * cached_align, but it won't go very wrong.
580
- */
581
- }
582
- }
583
- }
584
- rb_erase(&va->rb_node, &vmap_area_root);
585
- RB_CLEAR_NODE(&va->rb_node);
586
- list_del_rcu(&va->list);
587
-
588
- /*
589
- * Track the highest possible candidate for pcpu area
590
- * allocation. Areas outside of vmalloc area can be returned
591
- * here too, consider only end addresses which fall inside
592
- * vmalloc area proper.
593
- */
594
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
595
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
596
-
597
- kfree_rcu(va, rcu_head);
598
-}
599
-
600
-/*
601
- * Free a region of KVA allocated by alloc_vmap_area
602
- */
603
-static void free_vmap_area(struct vmap_area *va)
604
-{
605
- spin_lock(&vmap_area_lock);
606
- __free_vmap_area(va);
607
- spin_unlock(&vmap_area_lock);
608
-}
609
-
610
-/*
611
- * Clear the pagetable entries of a given vmap_area
612
- */
613
-static void unmap_vmap_area(struct vmap_area *va)
614
-{
615
- vunmap_page_range(va->va_start, va->va_end);
616
-}
617
-
1280
+bool lazy_vunmap_enable __read_mostly = true;
6181281 /*
6191282 * lazy_max_pages is the maximum amount of virtual address space we gather up
6201283 * before attempting to purge with a TLB flush.
....@@ -635,12 +1298,15 @@
6351298 {
6361299 unsigned int log;
6371300
1301
+ if (!lazy_vunmap_enable)
1302
+ return 0;
1303
+
6381304 log = fls(num_online_cpus());
6391305
6401306 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
6411307 }
6421308
643
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
1309
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
6441310
6451311 /*
6461312 * Serialize vmap purging. There is no actual criticial section protected
....@@ -658,7 +1324,7 @@
6581324 */
6591325 void set_iounmap_nonlazy(void)
6601326 {
661
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
1327
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
6621328 }
6631329
6641330 /*
....@@ -666,36 +1332,58 @@
6661332 */
6671333 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
6681334 {
1335
+ unsigned long resched_threshold;
6691336 struct llist_node *valist;
6701337 struct vmap_area *va;
6711338 struct vmap_area *n_va;
672
- bool do_free = false;
6731339
6741340 lockdep_assert_held(&vmap_purge_lock);
6751341
6761342 valist = llist_del_all(&vmap_purge_list);
1343
+ if (unlikely(valist == NULL))
1344
+ return false;
1345
+
1346
+ /*
1347
+ * TODO: to calculate a flush range without looping.
1348
+ * The list can be up to lazy_max_pages() elements.
1349
+ */
6771350 llist_for_each_entry(va, valist, purge_list) {
6781351 if (va->va_start < start)
6791352 start = va->va_start;
6801353 if (va->va_end > end)
6811354 end = va->va_end;
682
- do_free = true;
6831355 }
684
-
685
- if (!do_free)
686
- return false;
6871356
6881357 flush_tlb_kernel_range(start, end);
1358
+ resched_threshold = lazy_max_pages() << 1;
6891359
690
- spin_lock(&vmap_area_lock);
1360
+ spin_lock(&free_vmap_area_lock);
6911361 llist_for_each_entry_safe(va, n_va, valist, purge_list) {
692
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1362
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1363
+ unsigned long orig_start = va->va_start;
1364
+ unsigned long orig_end = va->va_end;
6931365
694
- __free_vmap_area(va);
695
- atomic_sub(nr, &vmap_lazy_nr);
696
- cond_resched_lock(&vmap_area_lock);
1366
+ /*
1367
+ * Finally insert or merge lazily-freed area. It is
1368
+ * detached and there is no need to "unlink" it from
1369
+ * anything.
1370
+ */
1371
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
1372
+ &free_vmap_area_list);
1373
+
1374
+ if (!va)
1375
+ continue;
1376
+
1377
+ if (is_vmalloc_or_module_addr((void *)orig_start))
1378
+ kasan_release_vmalloc(orig_start, orig_end,
1379
+ va->va_start, va->va_end);
1380
+
1381
+ atomic_long_sub(nr, &vmap_lazy_nr);
1382
+
1383
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1384
+ cond_resched_lock(&free_vmap_area_lock);
6971385 }
698
- spin_unlock(&vmap_area_lock);
1386
+ spin_unlock(&free_vmap_area_lock);
6991387 return true;
7001388 }
7011389
....@@ -729,10 +1417,14 @@
7291417 */
7301418 static void free_vmap_area_noflush(struct vmap_area *va)
7311419 {
732
- int nr_lazy;
1420
+ unsigned long nr_lazy;
7331421
734
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
735
- &vmap_lazy_nr);
1422
+ spin_lock(&vmap_area_lock);
1423
+ unlink_va(va, &vmap_area_root);
1424
+ spin_unlock(&vmap_area_lock);
1425
+
1426
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1427
+ PAGE_SHIFT, &vmap_lazy_nr);
7361428
7371429 /* After this point, we may free va at any time */
7381430 llist_add(&va->purge_list, &vmap_purge_list);
....@@ -747,8 +1439,8 @@
7471439 static void free_unmap_vmap_area(struct vmap_area *va)
7481440 {
7491441 flush_cache_vunmap(va->va_start, va->va_end);
750
- unmap_vmap_area(va);
751
- if (debug_pagealloc_enabled())
1442
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
1443
+ if (debug_pagealloc_enabled_static())
7521444 flush_tlb_kernel_range(va->va_start, va->va_end);
7531445
7541446 free_vmap_area_noflush(va);
....@@ -795,8 +1487,6 @@
7951487
7961488 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
7971489
798
-static bool vmap_initialized __read_mostly = false;
799
-
8001490 struct vmap_block_queue {
8011491 spinlock_t lock;
8021492 struct list_head free;
....@@ -816,12 +1506,11 @@
8161506 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
8171507
8181508 /*
819
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
1509
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
8201510 * in the free path. Could get rid of this if we change the API to return a
8211511 * "cookie" from alloc, to be passed to free. But no big deal yet.
8221512 */
823
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
824
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
1513
+static DEFINE_XARRAY(vmap_blocks);
8251514
8261515 /*
8271516 * We should probably have a fallback mechanism to allocate virtual memory
....@@ -852,7 +1541,7 @@
8521541 * @order: how many 2^order pages should be occupied in newly allocated block
8531542 * @gfp_mask: flags for the page level allocator
8541543 *
855
- * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
1544
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
8561545 */
8571546 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
8581547 {
....@@ -878,13 +1567,6 @@
8781567 return ERR_CAST(va);
8791568 }
8801569
881
- err = radix_tree_preload(gfp_mask);
882
- if (unlikely(err)) {
883
- kfree(vb);
884
- free_vmap_area(va);
885
- return ERR_PTR(err);
886
- }
887
-
8881570 vaddr = vmap_block_vaddr(va->va_start, 0);
8891571 spin_lock_init(&vb->lock);
8901572 vb->va = va;
....@@ -897,11 +1579,12 @@
8971579 INIT_LIST_HEAD(&vb->free_list);
8981580
8991581 vb_idx = addr_to_vb_idx(va->va_start);
900
- spin_lock(&vmap_block_tree_lock);
901
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
902
- spin_unlock(&vmap_block_tree_lock);
903
- BUG_ON(err);
904
- radix_tree_preload_end();
1582
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
1583
+ if (err) {
1584
+ kfree(vb);
1585
+ free_vmap_area(va);
1586
+ return ERR_PTR(err);
1587
+ }
9051588
9061589 vbq = &get_cpu_var(vmap_block_queue);
9071590 spin_lock(&vbq->lock);
....@@ -915,12 +1598,8 @@
9151598 static void free_vmap_block(struct vmap_block *vb)
9161599 {
9171600 struct vmap_block *tmp;
918
- unsigned long vb_idx;
9191601
920
- vb_idx = addr_to_vb_idx(vb->va->va_start);
921
- spin_lock(&vmap_block_tree_lock);
922
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
923
- spin_unlock(&vmap_block_tree_lock);
1602
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
9241603 BUG_ON(tmp != vb);
9251604
9261605 free_vmap_area_noflush(vb->va);
....@@ -1023,34 +1702,25 @@
10231702 return vaddr;
10241703 }
10251704
1026
-static void vb_free(const void *addr, unsigned long size)
1705
+static void vb_free(unsigned long addr, unsigned long size)
10271706 {
10281707 unsigned long offset;
1029
- unsigned long vb_idx;
10301708 unsigned int order;
10311709 struct vmap_block *vb;
10321710
10331711 BUG_ON(offset_in_page(size));
10341712 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
10351713
1036
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
1714
+ flush_cache_vunmap(addr, addr + size);
10371715
10381716 order = get_order(size);
1717
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
1718
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
10391719
1040
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1041
- offset >>= PAGE_SHIFT;
1720
+ unmap_kernel_range_noflush(addr, size);
10421721
1043
- vb_idx = addr_to_vb_idx((unsigned long)addr);
1044
- rcu_read_lock();
1045
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
1046
- rcu_read_unlock();
1047
- BUG_ON(!vb);
1048
-
1049
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
1050
-
1051
- if (debug_pagealloc_enabled())
1052
- flush_tlb_kernel_range((unsigned long)addr,
1053
- (unsigned long)addr + size);
1722
+ if (debug_pagealloc_enabled_static())
1723
+ flush_tlb_kernel_range(addr, addr + size);
10541724
10551725 spin_lock(&vb->lock);
10561726
....@@ -1067,24 +1737,9 @@
10671737 spin_unlock(&vb->lock);
10681738 }
10691739
1070
-/**
1071
- * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1072
- *
1073
- * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1074
- * to amortize TLB flushing overheads. What this means is that any page you
1075
- * have now, may, in a former life, have been mapped into kernel virtual
1076
- * address by the vmap layer and so there might be some CPUs with TLB entries
1077
- * still referencing that page (additional to the regular 1:1 kernel mapping).
1078
- *
1079
- * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1080
- * be sure that none of the pages we have control over will have any aliases
1081
- * from the vmap layer.
1082
- */
1083
-void vm_unmap_aliases(void)
1740
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
10841741 {
1085
- unsigned long start = ULONG_MAX, end = 0;
10861742 int cpu;
1087
- int flush = 0;
10881743
10891744 if (unlikely(!vmap_initialized))
10901745 return;
....@@ -1098,7 +1753,7 @@
10981753 rcu_read_lock();
10991754 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
11001755 spin_lock(&vb->lock);
1101
- if (vb->dirty) {
1756
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
11021757 unsigned long va_start = vb->va->va_start;
11031758 unsigned long s, e;
11041759
....@@ -1121,6 +1776,27 @@
11211776 flush_tlb_kernel_range(start, end);
11221777 mutex_unlock(&vmap_purge_lock);
11231778 }
1779
+
1780
+/**
1781
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1782
+ *
1783
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1784
+ * to amortize TLB flushing overheads. What this means is that any page you
1785
+ * have now, may, in a former life, have been mapped into kernel virtual
1786
+ * address by the vmap layer and so there might be some CPUs with TLB entries
1787
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
1788
+ *
1789
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1790
+ * be sure that none of the pages we have control over will have any aliases
1791
+ * from the vmap layer.
1792
+ */
1793
+void vm_unmap_aliases(void)
1794
+{
1795
+ unsigned long start = ULONG_MAX, end = 0;
1796
+ int flush = 0;
1797
+
1798
+ _vm_unmap_aliases(start, end, flush);
1799
+}
11241800 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
11251801
11261802 /**
....@@ -1140,9 +1816,11 @@
11401816 BUG_ON(addr > VMALLOC_END);
11411817 BUG_ON(!PAGE_ALIGNED(addr));
11421818
1819
+ kasan_poison_vmalloc(mem, size);
1820
+
11431821 if (likely(count <= VMAP_MAX_ALLOC)) {
11441822 debug_check_no_locks_freed(mem, size);
1145
- vb_free(mem, size);
1823
+ vb_free(addr, size);
11461824 return;
11471825 }
11481826
....@@ -1159,7 +1837,6 @@
11591837 * @pages: an array of pointers to the pages to be mapped
11601838 * @count: number of pages
11611839 * @node: prefer to allocate data structures on this node
1162
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
11631840 *
11641841 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
11651842 * faster than vmap so it's good. But if you mix long-life and short-life
....@@ -1169,7 +1846,7 @@
11691846 *
11701847 * Returns: a pointer to the address that has been mapped, or %NULL on failure
11711848 */
1172
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
1849
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
11731850 {
11741851 unsigned long size = (unsigned long)count << PAGE_SHIFT;
11751852 unsigned long addr;
....@@ -1190,7 +1867,10 @@
11901867 addr = va->va_start;
11911868 mem = (void *)addr;
11921869 }
1193
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
1870
+
1871
+ kasan_unpoison_vmalloc(mem, size);
1872
+
1873
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
11941874 vm_unmap_ram(mem, count);
11951875 return NULL;
11961876 }
....@@ -1199,6 +1879,7 @@
11991879 EXPORT_SYMBOL(vm_map_ram);
12001880
12011881 static struct vm_struct *vmlist __initdata;
1882
+
12021883 /**
12031884 * vm_area_add_early - add vmap area early during boot
12041885 * @vm: vm_struct to add
....@@ -1250,11 +1931,57 @@
12501931 vm_area_add_early(vm);
12511932 }
12521933
1934
+static void vmap_init_free_space(void)
1935
+{
1936
+ unsigned long vmap_start = 1;
1937
+ const unsigned long vmap_end = ULONG_MAX;
1938
+ struct vmap_area *busy, *free;
1939
+
1940
+ /*
1941
+ * B F B B B F
1942
+ * -|-----|.....|-----|-----|-----|.....|-
1943
+ * | The KVA space |
1944
+ * |<--------------------------------->|
1945
+ */
1946
+ list_for_each_entry(busy, &vmap_area_list, list) {
1947
+ if (busy->va_start - vmap_start > 0) {
1948
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1949
+ if (!WARN_ON_ONCE(!free)) {
1950
+ free->va_start = vmap_start;
1951
+ free->va_end = busy->va_start;
1952
+
1953
+ insert_vmap_area_augment(free, NULL,
1954
+ &free_vmap_area_root,
1955
+ &free_vmap_area_list);
1956
+ }
1957
+ }
1958
+
1959
+ vmap_start = busy->va_end;
1960
+ }
1961
+
1962
+ if (vmap_end - vmap_start > 0) {
1963
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1964
+ if (!WARN_ON_ONCE(!free)) {
1965
+ free->va_start = vmap_start;
1966
+ free->va_end = vmap_end;
1967
+
1968
+ insert_vmap_area_augment(free, NULL,
1969
+ &free_vmap_area_root,
1970
+ &free_vmap_area_list);
1971
+ }
1972
+ }
1973
+}
1974
+
12531975 void __init vmalloc_init(void)
12541976 {
12551977 struct vmap_area *va;
12561978 struct vm_struct *tmp;
12571979 int i;
1980
+
1981
+ /*
1982
+ * Create the cache for vmap_area objects.
1983
+ */
1984
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
12581985
12591986 for_each_possible_cpu(i) {
12601987 struct vmap_block_queue *vbq;
....@@ -1270,63 +1997,22 @@
12701997
12711998 /* Import existing vmlist entries. */
12721999 for (tmp = vmlist; tmp; tmp = tmp->next) {
1273
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1274
- va->flags = VM_VM_AREA;
2000
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2001
+ if (WARN_ON_ONCE(!va))
2002
+ continue;
2003
+
12752004 va->va_start = (unsigned long)tmp->addr;
12762005 va->va_end = va->va_start + tmp->size;
12772006 va->vm = tmp;
1278
- __insert_vmap_area(va);
2007
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
12792008 }
12802009
1281
- vmap_area_pcpu_hole = VMALLOC_END;
1282
-
2010
+ /*
2011
+ * Now we can initialize a free vmap space.
2012
+ */
2013
+ vmap_init_free_space();
12832014 vmap_initialized = true;
12842015 }
1285
-
1286
-/**
1287
- * map_kernel_range_noflush - map kernel VM area with the specified pages
1288
- * @addr: start of the VM area to map
1289
- * @size: size of the VM area to map
1290
- * @prot: page protection flags to use
1291
- * @pages: pages to map
1292
- *
1293
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1294
- * specify should have been allocated using get_vm_area() and its
1295
- * friends.
1296
- *
1297
- * NOTE:
1298
- * This function does NOT do any cache flushing. The caller is
1299
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
1300
- * before calling this function.
1301
- *
1302
- * RETURNS:
1303
- * The number of pages mapped on success, -errno on failure.
1304
- */
1305
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1306
- pgprot_t prot, struct page **pages)
1307
-{
1308
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
1309
-}
1310
-
1311
-/**
1312
- * unmap_kernel_range_noflush - unmap kernel VM area
1313
- * @addr: start of the VM area to unmap
1314
- * @size: size of the VM area to unmap
1315
- *
1316
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1317
- * specify should have been allocated using get_vm_area() and its
1318
- * friends.
1319
- *
1320
- * NOTE:
1321
- * This function does NOT do any cache flushing. The caller is
1322
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1323
- * before calling this function and flush_tlb_kernel_range() after.
1324
- */
1325
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1326
-{
1327
- vunmap_page_range(addr, addr + size);
1328
-}
1329
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
13302016
13312017 /**
13322018 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
....@@ -1341,33 +2027,26 @@
13412027 unsigned long end = addr + size;
13422028
13432029 flush_cache_vunmap(addr, end);
1344
- vunmap_page_range(addr, end);
2030
+ unmap_kernel_range_noflush(addr, size);
13452031 flush_tlb_kernel_range(addr, end);
13462032 }
1347
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
13482033
1349
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
2034
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2035
+ struct vmap_area *va, unsigned long flags, const void *caller)
13502036 {
1351
- unsigned long addr = (unsigned long)area->addr;
1352
- unsigned long end = addr + get_vm_area_size(area);
1353
- int err;
1354
-
1355
- err = vmap_page_range(addr, end, prot, pages);
1356
-
1357
- return err > 0 ? 0 : err;
1358
-}
1359
-EXPORT_SYMBOL_GPL(map_vm_area);
1360
-
1361
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1362
- unsigned long flags, const void *caller)
1363
-{
1364
- spin_lock(&vmap_area_lock);
13652037 vm->flags = flags;
13662038 vm->addr = (void *)va->va_start;
13672039 vm->size = va->va_end - va->va_start;
13682040 vm->caller = caller;
13692041 va->vm = vm;
1370
- va->flags |= VM_VM_AREA;
2042
+ trace_android_vh_save_vmalloc_stack(flags, vm);
2043
+}
2044
+
2045
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2046
+ unsigned long flags, const void *caller)
2047
+{
2048
+ spin_lock(&vmap_area_lock);
2049
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
13712050 spin_unlock(&vmap_area_lock);
13722051 }
13732052
....@@ -1388,6 +2067,7 @@
13882067 {
13892068 struct vmap_area *va;
13902069 struct vm_struct *area;
2070
+ unsigned long requested_size = size;
13912071
13922072 BUG_ON(in_interrupt());
13932073 size = PAGE_ALIGN(size);
....@@ -1411,18 +2091,12 @@
14112091 return NULL;
14122092 }
14132093
2094
+ kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
2095
+
14142096 setup_vmalloc_vm(area, va, flags, caller);
14152097
14162098 return area;
14172099 }
1418
-
1419
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1420
- unsigned long start, unsigned long end)
1421
-{
1422
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1423
- GFP_KERNEL, __builtin_return_address(0));
1424
-}
1425
-EXPORT_SYMBOL_GPL(__get_vm_area);
14262100
14272101 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
14282102 unsigned long start, unsigned long end,
....@@ -1431,15 +2105,18 @@
14312105 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
14322106 GFP_KERNEL, caller);
14332107 }
2108
+EXPORT_SYMBOL_GPL(__get_vm_area_caller);
14342109
14352110 /**
1436
- * get_vm_area - reserve a contiguous kernel virtual area
1437
- * @size: size of the area
1438
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
2111
+ * get_vm_area - reserve a contiguous kernel virtual area
2112
+ * @size: size of the area
2113
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
14392114 *
1440
- * Search an area of @size in the kernel virtual mapping area,
1441
- * and reserved it for out purposes. Returns the area descriptor
1442
- * on success or %NULL on failure.
2115
+ * Search an area of @size in the kernel virtual mapping area,
2116
+ * and reserved it for out purposes. Returns the area descriptor
2117
+ * on success or %NULL on failure.
2118
+ *
2119
+ * Return: the area descriptor on success or %NULL on failure.
14432120 */
14442121 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
14452122 {
....@@ -1447,7 +2124,6 @@
14472124 NUMA_NO_NODE, GFP_KERNEL,
14482125 __builtin_return_address(0));
14492126 }
1450
-EXPORT_SYMBOL_GPL(get_vm_area);
14512127
14522128 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
14532129 const void *caller)
....@@ -1457,31 +2133,35 @@
14572133 }
14582134
14592135 /**
1460
- * find_vm_area - find a continuous kernel virtual area
1461
- * @addr: base address
2136
+ * find_vm_area - find a continuous kernel virtual area
2137
+ * @addr: base address
14622138 *
1463
- * Search for the kernel VM area starting at @addr, and return it.
1464
- * It is up to the caller to do all required locking to keep the returned
1465
- * pointer valid.
2139
+ * Search for the kernel VM area starting at @addr, and return it.
2140
+ * It is up to the caller to do all required locking to keep the returned
2141
+ * pointer valid.
2142
+ *
2143
+ * Return: the area descriptor on success or %NULL on failure.
14662144 */
14672145 struct vm_struct *find_vm_area(const void *addr)
14682146 {
14692147 struct vmap_area *va;
14702148
14712149 va = find_vmap_area((unsigned long)addr);
1472
- if (va && va->flags & VM_VM_AREA)
1473
- return va->vm;
2150
+ if (!va)
2151
+ return NULL;
14742152
1475
- return NULL;
2153
+ return va->vm;
14762154 }
14772155
14782156 /**
1479
- * remove_vm_area - find and remove a continuous kernel virtual area
1480
- * @addr: base address
2157
+ * remove_vm_area - find and remove a continuous kernel virtual area
2158
+ * @addr: base address
14812159 *
1482
- * Search for the kernel VM area starting at @addr, and remove it.
1483
- * This function returns the found VM area, but using it is NOT safe
1484
- * on SMP machines, except for its size or flags.
2160
+ * Search for the kernel VM area starting at @addr, and remove it.
2161
+ * This function returns the found VM area, but using it is NOT safe
2162
+ * on SMP machines, except for its size or flags.
2163
+ *
2164
+ * Return: the area descriptor on success or %NULL on failure.
14852165 */
14862166 struct vm_struct *remove_vm_area(const void *addr)
14872167 {
....@@ -1489,14 +2169,13 @@
14892169
14902170 might_sleep();
14912171
1492
- va = find_vmap_area((unsigned long)addr);
1493
- if (va && va->flags & VM_VM_AREA) {
2172
+ spin_lock(&vmap_area_lock);
2173
+ va = __find_vmap_area((unsigned long)addr);
2174
+ if (va && va->vm) {
14942175 struct vm_struct *vm = va->vm;
14952176
1496
- spin_lock(&vmap_area_lock);
2177
+ trace_android_vh_remove_vmalloc_stack(vm);
14972178 va->vm = NULL;
1498
- va->flags &= ~VM_VM_AREA;
1499
- va->flags |= VM_LAZY_FREE;
15002179 spin_unlock(&vmap_area_lock);
15012180
15022181 kasan_free_shadow(vm);
....@@ -1504,7 +2183,66 @@
15042183
15052184 return vm;
15062185 }
2186
+
2187
+ spin_unlock(&vmap_area_lock);
15072188 return NULL;
2189
+}
2190
+
2191
+static inline void set_area_direct_map(const struct vm_struct *area,
2192
+ int (*set_direct_map)(struct page *page))
2193
+{
2194
+ int i;
2195
+
2196
+ for (i = 0; i < area->nr_pages; i++)
2197
+ if (page_address(area->pages[i]))
2198
+ set_direct_map(area->pages[i]);
2199
+}
2200
+
2201
+/* Handle removing and resetting vm mappings related to the vm_struct. */
2202
+static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
2203
+{
2204
+ unsigned long start = ULONG_MAX, end = 0;
2205
+ int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
2206
+ int flush_dmap = 0;
2207
+ int i;
2208
+
2209
+ remove_vm_area(area->addr);
2210
+
2211
+ /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
2212
+ if (!flush_reset)
2213
+ return;
2214
+
2215
+ /*
2216
+ * If not deallocating pages, just do the flush of the VM area and
2217
+ * return.
2218
+ */
2219
+ if (!deallocate_pages) {
2220
+ vm_unmap_aliases();
2221
+ return;
2222
+ }
2223
+
2224
+ /*
2225
+ * If execution gets here, flush the vm mapping and reset the direct
2226
+ * map. Find the start and end range of the direct mappings to make sure
2227
+ * the vm_unmap_aliases() flush includes the direct map.
2228
+ */
2229
+ for (i = 0; i < area->nr_pages; i++) {
2230
+ unsigned long addr = (unsigned long)page_address(area->pages[i]);
2231
+ if (addr) {
2232
+ start = min(addr, start);
2233
+ end = max(addr + PAGE_SIZE, end);
2234
+ flush_dmap = 1;
2235
+ }
2236
+ }
2237
+
2238
+ /*
2239
+ * Set direct map to something invalid so that it won't be cached if
2240
+ * there are any accesses after the TLB flush, then flush the TLB and
2241
+ * reset the direct map permissions to the default.
2242
+ */
2243
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
2244
+ _vm_unmap_aliases(start, end, flush_dmap);
2245
+ set_area_direct_map(area, set_direct_map_default_noflush);
15082246 }
15092247
15102248 static void __vunmap(const void *addr, int deallocate_pages)
....@@ -1528,7 +2266,10 @@
15282266 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
15292267 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
15302268
1531
- remove_vm_area(addr);
2269
+ kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
2270
+
2271
+ vm_remove_mappings(area, deallocate_pages);
2272
+
15322273 if (deallocate_pages) {
15332274 int i;
15342275
....@@ -1553,7 +2294,7 @@
15532294 * Use raw_cpu_ptr() because this can be called from preemptible
15542295 * context. Preemption is absolutely fine here, because the llist_add()
15552296 * implementation is lockless, so it works even if we are adding to
1556
- * nother cpu's list. schedule_work() should be fine with this too.
2297
+ * another cpu's list. schedule_work() should be fine with this too.
15572298 */
15582299 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
15592300
....@@ -1562,11 +2303,11 @@
15622303 }
15632304
15642305 /**
1565
- * vfree_atomic - release memory allocated by vmalloc()
1566
- * @addr: memory base address
2306
+ * vfree_atomic - release memory allocated by vmalloc()
2307
+ * @addr: memory base address
15672308 *
1568
- * This one is just like vfree() but can be called in any atomic context
1569
- * except NMIs.
2309
+ * This one is just like vfree() but can be called in any atomic context
2310
+ * except NMIs.
15702311 */
15712312 void vfree_atomic(const void *addr)
15722313 {
....@@ -1579,19 +2320,30 @@
15792320 __vfree_deferred(addr);
15802321 }
15812322
2323
+static void __vfree(const void *addr)
2324
+{
2325
+ if (unlikely(in_interrupt()))
2326
+ __vfree_deferred(addr);
2327
+ else
2328
+ __vunmap(addr, 1);
2329
+}
2330
+
15822331 /**
1583
- * vfree - release memory allocated by vmalloc()
1584
- * @addr: memory base address
2332
+ * vfree - Release memory allocated by vmalloc()
2333
+ * @addr: Memory base address
15852334 *
1586
- * Free the virtually continuous memory area starting at @addr, as
1587
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1588
- * NULL, no operation is performed.
2335
+ * Free the virtually continuous memory area starting at @addr, as obtained
2336
+ * from one of the vmalloc() family of APIs. This will usually also free the
2337
+ * physical memory underlying the virtual allocation, but that memory is
2338
+ * reference counted, so it will not be freed until the last user goes away.
15892339 *
1590
- * Must not be called in NMI context (strictly speaking, only if we don't
1591
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
1592
- * conventions for vfree() arch-depenedent would be a really bad idea)
2340
+ * If @addr is NULL, no operation is performed.
15932341 *
1594
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
2342
+ * Context:
2343
+ * May sleep if called *not* from interrupt context.
2344
+ * Must not be called in NMI context (strictly speaking, it could be
2345
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2346
+ * conventions for vfree() arch-depenedent would be a really bad idea).
15952347 */
15962348 void vfree(const void *addr)
15972349 {
....@@ -1599,23 +2351,23 @@
15992351
16002352 kmemleak_free(addr);
16012353
2354
+ might_sleep_if(!in_interrupt());
2355
+
16022356 if (!addr)
16032357 return;
1604
- if (unlikely(in_interrupt()))
1605
- __vfree_deferred(addr);
1606
- else
1607
- __vunmap(addr, 1);
2358
+
2359
+ __vfree(addr);
16082360 }
16092361 EXPORT_SYMBOL(vfree);
16102362
16112363 /**
1612
- * vunmap - release virtual mapping obtained by vmap()
1613
- * @addr: memory base address
2364
+ * vunmap - release virtual mapping obtained by vmap()
2365
+ * @addr: memory base address
16142366 *
1615
- * Free the virtually contiguous memory area starting at @addr,
1616
- * which was created from the page array passed to vmap().
2367
+ * Free the virtually contiguous memory area starting at @addr,
2368
+ * which was created from the page array passed to vmap().
16172369 *
1618
- * Must not be called in interrupt context.
2370
+ * Must not be called in interrupt context.
16192371 */
16202372 void vunmap(const void *addr)
16212373 {
....@@ -1627,24 +2379,29 @@
16272379 EXPORT_SYMBOL(vunmap);
16282380
16292381 /**
1630
- * vmap - map an array of pages into virtually contiguous space
1631
- * @pages: array of page pointers
1632
- * @count: number of pages to map
1633
- * @flags: vm_area->flags
1634
- * @prot: page protection for the mapping
2382
+ * vmap - map an array of pages into virtually contiguous space
2383
+ * @pages: array of page pointers
2384
+ * @count: number of pages to map
2385
+ * @flags: vm_area->flags
2386
+ * @prot: page protection for the mapping
16352387 *
1636
- * Maps @count pages from @pages into contiguous kernel virtual
1637
- * space.
2388
+ * Maps @count pages from @pages into contiguous kernel virtual space.
2389
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
2390
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
2391
+ * are transferred from the caller to vmap(), and will be freed / dropped when
2392
+ * vfree() is called on the return value.
2393
+ *
2394
+ * Return: the address of the area or %NULL on failure
16382395 */
16392396 void *vmap(struct page **pages, unsigned int count,
1640
- unsigned long flags, pgprot_t prot)
2397
+ unsigned long flags, pgprot_t prot)
16412398 {
16422399 struct vm_struct *area;
16432400 unsigned long size; /* In bytes */
16442401
16452402 might_sleep();
16462403
1647
- if (count > totalram_pages)
2404
+ if (count > totalram_pages())
16482405 return NULL;
16492406
16502407 size = (unsigned long)count << PAGE_SHIFT;
....@@ -1652,36 +2409,85 @@
16522409 if (!area)
16532410 return NULL;
16542411
1655
- if (map_vm_area(area, prot, pages)) {
2412
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
2413
+ pages) < 0) {
16562414 vunmap(area->addr);
16572415 return NULL;
16582416 }
16592417
2418
+ if (flags & VM_MAP_PUT_PAGES) {
2419
+ area->pages = pages;
2420
+ area->nr_pages = count;
2421
+ }
16602422 return area->addr;
16612423 }
16622424 EXPORT_SYMBOL(vmap);
16632425
1664
-static void *__vmalloc_node(unsigned long size, unsigned long align,
1665
- gfp_t gfp_mask, pgprot_t prot,
1666
- int node, const void *caller);
2426
+#ifdef CONFIG_VMAP_PFN
2427
+struct vmap_pfn_data {
2428
+ unsigned long *pfns;
2429
+ pgprot_t prot;
2430
+ unsigned int idx;
2431
+};
2432
+
2433
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
2434
+{
2435
+ struct vmap_pfn_data *data = private;
2436
+
2437
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
2438
+ return -EINVAL;
2439
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
2440
+ return 0;
2441
+}
2442
+
2443
+/**
2444
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
2445
+ * @pfns: array of PFNs
2446
+ * @count: number of pages to map
2447
+ * @prot: page protection for the mapping
2448
+ *
2449
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
2450
+ * the start address of the mapping.
2451
+ */
2452
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
2453
+{
2454
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
2455
+ struct vm_struct *area;
2456
+
2457
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
2458
+ __builtin_return_address(0));
2459
+ if (!area)
2460
+ return NULL;
2461
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2462
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
2463
+ free_vm_area(area);
2464
+ return NULL;
2465
+ }
2466
+
2467
+ flush_cache_vmap((unsigned long)area->addr,
2468
+ (unsigned long)area->addr + count * PAGE_SIZE);
2469
+
2470
+ return area->addr;
2471
+}
2472
+EXPORT_SYMBOL_GPL(vmap_pfn);
2473
+#endif /* CONFIG_VMAP_PFN */
2474
+
16672475 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
16682476 pgprot_t prot, int node)
16692477 {
1670
- struct page **pages;
1671
- unsigned int nr_pages, array_size, i;
16722478 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1673
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
1674
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
1675
- 0 :
1676
- __GFP_HIGHMEM;
2479
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
2480
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
2481
+ struct page **pages;
16772482
1678
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1679
- array_size = (nr_pages * sizeof(struct page *));
2483
+ gfp_mask |= __GFP_NOWARN;
2484
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
2485
+ gfp_mask |= __GFP_HIGHMEM;
16802486
16812487 /* Please note that the recursion is strictly bounded. */
16822488 if (array_size > PAGE_SIZE) {
1683
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
1684
- PAGE_KERNEL, node, area->caller);
2489
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
2490
+ area->caller);
16852491 } else {
16862492 pages = kmalloc_node(array_size, nested_gfp, node);
16872493 }
....@@ -1699,49 +2505,53 @@
16992505 struct page *page;
17002506
17012507 if (node == NUMA_NO_NODE)
1702
- page = alloc_page(alloc_mask|highmem_mask);
2508
+ page = alloc_page(gfp_mask);
17032509 else
1704
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
2510
+ page = alloc_pages_node(node, gfp_mask, 0);
17052511
17062512 if (unlikely(!page)) {
1707
- /* Successfully allocated i pages, free them in __vunmap() */
2513
+ /* Successfully allocated i pages, free them in __vfree() */
17082514 area->nr_pages = i;
17092515 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
17102516 goto fail;
17112517 }
17122518 area->pages[i] = page;
1713
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
2519
+ if (gfpflags_allow_blocking(gfp_mask))
17142520 cond_resched();
17152521 }
17162522 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
17172523
1718
- if (map_vm_area(area, prot, pages))
2524
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
2525
+ prot, pages) < 0)
17192526 goto fail;
2527
+
17202528 return area->addr;
17212529
17222530 fail:
17232531 warn_alloc(gfp_mask, NULL,
17242532 "vmalloc: allocation failure, allocated %ld of %ld bytes",
17252533 (area->nr_pages*PAGE_SIZE), area->size);
1726
- vfree(area->addr);
2534
+ __vfree(area->addr);
17272535 return NULL;
17282536 }
17292537
17302538 /**
1731
- * __vmalloc_node_range - allocate virtually contiguous memory
1732
- * @size: allocation size
1733
- * @align: desired alignment
1734
- * @start: vm area range start
1735
- * @end: vm area range end
1736
- * @gfp_mask: flags for the page level allocator
1737
- * @prot: protection mask for the allocated pages
1738
- * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
1739
- * @node: node to use for allocation or NUMA_NO_NODE
1740
- * @caller: caller's return address
2539
+ * __vmalloc_node_range - allocate virtually contiguous memory
2540
+ * @size: allocation size
2541
+ * @align: desired alignment
2542
+ * @start: vm area range start
2543
+ * @end: vm area range end
2544
+ * @gfp_mask: flags for the page level allocator
2545
+ * @prot: protection mask for the allocated pages
2546
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
2547
+ * @node: node to use for allocation or NUMA_NO_NODE
2548
+ * @caller: caller's return address
17412549 *
1742
- * Allocate enough pages to cover @size from the page level
1743
- * allocator with @gfp_mask flags. Map them into contiguous
1744
- * kernel virtual space, using a pagetable protection of @prot.
2550
+ * Allocate enough pages to cover @size from the page level
2551
+ * allocator with @gfp_mask flags. Map them into contiguous
2552
+ * kernel virtual space, using a pagetable protection of @prot.
2553
+ *
2554
+ * Return: the address of the area or %NULL on failure
17452555 */
17462556 void *__vmalloc_node_range(unsigned long size, unsigned long align,
17472557 unsigned long start, unsigned long end, gfp_t gfp_mask,
....@@ -1753,10 +2563,10 @@
17532563 unsigned long real_size = size;
17542564
17552565 size = PAGE_ALIGN(size);
1756
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
2566
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages())
17572567 goto fail;
17582568
1759
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
2569
+ area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
17602570 vm_flags, start, end, node, gfp_mask, caller);
17612571 if (!area)
17622572 goto fail;
....@@ -1764,12 +2574,6 @@
17642574 addr = __vmalloc_area_node(area, gfp_mask, prot, node);
17652575 if (!addr)
17662576 return NULL;
1767
-
1768
- /*
1769
- * First make sure the mappings are removed from all page-tables
1770
- * before they are freed.
1771
- */
1772
- vmalloc_sync_unmappings();
17732577
17742578 /*
17752579 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
....@@ -1789,84 +2593,82 @@
17892593 }
17902594
17912595 /**
1792
- * __vmalloc_node - allocate virtually contiguous memory
1793
- * @size: allocation size
1794
- * @align: desired alignment
1795
- * @gfp_mask: flags for the page level allocator
1796
- * @prot: protection mask for the allocated pages
1797
- * @node: node to use for allocation or NUMA_NO_NODE
1798
- * @caller: caller's return address
2596
+ * __vmalloc_node - allocate virtually contiguous memory
2597
+ * @size: allocation size
2598
+ * @align: desired alignment
2599
+ * @gfp_mask: flags for the page level allocator
2600
+ * @node: node to use for allocation or NUMA_NO_NODE
2601
+ * @caller: caller's return address
17992602 *
1800
- * Allocate enough pages to cover @size from the page level
1801
- * allocator with @gfp_mask flags. Map them into contiguous
1802
- * kernel virtual space, using a pagetable protection of @prot.
2603
+ * Allocate enough pages to cover @size from the page level allocator with
2604
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
18032605 *
1804
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
1805
- * and __GFP_NOFAIL are not supported
2606
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
2607
+ * and __GFP_NOFAIL are not supported
18062608 *
1807
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
1808
- * with mm people.
2609
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
2610
+ * with mm people.
18092611 *
2612
+ * Return: pointer to the allocated memory or %NULL on error
18102613 */
1811
-static void *__vmalloc_node(unsigned long size, unsigned long align,
1812
- gfp_t gfp_mask, pgprot_t prot,
1813
- int node, const void *caller)
2614
+void *__vmalloc_node(unsigned long size, unsigned long align,
2615
+ gfp_t gfp_mask, int node, const void *caller)
18142616 {
18152617 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1816
- gfp_mask, prot, 0, node, caller);
2618
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
18172619 }
2620
+/*
2621
+ * This is only for performance analysis of vmalloc and stress purpose.
2622
+ * It is required by vmalloc test module, therefore do not use it other
2623
+ * than that.
2624
+ */
2625
+#ifdef CONFIG_TEST_VMALLOC_MODULE
2626
+EXPORT_SYMBOL_GPL(__vmalloc_node);
2627
+#endif
18182628
1819
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
2629
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
18202630 {
1821
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
2631
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
18222632 __builtin_return_address(0));
18232633 }
18242634 EXPORT_SYMBOL(__vmalloc);
18252635
1826
-static inline void *__vmalloc_node_flags(unsigned long size,
1827
- int node, gfp_t flags)
1828
-{
1829
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1830
- node, __builtin_return_address(0));
1831
-}
1832
-
1833
-
1834
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
1835
- void *caller)
1836
-{
1837
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
1838
-}
1839
-
18402636 /**
1841
- * vmalloc - allocate virtually contiguous memory
1842
- * @size: allocation size
1843
- * Allocate enough pages to cover @size from the page level
1844
- * allocator and map them into contiguous kernel virtual space.
2637
+ * vmalloc - allocate virtually contiguous memory
2638
+ * @size: allocation size
18452639 *
1846
- * For tight control over page level allocator and protection flags
1847
- * use __vmalloc() instead.
2640
+ * Allocate enough pages to cover @size from the page level
2641
+ * allocator and map them into contiguous kernel virtual space.
2642
+ *
2643
+ * For tight control over page level allocator and protection flags
2644
+ * use __vmalloc() instead.
2645
+ *
2646
+ * Return: pointer to the allocated memory or %NULL on error
18482647 */
18492648 void *vmalloc(unsigned long size)
18502649 {
1851
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
1852
- GFP_KERNEL);
2650
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
2651
+ __builtin_return_address(0));
18532652 }
18542653 EXPORT_SYMBOL(vmalloc);
18552654
18562655 /**
1857
- * vzalloc - allocate virtually contiguous memory with zero fill
1858
- * @size: allocation size
1859
- * Allocate enough pages to cover @size from the page level
1860
- * allocator and map them into contiguous kernel virtual space.
1861
- * The memory allocated is set to zero.
2656
+ * vzalloc - allocate virtually contiguous memory with zero fill
2657
+ * @size: allocation size
18622658 *
1863
- * For tight control over page level allocator and protection flags
1864
- * use __vmalloc() instead.
2659
+ * Allocate enough pages to cover @size from the page level
2660
+ * allocator and map them into contiguous kernel virtual space.
2661
+ * The memory allocated is set to zero.
2662
+ *
2663
+ * For tight control over page level allocator and protection flags
2664
+ * use __vmalloc() instead.
2665
+ *
2666
+ * Return: pointer to the allocated memory or %NULL on error
18652667 */
18662668 void *vzalloc(unsigned long size)
18672669 {
1868
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
1869
- GFP_KERNEL | __GFP_ZERO);
2670
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
2671
+ __builtin_return_address(0));
18702672 }
18712673 EXPORT_SYMBOL(vzalloc);
18722674
....@@ -1876,39 +2678,35 @@
18762678 *
18772679 * The resulting memory area is zeroed so it can be mapped to userspace
18782680 * without leaking data.
2681
+ *
2682
+ * Return: pointer to the allocated memory or %NULL on error
18792683 */
18802684 void *vmalloc_user(unsigned long size)
18812685 {
1882
- struct vm_struct *area;
1883
- void *ret;
1884
-
1885
- ret = __vmalloc_node(size, SHMLBA,
1886
- GFP_KERNEL | __GFP_ZERO,
1887
- PAGE_KERNEL, NUMA_NO_NODE,
1888
- __builtin_return_address(0));
1889
- if (ret) {
1890
- area = find_vm_area(ret);
1891
- area->flags |= VM_USERMAP;
1892
- }
1893
- return ret;
2686
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
2687
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
2688
+ VM_USERMAP, NUMA_NO_NODE,
2689
+ __builtin_return_address(0));
18942690 }
18952691 EXPORT_SYMBOL(vmalloc_user);
18962692
18972693 /**
1898
- * vmalloc_node - allocate memory on a specific node
1899
- * @size: allocation size
1900
- * @node: numa node
2694
+ * vmalloc_node - allocate memory on a specific node
2695
+ * @size: allocation size
2696
+ * @node: numa node
19012697 *
1902
- * Allocate enough pages to cover @size from the page level
1903
- * allocator and map them into contiguous kernel virtual space.
2698
+ * Allocate enough pages to cover @size from the page level
2699
+ * allocator and map them into contiguous kernel virtual space.
19042700 *
1905
- * For tight control over page level allocator and protection flags
1906
- * use __vmalloc() instead.
2701
+ * For tight control over page level allocator and protection flags
2702
+ * use __vmalloc() instead.
2703
+ *
2704
+ * Return: pointer to the allocated memory or %NULL on error
19072705 */
19082706 void *vmalloc_node(unsigned long size, int node)
19092707 {
1910
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
1911
- node, __builtin_return_address(0));
2708
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
2709
+ __builtin_return_address(0));
19122710 }
19132711 EXPORT_SYMBOL(vmalloc_node);
19142712
....@@ -1921,33 +2719,14 @@
19212719 * allocator and map them into contiguous kernel virtual space.
19222720 * The memory allocated is set to zero.
19232721 *
1924
- * For tight control over page level allocator and protection flags
1925
- * use __vmalloc_node() instead.
2722
+ * Return: pointer to the allocated memory or %NULL on error
19262723 */
19272724 void *vzalloc_node(unsigned long size, int node)
19282725 {
1929
- return __vmalloc_node_flags(size, node,
1930
- GFP_KERNEL | __GFP_ZERO);
2726
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
2727
+ __builtin_return_address(0));
19312728 }
19322729 EXPORT_SYMBOL(vzalloc_node);
1933
-
1934
-/**
1935
- * vmalloc_exec - allocate virtually contiguous, executable memory
1936
- * @size: allocation size
1937
- *
1938
- * Kernel-internal function to allocate enough pages to cover @size
1939
- * the page level allocator and map them into contiguous and
1940
- * executable kernel virtual space.
1941
- *
1942
- * For tight control over page level allocator and protection flags
1943
- * use __vmalloc() instead.
1944
- */
1945
-
1946
-void *vmalloc_exec(unsigned long size)
1947
-{
1948
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
1949
- NUMA_NO_NODE, __builtin_return_address(0));
1950
-}
19512730
19522731 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
19532732 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
....@@ -1962,38 +2741,36 @@
19622741 #endif
19632742
19642743 /**
1965
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
1966
- * @size: allocation size
2744
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
2745
+ * @size: allocation size
19672746 *
1968
- * Allocate enough 32bit PA addressable pages to cover @size from the
1969
- * page level allocator and map them into contiguous kernel virtual space.
2747
+ * Allocate enough 32bit PA addressable pages to cover @size from the
2748
+ * page level allocator and map them into contiguous kernel virtual space.
2749
+ *
2750
+ * Return: pointer to the allocated memory or %NULL on error
19702751 */
19712752 void *vmalloc_32(unsigned long size)
19722753 {
1973
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1974
- NUMA_NO_NODE, __builtin_return_address(0));
2754
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
2755
+ __builtin_return_address(0));
19752756 }
19762757 EXPORT_SYMBOL(vmalloc_32);
19772758
19782759 /**
19792760 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1980
- * @size: allocation size
2761
+ * @size: allocation size
19812762 *
19822763 * The resulting memory area is 32bit addressable and zeroed so it can be
19832764 * mapped to userspace without leaking data.
2765
+ *
2766
+ * Return: pointer to the allocated memory or %NULL on error
19842767 */
19852768 void *vmalloc_32_user(unsigned long size)
19862769 {
1987
- struct vm_struct *area;
1988
- void *ret;
1989
-
1990
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1991
- NUMA_NO_NODE, __builtin_return_address(0));
1992
- if (ret) {
1993
- area = find_vm_area(ret);
1994
- area->flags |= VM_USERMAP;
1995
- }
1996
- return ret;
2770
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
2771
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
2772
+ VM_USERMAP, NUMA_NO_NODE,
2773
+ __builtin_return_address(0));
19972774 }
19982775 EXPORT_SYMBOL(vmalloc_32_user);
19992776
....@@ -2079,31 +2856,29 @@
20792856 }
20802857
20812858 /**
2082
- * vread() - read vmalloc area in a safe way.
2083
- * @buf: buffer for reading data
2084
- * @addr: vm address.
2085
- * @count: number of bytes to be read.
2859
+ * vread() - read vmalloc area in a safe way.
2860
+ * @buf: buffer for reading data
2861
+ * @addr: vm address.
2862
+ * @count: number of bytes to be read.
20862863 *
2087
- * Returns # of bytes which addr and buf should be increased.
2088
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
2089
- * includes any intersect with alive vmalloc area.
2864
+ * This function checks that addr is a valid vmalloc'ed area, and
2865
+ * copy data from that area to a given buffer. If the given memory range
2866
+ * of [addr...addr+count) includes some valid address, data is copied to
2867
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
2868
+ * IOREMAP area is treated as memory hole and no copy is done.
20902869 *
2091
- * This function checks that addr is a valid vmalloc'ed area, and
2092
- * copy data from that area to a given buffer. If the given memory range
2093
- * of [addr...addr+count) includes some valid address, data is copied to
2094
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
2095
- * IOREMAP area is treated as memory hole and no copy is done.
2870
+ * If [addr...addr+count) doesn't includes any intersects with alive
2871
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
20962872 *
2097
- * If [addr...addr+count) doesn't includes any intersects with alive
2098
- * vm_struct area, returns 0. @buf should be kernel's buffer.
2873
+ * Note: In usual ops, vread() is never necessary because the caller
2874
+ * should know vmalloc() area is valid and can use memcpy().
2875
+ * This is for routines which have to access vmalloc area without
2876
+ * any information, as /dev/kmem.
20992877 *
2100
- * Note: In usual ops, vread() is never necessary because the caller
2101
- * should know vmalloc() area is valid and can use memcpy().
2102
- * This is for routines which have to access vmalloc area without
2103
- * any informaion, as /dev/kmem.
2104
- *
2878
+ * Return: number of bytes for which addr and buf should be increased
2879
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
2880
+ * include any intersection with valid vmalloc area
21052881 */
2106
-
21072882 long vread(char *buf, char *addr, unsigned long count)
21082883 {
21092884 struct vmap_area *va;
....@@ -2121,7 +2896,7 @@
21212896 if (!count)
21222897 break;
21232898
2124
- if (!(va->flags & VM_VM_AREA))
2899
+ if (!va->vm)
21252900 continue;
21262901
21272902 vm = va->vm;
....@@ -2160,31 +2935,29 @@
21602935 }
21612936
21622937 /**
2163
- * vwrite() - write vmalloc area in a safe way.
2164
- * @buf: buffer for source data
2165
- * @addr: vm address.
2166
- * @count: number of bytes to be read.
2938
+ * vwrite() - write vmalloc area in a safe way.
2939
+ * @buf: buffer for source data
2940
+ * @addr: vm address.
2941
+ * @count: number of bytes to be read.
21672942 *
2168
- * Returns # of bytes which addr and buf should be incresed.
2169
- * (same number to @count).
2170
- * If [addr...addr+count) doesn't includes any intersect with valid
2171
- * vmalloc area, returns 0.
2943
+ * This function checks that addr is a valid vmalloc'ed area, and
2944
+ * copy data from a buffer to the given addr. If specified range of
2945
+ * [addr...addr+count) includes some valid address, data is copied from
2946
+ * proper area of @buf. If there are memory holes, no copy to hole.
2947
+ * IOREMAP area is treated as memory hole and no copy is done.
21722948 *
2173
- * This function checks that addr is a valid vmalloc'ed area, and
2174
- * copy data from a buffer to the given addr. If specified range of
2175
- * [addr...addr+count) includes some valid address, data is copied from
2176
- * proper area of @buf. If there are memory holes, no copy to hole.
2177
- * IOREMAP area is treated as memory hole and no copy is done.
2949
+ * If [addr...addr+count) doesn't includes any intersects with alive
2950
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
21782951 *
2179
- * If [addr...addr+count) doesn't includes any intersects with alive
2180
- * vm_struct area, returns 0. @buf should be kernel's buffer.
2952
+ * Note: In usual ops, vwrite() is never necessary because the caller
2953
+ * should know vmalloc() area is valid and can use memcpy().
2954
+ * This is for routines which have to access vmalloc area without
2955
+ * any information, as /dev/kmem.
21812956 *
2182
- * Note: In usual ops, vwrite() is never necessary because the caller
2183
- * should know vmalloc() area is valid and can use memcpy().
2184
- * This is for routines which have to access vmalloc area without
2185
- * any informaion, as /dev/kmem.
2957
+ * Return: number of bytes for which addr and buf should be
2958
+ * increased (same number as @count) or %0 if [addr...addr+count)
2959
+ * doesn't include any intersection with valid vmalloc area
21862960 */
2187
-
21882961 long vwrite(char *buf, char *addr, unsigned long count)
21892962 {
21902963 struct vmap_area *va;
....@@ -2203,7 +2976,7 @@
22032976 if (!count)
22042977 break;
22052978
2206
- if (!(va->flags & VM_VM_AREA))
2979
+ if (!va->vm)
22072980 continue;
22082981
22092982 vm = va->vm;
....@@ -2236,21 +3009,21 @@
22363009 }
22373010
22383011 /**
2239
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
2240
- * @vma: vma to cover
2241
- * @uaddr: target user address to start at
2242
- * @kaddr: virtual address of vmalloc kernel memory
2243
- * @pgoff: offset from @kaddr to start at
2244
- * @size: size of map area
3012
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
3013
+ * @vma: vma to cover
3014
+ * @uaddr: target user address to start at
3015
+ * @kaddr: virtual address of vmalloc kernel memory
3016
+ * @pgoff: offset from @kaddr to start at
3017
+ * @size: size of map area
22453018 *
2246
- * Returns: 0 for success, -Exxx on failure
3019
+ * Returns: 0 for success, -Exxx on failure
22473020 *
2248
- * This function checks that @kaddr is a valid vmalloc'ed area,
2249
- * and that it is big enough to cover the range starting at
2250
- * @uaddr in @vma. Will return failure if that criteria isn't
2251
- * met.
3021
+ * This function checks that @kaddr is a valid vmalloc'ed area,
3022
+ * and that it is big enough to cover the range starting at
3023
+ * @uaddr in @vma. Will return failure if that criteria isn't
3024
+ * met.
22523025 *
2253
- * Similar to remap_pfn_range() (see mm/memory.c)
3026
+ * Similar to remap_pfn_range() (see mm/memory.c)
22543027 */
22553028 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
22563029 void *kaddr, unsigned long pgoff,
....@@ -2272,7 +3045,7 @@
22723045 if (!area)
22733046 return -EINVAL;
22743047
2275
- if (!(area->flags & VM_USERMAP))
3048
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
22763049 return -EINVAL;
22773050
22783051 if (check_add_overflow(size, off, &end_index) ||
....@@ -2300,18 +3073,18 @@
23003073 EXPORT_SYMBOL(remap_vmalloc_range_partial);
23013074
23023075 /**
2303
- * remap_vmalloc_range - map vmalloc pages to userspace
2304
- * @vma: vma to cover (map full range of vma)
2305
- * @addr: vmalloc memory
2306
- * @pgoff: number of pages into addr before first page to map
3076
+ * remap_vmalloc_range - map vmalloc pages to userspace
3077
+ * @vma: vma to cover (map full range of vma)
3078
+ * @addr: vmalloc memory
3079
+ * @pgoff: number of pages into addr before first page to map
23073080 *
2308
- * Returns: 0 for success, -Exxx on failure
3081
+ * Returns: 0 for success, -Exxx on failure
23093082 *
2310
- * This function checks that addr is a valid vmalloc'ed area, and
2311
- * that it is big enough to cover the vma. Will return failure if
2312
- * that criteria isn't met.
3083
+ * This function checks that addr is a valid vmalloc'ed area, and
3084
+ * that it is big enough to cover the vma. Will return failure if
3085
+ * that criteria isn't met.
23133086 *
2314
- * Similar to remap_pfn_range() (see mm/memory.c)
3087
+ * Similar to remap_pfn_range() (see mm/memory.c)
23153088 */
23163089 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
23173090 unsigned long pgoff)
....@@ -2321,69 +3094,6 @@
23213094 vma->vm_end - vma->vm_start);
23223095 }
23233096 EXPORT_SYMBOL(remap_vmalloc_range);
2324
-
2325
-/*
2326
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
2327
- * not to have one.
2328
- *
2329
- * The purpose of this function is to make sure the vmalloc area
2330
- * mappings are identical in all page-tables in the system.
2331
- */
2332
-void __weak vmalloc_sync_mappings(void)
2333
-{
2334
-}
2335
-
2336
-void __weak vmalloc_sync_unmappings(void)
2337
-{
2338
-}
2339
-
2340
-static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2341
-{
2342
- pte_t ***p = data;
2343
-
2344
- if (p) {
2345
- *(*p) = pte;
2346
- (*p)++;
2347
- }
2348
- return 0;
2349
-}
2350
-
2351
-/**
2352
- * alloc_vm_area - allocate a range of kernel address space
2353
- * @size: size of the area
2354
- * @ptes: returns the PTEs for the address space
2355
- *
2356
- * Returns: NULL on failure, vm_struct on success
2357
- *
2358
- * This function reserves a range of kernel address space, and
2359
- * allocates pagetables to map that range. No actual mappings
2360
- * are created.
2361
- *
2362
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
2363
- * allocated for the VM area are returned.
2364
- */
2365
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
2366
-{
2367
- struct vm_struct *area;
2368
-
2369
- area = get_vm_area_caller(size, VM_IOREMAP,
2370
- __builtin_return_address(0));
2371
- if (area == NULL)
2372
- return NULL;
2373
-
2374
- /*
2375
- * This ensures that page tables are constructed for this region
2376
- * of kernel virtual address space and mapped into init_mm.
2377
- */
2378
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2379
- size, f, ptes ? &ptes : NULL)) {
2380
- free_vm_area(area);
2381
- return NULL;
2382
- }
2383
-
2384
- return area;
2385
-}
2386
-EXPORT_SYMBOL_GPL(alloc_vm_area);
23873097
23883098 void free_vm_area(struct vm_struct *area)
23893099 {
....@@ -2401,81 +3111,64 @@
24013111 }
24023112
24033113 /**
2404
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2405
- * @end: target address
2406
- * @pnext: out arg for the next vmap_area
2407
- * @pprev: out arg for the previous vmap_area
3114
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3115
+ * @addr: target address
24083116 *
2409
- * Returns: %true if either or both of next and prev are found,
2410
- * %false if no vmap_area exists
2411
- *
2412
- * Find vmap_areas end addresses of which enclose @end. ie. if not
2413
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
3117
+ * Returns: vmap_area if it is found. If there is no such area
3118
+ * the first highest(reverse order) vmap_area is returned
3119
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
3120
+ * if there are no any areas before @addr.
24143121 */
2415
-static bool pvm_find_next_prev(unsigned long end,
2416
- struct vmap_area **pnext,
2417
- struct vmap_area **pprev)
3122
+static struct vmap_area *
3123
+pvm_find_va_enclose_addr(unsigned long addr)
24183124 {
2419
- struct rb_node *n = vmap_area_root.rb_node;
2420
- struct vmap_area *va = NULL;
3125
+ struct vmap_area *va, *tmp;
3126
+ struct rb_node *n;
3127
+
3128
+ n = free_vmap_area_root.rb_node;
3129
+ va = NULL;
24213130
24223131 while (n) {
2423
- va = rb_entry(n, struct vmap_area, rb_node);
2424
- if (end < va->va_end)
2425
- n = n->rb_left;
2426
- else if (end > va->va_end)
3132
+ tmp = rb_entry(n, struct vmap_area, rb_node);
3133
+ if (tmp->va_start <= addr) {
3134
+ va = tmp;
3135
+ if (tmp->va_end >= addr)
3136
+ break;
3137
+
24273138 n = n->rb_right;
2428
- else
2429
- break;
3139
+ } else {
3140
+ n = n->rb_left;
3141
+ }
24303142 }
24313143
2432
- if (!va)
2433
- return false;
2434
-
2435
- if (va->va_end > end) {
2436
- *pnext = va;
2437
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2438
- } else {
2439
- *pprev = va;
2440
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2441
- }
2442
- return true;
3144
+ return va;
24433145 }
24443146
24453147 /**
2446
- * pvm_determine_end - find the highest aligned address between two vmap_areas
2447
- * @pnext: in/out arg for the next vmap_area
2448
- * @pprev: in/out arg for the previous vmap_area
2449
- * @align: alignment
3148
+ * pvm_determine_end_from_reverse - find the highest aligned address
3149
+ * of free block below VMALLOC_END
3150
+ * @va:
3151
+ * in - the VA we start the search(reverse order);
3152
+ * out - the VA with the highest aligned end address.
24503153 *
2451
- * Returns: determined end address
2452
- *
2453
- * Find the highest aligned address between *@pnext and *@pprev below
2454
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2455
- * down address is between the end addresses of the two vmap_areas.
2456
- *
2457
- * Please note that the address returned by this function may fall
2458
- * inside *@pnext vmap_area. The caller is responsible for checking
2459
- * that.
3154
+ * Returns: determined end address within vmap_area
24603155 */
2461
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
2462
- struct vmap_area **pprev,
2463
- unsigned long align)
3156
+static unsigned long
3157
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
24643158 {
2465
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3159
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
24663160 unsigned long addr;
24673161
2468
- if (*pnext)
2469
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2470
- else
2471
- addr = vmalloc_end;
2472
-
2473
- while (*pprev && (*pprev)->va_end > addr) {
2474
- *pnext = *pprev;
2475
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
3162
+ if (likely(*va)) {
3163
+ list_for_each_entry_from_reverse((*va),
3164
+ &free_vmap_area_list, list) {
3165
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
3166
+ if ((*va)->va_start < addr)
3167
+ return addr;
3168
+ }
24763169 }
24773170
2478
- return addr;
3171
+ return 0;
24793172 }
24803173
24813174 /**
....@@ -2495,12 +3188,12 @@
24953188 * to gigabytes. To avoid interacting with regular vmallocs, these
24963189 * areas are allocated from top.
24973190 *
2498
- * Despite its complicated look, this allocator is rather simple. It
2499
- * does everything top-down and scans areas from the end looking for
2500
- * matching slot. While scanning, if any of the areas overlaps with
2501
- * existing vmap_area, the base address is pulled down to fit the
2502
- * area. Scanning is repeated till all the areas fit and then all
2503
- * necessary data structures are inserted and the result is returned.
3191
+ * Despite its complicated look, this allocator is rather simple. It
3192
+ * does everything top-down and scans free blocks from the end looking
3193
+ * for matching base. While scanning, if any of the areas do not fit the
3194
+ * base address is pulled down to fit the area. Scanning is repeated till
3195
+ * all the areas fit and then all necessary data structures are inserted
3196
+ * and the result is returned.
25043197 */
25053198 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
25063199 const size_t *sizes, int nr_vms,
....@@ -2508,11 +3201,12 @@
25083201 {
25093202 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
25103203 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2511
- struct vmap_area **vas, *prev, *next;
3204
+ struct vmap_area **vas, *va;
25123205 struct vm_struct **vms;
25133206 int area, area2, last_area, term_area;
2514
- unsigned long base, start, end, last_end;
3207
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
25153208 bool purged = false;
3209
+ enum fit_type type;
25163210
25173211 /* verify parameters and allocate data structures */
25183212 BUG_ON(offset_in_page(align) || !is_power_of_2(align));
....@@ -2548,62 +3242,52 @@
25483242 goto err_free2;
25493243
25503244 for (area = 0; area < nr_vms; area++) {
2551
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
3245
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
25523246 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
25533247 if (!vas[area] || !vms[area])
25543248 goto err_free;
25553249 }
25563250 retry:
2557
- spin_lock(&vmap_area_lock);
3251
+ spin_lock(&free_vmap_area_lock);
25583252
25593253 /* start scanning - we scan from the top, begin with the last area */
25603254 area = term_area = last_area;
25613255 start = offsets[area];
25623256 end = start + sizes[area];
25633257
2564
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2565
- base = vmalloc_end - last_end;
2566
- goto found;
2567
- }
2568
- base = pvm_determine_end(&next, &prev, align) - end;
3258
+ va = pvm_find_va_enclose_addr(vmalloc_end);
3259
+ base = pvm_determine_end_from_reverse(&va, align) - end;
25693260
25703261 while (true) {
2571
- BUG_ON(next && next->va_end <= base + end);
2572
- BUG_ON(prev && prev->va_end > base + end);
2573
-
25743262 /*
25753263 * base might have underflowed, add last_end before
25763264 * comparing.
25773265 */
2578
- if (base + last_end < vmalloc_start + last_end) {
2579
- spin_unlock(&vmap_area_lock);
2580
- if (!purged) {
2581
- purge_vmap_area_lazy();
2582
- purged = true;
2583
- goto retry;
2584
- }
2585
- goto err_free;
2586
- }
3266
+ if (base + last_end < vmalloc_start + last_end)
3267
+ goto overflow;
25873268
25883269 /*
2589
- * If next overlaps, move base downwards so that it's
2590
- * right below next and then recheck.
3270
+ * Fitting base has not been found.
25913271 */
2592
- if (next && next->va_start < base + end) {
2593
- base = pvm_determine_end(&next, &prev, align) - end;
3272
+ if (va == NULL)
3273
+ goto overflow;
3274
+
3275
+ /*
3276
+ * If required width exceeds current VA block, move
3277
+ * base downwards and then recheck.
3278
+ */
3279
+ if (base + end > va->va_end) {
3280
+ base = pvm_determine_end_from_reverse(&va, align) - end;
25943281 term_area = area;
25953282 continue;
25963283 }
25973284
25983285 /*
2599
- * If prev overlaps, shift down next and prev and move
2600
- * base so that it's right below new next and then
2601
- * recheck.
3286
+ * If this VA does not fit, move base downwards and recheck.
26023287 */
2603
- if (prev && prev->va_end > base + start) {
2604
- next = prev;
2605
- prev = node_to_va(rb_prev(&next->rb_node));
2606
- base = pvm_determine_end(&next, &prev, align) - end;
3288
+ if (base + start < va->va_start) {
3289
+ va = node_to_va(rb_prev(&va->rb_node));
3290
+ base = pvm_determine_end_from_reverse(&va, align) - end;
26073291 term_area = area;
26083292 continue;
26093293 }
....@@ -2615,38 +3299,132 @@
26153299 area = (area + nr_vms - 1) % nr_vms;
26163300 if (area == term_area)
26173301 break;
3302
+
26183303 start = offsets[area];
26193304 end = start + sizes[area];
2620
- pvm_find_next_prev(base + end, &next, &prev);
3305
+ va = pvm_find_va_enclose_addr(base + end);
26213306 }
2622
-found:
3307
+
26233308 /* we've found a fitting base, insert all va's */
26243309 for (area = 0; area < nr_vms; area++) {
2625
- struct vmap_area *va = vas[area];
3310
+ int ret;
26263311
2627
- va->va_start = base + offsets[area];
2628
- va->va_end = va->va_start + sizes[area];
2629
- __insert_vmap_area(va);
3312
+ start = base + offsets[area];
3313
+ size = sizes[area];
3314
+
3315
+ va = pvm_find_va_enclose_addr(start);
3316
+ if (WARN_ON_ONCE(va == NULL))
3317
+ /* It is a BUG(), but trigger recovery instead. */
3318
+ goto recovery;
3319
+
3320
+ type = classify_va_fit_type(va, start, size);
3321
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
3322
+ /* It is a BUG(), but trigger recovery instead. */
3323
+ goto recovery;
3324
+
3325
+ ret = adjust_va_to_fit_type(va, start, size, type);
3326
+ if (unlikely(ret))
3327
+ goto recovery;
3328
+
3329
+ /* Allocated area. */
3330
+ va = vas[area];
3331
+ va->va_start = start;
3332
+ va->va_end = start + size;
26303333 }
26313334
2632
- vmap_area_pcpu_hole = base + offsets[last_area];
3335
+ spin_unlock(&free_vmap_area_lock);
26333336
2634
- spin_unlock(&vmap_area_lock);
3337
+ /* populate the kasan shadow space */
3338
+ for (area = 0; area < nr_vms; area++) {
3339
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
3340
+ goto err_free_shadow;
3341
+
3342
+ kasan_unpoison_vmalloc((void *)vas[area]->va_start,
3343
+ sizes[area]);
3344
+ }
26353345
26363346 /* insert all vm's */
2637
- for (area = 0; area < nr_vms; area++)
2638
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
3347
+ spin_lock(&vmap_area_lock);
3348
+ for (area = 0; area < nr_vms; area++) {
3349
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
3350
+
3351
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
26393352 pcpu_get_vm_areas);
3353
+ }
3354
+ spin_unlock(&vmap_area_lock);
26403355
26413356 kfree(vas);
26423357 return vms;
26433358
3359
+recovery:
3360
+ /*
3361
+ * Remove previously allocated areas. There is no
3362
+ * need in removing these areas from the busy tree,
3363
+ * because they are inserted only on the final step
3364
+ * and when pcpu_get_vm_areas() is success.
3365
+ */
3366
+ while (area--) {
3367
+ orig_start = vas[area]->va_start;
3368
+ orig_end = vas[area]->va_end;
3369
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3370
+ &free_vmap_area_list);
3371
+ if (va)
3372
+ kasan_release_vmalloc(orig_start, orig_end,
3373
+ va->va_start, va->va_end);
3374
+ vas[area] = NULL;
3375
+ }
3376
+
3377
+overflow:
3378
+ spin_unlock(&free_vmap_area_lock);
3379
+ if (!purged) {
3380
+ purge_vmap_area_lazy();
3381
+ purged = true;
3382
+
3383
+ /* Before "retry", check if we recover. */
3384
+ for (area = 0; area < nr_vms; area++) {
3385
+ if (vas[area])
3386
+ continue;
3387
+
3388
+ vas[area] = kmem_cache_zalloc(
3389
+ vmap_area_cachep, GFP_KERNEL);
3390
+ if (!vas[area])
3391
+ goto err_free;
3392
+ }
3393
+
3394
+ goto retry;
3395
+ }
3396
+
26443397 err_free:
26453398 for (area = 0; area < nr_vms; area++) {
2646
- kfree(vas[area]);
3399
+ if (vas[area])
3400
+ kmem_cache_free(vmap_area_cachep, vas[area]);
3401
+
26473402 kfree(vms[area]);
26483403 }
26493404 err_free2:
3405
+ kfree(vas);
3406
+ kfree(vms);
3407
+ return NULL;
3408
+
3409
+err_free_shadow:
3410
+ spin_lock(&free_vmap_area_lock);
3411
+ /*
3412
+ * We release all the vmalloc shadows, even the ones for regions that
3413
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
3414
+ * being able to tolerate this case.
3415
+ */
3416
+ for (area = 0; area < nr_vms; area++) {
3417
+ orig_start = vas[area]->va_start;
3418
+ orig_end = vas[area]->va_end;
3419
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3420
+ &free_vmap_area_list);
3421
+ if (va)
3422
+ kasan_release_vmalloc(orig_start, orig_end,
3423
+ va->va_start, va->va_end);
3424
+ vas[area] = NULL;
3425
+ kfree(vms[area]);
3426
+ }
3427
+ spin_unlock(&free_vmap_area_lock);
26503428 kfree(vas);
26513429 kfree(vms);
26523430 return NULL;
....@@ -2671,9 +3449,12 @@
26713449
26723450 #ifdef CONFIG_PROC_FS
26733451 static void *s_start(struct seq_file *m, loff_t *pos)
3452
+ __acquires(&vmap_purge_lock)
26743453 __acquires(&vmap_area_lock)
26753454 {
3455
+ mutex_lock(&vmap_purge_lock);
26763456 spin_lock(&vmap_area_lock);
3457
+
26773458 return seq_list_start(&vmap_area_list, *pos);
26783459 }
26793460
....@@ -2684,8 +3465,10 @@
26843465
26853466 static void s_stop(struct seq_file *m, void *p)
26863467 __releases(&vmap_area_lock)
3468
+ __releases(&vmap_purge_lock)
26873469 {
26883470 spin_unlock(&vmap_area_lock);
3471
+ mutex_unlock(&vmap_purge_lock);
26893472 }
26903473
26913474 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
....@@ -2712,6 +3495,22 @@
27123495 }
27133496 }
27143497
3498
+static void show_purge_info(struct seq_file *m)
3499
+{
3500
+ struct llist_node *head;
3501
+ struct vmap_area *va;
3502
+
3503
+ head = READ_ONCE(vmap_purge_list.first);
3504
+ if (head == NULL)
3505
+ return;
3506
+
3507
+ llist_for_each_entry(va, head, purge_list) {
3508
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3509
+ (void *)va->va_start, (void *)va->va_end,
3510
+ va->va_end - va->va_start);
3511
+ }
3512
+}
3513
+
27153514 static int s_show(struct seq_file *m, void *p)
27163515 {
27173516 struct vmap_area *va;
....@@ -2720,14 +3519,13 @@
27203519 va = list_entry(p, struct vmap_area, list);
27213520
27223521 /*
2723
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2724
- * behalf of vmap area is being tear down or vm_map_ram allocation.
3522
+ * s_show can encounter race with remove_vm_area, !vm on behalf
3523
+ * of vmap area is being tear down or vm_map_ram allocation.
27253524 */
2726
- if (!(va->flags & VM_VM_AREA)) {
2727
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
3525
+ if (!va->vm) {
3526
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
27283527 (void *)va->va_start, (void *)va->va_end,
2729
- va->va_end - va->va_start,
2730
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
3528
+ va->va_end - va->va_start);
27313529
27323530 return 0;
27333531 }
....@@ -2758,11 +3556,25 @@
27583556 if (v->flags & VM_USERMAP)
27593557 seq_puts(m, " user");
27603558
3559
+ if (v->flags & VM_DMA_COHERENT)
3560
+ seq_puts(m, " dma-coherent");
3561
+
27613562 if (is_vmalloc_addr(v->pages))
27623563 seq_puts(m, " vpages");
27633564
27643565 show_numa_info(m, v);
3566
+ trace_android_vh_show_stack_hash(m, v);
27653567 seq_putc(m, '\n');
3568
+
3569
+ /*
3570
+ * As a final step, dump "unpurged" areas. Note,
3571
+ * that entire "/proc/vmallocinfo" output will not
3572
+ * be address sorted, because the purge list is not
3573
+ * sorted.
3574
+ */
3575
+ if (list_is_last(&va->list, &vmap_area_list))
3576
+ show_purge_info(m);
3577
+
27663578 return 0;
27673579 }
27683580
....@@ -2786,4 +3598,3 @@
27863598 module_init(proc_vmalloc_init);
27873599
27883600 #endif
2789
-