hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/mm/vmalloc.c
....@@ -1,11 +1,11 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
2
- * linux/mm/vmalloc.c
3
- *
43 * Copyright (C) 1993 Linus Torvalds
54 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
65 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
76 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
87 * Numa awareness, Christoph Lameter, SGI, June 2005
8
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
99 */
1010
1111 #include <linux/vmalloc.h>
....@@ -18,12 +18,13 @@
1818 #include <linux/interrupt.h>
1919 #include <linux/proc_fs.h>
2020 #include <linux/seq_file.h>
21
+#include <linux/set_memory.h>
2122 #include <linux/debugobjects.h>
2223 #include <linux/kallsyms.h>
2324 #include <linux/list.h>
2425 #include <linux/notifier.h>
2526 #include <linux/rbtree.h>
26
-#include <linux/radix-tree.h>
27
+#include <linux/xarray.h>
2728 #include <linux/rcupdate.h>
2829 #include <linux/pfn.h>
2930 #include <linux/kmemleak.h>
....@@ -31,13 +32,24 @@
3132 #include <linux/compiler.h>
3233 #include <linux/llist.h>
3334 #include <linux/bitops.h>
35
+#include <linux/rbtree_augmented.h>
3436 #include <linux/overflow.h>
37
+#include <trace/hooks/mm.h>
3538
3639 #include <linux/uaccess.h>
3740 #include <asm/tlbflush.h>
3841 #include <asm/shmparam.h>
3942
4043 #include "internal.h"
44
+#include "pgalloc-track.h"
45
+
46
+bool is_vmalloc_addr(const void *x)
47
+{
48
+ unsigned long addr = (unsigned long)x;
49
+
50
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
51
+}
52
+EXPORT_SYMBOL(is_vmalloc_addr);
4153
4254 struct vfree_deferred {
4355 struct llist_head list;
....@@ -58,7 +70,8 @@
5870
5971 /*** Page table manipulation functions ***/
6072
61
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
73
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
74
+ pgtbl_mod_mask *mask)
6275 {
6376 pte_t *pte;
6477
....@@ -67,73 +80,119 @@
6780 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
6881 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
6982 } while (pte++, addr += PAGE_SIZE, addr != end);
83
+ *mask |= PGTBL_PTE_MODIFIED;
7084 }
7185
72
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
86
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
87
+ pgtbl_mod_mask *mask)
7388 {
7489 pmd_t *pmd;
7590 unsigned long next;
91
+ int cleared;
7692
7793 pmd = pmd_offset(pud, addr);
7894 do {
7995 next = pmd_addr_end(addr, end);
80
- if (pmd_clear_huge(pmd))
96
+
97
+ cleared = pmd_clear_huge(pmd);
98
+ if (cleared || pmd_bad(*pmd))
99
+ *mask |= PGTBL_PMD_MODIFIED;
100
+
101
+ if (cleared)
81102 continue;
82103 if (pmd_none_or_clear_bad(pmd))
83104 continue;
84
- vunmap_pte_range(pmd, addr, next);
105
+ vunmap_pte_range(pmd, addr, next, mask);
106
+
107
+ cond_resched();
85108 } while (pmd++, addr = next, addr != end);
86109 }
87110
88
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
111
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
112
+ pgtbl_mod_mask *mask)
89113 {
90114 pud_t *pud;
91115 unsigned long next;
116
+ int cleared;
92117
93118 pud = pud_offset(p4d, addr);
94119 do {
95120 next = pud_addr_end(addr, end);
96
- if (pud_clear_huge(pud))
121
+
122
+ cleared = pud_clear_huge(pud);
123
+ if (cleared || pud_bad(*pud))
124
+ *mask |= PGTBL_PUD_MODIFIED;
125
+
126
+ if (cleared)
97127 continue;
98128 if (pud_none_or_clear_bad(pud))
99129 continue;
100
- vunmap_pmd_range(pud, addr, next);
130
+ vunmap_pmd_range(pud, addr, next, mask);
101131 } while (pud++, addr = next, addr != end);
102132 }
103133
104
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
134
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
135
+ pgtbl_mod_mask *mask)
105136 {
106137 p4d_t *p4d;
107138 unsigned long next;
139
+ int cleared;
108140
109141 p4d = p4d_offset(pgd, addr);
110142 do {
111143 next = p4d_addr_end(addr, end);
112
- if (p4d_clear_huge(p4d))
144
+
145
+ cleared = p4d_clear_huge(p4d);
146
+ if (cleared || p4d_bad(*p4d))
147
+ *mask |= PGTBL_P4D_MODIFIED;
148
+
149
+ if (cleared)
113150 continue;
114151 if (p4d_none_or_clear_bad(p4d))
115152 continue;
116
- vunmap_pud_range(p4d, addr, next);
153
+ vunmap_pud_range(p4d, addr, next, mask);
117154 } while (p4d++, addr = next, addr != end);
118155 }
119156
120
-static void vunmap_page_range(unsigned long addr, unsigned long end)
157
+/**
158
+ * unmap_kernel_range_noflush - unmap kernel VM area
159
+ * @start: start of the VM area to unmap
160
+ * @size: size of the VM area to unmap
161
+ *
162
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
163
+ * should have been allocated using get_vm_area() and its friends.
164
+ *
165
+ * NOTE:
166
+ * This function does NOT do any cache flushing. The caller is responsible
167
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
168
+ * function and flush_tlb_kernel_range() after.
169
+ */
170
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
121171 {
122
- pgd_t *pgd;
172
+ unsigned long end = start + size;
123173 unsigned long next;
174
+ pgd_t *pgd;
175
+ unsigned long addr = start;
176
+ pgtbl_mod_mask mask = 0;
124177
125178 BUG_ON(addr >= end);
126179 pgd = pgd_offset_k(addr);
127180 do {
128181 next = pgd_addr_end(addr, end);
182
+ if (pgd_bad(*pgd))
183
+ mask |= PGTBL_PGD_MODIFIED;
129184 if (pgd_none_or_clear_bad(pgd))
130185 continue;
131
- vunmap_p4d_range(pgd, addr, next);
186
+ vunmap_p4d_range(pgd, addr, next, &mask);
132187 } while (pgd++, addr = next, addr != end);
188
+
189
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
190
+ arch_sync_kernel_mappings(start, end);
133191 }
134192
135193 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
136
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
194
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
195
+ pgtbl_mod_mask *mask)
137196 {
138197 pte_t *pte;
139198
....@@ -142,7 +201,7 @@
142201 * callers keep track of where we're up to.
143202 */
144203
145
- pte = pte_alloc_kernel(pmd, addr);
204
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
146205 if (!pte)
147206 return -ENOMEM;
148207 do {
....@@ -155,96 +214,120 @@
155214 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
156215 (*nr)++;
157216 } while (pte++, addr += PAGE_SIZE, addr != end);
217
+ *mask |= PGTBL_PTE_MODIFIED;
158218 return 0;
159219 }
160220
161221 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
162
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
222
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
223
+ pgtbl_mod_mask *mask)
163224 {
164225 pmd_t *pmd;
165226 unsigned long next;
166227
167
- pmd = pmd_alloc(&init_mm, pud, addr);
228
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
168229 if (!pmd)
169230 return -ENOMEM;
170231 do {
171232 next = pmd_addr_end(addr, end);
172
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
233
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
173234 return -ENOMEM;
174235 } while (pmd++, addr = next, addr != end);
175236 return 0;
176237 }
177238
178239 static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
179
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
240
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
241
+ pgtbl_mod_mask *mask)
180242 {
181243 pud_t *pud;
182244 unsigned long next;
183245
184
- pud = pud_alloc(&init_mm, p4d, addr);
246
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
185247 if (!pud)
186248 return -ENOMEM;
187249 do {
188250 next = pud_addr_end(addr, end);
189
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
251
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
190252 return -ENOMEM;
191253 } while (pud++, addr = next, addr != end);
192254 return 0;
193255 }
194256
195257 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
196
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
258
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
259
+ pgtbl_mod_mask *mask)
197260 {
198261 p4d_t *p4d;
199262 unsigned long next;
200263
201
- p4d = p4d_alloc(&init_mm, pgd, addr);
264
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
202265 if (!p4d)
203266 return -ENOMEM;
204267 do {
205268 next = p4d_addr_end(addr, end);
206
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
269
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
207270 return -ENOMEM;
208271 } while (p4d++, addr = next, addr != end);
209272 return 0;
210273 }
211274
212
-/*
213
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
214
- * will have pfns corresponding to the "pages" array.
275
+/**
276
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
277
+ * @addr: start of the VM area to map
278
+ * @size: size of the VM area to map
279
+ * @prot: page protection flags to use
280
+ * @pages: pages to map
215281 *
216
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
282
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
283
+ * have been allocated using get_vm_area() and its friends.
284
+ *
285
+ * NOTE:
286
+ * This function does NOT do any cache flushing. The caller is responsible for
287
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
288
+ * function.
289
+ *
290
+ * RETURNS:
291
+ * 0 on success, -errno on failure.
217292 */
218
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
219
- pgprot_t prot, struct page **pages)
293
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
294
+ pgprot_t prot, struct page **pages)
220295 {
221
- pgd_t *pgd;
296
+ unsigned long start = addr;
297
+ unsigned long end = addr + size;
222298 unsigned long next;
223
- unsigned long addr = start;
299
+ pgd_t *pgd;
224300 int err = 0;
225301 int nr = 0;
302
+ pgtbl_mod_mask mask = 0;
226303
227304 BUG_ON(addr >= end);
228305 pgd = pgd_offset_k(addr);
229306 do {
230307 next = pgd_addr_end(addr, end);
231
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
308
+ if (pgd_bad(*pgd))
309
+ mask |= PGTBL_PGD_MODIFIED;
310
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
232311 if (err)
233312 return err;
234313 } while (pgd++, addr = next, addr != end);
235314
236
- return nr;
315
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
316
+ arch_sync_kernel_mappings(start, end);
317
+
318
+ return 0;
237319 }
238320
239
-static int vmap_page_range(unsigned long start, unsigned long end,
240
- pgprot_t prot, struct page **pages)
321
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
322
+ struct page **pages)
241323 {
242324 int ret;
243325
244
- ret = vmap_page_range_noflush(start, end, prot, pages);
245
- flush_cache_vmap(start, end);
326
+ ret = map_kernel_range_noflush(start, size, prot, pages);
327
+ flush_cache_vmap(start, start + size);
246328 return ret;
247329 }
330
+EXPORT_SYMBOL_GPL(map_kernel_range);
248331
249332 int is_vmalloc_or_module_addr(const void *x)
250333 {
....@@ -324,22 +407,83 @@
324407
325408 /*** Global kva allocator ***/
326409
327
-#define VM_LAZY_FREE 0x02
328
-#define VM_VM_AREA 0x04
410
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
411
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
412
+
329413
330414 static DEFINE_SPINLOCK(vmap_area_lock);
415
+static DEFINE_SPINLOCK(free_vmap_area_lock);
331416 /* Export for kexec only */
332417 LIST_HEAD(vmap_area_list);
333418 static LLIST_HEAD(vmap_purge_list);
334419 static struct rb_root vmap_area_root = RB_ROOT;
420
+static bool vmap_initialized __read_mostly;
335421
336
-/* The vmap cache globals are protected by vmap_area_lock */
337
-static struct rb_node *free_vmap_cache;
338
-static unsigned long cached_hole_size;
339
-static unsigned long cached_vstart;
340
-static unsigned long cached_align;
422
+/*
423
+ * This kmem_cache is used for vmap_area objects. Instead of
424
+ * allocating from slab we reuse an object from this cache to
425
+ * make things faster. Especially in "no edge" splitting of
426
+ * free block.
427
+ */
428
+static struct kmem_cache *vmap_area_cachep;
341429
342
-static unsigned long vmap_area_pcpu_hole;
430
+/*
431
+ * This linked list is used in pair with free_vmap_area_root.
432
+ * It gives O(1) access to prev/next to perform fast coalescing.
433
+ */
434
+static LIST_HEAD(free_vmap_area_list);
435
+
436
+/*
437
+ * This augment red-black tree represents the free vmap space.
438
+ * All vmap_area objects in this tree are sorted by va->va_start
439
+ * address. It is used for allocation and merging when a vmap
440
+ * object is released.
441
+ *
442
+ * Each vmap_area node contains a maximum available free block
443
+ * of its sub-tree, right or left. Therefore it is possible to
444
+ * find a lowest match of free area.
445
+ */
446
+static struct rb_root free_vmap_area_root = RB_ROOT;
447
+
448
+/*
449
+ * Preload a CPU with one object for "no edge" split case. The
450
+ * aim is to get rid of allocations from the atomic context, thus
451
+ * to use more permissive allocation masks.
452
+ */
453
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
454
+
455
+static __always_inline unsigned long
456
+va_size(struct vmap_area *va)
457
+{
458
+ return (va->va_end - va->va_start);
459
+}
460
+
461
+static __always_inline unsigned long
462
+get_subtree_max_size(struct rb_node *node)
463
+{
464
+ struct vmap_area *va;
465
+
466
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
467
+ return va ? va->subtree_max_size : 0;
468
+}
469
+
470
+/*
471
+ * Gets called when remove the node and rotate.
472
+ */
473
+static __always_inline unsigned long
474
+compute_subtree_max_size(struct vmap_area *va)
475
+{
476
+ return max3(va_size(va),
477
+ get_subtree_max_size(va->rb_node.rb_left),
478
+ get_subtree_max_size(va->rb_node.rb_right));
479
+}
480
+
481
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
482
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
483
+
484
+static void purge_vmap_area_lazy(void);
485
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
486
+static unsigned long lazy_max_pages(void);
343487
344488 static atomic_long_t nr_vmalloc_pages;
345489
....@@ -347,6 +491,7 @@
347491 {
348492 return atomic_long_read(&nr_vmalloc_pages);
349493 }
494
+EXPORT_SYMBOL_GPL(vmalloc_nr_pages);
350495
351496 static struct vmap_area *__find_vmap_area(unsigned long addr)
352497 {
....@@ -367,41 +512,638 @@
367512 return NULL;
368513 }
369514
370
-static void __insert_vmap_area(struct vmap_area *va)
515
+/*
516
+ * This function returns back addresses of parent node
517
+ * and its left or right link for further processing.
518
+ *
519
+ * Otherwise NULL is returned. In that case all further
520
+ * steps regarding inserting of conflicting overlap range
521
+ * have to be declined and actually considered as a bug.
522
+ */
523
+static __always_inline struct rb_node **
524
+find_va_links(struct vmap_area *va,
525
+ struct rb_root *root, struct rb_node *from,
526
+ struct rb_node **parent)
371527 {
372
- struct rb_node **p = &vmap_area_root.rb_node;
373
- struct rb_node *parent = NULL;
374
- struct rb_node *tmp;
528
+ struct vmap_area *tmp_va;
529
+ struct rb_node **link;
375530
376
- while (*p) {
377
- struct vmap_area *tmp_va;
378
-
379
- parent = *p;
380
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
381
- if (va->va_start < tmp_va->va_end)
382
- p = &(*p)->rb_left;
383
- else if (va->va_end > tmp_va->va_start)
384
- p = &(*p)->rb_right;
385
- else
386
- BUG();
531
+ if (root) {
532
+ link = &root->rb_node;
533
+ if (unlikely(!*link)) {
534
+ *parent = NULL;
535
+ return link;
536
+ }
537
+ } else {
538
+ link = &from;
387539 }
388540
389
- rb_link_node(&va->rb_node, parent, p);
390
- rb_insert_color(&va->rb_node, &vmap_area_root);
541
+ /*
542
+ * Go to the bottom of the tree. When we hit the last point
543
+ * we end up with parent rb_node and correct direction, i name
544
+ * it link, where the new va->rb_node will be attached to.
545
+ */
546
+ do {
547
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
391548
392
- /* address-sort this list */
393
- tmp = rb_prev(&va->rb_node);
394
- if (tmp) {
395
- struct vmap_area *prev;
396
- prev = rb_entry(tmp, struct vmap_area, rb_node);
397
- list_add_rcu(&va->list, &prev->list);
398
- } else
399
- list_add_rcu(&va->list, &vmap_area_list);
549
+ /*
550
+ * During the traversal we also do some sanity check.
551
+ * Trigger the BUG() if there are sides(left/right)
552
+ * or full overlaps.
553
+ */
554
+ if (va->va_start < tmp_va->va_end &&
555
+ va->va_end <= tmp_va->va_start)
556
+ link = &(*link)->rb_left;
557
+ else if (va->va_end > tmp_va->va_start &&
558
+ va->va_start >= tmp_va->va_end)
559
+ link = &(*link)->rb_right;
560
+ else {
561
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
562
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
563
+
564
+ return NULL;
565
+ }
566
+ } while (*link);
567
+
568
+ *parent = &tmp_va->rb_node;
569
+ return link;
400570 }
401571
402
-static void purge_vmap_area_lazy(void);
572
+static __always_inline struct list_head *
573
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
574
+{
575
+ struct list_head *list;
403576
404
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
577
+ if (unlikely(!parent))
578
+ /*
579
+ * The red-black tree where we try to find VA neighbors
580
+ * before merging or inserting is empty, i.e. it means
581
+ * there is no free vmap space. Normally it does not
582
+ * happen but we handle this case anyway.
583
+ */
584
+ return NULL;
585
+
586
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
587
+ return (&parent->rb_right == link ? list->next : list);
588
+}
589
+
590
+static __always_inline void
591
+link_va(struct vmap_area *va, struct rb_root *root,
592
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
593
+{
594
+ /*
595
+ * VA is still not in the list, but we can
596
+ * identify its future previous list_head node.
597
+ */
598
+ if (likely(parent)) {
599
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
600
+ if (&parent->rb_right != link)
601
+ head = head->prev;
602
+ }
603
+
604
+ /* Insert to the rb-tree */
605
+ rb_link_node(&va->rb_node, parent, link);
606
+ if (root == &free_vmap_area_root) {
607
+ /*
608
+ * Some explanation here. Just perform simple insertion
609
+ * to the tree. We do not set va->subtree_max_size to
610
+ * its current size before calling rb_insert_augmented().
611
+ * It is because of we populate the tree from the bottom
612
+ * to parent levels when the node _is_ in the tree.
613
+ *
614
+ * Therefore we set subtree_max_size to zero after insertion,
615
+ * to let __augment_tree_propagate_from() puts everything to
616
+ * the correct order later on.
617
+ */
618
+ rb_insert_augmented(&va->rb_node,
619
+ root, &free_vmap_area_rb_augment_cb);
620
+ va->subtree_max_size = 0;
621
+ } else {
622
+ rb_insert_color(&va->rb_node, root);
623
+ }
624
+
625
+ /* Address-sort this list */
626
+ list_add(&va->list, head);
627
+}
628
+
629
+static __always_inline void
630
+unlink_va(struct vmap_area *va, struct rb_root *root)
631
+{
632
+ if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
633
+ return;
634
+
635
+ if (root == &free_vmap_area_root)
636
+ rb_erase_augmented(&va->rb_node,
637
+ root, &free_vmap_area_rb_augment_cb);
638
+ else
639
+ rb_erase(&va->rb_node, root);
640
+
641
+ list_del(&va->list);
642
+ RB_CLEAR_NODE(&va->rb_node);
643
+}
644
+
645
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
646
+static void
647
+augment_tree_propagate_check(void)
648
+{
649
+ struct vmap_area *va;
650
+ unsigned long computed_size;
651
+
652
+ list_for_each_entry(va, &free_vmap_area_list, list) {
653
+ computed_size = compute_subtree_max_size(va);
654
+ if (computed_size != va->subtree_max_size)
655
+ pr_emerg("tree is corrupted: %lu, %lu\n",
656
+ va_size(va), va->subtree_max_size);
657
+ }
658
+}
659
+#endif
660
+
661
+/*
662
+ * This function populates subtree_max_size from bottom to upper
663
+ * levels starting from VA point. The propagation must be done
664
+ * when VA size is modified by changing its va_start/va_end. Or
665
+ * in case of newly inserting of VA to the tree.
666
+ *
667
+ * It means that __augment_tree_propagate_from() must be called:
668
+ * - After VA has been inserted to the tree(free path);
669
+ * - After VA has been shrunk(allocation path);
670
+ * - After VA has been increased(merging path).
671
+ *
672
+ * Please note that, it does not mean that upper parent nodes
673
+ * and their subtree_max_size are recalculated all the time up
674
+ * to the root node.
675
+ *
676
+ * 4--8
677
+ * /\
678
+ * / \
679
+ * / \
680
+ * 2--2 8--8
681
+ *
682
+ * For example if we modify the node 4, shrinking it to 2, then
683
+ * no any modification is required. If we shrink the node 2 to 1
684
+ * its subtree_max_size is updated only, and set to 1. If we shrink
685
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
686
+ * node becomes 4--6.
687
+ */
688
+static __always_inline void
689
+augment_tree_propagate_from(struct vmap_area *va)
690
+{
691
+ /*
692
+ * Populate the tree from bottom towards the root until
693
+ * the calculated maximum available size of checked node
694
+ * is equal to its current one.
695
+ */
696
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
697
+
698
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
699
+ augment_tree_propagate_check();
700
+#endif
701
+}
702
+
703
+static void
704
+insert_vmap_area(struct vmap_area *va,
705
+ struct rb_root *root, struct list_head *head)
706
+{
707
+ struct rb_node **link;
708
+ struct rb_node *parent;
709
+
710
+ link = find_va_links(va, root, NULL, &parent);
711
+ if (link)
712
+ link_va(va, root, parent, link, head);
713
+}
714
+
715
+static void
716
+insert_vmap_area_augment(struct vmap_area *va,
717
+ struct rb_node *from, struct rb_root *root,
718
+ struct list_head *head)
719
+{
720
+ struct rb_node **link;
721
+ struct rb_node *parent;
722
+
723
+ if (from)
724
+ link = find_va_links(va, NULL, from, &parent);
725
+ else
726
+ link = find_va_links(va, root, NULL, &parent);
727
+
728
+ if (link) {
729
+ link_va(va, root, parent, link, head);
730
+ augment_tree_propagate_from(va);
731
+ }
732
+}
733
+
734
+/*
735
+ * Merge de-allocated chunk of VA memory with previous
736
+ * and next free blocks. If coalesce is not done a new
737
+ * free area is inserted. If VA has been merged, it is
738
+ * freed.
739
+ *
740
+ * Please note, it can return NULL in case of overlap
741
+ * ranges, followed by WARN() report. Despite it is a
742
+ * buggy behaviour, a system can be alive and keep
743
+ * ongoing.
744
+ */
745
+static __always_inline struct vmap_area *
746
+merge_or_add_vmap_area(struct vmap_area *va,
747
+ struct rb_root *root, struct list_head *head)
748
+{
749
+ struct vmap_area *sibling;
750
+ struct list_head *next;
751
+ struct rb_node **link;
752
+ struct rb_node *parent;
753
+ bool merged = false;
754
+
755
+ /*
756
+ * Find a place in the tree where VA potentially will be
757
+ * inserted, unless it is merged with its sibling/siblings.
758
+ */
759
+ link = find_va_links(va, root, NULL, &parent);
760
+ if (!link)
761
+ return NULL;
762
+
763
+ /*
764
+ * Get next node of VA to check if merging can be done.
765
+ */
766
+ next = get_va_next_sibling(parent, link);
767
+ if (unlikely(next == NULL))
768
+ goto insert;
769
+
770
+ /*
771
+ * start end
772
+ * | |
773
+ * |<------VA------>|<-----Next----->|
774
+ * | |
775
+ * start end
776
+ */
777
+ if (next != head) {
778
+ sibling = list_entry(next, struct vmap_area, list);
779
+ if (sibling->va_start == va->va_end) {
780
+ sibling->va_start = va->va_start;
781
+
782
+ /* Free vmap_area object. */
783
+ kmem_cache_free(vmap_area_cachep, va);
784
+
785
+ /* Point to the new merged area. */
786
+ va = sibling;
787
+ merged = true;
788
+ }
789
+ }
790
+
791
+ /*
792
+ * start end
793
+ * | |
794
+ * |<-----Prev----->|<------VA------>|
795
+ * | |
796
+ * start end
797
+ */
798
+ if (next->prev != head) {
799
+ sibling = list_entry(next->prev, struct vmap_area, list);
800
+ if (sibling->va_end == va->va_start) {
801
+ /*
802
+ * If both neighbors are coalesced, it is important
803
+ * to unlink the "next" node first, followed by merging
804
+ * with "previous" one. Otherwise the tree might not be
805
+ * fully populated if a sibling's augmented value is
806
+ * "normalized" because of rotation operations.
807
+ */
808
+ if (merged)
809
+ unlink_va(va, root);
810
+
811
+ sibling->va_end = va->va_end;
812
+
813
+ /* Free vmap_area object. */
814
+ kmem_cache_free(vmap_area_cachep, va);
815
+
816
+ /* Point to the new merged area. */
817
+ va = sibling;
818
+ merged = true;
819
+ }
820
+ }
821
+
822
+insert:
823
+ if (!merged)
824
+ link_va(va, root, parent, link, head);
825
+
826
+ /*
827
+ * Last step is to check and update the tree.
828
+ */
829
+ augment_tree_propagate_from(va);
830
+ return va;
831
+}
832
+
833
+static __always_inline bool
834
+is_within_this_va(struct vmap_area *va, unsigned long size,
835
+ unsigned long align, unsigned long vstart)
836
+{
837
+ unsigned long nva_start_addr;
838
+
839
+ if (va->va_start > vstart)
840
+ nva_start_addr = ALIGN(va->va_start, align);
841
+ else
842
+ nva_start_addr = ALIGN(vstart, align);
843
+
844
+ /* Can be overflowed due to big size or alignment. */
845
+ if (nva_start_addr + size < nva_start_addr ||
846
+ nva_start_addr < vstart)
847
+ return false;
848
+
849
+ return (nva_start_addr + size <= va->va_end);
850
+}
851
+
852
+/*
853
+ * Find the first free block(lowest start address) in the tree,
854
+ * that will accomplish the request corresponding to passing
855
+ * parameters.
856
+ */
857
+static __always_inline struct vmap_area *
858
+find_vmap_lowest_match(unsigned long size,
859
+ unsigned long align, unsigned long vstart)
860
+{
861
+ struct vmap_area *va;
862
+ struct rb_node *node;
863
+ unsigned long length;
864
+
865
+ /* Start from the root. */
866
+ node = free_vmap_area_root.rb_node;
867
+
868
+ /* Adjust the search size for alignment overhead. */
869
+ length = size + align - 1;
870
+
871
+ while (node) {
872
+ va = rb_entry(node, struct vmap_area, rb_node);
873
+
874
+ if (get_subtree_max_size(node->rb_left) >= length &&
875
+ vstart < va->va_start) {
876
+ node = node->rb_left;
877
+ } else {
878
+ if (is_within_this_va(va, size, align, vstart))
879
+ return va;
880
+
881
+ /*
882
+ * Does not make sense to go deeper towards the right
883
+ * sub-tree if it does not have a free block that is
884
+ * equal or bigger to the requested search length.
885
+ */
886
+ if (get_subtree_max_size(node->rb_right) >= length) {
887
+ node = node->rb_right;
888
+ continue;
889
+ }
890
+
891
+ /*
892
+ * OK. We roll back and find the first right sub-tree,
893
+ * that will satisfy the search criteria. It can happen
894
+ * only once due to "vstart" restriction.
895
+ */
896
+ while ((node = rb_parent(node))) {
897
+ va = rb_entry(node, struct vmap_area, rb_node);
898
+ if (is_within_this_va(va, size, align, vstart))
899
+ return va;
900
+
901
+ if (get_subtree_max_size(node->rb_right) >= length &&
902
+ vstart <= va->va_start) {
903
+ node = node->rb_right;
904
+ break;
905
+ }
906
+ }
907
+ }
908
+ }
909
+
910
+ return NULL;
911
+}
912
+
913
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
914
+#include <linux/random.h>
915
+
916
+static struct vmap_area *
917
+find_vmap_lowest_linear_match(unsigned long size,
918
+ unsigned long align, unsigned long vstart)
919
+{
920
+ struct vmap_area *va;
921
+
922
+ list_for_each_entry(va, &free_vmap_area_list, list) {
923
+ if (!is_within_this_va(va, size, align, vstart))
924
+ continue;
925
+
926
+ return va;
927
+ }
928
+
929
+ return NULL;
930
+}
931
+
932
+static void
933
+find_vmap_lowest_match_check(unsigned long size)
934
+{
935
+ struct vmap_area *va_1, *va_2;
936
+ unsigned long vstart;
937
+ unsigned int rnd;
938
+
939
+ get_random_bytes(&rnd, sizeof(rnd));
940
+ vstart = VMALLOC_START + rnd;
941
+
942
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
943
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
944
+
945
+ if (va_1 != va_2)
946
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
947
+ va_1, va_2, vstart);
948
+}
949
+#endif
950
+
951
+enum fit_type {
952
+ NOTHING_FIT = 0,
953
+ FL_FIT_TYPE = 1, /* full fit */
954
+ LE_FIT_TYPE = 2, /* left edge fit */
955
+ RE_FIT_TYPE = 3, /* right edge fit */
956
+ NE_FIT_TYPE = 4 /* no edge fit */
957
+};
958
+
959
+static __always_inline enum fit_type
960
+classify_va_fit_type(struct vmap_area *va,
961
+ unsigned long nva_start_addr, unsigned long size)
962
+{
963
+ enum fit_type type;
964
+
965
+ /* Check if it is within VA. */
966
+ if (nva_start_addr < va->va_start ||
967
+ nva_start_addr + size > va->va_end)
968
+ return NOTHING_FIT;
969
+
970
+ /* Now classify. */
971
+ if (va->va_start == nva_start_addr) {
972
+ if (va->va_end == nva_start_addr + size)
973
+ type = FL_FIT_TYPE;
974
+ else
975
+ type = LE_FIT_TYPE;
976
+ } else if (va->va_end == nva_start_addr + size) {
977
+ type = RE_FIT_TYPE;
978
+ } else {
979
+ type = NE_FIT_TYPE;
980
+ }
981
+
982
+ return type;
983
+}
984
+
985
+static __always_inline int
986
+adjust_va_to_fit_type(struct vmap_area *va,
987
+ unsigned long nva_start_addr, unsigned long size,
988
+ enum fit_type type)
989
+{
990
+ struct vmap_area *lva = NULL;
991
+
992
+ if (type == FL_FIT_TYPE) {
993
+ /*
994
+ * No need to split VA, it fully fits.
995
+ *
996
+ * | |
997
+ * V NVA V
998
+ * |---------------|
999
+ */
1000
+ unlink_va(va, &free_vmap_area_root);
1001
+ kmem_cache_free(vmap_area_cachep, va);
1002
+ } else if (type == LE_FIT_TYPE) {
1003
+ /*
1004
+ * Split left edge of fit VA.
1005
+ *
1006
+ * | |
1007
+ * V NVA V R
1008
+ * |-------|-------|
1009
+ */
1010
+ va->va_start += size;
1011
+ } else if (type == RE_FIT_TYPE) {
1012
+ /*
1013
+ * Split right edge of fit VA.
1014
+ *
1015
+ * | |
1016
+ * L V NVA V
1017
+ * |-------|-------|
1018
+ */
1019
+ va->va_end = nva_start_addr;
1020
+ } else if (type == NE_FIT_TYPE) {
1021
+ /*
1022
+ * Split no edge of fit VA.
1023
+ *
1024
+ * | |
1025
+ * L V NVA V R
1026
+ * |---|-------|---|
1027
+ */
1028
+ lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1029
+ if (unlikely(!lva)) {
1030
+ /*
1031
+ * For percpu allocator we do not do any pre-allocation
1032
+ * and leave it as it is. The reason is it most likely
1033
+ * never ends up with NE_FIT_TYPE splitting. In case of
1034
+ * percpu allocations offsets and sizes are aligned to
1035
+ * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1036
+ * are its main fitting cases.
1037
+ *
1038
+ * There are a few exceptions though, as an example it is
1039
+ * a first allocation (early boot up) when we have "one"
1040
+ * big free space that has to be split.
1041
+ *
1042
+ * Also we can hit this path in case of regular "vmap"
1043
+ * allocations, if "this" current CPU was not preloaded.
1044
+ * See the comment in alloc_vmap_area() why. If so, then
1045
+ * GFP_NOWAIT is used instead to get an extra object for
1046
+ * split purpose. That is rare and most time does not
1047
+ * occur.
1048
+ *
1049
+ * What happens if an allocation gets failed. Basically,
1050
+ * an "overflow" path is triggered to purge lazily freed
1051
+ * areas to free some memory, then, the "retry" path is
1052
+ * triggered to repeat one more time. See more details
1053
+ * in alloc_vmap_area() function.
1054
+ */
1055
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1056
+ if (!lva)
1057
+ return -1;
1058
+ }
1059
+
1060
+ /*
1061
+ * Build the remainder.
1062
+ */
1063
+ lva->va_start = va->va_start;
1064
+ lva->va_end = nva_start_addr;
1065
+
1066
+ /*
1067
+ * Shrink this VA to remaining size.
1068
+ */
1069
+ va->va_start = nva_start_addr + size;
1070
+ } else {
1071
+ return -1;
1072
+ }
1073
+
1074
+ if (type != FL_FIT_TYPE) {
1075
+ augment_tree_propagate_from(va);
1076
+
1077
+ if (lva) /* type == NE_FIT_TYPE */
1078
+ insert_vmap_area_augment(lva, &va->rb_node,
1079
+ &free_vmap_area_root, &free_vmap_area_list);
1080
+ }
1081
+
1082
+ return 0;
1083
+}
1084
+
1085
+/*
1086
+ * Returns a start address of the newly allocated area, if success.
1087
+ * Otherwise a vend is returned that indicates failure.
1088
+ */
1089
+static __always_inline unsigned long
1090
+__alloc_vmap_area(unsigned long size, unsigned long align,
1091
+ unsigned long vstart, unsigned long vend)
1092
+{
1093
+ unsigned long nva_start_addr;
1094
+ struct vmap_area *va;
1095
+ enum fit_type type;
1096
+ int ret;
1097
+
1098
+ va = find_vmap_lowest_match(size, align, vstart);
1099
+ if (unlikely(!va))
1100
+ return vend;
1101
+
1102
+ if (va->va_start > vstart)
1103
+ nva_start_addr = ALIGN(va->va_start, align);
1104
+ else
1105
+ nva_start_addr = ALIGN(vstart, align);
1106
+
1107
+ /* Check the "vend" restriction. */
1108
+ if (nva_start_addr + size > vend)
1109
+ return vend;
1110
+
1111
+ /* Classify what we have found. */
1112
+ type = classify_va_fit_type(va, nva_start_addr, size);
1113
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
1114
+ return vend;
1115
+
1116
+ /* Update the free vmap_area. */
1117
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
1118
+ if (ret)
1119
+ return vend;
1120
+
1121
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1122
+ find_vmap_lowest_match_check(size);
1123
+#endif
1124
+
1125
+ return nva_start_addr;
1126
+}
1127
+
1128
+/*
1129
+ * Free a region of KVA allocated by alloc_vmap_area
1130
+ */
1131
+static void free_vmap_area(struct vmap_area *va)
1132
+{
1133
+ /*
1134
+ * Remove from the busy tree/list.
1135
+ */
1136
+ spin_lock(&vmap_area_lock);
1137
+ unlink_va(va, &vmap_area_root);
1138
+ spin_unlock(&vmap_area_lock);
1139
+
1140
+ /*
1141
+ * Insert/Merge it back to the free tree/list.
1142
+ */
1143
+ spin_lock(&free_vmap_area_lock);
1144
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
1145
+ spin_unlock(&free_vmap_area_lock);
1146
+}
4051147
4061148 /*
4071149 * Allocate a region of KVA of the specified size and alignment, within the
....@@ -412,20 +1154,22 @@
4121154 unsigned long vstart, unsigned long vend,
4131155 int node, gfp_t gfp_mask)
4141156 {
415
- struct vmap_area *va;
416
- struct rb_node *n;
1157
+ struct vmap_area *va, *pva;
4171158 unsigned long addr;
4181159 int purged = 0;
419
- struct vmap_area *first;
1160
+ int ret;
4201161
4211162 BUG_ON(!size);
4221163 BUG_ON(offset_in_page(size));
4231164 BUG_ON(!is_power_of_2(align));
4241165
425
- might_sleep();
1166
+ if (unlikely(!vmap_initialized))
1167
+ return ERR_PTR(-EBUSY);
4261168
427
- va = kmalloc_node(sizeof(struct vmap_area),
428
- gfp_mask & GFP_RECLAIM_MASK, node);
1169
+ might_sleep();
1170
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1171
+
1172
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
4291173 if (unlikely(!va))
4301174 return ERR_PTR(-ENOMEM);
4311175
....@@ -433,101 +1177,71 @@
4331177 * Only scan the relevant parts containing pointers to other objects
4341178 * to avoid false negatives.
4351179 */
436
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
1180
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
4371181
4381182 retry:
439
- spin_lock(&vmap_area_lock);
4401183 /*
441
- * Invalidate cache if we have more permissive parameters.
442
- * cached_hole_size notes the largest hole noticed _below_
443
- * the vmap_area cached in free_vmap_cache: if size fits
444
- * into that hole, we want to scan from vstart to reuse
445
- * the hole instead of allocating above free_vmap_cache.
446
- * Note that __free_vmap_area may update free_vmap_cache
447
- * without updating cached_hole_size or cached_align.
1184
+ * Preload this CPU with one extra vmap_area object. It is used
1185
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
1186
+ * does not guarantee that an allocation occurs on a CPU that
1187
+ * is preloaded, instead we minimize the case when it is not.
1188
+ * It can happen because of cpu migration, because there is a
1189
+ * race until the below spinlock is taken.
1190
+ *
1191
+ * The preload is done in non-atomic context, thus it allows us
1192
+ * to use more permissive allocation masks to be more stable under
1193
+ * low memory condition and high memory pressure. In rare case,
1194
+ * if not preloaded, GFP_NOWAIT is used.
1195
+ *
1196
+ * Set "pva" to NULL here, because of "retry" path.
4481197 */
449
- if (!free_vmap_cache ||
450
- size < cached_hole_size ||
451
- vstart < cached_vstart ||
452
- align < cached_align) {
453
-nocache:
454
- cached_hole_size = 0;
455
- free_vmap_cache = NULL;
456
- }
457
- /* record if we encounter less permissive parameters */
458
- cached_vstart = vstart;
459
- cached_align = align;
1198
+ pva = NULL;
4601199
461
- /* find starting point for our search */
462
- if (free_vmap_cache) {
463
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
464
- addr = ALIGN(first->va_end, align);
465
- if (addr < vstart)
466
- goto nocache;
467
- if (addr + size < addr)
468
- goto overflow;
1200
+ if (!this_cpu_read(ne_fit_preload_node))
1201
+ /*
1202
+ * Even if it fails we do not really care about that.
1203
+ * Just proceed as it is. If needed "overflow" path
1204
+ * will refill the cache we allocate from.
1205
+ */
1206
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
4691207
470
- } else {
471
- addr = ALIGN(vstart, align);
472
- if (addr + size < addr)
473
- goto overflow;
1208
+ spin_lock(&free_vmap_area_lock);
4741209
475
- n = vmap_area_root.rb_node;
476
- first = NULL;
1210
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
1211
+ kmem_cache_free(vmap_area_cachep, pva);
4771212
478
- while (n) {
479
- struct vmap_area *tmp;
480
- tmp = rb_entry(n, struct vmap_area, rb_node);
481
- if (tmp->va_end >= addr) {
482
- first = tmp;
483
- if (tmp->va_start <= addr)
484
- break;
485
- n = n->rb_left;
486
- } else
487
- n = n->rb_right;
488
- }
489
-
490
- if (!first)
491
- goto found;
492
- }
493
-
494
- /* from the starting point, walk areas until a suitable hole is found */
495
- while (addr + size > first->va_start && addr + size <= vend) {
496
- if (addr + cached_hole_size < first->va_start)
497
- cached_hole_size = first->va_start - addr;
498
- addr = ALIGN(first->va_end, align);
499
- if (addr + size < addr)
500
- goto overflow;
501
-
502
- if (list_is_last(&first->list, &vmap_area_list))
503
- goto found;
504
-
505
- first = list_next_entry(first, list);
506
- }
507
-
508
-found:
5091213 /*
510
- * Check also calculated address against the vstart,
511
- * because it can be 0 because of big align request.
1214
+ * If an allocation fails, the "vend" address is
1215
+ * returned. Therefore trigger the overflow path.
5121216 */
513
- if (addr + size > vend || addr < vstart)
1217
+ addr = __alloc_vmap_area(size, align, vstart, vend);
1218
+ spin_unlock(&free_vmap_area_lock);
1219
+
1220
+ if (unlikely(addr == vend))
5141221 goto overflow;
5151222
5161223 va->va_start = addr;
5171224 va->va_end = addr + size;
518
- va->flags = 0;
519
- __insert_vmap_area(va);
520
- free_vmap_cache = &va->rb_node;
1225
+ va->vm = NULL;
1226
+
1227
+
1228
+ spin_lock(&vmap_area_lock);
1229
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
5211230 spin_unlock(&vmap_area_lock);
5221231
5231232 BUG_ON(!IS_ALIGNED(va->va_start, align));
5241233 BUG_ON(va->va_start < vstart);
5251234 BUG_ON(va->va_end > vend);
5261235
1236
+ ret = kasan_populate_vmalloc(addr, size);
1237
+ if (ret) {
1238
+ free_vmap_area(va);
1239
+ return ERR_PTR(ret);
1240
+ }
1241
+
5271242 return va;
5281243
5291244 overflow:
530
- spin_unlock(&vmap_area_lock);
5311245 if (!purged) {
5321246 purge_vmap_area_lazy();
5331247 purged = 1;
....@@ -546,7 +1260,8 @@
5461260 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
5471261 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
5481262 size);
549
- kfree(va);
1263
+
1264
+ kmem_cache_free(vmap_area_cachep, va);
5501265 return ERR_PTR(-EBUSY);
5511266 }
5521267
....@@ -562,59 +1277,7 @@
5621277 }
5631278 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
5641279
565
-static void __free_vmap_area(struct vmap_area *va)
566
-{
567
- BUG_ON(RB_EMPTY_NODE(&va->rb_node));
568
-
569
- if (free_vmap_cache) {
570
- if (va->va_end < cached_vstart) {
571
- free_vmap_cache = NULL;
572
- } else {
573
- struct vmap_area *cache;
574
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
575
- if (va->va_start <= cache->va_start) {
576
- free_vmap_cache = rb_prev(&va->rb_node);
577
- /*
578
- * We don't try to update cached_hole_size or
579
- * cached_align, but it won't go very wrong.
580
- */
581
- }
582
- }
583
- }
584
- rb_erase(&va->rb_node, &vmap_area_root);
585
- RB_CLEAR_NODE(&va->rb_node);
586
- list_del_rcu(&va->list);
587
-
588
- /*
589
- * Track the highest possible candidate for pcpu area
590
- * allocation. Areas outside of vmalloc area can be returned
591
- * here too, consider only end addresses which fall inside
592
- * vmalloc area proper.
593
- */
594
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
595
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
596
-
597
- kfree_rcu(va, rcu_head);
598
-}
599
-
600
-/*
601
- * Free a region of KVA allocated by alloc_vmap_area
602
- */
603
-static void free_vmap_area(struct vmap_area *va)
604
-{
605
- spin_lock(&vmap_area_lock);
606
- __free_vmap_area(va);
607
- spin_unlock(&vmap_area_lock);
608
-}
609
-
610
-/*
611
- * Clear the pagetable entries of a given vmap_area
612
- */
613
-static void unmap_vmap_area(struct vmap_area *va)
614
-{
615
- vunmap_page_range(va->va_start, va->va_end);
616
-}
617
-
1280
+bool lazy_vunmap_enable __read_mostly = true;
6181281 /*
6191282 * lazy_max_pages is the maximum amount of virtual address space we gather up
6201283 * before attempting to purge with a TLB flush.
....@@ -635,12 +1298,15 @@
6351298 {
6361299 unsigned int log;
6371300
1301
+ if (!lazy_vunmap_enable)
1302
+ return 0;
1303
+
6381304 log = fls(num_online_cpus());
6391305
6401306 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
6411307 }
6421308
643
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
1309
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
6441310
6451311 /*
6461312 * Serialize vmap purging. There is no actual criticial section protected
....@@ -658,7 +1324,7 @@
6581324 */
6591325 void set_iounmap_nonlazy(void)
6601326 {
661
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
1327
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
6621328 }
6631329
6641330 /*
....@@ -666,36 +1332,58 @@
6661332 */
6671333 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
6681334 {
1335
+ unsigned long resched_threshold;
6691336 struct llist_node *valist;
6701337 struct vmap_area *va;
6711338 struct vmap_area *n_va;
672
- bool do_free = false;
6731339
6741340 lockdep_assert_held(&vmap_purge_lock);
6751341
6761342 valist = llist_del_all(&vmap_purge_list);
1343
+ if (unlikely(valist == NULL))
1344
+ return false;
1345
+
1346
+ /*
1347
+ * TODO: to calculate a flush range without looping.
1348
+ * The list can be up to lazy_max_pages() elements.
1349
+ */
6771350 llist_for_each_entry(va, valist, purge_list) {
6781351 if (va->va_start < start)
6791352 start = va->va_start;
6801353 if (va->va_end > end)
6811354 end = va->va_end;
682
- do_free = true;
6831355 }
684
-
685
- if (!do_free)
686
- return false;
6871356
6881357 flush_tlb_kernel_range(start, end);
1358
+ resched_threshold = lazy_max_pages() << 1;
6891359
690
- spin_lock(&vmap_area_lock);
1360
+ spin_lock(&free_vmap_area_lock);
6911361 llist_for_each_entry_safe(va, n_va, valist, purge_list) {
692
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1362
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1363
+ unsigned long orig_start = va->va_start;
1364
+ unsigned long orig_end = va->va_end;
6931365
694
- __free_vmap_area(va);
695
- atomic_sub(nr, &vmap_lazy_nr);
696
- cond_resched_lock(&vmap_area_lock);
1366
+ /*
1367
+ * Finally insert or merge lazily-freed area. It is
1368
+ * detached and there is no need to "unlink" it from
1369
+ * anything.
1370
+ */
1371
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
1372
+ &free_vmap_area_list);
1373
+
1374
+ if (!va)
1375
+ continue;
1376
+
1377
+ if (is_vmalloc_or_module_addr((void *)orig_start))
1378
+ kasan_release_vmalloc(orig_start, orig_end,
1379
+ va->va_start, va->va_end);
1380
+
1381
+ atomic_long_sub(nr, &vmap_lazy_nr);
1382
+
1383
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1384
+ cond_resched_lock(&free_vmap_area_lock);
6971385 }
698
- spin_unlock(&vmap_area_lock);
1386
+ spin_unlock(&free_vmap_area_lock);
6991387 return true;
7001388 }
7011389
....@@ -729,10 +1417,14 @@
7291417 */
7301418 static void free_vmap_area_noflush(struct vmap_area *va)
7311419 {
732
- int nr_lazy;
1420
+ unsigned long nr_lazy;
7331421
734
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
735
- &vmap_lazy_nr);
1422
+ spin_lock(&vmap_area_lock);
1423
+ unlink_va(va, &vmap_area_root);
1424
+ spin_unlock(&vmap_area_lock);
1425
+
1426
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1427
+ PAGE_SHIFT, &vmap_lazy_nr);
7361428
7371429 /* After this point, we may free va at any time */
7381430 llist_add(&va->purge_list, &vmap_purge_list);
....@@ -747,8 +1439,8 @@
7471439 static void free_unmap_vmap_area(struct vmap_area *va)
7481440 {
7491441 flush_cache_vunmap(va->va_start, va->va_end);
750
- unmap_vmap_area(va);
751
- if (debug_pagealloc_enabled())
1442
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
1443
+ if (debug_pagealloc_enabled_static())
7521444 flush_tlb_kernel_range(va->va_start, va->va_end);
7531445
7541446 free_vmap_area_noflush(va);
....@@ -795,8 +1487,6 @@
7951487
7961488 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
7971489
798
-static bool vmap_initialized __read_mostly = false;
799
-
8001490 struct vmap_block_queue {
8011491 spinlock_t lock;
8021492 struct list_head free;
....@@ -816,12 +1506,11 @@
8161506 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
8171507
8181508 /*
819
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
1509
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
8201510 * in the free path. Could get rid of this if we change the API to return a
8211511 * "cookie" from alloc, to be passed to free. But no big deal yet.
8221512 */
823
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
824
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
1513
+static DEFINE_XARRAY(vmap_blocks);
8251514
8261515 /*
8271516 * We should probably have a fallback mechanism to allocate virtual memory
....@@ -852,7 +1541,7 @@
8521541 * @order: how many 2^order pages should be occupied in newly allocated block
8531542 * @gfp_mask: flags for the page level allocator
8541543 *
855
- * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
1544
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
8561545 */
8571546 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
8581547 {
....@@ -878,13 +1567,6 @@
8781567 return ERR_CAST(va);
8791568 }
8801569
881
- err = radix_tree_preload(gfp_mask);
882
- if (unlikely(err)) {
883
- kfree(vb);
884
- free_vmap_area(va);
885
- return ERR_PTR(err);
886
- }
887
-
8881570 vaddr = vmap_block_vaddr(va->va_start, 0);
8891571 spin_lock_init(&vb->lock);
8901572 vb->va = va;
....@@ -897,11 +1579,12 @@
8971579 INIT_LIST_HEAD(&vb->free_list);
8981580
8991581 vb_idx = addr_to_vb_idx(va->va_start);
900
- spin_lock(&vmap_block_tree_lock);
901
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
902
- spin_unlock(&vmap_block_tree_lock);
903
- BUG_ON(err);
904
- radix_tree_preload_end();
1582
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
1583
+ if (err) {
1584
+ kfree(vb);
1585
+ free_vmap_area(va);
1586
+ return ERR_PTR(err);
1587
+ }
9051588
9061589 cpu = get_cpu_light();
9071590 vbq = this_cpu_ptr(&vmap_block_queue);
....@@ -916,12 +1599,8 @@
9161599 static void free_vmap_block(struct vmap_block *vb)
9171600 {
9181601 struct vmap_block *tmp;
919
- unsigned long vb_idx;
9201602
921
- vb_idx = addr_to_vb_idx(vb->va->va_start);
922
- spin_lock(&vmap_block_tree_lock);
923
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
924
- spin_unlock(&vmap_block_tree_lock);
1603
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
9251604 BUG_ON(tmp != vb);
9261605
9271606 free_vmap_area_noflush(vb->va);
....@@ -1026,34 +1705,25 @@
10261705 return vaddr;
10271706 }
10281707
1029
-static void vb_free(const void *addr, unsigned long size)
1708
+static void vb_free(unsigned long addr, unsigned long size)
10301709 {
10311710 unsigned long offset;
1032
- unsigned long vb_idx;
10331711 unsigned int order;
10341712 struct vmap_block *vb;
10351713
10361714 BUG_ON(offset_in_page(size));
10371715 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
10381716
1039
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
1717
+ flush_cache_vunmap(addr, addr + size);
10401718
10411719 order = get_order(size);
1720
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
1721
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
10421722
1043
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1044
- offset >>= PAGE_SHIFT;
1723
+ unmap_kernel_range_noflush(addr, size);
10451724
1046
- vb_idx = addr_to_vb_idx((unsigned long)addr);
1047
- rcu_read_lock();
1048
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
1049
- rcu_read_unlock();
1050
- BUG_ON(!vb);
1051
-
1052
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
1053
-
1054
- if (debug_pagealloc_enabled())
1055
- flush_tlb_kernel_range((unsigned long)addr,
1056
- (unsigned long)addr + size);
1725
+ if (debug_pagealloc_enabled_static())
1726
+ flush_tlb_kernel_range(addr, addr + size);
10571727
10581728 spin_lock(&vb->lock);
10591729
....@@ -1070,24 +1740,9 @@
10701740 spin_unlock(&vb->lock);
10711741 }
10721742
1073
-/**
1074
- * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1075
- *
1076
- * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1077
- * to amortize TLB flushing overheads. What this means is that any page you
1078
- * have now, may, in a former life, have been mapped into kernel virtual
1079
- * address by the vmap layer and so there might be some CPUs with TLB entries
1080
- * still referencing that page (additional to the regular 1:1 kernel mapping).
1081
- *
1082
- * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1083
- * be sure that none of the pages we have control over will have any aliases
1084
- * from the vmap layer.
1085
- */
1086
-void vm_unmap_aliases(void)
1743
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
10871744 {
1088
- unsigned long start = ULONG_MAX, end = 0;
10891745 int cpu;
1090
- int flush = 0;
10911746
10921747 if (unlikely(!vmap_initialized))
10931748 return;
....@@ -1101,7 +1756,7 @@
11011756 rcu_read_lock();
11021757 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
11031758 spin_lock(&vb->lock);
1104
- if (vb->dirty) {
1759
+ if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
11051760 unsigned long va_start = vb->va->va_start;
11061761 unsigned long s, e;
11071762
....@@ -1124,6 +1779,27 @@
11241779 flush_tlb_kernel_range(start, end);
11251780 mutex_unlock(&vmap_purge_lock);
11261781 }
1782
+
1783
+/**
1784
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1785
+ *
1786
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1787
+ * to amortize TLB flushing overheads. What this means is that any page you
1788
+ * have now, may, in a former life, have been mapped into kernel virtual
1789
+ * address by the vmap layer and so there might be some CPUs with TLB entries
1790
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
1791
+ *
1792
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1793
+ * be sure that none of the pages we have control over will have any aliases
1794
+ * from the vmap layer.
1795
+ */
1796
+void vm_unmap_aliases(void)
1797
+{
1798
+ unsigned long start = ULONG_MAX, end = 0;
1799
+ int flush = 0;
1800
+
1801
+ _vm_unmap_aliases(start, end, flush);
1802
+}
11271803 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
11281804
11291805 /**
....@@ -1143,9 +1819,11 @@
11431819 BUG_ON(addr > VMALLOC_END);
11441820 BUG_ON(!PAGE_ALIGNED(addr));
11451821
1822
+ kasan_poison_vmalloc(mem, size);
1823
+
11461824 if (likely(count <= VMAP_MAX_ALLOC)) {
11471825 debug_check_no_locks_freed(mem, size);
1148
- vb_free(mem, size);
1826
+ vb_free(addr, size);
11491827 return;
11501828 }
11511829
....@@ -1162,7 +1840,6 @@
11621840 * @pages: an array of pointers to the pages to be mapped
11631841 * @count: number of pages
11641842 * @node: prefer to allocate data structures on this node
1165
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
11661843 *
11671844 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
11681845 * faster than vmap so it's good. But if you mix long-life and short-life
....@@ -1172,7 +1849,7 @@
11721849 *
11731850 * Returns: a pointer to the address that has been mapped, or %NULL on failure
11741851 */
1175
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
1852
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
11761853 {
11771854 unsigned long size = (unsigned long)count << PAGE_SHIFT;
11781855 unsigned long addr;
....@@ -1193,7 +1870,10 @@
11931870 addr = va->va_start;
11941871 mem = (void *)addr;
11951872 }
1196
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
1873
+
1874
+ kasan_unpoison_vmalloc(mem, size);
1875
+
1876
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
11971877 vm_unmap_ram(mem, count);
11981878 return NULL;
11991879 }
....@@ -1202,6 +1882,7 @@
12021882 EXPORT_SYMBOL(vm_map_ram);
12031883
12041884 static struct vm_struct *vmlist __initdata;
1885
+
12051886 /**
12061887 * vm_area_add_early - add vmap area early during boot
12071888 * @vm: vm_struct to add
....@@ -1253,11 +1934,57 @@
12531934 vm_area_add_early(vm);
12541935 }
12551936
1937
+static void vmap_init_free_space(void)
1938
+{
1939
+ unsigned long vmap_start = 1;
1940
+ const unsigned long vmap_end = ULONG_MAX;
1941
+ struct vmap_area *busy, *free;
1942
+
1943
+ /*
1944
+ * B F B B B F
1945
+ * -|-----|.....|-----|-----|-----|.....|-
1946
+ * | The KVA space |
1947
+ * |<--------------------------------->|
1948
+ */
1949
+ list_for_each_entry(busy, &vmap_area_list, list) {
1950
+ if (busy->va_start - vmap_start > 0) {
1951
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1952
+ if (!WARN_ON_ONCE(!free)) {
1953
+ free->va_start = vmap_start;
1954
+ free->va_end = busy->va_start;
1955
+
1956
+ insert_vmap_area_augment(free, NULL,
1957
+ &free_vmap_area_root,
1958
+ &free_vmap_area_list);
1959
+ }
1960
+ }
1961
+
1962
+ vmap_start = busy->va_end;
1963
+ }
1964
+
1965
+ if (vmap_end - vmap_start > 0) {
1966
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1967
+ if (!WARN_ON_ONCE(!free)) {
1968
+ free->va_start = vmap_start;
1969
+ free->va_end = vmap_end;
1970
+
1971
+ insert_vmap_area_augment(free, NULL,
1972
+ &free_vmap_area_root,
1973
+ &free_vmap_area_list);
1974
+ }
1975
+ }
1976
+}
1977
+
12561978 void __init vmalloc_init(void)
12571979 {
12581980 struct vmap_area *va;
12591981 struct vm_struct *tmp;
12601982 int i;
1983
+
1984
+ /*
1985
+ * Create the cache for vmap_area objects.
1986
+ */
1987
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
12611988
12621989 for_each_possible_cpu(i) {
12631990 struct vmap_block_queue *vbq;
....@@ -1273,63 +2000,22 @@
12732000
12742001 /* Import existing vmlist entries. */
12752002 for (tmp = vmlist; tmp; tmp = tmp->next) {
1276
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1277
- va->flags = VM_VM_AREA;
2003
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2004
+ if (WARN_ON_ONCE(!va))
2005
+ continue;
2006
+
12782007 va->va_start = (unsigned long)tmp->addr;
12792008 va->va_end = va->va_start + tmp->size;
12802009 va->vm = tmp;
1281
- __insert_vmap_area(va);
2010
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
12822011 }
12832012
1284
- vmap_area_pcpu_hole = VMALLOC_END;
1285
-
2013
+ /*
2014
+ * Now we can initialize a free vmap space.
2015
+ */
2016
+ vmap_init_free_space();
12862017 vmap_initialized = true;
12872018 }
1288
-
1289
-/**
1290
- * map_kernel_range_noflush - map kernel VM area with the specified pages
1291
- * @addr: start of the VM area to map
1292
- * @size: size of the VM area to map
1293
- * @prot: page protection flags to use
1294
- * @pages: pages to map
1295
- *
1296
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1297
- * specify should have been allocated using get_vm_area() and its
1298
- * friends.
1299
- *
1300
- * NOTE:
1301
- * This function does NOT do any cache flushing. The caller is
1302
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
1303
- * before calling this function.
1304
- *
1305
- * RETURNS:
1306
- * The number of pages mapped on success, -errno on failure.
1307
- */
1308
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1309
- pgprot_t prot, struct page **pages)
1310
-{
1311
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
1312
-}
1313
-
1314
-/**
1315
- * unmap_kernel_range_noflush - unmap kernel VM area
1316
- * @addr: start of the VM area to unmap
1317
- * @size: size of the VM area to unmap
1318
- *
1319
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1320
- * specify should have been allocated using get_vm_area() and its
1321
- * friends.
1322
- *
1323
- * NOTE:
1324
- * This function does NOT do any cache flushing. The caller is
1325
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1326
- * before calling this function and flush_tlb_kernel_range() after.
1327
- */
1328
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1329
-{
1330
- vunmap_page_range(addr, addr + size);
1331
-}
1332
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
13332019
13342020 /**
13352021 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
....@@ -1344,33 +2030,26 @@
13442030 unsigned long end = addr + size;
13452031
13462032 flush_cache_vunmap(addr, end);
1347
- vunmap_page_range(addr, end);
2033
+ unmap_kernel_range_noflush(addr, size);
13482034 flush_tlb_kernel_range(addr, end);
13492035 }
1350
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
13512036
1352
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
2037
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2038
+ struct vmap_area *va, unsigned long flags, const void *caller)
13532039 {
1354
- unsigned long addr = (unsigned long)area->addr;
1355
- unsigned long end = addr + get_vm_area_size(area);
1356
- int err;
1357
-
1358
- err = vmap_page_range(addr, end, prot, pages);
1359
-
1360
- return err > 0 ? 0 : err;
1361
-}
1362
-EXPORT_SYMBOL_GPL(map_vm_area);
1363
-
1364
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1365
- unsigned long flags, const void *caller)
1366
-{
1367
- spin_lock(&vmap_area_lock);
13682040 vm->flags = flags;
13692041 vm->addr = (void *)va->va_start;
13702042 vm->size = va->va_end - va->va_start;
13712043 vm->caller = caller;
13722044 va->vm = vm;
1373
- va->flags |= VM_VM_AREA;
2045
+ trace_android_vh_save_vmalloc_stack(flags, vm);
2046
+}
2047
+
2048
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2049
+ unsigned long flags, const void *caller)
2050
+{
2051
+ spin_lock(&vmap_area_lock);
2052
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
13742053 spin_unlock(&vmap_area_lock);
13752054 }
13762055
....@@ -1391,6 +2070,7 @@
13912070 {
13922071 struct vmap_area *va;
13932072 struct vm_struct *area;
2073
+ unsigned long requested_size = size;
13942074
13952075 BUG_ON(in_interrupt());
13962076 size = PAGE_ALIGN(size);
....@@ -1414,18 +2094,12 @@
14142094 return NULL;
14152095 }
14162096
2097
+ kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
2098
+
14172099 setup_vmalloc_vm(area, va, flags, caller);
14182100
14192101 return area;
14202102 }
1421
-
1422
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1423
- unsigned long start, unsigned long end)
1424
-{
1425
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1426
- GFP_KERNEL, __builtin_return_address(0));
1427
-}
1428
-EXPORT_SYMBOL_GPL(__get_vm_area);
14292103
14302104 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
14312105 unsigned long start, unsigned long end,
....@@ -1434,15 +2108,18 @@
14342108 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
14352109 GFP_KERNEL, caller);
14362110 }
2111
+EXPORT_SYMBOL_GPL(__get_vm_area_caller);
14372112
14382113 /**
1439
- * get_vm_area - reserve a contiguous kernel virtual area
1440
- * @size: size of the area
1441
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
2114
+ * get_vm_area - reserve a contiguous kernel virtual area
2115
+ * @size: size of the area
2116
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
14422117 *
1443
- * Search an area of @size in the kernel virtual mapping area,
1444
- * and reserved it for out purposes. Returns the area descriptor
1445
- * on success or %NULL on failure.
2118
+ * Search an area of @size in the kernel virtual mapping area,
2119
+ * and reserved it for out purposes. Returns the area descriptor
2120
+ * on success or %NULL on failure.
2121
+ *
2122
+ * Return: the area descriptor on success or %NULL on failure.
14462123 */
14472124 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
14482125 {
....@@ -1450,7 +2127,6 @@
14502127 NUMA_NO_NODE, GFP_KERNEL,
14512128 __builtin_return_address(0));
14522129 }
1453
-EXPORT_SYMBOL_GPL(get_vm_area);
14542130
14552131 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
14562132 const void *caller)
....@@ -1460,31 +2136,35 @@
14602136 }
14612137
14622138 /**
1463
- * find_vm_area - find a continuous kernel virtual area
1464
- * @addr: base address
2139
+ * find_vm_area - find a continuous kernel virtual area
2140
+ * @addr: base address
14652141 *
1466
- * Search for the kernel VM area starting at @addr, and return it.
1467
- * It is up to the caller to do all required locking to keep the returned
1468
- * pointer valid.
2142
+ * Search for the kernel VM area starting at @addr, and return it.
2143
+ * It is up to the caller to do all required locking to keep the returned
2144
+ * pointer valid.
2145
+ *
2146
+ * Return: the area descriptor on success or %NULL on failure.
14692147 */
14702148 struct vm_struct *find_vm_area(const void *addr)
14712149 {
14722150 struct vmap_area *va;
14732151
14742152 va = find_vmap_area((unsigned long)addr);
1475
- if (va && va->flags & VM_VM_AREA)
1476
- return va->vm;
2153
+ if (!va)
2154
+ return NULL;
14772155
1478
- return NULL;
2156
+ return va->vm;
14792157 }
14802158
14812159 /**
1482
- * remove_vm_area - find and remove a continuous kernel virtual area
1483
- * @addr: base address
2160
+ * remove_vm_area - find and remove a continuous kernel virtual area
2161
+ * @addr: base address
14842162 *
1485
- * Search for the kernel VM area starting at @addr, and remove it.
1486
- * This function returns the found VM area, but using it is NOT safe
1487
- * on SMP machines, except for its size or flags.
2163
+ * Search for the kernel VM area starting at @addr, and remove it.
2164
+ * This function returns the found VM area, but using it is NOT safe
2165
+ * on SMP machines, except for its size or flags.
2166
+ *
2167
+ * Return: the area descriptor on success or %NULL on failure.
14882168 */
14892169 struct vm_struct *remove_vm_area(const void *addr)
14902170 {
....@@ -1492,14 +2172,13 @@
14922172
14932173 might_sleep();
14942174
1495
- va = find_vmap_area((unsigned long)addr);
1496
- if (va && va->flags & VM_VM_AREA) {
2175
+ spin_lock(&vmap_area_lock);
2176
+ va = __find_vmap_area((unsigned long)addr);
2177
+ if (va && va->vm) {
14972178 struct vm_struct *vm = va->vm;
14982179
1499
- spin_lock(&vmap_area_lock);
2180
+ trace_android_vh_remove_vmalloc_stack(vm);
15002181 va->vm = NULL;
1501
- va->flags &= ~VM_VM_AREA;
1502
- va->flags |= VM_LAZY_FREE;
15032182 spin_unlock(&vmap_area_lock);
15042183
15052184 kasan_free_shadow(vm);
....@@ -1507,7 +2186,66 @@
15072186
15082187 return vm;
15092188 }
2189
+
2190
+ spin_unlock(&vmap_area_lock);
15102191 return NULL;
2192
+}
2193
+
2194
+static inline void set_area_direct_map(const struct vm_struct *area,
2195
+ int (*set_direct_map)(struct page *page))
2196
+{
2197
+ int i;
2198
+
2199
+ for (i = 0; i < area->nr_pages; i++)
2200
+ if (page_address(area->pages[i]))
2201
+ set_direct_map(area->pages[i]);
2202
+}
2203
+
2204
+/* Handle removing and resetting vm mappings related to the vm_struct. */
2205
+static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
2206
+{
2207
+ unsigned long start = ULONG_MAX, end = 0;
2208
+ int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
2209
+ int flush_dmap = 0;
2210
+ int i;
2211
+
2212
+ remove_vm_area(area->addr);
2213
+
2214
+ /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
2215
+ if (!flush_reset)
2216
+ return;
2217
+
2218
+ /*
2219
+ * If not deallocating pages, just do the flush of the VM area and
2220
+ * return.
2221
+ */
2222
+ if (!deallocate_pages) {
2223
+ vm_unmap_aliases();
2224
+ return;
2225
+ }
2226
+
2227
+ /*
2228
+ * If execution gets here, flush the vm mapping and reset the direct
2229
+ * map. Find the start and end range of the direct mappings to make sure
2230
+ * the vm_unmap_aliases() flush includes the direct map.
2231
+ */
2232
+ for (i = 0; i < area->nr_pages; i++) {
2233
+ unsigned long addr = (unsigned long)page_address(area->pages[i]);
2234
+ if (addr) {
2235
+ start = min(addr, start);
2236
+ end = max(addr + PAGE_SIZE, end);
2237
+ flush_dmap = 1;
2238
+ }
2239
+ }
2240
+
2241
+ /*
2242
+ * Set direct map to something invalid so that it won't be cached if
2243
+ * there are any accesses after the TLB flush, then flush the TLB and
2244
+ * reset the direct map permissions to the default.
2245
+ */
2246
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
2247
+ _vm_unmap_aliases(start, end, flush_dmap);
2248
+ set_area_direct_map(area, set_direct_map_default_noflush);
15112249 }
15122250
15132251 static void __vunmap(const void *addr, int deallocate_pages)
....@@ -1531,7 +2269,10 @@
15312269 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
15322270 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
15332271
1534
- remove_vm_area(addr);
2272
+ kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
2273
+
2274
+ vm_remove_mappings(area, deallocate_pages);
2275
+
15352276 if (deallocate_pages) {
15362277 int i;
15372278
....@@ -1556,7 +2297,7 @@
15562297 * Use raw_cpu_ptr() because this can be called from preemptible
15572298 * context. Preemption is absolutely fine here, because the llist_add()
15582299 * implementation is lockless, so it works even if we are adding to
1559
- * nother cpu's list. schedule_work() should be fine with this too.
2300
+ * another cpu's list. schedule_work() should be fine with this too.
15602301 */
15612302 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
15622303
....@@ -1565,11 +2306,11 @@
15652306 }
15662307
15672308 /**
1568
- * vfree_atomic - release memory allocated by vmalloc()
1569
- * @addr: memory base address
2309
+ * vfree_atomic - release memory allocated by vmalloc()
2310
+ * @addr: memory base address
15702311 *
1571
- * This one is just like vfree() but can be called in any atomic context
1572
- * except NMIs.
2312
+ * This one is just like vfree() but can be called in any atomic context
2313
+ * except NMIs.
15732314 */
15742315 void vfree_atomic(const void *addr)
15752316 {
....@@ -1582,19 +2323,30 @@
15822323 __vfree_deferred(addr);
15832324 }
15842325
2326
+static void __vfree(const void *addr)
2327
+{
2328
+ if (unlikely(in_interrupt()))
2329
+ __vfree_deferred(addr);
2330
+ else
2331
+ __vunmap(addr, 1);
2332
+}
2333
+
15852334 /**
1586
- * vfree - release memory allocated by vmalloc()
1587
- * @addr: memory base address
2335
+ * vfree - Release memory allocated by vmalloc()
2336
+ * @addr: Memory base address
15882337 *
1589
- * Free the virtually continuous memory area starting at @addr, as
1590
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1591
- * NULL, no operation is performed.
2338
+ * Free the virtually continuous memory area starting at @addr, as obtained
2339
+ * from one of the vmalloc() family of APIs. This will usually also free the
2340
+ * physical memory underlying the virtual allocation, but that memory is
2341
+ * reference counted, so it will not be freed until the last user goes away.
15922342 *
1593
- * Must not be called in NMI context (strictly speaking, only if we don't
1594
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
1595
- * conventions for vfree() arch-depenedent would be a really bad idea)
2343
+ * If @addr is NULL, no operation is performed.
15962344 *
1597
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
2345
+ * Context:
2346
+ * May sleep if called *not* from interrupt context.
2347
+ * Must not be called in NMI context (strictly speaking, it could be
2348
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2349
+ * conventions for vfree() arch-depenedent would be a really bad idea).
15982350 */
15992351 void vfree(const void *addr)
16002352 {
....@@ -1602,23 +2354,23 @@
16022354
16032355 kmemleak_free(addr);
16042356
2357
+ might_sleep_if(!in_interrupt());
2358
+
16052359 if (!addr)
16062360 return;
1607
- if (unlikely(in_interrupt()))
1608
- __vfree_deferred(addr);
1609
- else
1610
- __vunmap(addr, 1);
2361
+
2362
+ __vfree(addr);
16112363 }
16122364 EXPORT_SYMBOL(vfree);
16132365
16142366 /**
1615
- * vunmap - release virtual mapping obtained by vmap()
1616
- * @addr: memory base address
2367
+ * vunmap - release virtual mapping obtained by vmap()
2368
+ * @addr: memory base address
16172369 *
1618
- * Free the virtually contiguous memory area starting at @addr,
1619
- * which was created from the page array passed to vmap().
2370
+ * Free the virtually contiguous memory area starting at @addr,
2371
+ * which was created from the page array passed to vmap().
16202372 *
1621
- * Must not be called in interrupt context.
2373
+ * Must not be called in interrupt context.
16222374 */
16232375 void vunmap(const void *addr)
16242376 {
....@@ -1630,24 +2382,29 @@
16302382 EXPORT_SYMBOL(vunmap);
16312383
16322384 /**
1633
- * vmap - map an array of pages into virtually contiguous space
1634
- * @pages: array of page pointers
1635
- * @count: number of pages to map
1636
- * @flags: vm_area->flags
1637
- * @prot: page protection for the mapping
2385
+ * vmap - map an array of pages into virtually contiguous space
2386
+ * @pages: array of page pointers
2387
+ * @count: number of pages to map
2388
+ * @flags: vm_area->flags
2389
+ * @prot: page protection for the mapping
16382390 *
1639
- * Maps @count pages from @pages into contiguous kernel virtual
1640
- * space.
2391
+ * Maps @count pages from @pages into contiguous kernel virtual space.
2392
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
2393
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
2394
+ * are transferred from the caller to vmap(), and will be freed / dropped when
2395
+ * vfree() is called on the return value.
2396
+ *
2397
+ * Return: the address of the area or %NULL on failure
16412398 */
16422399 void *vmap(struct page **pages, unsigned int count,
1643
- unsigned long flags, pgprot_t prot)
2400
+ unsigned long flags, pgprot_t prot)
16442401 {
16452402 struct vm_struct *area;
16462403 unsigned long size; /* In bytes */
16472404
16482405 might_sleep();
16492406
1650
- if (count > totalram_pages)
2407
+ if (count > totalram_pages())
16512408 return NULL;
16522409
16532410 size = (unsigned long)count << PAGE_SHIFT;
....@@ -1655,36 +2412,81 @@
16552412 if (!area)
16562413 return NULL;
16572414
1658
- if (map_vm_area(area, prot, pages)) {
2415
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
2416
+ pages) < 0) {
16592417 vunmap(area->addr);
16602418 return NULL;
16612419 }
16622420
2421
+ if (flags & VM_MAP_PUT_PAGES) {
2422
+ area->pages = pages;
2423
+ area->nr_pages = count;
2424
+ }
16632425 return area->addr;
16642426 }
16652427 EXPORT_SYMBOL(vmap);
16662428
1667
-static void *__vmalloc_node(unsigned long size, unsigned long align,
1668
- gfp_t gfp_mask, pgprot_t prot,
1669
- int node, const void *caller);
2429
+#ifdef CONFIG_VMAP_PFN
2430
+struct vmap_pfn_data {
2431
+ unsigned long *pfns;
2432
+ pgprot_t prot;
2433
+ unsigned int idx;
2434
+};
2435
+
2436
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
2437
+{
2438
+ struct vmap_pfn_data *data = private;
2439
+
2440
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
2441
+ return -EINVAL;
2442
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
2443
+ return 0;
2444
+}
2445
+
2446
+/**
2447
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
2448
+ * @pfns: array of PFNs
2449
+ * @count: number of pages to map
2450
+ * @prot: page protection for the mapping
2451
+ *
2452
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
2453
+ * the start address of the mapping.
2454
+ */
2455
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
2456
+{
2457
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
2458
+ struct vm_struct *area;
2459
+
2460
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
2461
+ __builtin_return_address(0));
2462
+ if (!area)
2463
+ return NULL;
2464
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2465
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
2466
+ free_vm_area(area);
2467
+ return NULL;
2468
+ }
2469
+ return area->addr;
2470
+}
2471
+EXPORT_SYMBOL_GPL(vmap_pfn);
2472
+#endif /* CONFIG_VMAP_PFN */
2473
+
16702474 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
16712475 pgprot_t prot, int node)
16722476 {
1673
- struct page **pages;
1674
- unsigned int nr_pages, array_size, i;
16752477 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1676
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
1677
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
1678
- 0 :
1679
- __GFP_HIGHMEM;
2478
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
2479
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
2480
+ struct page **pages;
16802481
1681
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1682
- array_size = (nr_pages * sizeof(struct page *));
2482
+ gfp_mask |= __GFP_NOWARN;
2483
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
2484
+ gfp_mask |= __GFP_HIGHMEM;
16832485
16842486 /* Please note that the recursion is strictly bounded. */
16852487 if (array_size > PAGE_SIZE) {
1686
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
1687
- PAGE_KERNEL, node, area->caller);
2488
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
2489
+ area->caller);
16882490 } else {
16892491 pages = kmalloc_node(array_size, nested_gfp, node);
16902492 }
....@@ -1702,49 +2504,53 @@
17022504 struct page *page;
17032505
17042506 if (node == NUMA_NO_NODE)
1705
- page = alloc_page(alloc_mask|highmem_mask);
2507
+ page = alloc_page(gfp_mask);
17062508 else
1707
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
2509
+ page = alloc_pages_node(node, gfp_mask, 0);
17082510
17092511 if (unlikely(!page)) {
1710
- /* Successfully allocated i pages, free them in __vunmap() */
2512
+ /* Successfully allocated i pages, free them in __vfree() */
17112513 area->nr_pages = i;
17122514 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
17132515 goto fail;
17142516 }
17152517 area->pages[i] = page;
1716
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
2518
+ if (gfpflags_allow_blocking(gfp_mask))
17172519 cond_resched();
17182520 }
17192521 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
17202522
1721
- if (map_vm_area(area, prot, pages))
2523
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
2524
+ prot, pages) < 0)
17222525 goto fail;
2526
+
17232527 return area->addr;
17242528
17252529 fail:
17262530 warn_alloc(gfp_mask, NULL,
17272531 "vmalloc: allocation failure, allocated %ld of %ld bytes",
17282532 (area->nr_pages*PAGE_SIZE), area->size);
1729
- vfree(area->addr);
2533
+ __vfree(area->addr);
17302534 return NULL;
17312535 }
17322536
17332537 /**
1734
- * __vmalloc_node_range - allocate virtually contiguous memory
1735
- * @size: allocation size
1736
- * @align: desired alignment
1737
- * @start: vm area range start
1738
- * @end: vm area range end
1739
- * @gfp_mask: flags for the page level allocator
1740
- * @prot: protection mask for the allocated pages
1741
- * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
1742
- * @node: node to use for allocation or NUMA_NO_NODE
1743
- * @caller: caller's return address
2538
+ * __vmalloc_node_range - allocate virtually contiguous memory
2539
+ * @size: allocation size
2540
+ * @align: desired alignment
2541
+ * @start: vm area range start
2542
+ * @end: vm area range end
2543
+ * @gfp_mask: flags for the page level allocator
2544
+ * @prot: protection mask for the allocated pages
2545
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
2546
+ * @node: node to use for allocation or NUMA_NO_NODE
2547
+ * @caller: caller's return address
17442548 *
1745
- * Allocate enough pages to cover @size from the page level
1746
- * allocator with @gfp_mask flags. Map them into contiguous
1747
- * kernel virtual space, using a pagetable protection of @prot.
2549
+ * Allocate enough pages to cover @size from the page level
2550
+ * allocator with @gfp_mask flags. Map them into contiguous
2551
+ * kernel virtual space, using a pagetable protection of @prot.
2552
+ *
2553
+ * Return: the address of the area or %NULL on failure
17482554 */
17492555 void *__vmalloc_node_range(unsigned long size, unsigned long align,
17502556 unsigned long start, unsigned long end, gfp_t gfp_mask,
....@@ -1756,10 +2562,10 @@
17562562 unsigned long real_size = size;
17572563
17582564 size = PAGE_ALIGN(size);
1759
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
2565
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages())
17602566 goto fail;
17612567
1762
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
2568
+ area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
17632569 vm_flags, start, end, node, gfp_mask, caller);
17642570 if (!area)
17652571 goto fail;
....@@ -1767,12 +2573,6 @@
17672573 addr = __vmalloc_area_node(area, gfp_mask, prot, node);
17682574 if (!addr)
17692575 return NULL;
1770
-
1771
- /*
1772
- * First make sure the mappings are removed from all page-tables
1773
- * before they are freed.
1774
- */
1775
- vmalloc_sync_unmappings();
17762576
17772577 /*
17782578 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
....@@ -1792,84 +2592,82 @@
17922592 }
17932593
17942594 /**
1795
- * __vmalloc_node - allocate virtually contiguous memory
1796
- * @size: allocation size
1797
- * @align: desired alignment
1798
- * @gfp_mask: flags for the page level allocator
1799
- * @prot: protection mask for the allocated pages
1800
- * @node: node to use for allocation or NUMA_NO_NODE
1801
- * @caller: caller's return address
2595
+ * __vmalloc_node - allocate virtually contiguous memory
2596
+ * @size: allocation size
2597
+ * @align: desired alignment
2598
+ * @gfp_mask: flags for the page level allocator
2599
+ * @node: node to use for allocation or NUMA_NO_NODE
2600
+ * @caller: caller's return address
18022601 *
1803
- * Allocate enough pages to cover @size from the page level
1804
- * allocator with @gfp_mask flags. Map them into contiguous
1805
- * kernel virtual space, using a pagetable protection of @prot.
2602
+ * Allocate enough pages to cover @size from the page level allocator with
2603
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
18062604 *
1807
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
1808
- * and __GFP_NOFAIL are not supported
2605
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
2606
+ * and __GFP_NOFAIL are not supported
18092607 *
1810
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
1811
- * with mm people.
2608
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
2609
+ * with mm people.
18122610 *
2611
+ * Return: pointer to the allocated memory or %NULL on error
18132612 */
1814
-static void *__vmalloc_node(unsigned long size, unsigned long align,
1815
- gfp_t gfp_mask, pgprot_t prot,
1816
- int node, const void *caller)
2613
+void *__vmalloc_node(unsigned long size, unsigned long align,
2614
+ gfp_t gfp_mask, int node, const void *caller)
18172615 {
18182616 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1819
- gfp_mask, prot, 0, node, caller);
2617
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
18202618 }
2619
+/*
2620
+ * This is only for performance analysis of vmalloc and stress purpose.
2621
+ * It is required by vmalloc test module, therefore do not use it other
2622
+ * than that.
2623
+ */
2624
+#ifdef CONFIG_TEST_VMALLOC_MODULE
2625
+EXPORT_SYMBOL_GPL(__vmalloc_node);
2626
+#endif
18212627
1822
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
2628
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
18232629 {
1824
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
2630
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
18252631 __builtin_return_address(0));
18262632 }
18272633 EXPORT_SYMBOL(__vmalloc);
18282634
1829
-static inline void *__vmalloc_node_flags(unsigned long size,
1830
- int node, gfp_t flags)
1831
-{
1832
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1833
- node, __builtin_return_address(0));
1834
-}
1835
-
1836
-
1837
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
1838
- void *caller)
1839
-{
1840
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
1841
-}
1842
-
18432635 /**
1844
- * vmalloc - allocate virtually contiguous memory
1845
- * @size: allocation size
1846
- * Allocate enough pages to cover @size from the page level
1847
- * allocator and map them into contiguous kernel virtual space.
2636
+ * vmalloc - allocate virtually contiguous memory
2637
+ * @size: allocation size
18482638 *
1849
- * For tight control over page level allocator and protection flags
1850
- * use __vmalloc() instead.
2639
+ * Allocate enough pages to cover @size from the page level
2640
+ * allocator and map them into contiguous kernel virtual space.
2641
+ *
2642
+ * For tight control over page level allocator and protection flags
2643
+ * use __vmalloc() instead.
2644
+ *
2645
+ * Return: pointer to the allocated memory or %NULL on error
18512646 */
18522647 void *vmalloc(unsigned long size)
18532648 {
1854
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
1855
- GFP_KERNEL);
2649
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
2650
+ __builtin_return_address(0));
18562651 }
18572652 EXPORT_SYMBOL(vmalloc);
18582653
18592654 /**
1860
- * vzalloc - allocate virtually contiguous memory with zero fill
1861
- * @size: allocation size
1862
- * Allocate enough pages to cover @size from the page level
1863
- * allocator and map them into contiguous kernel virtual space.
1864
- * The memory allocated is set to zero.
2655
+ * vzalloc - allocate virtually contiguous memory with zero fill
2656
+ * @size: allocation size
18652657 *
1866
- * For tight control over page level allocator and protection flags
1867
- * use __vmalloc() instead.
2658
+ * Allocate enough pages to cover @size from the page level
2659
+ * allocator and map them into contiguous kernel virtual space.
2660
+ * The memory allocated is set to zero.
2661
+ *
2662
+ * For tight control over page level allocator and protection flags
2663
+ * use __vmalloc() instead.
2664
+ *
2665
+ * Return: pointer to the allocated memory or %NULL on error
18682666 */
18692667 void *vzalloc(unsigned long size)
18702668 {
1871
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
1872
- GFP_KERNEL | __GFP_ZERO);
2669
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
2670
+ __builtin_return_address(0));
18732671 }
18742672 EXPORT_SYMBOL(vzalloc);
18752673
....@@ -1879,39 +2677,35 @@
18792677 *
18802678 * The resulting memory area is zeroed so it can be mapped to userspace
18812679 * without leaking data.
2680
+ *
2681
+ * Return: pointer to the allocated memory or %NULL on error
18822682 */
18832683 void *vmalloc_user(unsigned long size)
18842684 {
1885
- struct vm_struct *area;
1886
- void *ret;
1887
-
1888
- ret = __vmalloc_node(size, SHMLBA,
1889
- GFP_KERNEL | __GFP_ZERO,
1890
- PAGE_KERNEL, NUMA_NO_NODE,
1891
- __builtin_return_address(0));
1892
- if (ret) {
1893
- area = find_vm_area(ret);
1894
- area->flags |= VM_USERMAP;
1895
- }
1896
- return ret;
2685
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
2686
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
2687
+ VM_USERMAP, NUMA_NO_NODE,
2688
+ __builtin_return_address(0));
18972689 }
18982690 EXPORT_SYMBOL(vmalloc_user);
18992691
19002692 /**
1901
- * vmalloc_node - allocate memory on a specific node
1902
- * @size: allocation size
1903
- * @node: numa node
2693
+ * vmalloc_node - allocate memory on a specific node
2694
+ * @size: allocation size
2695
+ * @node: numa node
19042696 *
1905
- * Allocate enough pages to cover @size from the page level
1906
- * allocator and map them into contiguous kernel virtual space.
2697
+ * Allocate enough pages to cover @size from the page level
2698
+ * allocator and map them into contiguous kernel virtual space.
19072699 *
1908
- * For tight control over page level allocator and protection flags
1909
- * use __vmalloc() instead.
2700
+ * For tight control over page level allocator and protection flags
2701
+ * use __vmalloc() instead.
2702
+ *
2703
+ * Return: pointer to the allocated memory or %NULL on error
19102704 */
19112705 void *vmalloc_node(unsigned long size, int node)
19122706 {
1913
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
1914
- node, __builtin_return_address(0));
2707
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
2708
+ __builtin_return_address(0));
19152709 }
19162710 EXPORT_SYMBOL(vmalloc_node);
19172711
....@@ -1924,33 +2718,14 @@
19242718 * allocator and map them into contiguous kernel virtual space.
19252719 * The memory allocated is set to zero.
19262720 *
1927
- * For tight control over page level allocator and protection flags
1928
- * use __vmalloc_node() instead.
2721
+ * Return: pointer to the allocated memory or %NULL on error
19292722 */
19302723 void *vzalloc_node(unsigned long size, int node)
19312724 {
1932
- return __vmalloc_node_flags(size, node,
1933
- GFP_KERNEL | __GFP_ZERO);
2725
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
2726
+ __builtin_return_address(0));
19342727 }
19352728 EXPORT_SYMBOL(vzalloc_node);
1936
-
1937
-/**
1938
- * vmalloc_exec - allocate virtually contiguous, executable memory
1939
- * @size: allocation size
1940
- *
1941
- * Kernel-internal function to allocate enough pages to cover @size
1942
- * the page level allocator and map them into contiguous and
1943
- * executable kernel virtual space.
1944
- *
1945
- * For tight control over page level allocator and protection flags
1946
- * use __vmalloc() instead.
1947
- */
1948
-
1949
-void *vmalloc_exec(unsigned long size)
1950
-{
1951
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
1952
- NUMA_NO_NODE, __builtin_return_address(0));
1953
-}
19542729
19552730 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
19562731 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
....@@ -1965,38 +2740,36 @@
19652740 #endif
19662741
19672742 /**
1968
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
1969
- * @size: allocation size
2743
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
2744
+ * @size: allocation size
19702745 *
1971
- * Allocate enough 32bit PA addressable pages to cover @size from the
1972
- * page level allocator and map them into contiguous kernel virtual space.
2746
+ * Allocate enough 32bit PA addressable pages to cover @size from the
2747
+ * page level allocator and map them into contiguous kernel virtual space.
2748
+ *
2749
+ * Return: pointer to the allocated memory or %NULL on error
19732750 */
19742751 void *vmalloc_32(unsigned long size)
19752752 {
1976
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1977
- NUMA_NO_NODE, __builtin_return_address(0));
2753
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
2754
+ __builtin_return_address(0));
19782755 }
19792756 EXPORT_SYMBOL(vmalloc_32);
19802757
19812758 /**
19822759 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1983
- * @size: allocation size
2760
+ * @size: allocation size
19842761 *
19852762 * The resulting memory area is 32bit addressable and zeroed so it can be
19862763 * mapped to userspace without leaking data.
2764
+ *
2765
+ * Return: pointer to the allocated memory or %NULL on error
19872766 */
19882767 void *vmalloc_32_user(unsigned long size)
19892768 {
1990
- struct vm_struct *area;
1991
- void *ret;
1992
-
1993
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1994
- NUMA_NO_NODE, __builtin_return_address(0));
1995
- if (ret) {
1996
- area = find_vm_area(ret);
1997
- area->flags |= VM_USERMAP;
1998
- }
1999
- return ret;
2769
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
2770
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
2771
+ VM_USERMAP, NUMA_NO_NODE,
2772
+ __builtin_return_address(0));
20002773 }
20012774 EXPORT_SYMBOL(vmalloc_32_user);
20022775
....@@ -2082,31 +2855,29 @@
20822855 }
20832856
20842857 /**
2085
- * vread() - read vmalloc area in a safe way.
2086
- * @buf: buffer for reading data
2087
- * @addr: vm address.
2088
- * @count: number of bytes to be read.
2858
+ * vread() - read vmalloc area in a safe way.
2859
+ * @buf: buffer for reading data
2860
+ * @addr: vm address.
2861
+ * @count: number of bytes to be read.
20892862 *
2090
- * Returns # of bytes which addr and buf should be increased.
2091
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
2092
- * includes any intersect with alive vmalloc area.
2863
+ * This function checks that addr is a valid vmalloc'ed area, and
2864
+ * copy data from that area to a given buffer. If the given memory range
2865
+ * of [addr...addr+count) includes some valid address, data is copied to
2866
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
2867
+ * IOREMAP area is treated as memory hole and no copy is done.
20932868 *
2094
- * This function checks that addr is a valid vmalloc'ed area, and
2095
- * copy data from that area to a given buffer. If the given memory range
2096
- * of [addr...addr+count) includes some valid address, data is copied to
2097
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
2098
- * IOREMAP area is treated as memory hole and no copy is done.
2869
+ * If [addr...addr+count) doesn't includes any intersects with alive
2870
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
20992871 *
2100
- * If [addr...addr+count) doesn't includes any intersects with alive
2101
- * vm_struct area, returns 0. @buf should be kernel's buffer.
2872
+ * Note: In usual ops, vread() is never necessary because the caller
2873
+ * should know vmalloc() area is valid and can use memcpy().
2874
+ * This is for routines which have to access vmalloc area without
2875
+ * any information, as /dev/kmem.
21022876 *
2103
- * Note: In usual ops, vread() is never necessary because the caller
2104
- * should know vmalloc() area is valid and can use memcpy().
2105
- * This is for routines which have to access vmalloc area without
2106
- * any informaion, as /dev/kmem.
2107
- *
2877
+ * Return: number of bytes for which addr and buf should be increased
2878
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
2879
+ * include any intersection with valid vmalloc area
21082880 */
2109
-
21102881 long vread(char *buf, char *addr, unsigned long count)
21112882 {
21122883 struct vmap_area *va;
....@@ -2124,7 +2895,7 @@
21242895 if (!count)
21252896 break;
21262897
2127
- if (!(va->flags & VM_VM_AREA))
2898
+ if (!va->vm)
21282899 continue;
21292900
21302901 vm = va->vm;
....@@ -2163,31 +2934,29 @@
21632934 }
21642935
21652936 /**
2166
- * vwrite() - write vmalloc area in a safe way.
2167
- * @buf: buffer for source data
2168
- * @addr: vm address.
2169
- * @count: number of bytes to be read.
2937
+ * vwrite() - write vmalloc area in a safe way.
2938
+ * @buf: buffer for source data
2939
+ * @addr: vm address.
2940
+ * @count: number of bytes to be read.
21702941 *
2171
- * Returns # of bytes which addr and buf should be incresed.
2172
- * (same number to @count).
2173
- * If [addr...addr+count) doesn't includes any intersect with valid
2174
- * vmalloc area, returns 0.
2942
+ * This function checks that addr is a valid vmalloc'ed area, and
2943
+ * copy data from a buffer to the given addr. If specified range of
2944
+ * [addr...addr+count) includes some valid address, data is copied from
2945
+ * proper area of @buf. If there are memory holes, no copy to hole.
2946
+ * IOREMAP area is treated as memory hole and no copy is done.
21752947 *
2176
- * This function checks that addr is a valid vmalloc'ed area, and
2177
- * copy data from a buffer to the given addr. If specified range of
2178
- * [addr...addr+count) includes some valid address, data is copied from
2179
- * proper area of @buf. If there are memory holes, no copy to hole.
2180
- * IOREMAP area is treated as memory hole and no copy is done.
2948
+ * If [addr...addr+count) doesn't includes any intersects with alive
2949
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
21812950 *
2182
- * If [addr...addr+count) doesn't includes any intersects with alive
2183
- * vm_struct area, returns 0. @buf should be kernel's buffer.
2951
+ * Note: In usual ops, vwrite() is never necessary because the caller
2952
+ * should know vmalloc() area is valid and can use memcpy().
2953
+ * This is for routines which have to access vmalloc area without
2954
+ * any information, as /dev/kmem.
21842955 *
2185
- * Note: In usual ops, vwrite() is never necessary because the caller
2186
- * should know vmalloc() area is valid and can use memcpy().
2187
- * This is for routines which have to access vmalloc area without
2188
- * any informaion, as /dev/kmem.
2956
+ * Return: number of bytes for which addr and buf should be
2957
+ * increased (same number as @count) or %0 if [addr...addr+count)
2958
+ * doesn't include any intersection with valid vmalloc area
21892959 */
2190
-
21912960 long vwrite(char *buf, char *addr, unsigned long count)
21922961 {
21932962 struct vmap_area *va;
....@@ -2206,7 +2975,7 @@
22062975 if (!count)
22072976 break;
22082977
2209
- if (!(va->flags & VM_VM_AREA))
2978
+ if (!va->vm)
22102979 continue;
22112980
22122981 vm = va->vm;
....@@ -2239,21 +3008,21 @@
22393008 }
22403009
22413010 /**
2242
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
2243
- * @vma: vma to cover
2244
- * @uaddr: target user address to start at
2245
- * @kaddr: virtual address of vmalloc kernel memory
2246
- * @pgoff: offset from @kaddr to start at
2247
- * @size: size of map area
3011
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
3012
+ * @vma: vma to cover
3013
+ * @uaddr: target user address to start at
3014
+ * @kaddr: virtual address of vmalloc kernel memory
3015
+ * @pgoff: offset from @kaddr to start at
3016
+ * @size: size of map area
22483017 *
2249
- * Returns: 0 for success, -Exxx on failure
3018
+ * Returns: 0 for success, -Exxx on failure
22503019 *
2251
- * This function checks that @kaddr is a valid vmalloc'ed area,
2252
- * and that it is big enough to cover the range starting at
2253
- * @uaddr in @vma. Will return failure if that criteria isn't
2254
- * met.
3020
+ * This function checks that @kaddr is a valid vmalloc'ed area,
3021
+ * and that it is big enough to cover the range starting at
3022
+ * @uaddr in @vma. Will return failure if that criteria isn't
3023
+ * met.
22553024 *
2256
- * Similar to remap_pfn_range() (see mm/memory.c)
3025
+ * Similar to remap_pfn_range() (see mm/memory.c)
22573026 */
22583027 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
22593028 void *kaddr, unsigned long pgoff,
....@@ -2275,7 +3044,7 @@
22753044 if (!area)
22763045 return -EINVAL;
22773046
2278
- if (!(area->flags & VM_USERMAP))
3047
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
22793048 return -EINVAL;
22803049
22813050 if (check_add_overflow(size, off, &end_index) ||
....@@ -2303,18 +3072,18 @@
23033072 EXPORT_SYMBOL(remap_vmalloc_range_partial);
23043073
23053074 /**
2306
- * remap_vmalloc_range - map vmalloc pages to userspace
2307
- * @vma: vma to cover (map full range of vma)
2308
- * @addr: vmalloc memory
2309
- * @pgoff: number of pages into addr before first page to map
3075
+ * remap_vmalloc_range - map vmalloc pages to userspace
3076
+ * @vma: vma to cover (map full range of vma)
3077
+ * @addr: vmalloc memory
3078
+ * @pgoff: number of pages into addr before first page to map
23103079 *
2311
- * Returns: 0 for success, -Exxx on failure
3080
+ * Returns: 0 for success, -Exxx on failure
23123081 *
2313
- * This function checks that addr is a valid vmalloc'ed area, and
2314
- * that it is big enough to cover the vma. Will return failure if
2315
- * that criteria isn't met.
3082
+ * This function checks that addr is a valid vmalloc'ed area, and
3083
+ * that it is big enough to cover the vma. Will return failure if
3084
+ * that criteria isn't met.
23163085 *
2317
- * Similar to remap_pfn_range() (see mm/memory.c)
3086
+ * Similar to remap_pfn_range() (see mm/memory.c)
23183087 */
23193088 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
23203089 unsigned long pgoff)
....@@ -2324,69 +3093,6 @@
23243093 vma->vm_end - vma->vm_start);
23253094 }
23263095 EXPORT_SYMBOL(remap_vmalloc_range);
2327
-
2328
-/*
2329
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
2330
- * not to have one.
2331
- *
2332
- * The purpose of this function is to make sure the vmalloc area
2333
- * mappings are identical in all page-tables in the system.
2334
- */
2335
-void __weak vmalloc_sync_mappings(void)
2336
-{
2337
-}
2338
-
2339
-void __weak vmalloc_sync_unmappings(void)
2340
-{
2341
-}
2342
-
2343
-static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2344
-{
2345
- pte_t ***p = data;
2346
-
2347
- if (p) {
2348
- *(*p) = pte;
2349
- (*p)++;
2350
- }
2351
- return 0;
2352
-}
2353
-
2354
-/**
2355
- * alloc_vm_area - allocate a range of kernel address space
2356
- * @size: size of the area
2357
- * @ptes: returns the PTEs for the address space
2358
- *
2359
- * Returns: NULL on failure, vm_struct on success
2360
- *
2361
- * This function reserves a range of kernel address space, and
2362
- * allocates pagetables to map that range. No actual mappings
2363
- * are created.
2364
- *
2365
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
2366
- * allocated for the VM area are returned.
2367
- */
2368
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
2369
-{
2370
- struct vm_struct *area;
2371
-
2372
- area = get_vm_area_caller(size, VM_IOREMAP,
2373
- __builtin_return_address(0));
2374
- if (area == NULL)
2375
- return NULL;
2376
-
2377
- /*
2378
- * This ensures that page tables are constructed for this region
2379
- * of kernel virtual address space and mapped into init_mm.
2380
- */
2381
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2382
- size, f, ptes ? &ptes : NULL)) {
2383
- free_vm_area(area);
2384
- return NULL;
2385
- }
2386
-
2387
- return area;
2388
-}
2389
-EXPORT_SYMBOL_GPL(alloc_vm_area);
23903096
23913097 void free_vm_area(struct vm_struct *area)
23923098 {
....@@ -2404,81 +3110,64 @@
24043110 }
24053111
24063112 /**
2407
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2408
- * @end: target address
2409
- * @pnext: out arg for the next vmap_area
2410
- * @pprev: out arg for the previous vmap_area
3113
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3114
+ * @addr: target address
24113115 *
2412
- * Returns: %true if either or both of next and prev are found,
2413
- * %false if no vmap_area exists
2414
- *
2415
- * Find vmap_areas end addresses of which enclose @end. ie. if not
2416
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
3116
+ * Returns: vmap_area if it is found. If there is no such area
3117
+ * the first highest(reverse order) vmap_area is returned
3118
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
3119
+ * if there are no any areas before @addr.
24173120 */
2418
-static bool pvm_find_next_prev(unsigned long end,
2419
- struct vmap_area **pnext,
2420
- struct vmap_area **pprev)
3121
+static struct vmap_area *
3122
+pvm_find_va_enclose_addr(unsigned long addr)
24213123 {
2422
- struct rb_node *n = vmap_area_root.rb_node;
2423
- struct vmap_area *va = NULL;
3124
+ struct vmap_area *va, *tmp;
3125
+ struct rb_node *n;
3126
+
3127
+ n = free_vmap_area_root.rb_node;
3128
+ va = NULL;
24243129
24253130 while (n) {
2426
- va = rb_entry(n, struct vmap_area, rb_node);
2427
- if (end < va->va_end)
2428
- n = n->rb_left;
2429
- else if (end > va->va_end)
3131
+ tmp = rb_entry(n, struct vmap_area, rb_node);
3132
+ if (tmp->va_start <= addr) {
3133
+ va = tmp;
3134
+ if (tmp->va_end >= addr)
3135
+ break;
3136
+
24303137 n = n->rb_right;
2431
- else
2432
- break;
3138
+ } else {
3139
+ n = n->rb_left;
3140
+ }
24333141 }
24343142
2435
- if (!va)
2436
- return false;
2437
-
2438
- if (va->va_end > end) {
2439
- *pnext = va;
2440
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2441
- } else {
2442
- *pprev = va;
2443
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2444
- }
2445
- return true;
3143
+ return va;
24463144 }
24473145
24483146 /**
2449
- * pvm_determine_end - find the highest aligned address between two vmap_areas
2450
- * @pnext: in/out arg for the next vmap_area
2451
- * @pprev: in/out arg for the previous vmap_area
2452
- * @align: alignment
3147
+ * pvm_determine_end_from_reverse - find the highest aligned address
3148
+ * of free block below VMALLOC_END
3149
+ * @va:
3150
+ * in - the VA we start the search(reverse order);
3151
+ * out - the VA with the highest aligned end address.
24533152 *
2454
- * Returns: determined end address
2455
- *
2456
- * Find the highest aligned address between *@pnext and *@pprev below
2457
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2458
- * down address is between the end addresses of the two vmap_areas.
2459
- *
2460
- * Please note that the address returned by this function may fall
2461
- * inside *@pnext vmap_area. The caller is responsible for checking
2462
- * that.
3153
+ * Returns: determined end address within vmap_area
24633154 */
2464
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
2465
- struct vmap_area **pprev,
2466
- unsigned long align)
3155
+static unsigned long
3156
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
24673157 {
2468
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3158
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
24693159 unsigned long addr;
24703160
2471
- if (*pnext)
2472
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2473
- else
2474
- addr = vmalloc_end;
2475
-
2476
- while (*pprev && (*pprev)->va_end > addr) {
2477
- *pnext = *pprev;
2478
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
3161
+ if (likely(*va)) {
3162
+ list_for_each_entry_from_reverse((*va),
3163
+ &free_vmap_area_list, list) {
3164
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
3165
+ if ((*va)->va_start < addr)
3166
+ return addr;
3167
+ }
24793168 }
24803169
2481
- return addr;
3170
+ return 0;
24823171 }
24833172
24843173 /**
....@@ -2498,12 +3187,12 @@
24983187 * to gigabytes. To avoid interacting with regular vmallocs, these
24993188 * areas are allocated from top.
25003189 *
2501
- * Despite its complicated look, this allocator is rather simple. It
2502
- * does everything top-down and scans areas from the end looking for
2503
- * matching slot. While scanning, if any of the areas overlaps with
2504
- * existing vmap_area, the base address is pulled down to fit the
2505
- * area. Scanning is repeated till all the areas fit and then all
2506
- * necessary data structures are inserted and the result is returned.
3190
+ * Despite its complicated look, this allocator is rather simple. It
3191
+ * does everything top-down and scans free blocks from the end looking
3192
+ * for matching base. While scanning, if any of the areas do not fit the
3193
+ * base address is pulled down to fit the area. Scanning is repeated till
3194
+ * all the areas fit and then all necessary data structures are inserted
3195
+ * and the result is returned.
25073196 */
25083197 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
25093198 const size_t *sizes, int nr_vms,
....@@ -2511,11 +3200,12 @@
25113200 {
25123201 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
25133202 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2514
- struct vmap_area **vas, *prev, *next;
3203
+ struct vmap_area **vas, *va;
25153204 struct vm_struct **vms;
25163205 int area, area2, last_area, term_area;
2517
- unsigned long base, start, end, last_end;
3206
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
25183207 bool purged = false;
3208
+ enum fit_type type;
25193209
25203210 /* verify parameters and allocate data structures */
25213211 BUG_ON(offset_in_page(align) || !is_power_of_2(align));
....@@ -2551,62 +3241,52 @@
25513241 goto err_free2;
25523242
25533243 for (area = 0; area < nr_vms; area++) {
2554
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
3244
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
25553245 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
25563246 if (!vas[area] || !vms[area])
25573247 goto err_free;
25583248 }
25593249 retry:
2560
- spin_lock(&vmap_area_lock);
3250
+ spin_lock(&free_vmap_area_lock);
25613251
25623252 /* start scanning - we scan from the top, begin with the last area */
25633253 area = term_area = last_area;
25643254 start = offsets[area];
25653255 end = start + sizes[area];
25663256
2567
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2568
- base = vmalloc_end - last_end;
2569
- goto found;
2570
- }
2571
- base = pvm_determine_end(&next, &prev, align) - end;
3257
+ va = pvm_find_va_enclose_addr(vmalloc_end);
3258
+ base = pvm_determine_end_from_reverse(&va, align) - end;
25723259
25733260 while (true) {
2574
- BUG_ON(next && next->va_end <= base + end);
2575
- BUG_ON(prev && prev->va_end > base + end);
2576
-
25773261 /*
25783262 * base might have underflowed, add last_end before
25793263 * comparing.
25803264 */
2581
- if (base + last_end < vmalloc_start + last_end) {
2582
- spin_unlock(&vmap_area_lock);
2583
- if (!purged) {
2584
- purge_vmap_area_lazy();
2585
- purged = true;
2586
- goto retry;
2587
- }
2588
- goto err_free;
2589
- }
3265
+ if (base + last_end < vmalloc_start + last_end)
3266
+ goto overflow;
25903267
25913268 /*
2592
- * If next overlaps, move base downwards so that it's
2593
- * right below next and then recheck.
3269
+ * Fitting base has not been found.
25943270 */
2595
- if (next && next->va_start < base + end) {
2596
- base = pvm_determine_end(&next, &prev, align) - end;
3271
+ if (va == NULL)
3272
+ goto overflow;
3273
+
3274
+ /*
3275
+ * If required width exceeds current VA block, move
3276
+ * base downwards and then recheck.
3277
+ */
3278
+ if (base + end > va->va_end) {
3279
+ base = pvm_determine_end_from_reverse(&va, align) - end;
25973280 term_area = area;
25983281 continue;
25993282 }
26003283
26013284 /*
2602
- * If prev overlaps, shift down next and prev and move
2603
- * base so that it's right below new next and then
2604
- * recheck.
3285
+ * If this VA does not fit, move base downwards and recheck.
26053286 */
2606
- if (prev && prev->va_end > base + start) {
2607
- next = prev;
2608
- prev = node_to_va(rb_prev(&next->rb_node));
2609
- base = pvm_determine_end(&next, &prev, align) - end;
3287
+ if (base + start < va->va_start) {
3288
+ va = node_to_va(rb_prev(&va->rb_node));
3289
+ base = pvm_determine_end_from_reverse(&va, align) - end;
26103290 term_area = area;
26113291 continue;
26123292 }
....@@ -2618,38 +3298,132 @@
26183298 area = (area + nr_vms - 1) % nr_vms;
26193299 if (area == term_area)
26203300 break;
3301
+
26213302 start = offsets[area];
26223303 end = start + sizes[area];
2623
- pvm_find_next_prev(base + end, &next, &prev);
3304
+ va = pvm_find_va_enclose_addr(base + end);
26243305 }
2625
-found:
3306
+
26263307 /* we've found a fitting base, insert all va's */
26273308 for (area = 0; area < nr_vms; area++) {
2628
- struct vmap_area *va = vas[area];
3309
+ int ret;
26293310
2630
- va->va_start = base + offsets[area];
2631
- va->va_end = va->va_start + sizes[area];
2632
- __insert_vmap_area(va);
3311
+ start = base + offsets[area];
3312
+ size = sizes[area];
3313
+
3314
+ va = pvm_find_va_enclose_addr(start);
3315
+ if (WARN_ON_ONCE(va == NULL))
3316
+ /* It is a BUG(), but trigger recovery instead. */
3317
+ goto recovery;
3318
+
3319
+ type = classify_va_fit_type(va, start, size);
3320
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
3321
+ /* It is a BUG(), but trigger recovery instead. */
3322
+ goto recovery;
3323
+
3324
+ ret = adjust_va_to_fit_type(va, start, size, type);
3325
+ if (unlikely(ret))
3326
+ goto recovery;
3327
+
3328
+ /* Allocated area. */
3329
+ va = vas[area];
3330
+ va->va_start = start;
3331
+ va->va_end = start + size;
26333332 }
26343333
2635
- vmap_area_pcpu_hole = base + offsets[last_area];
3334
+ spin_unlock(&free_vmap_area_lock);
26363335
2637
- spin_unlock(&vmap_area_lock);
3336
+ /* populate the kasan shadow space */
3337
+ for (area = 0; area < nr_vms; area++) {
3338
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
3339
+ goto err_free_shadow;
3340
+
3341
+ kasan_unpoison_vmalloc((void *)vas[area]->va_start,
3342
+ sizes[area]);
3343
+ }
26383344
26393345 /* insert all vm's */
2640
- for (area = 0; area < nr_vms; area++)
2641
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
3346
+ spin_lock(&vmap_area_lock);
3347
+ for (area = 0; area < nr_vms; area++) {
3348
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
3349
+
3350
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
26423351 pcpu_get_vm_areas);
3352
+ }
3353
+ spin_unlock(&vmap_area_lock);
26433354
26443355 kfree(vas);
26453356 return vms;
26463357
3358
+recovery:
3359
+ /*
3360
+ * Remove previously allocated areas. There is no
3361
+ * need in removing these areas from the busy tree,
3362
+ * because they are inserted only on the final step
3363
+ * and when pcpu_get_vm_areas() is success.
3364
+ */
3365
+ while (area--) {
3366
+ orig_start = vas[area]->va_start;
3367
+ orig_end = vas[area]->va_end;
3368
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3369
+ &free_vmap_area_list);
3370
+ if (va)
3371
+ kasan_release_vmalloc(orig_start, orig_end,
3372
+ va->va_start, va->va_end);
3373
+ vas[area] = NULL;
3374
+ }
3375
+
3376
+overflow:
3377
+ spin_unlock(&free_vmap_area_lock);
3378
+ if (!purged) {
3379
+ purge_vmap_area_lazy();
3380
+ purged = true;
3381
+
3382
+ /* Before "retry", check if we recover. */
3383
+ for (area = 0; area < nr_vms; area++) {
3384
+ if (vas[area])
3385
+ continue;
3386
+
3387
+ vas[area] = kmem_cache_zalloc(
3388
+ vmap_area_cachep, GFP_KERNEL);
3389
+ if (!vas[area])
3390
+ goto err_free;
3391
+ }
3392
+
3393
+ goto retry;
3394
+ }
3395
+
26473396 err_free:
26483397 for (area = 0; area < nr_vms; area++) {
2649
- kfree(vas[area]);
3398
+ if (vas[area])
3399
+ kmem_cache_free(vmap_area_cachep, vas[area]);
3400
+
26503401 kfree(vms[area]);
26513402 }
26523403 err_free2:
3404
+ kfree(vas);
3405
+ kfree(vms);
3406
+ return NULL;
3407
+
3408
+err_free_shadow:
3409
+ spin_lock(&free_vmap_area_lock);
3410
+ /*
3411
+ * We release all the vmalloc shadows, even the ones for regions that
3412
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
3413
+ * being able to tolerate this case.
3414
+ */
3415
+ for (area = 0; area < nr_vms; area++) {
3416
+ orig_start = vas[area]->va_start;
3417
+ orig_end = vas[area]->va_end;
3418
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3419
+ &free_vmap_area_list);
3420
+ if (va)
3421
+ kasan_release_vmalloc(orig_start, orig_end,
3422
+ va->va_start, va->va_end);
3423
+ vas[area] = NULL;
3424
+ kfree(vms[area]);
3425
+ }
3426
+ spin_unlock(&free_vmap_area_lock);
26533427 kfree(vas);
26543428 kfree(vms);
26553429 return NULL;
....@@ -2674,9 +3448,12 @@
26743448
26753449 #ifdef CONFIG_PROC_FS
26763450 static void *s_start(struct seq_file *m, loff_t *pos)
3451
+ __acquires(&vmap_purge_lock)
26773452 __acquires(&vmap_area_lock)
26783453 {
3454
+ mutex_lock(&vmap_purge_lock);
26793455 spin_lock(&vmap_area_lock);
3456
+
26803457 return seq_list_start(&vmap_area_list, *pos);
26813458 }
26823459
....@@ -2687,8 +3464,10 @@
26873464
26883465 static void s_stop(struct seq_file *m, void *p)
26893466 __releases(&vmap_area_lock)
3467
+ __releases(&vmap_purge_lock)
26903468 {
26913469 spin_unlock(&vmap_area_lock);
3470
+ mutex_unlock(&vmap_purge_lock);
26923471 }
26933472
26943473 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
....@@ -2715,6 +3494,22 @@
27153494 }
27163495 }
27173496
3497
+static void show_purge_info(struct seq_file *m)
3498
+{
3499
+ struct llist_node *head;
3500
+ struct vmap_area *va;
3501
+
3502
+ head = READ_ONCE(vmap_purge_list.first);
3503
+ if (head == NULL)
3504
+ return;
3505
+
3506
+ llist_for_each_entry(va, head, purge_list) {
3507
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3508
+ (void *)va->va_start, (void *)va->va_end,
3509
+ va->va_end - va->va_start);
3510
+ }
3511
+}
3512
+
27183513 static int s_show(struct seq_file *m, void *p)
27193514 {
27203515 struct vmap_area *va;
....@@ -2723,14 +3518,13 @@
27233518 va = list_entry(p, struct vmap_area, list);
27243519
27253520 /*
2726
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2727
- * behalf of vmap area is being tear down or vm_map_ram allocation.
3521
+ * s_show can encounter race with remove_vm_area, !vm on behalf
3522
+ * of vmap area is being tear down or vm_map_ram allocation.
27283523 */
2729
- if (!(va->flags & VM_VM_AREA)) {
2730
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
3524
+ if (!va->vm) {
3525
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
27313526 (void *)va->va_start, (void *)va->va_end,
2732
- va->va_end - va->va_start,
2733
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
3527
+ va->va_end - va->va_start);
27343528
27353529 return 0;
27363530 }
....@@ -2761,11 +3555,25 @@
27613555 if (v->flags & VM_USERMAP)
27623556 seq_puts(m, " user");
27633557
3558
+ if (v->flags & VM_DMA_COHERENT)
3559
+ seq_puts(m, " dma-coherent");
3560
+
27643561 if (is_vmalloc_addr(v->pages))
27653562 seq_puts(m, " vpages");
27663563
27673564 show_numa_info(m, v);
3565
+ trace_android_vh_show_stack_hash(m, v);
27683566 seq_putc(m, '\n');
3567
+
3568
+ /*
3569
+ * As a final step, dump "unpurged" areas. Note,
3570
+ * that entire "/proc/vmallocinfo" output will not
3571
+ * be address sorted, because the purge list is not
3572
+ * sorted.
3573
+ */
3574
+ if (list_is_last(&va->list, &vmap_area_list))
3575
+ show_purge_info(m);
3576
+
27693577 return 0;
27703578 }
27713579
....@@ -2789,4 +3597,3 @@
27893597 module_init(proc_vmalloc_init);
27903598
27913599 #endif
2792
-