.. | .. |
---|
1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
---|
2 | 2 | #include <linux/mm.h> |
---|
3 | 3 | #include <linux/mmzone.h> |
---|
4 | | -#include <linux/bootmem.h> |
---|
| 4 | +#include <linux/memblock.h> |
---|
5 | 5 | #include <linux/page_ext.h> |
---|
6 | 6 | #include <linux/memory.h> |
---|
7 | 7 | #include <linux/vmalloc.h> |
---|
8 | 8 | #include <linux/kmemleak.h> |
---|
9 | 9 | #include <linux/page_owner.h> |
---|
10 | 10 | #include <linux/page_idle.h> |
---|
11 | | - |
---|
| 11 | +#include <linux/rcupdate.h> |
---|
12 | 12 | /* |
---|
13 | 13 | * struct page extension |
---|
14 | 14 | * |
---|
.. | .. |
---|
58 | 58 | * can utilize this callback to initialize the state of it correctly. |
---|
59 | 59 | */ |
---|
60 | 60 | |
---|
61 | | -static struct page_ext_operations *page_ext_ops[] = { |
---|
62 | | -#ifdef CONFIG_DEBUG_PAGEALLOC |
---|
63 | | - &debug_guardpage_ops, |
---|
| 61 | +#ifdef CONFIG_SPARSEMEM |
---|
| 62 | +#define PAGE_EXT_INVALID (0x1) |
---|
64 | 63 | #endif |
---|
| 64 | + |
---|
| 65 | +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) |
---|
| 66 | +static bool need_page_idle(void) |
---|
| 67 | +{ |
---|
| 68 | + return true; |
---|
| 69 | +} |
---|
| 70 | +struct page_ext_operations page_idle_ops = { |
---|
| 71 | + .need = need_page_idle, |
---|
| 72 | +}; |
---|
| 73 | +#endif |
---|
| 74 | + |
---|
| 75 | +static struct page_ext_operations *page_ext_ops[] = { |
---|
65 | 76 | #ifdef CONFIG_PAGE_OWNER |
---|
66 | 77 | &page_owner_ops, |
---|
67 | 78 | #endif |
---|
68 | | -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) |
---|
| 79 | +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) |
---|
69 | 80 | &page_idle_ops, |
---|
| 81 | +#endif |
---|
| 82 | +#ifdef CONFIG_PAGE_PINNER |
---|
| 83 | + &page_pinner_ops, |
---|
70 | 84 | #endif |
---|
71 | 85 | }; |
---|
72 | 86 | |
---|
| 87 | +unsigned long page_ext_size = sizeof(struct page_ext); |
---|
| 88 | + |
---|
73 | 89 | static unsigned long total_usage; |
---|
74 | | -static unsigned long extra_mem; |
---|
75 | 90 | |
---|
76 | 91 | static bool __init invoke_need_callbacks(void) |
---|
77 | 92 | { |
---|
.. | .. |
---|
81 | 96 | |
---|
82 | 97 | for (i = 0; i < entries; i++) { |
---|
83 | 98 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { |
---|
84 | | - page_ext_ops[i]->offset = sizeof(struct page_ext) + |
---|
85 | | - extra_mem; |
---|
86 | | - extra_mem += page_ext_ops[i]->size; |
---|
| 99 | + page_ext_ops[i]->offset = page_ext_size; |
---|
| 100 | + page_ext_size += page_ext_ops[i]->size; |
---|
87 | 101 | need = true; |
---|
88 | 102 | } |
---|
89 | 103 | } |
---|
.. | .. |
---|
102 | 116 | } |
---|
103 | 117 | } |
---|
104 | 118 | |
---|
105 | | -static unsigned long get_entry_size(void) |
---|
| 119 | +#ifndef CONFIG_SPARSEMEM |
---|
| 120 | +void __init page_ext_init_flatmem_late(void) |
---|
106 | 121 | { |
---|
107 | | - return sizeof(struct page_ext) + extra_mem; |
---|
| 122 | + invoke_init_callbacks(); |
---|
108 | 123 | } |
---|
| 124 | +#endif |
---|
109 | 125 | |
---|
110 | 126 | static inline struct page_ext *get_entry(void *base, unsigned long index) |
---|
111 | 127 | { |
---|
112 | | - return base + get_entry_size() * index; |
---|
| 128 | + return base + page_ext_size * index; |
---|
113 | 129 | } |
---|
114 | 130 | |
---|
115 | | -#if !defined(CONFIG_SPARSEMEM) |
---|
| 131 | +/** |
---|
| 132 | + * page_ext_get() - Get the extended information for a page. |
---|
| 133 | + * @page: The page we're interested in. |
---|
| 134 | + * |
---|
| 135 | + * Ensures that the page_ext will remain valid until page_ext_put() |
---|
| 136 | + * is called. |
---|
| 137 | + * |
---|
| 138 | + * Return: NULL if no page_ext exists for this page. |
---|
| 139 | + * Context: Any context. Caller may not sleep until they have called |
---|
| 140 | + * page_ext_put(). |
---|
| 141 | + */ |
---|
| 142 | +struct page_ext *page_ext_get(struct page *page) |
---|
| 143 | +{ |
---|
| 144 | + struct page_ext *page_ext; |
---|
| 145 | + |
---|
| 146 | + rcu_read_lock(); |
---|
| 147 | + page_ext = lookup_page_ext(page); |
---|
| 148 | + if (!page_ext) { |
---|
| 149 | + rcu_read_unlock(); |
---|
| 150 | + return NULL; |
---|
| 151 | + } |
---|
| 152 | + |
---|
| 153 | + return page_ext; |
---|
| 154 | +} |
---|
| 155 | + |
---|
| 156 | +/** |
---|
| 157 | + * page_ext_put() - Working with page extended information is done. |
---|
| 158 | + * @page_ext: Page extended information received from page_ext_get(). |
---|
| 159 | + * |
---|
| 160 | + * The page extended information of the page may not be valid after this |
---|
| 161 | + * function is called. |
---|
| 162 | + * |
---|
| 163 | + * Return: None. |
---|
| 164 | + * Context: Any context with corresponding page_ext_get() is called. |
---|
| 165 | + */ |
---|
| 166 | +void page_ext_put(struct page_ext *page_ext) |
---|
| 167 | +{ |
---|
| 168 | + if (unlikely(!page_ext)) |
---|
| 169 | + return; |
---|
| 170 | + |
---|
| 171 | + rcu_read_unlock(); |
---|
| 172 | +} |
---|
| 173 | +#ifndef CONFIG_SPARSEMEM |
---|
116 | 174 | |
---|
117 | 175 | |
---|
118 | 176 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) |
---|
.. | .. |
---|
126 | 184 | unsigned long index; |
---|
127 | 185 | struct page_ext *base; |
---|
128 | 186 | |
---|
| 187 | + WARN_ON_ONCE(!rcu_read_lock_held()); |
---|
129 | 188 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
---|
130 | 189 | /* |
---|
131 | 190 | * The sanity checks the page allocator does upon freeing a |
---|
.. | .. |
---|
139 | 198 | MAX_ORDER_NR_PAGES); |
---|
140 | 199 | return get_entry(base, index); |
---|
141 | 200 | } |
---|
| 201 | +EXPORT_SYMBOL_GPL(lookup_page_ext); |
---|
142 | 202 | |
---|
143 | 203 | static int __init alloc_node_page_ext(int nid) |
---|
144 | 204 | { |
---|
.. | .. |
---|
159 | 219 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) |
---|
160 | 220 | nr_pages += MAX_ORDER_NR_PAGES; |
---|
161 | 221 | |
---|
162 | | - table_size = get_entry_size() * nr_pages; |
---|
| 222 | + table_size = page_ext_size * nr_pages; |
---|
163 | 223 | |
---|
164 | | - base = memblock_virt_alloc_try_nid_nopanic( |
---|
| 224 | + base = memblock_alloc_try_nid( |
---|
165 | 225 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
---|
166 | | - BOOTMEM_ALLOC_ACCESSIBLE, nid); |
---|
| 226 | + MEMBLOCK_ALLOC_ACCESSIBLE, nid); |
---|
167 | 227 | if (!base) |
---|
168 | 228 | return -ENOMEM; |
---|
169 | 229 | NODE_DATA(nid)->node_page_ext = base; |
---|
.. | .. |
---|
185 | 245 | goto fail; |
---|
186 | 246 | } |
---|
187 | 247 | pr_info("allocated %ld bytes of page_ext\n", total_usage); |
---|
188 | | - invoke_init_callbacks(); |
---|
189 | 248 | return; |
---|
190 | 249 | |
---|
191 | 250 | fail: |
---|
.. | .. |
---|
194 | 253 | } |
---|
195 | 254 | |
---|
196 | 255 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ |
---|
| 256 | +static bool page_ext_invalid(struct page_ext *page_ext) |
---|
| 257 | +{ |
---|
| 258 | + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); |
---|
| 259 | +} |
---|
197 | 260 | |
---|
198 | 261 | struct page_ext *lookup_page_ext(const struct page *page) |
---|
199 | 262 | { |
---|
200 | 263 | unsigned long pfn = page_to_pfn(page); |
---|
201 | 264 | struct mem_section *section = __pfn_to_section(pfn); |
---|
| 265 | + struct page_ext *page_ext = READ_ONCE(section->page_ext); |
---|
| 266 | + |
---|
| 267 | + WARN_ON_ONCE(!rcu_read_lock_held()); |
---|
202 | 268 | /* |
---|
203 | 269 | * The sanity checks the page allocator does upon freeing a |
---|
204 | 270 | * page can reach here before the page_ext arrays are |
---|
205 | 271 | * allocated when feeding a range of pages to the allocator |
---|
206 | 272 | * for the first time during bootup or memory hotplug. |
---|
207 | 273 | */ |
---|
208 | | - if (!section->page_ext) |
---|
| 274 | + if (page_ext_invalid(page_ext)) |
---|
209 | 275 | return NULL; |
---|
210 | | - return get_entry(section->page_ext, pfn); |
---|
| 276 | + return get_entry(page_ext, pfn); |
---|
211 | 277 | } |
---|
| 278 | +EXPORT_SYMBOL_GPL(lookup_page_ext); |
---|
212 | 279 | |
---|
213 | 280 | static void *__meminit alloc_page_ext(size_t size, int nid) |
---|
214 | 281 | { |
---|
.. | .. |
---|
237 | 304 | if (section->page_ext) |
---|
238 | 305 | return 0; |
---|
239 | 306 | |
---|
240 | | - table_size = get_entry_size() * PAGES_PER_SECTION; |
---|
| 307 | + table_size = page_ext_size * PAGES_PER_SECTION; |
---|
241 | 308 | base = alloc_page_ext(table_size, nid); |
---|
242 | 309 | |
---|
243 | 310 | /* |
---|
.. | .. |
---|
257 | 324 | * we need to apply a mask. |
---|
258 | 325 | */ |
---|
259 | 326 | pfn &= PAGE_SECTION_MASK; |
---|
260 | | - section->page_ext = (void *)base - get_entry_size() * pfn; |
---|
| 327 | + section->page_ext = (void *)base - page_ext_size * pfn; |
---|
261 | 328 | total_usage += table_size; |
---|
262 | 329 | return 0; |
---|
263 | 330 | } |
---|
.. | .. |
---|
270 | 337 | struct page *page = virt_to_page(addr); |
---|
271 | 338 | size_t table_size; |
---|
272 | 339 | |
---|
273 | | - table_size = get_entry_size() * PAGES_PER_SECTION; |
---|
| 340 | + table_size = page_ext_size * PAGES_PER_SECTION; |
---|
274 | 341 | |
---|
275 | 342 | BUG_ON(PageReserved(page)); |
---|
276 | 343 | kmemleak_free(addr); |
---|
.. | .. |
---|
286 | 353 | ms = __pfn_to_section(pfn); |
---|
287 | 354 | if (!ms || !ms->page_ext) |
---|
288 | 355 | return; |
---|
289 | | - base = get_entry(ms->page_ext, pfn); |
---|
| 356 | + |
---|
| 357 | + base = READ_ONCE(ms->page_ext); |
---|
| 358 | + /* |
---|
| 359 | + * page_ext here can be valid while doing the roll back |
---|
| 360 | + * operation in online_page_ext(). |
---|
| 361 | + */ |
---|
| 362 | + if (page_ext_invalid(base)) |
---|
| 363 | + base = (void *)base - PAGE_EXT_INVALID; |
---|
| 364 | + WRITE_ONCE(ms->page_ext, NULL); |
---|
| 365 | + |
---|
| 366 | + base = get_entry(base, pfn); |
---|
290 | 367 | free_page_ext(base); |
---|
291 | | - ms->page_ext = NULL; |
---|
| 368 | +} |
---|
| 369 | + |
---|
| 370 | +static void __invalidate_page_ext(unsigned long pfn) |
---|
| 371 | +{ |
---|
| 372 | + struct mem_section *ms; |
---|
| 373 | + void *val; |
---|
| 374 | + |
---|
| 375 | + ms = __pfn_to_section(pfn); |
---|
| 376 | + if (!ms || !ms->page_ext) |
---|
| 377 | + return; |
---|
| 378 | + val = (void *)ms->page_ext + PAGE_EXT_INVALID; |
---|
| 379 | + WRITE_ONCE(ms->page_ext, val); |
---|
292 | 380 | } |
---|
293 | 381 | |
---|
294 | 382 | static int __meminit online_page_ext(unsigned long start_pfn, |
---|
.. | .. |
---|
301 | 389 | start = SECTION_ALIGN_DOWN(start_pfn); |
---|
302 | 390 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
---|
303 | 391 | |
---|
304 | | - if (nid == -1) { |
---|
| 392 | + if (nid == NUMA_NO_NODE) { |
---|
305 | 393 | /* |
---|
306 | 394 | * In this case, "nid" already exists and contains valid memory. |
---|
307 | 395 | * "start_pfn" passed to us is a pfn which is an arg for |
---|
.. | .. |
---|
311 | 399 | VM_BUG_ON(!node_state(nid, N_ONLINE)); |
---|
312 | 400 | } |
---|
313 | 401 | |
---|
314 | | - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
---|
315 | | - if (!pfn_present(pfn)) |
---|
316 | | - continue; |
---|
| 402 | + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) |
---|
317 | 403 | fail = init_section_page_ext(pfn, nid); |
---|
318 | | - } |
---|
319 | 404 | if (!fail) |
---|
320 | 405 | return 0; |
---|
321 | 406 | |
---|
.. | .. |
---|
334 | 419 | start = SECTION_ALIGN_DOWN(start_pfn); |
---|
335 | 420 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
---|
336 | 421 | |
---|
| 422 | + /* |
---|
| 423 | + * Freeing of page_ext is done in 3 steps to avoid |
---|
| 424 | + * use-after-free of it: |
---|
| 425 | + * 1) Traverse all the sections and mark their page_ext |
---|
| 426 | + * as invalid. |
---|
| 427 | + * 2) Wait for all the existing users of page_ext who |
---|
| 428 | + * started before invalidation to finish. |
---|
| 429 | + * 3) Free the page_ext. |
---|
| 430 | + */ |
---|
| 431 | + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
---|
| 432 | + __invalidate_page_ext(pfn); |
---|
| 433 | + |
---|
| 434 | + synchronize_rcu(); |
---|
| 435 | + |
---|
337 | 436 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
---|
338 | 437 | __free_page_ext(pfn); |
---|
339 | 438 | return 0; |
---|