| .. | .. |
|---|
| 1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
|---|
| 2 | 2 | #include <linux/mm.h> |
|---|
| 3 | 3 | #include <linux/mmzone.h> |
|---|
| 4 | | -#include <linux/bootmem.h> |
|---|
| 4 | +#include <linux/memblock.h> |
|---|
| 5 | 5 | #include <linux/page_ext.h> |
|---|
| 6 | 6 | #include <linux/memory.h> |
|---|
| 7 | 7 | #include <linux/vmalloc.h> |
|---|
| 8 | 8 | #include <linux/kmemleak.h> |
|---|
| 9 | 9 | #include <linux/page_owner.h> |
|---|
| 10 | 10 | #include <linux/page_idle.h> |
|---|
| 11 | | - |
|---|
| 11 | +#include <linux/rcupdate.h> |
|---|
| 12 | 12 | /* |
|---|
| 13 | 13 | * struct page extension |
|---|
| 14 | 14 | * |
|---|
| .. | .. |
|---|
| 58 | 58 | * can utilize this callback to initialize the state of it correctly. |
|---|
| 59 | 59 | */ |
|---|
| 60 | 60 | |
|---|
| 61 | | -static struct page_ext_operations *page_ext_ops[] = { |
|---|
| 62 | | -#ifdef CONFIG_DEBUG_PAGEALLOC |
|---|
| 63 | | - &debug_guardpage_ops, |
|---|
| 61 | +#ifdef CONFIG_SPARSEMEM |
|---|
| 62 | +#define PAGE_EXT_INVALID (0x1) |
|---|
| 64 | 63 | #endif |
|---|
| 64 | + |
|---|
| 65 | +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) |
|---|
| 66 | +static bool need_page_idle(void) |
|---|
| 67 | +{ |
|---|
| 68 | + return true; |
|---|
| 69 | +} |
|---|
| 70 | +struct page_ext_operations page_idle_ops = { |
|---|
| 71 | + .need = need_page_idle, |
|---|
| 72 | +}; |
|---|
| 73 | +#endif |
|---|
| 74 | + |
|---|
| 75 | +static struct page_ext_operations *page_ext_ops[] = { |
|---|
| 65 | 76 | #ifdef CONFIG_PAGE_OWNER |
|---|
| 66 | 77 | &page_owner_ops, |
|---|
| 67 | 78 | #endif |
|---|
| 68 | | -#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) |
|---|
| 79 | +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) |
|---|
| 69 | 80 | &page_idle_ops, |
|---|
| 81 | +#endif |
|---|
| 82 | +#ifdef CONFIG_PAGE_PINNER |
|---|
| 83 | + &page_pinner_ops, |
|---|
| 70 | 84 | #endif |
|---|
| 71 | 85 | }; |
|---|
| 72 | 86 | |
|---|
| 87 | +unsigned long page_ext_size = sizeof(struct page_ext); |
|---|
| 88 | + |
|---|
| 73 | 89 | static unsigned long total_usage; |
|---|
| 74 | | -static unsigned long extra_mem; |
|---|
| 75 | 90 | |
|---|
| 76 | 91 | static bool __init invoke_need_callbacks(void) |
|---|
| 77 | 92 | { |
|---|
| .. | .. |
|---|
| 81 | 96 | |
|---|
| 82 | 97 | for (i = 0; i < entries; i++) { |
|---|
| 83 | 98 | if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { |
|---|
| 84 | | - page_ext_ops[i]->offset = sizeof(struct page_ext) + |
|---|
| 85 | | - extra_mem; |
|---|
| 86 | | - extra_mem += page_ext_ops[i]->size; |
|---|
| 99 | + page_ext_ops[i]->offset = page_ext_size; |
|---|
| 100 | + page_ext_size += page_ext_ops[i]->size; |
|---|
| 87 | 101 | need = true; |
|---|
| 88 | 102 | } |
|---|
| 89 | 103 | } |
|---|
| .. | .. |
|---|
| 102 | 116 | } |
|---|
| 103 | 117 | } |
|---|
| 104 | 118 | |
|---|
| 105 | | -static unsigned long get_entry_size(void) |
|---|
| 119 | +#ifndef CONFIG_SPARSEMEM |
|---|
| 120 | +void __init page_ext_init_flatmem_late(void) |
|---|
| 106 | 121 | { |
|---|
| 107 | | - return sizeof(struct page_ext) + extra_mem; |
|---|
| 122 | + invoke_init_callbacks(); |
|---|
| 108 | 123 | } |
|---|
| 124 | +#endif |
|---|
| 109 | 125 | |
|---|
| 110 | 126 | static inline struct page_ext *get_entry(void *base, unsigned long index) |
|---|
| 111 | 127 | { |
|---|
| 112 | | - return base + get_entry_size() * index; |
|---|
| 128 | + return base + page_ext_size * index; |
|---|
| 113 | 129 | } |
|---|
| 114 | 130 | |
|---|
| 115 | | -#if !defined(CONFIG_SPARSEMEM) |
|---|
| 131 | +/** |
|---|
| 132 | + * page_ext_get() - Get the extended information for a page. |
|---|
| 133 | + * @page: The page we're interested in. |
|---|
| 134 | + * |
|---|
| 135 | + * Ensures that the page_ext will remain valid until page_ext_put() |
|---|
| 136 | + * is called. |
|---|
| 137 | + * |
|---|
| 138 | + * Return: NULL if no page_ext exists for this page. |
|---|
| 139 | + * Context: Any context. Caller may not sleep until they have called |
|---|
| 140 | + * page_ext_put(). |
|---|
| 141 | + */ |
|---|
| 142 | +struct page_ext *page_ext_get(struct page *page) |
|---|
| 143 | +{ |
|---|
| 144 | + struct page_ext *page_ext; |
|---|
| 145 | + |
|---|
| 146 | + rcu_read_lock(); |
|---|
| 147 | + page_ext = lookup_page_ext(page); |
|---|
| 148 | + if (!page_ext) { |
|---|
| 149 | + rcu_read_unlock(); |
|---|
| 150 | + return NULL; |
|---|
| 151 | + } |
|---|
| 152 | + |
|---|
| 153 | + return page_ext; |
|---|
| 154 | +} |
|---|
| 155 | + |
|---|
| 156 | +/** |
|---|
| 157 | + * page_ext_put() - Working with page extended information is done. |
|---|
| 158 | + * @page_ext: Page extended information received from page_ext_get(). |
|---|
| 159 | + * |
|---|
| 160 | + * The page extended information of the page may not be valid after this |
|---|
| 161 | + * function is called. |
|---|
| 162 | + * |
|---|
| 163 | + * Return: None. |
|---|
| 164 | + * Context: Any context with corresponding page_ext_get() is called. |
|---|
| 165 | + */ |
|---|
| 166 | +void page_ext_put(struct page_ext *page_ext) |
|---|
| 167 | +{ |
|---|
| 168 | + if (unlikely(!page_ext)) |
|---|
| 169 | + return; |
|---|
| 170 | + |
|---|
| 171 | + rcu_read_unlock(); |
|---|
| 172 | +} |
|---|
| 173 | +#ifndef CONFIG_SPARSEMEM |
|---|
| 116 | 174 | |
|---|
| 117 | 175 | |
|---|
| 118 | 176 | void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) |
|---|
| .. | .. |
|---|
| 126 | 184 | unsigned long index; |
|---|
| 127 | 185 | struct page_ext *base; |
|---|
| 128 | 186 | |
|---|
| 187 | + WARN_ON_ONCE(!rcu_read_lock_held()); |
|---|
| 129 | 188 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
|---|
| 130 | 189 | /* |
|---|
| 131 | 190 | * The sanity checks the page allocator does upon freeing a |
|---|
| .. | .. |
|---|
| 139 | 198 | MAX_ORDER_NR_PAGES); |
|---|
| 140 | 199 | return get_entry(base, index); |
|---|
| 141 | 200 | } |
|---|
| 201 | +EXPORT_SYMBOL_GPL(lookup_page_ext); |
|---|
| 142 | 202 | |
|---|
| 143 | 203 | static int __init alloc_node_page_ext(int nid) |
|---|
| 144 | 204 | { |
|---|
| .. | .. |
|---|
| 159 | 219 | !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) |
|---|
| 160 | 220 | nr_pages += MAX_ORDER_NR_PAGES; |
|---|
| 161 | 221 | |
|---|
| 162 | | - table_size = get_entry_size() * nr_pages; |
|---|
| 222 | + table_size = page_ext_size * nr_pages; |
|---|
| 163 | 223 | |
|---|
| 164 | | - base = memblock_virt_alloc_try_nid_nopanic( |
|---|
| 224 | + base = memblock_alloc_try_nid( |
|---|
| 165 | 225 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
|---|
| 166 | | - BOOTMEM_ALLOC_ACCESSIBLE, nid); |
|---|
| 226 | + MEMBLOCK_ALLOC_ACCESSIBLE, nid); |
|---|
| 167 | 227 | if (!base) |
|---|
| 168 | 228 | return -ENOMEM; |
|---|
| 169 | 229 | NODE_DATA(nid)->node_page_ext = base; |
|---|
| .. | .. |
|---|
| 185 | 245 | goto fail; |
|---|
| 186 | 246 | } |
|---|
| 187 | 247 | pr_info("allocated %ld bytes of page_ext\n", total_usage); |
|---|
| 188 | | - invoke_init_callbacks(); |
|---|
| 189 | 248 | return; |
|---|
| 190 | 249 | |
|---|
| 191 | 250 | fail: |
|---|
| .. | .. |
|---|
| 194 | 253 | } |
|---|
| 195 | 254 | |
|---|
| 196 | 255 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ |
|---|
| 256 | +static bool page_ext_invalid(struct page_ext *page_ext) |
|---|
| 257 | +{ |
|---|
| 258 | + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); |
|---|
| 259 | +} |
|---|
| 197 | 260 | |
|---|
| 198 | 261 | struct page_ext *lookup_page_ext(const struct page *page) |
|---|
| 199 | 262 | { |
|---|
| 200 | 263 | unsigned long pfn = page_to_pfn(page); |
|---|
| 201 | 264 | struct mem_section *section = __pfn_to_section(pfn); |
|---|
| 265 | + struct page_ext *page_ext = READ_ONCE(section->page_ext); |
|---|
| 266 | + |
|---|
| 267 | + WARN_ON_ONCE(!rcu_read_lock_held()); |
|---|
| 202 | 268 | /* |
|---|
| 203 | 269 | * The sanity checks the page allocator does upon freeing a |
|---|
| 204 | 270 | * page can reach here before the page_ext arrays are |
|---|
| 205 | 271 | * allocated when feeding a range of pages to the allocator |
|---|
| 206 | 272 | * for the first time during bootup or memory hotplug. |
|---|
| 207 | 273 | */ |
|---|
| 208 | | - if (!section->page_ext) |
|---|
| 274 | + if (page_ext_invalid(page_ext)) |
|---|
| 209 | 275 | return NULL; |
|---|
| 210 | | - return get_entry(section->page_ext, pfn); |
|---|
| 276 | + return get_entry(page_ext, pfn); |
|---|
| 211 | 277 | } |
|---|
| 278 | +EXPORT_SYMBOL_GPL(lookup_page_ext); |
|---|
| 212 | 279 | |
|---|
| 213 | 280 | static void *__meminit alloc_page_ext(size_t size, int nid) |
|---|
| 214 | 281 | { |
|---|
| .. | .. |
|---|
| 237 | 304 | if (section->page_ext) |
|---|
| 238 | 305 | return 0; |
|---|
| 239 | 306 | |
|---|
| 240 | | - table_size = get_entry_size() * PAGES_PER_SECTION; |
|---|
| 307 | + table_size = page_ext_size * PAGES_PER_SECTION; |
|---|
| 241 | 308 | base = alloc_page_ext(table_size, nid); |
|---|
| 242 | 309 | |
|---|
| 243 | 310 | /* |
|---|
| .. | .. |
|---|
| 257 | 324 | * we need to apply a mask. |
|---|
| 258 | 325 | */ |
|---|
| 259 | 326 | pfn &= PAGE_SECTION_MASK; |
|---|
| 260 | | - section->page_ext = (void *)base - get_entry_size() * pfn; |
|---|
| 327 | + section->page_ext = (void *)base - page_ext_size * pfn; |
|---|
| 261 | 328 | total_usage += table_size; |
|---|
| 262 | 329 | return 0; |
|---|
| 263 | 330 | } |
|---|
| .. | .. |
|---|
| 270 | 337 | struct page *page = virt_to_page(addr); |
|---|
| 271 | 338 | size_t table_size; |
|---|
| 272 | 339 | |
|---|
| 273 | | - table_size = get_entry_size() * PAGES_PER_SECTION; |
|---|
| 340 | + table_size = page_ext_size * PAGES_PER_SECTION; |
|---|
| 274 | 341 | |
|---|
| 275 | 342 | BUG_ON(PageReserved(page)); |
|---|
| 276 | 343 | kmemleak_free(addr); |
|---|
| .. | .. |
|---|
| 286 | 353 | ms = __pfn_to_section(pfn); |
|---|
| 287 | 354 | if (!ms || !ms->page_ext) |
|---|
| 288 | 355 | return; |
|---|
| 289 | | - base = get_entry(ms->page_ext, pfn); |
|---|
| 356 | + |
|---|
| 357 | + base = READ_ONCE(ms->page_ext); |
|---|
| 358 | + /* |
|---|
| 359 | + * page_ext here can be valid while doing the roll back |
|---|
| 360 | + * operation in online_page_ext(). |
|---|
| 361 | + */ |
|---|
| 362 | + if (page_ext_invalid(base)) |
|---|
| 363 | + base = (void *)base - PAGE_EXT_INVALID; |
|---|
| 364 | + WRITE_ONCE(ms->page_ext, NULL); |
|---|
| 365 | + |
|---|
| 366 | + base = get_entry(base, pfn); |
|---|
| 290 | 367 | free_page_ext(base); |
|---|
| 291 | | - ms->page_ext = NULL; |
|---|
| 368 | +} |
|---|
| 369 | + |
|---|
| 370 | +static void __invalidate_page_ext(unsigned long pfn) |
|---|
| 371 | +{ |
|---|
| 372 | + struct mem_section *ms; |
|---|
| 373 | + void *val; |
|---|
| 374 | + |
|---|
| 375 | + ms = __pfn_to_section(pfn); |
|---|
| 376 | + if (!ms || !ms->page_ext) |
|---|
| 377 | + return; |
|---|
| 378 | + val = (void *)ms->page_ext + PAGE_EXT_INVALID; |
|---|
| 379 | + WRITE_ONCE(ms->page_ext, val); |
|---|
| 292 | 380 | } |
|---|
| 293 | 381 | |
|---|
| 294 | 382 | static int __meminit online_page_ext(unsigned long start_pfn, |
|---|
| .. | .. |
|---|
| 301 | 389 | start = SECTION_ALIGN_DOWN(start_pfn); |
|---|
| 302 | 390 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
|---|
| 303 | 391 | |
|---|
| 304 | | - if (nid == -1) { |
|---|
| 392 | + if (nid == NUMA_NO_NODE) { |
|---|
| 305 | 393 | /* |
|---|
| 306 | 394 | * In this case, "nid" already exists and contains valid memory. |
|---|
| 307 | 395 | * "start_pfn" passed to us is a pfn which is an arg for |
|---|
| .. | .. |
|---|
| 311 | 399 | VM_BUG_ON(!node_state(nid, N_ONLINE)); |
|---|
| 312 | 400 | } |
|---|
| 313 | 401 | |
|---|
| 314 | | - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
|---|
| 315 | | - if (!pfn_present(pfn)) |
|---|
| 316 | | - continue; |
|---|
| 402 | + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) |
|---|
| 317 | 403 | fail = init_section_page_ext(pfn, nid); |
|---|
| 318 | | - } |
|---|
| 319 | 404 | if (!fail) |
|---|
| 320 | 405 | return 0; |
|---|
| 321 | 406 | |
|---|
| .. | .. |
|---|
| 334 | 419 | start = SECTION_ALIGN_DOWN(start_pfn); |
|---|
| 335 | 420 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
|---|
| 336 | 421 | |
|---|
| 422 | + /* |
|---|
| 423 | + * Freeing of page_ext is done in 3 steps to avoid |
|---|
| 424 | + * use-after-free of it: |
|---|
| 425 | + * 1) Traverse all the sections and mark their page_ext |
|---|
| 426 | + * as invalid. |
|---|
| 427 | + * 2) Wait for all the existing users of page_ext who |
|---|
| 428 | + * started before invalidation to finish. |
|---|
| 429 | + * 3) Free the page_ext. |
|---|
| 430 | + */ |
|---|
| 431 | + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
|---|
| 432 | + __invalidate_page_ext(pfn); |
|---|
| 433 | + |
|---|
| 434 | + synchronize_rcu(); |
|---|
| 435 | + |
|---|
| 337 | 436 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
|---|
| 338 | 437 | __free_page_ext(pfn); |
|---|
| 339 | 438 | return 0; |
|---|