hc
2024-02-19 151fecfb72a0d602dfe79790602ef64b4e241574
kernel/mm/page_ext.c
....@@ -1,14 +1,14 @@
11 // SPDX-License-Identifier: GPL-2.0
22 #include <linux/mm.h>
33 #include <linux/mmzone.h>
4
-#include <linux/bootmem.h>
4
+#include <linux/memblock.h>
55 #include <linux/page_ext.h>
66 #include <linux/memory.h>
77 #include <linux/vmalloc.h>
88 #include <linux/kmemleak.h>
99 #include <linux/page_owner.h>
1010 #include <linux/page_idle.h>
11
-
11
+#include <linux/rcupdate.h>
1212 /*
1313 * struct page extension
1414 *
....@@ -58,20 +58,35 @@
5858 * can utilize this callback to initialize the state of it correctly.
5959 */
6060
61
-static struct page_ext_operations *page_ext_ops[] = {
62
-#ifdef CONFIG_DEBUG_PAGEALLOC
63
- &debug_guardpage_ops,
61
+#ifdef CONFIG_SPARSEMEM
62
+#define PAGE_EXT_INVALID (0x1)
6463 #endif
64
+
65
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
66
+static bool need_page_idle(void)
67
+{
68
+ return true;
69
+}
70
+struct page_ext_operations page_idle_ops = {
71
+ .need = need_page_idle,
72
+};
73
+#endif
74
+
75
+static struct page_ext_operations *page_ext_ops[] = {
6576 #ifdef CONFIG_PAGE_OWNER
6677 &page_owner_ops,
6778 #endif
68
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
79
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
6980 &page_idle_ops,
81
+#endif
82
+#ifdef CONFIG_PAGE_PINNER
83
+ &page_pinner_ops,
7084 #endif
7185 };
7286
87
+unsigned long page_ext_size = sizeof(struct page_ext);
88
+
7389 static unsigned long total_usage;
74
-static unsigned long extra_mem;
7590
7691 static bool __init invoke_need_callbacks(void)
7792 {
....@@ -81,9 +96,8 @@
8196
8297 for (i = 0; i < entries; i++) {
8398 if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
84
- page_ext_ops[i]->offset = sizeof(struct page_ext) +
85
- extra_mem;
86
- extra_mem += page_ext_ops[i]->size;
99
+ page_ext_ops[i]->offset = page_ext_size;
100
+ page_ext_size += page_ext_ops[i]->size;
87101 need = true;
88102 }
89103 }
....@@ -102,17 +116,61 @@
102116 }
103117 }
104118
105
-static unsigned long get_entry_size(void)
119
+#ifndef CONFIG_SPARSEMEM
120
+void __init page_ext_init_flatmem_late(void)
106121 {
107
- return sizeof(struct page_ext) + extra_mem;
122
+ invoke_init_callbacks();
108123 }
124
+#endif
109125
110126 static inline struct page_ext *get_entry(void *base, unsigned long index)
111127 {
112
- return base + get_entry_size() * index;
128
+ return base + page_ext_size * index;
113129 }
114130
115
-#if !defined(CONFIG_SPARSEMEM)
131
+/**
132
+ * page_ext_get() - Get the extended information for a page.
133
+ * @page: The page we're interested in.
134
+ *
135
+ * Ensures that the page_ext will remain valid until page_ext_put()
136
+ * is called.
137
+ *
138
+ * Return: NULL if no page_ext exists for this page.
139
+ * Context: Any context. Caller may not sleep until they have called
140
+ * page_ext_put().
141
+ */
142
+struct page_ext *page_ext_get(struct page *page)
143
+{
144
+ struct page_ext *page_ext;
145
+
146
+ rcu_read_lock();
147
+ page_ext = lookup_page_ext(page);
148
+ if (!page_ext) {
149
+ rcu_read_unlock();
150
+ return NULL;
151
+ }
152
+
153
+ return page_ext;
154
+}
155
+
156
+/**
157
+ * page_ext_put() - Working with page extended information is done.
158
+ * @page_ext: Page extended information received from page_ext_get().
159
+ *
160
+ * The page extended information of the page may not be valid after this
161
+ * function is called.
162
+ *
163
+ * Return: None.
164
+ * Context: Any context with corresponding page_ext_get() is called.
165
+ */
166
+void page_ext_put(struct page_ext *page_ext)
167
+{
168
+ if (unlikely(!page_ext))
169
+ return;
170
+
171
+ rcu_read_unlock();
172
+}
173
+#ifndef CONFIG_SPARSEMEM
116174
117175
118176 void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
....@@ -126,6 +184,7 @@
126184 unsigned long index;
127185 struct page_ext *base;
128186
187
+ WARN_ON_ONCE(!rcu_read_lock_held());
129188 base = NODE_DATA(page_to_nid(page))->node_page_ext;
130189 /*
131190 * The sanity checks the page allocator does upon freeing a
....@@ -139,6 +198,7 @@
139198 MAX_ORDER_NR_PAGES);
140199 return get_entry(base, index);
141200 }
201
+EXPORT_SYMBOL_GPL(lookup_page_ext);
142202
143203 static int __init alloc_node_page_ext(int nid)
144204 {
....@@ -159,11 +219,11 @@
159219 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
160220 nr_pages += MAX_ORDER_NR_PAGES;
161221
162
- table_size = get_entry_size() * nr_pages;
222
+ table_size = page_ext_size * nr_pages;
163223
164
- base = memblock_virt_alloc_try_nid_nopanic(
224
+ base = memblock_alloc_try_nid(
165225 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
166
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
226
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
167227 if (!base)
168228 return -ENOMEM;
169229 NODE_DATA(nid)->node_page_ext = base;
....@@ -185,7 +245,6 @@
185245 goto fail;
186246 }
187247 pr_info("allocated %ld bytes of page_ext\n", total_usage);
188
- invoke_init_callbacks();
189248 return;
190249
191250 fail:
....@@ -194,21 +253,29 @@
194253 }
195254
196255 #else /* CONFIG_FLAT_NODE_MEM_MAP */
256
+static bool page_ext_invalid(struct page_ext *page_ext)
257
+{
258
+ return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
259
+}
197260
198261 struct page_ext *lookup_page_ext(const struct page *page)
199262 {
200263 unsigned long pfn = page_to_pfn(page);
201264 struct mem_section *section = __pfn_to_section(pfn);
265
+ struct page_ext *page_ext = READ_ONCE(section->page_ext);
266
+
267
+ WARN_ON_ONCE(!rcu_read_lock_held());
202268 /*
203269 * The sanity checks the page allocator does upon freeing a
204270 * page can reach here before the page_ext arrays are
205271 * allocated when feeding a range of pages to the allocator
206272 * for the first time during bootup or memory hotplug.
207273 */
208
- if (!section->page_ext)
274
+ if (page_ext_invalid(page_ext))
209275 return NULL;
210
- return get_entry(section->page_ext, pfn);
276
+ return get_entry(page_ext, pfn);
211277 }
278
+EXPORT_SYMBOL_GPL(lookup_page_ext);
212279
213280 static void *__meminit alloc_page_ext(size_t size, int nid)
214281 {
....@@ -237,7 +304,7 @@
237304 if (section->page_ext)
238305 return 0;
239306
240
- table_size = get_entry_size() * PAGES_PER_SECTION;
307
+ table_size = page_ext_size * PAGES_PER_SECTION;
241308 base = alloc_page_ext(table_size, nid);
242309
243310 /*
....@@ -257,7 +324,7 @@
257324 * we need to apply a mask.
258325 */
259326 pfn &= PAGE_SECTION_MASK;
260
- section->page_ext = (void *)base - get_entry_size() * pfn;
327
+ section->page_ext = (void *)base - page_ext_size * pfn;
261328 total_usage += table_size;
262329 return 0;
263330 }
....@@ -270,7 +337,7 @@
270337 struct page *page = virt_to_page(addr);
271338 size_t table_size;
272339
273
- table_size = get_entry_size() * PAGES_PER_SECTION;
340
+ table_size = page_ext_size * PAGES_PER_SECTION;
274341
275342 BUG_ON(PageReserved(page));
276343 kmemleak_free(addr);
....@@ -286,9 +353,30 @@
286353 ms = __pfn_to_section(pfn);
287354 if (!ms || !ms->page_ext)
288355 return;
289
- base = get_entry(ms->page_ext, pfn);
356
+
357
+ base = READ_ONCE(ms->page_ext);
358
+ /*
359
+ * page_ext here can be valid while doing the roll back
360
+ * operation in online_page_ext().
361
+ */
362
+ if (page_ext_invalid(base))
363
+ base = (void *)base - PAGE_EXT_INVALID;
364
+ WRITE_ONCE(ms->page_ext, NULL);
365
+
366
+ base = get_entry(base, pfn);
290367 free_page_ext(base);
291
- ms->page_ext = NULL;
368
+}
369
+
370
+static void __invalidate_page_ext(unsigned long pfn)
371
+{
372
+ struct mem_section *ms;
373
+ void *val;
374
+
375
+ ms = __pfn_to_section(pfn);
376
+ if (!ms || !ms->page_ext)
377
+ return;
378
+ val = (void *)ms->page_ext + PAGE_EXT_INVALID;
379
+ WRITE_ONCE(ms->page_ext, val);
292380 }
293381
294382 static int __meminit online_page_ext(unsigned long start_pfn,
....@@ -301,7 +389,7 @@
301389 start = SECTION_ALIGN_DOWN(start_pfn);
302390 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
303391
304
- if (nid == -1) {
392
+ if (nid == NUMA_NO_NODE) {
305393 /*
306394 * In this case, "nid" already exists and contains valid memory.
307395 * "start_pfn" passed to us is a pfn which is an arg for
....@@ -311,11 +399,8 @@
311399 VM_BUG_ON(!node_state(nid, N_ONLINE));
312400 }
313401
314
- for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
315
- if (!pfn_present(pfn))
316
- continue;
402
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
317403 fail = init_section_page_ext(pfn, nid);
318
- }
319404 if (!fail)
320405 return 0;
321406
....@@ -334,6 +419,20 @@
334419 start = SECTION_ALIGN_DOWN(start_pfn);
335420 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
336421
422
+ /*
423
+ * Freeing of page_ext is done in 3 steps to avoid
424
+ * use-after-free of it:
425
+ * 1) Traverse all the sections and mark their page_ext
426
+ * as invalid.
427
+ * 2) Wait for all the existing users of page_ext who
428
+ * started before invalidation to finish.
429
+ * 3) Free the page_ext.
430
+ */
431
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
432
+ __invalidate_page_ext(pfn);
433
+
434
+ synchronize_rcu();
435
+
337436 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
338437 __free_page_ext(pfn);
339438 return 0;