hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/drivers/base/memory.c
....@@ -19,29 +19,50 @@
1919 #include <linux/memory.h>
2020 #include <linux/memory_hotplug.h>
2121 #include <linux/mm.h>
22
-#include <linux/mutex.h>
2322 #include <linux/stat.h>
2423 #include <linux/slab.h>
24
+#include <linux/xarray.h>
2525
2626 #include <linux/atomic.h>
2727 #include <linux/uaccess.h>
2828
29
-static DEFINE_MUTEX(mem_sysfs_mutex);
30
-
3129 #define MEMORY_CLASS_NAME "memory"
30
+
31
+static const char *const online_type_to_str[] = {
32
+ [MMOP_OFFLINE] = "offline",
33
+ [MMOP_ONLINE] = "online",
34
+ [MMOP_ONLINE_KERNEL] = "online_kernel",
35
+ [MMOP_ONLINE_MOVABLE] = "online_movable",
36
+};
37
+
38
+int memhp_online_type_from_str(const char *str)
39
+{
40
+ int i;
41
+
42
+ for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
43
+ if (sysfs_streq(str, online_type_to_str[i]))
44
+ return i;
45
+ }
46
+ return -EINVAL;
47
+}
3248
3349 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
3450
3551 static int sections_per_block;
3652
37
-static inline int base_memory_block_id(int section_nr)
53
+static inline unsigned long memory_block_id(unsigned long section_nr)
3854 {
3955 return section_nr / sections_per_block;
4056 }
4157
42
-static inline int pfn_to_block_id(unsigned long pfn)
58
+static inline unsigned long pfn_to_block_id(unsigned long pfn)
4359 {
44
- return base_memory_block_id(pfn_to_section_nr(pfn));
60
+ return memory_block_id(pfn_to_section_nr(pfn));
61
+}
62
+
63
+static inline unsigned long phys_to_block_id(unsigned long phys)
64
+{
65
+ return pfn_to_block_id(PFN_DOWN(phys));
4566 }
4667
4768 static int memory_subsys_online(struct device *dev);
....@@ -53,6 +74,13 @@
5374 .online = memory_subsys_online,
5475 .offline = memory_subsys_offline,
5576 };
77
+
78
+/*
79
+ * Memory blocks are cached in a local radix tree to avoid
80
+ * a costly linear search for the corresponding device on
81
+ * the subsystem bus.
82
+ */
83
+static DEFINE_XARRAY(memory_blocks);
5684
5785 static BLOCKING_NOTIFIER_HEAD(memory_chain);
5886
....@@ -68,20 +96,6 @@
6896 }
6997 EXPORT_SYMBOL(unregister_memory_notifier);
7098
71
-static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
72
-
73
-int register_memory_isolate_notifier(struct notifier_block *nb)
74
-{
75
- return atomic_notifier_chain_register(&memory_isolate_chain, nb);
76
-}
77
-EXPORT_SYMBOL(register_memory_isolate_notifier);
78
-
79
-void unregister_memory_isolate_notifier(struct notifier_block *nb)
80
-{
81
- atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
82
-}
83
-EXPORT_SYMBOL(unregister_memory_isolate_notifier);
84
-
8599 static void memory_block_release(struct device *dev)
86100 {
87101 struct memory_block *mem = to_memory_block(dev);
....@@ -93,69 +107,40 @@
93107 {
94108 return MIN_MEMORY_BLOCK_SIZE;
95109 }
96
-
97
-static unsigned long get_memory_block_size(void)
98
-{
99
- unsigned long block_sz;
100
-
101
- block_sz = memory_block_size_bytes();
102
-
103
- /* Validate blk_sz is a power of 2 and not less than section size */
104
- if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
105
- WARN_ON(1);
106
- block_sz = MIN_MEMORY_BLOCK_SIZE;
107
- }
108
-
109
- return block_sz;
110
-}
110
+EXPORT_SYMBOL_GPL(memory_block_size_bytes);
111111
112112 /*
113
- * use this as the physical section index that this memsection
114
- * uses.
113
+ * Show the first physical section index (number) of this memory block.
115114 */
116
-
117
-static ssize_t show_mem_start_phys_index(struct device *dev,
118
- struct device_attribute *attr, char *buf)
115
+static ssize_t phys_index_show(struct device *dev,
116
+ struct device_attribute *attr, char *buf)
119117 {
120118 struct memory_block *mem = to_memory_block(dev);
121119 unsigned long phys_index;
122120
123121 phys_index = mem->start_section_nr / sections_per_block;
124
- return sprintf(buf, "%08lx\n", phys_index);
122
+
123
+ return sysfs_emit(buf, "%08lx\n", phys_index);
125124 }
126125
127126 /*
128
- * Show whether the section of memory is likely to be hot-removable
127
+ * Legacy interface that we cannot remove. Always indicate "removable"
128
+ * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
129129 */
130
-static ssize_t show_mem_removable(struct device *dev,
131
- struct device_attribute *attr, char *buf)
130
+static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
131
+ char *buf)
132132 {
133
- unsigned long i, pfn;
134
- int ret = 1;
135
- struct memory_block *mem = to_memory_block(dev);
136
-
137
- if (mem->state != MEM_ONLINE)
138
- goto out;
139
-
140
- for (i = 0; i < sections_per_block; i++) {
141
- if (!present_section_nr(mem->start_section_nr + i))
142
- continue;
143
- pfn = section_nr_to_pfn(mem->start_section_nr + i);
144
- ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
145
- }
146
-
147
-out:
148
- return sprintf(buf, "%d\n", ret);
133
+ return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
149134 }
150135
151136 /*
152137 * online, offline, going offline, etc.
153138 */
154
-static ssize_t show_mem_state(struct device *dev,
155
- struct device_attribute *attr, char *buf)
139
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
140
+ char *buf)
156141 {
157142 struct memory_block *mem = to_memory_block(dev);
158
- ssize_t len = 0;
143
+ const char *output;
159144
160145 /*
161146 * We can probably put these states in a nice little array
....@@ -163,71 +148,25 @@
163148 */
164149 switch (mem->state) {
165150 case MEM_ONLINE:
166
- len = sprintf(buf, "online\n");
151
+ output = "online";
167152 break;
168153 case MEM_OFFLINE:
169
- len = sprintf(buf, "offline\n");
154
+ output = "offline";
170155 break;
171156 case MEM_GOING_OFFLINE:
172
- len = sprintf(buf, "going-offline\n");
157
+ output = "going-offline";
173158 break;
174159 default:
175
- len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
176
- mem->state);
177160 WARN_ON(1);
178
- break;
161
+ return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
179162 }
180163
181
- return len;
164
+ return sysfs_emit(buf, "%s\n", output);
182165 }
183166
184167 int memory_notify(unsigned long val, void *v)
185168 {
186169 return blocking_notifier_call_chain(&memory_chain, val, v);
187
-}
188
-
189
-int memory_isolate_notify(unsigned long val, void *v)
190
-{
191
- return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
192
-}
193
-
194
-/*
195
- * The probe routines leave the pages uninitialized, just as the bootmem code
196
- * does. Make sure we do not access them, but instead use only information from
197
- * within sections.
198
- */
199
-static bool pages_correctly_probed(unsigned long start_pfn)
200
-{
201
- unsigned long section_nr = pfn_to_section_nr(start_pfn);
202
- unsigned long section_nr_end = section_nr + sections_per_block;
203
- unsigned long pfn = start_pfn;
204
-
205
- /*
206
- * memmap between sections is not contiguous except with
207
- * SPARSEMEM_VMEMMAP. We lookup the page once per section
208
- * and assume memmap is contiguous within each section
209
- */
210
- for (; section_nr < section_nr_end; section_nr++) {
211
- if (WARN_ON_ONCE(!pfn_valid(pfn)))
212
- return false;
213
-
214
- if (!present_section_nr(section_nr)) {
215
- pr_warn("section %ld pfn[%lx, %lx) not present",
216
- section_nr, pfn, pfn + PAGES_PER_SECTION);
217
- return false;
218
- } else if (!valid_section_nr(section_nr)) {
219
- pr_warn("section %ld pfn[%lx, %lx) no valid memmap",
220
- section_nr, pfn, pfn + PAGES_PER_SECTION);
221
- return false;
222
- } else if (online_section_nr(section_nr)) {
223
- pr_warn("section %ld pfn[%lx, %lx) is already online",
224
- section_nr, pfn, pfn + PAGES_PER_SECTION);
225
- return false;
226
- }
227
- pfn += PAGES_PER_SECTION;
228
- }
229
-
230
- return true;
231170 }
232171
233172 /*
....@@ -236,7 +175,7 @@
236175 */
237176 static int
238177 memory_block_action(unsigned long start_section_nr, unsigned long action,
239
- int online_type)
178
+ int online_type, int nid)
240179 {
241180 unsigned long start_pfn;
242181 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
....@@ -246,10 +185,7 @@
246185
247186 switch (action) {
248187 case MEM_ONLINE:
249
- if (!pages_correctly_probed(start_pfn))
250
- return -EBUSY;
251
-
252
- ret = online_pages(start_pfn, nr_pages, online_type);
188
+ ret = online_pages(start_pfn, nr_pages, online_type, nid);
253189 break;
254190 case MEM_OFFLINE:
255191 ret = offline_pages(start_pfn, nr_pages);
....@@ -275,7 +211,7 @@
275211 mem->state = MEM_GOING_OFFLINE;
276212
277213 ret = memory_block_action(mem->start_section_nr, to_state,
278
- mem->online_type);
214
+ mem->online_type, mem->nid);
279215
280216 mem->state = ret ? from_state_req : to_state;
281217
....@@ -292,17 +228,14 @@
292228 return 0;
293229
294230 /*
295
- * If we are called from store_mem_state(), online_type will be
296
- * set >= 0 Otherwise we were called from the device online
297
- * attribute and need to set the online_type.
231
+ * When called via device_online() without configuring the online_type,
232
+ * we want to default to MMOP_ONLINE.
298233 */
299
- if (mem->online_type < 0)
300
- mem->online_type = MMOP_ONLINE_KEEP;
234
+ if (mem->online_type == MMOP_OFFLINE)
235
+ mem->online_type = MMOP_ONLINE;
301236
302237 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
303
-
304
- /* clear online_type */
305
- mem->online_type = -1;
238
+ mem->online_type = MMOP_OFFLINE;
306239
307240 return ret;
308241 }
....@@ -314,41 +247,27 @@
314247 if (mem->state == MEM_OFFLINE)
315248 return 0;
316249
317
- /* Can't offline block with non-present sections */
318
- if (mem->section_count != sections_per_block)
319
- return -EINVAL;
320
-
321250 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
322251 }
323252
324
-static ssize_t
325
-store_mem_state(struct device *dev,
326
- struct device_attribute *attr, const char *buf, size_t count)
253
+static ssize_t state_store(struct device *dev, struct device_attribute *attr,
254
+ const char *buf, size_t count)
327255 {
256
+ const int online_type = memhp_online_type_from_str(buf);
328257 struct memory_block *mem = to_memory_block(dev);
329
- int ret, online_type;
258
+ int ret;
259
+
260
+ if (online_type < 0)
261
+ return -EINVAL;
330262
331263 ret = lock_device_hotplug_sysfs();
332264 if (ret)
333265 return ret;
334266
335
- if (sysfs_streq(buf, "online_kernel"))
336
- online_type = MMOP_ONLINE_KERNEL;
337
- else if (sysfs_streq(buf, "online_movable"))
338
- online_type = MMOP_ONLINE_MOVABLE;
339
- else if (sysfs_streq(buf, "online"))
340
- online_type = MMOP_ONLINE_KEEP;
341
- else if (sysfs_streq(buf, "offline"))
342
- online_type = MMOP_OFFLINE;
343
- else {
344
- ret = -EINVAL;
345
- goto err;
346
- }
347
-
348267 switch (online_type) {
349268 case MMOP_ONLINE_KERNEL:
350269 case MMOP_ONLINE_MOVABLE:
351
- case MMOP_ONLINE_KEEP:
270
+ case MMOP_ONLINE:
352271 /* mem->online_type is protected by device_hotplug_lock */
353272 mem->online_type = online_type;
354273 ret = device_online(&mem->dev);
....@@ -360,7 +279,6 @@
360279 ret = -EINVAL; /* should never happen */
361280 }
362281
363
-err:
364282 unlock_device_hotplug();
365283
366284 if (ret < 0)
....@@ -372,43 +290,44 @@
372290 }
373291
374292 /*
375
- * phys_device is a bad name for this. What I really want
376
- * is a way to differentiate between memory ranges that
377
- * are part of physical devices that constitute
378
- * a complete removable unit or fru.
379
- * i.e. do these ranges belong to the same physical device,
380
- * s.t. if I offline all of these sections I can then
381
- * remove the physical device?
293
+ * Legacy interface that we cannot remove: s390x exposes the storage increment
294
+ * covered by a memory block, allowing for identifying which memory blocks
295
+ * comprise a storage increment. Since a memory block spans complete
296
+ * storage increments nowadays, this interface is basically unused. Other
297
+ * archs never exposed != 0.
382298 */
383
-static ssize_t show_phys_device(struct device *dev,
299
+static ssize_t phys_device_show(struct device *dev,
384300 struct device_attribute *attr, char *buf)
385301 {
386302 struct memory_block *mem = to_memory_block(dev);
387
- return sprintf(buf, "%d\n", mem->phys_device);
303
+ unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
304
+
305
+ return sysfs_emit(buf, "%d\n",
306
+ arch_get_memory_phys_device(start_pfn));
388307 }
389308
390309 #ifdef CONFIG_MEMORY_HOTREMOVE
391
-static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
392
- unsigned long nr_pages, int online_type,
393
- struct zone *default_zone)
310
+static int print_allowed_zone(char *buf, int len, int nid,
311
+ unsigned long start_pfn, unsigned long nr_pages,
312
+ int online_type, struct zone *default_zone)
394313 {
395314 struct zone *zone;
396315
397316 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
398
- if (zone != default_zone) {
399
- strcat(buf, " ");
400
- strcat(buf, zone->name);
401
- }
317
+ if (zone == default_zone)
318
+ return 0;
319
+
320
+ return sysfs_emit_at(buf, len, " %s", zone->name);
402321 }
403322
404
-static ssize_t show_valid_zones(struct device *dev,
323
+static ssize_t valid_zones_show(struct device *dev,
405324 struct device_attribute *attr, char *buf)
406325 {
407326 struct memory_block *mem = to_memory_block(dev);
408327 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
409328 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
410
- unsigned long valid_start_pfn, valid_end_pfn;
411329 struct zone *default_zone;
330
+ int len = 0;
412331 int nid;
413332
414333 /*
....@@ -420,77 +339,71 @@
420339 * The block contains more than one zone can not be offlined.
421340 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
422341 */
423
- if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages,
424
- &valid_start_pfn, &valid_end_pfn))
425
- return sprintf(buf, "none\n");
426
- start_pfn = valid_start_pfn;
427
- strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
342
+ default_zone = test_pages_in_a_zone(start_pfn,
343
+ start_pfn + nr_pages);
344
+ if (!default_zone)
345
+ return sysfs_emit(buf, "%s\n", "none");
346
+ len += sysfs_emit_at(buf, len, "%s", default_zone->name);
428347 goto out;
429348 }
430349
431350 nid = mem->nid;
432
- default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
433
- strcat(buf, default_zone->name);
351
+ default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
352
+ nr_pages);
434353
435
- print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
436
- default_zone);
437
- print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
438
- default_zone);
354
+ len += sysfs_emit_at(buf, len, "%s", default_zone->name);
355
+ len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
356
+ MMOP_ONLINE_KERNEL, default_zone);
357
+ len += print_allowed_zone(buf, len, nid, start_pfn, nr_pages,
358
+ MMOP_ONLINE_MOVABLE, default_zone);
439359 out:
440
- strcat(buf, "\n");
441
-
442
- return strlen(buf);
360
+ len += sysfs_emit_at(buf, len, "\n");
361
+ return len;
443362 }
444
-static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
363
+static DEVICE_ATTR_RO(valid_zones);
445364 #endif
446365
447
-static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
448
-static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
449
-static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
450
-static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
366
+static DEVICE_ATTR_RO(phys_index);
367
+static DEVICE_ATTR_RW(state);
368
+static DEVICE_ATTR_RO(phys_device);
369
+static DEVICE_ATTR_RO(removable);
451370
452371 /*
453
- * Block size attribute stuff
372
+ * Show the memory block size (shared by all memory blocks).
454373 */
455
-static ssize_t
456
-print_block_size(struct device *dev, struct device_attribute *attr,
457
- char *buf)
374
+static ssize_t block_size_bytes_show(struct device *dev,
375
+ struct device_attribute *attr, char *buf)
458376 {
459
- return sprintf(buf, "%lx\n", get_memory_block_size());
377
+ return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
460378 }
461379
462
-static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
380
+static DEVICE_ATTR_RO(block_size_bytes);
463381
464382 /*
465383 * Memory auto online policy.
466384 */
467385
468
-static ssize_t
469
-show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
470
- char *buf)
386
+static ssize_t auto_online_blocks_show(struct device *dev,
387
+ struct device_attribute *attr, char *buf)
471388 {
472
- if (memhp_auto_online)
473
- return sprintf(buf, "online\n");
474
- else
475
- return sprintf(buf, "offline\n");
389
+ return sysfs_emit(buf, "%s\n",
390
+ online_type_to_str[memhp_default_online_type]);
476391 }
477392
478
-static ssize_t
479
-store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
480
- const char *buf, size_t count)
393
+static ssize_t auto_online_blocks_store(struct device *dev,
394
+ struct device_attribute *attr,
395
+ const char *buf, size_t count)
481396 {
482
- if (sysfs_streq(buf, "online"))
483
- memhp_auto_online = true;
484
- else if (sysfs_streq(buf, "offline"))
485
- memhp_auto_online = false;
486
- else
397
+ const int online_type = memhp_online_type_from_str(buf);
398
+
399
+ if (online_type < 0)
487400 return -EINVAL;
488401
402
+ memhp_default_online_type = online_type;
489403 return count;
490404 }
491405
492
-static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
493
- store_auto_online_blocks);
406
+static DEVICE_ATTR_RW(auto_online_blocks);
494407
495408 /*
496409 * Some architectures will have custom drivers to do this, and
....@@ -499,9 +412,8 @@
499412 * and will require this interface.
500413 */
501414 #ifdef CONFIG_ARCH_MEMORY_PROBE
502
-static ssize_t
503
-memory_probe_store(struct device *dev, struct device_attribute *attr,
504
- const char *buf, size_t count)
415
+static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
416
+ const char *buf, size_t count)
505417 {
506418 u64 phys_addr;
507419 int nid, ret;
....@@ -520,7 +432,8 @@
520432
521433 nid = memory_add_physaddr_to_nid(phys_addr);
522434 ret = __add_memory(nid, phys_addr,
523
- MIN_MEMORY_BLOCK_SIZE * sections_per_block);
435
+ MIN_MEMORY_BLOCK_SIZE * sections_per_block,
436
+ MHP_NONE);
524437
525438 if (ret)
526439 goto out;
....@@ -531,7 +444,7 @@
531444 return ret;
532445 }
533446
534
-static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
447
+static DEVICE_ATTR_WO(probe);
535448 #endif
536449
537450 #ifdef CONFIG_MEMORY_FAILURE
....@@ -540,10 +453,9 @@
540453 */
541454
542455 /* Soft offline a page */
543
-static ssize_t
544
-store_soft_offline_page(struct device *dev,
545
- struct device_attribute *attr,
546
- const char *buf, size_t count)
456
+static ssize_t soft_offline_page_store(struct device *dev,
457
+ struct device_attribute *attr,
458
+ const char *buf, size_t count)
547459 {
548460 int ret;
549461 u64 pfn;
....@@ -552,20 +464,14 @@
552464 if (kstrtoull(buf, 0, &pfn) < 0)
553465 return -EINVAL;
554466 pfn >>= PAGE_SHIFT;
555
- if (!pfn_valid(pfn))
556
- return -ENXIO;
557
- /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
558
- if (!pfn_to_online_page(pfn))
559
- return -EIO;
560
- ret = soft_offline_page(pfn_to_page(pfn), 0);
467
+ ret = soft_offline_page(pfn, 0);
561468 return ret == 0 ? count : ret;
562469 }
563470
564471 /* Forcibly offline a page, including killing processes. */
565
-static ssize_t
566
-store_hard_offline_page(struct device *dev,
567
- struct device_attribute *attr,
568
- const char *buf, size_t count)
472
+static ssize_t hard_offline_page_store(struct device *dev,
473
+ struct device_attribute *attr,
474
+ const char *buf, size_t count)
569475 {
570476 int ret;
571477 u64 pfn;
....@@ -578,57 +484,39 @@
578484 return ret ? ret : count;
579485 }
580486
581
-static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
582
-static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
487
+static DEVICE_ATTR_WO(soft_offline_page);
488
+static DEVICE_ATTR_WO(hard_offline_page);
583489 #endif
584490
585
-/*
586
- * Note that phys_device is optional. It is here to allow for
587
- * differentiation between which *physical* devices each
588
- * section belongs to...
589
- */
491
+/* See phys_device_show(). */
590492 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
591493 {
592494 return 0;
593495 }
594496
595497 /*
596
- * A reference for the returned object is held and the reference for the
597
- * hinted object is released.
498
+ * A reference for the returned memory block device is acquired.
499
+ *
500
+ * Called under device_hotplug_lock.
598501 */
599
-static struct memory_block *find_memory_block_by_id(int block_id,
600
- struct memory_block *hint)
502
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
601503 {
602
- struct device *hintdev = hint ? &hint->dev : NULL;
603
- struct device *dev;
504
+ struct memory_block *mem;
604505
605
- dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
606
- if (hint)
607
- put_device(&hint->dev);
608
- if (!dev)
609
- return NULL;
610
- return to_memory_block(dev);
611
-}
612
-
613
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
614
- struct memory_block *hint)
615
-{
616
- int block_id = base_memory_block_id(__section_nr(section));
617
-
618
- return find_memory_block_by_id(block_id, hint);
506
+ mem = xa_load(&memory_blocks, block_id);
507
+ if (mem)
508
+ get_device(&mem->dev);
509
+ return mem;
619510 }
620511
621512 /*
622
- * For now, we have a linear search to go find the appropriate
623
- * memory_block corresponding to a particular phys_index. If
624
- * this gets to be a real problem, we can always use a radix
625
- * tree or something here.
626
- *
627
- * This could be made generic for all device subsystems.
513
+ * Called under device_hotplug_lock.
628514 */
629515 struct memory_block *find_memory_block(struct mem_section *section)
630516 {
631
- return find_memory_block_hinted(section, NULL);
517
+ unsigned long block_id = memory_block_id(__section_nr(section));
518
+
519
+ return find_memory_block_by_id(block_id);
632520 }
633521
634522 static struct attribute *memory_memblk_attrs[] = {
....@@ -666,20 +554,24 @@
666554 memory->dev.offline = memory->state == MEM_OFFLINE;
667555
668556 ret = device_register(&memory->dev);
669
- if (ret)
557
+ if (ret) {
670558 put_device(&memory->dev);
559
+ return ret;
560
+ }
561
+ ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
562
+ GFP_KERNEL));
563
+ if (ret)
564
+ device_unregister(&memory->dev);
671565
672566 return ret;
673567 }
674568
675
-static int init_memory_block(struct memory_block **memory, int block_id,
676
- unsigned long state)
569
+static int init_memory_block(unsigned long block_id, unsigned long state)
677570 {
678571 struct memory_block *mem;
679
- unsigned long start_pfn;
680572 int ret = 0;
681573
682
- mem = find_memory_block_by_id(block_id, NULL);
574
+ mem = find_memory_block_by_id(block_id);
683575 if (mem) {
684576 put_device(&mem->dev);
685577 return -EEXIST;
....@@ -689,43 +581,36 @@
689581 return -ENOMEM;
690582
691583 mem->start_section_nr = block_id * sections_per_block;
692
- mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
693584 mem->state = state;
694
- start_pfn = section_nr_to_pfn(mem->start_section_nr);
695
- mem->phys_device = arch_get_memory_phys_device(start_pfn);
696585 mem->nid = NUMA_NO_NODE;
697586
698587 ret = register_memory(mem);
699588
700
- *memory = mem;
701589 return ret;
702590 }
703591
704
-static int add_memory_block(int base_section_nr)
592
+static int add_memory_block(unsigned long base_section_nr)
705593 {
706
- struct memory_block *mem;
707
- int i, ret, section_count = 0;
594
+ int section_count = 0;
595
+ unsigned long nr;
708596
709
- for (i = base_section_nr;
710
- i < base_section_nr + sections_per_block;
711
- i++)
712
- if (present_section_nr(i))
597
+ for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
598
+ nr++)
599
+ if (present_section_nr(nr))
713600 section_count++;
714601
715602 if (section_count == 0)
716603 return 0;
717
- ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
718
- MEM_ONLINE);
719
- if (ret)
720
- return ret;
721
- mem->section_count = section_count;
722
- return 0;
604
+ return init_memory_block(memory_block_id(base_section_nr),
605
+ MEM_ONLINE);
723606 }
724607
725608 static void unregister_memory(struct memory_block *memory)
726609 {
727610 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
728611 return;
612
+
613
+ WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
729614
730615 /* drop the ref. we got via find_memory_block() */
731616 put_device(&memory->dev);
....@@ -736,11 +621,13 @@
736621 * Create memory block devices for the given memory area. Start and size
737622 * have to be aligned to memory block granularity. Memory block devices
738623 * will be initialized as offline.
624
+ *
625
+ * Called under device_hotplug_lock.
739626 */
740627 int create_memory_block_devices(unsigned long start, unsigned long size)
741628 {
742
- const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
743
- int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
629
+ const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
630
+ unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
744631 struct memory_block *mem;
745632 unsigned long block_id;
746633 int ret = 0;
....@@ -749,23 +636,21 @@
749636 !IS_ALIGNED(size, memory_block_size_bytes())))
750637 return -EINVAL;
751638
752
- mutex_lock(&mem_sysfs_mutex);
753639 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
754
- ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
640
+ ret = init_memory_block(block_id, MEM_OFFLINE);
755641 if (ret)
756642 break;
757
- mem->section_count = sections_per_block;
758643 }
759644 if (ret) {
760645 end_block_id = block_id;
761646 for (block_id = start_block_id; block_id != end_block_id;
762647 block_id++) {
763
- mem = find_memory_block_by_id(block_id, NULL);
764
- mem->section_count = 0;
648
+ mem = find_memory_block_by_id(block_id);
649
+ if (WARN_ON_ONCE(!mem))
650
+ continue;
765651 unregister_memory(mem);
766652 }
767653 }
768
- mutex_unlock(&mem_sysfs_mutex);
769654 return ret;
770655 }
771656
....@@ -773,28 +658,27 @@
773658 * Remove memory block devices for the given memory area. Start and size
774659 * have to be aligned to memory block granularity. Memory block devices
775660 * have to be offline.
661
+ *
662
+ * Called under device_hotplug_lock.
776663 */
777664 void remove_memory_block_devices(unsigned long start, unsigned long size)
778665 {
779
- const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
780
- const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
666
+ const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
667
+ const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
781668 struct memory_block *mem;
782
- int block_id;
669
+ unsigned long block_id;
783670
784671 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
785672 !IS_ALIGNED(size, memory_block_size_bytes())))
786673 return;
787674
788
- mutex_lock(&mem_sysfs_mutex);
789675 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
790
- mem = find_memory_block_by_id(block_id, NULL);
676
+ mem = find_memory_block_by_id(block_id);
791677 if (WARN_ON_ONCE(!mem))
792678 continue;
793
- mem->section_count = 0;
794679 unregister_memory_block_under_nodes(mem);
795680 unregister_memory(mem);
796681 }
797
- mutex_unlock(&mem_sysfs_mutex);
798682 }
799683
800684 /* return true if the memory block is offlined, otherwise, return false */
....@@ -828,38 +712,77 @@
828712 };
829713
830714 /*
831
- * Initialize the sysfs support for memory devices...
715
+ * Initialize the sysfs support for memory devices. At the time this function
716
+ * is called, we cannot have concurrent creation/deletion of memory block
717
+ * devices, the device_hotplug_lock is not needed.
832718 */
833
-int __init memory_dev_init(void)
719
+void __init memory_dev_init(void)
834720 {
835
- unsigned int i;
836721 int ret;
837
- int err;
838
- unsigned long block_sz;
722
+ unsigned long block_sz, nr;
723
+
724
+ /* Validate the configured memory block size */
725
+ block_sz = memory_block_size_bytes();
726
+ if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
727
+ panic("Memory block size not suitable: 0x%lx\n", block_sz);
728
+ sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
839729
840730 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
841731 if (ret)
842
- goto out;
843
-
844
- block_sz = get_memory_block_size();
845
- sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
732
+ panic("%s() failed to register subsystem: %d\n", __func__, ret);
846733
847734 /*
848735 * Create entries for memory sections that were found
849736 * during boot and have been initialized
850737 */
851
- mutex_lock(&mem_sysfs_mutex);
852
- for (i = 0; i <= __highest_present_section_nr;
853
- i += sections_per_block) {
854
- err = add_memory_block(i);
855
- if (!ret)
856
- ret = err;
738
+ for (nr = 0; nr <= __highest_present_section_nr;
739
+ nr += sections_per_block) {
740
+ ret = add_memory_block(nr);
741
+ if (ret)
742
+ panic("%s() failed to add memory block: %d\n", __func__,
743
+ ret);
857744 }
858
- mutex_unlock(&mem_sysfs_mutex);
745
+}
859746
860
-out:
861
- if (ret)
862
- printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
747
+/**
748
+ * walk_memory_blocks - walk through all present memory blocks overlapped
749
+ * by the range [start, start + size)
750
+ *
751
+ * @start: start address of the memory range
752
+ * @size: size of the memory range
753
+ * @arg: argument passed to func
754
+ * @func: callback for each memory section walked
755
+ *
756
+ * This function walks through all present memory blocks overlapped by the
757
+ * range [start, start + size), calling func on each memory block.
758
+ *
759
+ * In case func() returns an error, walking is aborted and the error is
760
+ * returned.
761
+ *
762
+ * Called under device_hotplug_lock.
763
+ */
764
+int walk_memory_blocks(unsigned long start, unsigned long size,
765
+ void *arg, walk_memory_blocks_func_t func)
766
+{
767
+ const unsigned long start_block_id = phys_to_block_id(start);
768
+ const unsigned long end_block_id = phys_to_block_id(start + size - 1);
769
+ struct memory_block *mem;
770
+ unsigned long block_id;
771
+ int ret = 0;
772
+
773
+ if (!size)
774
+ return 0;
775
+
776
+ for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
777
+ mem = find_memory_block_by_id(block_id);
778
+ if (!mem)
779
+ continue;
780
+
781
+ ret = func(mem, arg);
782
+ put_device(&mem->dev);
783
+ if (ret)
784
+ break;
785
+ }
863786 return ret;
864787 }
865788