hc
2024-05-11 297b60346df8beafee954a0fd7c2d64f33f3b9bc
kernel/arch/powerpc/mm/numa.c
....@@ -1,17 +1,13 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * pSeries NUMA support
34 *
45 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5
- *
6
- * This program is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU General Public License
8
- * as published by the Free Software Foundation; either version
9
- * 2 of the License, or (at your option) any later version.
106 */
117 #define pr_fmt(fmt) "numa: " fmt
128
139 #include <linux/threads.h>
14
-#include <linux/bootmem.h>
10
+#include <linux/memblock.h>
1511 #include <linux/init.h>
1612 #include <linux/mm.h>
1713 #include <linux/mmzone.h>
....@@ -19,7 +15,6 @@
1915 #include <linux/nodemask.h>
2016 #include <linux/cpu.h>
2117 #include <linux/notifier.h>
22
-#include <linux/memblock.h>
2318 #include <linux/of.h>
2419 #include <linux/pfn.h>
2520 #include <linux/cpuset.h>
....@@ -33,7 +28,6 @@
3328 #include <asm/sparsemem.h>
3429 #include <asm/prom.h>
3530 #include <asm/smp.h>
36
-#include <asm/cputhreads.h>
3731 #include <asm/topology.h>
3832 #include <asm/firmware.h>
3933 #include <asm/paca.h>
....@@ -57,14 +51,22 @@
5751 EXPORT_SYMBOL(node_to_cpumask_map);
5852 EXPORT_SYMBOL(node_data);
5953
60
-static int min_common_depth;
54
+static int primary_domain_index;
6155 static int n_mem_addr_cells, n_mem_size_cells;
62
-static int form1_affinity;
56
+
57
+#define FORM0_AFFINITY 0
58
+#define FORM1_AFFINITY 1
59
+#define FORM2_AFFINITY 2
60
+static int affinity_form;
6361
6462 #define MAX_DISTANCE_REF_POINTS 4
6563 static int distance_ref_points_depth;
6664 static const __be32 *distance_ref_points;
6765 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
66
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
67
+ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
68
+};
69
+static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
6870
6971 /*
7072 * Allocate node_to_cpumask_map based on number of available nodes
....@@ -85,7 +87,7 @@
8587 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
8688
8789 /* cpumask_of_node() will now work */
88
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
90
+ dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
8991 }
9092
9193 static int __init fake_numa_create_new_node(unsigned long end_pfn,
....@@ -169,6 +171,79 @@
169171 }
170172 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
171173
174
+static int __associativity_to_nid(const __be32 *associativity,
175
+ int max_array_sz)
176
+{
177
+ int nid;
178
+ /*
179
+ * primary_domain_index is 1 based array index.
180
+ */
181
+ int index = primary_domain_index - 1;
182
+
183
+ if (!numa_enabled || index >= max_array_sz)
184
+ return NUMA_NO_NODE;
185
+
186
+ nid = of_read_number(&associativity[index], 1);
187
+
188
+ /* POWER4 LPAR uses 0xffff as invalid node */
189
+ if (nid == 0xffff || nid >= nr_node_ids)
190
+ nid = NUMA_NO_NODE;
191
+ return nid;
192
+}
193
+/*
194
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
195
+ * info is found.
196
+ */
197
+static int associativity_to_nid(const __be32 *associativity)
198
+{
199
+ int array_sz = of_read_number(associativity, 1);
200
+
201
+ /* Skip the first element in the associativity array */
202
+ return __associativity_to_nid((associativity + 1), array_sz);
203
+}
204
+
205
+static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
206
+{
207
+ int dist;
208
+ int node1, node2;
209
+
210
+ node1 = associativity_to_nid(cpu1_assoc);
211
+ node2 = associativity_to_nid(cpu2_assoc);
212
+
213
+ dist = numa_distance_table[node1][node2];
214
+ if (dist <= LOCAL_DISTANCE)
215
+ return 0;
216
+ else if (dist <= REMOTE_DISTANCE)
217
+ return 1;
218
+ else
219
+ return 2;
220
+}
221
+
222
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
223
+{
224
+ int dist = 0;
225
+
226
+ int i, index;
227
+
228
+ for (i = 0; i < distance_ref_points_depth; i++) {
229
+ index = be32_to_cpu(distance_ref_points[i]);
230
+ if (cpu1_assoc[index] == cpu2_assoc[index])
231
+ break;
232
+ dist++;
233
+ }
234
+
235
+ return dist;
236
+}
237
+
238
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
239
+{
240
+ /* We should not get called with FORM0 */
241
+ VM_WARN_ON(affinity_form == FORM0_AFFINITY);
242
+ if (affinity_form == FORM1_AFFINITY)
243
+ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
244
+ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
245
+}
246
+
172247 /* must hold reference to node during call */
173248 static const __be32 *of_get_associativity(struct device_node *dev)
174249 {
....@@ -180,7 +255,9 @@
180255 int i;
181256 int distance = LOCAL_DISTANCE;
182257
183
- if (!form1_affinity)
258
+ if (affinity_form == FORM2_AFFINITY)
259
+ return numa_distance_table[a][b];
260
+ else if (affinity_form == FORM0_AFFINITY)
184261 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
185262
186263 for (i = 0; i < distance_ref_points_depth; i++) {
....@@ -195,57 +272,12 @@
195272 }
196273 EXPORT_SYMBOL(__node_distance);
197274
198
-static void initialize_distance_lookup_table(int nid,
199
- const __be32 *associativity)
200
-{
201
- int i;
202
-
203
- if (!form1_affinity)
204
- return;
205
-
206
- for (i = 0; i < distance_ref_points_depth; i++) {
207
- const __be32 *entry;
208
-
209
- entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
210
- distance_lookup_table[nid][i] = of_read_number(entry, 1);
211
- }
212
-}
213
-
214
-/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
215
- * info is found.
216
- */
217
-static int associativity_to_nid(const __be32 *associativity)
218
-{
219
- int nid = -1;
220
-
221
- if (min_common_depth == -1)
222
- goto out;
223
-
224
- if (of_read_number(associativity, 1) >= min_common_depth)
225
- nid = of_read_number(&associativity[min_common_depth], 1);
226
-
227
- /* POWER4 LPAR uses 0xffff as invalid node */
228
- if (nid == 0xffff || nid >= MAX_NUMNODES)
229
- nid = -1;
230
-
231
- if (nid > 0 &&
232
- of_read_number(associativity, 1) >= distance_ref_points_depth) {
233
- /*
234
- * Skip the length field and send start of associativity array
235
- */
236
- initialize_distance_lookup_table(nid, associativity + 1);
237
- }
238
-
239
-out:
240
- return nid;
241
-}
242
-
243275 /* Returns the nid associated with the given device tree node,
244276 * or -1 if not found.
245277 */
246278 static int of_node_to_nid_single(struct device_node *device)
247279 {
248
- int nid = -1;
280
+ int nid = NUMA_NO_NODE;
249281 const __be32 *tmp;
250282
251283 tmp = of_get_associativity(device);
....@@ -257,7 +289,7 @@
257289 /* Walk the device tree upwards, looking for an associativity id */
258290 int of_node_to_nid(struct device_node *device)
259291 {
260
- int nid = -1;
292
+ int nid = NUMA_NO_NODE;
261293
262294 of_node_get(device);
263295 while (device) {
....@@ -273,10 +305,159 @@
273305 }
274306 EXPORT_SYMBOL(of_node_to_nid);
275307
276
-static int __init find_min_common_depth(void)
308
+static void __initialize_form1_numa_distance(const __be32 *associativity,
309
+ int max_array_sz)
277310 {
278
- int depth;
311
+ int i, nid;
312
+
313
+ if (affinity_form != FORM1_AFFINITY)
314
+ return;
315
+
316
+ nid = __associativity_to_nid(associativity, max_array_sz);
317
+ if (nid != NUMA_NO_NODE) {
318
+ for (i = 0; i < distance_ref_points_depth; i++) {
319
+ const __be32 *entry;
320
+ int index = be32_to_cpu(distance_ref_points[i]) - 1;
321
+
322
+ /*
323
+ * broken hierarchy, return with broken distance table
324
+ */
325
+ if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
326
+ return;
327
+
328
+ entry = &associativity[index];
329
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
330
+ }
331
+ }
332
+}
333
+
334
+static void initialize_form1_numa_distance(const __be32 *associativity)
335
+{
336
+ int array_sz;
337
+
338
+ array_sz = of_read_number(associativity, 1);
339
+ /* Skip the first element in the associativity array */
340
+ __initialize_form1_numa_distance(associativity + 1, array_sz);
341
+}
342
+
343
+/*
344
+ * Used to update distance information w.r.t newly added node.
345
+ */
346
+void update_numa_distance(struct device_node *node)
347
+{
348
+ int nid;
349
+
350
+ if (affinity_form == FORM0_AFFINITY)
351
+ return;
352
+ else if (affinity_form == FORM1_AFFINITY) {
353
+ const __be32 *associativity;
354
+
355
+ associativity = of_get_associativity(node);
356
+ if (!associativity)
357
+ return;
358
+
359
+ initialize_form1_numa_distance(associativity);
360
+ return;
361
+ }
362
+
363
+ /* FORM2 affinity */
364
+ nid = of_node_to_nid_single(node);
365
+ if (nid == NUMA_NO_NODE)
366
+ return;
367
+
368
+ /*
369
+ * With FORM2 we expect NUMA distance of all possible NUMA
370
+ * nodes to be provided during boot.
371
+ */
372
+ WARN(numa_distance_table[nid][nid] == -1,
373
+ "NUMA distance details for node %d not provided\n", nid);
374
+}
375
+EXPORT_SYMBOL_GPL(update_numa_distance);
376
+
377
+/*
378
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
379
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
380
+ */
381
+static void initialize_form2_numa_distance_lookup_table(void)
382
+{
383
+ int i, j;
279384 struct device_node *root;
385
+ const __u8 *numa_dist_table;
386
+ const __be32 *numa_lookup_index;
387
+ int numa_dist_table_length;
388
+ int max_numa_index, distance_index;
389
+
390
+ if (firmware_has_feature(FW_FEATURE_OPAL))
391
+ root = of_find_node_by_path("/ibm,opal");
392
+ else
393
+ root = of_find_node_by_path("/rtas");
394
+ if (!root)
395
+ root = of_find_node_by_path("/");
396
+
397
+ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
398
+ max_numa_index = of_read_number(&numa_lookup_index[0], 1);
399
+
400
+ /* first element of the array is the size and is encode-int */
401
+ numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
402
+ numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
403
+ /* Skip the size which is encoded int */
404
+ numa_dist_table += sizeof(__be32);
405
+
406
+ pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
407
+ numa_dist_table_length, max_numa_index);
408
+
409
+ for (i = 0; i < max_numa_index; i++)
410
+ /* +1 skip the max_numa_index in the property */
411
+ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
412
+
413
+
414
+ if (numa_dist_table_length != max_numa_index * max_numa_index) {
415
+ WARN(1, "Wrong NUMA distance information\n");
416
+ /* consider everybody else just remote. */
417
+ for (i = 0; i < max_numa_index; i++) {
418
+ for (j = 0; j < max_numa_index; j++) {
419
+ int nodeA = numa_id_index_table[i];
420
+ int nodeB = numa_id_index_table[j];
421
+
422
+ if (nodeA == nodeB)
423
+ numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE;
424
+ else
425
+ numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE;
426
+ }
427
+ }
428
+ }
429
+
430
+ distance_index = 0;
431
+ for (i = 0; i < max_numa_index; i++) {
432
+ for (j = 0; j < max_numa_index; j++) {
433
+ int nodeA = numa_id_index_table[i];
434
+ int nodeB = numa_id_index_table[j];
435
+
436
+ numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++];
437
+ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
438
+ }
439
+ }
440
+ of_node_put(root);
441
+}
442
+
443
+static int __init find_primary_domain_index(void)
444
+{
445
+ int index;
446
+ struct device_node *root;
447
+
448
+ /*
449
+ * Check for which form of affinity.
450
+ */
451
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
452
+ affinity_form = FORM1_AFFINITY;
453
+ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
454
+ dbg("Using form 2 affinity\n");
455
+ affinity_form = FORM2_AFFINITY;
456
+ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
457
+ dbg("Using form 1 affinity\n");
458
+ affinity_form = FORM1_AFFINITY;
459
+ } else
460
+ affinity_form = FORM0_AFFINITY;
280461
281462 if (firmware_has_feature(FW_FEATURE_OPAL))
282463 root = of_find_node_by_path("/ibm,opal");
....@@ -307,25 +488,21 @@
307488 }
308489
309490 distance_ref_points_depth /= sizeof(int);
310
-
311
- if (firmware_has_feature(FW_FEATURE_OPAL) ||
312
- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
313
- dbg("Using form 1 affinity\n");
314
- form1_affinity = 1;
315
- }
316
-
317
- if (form1_affinity) {
318
- depth = of_read_number(distance_ref_points, 1);
319
- } else {
491
+ if (affinity_form == FORM0_AFFINITY) {
320492 if (distance_ref_points_depth < 2) {
321493 printk(KERN_WARNING "NUMA: "
322
- "short ibm,associativity-reference-points\n");
494
+ "short ibm,associativity-reference-points\n");
323495 goto err;
324496 }
325497
326
- depth = of_read_number(&distance_ref_points[1], 1);
498
+ index = of_read_number(&distance_ref_points[1], 1);
499
+ } else {
500
+ /*
501
+ * Both FORM1 and FORM2 affinity find the primary domain details
502
+ * at the same offset.
503
+ */
504
+ index = of_read_number(distance_ref_points, 1);
327505 }
328
-
329506 /*
330507 * Warn and cap if the hardware supports more than
331508 * MAX_DISTANCE_REF_POINTS domains.
....@@ -337,7 +514,7 @@
337514 }
338515
339516 of_node_put(root);
340
- return depth;
517
+ return index;
341518
342519 err:
343520 of_node_put(root);
....@@ -415,39 +592,119 @@
415592 return 0;
416593 }
417594
418
-/*
419
- * This is like of_node_to_nid_single() for memory represented in the
420
- * ibm,dynamic-reconfiguration-memory node.
421
- */
422
-static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
595
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
423596 {
424597 struct assoc_arrays aa = { .arrays = NULL };
425
- int default_nid = 0;
598
+ int default_nid = NUMA_NO_NODE;
426599 int nid = default_nid;
427600 int rc, index;
601
+
602
+ if ((primary_domain_index < 0) || !numa_enabled)
603
+ return default_nid;
428604
429605 rc = of_get_assoc_arrays(&aa);
430606 if (rc)
431607 return default_nid;
432608
433
- if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
434
- !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
435
- lmb->aa_index < aa.n_arrays) {
436
- index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
437
- nid = of_read_number(&aa.arrays[index], 1);
609
+ if (primary_domain_index <= aa.array_sz &&
610
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
611
+ const __be32 *associativity;
438612
439
- if (nid == 0xffff || nid >= MAX_NUMNODES)
440
- nid = default_nid;
441
-
442
- if (nid > 0) {
443
- index = lmb->aa_index * aa.array_sz;
444
- initialize_distance_lookup_table(nid,
445
- &aa.arrays[index]);
613
+ index = lmb->aa_index * aa.array_sz;
614
+ associativity = &aa.arrays[index];
615
+ nid = __associativity_to_nid(associativity, aa.array_sz);
616
+ if (nid > 0 && affinity_form == FORM1_AFFINITY) {
617
+ /*
618
+ * lookup array associativity entries have
619
+ * no length of the array as the first element.
620
+ */
621
+ __initialize_form1_numa_distance(associativity, aa.array_sz);
446622 }
447623 }
448
-
449624 return nid;
450625 }
626
+
627
+/*
628
+ * This is like of_node_to_nid_single() for memory represented in the
629
+ * ibm,dynamic-reconfiguration-memory node.
630
+ */
631
+int of_drconf_to_nid_single(struct drmem_lmb *lmb)
632
+{
633
+ struct assoc_arrays aa = { .arrays = NULL };
634
+ int default_nid = NUMA_NO_NODE;
635
+ int nid = default_nid;
636
+ int rc, index;
637
+
638
+ if ((primary_domain_index < 0) || !numa_enabled)
639
+ return default_nid;
640
+
641
+ rc = of_get_assoc_arrays(&aa);
642
+ if (rc)
643
+ return default_nid;
644
+
645
+ if (primary_domain_index <= aa.array_sz &&
646
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
647
+ const __be32 *associativity;
648
+
649
+ index = lmb->aa_index * aa.array_sz;
650
+ associativity = &aa.arrays[index];
651
+ nid = __associativity_to_nid(associativity, aa.array_sz);
652
+ }
653
+ return nid;
654
+}
655
+
656
+#ifdef CONFIG_PPC_SPLPAR
657
+
658
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
659
+{
660
+ long rc, hwid;
661
+
662
+ /*
663
+ * On a shared lpar, device tree will not have node associativity.
664
+ * At this time lppaca, or its __old_status field may not be
665
+ * updated. Hence kernel cannot detect if its on a shared lpar. So
666
+ * request an explicit associativity irrespective of whether the
667
+ * lpar is shared or dedicated. Use the device tree property as a
668
+ * fallback. cpu_to_phys_id is only valid between
669
+ * smp_setup_cpu_maps() and smp_setup_pacas().
670
+ */
671
+ if (firmware_has_feature(FW_FEATURE_VPHN)) {
672
+ if (cpu_to_phys_id)
673
+ hwid = cpu_to_phys_id[lcpu];
674
+ else
675
+ hwid = get_hard_smp_processor_id(lcpu);
676
+
677
+ rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
678
+ if (rc == H_SUCCESS)
679
+ return 0;
680
+ }
681
+
682
+ return -1;
683
+}
684
+
685
+static int vphn_get_nid(long lcpu)
686
+{
687
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
688
+
689
+
690
+ if (!__vphn_get_associativity(lcpu, associativity))
691
+ return associativity_to_nid(associativity);
692
+
693
+ return NUMA_NO_NODE;
694
+
695
+}
696
+#else
697
+
698
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
699
+{
700
+ return -1;
701
+}
702
+
703
+static int vphn_get_nid(long unused)
704
+{
705
+ return NUMA_NO_NODE;
706
+}
707
+#endif /* CONFIG_PPC_SPLPAR */
451708
452709 /*
453710 * Figure out to which domain a cpu belongs and stick it there.
....@@ -455,18 +712,32 @@
455712 */
456713 static int numa_setup_cpu(unsigned long lcpu)
457714 {
458
- int nid = -1;
459715 struct device_node *cpu;
716
+ int fcpu = cpu_first_thread_sibling(lcpu);
717
+ int nid = NUMA_NO_NODE;
718
+
719
+ if (!cpu_present(lcpu)) {
720
+ set_cpu_numa_node(lcpu, first_online_node);
721
+ return first_online_node;
722
+ }
460723
461724 /*
462725 * If a valid cpu-to-node mapping is already available, use it
463726 * directly instead of querying the firmware, since it represents
464727 * the most recent mapping notified to us by the platform (eg: VPHN).
728
+ * Since cpu_to_node binding remains the same for all threads in the
729
+ * core. If a valid cpu-to-node mapping is already available, for
730
+ * the first thread in the core, use it.
465731 */
466
- if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
732
+ nid = numa_cpu_lookup_table[fcpu];
733
+ if (nid >= 0) {
467734 map_cpu_to_node(lcpu, nid);
468735 return nid;
469736 }
737
+
738
+ nid = vphn_get_nid(lcpu);
739
+ if (nid != NUMA_NO_NODE)
740
+ goto out_present;
470741
471742 cpu = of_get_cpu_node(lcpu, NULL);
472743
....@@ -479,13 +750,26 @@
479750 }
480751
481752 nid = of_node_to_nid_single(cpu);
753
+ of_node_put(cpu);
482754
483755 out_present:
484756 if (nid < 0 || !node_possible(nid))
485757 nid = first_online_node;
486758
759
+ /*
760
+ * Update for the first thread of the core. All threads of a core
761
+ * have to be part of the same node. This not only avoids querying
762
+ * for every other thread in the core, but always avoids a case
763
+ * where virtual node associativity change causes subsequent threads
764
+ * of a core to be associated with different nid. However if first
765
+ * thread is already online, expect it to have a valid mapping.
766
+ */
767
+ if (fcpu != lcpu) {
768
+ WARN_ON(cpu_online(fcpu));
769
+ map_cpu_to_node(fcpu, nid);
770
+ }
771
+
487772 map_cpu_to_node(lcpu, nid);
488
- of_node_put(cpu);
489773 out:
490774 return nid;
491775 }
....@@ -575,8 +859,9 @@
575859 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
576860 * node. This assumes n_mem_{addr,size}_cells have been set.
577861 */
578
-static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
579
- const __be32 **usm)
862
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
863
+ const __be32 **usm,
864
+ void *data)
580865 {
581866 unsigned int ranges, is_kexec_kdump = 0;
582867 unsigned long base, size, sz;
....@@ -588,7 +873,7 @@
588873 */
589874 if ((lmb->flags & DRCONF_MEM_RESERVED)
590875 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
591
- return;
876
+ return 0;
592877
593878 if (*usm)
594879 is_kexec_kdump = 1;
....@@ -600,7 +885,7 @@
600885 if (is_kexec_kdump) {
601886 ranges = read_usm_ranges(usm);
602887 if (!ranges) /* there are no (base, size) duple */
603
- return;
888
+ return 0;
604889 }
605890
606891 do {
....@@ -609,7 +894,7 @@
609894 size = read_n_cells(n_mem_size_cells, usm);
610895 }
611896
612
- nid = of_drconf_to_nid_single(lmb);
897
+ nid = get_nid_and_numa_distance(lmb);
613898 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
614899 &nid);
615900 node_set_online(nid);
....@@ -617,6 +902,8 @@
617902 if (sz)
618903 memblock_set_node(base, sz, &memblock.memory, nid);
619904 } while (--ranges);
905
+
906
+ return 0;
620907 }
621908
622909 static int __init parse_numa_properties(void)
....@@ -624,18 +911,31 @@
624911 struct device_node *memory;
625912 int default_nid = 0;
626913 unsigned long i;
914
+ const __be32 *associativity;
627915
628916 if (numa_enabled == 0) {
629917 printk(KERN_WARNING "NUMA disabled by user\n");
630918 return -1;
631919 }
632920
633
- min_common_depth = find_min_common_depth();
921
+ primary_domain_index = find_primary_domain_index();
634922
635
- if (min_common_depth < 0)
636
- return min_common_depth;
923
+ if (primary_domain_index < 0) {
924
+ /*
925
+ * if we fail to parse primary_domain_index from device tree
926
+ * mark the numa disabled, boot with numa disabled.
927
+ */
928
+ numa_enabled = false;
929
+ return primary_domain_index;
930
+ }
637931
638
- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
932
+ dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
933
+
934
+ /*
935
+ * If it is FORM2 initialize the distance table here.
936
+ */
937
+ if (affinity_form == FORM2_AFFINITY)
938
+ initialize_form2_numa_distance_lookup_table();
639939
640940 /*
641941 * Even though we connect cpus to numa domains later in SMP
....@@ -643,22 +943,36 @@
643943 * each node to be onlined must have NODE_DATA etc backing it.
644944 */
645945 for_each_present_cpu(i) {
946
+ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
646947 struct device_node *cpu;
647
- int nid;
948
+ int nid = NUMA_NO_NODE;
648949
649
- cpu = of_get_cpu_node(i, NULL);
650
- BUG_ON(!cpu);
651
- nid = of_node_to_nid_single(cpu);
652
- of_node_put(cpu);
950
+ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
653951
654
- /*
655
- * Don't fall back to default_nid yet -- we will plug
656
- * cpus into nodes once the memory scan has discovered
657
- * the topology.
658
- */
659
- if (nid < 0)
660
- continue;
661
- node_set_online(nid);
952
+ if (__vphn_get_associativity(i, vphn_assoc) == 0) {
953
+ nid = associativity_to_nid(vphn_assoc);
954
+ initialize_form1_numa_distance(vphn_assoc);
955
+ } else {
956
+
957
+ /*
958
+ * Don't fall back to default_nid yet -- we will plug
959
+ * cpus into nodes once the memory scan has discovered
960
+ * the topology.
961
+ */
962
+ cpu = of_get_cpu_node(i, NULL);
963
+ BUG_ON(!cpu);
964
+
965
+ associativity = of_get_associativity(cpu);
966
+ if (associativity) {
967
+ nid = associativity_to_nid(associativity);
968
+ initialize_form1_numa_distance(associativity);
969
+ }
970
+ of_node_put(cpu);
971
+ }
972
+
973
+ /* node_set_online() is an UB if 'nid' is negative */
974
+ if (likely(nid >= 0))
975
+ node_set_online(nid);
662976 }
663977
664978 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
....@@ -690,8 +1004,11 @@
6901004 * have associativity properties. If none, then
6911005 * everything goes to default_nid.
6921006 */
693
- nid = of_node_to_nid_single(memory);
694
- if (nid < 0)
1007
+ associativity = of_get_associativity(memory);
1008
+ if (associativity) {
1009
+ nid = associativity_to_nid(associativity);
1010
+ initialize_form1_numa_distance(associativity);
1011
+ } else
6951012 nid = default_nid;
6961013
6971014 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
....@@ -712,7 +1029,7 @@
7121029 */
7131030 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
7141031 if (memory) {
715
- walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
1032
+ walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
7161033 of_node_put(memory);
7171034 }
7181035
....@@ -725,17 +1042,14 @@
7251042 unsigned long total_ram = memblock_phys_mem_size();
7261043 unsigned long start_pfn, end_pfn;
7271044 unsigned int nid = 0;
728
- struct memblock_region *reg;
1045
+ int i;
7291046
7301047 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
7311048 top_of_ram, total_ram);
7321049 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
7331050 (top_of_ram - total_ram) >> 20);
7341051
735
- for_each_memblock(memory, reg) {
736
- start_pfn = memblock_region_memory_base_pfn(reg);
737
- end_pfn = memblock_region_memory_end_pfn(reg);
738
-
1052
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
7391053 fake_numa_create_new_node(end_pfn, &nid);
7401054 memblock_set_node(PFN_PHYS(start_pfn),
7411055 PFN_PHYS(end_pfn - start_pfn),
....@@ -749,7 +1063,7 @@
7491063 unsigned int node;
7501064 unsigned int cpu, count;
7511065
752
- if (min_common_depth == -1 || !numa_enabled)
1066
+ if (!numa_enabled)
7531067 return;
7541068
7551069 for_each_online_node(node) {
....@@ -788,7 +1102,11 @@
7881102 void *nd;
7891103 int tnid;
7901104
791
- nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
1105
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
1106
+ if (!nd_pa)
1107
+ panic("Cannot allocate %zu bytes for node %d data\n",
1108
+ nd_size, nid);
1109
+
7921110 nd = __va(nd_pa);
7931111
7941112 /* report and initialize */
....@@ -808,24 +1126,48 @@
8081126 static void __init find_possible_nodes(void)
8091127 {
8101128 struct device_node *rtas;
811
- u32 numnodes, i;
1129
+ const __be32 *domains = NULL;
1130
+ int prop_length, max_nodes;
1131
+ u32 i;
8121132
813
- if (min_common_depth <= 0)
1133
+ if (!numa_enabled)
8141134 return;
8151135
8161136 rtas = of_find_node_by_path("/rtas");
8171137 if (!rtas)
8181138 return;
8191139
820
- if (of_property_read_u32_index(rtas,
821
- "ibm,max-associativity-domains",
822
- min_common_depth, &numnodes))
823
- goto out;
1140
+ /*
1141
+ * ibm,current-associativity-domains is a fairly recent property. If
1142
+ * it doesn't exist, then fallback on ibm,max-associativity-domains.
1143
+ * Current denotes what the platform can support compared to max
1144
+ * which denotes what the Hypervisor can support.
1145
+ *
1146
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
1147
+ * so we should consider the max number in that case.
1148
+ */
1149
+ if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
1150
+ domains = of_get_property(rtas,
1151
+ "ibm,current-associativity-domains",
1152
+ &prop_length);
1153
+ if (!domains) {
1154
+ domains = of_get_property(rtas, "ibm,max-associativity-domains",
1155
+ &prop_length);
1156
+ if (!domains)
1157
+ goto out;
1158
+ }
8241159
825
- for (i = 0; i < numnodes; i++) {
1160
+ max_nodes = of_read_number(&domains[primary_domain_index], 1);
1161
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
1162
+
1163
+ for (i = 0; i < max_nodes; i++) {
8261164 if (!node_possible(i))
8271165 node_set(i, node_possible_map);
8281166 }
1167
+
1168
+ prop_length /= sizeof(int);
1169
+ if (prop_length > primary_domain_index + 2)
1170
+ coregroup_enabled = 1;
8291171
8301172 out:
8311173 of_node_put(rtas);
....@@ -834,6 +1176,16 @@
8341176 void __init mem_topology_setup(void)
8351177 {
8361178 int cpu;
1179
+
1180
+ /*
1181
+ * Linux/mm assumes node 0 to be online at boot. However this is not
1182
+ * true on PowerPC, where node 0 is similar to any other node, it
1183
+ * could be cpuless, memoryless node. So force node 0 to be offline
1184
+ * for now. This will prevent cpuless, memoryless node 0 showing up
1185
+ * unnecessarily as online. If a node has cpus or memory that need
1186
+ * to be online, then node will anyway be marked online.
1187
+ */
1188
+ node_set_offline(0);
8371189
8381190 if (parse_numa_properties())
8391191 setup_nonnuma();
....@@ -852,8 +1204,17 @@
8521204
8531205 reset_numa_cpu_lookup_table();
8541206
855
- for_each_present_cpu(cpu)
1207
+ for_each_possible_cpu(cpu) {
1208
+ /*
1209
+ * Powerpc with CONFIG_NUMA always used to have a node 0,
1210
+ * even if it was memoryless or cpuless. For all cpus that
1211
+ * are possible but not present, cpu_to_node() would point
1212
+ * to node 0. To remove a cpuless, memoryless dummy node,
1213
+ * powerpc need to make sure all possible but not present
1214
+ * cpu_to_node are set to a proper node.
1215
+ */
8561216 numa_setup_cpu(cpu);
1217
+ }
8571218 }
8581219
8591220 void __init initmem_init(void)
....@@ -870,7 +1231,6 @@
8701231
8711232 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
8721233 setup_node_data(nid, start_pfn, end_pfn);
873
- sparse_memory_present_with_active_regions(nid);
8741234 }
8751235
8761236 sparse_init();
....@@ -905,22 +1265,6 @@
9051265 }
9061266 early_param("numa", early_numa);
9071267
908
-static bool topology_updates_enabled = true;
909
-
910
-static int __init early_topology_updates(char *p)
911
-{
912
- if (!p)
913
- return 0;
914
-
915
- if (!strcmp(p, "off")) {
916
- pr_info("Disabling topology updates\n");
917
- topology_updates_enabled = false;
918
- }
919
-
920
- return 0;
921
-}
922
-early_param("topology_updates", early_topology_updates);
923
-
9241268 #ifdef CONFIG_MEMORY_HOTPLUG
9251269 /*
9261270 * Find the node associated with a hot added memory section for
....@@ -931,7 +1275,7 @@
9311275 {
9321276 struct drmem_lmb *lmb;
9331277 unsigned long lmb_size;
934
- int nid = -1;
1278
+ int nid = NUMA_NO_NODE;
9351279
9361280 lmb_size = drmem_lmb_size();
9371281
....@@ -961,7 +1305,7 @@
9611305 static int hot_add_node_scn_to_nid(unsigned long scn_addr)
9621306 {
9631307 struct device_node *memory;
964
- int nid = -1;
1308
+ int nid = NUMA_NO_NODE;
9651309
9661310 for_each_node_by_type(memory, "memory") {
9671311 unsigned long start, size;
....@@ -1006,7 +1350,7 @@
10061350 struct device_node *memory = NULL;
10071351 int nid;
10081352
1009
- if (!numa_enabled || (min_common_depth < 0))
1353
+ if (!numa_enabled)
10101354 return first_online_node;
10111355
10121356 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
....@@ -1059,142 +1403,42 @@
10591403
10601404 /* Virtual Processor Home Node (VPHN) support */
10611405 #ifdef CONFIG_PPC_SPLPAR
1062
-
1063
-#include "vphn.h"
1064
-
1065
-struct topology_update_data {
1066
- struct topology_update_data *next;
1067
- unsigned int cpu;
1068
- int old_nid;
1069
- int new_nid;
1070
-};
1071
-
1072
-#define TOPOLOGY_DEF_TIMER_SECS 60
1073
-
1074
-static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1075
-static cpumask_t cpu_associativity_changes_mask;
1076
-static int vphn_enabled;
1077
-static int prrn_enabled;
1078
-static void reset_topology_timer(void);
1079
-static int topology_timer_secs = 1;
10801406 static int topology_inited;
1081
-
1082
-/*
1083
- * Change polling interval for associativity changes.
1084
- */
1085
-int timed_topology_update(int nsecs)
1086
-{
1087
- if (vphn_enabled) {
1088
- if (nsecs > 0)
1089
- topology_timer_secs = nsecs;
1090
- else
1091
- topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
1092
-
1093
- reset_topology_timer();
1094
- }
1095
-
1096
- return 0;
1097
-}
1098
-
1099
-/*
1100
- * Store the current values of the associativity change counters in the
1101
- * hypervisor.
1102
- */
1103
-static void setup_cpu_associativity_change_counters(void)
1104
-{
1105
- int cpu;
1106
-
1107
- /* The VPHN feature supports a maximum of 8 reference points */
1108
- BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1109
-
1110
- for_each_possible_cpu(cpu) {
1111
- int i;
1112
- u8 *counts = vphn_cpu_change_counts[cpu];
1113
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
1114
-
1115
- for (i = 0; i < distance_ref_points_depth; i++)
1116
- counts[i] = hypervisor_counts[i];
1117
- }
1118
-}
1119
-
1120
-/*
1121
- * The hypervisor maintains a set of 8 associativity change counters in
1122
- * the VPA of each cpu that correspond to the associativity levels in the
1123
- * ibm,associativity-reference-points property. When an associativity
1124
- * level changes, the corresponding counter is incremented.
1125
- *
1126
- * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1127
- * node associativity levels have changed.
1128
- *
1129
- * Returns the number of cpus with unhandled associativity changes.
1130
- */
1131
-static int update_cpu_associativity_changes_mask(void)
1132
-{
1133
- int cpu;
1134
- cpumask_t *changes = &cpu_associativity_changes_mask;
1135
-
1136
- for_each_possible_cpu(cpu) {
1137
- int i, changed = 0;
1138
- u8 *counts = vphn_cpu_change_counts[cpu];
1139
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
1140
-
1141
- for (i = 0; i < distance_ref_points_depth; i++) {
1142
- if (hypervisor_counts[i] != counts[i]) {
1143
- counts[i] = hypervisor_counts[i];
1144
- changed = 1;
1145
- }
1146
- }
1147
- if (changed) {
1148
- cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1149
- cpu = cpu_last_thread_sibling(cpu);
1150
- }
1151
- }
1152
-
1153
- return cpumask_weight(changes);
1154
-}
11551407
11561408 /*
11571409 * Retrieve the new associativity information for a virtual processor's
11581410 * home node.
11591411 */
1160
-static long hcall_vphn(unsigned long cpu, __be32 *associativity)
1161
-{
1162
- long rc;
1163
- long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1164
- u64 flags = 1;
1165
- int hwcpu = get_hard_smp_processor_id(cpu);
1166
-
1167
- rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1168
- vphn_unpack_associativity(retbuf, associativity);
1169
-
1170
- return rc;
1171
-}
1172
-
11731412 static long vphn_get_associativity(unsigned long cpu,
11741413 __be32 *associativity)
11751414 {
11761415 long rc;
11771416
1178
- rc = hcall_vphn(cpu, associativity);
1417
+ rc = hcall_vphn(get_hard_smp_processor_id(cpu),
1418
+ VPHN_FLAG_VCPU, associativity);
11791419
11801420 switch (rc) {
1181
- case H_FUNCTION:
1182
- printk_once(KERN_INFO
1183
- "VPHN is not supported. Disabling polling...\n");
1184
- stop_topology_update();
1185
- break;
1186
- case H_HARDWARE:
1187
- printk(KERN_ERR
1188
- "hcall_vphn() experienced a hardware fault "
1189
- "preventing VPHN. Disabling polling...\n");
1190
- stop_topology_update();
1191
- break;
11921421 case H_SUCCESS:
11931422 dbg("VPHN hcall succeeded. Reset polling...\n");
1194
- timed_topology_update(0);
1423
+ goto out;
1424
+
1425
+ case H_FUNCTION:
1426
+ pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
1427
+ break;
1428
+ case H_HARDWARE:
1429
+ pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
1430
+ "preventing VPHN. Disabling polling...\n");
1431
+ break;
1432
+ case H_PARAMETER:
1433
+ pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
1434
+ "Disabling polling...\n");
1435
+ break;
1436
+ default:
1437
+ pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
1438
+ , rc);
11951439 break;
11961440 }
1197
-
1441
+out:
11981442 return rc;
11991443 }
12001444
....@@ -1237,383 +1481,33 @@
12371481 return new_nid;
12381482 }
12391483
1240
-/*
1241
- * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1242
- * characteristics change. This function doesn't perform any locking and is
1243
- * only safe to call from stop_machine().
1244
- */
1245
-static int update_cpu_topology(void *data)
1484
+int cpu_to_coregroup_id(int cpu)
12461485 {
1247
- struct topology_update_data *update;
1248
- unsigned long cpu;
1486
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1487
+ int index;
12491488
1250
- if (!data)
1251
- return -EINVAL;
1489
+ if (cpu < 0 || cpu > nr_cpu_ids)
1490
+ return -1;
12521491
1253
- cpu = smp_processor_id();
1254
-
1255
- for (update = data; update; update = update->next) {
1256
- int new_nid = update->new_nid;
1257
- if (cpu != update->cpu)
1258
- continue;
1259
-
1260
- unmap_cpu_from_node(cpu);
1261
- map_cpu_to_node(cpu, new_nid);
1262
- set_cpu_numa_node(cpu, new_nid);
1263
- set_cpu_numa_mem(cpu, local_memory_node(new_nid));
1264
- vdso_getcpu_init();
1265
- }
1266
-
1267
- return 0;
1268
-}
1269
-
1270
-static int update_lookup_table(void *data)
1271
-{
1272
- struct topology_update_data *update;
1273
-
1274
- if (!data)
1275
- return -EINVAL;
1276
-
1277
- /*
1278
- * Upon topology update, the numa-cpu lookup table needs to be updated
1279
- * for all threads in the core, including offline CPUs, to ensure that
1280
- * future hotplug operations respect the cpu-to-node associativity
1281
- * properly.
1282
- */
1283
- for (update = data; update; update = update->next) {
1284
- int nid, base, j;
1285
-
1286
- nid = update->new_nid;
1287
- base = cpu_first_thread_sibling(update->cpu);
1288
-
1289
- for (j = 0; j < threads_per_core; j++) {
1290
- update_numa_cpu_lookup_table(base + j, nid);
1291
- }
1292
- }
1293
-
1294
- return 0;
1295
-}
1296
-
1297
-/*
1298
- * Update the node maps and sysfs entries for each cpu whose home node
1299
- * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1300
- *
1301
- * cpus_locked says whether we already hold cpu_hotplug_lock.
1302
- */
1303
-int numa_update_cpu_topology(bool cpus_locked)
1304
-{
1305
- unsigned int cpu, sibling, changed = 0;
1306
- struct topology_update_data *updates, *ud;
1307
- cpumask_t updated_cpus;
1308
- struct device *dev;
1309
- int weight, new_nid, i = 0;
1310
-
1311
- if (!prrn_enabled && !vphn_enabled && topology_inited)
1312
- return 0;
1313
-
1314
- weight = cpumask_weight(&cpu_associativity_changes_mask);
1315
- if (!weight)
1316
- return 0;
1317
-
1318
- updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
1319
- if (!updates)
1320
- return 0;
1321
-
1322
- cpumask_clear(&updated_cpus);
1323
-
1324
- for_each_cpu(cpu, &cpu_associativity_changes_mask) {
1325
- /*
1326
- * If siblings aren't flagged for changes, updates list
1327
- * will be too short. Skip on this update and set for next
1328
- * update.
1329
- */
1330
- if (!cpumask_subset(cpu_sibling_mask(cpu),
1331
- &cpu_associativity_changes_mask)) {
1332
- pr_info("Sibling bits not set for associativity "
1333
- "change, cpu%d\n", cpu);
1334
- cpumask_or(&cpu_associativity_changes_mask,
1335
- &cpu_associativity_changes_mask,
1336
- cpu_sibling_mask(cpu));
1337
- cpu = cpu_last_thread_sibling(cpu);
1338
- continue;
1339
- }
1340
-
1341
- new_nid = find_and_online_cpu_nid(cpu);
1342
-
1343
- if (new_nid == numa_cpu_lookup_table[cpu]) {
1344
- cpumask_andnot(&cpu_associativity_changes_mask,
1345
- &cpu_associativity_changes_mask,
1346
- cpu_sibling_mask(cpu));
1347
- dbg("Assoc chg gives same node %d for cpu%d\n",
1348
- new_nid, cpu);
1349
- cpu = cpu_last_thread_sibling(cpu);
1350
- continue;
1351
- }
1352
-
1353
- for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1354
- ud = &updates[i++];
1355
- ud->next = &updates[i];
1356
- ud->cpu = sibling;
1357
- ud->new_nid = new_nid;
1358
- ud->old_nid = numa_cpu_lookup_table[sibling];
1359
- cpumask_set_cpu(sibling, &updated_cpus);
1360
- }
1361
- cpu = cpu_last_thread_sibling(cpu);
1362
- }
1363
-
1364
- /*
1365
- * Prevent processing of 'updates' from overflowing array
1366
- * where last entry filled in a 'next' pointer.
1367
- */
1368
- if (i)
1369
- updates[i-1].next = NULL;
1370
-
1371
- pr_debug("Topology update for the following CPUs:\n");
1372
- if (cpumask_weight(&updated_cpus)) {
1373
- for (ud = &updates[0]; ud; ud = ud->next) {
1374
- pr_debug("cpu %d moving from node %d "
1375
- "to %d\n", ud->cpu,
1376
- ud->old_nid, ud->new_nid);
1377
- }
1378
- }
1379
-
1380
- /*
1381
- * In cases where we have nothing to update (because the updates list
1382
- * is too short or because the new topology is same as the old one),
1383
- * skip invoking update_cpu_topology() via stop-machine(). This is
1384
- * necessary (and not just a fast-path optimization) since stop-machine
1385
- * can end up electing a random CPU to run update_cpu_topology(), and
1386
- * thus trick us into setting up incorrect cpu-node mappings (since
1387
- * 'updates' is kzalloc()'ed).
1388
- *
1389
- * And for the similar reason, we will skip all the following updating.
1390
- */
1391
- if (!cpumask_weight(&updated_cpus))
1492
+ if (!coregroup_enabled)
13921493 goto out;
13931494
1394
- if (cpus_locked)
1395
- stop_machine_cpuslocked(update_cpu_topology, &updates[0],
1396
- &updated_cpus);
1397
- else
1398
- stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1495
+ if (!firmware_has_feature(FW_FEATURE_VPHN))
1496
+ goto out;
13991497
1400
- /*
1401
- * Update the numa-cpu lookup table with the new mappings, even for
1402
- * offline CPUs. It is best to perform this update from the stop-
1403
- * machine context.
1404
- */
1405
- if (cpus_locked)
1406
- stop_machine_cpuslocked(update_lookup_table, &updates[0],
1407
- cpumask_of(raw_smp_processor_id()));
1408
- else
1409
- stop_machine(update_lookup_table, &updates[0],
1410
- cpumask_of(raw_smp_processor_id()));
1498
+ if (vphn_get_associativity(cpu, associativity))
1499
+ goto out;
14111500
1412
- for (ud = &updates[0]; ud; ud = ud->next) {
1413
- unregister_cpu_under_node(ud->cpu, ud->old_nid);
1414
- register_cpu_under_node(ud->cpu, ud->new_nid);
1415
-
1416
- dev = get_cpu_device(ud->cpu);
1417
- if (dev)
1418
- kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1419
- cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
1420
- changed = 1;
1421
- }
1501
+ index = of_read_number(associativity, 1);
1502
+ if (index > primary_domain_index + 1)
1503
+ return of_read_number(&associativity[index - 1], 1);
14221504
14231505 out:
1424
- kfree(updates);
1425
- return changed;
1506
+ return cpu_to_core_id(cpu);
14261507 }
1427
-
1428
-int arch_update_cpu_topology(void)
1429
-{
1430
- return numa_update_cpu_topology(true);
1431
-}
1432
-
1433
-static void topology_work_fn(struct work_struct *work)
1434
-{
1435
- rebuild_sched_domains();
1436
-}
1437
-static DECLARE_WORK(topology_work, topology_work_fn);
1438
-
1439
-static void topology_schedule_update(void)
1440
-{
1441
- schedule_work(&topology_work);
1442
-}
1443
-
1444
-static void topology_timer_fn(struct timer_list *unused)
1445
-{
1446
- if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
1447
- topology_schedule_update();
1448
- else if (vphn_enabled) {
1449
- if (update_cpu_associativity_changes_mask() > 0)
1450
- topology_schedule_update();
1451
- reset_topology_timer();
1452
- }
1453
-}
1454
-static struct timer_list topology_timer;
1455
-
1456
-static void reset_topology_timer(void)
1457
-{
1458
- if (vphn_enabled)
1459
- mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
1460
-}
1461
-
1462
-#ifdef CONFIG_SMP
1463
-
1464
-static int dt_update_callback(struct notifier_block *nb,
1465
- unsigned long action, void *data)
1466
-{
1467
- struct of_reconfig_data *update = data;
1468
- int rc = NOTIFY_DONE;
1469
-
1470
- switch (action) {
1471
- case OF_RECONFIG_UPDATE_PROPERTY:
1472
- if (!of_prop_cmp(update->dn->type, "cpu") &&
1473
- !of_prop_cmp(update->prop->name, "ibm,associativity")) {
1474
- u32 core_id;
1475
- of_property_read_u32(update->dn, "reg", &core_id);
1476
- rc = dlpar_cpu_readd(core_id);
1477
- rc = NOTIFY_OK;
1478
- }
1479
- break;
1480
- }
1481
-
1482
- return rc;
1483
-}
1484
-
1485
-static struct notifier_block dt_update_nb = {
1486
- .notifier_call = dt_update_callback,
1487
-};
1488
-
1489
-#endif
1490
-
1491
-/*
1492
- * Start polling for associativity changes.
1493
- */
1494
-int start_topology_update(void)
1495
-{
1496
- int rc = 0;
1497
-
1498
- if (!topology_updates_enabled)
1499
- return 0;
1500
-
1501
- if (firmware_has_feature(FW_FEATURE_PRRN)) {
1502
- if (!prrn_enabled) {
1503
- prrn_enabled = 1;
1504
-#ifdef CONFIG_SMP
1505
- rc = of_reconfig_notifier_register(&dt_update_nb);
1506
-#endif
1507
- }
1508
- }
1509
- if (firmware_has_feature(FW_FEATURE_VPHN) &&
1510
- lppaca_shared_proc(get_lppaca())) {
1511
- if (!vphn_enabled) {
1512
- vphn_enabled = 1;
1513
- setup_cpu_associativity_change_counters();
1514
- timer_setup(&topology_timer, topology_timer_fn,
1515
- TIMER_DEFERRABLE);
1516
- reset_topology_timer();
1517
- }
1518
- }
1519
-
1520
- return rc;
1521
-}
1522
-
1523
-/*
1524
- * Disable polling for VPHN associativity changes.
1525
- */
1526
-int stop_topology_update(void)
1527
-{
1528
- int rc = 0;
1529
-
1530
- if (!topology_updates_enabled)
1531
- return 0;
1532
-
1533
- if (prrn_enabled) {
1534
- prrn_enabled = 0;
1535
-#ifdef CONFIG_SMP
1536
- rc = of_reconfig_notifier_unregister(&dt_update_nb);
1537
-#endif
1538
- }
1539
- if (vphn_enabled) {
1540
- vphn_enabled = 0;
1541
- rc = del_timer_sync(&topology_timer);
1542
- }
1543
-
1544
- return rc;
1545
-}
1546
-
1547
-int prrn_is_enabled(void)
1548
-{
1549
- return prrn_enabled;
1550
-}
1551
-
1552
-void __init shared_proc_topology_init(void)
1553
-{
1554
- if (lppaca_shared_proc(get_lppaca())) {
1555
- bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1556
- nr_cpumask_bits);
1557
- numa_update_cpu_topology(false);
1558
- }
1559
-}
1560
-
1561
-static int topology_read(struct seq_file *file, void *v)
1562
-{
1563
- if (vphn_enabled || prrn_enabled)
1564
- seq_puts(file, "on\n");
1565
- else
1566
- seq_puts(file, "off\n");
1567
-
1568
- return 0;
1569
-}
1570
-
1571
-static int topology_open(struct inode *inode, struct file *file)
1572
-{
1573
- return single_open(file, topology_read, NULL);
1574
-}
1575
-
1576
-static ssize_t topology_write(struct file *file, const char __user *buf,
1577
- size_t count, loff_t *off)
1578
-{
1579
- char kbuf[4]; /* "on" or "off" plus null. */
1580
- int read_len;
1581
-
1582
- read_len = count < 3 ? count : 3;
1583
- if (copy_from_user(kbuf, buf, read_len))
1584
- return -EINVAL;
1585
-
1586
- kbuf[read_len] = '\0';
1587
-
1588
- if (!strncmp(kbuf, "on", 2)) {
1589
- topology_updates_enabled = true;
1590
- start_topology_update();
1591
- } else if (!strncmp(kbuf, "off", 3)) {
1592
- stop_topology_update();
1593
- topology_updates_enabled = false;
1594
- } else
1595
- return -EINVAL;
1596
-
1597
- return count;
1598
-}
1599
-
1600
-static const struct file_operations topology_ops = {
1601
- .read = seq_read,
1602
- .write = topology_write,
1603
- .open = topology_open,
1604
- .release = single_release
1605
-};
16061508
16071509 static int topology_update_init(void)
16081510 {
1609
- start_topology_update();
1610
-
1611
- if (vphn_enabled)
1612
- topology_schedule_update();
1613
-
1614
- if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1615
- return -ENOMEM;
1616
-
16171511 topology_inited = 1;
16181512 return 0;
16191513 }