hc
2024-05-11 297b60346df8beafee954a0fd7c2d64f33f3b9bc
kernel/arch/powerpc/mm/numa.c
....@@ -51,14 +51,22 @@
5151 EXPORT_SYMBOL(node_to_cpumask_map);
5252 EXPORT_SYMBOL(node_data);
5353
54
-static int min_common_depth;
54
+static int primary_domain_index;
5555 static int n_mem_addr_cells, n_mem_size_cells;
56
-static int form1_affinity;
56
+
57
+#define FORM0_AFFINITY 0
58
+#define FORM1_AFFINITY 1
59
+#define FORM2_AFFINITY 2
60
+static int affinity_form;
5761
5862 #define MAX_DISTANCE_REF_POINTS 4
5963 static int distance_ref_points_depth;
6064 static const __be32 *distance_ref_points;
6165 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
66
+static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
67
+ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
68
+};
69
+static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
6270
6371 /*
6472 * Allocate node_to_cpumask_map based on number of available nodes
....@@ -163,7 +171,55 @@
163171 }
164172 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
165173
166
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
174
+static int __associativity_to_nid(const __be32 *associativity,
175
+ int max_array_sz)
176
+{
177
+ int nid;
178
+ /*
179
+ * primary_domain_index is 1 based array index.
180
+ */
181
+ int index = primary_domain_index - 1;
182
+
183
+ if (!numa_enabled || index >= max_array_sz)
184
+ return NUMA_NO_NODE;
185
+
186
+ nid = of_read_number(&associativity[index], 1);
187
+
188
+ /* POWER4 LPAR uses 0xffff as invalid node */
189
+ if (nid == 0xffff || nid >= nr_node_ids)
190
+ nid = NUMA_NO_NODE;
191
+ return nid;
192
+}
193
+/*
194
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
195
+ * info is found.
196
+ */
197
+static int associativity_to_nid(const __be32 *associativity)
198
+{
199
+ int array_sz = of_read_number(associativity, 1);
200
+
201
+ /* Skip the first element in the associativity array */
202
+ return __associativity_to_nid((associativity + 1), array_sz);
203
+}
204
+
205
+static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
206
+{
207
+ int dist;
208
+ int node1, node2;
209
+
210
+ node1 = associativity_to_nid(cpu1_assoc);
211
+ node2 = associativity_to_nid(cpu2_assoc);
212
+
213
+ dist = numa_distance_table[node1][node2];
214
+ if (dist <= LOCAL_DISTANCE)
215
+ return 0;
216
+ else if (dist <= REMOTE_DISTANCE)
217
+ return 1;
218
+ else
219
+ return 2;
220
+}
221
+
222
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
167223 {
168224 int dist = 0;
169225
....@@ -179,6 +235,15 @@
179235 return dist;
180236 }
181237
238
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
239
+{
240
+ /* We should not get called with FORM0 */
241
+ VM_WARN_ON(affinity_form == FORM0_AFFINITY);
242
+ if (affinity_form == FORM1_AFFINITY)
243
+ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
244
+ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
245
+}
246
+
182247 /* must hold reference to node during call */
183248 static const __be32 *of_get_associativity(struct device_node *dev)
184249 {
....@@ -190,7 +255,9 @@
190255 int i;
191256 int distance = LOCAL_DISTANCE;
192257
193
- if (!form1_affinity)
258
+ if (affinity_form == FORM2_AFFINITY)
259
+ return numa_distance_table[a][b];
260
+ else if (affinity_form == FORM0_AFFINITY)
194261 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
195262
196263 for (i = 0; i < distance_ref_points_depth; i++) {
....@@ -204,52 +271,6 @@
204271 return distance;
205272 }
206273 EXPORT_SYMBOL(__node_distance);
207
-
208
-static void initialize_distance_lookup_table(int nid,
209
- const __be32 *associativity)
210
-{
211
- int i;
212
-
213
- if (!form1_affinity)
214
- return;
215
-
216
- for (i = 0; i < distance_ref_points_depth; i++) {
217
- const __be32 *entry;
218
-
219
- entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
220
- distance_lookup_table[nid][i] = of_read_number(entry, 1);
221
- }
222
-}
223
-
224
-/*
225
- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
226
- * info is found.
227
- */
228
-static int associativity_to_nid(const __be32 *associativity)
229
-{
230
- int nid = NUMA_NO_NODE;
231
-
232
- if (!numa_enabled)
233
- goto out;
234
-
235
- if (of_read_number(associativity, 1) >= min_common_depth)
236
- nid = of_read_number(&associativity[min_common_depth], 1);
237
-
238
- /* POWER4 LPAR uses 0xffff as invalid node */
239
- if (nid == 0xffff || nid >= nr_node_ids)
240
- nid = NUMA_NO_NODE;
241
-
242
- if (nid > 0 &&
243
- of_read_number(associativity, 1) >= distance_ref_points_depth) {
244
- /*
245
- * Skip the length field and send start of associativity array
246
- */
247
- initialize_distance_lookup_table(nid, associativity + 1);
248
- }
249
-
250
-out:
251
- return nid;
252
-}
253274
254275 /* Returns the nid associated with the given device tree node,
255276 * or -1 if not found.
....@@ -284,10 +305,159 @@
284305 }
285306 EXPORT_SYMBOL(of_node_to_nid);
286307
287
-static int __init find_min_common_depth(void)
308
+static void __initialize_form1_numa_distance(const __be32 *associativity,
309
+ int max_array_sz)
288310 {
289
- int depth;
311
+ int i, nid;
312
+
313
+ if (affinity_form != FORM1_AFFINITY)
314
+ return;
315
+
316
+ nid = __associativity_to_nid(associativity, max_array_sz);
317
+ if (nid != NUMA_NO_NODE) {
318
+ for (i = 0; i < distance_ref_points_depth; i++) {
319
+ const __be32 *entry;
320
+ int index = be32_to_cpu(distance_ref_points[i]) - 1;
321
+
322
+ /*
323
+ * broken hierarchy, return with broken distance table
324
+ */
325
+ if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
326
+ return;
327
+
328
+ entry = &associativity[index];
329
+ distance_lookup_table[nid][i] = of_read_number(entry, 1);
330
+ }
331
+ }
332
+}
333
+
334
+static void initialize_form1_numa_distance(const __be32 *associativity)
335
+{
336
+ int array_sz;
337
+
338
+ array_sz = of_read_number(associativity, 1);
339
+ /* Skip the first element in the associativity array */
340
+ __initialize_form1_numa_distance(associativity + 1, array_sz);
341
+}
342
+
343
+/*
344
+ * Used to update distance information w.r.t newly added node.
345
+ */
346
+void update_numa_distance(struct device_node *node)
347
+{
348
+ int nid;
349
+
350
+ if (affinity_form == FORM0_AFFINITY)
351
+ return;
352
+ else if (affinity_form == FORM1_AFFINITY) {
353
+ const __be32 *associativity;
354
+
355
+ associativity = of_get_associativity(node);
356
+ if (!associativity)
357
+ return;
358
+
359
+ initialize_form1_numa_distance(associativity);
360
+ return;
361
+ }
362
+
363
+ /* FORM2 affinity */
364
+ nid = of_node_to_nid_single(node);
365
+ if (nid == NUMA_NO_NODE)
366
+ return;
367
+
368
+ /*
369
+ * With FORM2 we expect NUMA distance of all possible NUMA
370
+ * nodes to be provided during boot.
371
+ */
372
+ WARN(numa_distance_table[nid][nid] == -1,
373
+ "NUMA distance details for node %d not provided\n", nid);
374
+}
375
+EXPORT_SYMBOL_GPL(update_numa_distance);
376
+
377
+/*
378
+ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
379
+ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
380
+ */
381
+static void initialize_form2_numa_distance_lookup_table(void)
382
+{
383
+ int i, j;
290384 struct device_node *root;
385
+ const __u8 *numa_dist_table;
386
+ const __be32 *numa_lookup_index;
387
+ int numa_dist_table_length;
388
+ int max_numa_index, distance_index;
389
+
390
+ if (firmware_has_feature(FW_FEATURE_OPAL))
391
+ root = of_find_node_by_path("/ibm,opal");
392
+ else
393
+ root = of_find_node_by_path("/rtas");
394
+ if (!root)
395
+ root = of_find_node_by_path("/");
396
+
397
+ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
398
+ max_numa_index = of_read_number(&numa_lookup_index[0], 1);
399
+
400
+ /* first element of the array is the size and is encode-int */
401
+ numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
402
+ numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
403
+ /* Skip the size which is encoded int */
404
+ numa_dist_table += sizeof(__be32);
405
+
406
+ pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n",
407
+ numa_dist_table_length, max_numa_index);
408
+
409
+ for (i = 0; i < max_numa_index; i++)
410
+ /* +1 skip the max_numa_index in the property */
411
+ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
412
+
413
+
414
+ if (numa_dist_table_length != max_numa_index * max_numa_index) {
415
+ WARN(1, "Wrong NUMA distance information\n");
416
+ /* consider everybody else just remote. */
417
+ for (i = 0; i < max_numa_index; i++) {
418
+ for (j = 0; j < max_numa_index; j++) {
419
+ int nodeA = numa_id_index_table[i];
420
+ int nodeB = numa_id_index_table[j];
421
+
422
+ if (nodeA == nodeB)
423
+ numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE;
424
+ else
425
+ numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE;
426
+ }
427
+ }
428
+ }
429
+
430
+ distance_index = 0;
431
+ for (i = 0; i < max_numa_index; i++) {
432
+ for (j = 0; j < max_numa_index; j++) {
433
+ int nodeA = numa_id_index_table[i];
434
+ int nodeB = numa_id_index_table[j];
435
+
436
+ numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++];
437
+ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
438
+ }
439
+ }
440
+ of_node_put(root);
441
+}
442
+
443
+static int __init find_primary_domain_index(void)
444
+{
445
+ int index;
446
+ struct device_node *root;
447
+
448
+ /*
449
+ * Check for which form of affinity.
450
+ */
451
+ if (firmware_has_feature(FW_FEATURE_OPAL)) {
452
+ affinity_form = FORM1_AFFINITY;
453
+ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
454
+ dbg("Using form 2 affinity\n");
455
+ affinity_form = FORM2_AFFINITY;
456
+ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
457
+ dbg("Using form 1 affinity\n");
458
+ affinity_form = FORM1_AFFINITY;
459
+ } else
460
+ affinity_form = FORM0_AFFINITY;
291461
292462 if (firmware_has_feature(FW_FEATURE_OPAL))
293463 root = of_find_node_by_path("/ibm,opal");
....@@ -318,25 +488,21 @@
318488 }
319489
320490 distance_ref_points_depth /= sizeof(int);
321
-
322
- if (firmware_has_feature(FW_FEATURE_OPAL) ||
323
- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
324
- dbg("Using form 1 affinity\n");
325
- form1_affinity = 1;
326
- }
327
-
328
- if (form1_affinity) {
329
- depth = of_read_number(distance_ref_points, 1);
330
- } else {
491
+ if (affinity_form == FORM0_AFFINITY) {
331492 if (distance_ref_points_depth < 2) {
332493 printk(KERN_WARNING "NUMA: "
333
- "short ibm,associativity-reference-points\n");
494
+ "short ibm,associativity-reference-points\n");
334495 goto err;
335496 }
336497
337
- depth = of_read_number(&distance_ref_points[1], 1);
498
+ index = of_read_number(&distance_ref_points[1], 1);
499
+ } else {
500
+ /*
501
+ * Both FORM1 and FORM2 affinity find the primary domain details
502
+ * at the same offset.
503
+ */
504
+ index = of_read_number(distance_ref_points, 1);
338505 }
339
-
340506 /*
341507 * Warn and cap if the hardware supports more than
342508 * MAX_DISTANCE_REF_POINTS domains.
....@@ -348,7 +514,7 @@
348514 }
349515
350516 of_node_put(root);
351
- return depth;
517
+ return index;
352518
353519 err:
354520 of_node_put(root);
....@@ -426,6 +592,38 @@
426592 return 0;
427593 }
428594
595
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
596
+{
597
+ struct assoc_arrays aa = { .arrays = NULL };
598
+ int default_nid = NUMA_NO_NODE;
599
+ int nid = default_nid;
600
+ int rc, index;
601
+
602
+ if ((primary_domain_index < 0) || !numa_enabled)
603
+ return default_nid;
604
+
605
+ rc = of_get_assoc_arrays(&aa);
606
+ if (rc)
607
+ return default_nid;
608
+
609
+ if (primary_domain_index <= aa.array_sz &&
610
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
611
+ const __be32 *associativity;
612
+
613
+ index = lmb->aa_index * aa.array_sz;
614
+ associativity = &aa.arrays[index];
615
+ nid = __associativity_to_nid(associativity, aa.array_sz);
616
+ if (nid > 0 && affinity_form == FORM1_AFFINITY) {
617
+ /*
618
+ * lookup array associativity entries have
619
+ * no length of the array as the first element.
620
+ */
621
+ __initialize_form1_numa_distance(associativity, aa.array_sz);
622
+ }
623
+ }
624
+ return nid;
625
+}
626
+
429627 /*
430628 * This is like of_node_to_nid_single() for memory represented in the
431629 * ibm,dynamic-reconfiguration-memory node.
....@@ -437,35 +635,28 @@
437635 int nid = default_nid;
438636 int rc, index;
439637
440
- if ((min_common_depth < 0) || !numa_enabled)
638
+ if ((primary_domain_index < 0) || !numa_enabled)
441639 return default_nid;
442640
443641 rc = of_get_assoc_arrays(&aa);
444642 if (rc)
445643 return default_nid;
446644
447
- if (min_common_depth <= aa.array_sz &&
645
+ if (primary_domain_index <= aa.array_sz &&
448646 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
449
- index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
450
- nid = of_read_number(&aa.arrays[index], 1);
647
+ const __be32 *associativity;
451648
452
- if (nid == 0xffff || nid >= nr_node_ids)
453
- nid = default_nid;
454
-
455
- if (nid > 0) {
456
- index = lmb->aa_index * aa.array_sz;
457
- initialize_distance_lookup_table(nid,
458
- &aa.arrays[index]);
459
- }
649
+ index = lmb->aa_index * aa.array_sz;
650
+ associativity = &aa.arrays[index];
651
+ nid = __associativity_to_nid(associativity, aa.array_sz);
460652 }
461
-
462653 return nid;
463654 }
464655
465656 #ifdef CONFIG_PPC_SPLPAR
466
-static int vphn_get_nid(long lcpu)
657
+
658
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
467659 {
468
- __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
469660 long rc, hwid;
470661
471662 /*
....@@ -485,12 +676,30 @@
485676
486677 rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
487678 if (rc == H_SUCCESS)
488
- return associativity_to_nid(associativity);
679
+ return 0;
489680 }
490681
682
+ return -1;
683
+}
684
+
685
+static int vphn_get_nid(long lcpu)
686
+{
687
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
688
+
689
+
690
+ if (!__vphn_get_associativity(lcpu, associativity))
691
+ return associativity_to_nid(associativity);
692
+
491693 return NUMA_NO_NODE;
694
+
492695 }
493696 #else
697
+
698
+static int __vphn_get_associativity(long lcpu, __be32 *associativity)
699
+{
700
+ return -1;
701
+}
702
+
494703 static int vphn_get_nid(long unused)
495704 {
496705 return NUMA_NO_NODE;
....@@ -685,7 +894,7 @@
685894 size = read_n_cells(n_mem_size_cells, usm);
686895 }
687896
688
- nid = of_drconf_to_nid_single(lmb);
897
+ nid = get_nid_and_numa_distance(lmb);
689898 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
690899 &nid);
691900 node_set_online(nid);
....@@ -702,24 +911,31 @@
702911 struct device_node *memory;
703912 int default_nid = 0;
704913 unsigned long i;
914
+ const __be32 *associativity;
705915
706916 if (numa_enabled == 0) {
707917 printk(KERN_WARNING "NUMA disabled by user\n");
708918 return -1;
709919 }
710920
711
- min_common_depth = find_min_common_depth();
921
+ primary_domain_index = find_primary_domain_index();
712922
713
- if (min_common_depth < 0) {
923
+ if (primary_domain_index < 0) {
714924 /*
715
- * if we fail to parse min_common_depth from device tree
925
+ * if we fail to parse primary_domain_index from device tree
716926 * mark the numa disabled, boot with numa disabled.
717927 */
718928 numa_enabled = false;
719
- return min_common_depth;
929
+ return primary_domain_index;
720930 }
721931
722
- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
932
+ dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
933
+
934
+ /*
935
+ * If it is FORM2 initialize the distance table here.
936
+ */
937
+ if (affinity_form == FORM2_AFFINITY)
938
+ initialize_form2_numa_distance_lookup_table();
723939
724940 /*
725941 * Even though we connect cpus to numa domains later in SMP
....@@ -727,18 +943,30 @@
727943 * each node to be onlined must have NODE_DATA etc backing it.
728944 */
729945 for_each_present_cpu(i) {
946
+ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
730947 struct device_node *cpu;
731
- int nid = vphn_get_nid(i);
948
+ int nid = NUMA_NO_NODE;
732949
733
- /*
734
- * Don't fall back to default_nid yet -- we will plug
735
- * cpus into nodes once the memory scan has discovered
736
- * the topology.
737
- */
738
- if (nid == NUMA_NO_NODE) {
950
+ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
951
+
952
+ if (__vphn_get_associativity(i, vphn_assoc) == 0) {
953
+ nid = associativity_to_nid(vphn_assoc);
954
+ initialize_form1_numa_distance(vphn_assoc);
955
+ } else {
956
+
957
+ /*
958
+ * Don't fall back to default_nid yet -- we will plug
959
+ * cpus into nodes once the memory scan has discovered
960
+ * the topology.
961
+ */
739962 cpu = of_get_cpu_node(i, NULL);
740963 BUG_ON(!cpu);
741
- nid = of_node_to_nid_single(cpu);
964
+
965
+ associativity = of_get_associativity(cpu);
966
+ if (associativity) {
967
+ nid = associativity_to_nid(associativity);
968
+ initialize_form1_numa_distance(associativity);
969
+ }
742970 of_node_put(cpu);
743971 }
744972
....@@ -776,8 +1004,11 @@
7761004 * have associativity properties. If none, then
7771005 * everything goes to default_nid.
7781006 */
779
- nid = of_node_to_nid_single(memory);
780
- if (nid < 0)
1007
+ associativity = of_get_associativity(memory);
1008
+ if (associativity) {
1009
+ nid = associativity_to_nid(associativity);
1010
+ initialize_form1_numa_distance(associativity);
1011
+ } else
7811012 nid = default_nid;
7821013
7831014 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
....@@ -926,7 +1157,7 @@
9261157 goto out;
9271158 }
9281159
929
- max_nodes = of_read_number(&domains[min_common_depth], 1);
1160
+ max_nodes = of_read_number(&domains[primary_domain_index], 1);
9301161 pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
9311162
9321163 for (i = 0; i < max_nodes; i++) {
....@@ -935,7 +1166,7 @@
9351166 }
9361167
9371168 prop_length /= sizeof(int);
938
- if (prop_length > min_common_depth + 2)
1169
+ if (prop_length > primary_domain_index + 2)
9391170 coregroup_enabled = 1;
9401171
9411172 out:
....@@ -1268,7 +1499,7 @@
12681499 goto out;
12691500
12701501 index = of_read_number(associativity, 1);
1271
- if (index > min_common_depth + 1)
1502
+ if (index > primary_domain_index + 1)
12721503 return of_read_number(&associativity[index - 1], 1);
12731504
12741505 out: