forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/arch/powerpc/mm/numa.c
....@@ -1,17 +1,13 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * pSeries NUMA support
34 *
45 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5
- *
6
- * This program is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU General Public License
8
- * as published by the Free Software Foundation; either version
9
- * 2 of the License, or (at your option) any later version.
106 */
117 #define pr_fmt(fmt) "numa: " fmt
128
139 #include <linux/threads.h>
14
-#include <linux/bootmem.h>
10
+#include <linux/memblock.h>
1511 #include <linux/init.h>
1612 #include <linux/mm.h>
1713 #include <linux/mmzone.h>
....@@ -19,7 +15,6 @@
1915 #include <linux/nodemask.h>
2016 #include <linux/cpu.h>
2117 #include <linux/notifier.h>
22
-#include <linux/memblock.h>
2318 #include <linux/of.h>
2419 #include <linux/pfn.h>
2520 #include <linux/cpuset.h>
....@@ -33,7 +28,6 @@
3328 #include <asm/sparsemem.h>
3429 #include <asm/prom.h>
3530 #include <asm/smp.h>
36
-#include <asm/cputhreads.h>
3731 #include <asm/topology.h>
3832 #include <asm/firmware.h>
3933 #include <asm/paca.h>
....@@ -85,7 +79,7 @@
8579 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
8680
8781 /* cpumask_of_node() will now work */
88
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
82
+ dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
8983 }
9084
9185 static int __init fake_numa_create_new_node(unsigned long end_pfn,
....@@ -169,6 +163,22 @@
169163 }
170164 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
171165
166
+int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
167
+{
168
+ int dist = 0;
169
+
170
+ int i, index;
171
+
172
+ for (i = 0; i < distance_ref_points_depth; i++) {
173
+ index = be32_to_cpu(distance_ref_points[i]);
174
+ if (cpu1_assoc[index] == cpu2_assoc[index])
175
+ break;
176
+ dist++;
177
+ }
178
+
179
+ return dist;
180
+}
181
+
172182 /* must hold reference to node during call */
173183 static const __be32 *of_get_associativity(struct device_node *dev)
174184 {
....@@ -211,22 +221,23 @@
211221 }
212222 }
213223
214
-/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
224
+/*
225
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
215226 * info is found.
216227 */
217228 static int associativity_to_nid(const __be32 *associativity)
218229 {
219
- int nid = -1;
230
+ int nid = NUMA_NO_NODE;
220231
221
- if (min_common_depth == -1)
232
+ if (!numa_enabled)
222233 goto out;
223234
224235 if (of_read_number(associativity, 1) >= min_common_depth)
225236 nid = of_read_number(&associativity[min_common_depth], 1);
226237
227238 /* POWER4 LPAR uses 0xffff as invalid node */
228
- if (nid == 0xffff || nid >= MAX_NUMNODES)
229
- nid = -1;
239
+ if (nid == 0xffff || nid >= nr_node_ids)
240
+ nid = NUMA_NO_NODE;
230241
231242 if (nid > 0 &&
232243 of_read_number(associativity, 1) >= distance_ref_points_depth) {
....@@ -245,7 +256,7 @@
245256 */
246257 static int of_node_to_nid_single(struct device_node *device)
247258 {
248
- int nid = -1;
259
+ int nid = NUMA_NO_NODE;
249260 const __be32 *tmp;
250261
251262 tmp = of_get_associativity(device);
....@@ -257,7 +268,7 @@
257268 /* Walk the device tree upwards, looking for an associativity id */
258269 int of_node_to_nid(struct device_node *device)
259270 {
260
- int nid = -1;
271
+ int nid = NUMA_NO_NODE;
261272
262273 of_node_get(device);
263274 while (device) {
....@@ -419,24 +430,26 @@
419430 * This is like of_node_to_nid_single() for memory represented in the
420431 * ibm,dynamic-reconfiguration-memory node.
421432 */
422
-static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
433
+int of_drconf_to_nid_single(struct drmem_lmb *lmb)
423434 {
424435 struct assoc_arrays aa = { .arrays = NULL };
425
- int default_nid = 0;
436
+ int default_nid = NUMA_NO_NODE;
426437 int nid = default_nid;
427438 int rc, index;
439
+
440
+ if ((min_common_depth < 0) || !numa_enabled)
441
+ return default_nid;
428442
429443 rc = of_get_assoc_arrays(&aa);
430444 if (rc)
431445 return default_nid;
432446
433
- if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
434
- !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
435
- lmb->aa_index < aa.n_arrays) {
447
+ if (min_common_depth <= aa.array_sz &&
448
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
436449 index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
437450 nid = of_read_number(&aa.arrays[index], 1);
438451
439
- if (nid == 0xffff || nid >= MAX_NUMNODES)
452
+ if (nid == 0xffff || nid >= nr_node_ids)
440453 nid = default_nid;
441454
442455 if (nid > 0) {
....@@ -449,24 +462,73 @@
449462 return nid;
450463 }
451464
465
+#ifdef CONFIG_PPC_SPLPAR
466
+static int vphn_get_nid(long lcpu)
467
+{
468
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
469
+ long rc, hwid;
470
+
471
+ /*
472
+ * On a shared lpar, device tree will not have node associativity.
473
+ * At this time lppaca, or its __old_status field may not be
474
+ * updated. Hence kernel cannot detect if its on a shared lpar. So
475
+ * request an explicit associativity irrespective of whether the
476
+ * lpar is shared or dedicated. Use the device tree property as a
477
+ * fallback. cpu_to_phys_id is only valid between
478
+ * smp_setup_cpu_maps() and smp_setup_pacas().
479
+ */
480
+ if (firmware_has_feature(FW_FEATURE_VPHN)) {
481
+ if (cpu_to_phys_id)
482
+ hwid = cpu_to_phys_id[lcpu];
483
+ else
484
+ hwid = get_hard_smp_processor_id(lcpu);
485
+
486
+ rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
487
+ if (rc == H_SUCCESS)
488
+ return associativity_to_nid(associativity);
489
+ }
490
+
491
+ return NUMA_NO_NODE;
492
+}
493
+#else
494
+static int vphn_get_nid(long unused)
495
+{
496
+ return NUMA_NO_NODE;
497
+}
498
+#endif /* CONFIG_PPC_SPLPAR */
499
+
452500 /*
453501 * Figure out to which domain a cpu belongs and stick it there.
454502 * Return the id of the domain used.
455503 */
456504 static int numa_setup_cpu(unsigned long lcpu)
457505 {
458
- int nid = -1;
459506 struct device_node *cpu;
507
+ int fcpu = cpu_first_thread_sibling(lcpu);
508
+ int nid = NUMA_NO_NODE;
509
+
510
+ if (!cpu_present(lcpu)) {
511
+ set_cpu_numa_node(lcpu, first_online_node);
512
+ return first_online_node;
513
+ }
460514
461515 /*
462516 * If a valid cpu-to-node mapping is already available, use it
463517 * directly instead of querying the firmware, since it represents
464518 * the most recent mapping notified to us by the platform (eg: VPHN).
519
+ * Since cpu_to_node binding remains the same for all threads in the
520
+ * core. If a valid cpu-to-node mapping is already available, for
521
+ * the first thread in the core, use it.
465522 */
466
- if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
523
+ nid = numa_cpu_lookup_table[fcpu];
524
+ if (nid >= 0) {
467525 map_cpu_to_node(lcpu, nid);
468526 return nid;
469527 }
528
+
529
+ nid = vphn_get_nid(lcpu);
530
+ if (nid != NUMA_NO_NODE)
531
+ goto out_present;
470532
471533 cpu = of_get_cpu_node(lcpu, NULL);
472534
....@@ -479,13 +541,26 @@
479541 }
480542
481543 nid = of_node_to_nid_single(cpu);
544
+ of_node_put(cpu);
482545
483546 out_present:
484547 if (nid < 0 || !node_possible(nid))
485548 nid = first_online_node;
486549
550
+ /*
551
+ * Update for the first thread of the core. All threads of a core
552
+ * have to be part of the same node. This not only avoids querying
553
+ * for every other thread in the core, but always avoids a case
554
+ * where virtual node associativity change causes subsequent threads
555
+ * of a core to be associated with different nid. However if first
556
+ * thread is already online, expect it to have a valid mapping.
557
+ */
558
+ if (fcpu != lcpu) {
559
+ WARN_ON(cpu_online(fcpu));
560
+ map_cpu_to_node(fcpu, nid);
561
+ }
562
+
487563 map_cpu_to_node(lcpu, nid);
488
- of_node_put(cpu);
489564 out:
490565 return nid;
491566 }
....@@ -575,8 +650,9 @@
575650 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
576651 * node. This assumes n_mem_{addr,size}_cells have been set.
577652 */
578
-static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
579
- const __be32 **usm)
653
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
654
+ const __be32 **usm,
655
+ void *data)
580656 {
581657 unsigned int ranges, is_kexec_kdump = 0;
582658 unsigned long base, size, sz;
....@@ -588,7 +664,7 @@
588664 */
589665 if ((lmb->flags & DRCONF_MEM_RESERVED)
590666 || !(lmb->flags & DRCONF_MEM_ASSIGNED))
591
- return;
667
+ return 0;
592668
593669 if (*usm)
594670 is_kexec_kdump = 1;
....@@ -600,7 +676,7 @@
600676 if (is_kexec_kdump) {
601677 ranges = read_usm_ranges(usm);
602678 if (!ranges) /* there are no (base, size) duple */
603
- return;
679
+ return 0;
604680 }
605681
606682 do {
....@@ -617,6 +693,8 @@
617693 if (sz)
618694 memblock_set_node(base, sz, &memblock.memory, nid);
619695 } while (--ranges);
696
+
697
+ return 0;
620698 }
621699
622700 static int __init parse_numa_properties(void)
....@@ -632,8 +710,14 @@
632710
633711 min_common_depth = find_min_common_depth();
634712
635
- if (min_common_depth < 0)
713
+ if (min_common_depth < 0) {
714
+ /*
715
+ * if we fail to parse min_common_depth from device tree
716
+ * mark the numa disabled, boot with numa disabled.
717
+ */
718
+ numa_enabled = false;
636719 return min_common_depth;
720
+ }
637721
638722 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
639723
....@@ -644,21 +728,23 @@
644728 */
645729 for_each_present_cpu(i) {
646730 struct device_node *cpu;
647
- int nid;
648
-
649
- cpu = of_get_cpu_node(i, NULL);
650
- BUG_ON(!cpu);
651
- nid = of_node_to_nid_single(cpu);
652
- of_node_put(cpu);
731
+ int nid = vphn_get_nid(i);
653732
654733 /*
655734 * Don't fall back to default_nid yet -- we will plug
656735 * cpus into nodes once the memory scan has discovered
657736 * the topology.
658737 */
659
- if (nid < 0)
660
- continue;
661
- node_set_online(nid);
738
+ if (nid == NUMA_NO_NODE) {
739
+ cpu = of_get_cpu_node(i, NULL);
740
+ BUG_ON(!cpu);
741
+ nid = of_node_to_nid_single(cpu);
742
+ of_node_put(cpu);
743
+ }
744
+
745
+ /* node_set_online() is an UB if 'nid' is negative */
746
+ if (likely(nid >= 0))
747
+ node_set_online(nid);
662748 }
663749
664750 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
....@@ -712,7 +798,7 @@
712798 */
713799 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
714800 if (memory) {
715
- walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
801
+ walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
716802 of_node_put(memory);
717803 }
718804
....@@ -725,17 +811,14 @@
725811 unsigned long total_ram = memblock_phys_mem_size();
726812 unsigned long start_pfn, end_pfn;
727813 unsigned int nid = 0;
728
- struct memblock_region *reg;
814
+ int i;
729815
730816 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
731817 top_of_ram, total_ram);
732818 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
733819 (top_of_ram - total_ram) >> 20);
734820
735
- for_each_memblock(memory, reg) {
736
- start_pfn = memblock_region_memory_base_pfn(reg);
737
- end_pfn = memblock_region_memory_end_pfn(reg);
738
-
821
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
739822 fake_numa_create_new_node(end_pfn, &nid);
740823 memblock_set_node(PFN_PHYS(start_pfn),
741824 PFN_PHYS(end_pfn - start_pfn),
....@@ -749,7 +832,7 @@
749832 unsigned int node;
750833 unsigned int cpu, count;
751834
752
- if (min_common_depth == -1 || !numa_enabled)
835
+ if (!numa_enabled)
753836 return;
754837
755838 for_each_online_node(node) {
....@@ -788,7 +871,11 @@
788871 void *nd;
789872 int tnid;
790873
791
- nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
874
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
875
+ if (!nd_pa)
876
+ panic("Cannot allocate %zu bytes for node %d data\n",
877
+ nd_size, nid);
878
+
792879 nd = __va(nd_pa);
793880
794881 /* report and initialize */
....@@ -808,24 +895,48 @@
808895 static void __init find_possible_nodes(void)
809896 {
810897 struct device_node *rtas;
811
- u32 numnodes, i;
898
+ const __be32 *domains = NULL;
899
+ int prop_length, max_nodes;
900
+ u32 i;
812901
813
- if (min_common_depth <= 0)
902
+ if (!numa_enabled)
814903 return;
815904
816905 rtas = of_find_node_by_path("/rtas");
817906 if (!rtas)
818907 return;
819908
820
- if (of_property_read_u32_index(rtas,
821
- "ibm,max-associativity-domains",
822
- min_common_depth, &numnodes))
823
- goto out;
909
+ /*
910
+ * ibm,current-associativity-domains is a fairly recent property. If
911
+ * it doesn't exist, then fallback on ibm,max-associativity-domains.
912
+ * Current denotes what the platform can support compared to max
913
+ * which denotes what the Hypervisor can support.
914
+ *
915
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
916
+ * so we should consider the max number in that case.
917
+ */
918
+ if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
919
+ domains = of_get_property(rtas,
920
+ "ibm,current-associativity-domains",
921
+ &prop_length);
922
+ if (!domains) {
923
+ domains = of_get_property(rtas, "ibm,max-associativity-domains",
924
+ &prop_length);
925
+ if (!domains)
926
+ goto out;
927
+ }
824928
825
- for (i = 0; i < numnodes; i++) {
929
+ max_nodes = of_read_number(&domains[min_common_depth], 1);
930
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
931
+
932
+ for (i = 0; i < max_nodes; i++) {
826933 if (!node_possible(i))
827934 node_set(i, node_possible_map);
828935 }
936
+
937
+ prop_length /= sizeof(int);
938
+ if (prop_length > min_common_depth + 2)
939
+ coregroup_enabled = 1;
829940
830941 out:
831942 of_node_put(rtas);
....@@ -834,6 +945,16 @@
834945 void __init mem_topology_setup(void)
835946 {
836947 int cpu;
948
+
949
+ /*
950
+ * Linux/mm assumes node 0 to be online at boot. However this is not
951
+ * true on PowerPC, where node 0 is similar to any other node, it
952
+ * could be cpuless, memoryless node. So force node 0 to be offline
953
+ * for now. This will prevent cpuless, memoryless node 0 showing up
954
+ * unnecessarily as online. If a node has cpus or memory that need
955
+ * to be online, then node will anyway be marked online.
956
+ */
957
+ node_set_offline(0);
837958
838959 if (parse_numa_properties())
839960 setup_nonnuma();
....@@ -852,8 +973,17 @@
852973
853974 reset_numa_cpu_lookup_table();
854975
855
- for_each_present_cpu(cpu)
976
+ for_each_possible_cpu(cpu) {
977
+ /*
978
+ * Powerpc with CONFIG_NUMA always used to have a node 0,
979
+ * even if it was memoryless or cpuless. For all cpus that
980
+ * are possible but not present, cpu_to_node() would point
981
+ * to node 0. To remove a cpuless, memoryless dummy node,
982
+ * powerpc need to make sure all possible but not present
983
+ * cpu_to_node are set to a proper node.
984
+ */
856985 numa_setup_cpu(cpu);
986
+ }
857987 }
858988
859989 void __init initmem_init(void)
....@@ -870,7 +1000,6 @@
8701000
8711001 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
8721002 setup_node_data(nid, start_pfn, end_pfn);
873
- sparse_memory_present_with_active_regions(nid);
8741003 }
8751004
8761005 sparse_init();
....@@ -905,22 +1034,6 @@
9051034 }
9061035 early_param("numa", early_numa);
9071036
908
-static bool topology_updates_enabled = true;
909
-
910
-static int __init early_topology_updates(char *p)
911
-{
912
- if (!p)
913
- return 0;
914
-
915
- if (!strcmp(p, "off")) {
916
- pr_info("Disabling topology updates\n");
917
- topology_updates_enabled = false;
918
- }
919
-
920
- return 0;
921
-}
922
-early_param("topology_updates", early_topology_updates);
923
-
9241037 #ifdef CONFIG_MEMORY_HOTPLUG
9251038 /*
9261039 * Find the node associated with a hot added memory section for
....@@ -931,7 +1044,7 @@
9311044 {
9321045 struct drmem_lmb *lmb;
9331046 unsigned long lmb_size;
934
- int nid = -1;
1047
+ int nid = NUMA_NO_NODE;
9351048
9361049 lmb_size = drmem_lmb_size();
9371050
....@@ -961,7 +1074,7 @@
9611074 static int hot_add_node_scn_to_nid(unsigned long scn_addr)
9621075 {
9631076 struct device_node *memory;
964
- int nid = -1;
1077
+ int nid = NUMA_NO_NODE;
9651078
9661079 for_each_node_by_type(memory, "memory") {
9671080 unsigned long start, size;
....@@ -1006,7 +1119,7 @@
10061119 struct device_node *memory = NULL;
10071120 int nid;
10081121
1009
- if (!numa_enabled || (min_common_depth < 0))
1122
+ if (!numa_enabled)
10101123 return first_online_node;
10111124
10121125 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
....@@ -1059,142 +1172,42 @@
10591172
10601173 /* Virtual Processor Home Node (VPHN) support */
10611174 #ifdef CONFIG_PPC_SPLPAR
1062
-
1063
-#include "vphn.h"
1064
-
1065
-struct topology_update_data {
1066
- struct topology_update_data *next;
1067
- unsigned int cpu;
1068
- int old_nid;
1069
- int new_nid;
1070
-};
1071
-
1072
-#define TOPOLOGY_DEF_TIMER_SECS 60
1073
-
1074
-static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1075
-static cpumask_t cpu_associativity_changes_mask;
1076
-static int vphn_enabled;
1077
-static int prrn_enabled;
1078
-static void reset_topology_timer(void);
1079
-static int topology_timer_secs = 1;
10801175 static int topology_inited;
1081
-
1082
-/*
1083
- * Change polling interval for associativity changes.
1084
- */
1085
-int timed_topology_update(int nsecs)
1086
-{
1087
- if (vphn_enabled) {
1088
- if (nsecs > 0)
1089
- topology_timer_secs = nsecs;
1090
- else
1091
- topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
1092
-
1093
- reset_topology_timer();
1094
- }
1095
-
1096
- return 0;
1097
-}
1098
-
1099
-/*
1100
- * Store the current values of the associativity change counters in the
1101
- * hypervisor.
1102
- */
1103
-static void setup_cpu_associativity_change_counters(void)
1104
-{
1105
- int cpu;
1106
-
1107
- /* The VPHN feature supports a maximum of 8 reference points */
1108
- BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1109
-
1110
- for_each_possible_cpu(cpu) {
1111
- int i;
1112
- u8 *counts = vphn_cpu_change_counts[cpu];
1113
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
1114
-
1115
- for (i = 0; i < distance_ref_points_depth; i++)
1116
- counts[i] = hypervisor_counts[i];
1117
- }
1118
-}
1119
-
1120
-/*
1121
- * The hypervisor maintains a set of 8 associativity change counters in
1122
- * the VPA of each cpu that correspond to the associativity levels in the
1123
- * ibm,associativity-reference-points property. When an associativity
1124
- * level changes, the corresponding counter is incremented.
1125
- *
1126
- * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1127
- * node associativity levels have changed.
1128
- *
1129
- * Returns the number of cpus with unhandled associativity changes.
1130
- */
1131
-static int update_cpu_associativity_changes_mask(void)
1132
-{
1133
- int cpu;
1134
- cpumask_t *changes = &cpu_associativity_changes_mask;
1135
-
1136
- for_each_possible_cpu(cpu) {
1137
- int i, changed = 0;
1138
- u8 *counts = vphn_cpu_change_counts[cpu];
1139
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
1140
-
1141
- for (i = 0; i < distance_ref_points_depth; i++) {
1142
- if (hypervisor_counts[i] != counts[i]) {
1143
- counts[i] = hypervisor_counts[i];
1144
- changed = 1;
1145
- }
1146
- }
1147
- if (changed) {
1148
- cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1149
- cpu = cpu_last_thread_sibling(cpu);
1150
- }
1151
- }
1152
-
1153
- return cpumask_weight(changes);
1154
-}
11551176
11561177 /*
11571178 * Retrieve the new associativity information for a virtual processor's
11581179 * home node.
11591180 */
1160
-static long hcall_vphn(unsigned long cpu, __be32 *associativity)
1161
-{
1162
- long rc;
1163
- long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1164
- u64 flags = 1;
1165
- int hwcpu = get_hard_smp_processor_id(cpu);
1166
-
1167
- rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1168
- vphn_unpack_associativity(retbuf, associativity);
1169
-
1170
- return rc;
1171
-}
1172
-
11731181 static long vphn_get_associativity(unsigned long cpu,
11741182 __be32 *associativity)
11751183 {
11761184 long rc;
11771185
1178
- rc = hcall_vphn(cpu, associativity);
1186
+ rc = hcall_vphn(get_hard_smp_processor_id(cpu),
1187
+ VPHN_FLAG_VCPU, associativity);
11791188
11801189 switch (rc) {
1181
- case H_FUNCTION:
1182
- printk_once(KERN_INFO
1183
- "VPHN is not supported. Disabling polling...\n");
1184
- stop_topology_update();
1185
- break;
1186
- case H_HARDWARE:
1187
- printk(KERN_ERR
1188
- "hcall_vphn() experienced a hardware fault "
1189
- "preventing VPHN. Disabling polling...\n");
1190
- stop_topology_update();
1191
- break;
11921190 case H_SUCCESS:
11931191 dbg("VPHN hcall succeeded. Reset polling...\n");
1194
- timed_topology_update(0);
1192
+ goto out;
1193
+
1194
+ case H_FUNCTION:
1195
+ pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
1196
+ break;
1197
+ case H_HARDWARE:
1198
+ pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
1199
+ "preventing VPHN. Disabling polling...\n");
1200
+ break;
1201
+ case H_PARAMETER:
1202
+ pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
1203
+ "Disabling polling...\n");
1204
+ break;
1205
+ default:
1206
+ pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
1207
+ , rc);
11951208 break;
11961209 }
1197
-
1210
+out:
11981211 return rc;
11991212 }
12001213
....@@ -1237,383 +1250,33 @@
12371250 return new_nid;
12381251 }
12391252
1240
-/*
1241
- * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1242
- * characteristics change. This function doesn't perform any locking and is
1243
- * only safe to call from stop_machine().
1244
- */
1245
-static int update_cpu_topology(void *data)
1253
+int cpu_to_coregroup_id(int cpu)
12461254 {
1247
- struct topology_update_data *update;
1248
- unsigned long cpu;
1255
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
1256
+ int index;
12491257
1250
- if (!data)
1251
- return -EINVAL;
1258
+ if (cpu < 0 || cpu > nr_cpu_ids)
1259
+ return -1;
12521260
1253
- cpu = smp_processor_id();
1254
-
1255
- for (update = data; update; update = update->next) {
1256
- int new_nid = update->new_nid;
1257
- if (cpu != update->cpu)
1258
- continue;
1259
-
1260
- unmap_cpu_from_node(cpu);
1261
- map_cpu_to_node(cpu, new_nid);
1262
- set_cpu_numa_node(cpu, new_nid);
1263
- set_cpu_numa_mem(cpu, local_memory_node(new_nid));
1264
- vdso_getcpu_init();
1265
- }
1266
-
1267
- return 0;
1268
-}
1269
-
1270
-static int update_lookup_table(void *data)
1271
-{
1272
- struct topology_update_data *update;
1273
-
1274
- if (!data)
1275
- return -EINVAL;
1276
-
1277
- /*
1278
- * Upon topology update, the numa-cpu lookup table needs to be updated
1279
- * for all threads in the core, including offline CPUs, to ensure that
1280
- * future hotplug operations respect the cpu-to-node associativity
1281
- * properly.
1282
- */
1283
- for (update = data; update; update = update->next) {
1284
- int nid, base, j;
1285
-
1286
- nid = update->new_nid;
1287
- base = cpu_first_thread_sibling(update->cpu);
1288
-
1289
- for (j = 0; j < threads_per_core; j++) {
1290
- update_numa_cpu_lookup_table(base + j, nid);
1291
- }
1292
- }
1293
-
1294
- return 0;
1295
-}
1296
-
1297
-/*
1298
- * Update the node maps and sysfs entries for each cpu whose home node
1299
- * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1300
- *
1301
- * cpus_locked says whether we already hold cpu_hotplug_lock.
1302
- */
1303
-int numa_update_cpu_topology(bool cpus_locked)
1304
-{
1305
- unsigned int cpu, sibling, changed = 0;
1306
- struct topology_update_data *updates, *ud;
1307
- cpumask_t updated_cpus;
1308
- struct device *dev;
1309
- int weight, new_nid, i = 0;
1310
-
1311
- if (!prrn_enabled && !vphn_enabled && topology_inited)
1312
- return 0;
1313
-
1314
- weight = cpumask_weight(&cpu_associativity_changes_mask);
1315
- if (!weight)
1316
- return 0;
1317
-
1318
- updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
1319
- if (!updates)
1320
- return 0;
1321
-
1322
- cpumask_clear(&updated_cpus);
1323
-
1324
- for_each_cpu(cpu, &cpu_associativity_changes_mask) {
1325
- /*
1326
- * If siblings aren't flagged for changes, updates list
1327
- * will be too short. Skip on this update and set for next
1328
- * update.
1329
- */
1330
- if (!cpumask_subset(cpu_sibling_mask(cpu),
1331
- &cpu_associativity_changes_mask)) {
1332
- pr_info("Sibling bits not set for associativity "
1333
- "change, cpu%d\n", cpu);
1334
- cpumask_or(&cpu_associativity_changes_mask,
1335
- &cpu_associativity_changes_mask,
1336
- cpu_sibling_mask(cpu));
1337
- cpu = cpu_last_thread_sibling(cpu);
1338
- continue;
1339
- }
1340
-
1341
- new_nid = find_and_online_cpu_nid(cpu);
1342
-
1343
- if (new_nid == numa_cpu_lookup_table[cpu]) {
1344
- cpumask_andnot(&cpu_associativity_changes_mask,
1345
- &cpu_associativity_changes_mask,
1346
- cpu_sibling_mask(cpu));
1347
- dbg("Assoc chg gives same node %d for cpu%d\n",
1348
- new_nid, cpu);
1349
- cpu = cpu_last_thread_sibling(cpu);
1350
- continue;
1351
- }
1352
-
1353
- for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1354
- ud = &updates[i++];
1355
- ud->next = &updates[i];
1356
- ud->cpu = sibling;
1357
- ud->new_nid = new_nid;
1358
- ud->old_nid = numa_cpu_lookup_table[sibling];
1359
- cpumask_set_cpu(sibling, &updated_cpus);
1360
- }
1361
- cpu = cpu_last_thread_sibling(cpu);
1362
- }
1363
-
1364
- /*
1365
- * Prevent processing of 'updates' from overflowing array
1366
- * where last entry filled in a 'next' pointer.
1367
- */
1368
- if (i)
1369
- updates[i-1].next = NULL;
1370
-
1371
- pr_debug("Topology update for the following CPUs:\n");
1372
- if (cpumask_weight(&updated_cpus)) {
1373
- for (ud = &updates[0]; ud; ud = ud->next) {
1374
- pr_debug("cpu %d moving from node %d "
1375
- "to %d\n", ud->cpu,
1376
- ud->old_nid, ud->new_nid);
1377
- }
1378
- }
1379
-
1380
- /*
1381
- * In cases where we have nothing to update (because the updates list
1382
- * is too short or because the new topology is same as the old one),
1383
- * skip invoking update_cpu_topology() via stop-machine(). This is
1384
- * necessary (and not just a fast-path optimization) since stop-machine
1385
- * can end up electing a random CPU to run update_cpu_topology(), and
1386
- * thus trick us into setting up incorrect cpu-node mappings (since
1387
- * 'updates' is kzalloc()'ed).
1388
- *
1389
- * And for the similar reason, we will skip all the following updating.
1390
- */
1391
- if (!cpumask_weight(&updated_cpus))
1261
+ if (!coregroup_enabled)
13921262 goto out;
13931263
1394
- if (cpus_locked)
1395
- stop_machine_cpuslocked(update_cpu_topology, &updates[0],
1396
- &updated_cpus);
1397
- else
1398
- stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1264
+ if (!firmware_has_feature(FW_FEATURE_VPHN))
1265
+ goto out;
13991266
1400
- /*
1401
- * Update the numa-cpu lookup table with the new mappings, even for
1402
- * offline CPUs. It is best to perform this update from the stop-
1403
- * machine context.
1404
- */
1405
- if (cpus_locked)
1406
- stop_machine_cpuslocked(update_lookup_table, &updates[0],
1407
- cpumask_of(raw_smp_processor_id()));
1408
- else
1409
- stop_machine(update_lookup_table, &updates[0],
1410
- cpumask_of(raw_smp_processor_id()));
1267
+ if (vphn_get_associativity(cpu, associativity))
1268
+ goto out;
14111269
1412
- for (ud = &updates[0]; ud; ud = ud->next) {
1413
- unregister_cpu_under_node(ud->cpu, ud->old_nid);
1414
- register_cpu_under_node(ud->cpu, ud->new_nid);
1415
-
1416
- dev = get_cpu_device(ud->cpu);
1417
- if (dev)
1418
- kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1419
- cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
1420
- changed = 1;
1421
- }
1270
+ index = of_read_number(associativity, 1);
1271
+ if (index > min_common_depth + 1)
1272
+ return of_read_number(&associativity[index - 1], 1);
14221273
14231274 out:
1424
- kfree(updates);
1425
- return changed;
1275
+ return cpu_to_core_id(cpu);
14261276 }
1427
-
1428
-int arch_update_cpu_topology(void)
1429
-{
1430
- return numa_update_cpu_topology(true);
1431
-}
1432
-
1433
-static void topology_work_fn(struct work_struct *work)
1434
-{
1435
- rebuild_sched_domains();
1436
-}
1437
-static DECLARE_WORK(topology_work, topology_work_fn);
1438
-
1439
-static void topology_schedule_update(void)
1440
-{
1441
- schedule_work(&topology_work);
1442
-}
1443
-
1444
-static void topology_timer_fn(struct timer_list *unused)
1445
-{
1446
- if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
1447
- topology_schedule_update();
1448
- else if (vphn_enabled) {
1449
- if (update_cpu_associativity_changes_mask() > 0)
1450
- topology_schedule_update();
1451
- reset_topology_timer();
1452
- }
1453
-}
1454
-static struct timer_list topology_timer;
1455
-
1456
-static void reset_topology_timer(void)
1457
-{
1458
- if (vphn_enabled)
1459
- mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
1460
-}
1461
-
1462
-#ifdef CONFIG_SMP
1463
-
1464
-static int dt_update_callback(struct notifier_block *nb,
1465
- unsigned long action, void *data)
1466
-{
1467
- struct of_reconfig_data *update = data;
1468
- int rc = NOTIFY_DONE;
1469
-
1470
- switch (action) {
1471
- case OF_RECONFIG_UPDATE_PROPERTY:
1472
- if (!of_prop_cmp(update->dn->type, "cpu") &&
1473
- !of_prop_cmp(update->prop->name, "ibm,associativity")) {
1474
- u32 core_id;
1475
- of_property_read_u32(update->dn, "reg", &core_id);
1476
- rc = dlpar_cpu_readd(core_id);
1477
- rc = NOTIFY_OK;
1478
- }
1479
- break;
1480
- }
1481
-
1482
- return rc;
1483
-}
1484
-
1485
-static struct notifier_block dt_update_nb = {
1486
- .notifier_call = dt_update_callback,
1487
-};
1488
-
1489
-#endif
1490
-
1491
-/*
1492
- * Start polling for associativity changes.
1493
- */
1494
-int start_topology_update(void)
1495
-{
1496
- int rc = 0;
1497
-
1498
- if (!topology_updates_enabled)
1499
- return 0;
1500
-
1501
- if (firmware_has_feature(FW_FEATURE_PRRN)) {
1502
- if (!prrn_enabled) {
1503
- prrn_enabled = 1;
1504
-#ifdef CONFIG_SMP
1505
- rc = of_reconfig_notifier_register(&dt_update_nb);
1506
-#endif
1507
- }
1508
- }
1509
- if (firmware_has_feature(FW_FEATURE_VPHN) &&
1510
- lppaca_shared_proc(get_lppaca())) {
1511
- if (!vphn_enabled) {
1512
- vphn_enabled = 1;
1513
- setup_cpu_associativity_change_counters();
1514
- timer_setup(&topology_timer, topology_timer_fn,
1515
- TIMER_DEFERRABLE);
1516
- reset_topology_timer();
1517
- }
1518
- }
1519
-
1520
- return rc;
1521
-}
1522
-
1523
-/*
1524
- * Disable polling for VPHN associativity changes.
1525
- */
1526
-int stop_topology_update(void)
1527
-{
1528
- int rc = 0;
1529
-
1530
- if (!topology_updates_enabled)
1531
- return 0;
1532
-
1533
- if (prrn_enabled) {
1534
- prrn_enabled = 0;
1535
-#ifdef CONFIG_SMP
1536
- rc = of_reconfig_notifier_unregister(&dt_update_nb);
1537
-#endif
1538
- }
1539
- if (vphn_enabled) {
1540
- vphn_enabled = 0;
1541
- rc = del_timer_sync(&topology_timer);
1542
- }
1543
-
1544
- return rc;
1545
-}
1546
-
1547
-int prrn_is_enabled(void)
1548
-{
1549
- return prrn_enabled;
1550
-}
1551
-
1552
-void __init shared_proc_topology_init(void)
1553
-{
1554
- if (lppaca_shared_proc(get_lppaca())) {
1555
- bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1556
- nr_cpumask_bits);
1557
- numa_update_cpu_topology(false);
1558
- }
1559
-}
1560
-
1561
-static int topology_read(struct seq_file *file, void *v)
1562
-{
1563
- if (vphn_enabled || prrn_enabled)
1564
- seq_puts(file, "on\n");
1565
- else
1566
- seq_puts(file, "off\n");
1567
-
1568
- return 0;
1569
-}
1570
-
1571
-static int topology_open(struct inode *inode, struct file *file)
1572
-{
1573
- return single_open(file, topology_read, NULL);
1574
-}
1575
-
1576
-static ssize_t topology_write(struct file *file, const char __user *buf,
1577
- size_t count, loff_t *off)
1578
-{
1579
- char kbuf[4]; /* "on" or "off" plus null. */
1580
- int read_len;
1581
-
1582
- read_len = count < 3 ? count : 3;
1583
- if (copy_from_user(kbuf, buf, read_len))
1584
- return -EINVAL;
1585
-
1586
- kbuf[read_len] = '\0';
1587
-
1588
- if (!strncmp(kbuf, "on", 2)) {
1589
- topology_updates_enabled = true;
1590
- start_topology_update();
1591
- } else if (!strncmp(kbuf, "off", 3)) {
1592
- stop_topology_update();
1593
- topology_updates_enabled = false;
1594
- } else
1595
- return -EINVAL;
1596
-
1597
- return count;
1598
-}
1599
-
1600
-static const struct file_operations topology_ops = {
1601
- .read = seq_read,
1602
- .write = topology_write,
1603
- .open = topology_open,
1604
- .release = single_release
1605
-};
16061277
16071278 static int topology_update_init(void)
16081279 {
1609
- start_topology_update();
1610
-
1611
- if (vphn_enabled)
1612
- topology_schedule_update();
1613
-
1614
- if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1615
- return -ENOMEM;
1616
-
16171280 topology_inited = 1;
16181281 return 0;
16191282 }