From 2f7c68cb55ecb7331f2381deb497c27155f32faf Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Wed, 03 Jan 2024 09:43:39 +0000 Subject: [PATCH] update kernel to 5.10.198 --- kernel/arch/powerpc/mm/numa.c | 433 +++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 332 insertions(+), 101 deletions(-) diff --git a/kernel/arch/powerpc/mm/numa.c b/kernel/arch/powerpc/mm/numa.c index 275c60f..ce8569e 100644 --- a/kernel/arch/powerpc/mm/numa.c +++ b/kernel/arch/powerpc/mm/numa.c @@ -51,14 +51,22 @@ EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -163,7 +171,55 @@ } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) +{ + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -179,6 +235,15 @@ return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { @@ -190,7 +255,9 @@ int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -204,52 +271,6 @@ return distance; } EXPORT_SYMBOL(__node_distance); - -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) -{ - int i; - - if (!form1_affinity) - return; - - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; - - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } -} - -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) -{ - int nid = NUMA_NO_NODE; - - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } - -out: - return nid; -} /* Returns the nid associated with the given device tree node, * or -1 if not found. @@ -284,10 +305,159 @@ } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) { - int depth; + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + int nid; + + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} +EXPORT_SYMBOL_GPL(update_numa_distance); + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; struct device_node *root; + const __u8 *numa_dist_table; + const __be32 *numa_lookup_index; + int numa_dist_table_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); + numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + /* Skip the size which is encoded int */ + numa_dist_table += sizeof(__be32); + + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", + numa_dist_table_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (numa_dist_table_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + /* consider everybody else just remote. */ + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + if (nodeA == nodeB) + numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; + else + numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; + } + } + } + + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + } + } + of_node_put(root); +} + +static int __init find_primary_domain_index(void) +{ + int index; + struct device_node *root; + + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + dbg("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); @@ -318,25 +488,21 @@ } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + "short ibm,associativity-reference-points\n"); goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); + } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ + index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. @@ -348,7 +514,7 @@ } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -426,6 +592,38 @@ return 0; } +static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -437,35 +635,28 @@ int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; - nid = of_read_number(&aa.arrays[index], 1); + const __be32 *associativity; - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; - - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -485,12 +676,30 @@ rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -685,7 +894,7 @@ size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -702,24 +911,31 @@ struct device_node *memory; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); /* * Even though we connect cpus to numa domains later in SMP @@ -727,18 +943,30 @@ * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } @@ -776,8 +1004,11 @@ * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); @@ -926,7 +1157,7 @@ goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); for (i = 0; i < max_nodes; i++) { @@ -935,7 +1166,7 @@ } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -1268,7 +1499,7 @@ goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: -- Gitblit v1.6.2