| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* Common code for 32 and 64-bit NUMA */ |
|---|
| 2 | 3 | #include <linux/acpi.h> |
|---|
| 3 | 4 | #include <linux/kernel.h> |
|---|
| 4 | 5 | #include <linux/mm.h> |
|---|
| 5 | 6 | #include <linux/string.h> |
|---|
| 6 | 7 | #include <linux/init.h> |
|---|
| 7 | | -#include <linux/bootmem.h> |
|---|
| 8 | 8 | #include <linux/memblock.h> |
|---|
| 9 | 9 | #include <linux/mmzone.h> |
|---|
| 10 | 10 | #include <linux/ctype.h> |
|---|
| .. | .. |
|---|
| 25 | 25 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
|---|
| 26 | 26 | EXPORT_SYMBOL(node_data); |
|---|
| 27 | 27 | |
|---|
| 28 | | -static struct numa_meminfo numa_meminfo |
|---|
| 29 | | -#ifndef CONFIG_MEMORY_HOTPLUG |
|---|
| 30 | | -__initdata |
|---|
| 31 | | -#endif |
|---|
| 32 | | -; |
|---|
| 28 | +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; |
|---|
| 29 | +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; |
|---|
| 33 | 30 | |
|---|
| 34 | 31 | static int numa_distance_cnt; |
|---|
| 35 | 32 | static u8 *numa_distance; |
|---|
| .. | .. |
|---|
| 40 | 37 | return -EINVAL; |
|---|
| 41 | 38 | if (!strncmp(opt, "off", 3)) |
|---|
| 42 | 39 | numa_off = 1; |
|---|
| 43 | | -#ifdef CONFIG_NUMA_EMU |
|---|
| 44 | 40 | if (!strncmp(opt, "fake=", 5)) |
|---|
| 45 | | - numa_emu_cmdline(opt + 5); |
|---|
| 46 | | -#endif |
|---|
| 47 | | -#ifdef CONFIG_ACPI_NUMA |
|---|
| 41 | + return numa_emu_cmdline(opt + 5); |
|---|
| 48 | 42 | if (!strncmp(opt, "noacpi", 6)) |
|---|
| 49 | | - acpi_numa = -1; |
|---|
| 50 | | -#endif |
|---|
| 43 | + disable_srat(); |
|---|
| 44 | + if (!strncmp(opt, "nohmat", 6)) |
|---|
| 45 | + disable_hmat(); |
|---|
| 51 | 46 | return 0; |
|---|
| 52 | 47 | } |
|---|
| 53 | 48 | early_param("numa", numa_setup); |
|---|
| .. | .. |
|---|
| 124 | 119 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); |
|---|
| 125 | 120 | |
|---|
| 126 | 121 | /* cpumask_of_node() will now work */ |
|---|
| 127 | | - pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); |
|---|
| 122 | + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); |
|---|
| 128 | 123 | } |
|---|
| 129 | 124 | |
|---|
| 130 | 125 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
|---|
| .. | .. |
|---|
| 169 | 164 | } |
|---|
| 170 | 165 | |
|---|
| 171 | 166 | /** |
|---|
| 167 | + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another |
|---|
| 168 | + * @dst: numa_meminfo to append block to |
|---|
| 169 | + * @idx: Index of memblk to remove |
|---|
| 170 | + * @src: numa_meminfo to remove memblk from |
|---|
| 171 | + */ |
|---|
| 172 | +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, |
|---|
| 173 | + struct numa_meminfo *src) |
|---|
| 174 | +{ |
|---|
| 175 | + dst->blk[dst->nr_blks++] = src->blk[idx]; |
|---|
| 176 | + numa_remove_memblk_from(idx, src); |
|---|
| 177 | +} |
|---|
| 178 | + |
|---|
| 179 | +/** |
|---|
| 172 | 180 | * numa_add_memblk - Add one numa_memblk to numa_meminfo |
|---|
| 173 | 181 | * @nid: NUMA node ID of the new memblk |
|---|
| 174 | 182 | * @start: Start address of the new memblk |
|---|
| .. | .. |
|---|
| 196 | 204 | * Allocate node data. Try node-local memory and then any node. |
|---|
| 197 | 205 | * Never allocate in DMA zone. |
|---|
| 198 | 206 | */ |
|---|
| 199 | | - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); |
|---|
| 207 | + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); |
|---|
| 200 | 208 | if (!nd_pa) { |
|---|
| 201 | | - nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, |
|---|
| 202 | | - MEMBLOCK_ALLOC_ACCESSIBLE); |
|---|
| 203 | | - if (!nd_pa) { |
|---|
| 204 | | - pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", |
|---|
| 205 | | - nd_size, nid); |
|---|
| 206 | | - return; |
|---|
| 207 | | - } |
|---|
| 209 | + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", |
|---|
| 210 | + nd_size, nid); |
|---|
| 211 | + return; |
|---|
| 208 | 212 | } |
|---|
| 209 | 213 | nd = __va(nd_pa); |
|---|
| 210 | 214 | |
|---|
| .. | .. |
|---|
| 241 | 245 | for (i = 0; i < mi->nr_blks; i++) { |
|---|
| 242 | 246 | struct numa_memblk *bi = &mi->blk[i]; |
|---|
| 243 | 247 | |
|---|
| 244 | | - /* make sure all blocks are inside the limits */ |
|---|
| 245 | | - bi->start = max(bi->start, low); |
|---|
| 246 | | - bi->end = min(bi->end, high); |
|---|
| 248 | + /* move / save reserved memory ranges */ |
|---|
| 249 | + if (!memblock_overlaps_region(&memblock.memory, |
|---|
| 250 | + bi->start, bi->end - bi->start)) { |
|---|
| 251 | + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); |
|---|
| 252 | + continue; |
|---|
| 253 | + } |
|---|
| 247 | 254 | |
|---|
| 248 | | - /* and there's no empty or non-exist block */ |
|---|
| 249 | | - if (bi->start >= bi->end || |
|---|
| 250 | | - !memblock_overlaps_region(&memblock.memory, |
|---|
| 251 | | - bi->start, bi->end - bi->start)) |
|---|
| 255 | + /* make sure all non-reserved blocks are inside the limits */ |
|---|
| 256 | + bi->start = max(bi->start, low); |
|---|
| 257 | + |
|---|
| 258 | + /* preserve info for non-RAM areas above 'max_pfn': */ |
|---|
| 259 | + if (bi->end > high) { |
|---|
| 260 | + numa_add_memblk_to(bi->nid, high, bi->end, |
|---|
| 261 | + &numa_reserved_meminfo); |
|---|
| 262 | + bi->end = high; |
|---|
| 263 | + } |
|---|
| 264 | + |
|---|
| 265 | + /* and there's no empty block */ |
|---|
| 266 | + if (bi->start >= bi->end) |
|---|
| 252 | 267 | numa_remove_memblk_from(i--, mi); |
|---|
| 253 | 268 | } |
|---|
| 254 | 269 | |
|---|
| .. | .. |
|---|
| 505 | 520 | * memory ranges, because quirks such as trim_snb_memory() |
|---|
| 506 | 521 | * reserve specific pages for Sandy Bridge graphics. ] |
|---|
| 507 | 522 | */ |
|---|
| 508 | | - for_each_memblock(reserved, mb_region) { |
|---|
| 509 | | - if (mb_region->nid != MAX_NUMNODES) |
|---|
| 510 | | - node_set(mb_region->nid, reserved_nodemask); |
|---|
| 523 | + for_each_reserved_mem_region(mb_region) { |
|---|
| 524 | + int nid = memblock_get_region_node(mb_region); |
|---|
| 525 | + |
|---|
| 526 | + if (nid != MAX_NUMNODES) |
|---|
| 527 | + node_set(nid, reserved_nodemask); |
|---|
| 511 | 528 | } |
|---|
| 512 | 529 | |
|---|
| 513 | 530 | /* |
|---|
| .. | .. |
|---|
| 530 | 547 | |
|---|
| 531 | 548 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
|---|
| 532 | 549 | { |
|---|
| 533 | | - unsigned long uninitialized_var(pfn_align); |
|---|
| 534 | 550 | int i, nid; |
|---|
| 535 | 551 | |
|---|
| 536 | 552 | /* Account for nodes with cpus and no memory */ |
|---|
| .. | .. |
|---|
| 558 | 574 | * If sections array is gonna be used for pfn -> nid mapping, check |
|---|
| 559 | 575 | * whether its granularity is fine enough. |
|---|
| 560 | 576 | */ |
|---|
| 561 | | -#ifdef NODE_NOT_IN_PAGE_FLAGS |
|---|
| 562 | | - pfn_align = node_map_pfn_alignment(); |
|---|
| 563 | | - if (pfn_align && pfn_align < PAGES_PER_SECTION) { |
|---|
| 564 | | - printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", |
|---|
| 565 | | - PFN_PHYS(pfn_align) >> 20, |
|---|
| 566 | | - PFN_PHYS(PAGES_PER_SECTION) >> 20); |
|---|
| 567 | | - return -EINVAL; |
|---|
| 577 | + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { |
|---|
| 578 | + unsigned long pfn_align = node_map_pfn_alignment(); |
|---|
| 579 | + |
|---|
| 580 | + if (pfn_align && pfn_align < PAGES_PER_SECTION) { |
|---|
| 581 | + pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", |
|---|
| 582 | + PFN_PHYS(pfn_align) >> 20, |
|---|
| 583 | + PFN_PHYS(PAGES_PER_SECTION) >> 20); |
|---|
| 584 | + return -EINVAL; |
|---|
| 585 | + } |
|---|
| 568 | 586 | } |
|---|
| 569 | | -#endif |
|---|
| 570 | 587 | if (!numa_meminfo_cover_memory(mi)) |
|---|
| 571 | 588 | return -EINVAL; |
|---|
| 572 | 589 | |
|---|
| .. | .. |
|---|
| 703 | 720 | * x86_numa_init - Initialize NUMA |
|---|
| 704 | 721 | * |
|---|
| 705 | 722 | * Try each configured NUMA initialization method until one succeeds. The |
|---|
| 706 | | - * last fallback is dummy single node config encomapssing whole memory and |
|---|
| 723 | + * last fallback is dummy single node config encompassing whole memory and |
|---|
| 707 | 724 | * never fails. |
|---|
| 708 | 725 | */ |
|---|
| 709 | 726 | void __init x86_numa_init(void) |
|---|
| .. | .. |
|---|
| 724 | 741 | |
|---|
| 725 | 742 | static void __init init_memory_less_node(int nid) |
|---|
| 726 | 743 | { |
|---|
| 727 | | - unsigned long zones_size[MAX_NR_ZONES] = {0}; |
|---|
| 728 | | - unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
|---|
| 729 | | - |
|---|
| 730 | 744 | /* Allocate and initialize node data. Memory-less node is now online.*/ |
|---|
| 731 | 745 | alloc_node_data(nid); |
|---|
| 732 | | - free_area_init_node(nid, zones_size, 0, zholes_size); |
|---|
| 746 | + free_area_init_memoryless_node(nid); |
|---|
| 733 | 747 | |
|---|
| 734 | 748 | /* |
|---|
| 735 | 749 | * All zonelists will be built later in start_kernel() after per cpu |
|---|
| 736 | 750 | * areas are initialized. |
|---|
| 737 | 751 | */ |
|---|
| 752 | +} |
|---|
| 753 | + |
|---|
| 754 | +/* |
|---|
| 755 | + * A node may exist which has one or more Generic Initiators but no CPUs and no |
|---|
| 756 | + * memory. |
|---|
| 757 | + * |
|---|
| 758 | + * This function must be called after init_cpu_to_node(), to ensure that any |
|---|
| 759 | + * memoryless CPU nodes have already been brought online, and before the |
|---|
| 760 | + * node_data[nid] is needed for zone list setup in build_all_zonelists(). |
|---|
| 761 | + * |
|---|
| 762 | + * When this function is called, any nodes containing either memory and/or CPUs |
|---|
| 763 | + * will already be online and there is no need to do anything extra, even if |
|---|
| 764 | + * they also contain one or more Generic Initiators. |
|---|
| 765 | + */ |
|---|
| 766 | +void __init init_gi_nodes(void) |
|---|
| 767 | +{ |
|---|
| 768 | + int nid; |
|---|
| 769 | + |
|---|
| 770 | + for_each_node_state(nid, N_GENERIC_INITIATOR) |
|---|
| 771 | + if (!node_online(nid)) |
|---|
| 772 | + init_memory_less_node(nid); |
|---|
| 738 | 773 | } |
|---|
| 739 | 774 | |
|---|
| 740 | 775 | /* |
|---|
| .. | .. |
|---|
| 826 | 861 | return; |
|---|
| 827 | 862 | } |
|---|
| 828 | 863 | mask = node_to_cpumask_map[node]; |
|---|
| 829 | | - if (!mask) { |
|---|
| 864 | + if (!cpumask_available(mask)) { |
|---|
| 830 | 865 | pr_err("node_to_cpumask_map[%i] NULL\n", node); |
|---|
| 831 | 866 | dump_stack(); |
|---|
| 832 | 867 | return; |
|---|
| .. | .. |
|---|
| 865 | 900 | */ |
|---|
| 866 | 901 | const struct cpumask *cpumask_of_node(int node) |
|---|
| 867 | 902 | { |
|---|
| 868 | | - if (node >= nr_node_ids) { |
|---|
| 903 | + if ((unsigned)node >= nr_node_ids) { |
|---|
| 869 | 904 | printk(KERN_WARNING |
|---|
| 870 | | - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", |
|---|
| 905 | + "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", |
|---|
| 871 | 906 | node, nr_node_ids); |
|---|
| 872 | 907 | dump_stack(); |
|---|
| 873 | 908 | return cpu_none_mask; |
|---|
| 874 | 909 | } |
|---|
| 875 | | - if (node_to_cpumask_map[node] == NULL) { |
|---|
| 910 | + if (!cpumask_available(node_to_cpumask_map[node])) { |
|---|
| 876 | 911 | printk(KERN_WARNING |
|---|
| 877 | 912 | "cpumask_of_node(%d): no node_to_cpumask_map!\n", |
|---|
| 878 | 913 | node); |
|---|
| .. | .. |
|---|
| 885 | 920 | |
|---|
| 886 | 921 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
|---|
| 887 | 922 | |
|---|
| 888 | | -#ifdef CONFIG_MEMORY_HOTPLUG |
|---|
| 889 | | -int memory_add_physaddr_to_nid(u64 start) |
|---|
| 923 | +#ifdef CONFIG_NUMA_KEEP_MEMINFO |
|---|
| 924 | +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) |
|---|
| 890 | 925 | { |
|---|
| 891 | | - struct numa_meminfo *mi = &numa_meminfo; |
|---|
| 892 | | - int nid = mi->blk[0].nid; |
|---|
| 893 | 926 | int i; |
|---|
| 894 | 927 | |
|---|
| 895 | 928 | for (i = 0; i < mi->nr_blks; i++) |
|---|
| 896 | 929 | if (mi->blk[i].start <= start && mi->blk[i].end > start) |
|---|
| 897 | | - nid = mi->blk[i].nid; |
|---|
| 930 | + return mi->blk[i].nid; |
|---|
| 931 | + return NUMA_NO_NODE; |
|---|
| 932 | +} |
|---|
| 933 | + |
|---|
| 934 | +int phys_to_target_node(phys_addr_t start) |
|---|
| 935 | +{ |
|---|
| 936 | + int nid = meminfo_to_nid(&numa_meminfo, start); |
|---|
| 937 | + |
|---|
| 938 | + /* |
|---|
| 939 | + * Prefer online nodes, but if reserved memory might be |
|---|
| 940 | + * hot-added continue the search with reserved ranges. |
|---|
| 941 | + */ |
|---|
| 942 | + if (nid != NUMA_NO_NODE) |
|---|
| 943 | + return nid; |
|---|
| 944 | + |
|---|
| 945 | + return meminfo_to_nid(&numa_reserved_meminfo, start); |
|---|
| 946 | +} |
|---|
| 947 | +EXPORT_SYMBOL_GPL(phys_to_target_node); |
|---|
| 948 | + |
|---|
| 949 | +int memory_add_physaddr_to_nid(u64 start) |
|---|
| 950 | +{ |
|---|
| 951 | + int nid = meminfo_to_nid(&numa_meminfo, start); |
|---|
| 952 | + |
|---|
| 953 | + if (nid == NUMA_NO_NODE) |
|---|
| 954 | + nid = numa_meminfo.blk[0].nid; |
|---|
| 898 | 955 | return nid; |
|---|
| 899 | 956 | } |
|---|
| 900 | 957 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
|---|