numa.c (11169B)
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Author: Xiang Gao <gaoxiang@loongson.cn> 4 * Huacai Chen <chenhuacai@loongson.cn> 5 * 6 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited 7 */ 8#include <linux/init.h> 9#include <linux/kernel.h> 10#include <linux/mm.h> 11#include <linux/mmzone.h> 12#include <linux/export.h> 13#include <linux/nodemask.h> 14#include <linux/swap.h> 15#include <linux/memblock.h> 16#include <linux/pfn.h> 17#include <linux/acpi.h> 18#include <linux/efi.h> 19#include <linux/irq.h> 20#include <linux/pci.h> 21#include <asm/bootinfo.h> 22#include <asm/loongson.h> 23#include <asm/numa.h> 24#include <asm/page.h> 25#include <asm/pgalloc.h> 26#include <asm/sections.h> 27#include <asm/time.h> 28 29int numa_off; 30struct pglist_data *node_data[MAX_NUMNODES]; 31unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES]; 32 33EXPORT_SYMBOL(node_data); 34EXPORT_SYMBOL(node_distances); 35 36static struct numa_meminfo numa_meminfo; 37cpumask_t cpus_on_node[MAX_NUMNODES]; 38cpumask_t phys_cpus_on_node[MAX_NUMNODES]; 39EXPORT_SYMBOL(cpus_on_node); 40 41/* 42 * apicid, cpu, node mappings 43 */ 44s16 __cpuid_to_node[CONFIG_NR_CPUS] = { 45 [0 ... CONFIG_NR_CPUS - 1] = NUMA_NO_NODE 46}; 47EXPORT_SYMBOL(__cpuid_to_node); 48 49nodemask_t numa_nodes_parsed __initdata; 50 51#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA 52unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 53EXPORT_SYMBOL(__per_cpu_offset); 54 55static int __init pcpu_cpu_to_node(int cpu) 56{ 57 return early_cpu_to_node(cpu); 58} 59 60static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 61{ 62 if (early_cpu_to_node(from) == early_cpu_to_node(to)) 63 return LOCAL_DISTANCE; 64 else 65 return REMOTE_DISTANCE; 66} 67 68void __init pcpu_populate_pte(unsigned long addr) 69{ 70 pgd_t *pgd = pgd_offset_k(addr); 71 p4d_t *p4d = p4d_offset(pgd, addr); 72 pud_t *pud; 73 pmd_t *pmd; 74 75 if (p4d_none(*p4d)) { 76 pud_t *new; 77 78 new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); 79 pgd_populate(&init_mm, pgd, new); 80#ifndef __PAGETABLE_PUD_FOLDED 81 pud_init((unsigned long)new, (unsigned long)invalid_pmd_table); 82#endif 83 } 84 85 pud = pud_offset(p4d, addr); 86 if (pud_none(*pud)) { 87 pmd_t *new; 88 89 new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); 90 pud_populate(&init_mm, pud, new); 91#ifndef __PAGETABLE_PMD_FOLDED 92 pmd_init((unsigned long)new, (unsigned long)invalid_pte_table); 93#endif 94 } 95 96 pmd = pmd_offset(pud, addr); 97 if (!pmd_present(*pmd)) { 98 pte_t *new; 99 100 new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); 101 pmd_populate_kernel(&init_mm, pmd, new); 102 } 103} 104 105void __init setup_per_cpu_areas(void) 106{ 107 unsigned long delta; 108 unsigned int cpu; 109 int rc = -EINVAL; 110 111 if (pcpu_chosen_fc == PCPU_FC_AUTO) { 112 if (nr_node_ids >= 8) 113 pcpu_chosen_fc = PCPU_FC_PAGE; 114 else 115 pcpu_chosen_fc = PCPU_FC_EMBED; 116 } 117 118 /* 119 * Always reserve area for module percpu variables. That's 120 * what the legacy allocator did. 121 */ 122 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 123 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 124 PERCPU_DYNAMIC_RESERVE, PMD_SIZE, 125 pcpu_cpu_distance, pcpu_cpu_to_node); 126 if (rc < 0) 127 pr_warn("%s allocator failed (%d), falling back to page size\n", 128 pcpu_fc_names[pcpu_chosen_fc], rc); 129 } 130 if (rc < 0) 131 rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_cpu_to_node); 132 if (rc < 0) 133 panic("cannot initialize percpu area (err=%d)", rc); 134 135 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 136 for_each_possible_cpu(cpu) 137 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 138} 139#endif 140 141/* 142 * Get nodeid by logical cpu number. 143 * __cpuid_to_node maps phyical cpu id to node, so we 144 * should use cpu_logical_map(cpu) to index it. 145 * 146 * This routine is only used in early phase during 147 * booting, after setup_per_cpu_areas calling and numa_node 148 * initialization, cpu_to_node will be used instead. 149 */ 150int early_cpu_to_node(int cpu) 151{ 152 int physid = cpu_logical_map(cpu); 153 154 if (physid < 0) 155 return NUMA_NO_NODE; 156 157 return __cpuid_to_node[physid]; 158} 159 160void __init early_numa_add_cpu(int cpuid, s16 node) 161{ 162 int cpu = __cpu_number_map[cpuid]; 163 164 if (cpu < 0) 165 return; 166 167 cpumask_set_cpu(cpu, &cpus_on_node[node]); 168 cpumask_set_cpu(cpuid, &phys_cpus_on_node[node]); 169} 170 171void numa_add_cpu(unsigned int cpu) 172{ 173 int nid = cpu_to_node(cpu); 174 cpumask_set_cpu(cpu, &cpus_on_node[nid]); 175} 176 177void numa_remove_cpu(unsigned int cpu) 178{ 179 int nid = cpu_to_node(cpu); 180 cpumask_clear_cpu(cpu, &cpus_on_node[nid]); 181} 182 183static int __init numa_add_memblk_to(int nid, u64 start, u64 end, 184 struct numa_meminfo *mi) 185{ 186 /* ignore zero length blks */ 187 if (start == end) 188 return 0; 189 190 /* whine about and ignore invalid blks */ 191 if (start > end || nid < 0 || nid >= MAX_NUMNODES) { 192 pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", 193 nid, start, end - 1); 194 return 0; 195 } 196 197 if (mi->nr_blks >= NR_NODE_MEMBLKS) { 198 pr_err("NUMA: too many memblk ranges\n"); 199 return -EINVAL; 200 } 201 202 mi->blk[mi->nr_blks].start = PFN_ALIGN(start); 203 mi->blk[mi->nr_blks].end = PFN_ALIGN(end - PAGE_SIZE + 1); 204 mi->blk[mi->nr_blks].nid = nid; 205 mi->nr_blks++; 206 return 0; 207} 208 209/** 210 * numa_add_memblk - Add one numa_memblk to numa_meminfo 211 * @nid: NUMA node ID of the new memblk 212 * @start: Start address of the new memblk 213 * @end: End address of the new memblk 214 * 215 * Add a new memblk to the default numa_meminfo. 216 * 217 * RETURNS: 218 * 0 on success, -errno on failure. 219 */ 220int __init numa_add_memblk(int nid, u64 start, u64 end) 221{ 222 return numa_add_memblk_to(nid, start, end, &numa_meminfo); 223} 224 225static void __init alloc_node_data(int nid) 226{ 227 void *nd; 228 unsigned long nd_pa; 229 size_t nd_sz = roundup(sizeof(pg_data_t), PAGE_SIZE); 230 231 nd_pa = memblock_phys_alloc_try_nid(nd_sz, SMP_CACHE_BYTES, nid); 232 if (!nd_pa) { 233 pr_err("Cannot find %zu Byte for node_data (initial node: %d)\n", nd_sz, nid); 234 return; 235 } 236 237 nd = __va(nd_pa); 238 239 node_data[nid] = nd; 240 memset(nd, 0, sizeof(pg_data_t)); 241} 242 243static void __init node_mem_init(unsigned int node) 244{ 245 unsigned long start_pfn, end_pfn; 246 unsigned long node_addrspace_offset; 247 248 node_addrspace_offset = nid_to_addrbase(node); 249 pr_info("Node%d's addrspace_offset is 0x%lx\n", 250 node, node_addrspace_offset); 251 252 get_pfn_range_for_nid(node, &start_pfn, &end_pfn); 253 pr_info("Node%d: start_pfn=0x%lx, end_pfn=0x%lx\n", 254 node, start_pfn, end_pfn); 255 256 alloc_node_data(node); 257} 258 259#ifdef CONFIG_ACPI_NUMA 260 261/* 262 * Sanity check to catch more bad NUMA configurations (they are amazingly 263 * common). Make sure the nodes cover all memory. 264 */ 265static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) 266{ 267 int i; 268 u64 numaram, biosram; 269 270 numaram = 0; 271 for (i = 0; i < mi->nr_blks; i++) { 272 u64 s = mi->blk[i].start >> PAGE_SHIFT; 273 u64 e = mi->blk[i].end >> PAGE_SHIFT; 274 275 numaram += e - s; 276 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); 277 if ((s64)numaram < 0) 278 numaram = 0; 279 } 280 max_pfn = max_low_pfn; 281 biosram = max_pfn - absent_pages_in_range(0, max_pfn); 282 283 BUG_ON((s64)(biosram - numaram) >= (1 << (20 - PAGE_SHIFT))); 284 return true; 285} 286 287static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type) 288{ 289 static unsigned long num_physpages; 290 291 num_physpages += (size >> PAGE_SHIFT); 292 pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", 293 node, type, start, size); 294 pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", 295 start >> PAGE_SHIFT, (start + size) >> PAGE_SHIFT, num_physpages); 296 memblock_set_node(start, size, &memblock.memory, node); 297} 298 299/* 300 * add_numamem_region 301 * 302 * Add a uasable memory region described by BIOS. The 303 * routine gets each intersection between BIOS's region 304 * and node's region, and adds them into node's memblock 305 * pool. 306 * 307 */ 308static void __init add_numamem_region(u64 start, u64 end, u32 type) 309{ 310 u32 i; 311 u64 ofs = start; 312 313 if (start >= end) { 314 pr_debug("Invalid region: %016llx-%016llx\n", start, end); 315 return; 316 } 317 318 for (i = 0; i < numa_meminfo.nr_blks; i++) { 319 struct numa_memblk *mb = &numa_meminfo.blk[i]; 320 321 if (ofs > mb->end) 322 continue; 323 324 if (end > mb->end) { 325 add_node_intersection(mb->nid, ofs, mb->end - ofs, type); 326 ofs = mb->end; 327 } else { 328 add_node_intersection(mb->nid, ofs, end - ofs, type); 329 break; 330 } 331 } 332} 333 334static void __init init_node_memblock(void) 335{ 336 u32 mem_type; 337 u64 mem_end, mem_start, mem_size; 338 efi_memory_desc_t *md; 339 340 /* Parse memory information and activate */ 341 for_each_efi_memory_desc(md) { 342 mem_type = md->type; 343 mem_start = md->phys_addr; 344 mem_size = md->num_pages << EFI_PAGE_SHIFT; 345 mem_end = mem_start + mem_size; 346 347 switch (mem_type) { 348 case EFI_LOADER_CODE: 349 case EFI_LOADER_DATA: 350 case EFI_BOOT_SERVICES_CODE: 351 case EFI_BOOT_SERVICES_DATA: 352 case EFI_PERSISTENT_MEMORY: 353 case EFI_CONVENTIONAL_MEMORY: 354 add_numamem_region(mem_start, mem_end, mem_type); 355 break; 356 case EFI_PAL_CODE: 357 case EFI_UNUSABLE_MEMORY: 358 case EFI_ACPI_RECLAIM_MEMORY: 359 add_numamem_region(mem_start, mem_end, mem_type); 360 fallthrough; 361 case EFI_RESERVED_TYPE: 362 case EFI_RUNTIME_SERVICES_CODE: 363 case EFI_RUNTIME_SERVICES_DATA: 364 case EFI_MEMORY_MAPPED_IO: 365 case EFI_MEMORY_MAPPED_IO_PORT_SPACE: 366 pr_info("Resvd: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", 367 mem_type, mem_start, mem_size); 368 break; 369 } 370 } 371} 372 373static void __init numa_default_distance(void) 374{ 375 int row, col; 376 377 for (row = 0; row < MAX_NUMNODES; row++) 378 for (col = 0; col < MAX_NUMNODES; col++) { 379 if (col == row) 380 node_distances[row][col] = LOCAL_DISTANCE; 381 else 382 /* We assume that one node per package here! 383 * 384 * A SLIT should be used for multiple nodes 385 * per package to override default setting. 386 */ 387 node_distances[row][col] = REMOTE_DISTANCE; 388 } 389} 390 391int __init init_numa_memory(void) 392{ 393 int i; 394 int ret; 395 int node; 396 397 for (i = 0; i < NR_CPUS; i++) 398 set_cpuid_to_node(i, NUMA_NO_NODE); 399 400 numa_default_distance(); 401 nodes_clear(numa_nodes_parsed); 402 nodes_clear(node_possible_map); 403 nodes_clear(node_online_map); 404 memset(&numa_meminfo, 0, sizeof(numa_meminfo)); 405 406 /* Parse SRAT and SLIT if provided by firmware. */ 407 ret = acpi_numa_init(); 408 if (ret < 0) 409 return ret; 410 411 node_possible_map = numa_nodes_parsed; 412 if (WARN_ON(nodes_empty(node_possible_map))) 413 return -EINVAL; 414 415 init_node_memblock(); 416 if (numa_meminfo_cover_memory(&numa_meminfo) == false) 417 return -EINVAL; 418 419 for_each_node_mask(node, node_possible_map) { 420 node_mem_init(node); 421 node_set_online(node); 422 } 423 max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); 424 425 setup_nr_node_ids(); 426 loongson_sysconf.nr_nodes = nr_node_ids; 427 loongson_sysconf.cores_per_node = cpumask_weight(&phys_cpus_on_node[0]); 428 429 return 0; 430} 431 432#endif 433 434void __init paging_init(void) 435{ 436 unsigned int node; 437 unsigned long zones_size[MAX_NR_ZONES] = {0, }; 438 439 for_each_online_node(node) { 440 unsigned long start_pfn, end_pfn; 441 442 get_pfn_range_for_nid(node, &start_pfn, &end_pfn); 443 444 if (end_pfn > max_low_pfn) 445 max_low_pfn = end_pfn; 446 } 447#ifdef CONFIG_ZONE_DMA32 448 zones_size[ZONE_DMA32] = MAX_DMA32_PFN; 449#endif 450 zones_size[ZONE_NORMAL] = max_low_pfn; 451 free_area_init(zones_size); 452} 453 454void __init mem_init(void) 455{ 456 high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); 457 memblock_free_all(); 458 setup_zero_pages(); /* This comes from node 0 */ 459} 460 461int pcibus_to_node(struct pci_bus *bus) 462{ 463 return dev_to_node(&bus->dev); 464} 465EXPORT_SYMBOL(pcibus_to_node);