numa.c (36999B)
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * pSeries NUMA support 4 * 5 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM 6 */ 7#define pr_fmt(fmt) "numa: " fmt 8 9#include <linux/threads.h> 10#include <linux/memblock.h> 11#include <linux/init.h> 12#include <linux/mm.h> 13#include <linux/mmzone.h> 14#include <linux/export.h> 15#include <linux/nodemask.h> 16#include <linux/cpu.h> 17#include <linux/notifier.h> 18#include <linux/of.h> 19#include <linux/pfn.h> 20#include <linux/cpuset.h> 21#include <linux/node.h> 22#include <linux/stop_machine.h> 23#include <linux/proc_fs.h> 24#include <linux/seq_file.h> 25#include <linux/uaccess.h> 26#include <linux/slab.h> 27#include <asm/cputhreads.h> 28#include <asm/sparsemem.h> 29#include <asm/smp.h> 30#include <asm/topology.h> 31#include <asm/firmware.h> 32#include <asm/paca.h> 33#include <asm/hvcall.h> 34#include <asm/setup.h> 35#include <asm/vdso.h> 36#include <asm/drmem.h> 37 38static int numa_enabled = 1; 39 40static char *cmdline __initdata; 41 42int numa_cpu_lookup_table[NR_CPUS]; 43cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 44struct pglist_data *node_data[MAX_NUMNODES]; 45 46EXPORT_SYMBOL(numa_cpu_lookup_table); 47EXPORT_SYMBOL(node_to_cpumask_map); 48EXPORT_SYMBOL(node_data); 49 50static int primary_domain_index; 51static int n_mem_addr_cells, n_mem_size_cells; 52 53#define FORM0_AFFINITY 0 54#define FORM1_AFFINITY 1 55#define FORM2_AFFINITY 2 56static int affinity_form; 57 58#define MAX_DISTANCE_REF_POINTS 4 59static int distance_ref_points_depth; 60static const __be32 *distance_ref_points; 61static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; 62static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { 63 [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } 64}; 65static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; 66 67/* 68 * Allocate node_to_cpumask_map based on number of available nodes 69 * Requires node_possible_map to be valid. 70 * 71 * Note: cpumask_of_node() is not valid until after this is done. 72 */ 73static void __init setup_node_to_cpumask_map(void) 74{ 75 unsigned int node; 76 77 /* setup nr_node_ids if not done yet */ 78 if (nr_node_ids == MAX_NUMNODES) 79 setup_nr_node_ids(); 80 81 /* allocate the map */ 82 for_each_node(node) 83 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 84 85 /* cpumask_of_node() will now work */ 86 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 87} 88 89static int __init fake_numa_create_new_node(unsigned long end_pfn, 90 unsigned int *nid) 91{ 92 unsigned long long mem; 93 char *p = cmdline; 94 static unsigned int fake_nid; 95 static unsigned long long curr_boundary; 96 97 /* 98 * Modify node id, iff we started creating NUMA nodes 99 * We want to continue from where we left of the last time 100 */ 101 if (fake_nid) 102 *nid = fake_nid; 103 /* 104 * In case there are no more arguments to parse, the 105 * node_id should be the same as the last fake node id 106 * (we've handled this above). 107 */ 108 if (!p) 109 return 0; 110 111 mem = memparse(p, &p); 112 if (!mem) 113 return 0; 114 115 if (mem < curr_boundary) 116 return 0; 117 118 curr_boundary = mem; 119 120 if ((end_pfn << PAGE_SHIFT) > mem) { 121 /* 122 * Skip commas and spaces 123 */ 124 while (*p == ',' || *p == ' ' || *p == '\t') 125 p++; 126 127 cmdline = p; 128 fake_nid++; 129 *nid = fake_nid; 130 pr_debug("created new fake_node with id %d\n", fake_nid); 131 return 1; 132 } 133 return 0; 134} 135 136static void __init reset_numa_cpu_lookup_table(void) 137{ 138 unsigned int cpu; 139 140 for_each_possible_cpu(cpu) 141 numa_cpu_lookup_table[cpu] = -1; 142} 143 144void map_cpu_to_node(int cpu, int node) 145{ 146 update_numa_cpu_lookup_table(cpu, node); 147 148 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { 149 pr_debug("adding cpu %d to node %d\n", cpu, node); 150 cpumask_set_cpu(cpu, node_to_cpumask_map[node]); 151 } 152} 153 154#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) 155void unmap_cpu_from_node(unsigned long cpu) 156{ 157 int node = numa_cpu_lookup_table[cpu]; 158 159 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { 160 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); 161 pr_debug("removing cpu %lu from node %d\n", cpu, node); 162 } else { 163 pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); 164 } 165} 166#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ 167 168static int __associativity_to_nid(const __be32 *associativity, 169 int max_array_sz) 170{ 171 int nid; 172 /* 173 * primary_domain_index is 1 based array index. 174 */ 175 int index = primary_domain_index - 1; 176 177 if (!numa_enabled || index >= max_array_sz) 178 return NUMA_NO_NODE; 179 180 nid = of_read_number(&associativity[index], 1); 181 182 /* POWER4 LPAR uses 0xffff as invalid node */ 183 if (nid == 0xffff || nid >= nr_node_ids) 184 nid = NUMA_NO_NODE; 185 return nid; 186} 187/* 188 * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA 189 * info is found. 190 */ 191static int associativity_to_nid(const __be32 *associativity) 192{ 193 int array_sz = of_read_number(associativity, 1); 194 195 /* Skip the first element in the associativity array */ 196 return __associativity_to_nid((associativity + 1), array_sz); 197} 198 199static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 200{ 201 int dist; 202 int node1, node2; 203 204 node1 = associativity_to_nid(cpu1_assoc); 205 node2 = associativity_to_nid(cpu2_assoc); 206 207 dist = numa_distance_table[node1][node2]; 208 if (dist <= LOCAL_DISTANCE) 209 return 0; 210 else if (dist <= REMOTE_DISTANCE) 211 return 1; 212 else 213 return 2; 214} 215 216static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 217{ 218 int dist = 0; 219 220 int i, index; 221 222 for (i = 0; i < distance_ref_points_depth; i++) { 223 index = be32_to_cpu(distance_ref_points[i]); 224 if (cpu1_assoc[index] == cpu2_assoc[index]) 225 break; 226 dist++; 227 } 228 229 return dist; 230} 231 232int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) 233{ 234 /* We should not get called with FORM0 */ 235 VM_WARN_ON(affinity_form == FORM0_AFFINITY); 236 if (affinity_form == FORM1_AFFINITY) 237 return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); 238 return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); 239} 240 241/* must hold reference to node during call */ 242static const __be32 *of_get_associativity(struct device_node *dev) 243{ 244 return of_get_property(dev, "ibm,associativity", NULL); 245} 246 247int __node_distance(int a, int b) 248{ 249 int i; 250 int distance = LOCAL_DISTANCE; 251 252 if (affinity_form == FORM2_AFFINITY) 253 return numa_distance_table[a][b]; 254 else if (affinity_form == FORM0_AFFINITY) 255 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); 256 257 for (i = 0; i < distance_ref_points_depth; i++) { 258 if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) 259 break; 260 261 /* Double the distance for each NUMA level */ 262 distance *= 2; 263 } 264 265 return distance; 266} 267EXPORT_SYMBOL(__node_distance); 268 269/* Returns the nid associated with the given device tree node, 270 * or -1 if not found. 271 */ 272static int of_node_to_nid_single(struct device_node *device) 273{ 274 int nid = NUMA_NO_NODE; 275 const __be32 *tmp; 276 277 tmp = of_get_associativity(device); 278 if (tmp) 279 nid = associativity_to_nid(tmp); 280 return nid; 281} 282 283/* Walk the device tree upwards, looking for an associativity id */ 284int of_node_to_nid(struct device_node *device) 285{ 286 int nid = NUMA_NO_NODE; 287 288 of_node_get(device); 289 while (device) { 290 nid = of_node_to_nid_single(device); 291 if (nid != -1) 292 break; 293 294 device = of_get_next_parent(device); 295 } 296 of_node_put(device); 297 298 return nid; 299} 300EXPORT_SYMBOL(of_node_to_nid); 301 302static void __initialize_form1_numa_distance(const __be32 *associativity, 303 int max_array_sz) 304{ 305 int i, nid; 306 307 if (affinity_form != FORM1_AFFINITY) 308 return; 309 310 nid = __associativity_to_nid(associativity, max_array_sz); 311 if (nid != NUMA_NO_NODE) { 312 for (i = 0; i < distance_ref_points_depth; i++) { 313 const __be32 *entry; 314 int index = be32_to_cpu(distance_ref_points[i]) - 1; 315 316 /* 317 * broken hierarchy, return with broken distance table 318 */ 319 if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) 320 return; 321 322 entry = &associativity[index]; 323 distance_lookup_table[nid][i] = of_read_number(entry, 1); 324 } 325 } 326} 327 328static void initialize_form1_numa_distance(const __be32 *associativity) 329{ 330 int array_sz; 331 332 array_sz = of_read_number(associativity, 1); 333 /* Skip the first element in the associativity array */ 334 __initialize_form1_numa_distance(associativity + 1, array_sz); 335} 336 337/* 338 * Used to update distance information w.r.t newly added node. 339 */ 340void update_numa_distance(struct device_node *node) 341{ 342 int nid; 343 344 if (affinity_form == FORM0_AFFINITY) 345 return; 346 else if (affinity_form == FORM1_AFFINITY) { 347 const __be32 *associativity; 348 349 associativity = of_get_associativity(node); 350 if (!associativity) 351 return; 352 353 initialize_form1_numa_distance(associativity); 354 return; 355 } 356 357 /* FORM2 affinity */ 358 nid = of_node_to_nid_single(node); 359 if (nid == NUMA_NO_NODE) 360 return; 361 362 /* 363 * With FORM2 we expect NUMA distance of all possible NUMA 364 * nodes to be provided during boot. 365 */ 366 WARN(numa_distance_table[nid][nid] == -1, 367 "NUMA distance details for node %d not provided\n", nid); 368} 369 370/* 371 * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} 372 * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} 373 */ 374static void __init initialize_form2_numa_distance_lookup_table(void) 375{ 376 int i, j; 377 struct device_node *root; 378 const __u8 *form2_distances; 379 const __be32 *numa_lookup_index; 380 int form2_distances_length; 381 int max_numa_index, distance_index; 382 383 if (firmware_has_feature(FW_FEATURE_OPAL)) 384 root = of_find_node_by_path("/ibm,opal"); 385 else 386 root = of_find_node_by_path("/rtas"); 387 if (!root) 388 root = of_find_node_by_path("/"); 389 390 numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); 391 max_numa_index = of_read_number(&numa_lookup_index[0], 1); 392 393 /* first element of the array is the size and is encode-int */ 394 form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL); 395 form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1); 396 /* Skip the size which is encoded int */ 397 form2_distances += sizeof(__be32); 398 399 pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n", 400 form2_distances_length, max_numa_index); 401 402 for (i = 0; i < max_numa_index; i++) 403 /* +1 skip the max_numa_index in the property */ 404 numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); 405 406 407 if (form2_distances_length != max_numa_index * max_numa_index) { 408 WARN(1, "Wrong NUMA distance information\n"); 409 form2_distances = NULL; // don't use it 410 } 411 distance_index = 0; 412 for (i = 0; i < max_numa_index; i++) { 413 for (j = 0; j < max_numa_index; j++) { 414 int nodeA = numa_id_index_table[i]; 415 int nodeB = numa_id_index_table[j]; 416 int dist; 417 418 if (form2_distances) 419 dist = form2_distances[distance_index++]; 420 else if (nodeA == nodeB) 421 dist = LOCAL_DISTANCE; 422 else 423 dist = REMOTE_DISTANCE; 424 numa_distance_table[nodeA][nodeB] = dist; 425 pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist); 426 } 427 } 428 429 of_node_put(root); 430} 431 432static int __init find_primary_domain_index(void) 433{ 434 int index; 435 struct device_node *root; 436 437 /* 438 * Check for which form of affinity. 439 */ 440 if (firmware_has_feature(FW_FEATURE_OPAL)) { 441 affinity_form = FORM1_AFFINITY; 442 } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { 443 pr_debug("Using form 2 affinity\n"); 444 affinity_form = FORM2_AFFINITY; 445 } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { 446 pr_debug("Using form 1 affinity\n"); 447 affinity_form = FORM1_AFFINITY; 448 } else 449 affinity_form = FORM0_AFFINITY; 450 451 if (firmware_has_feature(FW_FEATURE_OPAL)) 452 root = of_find_node_by_path("/ibm,opal"); 453 else 454 root = of_find_node_by_path("/rtas"); 455 if (!root) 456 root = of_find_node_by_path("/"); 457 458 /* 459 * This property is a set of 32-bit integers, each representing 460 * an index into the ibm,associativity nodes. 461 * 462 * With form 0 affinity the first integer is for an SMP configuration 463 * (should be all 0's) and the second is for a normal NUMA 464 * configuration. We have only one level of NUMA. 465 * 466 * With form 1 affinity the first integer is the most significant 467 * NUMA boundary and the following are progressively less significant 468 * boundaries. There can be more than one level of NUMA. 469 */ 470 distance_ref_points = of_get_property(root, 471 "ibm,associativity-reference-points", 472 &distance_ref_points_depth); 473 474 if (!distance_ref_points) { 475 pr_debug("ibm,associativity-reference-points not found.\n"); 476 goto err; 477 } 478 479 distance_ref_points_depth /= sizeof(int); 480 if (affinity_form == FORM0_AFFINITY) { 481 if (distance_ref_points_depth < 2) { 482 pr_warn("short ibm,associativity-reference-points\n"); 483 goto err; 484 } 485 486 index = of_read_number(&distance_ref_points[1], 1); 487 } else { 488 /* 489 * Both FORM1 and FORM2 affinity find the primary domain details 490 * at the same offset. 491 */ 492 index = of_read_number(distance_ref_points, 1); 493 } 494 /* 495 * Warn and cap if the hardware supports more than 496 * MAX_DISTANCE_REF_POINTS domains. 497 */ 498 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { 499 pr_warn("distance array capped at %d entries\n", 500 MAX_DISTANCE_REF_POINTS); 501 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; 502 } 503 504 of_node_put(root); 505 return index; 506 507err: 508 of_node_put(root); 509 return -1; 510} 511 512static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) 513{ 514 struct device_node *memory = NULL; 515 516 memory = of_find_node_by_type(memory, "memory"); 517 if (!memory) 518 panic("numa.c: No memory nodes found!"); 519 520 *n_addr_cells = of_n_addr_cells(memory); 521 *n_size_cells = of_n_size_cells(memory); 522 of_node_put(memory); 523} 524 525static unsigned long read_n_cells(int n, const __be32 **buf) 526{ 527 unsigned long result = 0; 528 529 while (n--) { 530 result = (result << 32) | of_read_number(*buf, 1); 531 (*buf)++; 532 } 533 return result; 534} 535 536struct assoc_arrays { 537 u32 n_arrays; 538 u32 array_sz; 539 const __be32 *arrays; 540}; 541 542/* 543 * Retrieve and validate the list of associativity arrays for drconf 544 * memory from the ibm,associativity-lookup-arrays property of the 545 * device tree.. 546 * 547 * The layout of the ibm,associativity-lookup-arrays property is a number N 548 * indicating the number of associativity arrays, followed by a number M 549 * indicating the size of each associativity array, followed by a list 550 * of N associativity arrays. 551 */ 552static int of_get_assoc_arrays(struct assoc_arrays *aa) 553{ 554 struct device_node *memory; 555 const __be32 *prop; 556 u32 len; 557 558 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 559 if (!memory) 560 return -1; 561 562 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); 563 if (!prop || len < 2 * sizeof(unsigned int)) { 564 of_node_put(memory); 565 return -1; 566 } 567 568 aa->n_arrays = of_read_number(prop++, 1); 569 aa->array_sz = of_read_number(prop++, 1); 570 571 of_node_put(memory); 572 573 /* Now that we know the number of arrays and size of each array, 574 * revalidate the size of the property read in. 575 */ 576 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) 577 return -1; 578 579 aa->arrays = prop; 580 return 0; 581} 582 583static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb) 584{ 585 struct assoc_arrays aa = { .arrays = NULL }; 586 int default_nid = NUMA_NO_NODE; 587 int nid = default_nid; 588 int rc, index; 589 590 if ((primary_domain_index < 0) || !numa_enabled) 591 return default_nid; 592 593 rc = of_get_assoc_arrays(&aa); 594 if (rc) 595 return default_nid; 596 597 if (primary_domain_index <= aa.array_sz && 598 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 599 const __be32 *associativity; 600 601 index = lmb->aa_index * aa.array_sz; 602 associativity = &aa.arrays[index]; 603 nid = __associativity_to_nid(associativity, aa.array_sz); 604 if (nid > 0 && affinity_form == FORM1_AFFINITY) { 605 /* 606 * lookup array associativity entries have 607 * no length of the array as the first element. 608 */ 609 __initialize_form1_numa_distance(associativity, aa.array_sz); 610 } 611 } 612 return nid; 613} 614 615/* 616 * This is like of_node_to_nid_single() for memory represented in the 617 * ibm,dynamic-reconfiguration-memory node. 618 */ 619int of_drconf_to_nid_single(struct drmem_lmb *lmb) 620{ 621 struct assoc_arrays aa = { .arrays = NULL }; 622 int default_nid = NUMA_NO_NODE; 623 int nid = default_nid; 624 int rc, index; 625 626 if ((primary_domain_index < 0) || !numa_enabled) 627 return default_nid; 628 629 rc = of_get_assoc_arrays(&aa); 630 if (rc) 631 return default_nid; 632 633 if (primary_domain_index <= aa.array_sz && 634 !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { 635 const __be32 *associativity; 636 637 index = lmb->aa_index * aa.array_sz; 638 associativity = &aa.arrays[index]; 639 nid = __associativity_to_nid(associativity, aa.array_sz); 640 } 641 return nid; 642} 643 644#ifdef CONFIG_PPC_SPLPAR 645 646static int __vphn_get_associativity(long lcpu, __be32 *associativity) 647{ 648 long rc, hwid; 649 650 /* 651 * On a shared lpar, device tree will not have node associativity. 652 * At this time lppaca, or its __old_status field may not be 653 * updated. Hence kernel cannot detect if its on a shared lpar. So 654 * request an explicit associativity irrespective of whether the 655 * lpar is shared or dedicated. Use the device tree property as a 656 * fallback. cpu_to_phys_id is only valid between 657 * smp_setup_cpu_maps() and smp_setup_pacas(). 658 */ 659 if (firmware_has_feature(FW_FEATURE_VPHN)) { 660 if (cpu_to_phys_id) 661 hwid = cpu_to_phys_id[lcpu]; 662 else 663 hwid = get_hard_smp_processor_id(lcpu); 664 665 rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); 666 if (rc == H_SUCCESS) 667 return 0; 668 } 669 670 return -1; 671} 672 673static int vphn_get_nid(long lcpu) 674{ 675 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 676 677 678 if (!__vphn_get_associativity(lcpu, associativity)) 679 return associativity_to_nid(associativity); 680 681 return NUMA_NO_NODE; 682 683} 684#else 685 686static int __vphn_get_associativity(long lcpu, __be32 *associativity) 687{ 688 return -1; 689} 690 691static int vphn_get_nid(long unused) 692{ 693 return NUMA_NO_NODE; 694} 695#endif /* CONFIG_PPC_SPLPAR */ 696 697/* 698 * Figure out to which domain a cpu belongs and stick it there. 699 * Return the id of the domain used. 700 */ 701static int numa_setup_cpu(unsigned long lcpu) 702{ 703 struct device_node *cpu; 704 int fcpu = cpu_first_thread_sibling(lcpu); 705 int nid = NUMA_NO_NODE; 706 707 if (!cpu_present(lcpu)) { 708 set_cpu_numa_node(lcpu, first_online_node); 709 return first_online_node; 710 } 711 712 /* 713 * If a valid cpu-to-node mapping is already available, use it 714 * directly instead of querying the firmware, since it represents 715 * the most recent mapping notified to us by the platform (eg: VPHN). 716 * Since cpu_to_node binding remains the same for all threads in the 717 * core. If a valid cpu-to-node mapping is already available, for 718 * the first thread in the core, use it. 719 */ 720 nid = numa_cpu_lookup_table[fcpu]; 721 if (nid >= 0) { 722 map_cpu_to_node(lcpu, nid); 723 return nid; 724 } 725 726 nid = vphn_get_nid(lcpu); 727 if (nid != NUMA_NO_NODE) 728 goto out_present; 729 730 cpu = of_get_cpu_node(lcpu, NULL); 731 732 if (!cpu) { 733 WARN_ON(1); 734 if (cpu_present(lcpu)) 735 goto out_present; 736 else 737 goto out; 738 } 739 740 nid = of_node_to_nid_single(cpu); 741 of_node_put(cpu); 742 743out_present: 744 if (nid < 0 || !node_possible(nid)) 745 nid = first_online_node; 746 747 /* 748 * Update for the first thread of the core. All threads of a core 749 * have to be part of the same node. This not only avoids querying 750 * for every other thread in the core, but always avoids a case 751 * where virtual node associativity change causes subsequent threads 752 * of a core to be associated with different nid. However if first 753 * thread is already online, expect it to have a valid mapping. 754 */ 755 if (fcpu != lcpu) { 756 WARN_ON(cpu_online(fcpu)); 757 map_cpu_to_node(fcpu, nid); 758 } 759 760 map_cpu_to_node(lcpu, nid); 761out: 762 return nid; 763} 764 765static void verify_cpu_node_mapping(int cpu, int node) 766{ 767 int base, sibling, i; 768 769 /* Verify that all the threads in the core belong to the same node */ 770 base = cpu_first_thread_sibling(cpu); 771 772 for (i = 0; i < threads_per_core; i++) { 773 sibling = base + i; 774 775 if (sibling == cpu || cpu_is_offline(sibling)) 776 continue; 777 778 if (cpu_to_node(sibling) != node) { 779 WARN(1, "CPU thread siblings %d and %d don't belong" 780 " to the same node!\n", cpu, sibling); 781 break; 782 } 783 } 784} 785 786/* Must run before sched domains notifier. */ 787static int ppc_numa_cpu_prepare(unsigned int cpu) 788{ 789 int nid; 790 791 nid = numa_setup_cpu(cpu); 792 verify_cpu_node_mapping(cpu, nid); 793 return 0; 794} 795 796static int ppc_numa_cpu_dead(unsigned int cpu) 797{ 798 return 0; 799} 800 801/* 802 * Check and possibly modify a memory region to enforce the memory limit. 803 * 804 * Returns the size the region should have to enforce the memory limit. 805 * This will either be the original value of size, a truncated value, 806 * or zero. If the returned value of size is 0 the region should be 807 * discarded as it lies wholly above the memory limit. 808 */ 809static unsigned long __init numa_enforce_memory_limit(unsigned long start, 810 unsigned long size) 811{ 812 /* 813 * We use memblock_end_of_DRAM() in here instead of memory_limit because 814 * we've already adjusted it for the limit and it takes care of 815 * having memory holes below the limit. Also, in the case of 816 * iommu_is_off, memory_limit is not set but is implicitly enforced. 817 */ 818 819 if (start + size <= memblock_end_of_DRAM()) 820 return size; 821 822 if (start >= memblock_end_of_DRAM()) 823 return 0; 824 825 return memblock_end_of_DRAM() - start; 826} 827 828/* 829 * Reads the counter for a given entry in 830 * linux,drconf-usable-memory property 831 */ 832static inline int __init read_usm_ranges(const __be32 **usm) 833{ 834 /* 835 * For each lmb in ibm,dynamic-memory a corresponding 836 * entry in linux,drconf-usable-memory property contains 837 * a counter followed by that many (base, size) duple. 838 * read the counter from linux,drconf-usable-memory 839 */ 840 return read_n_cells(n_mem_size_cells, usm); 841} 842 843/* 844 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory 845 * node. This assumes n_mem_{addr,size}_cells have been set. 846 */ 847static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, 848 const __be32 **usm, 849 void *data) 850{ 851 unsigned int ranges, is_kexec_kdump = 0; 852 unsigned long base, size, sz; 853 int nid; 854 855 /* 856 * Skip this block if the reserved bit is set in flags (0x80) 857 * or if the block is not assigned to this partition (0x8) 858 */ 859 if ((lmb->flags & DRCONF_MEM_RESERVED) 860 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 861 return 0; 862 863 if (*usm) 864 is_kexec_kdump = 1; 865 866 base = lmb->base_addr; 867 size = drmem_lmb_size(); 868 ranges = 1; 869 870 if (is_kexec_kdump) { 871 ranges = read_usm_ranges(usm); 872 if (!ranges) /* there are no (base, size) duple */ 873 return 0; 874 } 875 876 do { 877 if (is_kexec_kdump) { 878 base = read_n_cells(n_mem_addr_cells, usm); 879 size = read_n_cells(n_mem_size_cells, usm); 880 } 881 882 nid = get_nid_and_numa_distance(lmb); 883 fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), 884 &nid); 885 node_set_online(nid); 886 sz = numa_enforce_memory_limit(base, size); 887 if (sz) 888 memblock_set_node(base, sz, &memblock.memory, nid); 889 } while (--ranges); 890 891 return 0; 892} 893 894static int __init parse_numa_properties(void) 895{ 896 struct device_node *memory; 897 int default_nid = 0; 898 unsigned long i; 899 const __be32 *associativity; 900 901 if (numa_enabled == 0) { 902 pr_warn("disabled by user\n"); 903 return -1; 904 } 905 906 primary_domain_index = find_primary_domain_index(); 907 908 if (primary_domain_index < 0) { 909 /* 910 * if we fail to parse primary_domain_index from device tree 911 * mark the numa disabled, boot with numa disabled. 912 */ 913 numa_enabled = false; 914 return primary_domain_index; 915 } 916 917 pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); 918 919 /* 920 * If it is FORM2 initialize the distance table here. 921 */ 922 if (affinity_form == FORM2_AFFINITY) 923 initialize_form2_numa_distance_lookup_table(); 924 925 /* 926 * Even though we connect cpus to numa domains later in SMP 927 * init, we need to know the node ids now. This is because 928 * each node to be onlined must have NODE_DATA etc backing it. 929 */ 930 for_each_present_cpu(i) { 931 __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; 932 struct device_node *cpu; 933 int nid = NUMA_NO_NODE; 934 935 memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); 936 937 if (__vphn_get_associativity(i, vphn_assoc) == 0) { 938 nid = associativity_to_nid(vphn_assoc); 939 initialize_form1_numa_distance(vphn_assoc); 940 } else { 941 942 /* 943 * Don't fall back to default_nid yet -- we will plug 944 * cpus into nodes once the memory scan has discovered 945 * the topology. 946 */ 947 cpu = of_get_cpu_node(i, NULL); 948 BUG_ON(!cpu); 949 950 associativity = of_get_associativity(cpu); 951 if (associativity) { 952 nid = associativity_to_nid(associativity); 953 initialize_form1_numa_distance(associativity); 954 } 955 of_node_put(cpu); 956 } 957 958 /* node_set_online() is an UB if 'nid' is negative */ 959 if (likely(nid >= 0)) 960 node_set_online(nid); 961 } 962 963 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); 964 965 for_each_node_by_type(memory, "memory") { 966 unsigned long start; 967 unsigned long size; 968 int nid; 969 int ranges; 970 const __be32 *memcell_buf; 971 unsigned int len; 972 973 memcell_buf = of_get_property(memory, 974 "linux,usable-memory", &len); 975 if (!memcell_buf || len <= 0) 976 memcell_buf = of_get_property(memory, "reg", &len); 977 if (!memcell_buf || len <= 0) 978 continue; 979 980 /* ranges in cell */ 981 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 982new_range: 983 /* these are order-sensitive, and modify the buffer pointer */ 984 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 985 size = read_n_cells(n_mem_size_cells, &memcell_buf); 986 987 /* 988 * Assumption: either all memory nodes or none will 989 * have associativity properties. If none, then 990 * everything goes to default_nid. 991 */ 992 associativity = of_get_associativity(memory); 993 if (associativity) { 994 nid = associativity_to_nid(associativity); 995 initialize_form1_numa_distance(associativity); 996 } else 997 nid = default_nid; 998 999 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); 1000 node_set_online(nid); 1001 1002 size = numa_enforce_memory_limit(start, size); 1003 if (size) 1004 memblock_set_node(start, size, &memblock.memory, nid); 1005 1006 if (--ranges) 1007 goto new_range; 1008 } 1009 1010 /* 1011 * Now do the same thing for each MEMBLOCK listed in the 1012 * ibm,dynamic-memory property in the 1013 * ibm,dynamic-reconfiguration-memory node. 1014 */ 1015 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1016 if (memory) { 1017 walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb); 1018 of_node_put(memory); 1019 } 1020 1021 return 0; 1022} 1023 1024static void __init setup_nonnuma(void) 1025{ 1026 unsigned long top_of_ram = memblock_end_of_DRAM(); 1027 unsigned long total_ram = memblock_phys_mem_size(); 1028 unsigned long start_pfn, end_pfn; 1029 unsigned int nid = 0; 1030 int i; 1031 1032 pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); 1033 pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); 1034 1035 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 1036 fake_numa_create_new_node(end_pfn, &nid); 1037 memblock_set_node(PFN_PHYS(start_pfn), 1038 PFN_PHYS(end_pfn - start_pfn), 1039 &memblock.memory, nid); 1040 node_set_online(nid); 1041 } 1042} 1043 1044void __init dump_numa_cpu_topology(void) 1045{ 1046 unsigned int node; 1047 unsigned int cpu, count; 1048 1049 if (!numa_enabled) 1050 return; 1051 1052 for_each_online_node(node) { 1053 pr_info("Node %d CPUs:", node); 1054 1055 count = 0; 1056 /* 1057 * If we used a CPU iterator here we would miss printing 1058 * the holes in the cpumap. 1059 */ 1060 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1061 if (cpumask_test_cpu(cpu, 1062 node_to_cpumask_map[node])) { 1063 if (count == 0) 1064 pr_cont(" %u", cpu); 1065 ++count; 1066 } else { 1067 if (count > 1) 1068 pr_cont("-%u", cpu - 1); 1069 count = 0; 1070 } 1071 } 1072 1073 if (count > 1) 1074 pr_cont("-%u", nr_cpu_ids - 1); 1075 pr_cont("\n"); 1076 } 1077} 1078 1079/* Initialize NODE_DATA for a node on the local memory */ 1080static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) 1081{ 1082 u64 spanned_pages = end_pfn - start_pfn; 1083 const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); 1084 u64 nd_pa; 1085 void *nd; 1086 int tnid; 1087 1088 nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 1089 if (!nd_pa) 1090 panic("Cannot allocate %zu bytes for node %d data\n", 1091 nd_size, nid); 1092 1093 nd = __va(nd_pa); 1094 1095 /* report and initialize */ 1096 pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", 1097 nd_pa, nd_pa + nd_size - 1); 1098 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 1099 if (tnid != nid) 1100 pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); 1101 1102 node_data[nid] = nd; 1103 memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); 1104 NODE_DATA(nid)->node_id = nid; 1105 NODE_DATA(nid)->node_start_pfn = start_pfn; 1106 NODE_DATA(nid)->node_spanned_pages = spanned_pages; 1107} 1108 1109static void __init find_possible_nodes(void) 1110{ 1111 struct device_node *rtas; 1112 const __be32 *domains = NULL; 1113 int prop_length, max_nodes; 1114 u32 i; 1115 1116 if (!numa_enabled) 1117 return; 1118 1119 rtas = of_find_node_by_path("/rtas"); 1120 if (!rtas) 1121 return; 1122 1123 /* 1124 * ibm,current-associativity-domains is a fairly recent property. If 1125 * it doesn't exist, then fallback on ibm,max-associativity-domains. 1126 * Current denotes what the platform can support compared to max 1127 * which denotes what the Hypervisor can support. 1128 * 1129 * If the LPAR is migratable, new nodes might be activated after a LPM, 1130 * so we should consider the max number in that case. 1131 */ 1132 if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) 1133 domains = of_get_property(rtas, 1134 "ibm,current-associativity-domains", 1135 &prop_length); 1136 if (!domains) { 1137 domains = of_get_property(rtas, "ibm,max-associativity-domains", 1138 &prop_length); 1139 if (!domains) 1140 goto out; 1141 } 1142 1143 max_nodes = of_read_number(&domains[primary_domain_index], 1); 1144 pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); 1145 1146 for (i = 0; i < max_nodes; i++) { 1147 if (!node_possible(i)) 1148 node_set(i, node_possible_map); 1149 } 1150 1151 prop_length /= sizeof(int); 1152 if (prop_length > primary_domain_index + 2) 1153 coregroup_enabled = 1; 1154 1155out: 1156 of_node_put(rtas); 1157} 1158 1159void __init mem_topology_setup(void) 1160{ 1161 int cpu; 1162 1163 /* 1164 * Linux/mm assumes node 0 to be online at boot. However this is not 1165 * true on PowerPC, where node 0 is similar to any other node, it 1166 * could be cpuless, memoryless node. So force node 0 to be offline 1167 * for now. This will prevent cpuless, memoryless node 0 showing up 1168 * unnecessarily as online. If a node has cpus or memory that need 1169 * to be online, then node will anyway be marked online. 1170 */ 1171 node_set_offline(0); 1172 1173 if (parse_numa_properties()) 1174 setup_nonnuma(); 1175 1176 /* 1177 * Modify the set of possible NUMA nodes to reflect information 1178 * available about the set of online nodes, and the set of nodes 1179 * that we expect to make use of for this platform's affinity 1180 * calculations. 1181 */ 1182 nodes_and(node_possible_map, node_possible_map, node_online_map); 1183 1184 find_possible_nodes(); 1185 1186 setup_node_to_cpumask_map(); 1187 1188 reset_numa_cpu_lookup_table(); 1189 1190 for_each_possible_cpu(cpu) { 1191 /* 1192 * Powerpc with CONFIG_NUMA always used to have a node 0, 1193 * even if it was memoryless or cpuless. For all cpus that 1194 * are possible but not present, cpu_to_node() would point 1195 * to node 0. To remove a cpuless, memoryless dummy node, 1196 * powerpc need to make sure all possible but not present 1197 * cpu_to_node are set to a proper node. 1198 */ 1199 numa_setup_cpu(cpu); 1200 } 1201} 1202 1203void __init initmem_init(void) 1204{ 1205 int nid; 1206 1207 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; 1208 max_pfn = max_low_pfn; 1209 1210 memblock_dump_all(); 1211 1212 for_each_online_node(nid) { 1213 unsigned long start_pfn, end_pfn; 1214 1215 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 1216 setup_node_data(nid, start_pfn, end_pfn); 1217 } 1218 1219 sparse_init(); 1220 1221 /* 1222 * We need the numa_cpu_lookup_table to be accurate for all CPUs, 1223 * even before we online them, so that we can use cpu_to_{node,mem} 1224 * early in boot, cf. smp_prepare_cpus(). 1225 * _nocalls() + manual invocation is used because cpuhp is not yet 1226 * initialized for the boot CPU. 1227 */ 1228 cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare", 1229 ppc_numa_cpu_prepare, ppc_numa_cpu_dead); 1230} 1231 1232static int __init early_numa(char *p) 1233{ 1234 if (!p) 1235 return 0; 1236 1237 if (strstr(p, "off")) 1238 numa_enabled = 0; 1239 1240 p = strstr(p, "fake="); 1241 if (p) 1242 cmdline = p + strlen("fake="); 1243 1244 return 0; 1245} 1246early_param("numa", early_numa); 1247 1248#ifdef CONFIG_MEMORY_HOTPLUG 1249/* 1250 * Find the node associated with a hot added memory section for 1251 * memory represented in the device tree by the property 1252 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. 1253 */ 1254static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) 1255{ 1256 struct drmem_lmb *lmb; 1257 unsigned long lmb_size; 1258 int nid = NUMA_NO_NODE; 1259 1260 lmb_size = drmem_lmb_size(); 1261 1262 for_each_drmem_lmb(lmb) { 1263 /* skip this block if it is reserved or not assigned to 1264 * this partition */ 1265 if ((lmb->flags & DRCONF_MEM_RESERVED) 1266 || !(lmb->flags & DRCONF_MEM_ASSIGNED)) 1267 continue; 1268 1269 if ((scn_addr < lmb->base_addr) 1270 || (scn_addr >= (lmb->base_addr + lmb_size))) 1271 continue; 1272 1273 nid = of_drconf_to_nid_single(lmb); 1274 break; 1275 } 1276 1277 return nid; 1278} 1279 1280/* 1281 * Find the node associated with a hot added memory section for memory 1282 * represented in the device tree as a node (i.e. memory@XXXX) for 1283 * each memblock. 1284 */ 1285static int hot_add_node_scn_to_nid(unsigned long scn_addr) 1286{ 1287 struct device_node *memory; 1288 int nid = NUMA_NO_NODE; 1289 1290 for_each_node_by_type(memory, "memory") { 1291 unsigned long start, size; 1292 int ranges; 1293 const __be32 *memcell_buf; 1294 unsigned int len; 1295 1296 memcell_buf = of_get_property(memory, "reg", &len); 1297 if (!memcell_buf || len <= 0) 1298 continue; 1299 1300 /* ranges in cell */ 1301 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); 1302 1303 while (ranges--) { 1304 start = read_n_cells(n_mem_addr_cells, &memcell_buf); 1305 size = read_n_cells(n_mem_size_cells, &memcell_buf); 1306 1307 if ((scn_addr < start) || (scn_addr >= (start + size))) 1308 continue; 1309 1310 nid = of_node_to_nid_single(memory); 1311 break; 1312 } 1313 1314 if (nid >= 0) 1315 break; 1316 } 1317 1318 of_node_put(memory); 1319 1320 return nid; 1321} 1322 1323/* 1324 * Find the node associated with a hot added memory section. Section 1325 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that 1326 * sections are fully contained within a single MEMBLOCK. 1327 */ 1328int hot_add_scn_to_nid(unsigned long scn_addr) 1329{ 1330 struct device_node *memory = NULL; 1331 int nid; 1332 1333 if (!numa_enabled) 1334 return first_online_node; 1335 1336 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1337 if (memory) { 1338 nid = hot_add_drconf_scn_to_nid(scn_addr); 1339 of_node_put(memory); 1340 } else { 1341 nid = hot_add_node_scn_to_nid(scn_addr); 1342 } 1343 1344 if (nid < 0 || !node_possible(nid)) 1345 nid = first_online_node; 1346 1347 return nid; 1348} 1349 1350static u64 hot_add_drconf_memory_max(void) 1351{ 1352 struct device_node *memory = NULL; 1353 struct device_node *dn = NULL; 1354 const __be64 *lrdr = NULL; 1355 1356 dn = of_find_node_by_path("/rtas"); 1357 if (dn) { 1358 lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL); 1359 of_node_put(dn); 1360 if (lrdr) 1361 return be64_to_cpup(lrdr); 1362 } 1363 1364 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 1365 if (memory) { 1366 of_node_put(memory); 1367 return drmem_lmb_memory_max(); 1368 } 1369 return 0; 1370} 1371 1372/* 1373 * memory_hotplug_max - return max address of memory that may be added 1374 * 1375 * This is currently only used on systems that support drconfig memory 1376 * hotplug. 1377 */ 1378u64 memory_hotplug_max(void) 1379{ 1380 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); 1381} 1382#endif /* CONFIG_MEMORY_HOTPLUG */ 1383 1384/* Virtual Processor Home Node (VPHN) support */ 1385#ifdef CONFIG_PPC_SPLPAR 1386static int topology_inited; 1387 1388/* 1389 * Retrieve the new associativity information for a virtual processor's 1390 * home node. 1391 */ 1392static long vphn_get_associativity(unsigned long cpu, 1393 __be32 *associativity) 1394{ 1395 long rc; 1396 1397 rc = hcall_vphn(get_hard_smp_processor_id(cpu), 1398 VPHN_FLAG_VCPU, associativity); 1399 1400 switch (rc) { 1401 case H_SUCCESS: 1402 pr_debug("VPHN hcall succeeded. Reset polling...\n"); 1403 goto out; 1404 1405 case H_FUNCTION: 1406 pr_err_ratelimited("VPHN unsupported. Disabling polling...\n"); 1407 break; 1408 case H_HARDWARE: 1409 pr_err_ratelimited("hcall_vphn() experienced a hardware fault " 1410 "preventing VPHN. Disabling polling...\n"); 1411 break; 1412 case H_PARAMETER: 1413 pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. " 1414 "Disabling polling...\n"); 1415 break; 1416 default: 1417 pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n" 1418 , rc); 1419 break; 1420 } 1421out: 1422 return rc; 1423} 1424 1425void find_and_update_cpu_nid(int cpu) 1426{ 1427 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1428 int new_nid; 1429 1430 /* Use associativity from first thread for all siblings */ 1431 if (vphn_get_associativity(cpu, associativity)) 1432 return; 1433 1434 /* Do not have previous associativity, so find it now. */ 1435 new_nid = associativity_to_nid(associativity); 1436 1437 if (new_nid < 0 || !node_possible(new_nid)) 1438 new_nid = first_online_node; 1439 else 1440 // Associate node <-> cpu, so cpu_up() calls 1441 // try_online_node() on the right node. 1442 set_cpu_numa_node(cpu, new_nid); 1443 1444 pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid); 1445} 1446 1447int cpu_to_coregroup_id(int cpu) 1448{ 1449 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1450 int index; 1451 1452 if (cpu < 0 || cpu > nr_cpu_ids) 1453 return -1; 1454 1455 if (!coregroup_enabled) 1456 goto out; 1457 1458 if (!firmware_has_feature(FW_FEATURE_VPHN)) 1459 goto out; 1460 1461 if (vphn_get_associativity(cpu, associativity)) 1462 goto out; 1463 1464 index = of_read_number(associativity, 1); 1465 if (index > primary_domain_index + 1) 1466 return of_read_number(&associativity[index - 1], 1); 1467 1468out: 1469 return cpu_to_core_id(cpu); 1470} 1471 1472static int topology_update_init(void) 1473{ 1474 topology_inited = 1; 1475 return 0; 1476} 1477device_initcall(topology_update_init); 1478#endif /* CONFIG_PPC_SPLPAR */