root/arch/x86/mm/numa_emulation.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. numa_emu_cmdline
  2. emu_find_memblk_by_nid
  3. mem_hole_size
  4. emu_setup_memblk
  5. split_nodes_interleave
  6. find_end_of_node
  7. uniform_size
  8. split_nodes_size_interleave_uniform
  9. split_nodes_size_interleave
  10. setup_emu2phys_nid
  11. numa_emulation
  12. numa_add_cpu
  13. numa_remove_cpu
  14. numa_set_cpumask
  15. numa_add_cpu
  16. numa_remove_cpu

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NUMA emulation
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/errno.h>
   7 #include <linux/topology.h>
   8 #include <linux/memblock.h>
   9 #include <asm/dma.h>
  10 
  11 #include "numa_internal.h"
  12 
  13 static int emu_nid_to_phys[MAX_NUMNODES];
  14 static char *emu_cmdline __initdata;
  15 
  16 void __init numa_emu_cmdline(char *str)
  17 {
  18         emu_cmdline = str;
  19 }
  20 
  21 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  22 {
  23         int i;
  24 
  25         for (i = 0; i < mi->nr_blks; i++)
  26                 if (mi->blk[i].nid == nid)
  27                         return i;
  28         return -ENOENT;
  29 }
  30 
  31 static u64 __init mem_hole_size(u64 start, u64 end)
  32 {
  33         unsigned long start_pfn = PFN_UP(start);
  34         unsigned long end_pfn = PFN_DOWN(end);
  35 
  36         if (start_pfn < end_pfn)
  37                 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
  38         return 0;
  39 }
  40 
  41 /*
  42  * Sets up nid to range from @start to @end.  The return value is -errno if
  43  * something went wrong, 0 otherwise.
  44  */
  45 static int __init emu_setup_memblk(struct numa_meminfo *ei,
  46                                    struct numa_meminfo *pi,
  47                                    int nid, int phys_blk, u64 size)
  48 {
  49         struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  50         struct numa_memblk *pb = &pi->blk[phys_blk];
  51 
  52         if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  53                 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  54                 return -EINVAL;
  55         }
  56 
  57         ei->nr_blks++;
  58         eb->start = pb->start;
  59         eb->end = pb->start + size;
  60         eb->nid = nid;
  61 
  62         if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  63                 emu_nid_to_phys[nid] = pb->nid;
  64 
  65         pb->start += size;
  66         if (pb->start >= pb->end) {
  67                 WARN_ON_ONCE(pb->start > pb->end);
  68                 numa_remove_memblk_from(phys_blk, pi);
  69         }
  70 
  71         printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
  72                nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
  73         return 0;
  74 }
  75 
  76 /*
  77  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  78  * to max_addr.
  79  *
  80  * Returns zero on success or negative on error.
  81  */
  82 static int __init split_nodes_interleave(struct numa_meminfo *ei,
  83                                          struct numa_meminfo *pi,
  84                                          u64 addr, u64 max_addr, int nr_nodes)
  85 {
  86         nodemask_t physnode_mask = numa_nodes_parsed;
  87         u64 size;
  88         int big;
  89         int nid = 0;
  90         int i, ret;
  91 
  92         if (nr_nodes <= 0)
  93                 return -1;
  94         if (nr_nodes > MAX_NUMNODES) {
  95                 pr_info("numa=fake=%d too large, reducing to %d\n",
  96                         nr_nodes, MAX_NUMNODES);
  97                 nr_nodes = MAX_NUMNODES;
  98         }
  99 
 100         /*
 101          * Calculate target node size.  x86_32 freaks on __udivdi3() so do
 102          * the division in ulong number of pages and convert back.
 103          */
 104         size = max_addr - addr - mem_hole_size(addr, max_addr);
 105         size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
 106 
 107         /*
 108          * Calculate the number of big nodes that can be allocated as a result
 109          * of consolidating the remainder.
 110          */
 111         big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
 112                 FAKE_NODE_MIN_SIZE;
 113 
 114         size &= FAKE_NODE_MIN_HASH_MASK;
 115         if (!size) {
 116                 pr_err("Not enough memory for each node.  "
 117                         "NUMA emulation disabled.\n");
 118                 return -1;
 119         }
 120 
 121         /*
 122          * Continue to fill physical nodes with fake nodes until there is no
 123          * memory left on any of them.
 124          */
 125         while (nodes_weight(physnode_mask)) {
 126                 for_each_node_mask(i, physnode_mask) {
 127                         u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 128                         u64 start, limit, end;
 129                         int phys_blk;
 130 
 131                         phys_blk = emu_find_memblk_by_nid(i, pi);
 132                         if (phys_blk < 0) {
 133                                 node_clear(i, physnode_mask);
 134                                 continue;
 135                         }
 136                         start = pi->blk[phys_blk].start;
 137                         limit = pi->blk[phys_blk].end;
 138                         end = start + size;
 139 
 140                         if (nid < big)
 141                                 end += FAKE_NODE_MIN_SIZE;
 142 
 143                         /*
 144                          * Continue to add memory to this fake node if its
 145                          * non-reserved memory is less than the per-node size.
 146                          */
 147                         while (end - start - mem_hole_size(start, end) < size) {
 148                                 end += FAKE_NODE_MIN_SIZE;
 149                                 if (end > limit) {
 150                                         end = limit;
 151                                         break;
 152                                 }
 153                         }
 154 
 155                         /*
 156                          * If there won't be at least FAKE_NODE_MIN_SIZE of
 157                          * non-reserved memory in ZONE_DMA32 for the next node,
 158                          * this one must extend to the boundary.
 159                          */
 160                         if (end < dma32_end && dma32_end - end -
 161                             mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 162                                 end = dma32_end;
 163 
 164                         /*
 165                          * If there won't be enough non-reserved memory for the
 166                          * next node, this one must extend to the end of the
 167                          * physical node.
 168                          */
 169                         if (limit - end - mem_hole_size(end, limit) < size)
 170                                 end = limit;
 171 
 172                         ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
 173                                                phys_blk,
 174                                                min(end, limit) - start);
 175                         if (ret < 0)
 176                                 return ret;
 177                 }
 178         }
 179         return 0;
 180 }
 181 
 182 /*
 183  * Returns the end address of a node so that there is at least `size' amount of
 184  * non-reserved memory or `max_addr' is reached.
 185  */
 186 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 187 {
 188         u64 end = start + size;
 189 
 190         while (end - start - mem_hole_size(start, end) < size) {
 191                 end += FAKE_NODE_MIN_SIZE;
 192                 if (end > max_addr) {
 193                         end = max_addr;
 194                         break;
 195                 }
 196         }
 197         return end;
 198 }
 199 
 200 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
 201 {
 202         unsigned long max_pfn = PHYS_PFN(max_addr);
 203         unsigned long base_pfn = PHYS_PFN(base);
 204         unsigned long hole_pfns = PHYS_PFN(hole);
 205 
 206         return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
 207 }
 208 
 209 /*
 210  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
 211  * `addr' to `max_addr'.
 212  *
 213  * Returns zero on success or negative on error.
 214  */
 215 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
 216                                               struct numa_meminfo *pi,
 217                                               u64 addr, u64 max_addr, u64 size,
 218                                               int nr_nodes, struct numa_memblk *pblk,
 219                                               int nid)
 220 {
 221         nodemask_t physnode_mask = numa_nodes_parsed;
 222         int i, ret, uniform = 0;
 223         u64 min_size;
 224 
 225         if ((!size && !nr_nodes) || (nr_nodes && !pblk))
 226                 return -1;
 227 
 228         /*
 229          * In the 'uniform' case split the passed in physical node by
 230          * nr_nodes, in the non-uniform case, ignore the passed in
 231          * physical block and try to create nodes of at least size
 232          * @size.
 233          *
 234          * In the uniform case, split the nodes strictly by physical
 235          * capacity, i.e. ignore holes. In the non-uniform case account
 236          * for holes and treat @size as a minimum floor.
 237          */
 238         if (!nr_nodes)
 239                 nr_nodes = MAX_NUMNODES;
 240         else {
 241                 nodes_clear(physnode_mask);
 242                 node_set(pblk->nid, physnode_mask);
 243                 uniform = 1;
 244         }
 245 
 246         if (uniform) {
 247                 min_size = uniform_size(max_addr, addr, 0, nr_nodes);
 248                 size = min_size;
 249         } else {
 250                 /*
 251                  * The limit on emulated nodes is MAX_NUMNODES, so the
 252                  * size per node is increased accordingly if the
 253                  * requested size is too small.  This creates a uniform
 254                  * distribution of node sizes across the entire machine
 255                  * (but not necessarily over physical nodes).
 256                  */
 257                 min_size = uniform_size(max_addr, addr,
 258                                 mem_hole_size(addr, max_addr), nr_nodes);
 259         }
 260         min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
 261         if (size < min_size) {
 262                 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
 263                         size >> 20, min_size >> 20);
 264                 size = min_size;
 265         }
 266         size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
 267 
 268         /*
 269          * Fill physical nodes with fake nodes of size until there is no memory
 270          * left on any of them.
 271          */
 272         while (nodes_weight(physnode_mask)) {
 273                 for_each_node_mask(i, physnode_mask) {
 274                         u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 275                         u64 start, limit, end;
 276                         int phys_blk;
 277 
 278                         phys_blk = emu_find_memblk_by_nid(i, pi);
 279                         if (phys_blk < 0) {
 280                                 node_clear(i, physnode_mask);
 281                                 continue;
 282                         }
 283 
 284                         start = pi->blk[phys_blk].start;
 285                         limit = pi->blk[phys_blk].end;
 286 
 287                         if (uniform)
 288                                 end = start + size;
 289                         else
 290                                 end = find_end_of_node(start, limit, size);
 291                         /*
 292                          * If there won't be at least FAKE_NODE_MIN_SIZE of
 293                          * non-reserved memory in ZONE_DMA32 for the next node,
 294                          * this one must extend to the boundary.
 295                          */
 296                         if (end < dma32_end && dma32_end - end -
 297                             mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 298                                 end = dma32_end;
 299 
 300                         /*
 301                          * If there won't be enough non-reserved memory for the
 302                          * next node, this one must extend to the end of the
 303                          * physical node.
 304                          */
 305                         if ((limit - end - mem_hole_size(end, limit) < size)
 306                                         && !uniform)
 307                                 end = limit;
 308 
 309                         ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
 310                                                phys_blk,
 311                                                min(end, limit) - start);
 312                         if (ret < 0)
 313                                 return ret;
 314                 }
 315         }
 316         return nid;
 317 }
 318 
 319 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 320                                               struct numa_meminfo *pi,
 321                                               u64 addr, u64 max_addr, u64 size)
 322 {
 323         return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
 324                         0, NULL, NUMA_NO_NODE);
 325 }
 326 
 327 int __init setup_emu2phys_nid(int *dfl_phys_nid)
 328 {
 329         int i, max_emu_nid = 0;
 330 
 331         *dfl_phys_nid = NUMA_NO_NODE;
 332         for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
 333                 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
 334                         max_emu_nid = i;
 335                         if (*dfl_phys_nid == NUMA_NO_NODE)
 336                                 *dfl_phys_nid = emu_nid_to_phys[i];
 337                 }
 338         }
 339 
 340         return max_emu_nid;
 341 }
 342 
 343 /**
 344  * numa_emulation - Emulate NUMA nodes
 345  * @numa_meminfo: NUMA configuration to massage
 346  * @numa_dist_cnt: The size of the physical NUMA distance table
 347  *
 348  * Emulate NUMA nodes according to the numa=fake kernel parameter.
 349  * @numa_meminfo contains the physical memory configuration and is modified
 350  * to reflect the emulated configuration on success.  @numa_dist_cnt is
 351  * used to determine the size of the physical distance table.
 352  *
 353  * On success, the following modifications are made.
 354  *
 355  * - @numa_meminfo is updated to reflect the emulated nodes.
 356  *
 357  * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
 358  *   emulated nodes.
 359  *
 360  * - NUMA distance table is rebuilt to represent distances between emulated
 361  *   nodes.  The distances are determined considering how emulated nodes
 362  *   are mapped to physical nodes and match the actual distances.
 363  *
 364  * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
 365  *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
 366  *
 367  * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
 368  * identity mapping and no other modification is made.
 369  */
 370 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 371 {
 372         static struct numa_meminfo ei __initdata;
 373         static struct numa_meminfo pi __initdata;
 374         const u64 max_addr = PFN_PHYS(max_pfn);
 375         u8 *phys_dist = NULL;
 376         size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
 377         int max_emu_nid, dfl_phys_nid;
 378         int i, j, ret;
 379 
 380         if (!emu_cmdline)
 381                 goto no_emu;
 382 
 383         memset(&ei, 0, sizeof(ei));
 384         pi = *numa_meminfo;
 385 
 386         for (i = 0; i < MAX_NUMNODES; i++)
 387                 emu_nid_to_phys[i] = NUMA_NO_NODE;
 388 
 389         /*
 390          * If the numa=fake command-line contains a 'M' or 'G', it represents
 391          * the fixed node size.  Otherwise, if it is just a single number N,
 392          * split the system RAM into N fake nodes.
 393          */
 394         if (strchr(emu_cmdline, 'U')) {
 395                 nodemask_t physnode_mask = numa_nodes_parsed;
 396                 unsigned long n;
 397                 int nid = 0;
 398 
 399                 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 400                 ret = -1;
 401                 for_each_node_mask(i, physnode_mask) {
 402                         /*
 403                          * The reason we pass in blk[0] is due to
 404                          * numa_remove_memblk_from() called by
 405                          * emu_setup_memblk() will delete entry 0
 406                          * and then move everything else up in the pi.blk
 407                          * array. Therefore we should always be looking
 408                          * at blk[0].
 409                          */
 410                         ret = split_nodes_size_interleave_uniform(&ei, &pi,
 411                                         pi.blk[0].start, pi.blk[0].end, 0,
 412                                         n, &pi.blk[0], nid);
 413                         if (ret < 0)
 414                                 break;
 415                         if (ret < n) {
 416                                 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
 417                                                 __func__, i, ret, n);
 418                                 ret = -1;
 419                                 break;
 420                         }
 421                         nid = ret;
 422                 }
 423         } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
 424                 u64 size;
 425 
 426                 size = memparse(emu_cmdline, &emu_cmdline);
 427                 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
 428         } else {
 429                 unsigned long n;
 430 
 431                 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 432                 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
 433         }
 434         if (*emu_cmdline == ':')
 435                 emu_cmdline++;
 436 
 437         if (ret < 0)
 438                 goto no_emu;
 439 
 440         if (numa_cleanup_meminfo(&ei) < 0) {
 441                 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
 442                 goto no_emu;
 443         }
 444 
 445         /* copy the physical distance table */
 446         if (numa_dist_cnt) {
 447                 u64 phys;
 448 
 449                 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 450                                               phys_size, PAGE_SIZE);
 451                 if (!phys) {
 452                         pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
 453                         goto no_emu;
 454                 }
 455                 memblock_reserve(phys, phys_size);
 456                 phys_dist = __va(phys);
 457 
 458                 for (i = 0; i < numa_dist_cnt; i++)
 459                         for (j = 0; j < numa_dist_cnt; j++)
 460                                 phys_dist[i * numa_dist_cnt + j] =
 461                                         node_distance(i, j);
 462         }
 463 
 464         /*
 465          * Determine the max emulated nid and the default phys nid to use
 466          * for unmapped nodes.
 467          */
 468         max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
 469 
 470         /* commit */
 471         *numa_meminfo = ei;
 472 
 473         /* Make sure numa_nodes_parsed only contains emulated nodes */
 474         nodes_clear(numa_nodes_parsed);
 475         for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
 476                 if (ei.blk[i].start != ei.blk[i].end &&
 477                     ei.blk[i].nid != NUMA_NO_NODE)
 478                         node_set(ei.blk[i].nid, numa_nodes_parsed);
 479 
 480         /*
 481          * Transform __apicid_to_node table to use emulated nids by
 482          * reverse-mapping phys_nid.  The maps should always exist but fall
 483          * back to zero just in case.
 484          */
 485         for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
 486                 if (__apicid_to_node[i] == NUMA_NO_NODE)
 487                         continue;
 488                 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
 489                         if (__apicid_to_node[i] == emu_nid_to_phys[j])
 490                                 break;
 491                 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
 492         }
 493 
 494         /* make sure all emulated nodes are mapped to a physical node */
 495         for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 496                 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 497                         emu_nid_to_phys[i] = dfl_phys_nid;
 498 
 499         /* transform distance table */
 500         numa_reset_distance();
 501         for (i = 0; i < max_emu_nid + 1; i++) {
 502                 for (j = 0; j < max_emu_nid + 1; j++) {
 503                         int physi = emu_nid_to_phys[i];
 504                         int physj = emu_nid_to_phys[j];
 505                         int dist;
 506 
 507                         if (get_option(&emu_cmdline, &dist) == 2)
 508                                 ;
 509                         else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
 510                                 dist = physi == physj ?
 511                                         LOCAL_DISTANCE : REMOTE_DISTANCE;
 512                         else
 513                                 dist = phys_dist[physi * numa_dist_cnt + physj];
 514 
 515                         numa_set_distance(i, j, dist);
 516                 }
 517         }
 518 
 519         /* free the copied physical distance table */
 520         if (phys_dist)
 521                 memblock_free(__pa(phys_dist), phys_size);
 522         return;
 523 
 524 no_emu:
 525         /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
 526         for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 527                 emu_nid_to_phys[i] = i;
 528 }
 529 
 530 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 531 void numa_add_cpu(int cpu)
 532 {
 533         int physnid, nid;
 534 
 535         nid = early_cpu_to_node(cpu);
 536         BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
 537 
 538         physnid = emu_nid_to_phys[nid];
 539 
 540         /*
 541          * Map the cpu to each emulated node that is allocated on the physical
 542          * node of the cpu's apic id.
 543          */
 544         for_each_online_node(nid)
 545                 if (emu_nid_to_phys[nid] == physnid)
 546                         cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
 547 }
 548 
 549 void numa_remove_cpu(int cpu)
 550 {
 551         int i;
 552 
 553         for_each_online_node(i)
 554                 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
 555 }
 556 #else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
 557 static void numa_set_cpumask(int cpu, bool enable)
 558 {
 559         int nid, physnid;
 560 
 561         nid = early_cpu_to_node(cpu);
 562         if (nid == NUMA_NO_NODE) {
 563                 /* early_cpu_to_node() already emits a warning and trace */
 564                 return;
 565         }
 566 
 567         physnid = emu_nid_to_phys[nid];
 568 
 569         for_each_online_node(nid) {
 570                 if (emu_nid_to_phys[nid] != physnid)
 571                         continue;
 572 
 573                 debug_cpumask_set_cpu(cpu, nid, enable);
 574         }
 575 }
 576 
 577 void numa_add_cpu(int cpu)
 578 {
 579         numa_set_cpumask(cpu, true);
 580 }
 581 
 582 void numa_remove_cpu(int cpu)
 583 {
 584         numa_set_cpumask(cpu, false);
 585 }
 586 #endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */

/* [<][>][^][v][top][bottom][index][help] */