drivers/base/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Memory subsystem support
   4  *
   5  * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
   6  *            Dave Hansen <haveblue@us.ibm.com>
   7  *
   8  * This file provides the necessary infrastructure to represent
   9  * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10  * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11  * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12  */
  13
  14 #include <linux/module.h>
  15 #include <linux/init.h>
  16 #include <linux/topology.h>
  17 #include <linux/capability.h>
  18 #include <linux/device.h>
  19 #include <linux/memory.h>
  20 #include <linux/memory_hotplug.h>
  21 #include <linux/mm.h>
  22 #include <linux/stat.h>
  23 #include <linux/slab.h>
  24 #include <linux/xarray.h>
  25
  26 #include <linux/atomic.h>
  27 #include <linux/uaccess.h>
  28
  29 #define MEMORY_CLASS_NAME       "memory"
  30
  31 static const char *const online_type_to_str[] = {
  32         [MMOP_OFFLINE] = "offline",
  33         [MMOP_ONLINE] = "online",
  34         [MMOP_ONLINE_KERNEL] = "online_kernel",
  35         [MMOP_ONLINE_MOVABLE] = "online_movable",
  36 };
  37
  38 int mhp_online_type_from_str(const char *str)
  39 {
  40         int i;
  41
  42         for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  43                 if (sysfs_streq(str, online_type_to_str[i]))
  44                         return i;
  45         }
  46         return -EINVAL;
  47 }
  48
  49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  50
  51 static int sections_per_block;
  52
  53 static inline unsigned long memory_block_id(unsigned long section_nr)
  54 {
  55         return section_nr / sections_per_block;
  56 }
  57
  58 static inline unsigned long pfn_to_block_id(unsigned long pfn)
  59 {
  60         return memory_block_id(pfn_to_section_nr(pfn));
  61 }
  62
  63 static inline unsigned long phys_to_block_id(unsigned long phys)
  64 {
  65         return pfn_to_block_id(PFN_DOWN(phys));
  66 }
  67
  68 static int memory_subsys_online(struct device *dev);
  69 static int memory_subsys_offline(struct device *dev);
  70
  71 static struct bus_type memory_subsys = {
  72         .name = MEMORY_CLASS_NAME,
  73         .dev_name = MEMORY_CLASS_NAME,
  74         .online = memory_subsys_online,
  75         .offline = memory_subsys_offline,
  76 };
  77
  78 /*
  79  * Memory blocks are cached in a local radix tree to avoid
  80  * a costly linear search for the corresponding device on
  81  * the subsystem bus.
  82  */
  83 static DEFINE_XARRAY(memory_blocks);
  84
  85 /*
  86  * Memory groups, indexed by memory group id (mgid).
  87  */
  88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  89 #define MEMORY_GROUP_MARK_DYNAMIC       XA_MARK_1
  90
  91 static BLOCKING_NOTIFIER_HEAD(memory_chain);
  92
  93 int register_memory_notifier(struct notifier_block *nb)
  94 {
  95         return blocking_notifier_chain_register(&memory_chain, nb);
  96 }
  97 EXPORT_SYMBOL(register_memory_notifier);
  98
  99 void unregister_memory_notifier(struct notifier_block *nb)
 100 {
 101         blocking_notifier_chain_unregister(&memory_chain, nb);
 102 }
 103 EXPORT_SYMBOL(unregister_memory_notifier);
 104
 105 static void memory_block_release(struct device *dev)
 106 {
 107         struct memory_block *mem = to_memory_block(dev);
 108
 109         kfree(mem);
 110 }
 111
 112 unsigned long __weak memory_block_size_bytes(void)
 113 {
 114         return MIN_MEMORY_BLOCK_SIZE;
 115 }
 116 EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 117
 118 /*
 119  * Show the first physical section index (number) of this memory block.
 120  */
 121 static ssize_t phys_index_show(struct device *dev,
 122                                struct device_attribute *attr, char *buf)
 123 {
 124         struct memory_block *mem = to_memory_block(dev);
 125         unsigned long phys_index;
 126
 127         phys_index = mem->start_section_nr / sections_per_block;
 128
 129         return sysfs_emit(buf, "%08lx\n", phys_index);
 130 }
 131
 132 /*
 133  * Legacy interface that we cannot remove. Always indicate "removable"
 134  * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
 135  */
 136 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 137                               char *buf)
 138 {
 139         return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
 140 }
 141
 142 /*
 143  * online, offline, going offline, etc.
 144  */
 145 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 146                           char *buf)
 147 {
 148         struct memory_block *mem = to_memory_block(dev);
 149         const char *output;
 150
 151         /*
 152          * We can probably put these states in a nice little array
 153          * so that they're not open-coded
 154          */
 155         switch (mem->state) {
 156         case MEM_ONLINE:
 157                 output = "online";
 158                 break;
 159         case MEM_OFFLINE:
 160                 output = "offline";
 161                 break;
 162         case MEM_GOING_OFFLINE:
 163                 output = "going-offline";
 164                 break;
 165         default:
 166                 WARN_ON(1);
 167                 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
 168         }
 169
 170         return sysfs_emit(buf, "%s\n", output);
 171 }
 172
 173 int memory_notify(unsigned long val, void *v)
 174 {
 175         return blocking_notifier_call_chain(&memory_chain, val, v);
 176 }
 177
 178 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
 179 static unsigned long memblk_nr_poison(struct memory_block *mem);
 180 #else
 181 static inline unsigned long memblk_nr_poison(struct memory_block *mem)
 182 {
 183         return 0;
 184 }
 185 #endif
 186
 187 static int memory_block_online(struct memory_block *mem)
 188 {
 189         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 190         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 191         unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 192         struct zone *zone;
 193         int ret;
 194
 195         if (memblk_nr_poison(mem))
 196                 return -EHWPOISON;
 197
 198         zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
 199                                   start_pfn, nr_pages);
 200
 201         /*
 202          * Although vmemmap pages have a different lifecycle than the pages
 203          * they describe (they remain until the memory is unplugged), doing
 204          * their initialization and accounting at memory onlining/offlining
 205          * stage helps to keep accounting easier to follow - e.g vmemmaps
 206          * belong to the same zone as the memory they backed.
 207          */
 208         if (nr_vmemmap_pages) {
 209                 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 210                 if (ret)
 211                         return ret;
 212         }
 213
 214         ret = online_pages(start_pfn + nr_vmemmap_pages,
 215                            nr_pages - nr_vmemmap_pages, zone, mem->group);
 216         if (ret) {
 217                 if (nr_vmemmap_pages)
 218                         mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 219                 return ret;
 220         }
 221
 222         /*
 223          * Account once onlining succeeded. If the zone was unpopulated, it is
 224          * now already properly populated.
 225          */
 226         if (nr_vmemmap_pages)
 227                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 228                                           nr_vmemmap_pages);
 229
 230         mem->zone = zone;
 231         return ret;
 232 }
 233
 234 static int memory_block_offline(struct memory_block *mem)
 235 {
 236         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 237         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 238         unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
 239         int ret;
 240
 241         if (!mem->zone)
 242                 return -EINVAL;
 243
 244         /*
 245          * Unaccount before offlining, such that unpopulated zone and kthreads
 246          * can properly be torn down in offline_pages().
 247          */
 248         if (nr_vmemmap_pages)
 249                 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 250                                           -nr_vmemmap_pages);
 251
 252         ret = offline_pages(start_pfn + nr_vmemmap_pages,
 253                             nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
 254         if (ret) {
 255                 /* offline_pages() failed. Account back. */
 256                 if (nr_vmemmap_pages)
 257                         adjust_present_page_count(pfn_to_page(start_pfn),
 258                                                   mem->group, nr_vmemmap_pages);
 259                 return ret;
 260         }
 261
 262         if (nr_vmemmap_pages)
 263                 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
 264
 265         mem->zone = NULL;
 266         return ret;
 267 }
 268
 269 /*
 270  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 271  * OK to have direct references to sparsemem variables in here.
 272  */
 273 static int
 274 memory_block_action(struct memory_block *mem, unsigned long action)
 275 {
 276         int ret;
 277
 278         switch (action) {
 279         case MEM_ONLINE:
 280                 ret = memory_block_online(mem);
 281                 break;
 282         case MEM_OFFLINE:
 283                 ret = memory_block_offline(mem);
 284                 break;
 285         default:
 286                 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
 287                      "%ld\n", __func__, mem->start_section_nr, action, action);
 288                 ret = -EINVAL;
 289         }
 290
 291         return ret;
 292 }
 293
 294 static int memory_block_change_state(struct memory_block *mem,
 295                 unsigned long to_state, unsigned long from_state_req)
 296 {
 297         int ret = 0;
 298
 299         if (mem->state != from_state_req)
 300                 return -EINVAL;
 301
 302         if (to_state == MEM_OFFLINE)
 303                 mem->state = MEM_GOING_OFFLINE;
 304
 305         ret = memory_block_action(mem, to_state);
 306         mem->state = ret ? from_state_req : to_state;
 307
 308         return ret;
 309 }
 310
 311 /* The device lock serializes operations on memory_subsys_[online|offline] */
 312 static int memory_subsys_online(struct device *dev)
 313 {
 314         struct memory_block *mem = to_memory_block(dev);
 315         int ret;
 316
 317         if (mem->state == MEM_ONLINE)
 318                 return 0;
 319
 320         /*
 321          * When called via device_online() without configuring the online_type,
 322          * we want to default to MMOP_ONLINE.
 323          */
 324         if (mem->online_type == MMOP_OFFLINE)
 325                 mem->online_type = MMOP_ONLINE;
 326
 327         ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 328         mem->online_type = MMOP_OFFLINE;
 329
 330         return ret;
 331 }
 332
 333 static int memory_subsys_offline(struct device *dev)
 334 {
 335         struct memory_block *mem = to_memory_block(dev);
 336
 337         if (mem->state == MEM_OFFLINE)
 338                 return 0;
 339
 340         return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 341 }
 342
 343 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
 344                            const char *buf, size_t count)
 345 {
 346         const int online_type = mhp_online_type_from_str(buf);
 347         struct memory_block *mem = to_memory_block(dev);
 348         int ret;
 349
 350         if (online_type < 0)
 351                 return -EINVAL;
 352
 353         ret = lock_device_hotplug_sysfs();
 354         if (ret)
 355                 return ret;
 356
 357         switch (online_type) {
 358         case MMOP_ONLINE_KERNEL:
 359         case MMOP_ONLINE_MOVABLE:
 360         case MMOP_ONLINE:
 361                 /* mem->online_type is protected by device_hotplug_lock */
 362                 mem->online_type = online_type;
 363                 ret = device_online(&mem->dev);
 364                 break;
 365         case MMOP_OFFLINE:
 366                 ret = device_offline(&mem->dev);
 367                 break;
 368         default:
 369                 ret = -EINVAL; /* should never happen */
 370         }
 371
 372         unlock_device_hotplug();
 373
 374         if (ret < 0)
 375                 return ret;
 376         if (ret)
 377                 return -EINVAL;
 378
 379         return count;
 380 }
 381
 382 /*
 383  * Legacy interface that we cannot remove: s390x exposes the storage increment
 384  * covered by a memory block, allowing for identifying which memory blocks
 385  * comprise a storage increment. Since a memory block spans complete
 386  * storage increments nowadays, this interface is basically unused. Other
 387  * archs never exposed != 0.
 388  */
 389 static ssize_t phys_device_show(struct device *dev,
 390                                 struct device_attribute *attr, char *buf)
 391 {
 392         struct memory_block *mem = to_memory_block(dev);
 393         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 394
 395         return sysfs_emit(buf, "%d\n",
 396                           arch_get_memory_phys_device(start_pfn));
 397 }
 398
 399 #ifdef CONFIG_MEMORY_HOTREMOVE
 400 static int print_allowed_zone(char *buf, int len, int nid,
 401                               struct memory_group *group,
 402                               unsigned long start_pfn, unsigned long nr_pages,
 403                               int online_type, struct zone *default_zone)
 404 {
 405         struct zone *zone;
 406
 407         zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
 408         if (zone == default_zone)
 409                 return 0;
 410
 411         return sysfs_emit_at(buf, len, " %s", zone->name);
 412 }
 413
 414 static ssize_t valid_zones_show(struct device *dev,
 415                                 struct device_attribute *attr, char *buf)
 416 {
 417         struct memory_block *mem = to_memory_block(dev);
 418         unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 419         unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 420         struct memory_group *group = mem->group;
 421         struct zone *default_zone;
 422         int nid = mem->nid;
 423         int len = 0;
 424
 425         /*
 426          * Check the existing zone. Make sure that we do that only on the
 427          * online nodes otherwise the page_zone is not reliable
 428          */
 429         if (mem->state == MEM_ONLINE) {
 430                 /*
 431                  * If !mem->zone, the memory block spans multiple zones and
 432                  * cannot get offlined.
 433                  */
 434                 default_zone = mem->zone;
 435                 if (!default_zone)
 436                         return sysfs_emit(buf, "%s\n", "none");
 437                 len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 438                 goto out;
 439         }
 440
 441         default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
 442                                           start_pfn, nr_pages);
 443
 444         len += sysfs_emit_at(buf, len, "%s", default_zone->name);
 445         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 446                                   MMOP_ONLINE_KERNEL, default_zone);
 447         len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
 448                                   MMOP_ONLINE_MOVABLE, default_zone);
 449 out:
 450         len += sysfs_emit_at(buf, len, "\n");
 451         return len;
 452 }
 453 static DEVICE_ATTR_RO(valid_zones);
 454 #endif
 455
 456 static DEVICE_ATTR_RO(phys_index);
 457 static DEVICE_ATTR_RW(state);
 458 static DEVICE_ATTR_RO(phys_device);
 459 static DEVICE_ATTR_RO(removable);
 460
 461 /*
 462  * Show the memory block size (shared by all memory blocks).
 463  */
 464 static ssize_t block_size_bytes_show(struct device *dev,
 465                                      struct device_attribute *attr, char *buf)
 466 {
 467         return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
 468 }
 469
 470 static DEVICE_ATTR_RO(block_size_bytes);
 471
 472 /*
 473  * Memory auto online policy.
 474  */
 475
 476 static ssize_t auto_online_blocks_show(struct device *dev,
 477                                        struct device_attribute *attr, char *buf)
 478 {
 479         return sysfs_emit(buf, "%s\n",
 480                           online_type_to_str[mhp_default_online_type]);
 481 }
 482
 483 static ssize_t auto_online_blocks_store(struct device *dev,
 484                                         struct device_attribute *attr,
 485                                         const char *buf, size_t count)
 486 {
 487         const int online_type = mhp_online_type_from_str(buf);
 488
 489         if (online_type < 0)
 490                 return -EINVAL;
 491
 492         mhp_default_online_type = online_type;
 493         return count;
 494 }
 495
 496 static DEVICE_ATTR_RW(auto_online_blocks);
 497
 498 /*
 499  * Some architectures will have custom drivers to do this, and
 500  * will not need to do it from userspace.  The fake hot-add code
 501  * as well as ppc64 will do all of their discovery in userspace
 502  * and will require this interface.
 503  */
 504 #ifdef CONFIG_ARCH_MEMORY_PROBE
 505 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
 506                            const char *buf, size_t count)
 507 {
 508         u64 phys_addr;
 509         int nid, ret;
 510         unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
 511
 512         ret = kstrtoull(buf, 0, &phys_addr);
 513         if (ret)
 514                 return ret;
 515
 516         if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
 517                 return -EINVAL;
 518
 519         ret = lock_device_hotplug_sysfs();
 520         if (ret)
 521                 return ret;
 522
 523         nid = memory_add_physaddr_to_nid(phys_addr);
 524         ret = __add_memory(nid, phys_addr,
 525                            MIN_MEMORY_BLOCK_SIZE * sections_per_block,
 526                            MHP_NONE);
 527
 528         if (ret)
 529                 goto out;
 530
 531         ret = count;
 532 out:
 533         unlock_device_hotplug();
 534         return ret;
 535 }
 536
 537 static DEVICE_ATTR_WO(probe);
 538 #endif
 539
 540 #ifdef CONFIG_MEMORY_FAILURE
 541 /*
 542  * Support for offlining pages of memory
 543  */
 544
 545 /* Soft offline a page */
 546 static ssize_t soft_offline_page_store(struct device *dev,
 547                                        struct device_attribute *attr,
 548                                        const char *buf, size_t count)
 549 {
 550         int ret;
 551         u64 pfn;
 552         if (!capable(CAP_SYS_ADMIN))
 553                 return -EPERM;
 554         if (kstrtoull(buf, 0, &pfn) < 0)
 555                 return -EINVAL;
 556         pfn >>= PAGE_SHIFT;
 557         ret = soft_offline_page(pfn, 0);
 558         return ret == 0 ? count : ret;
 559 }
 560
 561 /* Forcibly offline a page, including killing processes. */
 562 static ssize_t hard_offline_page_store(struct device *dev,
 563                                        struct device_attribute *attr,
 564                                        const char *buf, size_t count)
 565 {
 566         int ret;
 567         u64 pfn;
 568         if (!capable(CAP_SYS_ADMIN))
 569                 return -EPERM;
 570         if (kstrtoull(buf, 0, &pfn) < 0)
 571                 return -EINVAL;
 572         pfn >>= PAGE_SHIFT;
 573         ret = memory_failure(pfn, MF_SW_SIMULATED);
 574         if (ret == -EOPNOTSUPP)
 575                 ret = 0;
 576         return ret ? ret : count;
 577 }
 578
 579 static DEVICE_ATTR_WO(soft_offline_page);
 580 static DEVICE_ATTR_WO(hard_offline_page);
 581 #endif
 582
 583 /* See phys_device_show(). */
 584 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 585 {
 586         return 0;
 587 }
 588
 589 /*
 590  * A reference for the returned memory block device is acquired.
 591  *
 592  * Called under device_hotplug_lock.
 593  */
 594 static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 595 {
 596         struct memory_block *mem;
 597
 598         mem = xa_load(&memory_blocks, block_id);
 599         if (mem)
 600                 get_device(&mem->dev);
 601         return mem;
 602 }
 603
 604 /*
 605  * Called under device_hotplug_lock.
 606  */
 607 struct memory_block *find_memory_block(unsigned long section_nr)
 608 {
 609         unsigned long block_id = memory_block_id(section_nr);
 610
 611         return find_memory_block_by_id(block_id);
 612 }
 613
 614 static struct attribute *memory_memblk_attrs[] = {
 615         &dev_attr_phys_index.attr,
 616         &dev_attr_state.attr,
 617         &dev_attr_phys_device.attr,
 618         &dev_attr_removable.attr,
 619 #ifdef CONFIG_MEMORY_HOTREMOVE
 620         &dev_attr_valid_zones.attr,
 621 #endif
 622         NULL
 623 };
 624
 625 static const struct attribute_group memory_memblk_attr_group = {
 626         .attrs = memory_memblk_attrs,
 627 };
 628
 629 static const struct attribute_group *memory_memblk_attr_groups[] = {
 630         &memory_memblk_attr_group,
 631         NULL,
 632 };
 633
 634 static int __add_memory_block(struct memory_block *memory)
 635 {
 636         int ret;
 637
 638         memory->dev.bus = &memory_subsys;
 639         memory->dev.id = memory->start_section_nr / sections_per_block;
 640         memory->dev.release = memory_block_release;
 641         memory->dev.groups = memory_memblk_attr_groups;
 642         memory->dev.offline = memory->state == MEM_OFFLINE;
 643
 644         ret = device_register(&memory->dev);
 645         if (ret) {
 646                 put_device(&memory->dev);
 647                 return ret;
 648         }
 649         ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
 650                               GFP_KERNEL));
 651         if (ret)
 652                 device_unregister(&memory->dev);
 653
 654         return ret;
 655 }
 656
 657 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
 658                                                      int nid)
 659 {
 660         const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 661         const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 662         struct zone *zone, *matching_zone = NULL;
 663         pg_data_t *pgdat = NODE_DATA(nid);
 664         int i;
 665
 666         /*
 667          * This logic only works for early memory, when the applicable zones
 668          * already span the memory block. We don't expect overlapping zones on
 669          * a single node for early memory. So if we're told that some PFNs
 670          * of a node fall into this memory block, we can assume that all node
 671          * zones that intersect with the memory block are actually applicable.
 672          * No need to look at the memmap.
 673          */
 674         for (i = 0; i < MAX_NR_ZONES; i++) {
 675                 zone = pgdat->node_zones + i;
 676                 if (!populated_zone(zone))
 677                         continue;
 678                 if (!zone_intersects(zone, start_pfn, nr_pages))
 679                         continue;
 680                 if (!matching_zone) {
 681                         matching_zone = zone;
 682                         continue;
 683                 }
 684                 /* Spans multiple zones ... */
 685                 matching_zone = NULL;
 686                 break;
 687         }
 688         return matching_zone;
 689 }
 690
 691 #ifdef CONFIG_NUMA
 692 /**
 693  * memory_block_add_nid() - Indicate that system RAM falling into this memory
 694  *                          block device (partially) belongs to the given node.
 695  * @mem: The memory block device.
 696  * @nid: The node id.
 697  * @context: The memory initialization context.
 698  *
 699  * Indicate that system RAM falling into this memory block (partially) belongs
 700  * to the given node. If the context indicates ("early") that we are adding the
 701  * node during node device subsystem initialization, this will also properly
 702  * set/adjust mem->zone based on the zone ranges of the given node.
 703  */
 704 void memory_block_add_nid(struct memory_block *mem, int nid,
 705                           enum meminit_context context)
 706 {
 707         if (context == MEMINIT_EARLY && mem->nid != nid) {
 708                 /*
 709                  * For early memory we have to determine the zone when setting
 710                  * the node id and handle multiple nodes spanning a single
 711                  * memory block by indicate via zone == NULL that we're not
 712                  * dealing with a single zone. So if we're setting the node id
 713                  * the first time, determine if there is a single zone. If we're
 714                  * setting the node id a second time to a different node,
 715                  * invalidate the single detected zone.
 716                  */
 717                 if (mem->nid == NUMA_NO_NODE)
 718                         mem->zone = early_node_zone_for_memory_block(mem, nid);
 719                 else
 720                         mem->zone = NULL;
 721         }
 722
 723         /*
 724          * If this memory block spans multiple nodes, we only indicate
 725          * the last processed node. If we span multiple nodes (not applicable
 726          * to hotplugged memory), zone == NULL will prohibit memory offlining
 727          * and consequently unplug.
 728          */
 729         mem->nid = nid;
 730 }
 731 #endif
 732
 733 static int add_memory_block(unsigned long block_id, unsigned long state,
 734                             unsigned long nr_vmemmap_pages,
 735                             struct memory_group *group)
 736 {
 737         struct memory_block *mem;
 738         int ret = 0;
 739
 740         mem = find_memory_block_by_id(block_id);
 741         if (mem) {
 742                 put_device(&mem->dev);
 743                 return -EEXIST;
 744         }
 745         mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 746         if (!mem)
 747                 return -ENOMEM;
 748
 749         mem->start_section_nr = block_id * sections_per_block;
 750         mem->state = state;
 751         mem->nid = NUMA_NO_NODE;
 752         mem->nr_vmemmap_pages = nr_vmemmap_pages;
 753         INIT_LIST_HEAD(&mem->group_next);
 754
 755 #ifndef CONFIG_NUMA
 756         if (state == MEM_ONLINE)
 757                 /*
 758                  * MEM_ONLINE at this point implies early memory. With NUMA,
 759                  * we'll determine the zone when setting the node id via
 760                  * memory_block_add_nid(). Memory hotplug updated the zone
 761                  * manually when memory onlining/offlining succeeds.
 762                  */
 763                 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
 764 #endif /* CONFIG_NUMA */
 765
 766         ret = __add_memory_block(mem);
 767         if (ret)
 768                 return ret;
 769
 770         if (group) {
 771                 mem->group = group;
 772                 list_add(&mem->group_next, &group->memory_blocks);
 773         }
 774
 775         return 0;
 776 }
 777
 778 static int __init add_boot_memory_block(unsigned long base_section_nr)
 779 {
 780         int section_count = 0;
 781         unsigned long nr;
 782
 783         for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
 784              nr++)
 785                 if (present_section_nr(nr))
 786                         section_count++;
 787
 788         if (section_count == 0)
 789                 return 0;
 790         return add_memory_block(memory_block_id(base_section_nr),
 791                                 MEM_ONLINE, 0,  NULL);
 792 }
 793
 794 static int add_hotplug_memory_block(unsigned long block_id,
 795                                     unsigned long nr_vmemmap_pages,
 796                                     struct memory_group *group)
 797 {
 798         return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
 799 }
 800
 801 static void remove_memory_block(struct memory_block *memory)
 802 {
 803         if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
 804                 return;
 805
 806         WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
 807
 808         if (memory->group) {
 809                 list_del(&memory->group_next);
 810                 memory->group = NULL;
 811         }
 812
 813         /* drop the ref. we got via find_memory_block() */
 814         put_device(&memory->dev);
 815         device_unregister(&memory->dev);
 816 }
 817
 818 /*
 819  * Create memory block devices for the given memory area. Start and size
 820  * have to be aligned to memory block granularity. Memory block devices
 821  * will be initialized as offline.
 822  *
 823  * Called under device_hotplug_lock.
 824  */
 825 int create_memory_block_devices(unsigned long start, unsigned long size,
 826                                 unsigned long vmemmap_pages,
 827                                 struct memory_group *group)
 828 {
 829         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 830         unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 831         struct memory_block *mem;
 832         unsigned long block_id;
 833         int ret = 0;
 834
 835         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 836                          !IS_ALIGNED(size, memory_block_size_bytes())))
 837                 return -EINVAL;
 838
 839         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 840                 ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
 841                 if (ret)
 842                         break;
 843         }
 844         if (ret) {
 845                 end_block_id = block_id;
 846                 for (block_id = start_block_id; block_id != end_block_id;
 847                      block_id++) {
 848                         mem = find_memory_block_by_id(block_id);
 849                         if (WARN_ON_ONCE(!mem))
 850                                 continue;
 851                         remove_memory_block(mem);
 852                 }
 853         }
 854         return ret;
 855 }
 856
 857 /*
 858  * Remove memory block devices for the given memory area. Start and size
 859  * have to be aligned to memory block granularity. Memory block devices
 860  * have to be offline.
 861  *
 862  * Called under device_hotplug_lock.
 863  */
 864 void remove_memory_block_devices(unsigned long start, unsigned long size)
 865 {
 866         const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 867         const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 868         struct memory_block *mem;
 869         unsigned long block_id;
 870
 871         if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 872                          !IS_ALIGNED(size, memory_block_size_bytes())))
 873                 return;
 874
 875         for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 876                 mem = find_memory_block_by_id(block_id);
 877                 if (WARN_ON_ONCE(!mem))
 878                         continue;
 879                 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
 880                 unregister_memory_block_under_nodes(mem);
 881                 remove_memory_block(mem);
 882         }
 883 }
 884
 885 static struct attribute *memory_root_attrs[] = {
 886 #ifdef CONFIG_ARCH_MEMORY_PROBE
 887         &dev_attr_probe.attr,
 888 #endif
 889
 890 #ifdef CONFIG_MEMORY_FAILURE
 891         &dev_attr_soft_offline_page.attr,
 892         &dev_attr_hard_offline_page.attr,
 893 #endif
 894
 895         &dev_attr_block_size_bytes.attr,
 896         &dev_attr_auto_online_blocks.attr,
 897         NULL
 898 };
 899
 900 static const struct attribute_group memory_root_attr_group = {
 901         .attrs = memory_root_attrs,
 902 };
 903
 904 static const struct attribute_group *memory_root_attr_groups[] = {
 905         &memory_root_attr_group,
 906         NULL,
 907 };
 908
 909 /*
 910  * Initialize the sysfs support for memory devices. At the time this function
 911  * is called, we cannot have concurrent creation/deletion of memory block
 912  * devices, the device_hotplug_lock is not needed.
 913  */
 914 void __init memory_dev_init(void)
 915 {
 916         int ret;
 917         unsigned long block_sz, nr;
 918
 919         /* Validate the configured memory block size */
 920         block_sz = memory_block_size_bytes();
 921         if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
 922                 panic("Memory block size not suitable: 0x%lx\n", block_sz);
 923         sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
 924
 925         ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 926         if (ret)
 927                 panic("%s() failed to register subsystem: %d\n", __func__, ret);
 928
 929         /*
 930          * Create entries for memory sections that were found
 931          * during boot and have been initialized
 932          */
 933         for (nr = 0; nr <= __highest_present_section_nr;
 934              nr += sections_per_block) {
 935                 ret = add_boot_memory_block(nr);
 936                 if (ret)
 937                         panic("%s() failed to add memory block: %d\n", __func__,
 938                               ret);
 939         }
 940 }
 941
 942 /**
 943  * walk_memory_blocks - walk through all present memory blocks overlapped
 944  *                      by the range [start, start + size)
 945  *
 946  * @start: start address of the memory range
 947  * @size: size of the memory range
 948  * @arg: argument passed to func
 949  * @func: callback for each memory section walked
 950  *
 951  * This function walks through all present memory blocks overlapped by the
 952  * range [start, start + size), calling func on each memory block.
 953  *
 954  * In case func() returns an error, walking is aborted and the error is
 955  * returned.
 956  *
 957  * Called under device_hotplug_lock.
 958  */
 959 int walk_memory_blocks(unsigned long start, unsigned long size,
 960                        void *arg, walk_memory_blocks_func_t func)
 961 {
 962         const unsigned long start_block_id = phys_to_block_id(start);
 963         const unsigned long end_block_id = phys_to_block_id(start + size - 1);
 964         struct memory_block *mem;
 965         unsigned long block_id;
 966         int ret = 0;
 967
 968         if (!size)
 969                 return 0;
 970
 971         for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
 972                 mem = find_memory_block_by_id(block_id);
 973                 if (!mem)
 974                         continue;
 975
 976                 ret = func(mem, arg);
 977                 put_device(&mem->dev);
 978                 if (ret)
 979                         break;
 980         }
 981         return ret;
 982 }
 983
 984 struct for_each_memory_block_cb_data {
 985         walk_memory_blocks_func_t func;
 986         void *arg;
 987 };
 988
 989 static int for_each_memory_block_cb(struct device *dev, void *data)
 990 {
 991         struct memory_block *mem = to_memory_block(dev);
 992         struct for_each_memory_block_cb_data *cb_data = data;
 993
 994         return cb_data->func(mem, cb_data->arg);
 995 }
 996
 997 /**
 998  * for_each_memory_block - walk through all present memory blocks
 999  *
1000  * @arg: argument passed to func
1001  * @func: callback for each memory block walked
1002  *
1003  * This function walks through all present memory blocks, calling func on
1004  * each memory block.
1005  *
1006  * In case func() returns an error, walking is aborted and the error is
1007  * returned.
1008  */
1009 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1010 {
1011         struct for_each_memory_block_cb_data cb_data = {
1012                 .func = func,
1013                 .arg = arg,
1014         };
1015
1016         return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1017                                 for_each_memory_block_cb);
1018 }
1019
1020 /*
1021  * This is an internal helper to unify allocation and initialization of
1022  * memory groups. Note that the passed memory group will be copied to a
1023  * dynamically allocated memory group. After this call, the passed
1024  * memory group should no longer be used.
1025  */
1026 static int memory_group_register(struct memory_group group)
1027 {
1028         struct memory_group *new_group;
1029         uint32_t mgid;
1030         int ret;
1031
1032         if (!node_possible(group.nid))
1033                 return -EINVAL;
1034
1035         new_group = kzalloc(sizeof(group), GFP_KERNEL);
1036         if (!new_group)
1037                 return -ENOMEM;
1038         *new_group = group;
1039         INIT_LIST_HEAD(&new_group->memory_blocks);
1040
1041         ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1042                        GFP_KERNEL);
1043         if (ret) {
1044                 kfree(new_group);
1045                 return ret;
1046         } else if (group.is_dynamic) {
1047                 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1048         }
1049         return mgid;
1050 }
1051
1052 /**
1053  * memory_group_register_static() - Register a static memory group.
1054  * @nid: The node id.
1055  * @max_pages: The maximum number of pages we'll have in this static memory
1056  *             group.
1057  *
1058  * Register a new static memory group and return the memory group id.
1059  * All memory in the group belongs to a single unit, such as a DIMM. All
1060  * memory belonging to a static memory group is added in one go to be removed
1061  * in one go -- it's static.
1062  *
1063  * Returns an error if out of memory, if the node id is invalid, if no new
1064  * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1065  * returns the new memory group id.
1066  */
1067 int memory_group_register_static(int nid, unsigned long max_pages)
1068 {
1069         struct memory_group group = {
1070                 .nid = nid,
1071                 .s = {
1072                         .max_pages = max_pages,
1073                 },
1074         };
1075
1076         if (!max_pages)
1077                 return -EINVAL;
1078         return memory_group_register(group);
1079 }
1080 EXPORT_SYMBOL_GPL(memory_group_register_static);
1081
1082 /**
1083  * memory_group_register_dynamic() - Register a dynamic memory group.
1084  * @nid: The node id.
1085  * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1086  *              memory group.
1087  *
1088  * Register a new dynamic memory group and return the memory group id.
1089  * Memory within a dynamic memory group is added/removed dynamically
1090  * in unit_pages.
1091  *
1092  * Returns an error if out of memory, if the node id is invalid, if no new
1093  * memory groups can be registered, or if unit_pages is invalid (0, not a
1094  * power of two, smaller than a single memory block). Otherwise, returns the
1095  * new memory group id.
1096  */
1097 int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1098 {
1099         struct memory_group group = {
1100                 .nid = nid,
1101                 .is_dynamic = true,
1102                 .d = {
1103                         .unit_pages = unit_pages,
1104                 },
1105         };
1106
1107         if (!unit_pages || !is_power_of_2(unit_pages) ||
1108             unit_pages < PHYS_PFN(memory_block_size_bytes()))
1109                 return -EINVAL;
1110         return memory_group_register(group);
1111 }
1112 EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1113
1114 /**
1115  * memory_group_unregister() - Unregister a memory group.
1116  * @mgid: the memory group id
1117  *
1118  * Unregister a memory group. If any memory block still belongs to this
1119  * memory group, unregistering will fail.
1120  *
1121  * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1122  * memory blocks still belong to this memory group and returns 0 if
1123  * unregistering succeeded.
1124  */
1125 int memory_group_unregister(int mgid)
1126 {
1127         struct memory_group *group;
1128
1129         if (mgid < 0)
1130                 return -EINVAL;
1131
1132         group = xa_load(&memory_groups, mgid);
1133         if (!group)
1134                 return -EINVAL;
1135         if (!list_empty(&group->memory_blocks))
1136                 return -EBUSY;
1137         xa_erase(&memory_groups, mgid);
1138         kfree(group);
1139         return 0;
1140 }
1141 EXPORT_SYMBOL_GPL(memory_group_unregister);
1142
1143 /*
1144  * This is an internal helper only to be used in core memory hotplug code to
1145  * lookup a memory group. We don't care about locking, as we don't expect a
1146  * memory group to get unregistered while adding memory to it -- because
1147  * the group and the memory is managed by the same driver.
1148  */
1149 struct memory_group *memory_group_find_by_id(int mgid)
1150 {
1151         return xa_load(&memory_groups, mgid);
1152 }
1153
1154 /*
1155  * This is an internal helper only to be used in core memory hotplug code to
1156  * walk all dynamic memory groups excluding a given memory group, either
1157  * belonging to a specific node, or belonging to any node.
1158  */
1159 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1160                                struct memory_group *excluded, void *arg)
1161 {
1162         struct memory_group *group;
1163         unsigned long index;
1164         int ret = 0;
1165
1166         xa_for_each_marked(&memory_groups, index, group,
1167                            MEMORY_GROUP_MARK_DYNAMIC) {
1168                 if (group == excluded)
1169                         continue;
1170 #ifdef CONFIG_NUMA
1171                 if (nid != NUMA_NO_NODE && group->nid != nid)
1172                         continue;
1173 #endif /* CONFIG_NUMA */
1174                 ret = func(group, arg);
1175                 if (ret)
1176                         break;
1177         }
1178         return ret;
1179 }
1180
1181 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
1182 void memblk_nr_poison_inc(unsigned long pfn)
1183 {
1184         const unsigned long block_id = pfn_to_block_id(pfn);
1185         struct memory_block *mem = find_memory_block_by_id(block_id);
1186
1187         if (mem)
1188                 atomic_long_inc(&mem->nr_hwpoison);
1189 }
1190
1191 void memblk_nr_poison_sub(unsigned long pfn, long i)
1192 {
1193         const unsigned long block_id = pfn_to_block_id(pfn);
1194         struct memory_block *mem = find_memory_block_by_id(block_id);
1195
1196         if (mem)
1197                 atomic_long_sub(i, &mem->nr_hwpoison);
1198 }
1199
1200 static unsigned long memblk_nr_poison(struct memory_block *mem)
1201 {
1202         return atomic_long_read(&mem->nr_hwpoison);
1203 }
1204 #endif