memory.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
memory.c (31065B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Memory subsystem support
      4 *
      5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
      6 *            Dave Hansen <haveblue@us.ibm.com>
      7 *
      8 * This file provides the necessary infrastructure to represent
      9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
     10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
     11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
     12 */
     13
     14#include <linux/module.h>
     15#include <linux/init.h>
     16#include <linux/topology.h>
     17#include <linux/capability.h>
     18#include <linux/device.h>
     19#include <linux/memory.h>
     20#include <linux/memory_hotplug.h>
     21#include <linux/mm.h>
     22#include <linux/stat.h>
     23#include <linux/slab.h>
     24#include <linux/xarray.h>
     25
     26#include <linux/atomic.h>
     27#include <linux/uaccess.h>
     28
     29#define MEMORY_CLASS_NAME	"memory"
     30
     31static const char *const online_type_to_str[] = {
     32	[MMOP_OFFLINE] = "offline",
     33	[MMOP_ONLINE] = "online",
     34	[MMOP_ONLINE_KERNEL] = "online_kernel",
     35	[MMOP_ONLINE_MOVABLE] = "online_movable",
     36};
     37
     38int mhp_online_type_from_str(const char *str)
     39{
     40	int i;
     41
     42	for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
     43		if (sysfs_streq(str, online_type_to_str[i]))
     44			return i;
     45	}
     46	return -EINVAL;
     47}
     48
     49#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
     50
     51static int sections_per_block;
     52
     53static inline unsigned long memory_block_id(unsigned long section_nr)
     54{
     55	return section_nr / sections_per_block;
     56}
     57
     58static inline unsigned long pfn_to_block_id(unsigned long pfn)
     59{
     60	return memory_block_id(pfn_to_section_nr(pfn));
     61}
     62
     63static inline unsigned long phys_to_block_id(unsigned long phys)
     64{
     65	return pfn_to_block_id(PFN_DOWN(phys));
     66}
     67
     68static int memory_subsys_online(struct device *dev);
     69static int memory_subsys_offline(struct device *dev);
     70
     71static struct bus_type memory_subsys = {
     72	.name = MEMORY_CLASS_NAME,
     73	.dev_name = MEMORY_CLASS_NAME,
     74	.online = memory_subsys_online,
     75	.offline = memory_subsys_offline,
     76};
     77
     78/*
     79 * Memory blocks are cached in a local radix tree to avoid
     80 * a costly linear search for the corresponding device on
     81 * the subsystem bus.
     82 */
     83static DEFINE_XARRAY(memory_blocks);
     84
     85/*
     86 * Memory groups, indexed by memory group id (mgid).
     87 */
     88static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
     89#define MEMORY_GROUP_MARK_DYNAMIC	XA_MARK_1
     90
     91static BLOCKING_NOTIFIER_HEAD(memory_chain);
     92
     93int register_memory_notifier(struct notifier_block *nb)
     94{
     95	return blocking_notifier_chain_register(&memory_chain, nb);
     96}
     97EXPORT_SYMBOL(register_memory_notifier);
     98
     99void unregister_memory_notifier(struct notifier_block *nb)
    100{
    101	blocking_notifier_chain_unregister(&memory_chain, nb);
    102}
    103EXPORT_SYMBOL(unregister_memory_notifier);
    104
    105static void memory_block_release(struct device *dev)
    106{
    107	struct memory_block *mem = to_memory_block(dev);
    108
    109	kfree(mem);
    110}
    111
    112unsigned long __weak memory_block_size_bytes(void)
    113{
    114	return MIN_MEMORY_BLOCK_SIZE;
    115}
    116EXPORT_SYMBOL_GPL(memory_block_size_bytes);
    117
    118/*
    119 * Show the first physical section index (number) of this memory block.
    120 */
    121static ssize_t phys_index_show(struct device *dev,
    122			       struct device_attribute *attr, char *buf)
    123{
    124	struct memory_block *mem = to_memory_block(dev);
    125	unsigned long phys_index;
    126
    127	phys_index = mem->start_section_nr / sections_per_block;
    128
    129	return sysfs_emit(buf, "%08lx\n", phys_index);
    130}
    131
    132/*
    133 * Legacy interface that we cannot remove. Always indicate "removable"
    134 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
    135 */
    136static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
    137			      char *buf)
    138{
    139	return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
    140}
    141
    142/*
    143 * online, offline, going offline, etc.
    144 */
    145static ssize_t state_show(struct device *dev, struct device_attribute *attr,
    146			  char *buf)
    147{
    148	struct memory_block *mem = to_memory_block(dev);
    149	const char *output;
    150
    151	/*
    152	 * We can probably put these states in a nice little array
    153	 * so that they're not open-coded
    154	 */
    155	switch (mem->state) {
    156	case MEM_ONLINE:
    157		output = "online";
    158		break;
    159	case MEM_OFFLINE:
    160		output = "offline";
    161		break;
    162	case MEM_GOING_OFFLINE:
    163		output = "going-offline";
    164		break;
    165	default:
    166		WARN_ON(1);
    167		return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
    168	}
    169
    170	return sysfs_emit(buf, "%s\n", output);
    171}
    172
    173int memory_notify(unsigned long val, void *v)
    174{
    175	return blocking_notifier_call_chain(&memory_chain, val, v);
    176}
    177
    178static int memory_block_online(struct memory_block *mem)
    179{
    180	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
    181	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
    182	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
    183	struct zone *zone;
    184	int ret;
    185
    186	zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
    187				  start_pfn, nr_pages);
    188
    189	/*
    190	 * Although vmemmap pages have a different lifecycle than the pages
    191	 * they describe (they remain until the memory is unplugged), doing
    192	 * their initialization and accounting at memory onlining/offlining
    193	 * stage helps to keep accounting easier to follow - e.g vmemmaps
    194	 * belong to the same zone as the memory they backed.
    195	 */
    196	if (nr_vmemmap_pages) {
    197		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
    198		if (ret)
    199			return ret;
    200	}
    201
    202	ret = online_pages(start_pfn + nr_vmemmap_pages,
    203			   nr_pages - nr_vmemmap_pages, zone, mem->group);
    204	if (ret) {
    205		if (nr_vmemmap_pages)
    206			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
    207		return ret;
    208	}
    209
    210	/*
    211	 * Account once onlining succeeded. If the zone was unpopulated, it is
    212	 * now already properly populated.
    213	 */
    214	if (nr_vmemmap_pages)
    215		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
    216					  nr_vmemmap_pages);
    217
    218	mem->zone = zone;
    219	return ret;
    220}
    221
    222static int memory_block_offline(struct memory_block *mem)
    223{
    224	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
    225	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
    226	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
    227	int ret;
    228
    229	if (!mem->zone)
    230		return -EINVAL;
    231
    232	/*
    233	 * Unaccount before offlining, such that unpopulated zone and kthreads
    234	 * can properly be torn down in offline_pages().
    235	 */
    236	if (nr_vmemmap_pages)
    237		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
    238					  -nr_vmemmap_pages);
    239
    240	ret = offline_pages(start_pfn + nr_vmemmap_pages,
    241			    nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
    242	if (ret) {
    243		/* offline_pages() failed. Account back. */
    244		if (nr_vmemmap_pages)
    245			adjust_present_page_count(pfn_to_page(start_pfn),
    246						  mem->group, nr_vmemmap_pages);
    247		return ret;
    248	}
    249
    250	if (nr_vmemmap_pages)
    251		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
    252
    253	mem->zone = NULL;
    254	return ret;
    255}
    256
    257/*
    258 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
    259 * OK to have direct references to sparsemem variables in here.
    260 */
    261static int
    262memory_block_action(struct memory_block *mem, unsigned long action)
    263{
    264	int ret;
    265
    266	switch (action) {
    267	case MEM_ONLINE:
    268		ret = memory_block_online(mem);
    269		break;
    270	case MEM_OFFLINE:
    271		ret = memory_block_offline(mem);
    272		break;
    273	default:
    274		WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
    275		     "%ld\n", __func__, mem->start_section_nr, action, action);
    276		ret = -EINVAL;
    277	}
    278
    279	return ret;
    280}
    281
    282static int memory_block_change_state(struct memory_block *mem,
    283		unsigned long to_state, unsigned long from_state_req)
    284{
    285	int ret = 0;
    286
    287	if (mem->state != from_state_req)
    288		return -EINVAL;
    289
    290	if (to_state == MEM_OFFLINE)
    291		mem->state = MEM_GOING_OFFLINE;
    292
    293	ret = memory_block_action(mem, to_state);
    294	mem->state = ret ? from_state_req : to_state;
    295
    296	return ret;
    297}
    298
    299/* The device lock serializes operations on memory_subsys_[online|offline] */
    300static int memory_subsys_online(struct device *dev)
    301{
    302	struct memory_block *mem = to_memory_block(dev);
    303	int ret;
    304
    305	if (mem->state == MEM_ONLINE)
    306		return 0;
    307
    308	/*
    309	 * When called via device_online() without configuring the online_type,
    310	 * we want to default to MMOP_ONLINE.
    311	 */
    312	if (mem->online_type == MMOP_OFFLINE)
    313		mem->online_type = MMOP_ONLINE;
    314
    315	ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
    316	mem->online_type = MMOP_OFFLINE;
    317
    318	return ret;
    319}
    320
    321static int memory_subsys_offline(struct device *dev)
    322{
    323	struct memory_block *mem = to_memory_block(dev);
    324
    325	if (mem->state == MEM_OFFLINE)
    326		return 0;
    327
    328	return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
    329}
    330
    331static ssize_t state_store(struct device *dev, struct device_attribute *attr,
    332			   const char *buf, size_t count)
    333{
    334	const int online_type = mhp_online_type_from_str(buf);
    335	struct memory_block *mem = to_memory_block(dev);
    336	int ret;
    337
    338	if (online_type < 0)
    339		return -EINVAL;
    340
    341	ret = lock_device_hotplug_sysfs();
    342	if (ret)
    343		return ret;
    344
    345	switch (online_type) {
    346	case MMOP_ONLINE_KERNEL:
    347	case MMOP_ONLINE_MOVABLE:
    348	case MMOP_ONLINE:
    349		/* mem->online_type is protected by device_hotplug_lock */
    350		mem->online_type = online_type;
    351		ret = device_online(&mem->dev);
    352		break;
    353	case MMOP_OFFLINE:
    354		ret = device_offline(&mem->dev);
    355		break;
    356	default:
    357		ret = -EINVAL; /* should never happen */
    358	}
    359
    360	unlock_device_hotplug();
    361
    362	if (ret < 0)
    363		return ret;
    364	if (ret)
    365		return -EINVAL;
    366
    367	return count;
    368}
    369
    370/*
    371 * Legacy interface that we cannot remove: s390x exposes the storage increment
    372 * covered by a memory block, allowing for identifying which memory blocks
    373 * comprise a storage increment. Since a memory block spans complete
    374 * storage increments nowadays, this interface is basically unused. Other
    375 * archs never exposed != 0.
    376 */
    377static ssize_t phys_device_show(struct device *dev,
    378				struct device_attribute *attr, char *buf)
    379{
    380	struct memory_block *mem = to_memory_block(dev);
    381	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
    382
    383	return sysfs_emit(buf, "%d\n",
    384			  arch_get_memory_phys_device(start_pfn));
    385}
    386
    387#ifdef CONFIG_MEMORY_HOTREMOVE
    388static int print_allowed_zone(char *buf, int len, int nid,
    389			      struct memory_group *group,
    390			      unsigned long start_pfn, unsigned long nr_pages,
    391			      int online_type, struct zone *default_zone)
    392{
    393	struct zone *zone;
    394
    395	zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
    396	if (zone == default_zone)
    397		return 0;
    398
    399	return sysfs_emit_at(buf, len, " %s", zone->name);
    400}
    401
    402static ssize_t valid_zones_show(struct device *dev,
    403				struct device_attribute *attr, char *buf)
    404{
    405	struct memory_block *mem = to_memory_block(dev);
    406	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
    407	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
    408	struct memory_group *group = mem->group;
    409	struct zone *default_zone;
    410	int nid = mem->nid;
    411	int len = 0;
    412
    413	/*
    414	 * Check the existing zone. Make sure that we do that only on the
    415	 * online nodes otherwise the page_zone is not reliable
    416	 */
    417	if (mem->state == MEM_ONLINE) {
    418		/*
    419		 * If !mem->zone, the memory block spans multiple zones and
    420		 * cannot get offlined.
    421		 */
    422		default_zone = mem->zone;
    423		if (!default_zone)
    424			return sysfs_emit(buf, "%s\n", "none");
    425		len += sysfs_emit_at(buf, len, "%s", default_zone->name);
    426		goto out;
    427	}
    428
    429	default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
    430					  start_pfn, nr_pages);
    431
    432	len += sysfs_emit_at(buf, len, "%s", default_zone->name);
    433	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
    434				  MMOP_ONLINE_KERNEL, default_zone);
    435	len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
    436				  MMOP_ONLINE_MOVABLE, default_zone);
    437out:
    438	len += sysfs_emit_at(buf, len, "\n");
    439	return len;
    440}
    441static DEVICE_ATTR_RO(valid_zones);
    442#endif
    443
    444static DEVICE_ATTR_RO(phys_index);
    445static DEVICE_ATTR_RW(state);
    446static DEVICE_ATTR_RO(phys_device);
    447static DEVICE_ATTR_RO(removable);
    448
    449/*
    450 * Show the memory block size (shared by all memory blocks).
    451 */
    452static ssize_t block_size_bytes_show(struct device *dev,
    453				     struct device_attribute *attr, char *buf)
    454{
    455	return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
    456}
    457
    458static DEVICE_ATTR_RO(block_size_bytes);
    459
    460/*
    461 * Memory auto online policy.
    462 */
    463
    464static ssize_t auto_online_blocks_show(struct device *dev,
    465				       struct device_attribute *attr, char *buf)
    466{
    467	return sysfs_emit(buf, "%s\n",
    468			  online_type_to_str[mhp_default_online_type]);
    469}
    470
    471static ssize_t auto_online_blocks_store(struct device *dev,
    472					struct device_attribute *attr,
    473					const char *buf, size_t count)
    474{
    475	const int online_type = mhp_online_type_from_str(buf);
    476
    477	if (online_type < 0)
    478		return -EINVAL;
    479
    480	mhp_default_online_type = online_type;
    481	return count;
    482}
    483
    484static DEVICE_ATTR_RW(auto_online_blocks);
    485
    486/*
    487 * Some architectures will have custom drivers to do this, and
    488 * will not need to do it from userspace.  The fake hot-add code
    489 * as well as ppc64 will do all of their discovery in userspace
    490 * and will require this interface.
    491 */
    492#ifdef CONFIG_ARCH_MEMORY_PROBE
    493static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
    494			   const char *buf, size_t count)
    495{
    496	u64 phys_addr;
    497	int nid, ret;
    498	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
    499
    500	ret = kstrtoull(buf, 0, &phys_addr);
    501	if (ret)
    502		return ret;
    503
    504	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
    505		return -EINVAL;
    506
    507	ret = lock_device_hotplug_sysfs();
    508	if (ret)
    509		return ret;
    510
    511	nid = memory_add_physaddr_to_nid(phys_addr);
    512	ret = __add_memory(nid, phys_addr,
    513			   MIN_MEMORY_BLOCK_SIZE * sections_per_block,
    514			   MHP_NONE);
    515
    516	if (ret)
    517		goto out;
    518
    519	ret = count;
    520out:
    521	unlock_device_hotplug();
    522	return ret;
    523}
    524
    525static DEVICE_ATTR_WO(probe);
    526#endif
    527
    528#ifdef CONFIG_MEMORY_FAILURE
    529/*
    530 * Support for offlining pages of memory
    531 */
    532
    533/* Soft offline a page */
    534static ssize_t soft_offline_page_store(struct device *dev,
    535				       struct device_attribute *attr,
    536				       const char *buf, size_t count)
    537{
    538	int ret;
    539	u64 pfn;
    540	if (!capable(CAP_SYS_ADMIN))
    541		return -EPERM;
    542	if (kstrtoull(buf, 0, &pfn) < 0)
    543		return -EINVAL;
    544	pfn >>= PAGE_SHIFT;
    545	ret = soft_offline_page(pfn, 0);
    546	return ret == 0 ? count : ret;
    547}
    548
    549/* Forcibly offline a page, including killing processes. */
    550static ssize_t hard_offline_page_store(struct device *dev,
    551				       struct device_attribute *attr,
    552				       const char *buf, size_t count)
    553{
    554	int ret;
    555	u64 pfn;
    556	if (!capable(CAP_SYS_ADMIN))
    557		return -EPERM;
    558	if (kstrtoull(buf, 0, &pfn) < 0)
    559		return -EINVAL;
    560	pfn >>= PAGE_SHIFT;
    561	ret = memory_failure(pfn, MF_SW_SIMULATED);
    562	if (ret == -EOPNOTSUPP)
    563		ret = 0;
    564	return ret ? ret : count;
    565}
    566
    567static DEVICE_ATTR_WO(soft_offline_page);
    568static DEVICE_ATTR_WO(hard_offline_page);
    569#endif
    570
    571/* See phys_device_show(). */
    572int __weak arch_get_memory_phys_device(unsigned long start_pfn)
    573{
    574	return 0;
    575}
    576
    577/*
    578 * A reference for the returned memory block device is acquired.
    579 *
    580 * Called under device_hotplug_lock.
    581 */
    582static struct memory_block *find_memory_block_by_id(unsigned long block_id)
    583{
    584	struct memory_block *mem;
    585
    586	mem = xa_load(&memory_blocks, block_id);
    587	if (mem)
    588		get_device(&mem->dev);
    589	return mem;
    590}
    591
    592/*
    593 * Called under device_hotplug_lock.
    594 */
    595struct memory_block *find_memory_block(unsigned long section_nr)
    596{
    597	unsigned long block_id = memory_block_id(section_nr);
    598
    599	return find_memory_block_by_id(block_id);
    600}
    601
    602static struct attribute *memory_memblk_attrs[] = {
    603	&dev_attr_phys_index.attr,
    604	&dev_attr_state.attr,
    605	&dev_attr_phys_device.attr,
    606	&dev_attr_removable.attr,
    607#ifdef CONFIG_MEMORY_HOTREMOVE
    608	&dev_attr_valid_zones.attr,
    609#endif
    610	NULL
    611};
    612
    613static const struct attribute_group memory_memblk_attr_group = {
    614	.attrs = memory_memblk_attrs,
    615};
    616
    617static const struct attribute_group *memory_memblk_attr_groups[] = {
    618	&memory_memblk_attr_group,
    619	NULL,
    620};
    621
    622static int __add_memory_block(struct memory_block *memory)
    623{
    624	int ret;
    625
    626	memory->dev.bus = &memory_subsys;
    627	memory->dev.id = memory->start_section_nr / sections_per_block;
    628	memory->dev.release = memory_block_release;
    629	memory->dev.groups = memory_memblk_attr_groups;
    630	memory->dev.offline = memory->state == MEM_OFFLINE;
    631
    632	ret = device_register(&memory->dev);
    633	if (ret) {
    634		put_device(&memory->dev);
    635		return ret;
    636	}
    637	ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
    638			      GFP_KERNEL));
    639	if (ret)
    640		device_unregister(&memory->dev);
    641
    642	return ret;
    643}
    644
    645static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
    646						     int nid)
    647{
    648	const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
    649	const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
    650	struct zone *zone, *matching_zone = NULL;
    651	pg_data_t *pgdat = NODE_DATA(nid);
    652	int i;
    653
    654	/*
    655	 * This logic only works for early memory, when the applicable zones
    656	 * already span the memory block. We don't expect overlapping zones on
    657	 * a single node for early memory. So if we're told that some PFNs
    658	 * of a node fall into this memory block, we can assume that all node
    659	 * zones that intersect with the memory block are actually applicable.
    660	 * No need to look at the memmap.
    661	 */
    662	for (i = 0; i < MAX_NR_ZONES; i++) {
    663		zone = pgdat->node_zones + i;
    664		if (!populated_zone(zone))
    665			continue;
    666		if (!zone_intersects(zone, start_pfn, nr_pages))
    667			continue;
    668		if (!matching_zone) {
    669			matching_zone = zone;
    670			continue;
    671		}
    672		/* Spans multiple zones ... */
    673		matching_zone = NULL;
    674		break;
    675	}
    676	return matching_zone;
    677}
    678
    679#ifdef CONFIG_NUMA
    680/**
    681 * memory_block_add_nid() - Indicate that system RAM falling into this memory
    682 *			    block device (partially) belongs to the given node.
    683 * @mem: The memory block device.
    684 * @nid: The node id.
    685 * @context: The memory initialization context.
    686 *
    687 * Indicate that system RAM falling into this memory block (partially) belongs
    688 * to the given node. If the context indicates ("early") that we are adding the
    689 * node during node device subsystem initialization, this will also properly
    690 * set/adjust mem->zone based on the zone ranges of the given node.
    691 */
    692void memory_block_add_nid(struct memory_block *mem, int nid,
    693			  enum meminit_context context)
    694{
    695	if (context == MEMINIT_EARLY && mem->nid != nid) {
    696		/*
    697		 * For early memory we have to determine the zone when setting
    698		 * the node id and handle multiple nodes spanning a single
    699		 * memory block by indicate via zone == NULL that we're not
    700		 * dealing with a single zone. So if we're setting the node id
    701		 * the first time, determine if there is a single zone. If we're
    702		 * setting the node id a second time to a different node,
    703		 * invalidate the single detected zone.
    704		 */
    705		if (mem->nid == NUMA_NO_NODE)
    706			mem->zone = early_node_zone_for_memory_block(mem, nid);
    707		else
    708			mem->zone = NULL;
    709	}
    710
    711	/*
    712	 * If this memory block spans multiple nodes, we only indicate
    713	 * the last processed node. If we span multiple nodes (not applicable
    714	 * to hotplugged memory), zone == NULL will prohibit memory offlining
    715	 * and consequently unplug.
    716	 */
    717	mem->nid = nid;
    718}
    719#endif
    720
    721static int add_memory_block(unsigned long block_id, unsigned long state,
    722			    unsigned long nr_vmemmap_pages,
    723			    struct memory_group *group)
    724{
    725	struct memory_block *mem;
    726	int ret = 0;
    727
    728	mem = find_memory_block_by_id(block_id);
    729	if (mem) {
    730		put_device(&mem->dev);
    731		return -EEXIST;
    732	}
    733	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
    734	if (!mem)
    735		return -ENOMEM;
    736
    737	mem->start_section_nr = block_id * sections_per_block;
    738	mem->state = state;
    739	mem->nid = NUMA_NO_NODE;
    740	mem->nr_vmemmap_pages = nr_vmemmap_pages;
    741	INIT_LIST_HEAD(&mem->group_next);
    742
    743#ifndef CONFIG_NUMA
    744	if (state == MEM_ONLINE)
    745		/*
    746		 * MEM_ONLINE at this point implies early memory. With NUMA,
    747		 * we'll determine the zone when setting the node id via
    748		 * memory_block_add_nid(). Memory hotplug updated the zone
    749		 * manually when memory onlining/offlining succeeds.
    750		 */
    751		mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
    752#endif /* CONFIG_NUMA */
    753
    754	ret = __add_memory_block(mem);
    755	if (ret)
    756		return ret;
    757
    758	if (group) {
    759		mem->group = group;
    760		list_add(&mem->group_next, &group->memory_blocks);
    761	}
    762
    763	return 0;
    764}
    765
    766static int __init add_boot_memory_block(unsigned long base_section_nr)
    767{
    768	int section_count = 0;
    769	unsigned long nr;
    770
    771	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
    772	     nr++)
    773		if (present_section_nr(nr))
    774			section_count++;
    775
    776	if (section_count == 0)
    777		return 0;
    778	return add_memory_block(memory_block_id(base_section_nr),
    779				MEM_ONLINE, 0,  NULL);
    780}
    781
    782static int add_hotplug_memory_block(unsigned long block_id,
    783				    unsigned long nr_vmemmap_pages,
    784				    struct memory_group *group)
    785{
    786	return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
    787}
    788
    789static void remove_memory_block(struct memory_block *memory)
    790{
    791	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
    792		return;
    793
    794	WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
    795
    796	if (memory->group) {
    797		list_del(&memory->group_next);
    798		memory->group = NULL;
    799	}
    800
    801	/* drop the ref. we got via find_memory_block() */
    802	put_device(&memory->dev);
    803	device_unregister(&memory->dev);
    804}
    805
    806/*
    807 * Create memory block devices for the given memory area. Start and size
    808 * have to be aligned to memory block granularity. Memory block devices
    809 * will be initialized as offline.
    810 *
    811 * Called under device_hotplug_lock.
    812 */
    813int create_memory_block_devices(unsigned long start, unsigned long size,
    814				unsigned long vmemmap_pages,
    815				struct memory_group *group)
    816{
    817	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
    818	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
    819	struct memory_block *mem;
    820	unsigned long block_id;
    821	int ret = 0;
    822
    823	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
    824			 !IS_ALIGNED(size, memory_block_size_bytes())))
    825		return -EINVAL;
    826
    827	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
    828		ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
    829		if (ret)
    830			break;
    831	}
    832	if (ret) {
    833		end_block_id = block_id;
    834		for (block_id = start_block_id; block_id != end_block_id;
    835		     block_id++) {
    836			mem = find_memory_block_by_id(block_id);
    837			if (WARN_ON_ONCE(!mem))
    838				continue;
    839			remove_memory_block(mem);
    840		}
    841	}
    842	return ret;
    843}
    844
    845/*
    846 * Remove memory block devices for the given memory area. Start and size
    847 * have to be aligned to memory block granularity. Memory block devices
    848 * have to be offline.
    849 *
    850 * Called under device_hotplug_lock.
    851 */
    852void remove_memory_block_devices(unsigned long start, unsigned long size)
    853{
    854	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
    855	const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
    856	struct memory_block *mem;
    857	unsigned long block_id;
    858
    859	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
    860			 !IS_ALIGNED(size, memory_block_size_bytes())))
    861		return;
    862
    863	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
    864		mem = find_memory_block_by_id(block_id);
    865		if (WARN_ON_ONCE(!mem))
    866			continue;
    867		unregister_memory_block_under_nodes(mem);
    868		remove_memory_block(mem);
    869	}
    870}
    871
    872/* return true if the memory block is offlined, otherwise, return false */
    873bool is_memblock_offlined(struct memory_block *mem)
    874{
    875	return mem->state == MEM_OFFLINE;
    876}
    877
    878static struct attribute *memory_root_attrs[] = {
    879#ifdef CONFIG_ARCH_MEMORY_PROBE
    880	&dev_attr_probe.attr,
    881#endif
    882
    883#ifdef CONFIG_MEMORY_FAILURE
    884	&dev_attr_soft_offline_page.attr,
    885	&dev_attr_hard_offline_page.attr,
    886#endif
    887
    888	&dev_attr_block_size_bytes.attr,
    889	&dev_attr_auto_online_blocks.attr,
    890	NULL
    891};
    892
    893static const struct attribute_group memory_root_attr_group = {
    894	.attrs = memory_root_attrs,
    895};
    896
    897static const struct attribute_group *memory_root_attr_groups[] = {
    898	&memory_root_attr_group,
    899	NULL,
    900};
    901
    902/*
    903 * Initialize the sysfs support for memory devices. At the time this function
    904 * is called, we cannot have concurrent creation/deletion of memory block
    905 * devices, the device_hotplug_lock is not needed.
    906 */
    907void __init memory_dev_init(void)
    908{
    909	int ret;
    910	unsigned long block_sz, nr;
    911
    912	/* Validate the configured memory block size */
    913	block_sz = memory_block_size_bytes();
    914	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
    915		panic("Memory block size not suitable: 0x%lx\n", block_sz);
    916	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
    917
    918	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
    919	if (ret)
    920		panic("%s() failed to register subsystem: %d\n", __func__, ret);
    921
    922	/*
    923	 * Create entries for memory sections that were found
    924	 * during boot and have been initialized
    925	 */
    926	for (nr = 0; nr <= __highest_present_section_nr;
    927	     nr += sections_per_block) {
    928		ret = add_boot_memory_block(nr);
    929		if (ret)
    930			panic("%s() failed to add memory block: %d\n", __func__,
    931			      ret);
    932	}
    933}
    934
    935/**
    936 * walk_memory_blocks - walk through all present memory blocks overlapped
    937 *			by the range [start, start + size)
    938 *
    939 * @start: start address of the memory range
    940 * @size: size of the memory range
    941 * @arg: argument passed to func
    942 * @func: callback for each memory section walked
    943 *
    944 * This function walks through all present memory blocks overlapped by the
    945 * range [start, start + size), calling func on each memory block.
    946 *
    947 * In case func() returns an error, walking is aborted and the error is
    948 * returned.
    949 *
    950 * Called under device_hotplug_lock.
    951 */
    952int walk_memory_blocks(unsigned long start, unsigned long size,
    953		       void *arg, walk_memory_blocks_func_t func)
    954{
    955	const unsigned long start_block_id = phys_to_block_id(start);
    956	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
    957	struct memory_block *mem;
    958	unsigned long block_id;
    959	int ret = 0;
    960
    961	if (!size)
    962		return 0;
    963
    964	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
    965		mem = find_memory_block_by_id(block_id);
    966		if (!mem)
    967			continue;
    968
    969		ret = func(mem, arg);
    970		put_device(&mem->dev);
    971		if (ret)
    972			break;
    973	}
    974	return ret;
    975}
    976
    977struct for_each_memory_block_cb_data {
    978	walk_memory_blocks_func_t func;
    979	void *arg;
    980};
    981
    982static int for_each_memory_block_cb(struct device *dev, void *data)
    983{
    984	struct memory_block *mem = to_memory_block(dev);
    985	struct for_each_memory_block_cb_data *cb_data = data;
    986
    987	return cb_data->func(mem, cb_data->arg);
    988}
    989
    990/**
    991 * for_each_memory_block - walk through all present memory blocks
    992 *
    993 * @arg: argument passed to func
    994 * @func: callback for each memory block walked
    995 *
    996 * This function walks through all present memory blocks, calling func on
    997 * each memory block.
    998 *
    999 * In case func() returns an error, walking is aborted and the error is
   1000 * returned.
   1001 */
   1002int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
   1003{
   1004	struct for_each_memory_block_cb_data cb_data = {
   1005		.func = func,
   1006		.arg = arg,
   1007	};
   1008
   1009	return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
   1010				for_each_memory_block_cb);
   1011}
   1012
   1013/*
   1014 * This is an internal helper to unify allocation and initialization of
   1015 * memory groups. Note that the passed memory group will be copied to a
   1016 * dynamically allocated memory group. After this call, the passed
   1017 * memory group should no longer be used.
   1018 */
   1019static int memory_group_register(struct memory_group group)
   1020{
   1021	struct memory_group *new_group;
   1022	uint32_t mgid;
   1023	int ret;
   1024
   1025	if (!node_possible(group.nid))
   1026		return -EINVAL;
   1027
   1028	new_group = kzalloc(sizeof(group), GFP_KERNEL);
   1029	if (!new_group)
   1030		return -ENOMEM;
   1031	*new_group = group;
   1032	INIT_LIST_HEAD(&new_group->memory_blocks);
   1033
   1034	ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
   1035		       GFP_KERNEL);
   1036	if (ret) {
   1037		kfree(new_group);
   1038		return ret;
   1039	} else if (group.is_dynamic) {
   1040		xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
   1041	}
   1042	return mgid;
   1043}
   1044
   1045/**
   1046 * memory_group_register_static() - Register a static memory group.
   1047 * @nid: The node id.
   1048 * @max_pages: The maximum number of pages we'll have in this static memory
   1049 *	       group.
   1050 *
   1051 * Register a new static memory group and return the memory group id.
   1052 * All memory in the group belongs to a single unit, such as a DIMM. All
   1053 * memory belonging to a static memory group is added in one go to be removed
   1054 * in one go -- it's static.
   1055 *
   1056 * Returns an error if out of memory, if the node id is invalid, if no new
   1057 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
   1058 * returns the new memory group id.
   1059 */
   1060int memory_group_register_static(int nid, unsigned long max_pages)
   1061{
   1062	struct memory_group group = {
   1063		.nid = nid,
   1064		.s = {
   1065			.max_pages = max_pages,
   1066		},
   1067	};
   1068
   1069	if (!max_pages)
   1070		return -EINVAL;
   1071	return memory_group_register(group);
   1072}
   1073EXPORT_SYMBOL_GPL(memory_group_register_static);
   1074
   1075/**
   1076 * memory_group_register_dynamic() - Register a dynamic memory group.
   1077 * @nid: The node id.
   1078 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
   1079 *		memory group.
   1080 *
   1081 * Register a new dynamic memory group and return the memory group id.
   1082 * Memory within a dynamic memory group is added/removed dynamically
   1083 * in unit_pages.
   1084 *
   1085 * Returns an error if out of memory, if the node id is invalid, if no new
   1086 * memory groups can be registered, or if unit_pages is invalid (0, not a
   1087 * power of two, smaller than a single memory block). Otherwise, returns the
   1088 * new memory group id.
   1089 */
   1090int memory_group_register_dynamic(int nid, unsigned long unit_pages)
   1091{
   1092	struct memory_group group = {
   1093		.nid = nid,
   1094		.is_dynamic = true,
   1095		.d = {
   1096			.unit_pages = unit_pages,
   1097		},
   1098	};
   1099
   1100	if (!unit_pages || !is_power_of_2(unit_pages) ||
   1101	    unit_pages < PHYS_PFN(memory_block_size_bytes()))
   1102		return -EINVAL;
   1103	return memory_group_register(group);
   1104}
   1105EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
   1106
   1107/**
   1108 * memory_group_unregister() - Unregister a memory group.
   1109 * @mgid: the memory group id
   1110 *
   1111 * Unregister a memory group. If any memory block still belongs to this
   1112 * memory group, unregistering will fail.
   1113 *
   1114 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
   1115 * memory blocks still belong to this memory group and returns 0 if
   1116 * unregistering succeeded.
   1117 */
   1118int memory_group_unregister(int mgid)
   1119{
   1120	struct memory_group *group;
   1121
   1122	if (mgid < 0)
   1123		return -EINVAL;
   1124
   1125	group = xa_load(&memory_groups, mgid);
   1126	if (!group)
   1127		return -EINVAL;
   1128	if (!list_empty(&group->memory_blocks))
   1129		return -EBUSY;
   1130	xa_erase(&memory_groups, mgid);
   1131	kfree(group);
   1132	return 0;
   1133}
   1134EXPORT_SYMBOL_GPL(memory_group_unregister);
   1135
   1136/*
   1137 * This is an internal helper only to be used in core memory hotplug code to
   1138 * lookup a memory group. We don't care about locking, as we don't expect a
   1139 * memory group to get unregistered while adding memory to it -- because
   1140 * the group and the memory is managed by the same driver.
   1141 */
   1142struct memory_group *memory_group_find_by_id(int mgid)
   1143{
   1144	return xa_load(&memory_groups, mgid);
   1145}
   1146
   1147/*
   1148 * This is an internal helper only to be used in core memory hotplug code to
   1149 * walk all dynamic memory groups excluding a given memory group, either
   1150 * belonging to a specific node, or belonging to any node.
   1151 */
   1152int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
   1153			       struct memory_group *excluded, void *arg)
   1154{
   1155	struct memory_group *group;
   1156	unsigned long index;
   1157	int ret = 0;
   1158
   1159	xa_for_each_marked(&memory_groups, index, group,
   1160			   MEMORY_GROUP_MARK_DYNAMIC) {
   1161		if (group == excluded)
   1162			continue;
   1163#ifdef CONFIG_NUMA
   1164		if (nid != NUMA_NO_NODE && group->nid != nid)
   1165			continue;
   1166#endif /* CONFIG_NUMA */
   1167		ret = func(group, arg);
   1168		if (ret)
   1169			break;
   1170	}
   1171	return ret;
   1172}