cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

setup.c (34083B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Copyright (C) 1995  Linus Torvalds
      4 *
      5 * This file contains the setup_arch() code, which handles the architecture-dependent
      6 * parts of early kernel initialization.
      7 */
      8#include <linux/acpi.h>
      9#include <linux/console.h>
     10#include <linux/crash_dump.h>
     11#include <linux/dma-map-ops.h>
     12#include <linux/dmi.h>
     13#include <linux/efi.h>
     14#include <linux/init_ohci1394_dma.h>
     15#include <linux/initrd.h>
     16#include <linux/iscsi_ibft.h>
     17#include <linux/memblock.h>
     18#include <linux/panic_notifier.h>
     19#include <linux/pci.h>
     20#include <linux/root_dev.h>
     21#include <linux/hugetlb.h>
     22#include <linux/tboot.h>
     23#include <linux/usb/xhci-dbgp.h>
     24#include <linux/static_call.h>
     25#include <linux/swiotlb.h>
     26
     27#include <uapi/linux/mount.h>
     28
     29#include <xen/xen.h>
     30
     31#include <asm/apic.h>
     32#include <asm/numa.h>
     33#include <asm/bios_ebda.h>
     34#include <asm/bugs.h>
     35#include <asm/cpu.h>
     36#include <asm/efi.h>
     37#include <asm/gart.h>
     38#include <asm/hypervisor.h>
     39#include <asm/io_apic.h>
     40#include <asm/kasan.h>
     41#include <asm/kaslr.h>
     42#include <asm/mce.h>
     43#include <asm/memtype.h>
     44#include <asm/mtrr.h>
     45#include <asm/realmode.h>
     46#include <asm/olpc_ofw.h>
     47#include <asm/pci-direct.h>
     48#include <asm/prom.h>
     49#include <asm/proto.h>
     50#include <asm/thermal.h>
     51#include <asm/unwind.h>
     52#include <asm/vsyscall.h>
     53#include <linux/vmalloc.h>
     54
     55/*
     56 * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
     57 * max_pfn_mapped:     highest directly mapped pfn > 4 GB
     58 *
     59 * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
     60 * represented by pfn_mapped[].
     61 */
     62unsigned long max_low_pfn_mapped;
     63unsigned long max_pfn_mapped;
     64
     65#ifdef CONFIG_DMI
     66RESERVE_BRK(dmi_alloc, 65536);
     67#endif
     68
     69
     70unsigned long _brk_start = (unsigned long)__brk_base;
     71unsigned long _brk_end   = (unsigned long)__brk_base;
     72
     73struct boot_params boot_params;
     74
     75/*
     76 * These are the four main kernel memory regions, we put them into
     77 * the resource tree so that kdump tools and other debugging tools
     78 * recover it:
     79 */
     80
     81static struct resource rodata_resource = {
     82	.name	= "Kernel rodata",
     83	.start	= 0,
     84	.end	= 0,
     85	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
     86};
     87
     88static struct resource data_resource = {
     89	.name	= "Kernel data",
     90	.start	= 0,
     91	.end	= 0,
     92	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
     93};
     94
     95static struct resource code_resource = {
     96	.name	= "Kernel code",
     97	.start	= 0,
     98	.end	= 0,
     99	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
    100};
    101
    102static struct resource bss_resource = {
    103	.name	= "Kernel bss",
    104	.start	= 0,
    105	.end	= 0,
    106	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
    107};
    108
    109
    110#ifdef CONFIG_X86_32
    111/* CPU data as detected by the assembly code in head_32.S */
    112struct cpuinfo_x86 new_cpu_data;
    113
    114/* Common CPU data for all CPUs */
    115struct cpuinfo_x86 boot_cpu_data __read_mostly;
    116EXPORT_SYMBOL(boot_cpu_data);
    117
    118unsigned int def_to_bigsmp;
    119
    120struct apm_info apm_info;
    121EXPORT_SYMBOL(apm_info);
    122
    123#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
    124	defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
    125struct ist_info ist_info;
    126EXPORT_SYMBOL(ist_info);
    127#else
    128struct ist_info ist_info;
    129#endif
    130
    131#else
    132struct cpuinfo_x86 boot_cpu_data __read_mostly;
    133EXPORT_SYMBOL(boot_cpu_data);
    134#endif
    135
    136
    137#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
    138__visible unsigned long mmu_cr4_features __ro_after_init;
    139#else
    140__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
    141#endif
    142
    143/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
    144int bootloader_type, bootloader_version;
    145
    146/*
    147 * Setup options
    148 */
    149struct screen_info screen_info;
    150EXPORT_SYMBOL(screen_info);
    151struct edid_info edid_info;
    152EXPORT_SYMBOL_GPL(edid_info);
    153
    154extern int root_mountflags;
    155
    156unsigned long saved_video_mode;
    157
    158#define RAMDISK_IMAGE_START_MASK	0x07FF
    159#define RAMDISK_PROMPT_FLAG		0x8000
    160#define RAMDISK_LOAD_FLAG		0x4000
    161
    162static char __initdata command_line[COMMAND_LINE_SIZE];
    163#ifdef CONFIG_CMDLINE_BOOL
    164static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
    165#endif
    166
    167#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
    168struct edd edd;
    169#ifdef CONFIG_EDD_MODULE
    170EXPORT_SYMBOL(edd);
    171#endif
    172/**
    173 * copy_edd() - Copy the BIOS EDD information
    174 *              from boot_params into a safe place.
    175 *
    176 */
    177static inline void __init copy_edd(void)
    178{
    179     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
    180	    sizeof(edd.mbr_signature));
    181     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
    182     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
    183     edd.edd_info_nr = boot_params.eddbuf_entries;
    184}
    185#else
    186static inline void __init copy_edd(void)
    187{
    188}
    189#endif
    190
    191void * __init extend_brk(size_t size, size_t align)
    192{
    193	size_t mask = align - 1;
    194	void *ret;
    195
    196	BUG_ON(_brk_start == 0);
    197	BUG_ON(align & mask);
    198
    199	_brk_end = (_brk_end + mask) & ~mask;
    200	BUG_ON((char *)(_brk_end + size) > __brk_limit);
    201
    202	ret = (void *)_brk_end;
    203	_brk_end += size;
    204
    205	memset(ret, 0, size);
    206
    207	return ret;
    208}
    209
    210#ifdef CONFIG_X86_32
    211static void __init cleanup_highmap(void)
    212{
    213}
    214#endif
    215
    216static void __init reserve_brk(void)
    217{
    218	if (_brk_end > _brk_start)
    219		memblock_reserve(__pa_symbol(_brk_start),
    220				 _brk_end - _brk_start);
    221
    222	/* Mark brk area as locked down and no longer taking any
    223	   new allocations */
    224	_brk_start = 0;
    225}
    226
    227u64 relocated_ramdisk;
    228
    229#ifdef CONFIG_BLK_DEV_INITRD
    230
    231static u64 __init get_ramdisk_image(void)
    232{
    233	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
    234
    235	ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
    236
    237	if (ramdisk_image == 0)
    238		ramdisk_image = phys_initrd_start;
    239
    240	return ramdisk_image;
    241}
    242static u64 __init get_ramdisk_size(void)
    243{
    244	u64 ramdisk_size = boot_params.hdr.ramdisk_size;
    245
    246	ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
    247
    248	if (ramdisk_size == 0)
    249		ramdisk_size = phys_initrd_size;
    250
    251	return ramdisk_size;
    252}
    253
    254static void __init relocate_initrd(void)
    255{
    256	/* Assume only end is not page aligned */
    257	u64 ramdisk_image = get_ramdisk_image();
    258	u64 ramdisk_size  = get_ramdisk_size();
    259	u64 area_size     = PAGE_ALIGN(ramdisk_size);
    260
    261	/* We need to move the initrd down into directly mapped mem */
    262	relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
    263						      PFN_PHYS(max_pfn_mapped));
    264	if (!relocated_ramdisk)
    265		panic("Cannot find place for new RAMDISK of size %lld\n",
    266		      ramdisk_size);
    267
    268	initrd_start = relocated_ramdisk + PAGE_OFFSET;
    269	initrd_end   = initrd_start + ramdisk_size;
    270	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
    271	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
    272
    273	copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
    274
    275	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
    276		" [mem %#010llx-%#010llx]\n",
    277		ramdisk_image, ramdisk_image + ramdisk_size - 1,
    278		relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
    279}
    280
    281static void __init early_reserve_initrd(void)
    282{
    283	/* Assume only end is not page aligned */
    284	u64 ramdisk_image = get_ramdisk_image();
    285	u64 ramdisk_size  = get_ramdisk_size();
    286	u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
    287
    288	if (!boot_params.hdr.type_of_loader ||
    289	    !ramdisk_image || !ramdisk_size)
    290		return;		/* No initrd provided by bootloader */
    291
    292	memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
    293}
    294
    295static void __init reserve_initrd(void)
    296{
    297	/* Assume only end is not page aligned */
    298	u64 ramdisk_image = get_ramdisk_image();
    299	u64 ramdisk_size  = get_ramdisk_size();
    300	u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
    301
    302	if (!boot_params.hdr.type_of_loader ||
    303	    !ramdisk_image || !ramdisk_size)
    304		return;		/* No initrd provided by bootloader */
    305
    306	initrd_start = 0;
    307
    308	printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
    309			ramdisk_end - 1);
    310
    311	if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
    312				PFN_DOWN(ramdisk_end))) {
    313		/* All are mapped, easy case */
    314		initrd_start = ramdisk_image + PAGE_OFFSET;
    315		initrd_end = initrd_start + ramdisk_size;
    316		return;
    317	}
    318
    319	relocate_initrd();
    320
    321	memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
    322}
    323
    324#else
    325static void __init early_reserve_initrd(void)
    326{
    327}
    328static void __init reserve_initrd(void)
    329{
    330}
    331#endif /* CONFIG_BLK_DEV_INITRD */
    332
    333static void __init parse_setup_data(void)
    334{
    335	struct setup_data *data;
    336	u64 pa_data, pa_next;
    337
    338	pa_data = boot_params.hdr.setup_data;
    339	while (pa_data) {
    340		u32 data_len, data_type;
    341
    342		data = early_memremap(pa_data, sizeof(*data));
    343		data_len = data->len + sizeof(struct setup_data);
    344		data_type = data->type;
    345		pa_next = data->next;
    346		early_memunmap(data, sizeof(*data));
    347
    348		switch (data_type) {
    349		case SETUP_E820_EXT:
    350			e820__memory_setup_extended(pa_data, data_len);
    351			break;
    352		case SETUP_DTB:
    353			add_dtb(pa_data);
    354			break;
    355		case SETUP_EFI:
    356			parse_efi_setup(pa_data, data_len);
    357			break;
    358		default:
    359			break;
    360		}
    361		pa_data = pa_next;
    362	}
    363}
    364
    365static void __init memblock_x86_reserve_range_setup_data(void)
    366{
    367	struct setup_indirect *indirect;
    368	struct setup_data *data;
    369	u64 pa_data, pa_next;
    370	u32 len;
    371
    372	pa_data = boot_params.hdr.setup_data;
    373	while (pa_data) {
    374		data = early_memremap(pa_data, sizeof(*data));
    375		if (!data) {
    376			pr_warn("setup: failed to memremap setup_data entry\n");
    377			return;
    378		}
    379
    380		len = sizeof(*data);
    381		pa_next = data->next;
    382
    383		memblock_reserve(pa_data, sizeof(*data) + data->len);
    384
    385		if (data->type == SETUP_INDIRECT) {
    386			len += data->len;
    387			early_memunmap(data, sizeof(*data));
    388			data = early_memremap(pa_data, len);
    389			if (!data) {
    390				pr_warn("setup: failed to memremap indirect setup_data\n");
    391				return;
    392			}
    393
    394			indirect = (struct setup_indirect *)data->data;
    395
    396			if (indirect->type != SETUP_INDIRECT)
    397				memblock_reserve(indirect->addr, indirect->len);
    398		}
    399
    400		pa_data = pa_next;
    401		early_memunmap(data, len);
    402	}
    403}
    404
    405/*
    406 * --------- Crashkernel reservation ------------------------------
    407 */
    408
    409/* 16M alignment for crash kernel regions */
    410#define CRASH_ALIGN		SZ_16M
    411
    412/*
    413 * Keep the crash kernel below this limit.
    414 *
    415 * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
    416 * due to mapping restrictions.
    417 *
    418 * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
    419 * the upper limit of system RAM in 4-level paging mode. Since the kdump
    420 * jump could be from 5-level paging to 4-level paging, the jump will fail if
    421 * the kernel is put above 64 TB, and during the 1st kernel bootup there's
    422 * no good way to detect the paging mode of the target kernel which will be
    423 * loaded for dumping.
    424 */
    425#ifdef CONFIG_X86_32
    426# define CRASH_ADDR_LOW_MAX	SZ_512M
    427# define CRASH_ADDR_HIGH_MAX	SZ_512M
    428#else
    429# define CRASH_ADDR_LOW_MAX	SZ_4G
    430# define CRASH_ADDR_HIGH_MAX	SZ_64T
    431#endif
    432
    433static int __init reserve_crashkernel_low(void)
    434{
    435#ifdef CONFIG_X86_64
    436	unsigned long long base, low_base = 0, low_size = 0;
    437	unsigned long low_mem_limit;
    438	int ret;
    439
    440	low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
    441
    442	/* crashkernel=Y,low */
    443	ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
    444	if (ret) {
    445		/*
    446		 * two parts from kernel/dma/swiotlb.c:
    447		 * -swiotlb size: user-specified with swiotlb= or default.
    448		 *
    449		 * -swiotlb overflow buffer: now hardcoded to 32k. We round it
    450		 * to 8M for other buffers that may need to stay low too. Also
    451		 * make sure we allocate enough extra low memory so that we
    452		 * don't run out of DMA buffers for 32-bit devices.
    453		 */
    454		low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
    455	} else {
    456		/* passed with crashkernel=0,low ? */
    457		if (!low_size)
    458			return 0;
    459	}
    460
    461	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
    462	if (!low_base) {
    463		pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
    464		       (unsigned long)(low_size >> 20));
    465		return -ENOMEM;
    466	}
    467
    468	pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
    469		(unsigned long)(low_size >> 20),
    470		(unsigned long)(low_base >> 20),
    471		(unsigned long)(low_mem_limit >> 20));
    472
    473	crashk_low_res.start = low_base;
    474	crashk_low_res.end   = low_base + low_size - 1;
    475	insert_resource(&iomem_resource, &crashk_low_res);
    476#endif
    477	return 0;
    478}
    479
    480static void __init reserve_crashkernel(void)
    481{
    482	unsigned long long crash_size, crash_base, total_mem;
    483	bool high = false;
    484	int ret;
    485
    486	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
    487		return;
    488
    489	total_mem = memblock_phys_mem_size();
    490
    491	/* crashkernel=XM */
    492	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
    493	if (ret != 0 || crash_size <= 0) {
    494		/* crashkernel=X,high */
    495		ret = parse_crashkernel_high(boot_command_line, total_mem,
    496					     &crash_size, &crash_base);
    497		if (ret != 0 || crash_size <= 0)
    498			return;
    499		high = true;
    500	}
    501
    502	if (xen_pv_domain()) {
    503		pr_info("Ignoring crashkernel for a Xen PV domain\n");
    504		return;
    505	}
    506
    507	/* 0 means: find the address automatically */
    508	if (!crash_base) {
    509		/*
    510		 * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
    511		 * crashkernel=x,high reserves memory over 4G, also allocates
    512		 * 256M extra low memory for DMA buffers and swiotlb.
    513		 * But the extra memory is not required for all machines.
    514		 * So try low memory first and fall back to high memory
    515		 * unless "crashkernel=size[KMG],high" is specified.
    516		 */
    517		if (!high)
    518			crash_base = memblock_phys_alloc_range(crash_size,
    519						CRASH_ALIGN, CRASH_ALIGN,
    520						CRASH_ADDR_LOW_MAX);
    521		if (!crash_base)
    522			crash_base = memblock_phys_alloc_range(crash_size,
    523						CRASH_ALIGN, CRASH_ALIGN,
    524						CRASH_ADDR_HIGH_MAX);
    525		if (!crash_base) {
    526			pr_info("crashkernel reservation failed - No suitable area found.\n");
    527			return;
    528		}
    529	} else {
    530		unsigned long long start;
    531
    532		start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
    533						  crash_base + crash_size);
    534		if (start != crash_base) {
    535			pr_info("crashkernel reservation failed - memory is in use.\n");
    536			return;
    537		}
    538	}
    539
    540	if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
    541		memblock_phys_free(crash_base, crash_size);
    542		return;
    543	}
    544
    545	pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
    546		(unsigned long)(crash_size >> 20),
    547		(unsigned long)(crash_base >> 20),
    548		(unsigned long)(total_mem >> 20));
    549
    550	crashk_res.start = crash_base;
    551	crashk_res.end   = crash_base + crash_size - 1;
    552	insert_resource(&iomem_resource, &crashk_res);
    553}
    554
    555static struct resource standard_io_resources[] = {
    556	{ .name = "dma1", .start = 0x00, .end = 0x1f,
    557		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    558	{ .name = "pic1", .start = 0x20, .end = 0x21,
    559		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    560	{ .name = "timer0", .start = 0x40, .end = 0x43,
    561		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    562	{ .name = "timer1", .start = 0x50, .end = 0x53,
    563		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    564	{ .name = "keyboard", .start = 0x60, .end = 0x60,
    565		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    566	{ .name = "keyboard", .start = 0x64, .end = 0x64,
    567		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    568	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
    569		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    570	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
    571		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    572	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
    573		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
    574	{ .name = "fpu", .start = 0xf0, .end = 0xff,
    575		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
    576};
    577
    578void __init reserve_standard_io_resources(void)
    579{
    580	int i;
    581
    582	/* request I/O space for devices used on all i[345]86 PCs */
    583	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
    584		request_resource(&ioport_resource, &standard_io_resources[i]);
    585
    586}
    587
    588static bool __init snb_gfx_workaround_needed(void)
    589{
    590#ifdef CONFIG_PCI
    591	int i;
    592	u16 vendor, devid;
    593	static const __initconst u16 snb_ids[] = {
    594		0x0102,
    595		0x0112,
    596		0x0122,
    597		0x0106,
    598		0x0116,
    599		0x0126,
    600		0x010a,
    601	};
    602
    603	/* Assume no if something weird is going on with PCI */
    604	if (!early_pci_allowed())
    605		return false;
    606
    607	vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
    608	if (vendor != 0x8086)
    609		return false;
    610
    611	devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
    612	for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
    613		if (devid == snb_ids[i])
    614			return true;
    615#endif
    616
    617	return false;
    618}
    619
    620/*
    621 * Sandy Bridge graphics has trouble with certain ranges, exclude
    622 * them from allocation.
    623 */
    624static void __init trim_snb_memory(void)
    625{
    626	static const __initconst unsigned long bad_pages[] = {
    627		0x20050000,
    628		0x20110000,
    629		0x20130000,
    630		0x20138000,
    631		0x40004000,
    632	};
    633	int i;
    634
    635	if (!snb_gfx_workaround_needed())
    636		return;
    637
    638	printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
    639
    640	/*
    641	 * SandyBridge integrated graphics devices have a bug that prevents
    642	 * them from accessing certain memory ranges, namely anything below
    643	 * 1M and in the pages listed in bad_pages[] above.
    644	 *
    645	 * To avoid these pages being ever accessed by SNB gfx devices reserve
    646	 * bad_pages that have not already been reserved at boot time.
    647	 * All memory below the 1 MB mark is anyway reserved later during
    648	 * setup_arch(), so there is no need to reserve it here.
    649	 */
    650
    651	for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
    652		if (memblock_reserve(bad_pages[i], PAGE_SIZE))
    653			printk(KERN_WARNING "failed to reserve 0x%08lx\n",
    654			       bad_pages[i]);
    655	}
    656}
    657
    658static void __init trim_bios_range(void)
    659{
    660	/*
    661	 * A special case is the first 4Kb of memory;
    662	 * This is a BIOS owned area, not kernel ram, but generally
    663	 * not listed as such in the E820 table.
    664	 *
    665	 * This typically reserves additional memory (64KiB by default)
    666	 * since some BIOSes are known to corrupt low memory.  See the
    667	 * Kconfig help text for X86_RESERVE_LOW.
    668	 */
    669	e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
    670
    671	/*
    672	 * special case: Some BIOSes report the PC BIOS
    673	 * area (640Kb -> 1Mb) as RAM even though it is not.
    674	 * take them out.
    675	 */
    676	e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
    677
    678	e820__update_table(e820_table);
    679}
    680
    681/* called before trim_bios_range() to spare extra sanitize */
    682static void __init e820_add_kernel_range(void)
    683{
    684	u64 start = __pa_symbol(_text);
    685	u64 size = __pa_symbol(_end) - start;
    686
    687	/*
    688	 * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
    689	 * attempt to fix it by adding the range. We may have a confused BIOS,
    690	 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
    691	 * exclude kernel range. If we really are running on top non-RAM,
    692	 * we will crash later anyways.
    693	 */
    694	if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
    695		return;
    696
    697	pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
    698	e820__range_remove(start, size, E820_TYPE_RAM, 0);
    699	e820__range_add(start, size, E820_TYPE_RAM);
    700}
    701
    702static void __init early_reserve_memory(void)
    703{
    704	/*
    705	 * Reserve the memory occupied by the kernel between _text and
    706	 * __end_of_kernel_reserve symbols. Any kernel sections after the
    707	 * __end_of_kernel_reserve symbol must be explicitly reserved with a
    708	 * separate memblock_reserve() or they will be discarded.
    709	 */
    710	memblock_reserve(__pa_symbol(_text),
    711			 (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
    712
    713	/*
    714	 * The first 4Kb of memory is a BIOS owned area, but generally it is
    715	 * not listed as such in the E820 table.
    716	 *
    717	 * Reserve the first 64K of memory since some BIOSes are known to
    718	 * corrupt low memory. After the real mode trampoline is allocated the
    719	 * rest of the memory below 640k is reserved.
    720	 *
    721	 * In addition, make sure page 0 is always reserved because on
    722	 * systems with L1TF its contents can be leaked to user processes.
    723	 */
    724	memblock_reserve(0, SZ_64K);
    725
    726	early_reserve_initrd();
    727
    728	memblock_x86_reserve_range_setup_data();
    729
    730	reserve_ibft_region();
    731	reserve_bios_regions();
    732	trim_snb_memory();
    733}
    734
    735/*
    736 * Dump out kernel offset information on panic.
    737 */
    738static int
    739dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
    740{
    741	if (kaslr_enabled()) {
    742		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
    743			 kaslr_offset(),
    744			 __START_KERNEL,
    745			 __START_KERNEL_map,
    746			 MODULES_VADDR-1);
    747	} else {
    748		pr_emerg("Kernel Offset: disabled\n");
    749	}
    750
    751	return 0;
    752}
    753
    754void x86_configure_nx(void)
    755{
    756	if (boot_cpu_has(X86_FEATURE_NX))
    757		__supported_pte_mask |= _PAGE_NX;
    758	else
    759		__supported_pte_mask &= ~_PAGE_NX;
    760}
    761
    762static void __init x86_report_nx(void)
    763{
    764	if (!boot_cpu_has(X86_FEATURE_NX)) {
    765		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
    766		       "missing in CPU!\n");
    767	} else {
    768#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
    769		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
    770#else
    771		/* 32bit non-PAE kernel, NX cannot be used */
    772		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
    773		       "cannot be enabled: non-PAE kernel!\n");
    774#endif
    775	}
    776}
    777
    778/*
    779 * Determine if we were loaded by an EFI loader.  If so, then we have also been
    780 * passed the efi memmap, systab, etc., so we should use these data structures
    781 * for initialization.  Note, the efi init code path is determined by the
    782 * global efi_enabled. This allows the same kernel image to be used on existing
    783 * systems (with a traditional BIOS) as well as on EFI systems.
    784 */
    785/*
    786 * setup_arch - architecture-specific boot-time initializations
    787 *
    788 * Note: On x86_64, fixmaps are ready for use even before this is called.
    789 */
    790
    791void __init setup_arch(char **cmdline_p)
    792{
    793#ifdef CONFIG_X86_32
    794	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
    795
    796	/*
    797	 * copy kernel address range established so far and switch
    798	 * to the proper swapper page table
    799	 */
    800	clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
    801			initial_page_table + KERNEL_PGD_BOUNDARY,
    802			KERNEL_PGD_PTRS);
    803
    804	load_cr3(swapper_pg_dir);
    805	/*
    806	 * Note: Quark X1000 CPUs advertise PGE incorrectly and require
    807	 * a cr3 based tlb flush, so the following __flush_tlb_all()
    808	 * will not flush anything because the CPU quirk which clears
    809	 * X86_FEATURE_PGE has not been invoked yet. Though due to the
    810	 * load_cr3() above the TLB has been flushed already. The
    811	 * quirk is invoked before subsequent calls to __flush_tlb_all()
    812	 * so proper operation is guaranteed.
    813	 */
    814	__flush_tlb_all();
    815#else
    816	printk(KERN_INFO "Command line: %s\n", boot_command_line);
    817	boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
    818#endif
    819
    820	/*
    821	 * If we have OLPC OFW, we might end up relocating the fixmap due to
    822	 * reserve_top(), so do this before touching the ioremap area.
    823	 */
    824	olpc_ofw_detect();
    825
    826	idt_setup_early_traps();
    827	early_cpu_init();
    828	jump_label_init();
    829	static_call_init();
    830	early_ioremap_init();
    831
    832	setup_olpc_ofw_pgd();
    833
    834	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
    835	screen_info = boot_params.screen_info;
    836	edid_info = boot_params.edid_info;
    837#ifdef CONFIG_X86_32
    838	apm_info.bios = boot_params.apm_bios_info;
    839	ist_info = boot_params.ist_info;
    840#endif
    841	saved_video_mode = boot_params.hdr.vid_mode;
    842	bootloader_type = boot_params.hdr.type_of_loader;
    843	if ((bootloader_type >> 4) == 0xe) {
    844		bootloader_type &= 0xf;
    845		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
    846	}
    847	bootloader_version  = bootloader_type & 0xf;
    848	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
    849
    850#ifdef CONFIG_BLK_DEV_RAM
    851	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
    852#endif
    853#ifdef CONFIG_EFI
    854	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
    855		     EFI32_LOADER_SIGNATURE, 4)) {
    856		set_bit(EFI_BOOT, &efi.flags);
    857	} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
    858		     EFI64_LOADER_SIGNATURE, 4)) {
    859		set_bit(EFI_BOOT, &efi.flags);
    860		set_bit(EFI_64BIT, &efi.flags);
    861	}
    862#endif
    863
    864	x86_init.oem.arch_setup();
    865
    866	/*
    867	 * Do some memory reservations *before* memory is added to memblock, so
    868	 * memblock allocations won't overwrite it.
    869	 *
    870	 * After this point, everything still needed from the boot loader or
    871	 * firmware or kernel text should be early reserved or marked not RAM in
    872	 * e820. All other memory is free game.
    873	 *
    874	 * This call needs to happen before e820__memory_setup() which calls the
    875	 * xen_memory_setup() on Xen dom0 which relies on the fact that those
    876	 * early reservations have happened already.
    877	 */
    878	early_reserve_memory();
    879
    880	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
    881	e820__memory_setup();
    882	parse_setup_data();
    883
    884	copy_edd();
    885
    886	if (!boot_params.hdr.root_flags)
    887		root_mountflags &= ~MS_RDONLY;
    888	setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
    889
    890	code_resource.start = __pa_symbol(_text);
    891	code_resource.end = __pa_symbol(_etext)-1;
    892	rodata_resource.start = __pa_symbol(__start_rodata);
    893	rodata_resource.end = __pa_symbol(__end_rodata)-1;
    894	data_resource.start = __pa_symbol(_sdata);
    895	data_resource.end = __pa_symbol(_edata)-1;
    896	bss_resource.start = __pa_symbol(__bss_start);
    897	bss_resource.end = __pa_symbol(__bss_stop)-1;
    898
    899#ifdef CONFIG_CMDLINE_BOOL
    900#ifdef CONFIG_CMDLINE_OVERRIDE
    901	strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
    902#else
    903	if (builtin_cmdline[0]) {
    904		/* append boot loader cmdline to builtin */
    905		strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
    906		strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
    907		strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
    908	}
    909#endif
    910#endif
    911
    912	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
    913	*cmdline_p = command_line;
    914
    915	/*
    916	 * x86_configure_nx() is called before parse_early_param() to detect
    917	 * whether hardware doesn't support NX (so that the early EHCI debug
    918	 * console setup can safely call set_fixmap()).
    919	 */
    920	x86_configure_nx();
    921
    922	parse_early_param();
    923
    924	if (efi_enabled(EFI_BOOT))
    925		efi_memblock_x86_reserve_range();
    926
    927#ifdef CONFIG_MEMORY_HOTPLUG
    928	/*
    929	 * Memory used by the kernel cannot be hot-removed because Linux
    930	 * cannot migrate the kernel pages. When memory hotplug is
    931	 * enabled, we should prevent memblock from allocating memory
    932	 * for the kernel.
    933	 *
    934	 * ACPI SRAT records all hotpluggable memory ranges. But before
    935	 * SRAT is parsed, we don't know about it.
    936	 *
    937	 * The kernel image is loaded into memory at very early time. We
    938	 * cannot prevent this anyway. So on NUMA system, we set any
    939	 * node the kernel resides in as un-hotpluggable.
    940	 *
    941	 * Since on modern servers, one node could have double-digit
    942	 * gigabytes memory, we can assume the memory around the kernel
    943	 * image is also un-hotpluggable. So before SRAT is parsed, just
    944	 * allocate memory near the kernel image to try the best to keep
    945	 * the kernel away from hotpluggable memory.
    946	 */
    947	if (movable_node_is_enabled())
    948		memblock_set_bottom_up(true);
    949#endif
    950
    951	x86_report_nx();
    952
    953	if (acpi_mps_check()) {
    954#ifdef CONFIG_X86_LOCAL_APIC
    955		disable_apic = 1;
    956#endif
    957		setup_clear_cpu_cap(X86_FEATURE_APIC);
    958	}
    959
    960	e820__reserve_setup_data();
    961	e820__finish_early_params();
    962
    963	if (efi_enabled(EFI_BOOT))
    964		efi_init();
    965
    966	dmi_setup();
    967
    968	/*
    969	 * VMware detection requires dmi to be available, so this
    970	 * needs to be done after dmi_setup(), for the boot CPU.
    971	 */
    972	init_hypervisor_platform();
    973
    974	tsc_early_init();
    975	x86_init.resources.probe_roms();
    976
    977	/* after parse_early_param, so could debug it */
    978	insert_resource(&iomem_resource, &code_resource);
    979	insert_resource(&iomem_resource, &rodata_resource);
    980	insert_resource(&iomem_resource, &data_resource);
    981	insert_resource(&iomem_resource, &bss_resource);
    982
    983	e820_add_kernel_range();
    984	trim_bios_range();
    985#ifdef CONFIG_X86_32
    986	if (ppro_with_ram_bug()) {
    987		e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
    988				  E820_TYPE_RESERVED);
    989		e820__update_table(e820_table);
    990		printk(KERN_INFO "fixed physical RAM map:\n");
    991		e820__print_table("bad_ppro");
    992	}
    993#else
    994	early_gart_iommu_check();
    995#endif
    996
    997	/*
    998	 * partially used pages are not usable - thus
    999	 * we are rounding upwards:
   1000	 */
   1001	max_pfn = e820__end_of_ram_pfn();
   1002
   1003	/* update e820 for memory not covered by WB MTRRs */
   1004	if (IS_ENABLED(CONFIG_MTRR))
   1005		mtrr_bp_init();
   1006	else
   1007		pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
   1008
   1009	if (mtrr_trim_uncached_memory(max_pfn))
   1010		max_pfn = e820__end_of_ram_pfn();
   1011
   1012	max_possible_pfn = max_pfn;
   1013
   1014	/*
   1015	 * This call is required when the CPU does not support PAT. If
   1016	 * mtrr_bp_init() invoked it already via pat_init() the call has no
   1017	 * effect.
   1018	 */
   1019	init_cache_modes();
   1020
   1021	/*
   1022	 * Define random base addresses for memory sections after max_pfn is
   1023	 * defined and before each memory section base is used.
   1024	 */
   1025	kernel_randomize_memory();
   1026
   1027#ifdef CONFIG_X86_32
   1028	/* max_low_pfn get updated here */
   1029	find_low_pfn_range();
   1030#else
   1031	check_x2apic();
   1032
   1033	/* How many end-of-memory variables you have, grandma! */
   1034	/* need this before calling reserve_initrd */
   1035	if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
   1036		max_low_pfn = e820__end_of_low_ram_pfn();
   1037	else
   1038		max_low_pfn = max_pfn;
   1039
   1040	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
   1041#endif
   1042
   1043	/*
   1044	 * Find and reserve possible boot-time SMP configuration:
   1045	 */
   1046	find_smp_config();
   1047
   1048	early_alloc_pgt_buf();
   1049
   1050	/*
   1051	 * Need to conclude brk, before e820__memblock_setup()
   1052	 * it could use memblock_find_in_range, could overlap with
   1053	 * brk area.
   1054	 */
   1055	reserve_brk();
   1056
   1057	cleanup_highmap();
   1058
   1059	memblock_set_current_limit(ISA_END_ADDRESS);
   1060	e820__memblock_setup();
   1061
   1062	/*
   1063	 * Needs to run after memblock setup because it needs the physical
   1064	 * memory size.
   1065	 */
   1066	sev_setup_arch();
   1067
   1068	efi_fake_memmap();
   1069	efi_find_mirror();
   1070	efi_esrt_init();
   1071	efi_mokvar_table_init();
   1072
   1073	/*
   1074	 * The EFI specification says that boot service code won't be
   1075	 * called after ExitBootServices(). This is, in fact, a lie.
   1076	 */
   1077	efi_reserve_boot_services();
   1078
   1079	/* preallocate 4k for mptable mpc */
   1080	e820__memblock_alloc_reserved_mpc_new();
   1081
   1082#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
   1083	setup_bios_corruption_check();
   1084#endif
   1085
   1086#ifdef CONFIG_X86_32
   1087	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
   1088			(max_pfn_mapped<<PAGE_SHIFT) - 1);
   1089#endif
   1090
   1091	/*
   1092	 * Find free memory for the real mode trampoline and place it there. If
   1093	 * there is not enough free memory under 1M, on EFI-enabled systems
   1094	 * there will be additional attempt to reclaim the memory for the real
   1095	 * mode trampoline at efi_free_boot_services().
   1096	 *
   1097	 * Unconditionally reserve the entire first 1M of RAM because BIOSes
   1098	 * are known to corrupt low memory and several hundred kilobytes are not
   1099	 * worth complex detection what memory gets clobbered. Windows does the
   1100	 * same thing for very similar reasons.
   1101	 *
   1102	 * Moreover, on machines with SandyBridge graphics or in setups that use
   1103	 * crashkernel the entire 1M is reserved anyway.
   1104	 */
   1105	reserve_real_mode();
   1106
   1107	init_mem_mapping();
   1108
   1109	idt_setup_early_pf();
   1110
   1111	/*
   1112	 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
   1113	 * with the current CR4 value.  This may not be necessary, but
   1114	 * auditing all the early-boot CR4 manipulation would be needed to
   1115	 * rule it out.
   1116	 *
   1117	 * Mask off features that don't work outside long mode (just
   1118	 * PCIDE for now).
   1119	 */
   1120	mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
   1121
   1122	memblock_set_current_limit(get_max_mapped());
   1123
   1124	/*
   1125	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
   1126	 */
   1127
   1128#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
   1129	if (init_ohci1394_dma_early)
   1130		init_ohci1394_dma_on_all_controllers();
   1131#endif
   1132	/* Allocate bigger log buffer */
   1133	setup_log_buf(1);
   1134
   1135	if (efi_enabled(EFI_BOOT)) {
   1136		switch (boot_params.secure_boot) {
   1137		case efi_secureboot_mode_disabled:
   1138			pr_info("Secure boot disabled\n");
   1139			break;
   1140		case efi_secureboot_mode_enabled:
   1141			pr_info("Secure boot enabled\n");
   1142			break;
   1143		default:
   1144			pr_info("Secure boot could not be determined\n");
   1145			break;
   1146		}
   1147	}
   1148
   1149	reserve_initrd();
   1150
   1151	acpi_table_upgrade();
   1152	/* Look for ACPI tables and reserve memory occupied by them. */
   1153	acpi_boot_table_init();
   1154
   1155	vsmp_init();
   1156
   1157	io_delay_init();
   1158
   1159	early_platform_quirks();
   1160
   1161	early_acpi_boot_init();
   1162
   1163	initmem_init();
   1164	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
   1165
   1166	if (boot_cpu_has(X86_FEATURE_GBPAGES))
   1167		hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
   1168
   1169	/*
   1170	 * Reserve memory for crash kernel after SRAT is parsed so that it
   1171	 * won't consume hotpluggable memory.
   1172	 */
   1173	reserve_crashkernel();
   1174
   1175	memblock_find_dma_reserve();
   1176
   1177	if (!early_xdbc_setup_hardware())
   1178		early_xdbc_register_console();
   1179
   1180	x86_init.paging.pagetable_init();
   1181
   1182	kasan_init();
   1183
   1184	/*
   1185	 * Sync back kernel address range.
   1186	 *
   1187	 * FIXME: Can the later sync in setup_cpu_entry_areas() replace
   1188	 * this call?
   1189	 */
   1190	sync_initial_page_table();
   1191
   1192	tboot_probe();
   1193
   1194	map_vsyscall();
   1195
   1196	generic_apic_probe();
   1197
   1198	early_quirks();
   1199
   1200	/*
   1201	 * Read APIC and some other early information from ACPI tables.
   1202	 */
   1203	acpi_boot_init();
   1204	x86_dtb_init();
   1205
   1206	/*
   1207	 * get boot-time SMP configuration:
   1208	 */
   1209	get_smp_config();
   1210
   1211	/*
   1212	 * Systems w/o ACPI and mptables might not have it mapped the local
   1213	 * APIC yet, but prefill_possible_map() might need to access it.
   1214	 */
   1215	init_apic_mappings();
   1216
   1217	prefill_possible_map();
   1218
   1219	init_cpu_to_node();
   1220	init_gi_nodes();
   1221
   1222	io_apic_init_mappings();
   1223
   1224	x86_init.hyper.guest_late_init();
   1225
   1226	e820__reserve_resources();
   1227	e820__register_nosave_regions(max_pfn);
   1228
   1229	x86_init.resources.reserve_resources();
   1230
   1231	e820__setup_pci_gap();
   1232
   1233#ifdef CONFIG_VT
   1234#if defined(CONFIG_VGA_CONSOLE)
   1235	if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
   1236		conswitchp = &vga_con;
   1237#endif
   1238#endif
   1239	x86_init.oem.banner();
   1240
   1241	x86_init.timers.wallclock_init();
   1242
   1243	/*
   1244	 * This needs to run before setup_local_APIC() which soft-disables the
   1245	 * local APIC temporarily and that masks the thermal LVT interrupt,
   1246	 * leading to softlockups on machines which have configured SMI
   1247	 * interrupt delivery.
   1248	 */
   1249	therm_lvt_init();
   1250
   1251	mcheck_init();
   1252
   1253	register_refined_jiffies(CLOCK_TICK_RATE);
   1254
   1255#ifdef CONFIG_EFI
   1256	if (efi_enabled(EFI_BOOT))
   1257		efi_apply_memmap_quirks();
   1258#endif
   1259
   1260	unwind_init();
   1261}
   1262
   1263#ifdef CONFIG_X86_32
   1264
   1265static struct resource video_ram_resource = {
   1266	.name	= "Video RAM area",
   1267	.start	= 0xa0000,
   1268	.end	= 0xbffff,
   1269	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
   1270};
   1271
   1272void __init i386_reserve_resources(void)
   1273{
   1274	request_resource(&iomem_resource, &video_ram_resource);
   1275	reserve_standard_io_resources();
   1276}
   1277
   1278#endif /* CONFIG_X86_32 */
   1279
   1280static struct notifier_block kernel_offset_notifier = {
   1281	.notifier_call = dump_kernel_offset
   1282};
   1283
   1284static int __init register_kernel_offset_dumper(void)
   1285{
   1286	atomic_notifier_chain_register(&panic_notifier_list,
   1287					&kernel_offset_notifier);
   1288	return 0;
   1289}
   1290__initcall(register_kernel_offset_dumper);