core.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
core.c (69351B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Machine check handler.
      4 *
      5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
      6 * Rest from unknown author(s).
      7 * 2004 Andi Kleen. Rewrote most of it.
      8 * Copyright 2008 Intel Corporation
      9 * Author: Andi Kleen
     10 */
     11
     12#include <linux/thread_info.h>
     13#include <linux/capability.h>
     14#include <linux/miscdevice.h>
     15#include <linux/ratelimit.h>
     16#include <linux/rcupdate.h>
     17#include <linux/kobject.h>
     18#include <linux/uaccess.h>
     19#include <linux/kdebug.h>
     20#include <linux/kernel.h>
     21#include <linux/percpu.h>
     22#include <linux/string.h>
     23#include <linux/device.h>
     24#include <linux/syscore_ops.h>
     25#include <linux/delay.h>
     26#include <linux/ctype.h>
     27#include <linux/sched.h>
     28#include <linux/sysfs.h>
     29#include <linux/types.h>
     30#include <linux/slab.h>
     31#include <linux/init.h>
     32#include <linux/kmod.h>
     33#include <linux/poll.h>
     34#include <linux/nmi.h>
     35#include <linux/cpu.h>
     36#include <linux/ras.h>
     37#include <linux/smp.h>
     38#include <linux/fs.h>
     39#include <linux/mm.h>
     40#include <linux/debugfs.h>
     41#include <linux/irq_work.h>
     42#include <linux/export.h>
     43#include <linux/set_memory.h>
     44#include <linux/sync_core.h>
     45#include <linux/task_work.h>
     46#include <linux/hardirq.h>
     47
     48#include <asm/intel-family.h>
     49#include <asm/processor.h>
     50#include <asm/traps.h>
     51#include <asm/tlbflush.h>
     52#include <asm/mce.h>
     53#include <asm/msr.h>
     54#include <asm/reboot.h>
     55
     56#include "internal.h"
     57
     58/* sysfs synchronization */
     59static DEFINE_MUTEX(mce_sysfs_mutex);
     60
     61#define CREATE_TRACE_POINTS
     62#include <trace/events/mce.h>
     63
     64#define SPINUNIT		100	/* 100ns */
     65
     66DEFINE_PER_CPU(unsigned, mce_exception_count);
     67
     68DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
     69
     70struct mce_bank {
     71	u64			ctl;			/* subevents to enable */
     72
     73	__u64 init			: 1,		/* initialise bank? */
     74	      __reserved_1		: 63;
     75};
     76static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
     77
     78#define ATTR_LEN               16
     79/* One object for each MCE bank, shared by all CPUs */
     80struct mce_bank_dev {
     81	struct device_attribute	attr;			/* device attribute */
     82	char			attrname[ATTR_LEN];	/* attribute name */
     83	u8			bank;			/* bank number */
     84};
     85static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
     86
     87struct mce_vendor_flags mce_flags __read_mostly;
     88
     89struct mca_config mca_cfg __read_mostly = {
     90	.bootlog  = -1,
     91	.monarch_timeout = -1
     92};
     93
     94static DEFINE_PER_CPU(struct mce, mces_seen);
     95static unsigned long mce_need_notify;
     96
     97/*
     98 * MCA banks polled by the period polling timer for corrected events.
     99 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
    100 */
    101DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
    102	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
    103};
    104
    105/*
    106 * MCA banks controlled through firmware first for corrected errors.
    107 * This is a global list of banks for which we won't enable CMCI and we
    108 * won't poll. Firmware controls these banks and is responsible for
    109 * reporting corrected errors through GHES. Uncorrected/recoverable
    110 * errors are still notified through a machine check.
    111 */
    112mce_banks_t mce_banks_ce_disabled;
    113
    114static struct work_struct mce_work;
    115static struct irq_work mce_irq_work;
    116
    117/*
    118 * CPU/chipset specific EDAC code can register a notifier call here to print
    119 * MCE errors in a human-readable form.
    120 */
    121BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
    122
    123/* Do initial initialization of a struct mce */
    124void mce_setup(struct mce *m)
    125{
    126	memset(m, 0, sizeof(struct mce));
    127	m->cpu = m->extcpu = smp_processor_id();
    128	/* need the internal __ version to avoid deadlocks */
    129	m->time = __ktime_get_real_seconds();
    130	m->cpuvendor = boot_cpu_data.x86_vendor;
    131	m->cpuid = cpuid_eax(1);
    132	m->socketid = cpu_data(m->extcpu).phys_proc_id;
    133	m->apicid = cpu_data(m->extcpu).initial_apicid;
    134	m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
    135	m->ppin = cpu_data(m->extcpu).ppin;
    136	m->microcode = boot_cpu_data.microcode;
    137}
    138
    139DEFINE_PER_CPU(struct mce, injectm);
    140EXPORT_PER_CPU_SYMBOL_GPL(injectm);
    141
    142void mce_log(struct mce *m)
    143{
    144	if (!mce_gen_pool_add(m))
    145		irq_work_queue(&mce_irq_work);
    146}
    147EXPORT_SYMBOL_GPL(mce_log);
    148
    149void mce_register_decode_chain(struct notifier_block *nb)
    150{
    151	if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
    152		    nb->priority > MCE_PRIO_HIGHEST))
    153		return;
    154
    155	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
    156}
    157EXPORT_SYMBOL_GPL(mce_register_decode_chain);
    158
    159void mce_unregister_decode_chain(struct notifier_block *nb)
    160{
    161	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
    162}
    163EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
    164
    165static void __print_mce(struct mce *m)
    166{
    167	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
    168		 m->extcpu,
    169		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
    170		 m->mcgstatus, m->bank, m->status);
    171
    172	if (m->ip) {
    173		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
    174			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
    175			m->cs, m->ip);
    176
    177		if (m->cs == __KERNEL_CS)
    178			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
    179		pr_cont("\n");
    180	}
    181
    182	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
    183	if (m->addr)
    184		pr_cont("ADDR %llx ", m->addr);
    185	if (m->misc)
    186		pr_cont("MISC %llx ", m->misc);
    187	if (m->ppin)
    188		pr_cont("PPIN %llx ", m->ppin);
    189
    190	if (mce_flags.smca) {
    191		if (m->synd)
    192			pr_cont("SYND %llx ", m->synd);
    193		if (m->ipid)
    194			pr_cont("IPID %llx ", m->ipid);
    195	}
    196
    197	pr_cont("\n");
    198
    199	/*
    200	 * Note this output is parsed by external tools and old fields
    201	 * should not be changed.
    202	 */
    203	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
    204		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
    205		m->microcode);
    206}
    207
    208static void print_mce(struct mce *m)
    209{
    210	__print_mce(m);
    211
    212	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
    213		pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
    214}
    215
    216#define PANIC_TIMEOUT 5 /* 5 seconds */
    217
    218static atomic_t mce_panicked;
    219
    220static int fake_panic;
    221static atomic_t mce_fake_panicked;
    222
    223/* Panic in progress. Enable interrupts and wait for final IPI */
    224static void wait_for_panic(void)
    225{
    226	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
    227
    228	preempt_disable();
    229	local_irq_enable();
    230	while (timeout-- > 0)
    231		udelay(1);
    232	if (panic_timeout == 0)
    233		panic_timeout = mca_cfg.panic_timeout;
    234	panic("Panicing machine check CPU died");
    235}
    236
    237static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
    238{
    239	struct llist_node *pending;
    240	struct mce_evt_llist *l;
    241	int apei_err = 0;
    242
    243	/*
    244	 * Allow instrumentation around external facilities usage. Not that it
    245	 * matters a whole lot since the machine is going to panic anyway.
    246	 */
    247	instrumentation_begin();
    248
    249	if (!fake_panic) {
    250		/*
    251		 * Make sure only one CPU runs in machine check panic
    252		 */
    253		if (atomic_inc_return(&mce_panicked) > 1)
    254			wait_for_panic();
    255		barrier();
    256
    257		bust_spinlocks(1);
    258		console_verbose();
    259	} else {
    260		/* Don't log too much for fake panic */
    261		if (atomic_inc_return(&mce_fake_panicked) > 1)
    262			goto out;
    263	}
    264	pending = mce_gen_pool_prepare_records();
    265	/* First print corrected ones that are still unlogged */
    266	llist_for_each_entry(l, pending, llnode) {
    267		struct mce *m = &l->mce;
    268		if (!(m->status & MCI_STATUS_UC)) {
    269			print_mce(m);
    270			if (!apei_err)
    271				apei_err = apei_write_mce(m);
    272		}
    273	}
    274	/* Now print uncorrected but with the final one last */
    275	llist_for_each_entry(l, pending, llnode) {
    276		struct mce *m = &l->mce;
    277		if (!(m->status & MCI_STATUS_UC))
    278			continue;
    279		if (!final || mce_cmp(m, final)) {
    280			print_mce(m);
    281			if (!apei_err)
    282				apei_err = apei_write_mce(m);
    283		}
    284	}
    285	if (final) {
    286		print_mce(final);
    287		if (!apei_err)
    288			apei_err = apei_write_mce(final);
    289	}
    290	if (exp)
    291		pr_emerg(HW_ERR "Machine check: %s\n", exp);
    292	if (!fake_panic) {
    293		if (panic_timeout == 0)
    294			panic_timeout = mca_cfg.panic_timeout;
    295		panic(msg);
    296	} else
    297		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
    298
    299out:
    300	instrumentation_end();
    301}
    302
    303/* Support code for software error injection */
    304
    305static int msr_to_offset(u32 msr)
    306{
    307	unsigned bank = __this_cpu_read(injectm.bank);
    308
    309	if (msr == mca_cfg.rip_msr)
    310		return offsetof(struct mce, ip);
    311	if (msr == mca_msr_reg(bank, MCA_STATUS))
    312		return offsetof(struct mce, status);
    313	if (msr == mca_msr_reg(bank, MCA_ADDR))
    314		return offsetof(struct mce, addr);
    315	if (msr == mca_msr_reg(bank, MCA_MISC))
    316		return offsetof(struct mce, misc);
    317	if (msr == MSR_IA32_MCG_STATUS)
    318		return offsetof(struct mce, mcgstatus);
    319	return -1;
    320}
    321
    322void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
    323{
    324	if (wrmsr) {
    325		pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
    326			 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
    327			 regs->ip, (void *)regs->ip);
    328	} else {
    329		pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
    330			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
    331	}
    332
    333	show_stack_regs(regs);
    334
    335	panic("MCA architectural violation!\n");
    336
    337	while (true)
    338		cpu_relax();
    339}
    340
    341/* MSR access wrappers used for error injection */
    342noinstr u64 mce_rdmsrl(u32 msr)
    343{
    344	DECLARE_ARGS(val, low, high);
    345
    346	if (__this_cpu_read(injectm.finished)) {
    347		int offset;
    348		u64 ret;
    349
    350		instrumentation_begin();
    351
    352		offset = msr_to_offset(msr);
    353		if (offset < 0)
    354			ret = 0;
    355		else
    356			ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
    357
    358		instrumentation_end();
    359
    360		return ret;
    361	}
    362
    363	/*
    364	 * RDMSR on MCA MSRs should not fault. If they do, this is very much an
    365	 * architectural violation and needs to be reported to hw vendor. Panic
    366	 * the box to not allow any further progress.
    367	 */
    368	asm volatile("1: rdmsr\n"
    369		     "2:\n"
    370		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
    371		     : EAX_EDX_RET(val, low, high) : "c" (msr));
    372
    373
    374	return EAX_EDX_VAL(val, low, high);
    375}
    376
    377static noinstr void mce_wrmsrl(u32 msr, u64 v)
    378{
    379	u32 low, high;
    380
    381	if (__this_cpu_read(injectm.finished)) {
    382		int offset;
    383
    384		instrumentation_begin();
    385
    386		offset = msr_to_offset(msr);
    387		if (offset >= 0)
    388			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
    389
    390		instrumentation_end();
    391
    392		return;
    393	}
    394
    395	low  = (u32)v;
    396	high = (u32)(v >> 32);
    397
    398	/* See comment in mce_rdmsrl() */
    399	asm volatile("1: wrmsr\n"
    400		     "2:\n"
    401		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
    402		     : : "c" (msr), "a"(low), "d" (high) : "memory");
    403}
    404
    405/*
    406 * Collect all global (w.r.t. this processor) status about this machine
    407 * check into our "mce" struct so that we can use it later to assess
    408 * the severity of the problem as we read per-bank specific details.
    409 */
    410static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
    411{
    412	/*
    413	 * Enable instrumentation around mce_setup() which calls external
    414	 * facilities.
    415	 */
    416	instrumentation_begin();
    417	mce_setup(m);
    418	instrumentation_end();
    419
    420	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
    421	if (regs) {
    422		/*
    423		 * Get the address of the instruction at the time of
    424		 * the machine check error.
    425		 */
    426		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
    427			m->ip = regs->ip;
    428			m->cs = regs->cs;
    429
    430			/*
    431			 * When in VM86 mode make the cs look like ring 3
    432			 * always. This is a lie, but it's better than passing
    433			 * the additional vm86 bit around everywhere.
    434			 */
    435			if (v8086_mode(regs))
    436				m->cs |= 3;
    437		}
    438		/* Use accurate RIP reporting if available. */
    439		if (mca_cfg.rip_msr)
    440			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
    441	}
    442}
    443
    444int mce_available(struct cpuinfo_x86 *c)
    445{
    446	if (mca_cfg.disabled)
    447		return 0;
    448	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
    449}
    450
    451static void mce_schedule_work(void)
    452{
    453	if (!mce_gen_pool_empty())
    454		schedule_work(&mce_work);
    455}
    456
    457static void mce_irq_work_cb(struct irq_work *entry)
    458{
    459	mce_schedule_work();
    460}
    461
    462/*
    463 * Check if the address reported by the CPU is in a format we can parse.
    464 * It would be possible to add code for most other cases, but all would
    465 * be somewhat complicated (e.g. segment offset would require an instruction
    466 * parser). So only support physical addresses up to page granularity for now.
    467 */
    468int mce_usable_address(struct mce *m)
    469{
    470	if (!(m->status & MCI_STATUS_ADDRV))
    471		return 0;
    472
    473	/* Checks after this one are Intel/Zhaoxin-specific: */
    474	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
    475	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
    476		return 1;
    477
    478	if (!(m->status & MCI_STATUS_MISCV))
    479		return 0;
    480
    481	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
    482		return 0;
    483
    484	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
    485		return 0;
    486
    487	return 1;
    488}
    489EXPORT_SYMBOL_GPL(mce_usable_address);
    490
    491bool mce_is_memory_error(struct mce *m)
    492{
    493	switch (m->cpuvendor) {
    494	case X86_VENDOR_AMD:
    495	case X86_VENDOR_HYGON:
    496		return amd_mce_is_memory_error(m);
    497
    498	case X86_VENDOR_INTEL:
    499	case X86_VENDOR_ZHAOXIN:
    500		/*
    501		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
    502		 *
    503		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
    504		 * indicating a memory error. Bit 8 is used for indicating a
    505		 * cache hierarchy error. The combination of bit 2 and bit 3
    506		 * is used for indicating a `generic' cache hierarchy error
    507		 * But we can't just blindly check the above bits, because if
    508		 * bit 11 is set, then it is a bus/interconnect error - and
    509		 * either way the above bits just gives more detail on what
    510		 * bus/interconnect error happened. Note that bit 12 can be
    511		 * ignored, as it's the "filter" bit.
    512		 */
    513		return (m->status & 0xef80) == BIT(7) ||
    514		       (m->status & 0xef00) == BIT(8) ||
    515		       (m->status & 0xeffc) == 0xc;
    516
    517	default:
    518		return false;
    519	}
    520}
    521EXPORT_SYMBOL_GPL(mce_is_memory_error);
    522
    523static bool whole_page(struct mce *m)
    524{
    525	if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
    526		return true;
    527
    528	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
    529}
    530
    531bool mce_is_correctable(struct mce *m)
    532{
    533	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
    534		return false;
    535
    536	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
    537		return false;
    538
    539	if (m->status & MCI_STATUS_UC)
    540		return false;
    541
    542	return true;
    543}
    544EXPORT_SYMBOL_GPL(mce_is_correctable);
    545
    546static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
    547			      void *data)
    548{
    549	struct mce *m = (struct mce *)data;
    550
    551	if (!m)
    552		return NOTIFY_DONE;
    553
    554	/* Emit the trace record: */
    555	trace_mce_record(m);
    556
    557	set_bit(0, &mce_need_notify);
    558
    559	mce_notify_irq();
    560
    561	return NOTIFY_DONE;
    562}
    563
    564static struct notifier_block early_nb = {
    565	.notifier_call	= mce_early_notifier,
    566	.priority	= MCE_PRIO_EARLY,
    567};
    568
    569static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
    570			      void *data)
    571{
    572	struct mce *mce = (struct mce *)data;
    573	unsigned long pfn;
    574
    575	if (!mce || !mce_usable_address(mce))
    576		return NOTIFY_DONE;
    577
    578	if (mce->severity != MCE_AO_SEVERITY &&
    579	    mce->severity != MCE_DEFERRED_SEVERITY)
    580		return NOTIFY_DONE;
    581
    582	pfn = mce->addr >> PAGE_SHIFT;
    583	if (!memory_failure(pfn, 0)) {
    584		set_mce_nospec(pfn);
    585		mce->kflags |= MCE_HANDLED_UC;
    586	}
    587
    588	return NOTIFY_OK;
    589}
    590
    591static struct notifier_block mce_uc_nb = {
    592	.notifier_call	= uc_decode_notifier,
    593	.priority	= MCE_PRIO_UC,
    594};
    595
    596static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
    597				void *data)
    598{
    599	struct mce *m = (struct mce *)data;
    600
    601	if (!m)
    602		return NOTIFY_DONE;
    603
    604	if (mca_cfg.print_all || !m->kflags)
    605		__print_mce(m);
    606
    607	return NOTIFY_DONE;
    608}
    609
    610static struct notifier_block mce_default_nb = {
    611	.notifier_call	= mce_default_notifier,
    612	/* lowest prio, we want it to run last. */
    613	.priority	= MCE_PRIO_LOWEST,
    614};
    615
    616/*
    617 * Read ADDR and MISC registers.
    618 */
    619static noinstr void mce_read_aux(struct mce *m, int i)
    620{
    621	if (m->status & MCI_STATUS_MISCV)
    622		m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
    623
    624	if (m->status & MCI_STATUS_ADDRV) {
    625		m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
    626
    627		/*
    628		 * Mask the reported address by the reported granularity.
    629		 */
    630		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
    631			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
    632			m->addr >>= shift;
    633			m->addr <<= shift;
    634		}
    635
    636		/*
    637		 * Extract [55:<lsb>] where lsb is the least significant
    638		 * *valid* bit of the address bits.
    639		 */
    640		if (mce_flags.smca) {
    641			u8 lsb = (m->addr >> 56) & 0x3f;
    642
    643			m->addr &= GENMASK_ULL(55, lsb);
    644		}
    645	}
    646
    647	if (mce_flags.smca) {
    648		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
    649
    650		if (m->status & MCI_STATUS_SYNDV)
    651			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
    652	}
    653}
    654
    655DEFINE_PER_CPU(unsigned, mce_poll_count);
    656
    657/*
    658 * Poll for corrected events or events that happened before reset.
    659 * Those are just logged through /dev/mcelog.
    660 *
    661 * This is executed in standard interrupt context.
    662 *
    663 * Note: spec recommends to panic for fatal unsignalled
    664 * errors here. However this would be quite problematic --
    665 * we would need to reimplement the Monarch handling and
    666 * it would mess up the exclusion between exception handler
    667 * and poll handler -- * so we skip this for now.
    668 * These cases should not happen anyways, or only when the CPU
    669 * is already totally * confused. In this case it's likely it will
    670 * not fully execute the machine check handler either.
    671 */
    672bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
    673{
    674	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
    675	bool error_seen = false;
    676	struct mce m;
    677	int i;
    678
    679	this_cpu_inc(mce_poll_count);
    680
    681	mce_gather_info(&m, NULL);
    682
    683	if (flags & MCP_TIMESTAMP)
    684		m.tsc = rdtsc();
    685
    686	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
    687		if (!mce_banks[i].ctl || !test_bit(i, *b))
    688			continue;
    689
    690		m.misc = 0;
    691		m.addr = 0;
    692		m.bank = i;
    693
    694		barrier();
    695		m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
    696
    697		/* If this entry is not valid, ignore it */
    698		if (!(m.status & MCI_STATUS_VAL))
    699			continue;
    700
    701		/*
    702		 * If we are logging everything (at CPU online) or this
    703		 * is a corrected error, then we must log it.
    704		 */
    705		if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
    706			goto log_it;
    707
    708		/*
    709		 * Newer Intel systems that support software error
    710		 * recovery need to make additional checks. Other
    711		 * CPUs should skip over uncorrected errors, but log
    712		 * everything else.
    713		 */
    714		if (!mca_cfg.ser) {
    715			if (m.status & MCI_STATUS_UC)
    716				continue;
    717			goto log_it;
    718		}
    719
    720		/* Log "not enabled" (speculative) errors */
    721		if (!(m.status & MCI_STATUS_EN))
    722			goto log_it;
    723
    724		/*
    725		 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
    726		 * UC == 1 && PCC == 0 && S == 0
    727		 */
    728		if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
    729			goto log_it;
    730
    731		/*
    732		 * Skip anything else. Presumption is that our read of this
    733		 * bank is racing with a machine check. Leave the log alone
    734		 * for do_machine_check() to deal with it.
    735		 */
    736		continue;
    737
    738log_it:
    739		error_seen = true;
    740
    741		if (flags & MCP_DONTLOG)
    742			goto clear_it;
    743
    744		mce_read_aux(&m, i);
    745		m.severity = mce_severity(&m, NULL, NULL, false);
    746		/*
    747		 * Don't get the IP here because it's unlikely to
    748		 * have anything to do with the actual error location.
    749		 */
    750
    751		if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
    752			goto clear_it;
    753
    754		if (flags & MCP_QUEUE_LOG)
    755			mce_gen_pool_add(&m);
    756		else
    757			mce_log(&m);
    758
    759clear_it:
    760		/*
    761		 * Clear state for this bank.
    762		 */
    763		mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
    764	}
    765
    766	/*
    767	 * Don't clear MCG_STATUS here because it's only defined for
    768	 * exceptions.
    769	 */
    770
    771	sync_core();
    772
    773	return error_seen;
    774}
    775EXPORT_SYMBOL_GPL(machine_check_poll);
    776
    777/*
    778 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
    779 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
    780 * Vol 3B Table 15-20). But this confuses both the code that determines
    781 * whether the machine check occurred in kernel or user mode, and also
    782 * the severity assessment code. Pretend that EIPV was set, and take the
    783 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
    784 */
    785static __always_inline void
    786quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
    787{
    788	if (bank != 0)
    789		return;
    790	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
    791		return;
    792	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
    793		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
    794			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
    795			  MCACOD)) !=
    796			 (MCI_STATUS_UC|MCI_STATUS_EN|
    797			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
    798			  MCI_STATUS_AR|MCACOD_INSTR))
    799		return;
    800
    801	m->mcgstatus |= MCG_STATUS_EIPV;
    802	m->ip = regs->ip;
    803	m->cs = regs->cs;
    804}
    805
    806/*
    807 * Disable fast string copy and return from the MCE handler upon the first SRAR
    808 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
    809 * CPUs.
    810 * The fast string copy instructions ("REP; MOVS*") could consume an
    811 * uncorrectable memory error in the cache line _right after_ the desired region
    812 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
    813 * "REP; MOVS*".
    814 * This mitigation addresses the issue completely with the caveat of performance
    815 * degradation on the CPU affected. This is still better than the OS crashing on
    816 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
    817 * kernel context (e.g., copy_page).
    818 *
    819 * Returns true when fast string copy on CPU has been disabled.
    820 */
    821static noinstr bool quirk_skylake_repmov(void)
    822{
    823	u64 mcgstatus   = mce_rdmsrl(MSR_IA32_MCG_STATUS);
    824	u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
    825	u64 mc1_status;
    826
    827	/*
    828	 * Apply the quirk only to local machine checks, i.e., no broadcast
    829	 * sync is needed.
    830	 */
    831	if (!(mcgstatus & MCG_STATUS_LMCES) ||
    832	    !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
    833		return false;
    834
    835	mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
    836
    837	/* Check for a software-recoverable data fetch error. */
    838	if ((mc1_status &
    839	     (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
    840	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
    841	      MCI_STATUS_AR | MCI_STATUS_S)) ==
    842	     (MCI_STATUS_VAL |                   MCI_STATUS_UC | MCI_STATUS_EN |
    843	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
    844	      MCI_STATUS_AR | MCI_STATUS_S)) {
    845		misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
    846		mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
    847		mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
    848
    849		instrumentation_begin();
    850		pr_err_once("Erratum detected, disable fast string copy instructions.\n");
    851		instrumentation_end();
    852
    853		return true;
    854	}
    855
    856	return false;
    857}
    858
    859/*
    860 * Do a quick check if any of the events requires a panic.
    861 * This decides if we keep the events around or clear them.
    862 */
    863static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
    864					  struct pt_regs *regs)
    865{
    866	char *tmp = *msg;
    867	int i;
    868
    869	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
    870		m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
    871		if (!(m->status & MCI_STATUS_VAL))
    872			continue;
    873
    874		arch___set_bit(i, validp);
    875		if (mce_flags.snb_ifu_quirk)
    876			quirk_sandybridge_ifu(i, m, regs);
    877
    878		m->bank = i;
    879		if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
    880			mce_read_aux(m, i);
    881			*msg = tmp;
    882			return 1;
    883		}
    884	}
    885	return 0;
    886}
    887
    888/*
    889 * Variable to establish order between CPUs while scanning.
    890 * Each CPU spins initially until executing is equal its number.
    891 */
    892static atomic_t mce_executing;
    893
    894/*
    895 * Defines order of CPUs on entry. First CPU becomes Monarch.
    896 */
    897static atomic_t mce_callin;
    898
    899/*
    900 * Track which CPUs entered the MCA broadcast synchronization and which not in
    901 * order to print holdouts.
    902 */
    903static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
    904
    905/*
    906 * Check if a timeout waiting for other CPUs happened.
    907 */
    908static noinstr int mce_timed_out(u64 *t, const char *msg)
    909{
    910	int ret = 0;
    911
    912	/* Enable instrumentation around calls to external facilities */
    913	instrumentation_begin();
    914
    915	/*
    916	 * The others already did panic for some reason.
    917	 * Bail out like in a timeout.
    918	 * rmb() to tell the compiler that system_state
    919	 * might have been modified by someone else.
    920	 */
    921	rmb();
    922	if (atomic_read(&mce_panicked))
    923		wait_for_panic();
    924	if (!mca_cfg.monarch_timeout)
    925		goto out;
    926	if ((s64)*t < SPINUNIT) {
    927		if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
    928			pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
    929				 cpumask_pr_args(&mce_missing_cpus));
    930		mce_panic(msg, NULL, NULL);
    931
    932		ret = 1;
    933		goto out;
    934	}
    935	*t -= SPINUNIT;
    936
    937out:
    938	touch_nmi_watchdog();
    939
    940	instrumentation_end();
    941
    942	return ret;
    943}
    944
    945/*
    946 * The Monarch's reign.  The Monarch is the CPU who entered
    947 * the machine check handler first. It waits for the others to
    948 * raise the exception too and then grades them. When any
    949 * error is fatal panic. Only then let the others continue.
    950 *
    951 * The other CPUs entering the MCE handler will be controlled by the
    952 * Monarch. They are called Subjects.
    953 *
    954 * This way we prevent any potential data corruption in a unrecoverable case
    955 * and also makes sure always all CPU's errors are examined.
    956 *
    957 * Also this detects the case of a machine check event coming from outer
    958 * space (not detected by any CPUs) In this case some external agent wants
    959 * us to shut down, so panic too.
    960 *
    961 * The other CPUs might still decide to panic if the handler happens
    962 * in a unrecoverable place, but in this case the system is in a semi-stable
    963 * state and won't corrupt anything by itself. It's ok to let the others
    964 * continue for a bit first.
    965 *
    966 * All the spin loops have timeouts; when a timeout happens a CPU
    967 * typically elects itself to be Monarch.
    968 */
    969static void mce_reign(void)
    970{
    971	int cpu;
    972	struct mce *m = NULL;
    973	int global_worst = 0;
    974	char *msg = NULL;
    975
    976	/*
    977	 * This CPU is the Monarch and the other CPUs have run
    978	 * through their handlers.
    979	 * Grade the severity of the errors of all the CPUs.
    980	 */
    981	for_each_possible_cpu(cpu) {
    982		struct mce *mtmp = &per_cpu(mces_seen, cpu);
    983
    984		if (mtmp->severity > global_worst) {
    985			global_worst = mtmp->severity;
    986			m = &per_cpu(mces_seen, cpu);
    987		}
    988	}
    989
    990	/*
    991	 * Cannot recover? Panic here then.
    992	 * This dumps all the mces in the log buffer and stops the
    993	 * other CPUs.
    994	 */
    995	if (m && global_worst >= MCE_PANIC_SEVERITY) {
    996		/* call mce_severity() to get "msg" for panic */
    997		mce_severity(m, NULL, &msg, true);
    998		mce_panic("Fatal machine check", m, msg);
    999	}
   1000
   1001	/*
   1002	 * For UC somewhere we let the CPU who detects it handle it.
   1003	 * Also must let continue the others, otherwise the handling
   1004	 * CPU could deadlock on a lock.
   1005	 */
   1006
   1007	/*
   1008	 * No machine check event found. Must be some external
   1009	 * source or one CPU is hung. Panic.
   1010	 */
   1011	if (global_worst <= MCE_KEEP_SEVERITY)
   1012		mce_panic("Fatal machine check from unknown source", NULL, NULL);
   1013
   1014	/*
   1015	 * Now clear all the mces_seen so that they don't reappear on
   1016	 * the next mce.
   1017	 */
   1018	for_each_possible_cpu(cpu)
   1019		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
   1020}
   1021
   1022static atomic_t global_nwo;
   1023
   1024/*
   1025 * Start of Monarch synchronization. This waits until all CPUs have
   1026 * entered the exception handler and then determines if any of them
   1027 * saw a fatal event that requires panic. Then it executes them
   1028 * in the entry order.
   1029 * TBD double check parallel CPU hotunplug
   1030 */
   1031static noinstr int mce_start(int *no_way_out)
   1032{
   1033	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
   1034	int order, ret = -1;
   1035
   1036	if (!timeout)
   1037		return ret;
   1038
   1039	arch_atomic_add(*no_way_out, &global_nwo);
   1040	/*
   1041	 * Rely on the implied barrier below, such that global_nwo
   1042	 * is updated before mce_callin.
   1043	 */
   1044	order = arch_atomic_inc_return(&mce_callin);
   1045	arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
   1046
   1047	/* Enable instrumentation around calls to external facilities */
   1048	instrumentation_begin();
   1049
   1050	/*
   1051	 * Wait for everyone.
   1052	 */
   1053	while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
   1054		if (mce_timed_out(&timeout,
   1055				  "Timeout: Not all CPUs entered broadcast exception handler")) {
   1056			arch_atomic_set(&global_nwo, 0);
   1057			goto out;
   1058		}
   1059		ndelay(SPINUNIT);
   1060	}
   1061
   1062	/*
   1063	 * mce_callin should be read before global_nwo
   1064	 */
   1065	smp_rmb();
   1066
   1067	if (order == 1) {
   1068		/*
   1069		 * Monarch: Starts executing now, the others wait.
   1070		 */
   1071		arch_atomic_set(&mce_executing, 1);
   1072	} else {
   1073		/*
   1074		 * Subject: Now start the scanning loop one by one in
   1075		 * the original callin order.
   1076		 * This way when there are any shared banks it will be
   1077		 * only seen by one CPU before cleared, avoiding duplicates.
   1078		 */
   1079		while (arch_atomic_read(&mce_executing) < order) {
   1080			if (mce_timed_out(&timeout,
   1081					  "Timeout: Subject CPUs unable to finish machine check processing")) {
   1082				arch_atomic_set(&global_nwo, 0);
   1083				goto out;
   1084			}
   1085			ndelay(SPINUNIT);
   1086		}
   1087	}
   1088
   1089	/*
   1090	 * Cache the global no_way_out state.
   1091	 */
   1092	*no_way_out = arch_atomic_read(&global_nwo);
   1093
   1094	ret = order;
   1095
   1096out:
   1097	instrumentation_end();
   1098
   1099	return ret;
   1100}
   1101
   1102/*
   1103 * Synchronize between CPUs after main scanning loop.
   1104 * This invokes the bulk of the Monarch processing.
   1105 */
   1106static noinstr int mce_end(int order)
   1107{
   1108	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
   1109	int ret = -1;
   1110
   1111	/* Allow instrumentation around external facilities. */
   1112	instrumentation_begin();
   1113
   1114	if (!timeout)
   1115		goto reset;
   1116	if (order < 0)
   1117		goto reset;
   1118
   1119	/*
   1120	 * Allow others to run.
   1121	 */
   1122	atomic_inc(&mce_executing);
   1123
   1124	if (order == 1) {
   1125		/*
   1126		 * Monarch: Wait for everyone to go through their scanning
   1127		 * loops.
   1128		 */
   1129		while (atomic_read(&mce_executing) <= num_online_cpus()) {
   1130			if (mce_timed_out(&timeout,
   1131					  "Timeout: Monarch CPU unable to finish machine check processing"))
   1132				goto reset;
   1133			ndelay(SPINUNIT);
   1134		}
   1135
   1136		mce_reign();
   1137		barrier();
   1138		ret = 0;
   1139	} else {
   1140		/*
   1141		 * Subject: Wait for Monarch to finish.
   1142		 */
   1143		while (atomic_read(&mce_executing) != 0) {
   1144			if (mce_timed_out(&timeout,
   1145					  "Timeout: Monarch CPU did not finish machine check processing"))
   1146				goto reset;
   1147			ndelay(SPINUNIT);
   1148		}
   1149
   1150		/*
   1151		 * Don't reset anything. That's done by the Monarch.
   1152		 */
   1153		ret = 0;
   1154		goto out;
   1155	}
   1156
   1157	/*
   1158	 * Reset all global state.
   1159	 */
   1160reset:
   1161	atomic_set(&global_nwo, 0);
   1162	atomic_set(&mce_callin, 0);
   1163	cpumask_setall(&mce_missing_cpus);
   1164	barrier();
   1165
   1166	/*
   1167	 * Let others run again.
   1168	 */
   1169	atomic_set(&mce_executing, 0);
   1170
   1171out:
   1172	instrumentation_end();
   1173
   1174	return ret;
   1175}
   1176
   1177static __always_inline void mce_clear_state(unsigned long *toclear)
   1178{
   1179	int i;
   1180
   1181	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   1182		if (arch_test_bit(i, toclear))
   1183			mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
   1184	}
   1185}
   1186
   1187/*
   1188 * Cases where we avoid rendezvous handler timeout:
   1189 * 1) If this CPU is offline.
   1190 *
   1191 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
   1192 *  skip those CPUs which remain looping in the 1st kernel - see
   1193 *  crash_nmi_callback().
   1194 *
   1195 * Note: there still is a small window between kexec-ing and the new,
   1196 * kdump kernel establishing a new #MC handler where a broadcasted MCE
   1197 * might not get handled properly.
   1198 */
   1199static noinstr bool mce_check_crashing_cpu(void)
   1200{
   1201	unsigned int cpu = smp_processor_id();
   1202
   1203	if (arch_cpu_is_offline(cpu) ||
   1204	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
   1205		u64 mcgstatus;
   1206
   1207		mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
   1208
   1209		if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
   1210			if (mcgstatus & MCG_STATUS_LMCES)
   1211				return false;
   1212		}
   1213
   1214		if (mcgstatus & MCG_STATUS_RIPV) {
   1215			__wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
   1216			return true;
   1217		}
   1218	}
   1219	return false;
   1220}
   1221
   1222static __always_inline int
   1223__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
   1224		unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
   1225		int *worst)
   1226{
   1227	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1228	struct mca_config *cfg = &mca_cfg;
   1229	int severity, i, taint = 0;
   1230
   1231	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   1232		arch___clear_bit(i, toclear);
   1233		if (!arch_test_bit(i, valid_banks))
   1234			continue;
   1235
   1236		if (!mce_banks[i].ctl)
   1237			continue;
   1238
   1239		m->misc = 0;
   1240		m->addr = 0;
   1241		m->bank = i;
   1242
   1243		m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
   1244		if (!(m->status & MCI_STATUS_VAL))
   1245			continue;
   1246
   1247		/*
   1248		 * Corrected or non-signaled errors are handled by
   1249		 * machine_check_poll(). Leave them alone, unless this panics.
   1250		 */
   1251		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
   1252			!no_way_out)
   1253			continue;
   1254
   1255		/* Set taint even when machine check was not enabled. */
   1256		taint++;
   1257
   1258		severity = mce_severity(m, regs, NULL, true);
   1259
   1260		/*
   1261		 * When machine check was for corrected/deferred handler don't
   1262		 * touch, unless we're panicking.
   1263		 */
   1264		if ((severity == MCE_KEEP_SEVERITY ||
   1265		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
   1266			continue;
   1267
   1268		arch___set_bit(i, toclear);
   1269
   1270		/* Machine check event was not enabled. Clear, but ignore. */
   1271		if (severity == MCE_NO_SEVERITY)
   1272			continue;
   1273
   1274		mce_read_aux(m, i);
   1275
   1276		/* assuming valid severity level != 0 */
   1277		m->severity = severity;
   1278
   1279		/*
   1280		 * Enable instrumentation around the mce_log() call which is
   1281		 * done in #MC context, where instrumentation is disabled.
   1282		 */
   1283		instrumentation_begin();
   1284		mce_log(m);
   1285		instrumentation_end();
   1286
   1287		if (severity > *worst) {
   1288			*final = *m;
   1289			*worst = severity;
   1290		}
   1291	}
   1292
   1293	/* mce_clear_state will clear *final, save locally for use later */
   1294	*m = *final;
   1295
   1296	return taint;
   1297}
   1298
   1299static void kill_me_now(struct callback_head *ch)
   1300{
   1301	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
   1302
   1303	p->mce_count = 0;
   1304	force_sig(SIGBUS);
   1305}
   1306
   1307static void kill_me_maybe(struct callback_head *cb)
   1308{
   1309	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
   1310	int flags = MF_ACTION_REQUIRED;
   1311	int ret;
   1312
   1313	p->mce_count = 0;
   1314	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
   1315
   1316	if (!p->mce_ripv)
   1317		flags |= MF_MUST_KILL;
   1318
   1319	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
   1320	if (!ret) {
   1321		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
   1322		sync_core();
   1323		return;
   1324	}
   1325
   1326	/*
   1327	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
   1328	 * to the current process with the proper error info,
   1329	 * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
   1330	 *
   1331	 * In both cases, no further processing is required.
   1332	 */
   1333	if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
   1334		return;
   1335
   1336	pr_err("Memory error not recovered");
   1337	kill_me_now(cb);
   1338}
   1339
   1340static void kill_me_never(struct callback_head *cb)
   1341{
   1342	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
   1343
   1344	p->mce_count = 0;
   1345	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
   1346	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
   1347		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
   1348}
   1349
   1350static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
   1351{
   1352	int count = ++current->mce_count;
   1353
   1354	/* First call, save all the details */
   1355	if (count == 1) {
   1356		current->mce_addr = m->addr;
   1357		current->mce_kflags = m->kflags;
   1358		current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
   1359		current->mce_whole_page = whole_page(m);
   1360		current->mce_kill_me.func = func;
   1361	}
   1362
   1363	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
   1364	if (count > 10)
   1365		mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
   1366
   1367	/* Second or later call, make sure page address matches the one from first call */
   1368	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
   1369		mce_panic("Consecutive machine checks to different user pages", m, msg);
   1370
   1371	/* Do not call task_work_add() more than once */
   1372	if (count > 1)
   1373		return;
   1374
   1375	task_work_add(current, &current->mce_kill_me, TWA_RESUME);
   1376}
   1377
   1378/* Handle unconfigured int18 (should never happen) */
   1379static noinstr void unexpected_machine_check(struct pt_regs *regs)
   1380{
   1381	instrumentation_begin();
   1382	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
   1383	       smp_processor_id());
   1384	instrumentation_end();
   1385}
   1386
   1387/*
   1388 * The actual machine check handler. This only handles real exceptions when
   1389 * something got corrupted coming in through int 18.
   1390 *
   1391 * This is executed in #MC context not subject to normal locking rules.
   1392 * This implies that most kernel services cannot be safely used. Don't even
   1393 * think about putting a printk in there!
   1394 *
   1395 * On Intel systems this is entered on all CPUs in parallel through
   1396 * MCE broadcast. However some CPUs might be broken beyond repair,
   1397 * so be always careful when synchronizing with others.
   1398 *
   1399 * Tracing and kprobes are disabled: if we interrupted a kernel context
   1400 * with IF=1, we need to minimize stack usage.  There are also recursion
   1401 * issues: if the machine check was due to a failure of the memory
   1402 * backing the user stack, tracing that reads the user stack will cause
   1403 * potentially infinite recursion.
   1404 *
   1405 * Currently, the #MC handler calls out to a number of external facilities
   1406 * and, therefore, allows instrumentation around them. The optimal thing to
   1407 * have would be to do the absolutely minimal work required in #MC context
   1408 * and have instrumentation disabled only around that. Further processing can
   1409 * then happen in process context where instrumentation is allowed. Achieving
   1410 * that requires careful auditing and modifications. Until then, the code
   1411 * allows instrumentation temporarily, where required. *
   1412 */
   1413noinstr void do_machine_check(struct pt_regs *regs)
   1414{
   1415	int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
   1416	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
   1417	DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
   1418	struct mce m, *final;
   1419	char *msg = NULL;
   1420
   1421	if (unlikely(mce_flags.p5))
   1422		return pentium_machine_check(regs);
   1423	else if (unlikely(mce_flags.winchip))
   1424		return winchip_machine_check(regs);
   1425	else if (unlikely(!mca_cfg.initialized))
   1426		return unexpected_machine_check(regs);
   1427
   1428	if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
   1429		goto clear;
   1430
   1431	/*
   1432	 * Establish sequential order between the CPUs entering the machine
   1433	 * check handler.
   1434	 */
   1435	order = -1;
   1436
   1437	/*
   1438	 * If no_way_out gets set, there is no safe way to recover from this
   1439	 * MCE.
   1440	 */
   1441	no_way_out = 0;
   1442
   1443	/*
   1444	 * If kill_current_task is not set, there might be a way to recover from this
   1445	 * error.
   1446	 */
   1447	kill_current_task = 0;
   1448
   1449	/*
   1450	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
   1451	 * on Intel.
   1452	 */
   1453	lmce = 1;
   1454
   1455	this_cpu_inc(mce_exception_count);
   1456
   1457	mce_gather_info(&m, regs);
   1458	m.tsc = rdtsc();
   1459
   1460	final = this_cpu_ptr(&mces_seen);
   1461	*final = m;
   1462
   1463	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
   1464
   1465	barrier();
   1466
   1467	/*
   1468	 * When no restart IP might need to kill or panic.
   1469	 * Assume the worst for now, but if we find the
   1470	 * severity is MCE_AR_SEVERITY we have other options.
   1471	 */
   1472	if (!(m.mcgstatus & MCG_STATUS_RIPV))
   1473		kill_current_task = 1;
   1474	/*
   1475	 * Check if this MCE is signaled to only this logical processor,
   1476	 * on Intel, Zhaoxin only.
   1477	 */
   1478	if (m.cpuvendor == X86_VENDOR_INTEL ||
   1479	    m.cpuvendor == X86_VENDOR_ZHAOXIN)
   1480		lmce = m.mcgstatus & MCG_STATUS_LMCES;
   1481
   1482	/*
   1483	 * Local machine check may already know that we have to panic.
   1484	 * Broadcast machine check begins rendezvous in mce_start()
   1485	 * Go through all banks in exclusion of the other CPUs. This way we
   1486	 * don't report duplicated events on shared banks because the first one
   1487	 * to see it will clear it.
   1488	 */
   1489	if (lmce) {
   1490		if (no_way_out)
   1491			mce_panic("Fatal local machine check", &m, msg);
   1492	} else {
   1493		order = mce_start(&no_way_out);
   1494	}
   1495
   1496	taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
   1497
   1498	if (!no_way_out)
   1499		mce_clear_state(toclear);
   1500
   1501	/*
   1502	 * Do most of the synchronization with other CPUs.
   1503	 * When there's any problem use only local no_way_out state.
   1504	 */
   1505	if (!lmce) {
   1506		if (mce_end(order) < 0) {
   1507			if (!no_way_out)
   1508				no_way_out = worst >= MCE_PANIC_SEVERITY;
   1509
   1510			if (no_way_out)
   1511				mce_panic("Fatal machine check on current CPU", &m, msg);
   1512		}
   1513	} else {
   1514		/*
   1515		 * If there was a fatal machine check we should have
   1516		 * already called mce_panic earlier in this function.
   1517		 * Since we re-read the banks, we might have found
   1518		 * something new. Check again to see if we found a
   1519		 * fatal error. We call "mce_severity()" again to
   1520		 * make sure we have the right "msg".
   1521		 */
   1522		if (worst >= MCE_PANIC_SEVERITY) {
   1523			mce_severity(&m, regs, &msg, true);
   1524			mce_panic("Local fatal machine check!", &m, msg);
   1525		}
   1526	}
   1527
   1528	/*
   1529	 * Enable instrumentation around the external facilities like task_work_add()
   1530	 * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
   1531	 * properly would need a lot more involved reorganization.
   1532	 */
   1533	instrumentation_begin();
   1534
   1535	if (taint)
   1536		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
   1537
   1538	if (worst != MCE_AR_SEVERITY && !kill_current_task)
   1539		goto out;
   1540
   1541	/* Fault was in user mode and we need to take some action */
   1542	if ((m.cs & 3) == 3) {
   1543		/* If this triggers there is no way to recover. Die hard. */
   1544		BUG_ON(!on_thread_stack() || !user_mode(regs));
   1545
   1546		if (kill_current_task)
   1547			queue_task_work(&m, msg, kill_me_now);
   1548		else
   1549			queue_task_work(&m, msg, kill_me_maybe);
   1550
   1551	} else {
   1552		/*
   1553		 * Handle an MCE which has happened in kernel space but from
   1554		 * which the kernel can recover: ex_has_fault_handler() has
   1555		 * already verified that the rIP at which the error happened is
   1556		 * a rIP from which the kernel can recover (by jumping to
   1557		 * recovery code specified in _ASM_EXTABLE_FAULT()) and the
   1558		 * corresponding exception handler which would do that is the
   1559		 * proper one.
   1560		 */
   1561		if (m.kflags & MCE_IN_KERNEL_RECOV) {
   1562			if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
   1563				mce_panic("Failed kernel mode recovery", &m, msg);
   1564		}
   1565
   1566		if (m.kflags & MCE_IN_KERNEL_COPYIN)
   1567			queue_task_work(&m, msg, kill_me_never);
   1568	}
   1569
   1570out:
   1571	instrumentation_end();
   1572
   1573clear:
   1574	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
   1575}
   1576EXPORT_SYMBOL_GPL(do_machine_check);
   1577
   1578#ifndef CONFIG_MEMORY_FAILURE
   1579int memory_failure(unsigned long pfn, int flags)
   1580{
   1581	/* mce_severity() should not hand us an ACTION_REQUIRED error */
   1582	BUG_ON(flags & MF_ACTION_REQUIRED);
   1583	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
   1584	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
   1585	       pfn);
   1586
   1587	return 0;
   1588}
   1589#endif
   1590
   1591/*
   1592 * Periodic polling timer for "silent" machine check errors.  If the
   1593 * poller finds an MCE, poll 2x faster.  When the poller finds no more
   1594 * errors, poll 2x slower (up to check_interval seconds).
   1595 */
   1596static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
   1597
   1598static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
   1599static DEFINE_PER_CPU(struct timer_list, mce_timer);
   1600
   1601static unsigned long mce_adjust_timer_default(unsigned long interval)
   1602{
   1603	return interval;
   1604}
   1605
   1606static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
   1607
   1608static void __start_timer(struct timer_list *t, unsigned long interval)
   1609{
   1610	unsigned long when = jiffies + interval;
   1611	unsigned long flags;
   1612
   1613	local_irq_save(flags);
   1614
   1615	if (!timer_pending(t) || time_before(when, t->expires))
   1616		mod_timer(t, round_jiffies(when));
   1617
   1618	local_irq_restore(flags);
   1619}
   1620
   1621static void mce_timer_fn(struct timer_list *t)
   1622{
   1623	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
   1624	unsigned long iv;
   1625
   1626	WARN_ON(cpu_t != t);
   1627
   1628	iv = __this_cpu_read(mce_next_interval);
   1629
   1630	if (mce_available(this_cpu_ptr(&cpu_info))) {
   1631		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
   1632
   1633		if (mce_intel_cmci_poll()) {
   1634			iv = mce_adjust_timer(iv);
   1635			goto done;
   1636		}
   1637	}
   1638
   1639	/*
   1640	 * Alert userspace if needed. If we logged an MCE, reduce the polling
   1641	 * interval, otherwise increase the polling interval.
   1642	 */
   1643	if (mce_notify_irq())
   1644		iv = max(iv / 2, (unsigned long) HZ/100);
   1645	else
   1646		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
   1647
   1648done:
   1649	__this_cpu_write(mce_next_interval, iv);
   1650	__start_timer(t, iv);
   1651}
   1652
   1653/*
   1654 * Ensure that the timer is firing in @interval from now.
   1655 */
   1656void mce_timer_kick(unsigned long interval)
   1657{
   1658	struct timer_list *t = this_cpu_ptr(&mce_timer);
   1659	unsigned long iv = __this_cpu_read(mce_next_interval);
   1660
   1661	__start_timer(t, interval);
   1662
   1663	if (interval < iv)
   1664		__this_cpu_write(mce_next_interval, interval);
   1665}
   1666
   1667/* Must not be called in IRQ context where del_timer_sync() can deadlock */
   1668static void mce_timer_delete_all(void)
   1669{
   1670	int cpu;
   1671
   1672	for_each_online_cpu(cpu)
   1673		del_timer_sync(&per_cpu(mce_timer, cpu));
   1674}
   1675
   1676/*
   1677 * Notify the user(s) about new machine check events.
   1678 * Can be called from interrupt context, but not from machine check/NMI
   1679 * context.
   1680 */
   1681int mce_notify_irq(void)
   1682{
   1683	/* Not more than two messages every minute */
   1684	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
   1685
   1686	if (test_and_clear_bit(0, &mce_need_notify)) {
   1687		mce_work_trigger();
   1688
   1689		if (__ratelimit(&ratelimit))
   1690			pr_info(HW_ERR "Machine check events logged\n");
   1691
   1692		return 1;
   1693	}
   1694	return 0;
   1695}
   1696EXPORT_SYMBOL_GPL(mce_notify_irq);
   1697
   1698static void __mcheck_cpu_mce_banks_init(void)
   1699{
   1700	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1701	u8 n_banks = this_cpu_read(mce_num_banks);
   1702	int i;
   1703
   1704	for (i = 0; i < n_banks; i++) {
   1705		struct mce_bank *b = &mce_banks[i];
   1706
   1707		/*
   1708		 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
   1709		 * the required vendor quirks before
   1710		 * __mcheck_cpu_init_clear_banks() does the final bank setup.
   1711		 */
   1712		b->ctl = -1ULL;
   1713		b->init = true;
   1714	}
   1715}
   1716
   1717/*
   1718 * Initialize Machine Checks for a CPU.
   1719 */
   1720static void __mcheck_cpu_cap_init(void)
   1721{
   1722	u64 cap;
   1723	u8 b;
   1724
   1725	rdmsrl(MSR_IA32_MCG_CAP, cap);
   1726
   1727	b = cap & MCG_BANKCNT_MASK;
   1728
   1729	if (b > MAX_NR_BANKS) {
   1730		pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
   1731			smp_processor_id(), MAX_NR_BANKS, b);
   1732		b = MAX_NR_BANKS;
   1733	}
   1734
   1735	this_cpu_write(mce_num_banks, b);
   1736
   1737	__mcheck_cpu_mce_banks_init();
   1738
   1739	/* Use accurate RIP reporting if available. */
   1740	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
   1741		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
   1742
   1743	if (cap & MCG_SER_P)
   1744		mca_cfg.ser = 1;
   1745}
   1746
   1747static void __mcheck_cpu_init_generic(void)
   1748{
   1749	enum mcp_flags m_fl = 0;
   1750	mce_banks_t all_banks;
   1751	u64 cap;
   1752
   1753	if (!mca_cfg.bootlog)
   1754		m_fl = MCP_DONTLOG;
   1755
   1756	/*
   1757	 * Log the machine checks left over from the previous reset. Log them
   1758	 * only, do not start processing them. That will happen in mcheck_late_init()
   1759	 * when all consumers have been registered on the notifier chain.
   1760	 */
   1761	bitmap_fill(all_banks, MAX_NR_BANKS);
   1762	machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
   1763
   1764	cr4_set_bits(X86_CR4_MCE);
   1765
   1766	rdmsrl(MSR_IA32_MCG_CAP, cap);
   1767	if (cap & MCG_CTL_P)
   1768		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
   1769}
   1770
   1771static void __mcheck_cpu_init_clear_banks(void)
   1772{
   1773	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1774	int i;
   1775
   1776	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   1777		struct mce_bank *b = &mce_banks[i];
   1778
   1779		if (!b->init)
   1780			continue;
   1781		wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
   1782		wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
   1783	}
   1784}
   1785
   1786/*
   1787 * Do a final check to see if there are any unused/RAZ banks.
   1788 *
   1789 * This must be done after the banks have been initialized and any quirks have
   1790 * been applied.
   1791 *
   1792 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
   1793 * Otherwise, a user who disables a bank will not be able to re-enable it
   1794 * without a system reboot.
   1795 */
   1796static void __mcheck_cpu_check_banks(void)
   1797{
   1798	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1799	u64 msrval;
   1800	int i;
   1801
   1802	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   1803		struct mce_bank *b = &mce_banks[i];
   1804
   1805		if (!b->init)
   1806			continue;
   1807
   1808		rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
   1809		b->init = !!msrval;
   1810	}
   1811}
   1812
   1813/* Add per CPU specific workarounds here */
   1814static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
   1815{
   1816	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1817	struct mca_config *cfg = &mca_cfg;
   1818
   1819	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
   1820		pr_info("unknown CPU type - not enabling MCE support\n");
   1821		return -EOPNOTSUPP;
   1822	}
   1823
   1824	/* This should be disabled by the BIOS, but isn't always */
   1825	if (c->x86_vendor == X86_VENDOR_AMD) {
   1826		if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
   1827			/*
   1828			 * disable GART TBL walk error reporting, which
   1829			 * trips off incorrectly with the IOMMU & 3ware
   1830			 * & Cerberus:
   1831			 */
   1832			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
   1833		}
   1834		if (c->x86 < 0x11 && cfg->bootlog < 0) {
   1835			/*
   1836			 * Lots of broken BIOS around that don't clear them
   1837			 * by default and leave crap in there. Don't log:
   1838			 */
   1839			cfg->bootlog = 0;
   1840		}
   1841		/*
   1842		 * Various K7s with broken bank 0 around. Always disable
   1843		 * by default.
   1844		 */
   1845		if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
   1846			mce_banks[0].ctl = 0;
   1847
   1848		/*
   1849		 * overflow_recov is supported for F15h Models 00h-0fh
   1850		 * even though we don't have a CPUID bit for it.
   1851		 */
   1852		if (c->x86 == 0x15 && c->x86_model <= 0xf)
   1853			mce_flags.overflow_recov = 1;
   1854
   1855	}
   1856
   1857	if (c->x86_vendor == X86_VENDOR_INTEL) {
   1858		/*
   1859		 * SDM documents that on family 6 bank 0 should not be written
   1860		 * because it aliases to another special BIOS controlled
   1861		 * register.
   1862		 * But it's not aliased anymore on model 0x1a+
   1863		 * Don't ignore bank 0 completely because there could be a
   1864		 * valid event later, merely don't write CTL0.
   1865		 */
   1866
   1867		if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
   1868			mce_banks[0].init = false;
   1869
   1870		/*
   1871		 * All newer Intel systems support MCE broadcasting. Enable
   1872		 * synchronization with a one second timeout.
   1873		 */
   1874		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
   1875			cfg->monarch_timeout < 0)
   1876			cfg->monarch_timeout = USEC_PER_SEC;
   1877
   1878		/*
   1879		 * There are also broken BIOSes on some Pentium M and
   1880		 * earlier systems:
   1881		 */
   1882		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
   1883			cfg->bootlog = 0;
   1884
   1885		if (c->x86 == 6 && c->x86_model == 45)
   1886			mce_flags.snb_ifu_quirk = 1;
   1887
   1888		/*
   1889		 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
   1890		 * rep movs.
   1891		 */
   1892		if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
   1893			mce_flags.skx_repmov_quirk = 1;
   1894	}
   1895
   1896	if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
   1897		/*
   1898		 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
   1899		 * synchronization with a one second timeout.
   1900		 */
   1901		if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
   1902			if (cfg->monarch_timeout < 0)
   1903				cfg->monarch_timeout = USEC_PER_SEC;
   1904		}
   1905	}
   1906
   1907	if (cfg->monarch_timeout < 0)
   1908		cfg->monarch_timeout = 0;
   1909	if (cfg->bootlog != 0)
   1910		cfg->panic_timeout = 30;
   1911
   1912	return 0;
   1913}
   1914
   1915static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
   1916{
   1917	if (c->x86 != 5)
   1918		return 0;
   1919
   1920	switch (c->x86_vendor) {
   1921	case X86_VENDOR_INTEL:
   1922		intel_p5_mcheck_init(c);
   1923		mce_flags.p5 = 1;
   1924		return 1;
   1925	case X86_VENDOR_CENTAUR:
   1926		winchip_mcheck_init(c);
   1927		mce_flags.winchip = 1;
   1928		return 1;
   1929	default:
   1930		return 0;
   1931	}
   1932
   1933	return 0;
   1934}
   1935
   1936/*
   1937 * Init basic CPU features needed for early decoding of MCEs.
   1938 */
   1939static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
   1940{
   1941	if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
   1942		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
   1943		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
   1944		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
   1945		mce_flags.amd_threshold	 = 1;
   1946	}
   1947}
   1948
   1949static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
   1950{
   1951	struct mca_config *cfg = &mca_cfg;
   1952
   1953	 /*
   1954	  * All newer Centaur CPUs support MCE broadcasting. Enable
   1955	  * synchronization with a one second timeout.
   1956	  */
   1957	if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
   1958	     c->x86 > 6) {
   1959		if (cfg->monarch_timeout < 0)
   1960			cfg->monarch_timeout = USEC_PER_SEC;
   1961	}
   1962}
   1963
   1964static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
   1965{
   1966	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   1967
   1968	/*
   1969	 * These CPUs have MCA bank 8 which reports only one error type called
   1970	 * SVAD (System View Address Decoder). The reporting of that error is
   1971	 * controlled by IA32_MC8.CTL.0.
   1972	 *
   1973	 * If enabled, prefetching on these CPUs will cause SVAD MCE when
   1974	 * virtual machines start and result in a system  panic. Always disable
   1975	 * bank 8 SVAD error by default.
   1976	 */
   1977	if ((c->x86 == 7 && c->x86_model == 0x1b) ||
   1978	    (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
   1979		if (this_cpu_read(mce_num_banks) > 8)
   1980			mce_banks[8].ctl = 0;
   1981	}
   1982
   1983	intel_init_cmci();
   1984	intel_init_lmce();
   1985	mce_adjust_timer = cmci_intel_adjust_timer;
   1986}
   1987
   1988static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
   1989{
   1990	intel_clear_lmce();
   1991}
   1992
   1993static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
   1994{
   1995	switch (c->x86_vendor) {
   1996	case X86_VENDOR_INTEL:
   1997		mce_intel_feature_init(c);
   1998		mce_adjust_timer = cmci_intel_adjust_timer;
   1999		break;
   2000
   2001	case X86_VENDOR_AMD: {
   2002		mce_amd_feature_init(c);
   2003		break;
   2004		}
   2005
   2006	case X86_VENDOR_HYGON:
   2007		mce_hygon_feature_init(c);
   2008		break;
   2009
   2010	case X86_VENDOR_CENTAUR:
   2011		mce_centaur_feature_init(c);
   2012		break;
   2013
   2014	case X86_VENDOR_ZHAOXIN:
   2015		mce_zhaoxin_feature_init(c);
   2016		break;
   2017
   2018	default:
   2019		break;
   2020	}
   2021}
   2022
   2023static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
   2024{
   2025	switch (c->x86_vendor) {
   2026	case X86_VENDOR_INTEL:
   2027		mce_intel_feature_clear(c);
   2028		break;
   2029
   2030	case X86_VENDOR_ZHAOXIN:
   2031		mce_zhaoxin_feature_clear(c);
   2032		break;
   2033
   2034	default:
   2035		break;
   2036	}
   2037}
   2038
   2039static void mce_start_timer(struct timer_list *t)
   2040{
   2041	unsigned long iv = check_interval * HZ;
   2042
   2043	if (mca_cfg.ignore_ce || !iv)
   2044		return;
   2045
   2046	this_cpu_write(mce_next_interval, iv);
   2047	__start_timer(t, iv);
   2048}
   2049
   2050static void __mcheck_cpu_setup_timer(void)
   2051{
   2052	struct timer_list *t = this_cpu_ptr(&mce_timer);
   2053
   2054	timer_setup(t, mce_timer_fn, TIMER_PINNED);
   2055}
   2056
   2057static void __mcheck_cpu_init_timer(void)
   2058{
   2059	struct timer_list *t = this_cpu_ptr(&mce_timer);
   2060
   2061	timer_setup(t, mce_timer_fn, TIMER_PINNED);
   2062	mce_start_timer(t);
   2063}
   2064
   2065bool filter_mce(struct mce *m)
   2066{
   2067	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
   2068		return amd_filter_mce(m);
   2069	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
   2070		return intel_filter_mce(m);
   2071
   2072	return false;
   2073}
   2074
   2075static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
   2076{
   2077	irqentry_state_t irq_state;
   2078
   2079	WARN_ON_ONCE(user_mode(regs));
   2080
   2081	/*
   2082	 * Only required when from kernel mode. See
   2083	 * mce_check_crashing_cpu() for details.
   2084	 */
   2085	if (mca_cfg.initialized && mce_check_crashing_cpu())
   2086		return;
   2087
   2088	irq_state = irqentry_nmi_enter(regs);
   2089
   2090	do_machine_check(regs);
   2091
   2092	irqentry_nmi_exit(regs, irq_state);
   2093}
   2094
   2095static __always_inline void exc_machine_check_user(struct pt_regs *regs)
   2096{
   2097	irqentry_enter_from_user_mode(regs);
   2098
   2099	do_machine_check(regs);
   2100
   2101	irqentry_exit_to_user_mode(regs);
   2102}
   2103
   2104#ifdef CONFIG_X86_64
   2105/* MCE hit kernel mode */
   2106DEFINE_IDTENTRY_MCE(exc_machine_check)
   2107{
   2108	unsigned long dr7;
   2109
   2110	dr7 = local_db_save();
   2111	exc_machine_check_kernel(regs);
   2112	local_db_restore(dr7);
   2113}
   2114
   2115/* The user mode variant. */
   2116DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
   2117{
   2118	unsigned long dr7;
   2119
   2120	dr7 = local_db_save();
   2121	exc_machine_check_user(regs);
   2122	local_db_restore(dr7);
   2123}
   2124#else
   2125/* 32bit unified entry point */
   2126DEFINE_IDTENTRY_RAW(exc_machine_check)
   2127{
   2128	unsigned long dr7;
   2129
   2130	dr7 = local_db_save();
   2131	if (user_mode(regs))
   2132		exc_machine_check_user(regs);
   2133	else
   2134		exc_machine_check_kernel(regs);
   2135	local_db_restore(dr7);
   2136}
   2137#endif
   2138
   2139/*
   2140 * Called for each booted CPU to set up machine checks.
   2141 * Must be called with preempt off:
   2142 */
   2143void mcheck_cpu_init(struct cpuinfo_x86 *c)
   2144{
   2145	if (mca_cfg.disabled)
   2146		return;
   2147
   2148	if (__mcheck_cpu_ancient_init(c))
   2149		return;
   2150
   2151	if (!mce_available(c))
   2152		return;
   2153
   2154	__mcheck_cpu_cap_init();
   2155
   2156	if (__mcheck_cpu_apply_quirks(c) < 0) {
   2157		mca_cfg.disabled = 1;
   2158		return;
   2159	}
   2160
   2161	if (mce_gen_pool_init()) {
   2162		mca_cfg.disabled = 1;
   2163		pr_emerg("Couldn't allocate MCE records pool!\n");
   2164		return;
   2165	}
   2166
   2167	mca_cfg.initialized = 1;
   2168
   2169	__mcheck_cpu_init_early(c);
   2170	__mcheck_cpu_init_generic();
   2171	__mcheck_cpu_init_vendor(c);
   2172	__mcheck_cpu_init_clear_banks();
   2173	__mcheck_cpu_check_banks();
   2174	__mcheck_cpu_setup_timer();
   2175}
   2176
   2177/*
   2178 * Called for each booted CPU to clear some machine checks opt-ins
   2179 */
   2180void mcheck_cpu_clear(struct cpuinfo_x86 *c)
   2181{
   2182	if (mca_cfg.disabled)
   2183		return;
   2184
   2185	if (!mce_available(c))
   2186		return;
   2187
   2188	/*
   2189	 * Possibly to clear general settings generic to x86
   2190	 * __mcheck_cpu_clear_generic(c);
   2191	 */
   2192	__mcheck_cpu_clear_vendor(c);
   2193
   2194}
   2195
   2196static void __mce_disable_bank(void *arg)
   2197{
   2198	int bank = *((int *)arg);
   2199	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
   2200	cmci_disable_bank(bank);
   2201}
   2202
   2203void mce_disable_bank(int bank)
   2204{
   2205	if (bank >= this_cpu_read(mce_num_banks)) {
   2206		pr_warn(FW_BUG
   2207			"Ignoring request to disable invalid MCA bank %d.\n",
   2208			bank);
   2209		return;
   2210	}
   2211	set_bit(bank, mce_banks_ce_disabled);
   2212	on_each_cpu(__mce_disable_bank, &bank, 1);
   2213}
   2214
   2215/*
   2216 * mce=off Disables machine check
   2217 * mce=no_cmci Disables CMCI
   2218 * mce=no_lmce Disables LMCE
   2219 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
   2220 * mce=print_all Print all machine check logs to console
   2221 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
   2222 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
   2223 *	monarchtimeout is how long to wait for other CPUs on machine
   2224 *	check, or 0 to not wait
   2225 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
   2226	and older.
   2227 * mce=nobootlog Don't log MCEs from before booting.
   2228 * mce=bios_cmci_threshold Don't program the CMCI threshold
   2229 * mce=recovery force enable copy_mc_fragile()
   2230 */
   2231static int __init mcheck_enable(char *str)
   2232{
   2233	struct mca_config *cfg = &mca_cfg;
   2234
   2235	if (*str == 0) {
   2236		enable_p5_mce();
   2237		return 1;
   2238	}
   2239	if (*str == '=')
   2240		str++;
   2241	if (!strcmp(str, "off"))
   2242		cfg->disabled = 1;
   2243	else if (!strcmp(str, "no_cmci"))
   2244		cfg->cmci_disabled = true;
   2245	else if (!strcmp(str, "no_lmce"))
   2246		cfg->lmce_disabled = 1;
   2247	else if (!strcmp(str, "dont_log_ce"))
   2248		cfg->dont_log_ce = true;
   2249	else if (!strcmp(str, "print_all"))
   2250		cfg->print_all = true;
   2251	else if (!strcmp(str, "ignore_ce"))
   2252		cfg->ignore_ce = true;
   2253	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
   2254		cfg->bootlog = (str[0] == 'b');
   2255	else if (!strcmp(str, "bios_cmci_threshold"))
   2256		cfg->bios_cmci_threshold = 1;
   2257	else if (!strcmp(str, "recovery"))
   2258		cfg->recovery = 1;
   2259	else if (isdigit(str[0]))
   2260		get_option(&str, &(cfg->monarch_timeout));
   2261	else {
   2262		pr_info("mce argument %s ignored. Please use /sys\n", str);
   2263		return 0;
   2264	}
   2265	return 1;
   2266}
   2267__setup("mce", mcheck_enable);
   2268
   2269int __init mcheck_init(void)
   2270{
   2271	mce_register_decode_chain(&early_nb);
   2272	mce_register_decode_chain(&mce_uc_nb);
   2273	mce_register_decode_chain(&mce_default_nb);
   2274
   2275	INIT_WORK(&mce_work, mce_gen_pool_process);
   2276	init_irq_work(&mce_irq_work, mce_irq_work_cb);
   2277
   2278	return 0;
   2279}
   2280
   2281/*
   2282 * mce_syscore: PM support
   2283 */
   2284
   2285/*
   2286 * Disable machine checks on suspend and shutdown. We can't really handle
   2287 * them later.
   2288 */
   2289static void mce_disable_error_reporting(void)
   2290{
   2291	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   2292	int i;
   2293
   2294	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   2295		struct mce_bank *b = &mce_banks[i];
   2296
   2297		if (b->init)
   2298			wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
   2299	}
   2300	return;
   2301}
   2302
   2303static void vendor_disable_error_reporting(void)
   2304{
   2305	/*
   2306	 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
   2307	 * MSRs are socket-wide. Disabling them for just a single offlined CPU
   2308	 * is bad, since it will inhibit reporting for all shared resources on
   2309	 * the socket like the last level cache (LLC), the integrated memory
   2310	 * controller (iMC), etc.
   2311	 */
   2312	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
   2313	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
   2314	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
   2315	    boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
   2316		return;
   2317
   2318	mce_disable_error_reporting();
   2319}
   2320
   2321static int mce_syscore_suspend(void)
   2322{
   2323	vendor_disable_error_reporting();
   2324	return 0;
   2325}
   2326
   2327static void mce_syscore_shutdown(void)
   2328{
   2329	vendor_disable_error_reporting();
   2330}
   2331
   2332/*
   2333 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
   2334 * Only one CPU is active at this time, the others get re-added later using
   2335 * CPU hotplug:
   2336 */
   2337static void mce_syscore_resume(void)
   2338{
   2339	__mcheck_cpu_init_generic();
   2340	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
   2341	__mcheck_cpu_init_clear_banks();
   2342}
   2343
   2344static struct syscore_ops mce_syscore_ops = {
   2345	.suspend	= mce_syscore_suspend,
   2346	.shutdown	= mce_syscore_shutdown,
   2347	.resume		= mce_syscore_resume,
   2348};
   2349
   2350/*
   2351 * mce_device: Sysfs support
   2352 */
   2353
   2354static void mce_cpu_restart(void *data)
   2355{
   2356	if (!mce_available(raw_cpu_ptr(&cpu_info)))
   2357		return;
   2358	__mcheck_cpu_init_generic();
   2359	__mcheck_cpu_init_clear_banks();
   2360	__mcheck_cpu_init_timer();
   2361}
   2362
   2363/* Reinit MCEs after user configuration changes */
   2364static void mce_restart(void)
   2365{
   2366	mce_timer_delete_all();
   2367	on_each_cpu(mce_cpu_restart, NULL, 1);
   2368}
   2369
   2370/* Toggle features for corrected errors */
   2371static void mce_disable_cmci(void *data)
   2372{
   2373	if (!mce_available(raw_cpu_ptr(&cpu_info)))
   2374		return;
   2375	cmci_clear();
   2376}
   2377
   2378static void mce_enable_ce(void *all)
   2379{
   2380	if (!mce_available(raw_cpu_ptr(&cpu_info)))
   2381		return;
   2382	cmci_reenable();
   2383	cmci_recheck();
   2384	if (all)
   2385		__mcheck_cpu_init_timer();
   2386}
   2387
   2388static struct bus_type mce_subsys = {
   2389	.name		= "machinecheck",
   2390	.dev_name	= "machinecheck",
   2391};
   2392
   2393DEFINE_PER_CPU(struct device *, mce_device);
   2394
   2395static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
   2396{
   2397	return container_of(attr, struct mce_bank_dev, attr);
   2398}
   2399
   2400static ssize_t show_bank(struct device *s, struct device_attribute *attr,
   2401			 char *buf)
   2402{
   2403	u8 bank = attr_to_bank(attr)->bank;
   2404	struct mce_bank *b;
   2405
   2406	if (bank >= per_cpu(mce_num_banks, s->id))
   2407		return -EINVAL;
   2408
   2409	b = &per_cpu(mce_banks_array, s->id)[bank];
   2410
   2411	if (!b->init)
   2412		return -ENODEV;
   2413
   2414	return sprintf(buf, "%llx\n", b->ctl);
   2415}
   2416
   2417static ssize_t set_bank(struct device *s, struct device_attribute *attr,
   2418			const char *buf, size_t size)
   2419{
   2420	u8 bank = attr_to_bank(attr)->bank;
   2421	struct mce_bank *b;
   2422	u64 new;
   2423
   2424	if (kstrtou64(buf, 0, &new) < 0)
   2425		return -EINVAL;
   2426
   2427	if (bank >= per_cpu(mce_num_banks, s->id))
   2428		return -EINVAL;
   2429
   2430	b = &per_cpu(mce_banks_array, s->id)[bank];
   2431
   2432	if (!b->init)
   2433		return -ENODEV;
   2434
   2435	b->ctl = new;
   2436	mce_restart();
   2437
   2438	return size;
   2439}
   2440
   2441static ssize_t set_ignore_ce(struct device *s,
   2442			     struct device_attribute *attr,
   2443			     const char *buf, size_t size)
   2444{
   2445	u64 new;
   2446
   2447	if (kstrtou64(buf, 0, &new) < 0)
   2448		return -EINVAL;
   2449
   2450	mutex_lock(&mce_sysfs_mutex);
   2451	if (mca_cfg.ignore_ce ^ !!new) {
   2452		if (new) {
   2453			/* disable ce features */
   2454			mce_timer_delete_all();
   2455			on_each_cpu(mce_disable_cmci, NULL, 1);
   2456			mca_cfg.ignore_ce = true;
   2457		} else {
   2458			/* enable ce features */
   2459			mca_cfg.ignore_ce = false;
   2460			on_each_cpu(mce_enable_ce, (void *)1, 1);
   2461		}
   2462	}
   2463	mutex_unlock(&mce_sysfs_mutex);
   2464
   2465	return size;
   2466}
   2467
   2468static ssize_t set_cmci_disabled(struct device *s,
   2469				 struct device_attribute *attr,
   2470				 const char *buf, size_t size)
   2471{
   2472	u64 new;
   2473
   2474	if (kstrtou64(buf, 0, &new) < 0)
   2475		return -EINVAL;
   2476
   2477	mutex_lock(&mce_sysfs_mutex);
   2478	if (mca_cfg.cmci_disabled ^ !!new) {
   2479		if (new) {
   2480			/* disable cmci */
   2481			on_each_cpu(mce_disable_cmci, NULL, 1);
   2482			mca_cfg.cmci_disabled = true;
   2483		} else {
   2484			/* enable cmci */
   2485			mca_cfg.cmci_disabled = false;
   2486			on_each_cpu(mce_enable_ce, NULL, 1);
   2487		}
   2488	}
   2489	mutex_unlock(&mce_sysfs_mutex);
   2490
   2491	return size;
   2492}
   2493
   2494static ssize_t store_int_with_restart(struct device *s,
   2495				      struct device_attribute *attr,
   2496				      const char *buf, size_t size)
   2497{
   2498	unsigned long old_check_interval = check_interval;
   2499	ssize_t ret = device_store_ulong(s, attr, buf, size);
   2500
   2501	if (check_interval == old_check_interval)
   2502		return ret;
   2503
   2504	mutex_lock(&mce_sysfs_mutex);
   2505	mce_restart();
   2506	mutex_unlock(&mce_sysfs_mutex);
   2507
   2508	return ret;
   2509}
   2510
   2511static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
   2512static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
   2513static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
   2514
   2515static struct dev_ext_attribute dev_attr_check_interval = {
   2516	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
   2517	&check_interval
   2518};
   2519
   2520static struct dev_ext_attribute dev_attr_ignore_ce = {
   2521	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
   2522	&mca_cfg.ignore_ce
   2523};
   2524
   2525static struct dev_ext_attribute dev_attr_cmci_disabled = {
   2526	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
   2527	&mca_cfg.cmci_disabled
   2528};
   2529
   2530static struct device_attribute *mce_device_attrs[] = {
   2531	&dev_attr_check_interval.attr,
   2532#ifdef CONFIG_X86_MCELOG_LEGACY
   2533	&dev_attr_trigger,
   2534#endif
   2535	&dev_attr_monarch_timeout.attr,
   2536	&dev_attr_dont_log_ce.attr,
   2537	&dev_attr_print_all.attr,
   2538	&dev_attr_ignore_ce.attr,
   2539	&dev_attr_cmci_disabled.attr,
   2540	NULL
   2541};
   2542
   2543static cpumask_var_t mce_device_initialized;
   2544
   2545static void mce_device_release(struct device *dev)
   2546{
   2547	kfree(dev);
   2548}
   2549
   2550/* Per CPU device init. All of the CPUs still share the same bank device: */
   2551static int mce_device_create(unsigned int cpu)
   2552{
   2553	struct device *dev;
   2554	int err;
   2555	int i, j;
   2556
   2557	if (!mce_available(&boot_cpu_data))
   2558		return -EIO;
   2559
   2560	dev = per_cpu(mce_device, cpu);
   2561	if (dev)
   2562		return 0;
   2563
   2564	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
   2565	if (!dev)
   2566		return -ENOMEM;
   2567	dev->id  = cpu;
   2568	dev->bus = &mce_subsys;
   2569	dev->release = &mce_device_release;
   2570
   2571	err = device_register(dev);
   2572	if (err) {
   2573		put_device(dev);
   2574		return err;
   2575	}
   2576
   2577	for (i = 0; mce_device_attrs[i]; i++) {
   2578		err = device_create_file(dev, mce_device_attrs[i]);
   2579		if (err)
   2580			goto error;
   2581	}
   2582	for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
   2583		err = device_create_file(dev, &mce_bank_devs[j].attr);
   2584		if (err)
   2585			goto error2;
   2586	}
   2587	cpumask_set_cpu(cpu, mce_device_initialized);
   2588	per_cpu(mce_device, cpu) = dev;
   2589
   2590	return 0;
   2591error2:
   2592	while (--j >= 0)
   2593		device_remove_file(dev, &mce_bank_devs[j].attr);
   2594error:
   2595	while (--i >= 0)
   2596		device_remove_file(dev, mce_device_attrs[i]);
   2597
   2598	device_unregister(dev);
   2599
   2600	return err;
   2601}
   2602
   2603static void mce_device_remove(unsigned int cpu)
   2604{
   2605	struct device *dev = per_cpu(mce_device, cpu);
   2606	int i;
   2607
   2608	if (!cpumask_test_cpu(cpu, mce_device_initialized))
   2609		return;
   2610
   2611	for (i = 0; mce_device_attrs[i]; i++)
   2612		device_remove_file(dev, mce_device_attrs[i]);
   2613
   2614	for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
   2615		device_remove_file(dev, &mce_bank_devs[i].attr);
   2616
   2617	device_unregister(dev);
   2618	cpumask_clear_cpu(cpu, mce_device_initialized);
   2619	per_cpu(mce_device, cpu) = NULL;
   2620}
   2621
   2622/* Make sure there are no machine checks on offlined CPUs. */
   2623static void mce_disable_cpu(void)
   2624{
   2625	if (!mce_available(raw_cpu_ptr(&cpu_info)))
   2626		return;
   2627
   2628	if (!cpuhp_tasks_frozen)
   2629		cmci_clear();
   2630
   2631	vendor_disable_error_reporting();
   2632}
   2633
   2634static void mce_reenable_cpu(void)
   2635{
   2636	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
   2637	int i;
   2638
   2639	if (!mce_available(raw_cpu_ptr(&cpu_info)))
   2640		return;
   2641
   2642	if (!cpuhp_tasks_frozen)
   2643		cmci_reenable();
   2644	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
   2645		struct mce_bank *b = &mce_banks[i];
   2646
   2647		if (b->init)
   2648			wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
   2649	}
   2650}
   2651
   2652static int mce_cpu_dead(unsigned int cpu)
   2653{
   2654	mce_intel_hcpu_update(cpu);
   2655
   2656	/* intentionally ignoring frozen here */
   2657	if (!cpuhp_tasks_frozen)
   2658		cmci_rediscover();
   2659	return 0;
   2660}
   2661
   2662static int mce_cpu_online(unsigned int cpu)
   2663{
   2664	struct timer_list *t = this_cpu_ptr(&mce_timer);
   2665	int ret;
   2666
   2667	mce_device_create(cpu);
   2668
   2669	ret = mce_threshold_create_device(cpu);
   2670	if (ret) {
   2671		mce_device_remove(cpu);
   2672		return ret;
   2673	}
   2674	mce_reenable_cpu();
   2675	mce_start_timer(t);
   2676	return 0;
   2677}
   2678
   2679static int mce_cpu_pre_down(unsigned int cpu)
   2680{
   2681	struct timer_list *t = this_cpu_ptr(&mce_timer);
   2682
   2683	mce_disable_cpu();
   2684	del_timer_sync(t);
   2685	mce_threshold_remove_device(cpu);
   2686	mce_device_remove(cpu);
   2687	return 0;
   2688}
   2689
   2690static __init void mce_init_banks(void)
   2691{
   2692	int i;
   2693
   2694	for (i = 0; i < MAX_NR_BANKS; i++) {
   2695		struct mce_bank_dev *b = &mce_bank_devs[i];
   2696		struct device_attribute *a = &b->attr;
   2697
   2698		b->bank = i;
   2699
   2700		sysfs_attr_init(&a->attr);
   2701		a->attr.name	= b->attrname;
   2702		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
   2703
   2704		a->attr.mode	= 0644;
   2705		a->show		= show_bank;
   2706		a->store	= set_bank;
   2707	}
   2708}
   2709
   2710/*
   2711 * When running on XEN, this initcall is ordered against the XEN mcelog
   2712 * initcall:
   2713 *
   2714 *   device_initcall(xen_late_init_mcelog);
   2715 *   device_initcall_sync(mcheck_init_device);
   2716 */
   2717static __init int mcheck_init_device(void)
   2718{
   2719	int err;
   2720
   2721	/*
   2722	 * Check if we have a spare virtual bit. This will only become
   2723	 * a problem if/when we move beyond 5-level page tables.
   2724	 */
   2725	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
   2726
   2727	if (!mce_available(&boot_cpu_data)) {
   2728		err = -EIO;
   2729		goto err_out;
   2730	}
   2731
   2732	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
   2733		err = -ENOMEM;
   2734		goto err_out;
   2735	}
   2736
   2737	mce_init_banks();
   2738
   2739	err = subsys_system_register(&mce_subsys, NULL);
   2740	if (err)
   2741		goto err_out_mem;
   2742
   2743	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
   2744				mce_cpu_dead);
   2745	if (err)
   2746		goto err_out_mem;
   2747
   2748	/*
   2749	 * Invokes mce_cpu_online() on all CPUs which are online when
   2750	 * the state is installed.
   2751	 */
   2752	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
   2753				mce_cpu_online, mce_cpu_pre_down);
   2754	if (err < 0)
   2755		goto err_out_online;
   2756
   2757	register_syscore_ops(&mce_syscore_ops);
   2758
   2759	return 0;
   2760
   2761err_out_online:
   2762	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
   2763
   2764err_out_mem:
   2765	free_cpumask_var(mce_device_initialized);
   2766
   2767err_out:
   2768	pr_err("Unable to init MCE device (rc: %d)\n", err);
   2769
   2770	return err;
   2771}
   2772device_initcall_sync(mcheck_init_device);
   2773
   2774/*
   2775 * Old style boot options parsing. Only for compatibility.
   2776 */
   2777static int __init mcheck_disable(char *str)
   2778{
   2779	mca_cfg.disabled = 1;
   2780	return 1;
   2781}
   2782__setup("nomce", mcheck_disable);
   2783
   2784#ifdef CONFIG_DEBUG_FS
   2785struct dentry *mce_get_debugfs_dir(void)
   2786{
   2787	static struct dentry *dmce;
   2788
   2789	if (!dmce)
   2790		dmce = debugfs_create_dir("mce", NULL);
   2791
   2792	return dmce;
   2793}
   2794
   2795static void mce_reset(void)
   2796{
   2797	atomic_set(&mce_fake_panicked, 0);
   2798	atomic_set(&mce_executing, 0);
   2799	atomic_set(&mce_callin, 0);
   2800	atomic_set(&global_nwo, 0);
   2801	cpumask_setall(&mce_missing_cpus);
   2802}
   2803
   2804static int fake_panic_get(void *data, u64 *val)
   2805{
   2806	*val = fake_panic;
   2807	return 0;
   2808}
   2809
   2810static int fake_panic_set(void *data, u64 val)
   2811{
   2812	mce_reset();
   2813	fake_panic = val;
   2814	return 0;
   2815}
   2816
   2817DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
   2818			 "%llu\n");
   2819
   2820static void __init mcheck_debugfs_init(void)
   2821{
   2822	struct dentry *dmce;
   2823
   2824	dmce = mce_get_debugfs_dir();
   2825	debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
   2826				   &fake_panic_fops);
   2827}
   2828#else
   2829static void __init mcheck_debugfs_init(void) { }
   2830#endif
   2831
   2832static int __init mcheck_late_init(void)
   2833{
   2834	if (mca_cfg.recovery)
   2835		enable_copy_mc_fragile();
   2836
   2837	mcheck_debugfs_init();
   2838
   2839	/*
   2840	 * Flush out everything that has been logged during early boot, now that
   2841	 * everything has been initialized (workqueues, decoders, ...).
   2842	 */
   2843	mce_schedule_work();
   2844
   2845	return 0;
   2846}
   2847late_initcall(mcheck_late_init);