cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

sev.c (65956B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * AMD Memory Encryption Support
      4 *
      5 * Copyright (C) 2019 SUSE
      6 *
      7 * Author: Joerg Roedel <jroedel@suse.de>
      8 */
      9
     10#define pr_fmt(fmt)	"SEV: " fmt
     11
     12#include <linux/sched/debug.h>	/* For show_regs() */
     13#include <linux/percpu-defs.h>
     14#include <linux/cc_platform.h>
     15#include <linux/printk.h>
     16#include <linux/mm_types.h>
     17#include <linux/set_memory.h>
     18#include <linux/memblock.h>
     19#include <linux/kernel.h>
     20#include <linux/mm.h>
     21#include <linux/cpumask.h>
     22#include <linux/efi.h>
     23#include <linux/platform_device.h>
     24#include <linux/io.h>
     25#include <linux/cpumask.h>
     26#include <linux/amd-iommu.h>
     27
     28#include <asm/cpu_entry_area.h>
     29#include <asm/stacktrace.h>
     30#include <asm/sev.h>
     31#include <asm/insn-eval.h>
     32#include <asm/fpu/xcr.h>
     33#include <asm/processor.h>
     34#include <asm/realmode.h>
     35#include <asm/setup.h>
     36#include <asm/traps.h>
     37#include <asm/svm.h>
     38#include <asm/smp.h>
     39#include <asm/cpu.h>
     40#include <asm/apic.h>
     41#include <asm/cpuid.h>
     42#include <asm/cmdline.h>
     43
     44#define DR7_RESET_VALUE        0x400
     45
     46/* AP INIT values as documented in the APM2  section "Processor Initialization State" */
     47#define AP_INIT_CS_LIMIT		0xffff
     48#define AP_INIT_DS_LIMIT		0xffff
     49#define AP_INIT_LDTR_LIMIT		0xffff
     50#define AP_INIT_GDTR_LIMIT		0xffff
     51#define AP_INIT_IDTR_LIMIT		0xffff
     52#define AP_INIT_TR_LIMIT		0xffff
     53#define AP_INIT_RFLAGS_DEFAULT		0x2
     54#define AP_INIT_DR6_DEFAULT		0xffff0ff0
     55#define AP_INIT_GPAT_DEFAULT		0x0007040600070406ULL
     56#define AP_INIT_XCR0_DEFAULT		0x1
     57#define AP_INIT_X87_FTW_DEFAULT		0x5555
     58#define AP_INIT_X87_FCW_DEFAULT		0x0040
     59#define AP_INIT_CR0_DEFAULT		0x60000010
     60#define AP_INIT_MXCSR_DEFAULT		0x1f80
     61
     62/*
     63 * The first 16KB from the RMP_BASE is used by the processor for the
     64 * bookkeeping, the range need to be added during the RMP entry lookup.
     65 */
     66#define RMPTABLE_CPU_BOOKKEEPING_SZ	0x4000
     67#define RMPENTRY_SHIFT			8
     68#define rmptable_page_offset(x)	(RMPTABLE_CPU_BOOKKEEPING_SZ + (((unsigned long)x) >> RMPENTRY_SHIFT))
     69
     70/* For early boot hypervisor communication in SEV-ES enabled guests */
     71static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
     72
     73/*
     74 * Needs to be in the .data section because we need it NULL before bss is
     75 * cleared
     76 */
     77static struct ghcb *boot_ghcb __section(".data");
     78
     79/* Bitmap of SEV features supported by the hypervisor */
     80static u64 sev_hv_features __ro_after_init;
     81
     82static unsigned long rmptable_start __ro_after_init;
     83static unsigned long rmptable_end __ro_after_init;
     84
     85
     86/* #VC handler runtime per-CPU data */
     87struct sev_es_runtime_data {
     88	struct ghcb ghcb_page;
     89
     90	/*
     91	 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
     92	 * It is needed when an NMI happens while the #VC handler uses the real
     93	 * GHCB, and the NMI handler itself is causing another #VC exception. In
     94	 * that case the GHCB content of the first handler needs to be backed up
     95	 * and restored.
     96	 */
     97	struct ghcb backup_ghcb;
     98
     99	/*
    100	 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
    101	 * There is no need for it to be atomic, because nothing is written to
    102	 * the GHCB between the read and the write of ghcb_active. So it is safe
    103	 * to use it when a nested #VC exception happens before the write.
    104	 *
    105	 * This is necessary for example in the #VC->NMI->#VC case when the NMI
    106	 * happens while the first #VC handler uses the GHCB. When the NMI code
    107	 * raises a second #VC handler it might overwrite the contents of the
    108	 * GHCB written by the first handler. To avoid this the content of the
    109	 * GHCB is saved and restored when the GHCB is detected to be in use
    110	 * already.
    111	 */
    112	bool ghcb_active;
    113	bool backup_ghcb_active;
    114
    115	/*
    116	 * Cached DR7 value - write it on DR7 writes and return it on reads.
    117	 * That value will never make it to the real hardware DR7 as debugging
    118	 * is currently unsupported in SEV-ES guests.
    119	 */
    120	unsigned long dr7;
    121};
    122
    123struct ghcb_state {
    124	struct ghcb *ghcb;
    125};
    126
    127static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
    128DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
    129
    130static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
    131
    132struct sev_config {
    133	__u64 debug		: 1,
    134	      __reserved	: 63;
    135};
    136
    137static struct sev_config sev_cfg __read_mostly;
    138
    139static __always_inline bool on_vc_stack(struct pt_regs *regs)
    140{
    141	unsigned long sp = regs->sp;
    142
    143	/* User-mode RSP is not trusted */
    144	if (user_mode(regs))
    145		return false;
    146
    147	/* SYSCALL gap still has user-mode RSP */
    148	if (ip_within_syscall_gap(regs))
    149		return false;
    150
    151	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
    152}
    153
    154/*
    155 * This function handles the case when an NMI is raised in the #VC
    156 * exception handler entry code, before the #VC handler has switched off
    157 * its IST stack. In this case, the IST entry for #VC must be adjusted,
    158 * so that any nested #VC exception will not overwrite the stack
    159 * contents of the interrupted #VC handler.
    160 *
    161 * The IST entry is adjusted unconditionally so that it can be also be
    162 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
    163 * nested sev_es_ist_exit() call may adjust back the IST entry too
    164 * early.
    165 *
    166 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
    167 * on the NMI IST stack, as they are only called from NMI handling code
    168 * right now.
    169 */
    170void noinstr __sev_es_ist_enter(struct pt_regs *regs)
    171{
    172	unsigned long old_ist, new_ist;
    173
    174	/* Read old IST entry */
    175	new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
    176
    177	/*
    178	 * If NMI happened while on the #VC IST stack, set the new IST
    179	 * value below regs->sp, so that the interrupted stack frame is
    180	 * not overwritten by subsequent #VC exceptions.
    181	 */
    182	if (on_vc_stack(regs))
    183		new_ist = regs->sp;
    184
    185	/*
    186	 * Reserve additional 8 bytes and store old IST value so this
    187	 * adjustment can be unrolled in __sev_es_ist_exit().
    188	 */
    189	new_ist -= sizeof(old_ist);
    190	*(unsigned long *)new_ist = old_ist;
    191
    192	/* Set new IST entry */
    193	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
    194}
    195
    196void noinstr __sev_es_ist_exit(void)
    197{
    198	unsigned long ist;
    199
    200	/* Read IST entry */
    201	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
    202
    203	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
    204		return;
    205
    206	/* Read back old IST entry and write it to the TSS */
    207	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
    208}
    209
    210/*
    211 * Nothing shall interrupt this code path while holding the per-CPU
    212 * GHCB. The backup GHCB is only for NMIs interrupting this path.
    213 *
    214 * Callers must disable local interrupts around it.
    215 */
    216static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
    217{
    218	struct sev_es_runtime_data *data;
    219	struct ghcb *ghcb;
    220
    221	WARN_ON(!irqs_disabled());
    222
    223	data = this_cpu_read(runtime_data);
    224	ghcb = &data->ghcb_page;
    225
    226	if (unlikely(data->ghcb_active)) {
    227		/* GHCB is already in use - save its contents */
    228
    229		if (unlikely(data->backup_ghcb_active)) {
    230			/*
    231			 * Backup-GHCB is also already in use. There is no way
    232			 * to continue here so just kill the machine. To make
    233			 * panic() work, mark GHCBs inactive so that messages
    234			 * can be printed out.
    235			 */
    236			data->ghcb_active        = false;
    237			data->backup_ghcb_active = false;
    238
    239			instrumentation_begin();
    240			panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
    241			instrumentation_end();
    242		}
    243
    244		/* Mark backup_ghcb active before writing to it */
    245		data->backup_ghcb_active = true;
    246
    247		state->ghcb = &data->backup_ghcb;
    248
    249		/* Backup GHCB content */
    250		*state->ghcb = *ghcb;
    251	} else {
    252		state->ghcb = NULL;
    253		data->ghcb_active = true;
    254	}
    255
    256	return ghcb;
    257}
    258
    259static inline u64 sev_es_rd_ghcb_msr(void)
    260{
    261	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
    262}
    263
    264static __always_inline void sev_es_wr_ghcb_msr(u64 val)
    265{
    266	u32 low, high;
    267
    268	low  = (u32)(val);
    269	high = (u32)(val >> 32);
    270
    271	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
    272}
    273
    274static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
    275				unsigned char *buffer)
    276{
    277	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
    278}
    279
    280static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
    281{
    282	char buffer[MAX_INSN_SIZE];
    283	int insn_bytes;
    284
    285	insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
    286	if (insn_bytes == 0) {
    287		/* Nothing could be copied */
    288		ctxt->fi.vector     = X86_TRAP_PF;
    289		ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
    290		ctxt->fi.cr2        = ctxt->regs->ip;
    291		return ES_EXCEPTION;
    292	} else if (insn_bytes == -EINVAL) {
    293		/* Effective RIP could not be calculated */
    294		ctxt->fi.vector     = X86_TRAP_GP;
    295		ctxt->fi.error_code = 0;
    296		ctxt->fi.cr2        = 0;
    297		return ES_EXCEPTION;
    298	}
    299
    300	if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
    301		return ES_DECODE_FAILED;
    302
    303	if (ctxt->insn.immediate.got)
    304		return ES_OK;
    305	else
    306		return ES_DECODE_FAILED;
    307}
    308
    309static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
    310{
    311	char buffer[MAX_INSN_SIZE];
    312	int res, ret;
    313
    314	res = vc_fetch_insn_kernel(ctxt, buffer);
    315	if (res) {
    316		ctxt->fi.vector     = X86_TRAP_PF;
    317		ctxt->fi.error_code = X86_PF_INSTR;
    318		ctxt->fi.cr2        = ctxt->regs->ip;
    319		return ES_EXCEPTION;
    320	}
    321
    322	ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
    323	if (ret < 0)
    324		return ES_DECODE_FAILED;
    325	else
    326		return ES_OK;
    327}
    328
    329static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
    330{
    331	if (user_mode(ctxt->regs))
    332		return __vc_decode_user_insn(ctxt);
    333	else
    334		return __vc_decode_kern_insn(ctxt);
    335}
    336
    337static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
    338				   char *dst, char *buf, size_t size)
    339{
    340	unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
    341
    342	/*
    343	 * This function uses __put_user() independent of whether kernel or user
    344	 * memory is accessed. This works fine because __put_user() does no
    345	 * sanity checks of the pointer being accessed. All that it does is
    346	 * to report when the access failed.
    347	 *
    348	 * Also, this function runs in atomic context, so __put_user() is not
    349	 * allowed to sleep. The page-fault handler detects that it is running
    350	 * in atomic context and will not try to take mmap_sem and handle the
    351	 * fault, so additional pagefault_enable()/disable() calls are not
    352	 * needed.
    353	 *
    354	 * The access can't be done via copy_to_user() here because
    355	 * vc_write_mem() must not use string instructions to access unsafe
    356	 * memory. The reason is that MOVS is emulated by the #VC handler by
    357	 * splitting the move up into a read and a write and taking a nested #VC
    358	 * exception on whatever of them is the MMIO access. Using string
    359	 * instructions here would cause infinite nesting.
    360	 */
    361	switch (size) {
    362	case 1: {
    363		u8 d1;
    364		u8 __user *target = (u8 __user *)dst;
    365
    366		memcpy(&d1, buf, 1);
    367		if (__put_user(d1, target))
    368			goto fault;
    369		break;
    370	}
    371	case 2: {
    372		u16 d2;
    373		u16 __user *target = (u16 __user *)dst;
    374
    375		memcpy(&d2, buf, 2);
    376		if (__put_user(d2, target))
    377			goto fault;
    378		break;
    379	}
    380	case 4: {
    381		u32 d4;
    382		u32 __user *target = (u32 __user *)dst;
    383
    384		memcpy(&d4, buf, 4);
    385		if (__put_user(d4, target))
    386			goto fault;
    387		break;
    388	}
    389	case 8: {
    390		u64 d8;
    391		u64 __user *target = (u64 __user *)dst;
    392
    393		memcpy(&d8, buf, 8);
    394		if (__put_user(d8, target))
    395			goto fault;
    396		break;
    397	}
    398	default:
    399		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
    400		return ES_UNSUPPORTED;
    401	}
    402
    403	return ES_OK;
    404
    405fault:
    406	if (user_mode(ctxt->regs))
    407		error_code |= X86_PF_USER;
    408
    409	ctxt->fi.vector = X86_TRAP_PF;
    410	ctxt->fi.error_code = error_code;
    411	ctxt->fi.cr2 = (unsigned long)dst;
    412
    413	return ES_EXCEPTION;
    414}
    415
    416static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
    417				  char *src, char *buf, size_t size)
    418{
    419	unsigned long error_code = X86_PF_PROT;
    420
    421	/*
    422	 * This function uses __get_user() independent of whether kernel or user
    423	 * memory is accessed. This works fine because __get_user() does no
    424	 * sanity checks of the pointer being accessed. All that it does is
    425	 * to report when the access failed.
    426	 *
    427	 * Also, this function runs in atomic context, so __get_user() is not
    428	 * allowed to sleep. The page-fault handler detects that it is running
    429	 * in atomic context and will not try to take mmap_sem and handle the
    430	 * fault, so additional pagefault_enable()/disable() calls are not
    431	 * needed.
    432	 *
    433	 * The access can't be done via copy_from_user() here because
    434	 * vc_read_mem() must not use string instructions to access unsafe
    435	 * memory. The reason is that MOVS is emulated by the #VC handler by
    436	 * splitting the move up into a read and a write and taking a nested #VC
    437	 * exception on whatever of them is the MMIO access. Using string
    438	 * instructions here would cause infinite nesting.
    439	 */
    440	switch (size) {
    441	case 1: {
    442		u8 d1;
    443		u8 __user *s = (u8 __user *)src;
    444
    445		if (__get_user(d1, s))
    446			goto fault;
    447		memcpy(buf, &d1, 1);
    448		break;
    449	}
    450	case 2: {
    451		u16 d2;
    452		u16 __user *s = (u16 __user *)src;
    453
    454		if (__get_user(d2, s))
    455			goto fault;
    456		memcpy(buf, &d2, 2);
    457		break;
    458	}
    459	case 4: {
    460		u32 d4;
    461		u32 __user *s = (u32 __user *)src;
    462
    463		if (__get_user(d4, s))
    464			goto fault;
    465		memcpy(buf, &d4, 4);
    466		break;
    467	}
    468	case 8: {
    469		u64 d8;
    470		u64 __user *s = (u64 __user *)src;
    471		if (__get_user(d8, s))
    472			goto fault;
    473		memcpy(buf, &d8, 8);
    474		break;
    475	}
    476	default:
    477		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
    478		return ES_UNSUPPORTED;
    479	}
    480
    481	return ES_OK;
    482
    483fault:
    484	if (user_mode(ctxt->regs))
    485		error_code |= X86_PF_USER;
    486
    487	ctxt->fi.vector = X86_TRAP_PF;
    488	ctxt->fi.error_code = error_code;
    489	ctxt->fi.cr2 = (unsigned long)src;
    490
    491	return ES_EXCEPTION;
    492}
    493
    494static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
    495					   unsigned long vaddr, phys_addr_t *paddr)
    496{
    497	unsigned long va = (unsigned long)vaddr;
    498	unsigned int level;
    499	phys_addr_t pa;
    500	pgd_t *pgd;
    501	pte_t *pte;
    502
    503	pgd = __va(read_cr3_pa());
    504	pgd = &pgd[pgd_index(va)];
    505	pte = lookup_address_in_pgd(pgd, va, &level);
    506	if (!pte) {
    507		ctxt->fi.vector     = X86_TRAP_PF;
    508		ctxt->fi.cr2        = vaddr;
    509		ctxt->fi.error_code = 0;
    510
    511		if (user_mode(ctxt->regs))
    512			ctxt->fi.error_code |= X86_PF_USER;
    513
    514		return ES_EXCEPTION;
    515	}
    516
    517	if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
    518		/* Emulated MMIO to/from encrypted memory not supported */
    519		return ES_UNSUPPORTED;
    520
    521	pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
    522	pa |= va & ~page_level_mask(level);
    523
    524	*paddr = pa;
    525
    526	return ES_OK;
    527}
    528
    529/* Include code shared with pre-decompression boot stage */
    530#include "sev-shared.c"
    531
    532static noinstr void __sev_put_ghcb(struct ghcb_state *state)
    533{
    534	struct sev_es_runtime_data *data;
    535	struct ghcb *ghcb;
    536
    537	WARN_ON(!irqs_disabled());
    538
    539	data = this_cpu_read(runtime_data);
    540	ghcb = &data->ghcb_page;
    541
    542	if (state->ghcb) {
    543		/* Restore GHCB from Backup */
    544		*ghcb = *state->ghcb;
    545		data->backup_ghcb_active = false;
    546		state->ghcb = NULL;
    547	} else {
    548		/*
    549		 * Invalidate the GHCB so a VMGEXIT instruction issued
    550		 * from userspace won't appear to be valid.
    551		 */
    552		vc_ghcb_invalidate(ghcb);
    553		data->ghcb_active = false;
    554	}
    555}
    556
    557void noinstr __sev_es_nmi_complete(void)
    558{
    559	struct ghcb_state state;
    560	struct ghcb *ghcb;
    561
    562	ghcb = __sev_get_ghcb(&state);
    563
    564	vc_ghcb_invalidate(ghcb);
    565	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
    566	ghcb_set_sw_exit_info_1(ghcb, 0);
    567	ghcb_set_sw_exit_info_2(ghcb, 0);
    568
    569	sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
    570	VMGEXIT();
    571
    572	__sev_put_ghcb(&state);
    573}
    574
    575static u64 __init get_secrets_page(void)
    576{
    577	u64 pa_data = boot_params.cc_blob_address;
    578	struct cc_blob_sev_info info;
    579	void *map;
    580
    581	/*
    582	 * The CC blob contains the address of the secrets page, check if the
    583	 * blob is present.
    584	 */
    585	if (!pa_data)
    586		return 0;
    587
    588	map = early_memremap(pa_data, sizeof(info));
    589	if (!map) {
    590		pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
    591		return 0;
    592	}
    593	memcpy(&info, map, sizeof(info));
    594	early_memunmap(map, sizeof(info));
    595
    596	/* smoke-test the secrets page passed */
    597	if (!info.secrets_phys || info.secrets_len != PAGE_SIZE)
    598		return 0;
    599
    600	return info.secrets_phys;
    601}
    602
    603static u64 __init get_snp_jump_table_addr(void)
    604{
    605	struct snp_secrets_page_layout *layout;
    606	void __iomem *mem;
    607	u64 pa, addr;
    608
    609	pa = get_secrets_page();
    610	if (!pa)
    611		return 0;
    612
    613	mem = ioremap_encrypted(pa, PAGE_SIZE);
    614	if (!mem) {
    615		pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
    616		return 0;
    617	}
    618
    619	layout = (__force struct snp_secrets_page_layout *)mem;
    620
    621	addr = layout->os_area.ap_jump_table_pa;
    622	iounmap(mem);
    623
    624	return addr;
    625}
    626
    627static u64 __init get_jump_table_addr(void)
    628{
    629	struct ghcb_state state;
    630	unsigned long flags;
    631	struct ghcb *ghcb;
    632	u64 ret = 0;
    633
    634	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
    635		return get_snp_jump_table_addr();
    636
    637	local_irq_save(flags);
    638
    639	ghcb = __sev_get_ghcb(&state);
    640
    641	vc_ghcb_invalidate(ghcb);
    642	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
    643	ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
    644	ghcb_set_sw_exit_info_2(ghcb, 0);
    645
    646	sev_es_wr_ghcb_msr(__pa(ghcb));
    647	VMGEXIT();
    648
    649	if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
    650	    ghcb_sw_exit_info_2_is_valid(ghcb))
    651		ret = ghcb->save.sw_exit_info_2;
    652
    653	__sev_put_ghcb(&state);
    654
    655	local_irq_restore(flags);
    656
    657	return ret;
    658}
    659
    660static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate)
    661{
    662	unsigned long vaddr_end;
    663	int rc;
    664
    665	vaddr = vaddr & PAGE_MASK;
    666	vaddr_end = vaddr + (npages << PAGE_SHIFT);
    667
    668	while (vaddr < vaddr_end) {
    669		rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
    670		if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc))
    671			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
    672
    673		vaddr = vaddr + PAGE_SIZE;
    674	}
    675}
    676
    677static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
    678{
    679	unsigned long paddr_end;
    680	u64 val;
    681
    682	paddr = paddr & PAGE_MASK;
    683	paddr_end = paddr + (npages << PAGE_SHIFT);
    684
    685	while (paddr < paddr_end) {
    686		/*
    687		 * Use the MSR protocol because this function can be called before
    688		 * the GHCB is established.
    689		 */
    690		sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
    691		VMGEXIT();
    692
    693		val = sev_es_rd_ghcb_msr();
    694
    695		if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
    696			 "Wrong PSC response code: 0x%x\n",
    697			 (unsigned int)GHCB_RESP_CODE(val)))
    698			goto e_term;
    699
    700		if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
    701			 "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
    702			 op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
    703			 paddr, GHCB_MSR_PSC_RESP_VAL(val)))
    704			goto e_term;
    705
    706		paddr = paddr + PAGE_SIZE;
    707	}
    708
    709	return;
    710
    711e_term:
    712	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
    713}
    714
    715void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
    716					 unsigned int npages)
    717{
    718	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
    719		return;
    720
    721	 /*
    722	  * Ask the hypervisor to mark the memory pages as private in the RMP
    723	  * table.
    724	  */
    725	early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE);
    726
    727	/* Validate the memory pages after they've been added in the RMP table. */
    728	pvalidate_pages(vaddr, npages, true);
    729}
    730
    731void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
    732					unsigned int npages)
    733{
    734	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
    735		return;
    736
    737	/* Invalidate the memory pages before they are marked shared in the RMP table. */
    738	pvalidate_pages(vaddr, npages, false);
    739
    740	 /* Ask hypervisor to mark the memory pages shared in the RMP table. */
    741	early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED);
    742}
    743
    744void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
    745{
    746	unsigned long vaddr, npages;
    747
    748	vaddr = (unsigned long)__va(paddr);
    749	npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
    750
    751	if (op == SNP_PAGE_STATE_PRIVATE)
    752		early_snp_set_memory_private(vaddr, paddr, npages);
    753	else if (op == SNP_PAGE_STATE_SHARED)
    754		early_snp_set_memory_shared(vaddr, paddr, npages);
    755	else
    756		WARN(1, "invalid memory op %d\n", op);
    757}
    758
    759static int vmgexit_psc(struct snp_psc_desc *desc)
    760{
    761	int cur_entry, end_entry, ret = 0;
    762	struct snp_psc_desc *data;
    763	struct ghcb_state state;
    764	struct es_em_ctxt ctxt;
    765	unsigned long flags;
    766	struct ghcb *ghcb;
    767
    768	/*
    769	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
    770	 * a per-CPU GHCB.
    771	 */
    772	local_irq_save(flags);
    773
    774	ghcb = __sev_get_ghcb(&state);
    775	if (!ghcb) {
    776		ret = 1;
    777		goto out_unlock;
    778	}
    779
    780	/* Copy the input desc into GHCB shared buffer */
    781	data = (struct snp_psc_desc *)ghcb->shared_buffer;
    782	memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
    783
    784	/*
    785	 * As per the GHCB specification, the hypervisor can resume the guest
    786	 * before processing all the entries. Check whether all the entries
    787	 * are processed. If not, then keep retrying. Note, the hypervisor
    788	 * will update the data memory directly to indicate the status, so
    789	 * reference the data->hdr everywhere.
    790	 *
    791	 * The strategy here is to wait for the hypervisor to change the page
    792	 * state in the RMP table before guest accesses the memory pages. If the
    793	 * page state change was not successful, then later memory access will
    794	 * result in a crash.
    795	 */
    796	cur_entry = data->hdr.cur_entry;
    797	end_entry = data->hdr.end_entry;
    798
    799	while (data->hdr.cur_entry <= data->hdr.end_entry) {
    800		ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
    801
    802		/* This will advance the shared buffer data points to. */
    803		ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
    804
    805		/*
    806		 * Page State Change VMGEXIT can pass error code through
    807		 * exit_info_2.
    808		 */
    809		if (WARN(ret || ghcb->save.sw_exit_info_2,
    810			 "SNP: PSC failed ret=%d exit_info_2=%llx\n",
    811			 ret, ghcb->save.sw_exit_info_2)) {
    812			ret = 1;
    813			goto out;
    814		}
    815
    816		/* Verify that reserved bit is not set */
    817		if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
    818			ret = 1;
    819			goto out;
    820		}
    821
    822		/*
    823		 * Sanity check that entry processing is not going backwards.
    824		 * This will happen only if hypervisor is tricking us.
    825		 */
    826		if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
    827"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
    828			 end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
    829			ret = 1;
    830			goto out;
    831		}
    832	}
    833
    834out:
    835	__sev_put_ghcb(&state);
    836
    837out_unlock:
    838	local_irq_restore(flags);
    839
    840	return ret;
    841}
    842
    843static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
    844			      unsigned long vaddr_end, int op)
    845{
    846	struct psc_hdr *hdr;
    847	struct psc_entry *e;
    848	unsigned long pfn;
    849	int i;
    850
    851	hdr = &data->hdr;
    852	e = data->entries;
    853
    854	memset(data, 0, sizeof(*data));
    855	i = 0;
    856
    857	while (vaddr < vaddr_end) {
    858		if (is_vmalloc_addr((void *)vaddr))
    859			pfn = vmalloc_to_pfn((void *)vaddr);
    860		else
    861			pfn = __pa(vaddr) >> PAGE_SHIFT;
    862
    863		e->gfn = pfn;
    864		e->operation = op;
    865		hdr->end_entry = i;
    866
    867		/*
    868		 * Current SNP implementation doesn't keep track of the RMP page
    869		 * size so use 4K for simplicity.
    870		 */
    871		e->pagesize = RMP_PG_SIZE_4K;
    872
    873		vaddr = vaddr + PAGE_SIZE;
    874		e++;
    875		i++;
    876	}
    877
    878	if (vmgexit_psc(data))
    879		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
    880}
    881
    882static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
    883{
    884	unsigned long vaddr_end, next_vaddr;
    885	struct snp_psc_desc *desc;
    886
    887	desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
    888	if (!desc)
    889		panic("SNP: failed to allocate memory for PSC descriptor\n");
    890
    891	vaddr = vaddr & PAGE_MASK;
    892	vaddr_end = vaddr + (npages << PAGE_SHIFT);
    893
    894	while (vaddr < vaddr_end) {
    895		/* Calculate the last vaddr that fits in one struct snp_psc_desc. */
    896		next_vaddr = min_t(unsigned long, vaddr_end,
    897				   (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
    898
    899		__set_pages_state(desc, vaddr, next_vaddr, op);
    900
    901		vaddr = next_vaddr;
    902	}
    903
    904	kfree(desc);
    905}
    906
    907void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
    908{
    909	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
    910		return;
    911
    912	pvalidate_pages(vaddr, npages, false);
    913
    914	set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
    915}
    916
    917void snp_set_memory_private(unsigned long vaddr, unsigned int npages)
    918{
    919	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
    920		return;
    921
    922	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
    923
    924	pvalidate_pages(vaddr, npages, true);
    925}
    926
    927static int snp_set_vmsa(void *va, bool vmsa)
    928{
    929	u64 attrs;
    930
    931	/*
    932	 * Running at VMPL0 allows the kernel to change the VMSA bit for a page
    933	 * using the RMPADJUST instruction. However, for the instruction to
    934	 * succeed it must target the permissions of a lesser privileged
    935	 * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
    936	 * instruction in the AMD64 APM Volume 3).
    937	 */
    938	attrs = 1;
    939	if (vmsa)
    940		attrs |= RMPADJUST_VMSA_PAGE_BIT;
    941
    942	return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
    943}
    944
    945#define __ATTR_BASE		(SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
    946#define INIT_CS_ATTRIBS		(__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
    947#define INIT_DS_ATTRIBS		(__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
    948
    949#define INIT_LDTR_ATTRIBS	(SVM_SELECTOR_P_MASK | 2)
    950#define INIT_TR_ATTRIBS		(SVM_SELECTOR_P_MASK | 3)
    951
    952static void *snp_alloc_vmsa_page(void)
    953{
    954	struct page *p;
    955
    956	/*
    957	 * Allocate VMSA page to work around the SNP erratum where the CPU will
    958	 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
    959	 * collides with the RMP entry of VMSA page. The recommended workaround
    960	 * is to not use a large page.
    961	 *
    962	 * Allocate an 8k page which is also 8k-aligned.
    963	 */
    964	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
    965	if (!p)
    966		return NULL;
    967
    968	split_page(p, 1);
    969
    970	/* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
    971	__free_page(p);
    972
    973	return page_address(p + 1);
    974}
    975
    976static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
    977{
    978	int err;
    979
    980	err = snp_set_vmsa(vmsa, false);
    981	if (err)
    982		pr_err("clear VMSA page failed (%u), leaking page\n", err);
    983	else
    984		free_page((unsigned long)vmsa);
    985}
    986
    987static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
    988{
    989	struct sev_es_save_area *cur_vmsa, *vmsa;
    990	struct ghcb_state state;
    991	unsigned long flags;
    992	struct ghcb *ghcb;
    993	u8 sipi_vector;
    994	int cpu, ret;
    995	u64 cr4;
    996
    997	/*
    998	 * The hypervisor SNP feature support check has happened earlier, just check
    999	 * the AP_CREATION one here.
   1000	 */
   1001	if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
   1002		return -EOPNOTSUPP;
   1003
   1004	/*
   1005	 * Verify the desired start IP against the known trampoline start IP
   1006	 * to catch any future new trampolines that may be introduced that
   1007	 * would require a new protected guest entry point.
   1008	 */
   1009	if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
   1010		      "Unsupported SNP start_ip: %lx\n", start_ip))
   1011		return -EINVAL;
   1012
   1013	/* Override start_ip with known protected guest start IP */
   1014	start_ip = real_mode_header->sev_es_trampoline_start;
   1015
   1016	/* Find the logical CPU for the APIC ID */
   1017	for_each_present_cpu(cpu) {
   1018		if (arch_match_cpu_phys_id(cpu, apic_id))
   1019			break;
   1020	}
   1021	if (cpu >= nr_cpu_ids)
   1022		return -EINVAL;
   1023
   1024	cur_vmsa = per_cpu(sev_vmsa, cpu);
   1025
   1026	/*
   1027	 * A new VMSA is created each time because there is no guarantee that
   1028	 * the current VMSA is the kernels or that the vCPU is not running. If
   1029	 * an attempt was done to use the current VMSA with a running vCPU, a
   1030	 * #VMEXIT of that vCPU would wipe out all of the settings being done
   1031	 * here.
   1032	 */
   1033	vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
   1034	if (!vmsa)
   1035		return -ENOMEM;
   1036
   1037	/* CR4 should maintain the MCE value */
   1038	cr4 = native_read_cr4() & X86_CR4_MCE;
   1039
   1040	/* Set the CS value based on the start_ip converted to a SIPI vector */
   1041	sipi_vector		= (start_ip >> 12);
   1042	vmsa->cs.base		= sipi_vector << 12;
   1043	vmsa->cs.limit		= AP_INIT_CS_LIMIT;
   1044	vmsa->cs.attrib		= INIT_CS_ATTRIBS;
   1045	vmsa->cs.selector	= sipi_vector << 8;
   1046
   1047	/* Set the RIP value based on start_ip */
   1048	vmsa->rip		= start_ip & 0xfff;
   1049
   1050	/* Set AP INIT defaults as documented in the APM */
   1051	vmsa->ds.limit		= AP_INIT_DS_LIMIT;
   1052	vmsa->ds.attrib		= INIT_DS_ATTRIBS;
   1053	vmsa->es		= vmsa->ds;
   1054	vmsa->fs		= vmsa->ds;
   1055	vmsa->gs		= vmsa->ds;
   1056	vmsa->ss		= vmsa->ds;
   1057
   1058	vmsa->gdtr.limit	= AP_INIT_GDTR_LIMIT;
   1059	vmsa->ldtr.limit	= AP_INIT_LDTR_LIMIT;
   1060	vmsa->ldtr.attrib	= INIT_LDTR_ATTRIBS;
   1061	vmsa->idtr.limit	= AP_INIT_IDTR_LIMIT;
   1062	vmsa->tr.limit		= AP_INIT_TR_LIMIT;
   1063	vmsa->tr.attrib		= INIT_TR_ATTRIBS;
   1064
   1065	vmsa->cr4		= cr4;
   1066	vmsa->cr0		= AP_INIT_CR0_DEFAULT;
   1067	vmsa->dr7		= DR7_RESET_VALUE;
   1068	vmsa->dr6		= AP_INIT_DR6_DEFAULT;
   1069	vmsa->rflags		= AP_INIT_RFLAGS_DEFAULT;
   1070	vmsa->g_pat		= AP_INIT_GPAT_DEFAULT;
   1071	vmsa->xcr0		= AP_INIT_XCR0_DEFAULT;
   1072	vmsa->mxcsr		= AP_INIT_MXCSR_DEFAULT;
   1073	vmsa->x87_ftw		= AP_INIT_X87_FTW_DEFAULT;
   1074	vmsa->x87_fcw		= AP_INIT_X87_FCW_DEFAULT;
   1075
   1076	/* SVME must be set. */
   1077	vmsa->efer		= EFER_SVME;
   1078
   1079	/*
   1080	 * Set the SNP-specific fields for this VMSA:
   1081	 *   VMPL level
   1082	 *   SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
   1083	 */
   1084	vmsa->vmpl		= 0;
   1085	vmsa->sev_features	= sev_status >> 2;
   1086
   1087	/* Switch the page over to a VMSA page now that it is initialized */
   1088	ret = snp_set_vmsa(vmsa, true);
   1089	if (ret) {
   1090		pr_err("set VMSA page failed (%u)\n", ret);
   1091		free_page((unsigned long)vmsa);
   1092
   1093		return -EINVAL;
   1094	}
   1095
   1096	/* Issue VMGEXIT AP Creation NAE event */
   1097	local_irq_save(flags);
   1098
   1099	ghcb = __sev_get_ghcb(&state);
   1100
   1101	vc_ghcb_invalidate(ghcb);
   1102	ghcb_set_rax(ghcb, vmsa->sev_features);
   1103	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
   1104	ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
   1105	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
   1106
   1107	sev_es_wr_ghcb_msr(__pa(ghcb));
   1108	VMGEXIT();
   1109
   1110	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
   1111	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
   1112		pr_err("SNP AP Creation error\n");
   1113		ret = -EINVAL;
   1114	}
   1115
   1116	__sev_put_ghcb(&state);
   1117
   1118	local_irq_restore(flags);
   1119
   1120	/* Perform cleanup if there was an error */
   1121	if (ret) {
   1122		snp_cleanup_vmsa(vmsa);
   1123		vmsa = NULL;
   1124	}
   1125
   1126	/* Free up any previous VMSA page */
   1127	if (cur_vmsa)
   1128		snp_cleanup_vmsa(cur_vmsa);
   1129
   1130	/* Record the current VMSA page */
   1131	per_cpu(sev_vmsa, cpu) = vmsa;
   1132
   1133	return ret;
   1134}
   1135
   1136void snp_set_wakeup_secondary_cpu(void)
   1137{
   1138	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
   1139		return;
   1140
   1141	/*
   1142	 * Always set this override if SNP is enabled. This makes it the
   1143	 * required method to start APs under SNP. If the hypervisor does
   1144	 * not support AP creation, then no APs will be started.
   1145	 */
   1146	apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit;
   1147}
   1148
   1149int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
   1150{
   1151	u16 startup_cs, startup_ip;
   1152	phys_addr_t jump_table_pa;
   1153	u64 jump_table_addr;
   1154	u16 __iomem *jump_table;
   1155
   1156	jump_table_addr = get_jump_table_addr();
   1157
   1158	/* On UP guests there is no jump table so this is not a failure */
   1159	if (!jump_table_addr)
   1160		return 0;
   1161
   1162	/* Check if AP Jump Table is page-aligned */
   1163	if (jump_table_addr & ~PAGE_MASK)
   1164		return -EINVAL;
   1165
   1166	jump_table_pa = jump_table_addr & PAGE_MASK;
   1167
   1168	startup_cs = (u16)(rmh->trampoline_start >> 4);
   1169	startup_ip = (u16)(rmh->sev_es_trampoline_start -
   1170			   rmh->trampoline_start);
   1171
   1172	jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
   1173	if (!jump_table)
   1174		return -EIO;
   1175
   1176	writew(startup_ip, &jump_table[0]);
   1177	writew(startup_cs, &jump_table[1]);
   1178
   1179	iounmap(jump_table);
   1180
   1181	return 0;
   1182}
   1183
   1184/*
   1185 * This is needed by the OVMF UEFI firmware which will use whatever it finds in
   1186 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
   1187 * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
   1188 */
   1189int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
   1190{
   1191	struct sev_es_runtime_data *data;
   1192	unsigned long address, pflags;
   1193	int cpu;
   1194	u64 pfn;
   1195
   1196	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
   1197		return 0;
   1198
   1199	pflags = _PAGE_NX | _PAGE_RW;
   1200
   1201	for_each_possible_cpu(cpu) {
   1202		data = per_cpu(runtime_data, cpu);
   1203
   1204		address = __pa(&data->ghcb_page);
   1205		pfn = address >> PAGE_SHIFT;
   1206
   1207		if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
   1208			return 1;
   1209	}
   1210
   1211	return 0;
   1212}
   1213
   1214static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
   1215{
   1216	struct pt_regs *regs = ctxt->regs;
   1217	enum es_result ret;
   1218	u64 exit_info_1;
   1219
   1220	/* Is it a WRMSR? */
   1221	exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
   1222
   1223	ghcb_set_rcx(ghcb, regs->cx);
   1224	if (exit_info_1) {
   1225		ghcb_set_rax(ghcb, regs->ax);
   1226		ghcb_set_rdx(ghcb, regs->dx);
   1227	}
   1228
   1229	ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR,
   1230				  exit_info_1, 0);
   1231
   1232	if ((ret == ES_OK) && (!exit_info_1)) {
   1233		regs->ax = ghcb->save.rax;
   1234		regs->dx = ghcb->save.rdx;
   1235	}
   1236
   1237	return ret;
   1238}
   1239
   1240static void snp_register_per_cpu_ghcb(void)
   1241{
   1242	struct sev_es_runtime_data *data;
   1243	struct ghcb *ghcb;
   1244
   1245	data = this_cpu_read(runtime_data);
   1246	ghcb = &data->ghcb_page;
   1247
   1248	snp_register_ghcb_early(__pa(ghcb));
   1249}
   1250
   1251void setup_ghcb(void)
   1252{
   1253	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
   1254		return;
   1255
   1256	/* First make sure the hypervisor talks a supported protocol. */
   1257	if (!sev_es_negotiate_protocol())
   1258		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
   1259
   1260	/*
   1261	 * Check whether the runtime #VC exception handler is active. It uses
   1262	 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
   1263	 *
   1264	 * If SNP is active, register the per-CPU GHCB page so that the runtime
   1265	 * exception handler can use it.
   1266	 */
   1267	if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
   1268		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
   1269			snp_register_per_cpu_ghcb();
   1270
   1271		return;
   1272	}
   1273
   1274	/*
   1275	 * Clear the boot_ghcb. The first exception comes in before the bss
   1276	 * section is cleared.
   1277	 */
   1278	memset(&boot_ghcb_page, 0, PAGE_SIZE);
   1279
   1280	/* Alright - Make the boot-ghcb public */
   1281	boot_ghcb = &boot_ghcb_page;
   1282
   1283	/* SNP guest requires that GHCB GPA must be registered. */
   1284	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
   1285		snp_register_ghcb_early(__pa(&boot_ghcb_page));
   1286}
   1287
   1288#ifdef CONFIG_HOTPLUG_CPU
   1289static void sev_es_ap_hlt_loop(void)
   1290{
   1291	struct ghcb_state state;
   1292	struct ghcb *ghcb;
   1293
   1294	ghcb = __sev_get_ghcb(&state);
   1295
   1296	while (true) {
   1297		vc_ghcb_invalidate(ghcb);
   1298		ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
   1299		ghcb_set_sw_exit_info_1(ghcb, 0);
   1300		ghcb_set_sw_exit_info_2(ghcb, 0);
   1301
   1302		sev_es_wr_ghcb_msr(__pa(ghcb));
   1303		VMGEXIT();
   1304
   1305		/* Wakeup signal? */
   1306		if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
   1307		    ghcb->save.sw_exit_info_2)
   1308			break;
   1309	}
   1310
   1311	__sev_put_ghcb(&state);
   1312}
   1313
   1314/*
   1315 * Play_dead handler when running under SEV-ES. This is needed because
   1316 * the hypervisor can't deliver an SIPI request to restart the AP.
   1317 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
   1318 * hypervisor wakes it up again.
   1319 */
   1320static void sev_es_play_dead(void)
   1321{
   1322	play_dead_common();
   1323
   1324	/* IRQs now disabled */
   1325
   1326	sev_es_ap_hlt_loop();
   1327
   1328	/*
   1329	 * If we get here, the VCPU was woken up again. Jump to CPU
   1330	 * startup code to get it back online.
   1331	 */
   1332	start_cpu0();
   1333}
   1334#else  /* CONFIG_HOTPLUG_CPU */
   1335#define sev_es_play_dead	native_play_dead
   1336#endif /* CONFIG_HOTPLUG_CPU */
   1337
   1338#ifdef CONFIG_SMP
   1339static void __init sev_es_setup_play_dead(void)
   1340{
   1341	smp_ops.play_dead = sev_es_play_dead;
   1342}
   1343#else
   1344static inline void sev_es_setup_play_dead(void) { }
   1345#endif
   1346
   1347static void __init alloc_runtime_data(int cpu)
   1348{
   1349	struct sev_es_runtime_data *data;
   1350
   1351	data = memblock_alloc(sizeof(*data), PAGE_SIZE);
   1352	if (!data)
   1353		panic("Can't allocate SEV-ES runtime data");
   1354
   1355	per_cpu(runtime_data, cpu) = data;
   1356}
   1357
   1358static void __init init_ghcb(int cpu)
   1359{
   1360	struct sev_es_runtime_data *data;
   1361	int err;
   1362
   1363	data = per_cpu(runtime_data, cpu);
   1364
   1365	err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
   1366					 sizeof(data->ghcb_page));
   1367	if (err)
   1368		panic("Can't map GHCBs unencrypted");
   1369
   1370	memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
   1371
   1372	data->ghcb_active = false;
   1373	data->backup_ghcb_active = false;
   1374}
   1375
   1376void __init sev_es_init_vc_handling(void)
   1377{
   1378	int cpu;
   1379
   1380	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
   1381
   1382	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
   1383		return;
   1384
   1385	if (!sev_es_check_cpu_features())
   1386		panic("SEV-ES CPU Features missing");
   1387
   1388	/*
   1389	 * SNP is supported in v2 of the GHCB spec which mandates support for HV
   1390	 * features.
   1391	 */
   1392	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
   1393		sev_hv_features = get_hv_features();
   1394
   1395		if (!(sev_hv_features & GHCB_HV_FT_SNP))
   1396			sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
   1397	}
   1398
   1399	/* Enable SEV-ES special handling */
   1400	static_branch_enable(&sev_es_enable_key);
   1401
   1402	/* Initialize per-cpu GHCB pages */
   1403	for_each_possible_cpu(cpu) {
   1404		alloc_runtime_data(cpu);
   1405		init_ghcb(cpu);
   1406	}
   1407
   1408	sev_es_setup_play_dead();
   1409
   1410	/* Secondary CPUs use the runtime #VC handler */
   1411	initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
   1412}
   1413
   1414static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
   1415{
   1416	int trapnr = ctxt->fi.vector;
   1417
   1418	if (trapnr == X86_TRAP_PF)
   1419		native_write_cr2(ctxt->fi.cr2);
   1420
   1421	ctxt->regs->orig_ax = ctxt->fi.error_code;
   1422	do_early_exception(ctxt->regs, trapnr);
   1423}
   1424
   1425static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
   1426{
   1427	long *reg_array;
   1428	int offset;
   1429
   1430	reg_array = (long *)ctxt->regs;
   1431	offset    = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
   1432
   1433	if (offset < 0)
   1434		return NULL;
   1435
   1436	offset /= sizeof(long);
   1437
   1438	return reg_array + offset;
   1439}
   1440static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
   1441				 unsigned int bytes, bool read)
   1442{
   1443	u64 exit_code, exit_info_1, exit_info_2;
   1444	unsigned long ghcb_pa = __pa(ghcb);
   1445	enum es_result res;
   1446	phys_addr_t paddr;
   1447	void __user *ref;
   1448
   1449	ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
   1450	if (ref == (void __user *)-1L)
   1451		return ES_UNSUPPORTED;
   1452
   1453	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
   1454
   1455	res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
   1456	if (res != ES_OK) {
   1457		if (res == ES_EXCEPTION && !read)
   1458			ctxt->fi.error_code |= X86_PF_WRITE;
   1459
   1460		return res;
   1461	}
   1462
   1463	exit_info_1 = paddr;
   1464	/* Can never be greater than 8 */
   1465	exit_info_2 = bytes;
   1466
   1467	ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
   1468
   1469	return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
   1470}
   1471
   1472/*
   1473 * The MOVS instruction has two memory operands, which raises the
   1474 * problem that it is not known whether the access to the source or the
   1475 * destination caused the #VC exception (and hence whether an MMIO read
   1476 * or write operation needs to be emulated).
   1477 *
   1478 * Instead of playing games with walking page-tables and trying to guess
   1479 * whether the source or destination is an MMIO range, split the move
   1480 * into two operations, a read and a write with only one memory operand.
   1481 * This will cause a nested #VC exception on the MMIO address which can
   1482 * then be handled.
   1483 *
   1484 * This implementation has the benefit that it also supports MOVS where
   1485 * source _and_ destination are MMIO regions.
   1486 *
   1487 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
   1488 * rare operation. If it turns out to be a performance problem the split
   1489 * operations can be moved to memcpy_fromio() and memcpy_toio().
   1490 */
   1491static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
   1492					  unsigned int bytes)
   1493{
   1494	unsigned long ds_base, es_base;
   1495	unsigned char *src, *dst;
   1496	unsigned char buffer[8];
   1497	enum es_result ret;
   1498	bool rep;
   1499	int off;
   1500
   1501	ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
   1502	es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
   1503
   1504	if (ds_base == -1L || es_base == -1L) {
   1505		ctxt->fi.vector = X86_TRAP_GP;
   1506		ctxt->fi.error_code = 0;
   1507		return ES_EXCEPTION;
   1508	}
   1509
   1510	src = ds_base + (unsigned char *)ctxt->regs->si;
   1511	dst = es_base + (unsigned char *)ctxt->regs->di;
   1512
   1513	ret = vc_read_mem(ctxt, src, buffer, bytes);
   1514	if (ret != ES_OK)
   1515		return ret;
   1516
   1517	ret = vc_write_mem(ctxt, dst, buffer, bytes);
   1518	if (ret != ES_OK)
   1519		return ret;
   1520
   1521	if (ctxt->regs->flags & X86_EFLAGS_DF)
   1522		off = -bytes;
   1523	else
   1524		off =  bytes;
   1525
   1526	ctxt->regs->si += off;
   1527	ctxt->regs->di += off;
   1528
   1529	rep = insn_has_rep_prefix(&ctxt->insn);
   1530	if (rep)
   1531		ctxt->regs->cx -= 1;
   1532
   1533	if (!rep || ctxt->regs->cx == 0)
   1534		return ES_OK;
   1535	else
   1536		return ES_RETRY;
   1537}
   1538
   1539static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
   1540{
   1541	struct insn *insn = &ctxt->insn;
   1542	unsigned int bytes = 0;
   1543	enum mmio_type mmio;
   1544	enum es_result ret;
   1545	u8 sign_byte;
   1546	long *reg_data;
   1547
   1548	mmio = insn_decode_mmio(insn, &bytes);
   1549	if (mmio == MMIO_DECODE_FAILED)
   1550		return ES_DECODE_FAILED;
   1551
   1552	if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
   1553		reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
   1554		if (!reg_data)
   1555			return ES_DECODE_FAILED;
   1556	}
   1557
   1558	switch (mmio) {
   1559	case MMIO_WRITE:
   1560		memcpy(ghcb->shared_buffer, reg_data, bytes);
   1561		ret = vc_do_mmio(ghcb, ctxt, bytes, false);
   1562		break;
   1563	case MMIO_WRITE_IMM:
   1564		memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
   1565		ret = vc_do_mmio(ghcb, ctxt, bytes, false);
   1566		break;
   1567	case MMIO_READ:
   1568		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
   1569		if (ret)
   1570			break;
   1571
   1572		/* Zero-extend for 32-bit operation */
   1573		if (bytes == 4)
   1574			*reg_data = 0;
   1575
   1576		memcpy(reg_data, ghcb->shared_buffer, bytes);
   1577		break;
   1578	case MMIO_READ_ZERO_EXTEND:
   1579		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
   1580		if (ret)
   1581			break;
   1582
   1583		/* Zero extend based on operand size */
   1584		memset(reg_data, 0, insn->opnd_bytes);
   1585		memcpy(reg_data, ghcb->shared_buffer, bytes);
   1586		break;
   1587	case MMIO_READ_SIGN_EXTEND:
   1588		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
   1589		if (ret)
   1590			break;
   1591
   1592		if (bytes == 1) {
   1593			u8 *val = (u8 *)ghcb->shared_buffer;
   1594
   1595			sign_byte = (*val & 0x80) ? 0xff : 0x00;
   1596		} else {
   1597			u16 *val = (u16 *)ghcb->shared_buffer;
   1598
   1599			sign_byte = (*val & 0x8000) ? 0xff : 0x00;
   1600		}
   1601
   1602		/* Sign extend based on operand size */
   1603		memset(reg_data, sign_byte, insn->opnd_bytes);
   1604		memcpy(reg_data, ghcb->shared_buffer, bytes);
   1605		break;
   1606	case MMIO_MOVS:
   1607		ret = vc_handle_mmio_movs(ctxt, bytes);
   1608		break;
   1609	default:
   1610		ret = ES_UNSUPPORTED;
   1611		break;
   1612	}
   1613
   1614	return ret;
   1615}
   1616
   1617static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
   1618					  struct es_em_ctxt *ctxt)
   1619{
   1620	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
   1621	long val, *reg = vc_insn_get_rm(ctxt);
   1622	enum es_result ret;
   1623
   1624	if (!reg)
   1625		return ES_DECODE_FAILED;
   1626
   1627	val = *reg;
   1628
   1629	/* Upper 32 bits must be written as zeroes */
   1630	if (val >> 32) {
   1631		ctxt->fi.vector = X86_TRAP_GP;
   1632		ctxt->fi.error_code = 0;
   1633		return ES_EXCEPTION;
   1634	}
   1635
   1636	/* Clear out other reserved bits and set bit 10 */
   1637	val = (val & 0xffff23ffL) | BIT(10);
   1638
   1639	/* Early non-zero writes to DR7 are not supported */
   1640	if (!data && (val & ~DR7_RESET_VALUE))
   1641		return ES_UNSUPPORTED;
   1642
   1643	/* Using a value of 0 for ExitInfo1 means RAX holds the value */
   1644	ghcb_set_rax(ghcb, val);
   1645	ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
   1646	if (ret != ES_OK)
   1647		return ret;
   1648
   1649	if (data)
   1650		data->dr7 = val;
   1651
   1652	return ES_OK;
   1653}
   1654
   1655static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
   1656					 struct es_em_ctxt *ctxt)
   1657{
   1658	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
   1659	long *reg = vc_insn_get_rm(ctxt);
   1660
   1661	if (!reg)
   1662		return ES_DECODE_FAILED;
   1663
   1664	if (data)
   1665		*reg = data->dr7;
   1666	else
   1667		*reg = DR7_RESET_VALUE;
   1668
   1669	return ES_OK;
   1670}
   1671
   1672static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
   1673				       struct es_em_ctxt *ctxt)
   1674{
   1675	return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0);
   1676}
   1677
   1678static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
   1679{
   1680	enum es_result ret;
   1681
   1682	ghcb_set_rcx(ghcb, ctxt->regs->cx);
   1683
   1684	ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0);
   1685	if (ret != ES_OK)
   1686		return ret;
   1687
   1688	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
   1689		return ES_VMM_ERROR;
   1690
   1691	ctxt->regs->ax = ghcb->save.rax;
   1692	ctxt->regs->dx = ghcb->save.rdx;
   1693
   1694	return ES_OK;
   1695}
   1696
   1697static enum es_result vc_handle_monitor(struct ghcb *ghcb,
   1698					struct es_em_ctxt *ctxt)
   1699{
   1700	/*
   1701	 * Treat it as a NOP and do not leak a physical address to the
   1702	 * hypervisor.
   1703	 */
   1704	return ES_OK;
   1705}
   1706
   1707static enum es_result vc_handle_mwait(struct ghcb *ghcb,
   1708				      struct es_em_ctxt *ctxt)
   1709{
   1710	/* Treat the same as MONITOR/MONITORX */
   1711	return ES_OK;
   1712}
   1713
   1714static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
   1715					struct es_em_ctxt *ctxt)
   1716{
   1717	enum es_result ret;
   1718
   1719	ghcb_set_rax(ghcb, ctxt->regs->ax);
   1720	ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
   1721
   1722	if (x86_platform.hyper.sev_es_hcall_prepare)
   1723		x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
   1724
   1725	ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0);
   1726	if (ret != ES_OK)
   1727		return ret;
   1728
   1729	if (!ghcb_rax_is_valid(ghcb))
   1730		return ES_VMM_ERROR;
   1731
   1732	ctxt->regs->ax = ghcb->save.rax;
   1733
   1734	/*
   1735	 * Call sev_es_hcall_finish() after regs->ax is already set.
   1736	 * This allows the hypervisor handler to overwrite it again if
   1737	 * necessary.
   1738	 */
   1739	if (x86_platform.hyper.sev_es_hcall_finish &&
   1740	    !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
   1741		return ES_VMM_ERROR;
   1742
   1743	return ES_OK;
   1744}
   1745
   1746static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
   1747					struct es_em_ctxt *ctxt)
   1748{
   1749	/*
   1750	 * Calling ecx_alignment_check() directly does not work, because it
   1751	 * enables IRQs and the GHCB is active. Forward the exception and call
   1752	 * it later from vc_forward_exception().
   1753	 */
   1754	ctxt->fi.vector = X86_TRAP_AC;
   1755	ctxt->fi.error_code = 0;
   1756	return ES_EXCEPTION;
   1757}
   1758
   1759static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
   1760					 struct ghcb *ghcb,
   1761					 unsigned long exit_code)
   1762{
   1763	enum es_result result;
   1764
   1765	switch (exit_code) {
   1766	case SVM_EXIT_READ_DR7:
   1767		result = vc_handle_dr7_read(ghcb, ctxt);
   1768		break;
   1769	case SVM_EXIT_WRITE_DR7:
   1770		result = vc_handle_dr7_write(ghcb, ctxt);
   1771		break;
   1772	case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
   1773		result = vc_handle_trap_ac(ghcb, ctxt);
   1774		break;
   1775	case SVM_EXIT_RDTSC:
   1776	case SVM_EXIT_RDTSCP:
   1777		result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
   1778		break;
   1779	case SVM_EXIT_RDPMC:
   1780		result = vc_handle_rdpmc(ghcb, ctxt);
   1781		break;
   1782	case SVM_EXIT_INVD:
   1783		pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
   1784		result = ES_UNSUPPORTED;
   1785		break;
   1786	case SVM_EXIT_CPUID:
   1787		result = vc_handle_cpuid(ghcb, ctxt);
   1788		break;
   1789	case SVM_EXIT_IOIO:
   1790		result = vc_handle_ioio(ghcb, ctxt);
   1791		break;
   1792	case SVM_EXIT_MSR:
   1793		result = vc_handle_msr(ghcb, ctxt);
   1794		break;
   1795	case SVM_EXIT_VMMCALL:
   1796		result = vc_handle_vmmcall(ghcb, ctxt);
   1797		break;
   1798	case SVM_EXIT_WBINVD:
   1799		result = vc_handle_wbinvd(ghcb, ctxt);
   1800		break;
   1801	case SVM_EXIT_MONITOR:
   1802		result = vc_handle_monitor(ghcb, ctxt);
   1803		break;
   1804	case SVM_EXIT_MWAIT:
   1805		result = vc_handle_mwait(ghcb, ctxt);
   1806		break;
   1807	case SVM_EXIT_NPF:
   1808		result = vc_handle_mmio(ghcb, ctxt);
   1809		break;
   1810	default:
   1811		/*
   1812		 * Unexpected #VC exception
   1813		 */
   1814		result = ES_UNSUPPORTED;
   1815	}
   1816
   1817	return result;
   1818}
   1819
   1820static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
   1821{
   1822	long error_code = ctxt->fi.error_code;
   1823	int trapnr = ctxt->fi.vector;
   1824
   1825	ctxt->regs->orig_ax = ctxt->fi.error_code;
   1826
   1827	switch (trapnr) {
   1828	case X86_TRAP_GP:
   1829		exc_general_protection(ctxt->regs, error_code);
   1830		break;
   1831	case X86_TRAP_UD:
   1832		exc_invalid_op(ctxt->regs);
   1833		break;
   1834	case X86_TRAP_PF:
   1835		write_cr2(ctxt->fi.cr2);
   1836		exc_page_fault(ctxt->regs, error_code);
   1837		break;
   1838	case X86_TRAP_AC:
   1839		exc_alignment_check(ctxt->regs, error_code);
   1840		break;
   1841	default:
   1842		pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
   1843		BUG();
   1844	}
   1845}
   1846
   1847static __always_inline bool is_vc2_stack(unsigned long sp)
   1848{
   1849	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
   1850}
   1851
   1852static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
   1853{
   1854	unsigned long sp, prev_sp;
   1855
   1856	sp      = (unsigned long)regs;
   1857	prev_sp = regs->sp;
   1858
   1859	/*
   1860	 * If the code was already executing on the VC2 stack when the #VC
   1861	 * happened, let it proceed to the normal handling routine. This way the
   1862	 * code executing on the VC2 stack can cause #VC exceptions to get handled.
   1863	 */
   1864	return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
   1865}
   1866
   1867static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
   1868{
   1869	struct ghcb_state state;
   1870	struct es_em_ctxt ctxt;
   1871	enum es_result result;
   1872	struct ghcb *ghcb;
   1873	bool ret = true;
   1874
   1875	ghcb = __sev_get_ghcb(&state);
   1876
   1877	vc_ghcb_invalidate(ghcb);
   1878	result = vc_init_em_ctxt(&ctxt, regs, error_code);
   1879
   1880	if (result == ES_OK)
   1881		result = vc_handle_exitcode(&ctxt, ghcb, error_code);
   1882
   1883	__sev_put_ghcb(&state);
   1884
   1885	/* Done - now check the result */
   1886	switch (result) {
   1887	case ES_OK:
   1888		vc_finish_insn(&ctxt);
   1889		break;
   1890	case ES_UNSUPPORTED:
   1891		pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
   1892				   error_code, regs->ip);
   1893		ret = false;
   1894		break;
   1895	case ES_VMM_ERROR:
   1896		pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
   1897				   error_code, regs->ip);
   1898		ret = false;
   1899		break;
   1900	case ES_DECODE_FAILED:
   1901		pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
   1902				   error_code, regs->ip);
   1903		ret = false;
   1904		break;
   1905	case ES_EXCEPTION:
   1906		vc_forward_exception(&ctxt);
   1907		break;
   1908	case ES_RETRY:
   1909		/* Nothing to do */
   1910		break;
   1911	default:
   1912		pr_emerg("Unknown result in %s():%d\n", __func__, result);
   1913		/*
   1914		 * Emulating the instruction which caused the #VC exception
   1915		 * failed - can't continue so print debug information
   1916		 */
   1917		BUG();
   1918	}
   1919
   1920	return ret;
   1921}
   1922
   1923static __always_inline bool vc_is_db(unsigned long error_code)
   1924{
   1925	return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
   1926}
   1927
   1928/*
   1929 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
   1930 * and will panic when an error happens.
   1931 */
   1932DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
   1933{
   1934	irqentry_state_t irq_state;
   1935
   1936	/*
   1937	 * With the current implementation it is always possible to switch to a
   1938	 * safe stack because #VC exceptions only happen at known places, like
   1939	 * intercepted instructions or accesses to MMIO areas/IO ports. They can
   1940	 * also happen with code instrumentation when the hypervisor intercepts
   1941	 * #DB, but the critical paths are forbidden to be instrumented, so #DB
   1942	 * exceptions currently also only happen in safe places.
   1943	 *
   1944	 * But keep this here in case the noinstr annotations are violated due
   1945	 * to bug elsewhere.
   1946	 */
   1947	if (unlikely(vc_from_invalid_context(regs))) {
   1948		instrumentation_begin();
   1949		panic("Can't handle #VC exception from unsupported context\n");
   1950		instrumentation_end();
   1951	}
   1952
   1953	/*
   1954	 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
   1955	 */
   1956	if (vc_is_db(error_code)) {
   1957		exc_debug(regs);
   1958		return;
   1959	}
   1960
   1961	irq_state = irqentry_nmi_enter(regs);
   1962
   1963	instrumentation_begin();
   1964
   1965	if (!vc_raw_handle_exception(regs, error_code)) {
   1966		/* Show some debug info */
   1967		show_regs(regs);
   1968
   1969		/* Ask hypervisor to sev_es_terminate */
   1970		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
   1971
   1972		/* If that fails and we get here - just panic */
   1973		panic("Returned from Terminate-Request to Hypervisor\n");
   1974	}
   1975
   1976	instrumentation_end();
   1977	irqentry_nmi_exit(regs, irq_state);
   1978}
   1979
   1980/*
   1981 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
   1982 * and will kill the current task with SIGBUS when an error happens.
   1983 */
   1984DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
   1985{
   1986	/*
   1987	 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
   1988	 */
   1989	if (vc_is_db(error_code)) {
   1990		noist_exc_debug(regs);
   1991		return;
   1992	}
   1993
   1994	irqentry_enter_from_user_mode(regs);
   1995	instrumentation_begin();
   1996
   1997	if (!vc_raw_handle_exception(regs, error_code)) {
   1998		/*
   1999		 * Do not kill the machine if user-space triggered the
   2000		 * exception. Send SIGBUS instead and let user-space deal with
   2001		 * it.
   2002		 */
   2003		force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
   2004	}
   2005
   2006	instrumentation_end();
   2007	irqentry_exit_to_user_mode(regs);
   2008}
   2009
   2010bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
   2011{
   2012	unsigned long exit_code = regs->orig_ax;
   2013	struct es_em_ctxt ctxt;
   2014	enum es_result result;
   2015
   2016	vc_ghcb_invalidate(boot_ghcb);
   2017
   2018	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
   2019	if (result == ES_OK)
   2020		result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
   2021
   2022	/* Done - now check the result */
   2023	switch (result) {
   2024	case ES_OK:
   2025		vc_finish_insn(&ctxt);
   2026		break;
   2027	case ES_UNSUPPORTED:
   2028		early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
   2029				exit_code, regs->ip);
   2030		goto fail;
   2031	case ES_VMM_ERROR:
   2032		early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
   2033				exit_code, regs->ip);
   2034		goto fail;
   2035	case ES_DECODE_FAILED:
   2036		early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
   2037				exit_code, regs->ip);
   2038		goto fail;
   2039	case ES_EXCEPTION:
   2040		vc_early_forward_exception(&ctxt);
   2041		break;
   2042	case ES_RETRY:
   2043		/* Nothing to do */
   2044		break;
   2045	default:
   2046		BUG();
   2047	}
   2048
   2049	return true;
   2050
   2051fail:
   2052	show_regs(regs);
   2053
   2054	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
   2055}
   2056
   2057/*
   2058 * Initial set up of SNP relies on information provided by the
   2059 * Confidential Computing blob, which can be passed to the kernel
   2060 * in the following ways, depending on how it is booted:
   2061 *
   2062 * - when booted via the boot/decompress kernel:
   2063 *   - via boot_params
   2064 *
   2065 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
   2066 *   - via a setup_data entry, as defined by the Linux Boot Protocol
   2067 *
   2068 * Scan for the blob in that order.
   2069 */
   2070static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
   2071{
   2072	struct cc_blob_sev_info *cc_info;
   2073
   2074	/* Boot kernel would have passed the CC blob via boot_params. */
   2075	if (bp->cc_blob_address) {
   2076		cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
   2077		goto found_cc_info;
   2078	}
   2079
   2080	/*
   2081	 * If kernel was booted directly, without the use of the
   2082	 * boot/decompression kernel, the CC blob may have been passed via
   2083	 * setup_data instead.
   2084	 */
   2085	cc_info = find_cc_blob_setup_data(bp);
   2086	if (!cc_info)
   2087		return NULL;
   2088
   2089found_cc_info:
   2090	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
   2091		snp_abort();
   2092
   2093	return cc_info;
   2094}
   2095
   2096bool __init snp_init(struct boot_params *bp)
   2097{
   2098	struct cc_blob_sev_info *cc_info;
   2099
   2100	if (!bp)
   2101		return false;
   2102
   2103	cc_info = find_cc_blob(bp);
   2104	if (!cc_info)
   2105		return false;
   2106
   2107	setup_cpuid_table(cc_info);
   2108
   2109	/*
   2110	 * The CC blob will be used later to access the secrets page. Cache
   2111	 * it here like the boot kernel does.
   2112	 */
   2113	bp->cc_blob_address = (u32)(unsigned long)cc_info;
   2114
   2115	return true;
   2116}
   2117
   2118void __init snp_abort(void)
   2119{
   2120	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
   2121}
   2122
   2123static void dump_cpuid_table(void)
   2124{
   2125	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
   2126	int i = 0;
   2127
   2128	pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
   2129		cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
   2130
   2131	for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
   2132		const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
   2133
   2134		pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
   2135			i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
   2136			fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
   2137	}
   2138}
   2139
   2140/*
   2141 * It is useful from an auditing/testing perspective to provide an easy way
   2142 * for the guest owner to know that the CPUID table has been initialized as
   2143 * expected, but that initialization happens too early in boot to print any
   2144 * sort of indicator, and there's not really any other good place to do it,
   2145 * so do it here.
   2146 */
   2147static int __init report_cpuid_table(void)
   2148{
   2149	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
   2150
   2151	if (!cpuid_table->count)
   2152		return 0;
   2153
   2154	pr_info("Using SNP CPUID table, %d entries present.\n",
   2155		cpuid_table->count);
   2156
   2157	if (sev_cfg.debug)
   2158		dump_cpuid_table();
   2159
   2160	return 0;
   2161}
   2162arch_initcall(report_cpuid_table);
   2163
   2164static int __init init_sev_config(char *str)
   2165{
   2166	char *s;
   2167
   2168	while ((s = strsep(&str, ","))) {
   2169		if (!strcmp(s, "debug")) {
   2170			sev_cfg.debug = true;
   2171			continue;
   2172		}
   2173
   2174		pr_info("SEV command-line option '%s' was not recognized\n", s);
   2175	}
   2176
   2177	return 1;
   2178}
   2179__setup("sev=", init_sev_config);
   2180
   2181int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
   2182{
   2183	struct ghcb_state state;
   2184	struct es_em_ctxt ctxt;
   2185	unsigned long flags;
   2186	struct ghcb *ghcb;
   2187	int ret;
   2188
   2189	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
   2190		return -ENODEV;
   2191
   2192	if (!fw_err)
   2193		return -EINVAL;
   2194
   2195	/*
   2196	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
   2197	 * a per-CPU GHCB.
   2198	 */
   2199	local_irq_save(flags);
   2200
   2201	ghcb = __sev_get_ghcb(&state);
   2202	if (!ghcb) {
   2203		ret = -EIO;
   2204		goto e_restore_irq;
   2205	}
   2206
   2207	vc_ghcb_invalidate(ghcb);
   2208
   2209	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
   2210		ghcb_set_rax(ghcb, input->data_gpa);
   2211		ghcb_set_rbx(ghcb, input->data_npages);
   2212	}
   2213
   2214	ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
   2215	if (ret)
   2216		goto e_put;
   2217
   2218	if (ghcb->save.sw_exit_info_2) {
   2219		/* Number of expected pages are returned in RBX */
   2220		if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST &&
   2221		    ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN)
   2222			input->data_npages = ghcb_get_rbx(ghcb);
   2223
   2224		*fw_err = ghcb->save.sw_exit_info_2;
   2225
   2226		ret = -EIO;
   2227	}
   2228
   2229e_put:
   2230	__sev_put_ghcb(&state);
   2231e_restore_irq:
   2232	local_irq_restore(flags);
   2233
   2234	return ret;
   2235}
   2236EXPORT_SYMBOL_GPL(snp_issue_guest_request);
   2237
   2238static struct platform_device sev_guest_device = {
   2239	.name		= "sev-guest",
   2240	.id		= -1,
   2241};
   2242
   2243static int __init snp_init_platform_device(void)
   2244{
   2245	struct sev_guest_platform_data data;
   2246	u64 gpa;
   2247
   2248	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
   2249		return -ENODEV;
   2250
   2251	gpa = get_secrets_page();
   2252	if (!gpa)
   2253		return -ENODEV;
   2254
   2255	data.secrets_gpa = gpa;
   2256	if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
   2257		return -ENODEV;
   2258
   2259	if (platform_device_register(&sev_guest_device))
   2260		return -ENODEV;
   2261
   2262	pr_info("SNP guest platform device initialized.\n");
   2263	return 0;
   2264}
   2265device_initcall(snp_init_platform_device);
   2266
   2267#undef pr_fmt
   2268#define pr_fmt(fmt)	"SEV-SNP: " fmt
   2269
   2270static int __snp_enable(unsigned int cpu)
   2271{
   2272	u64 val;
   2273
   2274	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
   2275		return 0;
   2276
   2277	rdmsrl(MSR_AMD64_SYSCFG, val);
   2278
   2279	val |= MSR_AMD64_SYSCFG_SNP_EN;
   2280	val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
   2281
   2282	wrmsrl(MSR_AMD64_SYSCFG, val);
   2283
   2284	return 0;
   2285}
   2286
   2287static __init void snp_enable(void *arg)
   2288{
   2289	__snp_enable(smp_processor_id());
   2290}
   2291
   2292static int __mfdm_enable(unsigned int cpu)
   2293{
   2294	u64 val;
   2295
   2296	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
   2297		return 0;
   2298
   2299	rdmsrl(MSR_AMD64_SYSCFG, val);
   2300
   2301	val |= MSR_AMD64_SYSCFG_MFDM;
   2302
   2303	wrmsrl(MSR_AMD64_SYSCFG, val);
   2304
   2305	return 0;
   2306}
   2307
   2308static __init void mfdm_enable(void *arg)
   2309{
   2310	__mfdm_enable(smp_processor_id());
   2311}
   2312
   2313static bool get_rmptable_info(u64 *start, u64 *len)
   2314{
   2315	u64 calc_rmp_sz, rmp_sz, rmp_base, rmp_end, nr_pages;
   2316
   2317	rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
   2318	rdmsrl(MSR_AMD64_RMP_END, rmp_end);
   2319
   2320	if (!rmp_base || !rmp_end) {
   2321		pr_info("Memory for the RMP table has not been reserved by BIOS\n");
   2322		return false;
   2323	}
   2324
   2325	rmp_sz = rmp_end - rmp_base + 1;
   2326
   2327	/*
   2328	 * Calculate the amount the memory that must be reserved by the BIOS to
   2329	 * address the full system RAM. The reserved memory should also cover the
   2330	 * RMP table itself.
   2331	 *
   2332	 * See PPR Family 19h Model 01h, Revision B1 section 2.1.4.2 for more
   2333	 * information on memory requirement.
   2334	 */
   2335	nr_pages = totalram_pages();
   2336	calc_rmp_sz = (((rmp_sz >> PAGE_SHIFT) + nr_pages) << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
   2337
   2338	if (calc_rmp_sz > rmp_sz) {
   2339		pr_info("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
   2340			calc_rmp_sz, rmp_sz);
   2341		return false;
   2342	}
   2343
   2344	*start = rmp_base;
   2345	*len = rmp_sz;
   2346
   2347	pr_info("RMP table physical address 0x%016llx - 0x%016llx\n", rmp_base, rmp_end);
   2348
   2349	return true;
   2350}
   2351
   2352static __init int __snp_rmptable_init(void)
   2353{
   2354	u64 rmp_base, sz;
   2355	void *start;
   2356	u64 val;
   2357
   2358	if (!get_rmptable_info(&rmp_base, &sz))
   2359		return 1;
   2360
   2361	start = memremap(rmp_base, sz, MEMREMAP_WB);
   2362	if (!start) {
   2363		pr_err("Failed to map RMP table 0x%llx+0x%llx\n", rmp_base, sz);
   2364		return 1;
   2365	}
   2366
   2367	/*
   2368	 * Check if SEV-SNP is already enabled, this can happen if we are coming from
   2369	 * kexec boot.
   2370	 */
   2371	rdmsrl(MSR_AMD64_SYSCFG, val);
   2372	if (val & MSR_AMD64_SYSCFG_SNP_EN)
   2373		goto skip_enable;
   2374
   2375	/* Initialize the RMP table to zero */
   2376	memset(start, 0, sz);
   2377
   2378	/* Flush the caches to ensure that data is written before SNP is enabled. */
   2379	wbinvd_on_all_cpus();
   2380
   2381	/* MFDM must be enabled on all the CPUs prior to enabling SNP. */
   2382	on_each_cpu(mfdm_enable, NULL, 1);
   2383
   2384	/* Enable SNP on all CPUs. */
   2385	on_each_cpu(snp_enable, NULL, 1);
   2386
   2387skip_enable:
   2388	rmptable_start = (unsigned long)start;
   2389	rmptable_end = rmptable_start + sz - 1;
   2390
   2391	return 0;
   2392}
   2393
   2394static int __init snp_rmptable_init(void)
   2395{
   2396	int family, model;
   2397
   2398	if (!boot_cpu_has(X86_FEATURE_SEV_SNP))
   2399		return 0;
   2400
   2401	family = boot_cpu_data.x86;
   2402	model  = boot_cpu_data.x86_model;
   2403
   2404	/*
   2405	 * RMP table entry format is not architectural and it can vary by processor and
   2406	 * is defined by the per-processor PPR. Restrict SNP support on the known CPU
   2407	 * model and family for which the RMP table entry format is currently defined for.
   2408	 */
   2409	if (family != 0x19 || model > 0xaf)
   2410		goto nosnp;
   2411
   2412	if (amd_iommu_snp_enable())
   2413		goto nosnp;
   2414
   2415	if (__snp_rmptable_init())
   2416		goto nosnp;
   2417
   2418	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
   2419
   2420	return 0;
   2421
   2422nosnp:
   2423	setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
   2424	return 1;
   2425}
   2426
   2427/*
   2428 * This must be called after the PCI subsystem. This is because before enabling
   2429 * the SNP feature we need to ensure that IOMMU supports the SNP feature.
   2430 * The amd_iommu_snp_enable() is used for checking and enabling the feature and,
   2431 * and it is available after subsys_initcall().
   2432 */
   2433fs_initcall(snp_rmptable_init);
   2434
   2435static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
   2436{
   2437	unsigned long vaddr, paddr = pfn << PAGE_SHIFT;
   2438	struct rmpentry *entry, *large_entry;
   2439
   2440	if (!pfn_valid(pfn))
   2441		return ERR_PTR(-EINVAL);
   2442
   2443	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
   2444		return ERR_PTR(-ENXIO);
   2445
   2446	vaddr = rmptable_start + rmptable_page_offset(paddr);
   2447	if (unlikely(vaddr > rmptable_end))
   2448		return ERR_PTR(-ENXIO);
   2449
   2450	entry = (struct rmpentry *)vaddr;
   2451
   2452	/* Read a large RMP entry to get the correct page level used in RMP entry. */
   2453	vaddr = rmptable_start + rmptable_page_offset(paddr & PMD_MASK);
   2454	large_entry = (struct rmpentry *)vaddr;
   2455	*level = RMP_TO_X86_PG_LEVEL(rmpentry_pagesize(large_entry));
   2456
   2457	return entry;
   2458}
   2459
   2460void dump_rmpentry(u64 pfn)
   2461{
   2462	unsigned long pfn_end;
   2463	struct rmpentry *e;
   2464	int level;
   2465
   2466	e = __snp_lookup_rmpentry(pfn, &level);
   2467	if (!e) {
   2468		pr_alert("failed to read RMP entry pfn 0x%llx\n", pfn);
   2469		return;
   2470	}
   2471
   2472	if (rmpentry_assigned(e)) {
   2473		pr_alert("RMPEntry paddr 0x%llx [assigned=%d immutable=%d pagesize=%d gpa=0x%lx"
   2474			" asid=%d vmsa=%d validated=%d]\n", pfn << PAGE_SHIFT,
   2475			rmpentry_assigned(e), rmpentry_immutable(e), rmpentry_pagesize(e),
   2476			rmpentry_gpa(e), rmpentry_asid(e), rmpentry_vmsa(e),
   2477			rmpentry_validated(e));
   2478		return;
   2479	}
   2480
   2481	/*
   2482	 * If the RMP entry at the faulting pfn was not assigned, then we do not
   2483	 * know what caused the RMP violation. To get some useful debug information,
   2484	 * let iterate through the entire 2MB region, and dump the RMP entries if
   2485	 * one of the bit in the RMP entry is set.
   2486	 */
   2487	pfn = pfn & ~(PTRS_PER_PMD - 1);
   2488	pfn_end = pfn + PTRS_PER_PMD;
   2489
   2490	while (pfn < pfn_end) {
   2491		e = __snp_lookup_rmpentry(pfn, &level);
   2492		if (!e)
   2493			return;
   2494
   2495		if (e->low || e->high)
   2496			pr_alert("RMPEntry paddr 0x%llx: [high=0x%016llx low=0x%016llx]\n",
   2497				 pfn << PAGE_SHIFT, e->high, e->low);
   2498		pfn++;
   2499	}
   2500}
   2501EXPORT_SYMBOL_GPL(dump_rmpentry);
   2502
   2503/*
   2504 * Return 1 if the RMP entry is assigned, 0 if it exists but is not assigned,
   2505 * and -errno if there is no corresponding RMP entry.
   2506 */
   2507int snp_lookup_rmpentry(u64 pfn, int *level)
   2508{
   2509	struct rmpentry *e;
   2510
   2511	e = __snp_lookup_rmpentry(pfn, level);
   2512	if (IS_ERR(e))
   2513		return PTR_ERR(e);
   2514
   2515	return !!rmpentry_assigned(e);
   2516}
   2517EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
   2518
   2519int psmash(u64 pfn)
   2520{
   2521	unsigned long paddr = pfn << PAGE_SHIFT;
   2522	int ret;
   2523
   2524	if (!pfn_valid(pfn))
   2525		return -EINVAL;
   2526
   2527	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
   2528		return -ENXIO;
   2529
   2530	/* Binutils version 2.36 supports the PSMASH mnemonic. */
   2531	asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
   2532		      : "=a"(ret)
   2533		      : "a"(paddr)
   2534		      : "memory", "cc");
   2535
   2536	return ret;
   2537}
   2538EXPORT_SYMBOL_GPL(psmash);
   2539
   2540static int restore_direct_map(u64 pfn, int npages)
   2541{
   2542	int i, ret = 0;
   2543
   2544	for (i = 0; i < npages; i++) {
   2545		ret = set_direct_map_default_noflush(pfn_to_page(pfn + i));
   2546		if (ret)
   2547			goto cleanup;
   2548	}
   2549
   2550cleanup:
   2551	WARN(ret > 0, "Failed to restore direct map for pfn 0x%llx\n", pfn + i);
   2552	return ret;
   2553}
   2554
   2555static int invalid_direct_map(unsigned long pfn, int npages)
   2556{
   2557	int i, ret = 0;
   2558
   2559	for (i = 0; i < npages; i++) {
   2560		ret = set_direct_map_invalid_noflush(pfn_to_page(pfn + i));
   2561		if (ret)
   2562			goto cleanup;
   2563	}
   2564
   2565	return 0;
   2566
   2567cleanup:
   2568	restore_direct_map(pfn, i);
   2569	return ret;
   2570}
   2571
   2572static int rmpupdate(u64 pfn, struct rmpupdate *val)
   2573{
   2574	unsigned long paddr = pfn << PAGE_SHIFT;
   2575	int ret, level, npages;
   2576	int retries = 0;
   2577
   2578	if (!pfn_valid(pfn))
   2579		return -EINVAL;
   2580
   2581	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
   2582		return -ENXIO;
   2583
   2584	level = RMP_TO_X86_PG_LEVEL(val->pagesize);
   2585	npages = page_level_size(level) / PAGE_SIZE;
   2586
   2587	/*
   2588	 * If page is getting assigned in the RMP table then unmap it from the
   2589	 * direct map.
   2590	 */
   2591	if (val->assigned) {
   2592		if (invalid_direct_map(pfn, npages)) {
   2593			pr_err("Failed to unmap pfn 0x%llx pages %d from direct_map\n",
   2594			       pfn, npages);
   2595			return -EFAULT;
   2596		}
   2597	}
   2598
   2599retry:
   2600	/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
   2601	asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
   2602		     : "=a"(ret)
   2603		     : "a"(paddr), "c"((unsigned long)val)
   2604		     : "memory", "cc");
   2605
   2606	if (ret) {
   2607		if (!retries) {
   2608			pr_err("rmpupdate failed, ret: %d, pfn: %llx, npages: %d, level: %d, retrying (max: %d)...\n",
   2609			       ret, pfn, npages, level, 2 * num_present_cpus());
   2610			dump_stack();
   2611		}
   2612		retries++;
   2613		if (retries < 2 * num_present_cpus())
   2614			goto retry;
   2615	} else if (retries > 0) {
   2616		pr_err("rmpupdate for pfn %llx succeeded after %d retries\n", pfn, retries);
   2617	}
   2618
   2619	/*
   2620	 * Restore the direct map after the page is removed from the RMP table.
   2621	 */
   2622	if (!ret && !val->assigned) {
   2623		if (restore_direct_map(pfn, npages)) {
   2624			pr_err("Failed to map pfn 0x%llx pages %d in direct_map\n",
   2625			       pfn, npages);
   2626			return -EFAULT;
   2627		}
   2628	}
   2629
   2630	return ret;
   2631}
   2632
   2633int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable)
   2634{
   2635	struct rmpupdate val;
   2636
   2637	if (!pfn_valid(pfn))
   2638		return -EINVAL;
   2639
   2640	memset(&val, 0, sizeof(val));
   2641	val.assigned = 1;
   2642	val.asid = asid;
   2643	val.immutable = immutable;
   2644	val.gpa = gpa;
   2645	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
   2646
   2647	return rmpupdate(pfn, &val);
   2648}
   2649EXPORT_SYMBOL_GPL(rmp_make_private);
   2650
   2651int rmp_make_shared(u64 pfn, enum pg_level level)
   2652{
   2653	struct rmpupdate val;
   2654
   2655	if (!pfn_valid(pfn))
   2656		return -EINVAL;
   2657
   2658	memset(&val, 0, sizeof(val));
   2659	val.pagesize = X86_TO_RMP_PG_LEVEL(level);
   2660
   2661	return rmpupdate(pfn, &val);
   2662}
   2663EXPORT_SYMBOL_GPL(rmp_make_shared);