sev.c (65956B)
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * AMD Memory Encryption Support 4 * 5 * Copyright (C) 2019 SUSE 6 * 7 * Author: Joerg Roedel <jroedel@suse.de> 8 */ 9 10#define pr_fmt(fmt) "SEV: " fmt 11 12#include <linux/sched/debug.h> /* For show_regs() */ 13#include <linux/percpu-defs.h> 14#include <linux/cc_platform.h> 15#include <linux/printk.h> 16#include <linux/mm_types.h> 17#include <linux/set_memory.h> 18#include <linux/memblock.h> 19#include <linux/kernel.h> 20#include <linux/mm.h> 21#include <linux/cpumask.h> 22#include <linux/efi.h> 23#include <linux/platform_device.h> 24#include <linux/io.h> 25#include <linux/cpumask.h> 26#include <linux/amd-iommu.h> 27 28#include <asm/cpu_entry_area.h> 29#include <asm/stacktrace.h> 30#include <asm/sev.h> 31#include <asm/insn-eval.h> 32#include <asm/fpu/xcr.h> 33#include <asm/processor.h> 34#include <asm/realmode.h> 35#include <asm/setup.h> 36#include <asm/traps.h> 37#include <asm/svm.h> 38#include <asm/smp.h> 39#include <asm/cpu.h> 40#include <asm/apic.h> 41#include <asm/cpuid.h> 42#include <asm/cmdline.h> 43 44#define DR7_RESET_VALUE 0x400 45 46/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ 47#define AP_INIT_CS_LIMIT 0xffff 48#define AP_INIT_DS_LIMIT 0xffff 49#define AP_INIT_LDTR_LIMIT 0xffff 50#define AP_INIT_GDTR_LIMIT 0xffff 51#define AP_INIT_IDTR_LIMIT 0xffff 52#define AP_INIT_TR_LIMIT 0xffff 53#define AP_INIT_RFLAGS_DEFAULT 0x2 54#define AP_INIT_DR6_DEFAULT 0xffff0ff0 55#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL 56#define AP_INIT_XCR0_DEFAULT 0x1 57#define AP_INIT_X87_FTW_DEFAULT 0x5555 58#define AP_INIT_X87_FCW_DEFAULT 0x0040 59#define AP_INIT_CR0_DEFAULT 0x60000010 60#define AP_INIT_MXCSR_DEFAULT 0x1f80 61 62/* 63 * The first 16KB from the RMP_BASE is used by the processor for the 64 * bookkeeping, the range need to be added during the RMP entry lookup. 65 */ 66#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000 67#define RMPENTRY_SHIFT 8 68#define rmptable_page_offset(x) (RMPTABLE_CPU_BOOKKEEPING_SZ + (((unsigned long)x) >> RMPENTRY_SHIFT)) 69 70/* For early boot hypervisor communication in SEV-ES enabled guests */ 71static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); 72 73/* 74 * Needs to be in the .data section because we need it NULL before bss is 75 * cleared 76 */ 77static struct ghcb *boot_ghcb __section(".data"); 78 79/* Bitmap of SEV features supported by the hypervisor */ 80static u64 sev_hv_features __ro_after_init; 81 82static unsigned long rmptable_start __ro_after_init; 83static unsigned long rmptable_end __ro_after_init; 84 85 86/* #VC handler runtime per-CPU data */ 87struct sev_es_runtime_data { 88 struct ghcb ghcb_page; 89 90 /* 91 * Reserve one page per CPU as backup storage for the unencrypted GHCB. 92 * It is needed when an NMI happens while the #VC handler uses the real 93 * GHCB, and the NMI handler itself is causing another #VC exception. In 94 * that case the GHCB content of the first handler needs to be backed up 95 * and restored. 96 */ 97 struct ghcb backup_ghcb; 98 99 /* 100 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. 101 * There is no need for it to be atomic, because nothing is written to 102 * the GHCB between the read and the write of ghcb_active. So it is safe 103 * to use it when a nested #VC exception happens before the write. 104 * 105 * This is necessary for example in the #VC->NMI->#VC case when the NMI 106 * happens while the first #VC handler uses the GHCB. When the NMI code 107 * raises a second #VC handler it might overwrite the contents of the 108 * GHCB written by the first handler. To avoid this the content of the 109 * GHCB is saved and restored when the GHCB is detected to be in use 110 * already. 111 */ 112 bool ghcb_active; 113 bool backup_ghcb_active; 114 115 /* 116 * Cached DR7 value - write it on DR7 writes and return it on reads. 117 * That value will never make it to the real hardware DR7 as debugging 118 * is currently unsupported in SEV-ES guests. 119 */ 120 unsigned long dr7; 121}; 122 123struct ghcb_state { 124 struct ghcb *ghcb; 125}; 126 127static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); 128DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); 129 130static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); 131 132struct sev_config { 133 __u64 debug : 1, 134 __reserved : 63; 135}; 136 137static struct sev_config sev_cfg __read_mostly; 138 139static __always_inline bool on_vc_stack(struct pt_regs *regs) 140{ 141 unsigned long sp = regs->sp; 142 143 /* User-mode RSP is not trusted */ 144 if (user_mode(regs)) 145 return false; 146 147 /* SYSCALL gap still has user-mode RSP */ 148 if (ip_within_syscall_gap(regs)) 149 return false; 150 151 return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); 152} 153 154/* 155 * This function handles the case when an NMI is raised in the #VC 156 * exception handler entry code, before the #VC handler has switched off 157 * its IST stack. In this case, the IST entry for #VC must be adjusted, 158 * so that any nested #VC exception will not overwrite the stack 159 * contents of the interrupted #VC handler. 160 * 161 * The IST entry is adjusted unconditionally so that it can be also be 162 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a 163 * nested sev_es_ist_exit() call may adjust back the IST entry too 164 * early. 165 * 166 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run 167 * on the NMI IST stack, as they are only called from NMI handling code 168 * right now. 169 */ 170void noinstr __sev_es_ist_enter(struct pt_regs *regs) 171{ 172 unsigned long old_ist, new_ist; 173 174 /* Read old IST entry */ 175 new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 176 177 /* 178 * If NMI happened while on the #VC IST stack, set the new IST 179 * value below regs->sp, so that the interrupted stack frame is 180 * not overwritten by subsequent #VC exceptions. 181 */ 182 if (on_vc_stack(regs)) 183 new_ist = regs->sp; 184 185 /* 186 * Reserve additional 8 bytes and store old IST value so this 187 * adjustment can be unrolled in __sev_es_ist_exit(). 188 */ 189 new_ist -= sizeof(old_ist); 190 *(unsigned long *)new_ist = old_ist; 191 192 /* Set new IST entry */ 193 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); 194} 195 196void noinstr __sev_es_ist_exit(void) 197{ 198 unsigned long ist; 199 200 /* Read IST entry */ 201 ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); 202 203 if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) 204 return; 205 206 /* Read back old IST entry and write it to the TSS */ 207 this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); 208} 209 210/* 211 * Nothing shall interrupt this code path while holding the per-CPU 212 * GHCB. The backup GHCB is only for NMIs interrupting this path. 213 * 214 * Callers must disable local interrupts around it. 215 */ 216static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) 217{ 218 struct sev_es_runtime_data *data; 219 struct ghcb *ghcb; 220 221 WARN_ON(!irqs_disabled()); 222 223 data = this_cpu_read(runtime_data); 224 ghcb = &data->ghcb_page; 225 226 if (unlikely(data->ghcb_active)) { 227 /* GHCB is already in use - save its contents */ 228 229 if (unlikely(data->backup_ghcb_active)) { 230 /* 231 * Backup-GHCB is also already in use. There is no way 232 * to continue here so just kill the machine. To make 233 * panic() work, mark GHCBs inactive so that messages 234 * can be printed out. 235 */ 236 data->ghcb_active = false; 237 data->backup_ghcb_active = false; 238 239 instrumentation_begin(); 240 panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); 241 instrumentation_end(); 242 } 243 244 /* Mark backup_ghcb active before writing to it */ 245 data->backup_ghcb_active = true; 246 247 state->ghcb = &data->backup_ghcb; 248 249 /* Backup GHCB content */ 250 *state->ghcb = *ghcb; 251 } else { 252 state->ghcb = NULL; 253 data->ghcb_active = true; 254 } 255 256 return ghcb; 257} 258 259static inline u64 sev_es_rd_ghcb_msr(void) 260{ 261 return __rdmsr(MSR_AMD64_SEV_ES_GHCB); 262} 263 264static __always_inline void sev_es_wr_ghcb_msr(u64 val) 265{ 266 u32 low, high; 267 268 low = (u32)(val); 269 high = (u32)(val >> 32); 270 271 native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); 272} 273 274static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, 275 unsigned char *buffer) 276{ 277 return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); 278} 279 280static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) 281{ 282 char buffer[MAX_INSN_SIZE]; 283 int insn_bytes; 284 285 insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); 286 if (insn_bytes == 0) { 287 /* Nothing could be copied */ 288 ctxt->fi.vector = X86_TRAP_PF; 289 ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; 290 ctxt->fi.cr2 = ctxt->regs->ip; 291 return ES_EXCEPTION; 292 } else if (insn_bytes == -EINVAL) { 293 /* Effective RIP could not be calculated */ 294 ctxt->fi.vector = X86_TRAP_GP; 295 ctxt->fi.error_code = 0; 296 ctxt->fi.cr2 = 0; 297 return ES_EXCEPTION; 298 } 299 300 if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) 301 return ES_DECODE_FAILED; 302 303 if (ctxt->insn.immediate.got) 304 return ES_OK; 305 else 306 return ES_DECODE_FAILED; 307} 308 309static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) 310{ 311 char buffer[MAX_INSN_SIZE]; 312 int res, ret; 313 314 res = vc_fetch_insn_kernel(ctxt, buffer); 315 if (res) { 316 ctxt->fi.vector = X86_TRAP_PF; 317 ctxt->fi.error_code = X86_PF_INSTR; 318 ctxt->fi.cr2 = ctxt->regs->ip; 319 return ES_EXCEPTION; 320 } 321 322 ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); 323 if (ret < 0) 324 return ES_DECODE_FAILED; 325 else 326 return ES_OK; 327} 328 329static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) 330{ 331 if (user_mode(ctxt->regs)) 332 return __vc_decode_user_insn(ctxt); 333 else 334 return __vc_decode_kern_insn(ctxt); 335} 336 337static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, 338 char *dst, char *buf, size_t size) 339{ 340 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; 341 342 /* 343 * This function uses __put_user() independent of whether kernel or user 344 * memory is accessed. This works fine because __put_user() does no 345 * sanity checks of the pointer being accessed. All that it does is 346 * to report when the access failed. 347 * 348 * Also, this function runs in atomic context, so __put_user() is not 349 * allowed to sleep. The page-fault handler detects that it is running 350 * in atomic context and will not try to take mmap_sem and handle the 351 * fault, so additional pagefault_enable()/disable() calls are not 352 * needed. 353 * 354 * The access can't be done via copy_to_user() here because 355 * vc_write_mem() must not use string instructions to access unsafe 356 * memory. The reason is that MOVS is emulated by the #VC handler by 357 * splitting the move up into a read and a write and taking a nested #VC 358 * exception on whatever of them is the MMIO access. Using string 359 * instructions here would cause infinite nesting. 360 */ 361 switch (size) { 362 case 1: { 363 u8 d1; 364 u8 __user *target = (u8 __user *)dst; 365 366 memcpy(&d1, buf, 1); 367 if (__put_user(d1, target)) 368 goto fault; 369 break; 370 } 371 case 2: { 372 u16 d2; 373 u16 __user *target = (u16 __user *)dst; 374 375 memcpy(&d2, buf, 2); 376 if (__put_user(d2, target)) 377 goto fault; 378 break; 379 } 380 case 4: { 381 u32 d4; 382 u32 __user *target = (u32 __user *)dst; 383 384 memcpy(&d4, buf, 4); 385 if (__put_user(d4, target)) 386 goto fault; 387 break; 388 } 389 case 8: { 390 u64 d8; 391 u64 __user *target = (u64 __user *)dst; 392 393 memcpy(&d8, buf, 8); 394 if (__put_user(d8, target)) 395 goto fault; 396 break; 397 } 398 default: 399 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 400 return ES_UNSUPPORTED; 401 } 402 403 return ES_OK; 404 405fault: 406 if (user_mode(ctxt->regs)) 407 error_code |= X86_PF_USER; 408 409 ctxt->fi.vector = X86_TRAP_PF; 410 ctxt->fi.error_code = error_code; 411 ctxt->fi.cr2 = (unsigned long)dst; 412 413 return ES_EXCEPTION; 414} 415 416static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, 417 char *src, char *buf, size_t size) 418{ 419 unsigned long error_code = X86_PF_PROT; 420 421 /* 422 * This function uses __get_user() independent of whether kernel or user 423 * memory is accessed. This works fine because __get_user() does no 424 * sanity checks of the pointer being accessed. All that it does is 425 * to report when the access failed. 426 * 427 * Also, this function runs in atomic context, so __get_user() is not 428 * allowed to sleep. The page-fault handler detects that it is running 429 * in atomic context and will not try to take mmap_sem and handle the 430 * fault, so additional pagefault_enable()/disable() calls are not 431 * needed. 432 * 433 * The access can't be done via copy_from_user() here because 434 * vc_read_mem() must not use string instructions to access unsafe 435 * memory. The reason is that MOVS is emulated by the #VC handler by 436 * splitting the move up into a read and a write and taking a nested #VC 437 * exception on whatever of them is the MMIO access. Using string 438 * instructions here would cause infinite nesting. 439 */ 440 switch (size) { 441 case 1: { 442 u8 d1; 443 u8 __user *s = (u8 __user *)src; 444 445 if (__get_user(d1, s)) 446 goto fault; 447 memcpy(buf, &d1, 1); 448 break; 449 } 450 case 2: { 451 u16 d2; 452 u16 __user *s = (u16 __user *)src; 453 454 if (__get_user(d2, s)) 455 goto fault; 456 memcpy(buf, &d2, 2); 457 break; 458 } 459 case 4: { 460 u32 d4; 461 u32 __user *s = (u32 __user *)src; 462 463 if (__get_user(d4, s)) 464 goto fault; 465 memcpy(buf, &d4, 4); 466 break; 467 } 468 case 8: { 469 u64 d8; 470 u64 __user *s = (u64 __user *)src; 471 if (__get_user(d8, s)) 472 goto fault; 473 memcpy(buf, &d8, 8); 474 break; 475 } 476 default: 477 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 478 return ES_UNSUPPORTED; 479 } 480 481 return ES_OK; 482 483fault: 484 if (user_mode(ctxt->regs)) 485 error_code |= X86_PF_USER; 486 487 ctxt->fi.vector = X86_TRAP_PF; 488 ctxt->fi.error_code = error_code; 489 ctxt->fi.cr2 = (unsigned long)src; 490 491 return ES_EXCEPTION; 492} 493 494static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 495 unsigned long vaddr, phys_addr_t *paddr) 496{ 497 unsigned long va = (unsigned long)vaddr; 498 unsigned int level; 499 phys_addr_t pa; 500 pgd_t *pgd; 501 pte_t *pte; 502 503 pgd = __va(read_cr3_pa()); 504 pgd = &pgd[pgd_index(va)]; 505 pte = lookup_address_in_pgd(pgd, va, &level); 506 if (!pte) { 507 ctxt->fi.vector = X86_TRAP_PF; 508 ctxt->fi.cr2 = vaddr; 509 ctxt->fi.error_code = 0; 510 511 if (user_mode(ctxt->regs)) 512 ctxt->fi.error_code |= X86_PF_USER; 513 514 return ES_EXCEPTION; 515 } 516 517 if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) 518 /* Emulated MMIO to/from encrypted memory not supported */ 519 return ES_UNSUPPORTED; 520 521 pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 522 pa |= va & ~page_level_mask(level); 523 524 *paddr = pa; 525 526 return ES_OK; 527} 528 529/* Include code shared with pre-decompression boot stage */ 530#include "sev-shared.c" 531 532static noinstr void __sev_put_ghcb(struct ghcb_state *state) 533{ 534 struct sev_es_runtime_data *data; 535 struct ghcb *ghcb; 536 537 WARN_ON(!irqs_disabled()); 538 539 data = this_cpu_read(runtime_data); 540 ghcb = &data->ghcb_page; 541 542 if (state->ghcb) { 543 /* Restore GHCB from Backup */ 544 *ghcb = *state->ghcb; 545 data->backup_ghcb_active = false; 546 state->ghcb = NULL; 547 } else { 548 /* 549 * Invalidate the GHCB so a VMGEXIT instruction issued 550 * from userspace won't appear to be valid. 551 */ 552 vc_ghcb_invalidate(ghcb); 553 data->ghcb_active = false; 554 } 555} 556 557void noinstr __sev_es_nmi_complete(void) 558{ 559 struct ghcb_state state; 560 struct ghcb *ghcb; 561 562 ghcb = __sev_get_ghcb(&state); 563 564 vc_ghcb_invalidate(ghcb); 565 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); 566 ghcb_set_sw_exit_info_1(ghcb, 0); 567 ghcb_set_sw_exit_info_2(ghcb, 0); 568 569 sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); 570 VMGEXIT(); 571 572 __sev_put_ghcb(&state); 573} 574 575static u64 __init get_secrets_page(void) 576{ 577 u64 pa_data = boot_params.cc_blob_address; 578 struct cc_blob_sev_info info; 579 void *map; 580 581 /* 582 * The CC blob contains the address of the secrets page, check if the 583 * blob is present. 584 */ 585 if (!pa_data) 586 return 0; 587 588 map = early_memremap(pa_data, sizeof(info)); 589 if (!map) { 590 pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); 591 return 0; 592 } 593 memcpy(&info, map, sizeof(info)); 594 early_memunmap(map, sizeof(info)); 595 596 /* smoke-test the secrets page passed */ 597 if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) 598 return 0; 599 600 return info.secrets_phys; 601} 602 603static u64 __init get_snp_jump_table_addr(void) 604{ 605 struct snp_secrets_page_layout *layout; 606 void __iomem *mem; 607 u64 pa, addr; 608 609 pa = get_secrets_page(); 610 if (!pa) 611 return 0; 612 613 mem = ioremap_encrypted(pa, PAGE_SIZE); 614 if (!mem) { 615 pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); 616 return 0; 617 } 618 619 layout = (__force struct snp_secrets_page_layout *)mem; 620 621 addr = layout->os_area.ap_jump_table_pa; 622 iounmap(mem); 623 624 return addr; 625} 626 627static u64 __init get_jump_table_addr(void) 628{ 629 struct ghcb_state state; 630 unsigned long flags; 631 struct ghcb *ghcb; 632 u64 ret = 0; 633 634 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 635 return get_snp_jump_table_addr(); 636 637 local_irq_save(flags); 638 639 ghcb = __sev_get_ghcb(&state); 640 641 vc_ghcb_invalidate(ghcb); 642 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); 643 ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); 644 ghcb_set_sw_exit_info_2(ghcb, 0); 645 646 sev_es_wr_ghcb_msr(__pa(ghcb)); 647 VMGEXIT(); 648 649 if (ghcb_sw_exit_info_1_is_valid(ghcb) && 650 ghcb_sw_exit_info_2_is_valid(ghcb)) 651 ret = ghcb->save.sw_exit_info_2; 652 653 __sev_put_ghcb(&state); 654 655 local_irq_restore(flags); 656 657 return ret; 658} 659 660static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) 661{ 662 unsigned long vaddr_end; 663 int rc; 664 665 vaddr = vaddr & PAGE_MASK; 666 vaddr_end = vaddr + (npages << PAGE_SHIFT); 667 668 while (vaddr < vaddr_end) { 669 rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); 670 if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) 671 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); 672 673 vaddr = vaddr + PAGE_SIZE; 674 } 675} 676 677static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) 678{ 679 unsigned long paddr_end; 680 u64 val; 681 682 paddr = paddr & PAGE_MASK; 683 paddr_end = paddr + (npages << PAGE_SHIFT); 684 685 while (paddr < paddr_end) { 686 /* 687 * Use the MSR protocol because this function can be called before 688 * the GHCB is established. 689 */ 690 sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); 691 VMGEXIT(); 692 693 val = sev_es_rd_ghcb_msr(); 694 695 if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, 696 "Wrong PSC response code: 0x%x\n", 697 (unsigned int)GHCB_RESP_CODE(val))) 698 goto e_term; 699 700 if (WARN(GHCB_MSR_PSC_RESP_VAL(val), 701 "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", 702 op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", 703 paddr, GHCB_MSR_PSC_RESP_VAL(val))) 704 goto e_term; 705 706 paddr = paddr + PAGE_SIZE; 707 } 708 709 return; 710 711e_term: 712 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 713} 714 715void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, 716 unsigned int npages) 717{ 718 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 719 return; 720 721 /* 722 * Ask the hypervisor to mark the memory pages as private in the RMP 723 * table. 724 */ 725 early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); 726 727 /* Validate the memory pages after they've been added in the RMP table. */ 728 pvalidate_pages(vaddr, npages, true); 729} 730 731void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, 732 unsigned int npages) 733{ 734 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 735 return; 736 737 /* Invalidate the memory pages before they are marked shared in the RMP table. */ 738 pvalidate_pages(vaddr, npages, false); 739 740 /* Ask hypervisor to mark the memory pages shared in the RMP table. */ 741 early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); 742} 743 744void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) 745{ 746 unsigned long vaddr, npages; 747 748 vaddr = (unsigned long)__va(paddr); 749 npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; 750 751 if (op == SNP_PAGE_STATE_PRIVATE) 752 early_snp_set_memory_private(vaddr, paddr, npages); 753 else if (op == SNP_PAGE_STATE_SHARED) 754 early_snp_set_memory_shared(vaddr, paddr, npages); 755 else 756 WARN(1, "invalid memory op %d\n", op); 757} 758 759static int vmgexit_psc(struct snp_psc_desc *desc) 760{ 761 int cur_entry, end_entry, ret = 0; 762 struct snp_psc_desc *data; 763 struct ghcb_state state; 764 struct es_em_ctxt ctxt; 765 unsigned long flags; 766 struct ghcb *ghcb; 767 768 /* 769 * __sev_get_ghcb() needs to run with IRQs disabled because it is using 770 * a per-CPU GHCB. 771 */ 772 local_irq_save(flags); 773 774 ghcb = __sev_get_ghcb(&state); 775 if (!ghcb) { 776 ret = 1; 777 goto out_unlock; 778 } 779 780 /* Copy the input desc into GHCB shared buffer */ 781 data = (struct snp_psc_desc *)ghcb->shared_buffer; 782 memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); 783 784 /* 785 * As per the GHCB specification, the hypervisor can resume the guest 786 * before processing all the entries. Check whether all the entries 787 * are processed. If not, then keep retrying. Note, the hypervisor 788 * will update the data memory directly to indicate the status, so 789 * reference the data->hdr everywhere. 790 * 791 * The strategy here is to wait for the hypervisor to change the page 792 * state in the RMP table before guest accesses the memory pages. If the 793 * page state change was not successful, then later memory access will 794 * result in a crash. 795 */ 796 cur_entry = data->hdr.cur_entry; 797 end_entry = data->hdr.end_entry; 798 799 while (data->hdr.cur_entry <= data->hdr.end_entry) { 800 ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); 801 802 /* This will advance the shared buffer data points to. */ 803 ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); 804 805 /* 806 * Page State Change VMGEXIT can pass error code through 807 * exit_info_2. 808 */ 809 if (WARN(ret || ghcb->save.sw_exit_info_2, 810 "SNP: PSC failed ret=%d exit_info_2=%llx\n", 811 ret, ghcb->save.sw_exit_info_2)) { 812 ret = 1; 813 goto out; 814 } 815 816 /* Verify that reserved bit is not set */ 817 if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { 818 ret = 1; 819 goto out; 820 } 821 822 /* 823 * Sanity check that entry processing is not going backwards. 824 * This will happen only if hypervisor is tricking us. 825 */ 826 if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, 827"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", 828 end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { 829 ret = 1; 830 goto out; 831 } 832 } 833 834out: 835 __sev_put_ghcb(&state); 836 837out_unlock: 838 local_irq_restore(flags); 839 840 return ret; 841} 842 843static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, 844 unsigned long vaddr_end, int op) 845{ 846 struct psc_hdr *hdr; 847 struct psc_entry *e; 848 unsigned long pfn; 849 int i; 850 851 hdr = &data->hdr; 852 e = data->entries; 853 854 memset(data, 0, sizeof(*data)); 855 i = 0; 856 857 while (vaddr < vaddr_end) { 858 if (is_vmalloc_addr((void *)vaddr)) 859 pfn = vmalloc_to_pfn((void *)vaddr); 860 else 861 pfn = __pa(vaddr) >> PAGE_SHIFT; 862 863 e->gfn = pfn; 864 e->operation = op; 865 hdr->end_entry = i; 866 867 /* 868 * Current SNP implementation doesn't keep track of the RMP page 869 * size so use 4K for simplicity. 870 */ 871 e->pagesize = RMP_PG_SIZE_4K; 872 873 vaddr = vaddr + PAGE_SIZE; 874 e++; 875 i++; 876 } 877 878 if (vmgexit_psc(data)) 879 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 880} 881 882static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) 883{ 884 unsigned long vaddr_end, next_vaddr; 885 struct snp_psc_desc *desc; 886 887 desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); 888 if (!desc) 889 panic("SNP: failed to allocate memory for PSC descriptor\n"); 890 891 vaddr = vaddr & PAGE_MASK; 892 vaddr_end = vaddr + (npages << PAGE_SHIFT); 893 894 while (vaddr < vaddr_end) { 895 /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ 896 next_vaddr = min_t(unsigned long, vaddr_end, 897 (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); 898 899 __set_pages_state(desc, vaddr, next_vaddr, op); 900 901 vaddr = next_vaddr; 902 } 903 904 kfree(desc); 905} 906 907void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) 908{ 909 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 910 return; 911 912 pvalidate_pages(vaddr, npages, false); 913 914 set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); 915} 916 917void snp_set_memory_private(unsigned long vaddr, unsigned int npages) 918{ 919 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 920 return; 921 922 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 923 924 pvalidate_pages(vaddr, npages, true); 925} 926 927static int snp_set_vmsa(void *va, bool vmsa) 928{ 929 u64 attrs; 930 931 /* 932 * Running at VMPL0 allows the kernel to change the VMSA bit for a page 933 * using the RMPADJUST instruction. However, for the instruction to 934 * succeed it must target the permissions of a lesser privileged 935 * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST 936 * instruction in the AMD64 APM Volume 3). 937 */ 938 attrs = 1; 939 if (vmsa) 940 attrs |= RMPADJUST_VMSA_PAGE_BIT; 941 942 return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); 943} 944 945#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) 946#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) 947#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) 948 949#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) 950#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) 951 952static void *snp_alloc_vmsa_page(void) 953{ 954 struct page *p; 955 956 /* 957 * Allocate VMSA page to work around the SNP erratum where the CPU will 958 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) 959 * collides with the RMP entry of VMSA page. The recommended workaround 960 * is to not use a large page. 961 * 962 * Allocate an 8k page which is also 8k-aligned. 963 */ 964 p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); 965 if (!p) 966 return NULL; 967 968 split_page(p, 1); 969 970 /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ 971 __free_page(p); 972 973 return page_address(p + 1); 974} 975 976static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) 977{ 978 int err; 979 980 err = snp_set_vmsa(vmsa, false); 981 if (err) 982 pr_err("clear VMSA page failed (%u), leaking page\n", err); 983 else 984 free_page((unsigned long)vmsa); 985} 986 987static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) 988{ 989 struct sev_es_save_area *cur_vmsa, *vmsa; 990 struct ghcb_state state; 991 unsigned long flags; 992 struct ghcb *ghcb; 993 u8 sipi_vector; 994 int cpu, ret; 995 u64 cr4; 996 997 /* 998 * The hypervisor SNP feature support check has happened earlier, just check 999 * the AP_CREATION one here. 1000 */ 1001 if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) 1002 return -EOPNOTSUPP; 1003 1004 /* 1005 * Verify the desired start IP against the known trampoline start IP 1006 * to catch any future new trampolines that may be introduced that 1007 * would require a new protected guest entry point. 1008 */ 1009 if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, 1010 "Unsupported SNP start_ip: %lx\n", start_ip)) 1011 return -EINVAL; 1012 1013 /* Override start_ip with known protected guest start IP */ 1014 start_ip = real_mode_header->sev_es_trampoline_start; 1015 1016 /* Find the logical CPU for the APIC ID */ 1017 for_each_present_cpu(cpu) { 1018 if (arch_match_cpu_phys_id(cpu, apic_id)) 1019 break; 1020 } 1021 if (cpu >= nr_cpu_ids) 1022 return -EINVAL; 1023 1024 cur_vmsa = per_cpu(sev_vmsa, cpu); 1025 1026 /* 1027 * A new VMSA is created each time because there is no guarantee that 1028 * the current VMSA is the kernels or that the vCPU is not running. If 1029 * an attempt was done to use the current VMSA with a running vCPU, a 1030 * #VMEXIT of that vCPU would wipe out all of the settings being done 1031 * here. 1032 */ 1033 vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); 1034 if (!vmsa) 1035 return -ENOMEM; 1036 1037 /* CR4 should maintain the MCE value */ 1038 cr4 = native_read_cr4() & X86_CR4_MCE; 1039 1040 /* Set the CS value based on the start_ip converted to a SIPI vector */ 1041 sipi_vector = (start_ip >> 12); 1042 vmsa->cs.base = sipi_vector << 12; 1043 vmsa->cs.limit = AP_INIT_CS_LIMIT; 1044 vmsa->cs.attrib = INIT_CS_ATTRIBS; 1045 vmsa->cs.selector = sipi_vector << 8; 1046 1047 /* Set the RIP value based on start_ip */ 1048 vmsa->rip = start_ip & 0xfff; 1049 1050 /* Set AP INIT defaults as documented in the APM */ 1051 vmsa->ds.limit = AP_INIT_DS_LIMIT; 1052 vmsa->ds.attrib = INIT_DS_ATTRIBS; 1053 vmsa->es = vmsa->ds; 1054 vmsa->fs = vmsa->ds; 1055 vmsa->gs = vmsa->ds; 1056 vmsa->ss = vmsa->ds; 1057 1058 vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; 1059 vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; 1060 vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; 1061 vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; 1062 vmsa->tr.limit = AP_INIT_TR_LIMIT; 1063 vmsa->tr.attrib = INIT_TR_ATTRIBS; 1064 1065 vmsa->cr4 = cr4; 1066 vmsa->cr0 = AP_INIT_CR0_DEFAULT; 1067 vmsa->dr7 = DR7_RESET_VALUE; 1068 vmsa->dr6 = AP_INIT_DR6_DEFAULT; 1069 vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; 1070 vmsa->g_pat = AP_INIT_GPAT_DEFAULT; 1071 vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; 1072 vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; 1073 vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; 1074 vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; 1075 1076 /* SVME must be set. */ 1077 vmsa->efer = EFER_SVME; 1078 1079 /* 1080 * Set the SNP-specific fields for this VMSA: 1081 * VMPL level 1082 * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) 1083 */ 1084 vmsa->vmpl = 0; 1085 vmsa->sev_features = sev_status >> 2; 1086 1087 /* Switch the page over to a VMSA page now that it is initialized */ 1088 ret = snp_set_vmsa(vmsa, true); 1089 if (ret) { 1090 pr_err("set VMSA page failed (%u)\n", ret); 1091 free_page((unsigned long)vmsa); 1092 1093 return -EINVAL; 1094 } 1095 1096 /* Issue VMGEXIT AP Creation NAE event */ 1097 local_irq_save(flags); 1098 1099 ghcb = __sev_get_ghcb(&state); 1100 1101 vc_ghcb_invalidate(ghcb); 1102 ghcb_set_rax(ghcb, vmsa->sev_features); 1103 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); 1104 ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); 1105 ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); 1106 1107 sev_es_wr_ghcb_msr(__pa(ghcb)); 1108 VMGEXIT(); 1109 1110 if (!ghcb_sw_exit_info_1_is_valid(ghcb) || 1111 lower_32_bits(ghcb->save.sw_exit_info_1)) { 1112 pr_err("SNP AP Creation error\n"); 1113 ret = -EINVAL; 1114 } 1115 1116 __sev_put_ghcb(&state); 1117 1118 local_irq_restore(flags); 1119 1120 /* Perform cleanup if there was an error */ 1121 if (ret) { 1122 snp_cleanup_vmsa(vmsa); 1123 vmsa = NULL; 1124 } 1125 1126 /* Free up any previous VMSA page */ 1127 if (cur_vmsa) 1128 snp_cleanup_vmsa(cur_vmsa); 1129 1130 /* Record the current VMSA page */ 1131 per_cpu(sev_vmsa, cpu) = vmsa; 1132 1133 return ret; 1134} 1135 1136void snp_set_wakeup_secondary_cpu(void) 1137{ 1138 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1139 return; 1140 1141 /* 1142 * Always set this override if SNP is enabled. This makes it the 1143 * required method to start APs under SNP. If the hypervisor does 1144 * not support AP creation, then no APs will be started. 1145 */ 1146 apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; 1147} 1148 1149int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) 1150{ 1151 u16 startup_cs, startup_ip; 1152 phys_addr_t jump_table_pa; 1153 u64 jump_table_addr; 1154 u16 __iomem *jump_table; 1155 1156 jump_table_addr = get_jump_table_addr(); 1157 1158 /* On UP guests there is no jump table so this is not a failure */ 1159 if (!jump_table_addr) 1160 return 0; 1161 1162 /* Check if AP Jump Table is page-aligned */ 1163 if (jump_table_addr & ~PAGE_MASK) 1164 return -EINVAL; 1165 1166 jump_table_pa = jump_table_addr & PAGE_MASK; 1167 1168 startup_cs = (u16)(rmh->trampoline_start >> 4); 1169 startup_ip = (u16)(rmh->sev_es_trampoline_start - 1170 rmh->trampoline_start); 1171 1172 jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); 1173 if (!jump_table) 1174 return -EIO; 1175 1176 writew(startup_ip, &jump_table[0]); 1177 writew(startup_cs, &jump_table[1]); 1178 1179 iounmap(jump_table); 1180 1181 return 0; 1182} 1183 1184/* 1185 * This is needed by the OVMF UEFI firmware which will use whatever it finds in 1186 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu 1187 * runtime GHCBs used by the kernel are also mapped in the EFI page-table. 1188 */ 1189int __init sev_es_efi_map_ghcbs(pgd_t *pgd) 1190{ 1191 struct sev_es_runtime_data *data; 1192 unsigned long address, pflags; 1193 int cpu; 1194 u64 pfn; 1195 1196 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1197 return 0; 1198 1199 pflags = _PAGE_NX | _PAGE_RW; 1200 1201 for_each_possible_cpu(cpu) { 1202 data = per_cpu(runtime_data, cpu); 1203 1204 address = __pa(&data->ghcb_page); 1205 pfn = address >> PAGE_SHIFT; 1206 1207 if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) 1208 return 1; 1209 } 1210 1211 return 0; 1212} 1213 1214static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1215{ 1216 struct pt_regs *regs = ctxt->regs; 1217 enum es_result ret; 1218 u64 exit_info_1; 1219 1220 /* Is it a WRMSR? */ 1221 exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; 1222 1223 ghcb_set_rcx(ghcb, regs->cx); 1224 if (exit_info_1) { 1225 ghcb_set_rax(ghcb, regs->ax); 1226 ghcb_set_rdx(ghcb, regs->dx); 1227 } 1228 1229 ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, 1230 exit_info_1, 0); 1231 1232 if ((ret == ES_OK) && (!exit_info_1)) { 1233 regs->ax = ghcb->save.rax; 1234 regs->dx = ghcb->save.rdx; 1235 } 1236 1237 return ret; 1238} 1239 1240static void snp_register_per_cpu_ghcb(void) 1241{ 1242 struct sev_es_runtime_data *data; 1243 struct ghcb *ghcb; 1244 1245 data = this_cpu_read(runtime_data); 1246 ghcb = &data->ghcb_page; 1247 1248 snp_register_ghcb_early(__pa(ghcb)); 1249} 1250 1251void setup_ghcb(void) 1252{ 1253 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1254 return; 1255 1256 /* First make sure the hypervisor talks a supported protocol. */ 1257 if (!sev_es_negotiate_protocol()) 1258 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1259 1260 /* 1261 * Check whether the runtime #VC exception handler is active. It uses 1262 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). 1263 * 1264 * If SNP is active, register the per-CPU GHCB page so that the runtime 1265 * exception handler can use it. 1266 */ 1267 if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { 1268 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1269 snp_register_per_cpu_ghcb(); 1270 1271 return; 1272 } 1273 1274 /* 1275 * Clear the boot_ghcb. The first exception comes in before the bss 1276 * section is cleared. 1277 */ 1278 memset(&boot_ghcb_page, 0, PAGE_SIZE); 1279 1280 /* Alright - Make the boot-ghcb public */ 1281 boot_ghcb = &boot_ghcb_page; 1282 1283 /* SNP guest requires that GHCB GPA must be registered. */ 1284 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1285 snp_register_ghcb_early(__pa(&boot_ghcb_page)); 1286} 1287 1288#ifdef CONFIG_HOTPLUG_CPU 1289static void sev_es_ap_hlt_loop(void) 1290{ 1291 struct ghcb_state state; 1292 struct ghcb *ghcb; 1293 1294 ghcb = __sev_get_ghcb(&state); 1295 1296 while (true) { 1297 vc_ghcb_invalidate(ghcb); 1298 ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); 1299 ghcb_set_sw_exit_info_1(ghcb, 0); 1300 ghcb_set_sw_exit_info_2(ghcb, 0); 1301 1302 sev_es_wr_ghcb_msr(__pa(ghcb)); 1303 VMGEXIT(); 1304 1305 /* Wakeup signal? */ 1306 if (ghcb_sw_exit_info_2_is_valid(ghcb) && 1307 ghcb->save.sw_exit_info_2) 1308 break; 1309 } 1310 1311 __sev_put_ghcb(&state); 1312} 1313 1314/* 1315 * Play_dead handler when running under SEV-ES. This is needed because 1316 * the hypervisor can't deliver an SIPI request to restart the AP. 1317 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the 1318 * hypervisor wakes it up again. 1319 */ 1320static void sev_es_play_dead(void) 1321{ 1322 play_dead_common(); 1323 1324 /* IRQs now disabled */ 1325 1326 sev_es_ap_hlt_loop(); 1327 1328 /* 1329 * If we get here, the VCPU was woken up again. Jump to CPU 1330 * startup code to get it back online. 1331 */ 1332 start_cpu0(); 1333} 1334#else /* CONFIG_HOTPLUG_CPU */ 1335#define sev_es_play_dead native_play_dead 1336#endif /* CONFIG_HOTPLUG_CPU */ 1337 1338#ifdef CONFIG_SMP 1339static void __init sev_es_setup_play_dead(void) 1340{ 1341 smp_ops.play_dead = sev_es_play_dead; 1342} 1343#else 1344static inline void sev_es_setup_play_dead(void) { } 1345#endif 1346 1347static void __init alloc_runtime_data(int cpu) 1348{ 1349 struct sev_es_runtime_data *data; 1350 1351 data = memblock_alloc(sizeof(*data), PAGE_SIZE); 1352 if (!data) 1353 panic("Can't allocate SEV-ES runtime data"); 1354 1355 per_cpu(runtime_data, cpu) = data; 1356} 1357 1358static void __init init_ghcb(int cpu) 1359{ 1360 struct sev_es_runtime_data *data; 1361 int err; 1362 1363 data = per_cpu(runtime_data, cpu); 1364 1365 err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, 1366 sizeof(data->ghcb_page)); 1367 if (err) 1368 panic("Can't map GHCBs unencrypted"); 1369 1370 memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); 1371 1372 data->ghcb_active = false; 1373 data->backup_ghcb_active = false; 1374} 1375 1376void __init sev_es_init_vc_handling(void) 1377{ 1378 int cpu; 1379 1380 BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); 1381 1382 if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) 1383 return; 1384 1385 if (!sev_es_check_cpu_features()) 1386 panic("SEV-ES CPU Features missing"); 1387 1388 /* 1389 * SNP is supported in v2 of the GHCB spec which mandates support for HV 1390 * features. 1391 */ 1392 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { 1393 sev_hv_features = get_hv_features(); 1394 1395 if (!(sev_hv_features & GHCB_HV_FT_SNP)) 1396 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 1397 } 1398 1399 /* Enable SEV-ES special handling */ 1400 static_branch_enable(&sev_es_enable_key); 1401 1402 /* Initialize per-cpu GHCB pages */ 1403 for_each_possible_cpu(cpu) { 1404 alloc_runtime_data(cpu); 1405 init_ghcb(cpu); 1406 } 1407 1408 sev_es_setup_play_dead(); 1409 1410 /* Secondary CPUs use the runtime #VC handler */ 1411 initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; 1412} 1413 1414static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) 1415{ 1416 int trapnr = ctxt->fi.vector; 1417 1418 if (trapnr == X86_TRAP_PF) 1419 native_write_cr2(ctxt->fi.cr2); 1420 1421 ctxt->regs->orig_ax = ctxt->fi.error_code; 1422 do_early_exception(ctxt->regs, trapnr); 1423} 1424 1425static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) 1426{ 1427 long *reg_array; 1428 int offset; 1429 1430 reg_array = (long *)ctxt->regs; 1431 offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); 1432 1433 if (offset < 0) 1434 return NULL; 1435 1436 offset /= sizeof(long); 1437 1438 return reg_array + offset; 1439} 1440static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, 1441 unsigned int bytes, bool read) 1442{ 1443 u64 exit_code, exit_info_1, exit_info_2; 1444 unsigned long ghcb_pa = __pa(ghcb); 1445 enum es_result res; 1446 phys_addr_t paddr; 1447 void __user *ref; 1448 1449 ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); 1450 if (ref == (void __user *)-1L) 1451 return ES_UNSUPPORTED; 1452 1453 exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; 1454 1455 res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); 1456 if (res != ES_OK) { 1457 if (res == ES_EXCEPTION && !read) 1458 ctxt->fi.error_code |= X86_PF_WRITE; 1459 1460 return res; 1461 } 1462 1463 exit_info_1 = paddr; 1464 /* Can never be greater than 8 */ 1465 exit_info_2 = bytes; 1466 1467 ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); 1468 1469 return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); 1470} 1471 1472/* 1473 * The MOVS instruction has two memory operands, which raises the 1474 * problem that it is not known whether the access to the source or the 1475 * destination caused the #VC exception (and hence whether an MMIO read 1476 * or write operation needs to be emulated). 1477 * 1478 * Instead of playing games with walking page-tables and trying to guess 1479 * whether the source or destination is an MMIO range, split the move 1480 * into two operations, a read and a write with only one memory operand. 1481 * This will cause a nested #VC exception on the MMIO address which can 1482 * then be handled. 1483 * 1484 * This implementation has the benefit that it also supports MOVS where 1485 * source _and_ destination are MMIO regions. 1486 * 1487 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a 1488 * rare operation. If it turns out to be a performance problem the split 1489 * operations can be moved to memcpy_fromio() and memcpy_toio(). 1490 */ 1491static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, 1492 unsigned int bytes) 1493{ 1494 unsigned long ds_base, es_base; 1495 unsigned char *src, *dst; 1496 unsigned char buffer[8]; 1497 enum es_result ret; 1498 bool rep; 1499 int off; 1500 1501 ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); 1502 es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); 1503 1504 if (ds_base == -1L || es_base == -1L) { 1505 ctxt->fi.vector = X86_TRAP_GP; 1506 ctxt->fi.error_code = 0; 1507 return ES_EXCEPTION; 1508 } 1509 1510 src = ds_base + (unsigned char *)ctxt->regs->si; 1511 dst = es_base + (unsigned char *)ctxt->regs->di; 1512 1513 ret = vc_read_mem(ctxt, src, buffer, bytes); 1514 if (ret != ES_OK) 1515 return ret; 1516 1517 ret = vc_write_mem(ctxt, dst, buffer, bytes); 1518 if (ret != ES_OK) 1519 return ret; 1520 1521 if (ctxt->regs->flags & X86_EFLAGS_DF) 1522 off = -bytes; 1523 else 1524 off = bytes; 1525 1526 ctxt->regs->si += off; 1527 ctxt->regs->di += off; 1528 1529 rep = insn_has_rep_prefix(&ctxt->insn); 1530 if (rep) 1531 ctxt->regs->cx -= 1; 1532 1533 if (!rep || ctxt->regs->cx == 0) 1534 return ES_OK; 1535 else 1536 return ES_RETRY; 1537} 1538 1539static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1540{ 1541 struct insn *insn = &ctxt->insn; 1542 unsigned int bytes = 0; 1543 enum mmio_type mmio; 1544 enum es_result ret; 1545 u8 sign_byte; 1546 long *reg_data; 1547 1548 mmio = insn_decode_mmio(insn, &bytes); 1549 if (mmio == MMIO_DECODE_FAILED) 1550 return ES_DECODE_FAILED; 1551 1552 if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { 1553 reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); 1554 if (!reg_data) 1555 return ES_DECODE_FAILED; 1556 } 1557 1558 switch (mmio) { 1559 case MMIO_WRITE: 1560 memcpy(ghcb->shared_buffer, reg_data, bytes); 1561 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1562 break; 1563 case MMIO_WRITE_IMM: 1564 memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); 1565 ret = vc_do_mmio(ghcb, ctxt, bytes, false); 1566 break; 1567 case MMIO_READ: 1568 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1569 if (ret) 1570 break; 1571 1572 /* Zero-extend for 32-bit operation */ 1573 if (bytes == 4) 1574 *reg_data = 0; 1575 1576 memcpy(reg_data, ghcb->shared_buffer, bytes); 1577 break; 1578 case MMIO_READ_ZERO_EXTEND: 1579 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1580 if (ret) 1581 break; 1582 1583 /* Zero extend based on operand size */ 1584 memset(reg_data, 0, insn->opnd_bytes); 1585 memcpy(reg_data, ghcb->shared_buffer, bytes); 1586 break; 1587 case MMIO_READ_SIGN_EXTEND: 1588 ret = vc_do_mmio(ghcb, ctxt, bytes, true); 1589 if (ret) 1590 break; 1591 1592 if (bytes == 1) { 1593 u8 *val = (u8 *)ghcb->shared_buffer; 1594 1595 sign_byte = (*val & 0x80) ? 0xff : 0x00; 1596 } else { 1597 u16 *val = (u16 *)ghcb->shared_buffer; 1598 1599 sign_byte = (*val & 0x8000) ? 0xff : 0x00; 1600 } 1601 1602 /* Sign extend based on operand size */ 1603 memset(reg_data, sign_byte, insn->opnd_bytes); 1604 memcpy(reg_data, ghcb->shared_buffer, bytes); 1605 break; 1606 case MMIO_MOVS: 1607 ret = vc_handle_mmio_movs(ctxt, bytes); 1608 break; 1609 default: 1610 ret = ES_UNSUPPORTED; 1611 break; 1612 } 1613 1614 return ret; 1615} 1616 1617static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, 1618 struct es_em_ctxt *ctxt) 1619{ 1620 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1621 long val, *reg = vc_insn_get_rm(ctxt); 1622 enum es_result ret; 1623 1624 if (!reg) 1625 return ES_DECODE_FAILED; 1626 1627 val = *reg; 1628 1629 /* Upper 32 bits must be written as zeroes */ 1630 if (val >> 32) { 1631 ctxt->fi.vector = X86_TRAP_GP; 1632 ctxt->fi.error_code = 0; 1633 return ES_EXCEPTION; 1634 } 1635 1636 /* Clear out other reserved bits and set bit 10 */ 1637 val = (val & 0xffff23ffL) | BIT(10); 1638 1639 /* Early non-zero writes to DR7 are not supported */ 1640 if (!data && (val & ~DR7_RESET_VALUE)) 1641 return ES_UNSUPPORTED; 1642 1643 /* Using a value of 0 for ExitInfo1 means RAX holds the value */ 1644 ghcb_set_rax(ghcb, val); 1645 ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); 1646 if (ret != ES_OK) 1647 return ret; 1648 1649 if (data) 1650 data->dr7 = val; 1651 1652 return ES_OK; 1653} 1654 1655static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, 1656 struct es_em_ctxt *ctxt) 1657{ 1658 struct sev_es_runtime_data *data = this_cpu_read(runtime_data); 1659 long *reg = vc_insn_get_rm(ctxt); 1660 1661 if (!reg) 1662 return ES_DECODE_FAILED; 1663 1664 if (data) 1665 *reg = data->dr7; 1666 else 1667 *reg = DR7_RESET_VALUE; 1668 1669 return ES_OK; 1670} 1671 1672static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, 1673 struct es_em_ctxt *ctxt) 1674{ 1675 return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); 1676} 1677 1678static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) 1679{ 1680 enum es_result ret; 1681 1682 ghcb_set_rcx(ghcb, ctxt->regs->cx); 1683 1684 ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); 1685 if (ret != ES_OK) 1686 return ret; 1687 1688 if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) 1689 return ES_VMM_ERROR; 1690 1691 ctxt->regs->ax = ghcb->save.rax; 1692 ctxt->regs->dx = ghcb->save.rdx; 1693 1694 return ES_OK; 1695} 1696 1697static enum es_result vc_handle_monitor(struct ghcb *ghcb, 1698 struct es_em_ctxt *ctxt) 1699{ 1700 /* 1701 * Treat it as a NOP and do not leak a physical address to the 1702 * hypervisor. 1703 */ 1704 return ES_OK; 1705} 1706 1707static enum es_result vc_handle_mwait(struct ghcb *ghcb, 1708 struct es_em_ctxt *ctxt) 1709{ 1710 /* Treat the same as MONITOR/MONITORX */ 1711 return ES_OK; 1712} 1713 1714static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, 1715 struct es_em_ctxt *ctxt) 1716{ 1717 enum es_result ret; 1718 1719 ghcb_set_rax(ghcb, ctxt->regs->ax); 1720 ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); 1721 1722 if (x86_platform.hyper.sev_es_hcall_prepare) 1723 x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); 1724 1725 ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); 1726 if (ret != ES_OK) 1727 return ret; 1728 1729 if (!ghcb_rax_is_valid(ghcb)) 1730 return ES_VMM_ERROR; 1731 1732 ctxt->regs->ax = ghcb->save.rax; 1733 1734 /* 1735 * Call sev_es_hcall_finish() after regs->ax is already set. 1736 * This allows the hypervisor handler to overwrite it again if 1737 * necessary. 1738 */ 1739 if (x86_platform.hyper.sev_es_hcall_finish && 1740 !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) 1741 return ES_VMM_ERROR; 1742 1743 return ES_OK; 1744} 1745 1746static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, 1747 struct es_em_ctxt *ctxt) 1748{ 1749 /* 1750 * Calling ecx_alignment_check() directly does not work, because it 1751 * enables IRQs and the GHCB is active. Forward the exception and call 1752 * it later from vc_forward_exception(). 1753 */ 1754 ctxt->fi.vector = X86_TRAP_AC; 1755 ctxt->fi.error_code = 0; 1756 return ES_EXCEPTION; 1757} 1758 1759static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, 1760 struct ghcb *ghcb, 1761 unsigned long exit_code) 1762{ 1763 enum es_result result; 1764 1765 switch (exit_code) { 1766 case SVM_EXIT_READ_DR7: 1767 result = vc_handle_dr7_read(ghcb, ctxt); 1768 break; 1769 case SVM_EXIT_WRITE_DR7: 1770 result = vc_handle_dr7_write(ghcb, ctxt); 1771 break; 1772 case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: 1773 result = vc_handle_trap_ac(ghcb, ctxt); 1774 break; 1775 case SVM_EXIT_RDTSC: 1776 case SVM_EXIT_RDTSCP: 1777 result = vc_handle_rdtsc(ghcb, ctxt, exit_code); 1778 break; 1779 case SVM_EXIT_RDPMC: 1780 result = vc_handle_rdpmc(ghcb, ctxt); 1781 break; 1782 case SVM_EXIT_INVD: 1783 pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); 1784 result = ES_UNSUPPORTED; 1785 break; 1786 case SVM_EXIT_CPUID: 1787 result = vc_handle_cpuid(ghcb, ctxt); 1788 break; 1789 case SVM_EXIT_IOIO: 1790 result = vc_handle_ioio(ghcb, ctxt); 1791 break; 1792 case SVM_EXIT_MSR: 1793 result = vc_handle_msr(ghcb, ctxt); 1794 break; 1795 case SVM_EXIT_VMMCALL: 1796 result = vc_handle_vmmcall(ghcb, ctxt); 1797 break; 1798 case SVM_EXIT_WBINVD: 1799 result = vc_handle_wbinvd(ghcb, ctxt); 1800 break; 1801 case SVM_EXIT_MONITOR: 1802 result = vc_handle_monitor(ghcb, ctxt); 1803 break; 1804 case SVM_EXIT_MWAIT: 1805 result = vc_handle_mwait(ghcb, ctxt); 1806 break; 1807 case SVM_EXIT_NPF: 1808 result = vc_handle_mmio(ghcb, ctxt); 1809 break; 1810 default: 1811 /* 1812 * Unexpected #VC exception 1813 */ 1814 result = ES_UNSUPPORTED; 1815 } 1816 1817 return result; 1818} 1819 1820static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) 1821{ 1822 long error_code = ctxt->fi.error_code; 1823 int trapnr = ctxt->fi.vector; 1824 1825 ctxt->regs->orig_ax = ctxt->fi.error_code; 1826 1827 switch (trapnr) { 1828 case X86_TRAP_GP: 1829 exc_general_protection(ctxt->regs, error_code); 1830 break; 1831 case X86_TRAP_UD: 1832 exc_invalid_op(ctxt->regs); 1833 break; 1834 case X86_TRAP_PF: 1835 write_cr2(ctxt->fi.cr2); 1836 exc_page_fault(ctxt->regs, error_code); 1837 break; 1838 case X86_TRAP_AC: 1839 exc_alignment_check(ctxt->regs, error_code); 1840 break; 1841 default: 1842 pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); 1843 BUG(); 1844 } 1845} 1846 1847static __always_inline bool is_vc2_stack(unsigned long sp) 1848{ 1849 return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); 1850} 1851 1852static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) 1853{ 1854 unsigned long sp, prev_sp; 1855 1856 sp = (unsigned long)regs; 1857 prev_sp = regs->sp; 1858 1859 /* 1860 * If the code was already executing on the VC2 stack when the #VC 1861 * happened, let it proceed to the normal handling routine. This way the 1862 * code executing on the VC2 stack can cause #VC exceptions to get handled. 1863 */ 1864 return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); 1865} 1866 1867static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) 1868{ 1869 struct ghcb_state state; 1870 struct es_em_ctxt ctxt; 1871 enum es_result result; 1872 struct ghcb *ghcb; 1873 bool ret = true; 1874 1875 ghcb = __sev_get_ghcb(&state); 1876 1877 vc_ghcb_invalidate(ghcb); 1878 result = vc_init_em_ctxt(&ctxt, regs, error_code); 1879 1880 if (result == ES_OK) 1881 result = vc_handle_exitcode(&ctxt, ghcb, error_code); 1882 1883 __sev_put_ghcb(&state); 1884 1885 /* Done - now check the result */ 1886 switch (result) { 1887 case ES_OK: 1888 vc_finish_insn(&ctxt); 1889 break; 1890 case ES_UNSUPPORTED: 1891 pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", 1892 error_code, regs->ip); 1893 ret = false; 1894 break; 1895 case ES_VMM_ERROR: 1896 pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 1897 error_code, regs->ip); 1898 ret = false; 1899 break; 1900 case ES_DECODE_FAILED: 1901 pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 1902 error_code, regs->ip); 1903 ret = false; 1904 break; 1905 case ES_EXCEPTION: 1906 vc_forward_exception(&ctxt); 1907 break; 1908 case ES_RETRY: 1909 /* Nothing to do */ 1910 break; 1911 default: 1912 pr_emerg("Unknown result in %s():%d\n", __func__, result); 1913 /* 1914 * Emulating the instruction which caused the #VC exception 1915 * failed - can't continue so print debug information 1916 */ 1917 BUG(); 1918 } 1919 1920 return ret; 1921} 1922 1923static __always_inline bool vc_is_db(unsigned long error_code) 1924{ 1925 return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; 1926} 1927 1928/* 1929 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode 1930 * and will panic when an error happens. 1931 */ 1932DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) 1933{ 1934 irqentry_state_t irq_state; 1935 1936 /* 1937 * With the current implementation it is always possible to switch to a 1938 * safe stack because #VC exceptions only happen at known places, like 1939 * intercepted instructions or accesses to MMIO areas/IO ports. They can 1940 * also happen with code instrumentation when the hypervisor intercepts 1941 * #DB, but the critical paths are forbidden to be instrumented, so #DB 1942 * exceptions currently also only happen in safe places. 1943 * 1944 * But keep this here in case the noinstr annotations are violated due 1945 * to bug elsewhere. 1946 */ 1947 if (unlikely(vc_from_invalid_context(regs))) { 1948 instrumentation_begin(); 1949 panic("Can't handle #VC exception from unsupported context\n"); 1950 instrumentation_end(); 1951 } 1952 1953 /* 1954 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1955 */ 1956 if (vc_is_db(error_code)) { 1957 exc_debug(regs); 1958 return; 1959 } 1960 1961 irq_state = irqentry_nmi_enter(regs); 1962 1963 instrumentation_begin(); 1964 1965 if (!vc_raw_handle_exception(regs, error_code)) { 1966 /* Show some debug info */ 1967 show_regs(regs); 1968 1969 /* Ask hypervisor to sev_es_terminate */ 1970 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 1971 1972 /* If that fails and we get here - just panic */ 1973 panic("Returned from Terminate-Request to Hypervisor\n"); 1974 } 1975 1976 instrumentation_end(); 1977 irqentry_nmi_exit(regs, irq_state); 1978} 1979 1980/* 1981 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode 1982 * and will kill the current task with SIGBUS when an error happens. 1983 */ 1984DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) 1985{ 1986 /* 1987 * Handle #DB before calling into !noinstr code to avoid recursive #DB. 1988 */ 1989 if (vc_is_db(error_code)) { 1990 noist_exc_debug(regs); 1991 return; 1992 } 1993 1994 irqentry_enter_from_user_mode(regs); 1995 instrumentation_begin(); 1996 1997 if (!vc_raw_handle_exception(regs, error_code)) { 1998 /* 1999 * Do not kill the machine if user-space triggered the 2000 * exception. Send SIGBUS instead and let user-space deal with 2001 * it. 2002 */ 2003 force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); 2004 } 2005 2006 instrumentation_end(); 2007 irqentry_exit_to_user_mode(regs); 2008} 2009 2010bool __init handle_vc_boot_ghcb(struct pt_regs *regs) 2011{ 2012 unsigned long exit_code = regs->orig_ax; 2013 struct es_em_ctxt ctxt; 2014 enum es_result result; 2015 2016 vc_ghcb_invalidate(boot_ghcb); 2017 2018 result = vc_init_em_ctxt(&ctxt, regs, exit_code); 2019 if (result == ES_OK) 2020 result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); 2021 2022 /* Done - now check the result */ 2023 switch (result) { 2024 case ES_OK: 2025 vc_finish_insn(&ctxt); 2026 break; 2027 case ES_UNSUPPORTED: 2028 early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", 2029 exit_code, regs->ip); 2030 goto fail; 2031 case ES_VMM_ERROR: 2032 early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", 2033 exit_code, regs->ip); 2034 goto fail; 2035 case ES_DECODE_FAILED: 2036 early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", 2037 exit_code, regs->ip); 2038 goto fail; 2039 case ES_EXCEPTION: 2040 vc_early_forward_exception(&ctxt); 2041 break; 2042 case ES_RETRY: 2043 /* Nothing to do */ 2044 break; 2045 default: 2046 BUG(); 2047 } 2048 2049 return true; 2050 2051fail: 2052 show_regs(regs); 2053 2054 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); 2055} 2056 2057/* 2058 * Initial set up of SNP relies on information provided by the 2059 * Confidential Computing blob, which can be passed to the kernel 2060 * in the following ways, depending on how it is booted: 2061 * 2062 * - when booted via the boot/decompress kernel: 2063 * - via boot_params 2064 * 2065 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): 2066 * - via a setup_data entry, as defined by the Linux Boot Protocol 2067 * 2068 * Scan for the blob in that order. 2069 */ 2070static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) 2071{ 2072 struct cc_blob_sev_info *cc_info; 2073 2074 /* Boot kernel would have passed the CC blob via boot_params. */ 2075 if (bp->cc_blob_address) { 2076 cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; 2077 goto found_cc_info; 2078 } 2079 2080 /* 2081 * If kernel was booted directly, without the use of the 2082 * boot/decompression kernel, the CC blob may have been passed via 2083 * setup_data instead. 2084 */ 2085 cc_info = find_cc_blob_setup_data(bp); 2086 if (!cc_info) 2087 return NULL; 2088 2089found_cc_info: 2090 if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) 2091 snp_abort(); 2092 2093 return cc_info; 2094} 2095 2096bool __init snp_init(struct boot_params *bp) 2097{ 2098 struct cc_blob_sev_info *cc_info; 2099 2100 if (!bp) 2101 return false; 2102 2103 cc_info = find_cc_blob(bp); 2104 if (!cc_info) 2105 return false; 2106 2107 setup_cpuid_table(cc_info); 2108 2109 /* 2110 * The CC blob will be used later to access the secrets page. Cache 2111 * it here like the boot kernel does. 2112 */ 2113 bp->cc_blob_address = (u32)(unsigned long)cc_info; 2114 2115 return true; 2116} 2117 2118void __init snp_abort(void) 2119{ 2120 sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); 2121} 2122 2123static void dump_cpuid_table(void) 2124{ 2125 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2126 int i = 0; 2127 2128 pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", 2129 cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); 2130 2131 for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { 2132 const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; 2133 2134 pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", 2135 i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, 2136 fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); 2137 } 2138} 2139 2140/* 2141 * It is useful from an auditing/testing perspective to provide an easy way 2142 * for the guest owner to know that the CPUID table has been initialized as 2143 * expected, but that initialization happens too early in boot to print any 2144 * sort of indicator, and there's not really any other good place to do it, 2145 * so do it here. 2146 */ 2147static int __init report_cpuid_table(void) 2148{ 2149 const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); 2150 2151 if (!cpuid_table->count) 2152 return 0; 2153 2154 pr_info("Using SNP CPUID table, %d entries present.\n", 2155 cpuid_table->count); 2156 2157 if (sev_cfg.debug) 2158 dump_cpuid_table(); 2159 2160 return 0; 2161} 2162arch_initcall(report_cpuid_table); 2163 2164static int __init init_sev_config(char *str) 2165{ 2166 char *s; 2167 2168 while ((s = strsep(&str, ","))) { 2169 if (!strcmp(s, "debug")) { 2170 sev_cfg.debug = true; 2171 continue; 2172 } 2173 2174 pr_info("SEV command-line option '%s' was not recognized\n", s); 2175 } 2176 2177 return 1; 2178} 2179__setup("sev=", init_sev_config); 2180 2181int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) 2182{ 2183 struct ghcb_state state; 2184 struct es_em_ctxt ctxt; 2185 unsigned long flags; 2186 struct ghcb *ghcb; 2187 int ret; 2188 2189 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 2190 return -ENODEV; 2191 2192 if (!fw_err) 2193 return -EINVAL; 2194 2195 /* 2196 * __sev_get_ghcb() needs to run with IRQs disabled because it is using 2197 * a per-CPU GHCB. 2198 */ 2199 local_irq_save(flags); 2200 2201 ghcb = __sev_get_ghcb(&state); 2202 if (!ghcb) { 2203 ret = -EIO; 2204 goto e_restore_irq; 2205 } 2206 2207 vc_ghcb_invalidate(ghcb); 2208 2209 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { 2210 ghcb_set_rax(ghcb, input->data_gpa); 2211 ghcb_set_rbx(ghcb, input->data_npages); 2212 } 2213 2214 ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); 2215 if (ret) 2216 goto e_put; 2217 2218 if (ghcb->save.sw_exit_info_2) { 2219 /* Number of expected pages are returned in RBX */ 2220 if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && 2221 ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) 2222 input->data_npages = ghcb_get_rbx(ghcb); 2223 2224 *fw_err = ghcb->save.sw_exit_info_2; 2225 2226 ret = -EIO; 2227 } 2228 2229e_put: 2230 __sev_put_ghcb(&state); 2231e_restore_irq: 2232 local_irq_restore(flags); 2233 2234 return ret; 2235} 2236EXPORT_SYMBOL_GPL(snp_issue_guest_request); 2237 2238static struct platform_device sev_guest_device = { 2239 .name = "sev-guest", 2240 .id = -1, 2241}; 2242 2243static int __init snp_init_platform_device(void) 2244{ 2245 struct sev_guest_platform_data data; 2246 u64 gpa; 2247 2248 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 2249 return -ENODEV; 2250 2251 gpa = get_secrets_page(); 2252 if (!gpa) 2253 return -ENODEV; 2254 2255 data.secrets_gpa = gpa; 2256 if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) 2257 return -ENODEV; 2258 2259 if (platform_device_register(&sev_guest_device)) 2260 return -ENODEV; 2261 2262 pr_info("SNP guest platform device initialized.\n"); 2263 return 0; 2264} 2265device_initcall(snp_init_platform_device); 2266 2267#undef pr_fmt 2268#define pr_fmt(fmt) "SEV-SNP: " fmt 2269 2270static int __snp_enable(unsigned int cpu) 2271{ 2272 u64 val; 2273 2274 if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) 2275 return 0; 2276 2277 rdmsrl(MSR_AMD64_SYSCFG, val); 2278 2279 val |= MSR_AMD64_SYSCFG_SNP_EN; 2280 val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN; 2281 2282 wrmsrl(MSR_AMD64_SYSCFG, val); 2283 2284 return 0; 2285} 2286 2287static __init void snp_enable(void *arg) 2288{ 2289 __snp_enable(smp_processor_id()); 2290} 2291 2292static int __mfdm_enable(unsigned int cpu) 2293{ 2294 u64 val; 2295 2296 if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) 2297 return 0; 2298 2299 rdmsrl(MSR_AMD64_SYSCFG, val); 2300 2301 val |= MSR_AMD64_SYSCFG_MFDM; 2302 2303 wrmsrl(MSR_AMD64_SYSCFG, val); 2304 2305 return 0; 2306} 2307 2308static __init void mfdm_enable(void *arg) 2309{ 2310 __mfdm_enable(smp_processor_id()); 2311} 2312 2313static bool get_rmptable_info(u64 *start, u64 *len) 2314{ 2315 u64 calc_rmp_sz, rmp_sz, rmp_base, rmp_end, nr_pages; 2316 2317 rdmsrl(MSR_AMD64_RMP_BASE, rmp_base); 2318 rdmsrl(MSR_AMD64_RMP_END, rmp_end); 2319 2320 if (!rmp_base || !rmp_end) { 2321 pr_info("Memory for the RMP table has not been reserved by BIOS\n"); 2322 return false; 2323 } 2324 2325 rmp_sz = rmp_end - rmp_base + 1; 2326 2327 /* 2328 * Calculate the amount the memory that must be reserved by the BIOS to 2329 * address the full system RAM. The reserved memory should also cover the 2330 * RMP table itself. 2331 * 2332 * See PPR Family 19h Model 01h, Revision B1 section 2.1.4.2 for more 2333 * information on memory requirement. 2334 */ 2335 nr_pages = totalram_pages(); 2336 calc_rmp_sz = (((rmp_sz >> PAGE_SHIFT) + nr_pages) << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ; 2337 2338 if (calc_rmp_sz > rmp_sz) { 2339 pr_info("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n", 2340 calc_rmp_sz, rmp_sz); 2341 return false; 2342 } 2343 2344 *start = rmp_base; 2345 *len = rmp_sz; 2346 2347 pr_info("RMP table physical address 0x%016llx - 0x%016llx\n", rmp_base, rmp_end); 2348 2349 return true; 2350} 2351 2352static __init int __snp_rmptable_init(void) 2353{ 2354 u64 rmp_base, sz; 2355 void *start; 2356 u64 val; 2357 2358 if (!get_rmptable_info(&rmp_base, &sz)) 2359 return 1; 2360 2361 start = memremap(rmp_base, sz, MEMREMAP_WB); 2362 if (!start) { 2363 pr_err("Failed to map RMP table 0x%llx+0x%llx\n", rmp_base, sz); 2364 return 1; 2365 } 2366 2367 /* 2368 * Check if SEV-SNP is already enabled, this can happen if we are coming from 2369 * kexec boot. 2370 */ 2371 rdmsrl(MSR_AMD64_SYSCFG, val); 2372 if (val & MSR_AMD64_SYSCFG_SNP_EN) 2373 goto skip_enable; 2374 2375 /* Initialize the RMP table to zero */ 2376 memset(start, 0, sz); 2377 2378 /* Flush the caches to ensure that data is written before SNP is enabled. */ 2379 wbinvd_on_all_cpus(); 2380 2381 /* MFDM must be enabled on all the CPUs prior to enabling SNP. */ 2382 on_each_cpu(mfdm_enable, NULL, 1); 2383 2384 /* Enable SNP on all CPUs. */ 2385 on_each_cpu(snp_enable, NULL, 1); 2386 2387skip_enable: 2388 rmptable_start = (unsigned long)start; 2389 rmptable_end = rmptable_start + sz - 1; 2390 2391 return 0; 2392} 2393 2394static int __init snp_rmptable_init(void) 2395{ 2396 int family, model; 2397 2398 if (!boot_cpu_has(X86_FEATURE_SEV_SNP)) 2399 return 0; 2400 2401 family = boot_cpu_data.x86; 2402 model = boot_cpu_data.x86_model; 2403 2404 /* 2405 * RMP table entry format is not architectural and it can vary by processor and 2406 * is defined by the per-processor PPR. Restrict SNP support on the known CPU 2407 * model and family for which the RMP table entry format is currently defined for. 2408 */ 2409 if (family != 0x19 || model > 0xaf) 2410 goto nosnp; 2411 2412 if (amd_iommu_snp_enable()) 2413 goto nosnp; 2414 2415 if (__snp_rmptable_init()) 2416 goto nosnp; 2417 2418 cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL); 2419 2420 return 0; 2421 2422nosnp: 2423 setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); 2424 return 1; 2425} 2426 2427/* 2428 * This must be called after the PCI subsystem. This is because before enabling 2429 * the SNP feature we need to ensure that IOMMU supports the SNP feature. 2430 * The amd_iommu_snp_enable() is used for checking and enabling the feature and, 2431 * and it is available after subsys_initcall(). 2432 */ 2433fs_initcall(snp_rmptable_init); 2434 2435static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level) 2436{ 2437 unsigned long vaddr, paddr = pfn << PAGE_SHIFT; 2438 struct rmpentry *entry, *large_entry; 2439 2440 if (!pfn_valid(pfn)) 2441 return ERR_PTR(-EINVAL); 2442 2443 if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) 2444 return ERR_PTR(-ENXIO); 2445 2446 vaddr = rmptable_start + rmptable_page_offset(paddr); 2447 if (unlikely(vaddr > rmptable_end)) 2448 return ERR_PTR(-ENXIO); 2449 2450 entry = (struct rmpentry *)vaddr; 2451 2452 /* Read a large RMP entry to get the correct page level used in RMP entry. */ 2453 vaddr = rmptable_start + rmptable_page_offset(paddr & PMD_MASK); 2454 large_entry = (struct rmpentry *)vaddr; 2455 *level = RMP_TO_X86_PG_LEVEL(rmpentry_pagesize(large_entry)); 2456 2457 return entry; 2458} 2459 2460void dump_rmpentry(u64 pfn) 2461{ 2462 unsigned long pfn_end; 2463 struct rmpentry *e; 2464 int level; 2465 2466 e = __snp_lookup_rmpentry(pfn, &level); 2467 if (!e) { 2468 pr_alert("failed to read RMP entry pfn 0x%llx\n", pfn); 2469 return; 2470 } 2471 2472 if (rmpentry_assigned(e)) { 2473 pr_alert("RMPEntry paddr 0x%llx [assigned=%d immutable=%d pagesize=%d gpa=0x%lx" 2474 " asid=%d vmsa=%d validated=%d]\n", pfn << PAGE_SHIFT, 2475 rmpentry_assigned(e), rmpentry_immutable(e), rmpentry_pagesize(e), 2476 rmpentry_gpa(e), rmpentry_asid(e), rmpentry_vmsa(e), 2477 rmpentry_validated(e)); 2478 return; 2479 } 2480 2481 /* 2482 * If the RMP entry at the faulting pfn was not assigned, then we do not 2483 * know what caused the RMP violation. To get some useful debug information, 2484 * let iterate through the entire 2MB region, and dump the RMP entries if 2485 * one of the bit in the RMP entry is set. 2486 */ 2487 pfn = pfn & ~(PTRS_PER_PMD - 1); 2488 pfn_end = pfn + PTRS_PER_PMD; 2489 2490 while (pfn < pfn_end) { 2491 e = __snp_lookup_rmpentry(pfn, &level); 2492 if (!e) 2493 return; 2494 2495 if (e->low || e->high) 2496 pr_alert("RMPEntry paddr 0x%llx: [high=0x%016llx low=0x%016llx]\n", 2497 pfn << PAGE_SHIFT, e->high, e->low); 2498 pfn++; 2499 } 2500} 2501EXPORT_SYMBOL_GPL(dump_rmpentry); 2502 2503/* 2504 * Return 1 if the RMP entry is assigned, 0 if it exists but is not assigned, 2505 * and -errno if there is no corresponding RMP entry. 2506 */ 2507int snp_lookup_rmpentry(u64 pfn, int *level) 2508{ 2509 struct rmpentry *e; 2510 2511 e = __snp_lookup_rmpentry(pfn, level); 2512 if (IS_ERR(e)) 2513 return PTR_ERR(e); 2514 2515 return !!rmpentry_assigned(e); 2516} 2517EXPORT_SYMBOL_GPL(snp_lookup_rmpentry); 2518 2519int psmash(u64 pfn) 2520{ 2521 unsigned long paddr = pfn << PAGE_SHIFT; 2522 int ret; 2523 2524 if (!pfn_valid(pfn)) 2525 return -EINVAL; 2526 2527 if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) 2528 return -ENXIO; 2529 2530 /* Binutils version 2.36 supports the PSMASH mnemonic. */ 2531 asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF" 2532 : "=a"(ret) 2533 : "a"(paddr) 2534 : "memory", "cc"); 2535 2536 return ret; 2537} 2538EXPORT_SYMBOL_GPL(psmash); 2539 2540static int restore_direct_map(u64 pfn, int npages) 2541{ 2542 int i, ret = 0; 2543 2544 for (i = 0; i < npages; i++) { 2545 ret = set_direct_map_default_noflush(pfn_to_page(pfn + i)); 2546 if (ret) 2547 goto cleanup; 2548 } 2549 2550cleanup: 2551 WARN(ret > 0, "Failed to restore direct map for pfn 0x%llx\n", pfn + i); 2552 return ret; 2553} 2554 2555static int invalid_direct_map(unsigned long pfn, int npages) 2556{ 2557 int i, ret = 0; 2558 2559 for (i = 0; i < npages; i++) { 2560 ret = set_direct_map_invalid_noflush(pfn_to_page(pfn + i)); 2561 if (ret) 2562 goto cleanup; 2563 } 2564 2565 return 0; 2566 2567cleanup: 2568 restore_direct_map(pfn, i); 2569 return ret; 2570} 2571 2572static int rmpupdate(u64 pfn, struct rmpupdate *val) 2573{ 2574 unsigned long paddr = pfn << PAGE_SHIFT; 2575 int ret, level, npages; 2576 int retries = 0; 2577 2578 if (!pfn_valid(pfn)) 2579 return -EINVAL; 2580 2581 if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) 2582 return -ENXIO; 2583 2584 level = RMP_TO_X86_PG_LEVEL(val->pagesize); 2585 npages = page_level_size(level) / PAGE_SIZE; 2586 2587 /* 2588 * If page is getting assigned in the RMP table then unmap it from the 2589 * direct map. 2590 */ 2591 if (val->assigned) { 2592 if (invalid_direct_map(pfn, npages)) { 2593 pr_err("Failed to unmap pfn 0x%llx pages %d from direct_map\n", 2594 pfn, npages); 2595 return -EFAULT; 2596 } 2597 } 2598 2599retry: 2600 /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */ 2601 asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE" 2602 : "=a"(ret) 2603 : "a"(paddr), "c"((unsigned long)val) 2604 : "memory", "cc"); 2605 2606 if (ret) { 2607 if (!retries) { 2608 pr_err("rmpupdate failed, ret: %d, pfn: %llx, npages: %d, level: %d, retrying (max: %d)...\n", 2609 ret, pfn, npages, level, 2 * num_present_cpus()); 2610 dump_stack(); 2611 } 2612 retries++; 2613 if (retries < 2 * num_present_cpus()) 2614 goto retry; 2615 } else if (retries > 0) { 2616 pr_err("rmpupdate for pfn %llx succeeded after %d retries\n", pfn, retries); 2617 } 2618 2619 /* 2620 * Restore the direct map after the page is removed from the RMP table. 2621 */ 2622 if (!ret && !val->assigned) { 2623 if (restore_direct_map(pfn, npages)) { 2624 pr_err("Failed to map pfn 0x%llx pages %d in direct_map\n", 2625 pfn, npages); 2626 return -EFAULT; 2627 } 2628 } 2629 2630 return ret; 2631} 2632 2633int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, int asid, bool immutable) 2634{ 2635 struct rmpupdate val; 2636 2637 if (!pfn_valid(pfn)) 2638 return -EINVAL; 2639 2640 memset(&val, 0, sizeof(val)); 2641 val.assigned = 1; 2642 val.asid = asid; 2643 val.immutable = immutable; 2644 val.gpa = gpa; 2645 val.pagesize = X86_TO_RMP_PG_LEVEL(level); 2646 2647 return rmpupdate(pfn, &val); 2648} 2649EXPORT_SYMBOL_GPL(rmp_make_private); 2650 2651int rmp_make_shared(u64 pfn, enum pg_level level) 2652{ 2653 struct rmpupdate val; 2654 2655 if (!pfn_valid(pfn)) 2656 return -EINVAL; 2657 2658 memset(&val, 0, sizeof(val)); 2659 val.pagesize = X86_TO_RMP_PG_LEVEL(level); 2660 2661 return rmpupdate(pfn, &val); 2662} 2663EXPORT_SYMBOL_GPL(rmp_make_shared);