arm.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
arm.c (55840B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
      4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
      5 */
      6
      7#include <linux/bug.h>
      8#include <linux/cpu_pm.h>
      9#include <linux/entry-kvm.h>
     10#include <linux/errno.h>
     11#include <linux/err.h>
     12#include <linux/kvm_host.h>
     13#include <linux/list.h>
     14#include <linux/module.h>
     15#include <linux/vmalloc.h>
     16#include <linux/fs.h>
     17#include <linux/mman.h>
     18#include <linux/sched.h>
     19#include <linux/kmemleak.h>
     20#include <linux/kvm.h>
     21#include <linux/kvm_irqfd.h>
     22#include <linux/irqbypass.h>
     23#include <linux/sched/stat.h>
     24#include <linux/psci.h>
     25#include <trace/events/kvm.h>
     26
     27#define CREATE_TRACE_POINTS
     28#include "trace_arm.h"
     29
     30#include <linux/uaccess.h>
     31#include <asm/ptrace.h>
     32#include <asm/mman.h>
     33#include <asm/tlbflush.h>
     34#include <asm/cacheflush.h>
     35#include <asm/cpufeature.h>
     36#include <asm/virt.h>
     37#include <asm/kvm_arm.h>
     38#include <asm/kvm_asm.h>
     39#include <asm/kvm_mmu.h>
     40#include <asm/kvm_emulate.h>
     41#include <asm/sections.h>
     42
     43#include <kvm/arm_hypercalls.h>
     44#include <kvm/arm_pmu.h>
     45#include <kvm/arm_psci.h>
     46
     47static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
     48DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
     49
     50DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
     51
     52static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
     53unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
     54DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
     55
     56static bool vgic_present;
     57
     58static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
     59DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
     60
     61int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
     62{
     63	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
     64}
     65
     66int kvm_arch_hardware_setup(void *opaque)
     67{
     68	return 0;
     69}
     70
     71int kvm_arch_check_processor_compat(void *opaque)
     72{
     73	return 0;
     74}
     75
     76int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
     77			    struct kvm_enable_cap *cap)
     78{
     79	int r;
     80
     81	if (cap->flags)
     82		return -EINVAL;
     83
     84	switch (cap->cap) {
     85	case KVM_CAP_ARM_NISV_TO_USER:
     86		r = 0;
     87		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
     88			&kvm->arch.flags);
     89		break;
     90	case KVM_CAP_ARM_MTE:
     91		mutex_lock(&kvm->lock);
     92		if (!system_supports_mte() || kvm->created_vcpus) {
     93			r = -EINVAL;
     94		} else {
     95			r = 0;
     96			set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
     97		}
     98		mutex_unlock(&kvm->lock);
     99		break;
    100	case KVM_CAP_ARM_SYSTEM_SUSPEND:
    101		r = 0;
    102		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
    103		break;
    104	default:
    105		r = -EINVAL;
    106		break;
    107	}
    108
    109	return r;
    110}
    111
    112static int kvm_arm_default_max_vcpus(void)
    113{
    114	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
    115}
    116
    117static void set_default_spectre(struct kvm *kvm)
    118{
    119	/*
    120	 * The default is to expose CSV2 == 1 if the HW isn't affected.
    121	 * Although this is a per-CPU feature, we make it global because
    122	 * asymmetric systems are just a nuisance.
    123	 *
    124	 * Userspace can override this as long as it doesn't promise
    125	 * the impossible.
    126	 */
    127	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
    128		kvm->arch.pfr0_csv2 = 1;
    129	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
    130		kvm->arch.pfr0_csv3 = 1;
    131}
    132
    133/**
    134 * kvm_arch_init_vm - initializes a VM data structure
    135 * @kvm:	pointer to the KVM struct
    136 */
    137int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
    138{
    139	int ret;
    140
    141	ret = kvm_arm_setup_stage2(kvm, type);
    142	if (ret)
    143		return ret;
    144
    145	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
    146	if (ret)
    147		return ret;
    148
    149	ret = kvm_share_hyp(kvm, kvm + 1);
    150	if (ret)
    151		goto out_free_stage2_pgd;
    152
    153	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
    154		ret = -ENOMEM;
    155		goto out_free_stage2_pgd;
    156	}
    157	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
    158
    159	kvm_vgic_early_init(kvm);
    160
    161	/* The maximum number of VCPUs is limited by the host's GIC model */
    162	kvm->max_vcpus = kvm_arm_default_max_vcpus();
    163
    164	set_default_spectre(kvm);
    165	kvm_arm_init_hypercalls(kvm);
    166
    167	return ret;
    168out_free_stage2_pgd:
    169	kvm_free_stage2_pgd(&kvm->arch.mmu);
    170	return ret;
    171}
    172
    173vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
    174{
    175	return VM_FAULT_SIGBUS;
    176}
    177
    178
    179/**
    180 * kvm_arch_destroy_vm - destroy the VM data structure
    181 * @kvm:	pointer to the KVM struct
    182 */
    183void kvm_arch_destroy_vm(struct kvm *kvm)
    184{
    185	bitmap_free(kvm->arch.pmu_filter);
    186	free_cpumask_var(kvm->arch.supported_cpus);
    187
    188	kvm_vgic_destroy(kvm);
    189
    190	kvm_destroy_vcpus(kvm);
    191
    192	kvm_unshare_hyp(kvm, kvm + 1);
    193}
    194
    195int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
    196{
    197	int r;
    198	switch (ext) {
    199	case KVM_CAP_IRQCHIP:
    200		r = vgic_present;
    201		break;
    202	case KVM_CAP_IOEVENTFD:
    203	case KVM_CAP_DEVICE_CTRL:
    204	case KVM_CAP_USER_MEMORY:
    205	case KVM_CAP_SYNC_MMU:
    206	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
    207	case KVM_CAP_ONE_REG:
    208	case KVM_CAP_ARM_PSCI:
    209	case KVM_CAP_ARM_PSCI_0_2:
    210	case KVM_CAP_READONLY_MEM:
    211	case KVM_CAP_MP_STATE:
    212	case KVM_CAP_IMMEDIATE_EXIT:
    213	case KVM_CAP_VCPU_EVENTS:
    214	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
    215	case KVM_CAP_ARM_NISV_TO_USER:
    216	case KVM_CAP_ARM_INJECT_EXT_DABT:
    217	case KVM_CAP_SET_GUEST_DEBUG:
    218	case KVM_CAP_VCPU_ATTRIBUTES:
    219	case KVM_CAP_PTP_KVM:
    220	case KVM_CAP_ARM_SYSTEM_SUSPEND:
    221		r = 1;
    222		break;
    223	case KVM_CAP_SET_GUEST_DEBUG2:
    224		return KVM_GUESTDBG_VALID_MASK;
    225	case KVM_CAP_ARM_SET_DEVICE_ADDR:
    226		r = 1;
    227		break;
    228	case KVM_CAP_NR_VCPUS:
    229		/*
    230		 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
    231		 * architectures, as it does not always bound it to
    232		 * KVM_CAP_MAX_VCPUS. It should not matter much because
    233		 * this is just an advisory value.
    234		 */
    235		r = min_t(unsigned int, num_online_cpus(),
    236			  kvm_arm_default_max_vcpus());
    237		break;
    238	case KVM_CAP_MAX_VCPUS:
    239	case KVM_CAP_MAX_VCPU_ID:
    240		if (kvm)
    241			r = kvm->max_vcpus;
    242		else
    243			r = kvm_arm_default_max_vcpus();
    244		break;
    245	case KVM_CAP_MSI_DEVID:
    246		if (!kvm)
    247			r = -EINVAL;
    248		else
    249			r = kvm->arch.vgic.msis_require_devid;
    250		break;
    251	case KVM_CAP_ARM_USER_IRQ:
    252		/*
    253		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
    254		 * (bump this number if adding more devices)
    255		 */
    256		r = 1;
    257		break;
    258	case KVM_CAP_ARM_MTE:
    259		r = system_supports_mte();
    260		break;
    261	case KVM_CAP_STEAL_TIME:
    262		r = kvm_arm_pvtime_supported();
    263		break;
    264	case KVM_CAP_ARM_EL1_32BIT:
    265		r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
    266		break;
    267	case KVM_CAP_GUEST_DEBUG_HW_BPS:
    268		r = get_num_brps();
    269		break;
    270	case KVM_CAP_GUEST_DEBUG_HW_WPS:
    271		r = get_num_wrps();
    272		break;
    273	case KVM_CAP_ARM_PMU_V3:
    274		r = kvm_arm_support_pmu_v3();
    275		break;
    276	case KVM_CAP_ARM_INJECT_SERROR_ESR:
    277		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
    278		break;
    279	case KVM_CAP_ARM_VM_IPA_SIZE:
    280		r = get_kvm_ipa_limit();
    281		break;
    282	case KVM_CAP_ARM_SVE:
    283		r = system_supports_sve();
    284		break;
    285	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
    286	case KVM_CAP_ARM_PTRAUTH_GENERIC:
    287		r = system_has_full_ptr_auth();
    288		break;
    289	default:
    290		r = 0;
    291	}
    292
    293	return r;
    294}
    295
    296long kvm_arch_dev_ioctl(struct file *filp,
    297			unsigned int ioctl, unsigned long arg)
    298{
    299	return -EINVAL;
    300}
    301
    302struct kvm *kvm_arch_alloc_vm(void)
    303{
    304	size_t sz = sizeof(struct kvm);
    305
    306	if (!has_vhe())
    307		return kzalloc(sz, GFP_KERNEL_ACCOUNT);
    308
    309	return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
    310}
    311
    312int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
    313{
    314	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
    315		return -EBUSY;
    316
    317	if (id >= kvm->max_vcpus)
    318		return -EINVAL;
    319
    320	return 0;
    321}
    322
    323int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
    324{
    325	int err;
    326
    327	/* Force users to call KVM_ARM_VCPU_INIT */
    328	vcpu->arch.target = -1;
    329	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
    330
    331	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
    332
    333	/* Set up the timer */
    334	kvm_timer_vcpu_init(vcpu);
    335
    336	kvm_pmu_vcpu_init(vcpu);
    337
    338	kvm_arm_reset_debug_ptr(vcpu);
    339
    340	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
    341
    342	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
    343
    344	err = kvm_vgic_vcpu_init(vcpu);
    345	if (err)
    346		return err;
    347
    348	return kvm_share_hyp(vcpu, vcpu + 1);
    349}
    350
    351void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
    352{
    353}
    354
    355void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
    356{
    357	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
    358		static_branch_dec(&userspace_irqchip_in_use);
    359
    360	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
    361	kvm_timer_vcpu_terminate(vcpu);
    362	kvm_pmu_vcpu_destroy(vcpu);
    363
    364	kvm_arm_vcpu_destroy(vcpu);
    365}
    366
    367void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
    368{
    369
    370}
    371
    372void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
    373{
    374
    375}
    376
    377void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
    378{
    379	struct kvm_s2_mmu *mmu;
    380	int *last_ran;
    381
    382	mmu = vcpu->arch.hw_mmu;
    383	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
    384
    385	/*
    386	 * We guarantee that both TLBs and I-cache are private to each
    387	 * vcpu. If detecting that a vcpu from the same VM has
    388	 * previously run on the same physical CPU, call into the
    389	 * hypervisor code to nuke the relevant contexts.
    390	 *
    391	 * We might get preempted before the vCPU actually runs, but
    392	 * over-invalidation doesn't affect correctness.
    393	 */
    394	if (*last_ran != vcpu->vcpu_id) {
    395		kvm_call_hyp(__kvm_flush_cpu_context, mmu);
    396		*last_ran = vcpu->vcpu_id;
    397	}
    398
    399	vcpu->cpu = cpu;
    400
    401	kvm_vgic_load(vcpu);
    402	kvm_timer_vcpu_load(vcpu);
    403	if (has_vhe())
    404		kvm_vcpu_load_sysregs_vhe(vcpu);
    405	kvm_arch_vcpu_load_fp(vcpu);
    406	kvm_vcpu_pmu_restore_guest(vcpu);
    407	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
    408		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
    409
    410	if (single_task_running())
    411		vcpu_clear_wfx_traps(vcpu);
    412	else
    413		vcpu_set_wfx_traps(vcpu);
    414
    415	if (vcpu_has_ptrauth(vcpu))
    416		vcpu_ptrauth_disable(vcpu);
    417	kvm_arch_vcpu_load_debug_state_flags(vcpu);
    418
    419	if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus))
    420		vcpu_set_on_unsupported_cpu(vcpu);
    421}
    422
    423void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
    424{
    425	kvm_arch_vcpu_put_debug_state_flags(vcpu);
    426	kvm_arch_vcpu_put_fp(vcpu);
    427	if (has_vhe())
    428		kvm_vcpu_put_sysregs_vhe(vcpu);
    429	kvm_timer_vcpu_put(vcpu);
    430	kvm_vgic_put(vcpu);
    431	kvm_vcpu_pmu_restore_host(vcpu);
    432	kvm_arm_vmid_clear_active();
    433
    434	vcpu_clear_on_unsupported_cpu(vcpu);
    435	vcpu->cpu = -1;
    436}
    437
    438void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
    439{
    440	vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
    441	kvm_make_request(KVM_REQ_SLEEP, vcpu);
    442	kvm_vcpu_kick(vcpu);
    443}
    444
    445bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
    446{
    447	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
    448}
    449
    450static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
    451{
    452	vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
    453	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
    454	kvm_vcpu_kick(vcpu);
    455}
    456
    457static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
    458{
    459	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
    460}
    461
    462int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
    463				    struct kvm_mp_state *mp_state)
    464{
    465	*mp_state = vcpu->arch.mp_state;
    466
    467	return 0;
    468}
    469
    470int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
    471				    struct kvm_mp_state *mp_state)
    472{
    473	int ret = 0;
    474
    475	switch (mp_state->mp_state) {
    476	case KVM_MP_STATE_RUNNABLE:
    477		vcpu->arch.mp_state = *mp_state;
    478		break;
    479	case KVM_MP_STATE_STOPPED:
    480		kvm_arm_vcpu_power_off(vcpu);
    481		break;
    482	case KVM_MP_STATE_SUSPENDED:
    483		kvm_arm_vcpu_suspend(vcpu);
    484		break;
    485	default:
    486		ret = -EINVAL;
    487	}
    488
    489	return ret;
    490}
    491
    492/**
    493 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
    494 * @v:		The VCPU pointer
    495 *
    496 * If the guest CPU is not waiting for interrupts or an interrupt line is
    497 * asserted, the CPU is by definition runnable.
    498 */
    499int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
    500{
    501	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
    502	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
    503		&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
    504}
    505
    506bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
    507{
    508	return vcpu_mode_priv(vcpu);
    509}
    510
    511#ifdef CONFIG_GUEST_PERF_EVENTS
    512unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
    513{
    514	return *vcpu_pc(vcpu);
    515}
    516#endif
    517
    518static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
    519{
    520	return vcpu->arch.target >= 0;
    521}
    522
    523/*
    524 * Handle both the initialisation that is being done when the vcpu is
    525 * run for the first time, as well as the updates that must be
    526 * performed each time we get a new thread dealing with this vcpu.
    527 */
    528int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
    529{
    530	struct kvm *kvm = vcpu->kvm;
    531	int ret;
    532
    533	if (!kvm_vcpu_initialized(vcpu))
    534		return -ENOEXEC;
    535
    536	if (!kvm_arm_vcpu_is_finalized(vcpu))
    537		return -EPERM;
    538
    539	ret = kvm_arch_vcpu_run_map_fp(vcpu);
    540	if (ret)
    541		return ret;
    542
    543	if (likely(vcpu_has_run_once(vcpu)))
    544		return 0;
    545
    546	kvm_arm_vcpu_init_debug(vcpu);
    547
    548	if (likely(irqchip_in_kernel(kvm))) {
    549		/*
    550		 * Map the VGIC hardware resources before running a vcpu the
    551		 * first time on this VM.
    552		 */
    553		ret = kvm_vgic_map_resources(kvm);
    554		if (ret)
    555			return ret;
    556	}
    557
    558	ret = kvm_timer_enable(vcpu);
    559	if (ret)
    560		return ret;
    561
    562	ret = kvm_arm_pmu_v3_enable(vcpu);
    563	if (ret)
    564		return ret;
    565
    566	if (!irqchip_in_kernel(kvm)) {
    567		/*
    568		 * Tell the rest of the code that there are userspace irqchip
    569		 * VMs in the wild.
    570		 */
    571		static_branch_inc(&userspace_irqchip_in_use);
    572	}
    573
    574	/*
    575	 * Initialize traps for protected VMs.
    576	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
    577	 * the code is in place for first run initialization at EL2.
    578	 */
    579	if (kvm_vm_is_protected(kvm))
    580		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
    581
    582	mutex_lock(&kvm->lock);
    583	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
    584	mutex_unlock(&kvm->lock);
    585
    586	return ret;
    587}
    588
    589bool kvm_arch_intc_initialized(struct kvm *kvm)
    590{
    591	return vgic_initialized(kvm);
    592}
    593
    594void kvm_arm_halt_guest(struct kvm *kvm)
    595{
    596	unsigned long i;
    597	struct kvm_vcpu *vcpu;
    598
    599	kvm_for_each_vcpu(i, vcpu, kvm)
    600		vcpu->arch.pause = true;
    601	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
    602}
    603
    604void kvm_arm_resume_guest(struct kvm *kvm)
    605{
    606	unsigned long i;
    607	struct kvm_vcpu *vcpu;
    608
    609	kvm_for_each_vcpu(i, vcpu, kvm) {
    610		vcpu->arch.pause = false;
    611		__kvm_vcpu_wake_up(vcpu);
    612	}
    613}
    614
    615static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
    616{
    617	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
    618
    619	rcuwait_wait_event(wait,
    620			   (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
    621			   TASK_INTERRUPTIBLE);
    622
    623	if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
    624		/* Awaken to handle a signal, request we sleep again later. */
    625		kvm_make_request(KVM_REQ_SLEEP, vcpu);
    626	}
    627
    628	/*
    629	 * Make sure we will observe a potential reset request if we've
    630	 * observed a change to the power state. Pairs with the smp_wmb() in
    631	 * kvm_psci_vcpu_on().
    632	 */
    633	smp_rmb();
    634}
    635
    636/**
    637 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
    638 * @vcpu:	The VCPU pointer
    639 *
    640 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
    641 * the vCPU is runnable.  The vCPU may or may not be scheduled out, depending
    642 * on when a wake event arrives, e.g. there may already be a pending wake event.
    643 */
    644void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
    645{
    646	/*
    647	 * Sync back the state of the GIC CPU interface so that we have
    648	 * the latest PMR and group enables. This ensures that
    649	 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
    650	 * we have pending interrupts, e.g. when determining if the
    651	 * vCPU should block.
    652	 *
    653	 * For the same reason, we want to tell GICv4 that we need
    654	 * doorbells to be signalled, should an interrupt become pending.
    655	 */
    656	preempt_disable();
    657	kvm_vgic_vmcr_sync(vcpu);
    658	vgic_v4_put(vcpu, true);
    659	preempt_enable();
    660
    661	kvm_vcpu_halt(vcpu);
    662	vcpu->arch.flags &= ~KVM_ARM64_WFIT;
    663	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
    664
    665	preempt_disable();
    666	vgic_v4_load(vcpu);
    667	preempt_enable();
    668}
    669
    670static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
    671{
    672	if (!kvm_arm_vcpu_suspended(vcpu))
    673		return 1;
    674
    675	kvm_vcpu_wfi(vcpu);
    676
    677	/*
    678	 * The suspend state is sticky; we do not leave it until userspace
    679	 * explicitly marks the vCPU as runnable. Request that we suspend again
    680	 * later.
    681	 */
    682	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
    683
    684	/*
    685	 * Check to make sure the vCPU is actually runnable. If so, exit to
    686	 * userspace informing it of the wakeup condition.
    687	 */
    688	if (kvm_arch_vcpu_runnable(vcpu)) {
    689		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
    690		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
    691		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
    692		return 0;
    693	}
    694
    695	/*
    696	 * Otherwise, we were unblocked to process a different event, such as a
    697	 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
    698	 * process the event.
    699	 */
    700	return 1;
    701}
    702
    703/**
    704 * check_vcpu_requests - check and handle pending vCPU requests
    705 * @vcpu:	the VCPU pointer
    706 *
    707 * Return: 1 if we should enter the guest
    708 *	   0 if we should exit to userspace
    709 *	   < 0 if we should exit to userspace, where the return value indicates
    710 *	   an error
    711 */
    712static int check_vcpu_requests(struct kvm_vcpu *vcpu)
    713{
    714	if (kvm_request_pending(vcpu)) {
    715		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
    716			kvm_vcpu_sleep(vcpu);
    717
    718		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
    719			kvm_reset_vcpu(vcpu);
    720
    721		/*
    722		 * Clear IRQ_PENDING requests that were made to guarantee
    723		 * that a VCPU sees new virtual interrupts.
    724		 */
    725		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
    726
    727		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
    728			kvm_update_stolen_time(vcpu);
    729
    730		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
    731			/* The distributor enable bits were changed */
    732			preempt_disable();
    733			vgic_v4_put(vcpu, false);
    734			vgic_v4_load(vcpu);
    735			preempt_enable();
    736		}
    737
    738		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
    739			kvm_pmu_handle_pmcr(vcpu,
    740					    __vcpu_sys_reg(vcpu, PMCR_EL0));
    741
    742		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
    743			return kvm_vcpu_suspend(vcpu);
    744	}
    745
    746	return 1;
    747}
    748
    749static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
    750{
    751	if (likely(!vcpu_mode_is_32bit(vcpu)))
    752		return false;
    753
    754	return !system_supports_32bit_el0() ||
    755		static_branch_unlikely(&arm64_mismatched_32bit_el0);
    756}
    757
    758/**
    759 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
    760 * @vcpu:	The VCPU pointer
    761 * @ret:	Pointer to write optional return code
    762 *
    763 * Returns: true if the VCPU needs to return to a preemptible + interruptible
    764 *	    and skip guest entry.
    765 *
    766 * This function disambiguates between two different types of exits: exits to a
    767 * preemptible + interruptible kernel context and exits to userspace. For an
    768 * exit to userspace, this function will write the return code to ret and return
    769 * true. For an exit to preemptible + interruptible kernel context (i.e. check
    770 * for pending work and re-enter), return true without writing to ret.
    771 */
    772static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
    773{
    774	struct kvm_run *run = vcpu->run;
    775
    776	/*
    777	 * If we're using a userspace irqchip, then check if we need
    778	 * to tell a userspace irqchip about timer or PMU level
    779	 * changes and if so, exit to userspace (the actual level
    780	 * state gets updated in kvm_timer_update_run and
    781	 * kvm_pmu_update_run below).
    782	 */
    783	if (static_branch_unlikely(&userspace_irqchip_in_use)) {
    784		if (kvm_timer_should_notify_user(vcpu) ||
    785		    kvm_pmu_should_notify_user(vcpu)) {
    786			*ret = -EINTR;
    787			run->exit_reason = KVM_EXIT_INTR;
    788			return true;
    789		}
    790	}
    791
    792	if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
    793		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
    794		run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
    795		run->fail_entry.cpu = smp_processor_id();
    796		*ret = 0;
    797		return true;
    798	}
    799
    800	return kvm_request_pending(vcpu) ||
    801			xfer_to_guest_mode_work_pending();
    802}
    803
    804/*
    805 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
    806 * the vCPU is running.
    807 *
    808 * This must be noinstr as instrumentation may make use of RCU, and this is not
    809 * safe during the EQS.
    810 */
    811static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
    812{
    813	int ret;
    814
    815	guest_state_enter_irqoff();
    816	ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
    817	guest_state_exit_irqoff();
    818
    819	return ret;
    820}
    821
    822/**
    823 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
    824 * @vcpu:	The VCPU pointer
    825 *
    826 * This function is called through the VCPU_RUN ioctl called from user space. It
    827 * will execute VM code in a loop until the time slice for the process is used
    828 * or some emulation is needed from user space in which case the function will
    829 * return with return value 0 and with the kvm_run structure filled in with the
    830 * required data for the requested emulation.
    831 */
    832int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
    833{
    834	struct kvm_run *run = vcpu->run;
    835	int ret;
    836
    837	if (run->exit_reason == KVM_EXIT_MMIO) {
    838		ret = kvm_handle_mmio_return(vcpu);
    839		if (ret)
    840			return ret;
    841	}
    842
    843	vcpu_load(vcpu);
    844
    845	if (run->immediate_exit) {
    846		ret = -EINTR;
    847		goto out;
    848	}
    849
    850	kvm_sigset_activate(vcpu);
    851
    852	ret = 1;
    853	run->exit_reason = KVM_EXIT_UNKNOWN;
    854	run->flags = 0;
    855	while (ret > 0) {
    856		/*
    857		 * Check conditions before entering the guest
    858		 */
    859		ret = xfer_to_guest_mode_handle_work(vcpu);
    860		if (!ret)
    861			ret = 1;
    862
    863		if (ret > 0)
    864			ret = check_vcpu_requests(vcpu);
    865
    866		/*
    867		 * Preparing the interrupts to be injected also
    868		 * involves poking the GIC, which must be done in a
    869		 * non-preemptible context.
    870		 */
    871		preempt_disable();
    872
    873		/*
    874		 * The VMID allocator only tracks active VMIDs per
    875		 * physical CPU, and therefore the VMID allocated may not be
    876		 * preserved on VMID roll-over if the task was preempted,
    877		 * making a thread's VMID inactive. So we need to call
    878		 * kvm_arm_vmid_update() in non-premptible context.
    879		 */
    880		kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid);
    881
    882		kvm_pmu_flush_hwstate(vcpu);
    883
    884		local_irq_disable();
    885
    886		kvm_vgic_flush_hwstate(vcpu);
    887
    888		kvm_pmu_update_vcpu_events(vcpu);
    889
    890		/*
    891		 * Ensure we set mode to IN_GUEST_MODE after we disable
    892		 * interrupts and before the final VCPU requests check.
    893		 * See the comment in kvm_vcpu_exiting_guest_mode() and
    894		 * Documentation/virt/kvm/vcpu-requests.rst
    895		 */
    896		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
    897
    898		if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
    899			vcpu->mode = OUTSIDE_GUEST_MODE;
    900			isb(); /* Ensure work in x_flush_hwstate is committed */
    901			kvm_pmu_sync_hwstate(vcpu);
    902			if (static_branch_unlikely(&userspace_irqchip_in_use))
    903				kvm_timer_sync_user(vcpu);
    904			kvm_vgic_sync_hwstate(vcpu);
    905			local_irq_enable();
    906			preempt_enable();
    907			continue;
    908		}
    909
    910		kvm_arm_setup_debug(vcpu);
    911		kvm_arch_vcpu_ctxflush_fp(vcpu);
    912
    913		/**************************************************************
    914		 * Enter the guest
    915		 */
    916		trace_kvm_entry(*vcpu_pc(vcpu));
    917		guest_timing_enter_irqoff();
    918
    919		ret = kvm_arm_vcpu_enter_exit(vcpu);
    920
    921		vcpu->mode = OUTSIDE_GUEST_MODE;
    922		vcpu->stat.exits++;
    923		/*
    924		 * Back from guest
    925		 *************************************************************/
    926
    927		kvm_arm_clear_debug(vcpu);
    928
    929		/*
    930		 * We must sync the PMU state before the vgic state so
    931		 * that the vgic can properly sample the updated state of the
    932		 * interrupt line.
    933		 */
    934		kvm_pmu_sync_hwstate(vcpu);
    935
    936		/*
    937		 * Sync the vgic state before syncing the timer state because
    938		 * the timer code needs to know if the virtual timer
    939		 * interrupts are active.
    940		 */
    941		kvm_vgic_sync_hwstate(vcpu);
    942
    943		/*
    944		 * Sync the timer hardware state before enabling interrupts as
    945		 * we don't want vtimer interrupts to race with syncing the
    946		 * timer virtual interrupt state.
    947		 */
    948		if (static_branch_unlikely(&userspace_irqchip_in_use))
    949			kvm_timer_sync_user(vcpu);
    950
    951		kvm_arch_vcpu_ctxsync_fp(vcpu);
    952
    953		/*
    954		 * We must ensure that any pending interrupts are taken before
    955		 * we exit guest timing so that timer ticks are accounted as
    956		 * guest time. Transiently unmask interrupts so that any
    957		 * pending interrupts are taken.
    958		 *
    959		 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
    960		 * context synchronization event) is necessary to ensure that
    961		 * pending interrupts are taken.
    962		 */
    963		if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
    964			local_irq_enable();
    965			isb();
    966			local_irq_disable();
    967		}
    968
    969		guest_timing_exit_irqoff();
    970
    971		local_irq_enable();
    972
    973		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
    974
    975		/* Exit types that need handling before we can be preempted */
    976		handle_exit_early(vcpu, ret);
    977
    978		preempt_enable();
    979
    980		/*
    981		 * The ARMv8 architecture doesn't give the hypervisor
    982		 * a mechanism to prevent a guest from dropping to AArch32 EL0
    983		 * if implemented by the CPU. If we spot the guest in such
    984		 * state and that we decided it wasn't supposed to do so (like
    985		 * with the asymmetric AArch32 case), return to userspace with
    986		 * a fatal error.
    987		 */
    988		if (vcpu_mode_is_bad_32bit(vcpu)) {
    989			/*
    990			 * As we have caught the guest red-handed, decide that
    991			 * it isn't fit for purpose anymore by making the vcpu
    992			 * invalid. The VMM can try and fix it by issuing  a
    993			 * KVM_ARM_VCPU_INIT if it really wants to.
    994			 */
    995			vcpu->arch.target = -1;
    996			ret = ARM_EXCEPTION_IL;
    997		}
    998
    999		ret = handle_exit(vcpu, ret);
   1000	}
   1001
   1002	/* Tell userspace about in-kernel device output levels */
   1003	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
   1004		kvm_timer_update_run(vcpu);
   1005		kvm_pmu_update_run(vcpu);
   1006	}
   1007
   1008	kvm_sigset_deactivate(vcpu);
   1009
   1010out:
   1011	/*
   1012	 * In the unlikely event that we are returning to userspace
   1013	 * with pending exceptions or PC adjustment, commit these
   1014	 * adjustments in order to give userspace a consistent view of
   1015	 * the vcpu state. Note that this relies on __kvm_adjust_pc()
   1016	 * being preempt-safe on VHE.
   1017	 */
   1018	if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION |
   1019					 KVM_ARM64_INCREMENT_PC)))
   1020		kvm_call_hyp(__kvm_adjust_pc, vcpu);
   1021
   1022	vcpu_put(vcpu);
   1023	return ret;
   1024}
   1025
   1026static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
   1027{
   1028	int bit_index;
   1029	bool set;
   1030	unsigned long *hcr;
   1031
   1032	if (number == KVM_ARM_IRQ_CPU_IRQ)
   1033		bit_index = __ffs(HCR_VI);
   1034	else /* KVM_ARM_IRQ_CPU_FIQ */
   1035		bit_index = __ffs(HCR_VF);
   1036
   1037	hcr = vcpu_hcr(vcpu);
   1038	if (level)
   1039		set = test_and_set_bit(bit_index, hcr);
   1040	else
   1041		set = test_and_clear_bit(bit_index, hcr);
   1042
   1043	/*
   1044	 * If we didn't change anything, no need to wake up or kick other CPUs
   1045	 */
   1046	if (set == level)
   1047		return 0;
   1048
   1049	/*
   1050	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
   1051	 * trigger a world-switch round on the running physical CPU to set the
   1052	 * virtual IRQ/FIQ fields in the HCR appropriately.
   1053	 */
   1054	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
   1055	kvm_vcpu_kick(vcpu);
   1056
   1057	return 0;
   1058}
   1059
   1060int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
   1061			  bool line_status)
   1062{
   1063	u32 irq = irq_level->irq;
   1064	unsigned int irq_type, vcpu_idx, irq_num;
   1065	int nrcpus = atomic_read(&kvm->online_vcpus);
   1066	struct kvm_vcpu *vcpu = NULL;
   1067	bool level = irq_level->level;
   1068
   1069	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
   1070	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
   1071	vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
   1072	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
   1073
   1074	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
   1075
   1076	switch (irq_type) {
   1077	case KVM_ARM_IRQ_TYPE_CPU:
   1078		if (irqchip_in_kernel(kvm))
   1079			return -ENXIO;
   1080
   1081		if (vcpu_idx >= nrcpus)
   1082			return -EINVAL;
   1083
   1084		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
   1085		if (!vcpu)
   1086			return -EINVAL;
   1087
   1088		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
   1089			return -EINVAL;
   1090
   1091		return vcpu_interrupt_line(vcpu, irq_num, level);
   1092	case KVM_ARM_IRQ_TYPE_PPI:
   1093		if (!irqchip_in_kernel(kvm))
   1094			return -ENXIO;
   1095
   1096		if (vcpu_idx >= nrcpus)
   1097			return -EINVAL;
   1098
   1099		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
   1100		if (!vcpu)
   1101			return -EINVAL;
   1102
   1103		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
   1104			return -EINVAL;
   1105
   1106		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
   1107	case KVM_ARM_IRQ_TYPE_SPI:
   1108		if (!irqchip_in_kernel(kvm))
   1109			return -ENXIO;
   1110
   1111		if (irq_num < VGIC_NR_PRIVATE_IRQS)
   1112			return -EINVAL;
   1113
   1114		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
   1115	}
   1116
   1117	return -EINVAL;
   1118}
   1119
   1120static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
   1121			       const struct kvm_vcpu_init *init)
   1122{
   1123	unsigned int i, ret;
   1124	u32 phys_target = kvm_target_cpu();
   1125
   1126	if (init->target != phys_target)
   1127		return -EINVAL;
   1128
   1129	/*
   1130	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
   1131	 * use the same target.
   1132	 */
   1133	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
   1134		return -EINVAL;
   1135
   1136	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
   1137	for (i = 0; i < sizeof(init->features) * 8; i++) {
   1138		bool set = (init->features[i / 32] & (1 << (i % 32)));
   1139
   1140		if (set && i >= KVM_VCPU_MAX_FEATURES)
   1141			return -ENOENT;
   1142
   1143		/*
   1144		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
   1145		 * use the same feature set.
   1146		 */
   1147		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
   1148		    test_bit(i, vcpu->arch.features) != set)
   1149			return -EINVAL;
   1150
   1151		if (set)
   1152			set_bit(i, vcpu->arch.features);
   1153	}
   1154
   1155	vcpu->arch.target = phys_target;
   1156
   1157	/* Now we know what it is, we can reset it. */
   1158	ret = kvm_reset_vcpu(vcpu);
   1159	if (ret) {
   1160		vcpu->arch.target = -1;
   1161		bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
   1162	}
   1163
   1164	return ret;
   1165}
   1166
   1167static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
   1168					 struct kvm_vcpu_init *init)
   1169{
   1170	int ret;
   1171
   1172	ret = kvm_vcpu_set_target(vcpu, init);
   1173	if (ret)
   1174		return ret;
   1175
   1176	/*
   1177	 * Ensure a rebooted VM will fault in RAM pages and detect if the
   1178	 * guest MMU is turned off and flush the caches as needed.
   1179	 *
   1180	 * S2FWB enforces all memory accesses to RAM being cacheable,
   1181	 * ensuring that the data side is always coherent. We still
   1182	 * need to invalidate the I-cache though, as FWB does *not*
   1183	 * imply CTR_EL0.DIC.
   1184	 */
   1185	if (vcpu_has_run_once(vcpu)) {
   1186		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
   1187			stage2_unmap_vm(vcpu->kvm);
   1188		else
   1189			icache_inval_all_pou();
   1190	}
   1191
   1192	vcpu_reset_hcr(vcpu);
   1193	vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
   1194
   1195	/*
   1196	 * Handle the "start in power-off" case.
   1197	 */
   1198	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
   1199		kvm_arm_vcpu_power_off(vcpu);
   1200	else
   1201		vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
   1202
   1203	return 0;
   1204}
   1205
   1206static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
   1207				 struct kvm_device_attr *attr)
   1208{
   1209	int ret = -ENXIO;
   1210
   1211	switch (attr->group) {
   1212	default:
   1213		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
   1214		break;
   1215	}
   1216
   1217	return ret;
   1218}
   1219
   1220static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
   1221				 struct kvm_device_attr *attr)
   1222{
   1223	int ret = -ENXIO;
   1224
   1225	switch (attr->group) {
   1226	default:
   1227		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
   1228		break;
   1229	}
   1230
   1231	return ret;
   1232}
   1233
   1234static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
   1235				 struct kvm_device_attr *attr)
   1236{
   1237	int ret = -ENXIO;
   1238
   1239	switch (attr->group) {
   1240	default:
   1241		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
   1242		break;
   1243	}
   1244
   1245	return ret;
   1246}
   1247
   1248static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
   1249				   struct kvm_vcpu_events *events)
   1250{
   1251	memset(events, 0, sizeof(*events));
   1252
   1253	return __kvm_arm_vcpu_get_events(vcpu, events);
   1254}
   1255
   1256static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
   1257				   struct kvm_vcpu_events *events)
   1258{
   1259	int i;
   1260
   1261	/* check whether the reserved field is zero */
   1262	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
   1263		if (events->reserved[i])
   1264			return -EINVAL;
   1265
   1266	/* check whether the pad field is zero */
   1267	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
   1268		if (events->exception.pad[i])
   1269			return -EINVAL;
   1270
   1271	return __kvm_arm_vcpu_set_events(vcpu, events);
   1272}
   1273
   1274long kvm_arch_vcpu_ioctl(struct file *filp,
   1275			 unsigned int ioctl, unsigned long arg)
   1276{
   1277	struct kvm_vcpu *vcpu = filp->private_data;
   1278	void __user *argp = (void __user *)arg;
   1279	struct kvm_device_attr attr;
   1280	long r;
   1281
   1282	switch (ioctl) {
   1283	case KVM_ARM_VCPU_INIT: {
   1284		struct kvm_vcpu_init init;
   1285
   1286		r = -EFAULT;
   1287		if (copy_from_user(&init, argp, sizeof(init)))
   1288			break;
   1289
   1290		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
   1291		break;
   1292	}
   1293	case KVM_SET_ONE_REG:
   1294	case KVM_GET_ONE_REG: {
   1295		struct kvm_one_reg reg;
   1296
   1297		r = -ENOEXEC;
   1298		if (unlikely(!kvm_vcpu_initialized(vcpu)))
   1299			break;
   1300
   1301		r = -EFAULT;
   1302		if (copy_from_user(&reg, argp, sizeof(reg)))
   1303			break;
   1304
   1305		/*
   1306		 * We could owe a reset due to PSCI. Handle the pending reset
   1307		 * here to ensure userspace register accesses are ordered after
   1308		 * the reset.
   1309		 */
   1310		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
   1311			kvm_reset_vcpu(vcpu);
   1312
   1313		if (ioctl == KVM_SET_ONE_REG)
   1314			r = kvm_arm_set_reg(vcpu, &reg);
   1315		else
   1316			r = kvm_arm_get_reg(vcpu, &reg);
   1317		break;
   1318	}
   1319	case KVM_GET_REG_LIST: {
   1320		struct kvm_reg_list __user *user_list = argp;
   1321		struct kvm_reg_list reg_list;
   1322		unsigned n;
   1323
   1324		r = -ENOEXEC;
   1325		if (unlikely(!kvm_vcpu_initialized(vcpu)))
   1326			break;
   1327
   1328		r = -EPERM;
   1329		if (!kvm_arm_vcpu_is_finalized(vcpu))
   1330			break;
   1331
   1332		r = -EFAULT;
   1333		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
   1334			break;
   1335		n = reg_list.n;
   1336		reg_list.n = kvm_arm_num_regs(vcpu);
   1337		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
   1338			break;
   1339		r = -E2BIG;
   1340		if (n < reg_list.n)
   1341			break;
   1342		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
   1343		break;
   1344	}
   1345	case KVM_SET_DEVICE_ATTR: {
   1346		r = -EFAULT;
   1347		if (copy_from_user(&attr, argp, sizeof(attr)))
   1348			break;
   1349		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
   1350		break;
   1351	}
   1352	case KVM_GET_DEVICE_ATTR: {
   1353		r = -EFAULT;
   1354		if (copy_from_user(&attr, argp, sizeof(attr)))
   1355			break;
   1356		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
   1357		break;
   1358	}
   1359	case KVM_HAS_DEVICE_ATTR: {
   1360		r = -EFAULT;
   1361		if (copy_from_user(&attr, argp, sizeof(attr)))
   1362			break;
   1363		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
   1364		break;
   1365	}
   1366	case KVM_GET_VCPU_EVENTS: {
   1367		struct kvm_vcpu_events events;
   1368
   1369		if (kvm_arm_vcpu_get_events(vcpu, &events))
   1370			return -EINVAL;
   1371
   1372		if (copy_to_user(argp, &events, sizeof(events)))
   1373			return -EFAULT;
   1374
   1375		return 0;
   1376	}
   1377	case KVM_SET_VCPU_EVENTS: {
   1378		struct kvm_vcpu_events events;
   1379
   1380		if (copy_from_user(&events, argp, sizeof(events)))
   1381			return -EFAULT;
   1382
   1383		return kvm_arm_vcpu_set_events(vcpu, &events);
   1384	}
   1385	case KVM_ARM_VCPU_FINALIZE: {
   1386		int what;
   1387
   1388		if (!kvm_vcpu_initialized(vcpu))
   1389			return -ENOEXEC;
   1390
   1391		if (get_user(what, (const int __user *)argp))
   1392			return -EFAULT;
   1393
   1394		return kvm_arm_vcpu_finalize(vcpu, what);
   1395	}
   1396	default:
   1397		r = -EINVAL;
   1398	}
   1399
   1400	return r;
   1401}
   1402
   1403void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
   1404{
   1405
   1406}
   1407
   1408void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
   1409					const struct kvm_memory_slot *memslot)
   1410{
   1411	kvm_flush_remote_tlbs(kvm);
   1412}
   1413
   1414static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
   1415					struct kvm_arm_device_addr *dev_addr)
   1416{
   1417	unsigned long dev_id, type;
   1418
   1419	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
   1420		KVM_ARM_DEVICE_ID_SHIFT;
   1421	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
   1422		KVM_ARM_DEVICE_TYPE_SHIFT;
   1423
   1424	switch (dev_id) {
   1425	case KVM_ARM_DEVICE_VGIC_V2:
   1426		if (!vgic_present)
   1427			return -ENXIO;
   1428		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
   1429	default:
   1430		return -ENODEV;
   1431	}
   1432}
   1433
   1434long kvm_arch_vm_ioctl(struct file *filp,
   1435		       unsigned int ioctl, unsigned long arg)
   1436{
   1437	struct kvm *kvm = filp->private_data;
   1438	void __user *argp = (void __user *)arg;
   1439
   1440	switch (ioctl) {
   1441	case KVM_CREATE_IRQCHIP: {
   1442		int ret;
   1443		if (!vgic_present)
   1444			return -ENXIO;
   1445		mutex_lock(&kvm->lock);
   1446		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
   1447		mutex_unlock(&kvm->lock);
   1448		return ret;
   1449	}
   1450	case KVM_ARM_SET_DEVICE_ADDR: {
   1451		struct kvm_arm_device_addr dev_addr;
   1452
   1453		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
   1454			return -EFAULT;
   1455		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
   1456	}
   1457	case KVM_ARM_PREFERRED_TARGET: {
   1458		struct kvm_vcpu_init init;
   1459
   1460		kvm_vcpu_preferred_target(&init);
   1461
   1462		if (copy_to_user(argp, &init, sizeof(init)))
   1463			return -EFAULT;
   1464
   1465		return 0;
   1466	}
   1467	case KVM_ARM_MTE_COPY_TAGS: {
   1468		struct kvm_arm_copy_mte_tags copy_tags;
   1469
   1470		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
   1471			return -EFAULT;
   1472		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
   1473	}
   1474	default:
   1475		return -EINVAL;
   1476	}
   1477}
   1478
   1479static unsigned long nvhe_percpu_size(void)
   1480{
   1481	return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
   1482		(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
   1483}
   1484
   1485static unsigned long nvhe_percpu_order(void)
   1486{
   1487	unsigned long size = nvhe_percpu_size();
   1488
   1489	return size ? get_order(size) : 0;
   1490}
   1491
   1492/* A lookup table holding the hypervisor VA for each vector slot */
   1493static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
   1494
   1495static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
   1496{
   1497	hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
   1498}
   1499
   1500static int kvm_init_vector_slots(void)
   1501{
   1502	int err;
   1503	void *base;
   1504
   1505	base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
   1506	kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
   1507
   1508	base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
   1509	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
   1510
   1511	if (kvm_system_needs_idmapped_vectors() &&
   1512	    !is_protected_kvm_enabled()) {
   1513		err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
   1514					       __BP_HARDEN_HYP_VECS_SZ, &base);
   1515		if (err)
   1516			return err;
   1517	}
   1518
   1519	kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
   1520	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
   1521	return 0;
   1522}
   1523
   1524static void cpu_prepare_hyp_mode(int cpu)
   1525{
   1526	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
   1527	unsigned long tcr;
   1528
   1529	/*
   1530	 * Calculate the raw per-cpu offset without a translation from the
   1531	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
   1532	 * so that we can use adr_l to access per-cpu variables in EL2.
   1533	 * Also drop the KASAN tag which gets in the way...
   1534	 */
   1535	params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
   1536			    (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
   1537
   1538	params->mair_el2 = read_sysreg(mair_el1);
   1539
   1540	/*
   1541	 * The ID map may be configured to use an extended virtual address
   1542	 * range. This is only the case if system RAM is out of range for the
   1543	 * currently configured page size and VA_BITS, in which case we will
   1544	 * also need the extended virtual range for the HYP ID map, or we won't
   1545	 * be able to enable the EL2 MMU.
   1546	 *
   1547	 * However, at EL2, there is only one TTBR register, and we can't switch
   1548	 * between translation tables *and* update TCR_EL2.T0SZ at the same
   1549	 * time. Bottom line: we need to use the extended range with *both* our
   1550	 * translation tables.
   1551	 *
   1552	 * So use the same T0SZ value we use for the ID map.
   1553	 */
   1554	tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
   1555	tcr &= ~TCR_T0SZ_MASK;
   1556	tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
   1557	params->tcr_el2 = tcr;
   1558
   1559	params->pgd_pa = kvm_mmu_get_httbr();
   1560	if (is_protected_kvm_enabled())
   1561		params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
   1562	else
   1563		params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
   1564	params->vttbr = params->vtcr = 0;
   1565
   1566	/*
   1567	 * Flush the init params from the data cache because the struct will
   1568	 * be read while the MMU is off.
   1569	 */
   1570	kvm_flush_dcache_to_poc(params, sizeof(*params));
   1571}
   1572
   1573static void hyp_install_host_vector(void)
   1574{
   1575	struct kvm_nvhe_init_params *params;
   1576	struct arm_smccc_res res;
   1577
   1578	/* Switch from the HYP stub to our own HYP init vector */
   1579	__hyp_set_vectors(kvm_get_idmap_vector());
   1580
   1581	/*
   1582	 * Call initialization code, and switch to the full blown HYP code.
   1583	 * If the cpucaps haven't been finalized yet, something has gone very
   1584	 * wrong, and hyp will crash and burn when it uses any
   1585	 * cpus_have_const_cap() wrapper.
   1586	 */
   1587	BUG_ON(!system_capabilities_finalized());
   1588	params = this_cpu_ptr_nvhe_sym(kvm_init_params);
   1589	arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
   1590	WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
   1591}
   1592
   1593static void cpu_init_hyp_mode(void)
   1594{
   1595	hyp_install_host_vector();
   1596
   1597	/*
   1598	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
   1599	 * at EL2.
   1600	 */
   1601	if (this_cpu_has_cap(ARM64_SSBS) &&
   1602	    arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
   1603		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
   1604	}
   1605}
   1606
   1607static void cpu_hyp_reset(void)
   1608{
   1609	if (!is_kernel_in_hyp_mode())
   1610		__hyp_reset_vectors();
   1611}
   1612
   1613/*
   1614 * EL2 vectors can be mapped and rerouted in a number of ways,
   1615 * depending on the kernel configuration and CPU present:
   1616 *
   1617 * - If the CPU is affected by Spectre-v2, the hardening sequence is
   1618 *   placed in one of the vector slots, which is executed before jumping
   1619 *   to the real vectors.
   1620 *
   1621 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
   1622 *   containing the hardening sequence is mapped next to the idmap page,
   1623 *   and executed before jumping to the real vectors.
   1624 *
   1625 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
   1626 *   empty slot is selected, mapped next to the idmap page, and
   1627 *   executed before jumping to the real vectors.
   1628 *
   1629 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
   1630 * VHE, as we don't have hypervisor-specific mappings. If the system
   1631 * is VHE and yet selects this capability, it will be ignored.
   1632 */
   1633static void cpu_set_hyp_vector(void)
   1634{
   1635	struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
   1636	void *vector = hyp_spectre_vector_selector[data->slot];
   1637
   1638	if (!is_protected_kvm_enabled())
   1639		*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
   1640	else
   1641		kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
   1642}
   1643
   1644static void cpu_hyp_init_context(void)
   1645{
   1646	kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
   1647
   1648	if (!is_kernel_in_hyp_mode())
   1649		cpu_init_hyp_mode();
   1650}
   1651
   1652static void cpu_hyp_init_features(void)
   1653{
   1654	cpu_set_hyp_vector();
   1655	kvm_arm_init_debug();
   1656
   1657	if (is_kernel_in_hyp_mode())
   1658		kvm_timer_init_vhe();
   1659
   1660	if (vgic_present)
   1661		kvm_vgic_init_cpu_hardware();
   1662}
   1663
   1664static void cpu_hyp_reinit(void)
   1665{
   1666	cpu_hyp_reset();
   1667	cpu_hyp_init_context();
   1668	cpu_hyp_init_features();
   1669}
   1670
   1671static void _kvm_arch_hardware_enable(void *discard)
   1672{
   1673	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
   1674		cpu_hyp_reinit();
   1675		__this_cpu_write(kvm_arm_hardware_enabled, 1);
   1676	}
   1677}
   1678
   1679int kvm_arch_hardware_enable(void)
   1680{
   1681	_kvm_arch_hardware_enable(NULL);
   1682	return 0;
   1683}
   1684
   1685static void _kvm_arch_hardware_disable(void *discard)
   1686{
   1687	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
   1688		cpu_hyp_reset();
   1689		__this_cpu_write(kvm_arm_hardware_enabled, 0);
   1690	}
   1691}
   1692
   1693void kvm_arch_hardware_disable(void)
   1694{
   1695	if (!is_protected_kvm_enabled())
   1696		_kvm_arch_hardware_disable(NULL);
   1697}
   1698
   1699#ifdef CONFIG_CPU_PM
   1700static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
   1701				    unsigned long cmd,
   1702				    void *v)
   1703{
   1704	/*
   1705	 * kvm_arm_hardware_enabled is left with its old value over
   1706	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
   1707	 * re-enable hyp.
   1708	 */
   1709	switch (cmd) {
   1710	case CPU_PM_ENTER:
   1711		if (__this_cpu_read(kvm_arm_hardware_enabled))
   1712			/*
   1713			 * don't update kvm_arm_hardware_enabled here
   1714			 * so that the hardware will be re-enabled
   1715			 * when we resume. See below.
   1716			 */
   1717			cpu_hyp_reset();
   1718
   1719		return NOTIFY_OK;
   1720	case CPU_PM_ENTER_FAILED:
   1721	case CPU_PM_EXIT:
   1722		if (__this_cpu_read(kvm_arm_hardware_enabled))
   1723			/* The hardware was enabled before suspend. */
   1724			cpu_hyp_reinit();
   1725
   1726		return NOTIFY_OK;
   1727
   1728	default:
   1729		return NOTIFY_DONE;
   1730	}
   1731}
   1732
   1733static struct notifier_block hyp_init_cpu_pm_nb = {
   1734	.notifier_call = hyp_init_cpu_pm_notifier,
   1735};
   1736
   1737static void hyp_cpu_pm_init(void)
   1738{
   1739	if (!is_protected_kvm_enabled())
   1740		cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
   1741}
   1742static void hyp_cpu_pm_exit(void)
   1743{
   1744	if (!is_protected_kvm_enabled())
   1745		cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
   1746}
   1747#else
   1748static inline void hyp_cpu_pm_init(void)
   1749{
   1750}
   1751static inline void hyp_cpu_pm_exit(void)
   1752{
   1753}
   1754#endif
   1755
   1756static void init_cpu_logical_map(void)
   1757{
   1758	unsigned int cpu;
   1759
   1760	/*
   1761	 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
   1762	 * Only copy the set of online CPUs whose features have been checked
   1763	 * against the finalized system capabilities. The hypervisor will not
   1764	 * allow any other CPUs from the `possible` set to boot.
   1765	 */
   1766	for_each_online_cpu(cpu)
   1767		hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
   1768}
   1769
   1770#define init_psci_0_1_impl_state(config, what)	\
   1771	config.psci_0_1_ ## what ## _implemented = psci_ops.what
   1772
   1773static bool init_psci_relay(void)
   1774{
   1775	/*
   1776	 * If PSCI has not been initialized, protected KVM cannot install
   1777	 * itself on newly booted CPUs.
   1778	 */
   1779	if (!psci_ops.get_version) {
   1780		kvm_err("Cannot initialize protected mode without PSCI\n");
   1781		return false;
   1782	}
   1783
   1784	kvm_host_psci_config.version = psci_ops.get_version();
   1785
   1786	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
   1787		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
   1788		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
   1789		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
   1790		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
   1791		init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
   1792	}
   1793	return true;
   1794}
   1795
   1796static int init_subsystems(void)
   1797{
   1798	int err = 0;
   1799
   1800	/*
   1801	 * Enable hardware so that subsystem initialisation can access EL2.
   1802	 */
   1803	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
   1804
   1805	/*
   1806	 * Register CPU lower-power notifier
   1807	 */
   1808	hyp_cpu_pm_init();
   1809
   1810	/*
   1811	 * Init HYP view of VGIC
   1812	 */
   1813	err = kvm_vgic_hyp_init();
   1814	switch (err) {
   1815	case 0:
   1816		vgic_present = true;
   1817		break;
   1818	case -ENODEV:
   1819	case -ENXIO:
   1820		vgic_present = false;
   1821		err = 0;
   1822		break;
   1823	default:
   1824		goto out;
   1825	}
   1826
   1827	/*
   1828	 * Init HYP architected timer support
   1829	 */
   1830	err = kvm_timer_hyp_init(vgic_present);
   1831	if (err)
   1832		goto out;
   1833
   1834	kvm_register_perf_callbacks(NULL);
   1835
   1836out:
   1837	if (err || !is_protected_kvm_enabled())
   1838		on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
   1839
   1840	return err;
   1841}
   1842
   1843static void teardown_hyp_mode(void)
   1844{
   1845	int cpu;
   1846
   1847	free_hyp_pgds();
   1848	for_each_possible_cpu(cpu) {
   1849		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
   1850		free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
   1851	}
   1852}
   1853
   1854static int do_pkvm_init(u32 hyp_va_bits)
   1855{
   1856	void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
   1857	int ret;
   1858
   1859	preempt_disable();
   1860	cpu_hyp_init_context();
   1861	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
   1862				num_possible_cpus(), kern_hyp_va(per_cpu_base),
   1863				hyp_va_bits);
   1864	cpu_hyp_init_features();
   1865
   1866	/*
   1867	 * The stub hypercalls are now disabled, so set our local flag to
   1868	 * prevent a later re-init attempt in kvm_arch_hardware_enable().
   1869	 */
   1870	__this_cpu_write(kvm_arm_hardware_enabled, 1);
   1871	preempt_enable();
   1872
   1873	return ret;
   1874}
   1875
   1876static int kvm_hyp_init_protection(u32 hyp_va_bits)
   1877{
   1878	void *addr = phys_to_virt(hyp_mem_base);
   1879	int ret;
   1880
   1881	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
   1882	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
   1883	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
   1884	kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
   1885	kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
   1886	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
   1887	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
   1888	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
   1889
   1890	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
   1891	if (ret)
   1892		return ret;
   1893
   1894	ret = do_pkvm_init(hyp_va_bits);
   1895	if (ret)
   1896		return ret;
   1897
   1898	free_hyp_pgds();
   1899
   1900	return 0;
   1901}
   1902
   1903/**
   1904 * Inits Hyp-mode on all online CPUs
   1905 */
   1906static int init_hyp_mode(void)
   1907{
   1908	u32 hyp_va_bits;
   1909	int cpu;
   1910	int err = -ENOMEM;
   1911
   1912	/*
   1913	 * The protected Hyp-mode cannot be initialized if the memory pool
   1914	 * allocation has failed.
   1915	 */
   1916	if (is_protected_kvm_enabled() && !hyp_mem_base)
   1917		goto out_err;
   1918
   1919	/*
   1920	 * Allocate Hyp PGD and setup Hyp identity mapping
   1921	 */
   1922	err = kvm_mmu_init(&hyp_va_bits);
   1923	if (err)
   1924		goto out_err;
   1925
   1926	/*
   1927	 * Allocate stack pages for Hypervisor-mode
   1928	 */
   1929	for_each_possible_cpu(cpu) {
   1930		unsigned long stack_page;
   1931
   1932		stack_page = __get_free_page(GFP_KERNEL);
   1933		if (!stack_page) {
   1934			err = -ENOMEM;
   1935			goto out_err;
   1936		}
   1937
   1938		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
   1939	}
   1940
   1941	/*
   1942	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
   1943	 */
   1944	for_each_possible_cpu(cpu) {
   1945		struct page *page;
   1946		void *page_addr;
   1947
   1948		page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
   1949		if (!page) {
   1950			err = -ENOMEM;
   1951			goto out_err;
   1952		}
   1953
   1954		page_addr = page_address(page);
   1955		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
   1956		kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
   1957	}
   1958
   1959	/*
   1960	 * Map the Hyp-code called directly from the host
   1961	 */
   1962	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
   1963				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
   1964	if (err) {
   1965		kvm_err("Cannot map world-switch code\n");
   1966		goto out_err;
   1967	}
   1968
   1969	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
   1970				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
   1971	if (err) {
   1972		kvm_err("Cannot map .hyp.rodata section\n");
   1973		goto out_err;
   1974	}
   1975
   1976	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
   1977				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
   1978	if (err) {
   1979		kvm_err("Cannot map rodata section\n");
   1980		goto out_err;
   1981	}
   1982
   1983	/*
   1984	 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
   1985	 * section thanks to an assertion in the linker script. Map it RW and
   1986	 * the rest of .bss RO.
   1987	 */
   1988	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
   1989				  kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
   1990	if (err) {
   1991		kvm_err("Cannot map hyp bss section: %d\n", err);
   1992		goto out_err;
   1993	}
   1994
   1995	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
   1996				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
   1997	if (err) {
   1998		kvm_err("Cannot map bss section\n");
   1999		goto out_err;
   2000	}
   2001
   2002	/*
   2003	 * Map the Hyp stack pages
   2004	 */
   2005	for_each_possible_cpu(cpu) {
   2006		struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
   2007		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
   2008		unsigned long hyp_addr;
   2009
   2010		/*
   2011		 * Allocate a contiguous HYP private VA range for the stack
   2012		 * and guard page. The allocation is also aligned based on
   2013		 * the order of its size.
   2014		 */
   2015		err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
   2016		if (err) {
   2017			kvm_err("Cannot allocate hyp stack guard page\n");
   2018			goto out_err;
   2019		}
   2020
   2021		/*
   2022		 * Since the stack grows downwards, map the stack to the page
   2023		 * at the higher address and leave the lower guard page
   2024		 * unbacked.
   2025		 *
   2026		 * Any valid stack address now has the PAGE_SHIFT bit as 1
   2027		 * and addresses corresponding to the guard page have the
   2028		 * PAGE_SHIFT bit as 0 - this is used for overflow detection.
   2029		 */
   2030		err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE,
   2031					    __pa(stack_page), PAGE_HYP);
   2032		if (err) {
   2033			kvm_err("Cannot map hyp stack\n");
   2034			goto out_err;
   2035		}
   2036
   2037		/*
   2038		 * Save the stack PA in nvhe_init_params. This will be needed
   2039		 * to recreate the stack mapping in protected nVHE mode.
   2040		 * __hyp_pa() won't do the right thing there, since the stack
   2041		 * has been mapped in the flexible private VA space.
   2042		 */
   2043		params->stack_pa = __pa(stack_page);
   2044
   2045		params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
   2046	}
   2047
   2048	for_each_possible_cpu(cpu) {
   2049		char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
   2050		char *percpu_end = percpu_begin + nvhe_percpu_size();
   2051
   2052		/* Map Hyp percpu pages */
   2053		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
   2054		if (err) {
   2055			kvm_err("Cannot map hyp percpu region\n");
   2056			goto out_err;
   2057		}
   2058
   2059		/* Prepare the CPU initialization parameters */
   2060		cpu_prepare_hyp_mode(cpu);
   2061	}
   2062
   2063	if (is_protected_kvm_enabled()) {
   2064		init_cpu_logical_map();
   2065
   2066		if (!init_psci_relay()) {
   2067			err = -ENODEV;
   2068			goto out_err;
   2069		}
   2070	}
   2071
   2072	if (is_protected_kvm_enabled()) {
   2073		err = kvm_hyp_init_protection(hyp_va_bits);
   2074		if (err) {
   2075			kvm_err("Failed to init hyp memory protection\n");
   2076			goto out_err;
   2077		}
   2078	}
   2079
   2080	return 0;
   2081
   2082out_err:
   2083	teardown_hyp_mode();
   2084	kvm_err("error initializing Hyp mode: %d\n", err);
   2085	return err;
   2086}
   2087
   2088static void _kvm_host_prot_finalize(void *arg)
   2089{
   2090	int *err = arg;
   2091
   2092	if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
   2093		WRITE_ONCE(*err, -EINVAL);
   2094}
   2095
   2096static int pkvm_drop_host_privileges(void)
   2097{
   2098	int ret = 0;
   2099
   2100	/*
   2101	 * Flip the static key upfront as that may no longer be possible
   2102	 * once the host stage 2 is installed.
   2103	 */
   2104	static_branch_enable(&kvm_protected_mode_initialized);
   2105	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
   2106	return ret;
   2107}
   2108
   2109static int finalize_hyp_mode(void)
   2110{
   2111	if (!is_protected_kvm_enabled())
   2112		return 0;
   2113
   2114	/*
   2115	 * Exclude HYP sections from kmemleak so that they don't get peeked
   2116	 * at, which would end badly once inaccessible.
   2117	 */
   2118	kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
   2119	kmemleak_free_part(__va(hyp_mem_base), hyp_mem_size);
   2120	return pkvm_drop_host_privileges();
   2121}
   2122
   2123struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
   2124{
   2125	struct kvm_vcpu *vcpu;
   2126	unsigned long i;
   2127
   2128	mpidr &= MPIDR_HWID_BITMASK;
   2129	kvm_for_each_vcpu(i, vcpu, kvm) {
   2130		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
   2131			return vcpu;
   2132	}
   2133	return NULL;
   2134}
   2135
   2136bool kvm_arch_has_irq_bypass(void)
   2137{
   2138	return true;
   2139}
   2140
   2141int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
   2142				      struct irq_bypass_producer *prod)
   2143{
   2144	struct kvm_kernel_irqfd *irqfd =
   2145		container_of(cons, struct kvm_kernel_irqfd, consumer);
   2146
   2147	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
   2148					  &irqfd->irq_entry);
   2149}
   2150void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
   2151				      struct irq_bypass_producer *prod)
   2152{
   2153	struct kvm_kernel_irqfd *irqfd =
   2154		container_of(cons, struct kvm_kernel_irqfd, consumer);
   2155
   2156	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
   2157				     &irqfd->irq_entry);
   2158}
   2159
   2160void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
   2161{
   2162	struct kvm_kernel_irqfd *irqfd =
   2163		container_of(cons, struct kvm_kernel_irqfd, consumer);
   2164
   2165	kvm_arm_halt_guest(irqfd->kvm);
   2166}
   2167
   2168void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
   2169{
   2170	struct kvm_kernel_irqfd *irqfd =
   2171		container_of(cons, struct kvm_kernel_irqfd, consumer);
   2172
   2173	kvm_arm_resume_guest(irqfd->kvm);
   2174}
   2175
   2176/**
   2177 * Initialize Hyp-mode and memory mappings on all CPUs.
   2178 */
   2179int kvm_arch_init(void *opaque)
   2180{
   2181	int err;
   2182	bool in_hyp_mode;
   2183
   2184	if (!is_hyp_mode_available()) {
   2185		kvm_info("HYP mode not available\n");
   2186		return -ENODEV;
   2187	}
   2188
   2189	if (kvm_get_mode() == KVM_MODE_NONE) {
   2190		kvm_info("KVM disabled from command line\n");
   2191		return -ENODEV;
   2192	}
   2193
   2194	err = kvm_sys_reg_table_init();
   2195	if (err) {
   2196		kvm_info("Error initializing system register tables");
   2197		return err;
   2198	}
   2199
   2200	in_hyp_mode = is_kernel_in_hyp_mode();
   2201
   2202	if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
   2203	    cpus_have_final_cap(ARM64_WORKAROUND_1508412))
   2204		kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
   2205			 "Only trusted guests should be used on this system.\n");
   2206
   2207	err = kvm_set_ipa_limit();
   2208	if (err)
   2209		return err;
   2210
   2211	err = kvm_arm_init_sve();
   2212	if (err)
   2213		return err;
   2214
   2215	err = kvm_arm_vmid_alloc_init();
   2216	if (err) {
   2217		kvm_err("Failed to initialize VMID allocator.\n");
   2218		return err;
   2219	}
   2220
   2221	if (!in_hyp_mode) {
   2222		err = init_hyp_mode();
   2223		if (err)
   2224			goto out_err;
   2225	}
   2226
   2227	err = kvm_init_vector_slots();
   2228	if (err) {
   2229		kvm_err("Cannot initialise vector slots\n");
   2230		goto out_err;
   2231	}
   2232
   2233	err = init_subsystems();
   2234	if (err)
   2235		goto out_hyp;
   2236
   2237	if (!in_hyp_mode) {
   2238		err = finalize_hyp_mode();
   2239		if (err) {
   2240			kvm_err("Failed to finalize Hyp protection\n");
   2241			goto out_hyp;
   2242		}
   2243	}
   2244
   2245	if (is_protected_kvm_enabled()) {
   2246		kvm_info("Protected nVHE mode initialized successfully\n");
   2247	} else if (in_hyp_mode) {
   2248		kvm_info("VHE mode initialized successfully\n");
   2249	} else {
   2250		kvm_info("Hyp mode initialized successfully\n");
   2251	}
   2252
   2253	return 0;
   2254
   2255out_hyp:
   2256	hyp_cpu_pm_exit();
   2257	if (!in_hyp_mode)
   2258		teardown_hyp_mode();
   2259out_err:
   2260	kvm_arm_vmid_alloc_free();
   2261	return err;
   2262}
   2263
   2264/* NOP: Compiling as a module not supported */
   2265void kvm_arch_exit(void)
   2266{
   2267	kvm_unregister_perf_callbacks();
   2268}
   2269
   2270static int __init early_kvm_mode_cfg(char *arg)
   2271{
   2272	if (!arg)
   2273		return -EINVAL;
   2274
   2275	if (strcmp(arg, "protected") == 0) {
   2276		if (!is_kernel_in_hyp_mode())
   2277			kvm_mode = KVM_MODE_PROTECTED;
   2278		else
   2279			pr_warn_once("Protected KVM not available with VHE\n");
   2280
   2281		return 0;
   2282	}
   2283
   2284	if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
   2285		kvm_mode = KVM_MODE_DEFAULT;
   2286		return 0;
   2287	}
   2288
   2289	if (strcmp(arg, "none") == 0) {
   2290		kvm_mode = KVM_MODE_NONE;
   2291		return 0;
   2292	}
   2293
   2294	return -EINVAL;
   2295}
   2296early_param("kvm-arm.mode", early_kvm_mode_cfg);
   2297
   2298enum kvm_mode kvm_get_mode(void)
   2299{
   2300	return kvm_mode;
   2301}
   2302
   2303static int arm_init(void)
   2304{
   2305	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
   2306	return rc;
   2307}
   2308
   2309module_init(arm_init);