cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mmu.c (49547B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
      4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
      5 */
      6
      7#include <linux/mman.h>
      8#include <linux/kvm_host.h>
      9#include <linux/io.h>
     10#include <linux/hugetlb.h>
     11#include <linux/sched/signal.h>
     12#include <trace/events/kvm.h>
     13#include <asm/pgalloc.h>
     14#include <asm/cacheflush.h>
     15#include <asm/kvm_arm.h>
     16#include <asm/kvm_mmu.h>
     17#include <asm/kvm_pgtable.h>
     18#include <asm/kvm_ras.h>
     19#include <asm/kvm_asm.h>
     20#include <asm/kvm_emulate.h>
     21#include <asm/virt.h>
     22
     23#include "trace.h"
     24
     25static struct kvm_pgtable *hyp_pgtable;
     26static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
     27
     28static unsigned long hyp_idmap_start;
     29static unsigned long hyp_idmap_end;
     30static phys_addr_t hyp_idmap_vector;
     31
     32static unsigned long io_map_base;
     33
     34
     35/*
     36 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
     37 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
     38 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
     39 * long will also starve other vCPUs. We have to also make sure that the page
     40 * tables are not freed while we released the lock.
     41 */
     42static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
     43			      phys_addr_t end,
     44			      int (*fn)(struct kvm_pgtable *, u64, u64),
     45			      bool resched)
     46{
     47	int ret;
     48	u64 next;
     49
     50	do {
     51		struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
     52		if (!pgt)
     53			return -EINVAL;
     54
     55		next = stage2_pgd_addr_end(kvm, addr, end);
     56		ret = fn(pgt, addr, next - addr);
     57		if (ret)
     58			break;
     59
     60		if (resched && next != end)
     61			cond_resched_rwlock_write(&kvm->mmu_lock);
     62	} while (addr = next, addr != end);
     63
     64	return ret;
     65}
     66
     67#define stage2_apply_range_resched(kvm, addr, end, fn)			\
     68	stage2_apply_range(kvm, addr, end, fn, true)
     69
     70static bool memslot_is_logging(struct kvm_memory_slot *memslot)
     71{
     72	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
     73}
     74
     75/**
     76 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
     77 * @kvm:	pointer to kvm structure.
     78 *
     79 * Interface to HYP function to flush all VM TLB entries
     80 */
     81void kvm_flush_remote_tlbs(struct kvm *kvm)
     82{
     83	++kvm->stat.generic.remote_tlb_flush_requests;
     84	kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
     85}
     86
     87static bool kvm_is_device_pfn(unsigned long pfn)
     88{
     89	return !pfn_is_map_memory(pfn);
     90}
     91
     92static void *stage2_memcache_zalloc_page(void *arg)
     93{
     94	struct kvm_mmu_memory_cache *mc = arg;
     95
     96	/* Allocated with __GFP_ZERO, so no need to zero */
     97	return kvm_mmu_memory_cache_alloc(mc);
     98}
     99
    100static void *kvm_host_zalloc_pages_exact(size_t size)
    101{
    102	return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
    103}
    104
    105static void kvm_host_get_page(void *addr)
    106{
    107	get_page(virt_to_page(addr));
    108}
    109
    110static void kvm_host_put_page(void *addr)
    111{
    112	put_page(virt_to_page(addr));
    113}
    114
    115static int kvm_host_page_count(void *addr)
    116{
    117	return page_count(virt_to_page(addr));
    118}
    119
    120static phys_addr_t kvm_host_pa(void *addr)
    121{
    122	return __pa(addr);
    123}
    124
    125static void *kvm_host_va(phys_addr_t phys)
    126{
    127	return __va(phys);
    128}
    129
    130static void clean_dcache_guest_page(void *va, size_t size)
    131{
    132	__clean_dcache_guest_page(va, size);
    133}
    134
    135static void invalidate_icache_guest_page(void *va, size_t size)
    136{
    137	__invalidate_icache_guest_page(va, size);
    138}
    139
    140/*
    141 * Unmapping vs dcache management:
    142 *
    143 * If a guest maps certain memory pages as uncached, all writes will
    144 * bypass the data cache and go directly to RAM.  However, the CPUs
    145 * can still speculate reads (not writes) and fill cache lines with
    146 * data.
    147 *
    148 * Those cache lines will be *clean* cache lines though, so a
    149 * clean+invalidate operation is equivalent to an invalidate
    150 * operation, because no cache lines are marked dirty.
    151 *
    152 * Those clean cache lines could be filled prior to an uncached write
    153 * by the guest, and the cache coherent IO subsystem would therefore
    154 * end up writing old data to disk.
    155 *
    156 * This is why right after unmapping a page/section and invalidating
    157 * the corresponding TLBs, we flush to make sure the IO subsystem will
    158 * never hit in the cache.
    159 *
    160 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
    161 * we then fully enforce cacheability of RAM, no matter what the guest
    162 * does.
    163 */
    164/**
    165 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
    166 * @mmu:   The KVM stage-2 MMU pointer
    167 * @start: The intermediate physical base address of the range to unmap
    168 * @size:  The size of the area to unmap
    169 * @may_block: Whether or not we are permitted to block
    170 *
    171 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
    172 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
    173 * destroying the VM), otherwise another faulting VCPU may come in and mess
    174 * with things behind our backs.
    175 */
    176static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
    177				 bool may_block)
    178{
    179	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
    180	phys_addr_t end = start + size;
    181
    182	lockdep_assert_held_write(&kvm->mmu_lock);
    183	WARN_ON(size & ~PAGE_MASK);
    184	WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
    185				   may_block));
    186}
    187
    188static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
    189{
    190	__unmap_stage2_range(mmu, start, size, true);
    191}
    192
    193static void stage2_flush_memslot(struct kvm *kvm,
    194				 struct kvm_memory_slot *memslot)
    195{
    196	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
    197	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
    198
    199	stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
    200}
    201
    202/**
    203 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
    204 * @kvm: The struct kvm pointer
    205 *
    206 * Go through the stage 2 page tables and invalidate any cache lines
    207 * backing memory already mapped to the VM.
    208 */
    209static void stage2_flush_vm(struct kvm *kvm)
    210{
    211	struct kvm_memslots *slots;
    212	struct kvm_memory_slot *memslot;
    213	int idx, bkt;
    214
    215	idx = srcu_read_lock(&kvm->srcu);
    216	write_lock(&kvm->mmu_lock);
    217
    218	slots = kvm_memslots(kvm);
    219	kvm_for_each_memslot(memslot, bkt, slots)
    220		stage2_flush_memslot(kvm, memslot);
    221
    222	write_unlock(&kvm->mmu_lock);
    223	srcu_read_unlock(&kvm->srcu, idx);
    224}
    225
    226/**
    227 * free_hyp_pgds - free Hyp-mode page tables
    228 */
    229void free_hyp_pgds(void)
    230{
    231	mutex_lock(&kvm_hyp_pgd_mutex);
    232	if (hyp_pgtable) {
    233		kvm_pgtable_hyp_destroy(hyp_pgtable);
    234		kfree(hyp_pgtable);
    235		hyp_pgtable = NULL;
    236	}
    237	mutex_unlock(&kvm_hyp_pgd_mutex);
    238}
    239
    240static bool kvm_host_owns_hyp_mappings(void)
    241{
    242	if (is_kernel_in_hyp_mode())
    243		return false;
    244
    245	if (static_branch_likely(&kvm_protected_mode_initialized))
    246		return false;
    247
    248	/*
    249	 * This can happen at boot time when __create_hyp_mappings() is called
    250	 * after the hyp protection has been enabled, but the static key has
    251	 * not been flipped yet.
    252	 */
    253	if (!hyp_pgtable && is_protected_kvm_enabled())
    254		return false;
    255
    256	WARN_ON(!hyp_pgtable);
    257
    258	return true;
    259}
    260
    261int __create_hyp_mappings(unsigned long start, unsigned long size,
    262			  unsigned long phys, enum kvm_pgtable_prot prot)
    263{
    264	int err;
    265
    266	if (WARN_ON(!kvm_host_owns_hyp_mappings()))
    267		return -EINVAL;
    268
    269	mutex_lock(&kvm_hyp_pgd_mutex);
    270	err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
    271	mutex_unlock(&kvm_hyp_pgd_mutex);
    272
    273	return err;
    274}
    275
    276static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
    277{
    278	if (!is_vmalloc_addr(kaddr)) {
    279		BUG_ON(!virt_addr_valid(kaddr));
    280		return __pa(kaddr);
    281	} else {
    282		return page_to_phys(vmalloc_to_page(kaddr)) +
    283		       offset_in_page(kaddr);
    284	}
    285}
    286
    287struct hyp_shared_pfn {
    288	u64 pfn;
    289	int count;
    290	struct rb_node node;
    291};
    292
    293static DEFINE_MUTEX(hyp_shared_pfns_lock);
    294static struct rb_root hyp_shared_pfns = RB_ROOT;
    295
    296static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
    297					      struct rb_node **parent)
    298{
    299	struct hyp_shared_pfn *this;
    300
    301	*node = &hyp_shared_pfns.rb_node;
    302	*parent = NULL;
    303	while (**node) {
    304		this = container_of(**node, struct hyp_shared_pfn, node);
    305		*parent = **node;
    306		if (this->pfn < pfn)
    307			*node = &((**node)->rb_left);
    308		else if (this->pfn > pfn)
    309			*node = &((**node)->rb_right);
    310		else
    311			return this;
    312	}
    313
    314	return NULL;
    315}
    316
    317static int share_pfn_hyp(u64 pfn)
    318{
    319	struct rb_node **node, *parent;
    320	struct hyp_shared_pfn *this;
    321	int ret = 0;
    322
    323	mutex_lock(&hyp_shared_pfns_lock);
    324	this = find_shared_pfn(pfn, &node, &parent);
    325	if (this) {
    326		this->count++;
    327		goto unlock;
    328	}
    329
    330	this = kzalloc(sizeof(*this), GFP_KERNEL);
    331	if (!this) {
    332		ret = -ENOMEM;
    333		goto unlock;
    334	}
    335
    336	this->pfn = pfn;
    337	this->count = 1;
    338	rb_link_node(&this->node, parent, node);
    339	rb_insert_color(&this->node, &hyp_shared_pfns);
    340	ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
    341unlock:
    342	mutex_unlock(&hyp_shared_pfns_lock);
    343
    344	return ret;
    345}
    346
    347static int unshare_pfn_hyp(u64 pfn)
    348{
    349	struct rb_node **node, *parent;
    350	struct hyp_shared_pfn *this;
    351	int ret = 0;
    352
    353	mutex_lock(&hyp_shared_pfns_lock);
    354	this = find_shared_pfn(pfn, &node, &parent);
    355	if (WARN_ON(!this)) {
    356		ret = -ENOENT;
    357		goto unlock;
    358	}
    359
    360	this->count--;
    361	if (this->count)
    362		goto unlock;
    363
    364	rb_erase(&this->node, &hyp_shared_pfns);
    365	kfree(this);
    366	ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
    367unlock:
    368	mutex_unlock(&hyp_shared_pfns_lock);
    369
    370	return ret;
    371}
    372
    373int kvm_share_hyp(void *from, void *to)
    374{
    375	phys_addr_t start, end, cur;
    376	u64 pfn;
    377	int ret;
    378
    379	if (is_kernel_in_hyp_mode())
    380		return 0;
    381
    382	/*
    383	 * The share hcall maps things in the 'fixed-offset' region of the hyp
    384	 * VA space, so we can only share physically contiguous data-structures
    385	 * for now.
    386	 */
    387	if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
    388		return -EINVAL;
    389
    390	if (kvm_host_owns_hyp_mappings())
    391		return create_hyp_mappings(from, to, PAGE_HYP);
    392
    393	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
    394	end = PAGE_ALIGN(__pa(to));
    395	for (cur = start; cur < end; cur += PAGE_SIZE) {
    396		pfn = __phys_to_pfn(cur);
    397		ret = share_pfn_hyp(pfn);
    398		if (ret)
    399			return ret;
    400	}
    401
    402	return 0;
    403}
    404
    405void kvm_unshare_hyp(void *from, void *to)
    406{
    407	phys_addr_t start, end, cur;
    408	u64 pfn;
    409
    410	if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
    411		return;
    412
    413	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
    414	end = PAGE_ALIGN(__pa(to));
    415	for (cur = start; cur < end; cur += PAGE_SIZE) {
    416		pfn = __phys_to_pfn(cur);
    417		WARN_ON(unshare_pfn_hyp(pfn));
    418	}
    419}
    420
    421/**
    422 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
    423 * @from:	The virtual kernel start address of the range
    424 * @to:		The virtual kernel end address of the range (exclusive)
    425 * @prot:	The protection to be applied to this range
    426 *
    427 * The same virtual address as the kernel virtual address is also used
    428 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
    429 * physical pages.
    430 */
    431int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
    432{
    433	phys_addr_t phys_addr;
    434	unsigned long virt_addr;
    435	unsigned long start = kern_hyp_va((unsigned long)from);
    436	unsigned long end = kern_hyp_va((unsigned long)to);
    437
    438	if (is_kernel_in_hyp_mode())
    439		return 0;
    440
    441	if (!kvm_host_owns_hyp_mappings())
    442		return -EPERM;
    443
    444	start = start & PAGE_MASK;
    445	end = PAGE_ALIGN(end);
    446
    447	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
    448		int err;
    449
    450		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
    451		err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
    452					    prot);
    453		if (err)
    454			return err;
    455	}
    456
    457	return 0;
    458}
    459
    460
    461/**
    462 * hyp_alloc_private_va_range - Allocates a private VA range.
    463 * @size:	The size of the VA range to reserve.
    464 * @haddr:	The hypervisor virtual start address of the allocation.
    465 *
    466 * The private virtual address (VA) range is allocated below io_map_base
    467 * and aligned based on the order of @size.
    468 *
    469 * Return: 0 on success or negative error code on failure.
    470 */
    471int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
    472{
    473	unsigned long base;
    474	int ret = 0;
    475
    476	mutex_lock(&kvm_hyp_pgd_mutex);
    477
    478	/*
    479	 * This assumes that we have enough space below the idmap
    480	 * page to allocate our VAs. If not, the check below will
    481	 * kick. A potential alternative would be to detect that
    482	 * overflow and switch to an allocation above the idmap.
    483	 *
    484	 * The allocated size is always a multiple of PAGE_SIZE.
    485	 */
    486	base = io_map_base - PAGE_ALIGN(size);
    487
    488	/* Align the allocation based on the order of its size */
    489	base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
    490
    491	/*
    492	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
    493	 * allocating the new area, as it would indicate we've
    494	 * overflowed the idmap/IO address range.
    495	 */
    496	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
    497		ret = -ENOMEM;
    498	else
    499		*haddr = io_map_base = base;
    500
    501	mutex_unlock(&kvm_hyp_pgd_mutex);
    502
    503	return ret;
    504}
    505
    506static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
    507					unsigned long *haddr,
    508					enum kvm_pgtable_prot prot)
    509{
    510	unsigned long addr;
    511	int ret = 0;
    512
    513	if (!kvm_host_owns_hyp_mappings()) {
    514		addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
    515					 phys_addr, size, prot);
    516		if (IS_ERR_VALUE(addr))
    517			return addr;
    518		*haddr = addr;
    519
    520		return 0;
    521	}
    522
    523	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
    524	ret = hyp_alloc_private_va_range(size, &addr);
    525	if (ret)
    526		return ret;
    527
    528	ret = __create_hyp_mappings(addr, size, phys_addr, prot);
    529	if (ret)
    530		return ret;
    531
    532	*haddr = addr + offset_in_page(phys_addr);
    533	return ret;
    534}
    535
    536/**
    537 * create_hyp_io_mappings - Map IO into both kernel and HYP
    538 * @phys_addr:	The physical start address which gets mapped
    539 * @size:	Size of the region being mapped
    540 * @kaddr:	Kernel VA for this mapping
    541 * @haddr:	HYP VA for this mapping
    542 */
    543int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
    544			   void __iomem **kaddr,
    545			   void __iomem **haddr)
    546{
    547	unsigned long addr;
    548	int ret;
    549
    550	if (is_protected_kvm_enabled())
    551		return -EPERM;
    552
    553	*kaddr = ioremap(phys_addr, size);
    554	if (!*kaddr)
    555		return -ENOMEM;
    556
    557	if (is_kernel_in_hyp_mode()) {
    558		*haddr = *kaddr;
    559		return 0;
    560	}
    561
    562	ret = __create_hyp_private_mapping(phys_addr, size,
    563					   &addr, PAGE_HYP_DEVICE);
    564	if (ret) {
    565		iounmap(*kaddr);
    566		*kaddr = NULL;
    567		*haddr = NULL;
    568		return ret;
    569	}
    570
    571	*haddr = (void __iomem *)addr;
    572	return 0;
    573}
    574
    575/**
    576 * create_hyp_exec_mappings - Map an executable range into HYP
    577 * @phys_addr:	The physical start address which gets mapped
    578 * @size:	Size of the region being mapped
    579 * @haddr:	HYP VA for this mapping
    580 */
    581int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
    582			     void **haddr)
    583{
    584	unsigned long addr;
    585	int ret;
    586
    587	BUG_ON(is_kernel_in_hyp_mode());
    588
    589	ret = __create_hyp_private_mapping(phys_addr, size,
    590					   &addr, PAGE_HYP_EXEC);
    591	if (ret) {
    592		*haddr = NULL;
    593		return ret;
    594	}
    595
    596	*haddr = (void *)addr;
    597	return 0;
    598}
    599
    600static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
    601	/* We shouldn't need any other callback to walk the PT */
    602	.phys_to_virt		= kvm_host_va,
    603};
    604
    605static int get_user_mapping_size(struct kvm *kvm, u64 addr)
    606{
    607	struct kvm_pgtable pgt = {
    608		.pgd		= (kvm_pte_t *)kvm->mm->pgd,
    609		.ia_bits	= VA_BITS,
    610		.start_level	= (KVM_PGTABLE_MAX_LEVELS -
    611				   CONFIG_PGTABLE_LEVELS),
    612		.mm_ops		= &kvm_user_mm_ops,
    613	};
    614	kvm_pte_t pte = 0;	/* Keep GCC quiet... */
    615	u32 level = ~0;
    616	int ret;
    617
    618	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
    619	VM_BUG_ON(ret);
    620	VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
    621	VM_BUG_ON(!(pte & PTE_VALID));
    622
    623	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
    624}
    625
    626static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
    627	.zalloc_page		= stage2_memcache_zalloc_page,
    628	.zalloc_pages_exact	= kvm_host_zalloc_pages_exact,
    629	.free_pages_exact	= free_pages_exact,
    630	.get_page		= kvm_host_get_page,
    631	.put_page		= kvm_host_put_page,
    632	.page_count		= kvm_host_page_count,
    633	.phys_to_virt		= kvm_host_va,
    634	.virt_to_phys		= kvm_host_pa,
    635	.dcache_clean_inval_poc	= clean_dcache_guest_page,
    636	.icache_inval_pou	= invalidate_icache_guest_page,
    637};
    638
    639/**
    640 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
    641 * @kvm:	The pointer to the KVM structure
    642 * @mmu:	The pointer to the s2 MMU structure
    643 *
    644 * Allocates only the stage-2 HW PGD level table(s).
    645 * Note we don't need locking here as this is only called when the VM is
    646 * created, which can only be done once.
    647 */
    648int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
    649{
    650	int cpu, err;
    651	struct kvm_pgtable *pgt;
    652
    653	if (mmu->pgt != NULL) {
    654		kvm_err("kvm_arch already initialized?\n");
    655		return -EINVAL;
    656	}
    657
    658	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
    659	if (!pgt)
    660		return -ENOMEM;
    661
    662	mmu->arch = &kvm->arch;
    663	err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
    664	if (err)
    665		goto out_free_pgtable;
    666
    667	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
    668	if (!mmu->last_vcpu_ran) {
    669		err = -ENOMEM;
    670		goto out_destroy_pgtable;
    671	}
    672
    673	for_each_possible_cpu(cpu)
    674		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
    675
    676	mmu->pgt = pgt;
    677	mmu->pgd_phys = __pa(pgt->pgd);
    678	return 0;
    679
    680out_destroy_pgtable:
    681	kvm_pgtable_stage2_destroy(pgt);
    682out_free_pgtable:
    683	kfree(pgt);
    684	return err;
    685}
    686
    687static void stage2_unmap_memslot(struct kvm *kvm,
    688				 struct kvm_memory_slot *memslot)
    689{
    690	hva_t hva = memslot->userspace_addr;
    691	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
    692	phys_addr_t size = PAGE_SIZE * memslot->npages;
    693	hva_t reg_end = hva + size;
    694
    695	/*
    696	 * A memory region could potentially cover multiple VMAs, and any holes
    697	 * between them, so iterate over all of them to find out if we should
    698	 * unmap any of them.
    699	 *
    700	 *     +--------------------------------------------+
    701	 * +---------------+----------------+   +----------------+
    702	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
    703	 * +---------------+----------------+   +----------------+
    704	 *     |               memory region                |
    705	 *     +--------------------------------------------+
    706	 */
    707	do {
    708		struct vm_area_struct *vma;
    709		hva_t vm_start, vm_end;
    710
    711		vma = find_vma_intersection(current->mm, hva, reg_end);
    712		if (!vma)
    713			break;
    714
    715		/*
    716		 * Take the intersection of this VMA with the memory region
    717		 */
    718		vm_start = max(hva, vma->vm_start);
    719		vm_end = min(reg_end, vma->vm_end);
    720
    721		if (!(vma->vm_flags & VM_PFNMAP)) {
    722			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
    723			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
    724		}
    725		hva = vm_end;
    726	} while (hva < reg_end);
    727}
    728
    729/**
    730 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
    731 * @kvm: The struct kvm pointer
    732 *
    733 * Go through the memregions and unmap any regular RAM
    734 * backing memory already mapped to the VM.
    735 */
    736void stage2_unmap_vm(struct kvm *kvm)
    737{
    738	struct kvm_memslots *slots;
    739	struct kvm_memory_slot *memslot;
    740	int idx, bkt;
    741
    742	idx = srcu_read_lock(&kvm->srcu);
    743	mmap_read_lock(current->mm);
    744	write_lock(&kvm->mmu_lock);
    745
    746	slots = kvm_memslots(kvm);
    747	kvm_for_each_memslot(memslot, bkt, slots)
    748		stage2_unmap_memslot(kvm, memslot);
    749
    750	write_unlock(&kvm->mmu_lock);
    751	mmap_read_unlock(current->mm);
    752	srcu_read_unlock(&kvm->srcu, idx);
    753}
    754
    755void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
    756{
    757	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
    758	struct kvm_pgtable *pgt = NULL;
    759
    760	write_lock(&kvm->mmu_lock);
    761	pgt = mmu->pgt;
    762	if (pgt) {
    763		mmu->pgd_phys = 0;
    764		mmu->pgt = NULL;
    765		free_percpu(mmu->last_vcpu_ran);
    766	}
    767	write_unlock(&kvm->mmu_lock);
    768
    769	if (pgt) {
    770		kvm_pgtable_stage2_destroy(pgt);
    771		kfree(pgt);
    772	}
    773}
    774
    775/**
    776 * kvm_phys_addr_ioremap - map a device range to guest IPA
    777 *
    778 * @kvm:	The KVM pointer
    779 * @guest_ipa:	The IPA at which to insert the mapping
    780 * @pa:		The physical address of the device
    781 * @size:	The size of the mapping
    782 * @writable:   Whether or not to create a writable mapping
    783 */
    784int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
    785			  phys_addr_t pa, unsigned long size, bool writable)
    786{
    787	phys_addr_t addr;
    788	int ret = 0;
    789	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
    790	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
    791	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
    792				     KVM_PGTABLE_PROT_R |
    793				     (writable ? KVM_PGTABLE_PROT_W : 0);
    794
    795	if (is_protected_kvm_enabled())
    796		return -EPERM;
    797
    798	size += offset_in_page(guest_ipa);
    799	guest_ipa &= PAGE_MASK;
    800
    801	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
    802		ret = kvm_mmu_topup_memory_cache(&cache,
    803						 kvm_mmu_cache_min_pages(kvm));
    804		if (ret)
    805			break;
    806
    807		write_lock(&kvm->mmu_lock);
    808		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
    809					     &cache);
    810		write_unlock(&kvm->mmu_lock);
    811		if (ret)
    812			break;
    813
    814		pa += PAGE_SIZE;
    815	}
    816
    817	kvm_mmu_free_memory_cache(&cache);
    818	return ret;
    819}
    820
    821/**
    822 * stage2_wp_range() - write protect stage2 memory region range
    823 * @mmu:        The KVM stage-2 MMU pointer
    824 * @addr:	Start address of range
    825 * @end:	End address of range
    826 */
    827static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
    828{
    829	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
    830	stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
    831}
    832
    833/**
    834 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
    835 * @kvm:	The KVM pointer
    836 * @slot:	The memory slot to write protect
    837 *
    838 * Called to start logging dirty pages after memory region
    839 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
    840 * all present PUD, PMD and PTEs are write protected in the memory region.
    841 * Afterwards read of dirty page log can be called.
    842 *
    843 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
    844 * serializing operations for VM memory regions.
    845 */
    846static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
    847{
    848	struct kvm_memslots *slots = kvm_memslots(kvm);
    849	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
    850	phys_addr_t start, end;
    851
    852	if (WARN_ON_ONCE(!memslot))
    853		return;
    854
    855	start = memslot->base_gfn << PAGE_SHIFT;
    856	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
    857
    858	write_lock(&kvm->mmu_lock);
    859	stage2_wp_range(&kvm->arch.mmu, start, end);
    860	write_unlock(&kvm->mmu_lock);
    861	kvm_flush_remote_tlbs(kvm);
    862}
    863
    864/**
    865 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
    866 * @kvm:	The KVM pointer
    867 * @slot:	The memory slot associated with mask
    868 * @gfn_offset:	The gfn offset in memory slot
    869 * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
    870 *		slot to be write protected
    871 *
    872 * Walks bits set in mask write protects the associated pte's. Caller must
    873 * acquire kvm_mmu_lock.
    874 */
    875static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
    876		struct kvm_memory_slot *slot,
    877		gfn_t gfn_offset, unsigned long mask)
    878{
    879	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
    880	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
    881	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
    882
    883	stage2_wp_range(&kvm->arch.mmu, start, end);
    884}
    885
    886/*
    887 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
    888 * dirty pages.
    889 *
    890 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
    891 * enable dirty logging for them.
    892 */
    893void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
    894		struct kvm_memory_slot *slot,
    895		gfn_t gfn_offset, unsigned long mask)
    896{
    897	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
    898}
    899
    900static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
    901{
    902	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
    903}
    904
    905static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
    906					       unsigned long hva,
    907					       unsigned long map_size)
    908{
    909	gpa_t gpa_start;
    910	hva_t uaddr_start, uaddr_end;
    911	size_t size;
    912
    913	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
    914	if (map_size == PAGE_SIZE)
    915		return true;
    916
    917	size = memslot->npages * PAGE_SIZE;
    918
    919	gpa_start = memslot->base_gfn << PAGE_SHIFT;
    920
    921	uaddr_start = memslot->userspace_addr;
    922	uaddr_end = uaddr_start + size;
    923
    924	/*
    925	 * Pages belonging to memslots that don't have the same alignment
    926	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
    927	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
    928	 *
    929	 * Consider a layout like the following:
    930	 *
    931	 *    memslot->userspace_addr:
    932	 *    +-----+--------------------+--------------------+---+
    933	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
    934	 *    +-----+--------------------+--------------------+---+
    935	 *
    936	 *    memslot->base_gfn << PAGE_SHIFT:
    937	 *      +---+--------------------+--------------------+-----+
    938	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
    939	 *      +---+--------------------+--------------------+-----+
    940	 *
    941	 * If we create those stage-2 blocks, we'll end up with this incorrect
    942	 * mapping:
    943	 *   d -> f
    944	 *   e -> g
    945	 *   f -> h
    946	 */
    947	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
    948		return false;
    949
    950	/*
    951	 * Next, let's make sure we're not trying to map anything not covered
    952	 * by the memslot. This means we have to prohibit block size mappings
    953	 * for the beginning and end of a non-block aligned and non-block sized
    954	 * memory slot (illustrated by the head and tail parts of the
    955	 * userspace view above containing pages 'abcde' and 'xyz',
    956	 * respectively).
    957	 *
    958	 * Note that it doesn't matter if we do the check using the
    959	 * userspace_addr or the base_gfn, as both are equally aligned (per
    960	 * the check above) and equally sized.
    961	 */
    962	return (hva & ~(map_size - 1)) >= uaddr_start &&
    963	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
    964}
    965
    966/*
    967 * Check if the given hva is backed by a transparent huge page (THP) and
    968 * whether it can be mapped using block mapping in stage2. If so, adjust
    969 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
    970 * supported. This will need to be updated to support other THP sizes.
    971 *
    972 * Returns the size of the mapping.
    973 */
    974static unsigned long
    975transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
    976			    unsigned long hva, kvm_pfn_t *pfnp,
    977			    phys_addr_t *ipap)
    978{
    979	kvm_pfn_t pfn = *pfnp;
    980
    981	/*
    982	 * Make sure the adjustment is done only for THP pages. Also make
    983	 * sure that the HVA and IPA are sufficiently aligned and that the
    984	 * block map is contained within the memslot.
    985	 */
    986	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
    987	    get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
    988		/*
    989		 * The address we faulted on is backed by a transparent huge
    990		 * page.  However, because we map the compound huge page and
    991		 * not the individual tail page, we need to transfer the
    992		 * refcount to the head page.  We have to be careful that the
    993		 * THP doesn't start to split while we are adjusting the
    994		 * refcounts.
    995		 *
    996		 * We are sure this doesn't happen, because mmu_notifier_retry
    997		 * was successful and we are holding the mmu_lock, so if this
    998		 * THP is trying to split, it will be blocked in the mmu
    999		 * notifier before touching any of the pages, specifically
   1000		 * before being able to call __split_huge_page_refcount().
   1001		 *
   1002		 * We can therefore safely transfer the refcount from PG_tail
   1003		 * to PG_head and switch the pfn from a tail page to the head
   1004		 * page accordingly.
   1005		 */
   1006		*ipap &= PMD_MASK;
   1007		kvm_release_pfn_clean(pfn);
   1008		pfn &= ~(PTRS_PER_PMD - 1);
   1009		get_page(pfn_to_page(pfn));
   1010		*pfnp = pfn;
   1011
   1012		return PMD_SIZE;
   1013	}
   1014
   1015	/* Use page mapping if we cannot use block mapping. */
   1016	return PAGE_SIZE;
   1017}
   1018
   1019static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
   1020{
   1021	unsigned long pa;
   1022
   1023	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
   1024		return huge_page_shift(hstate_vma(vma));
   1025
   1026	if (!(vma->vm_flags & VM_PFNMAP))
   1027		return PAGE_SHIFT;
   1028
   1029	VM_BUG_ON(is_vm_hugetlb_page(vma));
   1030
   1031	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
   1032
   1033#ifndef __PAGETABLE_PMD_FOLDED
   1034	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
   1035	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
   1036	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
   1037		return PUD_SHIFT;
   1038#endif
   1039
   1040	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
   1041	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
   1042	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
   1043		return PMD_SHIFT;
   1044
   1045	return PAGE_SHIFT;
   1046}
   1047
   1048/*
   1049 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
   1050 * able to see the page's tags and therefore they must be initialised first. If
   1051 * PG_mte_tagged is set, tags have already been initialised.
   1052 *
   1053 * The race in the test/set of the PG_mte_tagged flag is handled by:
   1054 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
   1055 *   racing to santise the same page
   1056 * - mmap_lock protects between a VM faulting a page in and the VMM performing
   1057 *   an mprotect() to add VM_MTE
   1058 */
   1059static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
   1060			     unsigned long size)
   1061{
   1062	unsigned long i, nr_pages = size >> PAGE_SHIFT;
   1063	struct page *page;
   1064
   1065	if (!kvm_has_mte(kvm))
   1066		return 0;
   1067
   1068	/*
   1069	 * pfn_to_online_page() is used to reject ZONE_DEVICE pages
   1070	 * that may not support tags.
   1071	 */
   1072	page = pfn_to_online_page(pfn);
   1073
   1074	if (!page)
   1075		return -EFAULT;
   1076
   1077	for (i = 0; i < nr_pages; i++, page++) {
   1078		if (!test_bit(PG_mte_tagged, &page->flags)) {
   1079			mte_clear_page_tags(page_address(page));
   1080			set_bit(PG_mte_tagged, &page->flags);
   1081		}
   1082	}
   1083
   1084	return 0;
   1085}
   1086
   1087static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
   1088			  struct kvm_memory_slot *memslot, unsigned long hva,
   1089			  unsigned long fault_status)
   1090{
   1091	int ret = 0;
   1092	bool write_fault, writable, force_pte = false;
   1093	bool exec_fault;
   1094	bool device = false;
   1095	bool shared;
   1096	unsigned long mmu_seq;
   1097	struct kvm *kvm = vcpu->kvm;
   1098	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
   1099	struct vm_area_struct *vma;
   1100	short vma_shift;
   1101	gfn_t gfn;
   1102	kvm_pfn_t pfn;
   1103	bool logging_active = memslot_is_logging(memslot);
   1104	bool use_read_lock = false;
   1105	unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
   1106	unsigned long vma_pagesize, fault_granule;
   1107	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
   1108	struct kvm_pgtable *pgt;
   1109
   1110	fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
   1111	write_fault = kvm_is_write_fault(vcpu);
   1112	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
   1113	VM_BUG_ON(write_fault && exec_fault);
   1114
   1115	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
   1116		kvm_err("Unexpected L2 read permission error\n");
   1117		return -EFAULT;
   1118	}
   1119
   1120	/*
   1121	 * Let's check if we will get back a huge page backed by hugetlbfs, or
   1122	 * get block mapping for device MMIO region.
   1123	 */
   1124	mmap_read_lock(current->mm);
   1125	vma = vma_lookup(current->mm, hva);
   1126	if (unlikely(!vma)) {
   1127		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
   1128		mmap_read_unlock(current->mm);
   1129		return -EFAULT;
   1130	}
   1131
   1132	/*
   1133	 * logging_active is guaranteed to never be true for VM_PFNMAP
   1134	 * memslots.
   1135	 */
   1136	if (logging_active) {
   1137		force_pte = true;
   1138		vma_shift = PAGE_SHIFT;
   1139		use_read_lock = (fault_status == FSC_PERM && write_fault &&
   1140				 fault_granule == PAGE_SIZE);
   1141	} else {
   1142		vma_shift = get_vma_page_shift(vma, hva);
   1143	}
   1144
   1145	shared = (vma->vm_flags & VM_SHARED);
   1146
   1147	switch (vma_shift) {
   1148#ifndef __PAGETABLE_PMD_FOLDED
   1149	case PUD_SHIFT:
   1150		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
   1151			break;
   1152		fallthrough;
   1153#endif
   1154	case CONT_PMD_SHIFT:
   1155		vma_shift = PMD_SHIFT;
   1156		fallthrough;
   1157	case PMD_SHIFT:
   1158		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
   1159			break;
   1160		fallthrough;
   1161	case CONT_PTE_SHIFT:
   1162		vma_shift = PAGE_SHIFT;
   1163		force_pte = true;
   1164		fallthrough;
   1165	case PAGE_SHIFT:
   1166		break;
   1167	default:
   1168		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
   1169	}
   1170
   1171	vma_pagesize = 1UL << vma_shift;
   1172	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
   1173		fault_ipa &= ~(vma_pagesize - 1);
   1174
   1175	gfn = fault_ipa >> PAGE_SHIFT;
   1176	mmap_read_unlock(current->mm);
   1177
   1178	/*
   1179	 * Permission faults just need to update the existing leaf entry,
   1180	 * and so normally don't require allocations from the memcache. The
   1181	 * only exception to this is when dirty logging is enabled at runtime
   1182	 * and a write fault needs to collapse a block entry into a table.
   1183	 */
   1184	if (fault_status != FSC_PERM || (logging_active && write_fault)) {
   1185		ret = kvm_mmu_topup_memory_cache(memcache,
   1186						 kvm_mmu_cache_min_pages(kvm));
   1187		if (ret)
   1188			return ret;
   1189	}
   1190
   1191	mmu_seq = vcpu->kvm->mmu_notifier_seq;
   1192	/*
   1193	 * Ensure the read of mmu_notifier_seq happens before we call
   1194	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
   1195	 * the page we just got a reference to gets unmapped before we have a
   1196	 * chance to grab the mmu_lock, which ensure that if the page gets
   1197	 * unmapped afterwards, the call to kvm_unmap_gfn will take it away
   1198	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
   1199	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
   1200	 *
   1201	 * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
   1202	 * used to avoid unnecessary overhead introduced to locate the memory
   1203	 * slot because it's always fixed even @gfn is adjusted for huge pages.
   1204	 */
   1205	smp_rmb();
   1206
   1207	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
   1208				   write_fault, &writable, NULL);
   1209	if (pfn == KVM_PFN_ERR_HWPOISON) {
   1210		kvm_send_hwpoison_signal(hva, vma_shift);
   1211		return 0;
   1212	}
   1213	if (is_error_noslot_pfn(pfn))
   1214		return -EFAULT;
   1215
   1216	if (kvm_is_device_pfn(pfn)) {
   1217		/*
   1218		 * If the page was identified as device early by looking at
   1219		 * the VMA flags, vma_pagesize is already representing the
   1220		 * largest quantity we can map.  If instead it was mapped
   1221		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
   1222		 * and must not be upgraded.
   1223		 *
   1224		 * In both cases, we don't let transparent_hugepage_adjust()
   1225		 * change things at the last minute.
   1226		 */
   1227		device = true;
   1228	} else if (logging_active && !write_fault) {
   1229		/*
   1230		 * Only actually map the page as writable if this was a write
   1231		 * fault.
   1232		 */
   1233		writable = false;
   1234	}
   1235
   1236	if (exec_fault && device)
   1237		return -ENOEXEC;
   1238
   1239	/*
   1240	 * To reduce MMU contentions and enhance concurrency during dirty
   1241	 * logging dirty logging, only acquire read lock for permission
   1242	 * relaxation.
   1243	 */
   1244	if (use_read_lock)
   1245		read_lock(&kvm->mmu_lock);
   1246	else
   1247		write_lock(&kvm->mmu_lock);
   1248	pgt = vcpu->arch.hw_mmu->pgt;
   1249	if (mmu_notifier_retry(kvm, mmu_seq))
   1250		goto out_unlock;
   1251
   1252	/*
   1253	 * If we are not forced to use page mapping, check if we are
   1254	 * backed by a THP and thus use block mapping if possible.
   1255	 */
   1256	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
   1257		if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
   1258			vma_pagesize = fault_granule;
   1259		else
   1260			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
   1261								   hva, &pfn,
   1262								   &fault_ipa);
   1263	}
   1264
   1265	if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
   1266		/* Check the VMM hasn't introduced a new VM_SHARED VMA */
   1267		if (!shared)
   1268			ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
   1269		else
   1270			ret = -EFAULT;
   1271		if (ret)
   1272			goto out_unlock;
   1273	}
   1274
   1275	if (writable)
   1276		prot |= KVM_PGTABLE_PROT_W;
   1277
   1278	if (exec_fault)
   1279		prot |= KVM_PGTABLE_PROT_X;
   1280
   1281	if (device)
   1282		prot |= KVM_PGTABLE_PROT_DEVICE;
   1283	else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
   1284		prot |= KVM_PGTABLE_PROT_X;
   1285
   1286	/*
   1287	 * Under the premise of getting a FSC_PERM fault, we just need to relax
   1288	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
   1289	 * kvm_pgtable_stage2_map() should be called to change block size.
   1290	 */
   1291	if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
   1292		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
   1293	} else {
   1294		WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
   1295
   1296		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
   1297					     __pfn_to_phys(pfn), prot,
   1298					     memcache);
   1299	}
   1300
   1301	/* Mark the page dirty only if the fault is handled successfully */
   1302	if (writable && !ret) {
   1303		kvm_set_pfn_dirty(pfn);
   1304		mark_page_dirty_in_slot(kvm, memslot, gfn);
   1305	}
   1306
   1307out_unlock:
   1308	if (use_read_lock)
   1309		read_unlock(&kvm->mmu_lock);
   1310	else
   1311		write_unlock(&kvm->mmu_lock);
   1312	kvm_set_pfn_accessed(pfn);
   1313	kvm_release_pfn_clean(pfn);
   1314	return ret != -EAGAIN ? ret : 0;
   1315}
   1316
   1317/* Resolve the access fault by making the page young again. */
   1318static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
   1319{
   1320	pte_t pte;
   1321	kvm_pte_t kpte;
   1322	struct kvm_s2_mmu *mmu;
   1323
   1324	trace_kvm_access_fault(fault_ipa);
   1325
   1326	write_lock(&vcpu->kvm->mmu_lock);
   1327	mmu = vcpu->arch.hw_mmu;
   1328	kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
   1329	write_unlock(&vcpu->kvm->mmu_lock);
   1330
   1331	pte = __pte(kpte);
   1332	if (pte_valid(pte))
   1333		kvm_set_pfn_accessed(pte_pfn(pte));
   1334}
   1335
   1336/**
   1337 * kvm_handle_guest_abort - handles all 2nd stage aborts
   1338 * @vcpu:	the VCPU pointer
   1339 *
   1340 * Any abort that gets to the host is almost guaranteed to be caused by a
   1341 * missing second stage translation table entry, which can mean that either the
   1342 * guest simply needs more memory and we must allocate an appropriate page or it
   1343 * can mean that the guest tried to access I/O memory, which is emulated by user
   1344 * space. The distinction is based on the IPA causing the fault and whether this
   1345 * memory region has been registered as standard RAM by user space.
   1346 */
   1347int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
   1348{
   1349	unsigned long fault_status;
   1350	phys_addr_t fault_ipa;
   1351	struct kvm_memory_slot *memslot;
   1352	unsigned long hva;
   1353	bool is_iabt, write_fault, writable;
   1354	gfn_t gfn;
   1355	int ret, idx;
   1356
   1357	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
   1358
   1359	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
   1360	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
   1361
   1362	if (fault_status == FSC_FAULT) {
   1363		/* Beyond sanitised PARange (which is the IPA limit) */
   1364		if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
   1365			kvm_inject_size_fault(vcpu);
   1366			return 1;
   1367		}
   1368
   1369		/* Falls between the IPA range and the PARange? */
   1370		if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
   1371			fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
   1372
   1373			if (is_iabt)
   1374				kvm_inject_pabt(vcpu, fault_ipa);
   1375			else
   1376				kvm_inject_dabt(vcpu, fault_ipa);
   1377			return 1;
   1378		}
   1379	}
   1380
   1381	/* Synchronous External Abort? */
   1382	if (kvm_vcpu_abt_issea(vcpu)) {
   1383		/*
   1384		 * For RAS the host kernel may handle this abort.
   1385		 * There is no need to pass the error into the guest.
   1386		 */
   1387		if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
   1388			kvm_inject_vabt(vcpu);
   1389
   1390		return 1;
   1391	}
   1392
   1393	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
   1394			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
   1395
   1396	/* Check the stage-2 fault is trans. fault or write fault */
   1397	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
   1398	    fault_status != FSC_ACCESS) {
   1399		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
   1400			kvm_vcpu_trap_get_class(vcpu),
   1401			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
   1402			(unsigned long)kvm_vcpu_get_esr(vcpu));
   1403		return -EFAULT;
   1404	}
   1405
   1406	idx = srcu_read_lock(&vcpu->kvm->srcu);
   1407
   1408	gfn = fault_ipa >> PAGE_SHIFT;
   1409	memslot = gfn_to_memslot(vcpu->kvm, gfn);
   1410	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
   1411	write_fault = kvm_is_write_fault(vcpu);
   1412	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
   1413		/*
   1414		 * The guest has put either its instructions or its page-tables
   1415		 * somewhere it shouldn't have. Userspace won't be able to do
   1416		 * anything about this (there's no syndrome for a start), so
   1417		 * re-inject the abort back into the guest.
   1418		 */
   1419		if (is_iabt) {
   1420			ret = -ENOEXEC;
   1421			goto out;
   1422		}
   1423
   1424		if (kvm_vcpu_abt_iss1tw(vcpu)) {
   1425			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
   1426			ret = 1;
   1427			goto out_unlock;
   1428		}
   1429
   1430		/*
   1431		 * Check for a cache maintenance operation. Since we
   1432		 * ended-up here, we know it is outside of any memory
   1433		 * slot. But we can't find out if that is for a device,
   1434		 * or if the guest is just being stupid. The only thing
   1435		 * we know for sure is that this range cannot be cached.
   1436		 *
   1437		 * So let's assume that the guest is just being
   1438		 * cautious, and skip the instruction.
   1439		 */
   1440		if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
   1441			kvm_incr_pc(vcpu);
   1442			ret = 1;
   1443			goto out_unlock;
   1444		}
   1445
   1446		/*
   1447		 * The IPA is reported as [MAX:12], so we need to
   1448		 * complement it with the bottom 12 bits from the
   1449		 * faulting VA. This is always 12 bits, irrespective
   1450		 * of the page size.
   1451		 */
   1452		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
   1453		ret = io_mem_abort(vcpu, fault_ipa);
   1454		goto out_unlock;
   1455	}
   1456
   1457	/* Userspace should not be able to register out-of-bounds IPAs */
   1458	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
   1459
   1460	if (fault_status == FSC_ACCESS) {
   1461		handle_access_fault(vcpu, fault_ipa);
   1462		ret = 1;
   1463		goto out_unlock;
   1464	}
   1465
   1466	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
   1467	if (ret == 0)
   1468		ret = 1;
   1469out:
   1470	if (ret == -ENOEXEC) {
   1471		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
   1472		ret = 1;
   1473	}
   1474out_unlock:
   1475	srcu_read_unlock(&vcpu->kvm->srcu, idx);
   1476	return ret;
   1477}
   1478
   1479bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
   1480{
   1481	if (!kvm->arch.mmu.pgt)
   1482		return false;
   1483
   1484	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
   1485			     (range->end - range->start) << PAGE_SHIFT,
   1486			     range->may_block);
   1487
   1488	return false;
   1489}
   1490
   1491bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1492{
   1493	kvm_pfn_t pfn = pte_pfn(range->pte);
   1494	int ret;
   1495
   1496	if (!kvm->arch.mmu.pgt)
   1497		return false;
   1498
   1499	WARN_ON(range->end - range->start != 1);
   1500
   1501	ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
   1502	if (ret)
   1503		return false;
   1504
   1505	/*
   1506	 * We've moved a page around, probably through CoW, so let's treat
   1507	 * it just like a translation fault and the map handler will clean
   1508	 * the cache to the PoC.
   1509	 *
   1510	 * The MMU notifiers will have unmapped a huge PMD before calling
   1511	 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
   1512	 * therefore we never need to clear out a huge PMD through this
   1513	 * calling path and a memcache is not required.
   1514	 */
   1515	kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
   1516			       PAGE_SIZE, __pfn_to_phys(pfn),
   1517			       KVM_PGTABLE_PROT_R, NULL);
   1518
   1519	return false;
   1520}
   1521
   1522bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1523{
   1524	u64 size = (range->end - range->start) << PAGE_SHIFT;
   1525	kvm_pte_t kpte;
   1526	pte_t pte;
   1527
   1528	if (!kvm->arch.mmu.pgt)
   1529		return false;
   1530
   1531	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
   1532
   1533	kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
   1534					range->start << PAGE_SHIFT);
   1535	pte = __pte(kpte);
   1536	return pte_valid(pte) && pte_young(pte);
   1537}
   1538
   1539bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
   1540{
   1541	if (!kvm->arch.mmu.pgt)
   1542		return false;
   1543
   1544	return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
   1545					   range->start << PAGE_SHIFT);
   1546}
   1547
   1548phys_addr_t kvm_mmu_get_httbr(void)
   1549{
   1550	return __pa(hyp_pgtable->pgd);
   1551}
   1552
   1553phys_addr_t kvm_get_idmap_vector(void)
   1554{
   1555	return hyp_idmap_vector;
   1556}
   1557
   1558static int kvm_map_idmap_text(void)
   1559{
   1560	unsigned long size = hyp_idmap_end - hyp_idmap_start;
   1561	int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
   1562					PAGE_HYP_EXEC);
   1563	if (err)
   1564		kvm_err("Failed to idmap %lx-%lx\n",
   1565			hyp_idmap_start, hyp_idmap_end);
   1566
   1567	return err;
   1568}
   1569
   1570static void *kvm_hyp_zalloc_page(void *arg)
   1571{
   1572	return (void *)get_zeroed_page(GFP_KERNEL);
   1573}
   1574
   1575static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
   1576	.zalloc_page		= kvm_hyp_zalloc_page,
   1577	.get_page		= kvm_host_get_page,
   1578	.put_page		= kvm_host_put_page,
   1579	.phys_to_virt		= kvm_host_va,
   1580	.virt_to_phys		= kvm_host_pa,
   1581};
   1582
   1583int kvm_mmu_init(u32 *hyp_va_bits)
   1584{
   1585	int err;
   1586
   1587	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
   1588	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
   1589	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
   1590	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
   1591	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
   1592
   1593	/*
   1594	 * We rely on the linker script to ensure at build time that the HYP
   1595	 * init code does not cross a page boundary.
   1596	 */
   1597	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
   1598
   1599	*hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
   1600	kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
   1601	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
   1602	kvm_debug("HYP VA range: %lx:%lx\n",
   1603		  kern_hyp_va(PAGE_OFFSET),
   1604		  kern_hyp_va((unsigned long)high_memory - 1));
   1605
   1606	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
   1607	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
   1608	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
   1609		/*
   1610		 * The idmap page is intersecting with the VA space,
   1611		 * it is not safe to continue further.
   1612		 */
   1613		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
   1614		err = -EINVAL;
   1615		goto out;
   1616	}
   1617
   1618	hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
   1619	if (!hyp_pgtable) {
   1620		kvm_err("Hyp mode page-table not allocated\n");
   1621		err = -ENOMEM;
   1622		goto out;
   1623	}
   1624
   1625	err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
   1626	if (err)
   1627		goto out_free_pgtable;
   1628
   1629	err = kvm_map_idmap_text();
   1630	if (err)
   1631		goto out_destroy_pgtable;
   1632
   1633	io_map_base = hyp_idmap_start;
   1634	return 0;
   1635
   1636out_destroy_pgtable:
   1637	kvm_pgtable_hyp_destroy(hyp_pgtable);
   1638out_free_pgtable:
   1639	kfree(hyp_pgtable);
   1640	hyp_pgtable = NULL;
   1641out:
   1642	return err;
   1643}
   1644
   1645void kvm_arch_commit_memory_region(struct kvm *kvm,
   1646				   struct kvm_memory_slot *old,
   1647				   const struct kvm_memory_slot *new,
   1648				   enum kvm_mr_change change)
   1649{
   1650	/*
   1651	 * At this point memslot has been committed and there is an
   1652	 * allocated dirty_bitmap[], dirty pages will be tracked while the
   1653	 * memory slot is write protected.
   1654	 */
   1655	if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
   1656		/*
   1657		 * If we're with initial-all-set, we don't need to write
   1658		 * protect any pages because they're all reported as dirty.
   1659		 * Huge pages and normal pages will be write protect gradually.
   1660		 */
   1661		if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
   1662			kvm_mmu_wp_memory_region(kvm, new->id);
   1663		}
   1664	}
   1665}
   1666
   1667int kvm_arch_prepare_memory_region(struct kvm *kvm,
   1668				   const struct kvm_memory_slot *old,
   1669				   struct kvm_memory_slot *new,
   1670				   enum kvm_mr_change change)
   1671{
   1672	hva_t hva, reg_end;
   1673	int ret = 0;
   1674
   1675	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
   1676			change != KVM_MR_FLAGS_ONLY)
   1677		return 0;
   1678
   1679	/*
   1680	 * Prevent userspace from creating a memory region outside of the IPA
   1681	 * space addressable by the KVM guest IPA space.
   1682	 */
   1683	if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
   1684		return -EFAULT;
   1685
   1686	hva = new->userspace_addr;
   1687	reg_end = hva + (new->npages << PAGE_SHIFT);
   1688
   1689	mmap_read_lock(current->mm);
   1690	/*
   1691	 * A memory region could potentially cover multiple VMAs, and any holes
   1692	 * between them, so iterate over all of them.
   1693	 *
   1694	 *     +--------------------------------------------+
   1695	 * +---------------+----------------+   +----------------+
   1696	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
   1697	 * +---------------+----------------+   +----------------+
   1698	 *     |               memory region                |
   1699	 *     +--------------------------------------------+
   1700	 */
   1701	do {
   1702		struct vm_area_struct *vma;
   1703
   1704		vma = find_vma_intersection(current->mm, hva, reg_end);
   1705		if (!vma)
   1706			break;
   1707
   1708		/*
   1709		 * VM_SHARED mappings are not allowed with MTE to avoid races
   1710		 * when updating the PG_mte_tagged page flag, see
   1711		 * sanitise_mte_tags for more details.
   1712		 */
   1713		if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
   1714			ret = -EINVAL;
   1715			break;
   1716		}
   1717
   1718		if (vma->vm_flags & VM_PFNMAP) {
   1719			/* IO region dirty page logging not allowed */
   1720			if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
   1721				ret = -EINVAL;
   1722				break;
   1723			}
   1724		}
   1725		hva = min(reg_end, vma->vm_end);
   1726	} while (hva < reg_end);
   1727
   1728	mmap_read_unlock(current->mm);
   1729	return ret;
   1730}
   1731
   1732void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
   1733{
   1734}
   1735
   1736void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
   1737{
   1738}
   1739
   1740void kvm_arch_flush_shadow_all(struct kvm *kvm)
   1741{
   1742	kvm_free_stage2_pgd(&kvm->arch.mmu);
   1743}
   1744
   1745void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   1746				   struct kvm_memory_slot *slot)
   1747{
   1748	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
   1749	phys_addr_t size = slot->npages << PAGE_SHIFT;
   1750
   1751	write_lock(&kvm->mmu_lock);
   1752	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
   1753	write_unlock(&kvm->mmu_lock);
   1754}
   1755
   1756/*
   1757 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
   1758 *
   1759 * Main problems:
   1760 * - S/W ops are local to a CPU (not broadcast)
   1761 * - We have line migration behind our back (speculation)
   1762 * - System caches don't support S/W at all (damn!)
   1763 *
   1764 * In the face of the above, the best we can do is to try and convert
   1765 * S/W ops to VA ops. Because the guest is not allowed to infer the
   1766 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
   1767 * which is a rather good thing for us.
   1768 *
   1769 * Also, it is only used when turning caches on/off ("The expected
   1770 * usage of the cache maintenance instructions that operate by set/way
   1771 * is associated with the cache maintenance instructions associated
   1772 * with the powerdown and powerup of caches, if this is required by
   1773 * the implementation.").
   1774 *
   1775 * We use the following policy:
   1776 *
   1777 * - If we trap a S/W operation, we enable VM trapping to detect
   1778 *   caches being turned on/off, and do a full clean.
   1779 *
   1780 * - We flush the caches on both caches being turned on and off.
   1781 *
   1782 * - Once the caches are enabled, we stop trapping VM ops.
   1783 */
   1784void kvm_set_way_flush(struct kvm_vcpu *vcpu)
   1785{
   1786	unsigned long hcr = *vcpu_hcr(vcpu);
   1787
   1788	/*
   1789	 * If this is the first time we do a S/W operation
   1790	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
   1791	 * VM trapping.
   1792	 *
   1793	 * Otherwise, rely on the VM trapping to wait for the MMU +
   1794	 * Caches to be turned off. At that point, we'll be able to
   1795	 * clean the caches again.
   1796	 */
   1797	if (!(hcr & HCR_TVM)) {
   1798		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
   1799					vcpu_has_cache_enabled(vcpu));
   1800		stage2_flush_vm(vcpu->kvm);
   1801		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
   1802	}
   1803}
   1804
   1805void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
   1806{
   1807	bool now_enabled = vcpu_has_cache_enabled(vcpu);
   1808
   1809	/*
   1810	 * If switching the MMU+caches on, need to invalidate the caches.
   1811	 * If switching it off, need to clean the caches.
   1812	 * Clean + invalidate does the trick always.
   1813	 */
   1814	if (now_enabled != was_enabled)
   1815		stage2_flush_vm(vcpu->kvm);
   1816
   1817	/* Caches are now on, stop trapping VM ops (until a S/W op) */
   1818	if (now_enabled)
   1819		*vcpu_hcr(vcpu) &= ~HCR_TVM;
   1820
   1821	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
   1822}