cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

book3s_64_mmu_radix.c (36915B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *
      4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
      5 */
      6
      7#include <linux/types.h>
      8#include <linux/string.h>
      9#include <linux/kvm.h>
     10#include <linux/kvm_host.h>
     11#include <linux/anon_inodes.h>
     12#include <linux/file.h>
     13#include <linux/debugfs.h>
     14#include <linux/pgtable.h>
     15
     16#include <asm/kvm_ppc.h>
     17#include <asm/kvm_book3s.h>
     18#include <asm/page.h>
     19#include <asm/mmu.h>
     20#include <asm/pgalloc.h>
     21#include <asm/pte-walk.h>
     22#include <asm/ultravisor.h>
     23#include <asm/kvm_book3s_uvmem.h>
     24#include <asm/plpar_wrappers.h>
     25
     26/*
     27 * Supported radix tree geometry.
     28 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
     29 * for a page size of 64k or 4k.
     30 */
     31static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
     32
     33unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
     34					      gva_t eaddr, void *to, void *from,
     35					      unsigned long n)
     36{
     37	int old_pid, old_lpid;
     38	unsigned long quadrant, ret = n;
     39	bool is_load = !!to;
     40
     41	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
     42	if (kvmhv_on_pseries())
     43		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
     44					  (to != NULL) ? __pa(to): 0,
     45					  (from != NULL) ? __pa(from): 0, n);
     46
     47	if (eaddr & (0xFFFUL << 52))
     48		return ret;
     49
     50	quadrant = 1;
     51	if (!pid)
     52		quadrant = 2;
     53	if (is_load)
     54		from = (void *) (eaddr | (quadrant << 62));
     55	else
     56		to = (void *) (eaddr | (quadrant << 62));
     57
     58	preempt_disable();
     59
     60	asm volatile("hwsync" ::: "memory");
     61	isync();
     62	/* switch the lpid first to avoid running host with unallocated pid */
     63	old_lpid = mfspr(SPRN_LPID);
     64	if (old_lpid != lpid)
     65		mtspr(SPRN_LPID, lpid);
     66	if (quadrant == 1) {
     67		old_pid = mfspr(SPRN_PID);
     68		if (old_pid != pid)
     69			mtspr(SPRN_PID, pid);
     70	}
     71	isync();
     72
     73	pagefault_disable();
     74	if (is_load)
     75		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
     76	else
     77		ret = __copy_to_user_inatomic((void __user *)to, from, n);
     78	pagefault_enable();
     79
     80	asm volatile("hwsync" ::: "memory");
     81	isync();
     82	/* switch the pid first to avoid running host with unallocated pid */
     83	if (quadrant == 1 && pid != old_pid)
     84		mtspr(SPRN_PID, old_pid);
     85	if (lpid != old_lpid)
     86		mtspr(SPRN_LPID, old_lpid);
     87	isync();
     88
     89	preempt_enable();
     90
     91	return ret;
     92}
     93
     94static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
     95					  void *to, void *from, unsigned long n)
     96{
     97	int lpid = vcpu->kvm->arch.lpid;
     98	int pid = vcpu->arch.pid;
     99
    100	/* This would cause a data segment intr so don't allow the access */
    101	if (eaddr & (0x3FFUL << 52))
    102		return -EINVAL;
    103
    104	/* Should we be using the nested lpid */
    105	if (vcpu->arch.nested)
    106		lpid = vcpu->arch.nested->shadow_lpid;
    107
    108	/* If accessing quadrant 3 then pid is expected to be 0 */
    109	if (((eaddr >> 62) & 0x3) == 0x3)
    110		pid = 0;
    111
    112	eaddr &= ~(0xFFFUL << 52);
    113
    114	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
    115}
    116
    117long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
    118				 unsigned long n)
    119{
    120	long ret;
    121
    122	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
    123	if (ret > 0)
    124		memset(to + (n - ret), 0, ret);
    125
    126	return ret;
    127}
    128
    129long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
    130			       unsigned long n)
    131{
    132	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
    133}
    134
    135int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
    136			       struct kvmppc_pte *gpte, u64 root,
    137			       u64 *pte_ret_p)
    138{
    139	struct kvm *kvm = vcpu->kvm;
    140	int ret, level, ps;
    141	unsigned long rts, bits, offset, index;
    142	u64 pte, base, gpa;
    143	__be64 rpte;
    144
    145	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
    146		((root & RTS2_MASK) >> RTS2_SHIFT);
    147	bits = root & RPDS_MASK;
    148	base = root & RPDB_MASK;
    149
    150	offset = rts + 31;
    151
    152	/* Current implementations only support 52-bit space */
    153	if (offset != 52)
    154		return -EINVAL;
    155
    156	/* Walk each level of the radix tree */
    157	for (level = 3; level >= 0; --level) {
    158		u64 addr;
    159		/* Check a valid size */
    160		if (level && bits != p9_supported_radix_bits[level])
    161			return -EINVAL;
    162		if (level == 0 && !(bits == 5 || bits == 9))
    163			return -EINVAL;
    164		offset -= bits;
    165		index = (eaddr >> offset) & ((1UL << bits) - 1);
    166		/* Check that low bits of page table base are zero */
    167		if (base & ((1UL << (bits + 3)) - 1))
    168			return -EINVAL;
    169		/* Read the entry from guest memory */
    170		addr = base + (index * sizeof(rpte));
    171
    172		kvm_vcpu_srcu_read_lock(vcpu);
    173		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
    174		kvm_vcpu_srcu_read_unlock(vcpu);
    175		if (ret) {
    176			if (pte_ret_p)
    177				*pte_ret_p = addr;
    178			return ret;
    179		}
    180		pte = __be64_to_cpu(rpte);
    181		if (!(pte & _PAGE_PRESENT))
    182			return -ENOENT;
    183		/* Check if a leaf entry */
    184		if (pte & _PAGE_PTE)
    185			break;
    186		/* Get ready to walk the next level */
    187		base = pte & RPDB_MASK;
    188		bits = pte & RPDS_MASK;
    189	}
    190
    191	/* Need a leaf at lowest level; 512GB pages not supported */
    192	if (level < 0 || level == 3)
    193		return -EINVAL;
    194
    195	/* We found a valid leaf PTE */
    196	/* Offset is now log base 2 of the page size */
    197	gpa = pte & 0x01fffffffffff000ul;
    198	if (gpa & ((1ul << offset) - 1))
    199		return -EINVAL;
    200	gpa |= eaddr & ((1ul << offset) - 1);
    201	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
    202		if (offset == mmu_psize_defs[ps].shift)
    203			break;
    204	gpte->page_size = ps;
    205	gpte->page_shift = offset;
    206
    207	gpte->eaddr = eaddr;
    208	gpte->raddr = gpa;
    209
    210	/* Work out permissions */
    211	gpte->may_read = !!(pte & _PAGE_READ);
    212	gpte->may_write = !!(pte & _PAGE_WRITE);
    213	gpte->may_execute = !!(pte & _PAGE_EXEC);
    214
    215	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
    216
    217	if (pte_ret_p)
    218		*pte_ret_p = pte;
    219
    220	return 0;
    221}
    222
    223/*
    224 * Used to walk a partition or process table radix tree in guest memory
    225 * Note: We exploit the fact that a partition table and a process
    226 * table have the same layout, a partition-scoped page table and a
    227 * process-scoped page table have the same layout, and the 2nd
    228 * doubleword of a partition table entry has the same layout as
    229 * the PTCR register.
    230 */
    231int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
    232				     struct kvmppc_pte *gpte, u64 table,
    233				     int table_index, u64 *pte_ret_p)
    234{
    235	struct kvm *kvm = vcpu->kvm;
    236	int ret;
    237	unsigned long size, ptbl, root;
    238	struct prtb_entry entry;
    239
    240	if ((table & PRTS_MASK) > 24)
    241		return -EINVAL;
    242	size = 1ul << ((table & PRTS_MASK) + 12);
    243
    244	/* Is the table big enough to contain this entry? */
    245	if ((table_index * sizeof(entry)) >= size)
    246		return -EINVAL;
    247
    248	/* Read the table to find the root of the radix tree */
    249	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
    250	kvm_vcpu_srcu_read_lock(vcpu);
    251	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
    252	kvm_vcpu_srcu_read_unlock(vcpu);
    253	if (ret)
    254		return ret;
    255
    256	/* Root is stored in the first double word */
    257	root = be64_to_cpu(entry.prtb0);
    258
    259	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
    260}
    261
    262int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
    263			   struct kvmppc_pte *gpte, bool data, bool iswrite)
    264{
    265	u32 pid;
    266	u64 pte;
    267	int ret;
    268
    269	/* Work out effective PID */
    270	switch (eaddr >> 62) {
    271	case 0:
    272		pid = vcpu->arch.pid;
    273		break;
    274	case 3:
    275		pid = 0;
    276		break;
    277	default:
    278		return -EINVAL;
    279	}
    280
    281	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
    282				vcpu->kvm->arch.process_table, pid, &pte);
    283	if (ret)
    284		return ret;
    285
    286	/* Check privilege (applies only to process scoped translations) */
    287	if (kvmppc_get_msr(vcpu) & MSR_PR) {
    288		if (pte & _PAGE_PRIVILEGED) {
    289			gpte->may_read = 0;
    290			gpte->may_write = 0;
    291			gpte->may_execute = 0;
    292		}
    293	} else {
    294		if (!(pte & _PAGE_PRIVILEGED)) {
    295			/* Check AMR/IAMR to see if strict mode is in force */
    296			if (vcpu->arch.amr & (1ul << 62))
    297				gpte->may_read = 0;
    298			if (vcpu->arch.amr & (1ul << 63))
    299				gpte->may_write = 0;
    300			if (vcpu->arch.iamr & (1ul << 62))
    301				gpte->may_execute = 0;
    302		}
    303	}
    304
    305	return 0;
    306}
    307
    308void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
    309			     unsigned int pshift, unsigned int lpid)
    310{
    311	unsigned long psize = PAGE_SIZE;
    312	int psi;
    313	long rc;
    314	unsigned long rb;
    315
    316	if (pshift)
    317		psize = 1UL << pshift;
    318	else
    319		pshift = PAGE_SHIFT;
    320
    321	addr &= ~(psize - 1);
    322
    323	if (!kvmhv_on_pseries()) {
    324		radix__flush_tlb_lpid_page(lpid, addr, psize);
    325		return;
    326	}
    327
    328	psi = shift_to_mmu_psize(pshift);
    329
    330	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
    331		rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
    332		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
    333					lpid, rb);
    334	} else {
    335		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
    336					    H_RPTI_TYPE_NESTED |
    337					    H_RPTI_TYPE_TLB,
    338					    psize_to_rpti_pgsize(psi),
    339					    addr, addr + psize);
    340	}
    341
    342	if (rc)
    343		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
    344}
    345
    346static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
    347{
    348	long rc;
    349
    350	if (!kvmhv_on_pseries()) {
    351		radix__flush_pwc_lpid(lpid);
    352		return;
    353	}
    354
    355	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
    356		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
    357					lpid, TLBIEL_INVAL_SET_LPID);
    358	else
    359		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
    360					    H_RPTI_TYPE_NESTED |
    361					    H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
    362					    0, -1UL);
    363	if (rc)
    364		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
    365}
    366
    367static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
    368				      unsigned long clr, unsigned long set,
    369				      unsigned long addr, unsigned int shift)
    370{
    371	return __radix_pte_update(ptep, clr, set);
    372}
    373
    374static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
    375			     pte_t *ptep, pte_t pte)
    376{
    377	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
    378}
    379
    380static struct kmem_cache *kvm_pte_cache;
    381static struct kmem_cache *kvm_pmd_cache;
    382
    383static pte_t *kvmppc_pte_alloc(void)
    384{
    385	pte_t *pte;
    386
    387	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
    388	/* pmd_populate() will only reference _pa(pte). */
    389	kmemleak_ignore(pte);
    390
    391	return pte;
    392}
    393
    394static void kvmppc_pte_free(pte_t *ptep)
    395{
    396	kmem_cache_free(kvm_pte_cache, ptep);
    397}
    398
    399static pmd_t *kvmppc_pmd_alloc(void)
    400{
    401	pmd_t *pmd;
    402
    403	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
    404	/* pud_populate() will only reference _pa(pmd). */
    405	kmemleak_ignore(pmd);
    406
    407	return pmd;
    408}
    409
    410static void kvmppc_pmd_free(pmd_t *pmdp)
    411{
    412	kmem_cache_free(kvm_pmd_cache, pmdp);
    413}
    414
    415/* Called with kvm->mmu_lock held */
    416void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
    417		      unsigned int shift,
    418		      const struct kvm_memory_slot *memslot,
    419		      unsigned int lpid)
    420
    421{
    422	unsigned long old;
    423	unsigned long gfn = gpa >> PAGE_SHIFT;
    424	unsigned long page_size = PAGE_SIZE;
    425	unsigned long hpa;
    426
    427	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
    428	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
    429
    430	/* The following only applies to L1 entries */
    431	if (lpid != kvm->arch.lpid)
    432		return;
    433
    434	if (!memslot) {
    435		memslot = gfn_to_memslot(kvm, gfn);
    436		if (!memslot)
    437			return;
    438	}
    439	if (shift) { /* 1GB or 2MB page */
    440		page_size = 1ul << shift;
    441		if (shift == PMD_SHIFT)
    442			kvm->stat.num_2M_pages--;
    443		else if (shift == PUD_SHIFT)
    444			kvm->stat.num_1G_pages--;
    445	}
    446
    447	gpa &= ~(page_size - 1);
    448	hpa = old & PTE_RPN_MASK;
    449	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
    450
    451	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
    452		kvmppc_update_dirty_map(memslot, gfn, page_size);
    453}
    454
    455/*
    456 * kvmppc_free_p?d are used to free existing page tables, and recursively
    457 * descend and clear and free children.
    458 * Callers are responsible for flushing the PWC.
    459 *
    460 * When page tables are being unmapped/freed as part of page fault path
    461 * (full == false), valid ptes are generally not expected; however, there
    462 * is one situation where they arise, which is when dirty page logging is
    463 * turned off for a memslot while the VM is running.  The new memslot
    464 * becomes visible to page faults before the memslot commit function
    465 * gets to flush the memslot, which can lead to a 2MB page mapping being
    466 * installed for a guest physical address where there are already 64kB
    467 * (or 4kB) mappings (of sub-pages of the same 2MB page).
    468 */
    469static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
    470				  unsigned int lpid)
    471{
    472	if (full) {
    473		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
    474	} else {
    475		pte_t *p = pte;
    476		unsigned long it;
    477
    478		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
    479			if (pte_val(*p) == 0)
    480				continue;
    481			kvmppc_unmap_pte(kvm, p,
    482					 pte_pfn(*p) << PAGE_SHIFT,
    483					 PAGE_SHIFT, NULL, lpid);
    484		}
    485	}
    486
    487	kvmppc_pte_free(pte);
    488}
    489
    490static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
    491				  unsigned int lpid)
    492{
    493	unsigned long im;
    494	pmd_t *p = pmd;
    495
    496	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
    497		if (!pmd_present(*p))
    498			continue;
    499		if (pmd_is_leaf(*p)) {
    500			if (full) {
    501				pmd_clear(p);
    502			} else {
    503				WARN_ON_ONCE(1);
    504				kvmppc_unmap_pte(kvm, (pte_t *)p,
    505					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
    506					 PMD_SHIFT, NULL, lpid);
    507			}
    508		} else {
    509			pte_t *pte;
    510
    511			pte = pte_offset_map(p, 0);
    512			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
    513			pmd_clear(p);
    514		}
    515	}
    516	kvmppc_pmd_free(pmd);
    517}
    518
    519static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
    520				  unsigned int lpid)
    521{
    522	unsigned long iu;
    523	pud_t *p = pud;
    524
    525	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
    526		if (!pud_present(*p))
    527			continue;
    528		if (pud_is_leaf(*p)) {
    529			pud_clear(p);
    530		} else {
    531			pmd_t *pmd;
    532
    533			pmd = pmd_offset(p, 0);
    534			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
    535			pud_clear(p);
    536		}
    537	}
    538	pud_free(kvm->mm, pud);
    539}
    540
    541void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
    542{
    543	unsigned long ig;
    544
    545	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
    546		p4d_t *p4d = p4d_offset(pgd, 0);
    547		pud_t *pud;
    548
    549		if (!p4d_present(*p4d))
    550			continue;
    551		pud = pud_offset(p4d, 0);
    552		kvmppc_unmap_free_pud(kvm, pud, lpid);
    553		p4d_clear(p4d);
    554	}
    555}
    556
    557void kvmppc_free_radix(struct kvm *kvm)
    558{
    559	if (kvm->arch.pgtable) {
    560		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
    561					  kvm->arch.lpid);
    562		pgd_free(kvm->mm, kvm->arch.pgtable);
    563		kvm->arch.pgtable = NULL;
    564	}
    565}
    566
    567static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
    568					unsigned long gpa, unsigned int lpid)
    569{
    570	pte_t *pte = pte_offset_kernel(pmd, 0);
    571
    572	/*
    573	 * Clearing the pmd entry then flushing the PWC ensures that the pte
    574	 * page no longer be cached by the MMU, so can be freed without
    575	 * flushing the PWC again.
    576	 */
    577	pmd_clear(pmd);
    578	kvmppc_radix_flush_pwc(kvm, lpid);
    579
    580	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
    581}
    582
    583static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
    584					unsigned long gpa, unsigned int lpid)
    585{
    586	pmd_t *pmd = pmd_offset(pud, 0);
    587
    588	/*
    589	 * Clearing the pud entry then flushing the PWC ensures that the pmd
    590	 * page and any children pte pages will no longer be cached by the MMU,
    591	 * so can be freed without flushing the PWC again.
    592	 */
    593	pud_clear(pud);
    594	kvmppc_radix_flush_pwc(kvm, lpid);
    595
    596	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
    597}
    598
    599/*
    600 * There are a number of bits which may differ between different faults to
    601 * the same partition scope entry. RC bits, in the course of cleaning and
    602 * aging. And the write bit can change, either the access could have been
    603 * upgraded, or a read fault could happen concurrently with a write fault
    604 * that sets those bits first.
    605 */
    606#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
    607
    608int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
    609		      unsigned long gpa, unsigned int level,
    610		      unsigned long mmu_seq, unsigned int lpid,
    611		      unsigned long *rmapp, struct rmap_nested **n_rmap)
    612{
    613	pgd_t *pgd;
    614	p4d_t *p4d;
    615	pud_t *pud, *new_pud = NULL;
    616	pmd_t *pmd, *new_pmd = NULL;
    617	pte_t *ptep, *new_ptep = NULL;
    618	int ret;
    619
    620	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
    621	pgd = pgtable + pgd_index(gpa);
    622	p4d = p4d_offset(pgd, gpa);
    623
    624	pud = NULL;
    625	if (p4d_present(*p4d))
    626		pud = pud_offset(p4d, gpa);
    627	else
    628		new_pud = pud_alloc_one(kvm->mm, gpa);
    629
    630	pmd = NULL;
    631	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
    632		pmd = pmd_offset(pud, gpa);
    633	else if (level <= 1)
    634		new_pmd = kvmppc_pmd_alloc();
    635
    636	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
    637		new_ptep = kvmppc_pte_alloc();
    638
    639	/* Check if we might have been invalidated; let the guest retry if so */
    640	spin_lock(&kvm->mmu_lock);
    641	ret = -EAGAIN;
    642	if (mmu_notifier_retry(kvm, mmu_seq))
    643		goto out_unlock;
    644
    645	/* Now traverse again under the lock and change the tree */
    646	ret = -ENOMEM;
    647	if (p4d_none(*p4d)) {
    648		if (!new_pud)
    649			goto out_unlock;
    650		p4d_populate(kvm->mm, p4d, new_pud);
    651		new_pud = NULL;
    652	}
    653	pud = pud_offset(p4d, gpa);
    654	if (pud_is_leaf(*pud)) {
    655		unsigned long hgpa = gpa & PUD_MASK;
    656
    657		/* Check if we raced and someone else has set the same thing */
    658		if (level == 2) {
    659			if (pud_raw(*pud) == pte_raw(pte)) {
    660				ret = 0;
    661				goto out_unlock;
    662			}
    663			/* Valid 1GB page here already, add our extra bits */
    664			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
    665							PTE_BITS_MUST_MATCH);
    666			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
    667					      0, pte_val(pte), hgpa, PUD_SHIFT);
    668			ret = 0;
    669			goto out_unlock;
    670		}
    671		/*
    672		 * If we raced with another CPU which has just put
    673		 * a 1GB pte in after we saw a pmd page, try again.
    674		 */
    675		if (!new_pmd) {
    676			ret = -EAGAIN;
    677			goto out_unlock;
    678		}
    679		/* Valid 1GB page here already, remove it */
    680		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
    681				 lpid);
    682	}
    683	if (level == 2) {
    684		if (!pud_none(*pud)) {
    685			/*
    686			 * There's a page table page here, but we wanted to
    687			 * install a large page, so remove and free the page
    688			 * table page.
    689			 */
    690			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
    691		}
    692		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
    693		if (rmapp && n_rmap)
    694			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
    695		ret = 0;
    696		goto out_unlock;
    697	}
    698	if (pud_none(*pud)) {
    699		if (!new_pmd)
    700			goto out_unlock;
    701		pud_populate(kvm->mm, pud, new_pmd);
    702		new_pmd = NULL;
    703	}
    704	pmd = pmd_offset(pud, gpa);
    705	if (pmd_is_leaf(*pmd)) {
    706		unsigned long lgpa = gpa & PMD_MASK;
    707
    708		/* Check if we raced and someone else has set the same thing */
    709		if (level == 1) {
    710			if (pmd_raw(*pmd) == pte_raw(pte)) {
    711				ret = 0;
    712				goto out_unlock;
    713			}
    714			/* Valid 2MB page here already, add our extra bits */
    715			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
    716							PTE_BITS_MUST_MATCH);
    717			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
    718					0, pte_val(pte), lgpa, PMD_SHIFT);
    719			ret = 0;
    720			goto out_unlock;
    721		}
    722
    723		/*
    724		 * If we raced with another CPU which has just put
    725		 * a 2MB pte in after we saw a pte page, try again.
    726		 */
    727		if (!new_ptep) {
    728			ret = -EAGAIN;
    729			goto out_unlock;
    730		}
    731		/* Valid 2MB page here already, remove it */
    732		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
    733				 lpid);
    734	}
    735	if (level == 1) {
    736		if (!pmd_none(*pmd)) {
    737			/*
    738			 * There's a page table page here, but we wanted to
    739			 * install a large page, so remove and free the page
    740			 * table page.
    741			 */
    742			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
    743		}
    744		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
    745		if (rmapp && n_rmap)
    746			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
    747		ret = 0;
    748		goto out_unlock;
    749	}
    750	if (pmd_none(*pmd)) {
    751		if (!new_ptep)
    752			goto out_unlock;
    753		pmd_populate(kvm->mm, pmd, new_ptep);
    754		new_ptep = NULL;
    755	}
    756	ptep = pte_offset_kernel(pmd, gpa);
    757	if (pte_present(*ptep)) {
    758		/* Check if someone else set the same thing */
    759		if (pte_raw(*ptep) == pte_raw(pte)) {
    760			ret = 0;
    761			goto out_unlock;
    762		}
    763		/* Valid page here already, add our extra bits */
    764		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
    765							PTE_BITS_MUST_MATCH);
    766		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
    767		ret = 0;
    768		goto out_unlock;
    769	}
    770	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
    771	if (rmapp && n_rmap)
    772		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
    773	ret = 0;
    774
    775 out_unlock:
    776	spin_unlock(&kvm->mmu_lock);
    777	if (new_pud)
    778		pud_free(kvm->mm, new_pud);
    779	if (new_pmd)
    780		kvmppc_pmd_free(new_pmd);
    781	if (new_ptep)
    782		kvmppc_pte_free(new_ptep);
    783	return ret;
    784}
    785
    786bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
    787			     unsigned long gpa, unsigned int lpid)
    788{
    789	unsigned long pgflags;
    790	unsigned int shift;
    791	pte_t *ptep;
    792
    793	/*
    794	 * Need to set an R or C bit in the 2nd-level tables;
    795	 * since we are just helping out the hardware here,
    796	 * it is sufficient to do what the hardware does.
    797	 */
    798	pgflags = _PAGE_ACCESSED;
    799	if (writing)
    800		pgflags |= _PAGE_DIRTY;
    801
    802	if (nested)
    803		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
    804	else
    805		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
    806
    807	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
    808		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
    809		return true;
    810	}
    811	return false;
    812}
    813
    814int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
    815				   unsigned long gpa,
    816				   struct kvm_memory_slot *memslot,
    817				   bool writing, bool kvm_ro,
    818				   pte_t *inserted_pte, unsigned int *levelp)
    819{
    820	struct kvm *kvm = vcpu->kvm;
    821	struct page *page = NULL;
    822	unsigned long mmu_seq;
    823	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
    824	bool upgrade_write = false;
    825	bool *upgrade_p = &upgrade_write;
    826	pte_t pte, *ptep;
    827	unsigned int shift, level;
    828	int ret;
    829	bool large_enable;
    830
    831	/* used to check for invalidations in progress */
    832	mmu_seq = kvm->mmu_notifier_seq;
    833	smp_rmb();
    834
    835	/*
    836	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
    837	 * do it with !atomic && !async, which is how we call it.
    838	 * We always ask for write permission since the common case
    839	 * is that the page is writable.
    840	 */
    841	hva = gfn_to_hva_memslot(memslot, gfn);
    842	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
    843		upgrade_write = true;
    844	} else {
    845		unsigned long pfn;
    846
    847		/* Call KVM generic code to do the slow-path check */
    848		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
    849					   writing, upgrade_p, NULL);
    850		if (is_error_noslot_pfn(pfn))
    851			return -EFAULT;
    852		page = NULL;
    853		if (pfn_valid(pfn)) {
    854			page = pfn_to_page(pfn);
    855			if (PageReserved(page))
    856				page = NULL;
    857		}
    858	}
    859
    860	/*
    861	 * Read the PTE from the process' radix tree and use that
    862	 * so we get the shift and attribute bits.
    863	 */
    864	spin_lock(&kvm->mmu_lock);
    865	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
    866	pte = __pte(0);
    867	if (ptep)
    868		pte = READ_ONCE(*ptep);
    869	spin_unlock(&kvm->mmu_lock);
    870	/*
    871	 * If the PTE disappeared temporarily due to a THP
    872	 * collapse, just return and let the guest try again.
    873	 */
    874	if (!pte_present(pte)) {
    875		if (page)
    876			put_page(page);
    877		return RESUME_GUEST;
    878	}
    879
    880	/* If we're logging dirty pages, always map single pages */
    881	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
    882
    883	/* Get pte level from shift/size */
    884	if (large_enable && shift == PUD_SHIFT &&
    885	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
    886	    (hva & (PUD_SIZE - PAGE_SIZE))) {
    887		level = 2;
    888	} else if (large_enable && shift == PMD_SHIFT &&
    889		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
    890		   (hva & (PMD_SIZE - PAGE_SIZE))) {
    891		level = 1;
    892	} else {
    893		level = 0;
    894		if (shift > PAGE_SHIFT) {
    895			/*
    896			 * If the pte maps more than one page, bring over
    897			 * bits from the virtual address to get the real
    898			 * address of the specific single page we want.
    899			 */
    900			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
    901			pte = __pte(pte_val(pte) | (hva & rpnmask));
    902		}
    903	}
    904
    905	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
    906	if (writing || upgrade_write) {
    907		if (pte_val(pte) & _PAGE_WRITE)
    908			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
    909	} else {
    910		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
    911	}
    912
    913	/* Allocate space in the tree and write the PTE */
    914	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
    915				mmu_seq, kvm->arch.lpid, NULL, NULL);
    916	if (inserted_pte)
    917		*inserted_pte = pte;
    918	if (levelp)
    919		*levelp = level;
    920
    921	if (page) {
    922		if (!ret && (pte_val(pte) & _PAGE_WRITE))
    923			set_page_dirty_lock(page);
    924		put_page(page);
    925	}
    926
    927	/* Increment number of large pages if we (successfully) inserted one */
    928	if (!ret) {
    929		if (level == 1)
    930			kvm->stat.num_2M_pages++;
    931		else if (level == 2)
    932			kvm->stat.num_1G_pages++;
    933	}
    934
    935	return ret;
    936}
    937
    938int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
    939				   unsigned long ea, unsigned long dsisr)
    940{
    941	struct kvm *kvm = vcpu->kvm;
    942	unsigned long gpa, gfn;
    943	struct kvm_memory_slot *memslot;
    944	long ret;
    945	bool writing = !!(dsisr & DSISR_ISSTORE);
    946	bool kvm_ro = false;
    947
    948	/* Check for unusual errors */
    949	if (dsisr & DSISR_UNSUPP_MMU) {
    950		pr_err("KVM: Got unsupported MMU fault\n");
    951		return -EFAULT;
    952	}
    953	if (dsisr & DSISR_BADACCESS) {
    954		/* Reflect to the guest as DSI */
    955		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
    956		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
    957		return RESUME_GUEST;
    958	}
    959
    960	/* Translate the logical address */
    961	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
    962	gpa &= ~0xF000000000000000ul;
    963	gfn = gpa >> PAGE_SHIFT;
    964	if (!(dsisr & DSISR_PRTABLE_FAULT))
    965		gpa |= ea & 0xfff;
    966
    967	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
    968		return kvmppc_send_page_to_uv(kvm, gfn);
    969
    970	/* Get the corresponding memslot */
    971	memslot = gfn_to_memslot(kvm, gfn);
    972
    973	/* No memslot means it's an emulated MMIO region */
    974	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
    975		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
    976			     DSISR_SET_RC)) {
    977			/*
    978			 * Bad address in guest page table tree, or other
    979			 * unusual error - reflect it to the guest as DSI.
    980			 */
    981			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
    982			return RESUME_GUEST;
    983		}
    984		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
    985	}
    986
    987	if (memslot->flags & KVM_MEM_READONLY) {
    988		if (writing) {
    989			/* give the guest a DSI */
    990			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
    991						       DSISR_PROTFAULT);
    992			return RESUME_GUEST;
    993		}
    994		kvm_ro = true;
    995	}
    996
    997	/* Failed to set the reference/change bits */
    998	if (dsisr & DSISR_SET_RC) {
    999		spin_lock(&kvm->mmu_lock);
   1000		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
   1001					    gpa, kvm->arch.lpid))
   1002			dsisr &= ~DSISR_SET_RC;
   1003		spin_unlock(&kvm->mmu_lock);
   1004
   1005		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
   1006			       DSISR_PROTFAULT | DSISR_SET_RC)))
   1007			return RESUME_GUEST;
   1008	}
   1009
   1010	/* Try to insert a pte */
   1011	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
   1012					     kvm_ro, NULL, NULL);
   1013
   1014	if (ret == 0 || ret == -EAGAIN)
   1015		ret = RESUME_GUEST;
   1016	return ret;
   1017}
   1018
   1019/* Called with kvm->mmu_lock held */
   1020void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
   1021		     unsigned long gfn)
   1022{
   1023	pte_t *ptep;
   1024	unsigned long gpa = gfn << PAGE_SHIFT;
   1025	unsigned int shift;
   1026
   1027	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
   1028		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
   1029		return;
   1030	}
   1031
   1032	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
   1033	if (ptep && pte_present(*ptep))
   1034		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
   1035				 kvm->arch.lpid);
   1036}
   1037
   1038/* Called with kvm->mmu_lock held */
   1039bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
   1040		   unsigned long gfn)
   1041{
   1042	pte_t *ptep;
   1043	unsigned long gpa = gfn << PAGE_SHIFT;
   1044	unsigned int shift;
   1045	bool ref = false;
   1046	unsigned long old, *rmapp;
   1047
   1048	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
   1049		return ref;
   1050
   1051	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
   1052	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
   1053		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
   1054					      gpa, shift);
   1055		/* XXX need to flush tlb here? */
   1056		/* Also clear bit in ptes in shadow pgtable for nested guests */
   1057		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
   1058		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
   1059					       old & PTE_RPN_MASK,
   1060					       1UL << shift);
   1061		ref = true;
   1062	}
   1063	return ref;
   1064}
   1065
   1066/* Called with kvm->mmu_lock held */
   1067bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
   1068			unsigned long gfn)
   1069
   1070{
   1071	pte_t *ptep;
   1072	unsigned long gpa = gfn << PAGE_SHIFT;
   1073	unsigned int shift;
   1074	bool ref = false;
   1075
   1076	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
   1077		return ref;
   1078
   1079	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
   1080	if (ptep && pte_present(*ptep) && pte_young(*ptep))
   1081		ref = true;
   1082	return ref;
   1083}
   1084
   1085/* Returns the number of PAGE_SIZE pages that are dirty */
   1086static int kvm_radix_test_clear_dirty(struct kvm *kvm,
   1087				struct kvm_memory_slot *memslot, int pagenum)
   1088{
   1089	unsigned long gfn = memslot->base_gfn + pagenum;
   1090	unsigned long gpa = gfn << PAGE_SHIFT;
   1091	pte_t *ptep, pte;
   1092	unsigned int shift;
   1093	int ret = 0;
   1094	unsigned long old, *rmapp;
   1095
   1096	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
   1097		return ret;
   1098
   1099	/*
   1100	 * For performance reasons we don't hold kvm->mmu_lock while walking the
   1101	 * partition scoped table.
   1102	 */
   1103	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
   1104	if (!ptep)
   1105		return 0;
   1106
   1107	pte = READ_ONCE(*ptep);
   1108	if (pte_present(pte) && pte_dirty(pte)) {
   1109		spin_lock(&kvm->mmu_lock);
   1110		/*
   1111		 * Recheck the pte again
   1112		 */
   1113		if (pte_val(pte) != pte_val(*ptep)) {
   1114			/*
   1115			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
   1116			 * only find PAGE_SIZE pte entries here. We can continue
   1117			 * to use the pte addr returned by above page table
   1118			 * walk.
   1119			 */
   1120			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
   1121				spin_unlock(&kvm->mmu_lock);
   1122				return 0;
   1123			}
   1124		}
   1125
   1126		ret = 1;
   1127		VM_BUG_ON(shift);
   1128		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
   1129					      gpa, shift);
   1130		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
   1131		/* Also clear bit in ptes in shadow pgtable for nested guests */
   1132		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
   1133		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
   1134					       old & PTE_RPN_MASK,
   1135					       1UL << shift);
   1136		spin_unlock(&kvm->mmu_lock);
   1137	}
   1138	return ret;
   1139}
   1140
   1141long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
   1142			struct kvm_memory_slot *memslot, unsigned long *map)
   1143{
   1144	unsigned long i, j;
   1145	int npages;
   1146
   1147	for (i = 0; i < memslot->npages; i = j) {
   1148		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
   1149
   1150		/*
   1151		 * Note that if npages > 0 then i must be a multiple of npages,
   1152		 * since huge pages are only used to back the guest at guest
   1153		 * real addresses that are a multiple of their size.
   1154		 * Since we have at most one PTE covering any given guest
   1155		 * real address, if npages > 1 we can skip to i + npages.
   1156		 */
   1157		j = i + 1;
   1158		if (npages) {
   1159			set_dirty_bits(map, i, npages);
   1160			j = i + npages;
   1161		}
   1162	}
   1163	return 0;
   1164}
   1165
   1166void kvmppc_radix_flush_memslot(struct kvm *kvm,
   1167				const struct kvm_memory_slot *memslot)
   1168{
   1169	unsigned long n;
   1170	pte_t *ptep;
   1171	unsigned long gpa;
   1172	unsigned int shift;
   1173
   1174	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
   1175		kvmppc_uvmem_drop_pages(memslot, kvm, true);
   1176
   1177	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
   1178		return;
   1179
   1180	gpa = memslot->base_gfn << PAGE_SHIFT;
   1181	spin_lock(&kvm->mmu_lock);
   1182	for (n = memslot->npages; n; --n) {
   1183		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
   1184		if (ptep && pte_present(*ptep))
   1185			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
   1186					 kvm->arch.lpid);
   1187		gpa += PAGE_SIZE;
   1188	}
   1189	/*
   1190	 * Increase the mmu notifier sequence number to prevent any page
   1191	 * fault that read the memslot earlier from writing a PTE.
   1192	 */
   1193	kvm->mmu_notifier_seq++;
   1194	spin_unlock(&kvm->mmu_lock);
   1195}
   1196
   1197static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
   1198				 int psize, int *indexp)
   1199{
   1200	if (!mmu_psize_defs[psize].shift)
   1201		return;
   1202	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
   1203		(mmu_psize_defs[psize].ap << 29);
   1204	++(*indexp);
   1205}
   1206
   1207int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
   1208{
   1209	int i;
   1210
   1211	if (!radix_enabled())
   1212		return -EINVAL;
   1213	memset(info, 0, sizeof(*info));
   1214
   1215	/* 4k page size */
   1216	info->geometries[0].page_shift = 12;
   1217	info->geometries[0].level_bits[0] = 9;
   1218	for (i = 1; i < 4; ++i)
   1219		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
   1220	/* 64k page size */
   1221	info->geometries[1].page_shift = 16;
   1222	for (i = 0; i < 4; ++i)
   1223		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
   1224
   1225	i = 0;
   1226	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
   1227	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
   1228	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
   1229	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
   1230
   1231	return 0;
   1232}
   1233
   1234int kvmppc_init_vm_radix(struct kvm *kvm)
   1235{
   1236	kvm->arch.pgtable = pgd_alloc(kvm->mm);
   1237	if (!kvm->arch.pgtable)
   1238		return -ENOMEM;
   1239	return 0;
   1240}
   1241
   1242static void pte_ctor(void *addr)
   1243{
   1244	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
   1245}
   1246
   1247static void pmd_ctor(void *addr)
   1248{
   1249	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
   1250}
   1251
   1252struct debugfs_radix_state {
   1253	struct kvm	*kvm;
   1254	struct mutex	mutex;
   1255	unsigned long	gpa;
   1256	int		lpid;
   1257	int		chars_left;
   1258	int		buf_index;
   1259	char		buf[128];
   1260	u8		hdr;
   1261};
   1262
   1263static int debugfs_radix_open(struct inode *inode, struct file *file)
   1264{
   1265	struct kvm *kvm = inode->i_private;
   1266	struct debugfs_radix_state *p;
   1267
   1268	p = kzalloc(sizeof(*p), GFP_KERNEL);
   1269	if (!p)
   1270		return -ENOMEM;
   1271
   1272	kvm_get_kvm(kvm);
   1273	p->kvm = kvm;
   1274	mutex_init(&p->mutex);
   1275	file->private_data = p;
   1276
   1277	return nonseekable_open(inode, file);
   1278}
   1279
   1280static int debugfs_radix_release(struct inode *inode, struct file *file)
   1281{
   1282	struct debugfs_radix_state *p = file->private_data;
   1283
   1284	kvm_put_kvm(p->kvm);
   1285	kfree(p);
   1286	return 0;
   1287}
   1288
   1289static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
   1290				 size_t len, loff_t *ppos)
   1291{
   1292	struct debugfs_radix_state *p = file->private_data;
   1293	ssize_t ret, r;
   1294	unsigned long n;
   1295	struct kvm *kvm;
   1296	unsigned long gpa;
   1297	pgd_t *pgt;
   1298	struct kvm_nested_guest *nested;
   1299	pgd_t *pgdp;
   1300	p4d_t p4d, *p4dp;
   1301	pud_t pud, *pudp;
   1302	pmd_t pmd, *pmdp;
   1303	pte_t *ptep;
   1304	int shift;
   1305	unsigned long pte;
   1306
   1307	kvm = p->kvm;
   1308	if (!kvm_is_radix(kvm))
   1309		return 0;
   1310
   1311	ret = mutex_lock_interruptible(&p->mutex);
   1312	if (ret)
   1313		return ret;
   1314
   1315	if (p->chars_left) {
   1316		n = p->chars_left;
   1317		if (n > len)
   1318			n = len;
   1319		r = copy_to_user(buf, p->buf + p->buf_index, n);
   1320		n -= r;
   1321		p->chars_left -= n;
   1322		p->buf_index += n;
   1323		buf += n;
   1324		len -= n;
   1325		ret = n;
   1326		if (r) {
   1327			if (!n)
   1328				ret = -EFAULT;
   1329			goto out;
   1330		}
   1331	}
   1332
   1333	gpa = p->gpa;
   1334	nested = NULL;
   1335	pgt = NULL;
   1336	while (len != 0 && p->lpid >= 0) {
   1337		if (gpa >= RADIX_PGTABLE_RANGE) {
   1338			gpa = 0;
   1339			pgt = NULL;
   1340			if (nested) {
   1341				kvmhv_put_nested(nested);
   1342				nested = NULL;
   1343			}
   1344			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
   1345			p->hdr = 0;
   1346			if (p->lpid < 0)
   1347				break;
   1348		}
   1349		if (!pgt) {
   1350			if (p->lpid == 0) {
   1351				pgt = kvm->arch.pgtable;
   1352			} else {
   1353				nested = kvmhv_get_nested(kvm, p->lpid, false);
   1354				if (!nested) {
   1355					gpa = RADIX_PGTABLE_RANGE;
   1356					continue;
   1357				}
   1358				pgt = nested->shadow_pgtable;
   1359			}
   1360		}
   1361		n = 0;
   1362		if (!p->hdr) {
   1363			if (p->lpid > 0)
   1364				n = scnprintf(p->buf, sizeof(p->buf),
   1365					      "\nNested LPID %d: ", p->lpid);
   1366			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
   1367				      "pgdir: %lx\n", (unsigned long)pgt);
   1368			p->hdr = 1;
   1369			goto copy;
   1370		}
   1371
   1372		pgdp = pgt + pgd_index(gpa);
   1373		p4dp = p4d_offset(pgdp, gpa);
   1374		p4d = READ_ONCE(*p4dp);
   1375		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
   1376			gpa = (gpa & P4D_MASK) + P4D_SIZE;
   1377			continue;
   1378		}
   1379
   1380		pudp = pud_offset(&p4d, gpa);
   1381		pud = READ_ONCE(*pudp);
   1382		if (!(pud_val(pud) & _PAGE_PRESENT)) {
   1383			gpa = (gpa & PUD_MASK) + PUD_SIZE;
   1384			continue;
   1385		}
   1386		if (pud_val(pud) & _PAGE_PTE) {
   1387			pte = pud_val(pud);
   1388			shift = PUD_SHIFT;
   1389			goto leaf;
   1390		}
   1391
   1392		pmdp = pmd_offset(&pud, gpa);
   1393		pmd = READ_ONCE(*pmdp);
   1394		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
   1395			gpa = (gpa & PMD_MASK) + PMD_SIZE;
   1396			continue;
   1397		}
   1398		if (pmd_val(pmd) & _PAGE_PTE) {
   1399			pte = pmd_val(pmd);
   1400			shift = PMD_SHIFT;
   1401			goto leaf;
   1402		}
   1403
   1404		ptep = pte_offset_kernel(&pmd, gpa);
   1405		pte = pte_val(READ_ONCE(*ptep));
   1406		if (!(pte & _PAGE_PRESENT)) {
   1407			gpa += PAGE_SIZE;
   1408			continue;
   1409		}
   1410		shift = PAGE_SHIFT;
   1411	leaf:
   1412		n = scnprintf(p->buf, sizeof(p->buf),
   1413			      " %lx: %lx %d\n", gpa, pte, shift);
   1414		gpa += 1ul << shift;
   1415	copy:
   1416		p->chars_left = n;
   1417		if (n > len)
   1418			n = len;
   1419		r = copy_to_user(buf, p->buf, n);
   1420		n -= r;
   1421		p->chars_left -= n;
   1422		p->buf_index = n;
   1423		buf += n;
   1424		len -= n;
   1425		ret += n;
   1426		if (r) {
   1427			if (!ret)
   1428				ret = -EFAULT;
   1429			break;
   1430		}
   1431	}
   1432	p->gpa = gpa;
   1433	if (nested)
   1434		kvmhv_put_nested(nested);
   1435
   1436 out:
   1437	mutex_unlock(&p->mutex);
   1438	return ret;
   1439}
   1440
   1441static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
   1442			   size_t len, loff_t *ppos)
   1443{
   1444	return -EACCES;
   1445}
   1446
   1447static const struct file_operations debugfs_radix_fops = {
   1448	.owner	 = THIS_MODULE,
   1449	.open	 = debugfs_radix_open,
   1450	.release = debugfs_radix_release,
   1451	.read	 = debugfs_radix_read,
   1452	.write	 = debugfs_radix_write,
   1453	.llseek	 = generic_file_llseek,
   1454};
   1455
   1456void kvmhv_radix_debugfs_init(struct kvm *kvm)
   1457{
   1458	debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
   1459			    &debugfs_radix_fops);
   1460}
   1461
   1462int kvmppc_radix_init(void)
   1463{
   1464	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
   1465
   1466	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
   1467	if (!kvm_pte_cache)
   1468		return -ENOMEM;
   1469
   1470	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
   1471
   1472	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
   1473	if (!kvm_pmd_cache) {
   1474		kmem_cache_destroy(kvm_pte_cache);
   1475		return -ENOMEM;
   1476	}
   1477
   1478	return 0;
   1479}
   1480
   1481void kvmppc_radix_exit(void)
   1482{
   1483	kmem_cache_destroy(kvm_pte_cache);
   1484	kmem_cache_destroy(kvm_pmd_cache);
   1485}