cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gmap.c (79778B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  KVM guest address space mapping code
      4 *
      5 *    Copyright IBM Corp. 2007, 2020
      6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
      7 *		 David Hildenbrand <david@redhat.com>
      8 *		 Janosch Frank <frankja@linux.vnet.ibm.com>
      9 */
     10
     11#include <linux/kernel.h>
     12#include <linux/pagewalk.h>
     13#include <linux/swap.h>
     14#include <linux/smp.h>
     15#include <linux/spinlock.h>
     16#include <linux/slab.h>
     17#include <linux/swapops.h>
     18#include <linux/ksm.h>
     19#include <linux/mman.h>
     20#include <linux/pgtable.h>
     21
     22#include <asm/pgalloc.h>
     23#include <asm/gmap.h>
     24#include <asm/tlb.h>
     25
     26#define GMAP_SHADOW_FAKE_TABLE 1ULL
     27
     28/**
     29 * gmap_alloc - allocate and initialize a guest address space
     30 * @limit: maximum address of the gmap address space
     31 *
     32 * Returns a guest address space structure.
     33 */
     34static struct gmap *gmap_alloc(unsigned long limit)
     35{
     36	struct gmap *gmap;
     37	struct page *page;
     38	unsigned long *table;
     39	unsigned long etype, atype;
     40
     41	if (limit < _REGION3_SIZE) {
     42		limit = _REGION3_SIZE - 1;
     43		atype = _ASCE_TYPE_SEGMENT;
     44		etype = _SEGMENT_ENTRY_EMPTY;
     45	} else if (limit < _REGION2_SIZE) {
     46		limit = _REGION2_SIZE - 1;
     47		atype = _ASCE_TYPE_REGION3;
     48		etype = _REGION3_ENTRY_EMPTY;
     49	} else if (limit < _REGION1_SIZE) {
     50		limit = _REGION1_SIZE - 1;
     51		atype = _ASCE_TYPE_REGION2;
     52		etype = _REGION2_ENTRY_EMPTY;
     53	} else {
     54		limit = -1UL;
     55		atype = _ASCE_TYPE_REGION1;
     56		etype = _REGION1_ENTRY_EMPTY;
     57	}
     58	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
     59	if (!gmap)
     60		goto out;
     61	INIT_LIST_HEAD(&gmap->crst_list);
     62	INIT_LIST_HEAD(&gmap->children);
     63	INIT_LIST_HEAD(&gmap->pt_list);
     64	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
     65	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
     66	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
     67	spin_lock_init(&gmap->guest_table_lock);
     68	spin_lock_init(&gmap->shadow_lock);
     69	refcount_set(&gmap->ref_count, 1);
     70	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
     71	if (!page)
     72		goto out_free;
     73	page->index = 0;
     74	list_add(&page->lru, &gmap->crst_list);
     75	table = (unsigned long *) page_to_phys(page);
     76	crst_table_init(table, etype);
     77	gmap->table = table;
     78	gmap->asce = atype | _ASCE_TABLE_LENGTH |
     79		_ASCE_USER_BITS | __pa(table);
     80	gmap->asce_end = limit;
     81	return gmap;
     82
     83out_free:
     84	kfree(gmap);
     85out:
     86	return NULL;
     87}
     88
     89/**
     90 * gmap_create - create a guest address space
     91 * @mm: pointer to the parent mm_struct
     92 * @limit: maximum size of the gmap address space
     93 *
     94 * Returns a guest address space structure.
     95 */
     96struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
     97{
     98	struct gmap *gmap;
     99	unsigned long gmap_asce;
    100
    101	gmap = gmap_alloc(limit);
    102	if (!gmap)
    103		return NULL;
    104	gmap->mm = mm;
    105	spin_lock(&mm->context.lock);
    106	list_add_rcu(&gmap->list, &mm->context.gmap_list);
    107	if (list_is_singular(&mm->context.gmap_list))
    108		gmap_asce = gmap->asce;
    109	else
    110		gmap_asce = -1UL;
    111	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
    112	spin_unlock(&mm->context.lock);
    113	return gmap;
    114}
    115EXPORT_SYMBOL_GPL(gmap_create);
    116
    117static void gmap_flush_tlb(struct gmap *gmap)
    118{
    119	if (MACHINE_HAS_IDTE)
    120		__tlb_flush_idte(gmap->asce);
    121	else
    122		__tlb_flush_global();
    123}
    124
    125static void gmap_radix_tree_free(struct radix_tree_root *root)
    126{
    127	struct radix_tree_iter iter;
    128	unsigned long indices[16];
    129	unsigned long index;
    130	void __rcu **slot;
    131	int i, nr;
    132
    133	/* A radix tree is freed by deleting all of its entries */
    134	index = 0;
    135	do {
    136		nr = 0;
    137		radix_tree_for_each_slot(slot, root, &iter, index) {
    138			indices[nr] = iter.index;
    139			if (++nr == 16)
    140				break;
    141		}
    142		for (i = 0; i < nr; i++) {
    143			index = indices[i];
    144			radix_tree_delete(root, index);
    145		}
    146	} while (nr > 0);
    147}
    148
    149static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
    150{
    151	struct gmap_rmap *rmap, *rnext, *head;
    152	struct radix_tree_iter iter;
    153	unsigned long indices[16];
    154	unsigned long index;
    155	void __rcu **slot;
    156	int i, nr;
    157
    158	/* A radix tree is freed by deleting all of its entries */
    159	index = 0;
    160	do {
    161		nr = 0;
    162		radix_tree_for_each_slot(slot, root, &iter, index) {
    163			indices[nr] = iter.index;
    164			if (++nr == 16)
    165				break;
    166		}
    167		for (i = 0; i < nr; i++) {
    168			index = indices[i];
    169			head = radix_tree_delete(root, index);
    170			gmap_for_each_rmap_safe(rmap, rnext, head)
    171				kfree(rmap);
    172		}
    173	} while (nr > 0);
    174}
    175
    176/**
    177 * gmap_free - free a guest address space
    178 * @gmap: pointer to the guest address space structure
    179 *
    180 * No locks required. There are no references to this gmap anymore.
    181 */
    182static void gmap_free(struct gmap *gmap)
    183{
    184	struct page *page, *next;
    185
    186	/* Flush tlb of all gmaps (if not already done for shadows) */
    187	if (!(gmap_is_shadow(gmap) && gmap->removed))
    188		gmap_flush_tlb(gmap);
    189	/* Free all segment & region tables. */
    190	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
    191		__free_pages(page, CRST_ALLOC_ORDER);
    192	gmap_radix_tree_free(&gmap->guest_to_host);
    193	gmap_radix_tree_free(&gmap->host_to_guest);
    194
    195	/* Free additional data for a shadow gmap */
    196	if (gmap_is_shadow(gmap)) {
    197		/* Free all page tables. */
    198		list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
    199			page_table_free_pgste(page);
    200		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
    201		/* Release reference to the parent */
    202		gmap_put(gmap->parent);
    203	}
    204
    205	kfree(gmap);
    206}
    207
    208/**
    209 * gmap_get - increase reference counter for guest address space
    210 * @gmap: pointer to the guest address space structure
    211 *
    212 * Returns the gmap pointer
    213 */
    214struct gmap *gmap_get(struct gmap *gmap)
    215{
    216	refcount_inc(&gmap->ref_count);
    217	return gmap;
    218}
    219EXPORT_SYMBOL_GPL(gmap_get);
    220
    221/**
    222 * gmap_put - decrease reference counter for guest address space
    223 * @gmap: pointer to the guest address space structure
    224 *
    225 * If the reference counter reaches zero the guest address space is freed.
    226 */
    227void gmap_put(struct gmap *gmap)
    228{
    229	if (refcount_dec_and_test(&gmap->ref_count))
    230		gmap_free(gmap);
    231}
    232EXPORT_SYMBOL_GPL(gmap_put);
    233
    234/**
    235 * gmap_remove - remove a guest address space but do not free it yet
    236 * @gmap: pointer to the guest address space structure
    237 */
    238void gmap_remove(struct gmap *gmap)
    239{
    240	struct gmap *sg, *next;
    241	unsigned long gmap_asce;
    242
    243	/* Remove all shadow gmaps linked to this gmap */
    244	if (!list_empty(&gmap->children)) {
    245		spin_lock(&gmap->shadow_lock);
    246		list_for_each_entry_safe(sg, next, &gmap->children, list) {
    247			list_del(&sg->list);
    248			gmap_put(sg);
    249		}
    250		spin_unlock(&gmap->shadow_lock);
    251	}
    252	/* Remove gmap from the pre-mm list */
    253	spin_lock(&gmap->mm->context.lock);
    254	list_del_rcu(&gmap->list);
    255	if (list_empty(&gmap->mm->context.gmap_list))
    256		gmap_asce = 0;
    257	else if (list_is_singular(&gmap->mm->context.gmap_list))
    258		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
    259					     struct gmap, list)->asce;
    260	else
    261		gmap_asce = -1UL;
    262	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
    263	spin_unlock(&gmap->mm->context.lock);
    264	synchronize_rcu();
    265	/* Put reference */
    266	gmap_put(gmap);
    267}
    268EXPORT_SYMBOL_GPL(gmap_remove);
    269
    270/**
    271 * gmap_enable - switch primary space to the guest address space
    272 * @gmap: pointer to the guest address space structure
    273 */
    274void gmap_enable(struct gmap *gmap)
    275{
    276	S390_lowcore.gmap = (unsigned long) gmap;
    277}
    278EXPORT_SYMBOL_GPL(gmap_enable);
    279
    280/**
    281 * gmap_disable - switch back to the standard primary address space
    282 * @gmap: pointer to the guest address space structure
    283 */
    284void gmap_disable(struct gmap *gmap)
    285{
    286	S390_lowcore.gmap = 0UL;
    287}
    288EXPORT_SYMBOL_GPL(gmap_disable);
    289
    290/**
    291 * gmap_get_enabled - get a pointer to the currently enabled gmap
    292 *
    293 * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
    294 */
    295struct gmap *gmap_get_enabled(void)
    296{
    297	return (struct gmap *) S390_lowcore.gmap;
    298}
    299EXPORT_SYMBOL_GPL(gmap_get_enabled);
    300
    301/*
    302 * gmap_alloc_table is assumed to be called with mmap_lock held
    303 */
    304static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
    305			    unsigned long init, unsigned long gaddr)
    306{
    307	struct page *page;
    308	unsigned long *new;
    309
    310	/* since we dont free the gmap table until gmap_free we can unlock */
    311	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
    312	if (!page)
    313		return -ENOMEM;
    314	new = (unsigned long *) page_to_phys(page);
    315	crst_table_init(new, init);
    316	spin_lock(&gmap->guest_table_lock);
    317	if (*table & _REGION_ENTRY_INVALID) {
    318		list_add(&page->lru, &gmap->crst_list);
    319		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
    320			(*table & _REGION_ENTRY_TYPE_MASK);
    321		page->index = gaddr;
    322		page = NULL;
    323	}
    324	spin_unlock(&gmap->guest_table_lock);
    325	if (page)
    326		__free_pages(page, CRST_ALLOC_ORDER);
    327	return 0;
    328}
    329
    330/**
    331 * __gmap_segment_gaddr - find virtual address from segment pointer
    332 * @entry: pointer to a segment table entry in the guest address space
    333 *
    334 * Returns the virtual address in the guest address space for the segment
    335 */
    336static unsigned long __gmap_segment_gaddr(unsigned long *entry)
    337{
    338	struct page *page;
    339	unsigned long offset, mask;
    340
    341	offset = (unsigned long) entry / sizeof(unsigned long);
    342	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
    343	mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
    344	page = virt_to_page((void *)((unsigned long) entry & mask));
    345	return page->index + offset;
    346}
    347
    348/**
    349 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
    350 * @gmap: pointer to the guest address space structure
    351 * @vmaddr: address in the host process address space
    352 *
    353 * Returns 1 if a TLB flush is required
    354 */
    355static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
    356{
    357	unsigned long *entry;
    358	int flush = 0;
    359
    360	BUG_ON(gmap_is_shadow(gmap));
    361	spin_lock(&gmap->guest_table_lock);
    362	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
    363	if (entry) {
    364		flush = (*entry != _SEGMENT_ENTRY_EMPTY);
    365		*entry = _SEGMENT_ENTRY_EMPTY;
    366	}
    367	spin_unlock(&gmap->guest_table_lock);
    368	return flush;
    369}
    370
    371/**
    372 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
    373 * @gmap: pointer to the guest address space structure
    374 * @gaddr: address in the guest address space
    375 *
    376 * Returns 1 if a TLB flush is required
    377 */
    378static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
    379{
    380	unsigned long vmaddr;
    381
    382	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
    383						   gaddr >> PMD_SHIFT);
    384	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
    385}
    386
    387/**
    388 * gmap_unmap_segment - unmap segment from the guest address space
    389 * @gmap: pointer to the guest address space structure
    390 * @to: address in the guest address space
    391 * @len: length of the memory area to unmap
    392 *
    393 * Returns 0 if the unmap succeeded, -EINVAL if not.
    394 */
    395int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
    396{
    397	unsigned long off;
    398	int flush;
    399
    400	BUG_ON(gmap_is_shadow(gmap));
    401	if ((to | len) & (PMD_SIZE - 1))
    402		return -EINVAL;
    403	if (len == 0 || to + len < to)
    404		return -EINVAL;
    405
    406	flush = 0;
    407	mmap_write_lock(gmap->mm);
    408	for (off = 0; off < len; off += PMD_SIZE)
    409		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
    410	mmap_write_unlock(gmap->mm);
    411	if (flush)
    412		gmap_flush_tlb(gmap);
    413	return 0;
    414}
    415EXPORT_SYMBOL_GPL(gmap_unmap_segment);
    416
    417/**
    418 * gmap_map_segment - map a segment to the guest address space
    419 * @gmap: pointer to the guest address space structure
    420 * @from: source address in the parent address space
    421 * @to: target address in the guest address space
    422 * @len: length of the memory area to map
    423 *
    424 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
    425 */
    426int gmap_map_segment(struct gmap *gmap, unsigned long from,
    427		     unsigned long to, unsigned long len)
    428{
    429	unsigned long off;
    430	int flush;
    431
    432	BUG_ON(gmap_is_shadow(gmap));
    433	if ((from | to | len) & (PMD_SIZE - 1))
    434		return -EINVAL;
    435	if (len == 0 || from + len < from || to + len < to ||
    436	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
    437		return -EINVAL;
    438
    439	flush = 0;
    440	mmap_write_lock(gmap->mm);
    441	for (off = 0; off < len; off += PMD_SIZE) {
    442		/* Remove old translation */
    443		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
    444		/* Store new translation */
    445		if (radix_tree_insert(&gmap->guest_to_host,
    446				      (to + off) >> PMD_SHIFT,
    447				      (void *) from + off))
    448			break;
    449	}
    450	mmap_write_unlock(gmap->mm);
    451	if (flush)
    452		gmap_flush_tlb(gmap);
    453	if (off >= len)
    454		return 0;
    455	gmap_unmap_segment(gmap, to, len);
    456	return -ENOMEM;
    457}
    458EXPORT_SYMBOL_GPL(gmap_map_segment);
    459
    460/**
    461 * __gmap_translate - translate a guest address to a user space address
    462 * @gmap: pointer to guest mapping meta data structure
    463 * @gaddr: guest address
    464 *
    465 * Returns user space address which corresponds to the guest address or
    466 * -EFAULT if no such mapping exists.
    467 * This function does not establish potentially missing page table entries.
    468 * The mmap_lock of the mm that belongs to the address space must be held
    469 * when this function gets called.
    470 *
    471 * Note: Can also be called for shadow gmaps.
    472 */
    473unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
    474{
    475	unsigned long vmaddr;
    476
    477	vmaddr = (unsigned long)
    478		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
    479	/* Note: guest_to_host is empty for a shadow gmap */
    480	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
    481}
    482EXPORT_SYMBOL_GPL(__gmap_translate);
    483
    484/**
    485 * gmap_translate - translate a guest address to a user space address
    486 * @gmap: pointer to guest mapping meta data structure
    487 * @gaddr: guest address
    488 *
    489 * Returns user space address which corresponds to the guest address or
    490 * -EFAULT if no such mapping exists.
    491 * This function does not establish potentially missing page table entries.
    492 */
    493unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
    494{
    495	unsigned long rc;
    496
    497	mmap_read_lock(gmap->mm);
    498	rc = __gmap_translate(gmap, gaddr);
    499	mmap_read_unlock(gmap->mm);
    500	return rc;
    501}
    502EXPORT_SYMBOL_GPL(gmap_translate);
    503
    504/**
    505 * gmap_unlink - disconnect a page table from the gmap shadow tables
    506 * @mm: pointer to the parent mm_struct
    507 * @table: pointer to the host page table
    508 * @vmaddr: vm address associated with the host page table
    509 */
    510void gmap_unlink(struct mm_struct *mm, unsigned long *table,
    511		 unsigned long vmaddr)
    512{
    513	struct gmap *gmap;
    514	int flush;
    515
    516	rcu_read_lock();
    517	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
    518		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
    519		if (flush)
    520			gmap_flush_tlb(gmap);
    521	}
    522	rcu_read_unlock();
    523}
    524
    525static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
    526			   unsigned long gaddr);
    527
    528/**
    529 * __gmap_link - set up shadow page tables to connect a host to a guest address
    530 * @gmap: pointer to guest mapping meta data structure
    531 * @gaddr: guest address
    532 * @vmaddr: vm address
    533 *
    534 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
    535 * if the vm address is already mapped to a different guest segment.
    536 * The mmap_lock of the mm that belongs to the address space must be held
    537 * when this function gets called.
    538 */
    539int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
    540{
    541	struct mm_struct *mm;
    542	unsigned long *table;
    543	spinlock_t *ptl;
    544	pgd_t *pgd;
    545	p4d_t *p4d;
    546	pud_t *pud;
    547	pmd_t *pmd;
    548	u64 unprot;
    549	int rc;
    550
    551	BUG_ON(gmap_is_shadow(gmap));
    552	/* Create higher level tables in the gmap page table */
    553	table = gmap->table;
    554	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
    555		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
    556		if ((*table & _REGION_ENTRY_INVALID) &&
    557		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
    558				     gaddr & _REGION1_MASK))
    559			return -ENOMEM;
    560		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    561	}
    562	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
    563		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
    564		if ((*table & _REGION_ENTRY_INVALID) &&
    565		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
    566				     gaddr & _REGION2_MASK))
    567			return -ENOMEM;
    568		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    569	}
    570	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
    571		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
    572		if ((*table & _REGION_ENTRY_INVALID) &&
    573		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
    574				     gaddr & _REGION3_MASK))
    575			return -ENOMEM;
    576		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    577	}
    578	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
    579	/* Walk the parent mm page table */
    580	mm = gmap->mm;
    581	pgd = pgd_offset(mm, vmaddr);
    582	VM_BUG_ON(pgd_none(*pgd));
    583	p4d = p4d_offset(pgd, vmaddr);
    584	VM_BUG_ON(p4d_none(*p4d));
    585	pud = pud_offset(p4d, vmaddr);
    586	VM_BUG_ON(pud_none(*pud));
    587	/* large puds cannot yet be handled */
    588	if (pud_large(*pud))
    589		return -EFAULT;
    590	pmd = pmd_offset(pud, vmaddr);
    591	VM_BUG_ON(pmd_none(*pmd));
    592	/* Are we allowed to use huge pages? */
    593	if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
    594		return -EFAULT;
    595	/* Link gmap segment table entry location to page table. */
    596	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
    597	if (rc)
    598		return rc;
    599	ptl = pmd_lock(mm, pmd);
    600	spin_lock(&gmap->guest_table_lock);
    601	if (*table == _SEGMENT_ENTRY_EMPTY) {
    602		rc = radix_tree_insert(&gmap->host_to_guest,
    603				       vmaddr >> PMD_SHIFT, table);
    604		if (!rc) {
    605			if (pmd_large(*pmd)) {
    606				*table = (pmd_val(*pmd) &
    607					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
    608					| _SEGMENT_ENTRY_GMAP_UC;
    609			} else
    610				*table = pmd_val(*pmd) &
    611					_SEGMENT_ENTRY_HARDWARE_BITS;
    612		}
    613	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
    614		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
    615		unprot = (u64)*table;
    616		unprot &= ~_SEGMENT_ENTRY_PROTECT;
    617		unprot |= _SEGMENT_ENTRY_GMAP_UC;
    618		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
    619	}
    620	spin_unlock(&gmap->guest_table_lock);
    621	spin_unlock(ptl);
    622	radix_tree_preload_end();
    623	return rc;
    624}
    625
    626/**
    627 * gmap_fault - resolve a fault on a guest address
    628 * @gmap: pointer to guest mapping meta data structure
    629 * @gaddr: guest address
    630 * @fault_flags: flags to pass down to handle_mm_fault()
    631 *
    632 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
    633 * if the vm address is already mapped to a different guest segment.
    634 */
    635int gmap_fault(struct gmap *gmap, unsigned long gaddr,
    636	       unsigned int fault_flags)
    637{
    638	unsigned long vmaddr;
    639	int rc;
    640	bool unlocked;
    641
    642	mmap_read_lock(gmap->mm);
    643
    644retry:
    645	unlocked = false;
    646	vmaddr = __gmap_translate(gmap, gaddr);
    647	if (IS_ERR_VALUE(vmaddr)) {
    648		rc = vmaddr;
    649		goto out_up;
    650	}
    651	if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
    652			     &unlocked)) {
    653		rc = -EFAULT;
    654		goto out_up;
    655	}
    656	/*
    657	 * In the case that fixup_user_fault unlocked the mmap_lock during
    658	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
    659	 */
    660	if (unlocked)
    661		goto retry;
    662
    663	rc = __gmap_link(gmap, gaddr, vmaddr);
    664out_up:
    665	mmap_read_unlock(gmap->mm);
    666	return rc;
    667}
    668EXPORT_SYMBOL_GPL(gmap_fault);
    669
    670/*
    671 * this function is assumed to be called with mmap_lock held
    672 */
    673void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
    674{
    675	struct vm_area_struct *vma;
    676	unsigned long vmaddr;
    677	spinlock_t *ptl;
    678	pte_t *ptep;
    679
    680	/* Find the vm address for the guest address */
    681	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
    682						   gaddr >> PMD_SHIFT);
    683	if (vmaddr) {
    684		vmaddr |= gaddr & ~PMD_MASK;
    685
    686		vma = vma_lookup(gmap->mm, vmaddr);
    687		if (!vma || is_vm_hugetlb_page(vma))
    688			return;
    689
    690		/* Get pointer to the page table entry */
    691		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
    692		if (likely(ptep)) {
    693			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
    694			pte_unmap_unlock(ptep, ptl);
    695		}
    696	}
    697}
    698EXPORT_SYMBOL_GPL(__gmap_zap);
    699
    700void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
    701{
    702	unsigned long gaddr, vmaddr, size;
    703	struct vm_area_struct *vma;
    704
    705	mmap_read_lock(gmap->mm);
    706	for (gaddr = from; gaddr < to;
    707	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
    708		/* Find the vm address for the guest address */
    709		vmaddr = (unsigned long)
    710			radix_tree_lookup(&gmap->guest_to_host,
    711					  gaddr >> PMD_SHIFT);
    712		if (!vmaddr)
    713			continue;
    714		vmaddr |= gaddr & ~PMD_MASK;
    715		/* Find vma in the parent mm */
    716		vma = find_vma(gmap->mm, vmaddr);
    717		if (!vma)
    718			continue;
    719		/*
    720		 * We do not discard pages that are backed by
    721		 * hugetlbfs, so we don't have to refault them.
    722		 */
    723		if (is_vm_hugetlb_page(vma))
    724			continue;
    725		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
    726		zap_page_range(vma, vmaddr, size);
    727	}
    728	mmap_read_unlock(gmap->mm);
    729}
    730EXPORT_SYMBOL_GPL(gmap_discard);
    731
    732static LIST_HEAD(gmap_notifier_list);
    733static DEFINE_SPINLOCK(gmap_notifier_lock);
    734
    735/**
    736 * gmap_register_pte_notifier - register a pte invalidation callback
    737 * @nb: pointer to the gmap notifier block
    738 */
    739void gmap_register_pte_notifier(struct gmap_notifier *nb)
    740{
    741	spin_lock(&gmap_notifier_lock);
    742	list_add_rcu(&nb->list, &gmap_notifier_list);
    743	spin_unlock(&gmap_notifier_lock);
    744}
    745EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
    746
    747/**
    748 * gmap_unregister_pte_notifier - remove a pte invalidation callback
    749 * @nb: pointer to the gmap notifier block
    750 */
    751void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
    752{
    753	spin_lock(&gmap_notifier_lock);
    754	list_del_rcu(&nb->list);
    755	spin_unlock(&gmap_notifier_lock);
    756	synchronize_rcu();
    757}
    758EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
    759
    760/**
    761 * gmap_call_notifier - call all registered invalidation callbacks
    762 * @gmap: pointer to guest mapping meta data structure
    763 * @start: start virtual address in the guest address space
    764 * @end: end virtual address in the guest address space
    765 */
    766static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
    767			       unsigned long end)
    768{
    769	struct gmap_notifier *nb;
    770
    771	list_for_each_entry(nb, &gmap_notifier_list, list)
    772		nb->notifier_call(gmap, start, end);
    773}
    774
    775/**
    776 * gmap_table_walk - walk the gmap page tables
    777 * @gmap: pointer to guest mapping meta data structure
    778 * @gaddr: virtual address in the guest address space
    779 * @level: page table level to stop at
    780 *
    781 * Returns a table entry pointer for the given guest address and @level
    782 * @level=0 : returns a pointer to a page table table entry (or NULL)
    783 * @level=1 : returns a pointer to a segment table entry (or NULL)
    784 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
    785 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
    786 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
    787 *
    788 * Returns NULL if the gmap page tables could not be walked to the
    789 * requested level.
    790 *
    791 * Note: Can also be called for shadow gmaps.
    792 */
    793static inline unsigned long *gmap_table_walk(struct gmap *gmap,
    794					     unsigned long gaddr, int level)
    795{
    796	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
    797	unsigned long *table = gmap->table;
    798
    799	if (gmap_is_shadow(gmap) && gmap->removed)
    800		return NULL;
    801
    802	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
    803		return NULL;
    804
    805	if (asce_type != _ASCE_TYPE_REGION1 &&
    806	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
    807		return NULL;
    808
    809	switch (asce_type) {
    810	case _ASCE_TYPE_REGION1:
    811		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
    812		if (level == 4)
    813			break;
    814		if (*table & _REGION_ENTRY_INVALID)
    815			return NULL;
    816		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    817		fallthrough;
    818	case _ASCE_TYPE_REGION2:
    819		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
    820		if (level == 3)
    821			break;
    822		if (*table & _REGION_ENTRY_INVALID)
    823			return NULL;
    824		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    825		fallthrough;
    826	case _ASCE_TYPE_REGION3:
    827		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
    828		if (level == 2)
    829			break;
    830		if (*table & _REGION_ENTRY_INVALID)
    831			return NULL;
    832		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
    833		fallthrough;
    834	case _ASCE_TYPE_SEGMENT:
    835		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
    836		if (level == 1)
    837			break;
    838		if (*table & _REGION_ENTRY_INVALID)
    839			return NULL;
    840		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
    841		table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
    842	}
    843	return table;
    844}
    845
    846/**
    847 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
    848 *		      and return the pte pointer
    849 * @gmap: pointer to guest mapping meta data structure
    850 * @gaddr: virtual address in the guest address space
    851 * @ptl: pointer to the spinlock pointer
    852 *
    853 * Returns a pointer to the locked pte for a guest address, or NULL
    854 */
    855static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
    856			       spinlock_t **ptl)
    857{
    858	unsigned long *table;
    859
    860	BUG_ON(gmap_is_shadow(gmap));
    861	/* Walk the gmap page table, lock and get pte pointer */
    862	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
    863	if (!table || *table & _SEGMENT_ENTRY_INVALID)
    864		return NULL;
    865	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
    866}
    867
    868/**
    869 * gmap_pte_op_fixup - force a page in and connect the gmap page table
    870 * @gmap: pointer to guest mapping meta data structure
    871 * @gaddr: virtual address in the guest address space
    872 * @vmaddr: address in the host process address space
    873 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
    874 *
    875 * Returns 0 if the caller can retry __gmap_translate (might fail again),
    876 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
    877 * up or connecting the gmap page table.
    878 */
    879static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
    880			     unsigned long vmaddr, int prot)
    881{
    882	struct mm_struct *mm = gmap->mm;
    883	unsigned int fault_flags;
    884	bool unlocked = false;
    885
    886	BUG_ON(gmap_is_shadow(gmap));
    887	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
    888	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
    889		return -EFAULT;
    890	if (unlocked)
    891		/* lost mmap_lock, caller has to retry __gmap_translate */
    892		return 0;
    893	/* Connect the page tables */
    894	return __gmap_link(gmap, gaddr, vmaddr);
    895}
    896
    897/**
    898 * gmap_pte_op_end - release the page table lock
    899 * @ptl: pointer to the spinlock pointer
    900 */
    901static void gmap_pte_op_end(spinlock_t *ptl)
    902{
    903	if (ptl)
    904		spin_unlock(ptl);
    905}
    906
    907/**
    908 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
    909 *		      and return the pmd pointer
    910 * @gmap: pointer to guest mapping meta data structure
    911 * @gaddr: virtual address in the guest address space
    912 *
    913 * Returns a pointer to the pmd for a guest address, or NULL
    914 */
    915static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
    916{
    917	pmd_t *pmdp;
    918
    919	BUG_ON(gmap_is_shadow(gmap));
    920	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
    921	if (!pmdp)
    922		return NULL;
    923
    924	/* without huge pages, there is no need to take the table lock */
    925	if (!gmap->mm->context.allow_gmap_hpage_1m)
    926		return pmd_none(*pmdp) ? NULL : pmdp;
    927
    928	spin_lock(&gmap->guest_table_lock);
    929	if (pmd_none(*pmdp)) {
    930		spin_unlock(&gmap->guest_table_lock);
    931		return NULL;
    932	}
    933
    934	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
    935	if (!pmd_large(*pmdp))
    936		spin_unlock(&gmap->guest_table_lock);
    937	return pmdp;
    938}
    939
    940/**
    941 * gmap_pmd_op_end - release the guest_table_lock if needed
    942 * @gmap: pointer to the guest mapping meta data structure
    943 * @pmdp: pointer to the pmd
    944 */
    945static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
    946{
    947	if (pmd_large(*pmdp))
    948		spin_unlock(&gmap->guest_table_lock);
    949}
    950
    951/*
    952 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
    953 * @pmdp: pointer to the pmd to be protected
    954 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
    955 * @bits: notification bits to set
    956 *
    957 * Returns:
    958 * 0 if successfully protected
    959 * -EAGAIN if a fixup is needed
    960 * -EINVAL if unsupported notifier bits have been specified
    961 *
    962 * Expected to be called with sg->mm->mmap_lock in read and
    963 * guest_table_lock held.
    964 */
    965static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
    966			    pmd_t *pmdp, int prot, unsigned long bits)
    967{
    968	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
    969	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
    970	pmd_t new = *pmdp;
    971
    972	/* Fixup needed */
    973	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
    974		return -EAGAIN;
    975
    976	if (prot == PROT_NONE && !pmd_i) {
    977		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
    978		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
    979	}
    980
    981	if (prot == PROT_READ && !pmd_p) {
    982		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
    983		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
    984		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
    985	}
    986
    987	if (bits & GMAP_NOTIFY_MPROT)
    988		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
    989
    990	/* Shadow GMAP protection needs split PMDs */
    991	if (bits & GMAP_NOTIFY_SHADOW)
    992		return -EINVAL;
    993
    994	return 0;
    995}
    996
    997/*
    998 * gmap_protect_pte - remove access rights to memory and set pgste bits
    999 * @gmap: pointer to guest mapping meta data structure
   1000 * @gaddr: virtual address in the guest address space
   1001 * @pmdp: pointer to the pmd associated with the pte
   1002 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
   1003 * @bits: notification bits to set
   1004 *
   1005 * Returns 0 if successfully protected, -ENOMEM if out of memory and
   1006 * -EAGAIN if a fixup is needed.
   1007 *
   1008 * Expected to be called with sg->mm->mmap_lock in read
   1009 */
   1010static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
   1011			    pmd_t *pmdp, int prot, unsigned long bits)
   1012{
   1013	int rc;
   1014	pte_t *ptep;
   1015	spinlock_t *ptl = NULL;
   1016	unsigned long pbits = 0;
   1017
   1018	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
   1019		return -EAGAIN;
   1020
   1021	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
   1022	if (!ptep)
   1023		return -ENOMEM;
   1024
   1025	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
   1026	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
   1027	/* Protect and unlock. */
   1028	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
   1029	gmap_pte_op_end(ptl);
   1030	return rc;
   1031}
   1032
   1033/*
   1034 * gmap_protect_range - remove access rights to memory and set pgste bits
   1035 * @gmap: pointer to guest mapping meta data structure
   1036 * @gaddr: virtual address in the guest address space
   1037 * @len: size of area
   1038 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
   1039 * @bits: pgste notification bits to set
   1040 *
   1041 * Returns 0 if successfully protected, -ENOMEM if out of memory and
   1042 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
   1043 *
   1044 * Called with sg->mm->mmap_lock in read.
   1045 */
   1046static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
   1047			      unsigned long len, int prot, unsigned long bits)
   1048{
   1049	unsigned long vmaddr, dist;
   1050	pmd_t *pmdp;
   1051	int rc;
   1052
   1053	BUG_ON(gmap_is_shadow(gmap));
   1054	while (len) {
   1055		rc = -EAGAIN;
   1056		pmdp = gmap_pmd_op_walk(gmap, gaddr);
   1057		if (pmdp) {
   1058			if (!pmd_large(*pmdp)) {
   1059				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
   1060						      bits);
   1061				if (!rc) {
   1062					len -= PAGE_SIZE;
   1063					gaddr += PAGE_SIZE;
   1064				}
   1065			} else {
   1066				rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
   1067						      bits);
   1068				if (!rc) {
   1069					dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
   1070					len = len < dist ? 0 : len - dist;
   1071					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
   1072				}
   1073			}
   1074			gmap_pmd_op_end(gmap, pmdp);
   1075		}
   1076		if (rc) {
   1077			if (rc == -EINVAL)
   1078				return rc;
   1079
   1080			/* -EAGAIN, fixup of userspace mm and gmap */
   1081			vmaddr = __gmap_translate(gmap, gaddr);
   1082			if (IS_ERR_VALUE(vmaddr))
   1083				return vmaddr;
   1084			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
   1085			if (rc)
   1086				return rc;
   1087		}
   1088	}
   1089	return 0;
   1090}
   1091
   1092/**
   1093 * gmap_mprotect_notify - change access rights for a range of ptes and
   1094 *                        call the notifier if any pte changes again
   1095 * @gmap: pointer to guest mapping meta data structure
   1096 * @gaddr: virtual address in the guest address space
   1097 * @len: size of area
   1098 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
   1099 *
   1100 * Returns 0 if for each page in the given range a gmap mapping exists,
   1101 * the new access rights could be set and the notifier could be armed.
   1102 * If the gmap mapping is missing for one or more pages -EFAULT is
   1103 * returned. If no memory could be allocated -ENOMEM is returned.
   1104 * This function establishes missing page table entries.
   1105 */
   1106int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
   1107			 unsigned long len, int prot)
   1108{
   1109	int rc;
   1110
   1111	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
   1112		return -EINVAL;
   1113	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
   1114		return -EINVAL;
   1115	mmap_read_lock(gmap->mm);
   1116	rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
   1117	mmap_read_unlock(gmap->mm);
   1118	return rc;
   1119}
   1120EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
   1121
   1122/**
   1123 * gmap_read_table - get an unsigned long value from a guest page table using
   1124 *                   absolute addressing, without marking the page referenced.
   1125 * @gmap: pointer to guest mapping meta data structure
   1126 * @gaddr: virtual address in the guest address space
   1127 * @val: pointer to the unsigned long value to return
   1128 *
   1129 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
   1130 * if reading using the virtual address failed. -EINVAL if called on a gmap
   1131 * shadow.
   1132 *
   1133 * Called with gmap->mm->mmap_lock in read.
   1134 */
   1135int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
   1136{
   1137	unsigned long address, vmaddr;
   1138	spinlock_t *ptl;
   1139	pte_t *ptep, pte;
   1140	int rc;
   1141
   1142	if (gmap_is_shadow(gmap))
   1143		return -EINVAL;
   1144
   1145	while (1) {
   1146		rc = -EAGAIN;
   1147		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
   1148		if (ptep) {
   1149			pte = *ptep;
   1150			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
   1151				address = pte_val(pte) & PAGE_MASK;
   1152				address += gaddr & ~PAGE_MASK;
   1153				*val = *(unsigned long *) address;
   1154				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
   1155				/* Do *NOT* clear the _PAGE_INVALID bit! */
   1156				rc = 0;
   1157			}
   1158			gmap_pte_op_end(ptl);
   1159		}
   1160		if (!rc)
   1161			break;
   1162		vmaddr = __gmap_translate(gmap, gaddr);
   1163		if (IS_ERR_VALUE(vmaddr)) {
   1164			rc = vmaddr;
   1165			break;
   1166		}
   1167		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
   1168		if (rc)
   1169			break;
   1170	}
   1171	return rc;
   1172}
   1173EXPORT_SYMBOL_GPL(gmap_read_table);
   1174
   1175/**
   1176 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
   1177 * @sg: pointer to the shadow guest address space structure
   1178 * @vmaddr: vm address associated with the rmap
   1179 * @rmap: pointer to the rmap structure
   1180 *
   1181 * Called with the sg->guest_table_lock
   1182 */
   1183static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
   1184				    struct gmap_rmap *rmap)
   1185{
   1186	struct gmap_rmap *temp;
   1187	void __rcu **slot;
   1188
   1189	BUG_ON(!gmap_is_shadow(sg));
   1190	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
   1191	if (slot) {
   1192		rmap->next = radix_tree_deref_slot_protected(slot,
   1193							&sg->guest_table_lock);
   1194		for (temp = rmap->next; temp; temp = temp->next) {
   1195			if (temp->raddr == rmap->raddr) {
   1196				kfree(rmap);
   1197				return;
   1198			}
   1199		}
   1200		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
   1201	} else {
   1202		rmap->next = NULL;
   1203		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
   1204				  rmap);
   1205	}
   1206}
   1207
   1208/**
   1209 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
   1210 * @sg: pointer to the shadow guest address space structure
   1211 * @raddr: rmap address in the shadow gmap
   1212 * @paddr: address in the parent guest address space
   1213 * @len: length of the memory area to protect
   1214 *
   1215 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
   1216 * if out of memory and -EFAULT if paddr is invalid.
   1217 */
   1218static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
   1219			     unsigned long paddr, unsigned long len)
   1220{
   1221	struct gmap *parent;
   1222	struct gmap_rmap *rmap;
   1223	unsigned long vmaddr;
   1224	spinlock_t *ptl;
   1225	pte_t *ptep;
   1226	int rc;
   1227
   1228	BUG_ON(!gmap_is_shadow(sg));
   1229	parent = sg->parent;
   1230	while (len) {
   1231		vmaddr = __gmap_translate(parent, paddr);
   1232		if (IS_ERR_VALUE(vmaddr))
   1233			return vmaddr;
   1234		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
   1235		if (!rmap)
   1236			return -ENOMEM;
   1237		rmap->raddr = raddr;
   1238		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
   1239		if (rc) {
   1240			kfree(rmap);
   1241			return rc;
   1242		}
   1243		rc = -EAGAIN;
   1244		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
   1245		if (ptep) {
   1246			spin_lock(&sg->guest_table_lock);
   1247			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
   1248					     PGSTE_VSIE_BIT);
   1249			if (!rc)
   1250				gmap_insert_rmap(sg, vmaddr, rmap);
   1251			spin_unlock(&sg->guest_table_lock);
   1252			gmap_pte_op_end(ptl);
   1253		}
   1254		radix_tree_preload_end();
   1255		if (rc) {
   1256			kfree(rmap);
   1257			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
   1258			if (rc)
   1259				return rc;
   1260			continue;
   1261		}
   1262		paddr += PAGE_SIZE;
   1263		len -= PAGE_SIZE;
   1264	}
   1265	return 0;
   1266}
   1267
   1268#define _SHADOW_RMAP_MASK	0x7
   1269#define _SHADOW_RMAP_REGION1	0x5
   1270#define _SHADOW_RMAP_REGION2	0x4
   1271#define _SHADOW_RMAP_REGION3	0x3
   1272#define _SHADOW_RMAP_SEGMENT	0x2
   1273#define _SHADOW_RMAP_PGTABLE	0x1
   1274
   1275/**
   1276 * gmap_idte_one - invalidate a single region or segment table entry
   1277 * @asce: region or segment table *origin* + table-type bits
   1278 * @vaddr: virtual address to identify the table entry to flush
   1279 *
   1280 * The invalid bit of a single region or segment table entry is set
   1281 * and the associated TLB entries depending on the entry are flushed.
   1282 * The table-type of the @asce identifies the portion of the @vaddr
   1283 * that is used as the invalidation index.
   1284 */
   1285static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
   1286{
   1287	asm volatile(
   1288		"	idte	%0,0,%1"
   1289		: : "a" (asce), "a" (vaddr) : "cc", "memory");
   1290}
   1291
   1292/**
   1293 * gmap_unshadow_page - remove a page from a shadow page table
   1294 * @sg: pointer to the shadow guest address space structure
   1295 * @raddr: rmap address in the shadow guest address space
   1296 *
   1297 * Called with the sg->guest_table_lock
   1298 */
   1299static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
   1300{
   1301	unsigned long *table;
   1302
   1303	BUG_ON(!gmap_is_shadow(sg));
   1304	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
   1305	if (!table || *table & _PAGE_INVALID)
   1306		return;
   1307	gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
   1308	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
   1309}
   1310
   1311/**
   1312 * __gmap_unshadow_pgt - remove all entries from a shadow page table
   1313 * @sg: pointer to the shadow guest address space structure
   1314 * @raddr: rmap address in the shadow guest address space
   1315 * @pgt: pointer to the start of a shadow page table
   1316 *
   1317 * Called with the sg->guest_table_lock
   1318 */
   1319static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
   1320				unsigned long *pgt)
   1321{
   1322	int i;
   1323
   1324	BUG_ON(!gmap_is_shadow(sg));
   1325	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
   1326		pgt[i] = _PAGE_INVALID;
   1327}
   1328
   1329/**
   1330 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
   1331 * @sg: pointer to the shadow guest address space structure
   1332 * @raddr: address in the shadow guest address space
   1333 *
   1334 * Called with the sg->guest_table_lock
   1335 */
   1336static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
   1337{
   1338	unsigned long sto, *ste, *pgt;
   1339	struct page *page;
   1340
   1341	BUG_ON(!gmap_is_shadow(sg));
   1342	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
   1343	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
   1344		return;
   1345	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
   1346	sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
   1347	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
   1348	pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
   1349	*ste = _SEGMENT_ENTRY_EMPTY;
   1350	__gmap_unshadow_pgt(sg, raddr, pgt);
   1351	/* Free page table */
   1352	page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
   1353	list_del(&page->lru);
   1354	page_table_free_pgste(page);
   1355}
   1356
   1357/**
   1358 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
   1359 * @sg: pointer to the shadow guest address space structure
   1360 * @raddr: rmap address in the shadow guest address space
   1361 * @sgt: pointer to the start of a shadow segment table
   1362 *
   1363 * Called with the sg->guest_table_lock
   1364 */
   1365static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
   1366				unsigned long *sgt)
   1367{
   1368	unsigned long *pgt;
   1369	struct page *page;
   1370	int i;
   1371
   1372	BUG_ON(!gmap_is_shadow(sg));
   1373	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
   1374		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
   1375			continue;
   1376		pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
   1377		sgt[i] = _SEGMENT_ENTRY_EMPTY;
   1378		__gmap_unshadow_pgt(sg, raddr, pgt);
   1379		/* Free page table */
   1380		page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
   1381		list_del(&page->lru);
   1382		page_table_free_pgste(page);
   1383	}
   1384}
   1385
   1386/**
   1387 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
   1388 * @sg: pointer to the shadow guest address space structure
   1389 * @raddr: rmap address in the shadow guest address space
   1390 *
   1391 * Called with the shadow->guest_table_lock
   1392 */
   1393static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
   1394{
   1395	unsigned long r3o, *r3e, *sgt;
   1396	struct page *page;
   1397
   1398	BUG_ON(!gmap_is_shadow(sg));
   1399	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
   1400	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
   1401		return;
   1402	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
   1403	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
   1404	gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
   1405	sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
   1406	*r3e = _REGION3_ENTRY_EMPTY;
   1407	__gmap_unshadow_sgt(sg, raddr, sgt);
   1408	/* Free segment table */
   1409	page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
   1410	list_del(&page->lru);
   1411	__free_pages(page, CRST_ALLOC_ORDER);
   1412}
   1413
   1414/**
   1415 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
   1416 * @sg: pointer to the shadow guest address space structure
   1417 * @raddr: address in the shadow guest address space
   1418 * @r3t: pointer to the start of a shadow region-3 table
   1419 *
   1420 * Called with the sg->guest_table_lock
   1421 */
   1422static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
   1423				unsigned long *r3t)
   1424{
   1425	unsigned long *sgt;
   1426	struct page *page;
   1427	int i;
   1428
   1429	BUG_ON(!gmap_is_shadow(sg));
   1430	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
   1431		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
   1432			continue;
   1433		sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
   1434		r3t[i] = _REGION3_ENTRY_EMPTY;
   1435		__gmap_unshadow_sgt(sg, raddr, sgt);
   1436		/* Free segment table */
   1437		page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
   1438		list_del(&page->lru);
   1439		__free_pages(page, CRST_ALLOC_ORDER);
   1440	}
   1441}
   1442
   1443/**
   1444 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
   1445 * @sg: pointer to the shadow guest address space structure
   1446 * @raddr: rmap address in the shadow guest address space
   1447 *
   1448 * Called with the sg->guest_table_lock
   1449 */
   1450static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
   1451{
   1452	unsigned long r2o, *r2e, *r3t;
   1453	struct page *page;
   1454
   1455	BUG_ON(!gmap_is_shadow(sg));
   1456	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
   1457	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
   1458		return;
   1459	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
   1460	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
   1461	gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
   1462	r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
   1463	*r2e = _REGION2_ENTRY_EMPTY;
   1464	__gmap_unshadow_r3t(sg, raddr, r3t);
   1465	/* Free region 3 table */
   1466	page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
   1467	list_del(&page->lru);
   1468	__free_pages(page, CRST_ALLOC_ORDER);
   1469}
   1470
   1471/**
   1472 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
   1473 * @sg: pointer to the shadow guest address space structure
   1474 * @raddr: rmap address in the shadow guest address space
   1475 * @r2t: pointer to the start of a shadow region-2 table
   1476 *
   1477 * Called with the sg->guest_table_lock
   1478 */
   1479static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
   1480				unsigned long *r2t)
   1481{
   1482	unsigned long *r3t;
   1483	struct page *page;
   1484	int i;
   1485
   1486	BUG_ON(!gmap_is_shadow(sg));
   1487	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
   1488		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
   1489			continue;
   1490		r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
   1491		r2t[i] = _REGION2_ENTRY_EMPTY;
   1492		__gmap_unshadow_r3t(sg, raddr, r3t);
   1493		/* Free region 3 table */
   1494		page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
   1495		list_del(&page->lru);
   1496		__free_pages(page, CRST_ALLOC_ORDER);
   1497	}
   1498}
   1499
   1500/**
   1501 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
   1502 * @sg: pointer to the shadow guest address space structure
   1503 * @raddr: rmap address in the shadow guest address space
   1504 *
   1505 * Called with the sg->guest_table_lock
   1506 */
   1507static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
   1508{
   1509	unsigned long r1o, *r1e, *r2t;
   1510	struct page *page;
   1511
   1512	BUG_ON(!gmap_is_shadow(sg));
   1513	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
   1514	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
   1515		return;
   1516	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
   1517	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
   1518	gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
   1519	r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
   1520	*r1e = _REGION1_ENTRY_EMPTY;
   1521	__gmap_unshadow_r2t(sg, raddr, r2t);
   1522	/* Free region 2 table */
   1523	page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
   1524	list_del(&page->lru);
   1525	__free_pages(page, CRST_ALLOC_ORDER);
   1526}
   1527
   1528/**
   1529 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
   1530 * @sg: pointer to the shadow guest address space structure
   1531 * @raddr: rmap address in the shadow guest address space
   1532 * @r1t: pointer to the start of a shadow region-1 table
   1533 *
   1534 * Called with the shadow->guest_table_lock
   1535 */
   1536static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
   1537				unsigned long *r1t)
   1538{
   1539	unsigned long asce, *r2t;
   1540	struct page *page;
   1541	int i;
   1542
   1543	BUG_ON(!gmap_is_shadow(sg));
   1544	asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
   1545	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
   1546		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
   1547			continue;
   1548		r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
   1549		__gmap_unshadow_r2t(sg, raddr, r2t);
   1550		/* Clear entry and flush translation r1t -> r2t */
   1551		gmap_idte_one(asce, raddr);
   1552		r1t[i] = _REGION1_ENTRY_EMPTY;
   1553		/* Free region 2 table */
   1554		page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
   1555		list_del(&page->lru);
   1556		__free_pages(page, CRST_ALLOC_ORDER);
   1557	}
   1558}
   1559
   1560/**
   1561 * gmap_unshadow - remove a shadow page table completely
   1562 * @sg: pointer to the shadow guest address space structure
   1563 *
   1564 * Called with sg->guest_table_lock
   1565 */
   1566static void gmap_unshadow(struct gmap *sg)
   1567{
   1568	unsigned long *table;
   1569
   1570	BUG_ON(!gmap_is_shadow(sg));
   1571	if (sg->removed)
   1572		return;
   1573	sg->removed = 1;
   1574	gmap_call_notifier(sg, 0, -1UL);
   1575	gmap_flush_tlb(sg);
   1576	table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
   1577	switch (sg->asce & _ASCE_TYPE_MASK) {
   1578	case _ASCE_TYPE_REGION1:
   1579		__gmap_unshadow_r1t(sg, 0, table);
   1580		break;
   1581	case _ASCE_TYPE_REGION2:
   1582		__gmap_unshadow_r2t(sg, 0, table);
   1583		break;
   1584	case _ASCE_TYPE_REGION3:
   1585		__gmap_unshadow_r3t(sg, 0, table);
   1586		break;
   1587	case _ASCE_TYPE_SEGMENT:
   1588		__gmap_unshadow_sgt(sg, 0, table);
   1589		break;
   1590	}
   1591}
   1592
   1593/**
   1594 * gmap_find_shadow - find a specific asce in the list of shadow tables
   1595 * @parent: pointer to the parent gmap
   1596 * @asce: ASCE for which the shadow table is created
   1597 * @edat_level: edat level to be used for the shadow translation
   1598 *
   1599 * Returns the pointer to a gmap if a shadow table with the given asce is
   1600 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
   1601 * otherwise NULL
   1602 */
   1603static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
   1604				     int edat_level)
   1605{
   1606	struct gmap *sg;
   1607
   1608	list_for_each_entry(sg, &parent->children, list) {
   1609		if (sg->orig_asce != asce || sg->edat_level != edat_level ||
   1610		    sg->removed)
   1611			continue;
   1612		if (!sg->initialized)
   1613			return ERR_PTR(-EAGAIN);
   1614		refcount_inc(&sg->ref_count);
   1615		return sg;
   1616	}
   1617	return NULL;
   1618}
   1619
   1620/**
   1621 * gmap_shadow_valid - check if a shadow guest address space matches the
   1622 *                     given properties and is still valid
   1623 * @sg: pointer to the shadow guest address space structure
   1624 * @asce: ASCE for which the shadow table is requested
   1625 * @edat_level: edat level to be used for the shadow translation
   1626 *
   1627 * Returns 1 if the gmap shadow is still valid and matches the given
   1628 * properties, the caller can continue using it. Returns 0 otherwise, the
   1629 * caller has to request a new shadow gmap in this case.
   1630 *
   1631 */
   1632int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
   1633{
   1634	if (sg->removed)
   1635		return 0;
   1636	return sg->orig_asce == asce && sg->edat_level == edat_level;
   1637}
   1638EXPORT_SYMBOL_GPL(gmap_shadow_valid);
   1639
   1640/**
   1641 * gmap_shadow - create/find a shadow guest address space
   1642 * @parent: pointer to the parent gmap
   1643 * @asce: ASCE for which the shadow table is created
   1644 * @edat_level: edat level to be used for the shadow translation
   1645 *
   1646 * The pages of the top level page table referred by the asce parameter
   1647 * will be set to read-only and marked in the PGSTEs of the kvm process.
   1648 * The shadow table will be removed automatically on any change to the
   1649 * PTE mapping for the source table.
   1650 *
   1651 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
   1652 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
   1653 * parent gmap table could not be protected.
   1654 */
   1655struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
   1656			 int edat_level)
   1657{
   1658	struct gmap *sg, *new;
   1659	unsigned long limit;
   1660	int rc;
   1661
   1662	BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
   1663	BUG_ON(gmap_is_shadow(parent));
   1664	spin_lock(&parent->shadow_lock);
   1665	sg = gmap_find_shadow(parent, asce, edat_level);
   1666	spin_unlock(&parent->shadow_lock);
   1667	if (sg)
   1668		return sg;
   1669	/* Create a new shadow gmap */
   1670	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
   1671	if (asce & _ASCE_REAL_SPACE)
   1672		limit = -1UL;
   1673	new = gmap_alloc(limit);
   1674	if (!new)
   1675		return ERR_PTR(-ENOMEM);
   1676	new->mm = parent->mm;
   1677	new->parent = gmap_get(parent);
   1678	new->orig_asce = asce;
   1679	new->edat_level = edat_level;
   1680	new->initialized = false;
   1681	spin_lock(&parent->shadow_lock);
   1682	/* Recheck if another CPU created the same shadow */
   1683	sg = gmap_find_shadow(parent, asce, edat_level);
   1684	if (sg) {
   1685		spin_unlock(&parent->shadow_lock);
   1686		gmap_free(new);
   1687		return sg;
   1688	}
   1689	if (asce & _ASCE_REAL_SPACE) {
   1690		/* only allow one real-space gmap shadow */
   1691		list_for_each_entry(sg, &parent->children, list) {
   1692			if (sg->orig_asce & _ASCE_REAL_SPACE) {
   1693				spin_lock(&sg->guest_table_lock);
   1694				gmap_unshadow(sg);
   1695				spin_unlock(&sg->guest_table_lock);
   1696				list_del(&sg->list);
   1697				gmap_put(sg);
   1698				break;
   1699			}
   1700		}
   1701	}
   1702	refcount_set(&new->ref_count, 2);
   1703	list_add(&new->list, &parent->children);
   1704	if (asce & _ASCE_REAL_SPACE) {
   1705		/* nothing to protect, return right away */
   1706		new->initialized = true;
   1707		spin_unlock(&parent->shadow_lock);
   1708		return new;
   1709	}
   1710	spin_unlock(&parent->shadow_lock);
   1711	/* protect after insertion, so it will get properly invalidated */
   1712	mmap_read_lock(parent->mm);
   1713	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
   1714				((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
   1715				PROT_READ, GMAP_NOTIFY_SHADOW);
   1716	mmap_read_unlock(parent->mm);
   1717	spin_lock(&parent->shadow_lock);
   1718	new->initialized = true;
   1719	if (rc) {
   1720		list_del(&new->list);
   1721		gmap_free(new);
   1722		new = ERR_PTR(rc);
   1723	}
   1724	spin_unlock(&parent->shadow_lock);
   1725	return new;
   1726}
   1727EXPORT_SYMBOL_GPL(gmap_shadow);
   1728
   1729/**
   1730 * gmap_shadow_r2t - create an empty shadow region 2 table
   1731 * @sg: pointer to the shadow guest address space structure
   1732 * @saddr: faulting address in the shadow gmap
   1733 * @r2t: parent gmap address of the region 2 table to get shadowed
   1734 * @fake: r2t references contiguous guest memory block, not a r2t
   1735 *
   1736 * The r2t parameter specifies the address of the source table. The
   1737 * four pages of the source table are made read-only in the parent gmap
   1738 * address space. A write to the source table area @r2t will automatically
   1739 * remove the shadow r2 table and all of its decendents.
   1740 *
   1741 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
   1742 * shadow table structure is incomplete, -ENOMEM if out of memory and
   1743 * -EFAULT if an address in the parent gmap could not be resolved.
   1744 *
   1745 * Called with sg->mm->mmap_lock in read.
   1746 */
   1747int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
   1748		    int fake)
   1749{
   1750	unsigned long raddr, origin, offset, len;
   1751	unsigned long *s_r2t, *table;
   1752	struct page *page;
   1753	int rc;
   1754
   1755	BUG_ON(!gmap_is_shadow(sg));
   1756	/* Allocate a shadow region second table */
   1757	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
   1758	if (!page)
   1759		return -ENOMEM;
   1760	page->index = r2t & _REGION_ENTRY_ORIGIN;
   1761	if (fake)
   1762		page->index |= GMAP_SHADOW_FAKE_TABLE;
   1763	s_r2t = (unsigned long *) page_to_phys(page);
   1764	/* Install shadow region second table */
   1765	spin_lock(&sg->guest_table_lock);
   1766	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
   1767	if (!table) {
   1768		rc = -EAGAIN;		/* Race with unshadow */
   1769		goto out_free;
   1770	}
   1771	if (!(*table & _REGION_ENTRY_INVALID)) {
   1772		rc = 0;			/* Already established */
   1773		goto out_free;
   1774	} else if (*table & _REGION_ENTRY_ORIGIN) {
   1775		rc = -EAGAIN;		/* Race with shadow */
   1776		goto out_free;
   1777	}
   1778	crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
   1779	/* mark as invalid as long as the parent table is not protected */
   1780	*table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
   1781		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
   1782	if (sg->edat_level >= 1)
   1783		*table |= (r2t & _REGION_ENTRY_PROTECT);
   1784	list_add(&page->lru, &sg->crst_list);
   1785	if (fake) {
   1786		/* nothing to protect for fake tables */
   1787		*table &= ~_REGION_ENTRY_INVALID;
   1788		spin_unlock(&sg->guest_table_lock);
   1789		return 0;
   1790	}
   1791	spin_unlock(&sg->guest_table_lock);
   1792	/* Make r2t read-only in parent gmap page table */
   1793	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
   1794	origin = r2t & _REGION_ENTRY_ORIGIN;
   1795	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
   1796	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
   1797	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
   1798	spin_lock(&sg->guest_table_lock);
   1799	if (!rc) {
   1800		table = gmap_table_walk(sg, saddr, 4);
   1801		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
   1802			      (unsigned long) s_r2t)
   1803			rc = -EAGAIN;		/* Race with unshadow */
   1804		else
   1805			*table &= ~_REGION_ENTRY_INVALID;
   1806	} else {
   1807		gmap_unshadow_r2t(sg, raddr);
   1808	}
   1809	spin_unlock(&sg->guest_table_lock);
   1810	return rc;
   1811out_free:
   1812	spin_unlock(&sg->guest_table_lock);
   1813	__free_pages(page, CRST_ALLOC_ORDER);
   1814	return rc;
   1815}
   1816EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
   1817
   1818/**
   1819 * gmap_shadow_r3t - create a shadow region 3 table
   1820 * @sg: pointer to the shadow guest address space structure
   1821 * @saddr: faulting address in the shadow gmap
   1822 * @r3t: parent gmap address of the region 3 table to get shadowed
   1823 * @fake: r3t references contiguous guest memory block, not a r3t
   1824 *
   1825 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
   1826 * shadow table structure is incomplete, -ENOMEM if out of memory and
   1827 * -EFAULT if an address in the parent gmap could not be resolved.
   1828 *
   1829 * Called with sg->mm->mmap_lock in read.
   1830 */
   1831int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
   1832		    int fake)
   1833{
   1834	unsigned long raddr, origin, offset, len;
   1835	unsigned long *s_r3t, *table;
   1836	struct page *page;
   1837	int rc;
   1838
   1839	BUG_ON(!gmap_is_shadow(sg));
   1840	/* Allocate a shadow region second table */
   1841	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
   1842	if (!page)
   1843		return -ENOMEM;
   1844	page->index = r3t & _REGION_ENTRY_ORIGIN;
   1845	if (fake)
   1846		page->index |= GMAP_SHADOW_FAKE_TABLE;
   1847	s_r3t = (unsigned long *) page_to_phys(page);
   1848	/* Install shadow region second table */
   1849	spin_lock(&sg->guest_table_lock);
   1850	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
   1851	if (!table) {
   1852		rc = -EAGAIN;		/* Race with unshadow */
   1853		goto out_free;
   1854	}
   1855	if (!(*table & _REGION_ENTRY_INVALID)) {
   1856		rc = 0;			/* Already established */
   1857		goto out_free;
   1858	} else if (*table & _REGION_ENTRY_ORIGIN) {
   1859		rc = -EAGAIN;		/* Race with shadow */
   1860		goto out_free;
   1861	}
   1862	crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
   1863	/* mark as invalid as long as the parent table is not protected */
   1864	*table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
   1865		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
   1866	if (sg->edat_level >= 1)
   1867		*table |= (r3t & _REGION_ENTRY_PROTECT);
   1868	list_add(&page->lru, &sg->crst_list);
   1869	if (fake) {
   1870		/* nothing to protect for fake tables */
   1871		*table &= ~_REGION_ENTRY_INVALID;
   1872		spin_unlock(&sg->guest_table_lock);
   1873		return 0;
   1874	}
   1875	spin_unlock(&sg->guest_table_lock);
   1876	/* Make r3t read-only in parent gmap page table */
   1877	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
   1878	origin = r3t & _REGION_ENTRY_ORIGIN;
   1879	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
   1880	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
   1881	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
   1882	spin_lock(&sg->guest_table_lock);
   1883	if (!rc) {
   1884		table = gmap_table_walk(sg, saddr, 3);
   1885		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
   1886			      (unsigned long) s_r3t)
   1887			rc = -EAGAIN;		/* Race with unshadow */
   1888		else
   1889			*table &= ~_REGION_ENTRY_INVALID;
   1890	} else {
   1891		gmap_unshadow_r3t(sg, raddr);
   1892	}
   1893	spin_unlock(&sg->guest_table_lock);
   1894	return rc;
   1895out_free:
   1896	spin_unlock(&sg->guest_table_lock);
   1897	__free_pages(page, CRST_ALLOC_ORDER);
   1898	return rc;
   1899}
   1900EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
   1901
   1902/**
   1903 * gmap_shadow_sgt - create a shadow segment table
   1904 * @sg: pointer to the shadow guest address space structure
   1905 * @saddr: faulting address in the shadow gmap
   1906 * @sgt: parent gmap address of the segment table to get shadowed
   1907 * @fake: sgt references contiguous guest memory block, not a sgt
   1908 *
   1909 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
   1910 * shadow table structure is incomplete, -ENOMEM if out of memory and
   1911 * -EFAULT if an address in the parent gmap could not be resolved.
   1912 *
   1913 * Called with sg->mm->mmap_lock in read.
   1914 */
   1915int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
   1916		    int fake)
   1917{
   1918	unsigned long raddr, origin, offset, len;
   1919	unsigned long *s_sgt, *table;
   1920	struct page *page;
   1921	int rc;
   1922
   1923	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
   1924	/* Allocate a shadow segment table */
   1925	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
   1926	if (!page)
   1927		return -ENOMEM;
   1928	page->index = sgt & _REGION_ENTRY_ORIGIN;
   1929	if (fake)
   1930		page->index |= GMAP_SHADOW_FAKE_TABLE;
   1931	s_sgt = (unsigned long *) page_to_phys(page);
   1932	/* Install shadow region second table */
   1933	spin_lock(&sg->guest_table_lock);
   1934	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
   1935	if (!table) {
   1936		rc = -EAGAIN;		/* Race with unshadow */
   1937		goto out_free;
   1938	}
   1939	if (!(*table & _REGION_ENTRY_INVALID)) {
   1940		rc = 0;			/* Already established */
   1941		goto out_free;
   1942	} else if (*table & _REGION_ENTRY_ORIGIN) {
   1943		rc = -EAGAIN;		/* Race with shadow */
   1944		goto out_free;
   1945	}
   1946	crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
   1947	/* mark as invalid as long as the parent table is not protected */
   1948	*table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
   1949		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
   1950	if (sg->edat_level >= 1)
   1951		*table |= sgt & _REGION_ENTRY_PROTECT;
   1952	list_add(&page->lru, &sg->crst_list);
   1953	if (fake) {
   1954		/* nothing to protect for fake tables */
   1955		*table &= ~_REGION_ENTRY_INVALID;
   1956		spin_unlock(&sg->guest_table_lock);
   1957		return 0;
   1958	}
   1959	spin_unlock(&sg->guest_table_lock);
   1960	/* Make sgt read-only in parent gmap page table */
   1961	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
   1962	origin = sgt & _REGION_ENTRY_ORIGIN;
   1963	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
   1964	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
   1965	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
   1966	spin_lock(&sg->guest_table_lock);
   1967	if (!rc) {
   1968		table = gmap_table_walk(sg, saddr, 2);
   1969		if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
   1970			      (unsigned long) s_sgt)
   1971			rc = -EAGAIN;		/* Race with unshadow */
   1972		else
   1973			*table &= ~_REGION_ENTRY_INVALID;
   1974	} else {
   1975		gmap_unshadow_sgt(sg, raddr);
   1976	}
   1977	spin_unlock(&sg->guest_table_lock);
   1978	return rc;
   1979out_free:
   1980	spin_unlock(&sg->guest_table_lock);
   1981	__free_pages(page, CRST_ALLOC_ORDER);
   1982	return rc;
   1983}
   1984EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
   1985
   1986/**
   1987 * gmap_shadow_pgt_lookup - find a shadow page table
   1988 * @sg: pointer to the shadow guest address space structure
   1989 * @saddr: the address in the shadow aguest address space
   1990 * @pgt: parent gmap address of the page table to get shadowed
   1991 * @dat_protection: if the pgtable is marked as protected by dat
   1992 * @fake: pgt references contiguous guest memory block, not a pgtable
   1993 *
   1994 * Returns 0 if the shadow page table was found and -EAGAIN if the page
   1995 * table was not found.
   1996 *
   1997 * Called with sg->mm->mmap_lock in read.
   1998 */
   1999int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
   2000			   unsigned long *pgt, int *dat_protection,
   2001			   int *fake)
   2002{
   2003	unsigned long *table;
   2004	struct page *page;
   2005	int rc;
   2006
   2007	BUG_ON(!gmap_is_shadow(sg));
   2008	spin_lock(&sg->guest_table_lock);
   2009	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
   2010	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
   2011		/* Shadow page tables are full pages (pte+pgste) */
   2012		page = pfn_to_page(*table >> PAGE_SHIFT);
   2013		*pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
   2014		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
   2015		*fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
   2016		rc = 0;
   2017	} else  {
   2018		rc = -EAGAIN;
   2019	}
   2020	spin_unlock(&sg->guest_table_lock);
   2021	return rc;
   2022
   2023}
   2024EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
   2025
   2026/**
   2027 * gmap_shadow_pgt - instantiate a shadow page table
   2028 * @sg: pointer to the shadow guest address space structure
   2029 * @saddr: faulting address in the shadow gmap
   2030 * @pgt: parent gmap address of the page table to get shadowed
   2031 * @fake: pgt references contiguous guest memory block, not a pgtable
   2032 *
   2033 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
   2034 * shadow table structure is incomplete, -ENOMEM if out of memory,
   2035 * -EFAULT if an address in the parent gmap could not be resolved and
   2036 *
   2037 * Called with gmap->mm->mmap_lock in read
   2038 */
   2039int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
   2040		    int fake)
   2041{
   2042	unsigned long raddr, origin;
   2043	unsigned long *s_pgt, *table;
   2044	struct page *page;
   2045	int rc;
   2046
   2047	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
   2048	/* Allocate a shadow page table */
   2049	page = page_table_alloc_pgste(sg->mm);
   2050	if (!page)
   2051		return -ENOMEM;
   2052	page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
   2053	if (fake)
   2054		page->index |= GMAP_SHADOW_FAKE_TABLE;
   2055	s_pgt = (unsigned long *) page_to_phys(page);
   2056	/* Install shadow page table */
   2057	spin_lock(&sg->guest_table_lock);
   2058	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
   2059	if (!table) {
   2060		rc = -EAGAIN;		/* Race with unshadow */
   2061		goto out_free;
   2062	}
   2063	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
   2064		rc = 0;			/* Already established */
   2065		goto out_free;
   2066	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
   2067		rc = -EAGAIN;		/* Race with shadow */
   2068		goto out_free;
   2069	}
   2070	/* mark as invalid as long as the parent table is not protected */
   2071	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
   2072		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
   2073	list_add(&page->lru, &sg->pt_list);
   2074	if (fake) {
   2075		/* nothing to protect for fake tables */
   2076		*table &= ~_SEGMENT_ENTRY_INVALID;
   2077		spin_unlock(&sg->guest_table_lock);
   2078		return 0;
   2079	}
   2080	spin_unlock(&sg->guest_table_lock);
   2081	/* Make pgt read-only in parent gmap page table (not the pgste) */
   2082	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
   2083	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
   2084	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
   2085	spin_lock(&sg->guest_table_lock);
   2086	if (!rc) {
   2087		table = gmap_table_walk(sg, saddr, 1);
   2088		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
   2089			      (unsigned long) s_pgt)
   2090			rc = -EAGAIN;		/* Race with unshadow */
   2091		else
   2092			*table &= ~_SEGMENT_ENTRY_INVALID;
   2093	} else {
   2094		gmap_unshadow_pgt(sg, raddr);
   2095	}
   2096	spin_unlock(&sg->guest_table_lock);
   2097	return rc;
   2098out_free:
   2099	spin_unlock(&sg->guest_table_lock);
   2100	page_table_free_pgste(page);
   2101	return rc;
   2102
   2103}
   2104EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
   2105
   2106/**
   2107 * gmap_shadow_page - create a shadow page mapping
   2108 * @sg: pointer to the shadow guest address space structure
   2109 * @saddr: faulting address in the shadow gmap
   2110 * @pte: pte in parent gmap address space to get shadowed
   2111 *
   2112 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
   2113 * shadow table structure is incomplete, -ENOMEM if out of memory and
   2114 * -EFAULT if an address in the parent gmap could not be resolved.
   2115 *
   2116 * Called with sg->mm->mmap_lock in read.
   2117 */
   2118int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
   2119{
   2120	struct gmap *parent;
   2121	struct gmap_rmap *rmap;
   2122	unsigned long vmaddr, paddr;
   2123	spinlock_t *ptl;
   2124	pte_t *sptep, *tptep;
   2125	int prot;
   2126	int rc;
   2127
   2128	BUG_ON(!gmap_is_shadow(sg));
   2129	parent = sg->parent;
   2130	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
   2131
   2132	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
   2133	if (!rmap)
   2134		return -ENOMEM;
   2135	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
   2136
   2137	while (1) {
   2138		paddr = pte_val(pte) & PAGE_MASK;
   2139		vmaddr = __gmap_translate(parent, paddr);
   2140		if (IS_ERR_VALUE(vmaddr)) {
   2141			rc = vmaddr;
   2142			break;
   2143		}
   2144		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
   2145		if (rc)
   2146			break;
   2147		rc = -EAGAIN;
   2148		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
   2149		if (sptep) {
   2150			spin_lock(&sg->guest_table_lock);
   2151			/* Get page table pointer */
   2152			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
   2153			if (!tptep) {
   2154				spin_unlock(&sg->guest_table_lock);
   2155				gmap_pte_op_end(ptl);
   2156				radix_tree_preload_end();
   2157				break;
   2158			}
   2159			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
   2160			if (rc > 0) {
   2161				/* Success and a new mapping */
   2162				gmap_insert_rmap(sg, vmaddr, rmap);
   2163				rmap = NULL;
   2164				rc = 0;
   2165			}
   2166			gmap_pte_op_end(ptl);
   2167			spin_unlock(&sg->guest_table_lock);
   2168		}
   2169		radix_tree_preload_end();
   2170		if (!rc)
   2171			break;
   2172		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
   2173		if (rc)
   2174			break;
   2175	}
   2176	kfree(rmap);
   2177	return rc;
   2178}
   2179EXPORT_SYMBOL_GPL(gmap_shadow_page);
   2180
   2181/*
   2182 * gmap_shadow_notify - handle notifications for shadow gmap
   2183 *
   2184 * Called with sg->parent->shadow_lock.
   2185 */
   2186static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
   2187			       unsigned long gaddr)
   2188{
   2189	struct gmap_rmap *rmap, *rnext, *head;
   2190	unsigned long start, end, bits, raddr;
   2191
   2192	BUG_ON(!gmap_is_shadow(sg));
   2193
   2194	spin_lock(&sg->guest_table_lock);
   2195	if (sg->removed) {
   2196		spin_unlock(&sg->guest_table_lock);
   2197		return;
   2198	}
   2199	/* Check for top level table */
   2200	start = sg->orig_asce & _ASCE_ORIGIN;
   2201	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
   2202	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
   2203	    gaddr < end) {
   2204		/* The complete shadow table has to go */
   2205		gmap_unshadow(sg);
   2206		spin_unlock(&sg->guest_table_lock);
   2207		list_del(&sg->list);
   2208		gmap_put(sg);
   2209		return;
   2210	}
   2211	/* Remove the page table tree from on specific entry */
   2212	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
   2213	gmap_for_each_rmap_safe(rmap, rnext, head) {
   2214		bits = rmap->raddr & _SHADOW_RMAP_MASK;
   2215		raddr = rmap->raddr ^ bits;
   2216		switch (bits) {
   2217		case _SHADOW_RMAP_REGION1:
   2218			gmap_unshadow_r2t(sg, raddr);
   2219			break;
   2220		case _SHADOW_RMAP_REGION2:
   2221			gmap_unshadow_r3t(sg, raddr);
   2222			break;
   2223		case _SHADOW_RMAP_REGION3:
   2224			gmap_unshadow_sgt(sg, raddr);
   2225			break;
   2226		case _SHADOW_RMAP_SEGMENT:
   2227			gmap_unshadow_pgt(sg, raddr);
   2228			break;
   2229		case _SHADOW_RMAP_PGTABLE:
   2230			gmap_unshadow_page(sg, raddr);
   2231			break;
   2232		}
   2233		kfree(rmap);
   2234	}
   2235	spin_unlock(&sg->guest_table_lock);
   2236}
   2237
   2238/**
   2239 * ptep_notify - call all invalidation callbacks for a specific pte.
   2240 * @mm: pointer to the process mm_struct
   2241 * @vmaddr: virtual address in the process address space
   2242 * @pte: pointer to the page table entry
   2243 * @bits: bits from the pgste that caused the notify call
   2244 *
   2245 * This function is assumed to be called with the page table lock held
   2246 * for the pte to notify.
   2247 */
   2248void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
   2249		 pte_t *pte, unsigned long bits)
   2250{
   2251	unsigned long offset, gaddr = 0;
   2252	unsigned long *table;
   2253	struct gmap *gmap, *sg, *next;
   2254
   2255	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
   2256	offset = offset * (PAGE_SIZE / sizeof(pte_t));
   2257	rcu_read_lock();
   2258	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
   2259		spin_lock(&gmap->guest_table_lock);
   2260		table = radix_tree_lookup(&gmap->host_to_guest,
   2261					  vmaddr >> PMD_SHIFT);
   2262		if (table)
   2263			gaddr = __gmap_segment_gaddr(table) + offset;
   2264		spin_unlock(&gmap->guest_table_lock);
   2265		if (!table)
   2266			continue;
   2267
   2268		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
   2269			spin_lock(&gmap->shadow_lock);
   2270			list_for_each_entry_safe(sg, next,
   2271						 &gmap->children, list)
   2272				gmap_shadow_notify(sg, vmaddr, gaddr);
   2273			spin_unlock(&gmap->shadow_lock);
   2274		}
   2275		if (bits & PGSTE_IN_BIT)
   2276			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
   2277	}
   2278	rcu_read_unlock();
   2279}
   2280EXPORT_SYMBOL_GPL(ptep_notify);
   2281
   2282static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
   2283			     unsigned long gaddr)
   2284{
   2285	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
   2286	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
   2287}
   2288
   2289/**
   2290 * gmap_pmdp_xchg - exchange a gmap pmd with another
   2291 * @gmap: pointer to the guest address space structure
   2292 * @pmdp: pointer to the pmd entry
   2293 * @new: replacement entry
   2294 * @gaddr: the affected guest address
   2295 *
   2296 * This function is assumed to be called with the guest_table_lock
   2297 * held.
   2298 */
   2299static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
   2300			   unsigned long gaddr)
   2301{
   2302	gaddr &= HPAGE_MASK;
   2303	pmdp_notify_gmap(gmap, pmdp, gaddr);
   2304	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
   2305	if (MACHINE_HAS_TLB_GUEST)
   2306		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
   2307			    IDTE_GLOBAL);
   2308	else if (MACHINE_HAS_IDTE)
   2309		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
   2310	else
   2311		__pmdp_csp(pmdp);
   2312	set_pmd(pmdp, new);
   2313}
   2314
   2315static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
   2316			    int purge)
   2317{
   2318	pmd_t *pmdp;
   2319	struct gmap *gmap;
   2320	unsigned long gaddr;
   2321
   2322	rcu_read_lock();
   2323	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
   2324		spin_lock(&gmap->guest_table_lock);
   2325		pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
   2326						  vmaddr >> PMD_SHIFT);
   2327		if (pmdp) {
   2328			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
   2329			pmdp_notify_gmap(gmap, pmdp, gaddr);
   2330			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
   2331						   _SEGMENT_ENTRY_GMAP_UC));
   2332			if (purge)
   2333				__pmdp_csp(pmdp);
   2334			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
   2335		}
   2336		spin_unlock(&gmap->guest_table_lock);
   2337	}
   2338	rcu_read_unlock();
   2339}
   2340
   2341/**
   2342 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
   2343 *                        flushing
   2344 * @mm: pointer to the process mm_struct
   2345 * @vmaddr: virtual address in the process address space
   2346 */
   2347void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
   2348{
   2349	gmap_pmdp_clear(mm, vmaddr, 0);
   2350}
   2351EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
   2352
   2353/**
   2354 * gmap_pmdp_csp - csp all affected guest pmd entries
   2355 * @mm: pointer to the process mm_struct
   2356 * @vmaddr: virtual address in the process address space
   2357 */
   2358void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
   2359{
   2360	gmap_pmdp_clear(mm, vmaddr, 1);
   2361}
   2362EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
   2363
   2364/**
   2365 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
   2366 * @mm: pointer to the process mm_struct
   2367 * @vmaddr: virtual address in the process address space
   2368 */
   2369void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
   2370{
   2371	unsigned long *entry, gaddr;
   2372	struct gmap *gmap;
   2373	pmd_t *pmdp;
   2374
   2375	rcu_read_lock();
   2376	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
   2377		spin_lock(&gmap->guest_table_lock);
   2378		entry = radix_tree_delete(&gmap->host_to_guest,
   2379					  vmaddr >> PMD_SHIFT);
   2380		if (entry) {
   2381			pmdp = (pmd_t *)entry;
   2382			gaddr = __gmap_segment_gaddr(entry);
   2383			pmdp_notify_gmap(gmap, pmdp, gaddr);
   2384			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
   2385					   _SEGMENT_ENTRY_GMAP_UC));
   2386			if (MACHINE_HAS_TLB_GUEST)
   2387				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
   2388					    gmap->asce, IDTE_LOCAL);
   2389			else if (MACHINE_HAS_IDTE)
   2390				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
   2391			*entry = _SEGMENT_ENTRY_EMPTY;
   2392		}
   2393		spin_unlock(&gmap->guest_table_lock);
   2394	}
   2395	rcu_read_unlock();
   2396}
   2397EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
   2398
   2399/**
   2400 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
   2401 * @mm: pointer to the process mm_struct
   2402 * @vmaddr: virtual address in the process address space
   2403 */
   2404void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
   2405{
   2406	unsigned long *entry, gaddr;
   2407	struct gmap *gmap;
   2408	pmd_t *pmdp;
   2409
   2410	rcu_read_lock();
   2411	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
   2412		spin_lock(&gmap->guest_table_lock);
   2413		entry = radix_tree_delete(&gmap->host_to_guest,
   2414					  vmaddr >> PMD_SHIFT);
   2415		if (entry) {
   2416			pmdp = (pmd_t *)entry;
   2417			gaddr = __gmap_segment_gaddr(entry);
   2418			pmdp_notify_gmap(gmap, pmdp, gaddr);
   2419			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
   2420					   _SEGMENT_ENTRY_GMAP_UC));
   2421			if (MACHINE_HAS_TLB_GUEST)
   2422				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
   2423					    gmap->asce, IDTE_GLOBAL);
   2424			else if (MACHINE_HAS_IDTE)
   2425				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
   2426			else
   2427				__pmdp_csp(pmdp);
   2428			*entry = _SEGMENT_ENTRY_EMPTY;
   2429		}
   2430		spin_unlock(&gmap->guest_table_lock);
   2431	}
   2432	rcu_read_unlock();
   2433}
   2434EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
   2435
   2436/**
   2437 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
   2438 * @gmap: pointer to guest address space
   2439 * @pmdp: pointer to the pmd to be tested
   2440 * @gaddr: virtual address in the guest address space
   2441 *
   2442 * This function is assumed to be called with the guest_table_lock
   2443 * held.
   2444 */
   2445static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
   2446					  unsigned long gaddr)
   2447{
   2448	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
   2449		return false;
   2450
   2451	/* Already protected memory, which did not change is clean */
   2452	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
   2453	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
   2454		return false;
   2455
   2456	/* Clear UC indication and reset protection */
   2457	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
   2458	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
   2459	return true;
   2460}
   2461
   2462/**
   2463 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
   2464 * @gmap: pointer to guest address space
   2465 * @bitmap: dirty bitmap for this pmd
   2466 * @gaddr: virtual address in the guest address space
   2467 * @vmaddr: virtual address in the host address space
   2468 *
   2469 * This function is assumed to be called with the guest_table_lock
   2470 * held.
   2471 */
   2472void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
   2473			     unsigned long gaddr, unsigned long vmaddr)
   2474{
   2475	int i;
   2476	pmd_t *pmdp;
   2477	pte_t *ptep;
   2478	spinlock_t *ptl;
   2479
   2480	pmdp = gmap_pmd_op_walk(gmap, gaddr);
   2481	if (!pmdp)
   2482		return;
   2483
   2484	if (pmd_large(*pmdp)) {
   2485		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
   2486			bitmap_fill(bitmap, _PAGE_ENTRIES);
   2487	} else {
   2488		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
   2489			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
   2490			if (!ptep)
   2491				continue;
   2492			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
   2493				set_bit(i, bitmap);
   2494			spin_unlock(ptl);
   2495		}
   2496	}
   2497	gmap_pmd_op_end(gmap, pmdp);
   2498}
   2499EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
   2500
   2501#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   2502static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
   2503				    unsigned long end, struct mm_walk *walk)
   2504{
   2505	struct vm_area_struct *vma = walk->vma;
   2506
   2507	split_huge_pmd(vma, pmd, addr);
   2508	return 0;
   2509}
   2510
   2511static const struct mm_walk_ops thp_split_walk_ops = {
   2512	.pmd_entry	= thp_split_walk_pmd_entry,
   2513};
   2514
   2515static inline void thp_split_mm(struct mm_struct *mm)
   2516{
   2517	struct vm_area_struct *vma;
   2518
   2519	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
   2520		vma->vm_flags &= ~VM_HUGEPAGE;
   2521		vma->vm_flags |= VM_NOHUGEPAGE;
   2522		walk_page_vma(vma, &thp_split_walk_ops, NULL);
   2523	}
   2524	mm->def_flags |= VM_NOHUGEPAGE;
   2525}
   2526#else
   2527static inline void thp_split_mm(struct mm_struct *mm)
   2528{
   2529}
   2530#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   2531
   2532/*
   2533 * Remove all empty zero pages from the mapping for lazy refaulting
   2534 * - This must be called after mm->context.has_pgste is set, to avoid
   2535 *   future creation of zero pages
   2536 * - This must be called after THP was enabled
   2537 */
   2538static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
   2539			   unsigned long end, struct mm_walk *walk)
   2540{
   2541	unsigned long addr;
   2542
   2543	for (addr = start; addr != end; addr += PAGE_SIZE) {
   2544		pte_t *ptep;
   2545		spinlock_t *ptl;
   2546
   2547		ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
   2548		if (is_zero_pfn(pte_pfn(*ptep)))
   2549			ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
   2550		pte_unmap_unlock(ptep, ptl);
   2551	}
   2552	return 0;
   2553}
   2554
   2555static const struct mm_walk_ops zap_zero_walk_ops = {
   2556	.pmd_entry	= __zap_zero_pages,
   2557};
   2558
   2559/*
   2560 * switch on pgstes for its userspace process (for kvm)
   2561 */
   2562int s390_enable_sie(void)
   2563{
   2564	struct mm_struct *mm = current->mm;
   2565
   2566	/* Do we have pgstes? if yes, we are done */
   2567	if (mm_has_pgste(mm))
   2568		return 0;
   2569	/* Fail if the page tables are 2K */
   2570	if (!mm_alloc_pgste(mm))
   2571		return -EINVAL;
   2572	mmap_write_lock(mm);
   2573	mm->context.has_pgste = 1;
   2574	/* split thp mappings and disable thp for future mappings */
   2575	thp_split_mm(mm);
   2576	walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
   2577	mmap_write_unlock(mm);
   2578	return 0;
   2579}
   2580EXPORT_SYMBOL_GPL(s390_enable_sie);
   2581
   2582int gmap_mark_unmergeable(void)
   2583{
   2584	struct mm_struct *mm = current->mm;
   2585	struct vm_area_struct *vma;
   2586	int ret;
   2587
   2588	for (vma = mm->mmap; vma; vma = vma->vm_next) {
   2589		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
   2590				  MADV_UNMERGEABLE, &vma->vm_flags);
   2591		if (ret)
   2592			return ret;
   2593	}
   2594	mm->def_flags &= ~VM_MERGEABLE;
   2595	return 0;
   2596}
   2597EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
   2598
   2599/*
   2600 * Enable storage key handling from now on and initialize the storage
   2601 * keys with the default key.
   2602 */
   2603static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
   2604				  unsigned long next, struct mm_walk *walk)
   2605{
   2606	/* Clear storage key */
   2607	ptep_zap_key(walk->mm, addr, pte);
   2608	return 0;
   2609}
   2610
   2611/*
   2612 * Give a chance to schedule after setting a key to 256 pages.
   2613 * We only hold the mm lock, which is a rwsem and the kvm srcu.
   2614 * Both can sleep.
   2615 */
   2616static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
   2617				  unsigned long next, struct mm_walk *walk)
   2618{
   2619	cond_resched();
   2620	return 0;
   2621}
   2622
   2623static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
   2624				      unsigned long hmask, unsigned long next,
   2625				      struct mm_walk *walk)
   2626{
   2627	pmd_t *pmd = (pmd_t *)pte;
   2628	unsigned long start, end;
   2629	struct page *page = pmd_page(*pmd);
   2630
   2631	/*
   2632	 * The write check makes sure we do not set a key on shared
   2633	 * memory. This is needed as the walker does not differentiate
   2634	 * between actual guest memory and the process executable or
   2635	 * shared libraries.
   2636	 */
   2637	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
   2638	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
   2639		return 0;
   2640
   2641	start = pmd_val(*pmd) & HPAGE_MASK;
   2642	end = start + HPAGE_SIZE - 1;
   2643	__storage_key_init_range(start, end);
   2644	set_bit(PG_arch_1, &page->flags);
   2645	cond_resched();
   2646	return 0;
   2647}
   2648
   2649static const struct mm_walk_ops enable_skey_walk_ops = {
   2650	.hugetlb_entry		= __s390_enable_skey_hugetlb,
   2651	.pte_entry		= __s390_enable_skey_pte,
   2652	.pmd_entry		= __s390_enable_skey_pmd,
   2653};
   2654
   2655int s390_enable_skey(void)
   2656{
   2657	struct mm_struct *mm = current->mm;
   2658	int rc = 0;
   2659
   2660	mmap_write_lock(mm);
   2661	if (mm_uses_skeys(mm))
   2662		goto out_up;
   2663
   2664	mm->context.uses_skeys = 1;
   2665	rc = gmap_mark_unmergeable();
   2666	if (rc) {
   2667		mm->context.uses_skeys = 0;
   2668		goto out_up;
   2669	}
   2670	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
   2671
   2672out_up:
   2673	mmap_write_unlock(mm);
   2674	return rc;
   2675}
   2676EXPORT_SYMBOL_GPL(s390_enable_skey);
   2677
   2678/*
   2679 * Reset CMMA state, make all pages stable again.
   2680 */
   2681static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
   2682			     unsigned long next, struct mm_walk *walk)
   2683{
   2684	ptep_zap_unused(walk->mm, addr, pte, 1);
   2685	return 0;
   2686}
   2687
   2688static const struct mm_walk_ops reset_cmma_walk_ops = {
   2689	.pte_entry		= __s390_reset_cmma,
   2690};
   2691
   2692void s390_reset_cmma(struct mm_struct *mm)
   2693{
   2694	mmap_write_lock(mm);
   2695	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
   2696	mmap_write_unlock(mm);
   2697}
   2698EXPORT_SYMBOL_GPL(s390_reset_cmma);
   2699
   2700/*
   2701 * make inaccessible pages accessible again
   2702 */
   2703static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
   2704			    unsigned long next, struct mm_walk *walk)
   2705{
   2706	pte_t pte = READ_ONCE(*ptep);
   2707
   2708	/* There is a reference through the mapping */
   2709	if (pte_present(pte))
   2710		WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
   2711
   2712	return 0;
   2713}
   2714
   2715static const struct mm_walk_ops reset_acc_walk_ops = {
   2716	.pte_entry		= __s390_reset_acc,
   2717};
   2718
   2719#include <linux/sched/mm.h>
   2720void s390_reset_acc(struct mm_struct *mm)
   2721{
   2722	if (!mm_is_protected(mm))
   2723		return;
   2724	/*
   2725	 * we might be called during
   2726	 * reset:                             we walk the pages and clear
   2727	 * close of all kvm file descriptors: we walk the pages and clear
   2728	 * exit of process on fd closure:     vma already gone, do nothing
   2729	 */
   2730	if (!mmget_not_zero(mm))
   2731		return;
   2732	mmap_read_lock(mm);
   2733	walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
   2734	mmap_read_unlock(mm);
   2735	mmput(mm);
   2736}
   2737EXPORT_SYMBOL_GPL(s390_reset_acc);