cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mmu_pv.c (65285B)


      1// SPDX-License-Identifier: GPL-2.0
      2
      3/*
      4 * Xen mmu operations
      5 *
      6 * This file contains the various mmu fetch and update operations.
      7 * The most important job they must perform is the mapping between the
      8 * domain's pfn and the overall machine mfns.
      9 *
     10 * Xen allows guests to directly update the pagetable, in a controlled
     11 * fashion.  In other words, the guest modifies the same pagetable
     12 * that the CPU actually uses, which eliminates the overhead of having
     13 * a separate shadow pagetable.
     14 *
     15 * In order to allow this, it falls on the guest domain to map its
     16 * notion of a "physical" pfn - which is just a domain-local linear
     17 * address - into a real "machine address" which the CPU's MMU can
     18 * use.
     19 *
     20 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
     21 * inserted directly into the pagetable.  When creating a new
     22 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
     23 * when reading the content back with __(pgd|pmd|pte)_val, it converts
     24 * the mfn back into a pfn.
     25 *
     26 * The other constraint is that all pages which make up a pagetable
     27 * must be mapped read-only in the guest.  This prevents uncontrolled
     28 * guest updates to the pagetable.  Xen strictly enforces this, and
     29 * will disallow any pagetable update which will end up mapping a
     30 * pagetable page RW, and will disallow using any writable page as a
     31 * pagetable.
     32 *
     33 * Naively, when loading %cr3 with the base of a new pagetable, Xen
     34 * would need to validate the whole pagetable before going on.
     35 * Naturally, this is quite slow.  The solution is to "pin" a
     36 * pagetable, which enforces all the constraints on the pagetable even
     37 * when it is not actively in use.  This menas that Xen can be assured
     38 * that it is still valid when you do load it into %cr3, and doesn't
     39 * need to revalidate it.
     40 *
     41 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
     42 */
     43#include <linux/sched/mm.h>
     44#include <linux/debugfs.h>
     45#include <linux/bug.h>
     46#include <linux/vmalloc.h>
     47#include <linux/export.h>
     48#include <linux/init.h>
     49#include <linux/gfp.h>
     50#include <linux/memblock.h>
     51#include <linux/seq_file.h>
     52#include <linux/crash_dump.h>
     53#include <linux/pgtable.h>
     54#ifdef CONFIG_KEXEC_CORE
     55#include <linux/kexec.h>
     56#endif
     57
     58#include <trace/events/xen.h>
     59
     60#include <asm/tlbflush.h>
     61#include <asm/fixmap.h>
     62#include <asm/mmu_context.h>
     63#include <asm/setup.h>
     64#include <asm/paravirt.h>
     65#include <asm/e820/api.h>
     66#include <asm/linkage.h>
     67#include <asm/page.h>
     68#include <asm/init.h>
     69#include <asm/memtype.h>
     70#include <asm/smp.h>
     71#include <asm/tlb.h>
     72
     73#include <asm/xen/hypercall.h>
     74#include <asm/xen/hypervisor.h>
     75
     76#include <xen/xen.h>
     77#include <xen/page.h>
     78#include <xen/interface/xen.h>
     79#include <xen/interface/hvm/hvm_op.h>
     80#include <xen/interface/version.h>
     81#include <xen/interface/memory.h>
     82#include <xen/hvc-console.h>
     83#include <xen/swiotlb-xen.h>
     84
     85#include "multicalls.h"
     86#include "mmu.h"
     87#include "debugfs.h"
     88
     89#ifdef CONFIG_X86_VSYSCALL_EMULATION
     90/* l3 pud for userspace vsyscall mapping */
     91static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
     92#endif
     93
     94/*
     95 * Protects atomic reservation decrease/increase against concurrent increases.
     96 * Also protects non-atomic updates of current_pages and balloon lists.
     97 */
     98static DEFINE_SPINLOCK(xen_reservation_lock);
     99
    100/*
    101 * Note about cr3 (pagetable base) values:
    102 *
    103 * xen_cr3 contains the current logical cr3 value; it contains the
    104 * last set cr3.  This may not be the current effective cr3, because
    105 * its update may be being lazily deferred.  However, a vcpu looking
    106 * at its own cr3 can use this value knowing that it everything will
    107 * be self-consistent.
    108 *
    109 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
    110 * hypercall to set the vcpu cr3 is complete (so it may be a little
    111 * out of date, but it will never be set early).  If one vcpu is
    112 * looking at another vcpu's cr3 value, it should use this variable.
    113 */
    114DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
    115DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
    116
    117static phys_addr_t xen_pt_base, xen_pt_size __initdata;
    118
    119static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
    120
    121/*
    122 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
    123 * redzone above it, so round it up to a PGD boundary.
    124 */
    125#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
    126
    127void make_lowmem_page_readonly(void *vaddr)
    128{
    129	pte_t *pte, ptev;
    130	unsigned long address = (unsigned long)vaddr;
    131	unsigned int level;
    132
    133	pte = lookup_address(address, &level);
    134	if (pte == NULL)
    135		return;		/* vaddr missing */
    136
    137	ptev = pte_wrprotect(*pte);
    138
    139	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
    140		BUG();
    141}
    142
    143void make_lowmem_page_readwrite(void *vaddr)
    144{
    145	pte_t *pte, ptev;
    146	unsigned long address = (unsigned long)vaddr;
    147	unsigned int level;
    148
    149	pte = lookup_address(address, &level);
    150	if (pte == NULL)
    151		return;		/* vaddr missing */
    152
    153	ptev = pte_mkwrite(*pte);
    154
    155	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
    156		BUG();
    157}
    158
    159
    160/*
    161 * During early boot all page table pages are pinned, but we do not have struct
    162 * pages, so return true until struct pages are ready.
    163 */
    164static bool xen_page_pinned(void *ptr)
    165{
    166	if (static_branch_likely(&xen_struct_pages_ready)) {
    167		struct page *page = virt_to_page(ptr);
    168
    169		return PagePinned(page);
    170	}
    171	return true;
    172}
    173
    174static void xen_extend_mmu_update(const struct mmu_update *update)
    175{
    176	struct multicall_space mcs;
    177	struct mmu_update *u;
    178
    179	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
    180
    181	if (mcs.mc != NULL) {
    182		mcs.mc->args[1]++;
    183	} else {
    184		mcs = __xen_mc_entry(sizeof(*u));
    185		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
    186	}
    187
    188	u = mcs.args;
    189	*u = *update;
    190}
    191
    192static void xen_extend_mmuext_op(const struct mmuext_op *op)
    193{
    194	struct multicall_space mcs;
    195	struct mmuext_op *u;
    196
    197	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
    198
    199	if (mcs.mc != NULL) {
    200		mcs.mc->args[1]++;
    201	} else {
    202		mcs = __xen_mc_entry(sizeof(*u));
    203		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
    204	}
    205
    206	u = mcs.args;
    207	*u = *op;
    208}
    209
    210static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
    211{
    212	struct mmu_update u;
    213
    214	preempt_disable();
    215
    216	xen_mc_batch();
    217
    218	/* ptr may be ioremapped for 64-bit pagetable setup */
    219	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
    220	u.val = pmd_val_ma(val);
    221	xen_extend_mmu_update(&u);
    222
    223	xen_mc_issue(PARAVIRT_LAZY_MMU);
    224
    225	preempt_enable();
    226}
    227
    228static void xen_set_pmd(pmd_t *ptr, pmd_t val)
    229{
    230	trace_xen_mmu_set_pmd(ptr, val);
    231
    232	/* If page is not pinned, we can just update the entry
    233	   directly */
    234	if (!xen_page_pinned(ptr)) {
    235		*ptr = val;
    236		return;
    237	}
    238
    239	xen_set_pmd_hyper(ptr, val);
    240}
    241
    242/*
    243 * Associate a virtual page frame with a given physical page frame
    244 * and protection flags for that frame.
    245 */
    246void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
    247{
    248	if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags),
    249					 UVMF_INVLPG))
    250		BUG();
    251}
    252
    253static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
    254{
    255	struct mmu_update u;
    256
    257	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
    258		return false;
    259
    260	xen_mc_batch();
    261
    262	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
    263	u.val = pte_val_ma(pteval);
    264	xen_extend_mmu_update(&u);
    265
    266	xen_mc_issue(PARAVIRT_LAZY_MMU);
    267
    268	return true;
    269}
    270
    271static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
    272{
    273	if (!xen_batched_set_pte(ptep, pteval)) {
    274		/*
    275		 * Could call native_set_pte() here and trap and
    276		 * emulate the PTE write, but a hypercall is much cheaper.
    277		 */
    278		struct mmu_update u;
    279
    280		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
    281		u.val = pte_val_ma(pteval);
    282		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
    283	}
    284}
    285
    286static void xen_set_pte(pte_t *ptep, pte_t pteval)
    287{
    288	trace_xen_mmu_set_pte(ptep, pteval);
    289	__xen_set_pte(ptep, pteval);
    290}
    291
    292pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
    293				 unsigned long addr, pte_t *ptep)
    294{
    295	/* Just return the pte as-is.  We preserve the bits on commit */
    296	trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
    297	return *ptep;
    298}
    299
    300void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
    301				 pte_t *ptep, pte_t pte)
    302{
    303	struct mmu_update u;
    304
    305	trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
    306	xen_mc_batch();
    307
    308	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
    309	u.val = pte_val_ma(pte);
    310	xen_extend_mmu_update(&u);
    311
    312	xen_mc_issue(PARAVIRT_LAZY_MMU);
    313}
    314
    315/* Assume pteval_t is equivalent to all the other *val_t types. */
    316static pteval_t pte_mfn_to_pfn(pteval_t val)
    317{
    318	if (val & _PAGE_PRESENT) {
    319		unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
    320		unsigned long pfn = mfn_to_pfn(mfn);
    321
    322		pteval_t flags = val & PTE_FLAGS_MASK;
    323		if (unlikely(pfn == ~0))
    324			val = flags & ~_PAGE_PRESENT;
    325		else
    326			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
    327	}
    328
    329	return val;
    330}
    331
    332static pteval_t pte_pfn_to_mfn(pteval_t val)
    333{
    334	if (val & _PAGE_PRESENT) {
    335		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
    336		pteval_t flags = val & PTE_FLAGS_MASK;
    337		unsigned long mfn;
    338
    339		mfn = __pfn_to_mfn(pfn);
    340
    341		/*
    342		 * If there's no mfn for the pfn, then just create an
    343		 * empty non-present pte.  Unfortunately this loses
    344		 * information about the original pfn, so
    345		 * pte_mfn_to_pfn is asymmetric.
    346		 */
    347		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
    348			mfn = 0;
    349			flags = 0;
    350		} else
    351			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
    352		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
    353	}
    354
    355	return val;
    356}
    357
    358__visible pteval_t xen_pte_val(pte_t pte)
    359{
    360	pteval_t pteval = pte.pte;
    361
    362	return pte_mfn_to_pfn(pteval);
    363}
    364PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
    365
    366__visible pgdval_t xen_pgd_val(pgd_t pgd)
    367{
    368	return pte_mfn_to_pfn(pgd.pgd);
    369}
    370PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
    371
    372__visible pte_t xen_make_pte(pteval_t pte)
    373{
    374	pte = pte_pfn_to_mfn(pte);
    375
    376	return native_make_pte(pte);
    377}
    378PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
    379
    380__visible pgd_t xen_make_pgd(pgdval_t pgd)
    381{
    382	pgd = pte_pfn_to_mfn(pgd);
    383	return native_make_pgd(pgd);
    384}
    385PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
    386
    387__visible pmdval_t xen_pmd_val(pmd_t pmd)
    388{
    389	return pte_mfn_to_pfn(pmd.pmd);
    390}
    391PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
    392
    393static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
    394{
    395	struct mmu_update u;
    396
    397	preempt_disable();
    398
    399	xen_mc_batch();
    400
    401	/* ptr may be ioremapped for 64-bit pagetable setup */
    402	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
    403	u.val = pud_val_ma(val);
    404	xen_extend_mmu_update(&u);
    405
    406	xen_mc_issue(PARAVIRT_LAZY_MMU);
    407
    408	preempt_enable();
    409}
    410
    411static void xen_set_pud(pud_t *ptr, pud_t val)
    412{
    413	trace_xen_mmu_set_pud(ptr, val);
    414
    415	/* If page is not pinned, we can just update the entry
    416	   directly */
    417	if (!xen_page_pinned(ptr)) {
    418		*ptr = val;
    419		return;
    420	}
    421
    422	xen_set_pud_hyper(ptr, val);
    423}
    424
    425__visible pmd_t xen_make_pmd(pmdval_t pmd)
    426{
    427	pmd = pte_pfn_to_mfn(pmd);
    428	return native_make_pmd(pmd);
    429}
    430PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
    431
    432__visible pudval_t xen_pud_val(pud_t pud)
    433{
    434	return pte_mfn_to_pfn(pud.pud);
    435}
    436PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
    437
    438__visible pud_t xen_make_pud(pudval_t pud)
    439{
    440	pud = pte_pfn_to_mfn(pud);
    441
    442	return native_make_pud(pud);
    443}
    444PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
    445
    446static pgd_t *xen_get_user_pgd(pgd_t *pgd)
    447{
    448	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
    449	unsigned offset = pgd - pgd_page;
    450	pgd_t *user_ptr = NULL;
    451
    452	if (offset < pgd_index(USER_LIMIT)) {
    453		struct page *page = virt_to_page(pgd_page);
    454		user_ptr = (pgd_t *)page->private;
    455		if (user_ptr)
    456			user_ptr += offset;
    457	}
    458
    459	return user_ptr;
    460}
    461
    462static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
    463{
    464	struct mmu_update u;
    465
    466	u.ptr = virt_to_machine(ptr).maddr;
    467	u.val = p4d_val_ma(val);
    468	xen_extend_mmu_update(&u);
    469}
    470
    471/*
    472 * Raw hypercall-based set_p4d, intended for in early boot before
    473 * there's a page structure.  This implies:
    474 *  1. The only existing pagetable is the kernel's
    475 *  2. It is always pinned
    476 *  3. It has no user pagetable attached to it
    477 */
    478static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
    479{
    480	preempt_disable();
    481
    482	xen_mc_batch();
    483
    484	__xen_set_p4d_hyper(ptr, val);
    485
    486	xen_mc_issue(PARAVIRT_LAZY_MMU);
    487
    488	preempt_enable();
    489}
    490
    491static void xen_set_p4d(p4d_t *ptr, p4d_t val)
    492{
    493	pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
    494	pgd_t pgd_val;
    495
    496	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
    497
    498	/* If page is not pinned, we can just update the entry
    499	   directly */
    500	if (!xen_page_pinned(ptr)) {
    501		*ptr = val;
    502		if (user_ptr) {
    503			WARN_ON(xen_page_pinned(user_ptr));
    504			pgd_val.pgd = p4d_val_ma(val);
    505			*user_ptr = pgd_val;
    506		}
    507		return;
    508	}
    509
    510	/* If it's pinned, then we can at least batch the kernel and
    511	   user updates together. */
    512	xen_mc_batch();
    513
    514	__xen_set_p4d_hyper(ptr, val);
    515	if (user_ptr)
    516		__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
    517
    518	xen_mc_issue(PARAVIRT_LAZY_MMU);
    519}
    520
    521#if CONFIG_PGTABLE_LEVELS >= 5
    522__visible p4dval_t xen_p4d_val(p4d_t p4d)
    523{
    524	return pte_mfn_to_pfn(p4d.p4d);
    525}
    526PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
    527
    528__visible p4d_t xen_make_p4d(p4dval_t p4d)
    529{
    530	p4d = pte_pfn_to_mfn(p4d);
    531
    532	return native_make_p4d(p4d);
    533}
    534PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
    535#endif  /* CONFIG_PGTABLE_LEVELS >= 5 */
    536
    537static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
    538			 void (*func)(struct mm_struct *mm, struct page *,
    539				      enum pt_level),
    540			 bool last, unsigned long limit)
    541{
    542	int i, nr;
    543
    544	nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
    545	for (i = 0; i < nr; i++) {
    546		if (!pmd_none(pmd[i]))
    547			(*func)(mm, pmd_page(pmd[i]), PT_PTE);
    548	}
    549}
    550
    551static void xen_pud_walk(struct mm_struct *mm, pud_t *pud,
    552			 void (*func)(struct mm_struct *mm, struct page *,
    553				      enum pt_level),
    554			 bool last, unsigned long limit)
    555{
    556	int i, nr;
    557
    558	nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
    559	for (i = 0; i < nr; i++) {
    560		pmd_t *pmd;
    561
    562		if (pud_none(pud[i]))
    563			continue;
    564
    565		pmd = pmd_offset(&pud[i], 0);
    566		if (PTRS_PER_PMD > 1)
    567			(*func)(mm, virt_to_page(pmd), PT_PMD);
    568		xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
    569	}
    570}
    571
    572static void xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
    573			 void (*func)(struct mm_struct *mm, struct page *,
    574				      enum pt_level),
    575			 bool last, unsigned long limit)
    576{
    577	pud_t *pud;
    578
    579
    580	if (p4d_none(*p4d))
    581		return;
    582
    583	pud = pud_offset(p4d, 0);
    584	if (PTRS_PER_PUD > 1)
    585		(*func)(mm, virt_to_page(pud), PT_PUD);
    586	xen_pud_walk(mm, pud, func, last, limit);
    587}
    588
    589/*
    590 * (Yet another) pagetable walker.  This one is intended for pinning a
    591 * pagetable.  This means that it walks a pagetable and calls the
    592 * callback function on each page it finds making up the page table,
    593 * at every level.  It walks the entire pagetable, but it only bothers
    594 * pinning pte pages which are below limit.  In the normal case this
    595 * will be STACK_TOP_MAX, but at boot we need to pin up to
    596 * FIXADDR_TOP.
    597 *
    598 * We must skip the Xen hole in the middle of the address space, just after
    599 * the big x86-64 virtual hole.
    600 */
    601static void __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
    602			   void (*func)(struct mm_struct *mm, struct page *,
    603					enum pt_level),
    604			   unsigned long limit)
    605{
    606	int i, nr;
    607	unsigned hole_low = 0, hole_high = 0;
    608
    609	/* The limit is the last byte to be touched */
    610	limit--;
    611	BUG_ON(limit >= FIXADDR_TOP);
    612
    613	/*
    614	 * 64-bit has a great big hole in the middle of the address
    615	 * space, which contains the Xen mappings.
    616	 */
    617	hole_low = pgd_index(GUARD_HOLE_BASE_ADDR);
    618	hole_high = pgd_index(GUARD_HOLE_END_ADDR);
    619
    620	nr = pgd_index(limit) + 1;
    621	for (i = 0; i < nr; i++) {
    622		p4d_t *p4d;
    623
    624		if (i >= hole_low && i < hole_high)
    625			continue;
    626
    627		if (pgd_none(pgd[i]))
    628			continue;
    629
    630		p4d = p4d_offset(&pgd[i], 0);
    631		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
    632	}
    633
    634	/* Do the top level last, so that the callbacks can use it as
    635	   a cue to do final things like tlb flushes. */
    636	(*func)(mm, virt_to_page(pgd), PT_PGD);
    637}
    638
    639static void xen_pgd_walk(struct mm_struct *mm,
    640			 void (*func)(struct mm_struct *mm, struct page *,
    641				      enum pt_level),
    642			 unsigned long limit)
    643{
    644	__xen_pgd_walk(mm, mm->pgd, func, limit);
    645}
    646
    647/* If we're using split pte locks, then take the page's lock and
    648   return a pointer to it.  Otherwise return NULL. */
    649static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
    650{
    651	spinlock_t *ptl = NULL;
    652
    653#if USE_SPLIT_PTE_PTLOCKS
    654	ptl = ptlock_ptr(page);
    655	spin_lock_nest_lock(ptl, &mm->page_table_lock);
    656#endif
    657
    658	return ptl;
    659}
    660
    661static void xen_pte_unlock(void *v)
    662{
    663	spinlock_t *ptl = v;
    664	spin_unlock(ptl);
    665}
    666
    667static void xen_do_pin(unsigned level, unsigned long pfn)
    668{
    669	struct mmuext_op op;
    670
    671	op.cmd = level;
    672	op.arg1.mfn = pfn_to_mfn(pfn);
    673
    674	xen_extend_mmuext_op(&op);
    675}
    676
    677static void xen_pin_page(struct mm_struct *mm, struct page *page,
    678			 enum pt_level level)
    679{
    680	unsigned pgfl = TestSetPagePinned(page);
    681
    682	if (!pgfl) {
    683		void *pt = lowmem_page_address(page);
    684		unsigned long pfn = page_to_pfn(page);
    685		struct multicall_space mcs = __xen_mc_entry(0);
    686		spinlock_t *ptl;
    687
    688		/*
    689		 * We need to hold the pagetable lock between the time
    690		 * we make the pagetable RO and when we actually pin
    691		 * it.  If we don't, then other users may come in and
    692		 * attempt to update the pagetable by writing it,
    693		 * which will fail because the memory is RO but not
    694		 * pinned, so Xen won't do the trap'n'emulate.
    695		 *
    696		 * If we're using split pte locks, we can't hold the
    697		 * entire pagetable's worth of locks during the
    698		 * traverse, because we may wrap the preempt count (8
    699		 * bits).  The solution is to mark RO and pin each PTE
    700		 * page while holding the lock.  This means the number
    701		 * of locks we end up holding is never more than a
    702		 * batch size (~32 entries, at present).
    703		 *
    704		 * If we're not using split pte locks, we needn't pin
    705		 * the PTE pages independently, because we're
    706		 * protected by the overall pagetable lock.
    707		 */
    708		ptl = NULL;
    709		if (level == PT_PTE)
    710			ptl = xen_pte_lock(page, mm);
    711
    712		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
    713					pfn_pte(pfn, PAGE_KERNEL_RO),
    714					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
    715
    716		if (ptl) {
    717			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
    718
    719			/* Queue a deferred unlock for when this batch
    720			   is completed. */
    721			xen_mc_callback(xen_pte_unlock, ptl);
    722		}
    723	}
    724}
    725
    726/* This is called just after a mm has been created, but it has not
    727   been used yet.  We need to make sure that its pagetable is all
    728   read-only, and can be pinned. */
    729static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
    730{
    731	pgd_t *user_pgd = xen_get_user_pgd(pgd);
    732
    733	trace_xen_mmu_pgd_pin(mm, pgd);
    734
    735	xen_mc_batch();
    736
    737	__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT);
    738
    739	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
    740
    741	if (user_pgd) {
    742		xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
    743		xen_do_pin(MMUEXT_PIN_L4_TABLE,
    744			   PFN_DOWN(__pa(user_pgd)));
    745	}
    746
    747	xen_mc_issue(0);
    748}
    749
    750static void xen_pgd_pin(struct mm_struct *mm)
    751{
    752	__xen_pgd_pin(mm, mm->pgd);
    753}
    754
    755/*
    756 * On save, we need to pin all pagetables to make sure they get their
    757 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
    758 * them (unpinned pgds are not currently in use, probably because the
    759 * process is under construction or destruction).
    760 *
    761 * Expected to be called in stop_machine() ("equivalent to taking
    762 * every spinlock in the system"), so the locking doesn't really
    763 * matter all that much.
    764 */
    765void xen_mm_pin_all(void)
    766{
    767	struct page *page;
    768
    769	spin_lock(&pgd_lock);
    770
    771	list_for_each_entry(page, &pgd_list, lru) {
    772		if (!PagePinned(page)) {
    773			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
    774			SetPageSavePinned(page);
    775		}
    776	}
    777
    778	spin_unlock(&pgd_lock);
    779}
    780
    781static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
    782				   enum pt_level level)
    783{
    784	SetPagePinned(page);
    785}
    786
    787/*
    788 * The init_mm pagetable is really pinned as soon as its created, but
    789 * that's before we have page structures to store the bits.  So do all
    790 * the book-keeping now once struct pages for allocated pages are
    791 * initialized. This happens only after memblock_free_all() is called.
    792 */
    793static void __init xen_after_bootmem(void)
    794{
    795	static_branch_enable(&xen_struct_pages_ready);
    796#ifdef CONFIG_X86_VSYSCALL_EMULATION
    797	SetPagePinned(virt_to_page(level3_user_vsyscall));
    798#endif
    799	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
    800}
    801
    802static void xen_unpin_page(struct mm_struct *mm, struct page *page,
    803			   enum pt_level level)
    804{
    805	unsigned pgfl = TestClearPagePinned(page);
    806
    807	if (pgfl) {
    808		void *pt = lowmem_page_address(page);
    809		unsigned long pfn = page_to_pfn(page);
    810		spinlock_t *ptl = NULL;
    811		struct multicall_space mcs;
    812
    813		/*
    814		 * Do the converse to pin_page.  If we're using split
    815		 * pte locks, we must be holding the lock for while
    816		 * the pte page is unpinned but still RO to prevent
    817		 * concurrent updates from seeing it in this
    818		 * partially-pinned state.
    819		 */
    820		if (level == PT_PTE) {
    821			ptl = xen_pte_lock(page, mm);
    822
    823			if (ptl)
    824				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
    825		}
    826
    827		mcs = __xen_mc_entry(0);
    828
    829		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
    830					pfn_pte(pfn, PAGE_KERNEL),
    831					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
    832
    833		if (ptl) {
    834			/* unlock when batch completed */
    835			xen_mc_callback(xen_pte_unlock, ptl);
    836		}
    837	}
    838}
    839
    840/* Release a pagetables pages back as normal RW */
    841static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
    842{
    843	pgd_t *user_pgd = xen_get_user_pgd(pgd);
    844
    845	trace_xen_mmu_pgd_unpin(mm, pgd);
    846
    847	xen_mc_batch();
    848
    849	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
    850
    851	if (user_pgd) {
    852		xen_do_pin(MMUEXT_UNPIN_TABLE,
    853			   PFN_DOWN(__pa(user_pgd)));
    854		xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
    855	}
    856
    857	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
    858
    859	xen_mc_issue(0);
    860}
    861
    862static void xen_pgd_unpin(struct mm_struct *mm)
    863{
    864	__xen_pgd_unpin(mm, mm->pgd);
    865}
    866
    867/*
    868 * On resume, undo any pinning done at save, so that the rest of the
    869 * kernel doesn't see any unexpected pinned pagetables.
    870 */
    871void xen_mm_unpin_all(void)
    872{
    873	struct page *page;
    874
    875	spin_lock(&pgd_lock);
    876
    877	list_for_each_entry(page, &pgd_list, lru) {
    878		if (PageSavePinned(page)) {
    879			BUG_ON(!PagePinned(page));
    880			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
    881			ClearPageSavePinned(page);
    882		}
    883	}
    884
    885	spin_unlock(&pgd_lock);
    886}
    887
    888static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
    889{
    890	spin_lock(&next->page_table_lock);
    891	xen_pgd_pin(next);
    892	spin_unlock(&next->page_table_lock);
    893}
    894
    895static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
    896{
    897	spin_lock(&mm->page_table_lock);
    898	xen_pgd_pin(mm);
    899	spin_unlock(&mm->page_table_lock);
    900}
    901
    902static void drop_mm_ref_this_cpu(void *info)
    903{
    904	struct mm_struct *mm = info;
    905
    906	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
    907		leave_mm(smp_processor_id());
    908
    909	/*
    910	 * If this cpu still has a stale cr3 reference, then make sure
    911	 * it has been flushed.
    912	 */
    913	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
    914		xen_mc_flush();
    915}
    916
    917#ifdef CONFIG_SMP
    918/*
    919 * Another cpu may still have their %cr3 pointing at the pagetable, so
    920 * we need to repoint it somewhere else before we can unpin it.
    921 */
    922static void xen_drop_mm_ref(struct mm_struct *mm)
    923{
    924	cpumask_var_t mask;
    925	unsigned cpu;
    926
    927	drop_mm_ref_this_cpu(mm);
    928
    929	/* Get the "official" set of cpus referring to our pagetable. */
    930	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
    931		for_each_online_cpu(cpu) {
    932			if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
    933				continue;
    934			smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
    935		}
    936		return;
    937	}
    938
    939	/*
    940	 * It's possible that a vcpu may have a stale reference to our
    941	 * cr3, because its in lazy mode, and it hasn't yet flushed
    942	 * its set of pending hypercalls yet.  In this case, we can
    943	 * look at its actual current cr3 value, and force it to flush
    944	 * if needed.
    945	 */
    946	cpumask_clear(mask);
    947	for_each_online_cpu(cpu) {
    948		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
    949			cpumask_set_cpu(cpu, mask);
    950	}
    951
    952	smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
    953	free_cpumask_var(mask);
    954}
    955#else
    956static void xen_drop_mm_ref(struct mm_struct *mm)
    957{
    958	drop_mm_ref_this_cpu(mm);
    959}
    960#endif
    961
    962/*
    963 * While a process runs, Xen pins its pagetables, which means that the
    964 * hypervisor forces it to be read-only, and it controls all updates
    965 * to it.  This means that all pagetable updates have to go via the
    966 * hypervisor, which is moderately expensive.
    967 *
    968 * Since we're pulling the pagetable down, we switch to use init_mm,
    969 * unpin old process pagetable and mark it all read-write, which
    970 * allows further operations on it to be simple memory accesses.
    971 *
    972 * The only subtle point is that another CPU may be still using the
    973 * pagetable because of lazy tlb flushing.  This means we need need to
    974 * switch all CPUs off this pagetable before we can unpin it.
    975 */
    976static void xen_exit_mmap(struct mm_struct *mm)
    977{
    978	get_cpu();		/* make sure we don't move around */
    979	xen_drop_mm_ref(mm);
    980	put_cpu();
    981
    982	spin_lock(&mm->page_table_lock);
    983
    984	/* pgd may not be pinned in the error exit path of execve */
    985	if (xen_page_pinned(mm->pgd))
    986		xen_pgd_unpin(mm);
    987
    988	spin_unlock(&mm->page_table_lock);
    989}
    990
    991static void xen_post_allocator_init(void);
    992
    993static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
    994{
    995	struct mmuext_op op;
    996
    997	op.cmd = cmd;
    998	op.arg1.mfn = pfn_to_mfn(pfn);
    999	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
   1000		BUG();
   1001}
   1002
   1003static void __init xen_cleanhighmap(unsigned long vaddr,
   1004				    unsigned long vaddr_end)
   1005{
   1006	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
   1007	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
   1008
   1009	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
   1010	 * We include the PMD passed in on _both_ boundaries. */
   1011	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
   1012			pmd++, vaddr += PMD_SIZE) {
   1013		if (pmd_none(*pmd))
   1014			continue;
   1015		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
   1016			set_pmd(pmd, __pmd(0));
   1017	}
   1018	/* In case we did something silly, we should crash in this function
   1019	 * instead of somewhere later and be confusing. */
   1020	xen_mc_flush();
   1021}
   1022
   1023/*
   1024 * Make a page range writeable and free it.
   1025 */
   1026static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
   1027{
   1028	void *vaddr = __va(paddr);
   1029	void *vaddr_end = vaddr + size;
   1030
   1031	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
   1032		make_lowmem_page_readwrite(vaddr);
   1033
   1034	memblock_phys_free(paddr, size);
   1035}
   1036
   1037static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
   1038{
   1039	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
   1040
   1041	if (unpin)
   1042		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
   1043	ClearPagePinned(virt_to_page(__va(pa)));
   1044	xen_free_ro_pages(pa, PAGE_SIZE);
   1045}
   1046
   1047static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
   1048{
   1049	unsigned long pa;
   1050	pte_t *pte_tbl;
   1051	int i;
   1052
   1053	if (pmd_large(*pmd)) {
   1054		pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
   1055		xen_free_ro_pages(pa, PMD_SIZE);
   1056		return;
   1057	}
   1058
   1059	pte_tbl = pte_offset_kernel(pmd, 0);
   1060	for (i = 0; i < PTRS_PER_PTE; i++) {
   1061		if (pte_none(pte_tbl[i]))
   1062			continue;
   1063		pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
   1064		xen_free_ro_pages(pa, PAGE_SIZE);
   1065	}
   1066	set_pmd(pmd, __pmd(0));
   1067	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
   1068}
   1069
   1070static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
   1071{
   1072	unsigned long pa;
   1073	pmd_t *pmd_tbl;
   1074	int i;
   1075
   1076	if (pud_large(*pud)) {
   1077		pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
   1078		xen_free_ro_pages(pa, PUD_SIZE);
   1079		return;
   1080	}
   1081
   1082	pmd_tbl = pmd_offset(pud, 0);
   1083	for (i = 0; i < PTRS_PER_PMD; i++) {
   1084		if (pmd_none(pmd_tbl[i]))
   1085			continue;
   1086		xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
   1087	}
   1088	set_pud(pud, __pud(0));
   1089	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
   1090}
   1091
   1092static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
   1093{
   1094	unsigned long pa;
   1095	pud_t *pud_tbl;
   1096	int i;
   1097
   1098	if (p4d_large(*p4d)) {
   1099		pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
   1100		xen_free_ro_pages(pa, P4D_SIZE);
   1101		return;
   1102	}
   1103
   1104	pud_tbl = pud_offset(p4d, 0);
   1105	for (i = 0; i < PTRS_PER_PUD; i++) {
   1106		if (pud_none(pud_tbl[i]))
   1107			continue;
   1108		xen_cleanmfnmap_pud(pud_tbl + i, unpin);
   1109	}
   1110	set_p4d(p4d, __p4d(0));
   1111	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
   1112}
   1113
   1114/*
   1115 * Since it is well isolated we can (and since it is perhaps large we should)
   1116 * also free the page tables mapping the initial P->M table.
   1117 */
   1118static void __init xen_cleanmfnmap(unsigned long vaddr)
   1119{
   1120	pgd_t *pgd;
   1121	p4d_t *p4d;
   1122	bool unpin;
   1123
   1124	unpin = (vaddr == 2 * PGDIR_SIZE);
   1125	vaddr &= PMD_MASK;
   1126	pgd = pgd_offset_k(vaddr);
   1127	p4d = p4d_offset(pgd, 0);
   1128	if (!p4d_none(*p4d))
   1129		xen_cleanmfnmap_p4d(p4d, unpin);
   1130}
   1131
   1132static void __init xen_pagetable_p2m_free(void)
   1133{
   1134	unsigned long size;
   1135	unsigned long addr;
   1136
   1137	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
   1138
   1139	/* No memory or already called. */
   1140	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
   1141		return;
   1142
   1143	/* using __ka address and sticking INVALID_P2M_ENTRY! */
   1144	memset((void *)xen_start_info->mfn_list, 0xff, size);
   1145
   1146	addr = xen_start_info->mfn_list;
   1147	/*
   1148	 * We could be in __ka space.
   1149	 * We roundup to the PMD, which means that if anybody at this stage is
   1150	 * using the __ka address of xen_start_info or
   1151	 * xen_start_info->shared_info they are in going to crash. Fortunately
   1152	 * we have already revectored in xen_setup_kernel_pagetable.
   1153	 */
   1154	size = roundup(size, PMD_SIZE);
   1155
   1156	if (addr >= __START_KERNEL_map) {
   1157		xen_cleanhighmap(addr, addr + size);
   1158		size = PAGE_ALIGN(xen_start_info->nr_pages *
   1159				  sizeof(unsigned long));
   1160		memblock_free((void *)addr, size);
   1161	} else {
   1162		xen_cleanmfnmap(addr);
   1163	}
   1164}
   1165
   1166static void __init xen_pagetable_cleanhighmap(void)
   1167{
   1168	unsigned long size;
   1169	unsigned long addr;
   1170
   1171	/* At this stage, cleanup_highmap has already cleaned __ka space
   1172	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
   1173	 * the ramdisk). We continue on, erasing PMD entries that point to page
   1174	 * tables - do note that they are accessible at this stage via __va.
   1175	 * As Xen is aligning the memory end to a 4MB boundary, for good
   1176	 * measure we also round up to PMD_SIZE * 2 - which means that if
   1177	 * anybody is using __ka address to the initial boot-stack - and try
   1178	 * to use it - they are going to crash. The xen_start_info has been
   1179	 * taken care of already in xen_setup_kernel_pagetable. */
   1180	addr = xen_start_info->pt_base;
   1181	size = xen_start_info->nr_pt_frames * PAGE_SIZE;
   1182
   1183	xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
   1184	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
   1185}
   1186
   1187static void __init xen_pagetable_p2m_setup(void)
   1188{
   1189	xen_vmalloc_p2m_tree();
   1190
   1191	xen_pagetable_p2m_free();
   1192
   1193	xen_pagetable_cleanhighmap();
   1194
   1195	/* And revector! Bye bye old array */
   1196	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
   1197}
   1198
   1199static void __init xen_pagetable_init(void)
   1200{
   1201	/*
   1202	 * The majority of further PTE writes is to pagetables already
   1203	 * announced as such to Xen. Hence it is more efficient to use
   1204	 * hypercalls for these updates.
   1205	 */
   1206	pv_ops.mmu.set_pte = __xen_set_pte;
   1207
   1208	paging_init();
   1209	xen_post_allocator_init();
   1210
   1211	xen_pagetable_p2m_setup();
   1212
   1213	/* Allocate and initialize top and mid mfn levels for p2m structure */
   1214	xen_build_mfn_list_list();
   1215
   1216	/* Remap memory freed due to conflicts with E820 map */
   1217	xen_remap_memory();
   1218	xen_setup_mfn_list_list();
   1219}
   1220
   1221static noinstr void xen_write_cr2(unsigned long cr2)
   1222{
   1223	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
   1224}
   1225
   1226static noinline void xen_flush_tlb(void)
   1227{
   1228	struct mmuext_op *op;
   1229	struct multicall_space mcs;
   1230
   1231	preempt_disable();
   1232
   1233	mcs = xen_mc_entry(sizeof(*op));
   1234
   1235	op = mcs.args;
   1236	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
   1237	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
   1238
   1239	xen_mc_issue(PARAVIRT_LAZY_MMU);
   1240
   1241	preempt_enable();
   1242}
   1243
   1244static void xen_flush_tlb_one_user(unsigned long addr)
   1245{
   1246	struct mmuext_op *op;
   1247	struct multicall_space mcs;
   1248
   1249	trace_xen_mmu_flush_tlb_one_user(addr);
   1250
   1251	preempt_disable();
   1252
   1253	mcs = xen_mc_entry(sizeof(*op));
   1254	op = mcs.args;
   1255	op->cmd = MMUEXT_INVLPG_LOCAL;
   1256	op->arg1.linear_addr = addr & PAGE_MASK;
   1257	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
   1258
   1259	xen_mc_issue(PARAVIRT_LAZY_MMU);
   1260
   1261	preempt_enable();
   1262}
   1263
   1264static void xen_flush_tlb_multi(const struct cpumask *cpus,
   1265				const struct flush_tlb_info *info)
   1266{
   1267	struct {
   1268		struct mmuext_op op;
   1269		DECLARE_BITMAP(mask, NR_CPUS);
   1270	} *args;
   1271	struct multicall_space mcs;
   1272	const size_t mc_entry_size = sizeof(args->op) +
   1273		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
   1274
   1275	trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
   1276
   1277	if (cpumask_empty(cpus))
   1278		return;		/* nothing to do */
   1279
   1280	mcs = xen_mc_entry(mc_entry_size);
   1281	args = mcs.args;
   1282	args->op.arg2.vcpumask = to_cpumask(args->mask);
   1283
   1284	/* Remove any offline CPUs */
   1285	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
   1286
   1287	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
   1288	if (info->end != TLB_FLUSH_ALL &&
   1289	    (info->end - info->start) <= PAGE_SIZE) {
   1290		args->op.cmd = MMUEXT_INVLPG_MULTI;
   1291		args->op.arg1.linear_addr = info->start;
   1292	}
   1293
   1294	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
   1295
   1296	xen_mc_issue(PARAVIRT_LAZY_MMU);
   1297}
   1298
   1299static unsigned long xen_read_cr3(void)
   1300{
   1301	return this_cpu_read(xen_cr3);
   1302}
   1303
   1304static void set_current_cr3(void *v)
   1305{
   1306	this_cpu_write(xen_current_cr3, (unsigned long)v);
   1307}
   1308
   1309static void __xen_write_cr3(bool kernel, unsigned long cr3)
   1310{
   1311	struct mmuext_op op;
   1312	unsigned long mfn;
   1313
   1314	trace_xen_mmu_write_cr3(kernel, cr3);
   1315
   1316	if (cr3)
   1317		mfn = pfn_to_mfn(PFN_DOWN(cr3));
   1318	else
   1319		mfn = 0;
   1320
   1321	WARN_ON(mfn == 0 && kernel);
   1322
   1323	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
   1324	op.arg1.mfn = mfn;
   1325
   1326	xen_extend_mmuext_op(&op);
   1327
   1328	if (kernel) {
   1329		this_cpu_write(xen_cr3, cr3);
   1330
   1331		/* Update xen_current_cr3 once the batch has actually
   1332		   been submitted. */
   1333		xen_mc_callback(set_current_cr3, (void *)cr3);
   1334	}
   1335}
   1336static void xen_write_cr3(unsigned long cr3)
   1337{
   1338	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
   1339
   1340	BUG_ON(preemptible());
   1341
   1342	xen_mc_batch();  /* disables interrupts */
   1343
   1344	/* Update while interrupts are disabled, so its atomic with
   1345	   respect to ipis */
   1346	this_cpu_write(xen_cr3, cr3);
   1347
   1348	__xen_write_cr3(true, cr3);
   1349
   1350	if (user_pgd)
   1351		__xen_write_cr3(false, __pa(user_pgd));
   1352	else
   1353		__xen_write_cr3(false, 0);
   1354
   1355	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
   1356}
   1357
   1358/*
   1359 * At the start of the day - when Xen launches a guest, it has already
   1360 * built pagetables for the guest. We diligently look over them
   1361 * in xen_setup_kernel_pagetable and graft as appropriate them in the
   1362 * init_top_pgt and its friends. Then when we are happy we load
   1363 * the new init_top_pgt - and continue on.
   1364 *
   1365 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
   1366 * up the rest of the pagetables. When it has completed it loads the cr3.
   1367 * N.B. that baremetal would start at 'start_kernel' (and the early
   1368 * #PF handler would create bootstrap pagetables) - so we are running
   1369 * with the same assumptions as what to do when write_cr3 is executed
   1370 * at this point.
   1371 *
   1372 * Since there are no user-page tables at all, we have two variants
   1373 * of xen_write_cr3 - the early bootup (this one), and the late one
   1374 * (xen_write_cr3). The reason we have to do that is that in 64-bit
   1375 * the Linux kernel and user-space are both in ring 3 while the
   1376 * hypervisor is in ring 0.
   1377 */
   1378static void __init xen_write_cr3_init(unsigned long cr3)
   1379{
   1380	BUG_ON(preemptible());
   1381
   1382	xen_mc_batch();  /* disables interrupts */
   1383
   1384	/* Update while interrupts are disabled, so its atomic with
   1385	   respect to ipis */
   1386	this_cpu_write(xen_cr3, cr3);
   1387
   1388	__xen_write_cr3(true, cr3);
   1389
   1390	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
   1391}
   1392
   1393static int xen_pgd_alloc(struct mm_struct *mm)
   1394{
   1395	pgd_t *pgd = mm->pgd;
   1396	struct page *page = virt_to_page(pgd);
   1397	pgd_t *user_pgd;
   1398	int ret = -ENOMEM;
   1399
   1400	BUG_ON(PagePinned(virt_to_page(pgd)));
   1401	BUG_ON(page->private != 0);
   1402
   1403	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
   1404	page->private = (unsigned long)user_pgd;
   1405
   1406	if (user_pgd != NULL) {
   1407#ifdef CONFIG_X86_VSYSCALL_EMULATION
   1408		user_pgd[pgd_index(VSYSCALL_ADDR)] =
   1409			__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
   1410#endif
   1411		ret = 0;
   1412	}
   1413
   1414	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
   1415
   1416	return ret;
   1417}
   1418
   1419static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
   1420{
   1421	pgd_t *user_pgd = xen_get_user_pgd(pgd);
   1422
   1423	if (user_pgd)
   1424		free_page((unsigned long)user_pgd);
   1425}
   1426
   1427/*
   1428 * Init-time set_pte while constructing initial pagetables, which
   1429 * doesn't allow RO page table pages to be remapped RW.
   1430 *
   1431 * If there is no MFN for this PFN then this page is initially
   1432 * ballooned out so clear the PTE (as in decrease_reservation() in
   1433 * drivers/xen/balloon.c).
   1434 *
   1435 * Many of these PTE updates are done on unpinned and writable pages
   1436 * and doing a hypercall for these is unnecessary and expensive.  At
   1437 * this point it is rarely possible to tell if a page is pinned, so
   1438 * mostly write the PTE directly and rely on Xen trapping and
   1439 * emulating any updates as necessary.
   1440 */
   1441static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
   1442{
   1443	if (unlikely(is_early_ioremap_ptep(ptep)))
   1444		__xen_set_pte(ptep, pte);
   1445	else
   1446		native_set_pte(ptep, pte);
   1447}
   1448
   1449__visible pte_t xen_make_pte_init(pteval_t pte)
   1450{
   1451	unsigned long pfn;
   1452
   1453	/*
   1454	 * Pages belonging to the initial p2m list mapped outside the default
   1455	 * address range must be mapped read-only. This region contains the
   1456	 * page tables for mapping the p2m list, too, and page tables MUST be
   1457	 * mapped read-only.
   1458	 */
   1459	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
   1460	if (xen_start_info->mfn_list < __START_KERNEL_map &&
   1461	    pfn >= xen_start_info->first_p2m_pfn &&
   1462	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
   1463		pte &= ~_PAGE_RW;
   1464
   1465	pte = pte_pfn_to_mfn(pte);
   1466	return native_make_pte(pte);
   1467}
   1468PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
   1469
   1470/* Early in boot, while setting up the initial pagetable, assume
   1471   everything is pinned. */
   1472static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
   1473{
   1474#ifdef CONFIG_FLATMEM
   1475	BUG_ON(mem_map);	/* should only be used early */
   1476#endif
   1477	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
   1478	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
   1479}
   1480
   1481/* Used for pmd and pud */
   1482static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
   1483{
   1484#ifdef CONFIG_FLATMEM
   1485	BUG_ON(mem_map);	/* should only be used early */
   1486#endif
   1487	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
   1488}
   1489
   1490/* Early release_pte assumes that all pts are pinned, since there's
   1491   only init_mm and anything attached to that is pinned. */
   1492static void __init xen_release_pte_init(unsigned long pfn)
   1493{
   1494	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
   1495	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
   1496}
   1497
   1498static void __init xen_release_pmd_init(unsigned long pfn)
   1499{
   1500	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
   1501}
   1502
   1503static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
   1504{
   1505	struct multicall_space mcs;
   1506	struct mmuext_op *op;
   1507
   1508	mcs = __xen_mc_entry(sizeof(*op));
   1509	op = mcs.args;
   1510	op->cmd = cmd;
   1511	op->arg1.mfn = pfn_to_mfn(pfn);
   1512
   1513	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
   1514}
   1515
   1516static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
   1517{
   1518	struct multicall_space mcs;
   1519	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
   1520
   1521	mcs = __xen_mc_entry(0);
   1522	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
   1523				pfn_pte(pfn, prot), 0);
   1524}
   1525
   1526/* This needs to make sure the new pte page is pinned iff its being
   1527   attached to a pinned pagetable. */
   1528static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
   1529				    unsigned level)
   1530{
   1531	bool pinned = xen_page_pinned(mm->pgd);
   1532
   1533	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
   1534
   1535	if (pinned) {
   1536		struct page *page = pfn_to_page(pfn);
   1537
   1538		pinned = false;
   1539		if (static_branch_likely(&xen_struct_pages_ready)) {
   1540			pinned = PagePinned(page);
   1541			SetPagePinned(page);
   1542		}
   1543
   1544		xen_mc_batch();
   1545
   1546		__set_pfn_prot(pfn, PAGE_KERNEL_RO);
   1547
   1548		if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
   1549			__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
   1550
   1551		xen_mc_issue(PARAVIRT_LAZY_MMU);
   1552	}
   1553}
   1554
   1555static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
   1556{
   1557	xen_alloc_ptpage(mm, pfn, PT_PTE);
   1558}
   1559
   1560static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
   1561{
   1562	xen_alloc_ptpage(mm, pfn, PT_PMD);
   1563}
   1564
   1565/* This should never happen until we're OK to use struct page */
   1566static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
   1567{
   1568	struct page *page = pfn_to_page(pfn);
   1569	bool pinned = PagePinned(page);
   1570
   1571	trace_xen_mmu_release_ptpage(pfn, level, pinned);
   1572
   1573	if (pinned) {
   1574		xen_mc_batch();
   1575
   1576		if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
   1577			__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
   1578
   1579		__set_pfn_prot(pfn, PAGE_KERNEL);
   1580
   1581		xen_mc_issue(PARAVIRT_LAZY_MMU);
   1582
   1583		ClearPagePinned(page);
   1584	}
   1585}
   1586
   1587static void xen_release_pte(unsigned long pfn)
   1588{
   1589	xen_release_ptpage(pfn, PT_PTE);
   1590}
   1591
   1592static void xen_release_pmd(unsigned long pfn)
   1593{
   1594	xen_release_ptpage(pfn, PT_PMD);
   1595}
   1596
   1597static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
   1598{
   1599	xen_alloc_ptpage(mm, pfn, PT_PUD);
   1600}
   1601
   1602static void xen_release_pud(unsigned long pfn)
   1603{
   1604	xen_release_ptpage(pfn, PT_PUD);
   1605}
   1606
   1607/*
   1608 * Like __va(), but returns address in the kernel mapping (which is
   1609 * all we have until the physical memory mapping has been set up.
   1610 */
   1611static void * __init __ka(phys_addr_t paddr)
   1612{
   1613	return (void *)(paddr + __START_KERNEL_map);
   1614}
   1615
   1616/* Convert a machine address to physical address */
   1617static unsigned long __init m2p(phys_addr_t maddr)
   1618{
   1619	phys_addr_t paddr;
   1620
   1621	maddr &= XEN_PTE_MFN_MASK;
   1622	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
   1623
   1624	return paddr;
   1625}
   1626
   1627/* Convert a machine address to kernel virtual */
   1628static void * __init m2v(phys_addr_t maddr)
   1629{
   1630	return __ka(m2p(maddr));
   1631}
   1632
   1633/* Set the page permissions on an identity-mapped pages */
   1634static void __init set_page_prot_flags(void *addr, pgprot_t prot,
   1635				       unsigned long flags)
   1636{
   1637	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
   1638	pte_t pte = pfn_pte(pfn, prot);
   1639
   1640	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
   1641		BUG();
   1642}
   1643static void __init set_page_prot(void *addr, pgprot_t prot)
   1644{
   1645	return set_page_prot_flags(addr, prot, UVMF_NONE);
   1646}
   1647
   1648void __init xen_setup_machphys_mapping(void)
   1649{
   1650	struct xen_machphys_mapping mapping;
   1651
   1652	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
   1653		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
   1654		machine_to_phys_nr = mapping.max_mfn + 1;
   1655	} else {
   1656		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
   1657	}
   1658}
   1659
   1660static void __init convert_pfn_mfn(void *v)
   1661{
   1662	pte_t *pte = v;
   1663	int i;
   1664
   1665	/* All levels are converted the same way, so just treat them
   1666	   as ptes. */
   1667	for (i = 0; i < PTRS_PER_PTE; i++)
   1668		pte[i] = xen_make_pte(pte[i].pte);
   1669}
   1670static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
   1671				 unsigned long addr)
   1672{
   1673	if (*pt_base == PFN_DOWN(__pa(addr))) {
   1674		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
   1675		clear_page((void *)addr);
   1676		(*pt_base)++;
   1677	}
   1678	if (*pt_end == PFN_DOWN(__pa(addr))) {
   1679		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
   1680		clear_page((void *)addr);
   1681		(*pt_end)--;
   1682	}
   1683}
   1684/*
   1685 * Set up the initial kernel pagetable.
   1686 *
   1687 * We can construct this by grafting the Xen provided pagetable into
   1688 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
   1689 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
   1690 * kernel has a physical mapping to start with - but that's enough to
   1691 * get __va working.  We need to fill in the rest of the physical
   1692 * mapping once some sort of allocator has been set up.
   1693 */
   1694void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
   1695{
   1696	pud_t *l3;
   1697	pmd_t *l2;
   1698	unsigned long addr[3];
   1699	unsigned long pt_base, pt_end;
   1700	unsigned i;
   1701
   1702	/* max_pfn_mapped is the last pfn mapped in the initial memory
   1703	 * mappings. Considering that on Xen after the kernel mappings we
   1704	 * have the mappings of some pages that don't exist in pfn space, we
   1705	 * set max_pfn_mapped to the last real pfn mapped. */
   1706	if (xen_start_info->mfn_list < __START_KERNEL_map)
   1707		max_pfn_mapped = xen_start_info->first_p2m_pfn;
   1708	else
   1709		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
   1710
   1711	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
   1712	pt_end = pt_base + xen_start_info->nr_pt_frames;
   1713
   1714	/* Zap identity mapping */
   1715	init_top_pgt[0] = __pgd(0);
   1716
   1717	/* Pre-constructed entries are in pfn, so convert to mfn */
   1718	/* L4[273] -> level3_ident_pgt  */
   1719	/* L4[511] -> level3_kernel_pgt */
   1720	convert_pfn_mfn(init_top_pgt);
   1721
   1722	/* L3_i[0] -> level2_ident_pgt */
   1723	convert_pfn_mfn(level3_ident_pgt);
   1724	/* L3_k[510] -> level2_kernel_pgt */
   1725	/* L3_k[511] -> level2_fixmap_pgt */
   1726	convert_pfn_mfn(level3_kernel_pgt);
   1727
   1728	/* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
   1729	convert_pfn_mfn(level2_fixmap_pgt);
   1730
   1731	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
   1732	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
   1733	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
   1734
   1735	addr[0] = (unsigned long)pgd;
   1736	addr[1] = (unsigned long)l3;
   1737	addr[2] = (unsigned long)l2;
   1738	/* Graft it onto L4[273][0]. Note that we creating an aliasing problem:
   1739	 * Both L4[273][0] and L4[511][510] have entries that point to the same
   1740	 * L2 (PMD) tables. Meaning that if you modify it in __va space
   1741	 * it will be also modified in the __ka space! (But if you just
   1742	 * modify the PMD table to point to other PTE's or none, then you
   1743	 * are OK - which is what cleanup_highmap does) */
   1744	copy_page(level2_ident_pgt, l2);
   1745	/* Graft it onto L4[511][510] */
   1746	copy_page(level2_kernel_pgt, l2);
   1747
   1748	/*
   1749	 * Zap execute permission from the ident map. Due to the sharing of
   1750	 * L1 entries we need to do this in the L2.
   1751	 */
   1752	if (__supported_pte_mask & _PAGE_NX) {
   1753		for (i = 0; i < PTRS_PER_PMD; ++i) {
   1754			if (pmd_none(level2_ident_pgt[i]))
   1755				continue;
   1756			level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
   1757		}
   1758	}
   1759
   1760	/* Copy the initial P->M table mappings if necessary. */
   1761	i = pgd_index(xen_start_info->mfn_list);
   1762	if (i && i < pgd_index(__START_KERNEL_map))
   1763		init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
   1764
   1765	/* Make pagetable pieces RO */
   1766	set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
   1767	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
   1768	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
   1769	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
   1770	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
   1771	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
   1772
   1773	for (i = 0; i < FIXMAP_PMD_NUM; i++) {
   1774		set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
   1775			      PAGE_KERNEL_RO);
   1776	}
   1777
   1778	/* Pin down new L4 */
   1779	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
   1780			  PFN_DOWN(__pa_symbol(init_top_pgt)));
   1781
   1782	/* Unpin Xen-provided one */
   1783	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
   1784
   1785#ifdef CONFIG_X86_VSYSCALL_EMULATION
   1786	/* Pin user vsyscall L3 */
   1787	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
   1788	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
   1789			  PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
   1790#endif
   1791
   1792	/*
   1793	 * At this stage there can be no user pgd, and no page structure to
   1794	 * attach it to, so make sure we just set kernel pgd.
   1795	 */
   1796	xen_mc_batch();
   1797	__xen_write_cr3(true, __pa(init_top_pgt));
   1798	xen_mc_issue(PARAVIRT_LAZY_CPU);
   1799
   1800	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
   1801	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
   1802	 * the initial domain. For guests using the toolstack, they are in:
   1803	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
   1804	 * rip out the [L4] (pgd), but for guests we shave off three pages.
   1805	 */
   1806	for (i = 0; i < ARRAY_SIZE(addr); i++)
   1807		check_pt_base(&pt_base, &pt_end, addr[i]);
   1808
   1809	/* Our (by three pages) smaller Xen pagetable that we are using */
   1810	xen_pt_base = PFN_PHYS(pt_base);
   1811	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
   1812	memblock_reserve(xen_pt_base, xen_pt_size);
   1813
   1814	/* Revector the xen_start_info */
   1815	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
   1816}
   1817
   1818/*
   1819 * Read a value from a physical address.
   1820 */
   1821static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
   1822{
   1823	unsigned long *vaddr;
   1824	unsigned long val;
   1825
   1826	vaddr = early_memremap_ro(addr, sizeof(val));
   1827	val = *vaddr;
   1828	early_memunmap(vaddr, sizeof(val));
   1829	return val;
   1830}
   1831
   1832/*
   1833 * Translate a virtual address to a physical one without relying on mapped
   1834 * page tables. Don't rely on big pages being aligned in (guest) physical
   1835 * space!
   1836 */
   1837static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
   1838{
   1839	phys_addr_t pa;
   1840	pgd_t pgd;
   1841	pud_t pud;
   1842	pmd_t pmd;
   1843	pte_t pte;
   1844
   1845	pa = read_cr3_pa();
   1846	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
   1847						       sizeof(pgd)));
   1848	if (!pgd_present(pgd))
   1849		return 0;
   1850
   1851	pa = pgd_val(pgd) & PTE_PFN_MASK;
   1852	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
   1853						       sizeof(pud)));
   1854	if (!pud_present(pud))
   1855		return 0;
   1856	pa = pud_val(pud) & PTE_PFN_MASK;
   1857	if (pud_large(pud))
   1858		return pa + (vaddr & ~PUD_MASK);
   1859
   1860	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
   1861						       sizeof(pmd)));
   1862	if (!pmd_present(pmd))
   1863		return 0;
   1864	pa = pmd_val(pmd) & PTE_PFN_MASK;
   1865	if (pmd_large(pmd))
   1866		return pa + (vaddr & ~PMD_MASK);
   1867
   1868	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
   1869						       sizeof(pte)));
   1870	if (!pte_present(pte))
   1871		return 0;
   1872	pa = pte_pfn(pte) << PAGE_SHIFT;
   1873
   1874	return pa | (vaddr & ~PAGE_MASK);
   1875}
   1876
   1877/*
   1878 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
   1879 * this area.
   1880 */
   1881void __init xen_relocate_p2m(void)
   1882{
   1883	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
   1884	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
   1885	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
   1886	pte_t *pt;
   1887	pmd_t *pmd;
   1888	pud_t *pud;
   1889	pgd_t *pgd;
   1890	unsigned long *new_p2m;
   1891
   1892	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
   1893	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
   1894	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
   1895	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
   1896	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
   1897	n_frames = n_pte + n_pt + n_pmd + n_pud;
   1898
   1899	new_area = xen_find_free_area(PFN_PHYS(n_frames));
   1900	if (!new_area) {
   1901		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
   1902		BUG();
   1903	}
   1904
   1905	/*
   1906	 * Setup the page tables for addressing the new p2m list.
   1907	 * We have asked the hypervisor to map the p2m list at the user address
   1908	 * PUD_SIZE. It may have done so, or it may have used a kernel space
   1909	 * address depending on the Xen version.
   1910	 * To avoid any possible virtual address collision, just use
   1911	 * 2 * PUD_SIZE for the new area.
   1912	 */
   1913	pud_phys = new_area;
   1914	pmd_phys = pud_phys + PFN_PHYS(n_pud);
   1915	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
   1916	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
   1917
   1918	pgd = __va(read_cr3_pa());
   1919	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
   1920	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
   1921		pud = early_memremap(pud_phys, PAGE_SIZE);
   1922		clear_page(pud);
   1923		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
   1924				idx_pmd++) {
   1925			pmd = early_memremap(pmd_phys, PAGE_SIZE);
   1926			clear_page(pmd);
   1927			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
   1928					idx_pt++) {
   1929				pt = early_memremap(pt_phys, PAGE_SIZE);
   1930				clear_page(pt);
   1931				for (idx_pte = 0;
   1932				     idx_pte < min(n_pte, PTRS_PER_PTE);
   1933				     idx_pte++) {
   1934					pt[idx_pte] = pfn_pte(p2m_pfn,
   1935							      PAGE_KERNEL);
   1936					p2m_pfn++;
   1937				}
   1938				n_pte -= PTRS_PER_PTE;
   1939				early_memunmap(pt, PAGE_SIZE);
   1940				make_lowmem_page_readonly(__va(pt_phys));
   1941				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
   1942						PFN_DOWN(pt_phys));
   1943				pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys);
   1944				pt_phys += PAGE_SIZE;
   1945			}
   1946			n_pt -= PTRS_PER_PMD;
   1947			early_memunmap(pmd, PAGE_SIZE);
   1948			make_lowmem_page_readonly(__va(pmd_phys));
   1949			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
   1950					PFN_DOWN(pmd_phys));
   1951			pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys);
   1952			pmd_phys += PAGE_SIZE;
   1953		}
   1954		n_pmd -= PTRS_PER_PUD;
   1955		early_memunmap(pud, PAGE_SIZE);
   1956		make_lowmem_page_readonly(__va(pud_phys));
   1957		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
   1958		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
   1959		pud_phys += PAGE_SIZE;
   1960	}
   1961
   1962	/* Now copy the old p2m info to the new area. */
   1963	memcpy(new_p2m, xen_p2m_addr, size);
   1964	xen_p2m_addr = new_p2m;
   1965
   1966	/* Release the old p2m list and set new list info. */
   1967	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
   1968	BUG_ON(!p2m_pfn);
   1969	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
   1970
   1971	if (xen_start_info->mfn_list < __START_KERNEL_map) {
   1972		pfn = xen_start_info->first_p2m_pfn;
   1973		pfn_end = xen_start_info->first_p2m_pfn +
   1974			  xen_start_info->nr_p2m_frames;
   1975		set_pgd(pgd + 1, __pgd(0));
   1976	} else {
   1977		pfn = p2m_pfn;
   1978		pfn_end = p2m_pfn_end;
   1979	}
   1980
   1981	memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
   1982	while (pfn < pfn_end) {
   1983		if (pfn == p2m_pfn) {
   1984			pfn = p2m_pfn_end;
   1985			continue;
   1986		}
   1987		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
   1988		pfn++;
   1989	}
   1990
   1991	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
   1992	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
   1993	xen_start_info->nr_p2m_frames = n_frames;
   1994}
   1995
   1996void __init xen_reserve_special_pages(void)
   1997{
   1998	phys_addr_t paddr;
   1999
   2000	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
   2001	if (xen_start_info->store_mfn) {
   2002		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
   2003		memblock_reserve(paddr, PAGE_SIZE);
   2004	}
   2005	if (!xen_initial_domain()) {
   2006		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
   2007		memblock_reserve(paddr, PAGE_SIZE);
   2008	}
   2009}
   2010
   2011void __init xen_pt_check_e820(void)
   2012{
   2013	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
   2014		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
   2015		BUG();
   2016	}
   2017}
   2018
   2019static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
   2020
   2021static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
   2022{
   2023	pte_t pte;
   2024	unsigned long vaddr;
   2025
   2026	phys >>= PAGE_SHIFT;
   2027
   2028	switch (idx) {
   2029	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
   2030#ifdef CONFIG_X86_VSYSCALL_EMULATION
   2031	case VSYSCALL_PAGE:
   2032#endif
   2033		/* All local page mappings */
   2034		pte = pfn_pte(phys, prot);
   2035		break;
   2036
   2037#ifdef CONFIG_X86_LOCAL_APIC
   2038	case FIX_APIC_BASE:	/* maps dummy local APIC */
   2039		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
   2040		break;
   2041#endif
   2042
   2043#ifdef CONFIG_X86_IO_APIC
   2044	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
   2045		/*
   2046		 * We just don't map the IO APIC - all access is via
   2047		 * hypercalls.  Keep the address in the pte for reference.
   2048		 */
   2049		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
   2050		break;
   2051#endif
   2052
   2053	case FIX_PARAVIRT_BOOTMAP:
   2054		/* This is an MFN, but it isn't an IO mapping from the
   2055		   IO domain */
   2056		pte = mfn_pte(phys, prot);
   2057		break;
   2058
   2059	default:
   2060		/* By default, set_fixmap is used for hardware mappings */
   2061		pte = mfn_pte(phys, prot);
   2062		break;
   2063	}
   2064
   2065	vaddr = __fix_to_virt(idx);
   2066	if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG))
   2067		BUG();
   2068
   2069#ifdef CONFIG_X86_VSYSCALL_EMULATION
   2070	/* Replicate changes to map the vsyscall page into the user
   2071	   pagetable vsyscall mapping. */
   2072	if (idx == VSYSCALL_PAGE)
   2073		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
   2074#endif
   2075}
   2076
   2077static void __init xen_post_allocator_init(void)
   2078{
   2079	pv_ops.mmu.set_pte = xen_set_pte;
   2080	pv_ops.mmu.set_pmd = xen_set_pmd;
   2081	pv_ops.mmu.set_pud = xen_set_pud;
   2082	pv_ops.mmu.set_p4d = xen_set_p4d;
   2083
   2084	/* This will work as long as patching hasn't happened yet
   2085	   (which it hasn't) */
   2086	pv_ops.mmu.alloc_pte = xen_alloc_pte;
   2087	pv_ops.mmu.alloc_pmd = xen_alloc_pmd;
   2088	pv_ops.mmu.release_pte = xen_release_pte;
   2089	pv_ops.mmu.release_pmd = xen_release_pmd;
   2090	pv_ops.mmu.alloc_pud = xen_alloc_pud;
   2091	pv_ops.mmu.release_pud = xen_release_pud;
   2092	pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte);
   2093
   2094	pv_ops.mmu.write_cr3 = &xen_write_cr3;
   2095}
   2096
   2097static void xen_leave_lazy_mmu(void)
   2098{
   2099	preempt_disable();
   2100	xen_mc_flush();
   2101	paravirt_leave_lazy_mmu();
   2102	preempt_enable();
   2103}
   2104
   2105static const typeof(pv_ops) xen_mmu_ops __initconst = {
   2106	.mmu = {
   2107		.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
   2108		.write_cr2 = xen_write_cr2,
   2109
   2110		.read_cr3 = xen_read_cr3,
   2111		.write_cr3 = xen_write_cr3_init,
   2112
   2113		.flush_tlb_user = xen_flush_tlb,
   2114		.flush_tlb_kernel = xen_flush_tlb,
   2115		.flush_tlb_one_user = xen_flush_tlb_one_user,
   2116		.flush_tlb_multi = xen_flush_tlb_multi,
   2117		.tlb_remove_table = tlb_remove_table,
   2118
   2119		.pgd_alloc = xen_pgd_alloc,
   2120		.pgd_free = xen_pgd_free,
   2121
   2122		.alloc_pte = xen_alloc_pte_init,
   2123		.release_pte = xen_release_pte_init,
   2124		.alloc_pmd = xen_alloc_pmd_init,
   2125		.release_pmd = xen_release_pmd_init,
   2126
   2127		.set_pte = xen_set_pte_init,
   2128		.set_pmd = xen_set_pmd_hyper,
   2129
   2130		.ptep_modify_prot_start = xen_ptep_modify_prot_start,
   2131		.ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
   2132
   2133		.pte_val = PV_CALLEE_SAVE(xen_pte_val),
   2134		.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
   2135
   2136		.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
   2137		.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
   2138
   2139		.set_pud = xen_set_pud_hyper,
   2140
   2141		.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
   2142		.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
   2143
   2144		.pud_val = PV_CALLEE_SAVE(xen_pud_val),
   2145		.make_pud = PV_CALLEE_SAVE(xen_make_pud),
   2146		.set_p4d = xen_set_p4d_hyper,
   2147
   2148		.alloc_pud = xen_alloc_pmd_init,
   2149		.release_pud = xen_release_pmd_init,
   2150
   2151#if CONFIG_PGTABLE_LEVELS >= 5
   2152		.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
   2153		.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
   2154#endif
   2155
   2156		.activate_mm = xen_activate_mm,
   2157		.dup_mmap = xen_dup_mmap,
   2158		.exit_mmap = xen_exit_mmap,
   2159
   2160		.lazy_mode = {
   2161			.enter = paravirt_enter_lazy_mmu,
   2162			.leave = xen_leave_lazy_mmu,
   2163			.flush = paravirt_flush_lazy_mmu,
   2164		},
   2165
   2166		.set_fixmap = xen_set_fixmap,
   2167	},
   2168};
   2169
   2170void __init xen_init_mmu_ops(void)
   2171{
   2172	x86_init.paging.pagetable_init = xen_pagetable_init;
   2173	x86_init.hyper.init_after_bootmem = xen_after_bootmem;
   2174
   2175	pv_ops.mmu = xen_mmu_ops.mmu;
   2176
   2177	memset(dummy_mapping, 0xff, PAGE_SIZE);
   2178}
   2179
   2180/* Protected by xen_reservation_lock. */
   2181#define MAX_CONTIG_ORDER 9 /* 2MB */
   2182static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
   2183
   2184#define VOID_PTE (mfn_pte(0, __pgprot(0)))
   2185static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
   2186				unsigned long *in_frames,
   2187				unsigned long *out_frames)
   2188{
   2189	int i;
   2190	struct multicall_space mcs;
   2191
   2192	xen_mc_batch();
   2193	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
   2194		mcs = __xen_mc_entry(0);
   2195
   2196		if (in_frames)
   2197			in_frames[i] = virt_to_mfn(vaddr);
   2198
   2199		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
   2200		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
   2201
   2202		if (out_frames)
   2203			out_frames[i] = virt_to_pfn(vaddr);
   2204	}
   2205	xen_mc_issue(0);
   2206}
   2207
   2208/*
   2209 * Update the pfn-to-mfn mappings for a virtual address range, either to
   2210 * point to an array of mfns, or contiguously from a single starting
   2211 * mfn.
   2212 */
   2213static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
   2214				     unsigned long *mfns,
   2215				     unsigned long first_mfn)
   2216{
   2217	unsigned i, limit;
   2218	unsigned long mfn;
   2219
   2220	xen_mc_batch();
   2221
   2222	limit = 1u << order;
   2223	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
   2224		struct multicall_space mcs;
   2225		unsigned flags;
   2226
   2227		mcs = __xen_mc_entry(0);
   2228		if (mfns)
   2229			mfn = mfns[i];
   2230		else
   2231			mfn = first_mfn + i;
   2232
   2233		if (i < (limit - 1))
   2234			flags = 0;
   2235		else {
   2236			if (order == 0)
   2237				flags = UVMF_INVLPG | UVMF_ALL;
   2238			else
   2239				flags = UVMF_TLB_FLUSH | UVMF_ALL;
   2240		}
   2241
   2242		MULTI_update_va_mapping(mcs.mc, vaddr,
   2243				mfn_pte(mfn, PAGE_KERNEL), flags);
   2244
   2245		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
   2246	}
   2247
   2248	xen_mc_issue(0);
   2249}
   2250
   2251/*
   2252 * Perform the hypercall to exchange a region of our pfns to point to
   2253 * memory with the required contiguous alignment.  Takes the pfns as
   2254 * input, and populates mfns as output.
   2255 *
   2256 * Returns a success code indicating whether the hypervisor was able to
   2257 * satisfy the request or not.
   2258 */
   2259static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
   2260			       unsigned long *pfns_in,
   2261			       unsigned long extents_out,
   2262			       unsigned int order_out,
   2263			       unsigned long *mfns_out,
   2264			       unsigned int address_bits)
   2265{
   2266	long rc;
   2267	int success;
   2268
   2269	struct xen_memory_exchange exchange = {
   2270		.in = {
   2271			.nr_extents   = extents_in,
   2272			.extent_order = order_in,
   2273			.extent_start = pfns_in,
   2274			.domid        = DOMID_SELF
   2275		},
   2276		.out = {
   2277			.nr_extents   = extents_out,
   2278			.extent_order = order_out,
   2279			.extent_start = mfns_out,
   2280			.address_bits = address_bits,
   2281			.domid        = DOMID_SELF
   2282		}
   2283	};
   2284
   2285	BUG_ON(extents_in << order_in != extents_out << order_out);
   2286
   2287	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
   2288	success = (exchange.nr_exchanged == extents_in);
   2289
   2290	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
   2291	BUG_ON(success && (rc != 0));
   2292
   2293	return success;
   2294}
   2295
   2296int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
   2297				 unsigned int address_bits,
   2298				 dma_addr_t *dma_handle)
   2299{
   2300	unsigned long *in_frames = discontig_frames, out_frame;
   2301	unsigned long  flags;
   2302	int            success;
   2303	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
   2304
   2305	/*
   2306	 * Currently an auto-translated guest will not perform I/O, nor will
   2307	 * it require PAE page directories below 4GB. Therefore any calls to
   2308	 * this function are redundant and can be ignored.
   2309	 */
   2310
   2311	if (unlikely(order > MAX_CONTIG_ORDER))
   2312		return -ENOMEM;
   2313
   2314	memset((void *) vstart, 0, PAGE_SIZE << order);
   2315
   2316	spin_lock_irqsave(&xen_reservation_lock, flags);
   2317
   2318	/* 1. Zap current PTEs, remembering MFNs. */
   2319	xen_zap_pfn_range(vstart, order, in_frames, NULL);
   2320
   2321	/* 2. Get a new contiguous memory extent. */
   2322	out_frame = virt_to_pfn(vstart);
   2323	success = xen_exchange_memory(1UL << order, 0, in_frames,
   2324				      1, order, &out_frame,
   2325				      address_bits);
   2326
   2327	/* 3. Map the new extent in place of old pages. */
   2328	if (success)
   2329		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
   2330	else
   2331		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
   2332
   2333	spin_unlock_irqrestore(&xen_reservation_lock, flags);
   2334
   2335	*dma_handle = virt_to_machine(vstart).maddr;
   2336	return success ? 0 : -ENOMEM;
   2337}
   2338
   2339void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
   2340{
   2341	unsigned long *out_frames = discontig_frames, in_frame;
   2342	unsigned long  flags;
   2343	int success;
   2344	unsigned long vstart;
   2345
   2346	if (unlikely(order > MAX_CONTIG_ORDER))
   2347		return;
   2348
   2349	vstart = (unsigned long)phys_to_virt(pstart);
   2350	memset((void *) vstart, 0, PAGE_SIZE << order);
   2351
   2352	spin_lock_irqsave(&xen_reservation_lock, flags);
   2353
   2354	/* 1. Find start MFN of contiguous extent. */
   2355	in_frame = virt_to_mfn(vstart);
   2356
   2357	/* 2. Zap current PTEs. */
   2358	xen_zap_pfn_range(vstart, order, NULL, out_frames);
   2359
   2360	/* 3. Do the exchange for non-contiguous MFNs. */
   2361	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
   2362					0, out_frames, 0);
   2363
   2364	/* 4. Map new pages in place of old pages. */
   2365	if (success)
   2366		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
   2367	else
   2368		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
   2369
   2370	spin_unlock_irqrestore(&xen_reservation_lock, flags);
   2371}
   2372
   2373static noinline void xen_flush_tlb_all(void)
   2374{
   2375	struct mmuext_op *op;
   2376	struct multicall_space mcs;
   2377
   2378	preempt_disable();
   2379
   2380	mcs = xen_mc_entry(sizeof(*op));
   2381
   2382	op = mcs.args;
   2383	op->cmd = MMUEXT_TLB_FLUSH_ALL;
   2384	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
   2385
   2386	xen_mc_issue(PARAVIRT_LAZY_MMU);
   2387
   2388	preempt_enable();
   2389}
   2390
   2391#define REMAP_BATCH_SIZE 16
   2392
   2393struct remap_data {
   2394	xen_pfn_t *pfn;
   2395	bool contiguous;
   2396	bool no_translate;
   2397	pgprot_t prot;
   2398	struct mmu_update *mmu_update;
   2399};
   2400
   2401static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
   2402{
   2403	struct remap_data *rmd = data;
   2404	pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
   2405
   2406	/*
   2407	 * If we have a contiguous range, just update the pfn itself,
   2408	 * else update pointer to be "next pfn".
   2409	 */
   2410	if (rmd->contiguous)
   2411		(*rmd->pfn)++;
   2412	else
   2413		rmd->pfn++;
   2414
   2415	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
   2416	rmd->mmu_update->ptr |= rmd->no_translate ?
   2417		MMU_PT_UPDATE_NO_TRANSLATE :
   2418		MMU_NORMAL_PT_UPDATE;
   2419	rmd->mmu_update->val = pte_val_ma(pte);
   2420	rmd->mmu_update++;
   2421
   2422	return 0;
   2423}
   2424
   2425int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
   2426		  xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
   2427		  unsigned int domid, bool no_translate)
   2428{
   2429	int err = 0;
   2430	struct remap_data rmd;
   2431	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
   2432	unsigned long range;
   2433	int mapped = 0;
   2434
   2435	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
   2436
   2437	rmd.pfn = pfn;
   2438	rmd.prot = prot;
   2439	/*
   2440	 * We use the err_ptr to indicate if there we are doing a contiguous
   2441	 * mapping or a discontiguous mapping.
   2442	 */
   2443	rmd.contiguous = !err_ptr;
   2444	rmd.no_translate = no_translate;
   2445
   2446	while (nr) {
   2447		int index = 0;
   2448		int done = 0;
   2449		int batch = min(REMAP_BATCH_SIZE, nr);
   2450		int batch_left = batch;
   2451
   2452		range = (unsigned long)batch << PAGE_SHIFT;
   2453
   2454		rmd.mmu_update = mmu_update;
   2455		err = apply_to_page_range(vma->vm_mm, addr, range,
   2456					  remap_area_pfn_pte_fn, &rmd);
   2457		if (err)
   2458			goto out;
   2459
   2460		/*
   2461		 * We record the error for each page that gives an error, but
   2462		 * continue mapping until the whole set is done
   2463		 */
   2464		do {
   2465			int i;
   2466
   2467			err = HYPERVISOR_mmu_update(&mmu_update[index],
   2468						    batch_left, &done, domid);
   2469
   2470			/*
   2471			 * @err_ptr may be the same buffer as @gfn, so
   2472			 * only clear it after each chunk of @gfn is
   2473			 * used.
   2474			 */
   2475			if (err_ptr) {
   2476				for (i = index; i < index + done; i++)
   2477					err_ptr[i] = 0;
   2478			}
   2479			if (err < 0) {
   2480				if (!err_ptr)
   2481					goto out;
   2482				err_ptr[i] = err;
   2483				done++; /* Skip failed frame. */
   2484			} else
   2485				mapped += done;
   2486			batch_left -= done;
   2487			index += done;
   2488		} while (batch_left);
   2489
   2490		nr -= batch;
   2491		addr += range;
   2492		if (err_ptr)
   2493			err_ptr += batch;
   2494		cond_resched();
   2495	}
   2496out:
   2497
   2498	xen_flush_tlb_all();
   2499
   2500	return err < 0 ? err : mapped;
   2501}
   2502EXPORT_SYMBOL_GPL(xen_remap_pfn);
   2503
   2504#ifdef CONFIG_KEXEC_CORE
   2505phys_addr_t paddr_vmcoreinfo_note(void)
   2506{
   2507	if (xen_pv_domain())
   2508		return virt_to_machine(vmcoreinfo_note).maddr;
   2509	else
   2510		return __pa(vmcoreinfo_note);
   2511}
   2512#endif /* CONFIG_KEXEC_CORE */