set_memory.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
set_memory.c (59656B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright 2002 Andi Kleen, SuSE Labs.
      4 * Thanks to Ben LaHaise for precious feedback.
      5 */
      6#include <linux/highmem.h>
      7#include <linux/memblock.h>
      8#include <linux/sched.h>
      9#include <linux/mm.h>
     10#include <linux/interrupt.h>
     11#include <linux/seq_file.h>
     12#include <linux/debugfs.h>
     13#include <linux/pfn.h>
     14#include <linux/percpu.h>
     15#include <linux/gfp.h>
     16#include <linux/pci.h>
     17#include <linux/vmalloc.h>
     18#include <linux/libnvdimm.h>
     19#include <linux/vmstat.h>
     20#include <linux/kernel.h>
     21#include <linux/cc_platform.h>
     22#include <linux/set_memory.h>
     23
     24#include <asm/e820/api.h>
     25#include <asm/processor.h>
     26#include <asm/tlbflush.h>
     27#include <asm/sections.h>
     28#include <asm/setup.h>
     29#include <linux/uaccess.h>
     30#include <asm/pgalloc.h>
     31#include <asm/proto.h>
     32#include <asm/memtype.h>
     33#include <asm/hyperv-tlfs.h>
     34#include <asm/mshyperv.h>
     35
     36#include "../mm_internal.h"
     37
     38/*
     39 * The current flushing context - we pass it instead of 5 arguments:
     40 */
     41struct cpa_data {
     42	unsigned long	*vaddr;
     43	pgd_t		*pgd;
     44	pgprot_t	mask_set;
     45	pgprot_t	mask_clr;
     46	unsigned long	numpages;
     47	unsigned long	curpage;
     48	unsigned long	pfn;
     49	unsigned int	flags;
     50	unsigned int	force_split		: 1,
     51			force_static_prot	: 1,
     52			force_flush_all		: 1;
     53	struct page	**pages;
     54};
     55
     56enum cpa_warn {
     57	CPA_CONFLICT,
     58	CPA_PROTECT,
     59	CPA_DETECT,
     60};
     61
     62static const int cpa_warn_level = CPA_PROTECT;
     63
     64/*
     65 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
     66 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
     67 * entries change the page attribute in parallel to some other cpu
     68 * splitting a large page entry along with changing the attribute.
     69 */
     70static DEFINE_SPINLOCK(cpa_lock);
     71
     72#define CPA_FLUSHTLB 1
     73#define CPA_ARRAY 2
     74#define CPA_PAGES_ARRAY 4
     75#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
     76
     77static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
     78{
     79	return __pgprot(cachemode2protval(pcm));
     80}
     81
     82#ifdef CONFIG_PROC_FS
     83static unsigned long direct_pages_count[PG_LEVEL_NUM];
     84
     85void update_page_count(int level, unsigned long pages)
     86{
     87	/* Protect against CPA */
     88	spin_lock(&pgd_lock);
     89	direct_pages_count[level] += pages;
     90	spin_unlock(&pgd_lock);
     91}
     92
     93static void split_page_count(int level)
     94{
     95	if (direct_pages_count[level] == 0)
     96		return;
     97
     98	direct_pages_count[level]--;
     99	if (system_state == SYSTEM_RUNNING) {
    100		if (level == PG_LEVEL_2M)
    101			count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
    102		else if (level == PG_LEVEL_1G)
    103			count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
    104	}
    105	direct_pages_count[level - 1] += PTRS_PER_PTE;
    106}
    107
    108void arch_report_meminfo(struct seq_file *m)
    109{
    110	seq_printf(m, "DirectMap4k:    %8lu kB\n",
    111			direct_pages_count[PG_LEVEL_4K] << 2);
    112#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
    113	seq_printf(m, "DirectMap2M:    %8lu kB\n",
    114			direct_pages_count[PG_LEVEL_2M] << 11);
    115#else
    116	seq_printf(m, "DirectMap4M:    %8lu kB\n",
    117			direct_pages_count[PG_LEVEL_2M] << 12);
    118#endif
    119	if (direct_gbpages)
    120		seq_printf(m, "DirectMap1G:    %8lu kB\n",
    121			direct_pages_count[PG_LEVEL_1G] << 20);
    122}
    123#else
    124static inline void split_page_count(int level) { }
    125#endif
    126
    127#ifdef CONFIG_X86_CPA_STATISTICS
    128
    129static unsigned long cpa_1g_checked;
    130static unsigned long cpa_1g_sameprot;
    131static unsigned long cpa_1g_preserved;
    132static unsigned long cpa_2m_checked;
    133static unsigned long cpa_2m_sameprot;
    134static unsigned long cpa_2m_preserved;
    135static unsigned long cpa_4k_install;
    136
    137static inline void cpa_inc_1g_checked(void)
    138{
    139	cpa_1g_checked++;
    140}
    141
    142static inline void cpa_inc_2m_checked(void)
    143{
    144	cpa_2m_checked++;
    145}
    146
    147static inline void cpa_inc_4k_install(void)
    148{
    149	data_race(cpa_4k_install++);
    150}
    151
    152static inline void cpa_inc_lp_sameprot(int level)
    153{
    154	if (level == PG_LEVEL_1G)
    155		cpa_1g_sameprot++;
    156	else
    157		cpa_2m_sameprot++;
    158}
    159
    160static inline void cpa_inc_lp_preserved(int level)
    161{
    162	if (level == PG_LEVEL_1G)
    163		cpa_1g_preserved++;
    164	else
    165		cpa_2m_preserved++;
    166}
    167
    168static int cpastats_show(struct seq_file *m, void *p)
    169{
    170	seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
    171	seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
    172	seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
    173	seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
    174	seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
    175	seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
    176	seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
    177	return 0;
    178}
    179
    180static int cpastats_open(struct inode *inode, struct file *file)
    181{
    182	return single_open(file, cpastats_show, NULL);
    183}
    184
    185static const struct file_operations cpastats_fops = {
    186	.open		= cpastats_open,
    187	.read		= seq_read,
    188	.llseek		= seq_lseek,
    189	.release	= single_release,
    190};
    191
    192static int __init cpa_stats_init(void)
    193{
    194	debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
    195			    &cpastats_fops);
    196	return 0;
    197}
    198late_initcall(cpa_stats_init);
    199#else
    200static inline void cpa_inc_1g_checked(void) { }
    201static inline void cpa_inc_2m_checked(void) { }
    202static inline void cpa_inc_4k_install(void) { }
    203static inline void cpa_inc_lp_sameprot(int level) { }
    204static inline void cpa_inc_lp_preserved(int level) { }
    205#endif
    206
    207
    208static inline int
    209within(unsigned long addr, unsigned long start, unsigned long end)
    210{
    211	return addr >= start && addr < end;
    212}
    213
    214static inline int
    215within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
    216{
    217	return addr >= start && addr <= end;
    218}
    219
    220#ifdef CONFIG_X86_64
    221
    222static inline unsigned long highmap_start_pfn(void)
    223{
    224	return __pa_symbol(_text) >> PAGE_SHIFT;
    225}
    226
    227static inline unsigned long highmap_end_pfn(void)
    228{
    229	/* Do not reference physical address outside the kernel. */
    230	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
    231}
    232
    233static bool __cpa_pfn_in_highmap(unsigned long pfn)
    234{
    235	/*
    236	 * Kernel text has an alias mapping at a high address, known
    237	 * here as "highmap".
    238	 */
    239	return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
    240}
    241
    242#else
    243
    244static bool __cpa_pfn_in_highmap(unsigned long pfn)
    245{
    246	/* There is no highmap on 32-bit */
    247	return false;
    248}
    249
    250#endif
    251
    252/*
    253 * See set_mce_nospec().
    254 *
    255 * Machine check recovery code needs to change cache mode of poisoned pages to
    256 * UC to avoid speculative access logging another error. But passing the
    257 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
    258 * speculative access. So we cheat and flip the top bit of the address. This
    259 * works fine for the code that updates the page tables. But at the end of the
    260 * process we need to flush the TLB and cache and the non-canonical address
    261 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
    262 *
    263 * But in the common case we already have a canonical address. This code
    264 * will fix the top bit if needed and is a no-op otherwise.
    265 */
    266static inline unsigned long fix_addr(unsigned long addr)
    267{
    268#ifdef CONFIG_X86_64
    269	return (long)(addr << 1) >> 1;
    270#else
    271	return addr;
    272#endif
    273}
    274
    275static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
    276{
    277	if (cpa->flags & CPA_PAGES_ARRAY) {
    278		struct page *page = cpa->pages[idx];
    279
    280		if (unlikely(PageHighMem(page)))
    281			return 0;
    282
    283		return (unsigned long)page_address(page);
    284	}
    285
    286	if (cpa->flags & CPA_ARRAY)
    287		return cpa->vaddr[idx];
    288
    289	return *cpa->vaddr + idx * PAGE_SIZE;
    290}
    291
    292/*
    293 * Flushing functions
    294 */
    295
    296static void clflush_cache_range_opt(void *vaddr, unsigned int size)
    297{
    298	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
    299	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
    300	void *vend = vaddr + size;
    301
    302	if (p >= vend)
    303		return;
    304
    305	for (; p < vend; p += clflush_size)
    306		clflushopt(p);
    307}
    308
    309/**
    310 * clflush_cache_range - flush a cache range with clflush
    311 * @vaddr:	virtual start address
    312 * @size:	number of bytes to flush
    313 *
    314 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
    315 * SFENCE to avoid ordering issues.
    316 */
    317void clflush_cache_range(void *vaddr, unsigned int size)
    318{
    319	mb();
    320	clflush_cache_range_opt(vaddr, size);
    321	mb();
    322}
    323EXPORT_SYMBOL_GPL(clflush_cache_range);
    324
    325#ifdef CONFIG_ARCH_HAS_PMEM_API
    326void arch_invalidate_pmem(void *addr, size_t size)
    327{
    328	clflush_cache_range(addr, size);
    329}
    330EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
    331#endif
    332
    333static void __cpa_flush_all(void *arg)
    334{
    335	unsigned long cache = (unsigned long)arg;
    336
    337	/*
    338	 * Flush all to work around Errata in early athlons regarding
    339	 * large page flushing.
    340	 */
    341	__flush_tlb_all();
    342
    343	if (cache && boot_cpu_data.x86 >= 4)
    344		wbinvd();
    345}
    346
    347static void cpa_flush_all(unsigned long cache)
    348{
    349	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
    350
    351	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
    352}
    353
    354static void __cpa_flush_tlb(void *data)
    355{
    356	struct cpa_data *cpa = data;
    357	unsigned int i;
    358
    359	for (i = 0; i < cpa->numpages; i++)
    360		flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
    361}
    362
    363static void cpa_flush(struct cpa_data *data, int cache)
    364{
    365	struct cpa_data *cpa = data;
    366	unsigned int i;
    367
    368	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
    369
    370	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
    371		cpa_flush_all(cache);
    372		return;
    373	}
    374
    375	if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
    376		flush_tlb_all();
    377	else
    378		on_each_cpu(__cpa_flush_tlb, cpa, 1);
    379
    380	if (!cache)
    381		return;
    382
    383	mb();
    384	for (i = 0; i < cpa->numpages; i++) {
    385		unsigned long addr = __cpa_addr(cpa, i);
    386		unsigned int level;
    387
    388		pte_t *pte = lookup_address(addr, &level);
    389
    390		/*
    391		 * Only flush present addresses:
    392		 */
    393		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
    394			clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
    395	}
    396	mb();
    397}
    398
    399static bool overlaps(unsigned long r1_start, unsigned long r1_end,
    400		     unsigned long r2_start, unsigned long r2_end)
    401{
    402	return (r1_start <= r2_end && r1_end >= r2_start) ||
    403		(r2_start <= r1_end && r2_end >= r1_start);
    404}
    405
    406#ifdef CONFIG_PCI_BIOS
    407/*
    408 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
    409 * based config access (CONFIG_PCI_GOBIOS) support.
    410 */
    411#define BIOS_PFN	PFN_DOWN(BIOS_BEGIN)
    412#define BIOS_PFN_END	PFN_DOWN(BIOS_END - 1)
    413
    414static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
    415{
    416	if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
    417		return _PAGE_NX;
    418	return 0;
    419}
    420#else
    421static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
    422{
    423	return 0;
    424}
    425#endif
    426
    427/*
    428 * The .rodata section needs to be read-only. Using the pfn catches all
    429 * aliases.  This also includes __ro_after_init, so do not enforce until
    430 * kernel_set_to_readonly is true.
    431 */
    432static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
    433{
    434	unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
    435
    436	/*
    437	 * Note: __end_rodata is at page aligned and not inclusive, so
    438	 * subtract 1 to get the last enforced PFN in the rodata area.
    439	 */
    440	epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
    441
    442	if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
    443		return _PAGE_RW;
    444	return 0;
    445}
    446
    447/*
    448 * Protect kernel text against becoming non executable by forbidding
    449 * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
    450 * out of which the kernel actually executes.  Do not protect the low
    451 * mapping.
    452 *
    453 * This does not cover __inittext since that is gone after boot.
    454 */
    455static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
    456{
    457	unsigned long t_end = (unsigned long)_etext - 1;
    458	unsigned long t_start = (unsigned long)_text;
    459
    460	if (overlaps(start, end, t_start, t_end))
    461		return _PAGE_NX;
    462	return 0;
    463}
    464
    465#if defined(CONFIG_X86_64)
    466/*
    467 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
    468 * kernel text mappings for the large page aligned text, rodata sections
    469 * will be always read-only. For the kernel identity mappings covering the
    470 * holes caused by this alignment can be anything that user asks.
    471 *
    472 * This will preserve the large page mappings for kernel text/data at no
    473 * extra cost.
    474 */
    475static pgprotval_t protect_kernel_text_ro(unsigned long start,
    476					  unsigned long end)
    477{
    478	unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
    479	unsigned long t_start = (unsigned long)_text;
    480	unsigned int level;
    481
    482	if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
    483		return 0;
    484	/*
    485	 * Don't enforce the !RW mapping for the kernel text mapping, if
    486	 * the current mapping is already using small page mapping.  No
    487	 * need to work hard to preserve large page mappings in this case.
    488	 *
    489	 * This also fixes the Linux Xen paravirt guest boot failure caused
    490	 * by unexpected read-only mappings for kernel identity
    491	 * mappings. In this paravirt guest case, the kernel text mapping
    492	 * and the kernel identity mapping share the same page-table pages,
    493	 * so the protections for kernel text and identity mappings have to
    494	 * be the same.
    495	 */
    496	if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
    497		return _PAGE_RW;
    498	return 0;
    499}
    500#else
    501static pgprotval_t protect_kernel_text_ro(unsigned long start,
    502					  unsigned long end)
    503{
    504	return 0;
    505}
    506#endif
    507
    508static inline bool conflicts(pgprot_t prot, pgprotval_t val)
    509{
    510	return (pgprot_val(prot) & ~val) != pgprot_val(prot);
    511}
    512
    513static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
    514				  unsigned long start, unsigned long end,
    515				  unsigned long pfn, const char *txt)
    516{
    517	static const char *lvltxt[] = {
    518		[CPA_CONFLICT]	= "conflict",
    519		[CPA_PROTECT]	= "protect",
    520		[CPA_DETECT]	= "detect",
    521	};
    522
    523	if (warnlvl > cpa_warn_level || !conflicts(prot, val))
    524		return;
    525
    526	pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
    527		lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
    528		(unsigned long long)val);
    529}
    530
    531/*
    532 * Certain areas of memory on x86 require very specific protection flags,
    533 * for example the BIOS area or kernel text. Callers don't always get this
    534 * right (again, ioremap() on BIOS memory is not uncommon) so this function
    535 * checks and fixes these known static required protection bits.
    536 */
    537static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
    538					  unsigned long pfn, unsigned long npg,
    539					  unsigned long lpsize, int warnlvl)
    540{
    541	pgprotval_t forbidden, res;
    542	unsigned long end;
    543
    544	/*
    545	 * There is no point in checking RW/NX conflicts when the requested
    546	 * mapping is setting the page !PRESENT.
    547	 */
    548	if (!(pgprot_val(prot) & _PAGE_PRESENT))
    549		return prot;
    550
    551	/* Operate on the virtual address */
    552	end = start + npg * PAGE_SIZE - 1;
    553
    554	res = protect_kernel_text(start, end);
    555	check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
    556	forbidden = res;
    557
    558	/*
    559	 * Special case to preserve a large page. If the change spawns the
    560	 * full large page mapping then there is no point to split it
    561	 * up. Happens with ftrace and is going to be removed once ftrace
    562	 * switched to text_poke().
    563	 */
    564	if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
    565		res = protect_kernel_text_ro(start, end);
    566		check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
    567		forbidden |= res;
    568	}
    569
    570	/* Check the PFN directly */
    571	res = protect_pci_bios(pfn, pfn + npg - 1);
    572	check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
    573	forbidden |= res;
    574
    575	res = protect_rodata(pfn, pfn + npg - 1);
    576	check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
    577	forbidden |= res;
    578
    579	return __pgprot(pgprot_val(prot) & ~forbidden);
    580}
    581
    582/*
    583 * Lookup the page table entry for a virtual address in a specific pgd.
    584 * Return a pointer to the entry and the level of the mapping.
    585 */
    586pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
    587			     unsigned int *level)
    588{
    589	p4d_t *p4d;
    590	pud_t *pud;
    591	pmd_t *pmd;
    592
    593	*level = PG_LEVEL_NONE;
    594
    595	if (pgd_none(*pgd))
    596		return NULL;
    597
    598	p4d = p4d_offset(pgd, address);
    599	if (p4d_none(*p4d))
    600		return NULL;
    601
    602	*level = PG_LEVEL_512G;
    603	if (p4d_large(*p4d) || !p4d_present(*p4d))
    604		return (pte_t *)p4d;
    605
    606	pud = pud_offset(p4d, address);
    607	if (pud_none(*pud))
    608		return NULL;
    609
    610	*level = PG_LEVEL_1G;
    611	if (pud_large(*pud) || !pud_present(*pud))
    612		return (pte_t *)pud;
    613
    614	pmd = pmd_offset(pud, address);
    615	if (pmd_none(*pmd))
    616		return NULL;
    617
    618	*level = PG_LEVEL_2M;
    619	if (pmd_large(*pmd) || !pmd_present(*pmd))
    620		return (pte_t *)pmd;
    621
    622	*level = PG_LEVEL_4K;
    623
    624	return pte_offset_kernel(pmd, address);
    625}
    626
    627/*
    628 * Lookup the page table entry for a virtual address. Return a pointer
    629 * to the entry and the level of the mapping.
    630 *
    631 * Note: We return pud and pmd either when the entry is marked large
    632 * or when the present bit is not set. Otherwise we would return a
    633 * pointer to a nonexisting mapping.
    634 */
    635pte_t *lookup_address(unsigned long address, unsigned int *level)
    636{
    637	return lookup_address_in_pgd(pgd_offset_k(address), address, level);
    638}
    639EXPORT_SYMBOL_GPL(lookup_address);
    640
    641static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
    642				  unsigned int *level)
    643{
    644	if (cpa->pgd)
    645		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
    646					       address, level);
    647
    648	return lookup_address(address, level);
    649}
    650
    651/*
    652 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
    653 * or NULL if not present.
    654 */
    655pmd_t *lookup_pmd_address(unsigned long address)
    656{
    657	pgd_t *pgd;
    658	p4d_t *p4d;
    659	pud_t *pud;
    660
    661	pgd = pgd_offset_k(address);
    662	if (pgd_none(*pgd))
    663		return NULL;
    664
    665	p4d = p4d_offset(pgd, address);
    666	if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
    667		return NULL;
    668
    669	pud = pud_offset(p4d, address);
    670	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
    671		return NULL;
    672
    673	return pmd_offset(pud, address);
    674}
    675
    676/*
    677 * This is necessary because __pa() does not work on some
    678 * kinds of memory, like vmalloc() or the alloc_remap()
    679 * areas on 32-bit NUMA systems.  The percpu areas can
    680 * end up in this kind of memory, for instance.
    681 *
    682 * This could be optimized, but it is only intended to be
    683 * used at initialization time, and keeping it
    684 * unoptimized should increase the testing coverage for
    685 * the more obscure platforms.
    686 */
    687phys_addr_t slow_virt_to_phys(void *__virt_addr)
    688{
    689	unsigned long virt_addr = (unsigned long)__virt_addr;
    690	phys_addr_t phys_addr;
    691	unsigned long offset;
    692	enum pg_level level;
    693	pte_t *pte;
    694
    695	pte = lookup_address(virt_addr, &level);
    696	BUG_ON(!pte);
    697
    698	/*
    699	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
    700	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
    701	 * make 32-PAE kernel work correctly.
    702	 */
    703	switch (level) {
    704	case PG_LEVEL_1G:
    705		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
    706		offset = virt_addr & ~PUD_PAGE_MASK;
    707		break;
    708	case PG_LEVEL_2M:
    709		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
    710		offset = virt_addr & ~PMD_PAGE_MASK;
    711		break;
    712	default:
    713		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
    714		offset = virt_addr & ~PAGE_MASK;
    715	}
    716
    717	return (phys_addr_t)(phys_addr | offset);
    718}
    719EXPORT_SYMBOL_GPL(slow_virt_to_phys);
    720
    721/*
    722 * Set the new pmd in all the pgds we know about:
    723 */
    724static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
    725{
    726	/* change init_mm */
    727	set_pte_atomic(kpte, pte);
    728#ifdef CONFIG_X86_32
    729	if (!SHARED_KERNEL_PMD) {
    730		struct page *page;
    731
    732		list_for_each_entry(page, &pgd_list, lru) {
    733			pgd_t *pgd;
    734			p4d_t *p4d;
    735			pud_t *pud;
    736			pmd_t *pmd;
    737
    738			pgd = (pgd_t *)page_address(page) + pgd_index(address);
    739			p4d = p4d_offset(pgd, address);
    740			pud = pud_offset(p4d, address);
    741			pmd = pmd_offset(pud, address);
    742			set_pte_atomic((pte_t *)pmd, pte);
    743		}
    744	}
    745#endif
    746}
    747
    748static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
    749{
    750	/*
    751	 * _PAGE_GLOBAL means "global page" for present PTEs.
    752	 * But, it is also used to indicate _PAGE_PROTNONE
    753	 * for non-present PTEs.
    754	 *
    755	 * This ensures that a _PAGE_GLOBAL PTE going from
    756	 * present to non-present is not confused as
    757	 * _PAGE_PROTNONE.
    758	 */
    759	if (!(pgprot_val(prot) & _PAGE_PRESENT))
    760		pgprot_val(prot) &= ~_PAGE_GLOBAL;
    761
    762	return prot;
    763}
    764
    765static int __should_split_large_page(pte_t *kpte, unsigned long address,
    766				     struct cpa_data *cpa)
    767{
    768	unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
    769	pgprot_t old_prot, new_prot, req_prot, chk_prot;
    770	pte_t new_pte, *tmp;
    771	enum pg_level level;
    772
    773	/*
    774	 * Check for races, another CPU might have split this page
    775	 * up already:
    776	 */
    777	tmp = _lookup_address_cpa(cpa, address, &level);
    778	if (tmp != kpte)
    779		return 1;
    780
    781	switch (level) {
    782	case PG_LEVEL_2M:
    783		old_prot = pmd_pgprot(*(pmd_t *)kpte);
    784		old_pfn = pmd_pfn(*(pmd_t *)kpte);
    785		cpa_inc_2m_checked();
    786		break;
    787	case PG_LEVEL_1G:
    788		old_prot = pud_pgprot(*(pud_t *)kpte);
    789		old_pfn = pud_pfn(*(pud_t *)kpte);
    790		cpa_inc_1g_checked();
    791		break;
    792	default:
    793		return -EINVAL;
    794	}
    795
    796	psize = page_level_size(level);
    797	pmask = page_level_mask(level);
    798
    799	/*
    800	 * Calculate the number of pages, which fit into this large
    801	 * page starting at address:
    802	 */
    803	lpaddr = (address + psize) & pmask;
    804	numpages = (lpaddr - address) >> PAGE_SHIFT;
    805	if (numpages < cpa->numpages)
    806		cpa->numpages = numpages;
    807
    808	/*
    809	 * We are safe now. Check whether the new pgprot is the same:
    810	 * Convert protection attributes to 4k-format, as cpa->mask* are set
    811	 * up accordingly.
    812	 */
    813
    814	/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
    815	req_prot = pgprot_large_2_4k(old_prot);
    816
    817	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
    818	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
    819
    820	/*
    821	 * req_prot is in format of 4k pages. It must be converted to large
    822	 * page format: the caching mode includes the PAT bit located at
    823	 * different bit positions in the two formats.
    824	 */
    825	req_prot = pgprot_4k_2_large(req_prot);
    826	req_prot = pgprot_clear_protnone_bits(req_prot);
    827	if (pgprot_val(req_prot) & _PAGE_PRESENT)
    828		pgprot_val(req_prot) |= _PAGE_PSE;
    829
    830	/*
    831	 * old_pfn points to the large page base pfn. So we need to add the
    832	 * offset of the virtual address:
    833	 */
    834	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
    835	cpa->pfn = pfn;
    836
    837	/*
    838	 * Calculate the large page base address and the number of 4K pages
    839	 * in the large page
    840	 */
    841	lpaddr = address & pmask;
    842	numpages = psize >> PAGE_SHIFT;
    843
    844	/*
    845	 * Sanity check that the existing mapping is correct versus the static
    846	 * protections. static_protections() guards against !PRESENT, so no
    847	 * extra conditional required here.
    848	 */
    849	chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
    850				      psize, CPA_CONFLICT);
    851
    852	if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
    853		/*
    854		 * Split the large page and tell the split code to
    855		 * enforce static protections.
    856		 */
    857		cpa->force_static_prot = 1;
    858		return 1;
    859	}
    860
    861	/*
    862	 * Optimization: If the requested pgprot is the same as the current
    863	 * pgprot, then the large page can be preserved and no updates are
    864	 * required independent of alignment and length of the requested
    865	 * range. The above already established that the current pgprot is
    866	 * correct, which in consequence makes the requested pgprot correct
    867	 * as well if it is the same. The static protection scan below will
    868	 * not come to a different conclusion.
    869	 */
    870	if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
    871		cpa_inc_lp_sameprot(level);
    872		return 0;
    873	}
    874
    875	/*
    876	 * If the requested range does not cover the full page, split it up
    877	 */
    878	if (address != lpaddr || cpa->numpages != numpages)
    879		return 1;
    880
    881	/*
    882	 * Check whether the requested pgprot is conflicting with a static
    883	 * protection requirement in the large page.
    884	 */
    885	new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
    886				      psize, CPA_DETECT);
    887
    888	/*
    889	 * If there is a conflict, split the large page.
    890	 *
    891	 * There used to be a 4k wise evaluation trying really hard to
    892	 * preserve the large pages, but experimentation has shown, that this
    893	 * does not help at all. There might be corner cases which would
    894	 * preserve one large page occasionally, but it's really not worth the
    895	 * extra code and cycles for the common case.
    896	 */
    897	if (pgprot_val(req_prot) != pgprot_val(new_prot))
    898		return 1;
    899
    900	/* All checks passed. Update the large page mapping. */
    901	new_pte = pfn_pte(old_pfn, new_prot);
    902	__set_pmd_pte(kpte, address, new_pte);
    903	cpa->flags |= CPA_FLUSHTLB;
    904	cpa_inc_lp_preserved(level);
    905	return 0;
    906}
    907
    908static int should_split_large_page(pte_t *kpte, unsigned long address,
    909				   struct cpa_data *cpa)
    910{
    911	int do_split;
    912
    913	if (cpa->force_split)
    914		return 1;
    915
    916	spin_lock(&pgd_lock);
    917	do_split = __should_split_large_page(kpte, address, cpa);
    918	spin_unlock(&pgd_lock);
    919
    920	return do_split;
    921}
    922
    923static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
    924			  pgprot_t ref_prot, unsigned long address,
    925			  unsigned long size)
    926{
    927	unsigned int npg = PFN_DOWN(size);
    928	pgprot_t prot;
    929
    930	/*
    931	 * If should_split_large_page() discovered an inconsistent mapping,
    932	 * remove the invalid protection in the split mapping.
    933	 */
    934	if (!cpa->force_static_prot)
    935		goto set;
    936
    937	/* Hand in lpsize = 0 to enforce the protection mechanism */
    938	prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
    939
    940	if (pgprot_val(prot) == pgprot_val(ref_prot))
    941		goto set;
    942
    943	/*
    944	 * If this is splitting a PMD, fix it up. PUD splits cannot be
    945	 * fixed trivially as that would require to rescan the newly
    946	 * installed PMD mappings after returning from split_large_page()
    947	 * so an eventual further split can allocate the necessary PTE
    948	 * pages. Warn for now and revisit it in case this actually
    949	 * happens.
    950	 */
    951	if (size == PAGE_SIZE)
    952		ref_prot = prot;
    953	else
    954		pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
    955set:
    956	set_pte(pte, pfn_pte(pfn, ref_prot));
    957}
    958
    959static int
    960__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
    961		   struct page *base)
    962{
    963	unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
    964	pte_t *pbase = (pte_t *)page_address(base);
    965	unsigned int i, level;
    966	pgprot_t ref_prot;
    967	pte_t *tmp;
    968
    969	spin_lock(&pgd_lock);
    970	/*
    971	 * Check for races, another CPU might have split this page
    972	 * up for us already:
    973	 */
    974	tmp = _lookup_address_cpa(cpa, address, &level);
    975	if (tmp != kpte) {
    976		spin_unlock(&pgd_lock);
    977		return 1;
    978	}
    979
    980	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
    981
    982	switch (level) {
    983	case PG_LEVEL_2M:
    984		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
    985		/*
    986		 * Clear PSE (aka _PAGE_PAT) and move
    987		 * PAT bit to correct position.
    988		 */
    989		ref_prot = pgprot_large_2_4k(ref_prot);
    990		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
    991		lpaddr = address & PMD_MASK;
    992		lpinc = PAGE_SIZE;
    993		break;
    994
    995	case PG_LEVEL_1G:
    996		ref_prot = pud_pgprot(*(pud_t *)kpte);
    997		ref_pfn = pud_pfn(*(pud_t *)kpte);
    998		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
    999		lpaddr = address & PUD_MASK;
   1000		lpinc = PMD_SIZE;
   1001		/*
   1002		 * Clear the PSE flags if the PRESENT flag is not set
   1003		 * otherwise pmd_present/pmd_huge will return true
   1004		 * even on a non present pmd.
   1005		 */
   1006		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
   1007			pgprot_val(ref_prot) &= ~_PAGE_PSE;
   1008		break;
   1009
   1010	default:
   1011		spin_unlock(&pgd_lock);
   1012		return 1;
   1013	}
   1014
   1015	ref_prot = pgprot_clear_protnone_bits(ref_prot);
   1016
   1017	/*
   1018	 * Get the target pfn from the original entry:
   1019	 */
   1020	pfn = ref_pfn;
   1021	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
   1022		split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
   1023
   1024	if (virt_addr_valid(address)) {
   1025		unsigned long pfn = PFN_DOWN(__pa(address));
   1026
   1027		if (pfn_range_is_mapped(pfn, pfn + 1))
   1028			split_page_count(level);
   1029	}
   1030
   1031	/*
   1032	 * Install the new, split up pagetable.
   1033	 *
   1034	 * We use the standard kernel pagetable protections for the new
   1035	 * pagetable protections, the actual ptes set above control the
   1036	 * primary protection behavior:
   1037	 */
   1038	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
   1039
   1040	/*
   1041	 * Do a global flush tlb after splitting the large page
   1042	 * and before we do the actual change page attribute in the PTE.
   1043	 *
   1044	 * Without this, we violate the TLB application note, that says:
   1045	 * "The TLBs may contain both ordinary and large-page
   1046	 *  translations for a 4-KByte range of linear addresses. This
   1047	 *  may occur if software modifies the paging structures so that
   1048	 *  the page size used for the address range changes. If the two
   1049	 *  translations differ with respect to page frame or attributes
   1050	 *  (e.g., permissions), processor behavior is undefined and may
   1051	 *  be implementation-specific."
   1052	 *
   1053	 * We do this global tlb flush inside the cpa_lock, so that we
   1054	 * don't allow any other cpu, with stale tlb entries change the
   1055	 * page attribute in parallel, that also falls into the
   1056	 * just split large page entry.
   1057	 */
   1058	flush_tlb_all();
   1059	spin_unlock(&pgd_lock);
   1060
   1061	return 0;
   1062}
   1063
   1064static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
   1065			    unsigned long address)
   1066{
   1067	struct page *base;
   1068
   1069	if (!debug_pagealloc_enabled())
   1070		spin_unlock(&cpa_lock);
   1071	base = alloc_pages(GFP_KERNEL, 0);
   1072	if (!debug_pagealloc_enabled())
   1073		spin_lock(&cpa_lock);
   1074	if (!base)
   1075		return -ENOMEM;
   1076
   1077	if (__split_large_page(cpa, kpte, address, base))
   1078		__free_page(base);
   1079
   1080	return 0;
   1081}
   1082
   1083static bool try_to_free_pte_page(pte_t *pte)
   1084{
   1085	int i;
   1086
   1087	for (i = 0; i < PTRS_PER_PTE; i++)
   1088		if (!pte_none(pte[i]))
   1089			return false;
   1090
   1091	free_page((unsigned long)pte);
   1092	return true;
   1093}
   1094
   1095static bool try_to_free_pmd_page(pmd_t *pmd)
   1096{
   1097	int i;
   1098
   1099	for (i = 0; i < PTRS_PER_PMD; i++)
   1100		if (!pmd_none(pmd[i]))
   1101			return false;
   1102
   1103	free_page((unsigned long)pmd);
   1104	return true;
   1105}
   1106
   1107static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
   1108{
   1109	pte_t *pte = pte_offset_kernel(pmd, start);
   1110
   1111	while (start < end) {
   1112		set_pte(pte, __pte(0));
   1113
   1114		start += PAGE_SIZE;
   1115		pte++;
   1116	}
   1117
   1118	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
   1119		pmd_clear(pmd);
   1120		return true;
   1121	}
   1122	return false;
   1123}
   1124
   1125static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
   1126			      unsigned long start, unsigned long end)
   1127{
   1128	if (unmap_pte_range(pmd, start, end))
   1129		if (try_to_free_pmd_page(pud_pgtable(*pud)))
   1130			pud_clear(pud);
   1131}
   1132
   1133static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
   1134{
   1135	pmd_t *pmd = pmd_offset(pud, start);
   1136
   1137	/*
   1138	 * Not on a 2MB page boundary?
   1139	 */
   1140	if (start & (PMD_SIZE - 1)) {
   1141		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
   1142		unsigned long pre_end = min_t(unsigned long, end, next_page);
   1143
   1144		__unmap_pmd_range(pud, pmd, start, pre_end);
   1145
   1146		start = pre_end;
   1147		pmd++;
   1148	}
   1149
   1150	/*
   1151	 * Try to unmap in 2M chunks.
   1152	 */
   1153	while (end - start >= PMD_SIZE) {
   1154		if (pmd_large(*pmd))
   1155			pmd_clear(pmd);
   1156		else
   1157			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
   1158
   1159		start += PMD_SIZE;
   1160		pmd++;
   1161	}
   1162
   1163	/*
   1164	 * 4K leftovers?
   1165	 */
   1166	if (start < end)
   1167		return __unmap_pmd_range(pud, pmd, start, end);
   1168
   1169	/*
   1170	 * Try again to free the PMD page if haven't succeeded above.
   1171	 */
   1172	if (!pud_none(*pud))
   1173		if (try_to_free_pmd_page(pud_pgtable(*pud)))
   1174			pud_clear(pud);
   1175}
   1176
   1177static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
   1178{
   1179	pud_t *pud = pud_offset(p4d, start);
   1180
   1181	/*
   1182	 * Not on a GB page boundary?
   1183	 */
   1184	if (start & (PUD_SIZE - 1)) {
   1185		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
   1186		unsigned long pre_end	= min_t(unsigned long, end, next_page);
   1187
   1188		unmap_pmd_range(pud, start, pre_end);
   1189
   1190		start = pre_end;
   1191		pud++;
   1192	}
   1193
   1194	/*
   1195	 * Try to unmap in 1G chunks?
   1196	 */
   1197	while (end - start >= PUD_SIZE) {
   1198
   1199		if (pud_large(*pud))
   1200			pud_clear(pud);
   1201		else
   1202			unmap_pmd_range(pud, start, start + PUD_SIZE);
   1203
   1204		start += PUD_SIZE;
   1205		pud++;
   1206	}
   1207
   1208	/*
   1209	 * 2M leftovers?
   1210	 */
   1211	if (start < end)
   1212		unmap_pmd_range(pud, start, end);
   1213
   1214	/*
   1215	 * No need to try to free the PUD page because we'll free it in
   1216	 * populate_pgd's error path
   1217	 */
   1218}
   1219
   1220static int alloc_pte_page(pmd_t *pmd)
   1221{
   1222	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
   1223	if (!pte)
   1224		return -1;
   1225
   1226	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
   1227	return 0;
   1228}
   1229
   1230static int alloc_pmd_page(pud_t *pud)
   1231{
   1232	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
   1233	if (!pmd)
   1234		return -1;
   1235
   1236	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
   1237	return 0;
   1238}
   1239
   1240static void populate_pte(struct cpa_data *cpa,
   1241			 unsigned long start, unsigned long end,
   1242			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
   1243{
   1244	pte_t *pte;
   1245
   1246	pte = pte_offset_kernel(pmd, start);
   1247
   1248	pgprot = pgprot_clear_protnone_bits(pgprot);
   1249
   1250	while (num_pages-- && start < end) {
   1251		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
   1252
   1253		start	 += PAGE_SIZE;
   1254		cpa->pfn++;
   1255		pte++;
   1256	}
   1257}
   1258
   1259static long populate_pmd(struct cpa_data *cpa,
   1260			 unsigned long start, unsigned long end,
   1261			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
   1262{
   1263	long cur_pages = 0;
   1264	pmd_t *pmd;
   1265	pgprot_t pmd_pgprot;
   1266
   1267	/*
   1268	 * Not on a 2M boundary?
   1269	 */
   1270	if (start & (PMD_SIZE - 1)) {
   1271		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
   1272		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
   1273
   1274		pre_end   = min_t(unsigned long, pre_end, next_page);
   1275		cur_pages = (pre_end - start) >> PAGE_SHIFT;
   1276		cur_pages = min_t(unsigned int, num_pages, cur_pages);
   1277
   1278		/*
   1279		 * Need a PTE page?
   1280		 */
   1281		pmd = pmd_offset(pud, start);
   1282		if (pmd_none(*pmd))
   1283			if (alloc_pte_page(pmd))
   1284				return -1;
   1285
   1286		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
   1287
   1288		start = pre_end;
   1289	}
   1290
   1291	/*
   1292	 * We mapped them all?
   1293	 */
   1294	if (num_pages == cur_pages)
   1295		return cur_pages;
   1296
   1297	pmd_pgprot = pgprot_4k_2_large(pgprot);
   1298
   1299	while (end - start >= PMD_SIZE) {
   1300
   1301		/*
   1302		 * We cannot use a 1G page so allocate a PMD page if needed.
   1303		 */
   1304		if (pud_none(*pud))
   1305			if (alloc_pmd_page(pud))
   1306				return -1;
   1307
   1308		pmd = pmd_offset(pud, start);
   1309
   1310		set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
   1311					canon_pgprot(pmd_pgprot))));
   1312
   1313		start	  += PMD_SIZE;
   1314		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
   1315		cur_pages += PMD_SIZE >> PAGE_SHIFT;
   1316	}
   1317
   1318	/*
   1319	 * Map trailing 4K pages.
   1320	 */
   1321	if (start < end) {
   1322		pmd = pmd_offset(pud, start);
   1323		if (pmd_none(*pmd))
   1324			if (alloc_pte_page(pmd))
   1325				return -1;
   1326
   1327		populate_pte(cpa, start, end, num_pages - cur_pages,
   1328			     pmd, pgprot);
   1329	}
   1330	return num_pages;
   1331}
   1332
   1333static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
   1334			pgprot_t pgprot)
   1335{
   1336	pud_t *pud;
   1337	unsigned long end;
   1338	long cur_pages = 0;
   1339	pgprot_t pud_pgprot;
   1340
   1341	end = start + (cpa->numpages << PAGE_SHIFT);
   1342
   1343	/*
   1344	 * Not on a Gb page boundary? => map everything up to it with
   1345	 * smaller pages.
   1346	 */
   1347	if (start & (PUD_SIZE - 1)) {
   1348		unsigned long pre_end;
   1349		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
   1350
   1351		pre_end   = min_t(unsigned long, end, next_page);
   1352		cur_pages = (pre_end - start) >> PAGE_SHIFT;
   1353		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
   1354
   1355		pud = pud_offset(p4d, start);
   1356
   1357		/*
   1358		 * Need a PMD page?
   1359		 */
   1360		if (pud_none(*pud))
   1361			if (alloc_pmd_page(pud))
   1362				return -1;
   1363
   1364		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
   1365					 pud, pgprot);
   1366		if (cur_pages < 0)
   1367			return cur_pages;
   1368
   1369		start = pre_end;
   1370	}
   1371
   1372	/* We mapped them all? */
   1373	if (cpa->numpages == cur_pages)
   1374		return cur_pages;
   1375
   1376	pud = pud_offset(p4d, start);
   1377	pud_pgprot = pgprot_4k_2_large(pgprot);
   1378
   1379	/*
   1380	 * Map everything starting from the Gb boundary, possibly with 1G pages
   1381	 */
   1382	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
   1383		set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
   1384				   canon_pgprot(pud_pgprot))));
   1385
   1386		start	  += PUD_SIZE;
   1387		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
   1388		cur_pages += PUD_SIZE >> PAGE_SHIFT;
   1389		pud++;
   1390	}
   1391
   1392	/* Map trailing leftover */
   1393	if (start < end) {
   1394		long tmp;
   1395
   1396		pud = pud_offset(p4d, start);
   1397		if (pud_none(*pud))
   1398			if (alloc_pmd_page(pud))
   1399				return -1;
   1400
   1401		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
   1402				   pud, pgprot);
   1403		if (tmp < 0)
   1404			return cur_pages;
   1405
   1406		cur_pages += tmp;
   1407	}
   1408	return cur_pages;
   1409}
   1410
   1411/*
   1412 * Restrictions for kernel page table do not necessarily apply when mapping in
   1413 * an alternate PGD.
   1414 */
   1415static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
   1416{
   1417	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
   1418	pud_t *pud = NULL;	/* shut up gcc */
   1419	p4d_t *p4d;
   1420	pgd_t *pgd_entry;
   1421	long ret;
   1422
   1423	pgd_entry = cpa->pgd + pgd_index(addr);
   1424
   1425	if (pgd_none(*pgd_entry)) {
   1426		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
   1427		if (!p4d)
   1428			return -1;
   1429
   1430		set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
   1431	}
   1432
   1433	/*
   1434	 * Allocate a PUD page and hand it down for mapping.
   1435	 */
   1436	p4d = p4d_offset(pgd_entry, addr);
   1437	if (p4d_none(*p4d)) {
   1438		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
   1439		if (!pud)
   1440			return -1;
   1441
   1442		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
   1443	}
   1444
   1445	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
   1446	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
   1447
   1448	ret = populate_pud(cpa, addr, p4d, pgprot);
   1449	if (ret < 0) {
   1450		/*
   1451		 * Leave the PUD page in place in case some other CPU or thread
   1452		 * already found it, but remove any useless entries we just
   1453		 * added to it.
   1454		 */
   1455		unmap_pud_range(p4d, addr,
   1456				addr + (cpa->numpages << PAGE_SHIFT));
   1457		return ret;
   1458	}
   1459
   1460	cpa->numpages = ret;
   1461	return 0;
   1462}
   1463
   1464static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
   1465			       int primary)
   1466{
   1467	if (cpa->pgd) {
   1468		/*
   1469		 * Right now, we only execute this code path when mapping
   1470		 * the EFI virtual memory map regions, no other users
   1471		 * provide a ->pgd value. This may change in the future.
   1472		 */
   1473		return populate_pgd(cpa, vaddr);
   1474	}
   1475
   1476	/*
   1477	 * Ignore all non primary paths.
   1478	 */
   1479	if (!primary) {
   1480		cpa->numpages = 1;
   1481		return 0;
   1482	}
   1483
   1484	/*
   1485	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
   1486	 * to have holes.
   1487	 * Also set numpages to '1' indicating that we processed cpa req for
   1488	 * one virtual address page and its pfn. TBD: numpages can be set based
   1489	 * on the initial value and the level returned by lookup_address().
   1490	 */
   1491	if (within(vaddr, PAGE_OFFSET,
   1492		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
   1493		cpa->numpages = 1;
   1494		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
   1495		return 0;
   1496
   1497	} else if (__cpa_pfn_in_highmap(cpa->pfn)) {
   1498		/* Faults in the highmap are OK, so do not warn: */
   1499		return -EFAULT;
   1500	} else {
   1501		WARN(1, KERN_WARNING "CPA: called for zero pte. "
   1502			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
   1503			*cpa->vaddr);
   1504
   1505		return -EFAULT;
   1506	}
   1507}
   1508
   1509static int __change_page_attr(struct cpa_data *cpa, int primary)
   1510{
   1511	unsigned long address;
   1512	int do_split, err;
   1513	unsigned int level;
   1514	pte_t *kpte, old_pte;
   1515
   1516	address = __cpa_addr(cpa, cpa->curpage);
   1517repeat:
   1518	kpte = _lookup_address_cpa(cpa, address, &level);
   1519	if (!kpte)
   1520		return __cpa_process_fault(cpa, address, primary);
   1521
   1522	old_pte = *kpte;
   1523	if (pte_none(old_pte))
   1524		return __cpa_process_fault(cpa, address, primary);
   1525
   1526	if (level == PG_LEVEL_4K) {
   1527		pte_t new_pte;
   1528		pgprot_t new_prot = pte_pgprot(old_pte);
   1529		unsigned long pfn = pte_pfn(old_pte);
   1530
   1531		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
   1532		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
   1533
   1534		cpa_inc_4k_install();
   1535		/* Hand in lpsize = 0 to enforce the protection mechanism */
   1536		new_prot = static_protections(new_prot, address, pfn, 1, 0,
   1537					      CPA_PROTECT);
   1538
   1539		new_prot = pgprot_clear_protnone_bits(new_prot);
   1540
   1541		/*
   1542		 * We need to keep the pfn from the existing PTE,
   1543		 * after all we're only going to change it's attributes
   1544		 * not the memory it points to
   1545		 */
   1546		new_pte = pfn_pte(pfn, new_prot);
   1547		cpa->pfn = pfn;
   1548		/*
   1549		 * Do we really change anything ?
   1550		 */
   1551		if (pte_val(old_pte) != pte_val(new_pte)) {
   1552			set_pte_atomic(kpte, new_pte);
   1553			cpa->flags |= CPA_FLUSHTLB;
   1554		}
   1555		cpa->numpages = 1;
   1556		return 0;
   1557	}
   1558
   1559	/*
   1560	 * Check, whether we can keep the large page intact
   1561	 * and just change the pte:
   1562	 */
   1563	do_split = should_split_large_page(kpte, address, cpa);
   1564	/*
   1565	 * When the range fits into the existing large page,
   1566	 * return. cp->numpages and cpa->tlbflush have been updated in
   1567	 * try_large_page:
   1568	 */
   1569	if (do_split <= 0)
   1570		return do_split;
   1571
   1572	/*
   1573	 * We have to split the large page:
   1574	 */
   1575	err = split_large_page(cpa, kpte, address);
   1576	if (!err)
   1577		goto repeat;
   1578
   1579	return err;
   1580}
   1581
   1582static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
   1583
   1584static int cpa_process_alias(struct cpa_data *cpa)
   1585{
   1586	struct cpa_data alias_cpa;
   1587	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
   1588	unsigned long vaddr;
   1589	int ret;
   1590
   1591	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
   1592		return 0;
   1593
   1594	/*
   1595	 * No need to redo, when the primary call touched the direct
   1596	 * mapping already:
   1597	 */
   1598	vaddr = __cpa_addr(cpa, cpa->curpage);
   1599	if (!(within(vaddr, PAGE_OFFSET,
   1600		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
   1601
   1602		alias_cpa = *cpa;
   1603		alias_cpa.vaddr = &laddr;
   1604		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
   1605		alias_cpa.curpage = 0;
   1606
   1607		cpa->force_flush_all = 1;
   1608
   1609		ret = __change_page_attr_set_clr(&alias_cpa, 0);
   1610		if (ret)
   1611			return ret;
   1612	}
   1613
   1614#ifdef CONFIG_X86_64
   1615	/*
   1616	 * If the primary call didn't touch the high mapping already
   1617	 * and the physical address is inside the kernel map, we need
   1618	 * to touch the high mapped kernel as well:
   1619	 */
   1620	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
   1621	    __cpa_pfn_in_highmap(cpa->pfn)) {
   1622		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
   1623					       __START_KERNEL_map - phys_base;
   1624		alias_cpa = *cpa;
   1625		alias_cpa.vaddr = &temp_cpa_vaddr;
   1626		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
   1627		alias_cpa.curpage = 0;
   1628
   1629		cpa->force_flush_all = 1;
   1630		/*
   1631		 * The high mapping range is imprecise, so ignore the
   1632		 * return value.
   1633		 */
   1634		__change_page_attr_set_clr(&alias_cpa, 0);
   1635	}
   1636#endif
   1637
   1638	return 0;
   1639}
   1640
   1641static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
   1642{
   1643	unsigned long numpages = cpa->numpages;
   1644	unsigned long rempages = numpages;
   1645	int ret = 0;
   1646
   1647	while (rempages) {
   1648		/*
   1649		 * Store the remaining nr of pages for the large page
   1650		 * preservation check.
   1651		 */
   1652		cpa->numpages = rempages;
   1653		/* for array changes, we can't use large page */
   1654		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
   1655			cpa->numpages = 1;
   1656
   1657		if (!debug_pagealloc_enabled())
   1658			spin_lock(&cpa_lock);
   1659		ret = __change_page_attr(cpa, checkalias);
   1660		if (!debug_pagealloc_enabled())
   1661			spin_unlock(&cpa_lock);
   1662		if (ret)
   1663			goto out;
   1664
   1665		if (checkalias) {
   1666			ret = cpa_process_alias(cpa);
   1667			if (ret)
   1668				goto out;
   1669		}
   1670
   1671		/*
   1672		 * Adjust the number of pages with the result of the
   1673		 * CPA operation. Either a large page has been
   1674		 * preserved or a single page update happened.
   1675		 */
   1676		BUG_ON(cpa->numpages > rempages || !cpa->numpages);
   1677		rempages -= cpa->numpages;
   1678		cpa->curpage += cpa->numpages;
   1679	}
   1680
   1681out:
   1682	/* Restore the original numpages */
   1683	cpa->numpages = numpages;
   1684	return ret;
   1685}
   1686
   1687static int change_page_attr_set_clr(unsigned long *addr, int numpages,
   1688				    pgprot_t mask_set, pgprot_t mask_clr,
   1689				    int force_split, int in_flag,
   1690				    struct page **pages)
   1691{
   1692	struct cpa_data cpa;
   1693	int ret, cache, checkalias;
   1694
   1695	memset(&cpa, 0, sizeof(cpa));
   1696
   1697	/*
   1698	 * Check, if we are requested to set a not supported
   1699	 * feature.  Clearing non-supported features is OK.
   1700	 */
   1701	mask_set = canon_pgprot(mask_set);
   1702
   1703	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
   1704		return 0;
   1705
   1706	/* Ensure we are PAGE_SIZE aligned */
   1707	if (in_flag & CPA_ARRAY) {
   1708		int i;
   1709		for (i = 0; i < numpages; i++) {
   1710			if (addr[i] & ~PAGE_MASK) {
   1711				addr[i] &= PAGE_MASK;
   1712				WARN_ON_ONCE(1);
   1713			}
   1714		}
   1715	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
   1716		/*
   1717		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
   1718		 * No need to check in that case
   1719		 */
   1720		if (*addr & ~PAGE_MASK) {
   1721			*addr &= PAGE_MASK;
   1722			/*
   1723			 * People should not be passing in unaligned addresses:
   1724			 */
   1725			WARN_ON_ONCE(1);
   1726		}
   1727	}
   1728
   1729	/* Must avoid aliasing mappings in the highmem code */
   1730	kmap_flush_unused();
   1731
   1732	vm_unmap_aliases();
   1733
   1734	cpa.vaddr = addr;
   1735	cpa.pages = pages;
   1736	cpa.numpages = numpages;
   1737	cpa.mask_set = mask_set;
   1738	cpa.mask_clr = mask_clr;
   1739	cpa.flags = 0;
   1740	cpa.curpage = 0;
   1741	cpa.force_split = force_split;
   1742
   1743	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
   1744		cpa.flags |= in_flag;
   1745
   1746	/* No alias checking for _NX bit modifications */
   1747	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
   1748	/* Has caller explicitly disabled alias checking? */
   1749	if (in_flag & CPA_NO_CHECK_ALIAS)
   1750		checkalias = 0;
   1751
   1752	ret = __change_page_attr_set_clr(&cpa, checkalias);
   1753
   1754	/*
   1755	 * Check whether we really changed something:
   1756	 */
   1757	if (!(cpa.flags & CPA_FLUSHTLB))
   1758		goto out;
   1759
   1760	/*
   1761	 * No need to flush, when we did not set any of the caching
   1762	 * attributes:
   1763	 */
   1764	cache = !!pgprot2cachemode(mask_set);
   1765
   1766	/*
   1767	 * On error; flush everything to be sure.
   1768	 */
   1769	if (ret) {
   1770		cpa_flush_all(cache);
   1771		goto out;
   1772	}
   1773
   1774	cpa_flush(&cpa, cache);
   1775out:
   1776	return ret;
   1777}
   1778
   1779static inline int change_page_attr_set(unsigned long *addr, int numpages,
   1780				       pgprot_t mask, int array)
   1781{
   1782	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
   1783		(array ? CPA_ARRAY : 0), NULL);
   1784}
   1785
   1786static inline int change_page_attr_clear(unsigned long *addr, int numpages,
   1787					 pgprot_t mask, int array)
   1788{
   1789	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
   1790		(array ? CPA_ARRAY : 0), NULL);
   1791}
   1792
   1793static inline int cpa_set_pages_array(struct page **pages, int numpages,
   1794				       pgprot_t mask)
   1795{
   1796	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
   1797		CPA_PAGES_ARRAY, pages);
   1798}
   1799
   1800static inline int cpa_clear_pages_array(struct page **pages, int numpages,
   1801					 pgprot_t mask)
   1802{
   1803	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
   1804		CPA_PAGES_ARRAY, pages);
   1805}
   1806
   1807/*
   1808 * __set_memory_prot is an internal helper for callers that have been passed
   1809 * a pgprot_t value from upper layers and a reservation has already been taken.
   1810 * If you want to set the pgprot to a specific page protocol, use the
   1811 * set_memory_xx() functions.
   1812 */
   1813int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
   1814{
   1815	return change_page_attr_set_clr(&addr, numpages, prot,
   1816					__pgprot(~pgprot_val(prot)), 0, 0,
   1817					NULL);
   1818}
   1819
   1820int _set_memory_uc(unsigned long addr, int numpages)
   1821{
   1822	/*
   1823	 * for now UC MINUS. see comments in ioremap()
   1824	 * If you really need strong UC use ioremap_uc(), but note
   1825	 * that you cannot override IO areas with set_memory_*() as
   1826	 * these helpers cannot work with IO memory.
   1827	 */
   1828	return change_page_attr_set(&addr, numpages,
   1829				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
   1830				    0);
   1831}
   1832
   1833int set_memory_uc(unsigned long addr, int numpages)
   1834{
   1835	int ret;
   1836
   1837	/*
   1838	 * for now UC MINUS. see comments in ioremap()
   1839	 */
   1840	ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
   1841			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
   1842	if (ret)
   1843		goto out_err;
   1844
   1845	ret = _set_memory_uc(addr, numpages);
   1846	if (ret)
   1847		goto out_free;
   1848
   1849	return 0;
   1850
   1851out_free:
   1852	memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
   1853out_err:
   1854	return ret;
   1855}
   1856EXPORT_SYMBOL(set_memory_uc);
   1857
   1858int _set_memory_wc(unsigned long addr, int numpages)
   1859{
   1860	int ret;
   1861
   1862	ret = change_page_attr_set(&addr, numpages,
   1863				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
   1864				   0);
   1865	if (!ret) {
   1866		ret = change_page_attr_set_clr(&addr, numpages,
   1867					       cachemode2pgprot(_PAGE_CACHE_MODE_WC),
   1868					       __pgprot(_PAGE_CACHE_MASK),
   1869					       0, 0, NULL);
   1870	}
   1871	return ret;
   1872}
   1873
   1874int set_memory_wc(unsigned long addr, int numpages)
   1875{
   1876	int ret;
   1877
   1878	ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
   1879		_PAGE_CACHE_MODE_WC, NULL);
   1880	if (ret)
   1881		return ret;
   1882
   1883	ret = _set_memory_wc(addr, numpages);
   1884	if (ret)
   1885		memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
   1886
   1887	return ret;
   1888}
   1889EXPORT_SYMBOL(set_memory_wc);
   1890
   1891int _set_memory_wt(unsigned long addr, int numpages)
   1892{
   1893	return change_page_attr_set(&addr, numpages,
   1894				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
   1895}
   1896
   1897int _set_memory_wb(unsigned long addr, int numpages)
   1898{
   1899	/* WB cache mode is hard wired to all cache attribute bits being 0 */
   1900	return change_page_attr_clear(&addr, numpages,
   1901				      __pgprot(_PAGE_CACHE_MASK), 0);
   1902}
   1903
   1904int set_memory_wb(unsigned long addr, int numpages)
   1905{
   1906	int ret;
   1907
   1908	ret = _set_memory_wb(addr, numpages);
   1909	if (ret)
   1910		return ret;
   1911
   1912	memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
   1913	return 0;
   1914}
   1915EXPORT_SYMBOL(set_memory_wb);
   1916
   1917/* Prevent speculative access to a page by marking it not-present */
   1918#ifdef CONFIG_X86_64
   1919int set_mce_nospec(unsigned long pfn)
   1920{
   1921	unsigned long decoy_addr;
   1922	int rc;
   1923
   1924	/* SGX pages are not in the 1:1 map */
   1925	if (arch_is_platform_page(pfn << PAGE_SHIFT))
   1926		return 0;
   1927	/*
   1928	 * We would like to just call:
   1929	 *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
   1930	 * but doing that would radically increase the odds of a
   1931	 * speculative access to the poison page because we'd have
   1932	 * the virtual address of the kernel 1:1 mapping sitting
   1933	 * around in registers.
   1934	 * Instead we get tricky.  We create a non-canonical address
   1935	 * that looks just like the one we want, but has bit 63 flipped.
   1936	 * This relies on set_memory_XX() properly sanitizing any __pa()
   1937	 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
   1938	 */
   1939	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
   1940
   1941	rc = set_memory_np(decoy_addr, 1);
   1942	if (rc)
   1943		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
   1944	return rc;
   1945}
   1946
   1947static int set_memory_present(unsigned long *addr, int numpages)
   1948{
   1949	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
   1950}
   1951
   1952/* Restore full speculative operation to the pfn. */
   1953int clear_mce_nospec(unsigned long pfn)
   1954{
   1955	unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
   1956
   1957	return set_memory_present(&addr, 1);
   1958}
   1959EXPORT_SYMBOL_GPL(clear_mce_nospec);
   1960#endif /* CONFIG_X86_64 */
   1961
   1962int set_memory_x(unsigned long addr, int numpages)
   1963{
   1964	if (!(__supported_pte_mask & _PAGE_NX))
   1965		return 0;
   1966
   1967	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
   1968}
   1969
   1970int set_memory_nx(unsigned long addr, int numpages)
   1971{
   1972	if (!(__supported_pte_mask & _PAGE_NX))
   1973		return 0;
   1974
   1975	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
   1976}
   1977
   1978int set_memory_ro(unsigned long addr, int numpages)
   1979{
   1980	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
   1981}
   1982
   1983int set_memory_rw(unsigned long addr, int numpages)
   1984{
   1985	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
   1986}
   1987
   1988int set_memory_np(unsigned long addr, int numpages)
   1989{
   1990	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
   1991}
   1992
   1993int set_memory_np_noalias(unsigned long addr, int numpages)
   1994{
   1995	int cpa_flags = CPA_NO_CHECK_ALIAS;
   1996
   1997	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
   1998					__pgprot(_PAGE_PRESENT), 0,
   1999					cpa_flags, NULL);
   2000}
   2001
   2002int set_memory_4k(unsigned long addr, int numpages)
   2003{
   2004	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
   2005					__pgprot(0), 1, 0, NULL);
   2006}
   2007
   2008int set_memory_nonglobal(unsigned long addr, int numpages)
   2009{
   2010	return change_page_attr_clear(&addr, numpages,
   2011				      __pgprot(_PAGE_GLOBAL), 0);
   2012}
   2013
   2014int set_memory_global(unsigned long addr, int numpages)
   2015{
   2016	return change_page_attr_set(&addr, numpages,
   2017				    __pgprot(_PAGE_GLOBAL), 0);
   2018}
   2019
   2020/*
   2021 * __set_memory_enc_pgtable() is used for the hypervisors that get
   2022 * informed about "encryption" status via page tables.
   2023 */
   2024static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
   2025{
   2026	pgprot_t empty = __pgprot(0);
   2027	struct cpa_data cpa;
   2028	int ret;
   2029
   2030	/* Should not be working on unaligned addresses */
   2031	if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
   2032		addr &= PAGE_MASK;
   2033
   2034	memset(&cpa, 0, sizeof(cpa));
   2035	cpa.vaddr = &addr;
   2036	cpa.numpages = numpages;
   2037	cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
   2038	cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
   2039	cpa.pgd = init_mm.pgd;
   2040
   2041	/* Must avoid aliasing mappings in the highmem code */
   2042	kmap_flush_unused();
   2043	vm_unmap_aliases();
   2044
   2045	/* Flush the caches as needed before changing the encryption attribute. */
   2046	if (x86_platform.guest.enc_tlb_flush_required(enc))
   2047		cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
   2048
   2049	/* Notify hypervisor that we are about to set/clr encryption attribute. */
   2050	x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
   2051
   2052	ret = __change_page_attr_set_clr(&cpa, 1);
   2053
   2054	/*
   2055	 * After changing the encryption attribute, we need to flush TLBs again
   2056	 * in case any speculative TLB caching occurred (but no need to flush
   2057	 * caches again).  We could just use cpa_flush_all(), but in case TLB
   2058	 * flushing gets optimized in the cpa_flush() path use the same logic
   2059	 * as above.
   2060	 */
   2061	cpa_flush(&cpa, 0);
   2062
   2063	/* Notify hypervisor that we have successfully set/clr encryption attribute. */
   2064	if (!ret) {
   2065		if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
   2066			ret = -EIO;
   2067	}
   2068
   2069	return ret;
   2070}
   2071
   2072static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
   2073{
   2074	if (hv_is_isolation_supported())
   2075		return hv_set_mem_host_visibility(addr, numpages, !enc);
   2076
   2077	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
   2078		return __set_memory_enc_pgtable(addr, numpages, enc);
   2079
   2080	return 0;
   2081}
   2082
   2083int set_memory_encrypted(unsigned long addr, int numpages)
   2084{
   2085	return __set_memory_enc_dec(addr, numpages, true);
   2086}
   2087EXPORT_SYMBOL_GPL(set_memory_encrypted);
   2088
   2089int set_memory_decrypted(unsigned long addr, int numpages)
   2090{
   2091	return __set_memory_enc_dec(addr, numpages, false);
   2092}
   2093EXPORT_SYMBOL_GPL(set_memory_decrypted);
   2094
   2095int set_pages_uc(struct page *page, int numpages)
   2096{
   2097	unsigned long addr = (unsigned long)page_address(page);
   2098
   2099	return set_memory_uc(addr, numpages);
   2100}
   2101EXPORT_SYMBOL(set_pages_uc);
   2102
   2103static int _set_pages_array(struct page **pages, int numpages,
   2104		enum page_cache_mode new_type)
   2105{
   2106	unsigned long start;
   2107	unsigned long end;
   2108	enum page_cache_mode set_type;
   2109	int i;
   2110	int free_idx;
   2111	int ret;
   2112
   2113	for (i = 0; i < numpages; i++) {
   2114		if (PageHighMem(pages[i]))
   2115			continue;
   2116		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
   2117		end = start + PAGE_SIZE;
   2118		if (memtype_reserve(start, end, new_type, NULL))
   2119			goto err_out;
   2120	}
   2121
   2122	/* If WC, set to UC- first and then WC */
   2123	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
   2124				_PAGE_CACHE_MODE_UC_MINUS : new_type;
   2125
   2126	ret = cpa_set_pages_array(pages, numpages,
   2127				  cachemode2pgprot(set_type));
   2128	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
   2129		ret = change_page_attr_set_clr(NULL, numpages,
   2130					       cachemode2pgprot(
   2131						_PAGE_CACHE_MODE_WC),
   2132					       __pgprot(_PAGE_CACHE_MASK),
   2133					       0, CPA_PAGES_ARRAY, pages);
   2134	if (ret)
   2135		goto err_out;
   2136	return 0; /* Success */
   2137err_out:
   2138	free_idx = i;
   2139	for (i = 0; i < free_idx; i++) {
   2140		if (PageHighMem(pages[i]))
   2141			continue;
   2142		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
   2143		end = start + PAGE_SIZE;
   2144		memtype_free(start, end);
   2145	}
   2146	return -EINVAL;
   2147}
   2148
   2149int set_pages_array_uc(struct page **pages, int numpages)
   2150{
   2151	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
   2152}
   2153EXPORT_SYMBOL(set_pages_array_uc);
   2154
   2155int set_pages_array_wc(struct page **pages, int numpages)
   2156{
   2157	return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
   2158}
   2159EXPORT_SYMBOL(set_pages_array_wc);
   2160
   2161int set_pages_wb(struct page *page, int numpages)
   2162{
   2163	unsigned long addr = (unsigned long)page_address(page);
   2164
   2165	return set_memory_wb(addr, numpages);
   2166}
   2167EXPORT_SYMBOL(set_pages_wb);
   2168
   2169int set_pages_array_wb(struct page **pages, int numpages)
   2170{
   2171	int retval;
   2172	unsigned long start;
   2173	unsigned long end;
   2174	int i;
   2175
   2176	/* WB cache mode is hard wired to all cache attribute bits being 0 */
   2177	retval = cpa_clear_pages_array(pages, numpages,
   2178			__pgprot(_PAGE_CACHE_MASK));
   2179	if (retval)
   2180		return retval;
   2181
   2182	for (i = 0; i < numpages; i++) {
   2183		if (PageHighMem(pages[i]))
   2184			continue;
   2185		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
   2186		end = start + PAGE_SIZE;
   2187		memtype_free(start, end);
   2188	}
   2189
   2190	return 0;
   2191}
   2192EXPORT_SYMBOL(set_pages_array_wb);
   2193
   2194int set_pages_ro(struct page *page, int numpages)
   2195{
   2196	unsigned long addr = (unsigned long)page_address(page);
   2197
   2198	return set_memory_ro(addr, numpages);
   2199}
   2200
   2201int set_pages_rw(struct page *page, int numpages)
   2202{
   2203	unsigned long addr = (unsigned long)page_address(page);
   2204
   2205	return set_memory_rw(addr, numpages);
   2206}
   2207
   2208static int __set_pages_p(struct page *page, int numpages)
   2209{
   2210	unsigned long tempaddr = (unsigned long) page_address(page);
   2211	struct cpa_data cpa = { .vaddr = &tempaddr,
   2212				.pgd = NULL,
   2213				.numpages = numpages,
   2214				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
   2215				.mask_clr = __pgprot(0),
   2216				.flags = 0};
   2217
   2218	/*
   2219	 * No alias checking needed for setting present flag. otherwise,
   2220	 * we may need to break large pages for 64-bit kernel text
   2221	 * mappings (this adds to complexity if we want to do this from
   2222	 * atomic context especially). Let's keep it simple!
   2223	 */
   2224	return __change_page_attr_set_clr(&cpa, 0);
   2225}
   2226
   2227static int __set_pages_np(struct page *page, int numpages)
   2228{
   2229	unsigned long tempaddr = (unsigned long) page_address(page);
   2230	struct cpa_data cpa = { .vaddr = &tempaddr,
   2231				.pgd = NULL,
   2232				.numpages = numpages,
   2233				.mask_set = __pgprot(0),
   2234				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
   2235				.flags = 0};
   2236
   2237	/*
   2238	 * No alias checking needed for setting not present flag. otherwise,
   2239	 * we may need to break large pages for 64-bit kernel text
   2240	 * mappings (this adds to complexity if we want to do this from
   2241	 * atomic context especially). Let's keep it simple!
   2242	 */
   2243	return __change_page_attr_set_clr(&cpa, 0);
   2244}
   2245
   2246int set_direct_map_invalid_noflush(struct page *page)
   2247{
   2248	return __set_pages_np(page, 1);
   2249}
   2250
   2251int set_direct_map_default_noflush(struct page *page)
   2252{
   2253	return __set_pages_p(page, 1);
   2254}
   2255
   2256#ifdef CONFIG_DEBUG_PAGEALLOC
   2257void __kernel_map_pages(struct page *page, int numpages, int enable)
   2258{
   2259	if (PageHighMem(page))
   2260		return;
   2261	if (!enable) {
   2262		debug_check_no_locks_freed(page_address(page),
   2263					   numpages * PAGE_SIZE);
   2264	}
   2265
   2266	/*
   2267	 * The return value is ignored as the calls cannot fail.
   2268	 * Large pages for identity mappings are not used at boot time
   2269	 * and hence no memory allocations during large page split.
   2270	 */
   2271	if (enable)
   2272		__set_pages_p(page, numpages);
   2273	else
   2274		__set_pages_np(page, numpages);
   2275
   2276	/*
   2277	 * We should perform an IPI and flush all tlbs,
   2278	 * but that can deadlock->flush only current cpu.
   2279	 * Preemption needs to be disabled around __flush_tlb_all() due to
   2280	 * CR3 reload in __native_flush_tlb().
   2281	 */
   2282	preempt_disable();
   2283	__flush_tlb_all();
   2284	preempt_enable();
   2285
   2286	arch_flush_lazy_mmu_mode();
   2287}
   2288#endif /* CONFIG_DEBUG_PAGEALLOC */
   2289
   2290bool kernel_page_present(struct page *page)
   2291{
   2292	unsigned int level;
   2293	pte_t *pte;
   2294
   2295	if (PageHighMem(page))
   2296		return false;
   2297
   2298	pte = lookup_address((unsigned long)page_address(page), &level);
   2299	return (pte_val(*pte) & _PAGE_PRESENT);
   2300}
   2301
   2302int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
   2303				   unsigned numpages, unsigned long page_flags)
   2304{
   2305	int retval = -EINVAL;
   2306
   2307	struct cpa_data cpa = {
   2308		.vaddr = &address,
   2309		.pfn = pfn,
   2310		.pgd = pgd,
   2311		.numpages = numpages,
   2312		.mask_set = __pgprot(0),
   2313		.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
   2314		.flags = 0,
   2315	};
   2316
   2317	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
   2318
   2319	if (!(__supported_pte_mask & _PAGE_NX))
   2320		goto out;
   2321
   2322	if (!(page_flags & _PAGE_ENC))
   2323		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
   2324
   2325	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
   2326
   2327	retval = __change_page_attr_set_clr(&cpa, 0);
   2328	__flush_tlb_all();
   2329
   2330out:
   2331	return retval;
   2332}
   2333
   2334/*
   2335 * __flush_tlb_all() flushes mappings only on current CPU and hence this
   2336 * function shouldn't be used in an SMP environment. Presently, it's used only
   2337 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
   2338 */
   2339int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
   2340				     unsigned long numpages)
   2341{
   2342	int retval;
   2343
   2344	/*
   2345	 * The typical sequence for unmapping is to find a pte through
   2346	 * lookup_address_in_pgd() (ideally, it should never return NULL because
   2347	 * the address is already mapped) and change it's protections. As pfn is
   2348	 * the *target* of a mapping, it's not useful while unmapping.
   2349	 */
   2350	struct cpa_data cpa = {
   2351		.vaddr		= &address,
   2352		.pfn		= 0,
   2353		.pgd		= pgd,
   2354		.numpages	= numpages,
   2355		.mask_set	= __pgprot(0),
   2356		.mask_clr	= __pgprot(_PAGE_PRESENT | _PAGE_RW),
   2357		.flags		= 0,
   2358	};
   2359
   2360	WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
   2361
   2362	retval = __change_page_attr_set_clr(&cpa, 0);
   2363	__flush_tlb_all();
   2364
   2365	return retval;
   2366}
   2367
   2368/*
   2369 * The testcases use internal knowledge of the implementation that shouldn't
   2370 * be exposed to the rest of the kernel. Include these directly here.
   2371 */
   2372#ifdef CONFIG_CPA_DEBUG
   2373#include "cpa-test.c"
   2374#endif