pgtable.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
pgtable.c (15427B)
      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
      4 */
      5
      6#include <linux/sched.h>
      7#include <linux/mm_types.h>
      8#include <linux/memblock.h>
      9#include <linux/memremap.h>
     10#include <linux/pkeys.h>
     11#include <linux/debugfs.h>
     12#include <misc/cxl-base.h>
     13
     14#include <asm/pgalloc.h>
     15#include <asm/tlb.h>
     16#include <asm/trace.h>
     17#include <asm/powernv.h>
     18#include <asm/firmware.h>
     19#include <asm/ultravisor.h>
     20#include <asm/kexec.h>
     21
     22#include <mm/mmu_decl.h>
     23#include <trace/events/thp.h>
     24
     25#include "internal.h"
     26
     27struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
     28EXPORT_SYMBOL_GPL(mmu_psize_defs);
     29
     30#ifdef CONFIG_SPARSEMEM_VMEMMAP
     31int mmu_vmemmap_psize = MMU_PAGE_4K;
     32#endif
     33
     34unsigned long __pmd_frag_nr;
     35EXPORT_SYMBOL(__pmd_frag_nr);
     36unsigned long __pmd_frag_size_shift;
     37EXPORT_SYMBOL(__pmd_frag_size_shift);
     38
     39#ifdef CONFIG_TRANSPARENT_HUGEPAGE
     40/*
     41 * This is called when relaxing access to a hugepage. It's also called in the page
     42 * fault path when we don't hit any of the major fault cases, ie, a minor
     43 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
     44 * handled those two for us, we additionally deal with missing execute
     45 * permission here on some processors
     46 */
     47int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
     48			  pmd_t *pmdp, pmd_t entry, int dirty)
     49{
     50	int changed;
     51#ifdef CONFIG_DEBUG_VM
     52	WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
     53	assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
     54#endif
     55	changed = !pmd_same(*(pmdp), entry);
     56	if (changed) {
     57		/*
     58		 * We can use MMU_PAGE_2M here, because only radix
     59		 * path look at the psize.
     60		 */
     61		__ptep_set_access_flags(vma, pmdp_ptep(pmdp),
     62					pmd_pte(entry), address, MMU_PAGE_2M);
     63	}
     64	return changed;
     65}
     66
     67int pmdp_test_and_clear_young(struct vm_area_struct *vma,
     68			      unsigned long address, pmd_t *pmdp)
     69{
     70	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
     71}
     72/*
     73 * set a new huge pmd. We should not be called for updating
     74 * an existing pmd entry. That should go via pmd_hugepage_update.
     75 */
     76void set_pmd_at(struct mm_struct *mm, unsigned long addr,
     77		pmd_t *pmdp, pmd_t pmd)
     78{
     79#ifdef CONFIG_DEBUG_VM
     80	/*
     81	 * Make sure hardware valid bit is not set. We don't do
     82	 * tlb flush for this update.
     83	 */
     84
     85	WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
     86	assert_spin_locked(pmd_lockptr(mm, pmdp));
     87	WARN_ON(!(pmd_large(pmd)));
     88#endif
     89	trace_hugepage_set_pmd(addr, pmd_val(pmd));
     90	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
     91}
     92
     93static void do_serialize(void *arg)
     94{
     95	/* We've taken the IPI, so try to trim the mask while here */
     96	if (radix_enabled()) {
     97		struct mm_struct *mm = arg;
     98		exit_lazy_flush_tlb(mm, false);
     99	}
    100}
    101
    102/*
    103 * Serialize against find_current_mm_pte which does lock-less
    104 * lookup in page tables with local interrupts disabled. For huge pages
    105 * it casts pmd_t to pte_t. Since format of pte_t is different from
    106 * pmd_t we want to prevent transit from pmd pointing to page table
    107 * to pmd pointing to huge page (and back) while interrupts are disabled.
    108 * We clear pmd to possibly replace it with page table pointer in
    109 * different code paths. So make sure we wait for the parallel
    110 * find_current_mm_pte to finish.
    111 */
    112void serialize_against_pte_lookup(struct mm_struct *mm)
    113{
    114	smp_mb();
    115	smp_call_function_many(mm_cpumask(mm), do_serialize, mm, 1);
    116}
    117
    118/*
    119 * We use this to invalidate a pmdp entry before switching from a
    120 * hugepte to regular pmd entry.
    121 */
    122pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
    123		     pmd_t *pmdp)
    124{
    125	unsigned long old_pmd;
    126
    127	old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
    128	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
    129	return __pmd(old_pmd);
    130}
    131
    132pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
    133				   unsigned long addr, pmd_t *pmdp, int full)
    134{
    135	pmd_t pmd;
    136	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
    137	VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
    138		   !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
    139	pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
    140	/*
    141	 * if it not a fullmm flush, then we can possibly end up converting
    142	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
    143	 * Make sure we flush the tlb in this case.
    144	 */
    145	if (!full)
    146		flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
    147	return pmd;
    148}
    149
    150static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
    151{
    152	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
    153}
    154
    155/*
    156 * At some point we should be able to get rid of
    157 * pmd_mkhuge() and mk_huge_pmd() when we update all the
    158 * other archs to mark the pmd huge in pfn_pmd()
    159 */
    160pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
    161{
    162	unsigned long pmdv;
    163
    164	pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
    165
    166	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
    167}
    168
    169pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
    170{
    171	return pfn_pmd(page_to_pfn(page), pgprot);
    172}
    173
    174pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
    175{
    176	unsigned long pmdv;
    177
    178	pmdv = pmd_val(pmd);
    179	pmdv &= _HPAGE_CHG_MASK;
    180	return pmd_set_protbits(__pmd(pmdv), newprot);
    181}
    182#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    183
    184/* For use by kexec, called with MMU off */
    185notrace void mmu_cleanup_all(void)
    186{
    187	if (radix_enabled())
    188		radix__mmu_cleanup_all();
    189	else if (mmu_hash_ops.hpte_clear_all)
    190		mmu_hash_ops.hpte_clear_all();
    191
    192	reset_sprs();
    193}
    194
    195#ifdef CONFIG_MEMORY_HOTPLUG
    196int __meminit create_section_mapping(unsigned long start, unsigned long end,
    197				     int nid, pgprot_t prot)
    198{
    199	if (radix_enabled())
    200		return radix__create_section_mapping(start, end, nid, prot);
    201
    202	return hash__create_section_mapping(start, end, nid, prot);
    203}
    204
    205int __meminit remove_section_mapping(unsigned long start, unsigned long end)
    206{
    207	if (radix_enabled())
    208		return radix__remove_section_mapping(start, end);
    209
    210	return hash__remove_section_mapping(start, end);
    211}
    212#endif /* CONFIG_MEMORY_HOTPLUG */
    213
    214void __init mmu_partition_table_init(void)
    215{
    216	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
    217	unsigned long ptcr;
    218
    219	/* Initialize the Partition Table with no entries */
    220	partition_tb = memblock_alloc(patb_size, patb_size);
    221	if (!partition_tb)
    222		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
    223		      __func__, patb_size, patb_size);
    224
    225	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
    226	set_ptcr_when_no_uv(ptcr);
    227	powernv_set_nmmu_ptcr(ptcr);
    228}
    229
    230static void flush_partition(unsigned int lpid, bool radix)
    231{
    232	if (radix) {
    233		radix__flush_all_lpid(lpid);
    234		radix__flush_all_lpid_guest(lpid);
    235	} else {
    236		asm volatile("ptesync" : : : "memory");
    237		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
    238			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
    239		/* do we need fixup here ?*/
    240		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
    241		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
    242	}
    243}
    244
    245void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
    246				  unsigned long dw1, bool flush)
    247{
    248	unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
    249
    250	/*
    251	 * When ultravisor is enabled, the partition table is stored in secure
    252	 * memory and can only be accessed doing an ultravisor call. However, we
    253	 * maintain a copy of the partition table in normal memory to allow Nest
    254	 * MMU translations to occur (for normal VMs).
    255	 *
    256	 * Therefore, here we always update partition_tb, regardless of whether
    257	 * we are running under an ultravisor or not.
    258	 */
    259	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
    260	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
    261
    262	/*
    263	 * If ultravisor is enabled, we do an ultravisor call to register the
    264	 * partition table entry (PATE), which also do a global flush of TLBs
    265	 * and partition table caches for the lpid. Otherwise, just do the
    266	 * flush. The type of flush (hash or radix) depends on what the previous
    267	 * use of the partition ID was, not the new use.
    268	 */
    269	if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
    270		uv_register_pate(lpid, dw0, dw1);
    271		pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
    272			dw0, dw1);
    273	} else if (flush) {
    274		/*
    275		 * Boot does not need to flush, because MMU is off and each
    276		 * CPU does a tlbiel_all() before switching them on, which
    277		 * flushes everything.
    278		 */
    279		flush_partition(lpid, (old & PATB_HR));
    280	}
    281}
    282EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
    283
    284static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
    285{
    286	void *pmd_frag, *ret;
    287
    288	if (PMD_FRAG_NR == 1)
    289		return NULL;
    290
    291	spin_lock(&mm->page_table_lock);
    292	ret = mm->context.pmd_frag;
    293	if (ret) {
    294		pmd_frag = ret + PMD_FRAG_SIZE;
    295		/*
    296		 * If we have taken up all the fragments mark PTE page NULL
    297		 */
    298		if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
    299			pmd_frag = NULL;
    300		mm->context.pmd_frag = pmd_frag;
    301	}
    302	spin_unlock(&mm->page_table_lock);
    303	return (pmd_t *)ret;
    304}
    305
    306static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
    307{
    308	void *ret = NULL;
    309	struct page *page;
    310	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
    311
    312	if (mm == &init_mm)
    313		gfp &= ~__GFP_ACCOUNT;
    314	page = alloc_page(gfp);
    315	if (!page)
    316		return NULL;
    317	if (!pgtable_pmd_page_ctor(page)) {
    318		__free_pages(page, 0);
    319		return NULL;
    320	}
    321
    322	atomic_set(&page->pt_frag_refcount, 1);
    323
    324	ret = page_address(page);
    325	/*
    326	 * if we support only one fragment just return the
    327	 * allocated page.
    328	 */
    329	if (PMD_FRAG_NR == 1)
    330		return ret;
    331
    332	spin_lock(&mm->page_table_lock);
    333	/*
    334	 * If we find pgtable_page set, we return
    335	 * the allocated page with single fragment
    336	 * count.
    337	 */
    338	if (likely(!mm->context.pmd_frag)) {
    339		atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR);
    340		mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
    341	}
    342	spin_unlock(&mm->page_table_lock);
    343
    344	return (pmd_t *)ret;
    345}
    346
    347pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
    348{
    349	pmd_t *pmd;
    350
    351	pmd = get_pmd_from_cache(mm);
    352	if (pmd)
    353		return pmd;
    354
    355	return __alloc_for_pmdcache(mm);
    356}
    357
    358void pmd_fragment_free(unsigned long *pmd)
    359{
    360	struct page *page = virt_to_page(pmd);
    361
    362	if (PageReserved(page))
    363		return free_reserved_page(page);
    364
    365	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
    366	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
    367		pgtable_pmd_page_dtor(page);
    368		__free_page(page);
    369	}
    370}
    371
    372static inline void pgtable_free(void *table, int index)
    373{
    374	switch (index) {
    375	case PTE_INDEX:
    376		pte_fragment_free(table, 0);
    377		break;
    378	case PMD_INDEX:
    379		pmd_fragment_free(table);
    380		break;
    381	case PUD_INDEX:
    382		__pud_free(table);
    383		break;
    384#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
    385		/* 16M hugepd directory at pud level */
    386	case HTLB_16M_INDEX:
    387		BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0);
    388		kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table);
    389		break;
    390		/* 16G hugepd directory at the pgd level */
    391	case HTLB_16G_INDEX:
    392		BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0);
    393		kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table);
    394		break;
    395#endif
    396		/* We don't free pgd table via RCU callback */
    397	default:
    398		BUG();
    399	}
    400}
    401
    402void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
    403{
    404	unsigned long pgf = (unsigned long)table;
    405
    406	BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
    407	pgf |= index;
    408	tlb_remove_table(tlb, (void *)pgf);
    409}
    410
    411void __tlb_remove_table(void *_table)
    412{
    413	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
    414	unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
    415
    416	return pgtable_free(table, index);
    417}
    418
    419#ifdef CONFIG_PROC_FS
    420atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
    421
    422void arch_report_meminfo(struct seq_file *m)
    423{
    424	/*
    425	 * Hash maps the memory with one size mmu_linear_psize.
    426	 * So don't bother to print these on hash
    427	 */
    428	if (!radix_enabled())
    429		return;
    430	seq_printf(m, "DirectMap4k:    %8lu kB\n",
    431		   atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2);
    432	seq_printf(m, "DirectMap64k:    %8lu kB\n",
    433		   atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6);
    434	seq_printf(m, "DirectMap2M:    %8lu kB\n",
    435		   atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11);
    436	seq_printf(m, "DirectMap1G:    %8lu kB\n",
    437		   atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
    438}
    439#endif /* CONFIG_PROC_FS */
    440
    441pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
    442			     pte_t *ptep)
    443{
    444	unsigned long pte_val;
    445
    446	/*
    447	 * Clear the _PAGE_PRESENT so that no hardware parallel update is
    448	 * possible. Also keep the pte_present true so that we don't take
    449	 * wrong fault.
    450	 */
    451	pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
    452
    453	return __pte(pte_val);
    454
    455}
    456
    457void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
    458			     pte_t *ptep, pte_t old_pte, pte_t pte)
    459{
    460	if (radix_enabled())
    461		return radix__ptep_modify_prot_commit(vma, addr,
    462						      ptep, old_pte, pte);
    463	set_pte_at(vma->vm_mm, addr, ptep, pte);
    464}
    465
    466/*
    467 * For hash translation mode, we use the deposited table to store hash slot
    468 * information and they are stored at PTRS_PER_PMD offset from related pmd
    469 * location. Hence a pmd move requires deposit and withdraw.
    470 *
    471 * For radix translation with split pmd ptl, we store the deposited table in the
    472 * pmd page. Hence if we have different pmd page we need to withdraw during pmd
    473 * move.
    474 *
    475 * With hash we use deposited table always irrespective of anon or not.
    476 * With radix we use deposited table only for anonymous mapping.
    477 */
    478int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
    479			   struct spinlock *old_pmd_ptl,
    480			   struct vm_area_struct *vma)
    481{
    482	if (radix_enabled())
    483		return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
    484
    485	return true;
    486}
    487
    488/*
    489 * Does the CPU support tlbie?
    490 */
    491bool tlbie_capable __read_mostly = true;
    492EXPORT_SYMBOL(tlbie_capable);
    493
    494/*
    495 * Should tlbie be used for management of CPU TLBs, for kernel and process
    496 * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
    497 * guest address spaces.
    498 */
    499bool tlbie_enabled __read_mostly = true;
    500
    501static int __init setup_disable_tlbie(char *str)
    502{
    503	if (!radix_enabled()) {
    504		pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
    505		return 1;
    506	}
    507
    508	tlbie_capable = false;
    509	tlbie_enabled = false;
    510
    511        return 1;
    512}
    513__setup("disable_tlbie", setup_disable_tlbie);
    514
    515static int __init pgtable_debugfs_setup(void)
    516{
    517	if (!tlbie_capable)
    518		return 0;
    519
    520	/*
    521	 * There is no locking vs tlb flushing when changing this value.
    522	 * The tlb flushers will see one value or another, and use either
    523	 * tlbie or tlbiel with IPIs. In both cases the TLBs will be
    524	 * invalidated as expected.
    525	 */
    526	debugfs_create_bool("tlbie_enabled", 0600,
    527			arch_debugfs_dir,
    528			&tlbie_enabled);
    529
    530	return 0;
    531}
    532arch_initcall(pgtable_debugfs_setup);
    533
    534#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN)
    535/*
    536 * Override the generic version in mm/memremap.c.
    537 *
    538 * With hash translation, the direct-map range is mapped with just one
    539 * page size selected by htab_init_page_sizes(). Consult
    540 * mmu_psize_defs[] to determine the minimum page size alignment.
    541*/
    542unsigned long memremap_compat_align(void)
    543{
    544	if (!radix_enabled()) {
    545		unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
    546		return max(SUBSECTION_SIZE, 1UL << shift);
    547	}
    548
    549	return SUBSECTION_SIZE;
    550}
    551EXPORT_SYMBOL_GPL(memremap_compat_align);
    552#endif
    553
    554pgprot_t vm_get_page_prot(unsigned long vm_flags)
    555{
    556	unsigned long prot = pgprot_val(protection_map[vm_flags &
    557					(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
    558
    559	if (vm_flags & VM_SAO)
    560		prot |= _PAGE_SAO;
    561
    562#ifdef CONFIG_PPC_MEM_KEYS
    563	prot |= vmflag_to_pte_pkey_bits(vm_flags);
    564#endif
    565
    566	return __pgprot(prot);
    567}
    568EXPORT_SYMBOL(vm_get_page_prot);