cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pgtable.h (47001B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _LINUX_PGTABLE_H
      3#define _LINUX_PGTABLE_H
      4
      5#include <linux/pfn.h>
      6#include <asm/pgtable.h>
      7
      8#ifndef __ASSEMBLY__
      9#ifdef CONFIG_MMU
     10
     11#include <linux/mm_types.h>
     12#include <linux/bug.h>
     13#include <linux/errno.h>
     14#include <asm-generic/pgtable_uffd.h>
     15#include <linux/page_table_check.h>
     16
     17#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
     18	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
     19#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
     20#endif
     21
     22/*
     23 * On almost all architectures and configurations, 0 can be used as the
     24 * upper ceiling to free_pgtables(): on many architectures it has the same
     25 * effect as using TASK_SIZE.  However, there is one configuration which
     26 * must impose a more careful limit, to avoid freeing kernel pgtables.
     27 */
     28#ifndef USER_PGTABLES_CEILING
     29#define USER_PGTABLES_CEILING	0UL
     30#endif
     31
     32/*
     33 * This defines the first usable user address. Platforms
     34 * can override its value with custom FIRST_USER_ADDRESS
     35 * defined in their respective <asm/pgtable.h>.
     36 */
     37#ifndef FIRST_USER_ADDRESS
     38#define FIRST_USER_ADDRESS	0UL
     39#endif
     40
     41/*
     42 * This defines the generic helper for accessing PMD page
     43 * table page. Although platforms can still override this
     44 * via their respective <asm/pgtable.h>.
     45 */
     46#ifndef pmd_pgtable
     47#define pmd_pgtable(pmd) pmd_page(pmd)
     48#endif
     49
     50/*
     51 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
     52 *
     53 * The pXx_index() functions return the index of the entry in the page
     54 * table page which would control the given virtual address
     55 *
     56 * As these functions may be used by the same code for different levels of
     57 * the page table folding, they are always available, regardless of
     58 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
     59 * because in such cases PTRS_PER_PxD equals 1.
     60 */
     61
     62static inline unsigned long pte_index(unsigned long address)
     63{
     64	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
     65}
     66#define pte_index pte_index
     67
     68#ifndef pmd_index
     69static inline unsigned long pmd_index(unsigned long address)
     70{
     71	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
     72}
     73#define pmd_index pmd_index
     74#endif
     75
     76#ifndef pud_index
     77static inline unsigned long pud_index(unsigned long address)
     78{
     79	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
     80}
     81#define pud_index pud_index
     82#endif
     83
     84#ifndef pgd_index
     85/* Must be a compile-time constant, so implement it as a macro */
     86#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
     87#endif
     88
     89#ifndef pte_offset_kernel
     90static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
     91{
     92	return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
     93}
     94#define pte_offset_kernel pte_offset_kernel
     95#endif
     96
     97#if defined(CONFIG_HIGHPTE)
     98#define pte_offset_map(dir, address)				\
     99	((pte_t *)kmap_atomic(pmd_page(*(dir))) +		\
    100	 pte_index((address)))
    101#define pte_unmap(pte) kunmap_atomic((pte))
    102#else
    103#define pte_offset_map(dir, address)	pte_offset_kernel((dir), (address))
    104#define pte_unmap(pte) ((void)(pte))	/* NOP */
    105#endif
    106
    107/* Find an entry in the second-level page table.. */
    108#ifndef pmd_offset
    109static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
    110{
    111	return pud_pgtable(*pud) + pmd_index(address);
    112}
    113#define pmd_offset pmd_offset
    114#endif
    115
    116#ifndef pud_offset
    117static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
    118{
    119	return p4d_pgtable(*p4d) + pud_index(address);
    120}
    121#define pud_offset pud_offset
    122#endif
    123
    124static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
    125{
    126	return (pgd + pgd_index(address));
    127};
    128
    129/*
    130 * a shortcut to get a pgd_t in a given mm
    131 */
    132#ifndef pgd_offset
    133#define pgd_offset(mm, address)		pgd_offset_pgd((mm)->pgd, (address))
    134#endif
    135
    136/*
    137 * a shortcut which implies the use of the kernel's pgd, instead
    138 * of a process's
    139 */
    140#ifndef pgd_offset_k
    141#define pgd_offset_k(address)		pgd_offset(&init_mm, (address))
    142#endif
    143
    144/*
    145 * In many cases it is known that a virtual address is mapped at PMD or PTE
    146 * level, so instead of traversing all the page table levels, we can get a
    147 * pointer to the PMD entry in user or kernel page table or translate a virtual
    148 * address to the pointer in the PTE in the kernel page tables with simple
    149 * helpers.
    150 */
    151static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
    152{
    153	return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
    154}
    155
    156static inline pmd_t *pmd_off_k(unsigned long va)
    157{
    158	return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
    159}
    160
    161static inline pte_t *virt_to_kpte(unsigned long vaddr)
    162{
    163	pmd_t *pmd = pmd_off_k(vaddr);
    164
    165	return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
    166}
    167
    168#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
    169extern int ptep_set_access_flags(struct vm_area_struct *vma,
    170				 unsigned long address, pte_t *ptep,
    171				 pte_t entry, int dirty);
    172#endif
    173
    174#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
    175#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    176extern int pmdp_set_access_flags(struct vm_area_struct *vma,
    177				 unsigned long address, pmd_t *pmdp,
    178				 pmd_t entry, int dirty);
    179extern int pudp_set_access_flags(struct vm_area_struct *vma,
    180				 unsigned long address, pud_t *pudp,
    181				 pud_t entry, int dirty);
    182#else
    183static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
    184					unsigned long address, pmd_t *pmdp,
    185					pmd_t entry, int dirty)
    186{
    187	BUILD_BUG();
    188	return 0;
    189}
    190static inline int pudp_set_access_flags(struct vm_area_struct *vma,
    191					unsigned long address, pud_t *pudp,
    192					pud_t entry, int dirty)
    193{
    194	BUILD_BUG();
    195	return 0;
    196}
    197#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    198#endif
    199
    200#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
    201static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
    202					    unsigned long address,
    203					    pte_t *ptep)
    204{
    205	pte_t pte = *ptep;
    206	int r = 1;
    207	if (!pte_young(pte))
    208		r = 0;
    209	else
    210		set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
    211	return r;
    212}
    213#endif
    214
    215#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
    216#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    217static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
    218					    unsigned long address,
    219					    pmd_t *pmdp)
    220{
    221	pmd_t pmd = *pmdp;
    222	int r = 1;
    223	if (!pmd_young(pmd))
    224		r = 0;
    225	else
    226		set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
    227	return r;
    228}
    229#else
    230static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
    231					    unsigned long address,
    232					    pmd_t *pmdp)
    233{
    234	BUILD_BUG();
    235	return 0;
    236}
    237#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    238#endif
    239
    240#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
    241int ptep_clear_flush_young(struct vm_area_struct *vma,
    242			   unsigned long address, pte_t *ptep);
    243#endif
    244
    245#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
    246#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    247extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
    248				  unsigned long address, pmd_t *pmdp);
    249#else
    250/*
    251 * Despite relevant to THP only, this API is called from generic rmap code
    252 * under PageTransHuge(), hence needs a dummy implementation for !THP
    253 */
    254static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
    255					 unsigned long address, pmd_t *pmdp)
    256{
    257	BUILD_BUG();
    258	return 0;
    259}
    260#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    261#endif
    262
    263#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
    264static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
    265				       unsigned long address,
    266				       pte_t *ptep)
    267{
    268	pte_t pte = *ptep;
    269	pte_clear(mm, address, ptep);
    270	page_table_check_pte_clear(mm, address, pte);
    271	return pte;
    272}
    273#endif
    274
    275static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
    276			      pte_t *ptep)
    277{
    278	ptep_get_and_clear(mm, addr, ptep);
    279}
    280
    281#ifndef __HAVE_ARCH_PTEP_GET
    282static inline pte_t ptep_get(pte_t *ptep)
    283{
    284	return READ_ONCE(*ptep);
    285}
    286#endif
    287
    288#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
    289/*
    290 * WARNING: only to be used in the get_user_pages_fast() implementation.
    291 *
    292 * With get_user_pages_fast(), we walk down the pagetables without taking any
    293 * locks.  For this we would like to load the pointers atomically, but sometimes
    294 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
    295 * we do have is the guarantee that a PTE will only either go from not present
    296 * to present, or present to not present or both -- it will not switch to a
    297 * completely different present page without a TLB flush in between; something
    298 * that we are blocking by holding interrupts off.
    299 *
    300 * Setting ptes from not present to present goes:
    301 *
    302 *   ptep->pte_high = h;
    303 *   smp_wmb();
    304 *   ptep->pte_low = l;
    305 *
    306 * And present to not present goes:
    307 *
    308 *   ptep->pte_low = 0;
    309 *   smp_wmb();
    310 *   ptep->pte_high = 0;
    311 *
    312 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
    313 * We load pte_high *after* loading pte_low, which ensures we don't see an older
    314 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
    315 * picked up a changed pte high. We might have gotten rubbish values from
    316 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
    317 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
    318 * operates on present ptes we're safe.
    319 */
    320static inline pte_t ptep_get_lockless(pte_t *ptep)
    321{
    322	pte_t pte;
    323
    324	do {
    325		pte.pte_low = ptep->pte_low;
    326		smp_rmb();
    327		pte.pte_high = ptep->pte_high;
    328		smp_rmb();
    329	} while (unlikely(pte.pte_low != ptep->pte_low));
    330
    331	return pte;
    332}
    333#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
    334/*
    335 * We require that the PTE can be read atomically.
    336 */
    337static inline pte_t ptep_get_lockless(pte_t *ptep)
    338{
    339	return ptep_get(ptep);
    340}
    341#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
    342
    343#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    344#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
    345static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
    346					    unsigned long address,
    347					    pmd_t *pmdp)
    348{
    349	pmd_t pmd = *pmdp;
    350
    351	pmd_clear(pmdp);
    352	page_table_check_pmd_clear(mm, address, pmd);
    353
    354	return pmd;
    355}
    356#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
    357#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
    358static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
    359					    unsigned long address,
    360					    pud_t *pudp)
    361{
    362	pud_t pud = *pudp;
    363
    364	pud_clear(pudp);
    365	page_table_check_pud_clear(mm, address, pud);
    366
    367	return pud;
    368}
    369#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
    370#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    371
    372#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    373#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
    374static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
    375					    unsigned long address, pmd_t *pmdp,
    376					    int full)
    377{
    378	return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
    379}
    380#endif
    381
    382#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
    383static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
    384					    unsigned long address, pud_t *pudp,
    385					    int full)
    386{
    387	return pudp_huge_get_and_clear(mm, address, pudp);
    388}
    389#endif
    390#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    391
    392#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
    393static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
    394					    unsigned long address, pte_t *ptep,
    395					    int full)
    396{
    397	pte_t pte;
    398	pte = ptep_get_and_clear(mm, address, ptep);
    399	return pte;
    400}
    401#endif
    402
    403
    404/*
    405 * If two threads concurrently fault at the same page, the thread that
    406 * won the race updates the PTE and its local TLB/Cache. The other thread
    407 * gives up, simply does nothing, and continues; on architectures where
    408 * software can update TLB,  local TLB can be updated here to avoid next page
    409 * fault. This function updates TLB only, do nothing with cache or others.
    410 * It is the difference with function update_mmu_cache.
    411 */
    412#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
    413static inline void update_mmu_tlb(struct vm_area_struct *vma,
    414				unsigned long address, pte_t *ptep)
    415{
    416}
    417#define __HAVE_ARCH_UPDATE_MMU_TLB
    418#endif
    419
    420/*
    421 * Some architectures may be able to avoid expensive synchronization
    422 * primitives when modifications are made to PTE's which are already
    423 * not present, or in the process of an address space destruction.
    424 */
    425#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
    426static inline void pte_clear_not_present_full(struct mm_struct *mm,
    427					      unsigned long address,
    428					      pte_t *ptep,
    429					      int full)
    430{
    431	pte_clear(mm, address, ptep);
    432}
    433#endif
    434
    435#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
    436extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
    437			      unsigned long address,
    438			      pte_t *ptep);
    439#endif
    440
    441#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
    442extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
    443			      unsigned long address,
    444			      pmd_t *pmdp);
    445extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
    446			      unsigned long address,
    447			      pud_t *pudp);
    448#endif
    449
    450#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
    451struct mm_struct;
    452static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
    453{
    454	pte_t old_pte = *ptep;
    455	set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
    456}
    457#endif
    458
    459/*
    460 * On some architectures hardware does not set page access bit when accessing
    461 * memory page, it is responsibility of software setting this bit. It brings
    462 * out extra page fault penalty to track page access bit. For optimization page
    463 * access bit can be set during all page fault flow on these arches.
    464 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
    465 * where software maintains page access bit.
    466 */
    467#ifndef pte_sw_mkyoung
    468static inline pte_t pte_sw_mkyoung(pte_t pte)
    469{
    470	return pte;
    471}
    472#define pte_sw_mkyoung	pte_sw_mkyoung
    473#endif
    474
    475#ifndef pte_savedwrite
    476#define pte_savedwrite pte_write
    477#endif
    478
    479#ifndef pte_mk_savedwrite
    480#define pte_mk_savedwrite pte_mkwrite
    481#endif
    482
    483#ifndef pte_clear_savedwrite
    484#define pte_clear_savedwrite pte_wrprotect
    485#endif
    486
    487#ifndef pmd_savedwrite
    488#define pmd_savedwrite pmd_write
    489#endif
    490
    491#ifndef pmd_mk_savedwrite
    492#define pmd_mk_savedwrite pmd_mkwrite
    493#endif
    494
    495#ifndef pmd_clear_savedwrite
    496#define pmd_clear_savedwrite pmd_wrprotect
    497#endif
    498
    499#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
    500#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    501static inline void pmdp_set_wrprotect(struct mm_struct *mm,
    502				      unsigned long address, pmd_t *pmdp)
    503{
    504	pmd_t old_pmd = *pmdp;
    505	set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
    506}
    507#else
    508static inline void pmdp_set_wrprotect(struct mm_struct *mm,
    509				      unsigned long address, pmd_t *pmdp)
    510{
    511	BUILD_BUG();
    512}
    513#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    514#endif
    515#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
    516#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    517static inline void pudp_set_wrprotect(struct mm_struct *mm,
    518				      unsigned long address, pud_t *pudp)
    519{
    520	pud_t old_pud = *pudp;
    521
    522	set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
    523}
    524#else
    525static inline void pudp_set_wrprotect(struct mm_struct *mm,
    526				      unsigned long address, pud_t *pudp)
    527{
    528	BUILD_BUG();
    529}
    530#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
    531#endif
    532
    533#ifndef pmdp_collapse_flush
    534#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    535extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
    536				 unsigned long address, pmd_t *pmdp);
    537#else
    538static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
    539					unsigned long address,
    540					pmd_t *pmdp)
    541{
    542	BUILD_BUG();
    543	return *pmdp;
    544}
    545#define pmdp_collapse_flush pmdp_collapse_flush
    546#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
    547#endif
    548
    549#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
    550extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
    551				       pgtable_t pgtable);
    552#endif
    553
    554#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
    555extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
    556#endif
    557
    558#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    559/*
    560 * This is an implementation of pmdp_establish() that is only suitable for an
    561 * architecture that doesn't have hardware dirty/accessed bits. In this case we
    562 * can't race with CPU which sets these bits and non-atomic approach is fine.
    563 */
    564static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
    565		unsigned long address, pmd_t *pmdp, pmd_t pmd)
    566{
    567	pmd_t old_pmd = *pmdp;
    568	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
    569	return old_pmd;
    570}
    571#endif
    572
    573#ifndef __HAVE_ARCH_PMDP_INVALIDATE
    574extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
    575			    pmd_t *pmdp);
    576#endif
    577
    578#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
    579
    580/*
    581 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
    582 * hugepage mapping in the page tables. This function is similar to
    583 * pmdp_invalidate(), but should only be used if the access and dirty bits would
    584 * not be cleared by the software in the new PMD value. The function ensures
    585 * that hardware changes of the access and dirty bits updates would not be lost.
    586 *
    587 * Doing so can allow in certain architectures to avoid a TLB flush in most
    588 * cases. Yet, another TLB flush might be necessary later if the PMD update
    589 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
    590 * even when a TLB flush is needed because of the update, the caller may be able
    591 * to batch these TLB flushing operations, so fewer TLB flush operations are
    592 * needed.
    593 */
    594extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
    595				unsigned long address, pmd_t *pmdp);
    596#endif
    597
    598#ifndef __HAVE_ARCH_PTE_SAME
    599static inline int pte_same(pte_t pte_a, pte_t pte_b)
    600{
    601	return pte_val(pte_a) == pte_val(pte_b);
    602}
    603#endif
    604
    605#ifndef __HAVE_ARCH_PTE_UNUSED
    606/*
    607 * Some architectures provide facilities to virtualization guests
    608 * so that they can flag allocated pages as unused. This allows the
    609 * host to transparently reclaim unused pages. This function returns
    610 * whether the pte's page is unused.
    611 */
    612static inline int pte_unused(pte_t pte)
    613{
    614	return 0;
    615}
    616#endif
    617
    618#ifndef pte_access_permitted
    619#define pte_access_permitted(pte, write) \
    620	(pte_present(pte) && (!(write) || pte_write(pte)))
    621#endif
    622
    623#ifndef pmd_access_permitted
    624#define pmd_access_permitted(pmd, write) \
    625	(pmd_present(pmd) && (!(write) || pmd_write(pmd)))
    626#endif
    627
    628#ifndef pud_access_permitted
    629#define pud_access_permitted(pud, write) \
    630	(pud_present(pud) && (!(write) || pud_write(pud)))
    631#endif
    632
    633#ifndef p4d_access_permitted
    634#define p4d_access_permitted(p4d, write) \
    635	(p4d_present(p4d) && (!(write) || p4d_write(p4d)))
    636#endif
    637
    638#ifndef pgd_access_permitted
    639#define pgd_access_permitted(pgd, write) \
    640	(pgd_present(pgd) && (!(write) || pgd_write(pgd)))
    641#endif
    642
    643#ifndef __HAVE_ARCH_PMD_SAME
    644static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
    645{
    646	return pmd_val(pmd_a) == pmd_val(pmd_b);
    647}
    648
    649static inline int pud_same(pud_t pud_a, pud_t pud_b)
    650{
    651	return pud_val(pud_a) == pud_val(pud_b);
    652}
    653#endif
    654
    655#ifndef __HAVE_ARCH_P4D_SAME
    656static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
    657{
    658	return p4d_val(p4d_a) == p4d_val(p4d_b);
    659}
    660#endif
    661
    662#ifndef __HAVE_ARCH_PGD_SAME
    663static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
    664{
    665	return pgd_val(pgd_a) == pgd_val(pgd_b);
    666}
    667#endif
    668
    669/*
    670 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
    671 * TLB flush will be required as a result of the "set". For example, use
    672 * in scenarios where it is known ahead of time that the routine is
    673 * setting non-present entries, or re-setting an existing entry to the
    674 * same value. Otherwise, use the typical "set" helpers and flush the
    675 * TLB.
    676 */
    677#define set_pte_safe(ptep, pte) \
    678({ \
    679	WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
    680	set_pte(ptep, pte); \
    681})
    682
    683#define set_pmd_safe(pmdp, pmd) \
    684({ \
    685	WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
    686	set_pmd(pmdp, pmd); \
    687})
    688
    689#define set_pud_safe(pudp, pud) \
    690({ \
    691	WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
    692	set_pud(pudp, pud); \
    693})
    694
    695#define set_p4d_safe(p4dp, p4d) \
    696({ \
    697	WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
    698	set_p4d(p4dp, p4d); \
    699})
    700
    701#define set_pgd_safe(pgdp, pgd) \
    702({ \
    703	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
    704	set_pgd(pgdp, pgd); \
    705})
    706
    707#ifndef __HAVE_ARCH_DO_SWAP_PAGE
    708/*
    709 * Some architectures support metadata associated with a page. When a
    710 * page is being swapped out, this metadata must be saved so it can be
    711 * restored when the page is swapped back in. SPARC M7 and newer
    712 * processors support an ADI (Application Data Integrity) tag for the
    713 * page as metadata for the page. arch_do_swap_page() can restore this
    714 * metadata when a page is swapped back in.
    715 */
    716static inline void arch_do_swap_page(struct mm_struct *mm,
    717				     struct vm_area_struct *vma,
    718				     unsigned long addr,
    719				     pte_t pte, pte_t oldpte)
    720{
    721
    722}
    723#endif
    724
    725#ifndef __HAVE_ARCH_UNMAP_ONE
    726/*
    727 * Some architectures support metadata associated with a page. When a
    728 * page is being swapped out, this metadata must be saved so it can be
    729 * restored when the page is swapped back in. SPARC M7 and newer
    730 * processors support an ADI (Application Data Integrity) tag for the
    731 * page as metadata for the page. arch_unmap_one() can save this
    732 * metadata on a swap-out of a page.
    733 */
    734static inline int arch_unmap_one(struct mm_struct *mm,
    735				  struct vm_area_struct *vma,
    736				  unsigned long addr,
    737				  pte_t orig_pte)
    738{
    739	return 0;
    740}
    741#endif
    742
    743/*
    744 * Allow architectures to preserve additional metadata associated with
    745 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
    746 * prototypes must be defined in the arch-specific asm/pgtable.h file.
    747 */
    748#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
    749static inline int arch_prepare_to_swap(struct page *page)
    750{
    751	return 0;
    752}
    753#endif
    754
    755#ifndef __HAVE_ARCH_SWAP_INVALIDATE
    756static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
    757{
    758}
    759
    760static inline void arch_swap_invalidate_area(int type)
    761{
    762}
    763#endif
    764
    765#ifndef __HAVE_ARCH_SWAP_RESTORE
    766static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
    767{
    768}
    769#endif
    770
    771#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
    772#define pgd_offset_gate(mm, addr)	pgd_offset(mm, addr)
    773#endif
    774
    775#ifndef __HAVE_ARCH_MOVE_PTE
    776#define move_pte(pte, prot, old_addr, new_addr)	(pte)
    777#endif
    778
    779#ifndef pte_accessible
    780# define pte_accessible(mm, pte)	((void)(pte), 1)
    781#endif
    782
    783#ifndef flush_tlb_fix_spurious_fault
    784#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
    785#endif
    786
    787/*
    788 * When walking page tables, get the address of the next boundary,
    789 * or the end address of the range if that comes earlier.  Although no
    790 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
    791 */
    792
    793#define pgd_addr_end(addr, end)						\
    794({	unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;	\
    795	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
    796})
    797
    798#ifndef p4d_addr_end
    799#define p4d_addr_end(addr, end)						\
    800({	unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;	\
    801	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
    802})
    803#endif
    804
    805#ifndef pud_addr_end
    806#define pud_addr_end(addr, end)						\
    807({	unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;	\
    808	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
    809})
    810#endif
    811
    812#ifndef pmd_addr_end
    813#define pmd_addr_end(addr, end)						\
    814({	unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;	\
    815	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
    816})
    817#endif
    818
    819/*
    820 * When walking page tables, we usually want to skip any p?d_none entries;
    821 * and any p?d_bad entries - reporting the error before resetting to none.
    822 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
    823 */
    824void pgd_clear_bad(pgd_t *);
    825
    826#ifndef __PAGETABLE_P4D_FOLDED
    827void p4d_clear_bad(p4d_t *);
    828#else
    829#define p4d_clear_bad(p4d)        do { } while (0)
    830#endif
    831
    832#ifndef __PAGETABLE_PUD_FOLDED
    833void pud_clear_bad(pud_t *);
    834#else
    835#define pud_clear_bad(p4d)        do { } while (0)
    836#endif
    837
    838void pmd_clear_bad(pmd_t *);
    839
    840static inline int pgd_none_or_clear_bad(pgd_t *pgd)
    841{
    842	if (pgd_none(*pgd))
    843		return 1;
    844	if (unlikely(pgd_bad(*pgd))) {
    845		pgd_clear_bad(pgd);
    846		return 1;
    847	}
    848	return 0;
    849}
    850
    851static inline int p4d_none_or_clear_bad(p4d_t *p4d)
    852{
    853	if (p4d_none(*p4d))
    854		return 1;
    855	if (unlikely(p4d_bad(*p4d))) {
    856		p4d_clear_bad(p4d);
    857		return 1;
    858	}
    859	return 0;
    860}
    861
    862static inline int pud_none_or_clear_bad(pud_t *pud)
    863{
    864	if (pud_none(*pud))
    865		return 1;
    866	if (unlikely(pud_bad(*pud))) {
    867		pud_clear_bad(pud);
    868		return 1;
    869	}
    870	return 0;
    871}
    872
    873static inline int pmd_none_or_clear_bad(pmd_t *pmd)
    874{
    875	if (pmd_none(*pmd))
    876		return 1;
    877	if (unlikely(pmd_bad(*pmd))) {
    878		pmd_clear_bad(pmd);
    879		return 1;
    880	}
    881	return 0;
    882}
    883
    884static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
    885					     unsigned long addr,
    886					     pte_t *ptep)
    887{
    888	/*
    889	 * Get the current pte state, but zero it out to make it
    890	 * non-present, preventing the hardware from asynchronously
    891	 * updating it.
    892	 */
    893	return ptep_get_and_clear(vma->vm_mm, addr, ptep);
    894}
    895
    896static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
    897					     unsigned long addr,
    898					     pte_t *ptep, pte_t pte)
    899{
    900	/*
    901	 * The pte is non-present, so there's no hardware state to
    902	 * preserve.
    903	 */
    904	set_pte_at(vma->vm_mm, addr, ptep, pte);
    905}
    906
    907#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
    908/*
    909 * Start a pte protection read-modify-write transaction, which
    910 * protects against asynchronous hardware modifications to the pte.
    911 * The intention is not to prevent the hardware from making pte
    912 * updates, but to prevent any updates it may make from being lost.
    913 *
    914 * This does not protect against other software modifications of the
    915 * pte; the appropriate pte lock must be held over the transaction.
    916 *
    917 * Note that this interface is intended to be batchable, meaning that
    918 * ptep_modify_prot_commit may not actually update the pte, but merely
    919 * queue the update to be done at some later time.  The update must be
    920 * actually committed before the pte lock is released, however.
    921 */
    922static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
    923					   unsigned long addr,
    924					   pte_t *ptep)
    925{
    926	return __ptep_modify_prot_start(vma, addr, ptep);
    927}
    928
    929/*
    930 * Commit an update to a pte, leaving any hardware-controlled bits in
    931 * the PTE unmodified.
    932 */
    933static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
    934					   unsigned long addr,
    935					   pte_t *ptep, pte_t old_pte, pte_t pte)
    936{
    937	__ptep_modify_prot_commit(vma, addr, ptep, pte);
    938}
    939#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
    940#endif /* CONFIG_MMU */
    941
    942/*
    943 * No-op macros that just return the current protection value. Defined here
    944 * because these macros can be used even if CONFIG_MMU is not defined.
    945 */
    946
    947#ifndef pgprot_nx
    948#define pgprot_nx(prot)	(prot)
    949#endif
    950
    951#ifndef pgprot_noncached
    952#define pgprot_noncached(prot)	(prot)
    953#endif
    954
    955#ifndef pgprot_writecombine
    956#define pgprot_writecombine pgprot_noncached
    957#endif
    958
    959#ifndef pgprot_writethrough
    960#define pgprot_writethrough pgprot_noncached
    961#endif
    962
    963#ifndef pgprot_device
    964#define pgprot_device pgprot_noncached
    965#endif
    966
    967#ifndef pgprot_mhp
    968#define pgprot_mhp(prot)	(prot)
    969#endif
    970
    971#ifdef CONFIG_MMU
    972#ifndef pgprot_modify
    973#define pgprot_modify pgprot_modify
    974static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
    975{
    976	if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
    977		newprot = pgprot_noncached(newprot);
    978	if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
    979		newprot = pgprot_writecombine(newprot);
    980	if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
    981		newprot = pgprot_device(newprot);
    982	return newprot;
    983}
    984#endif
    985#endif /* CONFIG_MMU */
    986
    987#ifndef pgprot_encrypted
    988#define pgprot_encrypted(prot)	(prot)
    989#endif
    990
    991#ifndef pgprot_decrypted
    992#define pgprot_decrypted(prot)	(prot)
    993#endif
    994
    995/*
    996 * A facility to provide lazy MMU batching.  This allows PTE updates and
    997 * page invalidations to be delayed until a call to leave lazy MMU mode
    998 * is issued.  Some architectures may benefit from doing this, and it is
    999 * beneficial for both shadow and direct mode hypervisors, which may batch
   1000 * the PTE updates which happen during this window.  Note that using this
   1001 * interface requires that read hazards be removed from the code.  A read
   1002 * hazard could result in the direct mode hypervisor case, since the actual
   1003 * write to the page tables may not yet have taken place, so reads though
   1004 * a raw PTE pointer after it has been modified are not guaranteed to be
   1005 * up to date.  This mode can only be entered and left under the protection of
   1006 * the page table locks for all page tables which may be modified.  In the UP
   1007 * case, this is required so that preemption is disabled, and in the SMP case,
   1008 * it must synchronize the delayed page table writes properly on other CPUs.
   1009 */
   1010#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
   1011#define arch_enter_lazy_mmu_mode()	do {} while (0)
   1012#define arch_leave_lazy_mmu_mode()	do {} while (0)
   1013#define arch_flush_lazy_mmu_mode()	do {} while (0)
   1014#endif
   1015
   1016/*
   1017 * A facility to provide batching of the reload of page tables and
   1018 * other process state with the actual context switch code for
   1019 * paravirtualized guests.  By convention, only one of the batched
   1020 * update (lazy) modes (CPU, MMU) should be active at any given time,
   1021 * entry should never be nested, and entry and exits should always be
   1022 * paired.  This is for sanity of maintaining and reasoning about the
   1023 * kernel code.  In this case, the exit (end of the context switch) is
   1024 * in architecture-specific code, and so doesn't need a generic
   1025 * definition.
   1026 */
   1027#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
   1028#define arch_start_context_switch(prev)	do {} while (0)
   1029#endif
   1030
   1031/*
   1032 * When replacing an anonymous page by a real (!non) swap entry, we clear
   1033 * PG_anon_exclusive from the page and instead remember whether the flag was
   1034 * set in the swp pte. During fork(), we have to mark the entry as !exclusive
   1035 * (possibly shared). On swapin, we use that information to restore
   1036 * PG_anon_exclusive, which is very helpful in cases where we might have
   1037 * additional (e.g., FOLL_GET) references on a page and wouldn't be able to
   1038 * detect exclusivity.
   1039 *
   1040 * These functions don't apply to non-swap entries (e.g., migration, hwpoison,
   1041 * ...).
   1042 */
   1043#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
   1044static inline pte_t pte_swp_mkexclusive(pte_t pte)
   1045{
   1046	return pte;
   1047}
   1048
   1049static inline int pte_swp_exclusive(pte_t pte)
   1050{
   1051	return false;
   1052}
   1053
   1054static inline pte_t pte_swp_clear_exclusive(pte_t pte)
   1055{
   1056	return pte;
   1057}
   1058#endif
   1059
   1060#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
   1061#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
   1062static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
   1063{
   1064	return pmd;
   1065}
   1066
   1067static inline int pmd_swp_soft_dirty(pmd_t pmd)
   1068{
   1069	return 0;
   1070}
   1071
   1072static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
   1073{
   1074	return pmd;
   1075}
   1076#endif
   1077#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
   1078static inline int pte_soft_dirty(pte_t pte)
   1079{
   1080	return 0;
   1081}
   1082
   1083static inline int pmd_soft_dirty(pmd_t pmd)
   1084{
   1085	return 0;
   1086}
   1087
   1088static inline pte_t pte_mksoft_dirty(pte_t pte)
   1089{
   1090	return pte;
   1091}
   1092
   1093static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
   1094{
   1095	return pmd;
   1096}
   1097
   1098static inline pte_t pte_clear_soft_dirty(pte_t pte)
   1099{
   1100	return pte;
   1101}
   1102
   1103static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
   1104{
   1105	return pmd;
   1106}
   1107
   1108static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
   1109{
   1110	return pte;
   1111}
   1112
   1113static inline int pte_swp_soft_dirty(pte_t pte)
   1114{
   1115	return 0;
   1116}
   1117
   1118static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
   1119{
   1120	return pte;
   1121}
   1122
   1123static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
   1124{
   1125	return pmd;
   1126}
   1127
   1128static inline int pmd_swp_soft_dirty(pmd_t pmd)
   1129{
   1130	return 0;
   1131}
   1132
   1133static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
   1134{
   1135	return pmd;
   1136}
   1137#endif
   1138
   1139#ifndef __HAVE_PFNMAP_TRACKING
   1140/*
   1141 * Interfaces that can be used by architecture code to keep track of
   1142 * memory type of pfn mappings specified by the remap_pfn_range,
   1143 * vmf_insert_pfn.
   1144 */
   1145
   1146/*
   1147 * track_pfn_remap is called when a _new_ pfn mapping is being established
   1148 * by remap_pfn_range() for physical range indicated by pfn and size.
   1149 */
   1150static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
   1151				  unsigned long pfn, unsigned long addr,
   1152				  unsigned long size)
   1153{
   1154	return 0;
   1155}
   1156
   1157/*
   1158 * track_pfn_insert is called when a _new_ single pfn is established
   1159 * by vmf_insert_pfn().
   1160 */
   1161static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
   1162				    pfn_t pfn)
   1163{
   1164}
   1165
   1166/*
   1167 * track_pfn_copy is called when vma that is covering the pfnmap gets
   1168 * copied through copy_page_range().
   1169 */
   1170static inline int track_pfn_copy(struct vm_area_struct *vma)
   1171{
   1172	return 0;
   1173}
   1174
   1175/*
   1176 * untrack_pfn is called while unmapping a pfnmap for a region.
   1177 * untrack can be called for a specific region indicated by pfn and size or
   1178 * can be for the entire vma (in which case pfn, size are zero).
   1179 */
   1180static inline void untrack_pfn(struct vm_area_struct *vma,
   1181			       unsigned long pfn, unsigned long size)
   1182{
   1183}
   1184
   1185/*
   1186 * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
   1187 */
   1188static inline void untrack_pfn_moved(struct vm_area_struct *vma)
   1189{
   1190}
   1191#else
   1192extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
   1193			   unsigned long pfn, unsigned long addr,
   1194			   unsigned long size);
   1195extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
   1196			     pfn_t pfn);
   1197extern int track_pfn_copy(struct vm_area_struct *vma);
   1198extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
   1199			unsigned long size);
   1200extern void untrack_pfn_moved(struct vm_area_struct *vma);
   1201#endif
   1202
   1203#ifdef CONFIG_MMU
   1204#ifdef __HAVE_COLOR_ZERO_PAGE
   1205static inline int is_zero_pfn(unsigned long pfn)
   1206{
   1207	extern unsigned long zero_pfn;
   1208	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
   1209	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
   1210}
   1211
   1212#define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
   1213
   1214#else
   1215static inline int is_zero_pfn(unsigned long pfn)
   1216{
   1217	extern unsigned long zero_pfn;
   1218	return pfn == zero_pfn;
   1219}
   1220
   1221static inline unsigned long my_zero_pfn(unsigned long addr)
   1222{
   1223	extern unsigned long zero_pfn;
   1224	return zero_pfn;
   1225}
   1226#endif
   1227#else
   1228static inline int is_zero_pfn(unsigned long pfn)
   1229{
   1230	return 0;
   1231}
   1232
   1233static inline unsigned long my_zero_pfn(unsigned long addr)
   1234{
   1235	return 0;
   1236}
   1237#endif /* CONFIG_MMU */
   1238
   1239#ifdef CONFIG_MMU
   1240
   1241#ifndef CONFIG_TRANSPARENT_HUGEPAGE
   1242static inline int pmd_trans_huge(pmd_t pmd)
   1243{
   1244	return 0;
   1245}
   1246#ifndef pmd_write
   1247static inline int pmd_write(pmd_t pmd)
   1248{
   1249	BUG();
   1250	return 0;
   1251}
   1252#endif /* pmd_write */
   1253#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   1254
   1255#ifndef pud_write
   1256static inline int pud_write(pud_t pud)
   1257{
   1258	BUG();
   1259	return 0;
   1260}
   1261#endif /* pud_write */
   1262
   1263#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
   1264static inline int pmd_devmap(pmd_t pmd)
   1265{
   1266	return 0;
   1267}
   1268static inline int pud_devmap(pud_t pud)
   1269{
   1270	return 0;
   1271}
   1272static inline int pgd_devmap(pgd_t pgd)
   1273{
   1274	return 0;
   1275}
   1276#endif
   1277
   1278#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
   1279	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
   1280	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
   1281static inline int pud_trans_huge(pud_t pud)
   1282{
   1283	return 0;
   1284}
   1285#endif
   1286
   1287/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
   1288static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
   1289{
   1290	pud_t pudval = READ_ONCE(*pud);
   1291
   1292	if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
   1293		return 1;
   1294	if (unlikely(pud_bad(pudval))) {
   1295		pud_clear_bad(pud);
   1296		return 1;
   1297	}
   1298	return 0;
   1299}
   1300
   1301/* See pmd_trans_unstable for discussion. */
   1302static inline int pud_trans_unstable(pud_t *pud)
   1303{
   1304#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&			\
   1305	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
   1306	return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
   1307#else
   1308	return 0;
   1309#endif
   1310}
   1311
   1312#ifndef pmd_read_atomic
   1313static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
   1314{
   1315	/*
   1316	 * Depend on compiler for an atomic pmd read. NOTE: this is
   1317	 * only going to work, if the pmdval_t isn't larger than
   1318	 * an unsigned long.
   1319	 */
   1320	return *pmdp;
   1321}
   1322#endif
   1323
   1324#ifndef arch_needs_pgtable_deposit
   1325#define arch_needs_pgtable_deposit() (false)
   1326#endif
   1327/*
   1328 * This function is meant to be used by sites walking pagetables with
   1329 * the mmap_lock held in read mode to protect against MADV_DONTNEED and
   1330 * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
   1331 * into a null pmd and the transhuge page fault can convert a null pmd
   1332 * into an hugepmd or into a regular pmd (if the hugepage allocation
   1333 * fails). While holding the mmap_lock in read mode the pmd becomes
   1334 * stable and stops changing under us only if it's not null and not a
   1335 * transhuge pmd. When those races occurs and this function makes a
   1336 * difference vs the standard pmd_none_or_clear_bad, the result is
   1337 * undefined so behaving like if the pmd was none is safe (because it
   1338 * can return none anyway). The compiler level barrier() is critically
   1339 * important to compute the two checks atomically on the same pmdval.
   1340 *
   1341 * For 32bit kernels with a 64bit large pmd_t this automatically takes
   1342 * care of reading the pmd atomically to avoid SMP race conditions
   1343 * against pmd_populate() when the mmap_lock is hold for reading by the
   1344 * caller (a special atomic read not done by "gcc" as in the generic
   1345 * version above, is also needed when THP is disabled because the page
   1346 * fault can populate the pmd from under us).
   1347 */
   1348static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
   1349{
   1350	pmd_t pmdval = pmd_read_atomic(pmd);
   1351	/*
   1352	 * The barrier will stabilize the pmdval in a register or on
   1353	 * the stack so that it will stop changing under the code.
   1354	 *
   1355	 * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
   1356	 * pmd_read_atomic is allowed to return a not atomic pmdval
   1357	 * (for example pointing to an hugepage that has never been
   1358	 * mapped in the pmd). The below checks will only care about
   1359	 * the low part of the pmd with 32bit PAE x86 anyway, with the
   1360	 * exception of pmd_none(). So the important thing is that if
   1361	 * the low part of the pmd is found null, the high part will
   1362	 * be also null or the pmd_none() check below would be
   1363	 * confused.
   1364	 */
   1365#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1366	barrier();
   1367#endif
   1368	/*
   1369	 * !pmd_present() checks for pmd migration entries
   1370	 *
   1371	 * The complete check uses is_pmd_migration_entry() in linux/swapops.h
   1372	 * But using that requires moving current function and pmd_trans_unstable()
   1373	 * to linux/swapops.h to resolve dependency, which is too much code move.
   1374	 *
   1375	 * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
   1376	 * because !pmd_present() pages can only be under migration not swapped
   1377	 * out.
   1378	 *
   1379	 * pmd_none() is preserved for future condition checks on pmd migration
   1380	 * entries and not confusing with this function name, although it is
   1381	 * redundant with !pmd_present().
   1382	 */
   1383	if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
   1384		(IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
   1385		return 1;
   1386	if (unlikely(pmd_bad(pmdval))) {
   1387		pmd_clear_bad(pmd);
   1388		return 1;
   1389	}
   1390	return 0;
   1391}
   1392
   1393/*
   1394 * This is a noop if Transparent Hugepage Support is not built into
   1395 * the kernel. Otherwise it is equivalent to
   1396 * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
   1397 * places that already verified the pmd is not none and they want to
   1398 * walk ptes while holding the mmap sem in read mode (write mode don't
   1399 * need this). If THP is not enabled, the pmd can't go away under the
   1400 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
   1401 * run a pmd_trans_unstable before walking the ptes after
   1402 * split_huge_pmd returns (because it may have run when the pmd become
   1403 * null, but then a page fault can map in a THP and not a regular page).
   1404 */
   1405static inline int pmd_trans_unstable(pmd_t *pmd)
   1406{
   1407#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1408	return pmd_none_or_trans_huge_or_clear_bad(pmd);
   1409#else
   1410	return 0;
   1411#endif
   1412}
   1413
   1414/*
   1415 * the ordering of these checks is important for pmds with _page_devmap set.
   1416 * if we check pmd_trans_unstable() first we will trip the bad_pmd() check
   1417 * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
   1418 * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
   1419 */
   1420static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
   1421{
   1422	return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
   1423}
   1424
   1425#ifndef CONFIG_NUMA_BALANCING
   1426/*
   1427 * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
   1428 * the only case the kernel cares is for NUMA balancing and is only ever set
   1429 * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
   1430 * _PAGE_PROTNONE so by default, implement the helper as "always no". It
   1431 * is the responsibility of the caller to distinguish between PROT_NONE
   1432 * protections and NUMA hinting fault protections.
   1433 */
   1434static inline int pte_protnone(pte_t pte)
   1435{
   1436	return 0;
   1437}
   1438
   1439static inline int pmd_protnone(pmd_t pmd)
   1440{
   1441	return 0;
   1442}
   1443#endif /* CONFIG_NUMA_BALANCING */
   1444
   1445#endif /* CONFIG_MMU */
   1446
   1447#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
   1448
   1449#ifndef __PAGETABLE_P4D_FOLDED
   1450int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
   1451void p4d_clear_huge(p4d_t *p4d);
   1452#else
   1453static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
   1454{
   1455	return 0;
   1456}
   1457static inline void p4d_clear_huge(p4d_t *p4d) { }
   1458#endif /* !__PAGETABLE_P4D_FOLDED */
   1459
   1460int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
   1461int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
   1462int pud_clear_huge(pud_t *pud);
   1463int pmd_clear_huge(pmd_t *pmd);
   1464int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
   1465int pud_free_pmd_page(pud_t *pud, unsigned long addr);
   1466int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
   1467#else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
   1468static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
   1469{
   1470	return 0;
   1471}
   1472static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
   1473{
   1474	return 0;
   1475}
   1476static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
   1477{
   1478	return 0;
   1479}
   1480static inline void p4d_clear_huge(p4d_t *p4d) { }
   1481static inline int pud_clear_huge(pud_t *pud)
   1482{
   1483	return 0;
   1484}
   1485static inline int pmd_clear_huge(pmd_t *pmd)
   1486{
   1487	return 0;
   1488}
   1489static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
   1490{
   1491	return 0;
   1492}
   1493static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
   1494{
   1495	return 0;
   1496}
   1497static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
   1498{
   1499	return 0;
   1500}
   1501#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
   1502
   1503#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
   1504#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1505/*
   1506 * ARCHes with special requirements for evicting THP backing TLB entries can
   1507 * implement this. Otherwise also, it can help optimize normal TLB flush in
   1508 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
   1509 * entire TLB if flush span is greater than a threshold, which will
   1510 * likely be true for a single huge page. Thus a single THP flush will
   1511 * invalidate the entire TLB which is not desirable.
   1512 * e.g. see arch/arc: flush_pmd_tlb_range
   1513 */
   1514#define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
   1515#define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
   1516#else
   1517#define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
   1518#define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
   1519#endif
   1520#endif
   1521
   1522struct file;
   1523int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
   1524			unsigned long size, pgprot_t *vma_prot);
   1525
   1526#ifndef CONFIG_X86_ESPFIX64
   1527static inline void init_espfix_bsp(void) { }
   1528#endif
   1529
   1530extern void __init pgtable_cache_init(void);
   1531
   1532#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
   1533static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
   1534{
   1535	return true;
   1536}
   1537
   1538static inline bool arch_has_pfn_modify_check(void)
   1539{
   1540	return false;
   1541}
   1542#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
   1543
   1544/*
   1545 * Architecture PAGE_KERNEL_* fallbacks
   1546 *
   1547 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
   1548 * because they really don't support them, or the port needs to be updated to
   1549 * reflect the required functionality. Below are a set of relatively safe
   1550 * fallbacks, as best effort, which we can count on in lieu of the architectures
   1551 * not defining them on their own yet.
   1552 */
   1553
   1554#ifndef PAGE_KERNEL_RO
   1555# define PAGE_KERNEL_RO PAGE_KERNEL
   1556#endif
   1557
   1558#ifndef PAGE_KERNEL_EXEC
   1559# define PAGE_KERNEL_EXEC PAGE_KERNEL
   1560#endif
   1561
   1562/*
   1563 * Page Table Modification bits for pgtbl_mod_mask.
   1564 *
   1565 * These are used by the p?d_alloc_track*() set of functions an in the generic
   1566 * vmalloc/ioremap code to track at which page-table levels entries have been
   1567 * modified. Based on that the code can better decide when vmalloc and ioremap
   1568 * mapping changes need to be synchronized to other page-tables in the system.
   1569 */
   1570#define		__PGTBL_PGD_MODIFIED	0
   1571#define		__PGTBL_P4D_MODIFIED	1
   1572#define		__PGTBL_PUD_MODIFIED	2
   1573#define		__PGTBL_PMD_MODIFIED	3
   1574#define		__PGTBL_PTE_MODIFIED	4
   1575
   1576#define		PGTBL_PGD_MODIFIED	BIT(__PGTBL_PGD_MODIFIED)
   1577#define		PGTBL_P4D_MODIFIED	BIT(__PGTBL_P4D_MODIFIED)
   1578#define		PGTBL_PUD_MODIFIED	BIT(__PGTBL_PUD_MODIFIED)
   1579#define		PGTBL_PMD_MODIFIED	BIT(__PGTBL_PMD_MODIFIED)
   1580#define		PGTBL_PTE_MODIFIED	BIT(__PGTBL_PTE_MODIFIED)
   1581
   1582/* Page-Table Modification Mask */
   1583typedef unsigned int pgtbl_mod_mask;
   1584
   1585#endif /* !__ASSEMBLY__ */
   1586
   1587#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
   1588#ifdef CONFIG_PHYS_ADDR_T_64BIT
   1589/*
   1590 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
   1591 * with physical address space extension, but falls back to
   1592 * BITS_PER_LONG otherwise.
   1593 */
   1594#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
   1595#else
   1596#define MAX_POSSIBLE_PHYSMEM_BITS 32
   1597#endif
   1598#endif
   1599
   1600#ifndef has_transparent_hugepage
   1601#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   1602#define has_transparent_hugepage() 1
   1603#else
   1604#define has_transparent_hugepage() 0
   1605#endif
   1606#endif
   1607
   1608/*
   1609 * On some architectures it depends on the mm if the p4d/pud or pmd
   1610 * layer of the page table hierarchy is folded or not.
   1611 */
   1612#ifndef mm_p4d_folded
   1613#define mm_p4d_folded(mm)	__is_defined(__PAGETABLE_P4D_FOLDED)
   1614#endif
   1615
   1616#ifndef mm_pud_folded
   1617#define mm_pud_folded(mm)	__is_defined(__PAGETABLE_PUD_FOLDED)
   1618#endif
   1619
   1620#ifndef mm_pmd_folded
   1621#define mm_pmd_folded(mm)	__is_defined(__PAGETABLE_PMD_FOLDED)
   1622#endif
   1623
   1624#ifndef p4d_offset_lockless
   1625#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
   1626#endif
   1627#ifndef pud_offset_lockless
   1628#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
   1629#endif
   1630#ifndef pmd_offset_lockless
   1631#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
   1632#endif
   1633
   1634/*
   1635 * p?d_leaf() - true if this entry is a final mapping to a physical address.
   1636 * This differs from p?d_huge() by the fact that they are always available (if
   1637 * the architecture supports large pages at the appropriate level) even
   1638 * if CONFIG_HUGETLB_PAGE is not defined.
   1639 * Only meaningful when called on a valid entry.
   1640 */
   1641#ifndef pgd_leaf
   1642#define pgd_leaf(x)	0
   1643#endif
   1644#ifndef p4d_leaf
   1645#define p4d_leaf(x)	0
   1646#endif
   1647#ifndef pud_leaf
   1648#define pud_leaf(x)	0
   1649#endif
   1650#ifndef pmd_leaf
   1651#define pmd_leaf(x)	0
   1652#endif
   1653
   1654#ifndef pgd_leaf_size
   1655#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
   1656#endif
   1657#ifndef p4d_leaf_size
   1658#define p4d_leaf_size(x) P4D_SIZE
   1659#endif
   1660#ifndef pud_leaf_size
   1661#define pud_leaf_size(x) PUD_SIZE
   1662#endif
   1663#ifndef pmd_leaf_size
   1664#define pmd_leaf_size(x) PMD_SIZE
   1665#endif
   1666#ifndef pte_leaf_size
   1667#define pte_leaf_size(x) PAGE_SIZE
   1668#endif
   1669
   1670/*
   1671 * Some architectures have MMUs that are configurable or selectable at boot
   1672 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
   1673 * helps to have a static maximum value.
   1674 */
   1675
   1676#ifndef MAX_PTRS_PER_PTE
   1677#define MAX_PTRS_PER_PTE PTRS_PER_PTE
   1678#endif
   1679
   1680#ifndef MAX_PTRS_PER_PMD
   1681#define MAX_PTRS_PER_PMD PTRS_PER_PMD
   1682#endif
   1683
   1684#ifndef MAX_PTRS_PER_PUD
   1685#define MAX_PTRS_PER_PUD PTRS_PER_PUD
   1686#endif
   1687
   1688#ifndef MAX_PTRS_PER_P4D
   1689#define MAX_PTRS_PER_P4D PTRS_PER_P4D
   1690#endif
   1691
   1692#endif /* _LINUX_PGTABLE_H */