From 2a4a06da8a4b93dd189171eed7a99fffd38f42f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Nov 2020 11:41:40 +0100 Subject: mm/gup: Provide gup_get_pte() more generic In order to write another lockless page-table walker, we need gup_get_pte() exposed. While doing that, rename it to ptep_get_lockless() to match the existing ptep_get() naming. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201126121121.036370527@infradead.org --- include/linux/pgtable.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 71125a4676c4..ed9266cc115b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,61 @@ static inline pte_t ptep_get(pte_t *ptep) } #endif +#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking any + * locks. For this we would like to load the pointers atomically, but sometimes + * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What + * we do have is the guarantee that a PTE will only either go from not present + * to present, or present to not present or both -- it will not switch to a + * completely different present page without a TLB flush in between; something + * that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. + * We load pte_high *after* loading pte_low, which ensures we don't see an older + * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't + * picked up a changed pte high. We might have gotten rubbish values from + * pte_low and pte_high, but we are guaranteed that pte_low will not have the + * present bit set *unless* it is 'l'. Because get_user_pages_fast() only + * operates on present ptes we're safe. + */ +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} +#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +/* + * We require that the PTE can be read atomically. + */ +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + return ptep_get(ptep); +} +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, -- cgit v1.2.3-71-gd317