cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pagewalk.c (15852B)


      1// SPDX-License-Identifier: GPL-2.0
      2#include <linux/pagewalk.h>
      3#include <linux/highmem.h>
      4#include <linux/sched.h>
      5#include <linux/hugetlb.h>
      6
      7/*
      8 * We want to know the real level where a entry is located ignoring any
      9 * folding of levels which may be happening. For example if p4d is folded then
     10 * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
     11 */
     12static int real_depth(int depth)
     13{
     14	if (depth == 3 && PTRS_PER_PMD == 1)
     15		depth = 2;
     16	if (depth == 2 && PTRS_PER_PUD == 1)
     17		depth = 1;
     18	if (depth == 1 && PTRS_PER_P4D == 1)
     19		depth = 0;
     20	return depth;
     21}
     22
     23static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
     24				unsigned long end, struct mm_walk *walk)
     25{
     26	const struct mm_walk_ops *ops = walk->ops;
     27	int err = 0;
     28
     29	for (;;) {
     30		err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
     31		if (err)
     32		       break;
     33		if (addr >= end - PAGE_SIZE)
     34			break;
     35		addr += PAGE_SIZE;
     36		pte++;
     37	}
     38	return err;
     39}
     40
     41static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
     42			  struct mm_walk *walk)
     43{
     44	pte_t *pte;
     45	int err = 0;
     46	spinlock_t *ptl;
     47
     48	if (walk->no_vma) {
     49		pte = pte_offset_map(pmd, addr);
     50		err = walk_pte_range_inner(pte, addr, end, walk);
     51		pte_unmap(pte);
     52	} else {
     53		pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
     54		err = walk_pte_range_inner(pte, addr, end, walk);
     55		pte_unmap_unlock(pte, ptl);
     56	}
     57
     58	return err;
     59}
     60
     61#ifdef CONFIG_ARCH_HAS_HUGEPD
     62static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
     63			     unsigned long end, struct mm_walk *walk, int pdshift)
     64{
     65	int err = 0;
     66	const struct mm_walk_ops *ops = walk->ops;
     67	int shift = hugepd_shift(*phpd);
     68	int page_size = 1 << shift;
     69
     70	if (!ops->pte_entry)
     71		return 0;
     72
     73	if (addr & (page_size - 1))
     74		return 0;
     75
     76	for (;;) {
     77		pte_t *pte;
     78
     79		spin_lock(&walk->mm->page_table_lock);
     80		pte = hugepte_offset(*phpd, addr, pdshift);
     81		err = ops->pte_entry(pte, addr, addr + page_size, walk);
     82		spin_unlock(&walk->mm->page_table_lock);
     83
     84		if (err)
     85			break;
     86		if (addr >= end - page_size)
     87			break;
     88		addr += page_size;
     89	}
     90	return err;
     91}
     92#else
     93static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
     94			     unsigned long end, struct mm_walk *walk, int pdshift)
     95{
     96	return 0;
     97}
     98#endif
     99
    100static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
    101			  struct mm_walk *walk)
    102{
    103	pmd_t *pmd;
    104	unsigned long next;
    105	const struct mm_walk_ops *ops = walk->ops;
    106	int err = 0;
    107	int depth = real_depth(3);
    108
    109	pmd = pmd_offset(pud, addr);
    110	do {
    111again:
    112		next = pmd_addr_end(addr, end);
    113		if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
    114			if (ops->pte_hole)
    115				err = ops->pte_hole(addr, next, depth, walk);
    116			if (err)
    117				break;
    118			continue;
    119		}
    120
    121		walk->action = ACTION_SUBTREE;
    122
    123		/*
    124		 * This implies that each ->pmd_entry() handler
    125		 * needs to know about pmd_trans_huge() pmds
    126		 */
    127		if (ops->pmd_entry)
    128			err = ops->pmd_entry(pmd, addr, next, walk);
    129		if (err)
    130			break;
    131
    132		if (walk->action == ACTION_AGAIN)
    133			goto again;
    134
    135		/*
    136		 * Check this here so we only break down trans_huge
    137		 * pages when we _need_ to
    138		 */
    139		if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
    140		    walk->action == ACTION_CONTINUE ||
    141		    !(ops->pte_entry))
    142			continue;
    143
    144		if (walk->vma) {
    145			split_huge_pmd(walk->vma, pmd, addr);
    146			if (pmd_trans_unstable(pmd))
    147				goto again;
    148		}
    149
    150		if (is_hugepd(__hugepd(pmd_val(*pmd))))
    151			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
    152		else
    153			err = walk_pte_range(pmd, addr, next, walk);
    154		if (err)
    155			break;
    156	} while (pmd++, addr = next, addr != end);
    157
    158	return err;
    159}
    160
    161static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
    162			  struct mm_walk *walk)
    163{
    164	pud_t *pud;
    165	unsigned long next;
    166	const struct mm_walk_ops *ops = walk->ops;
    167	int err = 0;
    168	int depth = real_depth(2);
    169
    170	pud = pud_offset(p4d, addr);
    171	do {
    172 again:
    173		next = pud_addr_end(addr, end);
    174		if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
    175			if (ops->pte_hole)
    176				err = ops->pte_hole(addr, next, depth, walk);
    177			if (err)
    178				break;
    179			continue;
    180		}
    181
    182		walk->action = ACTION_SUBTREE;
    183
    184		if (ops->pud_entry)
    185			err = ops->pud_entry(pud, addr, next, walk);
    186		if (err)
    187			break;
    188
    189		if (walk->action == ACTION_AGAIN)
    190			goto again;
    191
    192		if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
    193		    walk->action == ACTION_CONTINUE ||
    194		    !(ops->pmd_entry || ops->pte_entry))
    195			continue;
    196
    197		if (walk->vma)
    198			split_huge_pud(walk->vma, pud, addr);
    199		if (pud_none(*pud))
    200			goto again;
    201
    202		if (is_hugepd(__hugepd(pud_val(*pud))))
    203			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
    204		else
    205			err = walk_pmd_range(pud, addr, next, walk);
    206		if (err)
    207			break;
    208	} while (pud++, addr = next, addr != end);
    209
    210	return err;
    211}
    212
    213static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
    214			  struct mm_walk *walk)
    215{
    216	p4d_t *p4d;
    217	unsigned long next;
    218	const struct mm_walk_ops *ops = walk->ops;
    219	int err = 0;
    220	int depth = real_depth(1);
    221
    222	p4d = p4d_offset(pgd, addr);
    223	do {
    224		next = p4d_addr_end(addr, end);
    225		if (p4d_none_or_clear_bad(p4d)) {
    226			if (ops->pte_hole)
    227				err = ops->pte_hole(addr, next, depth, walk);
    228			if (err)
    229				break;
    230			continue;
    231		}
    232		if (ops->p4d_entry) {
    233			err = ops->p4d_entry(p4d, addr, next, walk);
    234			if (err)
    235				break;
    236		}
    237		if (is_hugepd(__hugepd(p4d_val(*p4d))))
    238			err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT);
    239		else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
    240			err = walk_pud_range(p4d, addr, next, walk);
    241		if (err)
    242			break;
    243	} while (p4d++, addr = next, addr != end);
    244
    245	return err;
    246}
    247
    248static int walk_pgd_range(unsigned long addr, unsigned long end,
    249			  struct mm_walk *walk)
    250{
    251	pgd_t *pgd;
    252	unsigned long next;
    253	const struct mm_walk_ops *ops = walk->ops;
    254	int err = 0;
    255
    256	if (walk->pgd)
    257		pgd = walk->pgd + pgd_index(addr);
    258	else
    259		pgd = pgd_offset(walk->mm, addr);
    260	do {
    261		next = pgd_addr_end(addr, end);
    262		if (pgd_none_or_clear_bad(pgd)) {
    263			if (ops->pte_hole)
    264				err = ops->pte_hole(addr, next, 0, walk);
    265			if (err)
    266				break;
    267			continue;
    268		}
    269		if (ops->pgd_entry) {
    270			err = ops->pgd_entry(pgd, addr, next, walk);
    271			if (err)
    272				break;
    273		}
    274		if (is_hugepd(__hugepd(pgd_val(*pgd))))
    275			err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT);
    276		else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
    277			err = walk_p4d_range(pgd, addr, next, walk);
    278		if (err)
    279			break;
    280	} while (pgd++, addr = next, addr != end);
    281
    282	return err;
    283}
    284
    285#ifdef CONFIG_HUGETLB_PAGE
    286static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
    287				       unsigned long end)
    288{
    289	unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
    290	return boundary < end ? boundary : end;
    291}
    292
    293static int walk_hugetlb_range(unsigned long addr, unsigned long end,
    294			      struct mm_walk *walk)
    295{
    296	struct vm_area_struct *vma = walk->vma;
    297	struct hstate *h = hstate_vma(vma);
    298	unsigned long next;
    299	unsigned long hmask = huge_page_mask(h);
    300	unsigned long sz = huge_page_size(h);
    301	pte_t *pte;
    302	const struct mm_walk_ops *ops = walk->ops;
    303	int err = 0;
    304
    305	do {
    306		next = hugetlb_entry_end(h, addr, end);
    307		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
    308
    309		if (pte)
    310			err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
    311		else if (ops->pte_hole)
    312			err = ops->pte_hole(addr, next, -1, walk);
    313
    314		if (err)
    315			break;
    316	} while (addr = next, addr != end);
    317
    318	return err;
    319}
    320
    321#else /* CONFIG_HUGETLB_PAGE */
    322static int walk_hugetlb_range(unsigned long addr, unsigned long end,
    323			      struct mm_walk *walk)
    324{
    325	return 0;
    326}
    327
    328#endif /* CONFIG_HUGETLB_PAGE */
    329
    330/*
    331 * Decide whether we really walk over the current vma on [@start, @end)
    332 * or skip it via the returned value. Return 0 if we do walk over the
    333 * current vma, and return 1 if we skip the vma. Negative values means
    334 * error, where we abort the current walk.
    335 */
    336static int walk_page_test(unsigned long start, unsigned long end,
    337			struct mm_walk *walk)
    338{
    339	struct vm_area_struct *vma = walk->vma;
    340	const struct mm_walk_ops *ops = walk->ops;
    341
    342	if (ops->test_walk)
    343		return ops->test_walk(start, end, walk);
    344
    345	/*
    346	 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
    347	 * range, so we don't walk over it as we do for normal vmas. However,
    348	 * Some callers are interested in handling hole range and they don't
    349	 * want to just ignore any single address range. Such users certainly
    350	 * define their ->pte_hole() callbacks, so let's delegate them to handle
    351	 * vma(VM_PFNMAP).
    352	 */
    353	if (vma->vm_flags & VM_PFNMAP) {
    354		int err = 1;
    355		if (ops->pte_hole)
    356			err = ops->pte_hole(start, end, -1, walk);
    357		return err ? err : 1;
    358	}
    359	return 0;
    360}
    361
    362static int __walk_page_range(unsigned long start, unsigned long end,
    363			struct mm_walk *walk)
    364{
    365	int err = 0;
    366	struct vm_area_struct *vma = walk->vma;
    367	const struct mm_walk_ops *ops = walk->ops;
    368
    369	if (vma && ops->pre_vma) {
    370		err = ops->pre_vma(start, end, walk);
    371		if (err)
    372			return err;
    373	}
    374
    375	if (vma && is_vm_hugetlb_page(vma)) {
    376		if (ops->hugetlb_entry)
    377			err = walk_hugetlb_range(start, end, walk);
    378	} else
    379		err = walk_pgd_range(start, end, walk);
    380
    381	if (vma && ops->post_vma)
    382		ops->post_vma(walk);
    383
    384	return err;
    385}
    386
    387/**
    388 * walk_page_range - walk page table with caller specific callbacks
    389 * @mm:		mm_struct representing the target process of page table walk
    390 * @start:	start address of the virtual address range
    391 * @end:	end address of the virtual address range
    392 * @ops:	operation to call during the walk
    393 * @private:	private data for callbacks' usage
    394 *
    395 * Recursively walk the page table tree of the process represented by @mm
    396 * within the virtual address range [@start, @end). During walking, we can do
    397 * some caller-specific works for each entry, by setting up pmd_entry(),
    398 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
    399 * callbacks, the associated entries/pages are just ignored.
    400 * The return values of these callbacks are commonly defined like below:
    401 *
    402 *  - 0  : succeeded to handle the current entry, and if you don't reach the
    403 *         end address yet, continue to walk.
    404 *  - >0 : succeeded to handle the current entry, and return to the caller
    405 *         with caller specific value.
    406 *  - <0 : failed to handle the current entry, and return to the caller
    407 *         with error code.
    408 *
    409 * Before starting to walk page table, some callers want to check whether
    410 * they really want to walk over the current vma, typically by checking
    411 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
    412 * purpose.
    413 *
    414 * If operations need to be staged before and committed after a vma is walked,
    415 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
    416 * since it is intended to handle commit-type operations, can't return any
    417 * errors.
    418 *
    419 * struct mm_walk keeps current values of some common data like vma and pmd,
    420 * which are useful for the access from callbacks. If you want to pass some
    421 * caller-specific data to callbacks, @private should be helpful.
    422 *
    423 * Locking:
    424 *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
    425 *   because these function traverse vma list and/or access to vma's data.
    426 */
    427int walk_page_range(struct mm_struct *mm, unsigned long start,
    428		unsigned long end, const struct mm_walk_ops *ops,
    429		void *private)
    430{
    431	int err = 0;
    432	unsigned long next;
    433	struct vm_area_struct *vma;
    434	struct mm_walk walk = {
    435		.ops		= ops,
    436		.mm		= mm,
    437		.private	= private,
    438	};
    439
    440	if (start >= end)
    441		return -EINVAL;
    442
    443	if (!walk.mm)
    444		return -EINVAL;
    445
    446	mmap_assert_locked(walk.mm);
    447
    448	vma = find_vma(walk.mm, start);
    449	do {
    450		if (!vma) { /* after the last vma */
    451			walk.vma = NULL;
    452			next = end;
    453		} else if (start < vma->vm_start) { /* outside vma */
    454			walk.vma = NULL;
    455			next = min(end, vma->vm_start);
    456		} else { /* inside vma */
    457			walk.vma = vma;
    458			next = min(end, vma->vm_end);
    459			vma = vma->vm_next;
    460
    461			err = walk_page_test(start, next, &walk);
    462			if (err > 0) {
    463				/*
    464				 * positive return values are purely for
    465				 * controlling the pagewalk, so should never
    466				 * be passed to the callers.
    467				 */
    468				err = 0;
    469				continue;
    470			}
    471			if (err < 0)
    472				break;
    473		}
    474		if (walk.vma || walk.ops->pte_hole)
    475			err = __walk_page_range(start, next, &walk);
    476		if (err)
    477			break;
    478	} while (start = next, start < end);
    479	return err;
    480}
    481
    482/*
    483 * Similar to walk_page_range() but can walk any page tables even if they are
    484 * not backed by VMAs. Because 'unusual' entries may be walked this function
    485 * will also not lock the PTEs for the pte_entry() callback. This is useful for
    486 * walking the kernel pages tables or page tables for firmware.
    487 */
    488int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
    489			  unsigned long end, const struct mm_walk_ops *ops,
    490			  pgd_t *pgd,
    491			  void *private)
    492{
    493	struct mm_walk walk = {
    494		.ops		= ops,
    495		.mm		= mm,
    496		.pgd		= pgd,
    497		.private	= private,
    498		.no_vma		= true
    499	};
    500
    501	if (start >= end || !walk.mm)
    502		return -EINVAL;
    503
    504	mmap_assert_locked(walk.mm);
    505
    506	return __walk_page_range(start, end, &walk);
    507}
    508
    509int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
    510		void *private)
    511{
    512	struct mm_walk walk = {
    513		.ops		= ops,
    514		.mm		= vma->vm_mm,
    515		.vma		= vma,
    516		.private	= private,
    517	};
    518	int err;
    519
    520	if (!walk.mm)
    521		return -EINVAL;
    522
    523	mmap_assert_locked(walk.mm);
    524
    525	err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
    526	if (err > 0)
    527		return 0;
    528	if (err < 0)
    529		return err;
    530	return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
    531}
    532
    533/**
    534 * walk_page_mapping - walk all memory areas mapped into a struct address_space.
    535 * @mapping: Pointer to the struct address_space
    536 * @first_index: First page offset in the address_space
    537 * @nr: Number of incremental page offsets to cover
    538 * @ops:	operation to call during the walk
    539 * @private:	private data for callbacks' usage
    540 *
    541 * This function walks all memory areas mapped into a struct address_space.
    542 * The walk is limited to only the given page-size index range, but if
    543 * the index boundaries cross a huge page-table entry, that entry will be
    544 * included.
    545 *
    546 * Also see walk_page_range() for additional information.
    547 *
    548 * Locking:
    549 *   This function can't require that the struct mm_struct::mmap_lock is held,
    550 *   since @mapping may be mapped by multiple processes. Instead
    551 *   @mapping->i_mmap_rwsem must be held. This might have implications in the
    552 *   callbacks, and it's up tho the caller to ensure that the
    553 *   struct mm_struct::mmap_lock is not needed.
    554 *
    555 *   Also this means that a caller can't rely on the struct
    556 *   vm_area_struct::vm_flags to be constant across a call,
    557 *   except for immutable flags. Callers requiring this shouldn't use
    558 *   this function.
    559 *
    560 * Return: 0 on success, negative error code on failure, positive number on
    561 * caller defined premature termination.
    562 */
    563int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
    564		      pgoff_t nr, const struct mm_walk_ops *ops,
    565		      void *private)
    566{
    567	struct mm_walk walk = {
    568		.ops		= ops,
    569		.private	= private,
    570	};
    571	struct vm_area_struct *vma;
    572	pgoff_t vba, vea, cba, cea;
    573	unsigned long start_addr, end_addr;
    574	int err = 0;
    575
    576	lockdep_assert_held(&mapping->i_mmap_rwsem);
    577	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
    578				  first_index + nr - 1) {
    579		/* Clip to the vma */
    580		vba = vma->vm_pgoff;
    581		vea = vba + vma_pages(vma);
    582		cba = first_index;
    583		cba = max(cba, vba);
    584		cea = first_index + nr;
    585		cea = min(cea, vea);
    586
    587		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
    588		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
    589		if (start_addr >= end_addr)
    590			continue;
    591
    592		walk.vma = vma;
    593		walk.mm = vma->vm_mm;
    594
    595		err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
    596		if (err > 0) {
    597			err = 0;
    598			break;
    599		} else if (err < 0)
    600			break;
    601
    602		err = __walk_page_range(start_addr, end_addr, &walk);
    603		if (err)
    604			break;
    605	}
    606
    607	return err;
    608}