mm_types.h - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
mm_types.h (28978B)
      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _LINUX_MM_TYPES_H
      3#define _LINUX_MM_TYPES_H
      4
      5#include <linux/mm_types_task.h>
      6
      7#include <linux/auxvec.h>
      8#include <linux/kref.h>
      9#include <linux/list.h>
     10#include <linux/spinlock.h>
     11#include <linux/rbtree.h>
     12#include <linux/rwsem.h>
     13#include <linux/completion.h>
     14#include <linux/cpumask.h>
     15#include <linux/uprobes.h>
     16#include <linux/rcupdate.h>
     17#include <linux/page-flags-layout.h>
     18#include <linux/workqueue.h>
     19#include <linux/seqlock.h>
     20
     21#include <asm/mmu.h>
     22
     23#ifndef AT_VECTOR_SIZE_ARCH
     24#define AT_VECTOR_SIZE_ARCH 0
     25#endif
     26#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
     27
     28#define INIT_PASID	0
     29
     30struct address_space;
     31struct mem_cgroup;
     32
     33/*
     34 * Each physical page in the system has a struct page associated with
     35 * it to keep track of whatever it is we are using the page for at the
     36 * moment. Note that we have no way to track which tasks are using
     37 * a page, though if it is a pagecache page, rmap structures can tell us
     38 * who is mapping it.
     39 *
     40 * If you allocate the page using alloc_pages(), you can use some of the
     41 * space in struct page for your own purposes.  The five words in the main
     42 * union are available, except for bit 0 of the first word which must be
     43 * kept clear.  Many users use this word to store a pointer to an object
     44 * which is guaranteed to be aligned.  If you use the same storage as
     45 * page->mapping, you must restore it to NULL before freeing the page.
     46 *
     47 * If your page will not be mapped to userspace, you can also use the four
     48 * bytes in the mapcount union, but you must call page_mapcount_reset()
     49 * before freeing it.
     50 *
     51 * If you want to use the refcount field, it must be used in such a way
     52 * that other CPUs temporarily incrementing and then decrementing the
     53 * refcount does not cause problems.  On receiving the page from
     54 * alloc_pages(), the refcount will be positive.
     55 *
     56 * If you allocate pages of order > 0, you can use some of the fields
     57 * in each subpage, but you may need to restore some of their values
     58 * afterwards.
     59 *
     60 * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
     61 * That requires that freelist & counters in struct slab be adjacent and
     62 * double-word aligned. Because struct slab currently just reinterprets the
     63 * bits of struct page, we align all struct pages to double-word boundaries,
     64 * and ensure that 'freelist' is aligned within struct slab.
     65 */
     66#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
     67#define _struct_page_alignment	__aligned(2 * sizeof(unsigned long))
     68#else
     69#define _struct_page_alignment
     70#endif
     71
     72struct page {
     73	unsigned long flags;		/* Atomic flags, some possibly
     74					 * updated asynchronously */
     75	/*
     76	 * Five words (20/40 bytes) are available in this union.
     77	 * WARNING: bit 0 of the first word is used for PageTail(). That
     78	 * means the other users of this union MUST NOT use the bit to
     79	 * avoid collision and false-positive PageTail().
     80	 */
     81	union {
     82		struct {	/* Page cache and anonymous pages */
     83			/**
     84			 * @lru: Pageout list, eg. active_list protected by
     85			 * lruvec->lru_lock.  Sometimes used as a generic list
     86			 * by the page owner.
     87			 */
     88			union {
     89				struct list_head lru;
     90				/* Or, for the Unevictable "LRU list" slot */
     91				struct {
     92					/* Always even, to negate PageTail */
     93					void *__filler;
     94					/* Count page's or folio's mlocks */
     95					unsigned int mlock_count;
     96				};
     97			};
     98			/* See page-flags.h for PAGE_MAPPING_FLAGS */
     99			struct address_space *mapping;
    100			pgoff_t index;		/* Our offset within mapping. */
    101			/**
    102			 * @private: Mapping-private opaque data.
    103			 * Usually used for buffer_heads if PagePrivate.
    104			 * Used for swp_entry_t if PageSwapCache.
    105			 * Indicates order in the buddy system if PageBuddy.
    106			 */
    107			unsigned long private;
    108		};
    109		struct {	/* page_pool used by netstack */
    110			/**
    111			 * @pp_magic: magic value to avoid recycling non
    112			 * page_pool allocated pages.
    113			 */
    114			unsigned long pp_magic;
    115			struct page_pool *pp;
    116			unsigned long _pp_mapping_pad;
    117			unsigned long dma_addr;
    118			union {
    119				/**
    120				 * dma_addr_upper: might require a 64-bit
    121				 * value on 32-bit architectures.
    122				 */
    123				unsigned long dma_addr_upper;
    124				/**
    125				 * For frag page support, not supported in
    126				 * 32-bit architectures with 64-bit DMA.
    127				 */
    128				atomic_long_t pp_frag_count;
    129			};
    130		};
    131		struct {	/* Tail pages of compound page */
    132			unsigned long compound_head;	/* Bit zero is set */
    133
    134			/* First tail page only */
    135			unsigned char compound_dtor;
    136			unsigned char compound_order;
    137			atomic_t compound_mapcount;
    138			atomic_t compound_pincount;
    139#ifdef CONFIG_64BIT
    140			unsigned int compound_nr; /* 1 << compound_order */
    141#endif
    142		};
    143		struct {	/* Second tail page of compound page */
    144			unsigned long _compound_pad_1;	/* compound_head */
    145			unsigned long _compound_pad_2;
    146			/* For both global and memcg */
    147			struct list_head deferred_list;
    148		};
    149		struct {	/* Page table pages */
    150			unsigned long _pt_pad_1;	/* compound_head */
    151			pgtable_t pmd_huge_pte; /* protected by page->ptl */
    152			unsigned long _pt_pad_2;	/* mapping */
    153			union {
    154				struct mm_struct *pt_mm; /* x86 pgds only */
    155				atomic_t pt_frag_refcount; /* powerpc */
    156			};
    157#if ALLOC_SPLIT_PTLOCKS
    158			spinlock_t *ptl;
    159#else
    160			spinlock_t ptl;
    161#endif
    162		};
    163		struct {	/* ZONE_DEVICE pages */
    164			/** @pgmap: Points to the hosting device page map. */
    165			struct dev_pagemap *pgmap;
    166			void *zone_device_data;
    167			/*
    168			 * ZONE_DEVICE private pages are counted as being
    169			 * mapped so the next 3 words hold the mapping, index,
    170			 * and private fields from the source anonymous or
    171			 * page cache page while the page is migrated to device
    172			 * private memory.
    173			 * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
    174			 * use the mapping, index, and private fields when
    175			 * pmem backed DAX files are mapped.
    176			 */
    177		};
    178
    179		/** @rcu_head: You can use this to free a page by RCU. */
    180		struct rcu_head rcu_head;
    181	};
    182
    183	union {		/* This union is 4 bytes in size. */
    184		/*
    185		 * If the page can be mapped to userspace, encodes the number
    186		 * of times this page is referenced by a page table.
    187		 */
    188		atomic_t _mapcount;
    189
    190		/*
    191		 * If the page is neither PageSlab nor mappable to userspace,
    192		 * the value stored here may help determine what this page
    193		 * is used for.  See page-flags.h for a list of page types
    194		 * which are currently stored here.
    195		 */
    196		unsigned int page_type;
    197	};
    198
    199	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
    200	atomic_t _refcount;
    201
    202#ifdef CONFIG_MEMCG
    203	unsigned long memcg_data;
    204#endif
    205
    206	/*
    207	 * On machines where all RAM is mapped into kernel address space,
    208	 * we can simply calculate the virtual address. On machines with
    209	 * highmem some memory is mapped into kernel virtual memory
    210	 * dynamically, so we need a place to store that address.
    211	 * Note that this field could be 16 bits on x86 ... ;)
    212	 *
    213	 * Architectures with slow multiplication can define
    214	 * WANT_PAGE_VIRTUAL in asm/page.h
    215	 */
    216#if defined(WANT_PAGE_VIRTUAL)
    217	void *virtual;			/* Kernel virtual address (NULL if
    218					   not kmapped, ie. highmem) */
    219#endif /* WANT_PAGE_VIRTUAL */
    220
    221#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
    222	int _last_cpupid;
    223#endif
    224} _struct_page_alignment;
    225
    226/**
    227 * struct folio - Represents a contiguous set of bytes.
    228 * @flags: Identical to the page flags.
    229 * @lru: Least Recently Used list; tracks how recently this folio was used.
    230 * @mlock_count: Number of times this folio has been pinned by mlock().
    231 * @mapping: The file this page belongs to, or refers to the anon_vma for
    232 *    anonymous memory.
    233 * @index: Offset within the file, in units of pages.  For anonymous memory,
    234 *    this is the index from the beginning of the mmap.
    235 * @private: Filesystem per-folio data (see folio_attach_private()).
    236 *    Used for swp_entry_t if folio_test_swapcache().
    237 * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
    238 *    find out how many times this folio is mapped by userspace.
    239 * @_refcount: Do not access this member directly.  Use folio_ref_count()
    240 *    to find how many references there are to this folio.
    241 * @memcg_data: Memory Control Group data.
    242 *
    243 * A folio is a physically, virtually and logically contiguous set
    244 * of bytes.  It is a power-of-two in size, and it is aligned to that
    245 * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
    246 * in the page cache, it is at a file offset which is a multiple of that
    247 * power-of-two.  It may be mapped into userspace at an address which is
    248 * at an arbitrary page offset, but its kernel virtual address is aligned
    249 * to its size.
    250 */
    251struct folio {
    252	/* private: don't document the anon union */
    253	union {
    254		struct {
    255	/* public: */
    256			unsigned long flags;
    257			union {
    258				struct list_head lru;
    259	/* private: avoid cluttering the output */
    260				struct {
    261					void *__filler;
    262	/* public: */
    263					unsigned int mlock_count;
    264	/* private: */
    265				};
    266	/* public: */
    267			};
    268			struct address_space *mapping;
    269			pgoff_t index;
    270			void *private;
    271			atomic_t _mapcount;
    272			atomic_t _refcount;
    273#ifdef CONFIG_MEMCG
    274			unsigned long memcg_data;
    275#endif
    276	/* private: the union with struct page is transitional */
    277		};
    278		struct page page;
    279	};
    280};
    281
    282static_assert(sizeof(struct page) == sizeof(struct folio));
    283#define FOLIO_MATCH(pg, fl)						\
    284	static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
    285FOLIO_MATCH(flags, flags);
    286FOLIO_MATCH(lru, lru);
    287FOLIO_MATCH(mapping, mapping);
    288FOLIO_MATCH(compound_head, lru);
    289FOLIO_MATCH(index, index);
    290FOLIO_MATCH(private, private);
    291FOLIO_MATCH(_mapcount, _mapcount);
    292FOLIO_MATCH(_refcount, _refcount);
    293#ifdef CONFIG_MEMCG
    294FOLIO_MATCH(memcg_data, memcg_data);
    295#endif
    296#undef FOLIO_MATCH
    297
    298static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
    299{
    300	struct page *tail = &folio->page + 1;
    301	return &tail->compound_mapcount;
    302}
    303
    304static inline atomic_t *compound_mapcount_ptr(struct page *page)
    305{
    306	return &page[1].compound_mapcount;
    307}
    308
    309static inline atomic_t *compound_pincount_ptr(struct page *page)
    310{
    311	return &page[1].compound_pincount;
    312}
    313
    314/*
    315 * Used for sizing the vmemmap region on some architectures
    316 */
    317#define STRUCT_PAGE_MAX_SHIFT	(order_base_2(sizeof(struct page)))
    318
    319#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
    320#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
    321
    322/*
    323 * page_private can be used on tail pages.  However, PagePrivate is only
    324 * checked by the VM on the head page.  So page_private on the tail pages
    325 * should be used for data that's ancillary to the head page (eg attaching
    326 * buffer heads to tail pages after attaching buffer heads to the head page)
    327 */
    328#define page_private(page)		((page)->private)
    329
    330static inline void set_page_private(struct page *page, unsigned long private)
    331{
    332	page->private = private;
    333}
    334
    335static inline void *folio_get_private(struct folio *folio)
    336{
    337	return folio->private;
    338}
    339
    340struct page_frag_cache {
    341	void * va;
    342#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
    343	__u16 offset;
    344	__u16 size;
    345#else
    346	__u32 offset;
    347#endif
    348	/* we maintain a pagecount bias, so that we dont dirty cache line
    349	 * containing page->_refcount every time we allocate a fragment.
    350	 */
    351	unsigned int		pagecnt_bias;
    352	bool pfmemalloc;
    353};
    354
    355typedef unsigned long vm_flags_t;
    356
    357/*
    358 * A region containing a mapping of a non-memory backed file under NOMMU
    359 * conditions.  These are held in a global tree and are pinned by the VMAs that
    360 * map parts of them.
    361 */
    362struct vm_region {
    363	struct rb_node	vm_rb;		/* link in global region tree */
    364	vm_flags_t	vm_flags;	/* VMA vm_flags */
    365	unsigned long	vm_start;	/* start address of region */
    366	unsigned long	vm_end;		/* region initialised to here */
    367	unsigned long	vm_top;		/* region allocated to here */
    368	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
    369	struct file	*vm_file;	/* the backing file or NULL */
    370
    371	int		vm_usage;	/* region usage count (access under nommu_region_sem) */
    372	bool		vm_icache_flushed : 1; /* true if the icache has been flushed for
    373						* this region */
    374};
    375
    376#ifdef CONFIG_USERFAULTFD
    377#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
    378struct vm_userfaultfd_ctx {
    379	struct userfaultfd_ctx *ctx;
    380};
    381#else /* CONFIG_USERFAULTFD */
    382#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
    383struct vm_userfaultfd_ctx {};
    384#endif /* CONFIG_USERFAULTFD */
    385
    386struct anon_vma_name {
    387	struct kref kref;
    388	/* The name needs to be at the end because it is dynamically sized. */
    389	char name[];
    390};
    391
    392/*
    393 * This struct describes a virtual memory area. There is one of these
    394 * per VM-area/task. A VM area is any part of the process virtual memory
    395 * space that has a special rule for the page-fault handlers (ie a shared
    396 * library, the executable area etc).
    397 */
    398struct vm_area_struct {
    399	/* The first cache line has the info for VMA tree walking. */
    400
    401	unsigned long vm_start;		/* Our start address within vm_mm. */
    402	unsigned long vm_end;		/* The first byte after our end address
    403					   within vm_mm. */
    404
    405	/* linked list of VM areas per task, sorted by address */
    406	struct vm_area_struct *vm_next, *vm_prev;
    407
    408	struct rb_node vm_rb;
    409
    410	/*
    411	 * Largest free memory gap in bytes to the left of this VMA.
    412	 * Either between this VMA and vma->vm_prev, or between one of the
    413	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
    414	 * get_unmapped_area find a free area of the right size.
    415	 */
    416	unsigned long rb_subtree_gap;
    417
    418	/* Second cache line starts here. */
    419
    420	struct mm_struct *vm_mm;	/* The address space we belong to. */
    421
    422	/*
    423	 * Access permissions of this VMA.
    424	 * See vmf_insert_mixed_prot() for discussion.
    425	 */
    426	pgprot_t vm_page_prot;
    427	unsigned long vm_flags;		/* Flags, see mm.h. */
    428
    429	/*
    430	 * For areas with an address space and backing store,
    431	 * linkage into the address_space->i_mmap interval tree.
    432	 *
    433	 * For private anonymous mappings, a pointer to a null terminated string
    434	 * containing the name given to the vma, or NULL if unnamed.
    435	 */
    436
    437	union {
    438		struct {
    439			struct rb_node rb;
    440			unsigned long rb_subtree_last;
    441		} shared;
    442		/*
    443		 * Serialized by mmap_sem. Never use directly because it is
    444		 * valid only when vm_file is NULL. Use anon_vma_name instead.
    445		 */
    446		struct anon_vma_name *anon_name;
    447	};
    448
    449	/*
    450	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
    451	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
    452	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
    453	 * or brk vma (with NULL file) can only be in an anon_vma list.
    454	 */
    455	struct list_head anon_vma_chain; /* Serialized by mmap_lock &
    456					  * page_table_lock */
    457	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
    458
    459	/* Function pointers to deal with this struct. */
    460	const struct vm_operations_struct *vm_ops;
    461
    462	/* Information about our backing store: */
    463	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
    464					   units */
    465	struct file * vm_file;		/* File we map to (can be NULL). */
    466	void * vm_private_data;		/* was vm_pte (shared mem) */
    467
    468#ifdef CONFIG_SWAP
    469	atomic_long_t swap_readahead_info;
    470#endif
    471#ifndef CONFIG_MMU
    472	struct vm_region *vm_region;	/* NOMMU mapping region */
    473#endif
    474#ifdef CONFIG_NUMA
    475	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
    476#endif
    477	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
    478} __randomize_layout;
    479
    480struct kioctx_table;
    481struct mm_struct {
    482	struct {
    483		struct vm_area_struct *mmap;		/* list of VMAs */
    484		struct rb_root mm_rb;
    485		u64 vmacache_seqnum;                   /* per-thread vmacache */
    486#ifdef CONFIG_MMU
    487		unsigned long (*get_unmapped_area) (struct file *filp,
    488				unsigned long addr, unsigned long len,
    489				unsigned long pgoff, unsigned long flags);
    490#endif
    491		unsigned long mmap_base;	/* base of mmap area */
    492		unsigned long mmap_legacy_base;	/* base of mmap area in bottom-up allocations */
    493#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
    494		/* Base addresses for compatible mmap() */
    495		unsigned long mmap_compat_base;
    496		unsigned long mmap_compat_legacy_base;
    497#endif
    498		unsigned long task_size;	/* size of task vm space */
    499		unsigned long highest_vm_end;	/* highest vma end address */
    500		pgd_t * pgd;
    501
    502#ifdef CONFIG_MEMBARRIER
    503		/**
    504		 * @membarrier_state: Flags controlling membarrier behavior.
    505		 *
    506		 * This field is close to @pgd to hopefully fit in the same
    507		 * cache-line, which needs to be touched by switch_mm().
    508		 */
    509		atomic_t membarrier_state;
    510#endif
    511
    512		/**
    513		 * @mm_users: The number of users including userspace.
    514		 *
    515		 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
    516		 * drops to 0 (i.e. when the task exits and there are no other
    517		 * temporary reference holders), we also release a reference on
    518		 * @mm_count (which may then free the &struct mm_struct if
    519		 * @mm_count also drops to 0).
    520		 */
    521		atomic_t mm_users;
    522
    523		/**
    524		 * @mm_count: The number of references to &struct mm_struct
    525		 * (@mm_users count as 1).
    526		 *
    527		 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
    528		 * &struct mm_struct is freed.
    529		 */
    530		atomic_t mm_count;
    531
    532#ifdef CONFIG_MMU
    533		atomic_long_t pgtables_bytes;	/* PTE page table pages */
    534#endif
    535		int map_count;			/* number of VMAs */
    536
    537		spinlock_t page_table_lock; /* Protects page tables and some
    538					     * counters
    539					     */
    540		/*
    541		 * With some kernel config, the current mmap_lock's offset
    542		 * inside 'mm_struct' is at 0x120, which is very optimal, as
    543		 * its two hot fields 'count' and 'owner' sit in 2 different
    544		 * cachelines,  and when mmap_lock is highly contended, both
    545		 * of the 2 fields will be accessed frequently, current layout
    546		 * will help to reduce cache bouncing.
    547		 *
    548		 * So please be careful with adding new fields before
    549		 * mmap_lock, which can easily push the 2 fields into one
    550		 * cacheline.
    551		 */
    552		struct rw_semaphore mmap_lock;
    553
    554		struct list_head mmlist; /* List of maybe swapped mm's.	These
    555					  * are globally strung together off
    556					  * init_mm.mmlist, and are protected
    557					  * by mmlist_lock
    558					  */
    559
    560
    561		unsigned long hiwater_rss; /* High-watermark of RSS usage */
    562		unsigned long hiwater_vm;  /* High-water virtual memory usage */
    563
    564		unsigned long total_vm;	   /* Total pages mapped */
    565		unsigned long locked_vm;   /* Pages that have PG_mlocked set */
    566		atomic64_t    pinned_vm;   /* Refcount permanently increased */
    567		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
    568		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
    569		unsigned long stack_vm;	   /* VM_STACK */
    570		unsigned long def_flags;
    571
    572		/**
    573		 * @write_protect_seq: Locked when any thread is write
    574		 * protecting pages mapped by this mm to enforce a later COW,
    575		 * for instance during page table copying for fork().
    576		 */
    577		seqcount_t write_protect_seq;
    578
    579		spinlock_t arg_lock; /* protect the below fields */
    580
    581		unsigned long start_code, end_code, start_data, end_data;
    582		unsigned long start_brk, brk, start_stack;
    583		unsigned long arg_start, arg_end, env_start, env_end;
    584
    585		unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
    586
    587		/*
    588		 * Special counters, in some configurations protected by the
    589		 * page_table_lock, in other configurations by being atomic.
    590		 */
    591		struct mm_rss_stat rss_stat;
    592
    593		struct linux_binfmt *binfmt;
    594
    595		/* Architecture-specific MM context */
    596		mm_context_t context;
    597
    598		unsigned long flags; /* Must use atomic bitops to access */
    599
    600#ifdef CONFIG_AIO
    601		spinlock_t			ioctx_lock;
    602		struct kioctx_table __rcu	*ioctx_table;
    603#endif
    604#ifdef CONFIG_MEMCG
    605		/*
    606		 * "owner" points to a task that is regarded as the canonical
    607		 * user/owner of this mm. All of the following must be true in
    608		 * order for it to be changed:
    609		 *
    610		 * current == mm->owner
    611		 * current->mm != mm
    612		 * new_owner->mm == mm
    613		 * new_owner->alloc_lock is held
    614		 */
    615		struct task_struct __rcu *owner;
    616#endif
    617		struct user_namespace *user_ns;
    618
    619		/* store ref to file /proc/<pid>/exe symlink points to */
    620		struct file __rcu *exe_file;
    621#ifdef CONFIG_MMU_NOTIFIER
    622		struct mmu_notifier_subscriptions *notifier_subscriptions;
    623#endif
    624#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    625		pgtable_t pmd_huge_pte; /* protected by page_table_lock */
    626#endif
    627#ifdef CONFIG_NUMA_BALANCING
    628		/*
    629		 * numa_next_scan is the next time that the PTEs will be marked
    630		 * pte_numa. NUMA hinting faults will gather statistics and
    631		 * migrate pages to new nodes if necessary.
    632		 */
    633		unsigned long numa_next_scan;
    634
    635		/* Restart point for scanning and setting pte_numa */
    636		unsigned long numa_scan_offset;
    637
    638		/* numa_scan_seq prevents two threads setting pte_numa */
    639		int numa_scan_seq;
    640#endif
    641		/*
    642		 * An operation with batched TLB flushing is going on. Anything
    643		 * that can move process memory needs to flush the TLB when
    644		 * moving a PROT_NONE or PROT_NUMA mapped page.
    645		 */
    646		atomic_t tlb_flush_pending;
    647#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
    648		/* See flush_tlb_batched_pending() */
    649		atomic_t tlb_flush_batched;
    650#endif
    651		struct uprobes_state uprobes_state;
    652#ifdef CONFIG_PREEMPT_RT
    653		struct rcu_head delayed_drop;
    654#endif
    655#ifdef CONFIG_HUGETLB_PAGE
    656		atomic_long_t hugetlb_usage;
    657#endif
    658		struct work_struct async_put_work;
    659
    660#ifdef CONFIG_IOMMU_SVA
    661		u32 pasid;
    662#endif
    663#ifdef CONFIG_KSM
    664		/*
    665		 * Represent how many pages of this process are involved in KSM
    666		 * merging.
    667		 */
    668		unsigned long ksm_merging_pages;
    669#endif
    670	} __randomize_layout;
    671
    672	/*
    673	 * The mm_cpumask needs to be at the end of mm_struct, because it
    674	 * is dynamically sized based on nr_cpu_ids.
    675	 */
    676	unsigned long cpu_bitmap[];
    677};
    678
    679extern struct mm_struct init_mm;
    680
    681/* Pointer magic because the dynamic array size confuses some compilers. */
    682static inline void mm_init_cpumask(struct mm_struct *mm)
    683{
    684	unsigned long cpu_bitmap = (unsigned long)mm;
    685
    686	cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
    687	cpumask_clear((struct cpumask *)cpu_bitmap);
    688}
    689
    690/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
    691static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
    692{
    693	return (struct cpumask *)&mm->cpu_bitmap;
    694}
    695
    696struct mmu_gather;
    697extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
    698extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
    699extern void tlb_finish_mmu(struct mmu_gather *tlb);
    700
    701struct vm_fault;
    702
    703/**
    704 * typedef vm_fault_t - Return type for page fault handlers.
    705 *
    706 * Page fault handlers return a bitmask of %VM_FAULT values.
    707 */
    708typedef __bitwise unsigned int vm_fault_t;
    709
    710/**
    711 * enum vm_fault_reason - Page fault handlers return a bitmask of
    712 * these values to tell the core VM what happened when handling the
    713 * fault. Used to decide whether a process gets delivered SIGBUS or
    714 * just gets major/minor fault counters bumped up.
    715 *
    716 * @VM_FAULT_OOM:		Out Of Memory
    717 * @VM_FAULT_SIGBUS:		Bad access
    718 * @VM_FAULT_MAJOR:		Page read from storage
    719 * @VM_FAULT_WRITE:		Special case for get_user_pages
    720 * @VM_FAULT_HWPOISON:		Hit poisoned small page
    721 * @VM_FAULT_HWPOISON_LARGE:	Hit poisoned large page. Index encoded
    722 *				in upper bits
    723 * @VM_FAULT_SIGSEGV:		segmentation fault
    724 * @VM_FAULT_NOPAGE:		->fault installed the pte, not return page
    725 * @VM_FAULT_LOCKED:		->fault locked the returned page
    726 * @VM_FAULT_RETRY:		->fault blocked, must retry
    727 * @VM_FAULT_FALLBACK:		huge page fault failed, fall back to small
    728 * @VM_FAULT_DONE_COW:		->fault has fully handled COW
    729 * @VM_FAULT_NEEDDSYNC:		->fault did not modify page tables and needs
    730 *				fsync() to complete (for synchronous page faults
    731 *				in DAX)
    732 * @VM_FAULT_HINDEX_MASK:	mask HINDEX value
    733 *
    734 */
    735enum vm_fault_reason {
    736	VM_FAULT_OOM            = (__force vm_fault_t)0x000001,
    737	VM_FAULT_SIGBUS         = (__force vm_fault_t)0x000002,
    738	VM_FAULT_MAJOR          = (__force vm_fault_t)0x000004,
    739	VM_FAULT_WRITE          = (__force vm_fault_t)0x000008,
    740	VM_FAULT_HWPOISON       = (__force vm_fault_t)0x000010,
    741	VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
    742	VM_FAULT_SIGSEGV        = (__force vm_fault_t)0x000040,
    743	VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
    744	VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
    745	VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
    746	VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
    747	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
    748	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
    749	VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
    750};
    751
    752/* Encode hstate index for a hwpoisoned large page */
    753#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
    754#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf)
    755
    756#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS |	\
    757			VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON |	\
    758			VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
    759
    760#define VM_FAULT_RESULT_TRACE \
    761	{ VM_FAULT_OOM,                 "OOM" },	\
    762	{ VM_FAULT_SIGBUS,              "SIGBUS" },	\
    763	{ VM_FAULT_MAJOR,               "MAJOR" },	\
    764	{ VM_FAULT_WRITE,               "WRITE" },	\
    765	{ VM_FAULT_HWPOISON,            "HWPOISON" },	\
    766	{ VM_FAULT_HWPOISON_LARGE,      "HWPOISON_LARGE" },	\
    767	{ VM_FAULT_SIGSEGV,             "SIGSEGV" },	\
    768	{ VM_FAULT_NOPAGE,              "NOPAGE" },	\
    769	{ VM_FAULT_LOCKED,              "LOCKED" },	\
    770	{ VM_FAULT_RETRY,               "RETRY" },	\
    771	{ VM_FAULT_FALLBACK,            "FALLBACK" },	\
    772	{ VM_FAULT_DONE_COW,            "DONE_COW" },	\
    773	{ VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" }
    774
    775struct vm_special_mapping {
    776	const char *name;	/* The name, e.g. "[vdso]". */
    777
    778	/*
    779	 * If .fault is not provided, this points to a
    780	 * NULL-terminated array of pages that back the special mapping.
    781	 *
    782	 * This must not be NULL unless .fault is provided.
    783	 */
    784	struct page **pages;
    785
    786	/*
    787	 * If non-NULL, then this is called to resolve page faults
    788	 * on the special mapping.  If used, .pages is not checked.
    789	 */
    790	vm_fault_t (*fault)(const struct vm_special_mapping *sm,
    791				struct vm_area_struct *vma,
    792				struct vm_fault *vmf);
    793
    794	int (*mremap)(const struct vm_special_mapping *sm,
    795		     struct vm_area_struct *new_vma);
    796};
    797
    798enum tlb_flush_reason {
    799	TLB_FLUSH_ON_TASK_SWITCH,
    800	TLB_REMOTE_SHOOTDOWN,
    801	TLB_LOCAL_SHOOTDOWN,
    802	TLB_LOCAL_MM_SHOOTDOWN,
    803	TLB_REMOTE_SEND_IPI,
    804	NR_TLB_FLUSH_REASONS,
    805};
    806
    807 /*
    808  * A swap entry has to fit into a "unsigned long", as the entry is hidden
    809  * in the "index" field of the swapper address space.
    810  */
    811typedef struct {
    812	unsigned long val;
    813} swp_entry_t;
    814
    815/**
    816 * enum fault_flag - Fault flag definitions.
    817 * @FAULT_FLAG_WRITE: Fault was a write fault.
    818 * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
    819 * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
    820 * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
    821 * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
    822 * @FAULT_FLAG_TRIED: The fault has been tried once.
    823 * @FAULT_FLAG_USER: The fault originated in userspace.
    824 * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
    825 * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
    826 * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
    827 * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark
    828 *                      exclusive) a possibly shared anonymous page that is
    829 *                      mapped R/O.
    830 * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
    831 *                        We should only access orig_pte if this flag set.
    832 * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the
    833 *                         region to smaller page size and retry.
    834 *
    835 * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
    836 * whether we would allow page faults to retry by specifying these two
    837 * fault flags correctly.  Currently there can be three legal combinations:
    838 *
    839 * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
    840 *                              this is the first try
    841 *
    842 * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
    843 *                              we've already tried at least once
    844 *
    845 * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
    846 *
    847 * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
    848 * be used.  Note that page faults can be allowed to retry for multiple times,
    849 * in which case we'll have an initial fault with flags (a) then later on
    850 * continuous faults with flags (b).  We should always try to detect pending
    851 * signals before a retry to make sure the continuous page faults can still be
    852 * interrupted if necessary.
    853 *
    854 * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal.
    855 * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when
    856 * no existing R/O-mapped anonymous page is encountered.
    857 */
    858enum fault_flag {
    859	FAULT_FLAG_WRITE =		1 << 0,
    860	FAULT_FLAG_MKWRITE =		1 << 1,
    861	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
    862	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
    863	FAULT_FLAG_KILLABLE =		1 << 4,
    864	FAULT_FLAG_TRIED = 		1 << 5,
    865	FAULT_FLAG_USER =		1 << 6,
    866	FAULT_FLAG_REMOTE =		1 << 7,
    867	FAULT_FLAG_INSTRUCTION =	1 << 8,
    868	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
    869	FAULT_FLAG_UNSHARE =		1 << 10,
    870	FAULT_FLAG_ORIG_PTE_VALID =	1 << 11,
    871	FAULT_FLAG_PAGE_SPLIT =		1 << 12,
    872};
    873
    874typedef unsigned int __bitwise zap_flags_t;
    875
    876#endif /* _LINUX_MM_TYPES_H */