khugepaged.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
khugepaged.c (61644B)
      1// SPDX-License-Identifier: GPL-2.0
      2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      3
      4#include <linux/mm.h>
      5#include <linux/sched.h>
      6#include <linux/sched/mm.h>
      7#include <linux/sched/coredump.h>
      8#include <linux/mmu_notifier.h>
      9#include <linux/rmap.h>
     10#include <linux/swap.h>
     11#include <linux/mm_inline.h>
     12#include <linux/kthread.h>
     13#include <linux/khugepaged.h>
     14#include <linux/freezer.h>
     15#include <linux/mman.h>
     16#include <linux/hashtable.h>
     17#include <linux/userfaultfd_k.h>
     18#include <linux/page_idle.h>
     19#include <linux/page_table_check.h>
     20#include <linux/swapops.h>
     21#include <linux/shmem_fs.h>
     22
     23#include <asm/tlb.h>
     24#include <asm/pgalloc.h>
     25#include "internal.h"
     26
     27enum scan_result {
     28	SCAN_FAIL,
     29	SCAN_SUCCEED,
     30	SCAN_PMD_NULL,
     31	SCAN_EXCEED_NONE_PTE,
     32	SCAN_EXCEED_SWAP_PTE,
     33	SCAN_EXCEED_SHARED_PTE,
     34	SCAN_PTE_NON_PRESENT,
     35	SCAN_PTE_UFFD_WP,
     36	SCAN_PAGE_RO,
     37	SCAN_LACK_REFERENCED_PAGE,
     38	SCAN_PAGE_NULL,
     39	SCAN_SCAN_ABORT,
     40	SCAN_PAGE_COUNT,
     41	SCAN_PAGE_LRU,
     42	SCAN_PAGE_LOCK,
     43	SCAN_PAGE_ANON,
     44	SCAN_PAGE_COMPOUND,
     45	SCAN_ANY_PROCESS,
     46	SCAN_VMA_NULL,
     47	SCAN_VMA_CHECK,
     48	SCAN_ADDRESS_RANGE,
     49	SCAN_DEL_PAGE_LRU,
     50	SCAN_ALLOC_HUGE_PAGE_FAIL,
     51	SCAN_CGROUP_CHARGE_FAIL,
     52	SCAN_TRUNCATED,
     53	SCAN_PAGE_HAS_PRIVATE,
     54};
     55
     56#define CREATE_TRACE_POINTS
     57#include <trace/events/huge_memory.h>
     58
     59static struct task_struct *khugepaged_thread __read_mostly;
     60static DEFINE_MUTEX(khugepaged_mutex);
     61
     62/* default scan 8*512 pte (or vmas) every 30 second */
     63static unsigned int khugepaged_pages_to_scan __read_mostly;
     64static unsigned int khugepaged_pages_collapsed;
     65static unsigned int khugepaged_full_scans;
     66static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
     67/* during fragmentation poll the hugepage allocator once every minute */
     68static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
     69static unsigned long khugepaged_sleep_expire;
     70static DEFINE_SPINLOCK(khugepaged_mm_lock);
     71static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
     72/*
     73 * default collapse hugepages if there is at least one pte mapped like
     74 * it would have happened if the vma was large enough during page
     75 * fault.
     76 */
     77static unsigned int khugepaged_max_ptes_none __read_mostly;
     78static unsigned int khugepaged_max_ptes_swap __read_mostly;
     79static unsigned int khugepaged_max_ptes_shared __read_mostly;
     80
     81#define MM_SLOTS_HASH_BITS 10
     82static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
     83
     84static struct kmem_cache *mm_slot_cache __read_mostly;
     85
     86#define MAX_PTE_MAPPED_THP 8
     87
     88/**
     89 * struct mm_slot - hash lookup from mm to mm_slot
     90 * @hash: hash collision list
     91 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
     92 * @mm: the mm that this information is valid for
     93 * @nr_pte_mapped_thp: number of pte mapped THP
     94 * @pte_mapped_thp: address array corresponding pte mapped THP
     95 */
     96struct mm_slot {
     97	struct hlist_node hash;
     98	struct list_head mm_node;
     99	struct mm_struct *mm;
    100
    101	/* pte-mapped THP in this mm */
    102	int nr_pte_mapped_thp;
    103	unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
    104};
    105
    106/**
    107 * struct khugepaged_scan - cursor for scanning
    108 * @mm_head: the head of the mm list to scan
    109 * @mm_slot: the current mm_slot we are scanning
    110 * @address: the next address inside that to be scanned
    111 *
    112 * There is only the one khugepaged_scan instance of this cursor structure.
    113 */
    114struct khugepaged_scan {
    115	struct list_head mm_head;
    116	struct mm_slot *mm_slot;
    117	unsigned long address;
    118};
    119
    120static struct khugepaged_scan khugepaged_scan = {
    121	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
    122};
    123
    124#ifdef CONFIG_SYSFS
    125static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
    126					 struct kobj_attribute *attr,
    127					 char *buf)
    128{
    129	return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
    130}
    131
    132static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
    133					  struct kobj_attribute *attr,
    134					  const char *buf, size_t count)
    135{
    136	unsigned int msecs;
    137	int err;
    138
    139	err = kstrtouint(buf, 10, &msecs);
    140	if (err)
    141		return -EINVAL;
    142
    143	khugepaged_scan_sleep_millisecs = msecs;
    144	khugepaged_sleep_expire = 0;
    145	wake_up_interruptible(&khugepaged_wait);
    146
    147	return count;
    148}
    149static struct kobj_attribute scan_sleep_millisecs_attr =
    150	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
    151	       scan_sleep_millisecs_store);
    152
    153static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
    154					  struct kobj_attribute *attr,
    155					  char *buf)
    156{
    157	return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
    158}
    159
    160static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
    161					   struct kobj_attribute *attr,
    162					   const char *buf, size_t count)
    163{
    164	unsigned int msecs;
    165	int err;
    166
    167	err = kstrtouint(buf, 10, &msecs);
    168	if (err)
    169		return -EINVAL;
    170
    171	khugepaged_alloc_sleep_millisecs = msecs;
    172	khugepaged_sleep_expire = 0;
    173	wake_up_interruptible(&khugepaged_wait);
    174
    175	return count;
    176}
    177static struct kobj_attribute alloc_sleep_millisecs_attr =
    178	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
    179	       alloc_sleep_millisecs_store);
    180
    181static ssize_t pages_to_scan_show(struct kobject *kobj,
    182				  struct kobj_attribute *attr,
    183				  char *buf)
    184{
    185	return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
    186}
    187static ssize_t pages_to_scan_store(struct kobject *kobj,
    188				   struct kobj_attribute *attr,
    189				   const char *buf, size_t count)
    190{
    191	unsigned int pages;
    192	int err;
    193
    194	err = kstrtouint(buf, 10, &pages);
    195	if (err || !pages)
    196		return -EINVAL;
    197
    198	khugepaged_pages_to_scan = pages;
    199
    200	return count;
    201}
    202static struct kobj_attribute pages_to_scan_attr =
    203	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
    204	       pages_to_scan_store);
    205
    206static ssize_t pages_collapsed_show(struct kobject *kobj,
    207				    struct kobj_attribute *attr,
    208				    char *buf)
    209{
    210	return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
    211}
    212static struct kobj_attribute pages_collapsed_attr =
    213	__ATTR_RO(pages_collapsed);
    214
    215static ssize_t full_scans_show(struct kobject *kobj,
    216			       struct kobj_attribute *attr,
    217			       char *buf)
    218{
    219	return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
    220}
    221static struct kobj_attribute full_scans_attr =
    222	__ATTR_RO(full_scans);
    223
    224static ssize_t khugepaged_defrag_show(struct kobject *kobj,
    225				      struct kobj_attribute *attr, char *buf)
    226{
    227	return single_hugepage_flag_show(kobj, attr, buf,
    228					 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
    229}
    230static ssize_t khugepaged_defrag_store(struct kobject *kobj,
    231				       struct kobj_attribute *attr,
    232				       const char *buf, size_t count)
    233{
    234	return single_hugepage_flag_store(kobj, attr, buf, count,
    235				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
    236}
    237static struct kobj_attribute khugepaged_defrag_attr =
    238	__ATTR(defrag, 0644, khugepaged_defrag_show,
    239	       khugepaged_defrag_store);
    240
    241/*
    242 * max_ptes_none controls if khugepaged should collapse hugepages over
    243 * any unmapped ptes in turn potentially increasing the memory
    244 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
    245 * reduce the available free memory in the system as it
    246 * runs. Increasing max_ptes_none will instead potentially reduce the
    247 * free memory in the system during the khugepaged scan.
    248 */
    249static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
    250					     struct kobj_attribute *attr,
    251					     char *buf)
    252{
    253	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
    254}
    255static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
    256					      struct kobj_attribute *attr,
    257					      const char *buf, size_t count)
    258{
    259	int err;
    260	unsigned long max_ptes_none;
    261
    262	err = kstrtoul(buf, 10, &max_ptes_none);
    263	if (err || max_ptes_none > HPAGE_PMD_NR-1)
    264		return -EINVAL;
    265
    266	khugepaged_max_ptes_none = max_ptes_none;
    267
    268	return count;
    269}
    270static struct kobj_attribute khugepaged_max_ptes_none_attr =
    271	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
    272	       khugepaged_max_ptes_none_store);
    273
    274static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
    275					     struct kobj_attribute *attr,
    276					     char *buf)
    277{
    278	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
    279}
    280
    281static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
    282					      struct kobj_attribute *attr,
    283					      const char *buf, size_t count)
    284{
    285	int err;
    286	unsigned long max_ptes_swap;
    287
    288	err  = kstrtoul(buf, 10, &max_ptes_swap);
    289	if (err || max_ptes_swap > HPAGE_PMD_NR-1)
    290		return -EINVAL;
    291
    292	khugepaged_max_ptes_swap = max_ptes_swap;
    293
    294	return count;
    295}
    296
    297static struct kobj_attribute khugepaged_max_ptes_swap_attr =
    298	__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
    299	       khugepaged_max_ptes_swap_store);
    300
    301static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
    302					       struct kobj_attribute *attr,
    303					       char *buf)
    304{
    305	return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
    306}
    307
    308static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
    309					      struct kobj_attribute *attr,
    310					      const char *buf, size_t count)
    311{
    312	int err;
    313	unsigned long max_ptes_shared;
    314
    315	err  = kstrtoul(buf, 10, &max_ptes_shared);
    316	if (err || max_ptes_shared > HPAGE_PMD_NR-1)
    317		return -EINVAL;
    318
    319	khugepaged_max_ptes_shared = max_ptes_shared;
    320
    321	return count;
    322}
    323
    324static struct kobj_attribute khugepaged_max_ptes_shared_attr =
    325	__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
    326	       khugepaged_max_ptes_shared_store);
    327
    328static struct attribute *khugepaged_attr[] = {
    329	&khugepaged_defrag_attr.attr,
    330	&khugepaged_max_ptes_none_attr.attr,
    331	&khugepaged_max_ptes_swap_attr.attr,
    332	&khugepaged_max_ptes_shared_attr.attr,
    333	&pages_to_scan_attr.attr,
    334	&pages_collapsed_attr.attr,
    335	&full_scans_attr.attr,
    336	&scan_sleep_millisecs_attr.attr,
    337	&alloc_sleep_millisecs_attr.attr,
    338	NULL,
    339};
    340
    341struct attribute_group khugepaged_attr_group = {
    342	.attrs = khugepaged_attr,
    343	.name = "khugepaged",
    344};
    345#endif /* CONFIG_SYSFS */
    346
    347int hugepage_madvise(struct vm_area_struct *vma,
    348		     unsigned long *vm_flags, int advice)
    349{
    350	switch (advice) {
    351	case MADV_HUGEPAGE:
    352#ifdef CONFIG_S390
    353		/*
    354		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
    355		 * can't handle this properly after s390_enable_sie, so we simply
    356		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
    357		 */
    358		if (mm_has_pgste(vma->vm_mm))
    359			return 0;
    360#endif
    361		*vm_flags &= ~VM_NOHUGEPAGE;
    362		*vm_flags |= VM_HUGEPAGE;
    363		/*
    364		 * If the vma become good for khugepaged to scan,
    365		 * register it here without waiting a page fault that
    366		 * may not happen any time soon.
    367		 */
    368		khugepaged_enter_vma(vma, *vm_flags);
    369		break;
    370	case MADV_NOHUGEPAGE:
    371		*vm_flags &= ~VM_HUGEPAGE;
    372		*vm_flags |= VM_NOHUGEPAGE;
    373		/*
    374		 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
    375		 * this vma even if we leave the mm registered in khugepaged if
    376		 * it got registered before VM_NOHUGEPAGE was set.
    377		 */
    378		break;
    379	}
    380
    381	return 0;
    382}
    383
    384int __init khugepaged_init(void)
    385{
    386	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
    387					  sizeof(struct mm_slot),
    388					  __alignof__(struct mm_slot), 0, NULL);
    389	if (!mm_slot_cache)
    390		return -ENOMEM;
    391
    392	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
    393	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
    394	khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
    395	khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
    396
    397	return 0;
    398}
    399
    400void __init khugepaged_destroy(void)
    401{
    402	kmem_cache_destroy(mm_slot_cache);
    403}
    404
    405static inline struct mm_slot *alloc_mm_slot(void)
    406{
    407	if (!mm_slot_cache)	/* initialization failed */
    408		return NULL;
    409	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
    410}
    411
    412static inline void free_mm_slot(struct mm_slot *mm_slot)
    413{
    414	kmem_cache_free(mm_slot_cache, mm_slot);
    415}
    416
    417static struct mm_slot *get_mm_slot(struct mm_struct *mm)
    418{
    419	struct mm_slot *mm_slot;
    420
    421	hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
    422		if (mm == mm_slot->mm)
    423			return mm_slot;
    424
    425	return NULL;
    426}
    427
    428static void insert_to_mm_slots_hash(struct mm_struct *mm,
    429				    struct mm_slot *mm_slot)
    430{
    431	mm_slot->mm = mm;
    432	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
    433}
    434
    435static inline int khugepaged_test_exit(struct mm_struct *mm)
    436{
    437	return atomic_read(&mm->mm_users) == 0;
    438}
    439
    440bool hugepage_vma_check(struct vm_area_struct *vma,
    441			unsigned long vm_flags)
    442{
    443	if (!transhuge_vma_enabled(vma, vm_flags))
    444		return false;
    445
    446	if (vm_flags & VM_NO_KHUGEPAGED)
    447		return false;
    448
    449	/* Don't run khugepaged against DAX vma */
    450	if (vma_is_dax(vma))
    451		return false;
    452
    453	if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
    454				vma->vm_pgoff, HPAGE_PMD_NR))
    455		return false;
    456
    457	/* Enabled via shmem mount options or sysfs settings. */
    458	if (shmem_file(vma->vm_file))
    459		return shmem_huge_enabled(vma);
    460
    461	/* THP settings require madvise. */
    462	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
    463		return false;
    464
    465	/* Only regular file is valid */
    466	if (file_thp_enabled(vma))
    467		return true;
    468
    469	if (!vma->anon_vma || !vma_is_anonymous(vma))
    470		return false;
    471	if (vma_is_temporary_stack(vma))
    472		return false;
    473
    474	return true;
    475}
    476
    477void __khugepaged_enter(struct mm_struct *mm)
    478{
    479	struct mm_slot *mm_slot;
    480	int wakeup;
    481
    482	mm_slot = alloc_mm_slot();
    483	if (!mm_slot)
    484		return;
    485
    486	/* __khugepaged_exit() must not run from under us */
    487	VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
    488	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
    489		free_mm_slot(mm_slot);
    490		return;
    491	}
    492
    493	spin_lock(&khugepaged_mm_lock);
    494	insert_to_mm_slots_hash(mm, mm_slot);
    495	/*
    496	 * Insert just behind the scanning cursor, to let the area settle
    497	 * down a little.
    498	 */
    499	wakeup = list_empty(&khugepaged_scan.mm_head);
    500	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
    501	spin_unlock(&khugepaged_mm_lock);
    502
    503	mmgrab(mm);
    504	if (wakeup)
    505		wake_up_interruptible(&khugepaged_wait);
    506}
    507
    508void khugepaged_enter_vma(struct vm_area_struct *vma,
    509			  unsigned long vm_flags)
    510{
    511	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
    512	    khugepaged_enabled() &&
    513	    (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
    514	     (vma->vm_end & HPAGE_PMD_MASK))) {
    515		if (hugepage_vma_check(vma, vm_flags))
    516			__khugepaged_enter(vma->vm_mm);
    517	}
    518}
    519
    520void __khugepaged_exit(struct mm_struct *mm)
    521{
    522	struct mm_slot *mm_slot;
    523	int free = 0;
    524
    525	spin_lock(&khugepaged_mm_lock);
    526	mm_slot = get_mm_slot(mm);
    527	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
    528		hash_del(&mm_slot->hash);
    529		list_del(&mm_slot->mm_node);
    530		free = 1;
    531	}
    532	spin_unlock(&khugepaged_mm_lock);
    533
    534	if (free) {
    535		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
    536		free_mm_slot(mm_slot);
    537		mmdrop(mm);
    538	} else if (mm_slot) {
    539		/*
    540		 * This is required to serialize against
    541		 * khugepaged_test_exit() (which is guaranteed to run
    542		 * under mmap sem read mode). Stop here (after we
    543		 * return all pagetables will be destroyed) until
    544		 * khugepaged has finished working on the pagetables
    545		 * under the mmap_lock.
    546		 */
    547		mmap_write_lock(mm);
    548		mmap_write_unlock(mm);
    549	}
    550}
    551
    552static void release_pte_page(struct page *page)
    553{
    554	mod_node_page_state(page_pgdat(page),
    555			NR_ISOLATED_ANON + page_is_file_lru(page),
    556			-compound_nr(page));
    557	unlock_page(page);
    558	putback_lru_page(page);
    559}
    560
    561static void release_pte_pages(pte_t *pte, pte_t *_pte,
    562		struct list_head *compound_pagelist)
    563{
    564	struct page *page, *tmp;
    565
    566	while (--_pte >= pte) {
    567		pte_t pteval = *_pte;
    568
    569		page = pte_page(pteval);
    570		if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
    571				!PageCompound(page))
    572			release_pte_page(page);
    573	}
    574
    575	list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
    576		list_del(&page->lru);
    577		release_pte_page(page);
    578	}
    579}
    580
    581static bool is_refcount_suitable(struct page *page)
    582{
    583	int expected_refcount;
    584
    585	expected_refcount = total_mapcount(page);
    586	if (PageSwapCache(page))
    587		expected_refcount += compound_nr(page);
    588
    589	return page_count(page) == expected_refcount;
    590}
    591
    592static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
    593					unsigned long address,
    594					pte_t *pte,
    595					struct list_head *compound_pagelist)
    596{
    597	struct page *page = NULL;
    598	pte_t *_pte;
    599	int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
    600	bool writable = false;
    601
    602	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
    603	     _pte++, address += PAGE_SIZE) {
    604		pte_t pteval = *_pte;
    605		if (pte_none(pteval) || (pte_present(pteval) &&
    606				is_zero_pfn(pte_pfn(pteval)))) {
    607			if (!userfaultfd_armed(vma) &&
    608			    ++none_or_zero <= khugepaged_max_ptes_none) {
    609				continue;
    610			} else {
    611				result = SCAN_EXCEED_NONE_PTE;
    612				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
    613				goto out;
    614			}
    615		}
    616		if (!pte_present(pteval)) {
    617			result = SCAN_PTE_NON_PRESENT;
    618			goto out;
    619		}
    620		page = vm_normal_page(vma, address, pteval);
    621		if (unlikely(!page)) {
    622			result = SCAN_PAGE_NULL;
    623			goto out;
    624		}
    625
    626		VM_BUG_ON_PAGE(!PageAnon(page), page);
    627
    628		if (page_mapcount(page) > 1 &&
    629				++shared > khugepaged_max_ptes_shared) {
    630			result = SCAN_EXCEED_SHARED_PTE;
    631			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
    632			goto out;
    633		}
    634
    635		if (PageCompound(page)) {
    636			struct page *p;
    637			page = compound_head(page);
    638
    639			/*
    640			 * Check if we have dealt with the compound page
    641			 * already
    642			 */
    643			list_for_each_entry(p, compound_pagelist, lru) {
    644				if (page == p)
    645					goto next;
    646			}
    647		}
    648
    649		/*
    650		 * We can do it before isolate_lru_page because the
    651		 * page can't be freed from under us. NOTE: PG_lock
    652		 * is needed to serialize against split_huge_page
    653		 * when invoked from the VM.
    654		 */
    655		if (!trylock_page(page)) {
    656			result = SCAN_PAGE_LOCK;
    657			goto out;
    658		}
    659
    660		/*
    661		 * Check if the page has any GUP (or other external) pins.
    662		 *
    663		 * The page table that maps the page has been already unlinked
    664		 * from the page table tree and this process cannot get
    665		 * an additional pin on the page.
    666		 *
    667		 * New pins can come later if the page is shared across fork,
    668		 * but not from this process. The other process cannot write to
    669		 * the page, only trigger CoW.
    670		 */
    671		if (!is_refcount_suitable(page)) {
    672			unlock_page(page);
    673			result = SCAN_PAGE_COUNT;
    674			goto out;
    675		}
    676
    677		/*
    678		 * Isolate the page to avoid collapsing an hugepage
    679		 * currently in use by the VM.
    680		 */
    681		if (isolate_lru_page(page)) {
    682			unlock_page(page);
    683			result = SCAN_DEL_PAGE_LRU;
    684			goto out;
    685		}
    686		mod_node_page_state(page_pgdat(page),
    687				NR_ISOLATED_ANON + page_is_file_lru(page),
    688				compound_nr(page));
    689		VM_BUG_ON_PAGE(!PageLocked(page), page);
    690		VM_BUG_ON_PAGE(PageLRU(page), page);
    691
    692		if (PageCompound(page))
    693			list_add_tail(&page->lru, compound_pagelist);
    694next:
    695		/* There should be enough young pte to collapse the page */
    696		if (pte_young(pteval) ||
    697		    page_is_young(page) || PageReferenced(page) ||
    698		    mmu_notifier_test_young(vma->vm_mm, address))
    699			referenced++;
    700
    701		if (pte_write(pteval))
    702			writable = true;
    703	}
    704
    705	if (unlikely(!writable)) {
    706		result = SCAN_PAGE_RO;
    707	} else if (unlikely(!referenced)) {
    708		result = SCAN_LACK_REFERENCED_PAGE;
    709	} else {
    710		result = SCAN_SUCCEED;
    711		trace_mm_collapse_huge_page_isolate(page, none_or_zero,
    712						    referenced, writable, result);
    713		return 1;
    714	}
    715out:
    716	release_pte_pages(pte, _pte, compound_pagelist);
    717	trace_mm_collapse_huge_page_isolate(page, none_or_zero,
    718					    referenced, writable, result);
    719	return 0;
    720}
    721
    722static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
    723				      struct vm_area_struct *vma,
    724				      unsigned long address,
    725				      spinlock_t *ptl,
    726				      struct list_head *compound_pagelist)
    727{
    728	struct page *src_page, *tmp;
    729	pte_t *_pte;
    730	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
    731				_pte++, page++, address += PAGE_SIZE) {
    732		pte_t pteval = *_pte;
    733
    734		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
    735			clear_user_highpage(page, address);
    736			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
    737			if (is_zero_pfn(pte_pfn(pteval))) {
    738				/*
    739				 * ptl mostly unnecessary.
    740				 */
    741				spin_lock(ptl);
    742				ptep_clear(vma->vm_mm, address, _pte);
    743				spin_unlock(ptl);
    744			}
    745		} else {
    746			src_page = pte_page(pteval);
    747			copy_user_highpage(page, src_page, address, vma);
    748			if (!PageCompound(src_page))
    749				release_pte_page(src_page);
    750			/*
    751			 * ptl mostly unnecessary, but preempt has to
    752			 * be disabled to update the per-cpu stats
    753			 * inside page_remove_rmap().
    754			 */
    755			spin_lock(ptl);
    756			ptep_clear(vma->vm_mm, address, _pte);
    757			page_remove_rmap(src_page, vma, false);
    758			spin_unlock(ptl);
    759			free_page_and_swap_cache(src_page);
    760		}
    761	}
    762
    763	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
    764		list_del(&src_page->lru);
    765		release_pte_page(src_page);
    766	}
    767}
    768
    769static void khugepaged_alloc_sleep(void)
    770{
    771	DEFINE_WAIT(wait);
    772
    773	add_wait_queue(&khugepaged_wait, &wait);
    774	freezable_schedule_timeout_interruptible(
    775		msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
    776	remove_wait_queue(&khugepaged_wait, &wait);
    777}
    778
    779static int khugepaged_node_load[MAX_NUMNODES];
    780
    781static bool khugepaged_scan_abort(int nid)
    782{
    783	int i;
    784
    785	/*
    786	 * If node_reclaim_mode is disabled, then no extra effort is made to
    787	 * allocate memory locally.
    788	 */
    789	if (!node_reclaim_enabled())
    790		return false;
    791
    792	/* If there is a count for this node already, it must be acceptable */
    793	if (khugepaged_node_load[nid])
    794		return false;
    795
    796	for (i = 0; i < MAX_NUMNODES; i++) {
    797		if (!khugepaged_node_load[i])
    798			continue;
    799		if (node_distance(nid, i) > node_reclaim_distance)
    800			return true;
    801	}
    802	return false;
    803}
    804
    805/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
    806static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
    807{
    808	return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
    809}
    810
    811#ifdef CONFIG_NUMA
    812static int khugepaged_find_target_node(void)
    813{
    814	static int last_khugepaged_target_node = NUMA_NO_NODE;
    815	int nid, target_node = 0, max_value = 0;
    816
    817	/* find first node with max normal pages hit */
    818	for (nid = 0; nid < MAX_NUMNODES; nid++)
    819		if (khugepaged_node_load[nid] > max_value) {
    820			max_value = khugepaged_node_load[nid];
    821			target_node = nid;
    822		}
    823
    824	/* do some balance if several nodes have the same hit record */
    825	if (target_node <= last_khugepaged_target_node)
    826		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
    827				nid++)
    828			if (max_value == khugepaged_node_load[nid]) {
    829				target_node = nid;
    830				break;
    831			}
    832
    833	last_khugepaged_target_node = target_node;
    834	return target_node;
    835}
    836
    837static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
    838{
    839	if (IS_ERR(*hpage)) {
    840		if (!*wait)
    841			return false;
    842
    843		*wait = false;
    844		*hpage = NULL;
    845		khugepaged_alloc_sleep();
    846	} else if (*hpage) {
    847		put_page(*hpage);
    848		*hpage = NULL;
    849	}
    850
    851	return true;
    852}
    853
    854static struct page *
    855khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
    856{
    857	VM_BUG_ON_PAGE(*hpage, *hpage);
    858
    859	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
    860	if (unlikely(!*hpage)) {
    861		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
    862		*hpage = ERR_PTR(-ENOMEM);
    863		return NULL;
    864	}
    865
    866	prep_transhuge_page(*hpage);
    867	count_vm_event(THP_COLLAPSE_ALLOC);
    868	return *hpage;
    869}
    870#else
    871static int khugepaged_find_target_node(void)
    872{
    873	return 0;
    874}
    875
    876static inline struct page *alloc_khugepaged_hugepage(void)
    877{
    878	struct page *page;
    879
    880	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
    881			   HPAGE_PMD_ORDER);
    882	if (page)
    883		prep_transhuge_page(page);
    884	return page;
    885}
    886
    887static struct page *khugepaged_alloc_hugepage(bool *wait)
    888{
    889	struct page *hpage;
    890
    891	do {
    892		hpage = alloc_khugepaged_hugepage();
    893		if (!hpage) {
    894			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
    895			if (!*wait)
    896				return NULL;
    897
    898			*wait = false;
    899			khugepaged_alloc_sleep();
    900		} else
    901			count_vm_event(THP_COLLAPSE_ALLOC);
    902	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
    903
    904	return hpage;
    905}
    906
    907static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
    908{
    909	/*
    910	 * If the hpage allocated earlier was briefly exposed in page cache
    911	 * before collapse_file() failed, it is possible that racing lookups
    912	 * have not yet completed, and would then be unpleasantly surprised by
    913	 * finding the hpage reused for the same mapping at a different offset.
    914	 * Just release the previous allocation if there is any danger of that.
    915	 */
    916	if (*hpage && page_count(*hpage) > 1) {
    917		put_page(*hpage);
    918		*hpage = NULL;
    919	}
    920
    921	if (!*hpage)
    922		*hpage = khugepaged_alloc_hugepage(wait);
    923
    924	if (unlikely(!*hpage))
    925		return false;
    926
    927	return true;
    928}
    929
    930static struct page *
    931khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
    932{
    933	VM_BUG_ON(!*hpage);
    934
    935	return  *hpage;
    936}
    937#endif
    938
    939/*
    940 * If mmap_lock temporarily dropped, revalidate vma
    941 * before taking mmap_lock.
    942 * Return 0 if succeeds, otherwise return none-zero
    943 * value (scan code).
    944 */
    945
    946static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
    947		struct vm_area_struct **vmap)
    948{
    949	struct vm_area_struct *vma;
    950	unsigned long hstart, hend;
    951
    952	if (unlikely(khugepaged_test_exit(mm)))
    953		return SCAN_ANY_PROCESS;
    954
    955	*vmap = vma = find_vma(mm, address);
    956	if (!vma)
    957		return SCAN_VMA_NULL;
    958
    959	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
    960	hend = vma->vm_end & HPAGE_PMD_MASK;
    961	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
    962		return SCAN_ADDRESS_RANGE;
    963	if (!hugepage_vma_check(vma, vma->vm_flags))
    964		return SCAN_VMA_CHECK;
    965	/* Anon VMA expected */
    966	if (!vma->anon_vma || !vma_is_anonymous(vma))
    967		return SCAN_VMA_CHECK;
    968	return 0;
    969}
    970
    971/*
    972 * Bring missing pages in from swap, to complete THP collapse.
    973 * Only done if khugepaged_scan_pmd believes it is worthwhile.
    974 *
    975 * Called and returns without pte mapped or spinlocks held,
    976 * but with mmap_lock held to protect against vma changes.
    977 */
    978
    979static bool __collapse_huge_page_swapin(struct mm_struct *mm,
    980					struct vm_area_struct *vma,
    981					unsigned long haddr, pmd_t *pmd,
    982					int referenced)
    983{
    984	int swapped_in = 0;
    985	vm_fault_t ret = 0;
    986	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
    987
    988	for (address = haddr; address < end; address += PAGE_SIZE) {
    989		struct vm_fault vmf = {
    990			.vma = vma,
    991			.address = address,
    992			.pgoff = linear_page_index(vma, haddr),
    993			.flags = FAULT_FLAG_ALLOW_RETRY,
    994			.pmd = pmd,
    995		};
    996
    997		vmf.pte = pte_offset_map(pmd, address);
    998		vmf.orig_pte = *vmf.pte;
    999		if (!is_swap_pte(vmf.orig_pte)) {
   1000			pte_unmap(vmf.pte);
   1001			continue;
   1002		}
   1003		swapped_in++;
   1004		ret = do_swap_page(&vmf);
   1005
   1006		/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
   1007		if (ret & VM_FAULT_RETRY) {
   1008			mmap_read_lock(mm);
   1009			if (hugepage_vma_revalidate(mm, haddr, &vma)) {
   1010				/* vma is no longer available, don't continue to swapin */
   1011				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
   1012				return false;
   1013			}
   1014			/* check if the pmd is still valid */
   1015			if (mm_find_pmd(mm, haddr) != pmd) {
   1016				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
   1017				return false;
   1018			}
   1019		}
   1020		if (ret & VM_FAULT_ERROR) {
   1021			trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
   1022			return false;
   1023		}
   1024	}
   1025
   1026	/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
   1027	if (swapped_in)
   1028		lru_add_drain();
   1029
   1030	trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
   1031	return true;
   1032}
   1033
   1034static void collapse_huge_page(struct mm_struct *mm,
   1035				   unsigned long address,
   1036				   struct page **hpage,
   1037				   int node, int referenced, int unmapped)
   1038{
   1039	LIST_HEAD(compound_pagelist);
   1040	pmd_t *pmd, _pmd;
   1041	pte_t *pte;
   1042	pgtable_t pgtable;
   1043	struct page *new_page;
   1044	spinlock_t *pmd_ptl, *pte_ptl;
   1045	int isolated = 0, result = 0;
   1046	struct vm_area_struct *vma;
   1047	struct mmu_notifier_range range;
   1048	gfp_t gfp;
   1049
   1050	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
   1051
   1052	/* Only allocate from the target node */
   1053	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
   1054
   1055	/*
   1056	 * Before allocating the hugepage, release the mmap_lock read lock.
   1057	 * The allocation can take potentially a long time if it involves
   1058	 * sync compaction, and we do not need to hold the mmap_lock during
   1059	 * that. We will recheck the vma after taking it again in write mode.
   1060	 */
   1061	mmap_read_unlock(mm);
   1062	new_page = khugepaged_alloc_page(hpage, gfp, node);
   1063	if (!new_page) {
   1064		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
   1065		goto out_nolock;
   1066	}
   1067
   1068	if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
   1069		result = SCAN_CGROUP_CHARGE_FAIL;
   1070		goto out_nolock;
   1071	}
   1072	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
   1073
   1074	mmap_read_lock(mm);
   1075	result = hugepage_vma_revalidate(mm, address, &vma);
   1076	if (result) {
   1077		mmap_read_unlock(mm);
   1078		goto out_nolock;
   1079	}
   1080
   1081	pmd = mm_find_pmd(mm, address);
   1082	if (!pmd) {
   1083		result = SCAN_PMD_NULL;
   1084		mmap_read_unlock(mm);
   1085		goto out_nolock;
   1086	}
   1087
   1088	/*
   1089	 * __collapse_huge_page_swapin always returns with mmap_lock locked.
   1090	 * If it fails, we release mmap_lock and jump out_nolock.
   1091	 * Continuing to collapse causes inconsistency.
   1092	 */
   1093	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
   1094						     pmd, referenced)) {
   1095		mmap_read_unlock(mm);
   1096		goto out_nolock;
   1097	}
   1098
   1099	mmap_read_unlock(mm);
   1100	/*
   1101	 * Prevent all access to pagetables with the exception of
   1102	 * gup_fast later handled by the ptep_clear_flush and the VM
   1103	 * handled by the anon_vma lock + PG_lock.
   1104	 */
   1105	mmap_write_lock(mm);
   1106	result = hugepage_vma_revalidate(mm, address, &vma);
   1107	if (result)
   1108		goto out_up_write;
   1109	/* check if the pmd is still valid */
   1110	if (mm_find_pmd(mm, address) != pmd)
   1111		goto out_up_write;
   1112
   1113	anon_vma_lock_write(vma->anon_vma);
   1114
   1115	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
   1116				address, address + HPAGE_PMD_SIZE);
   1117	mmu_notifier_invalidate_range_start(&range);
   1118
   1119	pte = pte_offset_map(pmd, address);
   1120	pte_ptl = pte_lockptr(mm, pmd);
   1121
   1122	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
   1123	/*
   1124	 * After this gup_fast can't run anymore. This also removes
   1125	 * any huge TLB entry from the CPU so we won't allow
   1126	 * huge and small TLB entries for the same virtual address
   1127	 * to avoid the risk of CPU bugs in that area.
   1128	 */
   1129	_pmd = pmdp_collapse_flush(vma, address, pmd);
   1130	spin_unlock(pmd_ptl);
   1131	mmu_notifier_invalidate_range_end(&range);
   1132
   1133	spin_lock(pte_ptl);
   1134	isolated = __collapse_huge_page_isolate(vma, address, pte,
   1135			&compound_pagelist);
   1136	spin_unlock(pte_ptl);
   1137
   1138	if (unlikely(!isolated)) {
   1139		pte_unmap(pte);
   1140		spin_lock(pmd_ptl);
   1141		BUG_ON(!pmd_none(*pmd));
   1142		/*
   1143		 * We can only use set_pmd_at when establishing
   1144		 * hugepmds and never for establishing regular pmds that
   1145		 * points to regular pagetables. Use pmd_populate for that
   1146		 */
   1147		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
   1148		spin_unlock(pmd_ptl);
   1149		anon_vma_unlock_write(vma->anon_vma);
   1150		result = SCAN_FAIL;
   1151		goto out_up_write;
   1152	}
   1153
   1154	/*
   1155	 * All pages are isolated and locked so anon_vma rmap
   1156	 * can't run anymore.
   1157	 */
   1158	anon_vma_unlock_write(vma->anon_vma);
   1159
   1160	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
   1161			&compound_pagelist);
   1162	pte_unmap(pte);
   1163	/*
   1164	 * spin_lock() below is not the equivalent of smp_wmb(), but
   1165	 * the smp_wmb() inside __SetPageUptodate() can be reused to
   1166	 * avoid the copy_huge_page writes to become visible after
   1167	 * the set_pmd_at() write.
   1168	 */
   1169	__SetPageUptodate(new_page);
   1170	pgtable = pmd_pgtable(_pmd);
   1171
   1172	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
   1173	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
   1174
   1175	spin_lock(pmd_ptl);
   1176	BUG_ON(!pmd_none(*pmd));
   1177	page_add_new_anon_rmap(new_page, vma, address);
   1178	lru_cache_add_inactive_or_unevictable(new_page, vma);
   1179	pgtable_trans_huge_deposit(mm, pmd, pgtable);
   1180	set_pmd_at(mm, address, pmd, _pmd);
   1181	update_mmu_cache_pmd(vma, address, pmd);
   1182	spin_unlock(pmd_ptl);
   1183
   1184	*hpage = NULL;
   1185
   1186	khugepaged_pages_collapsed++;
   1187	result = SCAN_SUCCEED;
   1188out_up_write:
   1189	mmap_write_unlock(mm);
   1190out_nolock:
   1191	if (!IS_ERR_OR_NULL(*hpage))
   1192		mem_cgroup_uncharge(page_folio(*hpage));
   1193	trace_mm_collapse_huge_page(mm, isolated, result);
   1194	return;
   1195}
   1196
   1197static int khugepaged_scan_pmd(struct mm_struct *mm,
   1198			       struct vm_area_struct *vma,
   1199			       unsigned long address,
   1200			       struct page **hpage)
   1201{
   1202	pmd_t *pmd;
   1203	pte_t *pte, *_pte;
   1204	int ret = 0, result = 0, referenced = 0;
   1205	int none_or_zero = 0, shared = 0;
   1206	struct page *page = NULL;
   1207	unsigned long _address;
   1208	spinlock_t *ptl;
   1209	int node = NUMA_NO_NODE, unmapped = 0;
   1210	bool writable = false;
   1211
   1212	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
   1213
   1214	pmd = mm_find_pmd(mm, address);
   1215	if (!pmd) {
   1216		result = SCAN_PMD_NULL;
   1217		goto out;
   1218	}
   1219
   1220	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
   1221	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
   1222	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
   1223	     _pte++, _address += PAGE_SIZE) {
   1224		pte_t pteval = *_pte;
   1225		if (is_swap_pte(pteval)) {
   1226			if (++unmapped <= khugepaged_max_ptes_swap) {
   1227				/*
   1228				 * Always be strict with uffd-wp
   1229				 * enabled swap entries.  Please see
   1230				 * comment below for pte_uffd_wp().
   1231				 */
   1232				if (pte_swp_uffd_wp(pteval)) {
   1233					result = SCAN_PTE_UFFD_WP;
   1234					goto out_unmap;
   1235				}
   1236				continue;
   1237			} else {
   1238				result = SCAN_EXCEED_SWAP_PTE;
   1239				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
   1240				goto out_unmap;
   1241			}
   1242		}
   1243		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
   1244			if (!userfaultfd_armed(vma) &&
   1245			    ++none_or_zero <= khugepaged_max_ptes_none) {
   1246				continue;
   1247			} else {
   1248				result = SCAN_EXCEED_NONE_PTE;
   1249				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
   1250				goto out_unmap;
   1251			}
   1252		}
   1253		if (pte_uffd_wp(pteval)) {
   1254			/*
   1255			 * Don't collapse the page if any of the small
   1256			 * PTEs are armed with uffd write protection.
   1257			 * Here we can also mark the new huge pmd as
   1258			 * write protected if any of the small ones is
   1259			 * marked but that could bring unknown
   1260			 * userfault messages that falls outside of
   1261			 * the registered range.  So, just be simple.
   1262			 */
   1263			result = SCAN_PTE_UFFD_WP;
   1264			goto out_unmap;
   1265		}
   1266		if (pte_write(pteval))
   1267			writable = true;
   1268
   1269		page = vm_normal_page(vma, _address, pteval);
   1270		if (unlikely(!page)) {
   1271			result = SCAN_PAGE_NULL;
   1272			goto out_unmap;
   1273		}
   1274
   1275		if (page_mapcount(page) > 1 &&
   1276				++shared > khugepaged_max_ptes_shared) {
   1277			result = SCAN_EXCEED_SHARED_PTE;
   1278			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
   1279			goto out_unmap;
   1280		}
   1281
   1282		page = compound_head(page);
   1283
   1284		/*
   1285		 * Record which node the original page is from and save this
   1286		 * information to khugepaged_node_load[].
   1287		 * Khugepaged will allocate hugepage from the node has the max
   1288		 * hit record.
   1289		 */
   1290		node = page_to_nid(page);
   1291		if (khugepaged_scan_abort(node)) {
   1292			result = SCAN_SCAN_ABORT;
   1293			goto out_unmap;
   1294		}
   1295		khugepaged_node_load[node]++;
   1296		if (!PageLRU(page)) {
   1297			result = SCAN_PAGE_LRU;
   1298			goto out_unmap;
   1299		}
   1300		if (PageLocked(page)) {
   1301			result = SCAN_PAGE_LOCK;
   1302			goto out_unmap;
   1303		}
   1304		if (!PageAnon(page)) {
   1305			result = SCAN_PAGE_ANON;
   1306			goto out_unmap;
   1307		}
   1308
   1309		/*
   1310		 * Check if the page has any GUP (or other external) pins.
   1311		 *
   1312		 * Here the check is racy it may see totmal_mapcount > refcount
   1313		 * in some cases.
   1314		 * For example, one process with one forked child process.
   1315		 * The parent has the PMD split due to MADV_DONTNEED, then
   1316		 * the child is trying unmap the whole PMD, but khugepaged
   1317		 * may be scanning the parent between the child has
   1318		 * PageDoubleMap flag cleared and dec the mapcount.  So
   1319		 * khugepaged may see total_mapcount > refcount.
   1320		 *
   1321		 * But such case is ephemeral we could always retry collapse
   1322		 * later.  However it may report false positive if the page
   1323		 * has excessive GUP pins (i.e. 512).  Anyway the same check
   1324		 * will be done again later the risk seems low.
   1325		 */
   1326		if (!is_refcount_suitable(page)) {
   1327			result = SCAN_PAGE_COUNT;
   1328			goto out_unmap;
   1329		}
   1330		if (pte_young(pteval) ||
   1331		    page_is_young(page) || PageReferenced(page) ||
   1332		    mmu_notifier_test_young(vma->vm_mm, address))
   1333			referenced++;
   1334	}
   1335	if (!writable) {
   1336		result = SCAN_PAGE_RO;
   1337	} else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
   1338		result = SCAN_LACK_REFERENCED_PAGE;
   1339	} else {
   1340		result = SCAN_SUCCEED;
   1341		ret = 1;
   1342	}
   1343out_unmap:
   1344	pte_unmap_unlock(pte, ptl);
   1345	if (ret) {
   1346		node = khugepaged_find_target_node();
   1347		/* collapse_huge_page will return with the mmap_lock released */
   1348		collapse_huge_page(mm, address, hpage, node,
   1349				referenced, unmapped);
   1350	}
   1351out:
   1352	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
   1353				     none_or_zero, result, unmapped);
   1354	return ret;
   1355}
   1356
   1357static void collect_mm_slot(struct mm_slot *mm_slot)
   1358{
   1359	struct mm_struct *mm = mm_slot->mm;
   1360
   1361	lockdep_assert_held(&khugepaged_mm_lock);
   1362
   1363	if (khugepaged_test_exit(mm)) {
   1364		/* free mm_slot */
   1365		hash_del(&mm_slot->hash);
   1366		list_del(&mm_slot->mm_node);
   1367
   1368		/*
   1369		 * Not strictly needed because the mm exited already.
   1370		 *
   1371		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
   1372		 */
   1373
   1374		/* khugepaged_mm_lock actually not necessary for the below */
   1375		free_mm_slot(mm_slot);
   1376		mmdrop(mm);
   1377	}
   1378}
   1379
   1380#ifdef CONFIG_SHMEM
   1381/*
   1382 * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
   1383 * khugepaged should try to collapse the page table.
   1384 */
   1385static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
   1386					 unsigned long addr)
   1387{
   1388	struct mm_slot *mm_slot;
   1389
   1390	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
   1391
   1392	spin_lock(&khugepaged_mm_lock);
   1393	mm_slot = get_mm_slot(mm);
   1394	if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
   1395		mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
   1396	spin_unlock(&khugepaged_mm_lock);
   1397	return 0;
   1398}
   1399
   1400static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
   1401				  unsigned long addr, pmd_t *pmdp)
   1402{
   1403	spinlock_t *ptl;
   1404	pmd_t pmd;
   1405
   1406	mmap_assert_write_locked(mm);
   1407	ptl = pmd_lock(vma->vm_mm, pmdp);
   1408	pmd = pmdp_collapse_flush(vma, addr, pmdp);
   1409	spin_unlock(ptl);
   1410	mm_dec_nr_ptes(mm);
   1411	page_table_check_pte_clear_range(mm, addr, pmd);
   1412	pte_free(mm, pmd_pgtable(pmd));
   1413}
   1414
   1415/**
   1416 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
   1417 * address haddr.
   1418 *
   1419 * @mm: process address space where collapse happens
   1420 * @addr: THP collapse address
   1421 *
   1422 * This function checks whether all the PTEs in the PMD are pointing to the
   1423 * right THP. If so, retract the page table so the THP can refault in with
   1424 * as pmd-mapped.
   1425 */
   1426void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
   1427{
   1428	unsigned long haddr = addr & HPAGE_PMD_MASK;
   1429	struct vm_area_struct *vma = find_vma(mm, haddr);
   1430	struct page *hpage;
   1431	pte_t *start_pte, *pte;
   1432	pmd_t *pmd;
   1433	spinlock_t *ptl;
   1434	int count = 0;
   1435	int i;
   1436
   1437	if (!vma || !vma->vm_file ||
   1438	    !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
   1439		return;
   1440
   1441	/*
   1442	 * This vm_flags may not have VM_HUGEPAGE if the page was not
   1443	 * collapsed by this mm. But we can still collapse if the page is
   1444	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
   1445	 * will not fail the vma for missing VM_HUGEPAGE
   1446	 */
   1447	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
   1448		return;
   1449
   1450	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
   1451	if (userfaultfd_wp(vma))
   1452		return;
   1453
   1454	hpage = find_lock_page(vma->vm_file->f_mapping,
   1455			       linear_page_index(vma, haddr));
   1456	if (!hpage)
   1457		return;
   1458
   1459	if (!PageHead(hpage))
   1460		goto drop_hpage;
   1461
   1462	pmd = mm_find_pmd(mm, haddr);
   1463	if (!pmd)
   1464		goto drop_hpage;
   1465
   1466	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
   1467
   1468	/* step 1: check all mapped PTEs are to the right huge page */
   1469	for (i = 0, addr = haddr, pte = start_pte;
   1470	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
   1471		struct page *page;
   1472
   1473		/* empty pte, skip */
   1474		if (pte_none(*pte))
   1475			continue;
   1476
   1477		/* page swapped out, abort */
   1478		if (!pte_present(*pte))
   1479			goto abort;
   1480
   1481		page = vm_normal_page(vma, addr, *pte);
   1482
   1483		/*
   1484		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
   1485		 * page table, but the new page will not be a subpage of hpage.
   1486		 */
   1487		if (hpage + i != page)
   1488			goto abort;
   1489		count++;
   1490	}
   1491
   1492	/* step 2: adjust rmap */
   1493	for (i = 0, addr = haddr, pte = start_pte;
   1494	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
   1495		struct page *page;
   1496
   1497		if (pte_none(*pte))
   1498			continue;
   1499		page = vm_normal_page(vma, addr, *pte);
   1500		page_remove_rmap(page, vma, false);
   1501	}
   1502
   1503	pte_unmap_unlock(start_pte, ptl);
   1504
   1505	/* step 3: set proper refcount and mm_counters. */
   1506	if (count) {
   1507		page_ref_sub(hpage, count);
   1508		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
   1509	}
   1510
   1511	/* step 4: collapse pmd */
   1512	collapse_and_free_pmd(mm, vma, haddr, pmd);
   1513drop_hpage:
   1514	unlock_page(hpage);
   1515	put_page(hpage);
   1516	return;
   1517
   1518abort:
   1519	pte_unmap_unlock(start_pte, ptl);
   1520	goto drop_hpage;
   1521}
   1522
   1523static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
   1524{
   1525	struct mm_struct *mm = mm_slot->mm;
   1526	int i;
   1527
   1528	if (likely(mm_slot->nr_pte_mapped_thp == 0))
   1529		return;
   1530
   1531	if (!mmap_write_trylock(mm))
   1532		return;
   1533
   1534	if (unlikely(khugepaged_test_exit(mm)))
   1535		goto out;
   1536
   1537	for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
   1538		collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
   1539
   1540out:
   1541	mm_slot->nr_pte_mapped_thp = 0;
   1542	mmap_write_unlock(mm);
   1543}
   1544
   1545static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
   1546{
   1547	struct vm_area_struct *vma;
   1548	struct mm_struct *mm;
   1549	unsigned long addr;
   1550	pmd_t *pmd;
   1551
   1552	i_mmap_lock_write(mapping);
   1553	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
   1554		/*
   1555		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
   1556		 * got written to. These VMAs are likely not worth investing
   1557		 * mmap_write_lock(mm) as PMD-mapping is likely to be split
   1558		 * later.
   1559		 *
   1560		 * Not that vma->anon_vma check is racy: it can be set up after
   1561		 * the check but before we took mmap_lock by the fault path.
   1562		 * But page lock would prevent establishing any new ptes of the
   1563		 * page, so we are safe.
   1564		 *
   1565		 * An alternative would be drop the check, but check that page
   1566		 * table is clear before calling pmdp_collapse_flush() under
   1567		 * ptl. It has higher chance to recover THP for the VMA, but
   1568		 * has higher cost too.
   1569		 */
   1570		if (vma->anon_vma)
   1571			continue;
   1572		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
   1573		if (addr & ~HPAGE_PMD_MASK)
   1574			continue;
   1575		if (vma->vm_end < addr + HPAGE_PMD_SIZE)
   1576			continue;
   1577		mm = vma->vm_mm;
   1578		pmd = mm_find_pmd(mm, addr);
   1579		if (!pmd)
   1580			continue;
   1581		/*
   1582		 * We need exclusive mmap_lock to retract page table.
   1583		 *
   1584		 * We use trylock due to lock inversion: we need to acquire
   1585		 * mmap_lock while holding page lock. Fault path does it in
   1586		 * reverse order. Trylock is a way to avoid deadlock.
   1587		 */
   1588		if (mmap_write_trylock(mm)) {
   1589			/*
   1590			 * When a vma is registered with uffd-wp, we can't
   1591			 * recycle the pmd pgtable because there can be pte
   1592			 * markers installed.  Skip it only, so the rest mm/vma
   1593			 * can still have the same file mapped hugely, however
   1594			 * it'll always mapped in small page size for uffd-wp
   1595			 * registered ranges.
   1596			 */
   1597			if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
   1598				collapse_and_free_pmd(mm, vma, addr, pmd);
   1599			mmap_write_unlock(mm);
   1600		} else {
   1601			/* Try again later */
   1602			khugepaged_add_pte_mapped_thp(mm, addr);
   1603		}
   1604	}
   1605	i_mmap_unlock_write(mapping);
   1606}
   1607
   1608/**
   1609 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
   1610 *
   1611 * @mm: process address space where collapse happens
   1612 * @file: file that collapse on
   1613 * @start: collapse start address
   1614 * @hpage: new allocated huge page for collapse
   1615 * @node: appointed node the new huge page allocate from
   1616 *
   1617 * Basic scheme is simple, details are more complex:
   1618 *  - allocate and lock a new huge page;
   1619 *  - scan page cache replacing old pages with the new one
   1620 *    + swap/gup in pages if necessary;
   1621 *    + fill in gaps;
   1622 *    + keep old pages around in case rollback is required;
   1623 *  - if replacing succeeds:
   1624 *    + copy data over;
   1625 *    + free old pages;
   1626 *    + unlock huge page;
   1627 *  - if replacing failed;
   1628 *    + put all pages back and unfreeze them;
   1629 *    + restore gaps in the page cache;
   1630 *    + unlock and free huge page;
   1631 */
   1632static void collapse_file(struct mm_struct *mm,
   1633		struct file *file, pgoff_t start,
   1634		struct page **hpage, int node)
   1635{
   1636	struct address_space *mapping = file->f_mapping;
   1637	gfp_t gfp;
   1638	struct page *new_page;
   1639	pgoff_t index, end = start + HPAGE_PMD_NR;
   1640	LIST_HEAD(pagelist);
   1641	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
   1642	int nr_none = 0, result = SCAN_SUCCEED;
   1643	bool is_shmem = shmem_file(file);
   1644	int nr;
   1645
   1646	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
   1647	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
   1648
   1649	/* Only allocate from the target node */
   1650	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
   1651
   1652	new_page = khugepaged_alloc_page(hpage, gfp, node);
   1653	if (!new_page) {
   1654		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
   1655		goto out;
   1656	}
   1657
   1658	if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
   1659		result = SCAN_CGROUP_CHARGE_FAIL;
   1660		goto out;
   1661	}
   1662	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
   1663
   1664	/*
   1665	 * Ensure we have slots for all the pages in the range.  This is
   1666	 * almost certainly a no-op because most of the pages must be present
   1667	 */
   1668	do {
   1669		xas_lock_irq(&xas);
   1670		xas_create_range(&xas);
   1671		if (!xas_error(&xas))
   1672			break;
   1673		xas_unlock_irq(&xas);
   1674		if (!xas_nomem(&xas, GFP_KERNEL)) {
   1675			result = SCAN_FAIL;
   1676			goto out;
   1677		}
   1678	} while (1);
   1679
   1680	__SetPageLocked(new_page);
   1681	if (is_shmem)
   1682		__SetPageSwapBacked(new_page);
   1683	new_page->index = start;
   1684	new_page->mapping = mapping;
   1685
   1686	/*
   1687	 * At this point the new_page is locked and not up-to-date.
   1688	 * It's safe to insert it into the page cache, because nobody would
   1689	 * be able to map it or use it in another way until we unlock it.
   1690	 */
   1691
   1692	xas_set(&xas, start);
   1693	for (index = start; index < end; index++) {
   1694		struct page *page = xas_next(&xas);
   1695
   1696		VM_BUG_ON(index != xas.xa_index);
   1697		if (is_shmem) {
   1698			if (!page) {
   1699				/*
   1700				 * Stop if extent has been truncated or
   1701				 * hole-punched, and is now completely
   1702				 * empty.
   1703				 */
   1704				if (index == start) {
   1705					if (!xas_next_entry(&xas, end - 1)) {
   1706						result = SCAN_TRUNCATED;
   1707						goto xa_locked;
   1708					}
   1709					xas_set(&xas, index);
   1710				}
   1711				if (!shmem_charge(mapping->host, 1)) {
   1712					result = SCAN_FAIL;
   1713					goto xa_locked;
   1714				}
   1715				xas_store(&xas, new_page);
   1716				nr_none++;
   1717				continue;
   1718			}
   1719
   1720			if (xa_is_value(page) || !PageUptodate(page)) {
   1721				xas_unlock_irq(&xas);
   1722				/* swap in or instantiate fallocated page */
   1723				if (shmem_getpage(mapping->host, index, &page,
   1724						  SGP_NOALLOC)) {
   1725					result = SCAN_FAIL;
   1726					goto xa_unlocked;
   1727				}
   1728			} else if (trylock_page(page)) {
   1729				get_page(page);
   1730				xas_unlock_irq(&xas);
   1731			} else {
   1732				result = SCAN_PAGE_LOCK;
   1733				goto xa_locked;
   1734			}
   1735		} else {	/* !is_shmem */
   1736			if (!page || xa_is_value(page)) {
   1737				xas_unlock_irq(&xas);
   1738				page_cache_sync_readahead(mapping, &file->f_ra,
   1739							  file, index,
   1740							  end - index);
   1741				/* drain pagevecs to help isolate_lru_page() */
   1742				lru_add_drain();
   1743				page = find_lock_page(mapping, index);
   1744				if (unlikely(page == NULL)) {
   1745					result = SCAN_FAIL;
   1746					goto xa_unlocked;
   1747				}
   1748			} else if (PageDirty(page)) {
   1749				/*
   1750				 * khugepaged only works on read-only fd,
   1751				 * so this page is dirty because it hasn't
   1752				 * been flushed since first write. There
   1753				 * won't be new dirty pages.
   1754				 *
   1755				 * Trigger async flush here and hope the
   1756				 * writeback is done when khugepaged
   1757				 * revisits this page.
   1758				 *
   1759				 * This is a one-off situation. We are not
   1760				 * forcing writeback in loop.
   1761				 */
   1762				xas_unlock_irq(&xas);
   1763				filemap_flush(mapping);
   1764				result = SCAN_FAIL;
   1765				goto xa_unlocked;
   1766			} else if (PageWriteback(page)) {
   1767				xas_unlock_irq(&xas);
   1768				result = SCAN_FAIL;
   1769				goto xa_unlocked;
   1770			} else if (trylock_page(page)) {
   1771				get_page(page);
   1772				xas_unlock_irq(&xas);
   1773			} else {
   1774				result = SCAN_PAGE_LOCK;
   1775				goto xa_locked;
   1776			}
   1777		}
   1778
   1779		/*
   1780		 * The page must be locked, so we can drop the i_pages lock
   1781		 * without racing with truncate.
   1782		 */
   1783		VM_BUG_ON_PAGE(!PageLocked(page), page);
   1784
   1785		/* make sure the page is up to date */
   1786		if (unlikely(!PageUptodate(page))) {
   1787			result = SCAN_FAIL;
   1788			goto out_unlock;
   1789		}
   1790
   1791		/*
   1792		 * If file was truncated then extended, or hole-punched, before
   1793		 * we locked the first page, then a THP might be there already.
   1794		 */
   1795		if (PageTransCompound(page)) {
   1796			result = SCAN_PAGE_COMPOUND;
   1797			goto out_unlock;
   1798		}
   1799
   1800		if (page_mapping(page) != mapping) {
   1801			result = SCAN_TRUNCATED;
   1802			goto out_unlock;
   1803		}
   1804
   1805		if (!is_shmem && (PageDirty(page) ||
   1806				  PageWriteback(page))) {
   1807			/*
   1808			 * khugepaged only works on read-only fd, so this
   1809			 * page is dirty because it hasn't been flushed
   1810			 * since first write.
   1811			 */
   1812			result = SCAN_FAIL;
   1813			goto out_unlock;
   1814		}
   1815
   1816		if (isolate_lru_page(page)) {
   1817			result = SCAN_DEL_PAGE_LRU;
   1818			goto out_unlock;
   1819		}
   1820
   1821		if (page_has_private(page) &&
   1822		    !try_to_release_page(page, GFP_KERNEL)) {
   1823			result = SCAN_PAGE_HAS_PRIVATE;
   1824			putback_lru_page(page);
   1825			goto out_unlock;
   1826		}
   1827
   1828		if (page_mapped(page))
   1829			try_to_unmap(page_folio(page),
   1830					TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
   1831
   1832		xas_lock_irq(&xas);
   1833		xas_set(&xas, index);
   1834
   1835		VM_BUG_ON_PAGE(page != xas_load(&xas), page);
   1836
   1837		/*
   1838		 * The page is expected to have page_count() == 3:
   1839		 *  - we hold a pin on it;
   1840		 *  - one reference from page cache;
   1841		 *  - one from isolate_lru_page;
   1842		 */
   1843		if (!page_ref_freeze(page, 3)) {
   1844			result = SCAN_PAGE_COUNT;
   1845			xas_unlock_irq(&xas);
   1846			putback_lru_page(page);
   1847			goto out_unlock;
   1848		}
   1849
   1850		/*
   1851		 * Add the page to the list to be able to undo the collapse if
   1852		 * something go wrong.
   1853		 */
   1854		list_add_tail(&page->lru, &pagelist);
   1855
   1856		/* Finally, replace with the new page. */
   1857		xas_store(&xas, new_page);
   1858		continue;
   1859out_unlock:
   1860		unlock_page(page);
   1861		put_page(page);
   1862		goto xa_unlocked;
   1863	}
   1864	nr = thp_nr_pages(new_page);
   1865
   1866	if (is_shmem)
   1867		__mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
   1868	else {
   1869		__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
   1870		filemap_nr_thps_inc(mapping);
   1871		/*
   1872		 * Paired with smp_mb() in do_dentry_open() to ensure
   1873		 * i_writecount is up to date and the update to nr_thps is
   1874		 * visible. Ensures the page cache will be truncated if the
   1875		 * file is opened writable.
   1876		 */
   1877		smp_mb();
   1878		if (inode_is_open_for_write(mapping->host)) {
   1879			result = SCAN_FAIL;
   1880			__mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
   1881			filemap_nr_thps_dec(mapping);
   1882			goto xa_locked;
   1883		}
   1884	}
   1885
   1886	if (nr_none) {
   1887		__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
   1888		if (is_shmem)
   1889			__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
   1890	}
   1891
   1892	/* Join all the small entries into a single multi-index entry */
   1893	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
   1894	xas_store(&xas, new_page);
   1895xa_locked:
   1896	xas_unlock_irq(&xas);
   1897xa_unlocked:
   1898
   1899	/*
   1900	 * If collapse is successful, flush must be done now before copying.
   1901	 * If collapse is unsuccessful, does flush actually need to be done?
   1902	 * Do it anyway, to clear the state.
   1903	 */
   1904	try_to_unmap_flush();
   1905
   1906	if (result == SCAN_SUCCEED) {
   1907		struct page *page, *tmp;
   1908
   1909		/*
   1910		 * Replacing old pages with new one has succeeded, now we
   1911		 * need to copy the content and free the old pages.
   1912		 */
   1913		index = start;
   1914		list_for_each_entry_safe(page, tmp, &pagelist, lru) {
   1915			while (index < page->index) {
   1916				clear_highpage(new_page + (index % HPAGE_PMD_NR));
   1917				index++;
   1918			}
   1919			copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
   1920					page);
   1921			list_del(&page->lru);
   1922			page->mapping = NULL;
   1923			page_ref_unfreeze(page, 1);
   1924			ClearPageActive(page);
   1925			ClearPageUnevictable(page);
   1926			unlock_page(page);
   1927			put_page(page);
   1928			index++;
   1929		}
   1930		while (index < end) {
   1931			clear_highpage(new_page + (index % HPAGE_PMD_NR));
   1932			index++;
   1933		}
   1934
   1935		SetPageUptodate(new_page);
   1936		page_ref_add(new_page, HPAGE_PMD_NR - 1);
   1937		if (is_shmem)
   1938			set_page_dirty(new_page);
   1939		lru_cache_add(new_page);
   1940
   1941		/*
   1942		 * Remove pte page tables, so we can re-fault the page as huge.
   1943		 */
   1944		retract_page_tables(mapping, start);
   1945		*hpage = NULL;
   1946
   1947		khugepaged_pages_collapsed++;
   1948	} else {
   1949		struct page *page;
   1950
   1951		/* Something went wrong: roll back page cache changes */
   1952		xas_lock_irq(&xas);
   1953		mapping->nrpages -= nr_none;
   1954
   1955		if (is_shmem)
   1956			shmem_uncharge(mapping->host, nr_none);
   1957
   1958		xas_set(&xas, start);
   1959		xas_for_each(&xas, page, end - 1) {
   1960			page = list_first_entry_or_null(&pagelist,
   1961					struct page, lru);
   1962			if (!page || xas.xa_index < page->index) {
   1963				if (!nr_none)
   1964					break;
   1965				nr_none--;
   1966				/* Put holes back where they were */
   1967				xas_store(&xas, NULL);
   1968				continue;
   1969			}
   1970
   1971			VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
   1972
   1973			/* Unfreeze the page. */
   1974			list_del(&page->lru);
   1975			page_ref_unfreeze(page, 2);
   1976			xas_store(&xas, page);
   1977			xas_pause(&xas);
   1978			xas_unlock_irq(&xas);
   1979			unlock_page(page);
   1980			putback_lru_page(page);
   1981			xas_lock_irq(&xas);
   1982		}
   1983		VM_BUG_ON(nr_none);
   1984		xas_unlock_irq(&xas);
   1985
   1986		new_page->mapping = NULL;
   1987	}
   1988
   1989	unlock_page(new_page);
   1990out:
   1991	VM_BUG_ON(!list_empty(&pagelist));
   1992	if (!IS_ERR_OR_NULL(*hpage))
   1993		mem_cgroup_uncharge(page_folio(*hpage));
   1994	/* TODO: tracepoints */
   1995}
   1996
   1997static void khugepaged_scan_file(struct mm_struct *mm,
   1998		struct file *file, pgoff_t start, struct page **hpage)
   1999{
   2000	struct page *page = NULL;
   2001	struct address_space *mapping = file->f_mapping;
   2002	XA_STATE(xas, &mapping->i_pages, start);
   2003	int present, swap;
   2004	int node = NUMA_NO_NODE;
   2005	int result = SCAN_SUCCEED;
   2006
   2007	present = 0;
   2008	swap = 0;
   2009	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
   2010	rcu_read_lock();
   2011	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
   2012		if (xas_retry(&xas, page))
   2013			continue;
   2014
   2015		if (xa_is_value(page)) {
   2016			if (++swap > khugepaged_max_ptes_swap) {
   2017				result = SCAN_EXCEED_SWAP_PTE;
   2018				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
   2019				break;
   2020			}
   2021			continue;
   2022		}
   2023
   2024		/*
   2025		 * XXX: khugepaged should compact smaller compound pages
   2026		 * into a PMD sized page
   2027		 */
   2028		if (PageTransCompound(page)) {
   2029			result = SCAN_PAGE_COMPOUND;
   2030			break;
   2031		}
   2032
   2033		node = page_to_nid(page);
   2034		if (khugepaged_scan_abort(node)) {
   2035			result = SCAN_SCAN_ABORT;
   2036			break;
   2037		}
   2038		khugepaged_node_load[node]++;
   2039
   2040		if (!PageLRU(page)) {
   2041			result = SCAN_PAGE_LRU;
   2042			break;
   2043		}
   2044
   2045		if (page_count(page) !=
   2046		    1 + page_mapcount(page) + page_has_private(page)) {
   2047			result = SCAN_PAGE_COUNT;
   2048			break;
   2049		}
   2050
   2051		/*
   2052		 * We probably should check if the page is referenced here, but
   2053		 * nobody would transfer pte_young() to PageReferenced() for us.
   2054		 * And rmap walk here is just too costly...
   2055		 */
   2056
   2057		present++;
   2058
   2059		if (need_resched()) {
   2060			xas_pause(&xas);
   2061			cond_resched_rcu();
   2062		}
   2063	}
   2064	rcu_read_unlock();
   2065
   2066	if (result == SCAN_SUCCEED) {
   2067		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
   2068			result = SCAN_EXCEED_NONE_PTE;
   2069			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
   2070		} else {
   2071			node = khugepaged_find_target_node();
   2072			collapse_file(mm, file, start, hpage, node);
   2073		}
   2074	}
   2075
   2076	/* TODO: tracepoints */
   2077}
   2078#else
   2079static void khugepaged_scan_file(struct mm_struct *mm,
   2080		struct file *file, pgoff_t start, struct page **hpage)
   2081{
   2082	BUILD_BUG();
   2083}
   2084
   2085static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
   2086{
   2087}
   2088#endif
   2089
   2090static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
   2091					    struct page **hpage)
   2092	__releases(&khugepaged_mm_lock)
   2093	__acquires(&khugepaged_mm_lock)
   2094{
   2095	struct mm_slot *mm_slot;
   2096	struct mm_struct *mm;
   2097	struct vm_area_struct *vma;
   2098	int progress = 0;
   2099
   2100	VM_BUG_ON(!pages);
   2101	lockdep_assert_held(&khugepaged_mm_lock);
   2102
   2103	if (khugepaged_scan.mm_slot)
   2104		mm_slot = khugepaged_scan.mm_slot;
   2105	else {
   2106		mm_slot = list_entry(khugepaged_scan.mm_head.next,
   2107				     struct mm_slot, mm_node);
   2108		khugepaged_scan.address = 0;
   2109		khugepaged_scan.mm_slot = mm_slot;
   2110	}
   2111	spin_unlock(&khugepaged_mm_lock);
   2112	khugepaged_collapse_pte_mapped_thps(mm_slot);
   2113
   2114	mm = mm_slot->mm;
   2115	/*
   2116	 * Don't wait for semaphore (to avoid long wait times).  Just move to
   2117	 * the next mm on the list.
   2118	 */
   2119	vma = NULL;
   2120	if (unlikely(!mmap_read_trylock(mm)))
   2121		goto breakouterloop_mmap_lock;
   2122	if (likely(!khugepaged_test_exit(mm)))
   2123		vma = find_vma(mm, khugepaged_scan.address);
   2124
   2125	progress++;
   2126	for (; vma; vma = vma->vm_next) {
   2127		unsigned long hstart, hend;
   2128
   2129		cond_resched();
   2130		if (unlikely(khugepaged_test_exit(mm))) {
   2131			progress++;
   2132			break;
   2133		}
   2134		if (!hugepage_vma_check(vma, vma->vm_flags)) {
   2135skip:
   2136			progress++;
   2137			continue;
   2138		}
   2139		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
   2140		hend = vma->vm_end & HPAGE_PMD_MASK;
   2141		if (hstart >= hend)
   2142			goto skip;
   2143		if (khugepaged_scan.address > hend)
   2144			goto skip;
   2145		if (khugepaged_scan.address < hstart)
   2146			khugepaged_scan.address = hstart;
   2147		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
   2148		if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
   2149			goto skip;
   2150
   2151		while (khugepaged_scan.address < hend) {
   2152			int ret;
   2153			cond_resched();
   2154			if (unlikely(khugepaged_test_exit(mm)))
   2155				goto breakouterloop;
   2156
   2157			VM_BUG_ON(khugepaged_scan.address < hstart ||
   2158				  khugepaged_scan.address + HPAGE_PMD_SIZE >
   2159				  hend);
   2160			if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
   2161				struct file *file = get_file(vma->vm_file);
   2162				pgoff_t pgoff = linear_page_index(vma,
   2163						khugepaged_scan.address);
   2164
   2165				mmap_read_unlock(mm);
   2166				ret = 1;
   2167				khugepaged_scan_file(mm, file, pgoff, hpage);
   2168				fput(file);
   2169			} else {
   2170				ret = khugepaged_scan_pmd(mm, vma,
   2171						khugepaged_scan.address,
   2172						hpage);
   2173			}
   2174			/* move to next address */
   2175			khugepaged_scan.address += HPAGE_PMD_SIZE;
   2176			progress += HPAGE_PMD_NR;
   2177			if (ret)
   2178				/* we released mmap_lock so break loop */
   2179				goto breakouterloop_mmap_lock;
   2180			if (progress >= pages)
   2181				goto breakouterloop;
   2182		}
   2183	}
   2184breakouterloop:
   2185	mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
   2186breakouterloop_mmap_lock:
   2187
   2188	spin_lock(&khugepaged_mm_lock);
   2189	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
   2190	/*
   2191	 * Release the current mm_slot if this mm is about to die, or
   2192	 * if we scanned all vmas of this mm.
   2193	 */
   2194	if (khugepaged_test_exit(mm) || !vma) {
   2195		/*
   2196		 * Make sure that if mm_users is reaching zero while
   2197		 * khugepaged runs here, khugepaged_exit will find
   2198		 * mm_slot not pointing to the exiting mm.
   2199		 */
   2200		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
   2201			khugepaged_scan.mm_slot = list_entry(
   2202				mm_slot->mm_node.next,
   2203				struct mm_slot, mm_node);
   2204			khugepaged_scan.address = 0;
   2205		} else {
   2206			khugepaged_scan.mm_slot = NULL;
   2207			khugepaged_full_scans++;
   2208		}
   2209
   2210		collect_mm_slot(mm_slot);
   2211	}
   2212
   2213	return progress;
   2214}
   2215
   2216static int khugepaged_has_work(void)
   2217{
   2218	return !list_empty(&khugepaged_scan.mm_head) &&
   2219		khugepaged_enabled();
   2220}
   2221
   2222static int khugepaged_wait_event(void)
   2223{
   2224	return !list_empty(&khugepaged_scan.mm_head) ||
   2225		kthread_should_stop();
   2226}
   2227
   2228static void khugepaged_do_scan(void)
   2229{
   2230	struct page *hpage = NULL;
   2231	unsigned int progress = 0, pass_through_head = 0;
   2232	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
   2233	bool wait = true;
   2234
   2235	lru_add_drain_all();
   2236
   2237	while (progress < pages) {
   2238		if (!khugepaged_prealloc_page(&hpage, &wait))
   2239			break;
   2240
   2241		cond_resched();
   2242
   2243		if (unlikely(kthread_should_stop() || try_to_freeze()))
   2244			break;
   2245
   2246		spin_lock(&khugepaged_mm_lock);
   2247		if (!khugepaged_scan.mm_slot)
   2248			pass_through_head++;
   2249		if (khugepaged_has_work() &&
   2250		    pass_through_head < 2)
   2251			progress += khugepaged_scan_mm_slot(pages - progress,
   2252							    &hpage);
   2253		else
   2254			progress = pages;
   2255		spin_unlock(&khugepaged_mm_lock);
   2256	}
   2257
   2258	if (!IS_ERR_OR_NULL(hpage))
   2259		put_page(hpage);
   2260}
   2261
   2262static bool khugepaged_should_wakeup(void)
   2263{
   2264	return kthread_should_stop() ||
   2265	       time_after_eq(jiffies, khugepaged_sleep_expire);
   2266}
   2267
   2268static void khugepaged_wait_work(void)
   2269{
   2270	if (khugepaged_has_work()) {
   2271		const unsigned long scan_sleep_jiffies =
   2272			msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
   2273
   2274		if (!scan_sleep_jiffies)
   2275			return;
   2276
   2277		khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
   2278		wait_event_freezable_timeout(khugepaged_wait,
   2279					     khugepaged_should_wakeup(),
   2280					     scan_sleep_jiffies);
   2281		return;
   2282	}
   2283
   2284	if (khugepaged_enabled())
   2285		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
   2286}
   2287
   2288static int khugepaged(void *none)
   2289{
   2290	struct mm_slot *mm_slot;
   2291
   2292	set_freezable();
   2293	set_user_nice(current, MAX_NICE);
   2294
   2295	while (!kthread_should_stop()) {
   2296		khugepaged_do_scan();
   2297		khugepaged_wait_work();
   2298	}
   2299
   2300	spin_lock(&khugepaged_mm_lock);
   2301	mm_slot = khugepaged_scan.mm_slot;
   2302	khugepaged_scan.mm_slot = NULL;
   2303	if (mm_slot)
   2304		collect_mm_slot(mm_slot);
   2305	spin_unlock(&khugepaged_mm_lock);
   2306	return 0;
   2307}
   2308
   2309static void set_recommended_min_free_kbytes(void)
   2310{
   2311	struct zone *zone;
   2312	int nr_zones = 0;
   2313	unsigned long recommended_min;
   2314
   2315	if (!khugepaged_enabled()) {
   2316		calculate_min_free_kbytes();
   2317		goto update_wmarks;
   2318	}
   2319
   2320	for_each_populated_zone(zone) {
   2321		/*
   2322		 * We don't need to worry about fragmentation of
   2323		 * ZONE_MOVABLE since it only has movable pages.
   2324		 */
   2325		if (zone_idx(zone) > gfp_zone(GFP_USER))
   2326			continue;
   2327
   2328		nr_zones++;
   2329	}
   2330
   2331	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
   2332	recommended_min = pageblock_nr_pages * nr_zones * 2;
   2333
   2334	/*
   2335	 * Make sure that on average at least two pageblocks are almost free
   2336	 * of another type, one for a migratetype to fall back to and a
   2337	 * second to avoid subsequent fallbacks of other types There are 3
   2338	 * MIGRATE_TYPES we care about.
   2339	 */
   2340	recommended_min += pageblock_nr_pages * nr_zones *
   2341			   MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
   2342
   2343	/* don't ever allow to reserve more than 5% of the lowmem */
   2344	recommended_min = min(recommended_min,
   2345			      (unsigned long) nr_free_buffer_pages() / 20);
   2346	recommended_min <<= (PAGE_SHIFT-10);
   2347
   2348	if (recommended_min > min_free_kbytes) {
   2349		if (user_min_free_kbytes >= 0)
   2350			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
   2351				min_free_kbytes, recommended_min);
   2352
   2353		min_free_kbytes = recommended_min;
   2354	}
   2355
   2356update_wmarks:
   2357	setup_per_zone_wmarks();
   2358}
   2359
   2360int start_stop_khugepaged(void)
   2361{
   2362	int err = 0;
   2363
   2364	mutex_lock(&khugepaged_mutex);
   2365	if (khugepaged_enabled()) {
   2366		if (!khugepaged_thread)
   2367			khugepaged_thread = kthread_run(khugepaged, NULL,
   2368							"khugepaged");
   2369		if (IS_ERR(khugepaged_thread)) {
   2370			pr_err("khugepaged: kthread_run(khugepaged) failed\n");
   2371			err = PTR_ERR(khugepaged_thread);
   2372			khugepaged_thread = NULL;
   2373			goto fail;
   2374		}
   2375
   2376		if (!list_empty(&khugepaged_scan.mm_head))
   2377			wake_up_interruptible(&khugepaged_wait);
   2378	} else if (khugepaged_thread) {
   2379		kthread_stop(khugepaged_thread);
   2380		khugepaged_thread = NULL;
   2381	}
   2382	set_recommended_min_free_kbytes();
   2383fail:
   2384	mutex_unlock(&khugepaged_mutex);
   2385	return err;
   2386}
   2387
   2388void khugepaged_min_free_kbytes_update(void)
   2389{
   2390	mutex_lock(&khugepaged_mutex);
   2391	if (khugepaged_enabled() && khugepaged_thread)
   2392		set_recommended_min_free_kbytes();
   2393	mutex_unlock(&khugepaged_mutex);
   2394}