cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

huge_memory.c (86442B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  Copyright (C) 2009  Red Hat, Inc.
      4 */
      5
      6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      7
      8#include <linux/mm.h>
      9#include <linux/sched.h>
     10#include <linux/sched/mm.h>
     11#include <linux/sched/coredump.h>
     12#include <linux/sched/numa_balancing.h>
     13#include <linux/highmem.h>
     14#include <linux/hugetlb.h>
     15#include <linux/mmu_notifier.h>
     16#include <linux/rmap.h>
     17#include <linux/swap.h>
     18#include <linux/shrinker.h>
     19#include <linux/mm_inline.h>
     20#include <linux/swapops.h>
     21#include <linux/dax.h>
     22#include <linux/khugepaged.h>
     23#include <linux/freezer.h>
     24#include <linux/pfn_t.h>
     25#include <linux/mman.h>
     26#include <linux/memremap.h>
     27#include <linux/pagemap.h>
     28#include <linux/debugfs.h>
     29#include <linux/migrate.h>
     30#include <linux/hashtable.h>
     31#include <linux/userfaultfd_k.h>
     32#include <linux/page_idle.h>
     33#include <linux/shmem_fs.h>
     34#include <linux/oom.h>
     35#include <linux/numa.h>
     36#include <linux/page_owner.h>
     37#include <linux/sched/sysctl.h>
     38
     39#include <asm/tlb.h>
     40#include <asm/pgalloc.h>
     41#include "internal.h"
     42#include "swap.h"
     43
     44#define CREATE_TRACE_POINTS
     45#include <trace/events/thp.h>
     46
     47/*
     48 * By default, transparent hugepage support is disabled in order to avoid
     49 * risking an increased memory footprint for applications that are not
     50 * guaranteed to benefit from it. When transparent hugepage support is
     51 * enabled, it is for all mappings, and khugepaged scans all mappings.
     52 * Defrag is invoked by khugepaged hugepage allocations and by page faults
     53 * for all hugepage allocations.
     54 */
     55unsigned long transparent_hugepage_flags __read_mostly =
     56#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
     57	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
     58#endif
     59#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
     60	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
     61#endif
     62	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
     63	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
     64	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
     65
     66static struct shrinker deferred_split_shrinker;
     67
     68static atomic_t huge_zero_refcount;
     69struct page *huge_zero_page __read_mostly;
     70unsigned long huge_zero_pfn __read_mostly = ~0UL;
     71
     72bool transparent_hugepage_active(struct vm_area_struct *vma)
     73{
     74	/* The addr is used to check if the vma size fits */
     75	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
     76
     77	if (!transhuge_vma_suitable(vma, addr))
     78		return false;
     79	if (vma_is_anonymous(vma))
     80		return __transparent_hugepage_enabled(vma);
     81	if (vma_is_shmem(vma))
     82		return shmem_huge_enabled(vma);
     83	if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
     84		return true;
     85
     86	return false;
     87}
     88
     89static bool get_huge_zero_page(void)
     90{
     91	struct page *zero_page;
     92retry:
     93	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
     94		return true;
     95
     96	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
     97			HPAGE_PMD_ORDER);
     98	if (!zero_page) {
     99		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
    100		return false;
    101	}
    102	count_vm_event(THP_ZERO_PAGE_ALLOC);
    103	preempt_disable();
    104	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
    105		preempt_enable();
    106		__free_pages(zero_page, compound_order(zero_page));
    107		goto retry;
    108	}
    109	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
    110
    111	/* We take additional reference here. It will be put back by shrinker */
    112	atomic_set(&huge_zero_refcount, 2);
    113	preempt_enable();
    114	return true;
    115}
    116
    117static void put_huge_zero_page(void)
    118{
    119	/*
    120	 * Counter should never go to zero here. Only shrinker can put
    121	 * last reference.
    122	 */
    123	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
    124}
    125
    126struct page *mm_get_huge_zero_page(struct mm_struct *mm)
    127{
    128	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
    129		return READ_ONCE(huge_zero_page);
    130
    131	if (!get_huge_zero_page())
    132		return NULL;
    133
    134	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
    135		put_huge_zero_page();
    136
    137	return READ_ONCE(huge_zero_page);
    138}
    139
    140void mm_put_huge_zero_page(struct mm_struct *mm)
    141{
    142	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
    143		put_huge_zero_page();
    144}
    145
    146static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
    147					struct shrink_control *sc)
    148{
    149	/* we can free zero page only if last reference remains */
    150	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
    151}
    152
    153static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
    154				       struct shrink_control *sc)
    155{
    156	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
    157		struct page *zero_page = xchg(&huge_zero_page, NULL);
    158		BUG_ON(zero_page == NULL);
    159		WRITE_ONCE(huge_zero_pfn, ~0UL);
    160		__free_pages(zero_page, compound_order(zero_page));
    161		return HPAGE_PMD_NR;
    162	}
    163
    164	return 0;
    165}
    166
    167static struct shrinker huge_zero_page_shrinker = {
    168	.count_objects = shrink_huge_zero_page_count,
    169	.scan_objects = shrink_huge_zero_page_scan,
    170	.seeks = DEFAULT_SEEKS,
    171};
    172
    173#ifdef CONFIG_SYSFS
    174static ssize_t enabled_show(struct kobject *kobj,
    175			    struct kobj_attribute *attr, char *buf)
    176{
    177	const char *output;
    178
    179	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
    180		output = "[always] madvise never";
    181	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
    182			  &transparent_hugepage_flags))
    183		output = "always [madvise] never";
    184	else
    185		output = "always madvise [never]";
    186
    187	return sysfs_emit(buf, "%s\n", output);
    188}
    189
    190static ssize_t enabled_store(struct kobject *kobj,
    191			     struct kobj_attribute *attr,
    192			     const char *buf, size_t count)
    193{
    194	ssize_t ret = count;
    195
    196	if (sysfs_streq(buf, "always")) {
    197		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
    198		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
    199	} else if (sysfs_streq(buf, "madvise")) {
    200		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
    201		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
    202	} else if (sysfs_streq(buf, "never")) {
    203		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
    204		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
    205	} else
    206		ret = -EINVAL;
    207
    208	if (ret > 0) {
    209		int err = start_stop_khugepaged();
    210		if (err)
    211			ret = err;
    212	}
    213	return ret;
    214}
    215static struct kobj_attribute enabled_attr =
    216	__ATTR(enabled, 0644, enabled_show, enabled_store);
    217
    218ssize_t single_hugepage_flag_show(struct kobject *kobj,
    219				  struct kobj_attribute *attr, char *buf,
    220				  enum transparent_hugepage_flag flag)
    221{
    222	return sysfs_emit(buf, "%d\n",
    223			  !!test_bit(flag, &transparent_hugepage_flags));
    224}
    225
    226ssize_t single_hugepage_flag_store(struct kobject *kobj,
    227				 struct kobj_attribute *attr,
    228				 const char *buf, size_t count,
    229				 enum transparent_hugepage_flag flag)
    230{
    231	unsigned long value;
    232	int ret;
    233
    234	ret = kstrtoul(buf, 10, &value);
    235	if (ret < 0)
    236		return ret;
    237	if (value > 1)
    238		return -EINVAL;
    239
    240	if (value)
    241		set_bit(flag, &transparent_hugepage_flags);
    242	else
    243		clear_bit(flag, &transparent_hugepage_flags);
    244
    245	return count;
    246}
    247
    248static ssize_t defrag_show(struct kobject *kobj,
    249			   struct kobj_attribute *attr, char *buf)
    250{
    251	const char *output;
    252
    253	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
    254		     &transparent_hugepage_flags))
    255		output = "[always] defer defer+madvise madvise never";
    256	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
    257			  &transparent_hugepage_flags))
    258		output = "always [defer] defer+madvise madvise never";
    259	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
    260			  &transparent_hugepage_flags))
    261		output = "always defer [defer+madvise] madvise never";
    262	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
    263			  &transparent_hugepage_flags))
    264		output = "always defer defer+madvise [madvise] never";
    265	else
    266		output = "always defer defer+madvise madvise [never]";
    267
    268	return sysfs_emit(buf, "%s\n", output);
    269}
    270
    271static ssize_t defrag_store(struct kobject *kobj,
    272			    struct kobj_attribute *attr,
    273			    const char *buf, size_t count)
    274{
    275	if (sysfs_streq(buf, "always")) {
    276		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
    277		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
    278		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
    279		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
    280	} else if (sysfs_streq(buf, "defer+madvise")) {
    281		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
    282		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
    283		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
    284		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
    285	} else if (sysfs_streq(buf, "defer")) {
    286		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
    287		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
    288		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
    289		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
    290	} else if (sysfs_streq(buf, "madvise")) {
    291		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
    292		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
    293		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
    294		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
    295	} else if (sysfs_streq(buf, "never")) {
    296		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
    297		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
    298		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
    299		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
    300	} else
    301		return -EINVAL;
    302
    303	return count;
    304}
    305static struct kobj_attribute defrag_attr =
    306	__ATTR(defrag, 0644, defrag_show, defrag_store);
    307
    308static ssize_t use_zero_page_show(struct kobject *kobj,
    309				  struct kobj_attribute *attr, char *buf)
    310{
    311	return single_hugepage_flag_show(kobj, attr, buf,
    312					 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
    313}
    314static ssize_t use_zero_page_store(struct kobject *kobj,
    315		struct kobj_attribute *attr, const char *buf, size_t count)
    316{
    317	return single_hugepage_flag_store(kobj, attr, buf, count,
    318				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
    319}
    320static struct kobj_attribute use_zero_page_attr =
    321	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
    322
    323static ssize_t hpage_pmd_size_show(struct kobject *kobj,
    324				   struct kobj_attribute *attr, char *buf)
    325{
    326	return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
    327}
    328static struct kobj_attribute hpage_pmd_size_attr =
    329	__ATTR_RO(hpage_pmd_size);
    330
    331static struct attribute *hugepage_attr[] = {
    332	&enabled_attr.attr,
    333	&defrag_attr.attr,
    334	&use_zero_page_attr.attr,
    335	&hpage_pmd_size_attr.attr,
    336#ifdef CONFIG_SHMEM
    337	&shmem_enabled_attr.attr,
    338#endif
    339	NULL,
    340};
    341
    342static const struct attribute_group hugepage_attr_group = {
    343	.attrs = hugepage_attr,
    344};
    345
    346static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
    347{
    348	int err;
    349
    350	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
    351	if (unlikely(!*hugepage_kobj)) {
    352		pr_err("failed to create transparent hugepage kobject\n");
    353		return -ENOMEM;
    354	}
    355
    356	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
    357	if (err) {
    358		pr_err("failed to register transparent hugepage group\n");
    359		goto delete_obj;
    360	}
    361
    362	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
    363	if (err) {
    364		pr_err("failed to register transparent hugepage group\n");
    365		goto remove_hp_group;
    366	}
    367
    368	return 0;
    369
    370remove_hp_group:
    371	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
    372delete_obj:
    373	kobject_put(*hugepage_kobj);
    374	return err;
    375}
    376
    377static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
    378{
    379	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
    380	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
    381	kobject_put(hugepage_kobj);
    382}
    383#else
    384static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
    385{
    386	return 0;
    387}
    388
    389static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
    390{
    391}
    392#endif /* CONFIG_SYSFS */
    393
    394static int __init hugepage_init(void)
    395{
    396	int err;
    397	struct kobject *hugepage_kobj;
    398
    399	if (!has_transparent_hugepage()) {
    400		/*
    401		 * Hardware doesn't support hugepages, hence disable
    402		 * DAX PMD support.
    403		 */
    404		transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
    405		return -EINVAL;
    406	}
    407
    408	/*
    409	 * hugepages can't be allocated by the buddy allocator
    410	 */
    411	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
    412	/*
    413	 * we use page->mapping and page->index in second tail page
    414	 * as list_head: assuming THP order >= 2
    415	 */
    416	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
    417
    418	err = hugepage_init_sysfs(&hugepage_kobj);
    419	if (err)
    420		goto err_sysfs;
    421
    422	err = khugepaged_init();
    423	if (err)
    424		goto err_slab;
    425
    426	err = register_shrinker(&huge_zero_page_shrinker);
    427	if (err)
    428		goto err_hzp_shrinker;
    429	err = register_shrinker(&deferred_split_shrinker);
    430	if (err)
    431		goto err_split_shrinker;
    432
    433	/*
    434	 * By default disable transparent hugepages on smaller systems,
    435	 * where the extra memory used could hurt more than TLB overhead
    436	 * is likely to save.  The admin can still enable it through /sys.
    437	 */
    438	if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
    439		transparent_hugepage_flags = 0;
    440		return 0;
    441	}
    442
    443	err = start_stop_khugepaged();
    444	if (err)
    445		goto err_khugepaged;
    446
    447	return 0;
    448err_khugepaged:
    449	unregister_shrinker(&deferred_split_shrinker);
    450err_split_shrinker:
    451	unregister_shrinker(&huge_zero_page_shrinker);
    452err_hzp_shrinker:
    453	khugepaged_destroy();
    454err_slab:
    455	hugepage_exit_sysfs(hugepage_kobj);
    456err_sysfs:
    457	return err;
    458}
    459subsys_initcall(hugepage_init);
    460
    461static int __init setup_transparent_hugepage(char *str)
    462{
    463	int ret = 0;
    464	if (!str)
    465		goto out;
    466	if (!strcmp(str, "always")) {
    467		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
    468			&transparent_hugepage_flags);
    469		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
    470			  &transparent_hugepage_flags);
    471		ret = 1;
    472	} else if (!strcmp(str, "madvise")) {
    473		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
    474			  &transparent_hugepage_flags);
    475		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
    476			&transparent_hugepage_flags);
    477		ret = 1;
    478	} else if (!strcmp(str, "never")) {
    479		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
    480			  &transparent_hugepage_flags);
    481		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
    482			  &transparent_hugepage_flags);
    483		ret = 1;
    484	}
    485out:
    486	if (!ret)
    487		pr_warn("transparent_hugepage= cannot parse, ignored\n");
    488	return ret;
    489}
    490__setup("transparent_hugepage=", setup_transparent_hugepage);
    491
    492pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
    493{
    494	if (likely(vma->vm_flags & VM_WRITE))
    495		pmd = pmd_mkwrite(pmd);
    496	return pmd;
    497}
    498
    499#ifdef CONFIG_MEMCG
    500static inline struct deferred_split *get_deferred_split_queue(struct page *page)
    501{
    502	struct mem_cgroup *memcg = page_memcg(compound_head(page));
    503	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
    504
    505	if (memcg)
    506		return &memcg->deferred_split_queue;
    507	else
    508		return &pgdat->deferred_split_queue;
    509}
    510#else
    511static inline struct deferred_split *get_deferred_split_queue(struct page *page)
    512{
    513	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
    514
    515	return &pgdat->deferred_split_queue;
    516}
    517#endif
    518
    519void prep_transhuge_page(struct page *page)
    520{
    521	/*
    522	 * we use page->mapping and page->indexlru in second tail page
    523	 * as list_head: assuming THP order >= 2
    524	 */
    525
    526	INIT_LIST_HEAD(page_deferred_list(page));
    527	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
    528}
    529
    530static inline bool is_transparent_hugepage(struct page *page)
    531{
    532	if (!PageCompound(page))
    533		return false;
    534
    535	page = compound_head(page);
    536	return is_huge_zero_page(page) ||
    537	       page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
    538}
    539
    540static unsigned long __thp_get_unmapped_area(struct file *filp,
    541		unsigned long addr, unsigned long len,
    542		loff_t off, unsigned long flags, unsigned long size)
    543{
    544	loff_t off_end = off + len;
    545	loff_t off_align = round_up(off, size);
    546	unsigned long len_pad, ret;
    547
    548	if (off_end <= off_align || (off_end - off_align) < size)
    549		return 0;
    550
    551	len_pad = len + size;
    552	if (len_pad < len || (off + len_pad) < off)
    553		return 0;
    554
    555	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
    556					      off >> PAGE_SHIFT, flags);
    557
    558	/*
    559	 * The failure might be due to length padding. The caller will retry
    560	 * without the padding.
    561	 */
    562	if (IS_ERR_VALUE(ret))
    563		return 0;
    564
    565	/*
    566	 * Do not try to align to THP boundary if allocation at the address
    567	 * hint succeeds.
    568	 */
    569	if (ret == addr)
    570		return addr;
    571
    572	ret += (off - ret) & (size - 1);
    573	return ret;
    574}
    575
    576unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
    577		unsigned long len, unsigned long pgoff, unsigned long flags)
    578{
    579	unsigned long ret;
    580	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
    581
    582	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
    583	if (ret)
    584		return ret;
    585
    586	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
    587}
    588EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
    589
    590static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
    591			struct page *page, gfp_t gfp)
    592{
    593	struct vm_area_struct *vma = vmf->vma;
    594	pgtable_t pgtable;
    595	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    596	vm_fault_t ret = 0;
    597
    598	VM_BUG_ON_PAGE(!PageCompound(page), page);
    599
    600	if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
    601		put_page(page);
    602		count_vm_event(THP_FAULT_FALLBACK);
    603		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
    604		return VM_FAULT_FALLBACK;
    605	}
    606	cgroup_throttle_swaprate(page, gfp);
    607
    608	pgtable = pte_alloc_one(vma->vm_mm);
    609	if (unlikely(!pgtable)) {
    610		ret = VM_FAULT_OOM;
    611		goto release;
    612	}
    613
    614	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
    615	/*
    616	 * The memory barrier inside __SetPageUptodate makes sure that
    617	 * clear_huge_page writes become visible before the set_pmd_at()
    618	 * write.
    619	 */
    620	__SetPageUptodate(page);
    621
    622	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
    623	if (unlikely(!pmd_none(*vmf->pmd))) {
    624		goto unlock_release;
    625	} else {
    626		pmd_t entry;
    627
    628		ret = check_stable_address_space(vma->vm_mm);
    629		if (ret)
    630			goto unlock_release;
    631
    632		/* Deliver the page fault to userland */
    633		if (userfaultfd_missing(vma)) {
    634			spin_unlock(vmf->ptl);
    635			put_page(page);
    636			pte_free(vma->vm_mm, pgtable);
    637			ret = handle_userfault(vmf, VM_UFFD_MISSING);
    638			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
    639			return ret;
    640		}
    641
    642		entry = mk_huge_pmd(page, vma->vm_page_prot);
    643		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
    644		page_add_new_anon_rmap(page, vma, haddr);
    645		lru_cache_add_inactive_or_unevictable(page, vma);
    646		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
    647		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
    648		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
    649		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
    650		mm_inc_nr_ptes(vma->vm_mm);
    651		spin_unlock(vmf->ptl);
    652		count_vm_event(THP_FAULT_ALLOC);
    653		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
    654	}
    655
    656	return 0;
    657unlock_release:
    658	spin_unlock(vmf->ptl);
    659release:
    660	if (pgtable)
    661		pte_free(vma->vm_mm, pgtable);
    662	put_page(page);
    663	return ret;
    664
    665}
    666
    667/*
    668 * always: directly stall for all thp allocations
    669 * defer: wake kswapd and fail if not immediately available
    670 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
    671 *		  fail if not immediately available
    672 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
    673 *	    available
    674 * never: never stall for any thp allocation
    675 */
    676gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
    677{
    678	const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
    679
    680	/* Always do synchronous compaction */
    681	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
    682		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
    683
    684	/* Kick kcompactd and fail quickly */
    685	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
    686		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
    687
    688	/* Synchronous compaction if madvised, otherwise kick kcompactd */
    689	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
    690		return GFP_TRANSHUGE_LIGHT |
    691			(vma_madvised ? __GFP_DIRECT_RECLAIM :
    692					__GFP_KSWAPD_RECLAIM);
    693
    694	/* Only do synchronous compaction if madvised */
    695	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
    696		return GFP_TRANSHUGE_LIGHT |
    697		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
    698
    699	return GFP_TRANSHUGE_LIGHT;
    700}
    701
    702/* Caller must hold page table lock. */
    703static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
    704		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
    705		struct page *zero_page)
    706{
    707	pmd_t entry;
    708	if (!pmd_none(*pmd))
    709		return;
    710	entry = mk_pmd(zero_page, vma->vm_page_prot);
    711	entry = pmd_mkhuge(entry);
    712	if (pgtable)
    713		pgtable_trans_huge_deposit(mm, pmd, pgtable);
    714	set_pmd_at(mm, haddr, pmd, entry);
    715	mm_inc_nr_ptes(mm);
    716}
    717
    718vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
    719{
    720	struct vm_area_struct *vma = vmf->vma;
    721	gfp_t gfp;
    722	struct folio *folio;
    723	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
    724
    725	if (!transhuge_vma_suitable(vma, haddr))
    726		return VM_FAULT_FALLBACK;
    727	if (unlikely(anon_vma_prepare(vma)))
    728		return VM_FAULT_OOM;
    729	khugepaged_enter(vma, vma->vm_flags);
    730
    731	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
    732			!mm_forbids_zeropage(vma->vm_mm) &&
    733			transparent_hugepage_use_zero_page()) {
    734		pgtable_t pgtable;
    735		struct page *zero_page;
    736		vm_fault_t ret;
    737		pgtable = pte_alloc_one(vma->vm_mm);
    738		if (unlikely(!pgtable))
    739			return VM_FAULT_OOM;
    740		zero_page = mm_get_huge_zero_page(vma->vm_mm);
    741		if (unlikely(!zero_page)) {
    742			pte_free(vma->vm_mm, pgtable);
    743			count_vm_event(THP_FAULT_FALLBACK);
    744			return VM_FAULT_FALLBACK;
    745		}
    746		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
    747		ret = 0;
    748		if (pmd_none(*vmf->pmd)) {
    749			ret = check_stable_address_space(vma->vm_mm);
    750			if (ret) {
    751				spin_unlock(vmf->ptl);
    752				pte_free(vma->vm_mm, pgtable);
    753			} else if (userfaultfd_missing(vma)) {
    754				spin_unlock(vmf->ptl);
    755				pte_free(vma->vm_mm, pgtable);
    756				ret = handle_userfault(vmf, VM_UFFD_MISSING);
    757				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
    758			} else {
    759				set_huge_zero_page(pgtable, vma->vm_mm, vma,
    760						   haddr, vmf->pmd, zero_page);
    761				update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
    762				spin_unlock(vmf->ptl);
    763			}
    764		} else {
    765			spin_unlock(vmf->ptl);
    766			pte_free(vma->vm_mm, pgtable);
    767		}
    768		return ret;
    769	}
    770	gfp = vma_thp_gfp_mask(vma);
    771	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
    772	if (unlikely(!folio)) {
    773		count_vm_event(THP_FAULT_FALLBACK);
    774		return VM_FAULT_FALLBACK;
    775	}
    776	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
    777}
    778
    779static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
    780		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
    781		pgtable_t pgtable)
    782{
    783	struct mm_struct *mm = vma->vm_mm;
    784	pmd_t entry;
    785	spinlock_t *ptl;
    786
    787	ptl = pmd_lock(mm, pmd);
    788	if (!pmd_none(*pmd)) {
    789		if (write) {
    790			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
    791				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
    792				goto out_unlock;
    793			}
    794			entry = pmd_mkyoung(*pmd);
    795			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
    796			if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
    797				update_mmu_cache_pmd(vma, addr, pmd);
    798		}
    799
    800		goto out_unlock;
    801	}
    802
    803	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
    804	if (pfn_t_devmap(pfn))
    805		entry = pmd_mkdevmap(entry);
    806	if (write) {
    807		entry = pmd_mkyoung(pmd_mkdirty(entry));
    808		entry = maybe_pmd_mkwrite(entry, vma);
    809	}
    810
    811	if (pgtable) {
    812		pgtable_trans_huge_deposit(mm, pmd, pgtable);
    813		mm_inc_nr_ptes(mm);
    814		pgtable = NULL;
    815	}
    816
    817	set_pmd_at(mm, addr, pmd, entry);
    818	update_mmu_cache_pmd(vma, addr, pmd);
    819
    820out_unlock:
    821	spin_unlock(ptl);
    822	if (pgtable)
    823		pte_free(mm, pgtable);
    824}
    825
    826/**
    827 * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
    828 * @vmf: Structure describing the fault
    829 * @pfn: pfn to insert
    830 * @pgprot: page protection to use
    831 * @write: whether it's a write fault
    832 *
    833 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
    834 * also consult the vmf_insert_mixed_prot() documentation when
    835 * @pgprot != @vmf->vma->vm_page_prot.
    836 *
    837 * Return: vm_fault_t value.
    838 */
    839vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
    840				   pgprot_t pgprot, bool write)
    841{
    842	unsigned long addr = vmf->address & PMD_MASK;
    843	struct vm_area_struct *vma = vmf->vma;
    844	pgtable_t pgtable = NULL;
    845
    846	/*
    847	 * If we had pmd_special, we could avoid all these restrictions,
    848	 * but we need to be consistent with PTEs and architectures that
    849	 * can't support a 'special' bit.
    850	 */
    851	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
    852			!pfn_t_devmap(pfn));
    853	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
    854						(VM_PFNMAP|VM_MIXEDMAP));
    855	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
    856
    857	if (addr < vma->vm_start || addr >= vma->vm_end)
    858		return VM_FAULT_SIGBUS;
    859
    860	if (arch_needs_pgtable_deposit()) {
    861		pgtable = pte_alloc_one(vma->vm_mm);
    862		if (!pgtable)
    863			return VM_FAULT_OOM;
    864	}
    865
    866	track_pfn_insert(vma, &pgprot, pfn);
    867
    868	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
    869	return VM_FAULT_NOPAGE;
    870}
    871EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
    872
    873#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
    874static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
    875{
    876	if (likely(vma->vm_flags & VM_WRITE))
    877		pud = pud_mkwrite(pud);
    878	return pud;
    879}
    880
    881static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
    882		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
    883{
    884	struct mm_struct *mm = vma->vm_mm;
    885	pud_t entry;
    886	spinlock_t *ptl;
    887
    888	ptl = pud_lock(mm, pud);
    889	if (!pud_none(*pud)) {
    890		if (write) {
    891			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
    892				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
    893				goto out_unlock;
    894			}
    895			entry = pud_mkyoung(*pud);
    896			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
    897			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
    898				update_mmu_cache_pud(vma, addr, pud);
    899		}
    900		goto out_unlock;
    901	}
    902
    903	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
    904	if (pfn_t_devmap(pfn))
    905		entry = pud_mkdevmap(entry);
    906	if (write) {
    907		entry = pud_mkyoung(pud_mkdirty(entry));
    908		entry = maybe_pud_mkwrite(entry, vma);
    909	}
    910	set_pud_at(mm, addr, pud, entry);
    911	update_mmu_cache_pud(vma, addr, pud);
    912
    913out_unlock:
    914	spin_unlock(ptl);
    915}
    916
    917/**
    918 * vmf_insert_pfn_pud_prot - insert a pud size pfn
    919 * @vmf: Structure describing the fault
    920 * @pfn: pfn to insert
    921 * @pgprot: page protection to use
    922 * @write: whether it's a write fault
    923 *
    924 * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
    925 * also consult the vmf_insert_mixed_prot() documentation when
    926 * @pgprot != @vmf->vma->vm_page_prot.
    927 *
    928 * Return: vm_fault_t value.
    929 */
    930vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
    931				   pgprot_t pgprot, bool write)
    932{
    933	unsigned long addr = vmf->address & PUD_MASK;
    934	struct vm_area_struct *vma = vmf->vma;
    935
    936	/*
    937	 * If we had pud_special, we could avoid all these restrictions,
    938	 * but we need to be consistent with PTEs and architectures that
    939	 * can't support a 'special' bit.
    940	 */
    941	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
    942			!pfn_t_devmap(pfn));
    943	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
    944						(VM_PFNMAP|VM_MIXEDMAP));
    945	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
    946
    947	if (addr < vma->vm_start || addr >= vma->vm_end)
    948		return VM_FAULT_SIGBUS;
    949
    950	track_pfn_insert(vma, &pgprot, pfn);
    951
    952	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
    953	return VM_FAULT_NOPAGE;
    954}
    955EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
    956#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
    957
    958static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
    959		pmd_t *pmd, int flags)
    960{
    961	pmd_t _pmd;
    962
    963	_pmd = pmd_mkyoung(*pmd);
    964	if (flags & FOLL_WRITE)
    965		_pmd = pmd_mkdirty(_pmd);
    966	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
    967				pmd, _pmd, flags & FOLL_WRITE))
    968		update_mmu_cache_pmd(vma, addr, pmd);
    969}
    970
    971struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
    972		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
    973{
    974	unsigned long pfn = pmd_pfn(*pmd);
    975	struct mm_struct *mm = vma->vm_mm;
    976	struct page *page;
    977
    978	assert_spin_locked(pmd_lockptr(mm, pmd));
    979
    980	/*
    981	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
    982	 * not be in this function with `flags & FOLL_COW` set.
    983	 */
    984	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
    985
    986	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
    987	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
    988			 (FOLL_PIN | FOLL_GET)))
    989		return NULL;
    990
    991	if (flags & FOLL_WRITE && !pmd_write(*pmd))
    992		return NULL;
    993
    994	if (pmd_present(*pmd) && pmd_devmap(*pmd))
    995		/* pass */;
    996	else
    997		return NULL;
    998
    999	if (flags & FOLL_TOUCH)
   1000		touch_pmd(vma, addr, pmd, flags);
   1001
   1002	/*
   1003	 * device mapped pages can only be returned if the
   1004	 * caller will manage the page reference count.
   1005	 */
   1006	if (!(flags & (FOLL_GET | FOLL_PIN)))
   1007		return ERR_PTR(-EEXIST);
   1008
   1009	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
   1010	*pgmap = get_dev_pagemap(pfn, *pgmap);
   1011	if (!*pgmap)
   1012		return ERR_PTR(-EFAULT);
   1013	page = pfn_to_page(pfn);
   1014	if (!try_grab_page(page, flags))
   1015		page = ERR_PTR(-ENOMEM);
   1016
   1017	return page;
   1018}
   1019
   1020int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
   1021		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
   1022		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
   1023{
   1024	spinlock_t *dst_ptl, *src_ptl;
   1025	struct page *src_page;
   1026	pmd_t pmd;
   1027	pgtable_t pgtable = NULL;
   1028	int ret = -ENOMEM;
   1029
   1030	/* Skip if can be re-fill on fault */
   1031	if (!vma_is_anonymous(dst_vma))
   1032		return 0;
   1033
   1034	pgtable = pte_alloc_one(dst_mm);
   1035	if (unlikely(!pgtable))
   1036		goto out;
   1037
   1038	dst_ptl = pmd_lock(dst_mm, dst_pmd);
   1039	src_ptl = pmd_lockptr(src_mm, src_pmd);
   1040	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
   1041
   1042	ret = -EAGAIN;
   1043	pmd = *src_pmd;
   1044
   1045#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
   1046	if (unlikely(is_swap_pmd(pmd))) {
   1047		swp_entry_t entry = pmd_to_swp_entry(pmd);
   1048
   1049		VM_BUG_ON(!is_pmd_migration_entry(pmd));
   1050		if (!is_readable_migration_entry(entry)) {
   1051			entry = make_readable_migration_entry(
   1052							swp_offset(entry));
   1053			pmd = swp_entry_to_pmd(entry);
   1054			if (pmd_swp_soft_dirty(*src_pmd))
   1055				pmd = pmd_swp_mksoft_dirty(pmd);
   1056			if (pmd_swp_uffd_wp(*src_pmd))
   1057				pmd = pmd_swp_mkuffd_wp(pmd);
   1058			set_pmd_at(src_mm, addr, src_pmd, pmd);
   1059		}
   1060		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
   1061		mm_inc_nr_ptes(dst_mm);
   1062		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
   1063		if (!userfaultfd_wp(dst_vma))
   1064			pmd = pmd_swp_clear_uffd_wp(pmd);
   1065		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
   1066		ret = 0;
   1067		goto out_unlock;
   1068	}
   1069#endif
   1070
   1071	if (unlikely(!pmd_trans_huge(pmd))) {
   1072		pte_free(dst_mm, pgtable);
   1073		goto out_unlock;
   1074	}
   1075	/*
   1076	 * When page table lock is held, the huge zero pmd should not be
   1077	 * under splitting since we don't split the page itself, only pmd to
   1078	 * a page table.
   1079	 */
   1080	if (is_huge_zero_pmd(pmd)) {
   1081		/*
   1082		 * get_huge_zero_page() will never allocate a new page here,
   1083		 * since we already have a zero page to copy. It just takes a
   1084		 * reference.
   1085		 */
   1086		mm_get_huge_zero_page(dst_mm);
   1087		goto out_zero_page;
   1088	}
   1089
   1090	src_page = pmd_page(pmd);
   1091	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
   1092
   1093	get_page(src_page);
   1094	if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
   1095		/* Page maybe pinned: split and retry the fault on PTEs. */
   1096		put_page(src_page);
   1097		pte_free(dst_mm, pgtable);
   1098		spin_unlock(src_ptl);
   1099		spin_unlock(dst_ptl);
   1100		__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
   1101		return -EAGAIN;
   1102	}
   1103	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
   1104out_zero_page:
   1105	mm_inc_nr_ptes(dst_mm);
   1106	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
   1107	pmdp_set_wrprotect(src_mm, addr, src_pmd);
   1108	if (!userfaultfd_wp(dst_vma))
   1109		pmd = pmd_clear_uffd_wp(pmd);
   1110	pmd = pmd_mkold(pmd_wrprotect(pmd));
   1111	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
   1112
   1113	ret = 0;
   1114out_unlock:
   1115	spin_unlock(src_ptl);
   1116	spin_unlock(dst_ptl);
   1117out:
   1118	return ret;
   1119}
   1120
   1121#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
   1122static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
   1123		pud_t *pud, int flags)
   1124{
   1125	pud_t _pud;
   1126
   1127	_pud = pud_mkyoung(*pud);
   1128	if (flags & FOLL_WRITE)
   1129		_pud = pud_mkdirty(_pud);
   1130	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
   1131				pud, _pud, flags & FOLL_WRITE))
   1132		update_mmu_cache_pud(vma, addr, pud);
   1133}
   1134
   1135struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
   1136		pud_t *pud, int flags, struct dev_pagemap **pgmap)
   1137{
   1138	unsigned long pfn = pud_pfn(*pud);
   1139	struct mm_struct *mm = vma->vm_mm;
   1140	struct page *page;
   1141
   1142	assert_spin_locked(pud_lockptr(mm, pud));
   1143
   1144	if (flags & FOLL_WRITE && !pud_write(*pud))
   1145		return NULL;
   1146
   1147	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
   1148	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
   1149			 (FOLL_PIN | FOLL_GET)))
   1150		return NULL;
   1151
   1152	if (pud_present(*pud) && pud_devmap(*pud))
   1153		/* pass */;
   1154	else
   1155		return NULL;
   1156
   1157	if (flags & FOLL_TOUCH)
   1158		touch_pud(vma, addr, pud, flags);
   1159
   1160	/*
   1161	 * device mapped pages can only be returned if the
   1162	 * caller will manage the page reference count.
   1163	 *
   1164	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
   1165	 */
   1166	if (!(flags & (FOLL_GET | FOLL_PIN)))
   1167		return ERR_PTR(-EEXIST);
   1168
   1169	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
   1170	*pgmap = get_dev_pagemap(pfn, *pgmap);
   1171	if (!*pgmap)
   1172		return ERR_PTR(-EFAULT);
   1173	page = pfn_to_page(pfn);
   1174	if (!try_grab_page(page, flags))
   1175		page = ERR_PTR(-ENOMEM);
   1176
   1177	return page;
   1178}
   1179
   1180int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
   1181		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
   1182		  struct vm_area_struct *vma)
   1183{
   1184	spinlock_t *dst_ptl, *src_ptl;
   1185	pud_t pud;
   1186	int ret;
   1187
   1188	dst_ptl = pud_lock(dst_mm, dst_pud);
   1189	src_ptl = pud_lockptr(src_mm, src_pud);
   1190	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
   1191
   1192	ret = -EAGAIN;
   1193	pud = *src_pud;
   1194	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
   1195		goto out_unlock;
   1196
   1197	/*
   1198	 * When page table lock is held, the huge zero pud should not be
   1199	 * under splitting since we don't split the page itself, only pud to
   1200	 * a page table.
   1201	 */
   1202	if (is_huge_zero_pud(pud)) {
   1203		/* No huge zero pud yet */
   1204	}
   1205
   1206	/*
   1207	 * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
   1208	 * and split if duplicating fails.
   1209	 */
   1210	pudp_set_wrprotect(src_mm, addr, src_pud);
   1211	pud = pud_mkold(pud_wrprotect(pud));
   1212	set_pud_at(dst_mm, addr, dst_pud, pud);
   1213
   1214	ret = 0;
   1215out_unlock:
   1216	spin_unlock(src_ptl);
   1217	spin_unlock(dst_ptl);
   1218	return ret;
   1219}
   1220
   1221void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
   1222{
   1223	pud_t entry;
   1224	unsigned long haddr;
   1225	bool write = vmf->flags & FAULT_FLAG_WRITE;
   1226
   1227	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
   1228	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
   1229		goto unlock;
   1230
   1231	entry = pud_mkyoung(orig_pud);
   1232	if (write)
   1233		entry = pud_mkdirty(entry);
   1234	haddr = vmf->address & HPAGE_PUD_MASK;
   1235	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
   1236		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
   1237
   1238unlock:
   1239	spin_unlock(vmf->ptl);
   1240}
   1241#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
   1242
   1243void huge_pmd_set_accessed(struct vm_fault *vmf)
   1244{
   1245	pmd_t entry;
   1246	unsigned long haddr;
   1247	bool write = vmf->flags & FAULT_FLAG_WRITE;
   1248	pmd_t orig_pmd = vmf->orig_pmd;
   1249
   1250	vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
   1251	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
   1252		goto unlock;
   1253
   1254	entry = pmd_mkyoung(orig_pmd);
   1255	if (write)
   1256		entry = pmd_mkdirty(entry);
   1257	haddr = vmf->address & HPAGE_PMD_MASK;
   1258	if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
   1259		update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
   1260
   1261unlock:
   1262	spin_unlock(vmf->ptl);
   1263}
   1264
   1265vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
   1266{
   1267	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
   1268	struct vm_area_struct *vma = vmf->vma;
   1269	struct page *page;
   1270	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
   1271	pmd_t orig_pmd = vmf->orig_pmd;
   1272
   1273	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
   1274	VM_BUG_ON_VMA(!vma->anon_vma, vma);
   1275
   1276	VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
   1277	VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
   1278
   1279	if (is_huge_zero_pmd(orig_pmd))
   1280		goto fallback;
   1281
   1282	spin_lock(vmf->ptl);
   1283
   1284	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
   1285		spin_unlock(vmf->ptl);
   1286		return 0;
   1287	}
   1288
   1289	page = pmd_page(orig_pmd);
   1290	VM_BUG_ON_PAGE(!PageHead(page), page);
   1291
   1292	/* Early check when only holding the PT lock. */
   1293	if (PageAnonExclusive(page))
   1294		goto reuse;
   1295
   1296	if (!trylock_page(page)) {
   1297		get_page(page);
   1298		spin_unlock(vmf->ptl);
   1299		lock_page(page);
   1300		spin_lock(vmf->ptl);
   1301		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
   1302			spin_unlock(vmf->ptl);
   1303			unlock_page(page);
   1304			put_page(page);
   1305			return 0;
   1306		}
   1307		put_page(page);
   1308	}
   1309
   1310	/* Recheck after temporarily dropping the PT lock. */
   1311	if (PageAnonExclusive(page)) {
   1312		unlock_page(page);
   1313		goto reuse;
   1314	}
   1315
   1316	/*
   1317	 * See do_wp_page(): we can only reuse the page exclusively if there are
   1318	 * no additional references. Note that we always drain the LRU
   1319	 * pagevecs immediately after adding a THP.
   1320	 */
   1321	if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
   1322		goto unlock_fallback;
   1323	if (PageSwapCache(page))
   1324		try_to_free_swap(page);
   1325	if (page_count(page) == 1) {
   1326		pmd_t entry;
   1327
   1328		page_move_anon_rmap(page, vma);
   1329		unlock_page(page);
   1330reuse:
   1331		if (unlikely(unshare)) {
   1332			spin_unlock(vmf->ptl);
   1333			return 0;
   1334		}
   1335		entry = pmd_mkyoung(orig_pmd);
   1336		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
   1337		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
   1338			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
   1339		spin_unlock(vmf->ptl);
   1340		return VM_FAULT_WRITE;
   1341	}
   1342
   1343unlock_fallback:
   1344	unlock_page(page);
   1345	spin_unlock(vmf->ptl);
   1346fallback:
   1347	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
   1348	return VM_FAULT_FALLBACK;
   1349}
   1350
   1351/*
   1352 * FOLL_FORCE can write to even unwritable pmd's, but only
   1353 * after we've gone through a COW cycle and they are dirty.
   1354 */
   1355static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
   1356{
   1357	return pmd_write(pmd) ||
   1358	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
   1359}
   1360
   1361struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
   1362				   unsigned long addr,
   1363				   pmd_t *pmd,
   1364				   unsigned int flags)
   1365{
   1366	struct mm_struct *mm = vma->vm_mm;
   1367	struct page *page = NULL;
   1368
   1369	assert_spin_locked(pmd_lockptr(mm, pmd));
   1370
   1371	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
   1372		goto out;
   1373
   1374	/* Avoid dumping huge zero page */
   1375	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
   1376		return ERR_PTR(-EFAULT);
   1377
   1378	/* Full NUMA hinting faults to serialise migration in fault paths */
   1379	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
   1380		goto out;
   1381
   1382	page = pmd_page(*pmd);
   1383	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
   1384
   1385	if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
   1386		return ERR_PTR(-EMLINK);
   1387
   1388	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
   1389			!PageAnonExclusive(page), page);
   1390
   1391	if (!try_grab_page(page, flags))
   1392		return ERR_PTR(-ENOMEM);
   1393
   1394	if (flags & FOLL_TOUCH)
   1395		touch_pmd(vma, addr, pmd, flags);
   1396
   1397	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
   1398	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
   1399
   1400out:
   1401	return page;
   1402}
   1403
   1404/* NUMA hinting page fault entry point for trans huge pmds */
   1405vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
   1406{
   1407	struct vm_area_struct *vma = vmf->vma;
   1408	pmd_t oldpmd = vmf->orig_pmd;
   1409	pmd_t pmd;
   1410	struct page *page;
   1411	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
   1412	int page_nid = NUMA_NO_NODE;
   1413	int target_nid, last_cpupid = -1;
   1414	bool migrated = false;
   1415	bool was_writable = pmd_savedwrite(oldpmd);
   1416	int flags = 0;
   1417
   1418	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
   1419	if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
   1420		spin_unlock(vmf->ptl);
   1421		goto out;
   1422	}
   1423
   1424	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
   1425	page = vm_normal_page_pmd(vma, haddr, pmd);
   1426	if (!page)
   1427		goto out_map;
   1428
   1429	/* See similar comment in do_numa_page for explanation */
   1430	if (!was_writable)
   1431		flags |= TNF_NO_GROUP;
   1432
   1433	page_nid = page_to_nid(page);
   1434	last_cpupid = page_cpupid_last(page);
   1435	target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
   1436				       &flags);
   1437
   1438	if (target_nid == NUMA_NO_NODE) {
   1439		put_page(page);
   1440		goto out_map;
   1441	}
   1442
   1443	spin_unlock(vmf->ptl);
   1444
   1445	migrated = migrate_misplaced_page(page, vma, target_nid);
   1446	if (migrated) {
   1447		flags |= TNF_MIGRATED;
   1448		page_nid = target_nid;
   1449	} else {
   1450		flags |= TNF_MIGRATE_FAIL;
   1451		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
   1452		if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
   1453			spin_unlock(vmf->ptl);
   1454			goto out;
   1455		}
   1456		goto out_map;
   1457	}
   1458
   1459out:
   1460	if (page_nid != NUMA_NO_NODE)
   1461		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
   1462				flags);
   1463
   1464	return 0;
   1465
   1466out_map:
   1467	/* Restore the PMD */
   1468	pmd = pmd_modify(oldpmd, vma->vm_page_prot);
   1469	pmd = pmd_mkyoung(pmd);
   1470	if (was_writable)
   1471		pmd = pmd_mkwrite(pmd);
   1472	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
   1473	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
   1474	spin_unlock(vmf->ptl);
   1475	goto out;
   1476}
   1477
   1478/*
   1479 * Return true if we do MADV_FREE successfully on entire pmd page.
   1480 * Otherwise, return false.
   1481 */
   1482bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
   1483		pmd_t *pmd, unsigned long addr, unsigned long next)
   1484{
   1485	spinlock_t *ptl;
   1486	pmd_t orig_pmd;
   1487	struct page *page;
   1488	struct mm_struct *mm = tlb->mm;
   1489	bool ret = false;
   1490
   1491	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
   1492
   1493	ptl = pmd_trans_huge_lock(pmd, vma);
   1494	if (!ptl)
   1495		goto out_unlocked;
   1496
   1497	orig_pmd = *pmd;
   1498	if (is_huge_zero_pmd(orig_pmd))
   1499		goto out;
   1500
   1501	if (unlikely(!pmd_present(orig_pmd))) {
   1502		VM_BUG_ON(thp_migration_supported() &&
   1503				  !is_pmd_migration_entry(orig_pmd));
   1504		goto out;
   1505	}
   1506
   1507	page = pmd_page(orig_pmd);
   1508	/*
   1509	 * If other processes are mapping this page, we couldn't discard
   1510	 * the page unless they all do MADV_FREE so let's skip the page.
   1511	 */
   1512	if (total_mapcount(page) != 1)
   1513		goto out;
   1514
   1515	if (!trylock_page(page))
   1516		goto out;
   1517
   1518	/*
   1519	 * If user want to discard part-pages of THP, split it so MADV_FREE
   1520	 * will deactivate only them.
   1521	 */
   1522	if (next - addr != HPAGE_PMD_SIZE) {
   1523		get_page(page);
   1524		spin_unlock(ptl);
   1525		split_huge_page(page);
   1526		unlock_page(page);
   1527		put_page(page);
   1528		goto out_unlocked;
   1529	}
   1530
   1531	if (PageDirty(page))
   1532		ClearPageDirty(page);
   1533	unlock_page(page);
   1534
   1535	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
   1536		pmdp_invalidate(vma, addr, pmd);
   1537		orig_pmd = pmd_mkold(orig_pmd);
   1538		orig_pmd = pmd_mkclean(orig_pmd);
   1539
   1540		set_pmd_at(mm, addr, pmd, orig_pmd);
   1541		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
   1542	}
   1543
   1544	mark_page_lazyfree(page);
   1545	ret = true;
   1546out:
   1547	spin_unlock(ptl);
   1548out_unlocked:
   1549	return ret;
   1550}
   1551
   1552static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
   1553{
   1554	pgtable_t pgtable;
   1555
   1556	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
   1557	pte_free(mm, pgtable);
   1558	mm_dec_nr_ptes(mm);
   1559}
   1560
   1561int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
   1562		 pmd_t *pmd, unsigned long addr)
   1563{
   1564	pmd_t orig_pmd;
   1565	spinlock_t *ptl;
   1566
   1567	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
   1568
   1569	ptl = __pmd_trans_huge_lock(pmd, vma);
   1570	if (!ptl)
   1571		return 0;
   1572	/*
   1573	 * For architectures like ppc64 we look at deposited pgtable
   1574	 * when calling pmdp_huge_get_and_clear. So do the
   1575	 * pgtable_trans_huge_withdraw after finishing pmdp related
   1576	 * operations.
   1577	 */
   1578	orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
   1579						tlb->fullmm);
   1580	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
   1581	if (vma_is_special_huge(vma)) {
   1582		if (arch_needs_pgtable_deposit())
   1583			zap_deposited_table(tlb->mm, pmd);
   1584		spin_unlock(ptl);
   1585	} else if (is_huge_zero_pmd(orig_pmd)) {
   1586		zap_deposited_table(tlb->mm, pmd);
   1587		spin_unlock(ptl);
   1588	} else {
   1589		struct page *page = NULL;
   1590		int flush_needed = 1;
   1591
   1592		if (pmd_present(orig_pmd)) {
   1593			page = pmd_page(orig_pmd);
   1594			page_remove_rmap(page, vma, true);
   1595			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
   1596			VM_BUG_ON_PAGE(!PageHead(page), page);
   1597		} else if (thp_migration_supported()) {
   1598			swp_entry_t entry;
   1599
   1600			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
   1601			entry = pmd_to_swp_entry(orig_pmd);
   1602			page = pfn_swap_entry_to_page(entry);
   1603			flush_needed = 0;
   1604		} else
   1605			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
   1606
   1607		if (PageAnon(page)) {
   1608			zap_deposited_table(tlb->mm, pmd);
   1609			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
   1610		} else {
   1611			if (arch_needs_pgtable_deposit())
   1612				zap_deposited_table(tlb->mm, pmd);
   1613			add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
   1614		}
   1615
   1616		spin_unlock(ptl);
   1617		if (flush_needed)
   1618			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
   1619	}
   1620	return 1;
   1621}
   1622
   1623#ifndef pmd_move_must_withdraw
   1624static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
   1625					 spinlock_t *old_pmd_ptl,
   1626					 struct vm_area_struct *vma)
   1627{
   1628	/*
   1629	 * With split pmd lock we also need to move preallocated
   1630	 * PTE page table if new_pmd is on different PMD page table.
   1631	 *
   1632	 * We also don't deposit and withdraw tables for file pages.
   1633	 */
   1634	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
   1635}
   1636#endif
   1637
   1638static pmd_t move_soft_dirty_pmd(pmd_t pmd)
   1639{
   1640#ifdef CONFIG_MEM_SOFT_DIRTY
   1641	if (unlikely(is_pmd_migration_entry(pmd)))
   1642		pmd = pmd_swp_mksoft_dirty(pmd);
   1643	else if (pmd_present(pmd))
   1644		pmd = pmd_mksoft_dirty(pmd);
   1645#endif
   1646	return pmd;
   1647}
   1648
   1649bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
   1650		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
   1651{
   1652	spinlock_t *old_ptl, *new_ptl;
   1653	pmd_t pmd;
   1654	struct mm_struct *mm = vma->vm_mm;
   1655	bool force_flush = false;
   1656
   1657	/*
   1658	 * The destination pmd shouldn't be established, free_pgtables()
   1659	 * should have release it.
   1660	 */
   1661	if (WARN_ON(!pmd_none(*new_pmd))) {
   1662		VM_BUG_ON(pmd_trans_huge(*new_pmd));
   1663		return false;
   1664	}
   1665
   1666	/*
   1667	 * We don't have to worry about the ordering of src and dst
   1668	 * ptlocks because exclusive mmap_lock prevents deadlock.
   1669	 */
   1670	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
   1671	if (old_ptl) {
   1672		new_ptl = pmd_lockptr(mm, new_pmd);
   1673		if (new_ptl != old_ptl)
   1674			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
   1675		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
   1676		if (pmd_present(pmd))
   1677			force_flush = true;
   1678		VM_BUG_ON(!pmd_none(*new_pmd));
   1679
   1680		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
   1681			pgtable_t pgtable;
   1682			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
   1683			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
   1684		}
   1685		pmd = move_soft_dirty_pmd(pmd);
   1686		set_pmd_at(mm, new_addr, new_pmd, pmd);
   1687		if (force_flush)
   1688			flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
   1689		if (new_ptl != old_ptl)
   1690			spin_unlock(new_ptl);
   1691		spin_unlock(old_ptl);
   1692		return true;
   1693	}
   1694	return false;
   1695}
   1696
   1697/*
   1698 * Returns
   1699 *  - 0 if PMD could not be locked
   1700 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
   1701 *      or if prot_numa but THP migration is not supported
   1702 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
   1703 */
   1704int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
   1705		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
   1706		    unsigned long cp_flags)
   1707{
   1708	struct mm_struct *mm = vma->vm_mm;
   1709	spinlock_t *ptl;
   1710	pmd_t oldpmd, entry;
   1711	bool preserve_write;
   1712	int ret;
   1713	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
   1714	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
   1715	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
   1716
   1717	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
   1718
   1719	if (prot_numa && !thp_migration_supported())
   1720		return 1;
   1721
   1722	ptl = __pmd_trans_huge_lock(pmd, vma);
   1723	if (!ptl)
   1724		return 0;
   1725
   1726	preserve_write = prot_numa && pmd_write(*pmd);
   1727	ret = 1;
   1728
   1729#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
   1730	if (is_swap_pmd(*pmd)) {
   1731		swp_entry_t entry = pmd_to_swp_entry(*pmd);
   1732		struct page *page = pfn_swap_entry_to_page(entry);
   1733
   1734		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
   1735		if (is_writable_migration_entry(entry)) {
   1736			pmd_t newpmd;
   1737			/*
   1738			 * A protection check is difficult so
   1739			 * just be safe and disable write
   1740			 */
   1741			if (PageAnon(page))
   1742				entry = make_readable_exclusive_migration_entry(swp_offset(entry));
   1743			else
   1744				entry = make_readable_migration_entry(swp_offset(entry));
   1745			newpmd = swp_entry_to_pmd(entry);
   1746			if (pmd_swp_soft_dirty(*pmd))
   1747				newpmd = pmd_swp_mksoft_dirty(newpmd);
   1748			if (pmd_swp_uffd_wp(*pmd))
   1749				newpmd = pmd_swp_mkuffd_wp(newpmd);
   1750			set_pmd_at(mm, addr, pmd, newpmd);
   1751		}
   1752		goto unlock;
   1753	}
   1754#endif
   1755
   1756	if (prot_numa) {
   1757		struct page *page;
   1758		/*
   1759		 * Avoid trapping faults against the zero page. The read-only
   1760		 * data is likely to be read-cached on the local CPU and
   1761		 * local/remote hits to the zero page are not interesting.
   1762		 */
   1763		if (is_huge_zero_pmd(*pmd))
   1764			goto unlock;
   1765
   1766		if (pmd_protnone(*pmd))
   1767			goto unlock;
   1768
   1769		page = pmd_page(*pmd);
   1770		/*
   1771		 * Skip scanning top tier node if normal numa
   1772		 * balancing is disabled
   1773		 */
   1774		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
   1775		    node_is_toptier(page_to_nid(page)))
   1776			goto unlock;
   1777	}
   1778	/*
   1779	 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
   1780	 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
   1781	 * which is also under mmap_read_lock(mm):
   1782	 *
   1783	 *	CPU0:				CPU1:
   1784	 *				change_huge_pmd(prot_numa=1)
   1785	 *				 pmdp_huge_get_and_clear_notify()
   1786	 * madvise_dontneed()
   1787	 *  zap_pmd_range()
   1788	 *   pmd_trans_huge(*pmd) == 0 (without ptl)
   1789	 *   // skip the pmd
   1790	 *				 set_pmd_at();
   1791	 *				 // pmd is re-established
   1792	 *
   1793	 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
   1794	 * which may break userspace.
   1795	 *
   1796	 * pmdp_invalidate_ad() is required to make sure we don't miss
   1797	 * dirty/young flags set by hardware.
   1798	 */
   1799	oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
   1800
   1801	entry = pmd_modify(oldpmd, newprot);
   1802	if (preserve_write)
   1803		entry = pmd_mk_savedwrite(entry);
   1804	if (uffd_wp) {
   1805		entry = pmd_wrprotect(entry);
   1806		entry = pmd_mkuffd_wp(entry);
   1807	} else if (uffd_wp_resolve) {
   1808		/*
   1809		 * Leave the write bit to be handled by PF interrupt
   1810		 * handler, then things like COW could be properly
   1811		 * handled.
   1812		 */
   1813		entry = pmd_clear_uffd_wp(entry);
   1814	}
   1815	ret = HPAGE_PMD_NR;
   1816	set_pmd_at(mm, addr, pmd, entry);
   1817
   1818	if (huge_pmd_needs_flush(oldpmd, entry))
   1819		tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
   1820
   1821	BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
   1822unlock:
   1823	spin_unlock(ptl);
   1824	return ret;
   1825}
   1826
   1827/*
   1828 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
   1829 *
   1830 * Note that if it returns page table lock pointer, this routine returns without
   1831 * unlocking page table lock. So callers must unlock it.
   1832 */
   1833spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
   1834{
   1835	spinlock_t *ptl;
   1836	ptl = pmd_lock(vma->vm_mm, pmd);
   1837	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
   1838			pmd_devmap(*pmd)))
   1839		return ptl;
   1840	spin_unlock(ptl);
   1841	return NULL;
   1842}
   1843
   1844/*
   1845 * Returns true if a given pud maps a thp, false otherwise.
   1846 *
   1847 * Note that if it returns true, this routine returns without unlocking page
   1848 * table lock. So callers must unlock it.
   1849 */
   1850spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
   1851{
   1852	spinlock_t *ptl;
   1853
   1854	ptl = pud_lock(vma->vm_mm, pud);
   1855	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
   1856		return ptl;
   1857	spin_unlock(ptl);
   1858	return NULL;
   1859}
   1860
   1861#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
   1862int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
   1863		 pud_t *pud, unsigned long addr)
   1864{
   1865	spinlock_t *ptl;
   1866
   1867	ptl = __pud_trans_huge_lock(pud, vma);
   1868	if (!ptl)
   1869		return 0;
   1870	/*
   1871	 * For architectures like ppc64 we look at deposited pgtable
   1872	 * when calling pudp_huge_get_and_clear. So do the
   1873	 * pgtable_trans_huge_withdraw after finishing pudp related
   1874	 * operations.
   1875	 */
   1876	pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
   1877	tlb_remove_pud_tlb_entry(tlb, pud, addr);
   1878	if (vma_is_special_huge(vma)) {
   1879		spin_unlock(ptl);
   1880		/* No zero page support yet */
   1881	} else {
   1882		/* No support for anonymous PUD pages yet */
   1883		BUG();
   1884	}
   1885	return 1;
   1886}
   1887
   1888static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
   1889		unsigned long haddr)
   1890{
   1891	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
   1892	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
   1893	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
   1894	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
   1895
   1896	count_vm_event(THP_SPLIT_PUD);
   1897
   1898	pudp_huge_clear_flush_notify(vma, haddr, pud);
   1899}
   1900
   1901void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
   1902		unsigned long address)
   1903{
   1904	spinlock_t *ptl;
   1905	struct mmu_notifier_range range;
   1906
   1907	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
   1908				address & HPAGE_PUD_MASK,
   1909				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
   1910	mmu_notifier_invalidate_range_start(&range);
   1911	ptl = pud_lock(vma->vm_mm, pud);
   1912	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
   1913		goto out;
   1914	__split_huge_pud_locked(vma, pud, range.start);
   1915
   1916out:
   1917	spin_unlock(ptl);
   1918	/*
   1919	 * No need to double call mmu_notifier->invalidate_range() callback as
   1920	 * the above pudp_huge_clear_flush_notify() did already call it.
   1921	 */
   1922	mmu_notifier_invalidate_range_only_end(&range);
   1923}
   1924#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
   1925
   1926static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
   1927		unsigned long haddr, pmd_t *pmd)
   1928{
   1929	struct mm_struct *mm = vma->vm_mm;
   1930	pgtable_t pgtable;
   1931	pmd_t _pmd;
   1932	int i;
   1933
   1934	/*
   1935	 * Leave pmd empty until pte is filled note that it is fine to delay
   1936	 * notification until mmu_notifier_invalidate_range_end() as we are
   1937	 * replacing a zero pmd write protected page with a zero pte write
   1938	 * protected page.
   1939	 *
   1940	 * See Documentation/vm/mmu_notifier.rst
   1941	 */
   1942	pmdp_huge_clear_flush(vma, haddr, pmd);
   1943
   1944	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
   1945	pmd_populate(mm, &_pmd, pgtable);
   1946
   1947	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
   1948		pte_t *pte, entry;
   1949		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
   1950		entry = pte_mkspecial(entry);
   1951		pte = pte_offset_map(&_pmd, haddr);
   1952		VM_BUG_ON(!pte_none(*pte));
   1953		set_pte_at(mm, haddr, pte, entry);
   1954		pte_unmap(pte);
   1955	}
   1956	smp_wmb(); /* make pte visible before pmd */
   1957	pmd_populate(mm, pmd, pgtable);
   1958}
   1959
   1960static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
   1961		unsigned long haddr, bool freeze)
   1962{
   1963	struct mm_struct *mm = vma->vm_mm;
   1964	struct page *page;
   1965	pgtable_t pgtable;
   1966	pmd_t old_pmd, _pmd;
   1967	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
   1968	bool anon_exclusive = false;
   1969	unsigned long addr;
   1970	int i;
   1971
   1972	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
   1973	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
   1974	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
   1975	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
   1976				&& !pmd_devmap(*pmd));
   1977
   1978	count_vm_event(THP_SPLIT_PMD);
   1979
   1980	if (!vma_is_anonymous(vma)) {
   1981		old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
   1982		/*
   1983		 * We are going to unmap this huge page. So
   1984		 * just go ahead and zap it
   1985		 */
   1986		if (arch_needs_pgtable_deposit())
   1987			zap_deposited_table(mm, pmd);
   1988		if (vma_is_special_huge(vma))
   1989			return;
   1990		if (unlikely(is_pmd_migration_entry(old_pmd))) {
   1991			swp_entry_t entry;
   1992
   1993			entry = pmd_to_swp_entry(old_pmd);
   1994			page = pfn_swap_entry_to_page(entry);
   1995		} else {
   1996			page = pmd_page(old_pmd);
   1997			if (!PageDirty(page) && pmd_dirty(old_pmd))
   1998				set_page_dirty(page);
   1999			if (!PageReferenced(page) && pmd_young(old_pmd))
   2000				SetPageReferenced(page);
   2001			page_remove_rmap(page, vma, true);
   2002			put_page(page);
   2003		}
   2004		add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
   2005		return;
   2006	}
   2007
   2008	if (is_huge_zero_pmd(*pmd)) {
   2009		/*
   2010		 * FIXME: Do we want to invalidate secondary mmu by calling
   2011		 * mmu_notifier_invalidate_range() see comments below inside
   2012		 * __split_huge_pmd() ?
   2013		 *
   2014		 * We are going from a zero huge page write protected to zero
   2015		 * small page also write protected so it does not seems useful
   2016		 * to invalidate secondary mmu at this time.
   2017		 */
   2018		return __split_huge_zero_page_pmd(vma, haddr, pmd);
   2019	}
   2020
   2021	/*
   2022	 * Up to this point the pmd is present and huge and userland has the
   2023	 * whole access to the hugepage during the split (which happens in
   2024	 * place). If we overwrite the pmd with the not-huge version pointing
   2025	 * to the pte here (which of course we could if all CPUs were bug
   2026	 * free), userland could trigger a small page size TLB miss on the
   2027	 * small sized TLB while the hugepage TLB entry is still established in
   2028	 * the huge TLB. Some CPU doesn't like that.
   2029	 * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
   2030	 * 383 on page 105. Intel should be safe but is also warns that it's
   2031	 * only safe if the permission and cache attributes of the two entries
   2032	 * loaded in the two TLB is identical (which should be the case here).
   2033	 * But it is generally safer to never allow small and huge TLB entries
   2034	 * for the same virtual address to be loaded simultaneously. So instead
   2035	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
   2036	 * current pmd notpresent (atomically because here the pmd_trans_huge
   2037	 * must remain set at all times on the pmd until the split is complete
   2038	 * for this pmd), then we flush the SMP TLB and finally we write the
   2039	 * non-huge version of the pmd entry with pmd_populate.
   2040	 */
   2041	old_pmd = pmdp_invalidate(vma, haddr, pmd);
   2042
   2043	pmd_migration = is_pmd_migration_entry(old_pmd);
   2044	if (unlikely(pmd_migration)) {
   2045		swp_entry_t entry;
   2046
   2047		entry = pmd_to_swp_entry(old_pmd);
   2048		page = pfn_swap_entry_to_page(entry);
   2049		write = is_writable_migration_entry(entry);
   2050		if (PageAnon(page))
   2051			anon_exclusive = is_readable_exclusive_migration_entry(entry);
   2052		young = false;
   2053		soft_dirty = pmd_swp_soft_dirty(old_pmd);
   2054		uffd_wp = pmd_swp_uffd_wp(old_pmd);
   2055	} else {
   2056		page = pmd_page(old_pmd);
   2057		if (pmd_dirty(old_pmd))
   2058			SetPageDirty(page);
   2059		write = pmd_write(old_pmd);
   2060		young = pmd_young(old_pmd);
   2061		soft_dirty = pmd_soft_dirty(old_pmd);
   2062		uffd_wp = pmd_uffd_wp(old_pmd);
   2063
   2064		VM_BUG_ON_PAGE(!page_count(page), page);
   2065		page_ref_add(page, HPAGE_PMD_NR - 1);
   2066
   2067		/*
   2068		 * Without "freeze", we'll simply split the PMD, propagating the
   2069		 * PageAnonExclusive() flag for each PTE by setting it for
   2070		 * each subpage -- no need to (temporarily) clear.
   2071		 *
   2072		 * With "freeze" we want to replace mapped pages by
   2073		 * migration entries right away. This is only possible if we
   2074		 * managed to clear PageAnonExclusive() -- see
   2075		 * set_pmd_migration_entry().
   2076		 *
   2077		 * In case we cannot clear PageAnonExclusive(), split the PMD
   2078		 * only and let try_to_migrate_one() fail later.
   2079		 */
   2080		anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
   2081		if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
   2082			freeze = false;
   2083	}
   2084
   2085	/*
   2086	 * Withdraw the table only after we mark the pmd entry invalid.
   2087	 * This's critical for some architectures (Power).
   2088	 */
   2089	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
   2090	pmd_populate(mm, &_pmd, pgtable);
   2091
   2092	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
   2093		pte_t entry, *pte;
   2094		/*
   2095		 * Note that NUMA hinting access restrictions are not
   2096		 * transferred to avoid any possibility of altering
   2097		 * permissions across VMAs.
   2098		 */
   2099		if (freeze || pmd_migration) {
   2100			swp_entry_t swp_entry;
   2101			if (write)
   2102				swp_entry = make_writable_migration_entry(
   2103							page_to_pfn(page + i));
   2104			else if (anon_exclusive)
   2105				swp_entry = make_readable_exclusive_migration_entry(
   2106							page_to_pfn(page + i));
   2107			else
   2108				swp_entry = make_readable_migration_entry(
   2109							page_to_pfn(page + i));
   2110			entry = swp_entry_to_pte(swp_entry);
   2111			if (soft_dirty)
   2112				entry = pte_swp_mksoft_dirty(entry);
   2113			if (uffd_wp)
   2114				entry = pte_swp_mkuffd_wp(entry);
   2115		} else {
   2116			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
   2117			entry = maybe_mkwrite(entry, vma);
   2118			if (anon_exclusive)
   2119				SetPageAnonExclusive(page + i);
   2120			if (!write)
   2121				entry = pte_wrprotect(entry);
   2122			if (!young)
   2123				entry = pte_mkold(entry);
   2124			if (soft_dirty)
   2125				entry = pte_mksoft_dirty(entry);
   2126			if (uffd_wp)
   2127				entry = pte_mkuffd_wp(entry);
   2128		}
   2129		pte = pte_offset_map(&_pmd, addr);
   2130		BUG_ON(!pte_none(*pte));
   2131		set_pte_at(mm, addr, pte, entry);
   2132		if (!pmd_migration)
   2133			atomic_inc(&page[i]._mapcount);
   2134		pte_unmap(pte);
   2135	}
   2136
   2137	if (!pmd_migration) {
   2138		/*
   2139		 * Set PG_double_map before dropping compound_mapcount to avoid
   2140		 * false-negative page_mapped().
   2141		 */
   2142		if (compound_mapcount(page) > 1 &&
   2143		    !TestSetPageDoubleMap(page)) {
   2144			for (i = 0; i < HPAGE_PMD_NR; i++)
   2145				atomic_inc(&page[i]._mapcount);
   2146		}
   2147
   2148		lock_page_memcg(page);
   2149		if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
   2150			/* Last compound_mapcount is gone. */
   2151			__mod_lruvec_page_state(page, NR_ANON_THPS,
   2152						-HPAGE_PMD_NR);
   2153			if (TestClearPageDoubleMap(page)) {
   2154				/* No need in mapcount reference anymore */
   2155				for (i = 0; i < HPAGE_PMD_NR; i++)
   2156					atomic_dec(&page[i]._mapcount);
   2157			}
   2158		}
   2159		unlock_page_memcg(page);
   2160
   2161		/* Above is effectively page_remove_rmap(page, vma, true) */
   2162		munlock_vma_page(page, vma, true);
   2163	}
   2164
   2165	smp_wmb(); /* make pte visible before pmd */
   2166	pmd_populate(mm, pmd, pgtable);
   2167
   2168	if (freeze) {
   2169		for (i = 0; i < HPAGE_PMD_NR; i++) {
   2170			page_remove_rmap(page + i, vma, false);
   2171			put_page(page + i);
   2172		}
   2173	}
   2174}
   2175
   2176void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
   2177		unsigned long address, bool freeze, struct folio *folio)
   2178{
   2179	spinlock_t *ptl;
   2180	struct mmu_notifier_range range;
   2181
   2182	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
   2183				address & HPAGE_PMD_MASK,
   2184				(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
   2185	mmu_notifier_invalidate_range_start(&range);
   2186	ptl = pmd_lock(vma->vm_mm, pmd);
   2187
   2188	/*
   2189	 * If caller asks to setup a migration entry, we need a folio to check
   2190	 * pmd against. Otherwise we can end up replacing wrong folio.
   2191	 */
   2192	VM_BUG_ON(freeze && !folio);
   2193	VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
   2194
   2195	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
   2196	    is_pmd_migration_entry(*pmd)) {
   2197		if (folio && folio != page_folio(pmd_page(*pmd)))
   2198			goto out;
   2199		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
   2200	}
   2201
   2202out:
   2203	spin_unlock(ptl);
   2204	/*
   2205	 * No need to double call mmu_notifier->invalidate_range() callback.
   2206	 * They are 3 cases to consider inside __split_huge_pmd_locked():
   2207	 *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
   2208	 *  2) __split_huge_zero_page_pmd() read only zero page and any write
   2209	 *    fault will trigger a flush_notify before pointing to a new page
   2210	 *    (it is fine if the secondary mmu keeps pointing to the old zero
   2211	 *    page in the meantime)
   2212	 *  3) Split a huge pmd into pte pointing to the same page. No need
   2213	 *     to invalidate secondary tlb entry they are all still valid.
   2214	 *     any further changes to individual pte will notify. So no need
   2215	 *     to call mmu_notifier->invalidate_range()
   2216	 */
   2217	mmu_notifier_invalidate_range_only_end(&range);
   2218}
   2219
   2220void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
   2221		bool freeze, struct folio *folio)
   2222{
   2223	pgd_t *pgd;
   2224	p4d_t *p4d;
   2225	pud_t *pud;
   2226	pmd_t *pmd;
   2227
   2228	pgd = pgd_offset(vma->vm_mm, address);
   2229	if (!pgd_present(*pgd))
   2230		return;
   2231
   2232	p4d = p4d_offset(pgd, address);
   2233	if (!p4d_present(*p4d))
   2234		return;
   2235
   2236	pud = pud_offset(p4d, address);
   2237	if (!pud_present(*pud))
   2238		return;
   2239
   2240	pmd = pmd_offset(pud, address);
   2241
   2242	__split_huge_pmd(vma, pmd, address, freeze, folio);
   2243}
   2244
   2245static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
   2246{
   2247	/*
   2248	 * If the new address isn't hpage aligned and it could previously
   2249	 * contain an hugepage: check if we need to split an huge pmd.
   2250	 */
   2251	if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
   2252	    range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
   2253			 ALIGN(address, HPAGE_PMD_SIZE)))
   2254		split_huge_pmd_address(vma, address, false, NULL);
   2255}
   2256
   2257void vma_adjust_trans_huge(struct vm_area_struct *vma,
   2258			     unsigned long start,
   2259			     unsigned long end,
   2260			     long adjust_next)
   2261{
   2262	/* Check if we need to split start first. */
   2263	split_huge_pmd_if_needed(vma, start);
   2264
   2265	/* Check if we need to split end next. */
   2266	split_huge_pmd_if_needed(vma, end);
   2267
   2268	/*
   2269	 * If we're also updating the vma->vm_next->vm_start,
   2270	 * check if we need to split it.
   2271	 */
   2272	if (adjust_next > 0) {
   2273		struct vm_area_struct *next = vma->vm_next;
   2274		unsigned long nstart = next->vm_start;
   2275		nstart += adjust_next;
   2276		split_huge_pmd_if_needed(next, nstart);
   2277	}
   2278}
   2279
   2280static void unmap_page(struct page *page)
   2281{
   2282	struct folio *folio = page_folio(page);
   2283	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
   2284		TTU_SYNC;
   2285
   2286	VM_BUG_ON_PAGE(!PageHead(page), page);
   2287
   2288	/*
   2289	 * Anon pages need migration entries to preserve them, but file
   2290	 * pages can simply be left unmapped, then faulted back on demand.
   2291	 * If that is ever changed (perhaps for mlock), update remap_page().
   2292	 */
   2293	if (folio_test_anon(folio))
   2294		try_to_migrate(folio, ttu_flags);
   2295	else
   2296		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
   2297}
   2298
   2299static void remap_page(struct folio *folio, unsigned long nr)
   2300{
   2301	int i = 0;
   2302
   2303	/* If unmap_page() uses try_to_migrate() on file, remove this check */
   2304	if (!folio_test_anon(folio))
   2305		return;
   2306	for (;;) {
   2307		remove_migration_ptes(folio, folio, true);
   2308		i += folio_nr_pages(folio);
   2309		if (i >= nr)
   2310			break;
   2311		folio = folio_next(folio);
   2312	}
   2313}
   2314
   2315static void lru_add_page_tail(struct page *head, struct page *tail,
   2316		struct lruvec *lruvec, struct list_head *list)
   2317{
   2318	VM_BUG_ON_PAGE(!PageHead(head), head);
   2319	VM_BUG_ON_PAGE(PageCompound(tail), head);
   2320	VM_BUG_ON_PAGE(PageLRU(tail), head);
   2321	lockdep_assert_held(&lruvec->lru_lock);
   2322
   2323	if (list) {
   2324		/* page reclaim is reclaiming a huge page */
   2325		VM_WARN_ON(PageLRU(head));
   2326		get_page(tail);
   2327		list_add_tail(&tail->lru, list);
   2328	} else {
   2329		/* head is still on lru (and we have it frozen) */
   2330		VM_WARN_ON(!PageLRU(head));
   2331		if (PageUnevictable(tail))
   2332			tail->mlock_count = 0;
   2333		else
   2334			list_add_tail(&tail->lru, &head->lru);
   2335		SetPageLRU(tail);
   2336	}
   2337}
   2338
   2339static void __split_huge_page_tail(struct page *head, int tail,
   2340		struct lruvec *lruvec, struct list_head *list)
   2341{
   2342	struct page *page_tail = head + tail;
   2343
   2344	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
   2345
   2346	/*
   2347	 * Clone page flags before unfreezing refcount.
   2348	 *
   2349	 * After successful get_page_unless_zero() might follow flags change,
   2350	 * for example lock_page() which set PG_waiters.
   2351	 *
   2352	 * Note that for mapped sub-pages of an anonymous THP,
   2353	 * PG_anon_exclusive has been cleared in unmap_page() and is stored in
   2354	 * the migration entry instead from where remap_page() will restore it.
   2355	 * We can still have PG_anon_exclusive set on effectively unmapped and
   2356	 * unreferenced sub-pages of an anonymous THP: we can simply drop
   2357	 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
   2358	 */
   2359	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
   2360	page_tail->flags |= (head->flags &
   2361			((1L << PG_referenced) |
   2362			 (1L << PG_swapbacked) |
   2363			 (1L << PG_swapcache) |
   2364			 (1L << PG_mlocked) |
   2365			 (1L << PG_uptodate) |
   2366			 (1L << PG_active) |
   2367			 (1L << PG_workingset) |
   2368			 (1L << PG_locked) |
   2369			 (1L << PG_unevictable) |
   2370#ifdef CONFIG_64BIT
   2371			 (1L << PG_arch_2) |
   2372#endif
   2373			 (1L << PG_dirty)));
   2374
   2375	/* ->mapping in first tail page is compound_mapcount */
   2376	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
   2377			page_tail);
   2378	page_tail->mapping = head->mapping;
   2379	page_tail->index = head->index + tail;
   2380	page_tail->private = 0;
   2381
   2382	/* Page flags must be visible before we make the page non-compound. */
   2383	smp_wmb();
   2384
   2385	/*
   2386	 * Clear PageTail before unfreezing page refcount.
   2387	 *
   2388	 * After successful get_page_unless_zero() might follow put_page()
   2389	 * which needs correct compound_head().
   2390	 */
   2391	clear_compound_head(page_tail);
   2392
   2393	/* Finally unfreeze refcount. Additional reference from page cache. */
   2394	page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
   2395					  PageSwapCache(head)));
   2396
   2397	if (page_is_young(head))
   2398		set_page_young(page_tail);
   2399	if (page_is_idle(head))
   2400		set_page_idle(page_tail);
   2401
   2402	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
   2403
   2404	/*
   2405	 * always add to the tail because some iterators expect new
   2406	 * pages to show after the currently processed elements - e.g.
   2407	 * migrate_pages
   2408	 */
   2409	lru_add_page_tail(head, page_tail, lruvec, list);
   2410}
   2411
   2412static void __split_huge_page(struct page *page, struct list_head *list,
   2413		pgoff_t end)
   2414{
   2415	struct folio *folio = page_folio(page);
   2416	struct page *head = &folio->page;
   2417	struct lruvec *lruvec;
   2418	struct address_space *swap_cache = NULL;
   2419	unsigned long offset = 0;
   2420	unsigned int nr = thp_nr_pages(head);
   2421	int i;
   2422
   2423	/* complete memcg works before add pages to LRU */
   2424	split_page_memcg(head, nr);
   2425
   2426	if (PageAnon(head) && PageSwapCache(head)) {
   2427		swp_entry_t entry = { .val = page_private(head) };
   2428
   2429		offset = swp_offset(entry);
   2430		swap_cache = swap_address_space(entry);
   2431		xa_lock(&swap_cache->i_pages);
   2432	}
   2433
   2434	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
   2435	lruvec = folio_lruvec_lock(folio);
   2436
   2437	ClearPageHasHWPoisoned(head);
   2438
   2439	for (i = nr - 1; i >= 1; i--) {
   2440		__split_huge_page_tail(head, i, lruvec, list);
   2441		/* Some pages can be beyond EOF: drop them from page cache */
   2442		if (head[i].index >= end) {
   2443			ClearPageDirty(head + i);
   2444			__delete_from_page_cache(head + i, NULL);
   2445			if (shmem_mapping(head->mapping))
   2446				shmem_uncharge(head->mapping->host, 1);
   2447			put_page(head + i);
   2448		} else if (!PageAnon(page)) {
   2449			__xa_store(&head->mapping->i_pages, head[i].index,
   2450					head + i, 0);
   2451		} else if (swap_cache) {
   2452			__xa_store(&swap_cache->i_pages, offset + i,
   2453					head + i, 0);
   2454		}
   2455	}
   2456
   2457	ClearPageCompound(head);
   2458	unlock_page_lruvec(lruvec);
   2459	/* Caller disabled irqs, so they are still disabled here */
   2460
   2461	split_page_owner(head, nr);
   2462
   2463	/* See comment in __split_huge_page_tail() */
   2464	if (PageAnon(head)) {
   2465		/* Additional pin to swap cache */
   2466		if (PageSwapCache(head)) {
   2467			page_ref_add(head, 2);
   2468			xa_unlock(&swap_cache->i_pages);
   2469		} else {
   2470			page_ref_inc(head);
   2471		}
   2472	} else {
   2473		/* Additional pin to page cache */
   2474		page_ref_add(head, 2);
   2475		xa_unlock(&head->mapping->i_pages);
   2476	}
   2477	local_irq_enable();
   2478
   2479	remap_page(folio, nr);
   2480
   2481	if (PageSwapCache(head)) {
   2482		swp_entry_t entry = { .val = page_private(head) };
   2483
   2484		split_swap_cluster(entry);
   2485	}
   2486
   2487	for (i = 0; i < nr; i++) {
   2488		struct page *subpage = head + i;
   2489		if (subpage == page)
   2490			continue;
   2491		unlock_page(subpage);
   2492
   2493		/*
   2494		 * Subpages may be freed if there wasn't any mapping
   2495		 * like if add_to_swap() is running on a lru page that
   2496		 * had its mapping zapped. And freeing these pages
   2497		 * requires taking the lru_lock so we do the put_page
   2498		 * of the tail pages after the split is complete.
   2499		 */
   2500		put_page(subpage);
   2501	}
   2502}
   2503
   2504/* Racy check whether the huge page can be split */
   2505bool can_split_folio(struct folio *folio, int *pextra_pins)
   2506{
   2507	int extra_pins;
   2508
   2509	/* Additional pins from page cache */
   2510	if (folio_test_anon(folio))
   2511		extra_pins = folio_test_swapcache(folio) ?
   2512				folio_nr_pages(folio) : 0;
   2513	else
   2514		extra_pins = folio_nr_pages(folio);
   2515	if (pextra_pins)
   2516		*pextra_pins = extra_pins;
   2517	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
   2518}
   2519
   2520/*
   2521 * This function splits huge page into normal pages. @page can point to any
   2522 * subpage of huge page to split. Split doesn't change the position of @page.
   2523 *
   2524 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
   2525 * The huge page must be locked.
   2526 *
   2527 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
   2528 *
   2529 * Both head page and tail pages will inherit mapping, flags, and so on from
   2530 * the hugepage.
   2531 *
   2532 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
   2533 * they are not mapped.
   2534 *
   2535 * Returns 0 if the hugepage is split successfully.
   2536 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
   2537 * us.
   2538 */
   2539int split_huge_page_to_list(struct page *page, struct list_head *list)
   2540{
   2541	struct folio *folio = page_folio(page);
   2542	struct page *head = &folio->page;
   2543	struct deferred_split *ds_queue = get_deferred_split_queue(head);
   2544	XA_STATE(xas, &head->mapping->i_pages, head->index);
   2545	struct anon_vma *anon_vma = NULL;
   2546	struct address_space *mapping = NULL;
   2547	int extra_pins, ret;
   2548	pgoff_t end;
   2549	bool is_hzp;
   2550
   2551	VM_BUG_ON_PAGE(!PageLocked(head), head);
   2552	VM_BUG_ON_PAGE(!PageCompound(head), head);
   2553
   2554	is_hzp = is_huge_zero_page(head);
   2555	VM_WARN_ON_ONCE_PAGE(is_hzp, head);
   2556	if (is_hzp)
   2557		return -EBUSY;
   2558
   2559	if (PageWriteback(head))
   2560		return -EBUSY;
   2561
   2562	if (PageAnon(head)) {
   2563		/*
   2564		 * The caller does not necessarily hold an mmap_lock that would
   2565		 * prevent the anon_vma disappearing so we first we take a
   2566		 * reference to it and then lock the anon_vma for write. This
   2567		 * is similar to folio_lock_anon_vma_read except the write lock
   2568		 * is taken to serialise against parallel split or collapse
   2569		 * operations.
   2570		 */
   2571		anon_vma = page_get_anon_vma(head);
   2572		if (!anon_vma) {
   2573			ret = -EBUSY;
   2574			goto out;
   2575		}
   2576		end = -1;
   2577		mapping = NULL;
   2578		anon_vma_lock_write(anon_vma);
   2579	} else {
   2580		mapping = head->mapping;
   2581
   2582		/* Truncated ? */
   2583		if (!mapping) {
   2584			ret = -EBUSY;
   2585			goto out;
   2586		}
   2587
   2588		xas_split_alloc(&xas, head, compound_order(head),
   2589				mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
   2590		if (xas_error(&xas)) {
   2591			ret = xas_error(&xas);
   2592			goto out;
   2593		}
   2594
   2595		anon_vma = NULL;
   2596		i_mmap_lock_read(mapping);
   2597
   2598		/*
   2599		 *__split_huge_page() may need to trim off pages beyond EOF:
   2600		 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
   2601		 * which cannot be nested inside the page tree lock. So note
   2602		 * end now: i_size itself may be changed at any moment, but
   2603		 * head page lock is good enough to serialize the trimming.
   2604		 */
   2605		end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
   2606		if (shmem_mapping(mapping))
   2607			end = shmem_fallocend(mapping->host, end);
   2608	}
   2609
   2610	/*
   2611	 * Racy check if we can split the page, before unmap_page() will
   2612	 * split PMDs
   2613	 */
   2614	if (!can_split_folio(folio, &extra_pins)) {
   2615		ret = -EBUSY;
   2616		goto out_unlock;
   2617	}
   2618
   2619	unmap_page(head);
   2620
   2621	/* block interrupt reentry in xa_lock and spinlock */
   2622	local_irq_disable();
   2623	if (mapping) {
   2624		/*
   2625		 * Check if the head page is present in page cache.
   2626		 * We assume all tail are present too, if head is there.
   2627		 */
   2628		xas_lock(&xas);
   2629		xas_reset(&xas);
   2630		if (xas_load(&xas) != head)
   2631			goto fail;
   2632	}
   2633
   2634	/* Prevent deferred_split_scan() touching ->_refcount */
   2635	spin_lock(&ds_queue->split_queue_lock);
   2636	if (page_ref_freeze(head, 1 + extra_pins)) {
   2637		if (!list_empty(page_deferred_list(head))) {
   2638			ds_queue->split_queue_len--;
   2639			list_del(page_deferred_list(head));
   2640		}
   2641		spin_unlock(&ds_queue->split_queue_lock);
   2642		if (mapping) {
   2643			int nr = thp_nr_pages(head);
   2644
   2645			xas_split(&xas, head, thp_order(head));
   2646			if (PageSwapBacked(head)) {
   2647				__mod_lruvec_page_state(head, NR_SHMEM_THPS,
   2648							-nr);
   2649			} else {
   2650				__mod_lruvec_page_state(head, NR_FILE_THPS,
   2651							-nr);
   2652				filemap_nr_thps_dec(mapping);
   2653			}
   2654		}
   2655
   2656		__split_huge_page(page, list, end);
   2657		ret = 0;
   2658	} else {
   2659		spin_unlock(&ds_queue->split_queue_lock);
   2660fail:
   2661		if (mapping)
   2662			xas_unlock(&xas);
   2663		local_irq_enable();
   2664		remap_page(folio, folio_nr_pages(folio));
   2665		ret = -EBUSY;
   2666	}
   2667
   2668out_unlock:
   2669	if (anon_vma) {
   2670		anon_vma_unlock_write(anon_vma);
   2671		put_anon_vma(anon_vma);
   2672	}
   2673	if (mapping)
   2674		i_mmap_unlock_read(mapping);
   2675out:
   2676	xas_destroy(&xas);
   2677	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
   2678	return ret;
   2679}
   2680
   2681void free_transhuge_page(struct page *page)
   2682{
   2683	struct deferred_split *ds_queue = get_deferred_split_queue(page);
   2684	unsigned long flags;
   2685
   2686	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
   2687	if (!list_empty(page_deferred_list(page))) {
   2688		ds_queue->split_queue_len--;
   2689		list_del(page_deferred_list(page));
   2690	}
   2691	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
   2692	free_compound_page(page);
   2693}
   2694
   2695void deferred_split_huge_page(struct page *page)
   2696{
   2697	struct deferred_split *ds_queue = get_deferred_split_queue(page);
   2698#ifdef CONFIG_MEMCG
   2699	struct mem_cgroup *memcg = page_memcg(compound_head(page));
   2700#endif
   2701	unsigned long flags;
   2702
   2703	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
   2704
   2705	/*
   2706	 * The try_to_unmap() in page reclaim path might reach here too,
   2707	 * this may cause a race condition to corrupt deferred split queue.
   2708	 * And, if page reclaim is already handling the same page, it is
   2709	 * unnecessary to handle it again in shrinker.
   2710	 *
   2711	 * Check PageSwapCache to determine if the page is being
   2712	 * handled by page reclaim since THP swap would add the page into
   2713	 * swap cache before calling try_to_unmap().
   2714	 */
   2715	if (PageSwapCache(page))
   2716		return;
   2717
   2718	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
   2719	if (list_empty(page_deferred_list(page))) {
   2720		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
   2721		list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
   2722		ds_queue->split_queue_len++;
   2723#ifdef CONFIG_MEMCG
   2724		if (memcg)
   2725			set_shrinker_bit(memcg, page_to_nid(page),
   2726					 deferred_split_shrinker.id);
   2727#endif
   2728	}
   2729	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
   2730}
   2731
   2732static unsigned long deferred_split_count(struct shrinker *shrink,
   2733		struct shrink_control *sc)
   2734{
   2735	struct pglist_data *pgdata = NODE_DATA(sc->nid);
   2736	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
   2737
   2738#ifdef CONFIG_MEMCG
   2739	if (sc->memcg)
   2740		ds_queue = &sc->memcg->deferred_split_queue;
   2741#endif
   2742	return READ_ONCE(ds_queue->split_queue_len);
   2743}
   2744
   2745static unsigned long deferred_split_scan(struct shrinker *shrink,
   2746		struct shrink_control *sc)
   2747{
   2748	struct pglist_data *pgdata = NODE_DATA(sc->nid);
   2749	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
   2750	unsigned long flags;
   2751	LIST_HEAD(list), *pos, *next;
   2752	struct page *page;
   2753	int split = 0;
   2754
   2755#ifdef CONFIG_MEMCG
   2756	if (sc->memcg)
   2757		ds_queue = &sc->memcg->deferred_split_queue;
   2758#endif
   2759
   2760	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
   2761	/* Take pin on all head pages to avoid freeing them under us */
   2762	list_for_each_safe(pos, next, &ds_queue->split_queue) {
   2763		page = list_entry((void *)pos, struct page, deferred_list);
   2764		page = compound_head(page);
   2765		if (get_page_unless_zero(page)) {
   2766			list_move(page_deferred_list(page), &list);
   2767		} else {
   2768			/* We lost race with put_compound_page() */
   2769			list_del_init(page_deferred_list(page));
   2770			ds_queue->split_queue_len--;
   2771		}
   2772		if (!--sc->nr_to_scan)
   2773			break;
   2774	}
   2775	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
   2776
   2777	list_for_each_safe(pos, next, &list) {
   2778		page = list_entry((void *)pos, struct page, deferred_list);
   2779		if (!trylock_page(page))
   2780			goto next;
   2781		/* split_huge_page() removes page from list on success */
   2782		if (!split_huge_page(page))
   2783			split++;
   2784		unlock_page(page);
   2785next:
   2786		put_page(page);
   2787	}
   2788
   2789	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
   2790	list_splice_tail(&list, &ds_queue->split_queue);
   2791	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
   2792
   2793	/*
   2794	 * Stop shrinker if we didn't split any page, but the queue is empty.
   2795	 * This can happen if pages were freed under us.
   2796	 */
   2797	if (!split && list_empty(&ds_queue->split_queue))
   2798		return SHRINK_STOP;
   2799	return split;
   2800}
   2801
   2802static struct shrinker deferred_split_shrinker = {
   2803	.count_objects = deferred_split_count,
   2804	.scan_objects = deferred_split_scan,
   2805	.seeks = DEFAULT_SEEKS,
   2806	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
   2807		 SHRINKER_NONSLAB,
   2808};
   2809
   2810#ifdef CONFIG_DEBUG_FS
   2811static void split_huge_pages_all(void)
   2812{
   2813	struct zone *zone;
   2814	struct page *page;
   2815	unsigned long pfn, max_zone_pfn;
   2816	unsigned long total = 0, split = 0;
   2817
   2818	pr_debug("Split all THPs\n");
   2819	for_each_populated_zone(zone) {
   2820		max_zone_pfn = zone_end_pfn(zone);
   2821		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
   2822			if (!pfn_valid(pfn))
   2823				continue;
   2824
   2825			page = pfn_to_page(pfn);
   2826			if (!get_page_unless_zero(page))
   2827				continue;
   2828
   2829			if (zone != page_zone(page))
   2830				goto next;
   2831
   2832			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
   2833				goto next;
   2834
   2835			total++;
   2836			lock_page(page);
   2837			if (!split_huge_page(page))
   2838				split++;
   2839			unlock_page(page);
   2840next:
   2841			put_page(page);
   2842			cond_resched();
   2843		}
   2844	}
   2845
   2846	pr_debug("%lu of %lu THP split\n", split, total);
   2847}
   2848
   2849static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
   2850{
   2851	return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
   2852		    is_vm_hugetlb_page(vma);
   2853}
   2854
   2855static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
   2856				unsigned long vaddr_end)
   2857{
   2858	int ret = 0;
   2859	struct task_struct *task;
   2860	struct mm_struct *mm;
   2861	unsigned long total = 0, split = 0;
   2862	unsigned long addr;
   2863
   2864	vaddr_start &= PAGE_MASK;
   2865	vaddr_end &= PAGE_MASK;
   2866
   2867	/* Find the task_struct from pid */
   2868	rcu_read_lock();
   2869	task = find_task_by_vpid(pid);
   2870	if (!task) {
   2871		rcu_read_unlock();
   2872		ret = -ESRCH;
   2873		goto out;
   2874	}
   2875	get_task_struct(task);
   2876	rcu_read_unlock();
   2877
   2878	/* Find the mm_struct */
   2879	mm = get_task_mm(task);
   2880	put_task_struct(task);
   2881
   2882	if (!mm) {
   2883		ret = -EINVAL;
   2884		goto out;
   2885	}
   2886
   2887	pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
   2888		 pid, vaddr_start, vaddr_end);
   2889
   2890	mmap_read_lock(mm);
   2891	/*
   2892	 * always increase addr by PAGE_SIZE, since we could have a PTE page
   2893	 * table filled with PTE-mapped THPs, each of which is distinct.
   2894	 */
   2895	for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
   2896		struct vm_area_struct *vma = find_vma(mm, addr);
   2897		struct page *page;
   2898
   2899		if (!vma || addr < vma->vm_start)
   2900			break;
   2901
   2902		/* skip special VMA and hugetlb VMA */
   2903		if (vma_not_suitable_for_thp_split(vma)) {
   2904			addr = vma->vm_end;
   2905			continue;
   2906		}
   2907
   2908		/* FOLL_DUMP to ignore special (like zero) pages */
   2909		page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
   2910
   2911		if (IS_ERR(page))
   2912			continue;
   2913		if (!page)
   2914			continue;
   2915
   2916		if (!is_transparent_hugepage(page))
   2917			goto next;
   2918
   2919		total++;
   2920		if (!can_split_folio(page_folio(page), NULL))
   2921			goto next;
   2922
   2923		if (!trylock_page(page))
   2924			goto next;
   2925
   2926		if (!split_huge_page(page))
   2927			split++;
   2928
   2929		unlock_page(page);
   2930next:
   2931		put_page(page);
   2932		cond_resched();
   2933	}
   2934	mmap_read_unlock(mm);
   2935	mmput(mm);
   2936
   2937	pr_debug("%lu of %lu THP split\n", split, total);
   2938
   2939out:
   2940	return ret;
   2941}
   2942
   2943static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
   2944				pgoff_t off_end)
   2945{
   2946	struct filename *file;
   2947	struct file *candidate;
   2948	struct address_space *mapping;
   2949	int ret = -EINVAL;
   2950	pgoff_t index;
   2951	int nr_pages = 1;
   2952	unsigned long total = 0, split = 0;
   2953
   2954	file = getname_kernel(file_path);
   2955	if (IS_ERR(file))
   2956		return ret;
   2957
   2958	candidate = file_open_name(file, O_RDONLY, 0);
   2959	if (IS_ERR(candidate))
   2960		goto out;
   2961
   2962	pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
   2963		 file_path, off_start, off_end);
   2964
   2965	mapping = candidate->f_mapping;
   2966
   2967	for (index = off_start; index < off_end; index += nr_pages) {
   2968		struct page *fpage = pagecache_get_page(mapping, index,
   2969						FGP_ENTRY | FGP_HEAD, 0);
   2970
   2971		nr_pages = 1;
   2972		if (xa_is_value(fpage) || !fpage)
   2973			continue;
   2974
   2975		if (!is_transparent_hugepage(fpage))
   2976			goto next;
   2977
   2978		total++;
   2979		nr_pages = thp_nr_pages(fpage);
   2980
   2981		if (!trylock_page(fpage))
   2982			goto next;
   2983
   2984		if (!split_huge_page(fpage))
   2985			split++;
   2986
   2987		unlock_page(fpage);
   2988next:
   2989		put_page(fpage);
   2990		cond_resched();
   2991	}
   2992
   2993	filp_close(candidate, NULL);
   2994	ret = 0;
   2995
   2996	pr_debug("%lu of %lu file-backed THP split\n", split, total);
   2997out:
   2998	putname(file);
   2999	return ret;
   3000}
   3001
   3002#define MAX_INPUT_BUF_SZ 255
   3003
   3004static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
   3005				size_t count, loff_t *ppops)
   3006{
   3007	static DEFINE_MUTEX(split_debug_mutex);
   3008	ssize_t ret;
   3009	/* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
   3010	char input_buf[MAX_INPUT_BUF_SZ];
   3011	int pid;
   3012	unsigned long vaddr_start, vaddr_end;
   3013
   3014	ret = mutex_lock_interruptible(&split_debug_mutex);
   3015	if (ret)
   3016		return ret;
   3017
   3018	ret = -EFAULT;
   3019
   3020	memset(input_buf, 0, MAX_INPUT_BUF_SZ);
   3021	if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
   3022		goto out;
   3023
   3024	input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
   3025
   3026	if (input_buf[0] == '/') {
   3027		char *tok;
   3028		char *buf = input_buf;
   3029		char file_path[MAX_INPUT_BUF_SZ];
   3030		pgoff_t off_start = 0, off_end = 0;
   3031		size_t input_len = strlen(input_buf);
   3032
   3033		tok = strsep(&buf, ",");
   3034		if (tok) {
   3035			strcpy(file_path, tok);
   3036		} else {
   3037			ret = -EINVAL;
   3038			goto out;
   3039		}
   3040
   3041		ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
   3042		if (ret != 2) {
   3043			ret = -EINVAL;
   3044			goto out;
   3045		}
   3046		ret = split_huge_pages_in_file(file_path, off_start, off_end);
   3047		if (!ret)
   3048			ret = input_len;
   3049
   3050		goto out;
   3051	}
   3052
   3053	ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
   3054	if (ret == 1 && pid == 1) {
   3055		split_huge_pages_all();
   3056		ret = strlen(input_buf);
   3057		goto out;
   3058	} else if (ret != 3) {
   3059		ret = -EINVAL;
   3060		goto out;
   3061	}
   3062
   3063	ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
   3064	if (!ret)
   3065		ret = strlen(input_buf);
   3066out:
   3067	mutex_unlock(&split_debug_mutex);
   3068	return ret;
   3069
   3070}
   3071
   3072static const struct file_operations split_huge_pages_fops = {
   3073	.owner	 = THIS_MODULE,
   3074	.write	 = split_huge_pages_write,
   3075	.llseek  = no_llseek,
   3076};
   3077
   3078static int __init split_huge_pages_debugfs(void)
   3079{
   3080	debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
   3081			    &split_huge_pages_fops);
   3082	return 0;
   3083}
   3084late_initcall(split_huge_pages_debugfs);
   3085#endif
   3086
   3087#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
   3088int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
   3089		struct page *page)
   3090{
   3091	struct vm_area_struct *vma = pvmw->vma;
   3092	struct mm_struct *mm = vma->vm_mm;
   3093	unsigned long address = pvmw->address;
   3094	bool anon_exclusive;
   3095	pmd_t pmdval;
   3096	swp_entry_t entry;
   3097	pmd_t pmdswp;
   3098
   3099	if (!(pvmw->pmd && !pvmw->pte))
   3100		return 0;
   3101
   3102	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
   3103	pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
   3104
   3105	anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
   3106	if (anon_exclusive && page_try_share_anon_rmap(page)) {
   3107		set_pmd_at(mm, address, pvmw->pmd, pmdval);
   3108		return -EBUSY;
   3109	}
   3110
   3111	if (pmd_dirty(pmdval))
   3112		set_page_dirty(page);
   3113	if (pmd_write(pmdval))
   3114		entry = make_writable_migration_entry(page_to_pfn(page));
   3115	else if (anon_exclusive)
   3116		entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
   3117	else
   3118		entry = make_readable_migration_entry(page_to_pfn(page));
   3119	pmdswp = swp_entry_to_pmd(entry);
   3120	if (pmd_soft_dirty(pmdval))
   3121		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
   3122	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
   3123	page_remove_rmap(page, vma, true);
   3124	put_page(page);
   3125	trace_set_migration_pmd(address, pmd_val(pmdswp));
   3126
   3127	return 0;
   3128}
   3129
   3130void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
   3131{
   3132	struct vm_area_struct *vma = pvmw->vma;
   3133	struct mm_struct *mm = vma->vm_mm;
   3134	unsigned long address = pvmw->address;
   3135	unsigned long mmun_start = address & HPAGE_PMD_MASK;
   3136	pmd_t pmde;
   3137	swp_entry_t entry;
   3138
   3139	if (!(pvmw->pmd && !pvmw->pte))
   3140		return;
   3141
   3142	entry = pmd_to_swp_entry(*pvmw->pmd);
   3143	get_page(new);
   3144	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
   3145	if (pmd_swp_soft_dirty(*pvmw->pmd))
   3146		pmde = pmd_mksoft_dirty(pmde);
   3147	if (is_writable_migration_entry(entry))
   3148		pmde = maybe_pmd_mkwrite(pmde, vma);
   3149	if (pmd_swp_uffd_wp(*pvmw->pmd))
   3150		pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
   3151
   3152	if (PageAnon(new)) {
   3153		rmap_t rmap_flags = RMAP_COMPOUND;
   3154
   3155		if (!is_readable_migration_entry(entry))
   3156			rmap_flags |= RMAP_EXCLUSIVE;
   3157
   3158		page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
   3159	} else {
   3160		page_add_file_rmap(new, vma, true);
   3161	}
   3162	VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
   3163	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
   3164
   3165	/* No need to invalidate - it was non-present before */
   3166	update_mmu_cache_pmd(vma, address, pvmw->pmd);
   3167	trace_remove_migration_pmd(address, pmd_val(pmde));
   3168}
   3169#endif