ksm.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
ksm.c (91225B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Memory merging support.
      4 *
      5 * This code enables dynamic sharing of identical pages found in different
      6 * memory areas, even if they are not shared by fork()
      7 *
      8 * Copyright (C) 2008-2009 Red Hat, Inc.
      9 * Authors:
     10 *	Izik Eidus
     11 *	Andrea Arcangeli
     12 *	Chris Wright
     13 *	Hugh Dickins
     14 */
     15
     16#include <linux/errno.h>
     17#include <linux/mm.h>
     18#include <linux/mm_inline.h>
     19#include <linux/fs.h>
     20#include <linux/mman.h>
     21#include <linux/sched.h>
     22#include <linux/sched/mm.h>
     23#include <linux/sched/coredump.h>
     24#include <linux/rwsem.h>
     25#include <linux/pagemap.h>
     26#include <linux/rmap.h>
     27#include <linux/spinlock.h>
     28#include <linux/xxhash.h>
     29#include <linux/delay.h>
     30#include <linux/kthread.h>
     31#include <linux/wait.h>
     32#include <linux/slab.h>
     33#include <linux/rbtree.h>
     34#include <linux/memory.h>
     35#include <linux/mmu_notifier.h>
     36#include <linux/swap.h>
     37#include <linux/ksm.h>
     38#include <linux/hashtable.h>
     39#include <linux/freezer.h>
     40#include <linux/oom.h>
     41#include <linux/numa.h>
     42
     43#include <asm/tlbflush.h>
     44#include "internal.h"
     45
     46#ifdef CONFIG_NUMA
     47#define NUMA(x)		(x)
     48#define DO_NUMA(x)	do { (x); } while (0)
     49#else
     50#define NUMA(x)		(0)
     51#define DO_NUMA(x)	do { } while (0)
     52#endif
     53
     54/**
     55 * DOC: Overview
     56 *
     57 * A few notes about the KSM scanning process,
     58 * to make it easier to understand the data structures below:
     59 *
     60 * In order to reduce excessive scanning, KSM sorts the memory pages by their
     61 * contents into a data structure that holds pointers to the pages' locations.
     62 *
     63 * Since the contents of the pages may change at any moment, KSM cannot just
     64 * insert the pages into a normal sorted tree and expect it to find anything.
     65 * Therefore KSM uses two data structures - the stable and the unstable tree.
     66 *
     67 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
     68 * by their contents.  Because each such page is write-protected, searching on
     69 * this tree is fully assured to be working (except when pages are unmapped),
     70 * and therefore this tree is called the stable tree.
     71 *
     72 * The stable tree node includes information required for reverse
     73 * mapping from a KSM page to virtual addresses that map this page.
     74 *
     75 * In order to avoid large latencies of the rmap walks on KSM pages,
     76 * KSM maintains two types of nodes in the stable tree:
     77 *
     78 * * the regular nodes that keep the reverse mapping structures in a
     79 *   linked list
     80 * * the "chains" that link nodes ("dups") that represent the same
     81 *   write protected memory content, but each "dup" corresponds to a
     82 *   different KSM page copy of that content
     83 *
     84 * Internally, the regular nodes, "dups" and "chains" are represented
     85 * using the same struct stable_node structure.
     86 *
     87 * In addition to the stable tree, KSM uses a second data structure called the
     88 * unstable tree: this tree holds pointers to pages which have been found to
     89 * be "unchanged for a period of time".  The unstable tree sorts these pages
     90 * by their contents, but since they are not write-protected, KSM cannot rely
     91 * upon the unstable tree to work correctly - the unstable tree is liable to
     92 * be corrupted as its contents are modified, and so it is called unstable.
     93 *
     94 * KSM solves this problem by several techniques:
     95 *
     96 * 1) The unstable tree is flushed every time KSM completes scanning all
     97 *    memory areas, and then the tree is rebuilt again from the beginning.
     98 * 2) KSM will only insert into the unstable tree, pages whose hash value
     99 *    has not changed since the previous scan of all memory areas.
    100 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
    101 *    colors of the nodes and not on their contents, assuring that even when
    102 *    the tree gets "corrupted" it won't get out of balance, so scanning time
    103 *    remains the same (also, searching and inserting nodes in an rbtree uses
    104 *    the same algorithm, so we have no overhead when we flush and rebuild).
    105 * 4) KSM never flushes the stable tree, which means that even if it were to
    106 *    take 10 attempts to find a page in the unstable tree, once it is found,
    107 *    it is secured in the stable tree.  (When we scan a new page, we first
    108 *    compare it against the stable tree, and then against the unstable tree.)
    109 *
    110 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
    111 * stable trees and multiple unstable trees: one of each for each NUMA node.
    112 */
    113
    114/**
    115 * struct mm_slot - ksm information per mm that is being scanned
    116 * @link: link to the mm_slots hash list
    117 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
    118 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
    119 * @mm: the mm that this information is valid for
    120 */
    121struct mm_slot {
    122	struct hlist_node link;
    123	struct list_head mm_list;
    124	struct rmap_item *rmap_list;
    125	struct mm_struct *mm;
    126};
    127
    128/**
    129 * struct ksm_scan - cursor for scanning
    130 * @mm_slot: the current mm_slot we are scanning
    131 * @address: the next address inside that to be scanned
    132 * @rmap_list: link to the next rmap to be scanned in the rmap_list
    133 * @seqnr: count of completed full scans (needed when removing unstable node)
    134 *
    135 * There is only the one ksm_scan instance of this cursor structure.
    136 */
    137struct ksm_scan {
    138	struct mm_slot *mm_slot;
    139	unsigned long address;
    140	struct rmap_item **rmap_list;
    141	unsigned long seqnr;
    142};
    143
    144/**
    145 * struct stable_node - node of the stable rbtree
    146 * @node: rb node of this ksm page in the stable tree
    147 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
    148 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
    149 * @list: linked into migrate_nodes, pending placement in the proper node tree
    150 * @hlist: hlist head of rmap_items using this ksm page
    151 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
    152 * @chain_prune_time: time of the last full garbage collection
    153 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
    154 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
    155 */
    156struct stable_node {
    157	union {
    158		struct rb_node node;	/* when node of stable tree */
    159		struct {		/* when listed for migration */
    160			struct list_head *head;
    161			struct {
    162				struct hlist_node hlist_dup;
    163				struct list_head list;
    164			};
    165		};
    166	};
    167	struct hlist_head hlist;
    168	union {
    169		unsigned long kpfn;
    170		unsigned long chain_prune_time;
    171	};
    172	/*
    173	 * STABLE_NODE_CHAIN can be any negative number in
    174	 * rmap_hlist_len negative range, but better not -1 to be able
    175	 * to reliably detect underflows.
    176	 */
    177#define STABLE_NODE_CHAIN -1024
    178	int rmap_hlist_len;
    179#ifdef CONFIG_NUMA
    180	int nid;
    181#endif
    182};
    183
    184/**
    185 * struct rmap_item - reverse mapping item for virtual addresses
    186 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
    187 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
    188 * @nid: NUMA node id of unstable tree in which linked (may not match page)
    189 * @mm: the memory structure this rmap_item is pointing into
    190 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
    191 * @oldchecksum: previous checksum of the page at that virtual address
    192 * @node: rb node of this rmap_item in the unstable tree
    193 * @head: pointer to stable_node heading this list in the stable tree
    194 * @hlist: link into hlist of rmap_items hanging off that stable_node
    195 */
    196struct rmap_item {
    197	struct rmap_item *rmap_list;
    198	union {
    199		struct anon_vma *anon_vma;	/* when stable */
    200#ifdef CONFIG_NUMA
    201		int nid;		/* when node of unstable tree */
    202#endif
    203	};
    204	struct mm_struct *mm;
    205	unsigned long address;		/* + low bits used for flags below */
    206	unsigned int oldchecksum;	/* when unstable */
    207	union {
    208		struct rb_node node;	/* when node of unstable tree */
    209		struct {		/* when listed from stable tree */
    210			struct stable_node *head;
    211			struct hlist_node hlist;
    212		};
    213	};
    214};
    215
    216#define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
    217#define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
    218#define STABLE_FLAG	0x200	/* is listed from the stable tree */
    219
    220/* The stable and unstable tree heads */
    221static struct rb_root one_stable_tree[1] = { RB_ROOT };
    222static struct rb_root one_unstable_tree[1] = { RB_ROOT };
    223static struct rb_root *root_stable_tree = one_stable_tree;
    224static struct rb_root *root_unstable_tree = one_unstable_tree;
    225
    226/* Recently migrated nodes of stable tree, pending proper placement */
    227static LIST_HEAD(migrate_nodes);
    228#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
    229
    230#define MM_SLOTS_HASH_BITS 10
    231static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
    232
    233static struct mm_slot ksm_mm_head = {
    234	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
    235};
    236static struct ksm_scan ksm_scan = {
    237	.mm_slot = &ksm_mm_head,
    238};
    239
    240static struct kmem_cache *rmap_item_cache;
    241static struct kmem_cache *stable_node_cache;
    242static struct kmem_cache *mm_slot_cache;
    243
    244/* The number of nodes in the stable tree */
    245static unsigned long ksm_pages_shared;
    246
    247/* The number of page slots additionally sharing those nodes */
    248static unsigned long ksm_pages_sharing;
    249
    250/* The number of nodes in the unstable tree */
    251static unsigned long ksm_pages_unshared;
    252
    253/* The number of rmap_items in use: to calculate pages_volatile */
    254static unsigned long ksm_rmap_items;
    255
    256/* The number of stable_node chains */
    257static unsigned long ksm_stable_node_chains;
    258
    259/* The number of stable_node dups linked to the stable_node chains */
    260static unsigned long ksm_stable_node_dups;
    261
    262/* Delay in pruning stale stable_node_dups in the stable_node_chains */
    263static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
    264
    265/* Maximum number of page slots sharing a stable node */
    266static int ksm_max_page_sharing = 256;
    267
    268/* Number of pages ksmd should scan in one batch */
    269static unsigned int ksm_thread_pages_to_scan = 100;
    270
    271/* Milliseconds ksmd should sleep between batches */
    272static unsigned int ksm_thread_sleep_millisecs = 20;
    273
    274/* Checksum of an empty (zeroed) page */
    275static unsigned int zero_checksum __read_mostly;
    276
    277/* Whether to merge empty (zeroed) pages with actual zero pages */
    278static bool ksm_use_zero_pages __read_mostly;
    279
    280#ifdef CONFIG_NUMA
    281/* Zeroed when merging across nodes is not allowed */
    282static unsigned int ksm_merge_across_nodes = 1;
    283static int ksm_nr_node_ids = 1;
    284#else
    285#define ksm_merge_across_nodes	1U
    286#define ksm_nr_node_ids		1
    287#endif
    288
    289#define KSM_RUN_STOP	0
    290#define KSM_RUN_MERGE	1
    291#define KSM_RUN_UNMERGE	2
    292#define KSM_RUN_OFFLINE	4
    293static unsigned long ksm_run = KSM_RUN_STOP;
    294static void wait_while_offlining(void);
    295
    296static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
    297static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
    298static DEFINE_MUTEX(ksm_thread_mutex);
    299static DEFINE_SPINLOCK(ksm_mmlist_lock);
    300
    301#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
    302		sizeof(struct __struct), __alignof__(struct __struct),\
    303		(__flags), NULL)
    304
    305static int __init ksm_slab_init(void)
    306{
    307	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
    308	if (!rmap_item_cache)
    309		goto out;
    310
    311	stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
    312	if (!stable_node_cache)
    313		goto out_free1;
    314
    315	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
    316	if (!mm_slot_cache)
    317		goto out_free2;
    318
    319	return 0;
    320
    321out_free2:
    322	kmem_cache_destroy(stable_node_cache);
    323out_free1:
    324	kmem_cache_destroy(rmap_item_cache);
    325out:
    326	return -ENOMEM;
    327}
    328
    329static void __init ksm_slab_free(void)
    330{
    331	kmem_cache_destroy(mm_slot_cache);
    332	kmem_cache_destroy(stable_node_cache);
    333	kmem_cache_destroy(rmap_item_cache);
    334	mm_slot_cache = NULL;
    335}
    336
    337static __always_inline bool is_stable_node_chain(struct stable_node *chain)
    338{
    339	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
    340}
    341
    342static __always_inline bool is_stable_node_dup(struct stable_node *dup)
    343{
    344	return dup->head == STABLE_NODE_DUP_HEAD;
    345}
    346
    347static inline void stable_node_chain_add_dup(struct stable_node *dup,
    348					     struct stable_node *chain)
    349{
    350	VM_BUG_ON(is_stable_node_dup(dup));
    351	dup->head = STABLE_NODE_DUP_HEAD;
    352	VM_BUG_ON(!is_stable_node_chain(chain));
    353	hlist_add_head(&dup->hlist_dup, &chain->hlist);
    354	ksm_stable_node_dups++;
    355}
    356
    357static inline void __stable_node_dup_del(struct stable_node *dup)
    358{
    359	VM_BUG_ON(!is_stable_node_dup(dup));
    360	hlist_del(&dup->hlist_dup);
    361	ksm_stable_node_dups--;
    362}
    363
    364static inline void stable_node_dup_del(struct stable_node *dup)
    365{
    366	VM_BUG_ON(is_stable_node_chain(dup));
    367	if (is_stable_node_dup(dup))
    368		__stable_node_dup_del(dup);
    369	else
    370		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
    371#ifdef CONFIG_DEBUG_VM
    372	dup->head = NULL;
    373#endif
    374}
    375
    376static inline struct rmap_item *alloc_rmap_item(void)
    377{
    378	struct rmap_item *rmap_item;
    379
    380	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
    381						__GFP_NORETRY | __GFP_NOWARN);
    382	if (rmap_item)
    383		ksm_rmap_items++;
    384	return rmap_item;
    385}
    386
    387static inline void free_rmap_item(struct rmap_item *rmap_item)
    388{
    389	ksm_rmap_items--;
    390	rmap_item->mm = NULL;	/* debug safety */
    391	kmem_cache_free(rmap_item_cache, rmap_item);
    392}
    393
    394static inline struct stable_node *alloc_stable_node(void)
    395{
    396	/*
    397	 * The allocation can take too long with GFP_KERNEL when memory is under
    398	 * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
    399	 * grants access to memory reserves, helping to avoid this problem.
    400	 */
    401	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
    402}
    403
    404static inline void free_stable_node(struct stable_node *stable_node)
    405{
    406	VM_BUG_ON(stable_node->rmap_hlist_len &&
    407		  !is_stable_node_chain(stable_node));
    408	kmem_cache_free(stable_node_cache, stable_node);
    409}
    410
    411static inline struct mm_slot *alloc_mm_slot(void)
    412{
    413	if (!mm_slot_cache)	/* initialization failed */
    414		return NULL;
    415	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
    416}
    417
    418static inline void free_mm_slot(struct mm_slot *mm_slot)
    419{
    420	kmem_cache_free(mm_slot_cache, mm_slot);
    421}
    422
    423static struct mm_slot *get_mm_slot(struct mm_struct *mm)
    424{
    425	struct mm_slot *slot;
    426
    427	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
    428		if (slot->mm == mm)
    429			return slot;
    430
    431	return NULL;
    432}
    433
    434static void insert_to_mm_slots_hash(struct mm_struct *mm,
    435				    struct mm_slot *mm_slot)
    436{
    437	mm_slot->mm = mm;
    438	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
    439}
    440
    441/*
    442 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
    443 * page tables after it has passed through ksm_exit() - which, if necessary,
    444 * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
    445 * a special flag: they can just back out as soon as mm_users goes to zero.
    446 * ksm_test_exit() is used throughout to make this test for exit: in some
    447 * places for correctness, in some places just to avoid unnecessary work.
    448 */
    449static inline bool ksm_test_exit(struct mm_struct *mm)
    450{
    451	return atomic_read(&mm->mm_users) == 0;
    452}
    453
    454/*
    455 * We use break_ksm to break COW on a ksm page: it's a stripped down
    456 *
    457 *	if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
    458 *		put_page(page);
    459 *
    460 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
    461 * in case the application has unmapped and remapped mm,addr meanwhile.
    462 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
    463 * mmap of /dev/mem, where we would not want to touch it.
    464 *
    465 * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
    466 * of the process that owns 'vma'.  We also do not want to enforce
    467 * protection keys here anyway.
    468 */
    469static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
    470{
    471	struct page *page;
    472	vm_fault_t ret = 0;
    473
    474	do {
    475		cond_resched();
    476		page = follow_page(vma, addr,
    477				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
    478		if (IS_ERR_OR_NULL(page))
    479			break;
    480		if (PageKsm(page))
    481			ret = handle_mm_fault(vma, addr,
    482					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
    483					      NULL);
    484		else
    485			ret = VM_FAULT_WRITE;
    486		put_page(page);
    487	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
    488	/*
    489	 * We must loop because handle_mm_fault() may back out if there's
    490	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
    491	 *
    492	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
    493	 * COW has been broken, even if the vma does not permit VM_WRITE;
    494	 * but note that a concurrent fault might break PageKsm for us.
    495	 *
    496	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
    497	 * backing file, which also invalidates anonymous pages: that's
    498	 * okay, that truncation will have unmapped the PageKsm for us.
    499	 *
    500	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
    501	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
    502	 * current task has TIF_MEMDIE set, and will be OOM killed on return
    503	 * to user; and ksmd, having no mm, would never be chosen for that.
    504	 *
    505	 * But if the mm is in a limited mem_cgroup, then the fault may fail
    506	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
    507	 * even ksmd can fail in this way - though it's usually breaking ksm
    508	 * just to undo a merge it made a moment before, so unlikely to oom.
    509	 *
    510	 * That's a pity: we might therefore have more kernel pages allocated
    511	 * than we're counting as nodes in the stable tree; but ksm_do_scan
    512	 * will retry to break_cow on each pass, so should recover the page
    513	 * in due course.  The important thing is to not let VM_MERGEABLE
    514	 * be cleared while any such pages might remain in the area.
    515	 */
    516	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
    517}
    518
    519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
    520		unsigned long addr)
    521{
    522	struct vm_area_struct *vma;
    523	if (ksm_test_exit(mm))
    524		return NULL;
    525	vma = vma_lookup(mm, addr);
    526	if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
    527		return NULL;
    528	return vma;
    529}
    530
    531static void break_cow(struct rmap_item *rmap_item)
    532{
    533	struct mm_struct *mm = rmap_item->mm;
    534	unsigned long addr = rmap_item->address;
    535	struct vm_area_struct *vma;
    536
    537	/*
    538	 * It is not an accident that whenever we want to break COW
    539	 * to undo, we also need to drop a reference to the anon_vma.
    540	 */
    541	put_anon_vma(rmap_item->anon_vma);
    542
    543	mmap_read_lock(mm);
    544	vma = find_mergeable_vma(mm, addr);
    545	if (vma)
    546		break_ksm(vma, addr);
    547	mmap_read_unlock(mm);
    548}
    549
    550static struct page *get_mergeable_page(struct rmap_item *rmap_item)
    551{
    552	struct mm_struct *mm = rmap_item->mm;
    553	unsigned long addr = rmap_item->address;
    554	struct vm_area_struct *vma;
    555	struct page *page;
    556
    557	mmap_read_lock(mm);
    558	vma = find_mergeable_vma(mm, addr);
    559	if (!vma)
    560		goto out;
    561
    562	page = follow_page(vma, addr, FOLL_GET);
    563	if (IS_ERR_OR_NULL(page))
    564		goto out;
    565	if (PageAnon(page)) {
    566		flush_anon_page(vma, page, addr);
    567		flush_dcache_page(page);
    568	} else {
    569		put_page(page);
    570out:
    571		page = NULL;
    572	}
    573	mmap_read_unlock(mm);
    574	return page;
    575}
    576
    577/*
    578 * This helper is used for getting right index into array of tree roots.
    579 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
    580 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
    581 * every node has its own stable and unstable tree.
    582 */
    583static inline int get_kpfn_nid(unsigned long kpfn)
    584{
    585	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
    586}
    587
    588static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
    589						   struct rb_root *root)
    590{
    591	struct stable_node *chain = alloc_stable_node();
    592	VM_BUG_ON(is_stable_node_chain(dup));
    593	if (likely(chain)) {
    594		INIT_HLIST_HEAD(&chain->hlist);
    595		chain->chain_prune_time = jiffies;
    596		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
    597#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
    598		chain->nid = NUMA_NO_NODE; /* debug */
    599#endif
    600		ksm_stable_node_chains++;
    601
    602		/*
    603		 * Put the stable node chain in the first dimension of
    604		 * the stable tree and at the same time remove the old
    605		 * stable node.
    606		 */
    607		rb_replace_node(&dup->node, &chain->node, root);
    608
    609		/*
    610		 * Move the old stable node to the second dimension
    611		 * queued in the hlist_dup. The invariant is that all
    612		 * dup stable_nodes in the chain->hlist point to pages
    613		 * that are write protected and have the exact same
    614		 * content.
    615		 */
    616		stable_node_chain_add_dup(dup, chain);
    617	}
    618	return chain;
    619}
    620
    621static inline void free_stable_node_chain(struct stable_node *chain,
    622					  struct rb_root *root)
    623{
    624	rb_erase(&chain->node, root);
    625	free_stable_node(chain);
    626	ksm_stable_node_chains--;
    627}
    628
    629static void remove_node_from_stable_tree(struct stable_node *stable_node)
    630{
    631	struct rmap_item *rmap_item;
    632
    633	/* check it's not STABLE_NODE_CHAIN or negative */
    634	BUG_ON(stable_node->rmap_hlist_len < 0);
    635
    636	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
    637		if (rmap_item->hlist.next)
    638			ksm_pages_sharing--;
    639		else
    640			ksm_pages_shared--;
    641
    642		rmap_item->mm->ksm_merging_pages--;
    643
    644		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
    645		stable_node->rmap_hlist_len--;
    646		put_anon_vma(rmap_item->anon_vma);
    647		rmap_item->address &= PAGE_MASK;
    648		cond_resched();
    649	}
    650
    651	/*
    652	 * We need the second aligned pointer of the migrate_nodes
    653	 * list_head to stay clear from the rb_parent_color union
    654	 * (aligned and different than any node) and also different
    655	 * from &migrate_nodes. This will verify that future list.h changes
    656	 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
    657	 */
    658	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
    659	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
    660
    661	if (stable_node->head == &migrate_nodes)
    662		list_del(&stable_node->list);
    663	else
    664		stable_node_dup_del(stable_node);
    665	free_stable_node(stable_node);
    666}
    667
    668enum get_ksm_page_flags {
    669	GET_KSM_PAGE_NOLOCK,
    670	GET_KSM_PAGE_LOCK,
    671	GET_KSM_PAGE_TRYLOCK
    672};
    673
    674/*
    675 * get_ksm_page: checks if the page indicated by the stable node
    676 * is still its ksm page, despite having held no reference to it.
    677 * In which case we can trust the content of the page, and it
    678 * returns the gotten page; but if the page has now been zapped,
    679 * remove the stale node from the stable tree and return NULL.
    680 * But beware, the stable node's page might be being migrated.
    681 *
    682 * You would expect the stable_node to hold a reference to the ksm page.
    683 * But if it increments the page's count, swapping out has to wait for
    684 * ksmd to come around again before it can free the page, which may take
    685 * seconds or even minutes: much too unresponsive.  So instead we use a
    686 * "keyhole reference": access to the ksm page from the stable node peeps
    687 * out through its keyhole to see if that page still holds the right key,
    688 * pointing back to this stable node.  This relies on freeing a PageAnon
    689 * page to reset its page->mapping to NULL, and relies on no other use of
    690 * a page to put something that might look like our key in page->mapping.
    691 * is on its way to being freed; but it is an anomaly to bear in mind.
    692 */
    693static struct page *get_ksm_page(struct stable_node *stable_node,
    694				 enum get_ksm_page_flags flags)
    695{
    696	struct page *page;
    697	void *expected_mapping;
    698	unsigned long kpfn;
    699
    700	expected_mapping = (void *)((unsigned long)stable_node |
    701					PAGE_MAPPING_KSM);
    702again:
    703	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
    704	page = pfn_to_page(kpfn);
    705	if (READ_ONCE(page->mapping) != expected_mapping)
    706		goto stale;
    707
    708	/*
    709	 * We cannot do anything with the page while its refcount is 0.
    710	 * Usually 0 means free, or tail of a higher-order page: in which
    711	 * case this node is no longer referenced, and should be freed;
    712	 * however, it might mean that the page is under page_ref_freeze().
    713	 * The __remove_mapping() case is easy, again the node is now stale;
    714	 * the same is in reuse_ksm_page() case; but if page is swapcache
    715	 * in migrate_page_move_mapping(), it might still be our page,
    716	 * in which case it's essential to keep the node.
    717	 */
    718	while (!get_page_unless_zero(page)) {
    719		/*
    720		 * Another check for page->mapping != expected_mapping would
    721		 * work here too.  We have chosen the !PageSwapCache test to
    722		 * optimize the common case, when the page is or is about to
    723		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
    724		 * in the ref_freeze section of __remove_mapping(); but Anon
    725		 * page->mapping reset to NULL later, in free_pages_prepare().
    726		 */
    727		if (!PageSwapCache(page))
    728			goto stale;
    729		cpu_relax();
    730	}
    731
    732	if (READ_ONCE(page->mapping) != expected_mapping) {
    733		put_page(page);
    734		goto stale;
    735	}
    736
    737	if (flags == GET_KSM_PAGE_TRYLOCK) {
    738		if (!trylock_page(page)) {
    739			put_page(page);
    740			return ERR_PTR(-EBUSY);
    741		}
    742	} else if (flags == GET_KSM_PAGE_LOCK)
    743		lock_page(page);
    744
    745	if (flags != GET_KSM_PAGE_NOLOCK) {
    746		if (READ_ONCE(page->mapping) != expected_mapping) {
    747			unlock_page(page);
    748			put_page(page);
    749			goto stale;
    750		}
    751	}
    752	return page;
    753
    754stale:
    755	/*
    756	 * We come here from above when page->mapping or !PageSwapCache
    757	 * suggests that the node is stale; but it might be under migration.
    758	 * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
    759	 * before checking whether node->kpfn has been changed.
    760	 */
    761	smp_rmb();
    762	if (READ_ONCE(stable_node->kpfn) != kpfn)
    763		goto again;
    764	remove_node_from_stable_tree(stable_node);
    765	return NULL;
    766}
    767
    768/*
    769 * Removing rmap_item from stable or unstable tree.
    770 * This function will clean the information from the stable/unstable tree.
    771 */
    772static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
    773{
    774	if (rmap_item->address & STABLE_FLAG) {
    775		struct stable_node *stable_node;
    776		struct page *page;
    777
    778		stable_node = rmap_item->head;
    779		page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
    780		if (!page)
    781			goto out;
    782
    783		hlist_del(&rmap_item->hlist);
    784		unlock_page(page);
    785		put_page(page);
    786
    787		if (!hlist_empty(&stable_node->hlist))
    788			ksm_pages_sharing--;
    789		else
    790			ksm_pages_shared--;
    791
    792		rmap_item->mm->ksm_merging_pages--;
    793
    794		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
    795		stable_node->rmap_hlist_len--;
    796
    797		put_anon_vma(rmap_item->anon_vma);
    798		rmap_item->head = NULL;
    799		rmap_item->address &= PAGE_MASK;
    800
    801	} else if (rmap_item->address & UNSTABLE_FLAG) {
    802		unsigned char age;
    803		/*
    804		 * Usually ksmd can and must skip the rb_erase, because
    805		 * root_unstable_tree was already reset to RB_ROOT.
    806		 * But be careful when an mm is exiting: do the rb_erase
    807		 * if this rmap_item was inserted by this scan, rather
    808		 * than left over from before.
    809		 */
    810		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
    811		BUG_ON(age > 1);
    812		if (!age)
    813			rb_erase(&rmap_item->node,
    814				 root_unstable_tree + NUMA(rmap_item->nid));
    815		ksm_pages_unshared--;
    816		rmap_item->address &= PAGE_MASK;
    817	}
    818out:
    819	cond_resched();		/* we're called from many long loops */
    820}
    821
    822static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
    823{
    824	while (*rmap_list) {
    825		struct rmap_item *rmap_item = *rmap_list;
    826		*rmap_list = rmap_item->rmap_list;
    827		remove_rmap_item_from_tree(rmap_item);
    828		free_rmap_item(rmap_item);
    829	}
    830}
    831
    832/*
    833 * Though it's very tempting to unmerge rmap_items from stable tree rather
    834 * than check every pte of a given vma, the locking doesn't quite work for
    835 * that - an rmap_item is assigned to the stable tree after inserting ksm
    836 * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
    837 * rmap_items from parent to child at fork time (so as not to waste time
    838 * if exit comes before the next scan reaches it).
    839 *
    840 * Similarly, although we'd like to remove rmap_items (so updating counts
    841 * and freeing memory) when unmerging an area, it's easier to leave that
    842 * to the next pass of ksmd - consider, for example, how ksmd might be
    843 * in cmp_and_merge_page on one of the rmap_items we would be removing.
    844 */
    845static int unmerge_ksm_pages(struct vm_area_struct *vma,
    846			     unsigned long start, unsigned long end)
    847{
    848	unsigned long addr;
    849	int err = 0;
    850
    851	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
    852		if (ksm_test_exit(vma->vm_mm))
    853			break;
    854		if (signal_pending(current))
    855			err = -ERESTARTSYS;
    856		else
    857			err = break_ksm(vma, addr);
    858	}
    859	return err;
    860}
    861
    862static inline struct stable_node *folio_stable_node(struct folio *folio)
    863{
    864	return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
    865}
    866
    867static inline struct stable_node *page_stable_node(struct page *page)
    868{
    869	return folio_stable_node(page_folio(page));
    870}
    871
    872static inline void set_page_stable_node(struct page *page,
    873					struct stable_node *stable_node)
    874{
    875	VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
    876	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
    877}
    878
    879#ifdef CONFIG_SYSFS
    880/*
    881 * Only called through the sysfs control interface:
    882 */
    883static int remove_stable_node(struct stable_node *stable_node)
    884{
    885	struct page *page;
    886	int err;
    887
    888	page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
    889	if (!page) {
    890		/*
    891		 * get_ksm_page did remove_node_from_stable_tree itself.
    892		 */
    893		return 0;
    894	}
    895
    896	/*
    897	 * Page could be still mapped if this races with __mmput() running in
    898	 * between ksm_exit() and exit_mmap(). Just refuse to let
    899	 * merge_across_nodes/max_page_sharing be switched.
    900	 */
    901	err = -EBUSY;
    902	if (!page_mapped(page)) {
    903		/*
    904		 * The stable node did not yet appear stale to get_ksm_page(),
    905		 * since that allows for an unmapped ksm page to be recognized
    906		 * right up until it is freed; but the node is safe to remove.
    907		 * This page might be in a pagevec waiting to be freed,
    908		 * or it might be PageSwapCache (perhaps under writeback),
    909		 * or it might have been removed from swapcache a moment ago.
    910		 */
    911		set_page_stable_node(page, NULL);
    912		remove_node_from_stable_tree(stable_node);
    913		err = 0;
    914	}
    915
    916	unlock_page(page);
    917	put_page(page);
    918	return err;
    919}
    920
    921static int remove_stable_node_chain(struct stable_node *stable_node,
    922				    struct rb_root *root)
    923{
    924	struct stable_node *dup;
    925	struct hlist_node *hlist_safe;
    926
    927	if (!is_stable_node_chain(stable_node)) {
    928		VM_BUG_ON(is_stable_node_dup(stable_node));
    929		if (remove_stable_node(stable_node))
    930			return true;
    931		else
    932			return false;
    933	}
    934
    935	hlist_for_each_entry_safe(dup, hlist_safe,
    936				  &stable_node->hlist, hlist_dup) {
    937		VM_BUG_ON(!is_stable_node_dup(dup));
    938		if (remove_stable_node(dup))
    939			return true;
    940	}
    941	BUG_ON(!hlist_empty(&stable_node->hlist));
    942	free_stable_node_chain(stable_node, root);
    943	return false;
    944}
    945
    946static int remove_all_stable_nodes(void)
    947{
    948	struct stable_node *stable_node, *next;
    949	int nid;
    950	int err = 0;
    951
    952	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
    953		while (root_stable_tree[nid].rb_node) {
    954			stable_node = rb_entry(root_stable_tree[nid].rb_node,
    955						struct stable_node, node);
    956			if (remove_stable_node_chain(stable_node,
    957						     root_stable_tree + nid)) {
    958				err = -EBUSY;
    959				break;	/* proceed to next nid */
    960			}
    961			cond_resched();
    962		}
    963	}
    964	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
    965		if (remove_stable_node(stable_node))
    966			err = -EBUSY;
    967		cond_resched();
    968	}
    969	return err;
    970}
    971
    972static int unmerge_and_remove_all_rmap_items(void)
    973{
    974	struct mm_slot *mm_slot;
    975	struct mm_struct *mm;
    976	struct vm_area_struct *vma;
    977	int err = 0;
    978
    979	spin_lock(&ksm_mmlist_lock);
    980	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
    981						struct mm_slot, mm_list);
    982	spin_unlock(&ksm_mmlist_lock);
    983
    984	for (mm_slot = ksm_scan.mm_slot;
    985			mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
    986		mm = mm_slot->mm;
    987		mmap_read_lock(mm);
    988		for (vma = mm->mmap; vma; vma = vma->vm_next) {
    989			if (ksm_test_exit(mm))
    990				break;
    991			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
    992				continue;
    993			err = unmerge_ksm_pages(vma,
    994						vma->vm_start, vma->vm_end);
    995			if (err)
    996				goto error;
    997		}
    998
    999		remove_trailing_rmap_items(&mm_slot->rmap_list);
   1000		mmap_read_unlock(mm);
   1001
   1002		spin_lock(&ksm_mmlist_lock);
   1003		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
   1004						struct mm_slot, mm_list);
   1005		if (ksm_test_exit(mm)) {
   1006			hash_del(&mm_slot->link);
   1007			list_del(&mm_slot->mm_list);
   1008			spin_unlock(&ksm_mmlist_lock);
   1009
   1010			free_mm_slot(mm_slot);
   1011			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
   1012			mmdrop(mm);
   1013		} else
   1014			spin_unlock(&ksm_mmlist_lock);
   1015	}
   1016
   1017	/* Clean up stable nodes, but don't worry if some are still busy */
   1018	remove_all_stable_nodes();
   1019	ksm_scan.seqnr = 0;
   1020	return 0;
   1021
   1022error:
   1023	mmap_read_unlock(mm);
   1024	spin_lock(&ksm_mmlist_lock);
   1025	ksm_scan.mm_slot = &ksm_mm_head;
   1026	spin_unlock(&ksm_mmlist_lock);
   1027	return err;
   1028}
   1029#endif /* CONFIG_SYSFS */
   1030
   1031static u32 calc_checksum(struct page *page)
   1032{
   1033	u32 checksum;
   1034	void *addr = kmap_atomic(page);
   1035	checksum = xxhash(addr, PAGE_SIZE, 0);
   1036	kunmap_atomic(addr);
   1037	return checksum;
   1038}
   1039
   1040static int write_protect_page(struct vm_area_struct *vma, struct page *page,
   1041			      pte_t *orig_pte)
   1042{
   1043	struct mm_struct *mm = vma->vm_mm;
   1044	DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
   1045	int swapped;
   1046	int err = -EFAULT;
   1047	struct mmu_notifier_range range;
   1048	bool anon_exclusive;
   1049
   1050	pvmw.address = page_address_in_vma(page, vma);
   1051	if (pvmw.address == -EFAULT)
   1052		goto out;
   1053
   1054	BUG_ON(PageTransCompound(page));
   1055
   1056	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
   1057				pvmw.address,
   1058				pvmw.address + PAGE_SIZE);
   1059	mmu_notifier_invalidate_range_start(&range);
   1060
   1061	if (!page_vma_mapped_walk(&pvmw))
   1062		goto out_mn;
   1063	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
   1064		goto out_unlock;
   1065
   1066	anon_exclusive = PageAnonExclusive(page);
   1067	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
   1068	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
   1069	    anon_exclusive || mm_tlb_flush_pending(mm)) {
   1070		pte_t entry;
   1071
   1072		swapped = PageSwapCache(page);
   1073		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
   1074		/*
   1075		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
   1076		 * take any lock, therefore the check that we are going to make
   1077		 * with the pagecount against the mapcount is racy and
   1078		 * O_DIRECT can happen right after the check.
   1079		 * So we clear the pte and flush the tlb before the check
   1080		 * this assure us that no O_DIRECT can happen after the check
   1081		 * or in the middle of the check.
   1082		 *
   1083		 * No need to notify as we are downgrading page table to read
   1084		 * only not changing it to point to a new page.
   1085		 *
   1086		 * See Documentation/vm/mmu_notifier.rst
   1087		 */
   1088		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
   1089		/*
   1090		 * Check that no O_DIRECT or similar I/O is in progress on the
   1091		 * page
   1092		 */
   1093		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
   1094			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
   1095			goto out_unlock;
   1096		}
   1097
   1098		if (anon_exclusive && page_try_share_anon_rmap(page)) {
   1099			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
   1100			goto out_unlock;
   1101		}
   1102
   1103		if (pte_dirty(entry))
   1104			set_page_dirty(page);
   1105
   1106		if (pte_protnone(entry))
   1107			entry = pte_mkclean(pte_clear_savedwrite(entry));
   1108		else
   1109			entry = pte_mkclean(pte_wrprotect(entry));
   1110		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
   1111	}
   1112	*orig_pte = *pvmw.pte;
   1113	err = 0;
   1114
   1115out_unlock:
   1116	page_vma_mapped_walk_done(&pvmw);
   1117out_mn:
   1118	mmu_notifier_invalidate_range_end(&range);
   1119out:
   1120	return err;
   1121}
   1122
   1123/**
   1124 * replace_page - replace page in vma by new ksm page
   1125 * @vma:      vma that holds the pte pointing to page
   1126 * @page:     the page we are replacing by kpage
   1127 * @kpage:    the ksm page we replace page by
   1128 * @orig_pte: the original value of the pte
   1129 *
   1130 * Returns 0 on success, -EFAULT on failure.
   1131 */
   1132static int replace_page(struct vm_area_struct *vma, struct page *page,
   1133			struct page *kpage, pte_t orig_pte)
   1134{
   1135	struct mm_struct *mm = vma->vm_mm;
   1136	pmd_t *pmd;
   1137	pte_t *ptep;
   1138	pte_t newpte;
   1139	spinlock_t *ptl;
   1140	unsigned long addr;
   1141	int err = -EFAULT;
   1142	struct mmu_notifier_range range;
   1143
   1144	addr = page_address_in_vma(page, vma);
   1145	if (addr == -EFAULT)
   1146		goto out;
   1147
   1148	pmd = mm_find_pmd(mm, addr);
   1149	if (!pmd)
   1150		goto out;
   1151
   1152	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
   1153				addr + PAGE_SIZE);
   1154	mmu_notifier_invalidate_range_start(&range);
   1155
   1156	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
   1157	if (!pte_same(*ptep, orig_pte)) {
   1158		pte_unmap_unlock(ptep, ptl);
   1159		goto out_mn;
   1160	}
   1161	VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
   1162	VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
   1163
   1164	/*
   1165	 * No need to check ksm_use_zero_pages here: we can only have a
   1166	 * zero_page here if ksm_use_zero_pages was enabled already.
   1167	 */
   1168	if (!is_zero_pfn(page_to_pfn(kpage))) {
   1169		get_page(kpage);
   1170		page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
   1171		newpte = mk_pte(kpage, vma->vm_page_prot);
   1172	} else {
   1173		newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
   1174					       vma->vm_page_prot));
   1175		/*
   1176		 * We're replacing an anonymous page with a zero page, which is
   1177		 * not anonymous. We need to do proper accounting otherwise we
   1178		 * will get wrong values in /proc, and a BUG message in dmesg
   1179		 * when tearing down the mm.
   1180		 */
   1181		dec_mm_counter(mm, MM_ANONPAGES);
   1182	}
   1183
   1184	flush_cache_page(vma, addr, pte_pfn(*ptep));
   1185	/*
   1186	 * No need to notify as we are replacing a read only page with another
   1187	 * read only page with the same content.
   1188	 *
   1189	 * See Documentation/vm/mmu_notifier.rst
   1190	 */
   1191	ptep_clear_flush(vma, addr, ptep);
   1192	set_pte_at_notify(mm, addr, ptep, newpte);
   1193
   1194	page_remove_rmap(page, vma, false);
   1195	if (!page_mapped(page))
   1196		try_to_free_swap(page);
   1197	put_page(page);
   1198
   1199	pte_unmap_unlock(ptep, ptl);
   1200	err = 0;
   1201out_mn:
   1202	mmu_notifier_invalidate_range_end(&range);
   1203out:
   1204	return err;
   1205}
   1206
   1207/*
   1208 * try_to_merge_one_page - take two pages and merge them into one
   1209 * @vma: the vma that holds the pte pointing to page
   1210 * @page: the PageAnon page that we want to replace with kpage
   1211 * @kpage: the PageKsm page that we want to map instead of page,
   1212 *         or NULL the first time when we want to use page as kpage.
   1213 *
   1214 * This function returns 0 if the pages were merged, -EFAULT otherwise.
   1215 */
   1216static int try_to_merge_one_page(struct vm_area_struct *vma,
   1217				 struct page *page, struct page *kpage)
   1218{
   1219	pte_t orig_pte = __pte(0);
   1220	int err = -EFAULT;
   1221
   1222	if (page == kpage)			/* ksm page forked */
   1223		return 0;
   1224
   1225	if (!PageAnon(page))
   1226		goto out;
   1227
   1228	/*
   1229	 * We need the page lock to read a stable PageSwapCache in
   1230	 * write_protect_page().  We use trylock_page() instead of
   1231	 * lock_page() because we don't want to wait here - we
   1232	 * prefer to continue scanning and merging different pages,
   1233	 * then come back to this page when it is unlocked.
   1234	 */
   1235	if (!trylock_page(page))
   1236		goto out;
   1237
   1238	if (PageTransCompound(page)) {
   1239		if (split_huge_page(page))
   1240			goto out_unlock;
   1241	}
   1242
   1243	/*
   1244	 * If this anonymous page is mapped only here, its pte may need
   1245	 * to be write-protected.  If it's mapped elsewhere, all of its
   1246	 * ptes are necessarily already write-protected.  But in either
   1247	 * case, we need to lock and check page_count is not raised.
   1248	 */
   1249	if (write_protect_page(vma, page, &orig_pte) == 0) {
   1250		if (!kpage) {
   1251			/*
   1252			 * While we hold page lock, upgrade page from
   1253			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
   1254			 * stable_tree_insert() will update stable_node.
   1255			 */
   1256			set_page_stable_node(page, NULL);
   1257			mark_page_accessed(page);
   1258			/*
   1259			 * Page reclaim just frees a clean page with no dirty
   1260			 * ptes: make sure that the ksm page would be swapped.
   1261			 */
   1262			if (!PageDirty(page))
   1263				SetPageDirty(page);
   1264			err = 0;
   1265		} else if (pages_identical(page, kpage))
   1266			err = replace_page(vma, page, kpage, orig_pte);
   1267	}
   1268
   1269out_unlock:
   1270	unlock_page(page);
   1271out:
   1272	return err;
   1273}
   1274
   1275/*
   1276 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
   1277 * but no new kernel page is allocated: kpage must already be a ksm page.
   1278 *
   1279 * This function returns 0 if the pages were merged, -EFAULT otherwise.
   1280 */
   1281static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
   1282				      struct page *page, struct page *kpage)
   1283{
   1284	struct mm_struct *mm = rmap_item->mm;
   1285	struct vm_area_struct *vma;
   1286	int err = -EFAULT;
   1287
   1288	mmap_read_lock(mm);
   1289	vma = find_mergeable_vma(mm, rmap_item->address);
   1290	if (!vma)
   1291		goto out;
   1292
   1293	err = try_to_merge_one_page(vma, page, kpage);
   1294	if (err)
   1295		goto out;
   1296
   1297	/* Unstable nid is in union with stable anon_vma: remove first */
   1298	remove_rmap_item_from_tree(rmap_item);
   1299
   1300	/* Must get reference to anon_vma while still holding mmap_lock */
   1301	rmap_item->anon_vma = vma->anon_vma;
   1302	get_anon_vma(vma->anon_vma);
   1303out:
   1304	mmap_read_unlock(mm);
   1305	return err;
   1306}
   1307
   1308/*
   1309 * try_to_merge_two_pages - take two identical pages and prepare them
   1310 * to be merged into one page.
   1311 *
   1312 * This function returns the kpage if we successfully merged two identical
   1313 * pages into one ksm page, NULL otherwise.
   1314 *
   1315 * Note that this function upgrades page to ksm page: if one of the pages
   1316 * is already a ksm page, try_to_merge_with_ksm_page should be used.
   1317 */
   1318static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
   1319					   struct page *page,
   1320					   struct rmap_item *tree_rmap_item,
   1321					   struct page *tree_page)
   1322{
   1323	int err;
   1324
   1325	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
   1326	if (!err) {
   1327		err = try_to_merge_with_ksm_page(tree_rmap_item,
   1328							tree_page, page);
   1329		/*
   1330		 * If that fails, we have a ksm page with only one pte
   1331		 * pointing to it: so break it.
   1332		 */
   1333		if (err)
   1334			break_cow(rmap_item);
   1335	}
   1336	return err ? NULL : page;
   1337}
   1338
   1339static __always_inline
   1340bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
   1341{
   1342	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
   1343	/*
   1344	 * Check that at least one mapping still exists, otherwise
   1345	 * there's no much point to merge and share with this
   1346	 * stable_node, as the underlying tree_page of the other
   1347	 * sharer is going to be freed soon.
   1348	 */
   1349	return stable_node->rmap_hlist_len &&
   1350		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
   1351}
   1352
   1353static __always_inline
   1354bool is_page_sharing_candidate(struct stable_node *stable_node)
   1355{
   1356	return __is_page_sharing_candidate(stable_node, 0);
   1357}
   1358
   1359static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
   1360				    struct stable_node **_stable_node,
   1361				    struct rb_root *root,
   1362				    bool prune_stale_stable_nodes)
   1363{
   1364	struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
   1365	struct hlist_node *hlist_safe;
   1366	struct page *_tree_page, *tree_page = NULL;
   1367	int nr = 0;
   1368	int found_rmap_hlist_len;
   1369
   1370	if (!prune_stale_stable_nodes ||
   1371	    time_before(jiffies, stable_node->chain_prune_time +
   1372			msecs_to_jiffies(
   1373				ksm_stable_node_chains_prune_millisecs)))
   1374		prune_stale_stable_nodes = false;
   1375	else
   1376		stable_node->chain_prune_time = jiffies;
   1377
   1378	hlist_for_each_entry_safe(dup, hlist_safe,
   1379				  &stable_node->hlist, hlist_dup) {
   1380		cond_resched();
   1381		/*
   1382		 * We must walk all stable_node_dup to prune the stale
   1383		 * stable nodes during lookup.
   1384		 *
   1385		 * get_ksm_page can drop the nodes from the
   1386		 * stable_node->hlist if they point to freed pages
   1387		 * (that's why we do a _safe walk). The "dup"
   1388		 * stable_node parameter itself will be freed from
   1389		 * under us if it returns NULL.
   1390		 */
   1391		_tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
   1392		if (!_tree_page)
   1393			continue;
   1394		nr += 1;
   1395		if (is_page_sharing_candidate(dup)) {
   1396			if (!found ||
   1397			    dup->rmap_hlist_len > found_rmap_hlist_len) {
   1398				if (found)
   1399					put_page(tree_page);
   1400				found = dup;
   1401				found_rmap_hlist_len = found->rmap_hlist_len;
   1402				tree_page = _tree_page;
   1403
   1404				/* skip put_page for found dup */
   1405				if (!prune_stale_stable_nodes)
   1406					break;
   1407				continue;
   1408			}
   1409		}
   1410		put_page(_tree_page);
   1411	}
   1412
   1413	if (found) {
   1414		/*
   1415		 * nr is counting all dups in the chain only if
   1416		 * prune_stale_stable_nodes is true, otherwise we may
   1417		 * break the loop at nr == 1 even if there are
   1418		 * multiple entries.
   1419		 */
   1420		if (prune_stale_stable_nodes && nr == 1) {
   1421			/*
   1422			 * If there's not just one entry it would
   1423			 * corrupt memory, better BUG_ON. In KSM
   1424			 * context with no lock held it's not even
   1425			 * fatal.
   1426			 */
   1427			BUG_ON(stable_node->hlist.first->next);
   1428
   1429			/*
   1430			 * There's just one entry and it is below the
   1431			 * deduplication limit so drop the chain.
   1432			 */
   1433			rb_replace_node(&stable_node->node, &found->node,
   1434					root);
   1435			free_stable_node(stable_node);
   1436			ksm_stable_node_chains--;
   1437			ksm_stable_node_dups--;
   1438			/*
   1439			 * NOTE: the caller depends on the stable_node
   1440			 * to be equal to stable_node_dup if the chain
   1441			 * was collapsed.
   1442			 */
   1443			*_stable_node = found;
   1444			/*
   1445			 * Just for robustness, as stable_node is
   1446			 * otherwise left as a stable pointer, the
   1447			 * compiler shall optimize it away at build
   1448			 * time.
   1449			 */
   1450			stable_node = NULL;
   1451		} else if (stable_node->hlist.first != &found->hlist_dup &&
   1452			   __is_page_sharing_candidate(found, 1)) {
   1453			/*
   1454			 * If the found stable_node dup can accept one
   1455			 * more future merge (in addition to the one
   1456			 * that is underway) and is not at the head of
   1457			 * the chain, put it there so next search will
   1458			 * be quicker in the !prune_stale_stable_nodes
   1459			 * case.
   1460			 *
   1461			 * NOTE: it would be inaccurate to use nr > 1
   1462			 * instead of checking the hlist.first pointer
   1463			 * directly, because in the
   1464			 * prune_stale_stable_nodes case "nr" isn't
   1465			 * the position of the found dup in the chain,
   1466			 * but the total number of dups in the chain.
   1467			 */
   1468			hlist_del(&found->hlist_dup);
   1469			hlist_add_head(&found->hlist_dup,
   1470				       &stable_node->hlist);
   1471		}
   1472	}
   1473
   1474	*_stable_node_dup = found;
   1475	return tree_page;
   1476}
   1477
   1478static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
   1479					       struct rb_root *root)
   1480{
   1481	if (!is_stable_node_chain(stable_node))
   1482		return stable_node;
   1483	if (hlist_empty(&stable_node->hlist)) {
   1484		free_stable_node_chain(stable_node, root);
   1485		return NULL;
   1486	}
   1487	return hlist_entry(stable_node->hlist.first,
   1488			   typeof(*stable_node), hlist_dup);
   1489}
   1490
   1491/*
   1492 * Like for get_ksm_page, this function can free the *_stable_node and
   1493 * *_stable_node_dup if the returned tree_page is NULL.
   1494 *
   1495 * It can also free and overwrite *_stable_node with the found
   1496 * stable_node_dup if the chain is collapsed (in which case
   1497 * *_stable_node will be equal to *_stable_node_dup like if the chain
   1498 * never existed). It's up to the caller to verify tree_page is not
   1499 * NULL before dereferencing *_stable_node or *_stable_node_dup.
   1500 *
   1501 * *_stable_node_dup is really a second output parameter of this
   1502 * function and will be overwritten in all cases, the caller doesn't
   1503 * need to initialize it.
   1504 */
   1505static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
   1506					struct stable_node **_stable_node,
   1507					struct rb_root *root,
   1508					bool prune_stale_stable_nodes)
   1509{
   1510	struct stable_node *stable_node = *_stable_node;
   1511	if (!is_stable_node_chain(stable_node)) {
   1512		if (is_page_sharing_candidate(stable_node)) {
   1513			*_stable_node_dup = stable_node;
   1514			return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
   1515		}
   1516		/*
   1517		 * _stable_node_dup set to NULL means the stable_node
   1518		 * reached the ksm_max_page_sharing limit.
   1519		 */
   1520		*_stable_node_dup = NULL;
   1521		return NULL;
   1522	}
   1523	return stable_node_dup(_stable_node_dup, _stable_node, root,
   1524			       prune_stale_stable_nodes);
   1525}
   1526
   1527static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
   1528						struct stable_node **s_n,
   1529						struct rb_root *root)
   1530{
   1531	return __stable_node_chain(s_n_d, s_n, root, true);
   1532}
   1533
   1534static __always_inline struct page *chain(struct stable_node **s_n_d,
   1535					  struct stable_node *s_n,
   1536					  struct rb_root *root)
   1537{
   1538	struct stable_node *old_stable_node = s_n;
   1539	struct page *tree_page;
   1540
   1541	tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
   1542	/* not pruning dups so s_n cannot have changed */
   1543	VM_BUG_ON(s_n != old_stable_node);
   1544	return tree_page;
   1545}
   1546
   1547/*
   1548 * stable_tree_search - search for page inside the stable tree
   1549 *
   1550 * This function checks if there is a page inside the stable tree
   1551 * with identical content to the page that we are scanning right now.
   1552 *
   1553 * This function returns the stable tree node of identical content if found,
   1554 * NULL otherwise.
   1555 */
   1556static struct page *stable_tree_search(struct page *page)
   1557{
   1558	int nid;
   1559	struct rb_root *root;
   1560	struct rb_node **new;
   1561	struct rb_node *parent;
   1562	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
   1563	struct stable_node *page_node;
   1564
   1565	page_node = page_stable_node(page);
   1566	if (page_node && page_node->head != &migrate_nodes) {
   1567		/* ksm page forked */
   1568		get_page(page);
   1569		return page;
   1570	}
   1571
   1572	nid = get_kpfn_nid(page_to_pfn(page));
   1573	root = root_stable_tree + nid;
   1574again:
   1575	new = &root->rb_node;
   1576	parent = NULL;
   1577
   1578	while (*new) {
   1579		struct page *tree_page;
   1580		int ret;
   1581
   1582		cond_resched();
   1583		stable_node = rb_entry(*new, struct stable_node, node);
   1584		stable_node_any = NULL;
   1585		tree_page = chain_prune(&stable_node_dup, &stable_node,	root);
   1586		/*
   1587		 * NOTE: stable_node may have been freed by
   1588		 * chain_prune() if the returned stable_node_dup is
   1589		 * not NULL. stable_node_dup may have been inserted in
   1590		 * the rbtree instead as a regular stable_node (in
   1591		 * order to collapse the stable_node chain if a single
   1592		 * stable_node dup was found in it). In such case the
   1593		 * stable_node is overwritten by the callee to point
   1594		 * to the stable_node_dup that was collapsed in the
   1595		 * stable rbtree and stable_node will be equal to
   1596		 * stable_node_dup like if the chain never existed.
   1597		 */
   1598		if (!stable_node_dup) {
   1599			/*
   1600			 * Either all stable_node dups were full in
   1601			 * this stable_node chain, or this chain was
   1602			 * empty and should be rb_erased.
   1603			 */
   1604			stable_node_any = stable_node_dup_any(stable_node,
   1605							      root);
   1606			if (!stable_node_any) {
   1607				/* rb_erase just run */
   1608				goto again;
   1609			}
   1610			/*
   1611			 * Take any of the stable_node dups page of
   1612			 * this stable_node chain to let the tree walk
   1613			 * continue. All KSM pages belonging to the
   1614			 * stable_node dups in a stable_node chain
   1615			 * have the same content and they're
   1616			 * write protected at all times. Any will work
   1617			 * fine to continue the walk.
   1618			 */
   1619			tree_page = get_ksm_page(stable_node_any,
   1620						 GET_KSM_PAGE_NOLOCK);
   1621		}
   1622		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
   1623		if (!tree_page) {
   1624			/*
   1625			 * If we walked over a stale stable_node,
   1626			 * get_ksm_page() will call rb_erase() and it
   1627			 * may rebalance the tree from under us. So
   1628			 * restart the search from scratch. Returning
   1629			 * NULL would be safe too, but we'd generate
   1630			 * false negative insertions just because some
   1631			 * stable_node was stale.
   1632			 */
   1633			goto again;
   1634		}
   1635
   1636		ret = memcmp_pages(page, tree_page);
   1637		put_page(tree_page);
   1638
   1639		parent = *new;
   1640		if (ret < 0)
   1641			new = &parent->rb_left;
   1642		else if (ret > 0)
   1643			new = &parent->rb_right;
   1644		else {
   1645			if (page_node) {
   1646				VM_BUG_ON(page_node->head != &migrate_nodes);
   1647				/*
   1648				 * Test if the migrated page should be merged
   1649				 * into a stable node dup. If the mapcount is
   1650				 * 1 we can migrate it with another KSM page
   1651				 * without adding it to the chain.
   1652				 */
   1653				if (page_mapcount(page) > 1)
   1654					goto chain_append;
   1655			}
   1656
   1657			if (!stable_node_dup) {
   1658				/*
   1659				 * If the stable_node is a chain and
   1660				 * we got a payload match in memcmp
   1661				 * but we cannot merge the scanned
   1662				 * page in any of the existing
   1663				 * stable_node dups because they're
   1664				 * all full, we need to wait the
   1665				 * scanned page to find itself a match
   1666				 * in the unstable tree to create a
   1667				 * brand new KSM page to add later to
   1668				 * the dups of this stable_node.
   1669				 */
   1670				return NULL;
   1671			}
   1672
   1673			/*
   1674			 * Lock and unlock the stable_node's page (which
   1675			 * might already have been migrated) so that page
   1676			 * migration is sure to notice its raised count.
   1677			 * It would be more elegant to return stable_node
   1678			 * than kpage, but that involves more changes.
   1679			 */
   1680			tree_page = get_ksm_page(stable_node_dup,
   1681						 GET_KSM_PAGE_TRYLOCK);
   1682
   1683			if (PTR_ERR(tree_page) == -EBUSY)
   1684				return ERR_PTR(-EBUSY);
   1685
   1686			if (unlikely(!tree_page))
   1687				/*
   1688				 * The tree may have been rebalanced,
   1689				 * so re-evaluate parent and new.
   1690				 */
   1691				goto again;
   1692			unlock_page(tree_page);
   1693
   1694			if (get_kpfn_nid(stable_node_dup->kpfn) !=
   1695			    NUMA(stable_node_dup->nid)) {
   1696				put_page(tree_page);
   1697				goto replace;
   1698			}
   1699			return tree_page;
   1700		}
   1701	}
   1702
   1703	if (!page_node)
   1704		return NULL;
   1705
   1706	list_del(&page_node->list);
   1707	DO_NUMA(page_node->nid = nid);
   1708	rb_link_node(&page_node->node, parent, new);
   1709	rb_insert_color(&page_node->node, root);
   1710out:
   1711	if (is_page_sharing_candidate(page_node)) {
   1712		get_page(page);
   1713		return page;
   1714	} else
   1715		return NULL;
   1716
   1717replace:
   1718	/*
   1719	 * If stable_node was a chain and chain_prune collapsed it,
   1720	 * stable_node has been updated to be the new regular
   1721	 * stable_node. A collapse of the chain is indistinguishable
   1722	 * from the case there was no chain in the stable
   1723	 * rbtree. Otherwise stable_node is the chain and
   1724	 * stable_node_dup is the dup to replace.
   1725	 */
   1726	if (stable_node_dup == stable_node) {
   1727		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
   1728		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
   1729		/* there is no chain */
   1730		if (page_node) {
   1731			VM_BUG_ON(page_node->head != &migrate_nodes);
   1732			list_del(&page_node->list);
   1733			DO_NUMA(page_node->nid = nid);
   1734			rb_replace_node(&stable_node_dup->node,
   1735					&page_node->node,
   1736					root);
   1737			if (is_page_sharing_candidate(page_node))
   1738				get_page(page);
   1739			else
   1740				page = NULL;
   1741		} else {
   1742			rb_erase(&stable_node_dup->node, root);
   1743			page = NULL;
   1744		}
   1745	} else {
   1746		VM_BUG_ON(!is_stable_node_chain(stable_node));
   1747		__stable_node_dup_del(stable_node_dup);
   1748		if (page_node) {
   1749			VM_BUG_ON(page_node->head != &migrate_nodes);
   1750			list_del(&page_node->list);
   1751			DO_NUMA(page_node->nid = nid);
   1752			stable_node_chain_add_dup(page_node, stable_node);
   1753			if (is_page_sharing_candidate(page_node))
   1754				get_page(page);
   1755			else
   1756				page = NULL;
   1757		} else {
   1758			page = NULL;
   1759		}
   1760	}
   1761	stable_node_dup->head = &migrate_nodes;
   1762	list_add(&stable_node_dup->list, stable_node_dup->head);
   1763	return page;
   1764
   1765chain_append:
   1766	/* stable_node_dup could be null if it reached the limit */
   1767	if (!stable_node_dup)
   1768		stable_node_dup = stable_node_any;
   1769	/*
   1770	 * If stable_node was a chain and chain_prune collapsed it,
   1771	 * stable_node has been updated to be the new regular
   1772	 * stable_node. A collapse of the chain is indistinguishable
   1773	 * from the case there was no chain in the stable
   1774	 * rbtree. Otherwise stable_node is the chain and
   1775	 * stable_node_dup is the dup to replace.
   1776	 */
   1777	if (stable_node_dup == stable_node) {
   1778		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
   1779		/* chain is missing so create it */
   1780		stable_node = alloc_stable_node_chain(stable_node_dup,
   1781						      root);
   1782		if (!stable_node)
   1783			return NULL;
   1784	}
   1785	/*
   1786	 * Add this stable_node dup that was
   1787	 * migrated to the stable_node chain
   1788	 * of the current nid for this page
   1789	 * content.
   1790	 */
   1791	VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
   1792	VM_BUG_ON(page_node->head != &migrate_nodes);
   1793	list_del(&page_node->list);
   1794	DO_NUMA(page_node->nid = nid);
   1795	stable_node_chain_add_dup(page_node, stable_node);
   1796	goto out;
   1797}
   1798
   1799/*
   1800 * stable_tree_insert - insert stable tree node pointing to new ksm page
   1801 * into the stable tree.
   1802 *
   1803 * This function returns the stable tree node just allocated on success,
   1804 * NULL otherwise.
   1805 */
   1806static struct stable_node *stable_tree_insert(struct page *kpage)
   1807{
   1808	int nid;
   1809	unsigned long kpfn;
   1810	struct rb_root *root;
   1811	struct rb_node **new;
   1812	struct rb_node *parent;
   1813	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
   1814	bool need_chain = false;
   1815
   1816	kpfn = page_to_pfn(kpage);
   1817	nid = get_kpfn_nid(kpfn);
   1818	root = root_stable_tree + nid;
   1819again:
   1820	parent = NULL;
   1821	new = &root->rb_node;
   1822
   1823	while (*new) {
   1824		struct page *tree_page;
   1825		int ret;
   1826
   1827		cond_resched();
   1828		stable_node = rb_entry(*new, struct stable_node, node);
   1829		stable_node_any = NULL;
   1830		tree_page = chain(&stable_node_dup, stable_node, root);
   1831		if (!stable_node_dup) {
   1832			/*
   1833			 * Either all stable_node dups were full in
   1834			 * this stable_node chain, or this chain was
   1835			 * empty and should be rb_erased.
   1836			 */
   1837			stable_node_any = stable_node_dup_any(stable_node,
   1838							      root);
   1839			if (!stable_node_any) {
   1840				/* rb_erase just run */
   1841				goto again;
   1842			}
   1843			/*
   1844			 * Take any of the stable_node dups page of
   1845			 * this stable_node chain to let the tree walk
   1846			 * continue. All KSM pages belonging to the
   1847			 * stable_node dups in a stable_node chain
   1848			 * have the same content and they're
   1849			 * write protected at all times. Any will work
   1850			 * fine to continue the walk.
   1851			 */
   1852			tree_page = get_ksm_page(stable_node_any,
   1853						 GET_KSM_PAGE_NOLOCK);
   1854		}
   1855		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
   1856		if (!tree_page) {
   1857			/*
   1858			 * If we walked over a stale stable_node,
   1859			 * get_ksm_page() will call rb_erase() and it
   1860			 * may rebalance the tree from under us. So
   1861			 * restart the search from scratch. Returning
   1862			 * NULL would be safe too, but we'd generate
   1863			 * false negative insertions just because some
   1864			 * stable_node was stale.
   1865			 */
   1866			goto again;
   1867		}
   1868
   1869		ret = memcmp_pages(kpage, tree_page);
   1870		put_page(tree_page);
   1871
   1872		parent = *new;
   1873		if (ret < 0)
   1874			new = &parent->rb_left;
   1875		else if (ret > 0)
   1876			new = &parent->rb_right;
   1877		else {
   1878			need_chain = true;
   1879			break;
   1880		}
   1881	}
   1882
   1883	stable_node_dup = alloc_stable_node();
   1884	if (!stable_node_dup)
   1885		return NULL;
   1886
   1887	INIT_HLIST_HEAD(&stable_node_dup->hlist);
   1888	stable_node_dup->kpfn = kpfn;
   1889	set_page_stable_node(kpage, stable_node_dup);
   1890	stable_node_dup->rmap_hlist_len = 0;
   1891	DO_NUMA(stable_node_dup->nid = nid);
   1892	if (!need_chain) {
   1893		rb_link_node(&stable_node_dup->node, parent, new);
   1894		rb_insert_color(&stable_node_dup->node, root);
   1895	} else {
   1896		if (!is_stable_node_chain(stable_node)) {
   1897			struct stable_node *orig = stable_node;
   1898			/* chain is missing so create it */
   1899			stable_node = alloc_stable_node_chain(orig, root);
   1900			if (!stable_node) {
   1901				free_stable_node(stable_node_dup);
   1902				return NULL;
   1903			}
   1904		}
   1905		stable_node_chain_add_dup(stable_node_dup, stable_node);
   1906	}
   1907
   1908	return stable_node_dup;
   1909}
   1910
   1911/*
   1912 * unstable_tree_search_insert - search for identical page,
   1913 * else insert rmap_item into the unstable tree.
   1914 *
   1915 * This function searches for a page in the unstable tree identical to the
   1916 * page currently being scanned; and if no identical page is found in the
   1917 * tree, we insert rmap_item as a new object into the unstable tree.
   1918 *
   1919 * This function returns pointer to rmap_item found to be identical
   1920 * to the currently scanned page, NULL otherwise.
   1921 *
   1922 * This function does both searching and inserting, because they share
   1923 * the same walking algorithm in an rbtree.
   1924 */
   1925static
   1926struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
   1927					      struct page *page,
   1928					      struct page **tree_pagep)
   1929{
   1930	struct rb_node **new;
   1931	struct rb_root *root;
   1932	struct rb_node *parent = NULL;
   1933	int nid;
   1934
   1935	nid = get_kpfn_nid(page_to_pfn(page));
   1936	root = root_unstable_tree + nid;
   1937	new = &root->rb_node;
   1938
   1939	while (*new) {
   1940		struct rmap_item *tree_rmap_item;
   1941		struct page *tree_page;
   1942		int ret;
   1943
   1944		cond_resched();
   1945		tree_rmap_item = rb_entry(*new, struct rmap_item, node);
   1946		tree_page = get_mergeable_page(tree_rmap_item);
   1947		if (!tree_page)
   1948			return NULL;
   1949
   1950		/*
   1951		 * Don't substitute a ksm page for a forked page.
   1952		 */
   1953		if (page == tree_page) {
   1954			put_page(tree_page);
   1955			return NULL;
   1956		}
   1957
   1958		ret = memcmp_pages(page, tree_page);
   1959
   1960		parent = *new;
   1961		if (ret < 0) {
   1962			put_page(tree_page);
   1963			new = &parent->rb_left;
   1964		} else if (ret > 0) {
   1965			put_page(tree_page);
   1966			new = &parent->rb_right;
   1967		} else if (!ksm_merge_across_nodes &&
   1968			   page_to_nid(tree_page) != nid) {
   1969			/*
   1970			 * If tree_page has been migrated to another NUMA node,
   1971			 * it will be flushed out and put in the right unstable
   1972			 * tree next time: only merge with it when across_nodes.
   1973			 */
   1974			put_page(tree_page);
   1975			return NULL;
   1976		} else {
   1977			*tree_pagep = tree_page;
   1978			return tree_rmap_item;
   1979		}
   1980	}
   1981
   1982	rmap_item->address |= UNSTABLE_FLAG;
   1983	rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
   1984	DO_NUMA(rmap_item->nid = nid);
   1985	rb_link_node(&rmap_item->node, parent, new);
   1986	rb_insert_color(&rmap_item->node, root);
   1987
   1988	ksm_pages_unshared++;
   1989	return NULL;
   1990}
   1991
   1992/*
   1993 * stable_tree_append - add another rmap_item to the linked list of
   1994 * rmap_items hanging off a given node of the stable tree, all sharing
   1995 * the same ksm page.
   1996 */
   1997static void stable_tree_append(struct rmap_item *rmap_item,
   1998			       struct stable_node *stable_node,
   1999			       bool max_page_sharing_bypass)
   2000{
   2001	/*
   2002	 * rmap won't find this mapping if we don't insert the
   2003	 * rmap_item in the right stable_node
   2004	 * duplicate. page_migration could break later if rmap breaks,
   2005	 * so we can as well crash here. We really need to check for
   2006	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
   2007	 * for other negative values as an underflow if detected here
   2008	 * for the first time (and not when decreasing rmap_hlist_len)
   2009	 * would be sign of memory corruption in the stable_node.
   2010	 */
   2011	BUG_ON(stable_node->rmap_hlist_len < 0);
   2012
   2013	stable_node->rmap_hlist_len++;
   2014	if (!max_page_sharing_bypass)
   2015		/* possibly non fatal but unexpected overflow, only warn */
   2016		WARN_ON_ONCE(stable_node->rmap_hlist_len >
   2017			     ksm_max_page_sharing);
   2018
   2019	rmap_item->head = stable_node;
   2020	rmap_item->address |= STABLE_FLAG;
   2021	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
   2022
   2023	if (rmap_item->hlist.next)
   2024		ksm_pages_sharing++;
   2025	else
   2026		ksm_pages_shared++;
   2027
   2028	rmap_item->mm->ksm_merging_pages++;
   2029}
   2030
   2031/*
   2032 * cmp_and_merge_page - first see if page can be merged into the stable tree;
   2033 * if not, compare checksum to previous and if it's the same, see if page can
   2034 * be inserted into the unstable tree, or merged with a page already there and
   2035 * both transferred to the stable tree.
   2036 *
   2037 * @page: the page that we are searching identical page to.
   2038 * @rmap_item: the reverse mapping into the virtual address of this page
   2039 */
   2040static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
   2041{
   2042	struct mm_struct *mm = rmap_item->mm;
   2043	struct rmap_item *tree_rmap_item;
   2044	struct page *tree_page = NULL;
   2045	struct stable_node *stable_node;
   2046	struct page *kpage;
   2047	unsigned int checksum;
   2048	int err;
   2049	bool max_page_sharing_bypass = false;
   2050
   2051	stable_node = page_stable_node(page);
   2052	if (stable_node) {
   2053		if (stable_node->head != &migrate_nodes &&
   2054		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
   2055		    NUMA(stable_node->nid)) {
   2056			stable_node_dup_del(stable_node);
   2057			stable_node->head = &migrate_nodes;
   2058			list_add(&stable_node->list, stable_node->head);
   2059		}
   2060		if (stable_node->head != &migrate_nodes &&
   2061		    rmap_item->head == stable_node)
   2062			return;
   2063		/*
   2064		 * If it's a KSM fork, allow it to go over the sharing limit
   2065		 * without warnings.
   2066		 */
   2067		if (!is_page_sharing_candidate(stable_node))
   2068			max_page_sharing_bypass = true;
   2069	}
   2070
   2071	/* We first start with searching the page inside the stable tree */
   2072	kpage = stable_tree_search(page);
   2073	if (kpage == page && rmap_item->head == stable_node) {
   2074		put_page(kpage);
   2075		return;
   2076	}
   2077
   2078	remove_rmap_item_from_tree(rmap_item);
   2079
   2080	if (kpage) {
   2081		if (PTR_ERR(kpage) == -EBUSY)
   2082			return;
   2083
   2084		err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
   2085		if (!err) {
   2086			/*
   2087			 * The page was successfully merged:
   2088			 * add its rmap_item to the stable tree.
   2089			 */
   2090			lock_page(kpage);
   2091			stable_tree_append(rmap_item, page_stable_node(kpage),
   2092					   max_page_sharing_bypass);
   2093			unlock_page(kpage);
   2094		}
   2095		put_page(kpage);
   2096		return;
   2097	}
   2098
   2099	/*
   2100	 * If the hash value of the page has changed from the last time
   2101	 * we calculated it, this page is changing frequently: therefore we
   2102	 * don't want to insert it in the unstable tree, and we don't want
   2103	 * to waste our time searching for something identical to it there.
   2104	 */
   2105	checksum = calc_checksum(page);
   2106	if (rmap_item->oldchecksum != checksum) {
   2107		rmap_item->oldchecksum = checksum;
   2108		return;
   2109	}
   2110
   2111	/*
   2112	 * Same checksum as an empty page. We attempt to merge it with the
   2113	 * appropriate zero page if the user enabled this via sysfs.
   2114	 */
   2115	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
   2116		struct vm_area_struct *vma;
   2117
   2118		mmap_read_lock(mm);
   2119		vma = find_mergeable_vma(mm, rmap_item->address);
   2120		if (vma) {
   2121			err = try_to_merge_one_page(vma, page,
   2122					ZERO_PAGE(rmap_item->address));
   2123		} else {
   2124			/*
   2125			 * If the vma is out of date, we do not need to
   2126			 * continue.
   2127			 */
   2128			err = 0;
   2129		}
   2130		mmap_read_unlock(mm);
   2131		/*
   2132		 * In case of failure, the page was not really empty, so we
   2133		 * need to continue. Otherwise we're done.
   2134		 */
   2135		if (!err)
   2136			return;
   2137	}
   2138	tree_rmap_item =
   2139		unstable_tree_search_insert(rmap_item, page, &tree_page);
   2140	if (tree_rmap_item) {
   2141		bool split;
   2142
   2143		kpage = try_to_merge_two_pages(rmap_item, page,
   2144						tree_rmap_item, tree_page);
   2145		/*
   2146		 * If both pages we tried to merge belong to the same compound
   2147		 * page, then we actually ended up increasing the reference
   2148		 * count of the same compound page twice, and split_huge_page
   2149		 * failed.
   2150		 * Here we set a flag if that happened, and we use it later to
   2151		 * try split_huge_page again. Since we call put_page right
   2152		 * afterwards, the reference count will be correct and
   2153		 * split_huge_page should succeed.
   2154		 */
   2155		split = PageTransCompound(page)
   2156			&& compound_head(page) == compound_head(tree_page);
   2157		put_page(tree_page);
   2158		if (kpage) {
   2159			/*
   2160			 * The pages were successfully merged: insert new
   2161			 * node in the stable tree and add both rmap_items.
   2162			 */
   2163			lock_page(kpage);
   2164			stable_node = stable_tree_insert(kpage);
   2165			if (stable_node) {
   2166				stable_tree_append(tree_rmap_item, stable_node,
   2167						   false);
   2168				stable_tree_append(rmap_item, stable_node,
   2169						   false);
   2170			}
   2171			unlock_page(kpage);
   2172
   2173			/*
   2174			 * If we fail to insert the page into the stable tree,
   2175			 * we will have 2 virtual addresses that are pointing
   2176			 * to a ksm page left outside the stable tree,
   2177			 * in which case we need to break_cow on both.
   2178			 */
   2179			if (!stable_node) {
   2180				break_cow(tree_rmap_item);
   2181				break_cow(rmap_item);
   2182			}
   2183		} else if (split) {
   2184			/*
   2185			 * We are here if we tried to merge two pages and
   2186			 * failed because they both belonged to the same
   2187			 * compound page. We will split the page now, but no
   2188			 * merging will take place.
   2189			 * We do not want to add the cost of a full lock; if
   2190			 * the page is locked, it is better to skip it and
   2191			 * perhaps try again later.
   2192			 */
   2193			if (!trylock_page(page))
   2194				return;
   2195			split_huge_page(page);
   2196			unlock_page(page);
   2197		}
   2198	}
   2199}
   2200
   2201static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
   2202					    struct rmap_item **rmap_list,
   2203					    unsigned long addr)
   2204{
   2205	struct rmap_item *rmap_item;
   2206
   2207	while (*rmap_list) {
   2208		rmap_item = *rmap_list;
   2209		if ((rmap_item->address & PAGE_MASK) == addr)
   2210			return rmap_item;
   2211		if (rmap_item->address > addr)
   2212			break;
   2213		*rmap_list = rmap_item->rmap_list;
   2214		remove_rmap_item_from_tree(rmap_item);
   2215		free_rmap_item(rmap_item);
   2216	}
   2217
   2218	rmap_item = alloc_rmap_item();
   2219	if (rmap_item) {
   2220		/* It has already been zeroed */
   2221		rmap_item->mm = mm_slot->mm;
   2222		rmap_item->address = addr;
   2223		rmap_item->rmap_list = *rmap_list;
   2224		*rmap_list = rmap_item;
   2225	}
   2226	return rmap_item;
   2227}
   2228
   2229static struct rmap_item *scan_get_next_rmap_item(struct page **page)
   2230{
   2231	struct mm_struct *mm;
   2232	struct mm_slot *slot;
   2233	struct vm_area_struct *vma;
   2234	struct rmap_item *rmap_item;
   2235	int nid;
   2236
   2237	if (list_empty(&ksm_mm_head.mm_list))
   2238		return NULL;
   2239
   2240	slot = ksm_scan.mm_slot;
   2241	if (slot == &ksm_mm_head) {
   2242		/*
   2243		 * A number of pages can hang around indefinitely on per-cpu
   2244		 * pagevecs, raised page count preventing write_protect_page
   2245		 * from merging them.  Though it doesn't really matter much,
   2246		 * it is puzzling to see some stuck in pages_volatile until
   2247		 * other activity jostles them out, and they also prevented
   2248		 * LTP's KSM test from succeeding deterministically; so drain
   2249		 * them here (here rather than on entry to ksm_do_scan(),
   2250		 * so we don't IPI too often when pages_to_scan is set low).
   2251		 */
   2252		lru_add_drain_all();
   2253
   2254		/*
   2255		 * Whereas stale stable_nodes on the stable_tree itself
   2256		 * get pruned in the regular course of stable_tree_search(),
   2257		 * those moved out to the migrate_nodes list can accumulate:
   2258		 * so prune them once before each full scan.
   2259		 */
   2260		if (!ksm_merge_across_nodes) {
   2261			struct stable_node *stable_node, *next;
   2262			struct page *page;
   2263
   2264			list_for_each_entry_safe(stable_node, next,
   2265						 &migrate_nodes, list) {
   2266				page = get_ksm_page(stable_node,
   2267						    GET_KSM_PAGE_NOLOCK);
   2268				if (page)
   2269					put_page(page);
   2270				cond_resched();
   2271			}
   2272		}
   2273
   2274		for (nid = 0; nid < ksm_nr_node_ids; nid++)
   2275			root_unstable_tree[nid] = RB_ROOT;
   2276
   2277		spin_lock(&ksm_mmlist_lock);
   2278		slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
   2279		ksm_scan.mm_slot = slot;
   2280		spin_unlock(&ksm_mmlist_lock);
   2281		/*
   2282		 * Although we tested list_empty() above, a racing __ksm_exit
   2283		 * of the last mm on the list may have removed it since then.
   2284		 */
   2285		if (slot == &ksm_mm_head)
   2286			return NULL;
   2287next_mm:
   2288		ksm_scan.address = 0;
   2289		ksm_scan.rmap_list = &slot->rmap_list;
   2290	}
   2291
   2292	mm = slot->mm;
   2293	mmap_read_lock(mm);
   2294	if (ksm_test_exit(mm))
   2295		vma = NULL;
   2296	else
   2297		vma = find_vma(mm, ksm_scan.address);
   2298
   2299	for (; vma; vma = vma->vm_next) {
   2300		if (!(vma->vm_flags & VM_MERGEABLE))
   2301			continue;
   2302		if (ksm_scan.address < vma->vm_start)
   2303			ksm_scan.address = vma->vm_start;
   2304		if (!vma->anon_vma)
   2305			ksm_scan.address = vma->vm_end;
   2306
   2307		while (ksm_scan.address < vma->vm_end) {
   2308			if (ksm_test_exit(mm))
   2309				break;
   2310			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
   2311			if (IS_ERR_OR_NULL(*page)) {
   2312				ksm_scan.address += PAGE_SIZE;
   2313				cond_resched();
   2314				continue;
   2315			}
   2316			if (PageAnon(*page)) {
   2317				flush_anon_page(vma, *page, ksm_scan.address);
   2318				flush_dcache_page(*page);
   2319				rmap_item = get_next_rmap_item(slot,
   2320					ksm_scan.rmap_list, ksm_scan.address);
   2321				if (rmap_item) {
   2322					ksm_scan.rmap_list =
   2323							&rmap_item->rmap_list;
   2324					ksm_scan.address += PAGE_SIZE;
   2325				} else
   2326					put_page(*page);
   2327				mmap_read_unlock(mm);
   2328				return rmap_item;
   2329			}
   2330			put_page(*page);
   2331			ksm_scan.address += PAGE_SIZE;
   2332			cond_resched();
   2333		}
   2334	}
   2335
   2336	if (ksm_test_exit(mm)) {
   2337		ksm_scan.address = 0;
   2338		ksm_scan.rmap_list = &slot->rmap_list;
   2339	}
   2340	/*
   2341	 * Nuke all the rmap_items that are above this current rmap:
   2342	 * because there were no VM_MERGEABLE vmas with such addresses.
   2343	 */
   2344	remove_trailing_rmap_items(ksm_scan.rmap_list);
   2345
   2346	spin_lock(&ksm_mmlist_lock);
   2347	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
   2348						struct mm_slot, mm_list);
   2349	if (ksm_scan.address == 0) {
   2350		/*
   2351		 * We've completed a full scan of all vmas, holding mmap_lock
   2352		 * throughout, and found no VM_MERGEABLE: so do the same as
   2353		 * __ksm_exit does to remove this mm from all our lists now.
   2354		 * This applies either when cleaning up after __ksm_exit
   2355		 * (but beware: we can reach here even before __ksm_exit),
   2356		 * or when all VM_MERGEABLE areas have been unmapped (and
   2357		 * mmap_lock then protects against race with MADV_MERGEABLE).
   2358		 */
   2359		hash_del(&slot->link);
   2360		list_del(&slot->mm_list);
   2361		spin_unlock(&ksm_mmlist_lock);
   2362
   2363		free_mm_slot(slot);
   2364		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
   2365		mmap_read_unlock(mm);
   2366		mmdrop(mm);
   2367	} else {
   2368		mmap_read_unlock(mm);
   2369		/*
   2370		 * mmap_read_unlock(mm) first because after
   2371		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
   2372		 * already have been freed under us by __ksm_exit()
   2373		 * because the "mm_slot" is still hashed and
   2374		 * ksm_scan.mm_slot doesn't point to it anymore.
   2375		 */
   2376		spin_unlock(&ksm_mmlist_lock);
   2377	}
   2378
   2379	/* Repeat until we've completed scanning the whole list */
   2380	slot = ksm_scan.mm_slot;
   2381	if (slot != &ksm_mm_head)
   2382		goto next_mm;
   2383
   2384	ksm_scan.seqnr++;
   2385	return NULL;
   2386}
   2387
   2388/**
   2389 * ksm_do_scan  - the ksm scanner main worker function.
   2390 * @scan_npages:  number of pages we want to scan before we return.
   2391 */
   2392static void ksm_do_scan(unsigned int scan_npages)
   2393{
   2394	struct rmap_item *rmap_item;
   2395	struct page *page;
   2396
   2397	while (scan_npages-- && likely(!freezing(current))) {
   2398		cond_resched();
   2399		rmap_item = scan_get_next_rmap_item(&page);
   2400		if (!rmap_item)
   2401			return;
   2402		cmp_and_merge_page(page, rmap_item);
   2403		put_page(page);
   2404	}
   2405}
   2406
   2407static int ksmd_should_run(void)
   2408{
   2409	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
   2410}
   2411
   2412static int ksm_scan_thread(void *nothing)
   2413{
   2414	unsigned int sleep_ms;
   2415
   2416	set_freezable();
   2417	set_user_nice(current, 5);
   2418
   2419	while (!kthread_should_stop()) {
   2420		mutex_lock(&ksm_thread_mutex);
   2421		wait_while_offlining();
   2422		if (ksmd_should_run())
   2423			ksm_do_scan(ksm_thread_pages_to_scan);
   2424		mutex_unlock(&ksm_thread_mutex);
   2425
   2426		try_to_freeze();
   2427
   2428		if (ksmd_should_run()) {
   2429			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
   2430			wait_event_interruptible_timeout(ksm_iter_wait,
   2431				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
   2432				msecs_to_jiffies(sleep_ms));
   2433		} else {
   2434			wait_event_freezable(ksm_thread_wait,
   2435				ksmd_should_run() || kthread_should_stop());
   2436		}
   2437	}
   2438	return 0;
   2439}
   2440
   2441int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
   2442		unsigned long end, int advice, unsigned long *vm_flags)
   2443{
   2444	struct mm_struct *mm = vma->vm_mm;
   2445	int err;
   2446
   2447	switch (advice) {
   2448	case MADV_MERGEABLE:
   2449		/*
   2450		 * Be somewhat over-protective for now!
   2451		 */
   2452		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
   2453				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
   2454				 VM_HUGETLB | VM_MIXEDMAP))
   2455			return 0;		/* just ignore the advice */
   2456
   2457		if (vma_is_dax(vma))
   2458			return 0;
   2459
   2460#ifdef VM_SAO
   2461		if (*vm_flags & VM_SAO)
   2462			return 0;
   2463#endif
   2464#ifdef VM_SPARC_ADI
   2465		if (*vm_flags & VM_SPARC_ADI)
   2466			return 0;
   2467#endif
   2468
   2469		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
   2470			err = __ksm_enter(mm);
   2471			if (err)
   2472				return err;
   2473		}
   2474
   2475		*vm_flags |= VM_MERGEABLE;
   2476		break;
   2477
   2478	case MADV_UNMERGEABLE:
   2479		if (!(*vm_flags & VM_MERGEABLE))
   2480			return 0;		/* just ignore the advice */
   2481
   2482		if (vma->anon_vma) {
   2483			err = unmerge_ksm_pages(vma, start, end);
   2484			if (err)
   2485				return err;
   2486		}
   2487
   2488		*vm_flags &= ~VM_MERGEABLE;
   2489		break;
   2490	}
   2491
   2492	return 0;
   2493}
   2494EXPORT_SYMBOL_GPL(ksm_madvise);
   2495
   2496int __ksm_enter(struct mm_struct *mm)
   2497{
   2498	struct mm_slot *mm_slot;
   2499	int needs_wakeup;
   2500
   2501	mm_slot = alloc_mm_slot();
   2502	if (!mm_slot)
   2503		return -ENOMEM;
   2504
   2505	/* Check ksm_run too?  Would need tighter locking */
   2506	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
   2507
   2508	spin_lock(&ksm_mmlist_lock);
   2509	insert_to_mm_slots_hash(mm, mm_slot);
   2510	/*
   2511	 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
   2512	 * insert just behind the scanning cursor, to let the area settle
   2513	 * down a little; when fork is followed by immediate exec, we don't
   2514	 * want ksmd to waste time setting up and tearing down an rmap_list.
   2515	 *
   2516	 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
   2517	 * scanning cursor, otherwise KSM pages in newly forked mms will be
   2518	 * missed: then we might as well insert at the end of the list.
   2519	 */
   2520	if (ksm_run & KSM_RUN_UNMERGE)
   2521		list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
   2522	else
   2523		list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
   2524	spin_unlock(&ksm_mmlist_lock);
   2525
   2526	set_bit(MMF_VM_MERGEABLE, &mm->flags);
   2527	mmgrab(mm);
   2528
   2529	if (needs_wakeup)
   2530		wake_up_interruptible(&ksm_thread_wait);
   2531
   2532	return 0;
   2533}
   2534
   2535void __ksm_exit(struct mm_struct *mm)
   2536{
   2537	struct mm_slot *mm_slot;
   2538	int easy_to_free = 0;
   2539
   2540	/*
   2541	 * This process is exiting: if it's straightforward (as is the
   2542	 * case when ksmd was never running), free mm_slot immediately.
   2543	 * But if it's at the cursor or has rmap_items linked to it, use
   2544	 * mmap_lock to synchronize with any break_cows before pagetables
   2545	 * are freed, and leave the mm_slot on the list for ksmd to free.
   2546	 * Beware: ksm may already have noticed it exiting and freed the slot.
   2547	 */
   2548
   2549	spin_lock(&ksm_mmlist_lock);
   2550	mm_slot = get_mm_slot(mm);
   2551	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
   2552		if (!mm_slot->rmap_list) {
   2553			hash_del(&mm_slot->link);
   2554			list_del(&mm_slot->mm_list);
   2555			easy_to_free = 1;
   2556		} else {
   2557			list_move(&mm_slot->mm_list,
   2558				  &ksm_scan.mm_slot->mm_list);
   2559		}
   2560	}
   2561	spin_unlock(&ksm_mmlist_lock);
   2562
   2563	if (easy_to_free) {
   2564		free_mm_slot(mm_slot);
   2565		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
   2566		mmdrop(mm);
   2567	} else if (mm_slot) {
   2568		mmap_write_lock(mm);
   2569		mmap_write_unlock(mm);
   2570	}
   2571}
   2572
   2573struct page *ksm_might_need_to_copy(struct page *page,
   2574			struct vm_area_struct *vma, unsigned long address)
   2575{
   2576	struct folio *folio = page_folio(page);
   2577	struct anon_vma *anon_vma = folio_anon_vma(folio);
   2578	struct page *new_page;
   2579
   2580	if (PageKsm(page)) {
   2581		if (page_stable_node(page) &&
   2582		    !(ksm_run & KSM_RUN_UNMERGE))
   2583			return page;	/* no need to copy it */
   2584	} else if (!anon_vma) {
   2585		return page;		/* no need to copy it */
   2586	} else if (page->index == linear_page_index(vma, address) &&
   2587			anon_vma->root == vma->anon_vma->root) {
   2588		return page;		/* still no need to copy it */
   2589	}
   2590	if (!PageUptodate(page))
   2591		return page;		/* let do_swap_page report the error */
   2592
   2593	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
   2594	if (new_page &&
   2595	    mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
   2596		put_page(new_page);
   2597		new_page = NULL;
   2598	}
   2599	if (new_page) {
   2600		copy_user_highpage(new_page, page, address, vma);
   2601
   2602		SetPageDirty(new_page);
   2603		__SetPageUptodate(new_page);
   2604		__SetPageLocked(new_page);
   2605#ifdef CONFIG_SWAP
   2606		count_vm_event(KSM_SWPIN_COPY);
   2607#endif
   2608	}
   2609
   2610	return new_page;
   2611}
   2612
   2613void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
   2614{
   2615	struct stable_node *stable_node;
   2616	struct rmap_item *rmap_item;
   2617	int search_new_forks = 0;
   2618
   2619	VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
   2620
   2621	/*
   2622	 * Rely on the page lock to protect against concurrent modifications
   2623	 * to that page's node of the stable tree.
   2624	 */
   2625	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   2626
   2627	stable_node = folio_stable_node(folio);
   2628	if (!stable_node)
   2629		return;
   2630again:
   2631	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
   2632		struct anon_vma *anon_vma = rmap_item->anon_vma;
   2633		struct anon_vma_chain *vmac;
   2634		struct vm_area_struct *vma;
   2635
   2636		cond_resched();
   2637		if (!anon_vma_trylock_read(anon_vma)) {
   2638			if (rwc->try_lock) {
   2639				rwc->contended = true;
   2640				return;
   2641			}
   2642			anon_vma_lock_read(anon_vma);
   2643		}
   2644		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
   2645					       0, ULONG_MAX) {
   2646			unsigned long addr;
   2647
   2648			cond_resched();
   2649			vma = vmac->vma;
   2650
   2651			/* Ignore the stable/unstable/sqnr flags */
   2652			addr = rmap_item->address & PAGE_MASK;
   2653
   2654			if (addr < vma->vm_start || addr >= vma->vm_end)
   2655				continue;
   2656			/*
   2657			 * Initially we examine only the vma which covers this
   2658			 * rmap_item; but later, if there is still work to do,
   2659			 * we examine covering vmas in other mms: in case they
   2660			 * were forked from the original since ksmd passed.
   2661			 */
   2662			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
   2663				continue;
   2664
   2665			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
   2666				continue;
   2667
   2668			if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
   2669				anon_vma_unlock_read(anon_vma);
   2670				return;
   2671			}
   2672			if (rwc->done && rwc->done(folio)) {
   2673				anon_vma_unlock_read(anon_vma);
   2674				return;
   2675			}
   2676		}
   2677		anon_vma_unlock_read(anon_vma);
   2678	}
   2679	if (!search_new_forks++)
   2680		goto again;
   2681}
   2682
   2683#ifdef CONFIG_MIGRATION
   2684void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
   2685{
   2686	struct stable_node *stable_node;
   2687
   2688	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   2689	VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
   2690	VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
   2691
   2692	stable_node = folio_stable_node(folio);
   2693	if (stable_node) {
   2694		VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
   2695		stable_node->kpfn = folio_pfn(newfolio);
   2696		/*
   2697		 * newfolio->mapping was set in advance; now we need smp_wmb()
   2698		 * to make sure that the new stable_node->kpfn is visible
   2699		 * to get_ksm_page() before it can see that folio->mapping
   2700		 * has gone stale (or that folio_test_swapcache has been cleared).
   2701		 */
   2702		smp_wmb();
   2703		set_page_stable_node(&folio->page, NULL);
   2704	}
   2705}
   2706#endif /* CONFIG_MIGRATION */
   2707
   2708#ifdef CONFIG_MEMORY_HOTREMOVE
   2709static void wait_while_offlining(void)
   2710{
   2711	while (ksm_run & KSM_RUN_OFFLINE) {
   2712		mutex_unlock(&ksm_thread_mutex);
   2713		wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
   2714			    TASK_UNINTERRUPTIBLE);
   2715		mutex_lock(&ksm_thread_mutex);
   2716	}
   2717}
   2718
   2719static bool stable_node_dup_remove_range(struct stable_node *stable_node,
   2720					 unsigned long start_pfn,
   2721					 unsigned long end_pfn)
   2722{
   2723	if (stable_node->kpfn >= start_pfn &&
   2724	    stable_node->kpfn < end_pfn) {
   2725		/*
   2726		 * Don't get_ksm_page, page has already gone:
   2727		 * which is why we keep kpfn instead of page*
   2728		 */
   2729		remove_node_from_stable_tree(stable_node);
   2730		return true;
   2731	}
   2732	return false;
   2733}
   2734
   2735static bool stable_node_chain_remove_range(struct stable_node *stable_node,
   2736					   unsigned long start_pfn,
   2737					   unsigned long end_pfn,
   2738					   struct rb_root *root)
   2739{
   2740	struct stable_node *dup;
   2741	struct hlist_node *hlist_safe;
   2742
   2743	if (!is_stable_node_chain(stable_node)) {
   2744		VM_BUG_ON(is_stable_node_dup(stable_node));
   2745		return stable_node_dup_remove_range(stable_node, start_pfn,
   2746						    end_pfn);
   2747	}
   2748
   2749	hlist_for_each_entry_safe(dup, hlist_safe,
   2750				  &stable_node->hlist, hlist_dup) {
   2751		VM_BUG_ON(!is_stable_node_dup(dup));
   2752		stable_node_dup_remove_range(dup, start_pfn, end_pfn);
   2753	}
   2754	if (hlist_empty(&stable_node->hlist)) {
   2755		free_stable_node_chain(stable_node, root);
   2756		return true; /* notify caller that tree was rebalanced */
   2757	} else
   2758		return false;
   2759}
   2760
   2761static void ksm_check_stable_tree(unsigned long start_pfn,
   2762				  unsigned long end_pfn)
   2763{
   2764	struct stable_node *stable_node, *next;
   2765	struct rb_node *node;
   2766	int nid;
   2767
   2768	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
   2769		node = rb_first(root_stable_tree + nid);
   2770		while (node) {
   2771			stable_node = rb_entry(node, struct stable_node, node);
   2772			if (stable_node_chain_remove_range(stable_node,
   2773							   start_pfn, end_pfn,
   2774							   root_stable_tree +
   2775							   nid))
   2776				node = rb_first(root_stable_tree + nid);
   2777			else
   2778				node = rb_next(node);
   2779			cond_resched();
   2780		}
   2781	}
   2782	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
   2783		if (stable_node->kpfn >= start_pfn &&
   2784		    stable_node->kpfn < end_pfn)
   2785			remove_node_from_stable_tree(stable_node);
   2786		cond_resched();
   2787	}
   2788}
   2789
   2790static int ksm_memory_callback(struct notifier_block *self,
   2791			       unsigned long action, void *arg)
   2792{
   2793	struct memory_notify *mn = arg;
   2794
   2795	switch (action) {
   2796	case MEM_GOING_OFFLINE:
   2797		/*
   2798		 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
   2799		 * and remove_all_stable_nodes() while memory is going offline:
   2800		 * it is unsafe for them to touch the stable tree at this time.
   2801		 * But unmerge_ksm_pages(), rmap lookups and other entry points
   2802		 * which do not need the ksm_thread_mutex are all safe.
   2803		 */
   2804		mutex_lock(&ksm_thread_mutex);
   2805		ksm_run |= KSM_RUN_OFFLINE;
   2806		mutex_unlock(&ksm_thread_mutex);
   2807		break;
   2808
   2809	case MEM_OFFLINE:
   2810		/*
   2811		 * Most of the work is done by page migration; but there might
   2812		 * be a few stable_nodes left over, still pointing to struct
   2813		 * pages which have been offlined: prune those from the tree,
   2814		 * otherwise get_ksm_page() might later try to access a
   2815		 * non-existent struct page.
   2816		 */
   2817		ksm_check_stable_tree(mn->start_pfn,
   2818				      mn->start_pfn + mn->nr_pages);
   2819		fallthrough;
   2820	case MEM_CANCEL_OFFLINE:
   2821		mutex_lock(&ksm_thread_mutex);
   2822		ksm_run &= ~KSM_RUN_OFFLINE;
   2823		mutex_unlock(&ksm_thread_mutex);
   2824
   2825		smp_mb();	/* wake_up_bit advises this */
   2826		wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
   2827		break;
   2828	}
   2829	return NOTIFY_OK;
   2830}
   2831#else
   2832static void wait_while_offlining(void)
   2833{
   2834}
   2835#endif /* CONFIG_MEMORY_HOTREMOVE */
   2836
   2837#ifdef CONFIG_SYSFS
   2838/*
   2839 * This all compiles without CONFIG_SYSFS, but is a waste of space.
   2840 */
   2841
   2842#define KSM_ATTR_RO(_name) \
   2843	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
   2844#define KSM_ATTR(_name) \
   2845	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
   2846
   2847static ssize_t sleep_millisecs_show(struct kobject *kobj,
   2848				    struct kobj_attribute *attr, char *buf)
   2849{
   2850	return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
   2851}
   2852
   2853static ssize_t sleep_millisecs_store(struct kobject *kobj,
   2854				     struct kobj_attribute *attr,
   2855				     const char *buf, size_t count)
   2856{
   2857	unsigned int msecs;
   2858	int err;
   2859
   2860	err = kstrtouint(buf, 10, &msecs);
   2861	if (err)
   2862		return -EINVAL;
   2863
   2864	ksm_thread_sleep_millisecs = msecs;
   2865	wake_up_interruptible(&ksm_iter_wait);
   2866
   2867	return count;
   2868}
   2869KSM_ATTR(sleep_millisecs);
   2870
   2871static ssize_t pages_to_scan_show(struct kobject *kobj,
   2872				  struct kobj_attribute *attr, char *buf)
   2873{
   2874	return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
   2875}
   2876
   2877static ssize_t pages_to_scan_store(struct kobject *kobj,
   2878				   struct kobj_attribute *attr,
   2879				   const char *buf, size_t count)
   2880{
   2881	unsigned int nr_pages;
   2882	int err;
   2883
   2884	err = kstrtouint(buf, 10, &nr_pages);
   2885	if (err)
   2886		return -EINVAL;
   2887
   2888	ksm_thread_pages_to_scan = nr_pages;
   2889
   2890	return count;
   2891}
   2892KSM_ATTR(pages_to_scan);
   2893
   2894static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
   2895			char *buf)
   2896{
   2897	return sysfs_emit(buf, "%lu\n", ksm_run);
   2898}
   2899
   2900static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
   2901			 const char *buf, size_t count)
   2902{
   2903	unsigned int flags;
   2904	int err;
   2905
   2906	err = kstrtouint(buf, 10, &flags);
   2907	if (err)
   2908		return -EINVAL;
   2909	if (flags > KSM_RUN_UNMERGE)
   2910		return -EINVAL;
   2911
   2912	/*
   2913	 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
   2914	 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
   2915	 * breaking COW to free the pages_shared (but leaves mm_slots
   2916	 * on the list for when ksmd may be set running again).
   2917	 */
   2918
   2919	mutex_lock(&ksm_thread_mutex);
   2920	wait_while_offlining();
   2921	if (ksm_run != flags) {
   2922		ksm_run = flags;
   2923		if (flags & KSM_RUN_UNMERGE) {
   2924			set_current_oom_origin();
   2925			err = unmerge_and_remove_all_rmap_items();
   2926			clear_current_oom_origin();
   2927			if (err) {
   2928				ksm_run = KSM_RUN_STOP;
   2929				count = err;
   2930			}
   2931		}
   2932	}
   2933	mutex_unlock(&ksm_thread_mutex);
   2934
   2935	if (flags & KSM_RUN_MERGE)
   2936		wake_up_interruptible(&ksm_thread_wait);
   2937
   2938	return count;
   2939}
   2940KSM_ATTR(run);
   2941
   2942#ifdef CONFIG_NUMA
   2943static ssize_t merge_across_nodes_show(struct kobject *kobj,
   2944				       struct kobj_attribute *attr, char *buf)
   2945{
   2946	return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
   2947}
   2948
   2949static ssize_t merge_across_nodes_store(struct kobject *kobj,
   2950				   struct kobj_attribute *attr,
   2951				   const char *buf, size_t count)
   2952{
   2953	int err;
   2954	unsigned long knob;
   2955
   2956	err = kstrtoul(buf, 10, &knob);
   2957	if (err)
   2958		return err;
   2959	if (knob > 1)
   2960		return -EINVAL;
   2961
   2962	mutex_lock(&ksm_thread_mutex);
   2963	wait_while_offlining();
   2964	if (ksm_merge_across_nodes != knob) {
   2965		if (ksm_pages_shared || remove_all_stable_nodes())
   2966			err = -EBUSY;
   2967		else if (root_stable_tree == one_stable_tree) {
   2968			struct rb_root *buf;
   2969			/*
   2970			 * This is the first time that we switch away from the
   2971			 * default of merging across nodes: must now allocate
   2972			 * a buffer to hold as many roots as may be needed.
   2973			 * Allocate stable and unstable together:
   2974			 * MAXSMP NODES_SHIFT 10 will use 16kB.
   2975			 */
   2976			buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
   2977				      GFP_KERNEL);
   2978			/* Let us assume that RB_ROOT is NULL is zero */
   2979			if (!buf)
   2980				err = -ENOMEM;
   2981			else {
   2982				root_stable_tree = buf;
   2983				root_unstable_tree = buf + nr_node_ids;
   2984				/* Stable tree is empty but not the unstable */
   2985				root_unstable_tree[0] = one_unstable_tree[0];
   2986			}
   2987		}
   2988		if (!err) {
   2989			ksm_merge_across_nodes = knob;
   2990			ksm_nr_node_ids = knob ? 1 : nr_node_ids;
   2991		}
   2992	}
   2993	mutex_unlock(&ksm_thread_mutex);
   2994
   2995	return err ? err : count;
   2996}
   2997KSM_ATTR(merge_across_nodes);
   2998#endif
   2999
   3000static ssize_t use_zero_pages_show(struct kobject *kobj,
   3001				   struct kobj_attribute *attr, char *buf)
   3002{
   3003	return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
   3004}
   3005static ssize_t use_zero_pages_store(struct kobject *kobj,
   3006				   struct kobj_attribute *attr,
   3007				   const char *buf, size_t count)
   3008{
   3009	int err;
   3010	bool value;
   3011
   3012	err = kstrtobool(buf, &value);
   3013	if (err)
   3014		return -EINVAL;
   3015
   3016	ksm_use_zero_pages = value;
   3017
   3018	return count;
   3019}
   3020KSM_ATTR(use_zero_pages);
   3021
   3022static ssize_t max_page_sharing_show(struct kobject *kobj,
   3023				     struct kobj_attribute *attr, char *buf)
   3024{
   3025	return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
   3026}
   3027
   3028static ssize_t max_page_sharing_store(struct kobject *kobj,
   3029				      struct kobj_attribute *attr,
   3030				      const char *buf, size_t count)
   3031{
   3032	int err;
   3033	int knob;
   3034
   3035	err = kstrtoint(buf, 10, &knob);
   3036	if (err)
   3037		return err;
   3038	/*
   3039	 * When a KSM page is created it is shared by 2 mappings. This
   3040	 * being a signed comparison, it implicitly verifies it's not
   3041	 * negative.
   3042	 */
   3043	if (knob < 2)
   3044		return -EINVAL;
   3045
   3046	if (READ_ONCE(ksm_max_page_sharing) == knob)
   3047		return count;
   3048
   3049	mutex_lock(&ksm_thread_mutex);
   3050	wait_while_offlining();
   3051	if (ksm_max_page_sharing != knob) {
   3052		if (ksm_pages_shared || remove_all_stable_nodes())
   3053			err = -EBUSY;
   3054		else
   3055			ksm_max_page_sharing = knob;
   3056	}
   3057	mutex_unlock(&ksm_thread_mutex);
   3058
   3059	return err ? err : count;
   3060}
   3061KSM_ATTR(max_page_sharing);
   3062
   3063static ssize_t pages_shared_show(struct kobject *kobj,
   3064				 struct kobj_attribute *attr, char *buf)
   3065{
   3066	return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
   3067}
   3068KSM_ATTR_RO(pages_shared);
   3069
   3070static ssize_t pages_sharing_show(struct kobject *kobj,
   3071				  struct kobj_attribute *attr, char *buf)
   3072{
   3073	return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
   3074}
   3075KSM_ATTR_RO(pages_sharing);
   3076
   3077static ssize_t pages_unshared_show(struct kobject *kobj,
   3078				   struct kobj_attribute *attr, char *buf)
   3079{
   3080	return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
   3081}
   3082KSM_ATTR_RO(pages_unshared);
   3083
   3084static ssize_t pages_volatile_show(struct kobject *kobj,
   3085				   struct kobj_attribute *attr, char *buf)
   3086{
   3087	long ksm_pages_volatile;
   3088
   3089	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
   3090				- ksm_pages_sharing - ksm_pages_unshared;
   3091	/*
   3092	 * It was not worth any locking to calculate that statistic,
   3093	 * but it might therefore sometimes be negative: conceal that.
   3094	 */
   3095	if (ksm_pages_volatile < 0)
   3096		ksm_pages_volatile = 0;
   3097	return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
   3098}
   3099KSM_ATTR_RO(pages_volatile);
   3100
   3101static ssize_t stable_node_dups_show(struct kobject *kobj,
   3102				     struct kobj_attribute *attr, char *buf)
   3103{
   3104	return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
   3105}
   3106KSM_ATTR_RO(stable_node_dups);
   3107
   3108static ssize_t stable_node_chains_show(struct kobject *kobj,
   3109				       struct kobj_attribute *attr, char *buf)
   3110{
   3111	return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
   3112}
   3113KSM_ATTR_RO(stable_node_chains);
   3114
   3115static ssize_t
   3116stable_node_chains_prune_millisecs_show(struct kobject *kobj,
   3117					struct kobj_attribute *attr,
   3118					char *buf)
   3119{
   3120	return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
   3121}
   3122
   3123static ssize_t
   3124stable_node_chains_prune_millisecs_store(struct kobject *kobj,
   3125					 struct kobj_attribute *attr,
   3126					 const char *buf, size_t count)
   3127{
   3128	unsigned int msecs;
   3129	int err;
   3130
   3131	err = kstrtouint(buf, 10, &msecs);
   3132	if (err)
   3133		return -EINVAL;
   3134
   3135	ksm_stable_node_chains_prune_millisecs = msecs;
   3136
   3137	return count;
   3138}
   3139KSM_ATTR(stable_node_chains_prune_millisecs);
   3140
   3141static ssize_t full_scans_show(struct kobject *kobj,
   3142			       struct kobj_attribute *attr, char *buf)
   3143{
   3144	return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
   3145}
   3146KSM_ATTR_RO(full_scans);
   3147
   3148static struct attribute *ksm_attrs[] = {
   3149	&sleep_millisecs_attr.attr,
   3150	&pages_to_scan_attr.attr,
   3151	&run_attr.attr,
   3152	&pages_shared_attr.attr,
   3153	&pages_sharing_attr.attr,
   3154	&pages_unshared_attr.attr,
   3155	&pages_volatile_attr.attr,
   3156	&full_scans_attr.attr,
   3157#ifdef CONFIG_NUMA
   3158	&merge_across_nodes_attr.attr,
   3159#endif
   3160	&max_page_sharing_attr.attr,
   3161	&stable_node_chains_attr.attr,
   3162	&stable_node_dups_attr.attr,
   3163	&stable_node_chains_prune_millisecs_attr.attr,
   3164	&use_zero_pages_attr.attr,
   3165	NULL,
   3166};
   3167
   3168static const struct attribute_group ksm_attr_group = {
   3169	.attrs = ksm_attrs,
   3170	.name = "ksm",
   3171};
   3172#endif /* CONFIG_SYSFS */
   3173
   3174static int __init ksm_init(void)
   3175{
   3176	struct task_struct *ksm_thread;
   3177	int err;
   3178
   3179	/* The correct value depends on page size and endianness */
   3180	zero_checksum = calc_checksum(ZERO_PAGE(0));
   3181	/* Default to false for backwards compatibility */
   3182	ksm_use_zero_pages = false;
   3183
   3184	err = ksm_slab_init();
   3185	if (err)
   3186		goto out;
   3187
   3188	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
   3189	if (IS_ERR(ksm_thread)) {
   3190		pr_err("ksm: creating kthread failed\n");
   3191		err = PTR_ERR(ksm_thread);
   3192		goto out_free;
   3193	}
   3194
   3195#ifdef CONFIG_SYSFS
   3196	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
   3197	if (err) {
   3198		pr_err("ksm: register sysfs failed\n");
   3199		kthread_stop(ksm_thread);
   3200		goto out_free;
   3201	}
   3202#else
   3203	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */
   3204
   3205#endif /* CONFIG_SYSFS */
   3206
   3207#ifdef CONFIG_MEMORY_HOTREMOVE
   3208	/* There is no significance to this priority 100 */
   3209	hotplug_memory_notifier(ksm_memory_callback, 100);
   3210#endif
   3211	return 0;
   3212
   3213out_free:
   3214	ksm_slab_free();
   3215out:
   3216	return err;
   3217}
   3218subsys_initcall(ksm_init);