cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

compaction.c (86123B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * linux/mm/compaction.c
      4 *
      5 * Memory compaction for the reduction of external fragmentation. Note that
      6 * this heavily depends upon page migration to do all the real heavy
      7 * lifting
      8 *
      9 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
     10 */
     11#include <linux/cpu.h>
     12#include <linux/swap.h>
     13#include <linux/migrate.h>
     14#include <linux/compaction.h>
     15#include <linux/mm_inline.h>
     16#include <linux/sched/signal.h>
     17#include <linux/backing-dev.h>
     18#include <linux/sysctl.h>
     19#include <linux/sysfs.h>
     20#include <linux/page-isolation.h>
     21#include <linux/kasan.h>
     22#include <linux/kthread.h>
     23#include <linux/freezer.h>
     24#include <linux/page_owner.h>
     25#include <linux/psi.h>
     26#include "internal.h"
     27
     28#ifdef CONFIG_COMPACTION
     29/*
     30 * Fragmentation score check interval for proactive compaction purposes.
     31 */
     32#define HPAGE_FRAG_CHECK_INTERVAL_MSEC	(500)
     33
     34static inline void count_compact_event(enum vm_event_item item)
     35{
     36	count_vm_event(item);
     37}
     38
     39static inline void count_compact_events(enum vm_event_item item, long delta)
     40{
     41	count_vm_events(item, delta);
     42}
     43#else
     44#define count_compact_event(item) do { } while (0)
     45#define count_compact_events(item, delta) do { } while (0)
     46#endif
     47
     48#if defined CONFIG_COMPACTION || defined CONFIG_CMA
     49
     50#define CREATE_TRACE_POINTS
     51#include <trace/events/compaction.h>
     52
     53#define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
     54#define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
     55#define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
     56#define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)
     57
     58/*
     59 * Page order with-respect-to which proactive compaction
     60 * calculates external fragmentation, which is used as
     61 * the "fragmentation score" of a node/zone.
     62 */
     63#if defined CONFIG_TRANSPARENT_HUGEPAGE
     64#define COMPACTION_HPAGE_ORDER	HPAGE_PMD_ORDER
     65#elif defined CONFIG_HUGETLBFS
     66#define COMPACTION_HPAGE_ORDER	HUGETLB_PAGE_ORDER
     67#else
     68#define COMPACTION_HPAGE_ORDER	(PMD_SHIFT - PAGE_SHIFT)
     69#endif
     70
     71static unsigned long release_freepages(struct list_head *freelist)
     72{
     73	struct page *page, *next;
     74	unsigned long high_pfn = 0;
     75
     76	list_for_each_entry_safe(page, next, freelist, lru) {
     77		unsigned long pfn = page_to_pfn(page);
     78		list_del(&page->lru);
     79		__free_page(page);
     80		if (pfn > high_pfn)
     81			high_pfn = pfn;
     82	}
     83
     84	return high_pfn;
     85}
     86
     87static void split_map_pages(struct list_head *list)
     88{
     89	unsigned int i, order, nr_pages;
     90	struct page *page, *next;
     91	LIST_HEAD(tmp_list);
     92
     93	list_for_each_entry_safe(page, next, list, lru) {
     94		list_del(&page->lru);
     95
     96		order = page_private(page);
     97		nr_pages = 1 << order;
     98
     99		post_alloc_hook(page, order, __GFP_MOVABLE);
    100		if (order)
    101			split_page(page, order);
    102
    103		for (i = 0; i < nr_pages; i++) {
    104			list_add(&page->lru, &tmp_list);
    105			page++;
    106		}
    107	}
    108
    109	list_splice(&tmp_list, list);
    110}
    111
    112#ifdef CONFIG_COMPACTION
    113
    114int PageMovable(struct page *page)
    115{
    116	struct address_space *mapping;
    117
    118	VM_BUG_ON_PAGE(!PageLocked(page), page);
    119	if (!__PageMovable(page))
    120		return 0;
    121
    122	mapping = page_mapping(page);
    123	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
    124		return 1;
    125
    126	return 0;
    127}
    128EXPORT_SYMBOL(PageMovable);
    129
    130void __SetPageMovable(struct page *page, struct address_space *mapping)
    131{
    132	VM_BUG_ON_PAGE(!PageLocked(page), page);
    133	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
    134	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
    135}
    136EXPORT_SYMBOL(__SetPageMovable);
    137
    138void __ClearPageMovable(struct page *page)
    139{
    140	VM_BUG_ON_PAGE(!PageMovable(page), page);
    141	/*
    142	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
    143	 * flag so that VM can catch up released page by driver after isolation.
    144	 * With it, VM migration doesn't try to put it back.
    145	 */
    146	page->mapping = (void *)((unsigned long)page->mapping &
    147				PAGE_MAPPING_MOVABLE);
    148}
    149EXPORT_SYMBOL(__ClearPageMovable);
    150
    151/* Do not skip compaction more than 64 times */
    152#define COMPACT_MAX_DEFER_SHIFT 6
    153
    154/*
    155 * Compaction is deferred when compaction fails to result in a page
    156 * allocation success. 1 << compact_defer_shift, compactions are skipped up
    157 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
    158 */
    159static void defer_compaction(struct zone *zone, int order)
    160{
    161	zone->compact_considered = 0;
    162	zone->compact_defer_shift++;
    163
    164	if (order < zone->compact_order_failed)
    165		zone->compact_order_failed = order;
    166
    167	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
    168		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
    169
    170	trace_mm_compaction_defer_compaction(zone, order);
    171}
    172
    173/* Returns true if compaction should be skipped this time */
    174static bool compaction_deferred(struct zone *zone, int order)
    175{
    176	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
    177
    178	if (order < zone->compact_order_failed)
    179		return false;
    180
    181	/* Avoid possible overflow */
    182	if (++zone->compact_considered >= defer_limit) {
    183		zone->compact_considered = defer_limit;
    184		return false;
    185	}
    186
    187	trace_mm_compaction_deferred(zone, order);
    188
    189	return true;
    190}
    191
    192/*
    193 * Update defer tracking counters after successful compaction of given order,
    194 * which means an allocation either succeeded (alloc_success == true) or is
    195 * expected to succeed.
    196 */
    197void compaction_defer_reset(struct zone *zone, int order,
    198		bool alloc_success)
    199{
    200	if (alloc_success) {
    201		zone->compact_considered = 0;
    202		zone->compact_defer_shift = 0;
    203	}
    204	if (order >= zone->compact_order_failed)
    205		zone->compact_order_failed = order + 1;
    206
    207	trace_mm_compaction_defer_reset(zone, order);
    208}
    209
    210/* Returns true if restarting compaction after many failures */
    211static bool compaction_restarting(struct zone *zone, int order)
    212{
    213	if (order < zone->compact_order_failed)
    214		return false;
    215
    216	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
    217		zone->compact_considered >= 1UL << zone->compact_defer_shift;
    218}
    219
    220/* Returns true if the pageblock should be scanned for pages to isolate. */
    221static inline bool isolation_suitable(struct compact_control *cc,
    222					struct page *page)
    223{
    224	if (cc->ignore_skip_hint)
    225		return true;
    226
    227	return !get_pageblock_skip(page);
    228}
    229
    230static void reset_cached_positions(struct zone *zone)
    231{
    232	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
    233	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
    234	zone->compact_cached_free_pfn =
    235				pageblock_start_pfn(zone_end_pfn(zone) - 1);
    236}
    237
    238/*
    239 * Compound pages of >= pageblock_order should consistently be skipped until
    240 * released. It is always pointless to compact pages of such order (if they are
    241 * migratable), and the pageblocks they occupy cannot contain any free pages.
    242 */
    243static bool pageblock_skip_persistent(struct page *page)
    244{
    245	if (!PageCompound(page))
    246		return false;
    247
    248	page = compound_head(page);
    249
    250	if (compound_order(page) >= pageblock_order)
    251		return true;
    252
    253	return false;
    254}
    255
    256static bool
    257__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
    258							bool check_target)
    259{
    260	struct page *page = pfn_to_online_page(pfn);
    261	struct page *block_page;
    262	struct page *end_page;
    263	unsigned long block_pfn;
    264
    265	if (!page)
    266		return false;
    267	if (zone != page_zone(page))
    268		return false;
    269	if (pageblock_skip_persistent(page))
    270		return false;
    271
    272	/*
    273	 * If skip is already cleared do no further checking once the
    274	 * restart points have been set.
    275	 */
    276	if (check_source && check_target && !get_pageblock_skip(page))
    277		return true;
    278
    279	/*
    280	 * If clearing skip for the target scanner, do not select a
    281	 * non-movable pageblock as the starting point.
    282	 */
    283	if (!check_source && check_target &&
    284	    get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
    285		return false;
    286
    287	/* Ensure the start of the pageblock or zone is online and valid */
    288	block_pfn = pageblock_start_pfn(pfn);
    289	block_pfn = max(block_pfn, zone->zone_start_pfn);
    290	block_page = pfn_to_online_page(block_pfn);
    291	if (block_page) {
    292		page = block_page;
    293		pfn = block_pfn;
    294	}
    295
    296	/* Ensure the end of the pageblock or zone is online and valid */
    297	block_pfn = pageblock_end_pfn(pfn) - 1;
    298	block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
    299	end_page = pfn_to_online_page(block_pfn);
    300	if (!end_page)
    301		return false;
    302
    303	/*
    304	 * Only clear the hint if a sample indicates there is either a
    305	 * free page or an LRU page in the block. One or other condition
    306	 * is necessary for the block to be a migration source/target.
    307	 */
    308	do {
    309		if (check_source && PageLRU(page)) {
    310			clear_pageblock_skip(page);
    311			return true;
    312		}
    313
    314		if (check_target && PageBuddy(page)) {
    315			clear_pageblock_skip(page);
    316			return true;
    317		}
    318
    319		page += (1 << PAGE_ALLOC_COSTLY_ORDER);
    320	} while (page <= end_page);
    321
    322	return false;
    323}
    324
    325/*
    326 * This function is called to clear all cached information on pageblocks that
    327 * should be skipped for page isolation when the migrate and free page scanner
    328 * meet.
    329 */
    330static void __reset_isolation_suitable(struct zone *zone)
    331{
    332	unsigned long migrate_pfn = zone->zone_start_pfn;
    333	unsigned long free_pfn = zone_end_pfn(zone) - 1;
    334	unsigned long reset_migrate = free_pfn;
    335	unsigned long reset_free = migrate_pfn;
    336	bool source_set = false;
    337	bool free_set = false;
    338
    339	if (!zone->compact_blockskip_flush)
    340		return;
    341
    342	zone->compact_blockskip_flush = false;
    343
    344	/*
    345	 * Walk the zone and update pageblock skip information. Source looks
    346	 * for PageLRU while target looks for PageBuddy. When the scanner
    347	 * is found, both PageBuddy and PageLRU are checked as the pageblock
    348	 * is suitable as both source and target.
    349	 */
    350	for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
    351					free_pfn -= pageblock_nr_pages) {
    352		cond_resched();
    353
    354		/* Update the migrate PFN */
    355		if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
    356		    migrate_pfn < reset_migrate) {
    357			source_set = true;
    358			reset_migrate = migrate_pfn;
    359			zone->compact_init_migrate_pfn = reset_migrate;
    360			zone->compact_cached_migrate_pfn[0] = reset_migrate;
    361			zone->compact_cached_migrate_pfn[1] = reset_migrate;
    362		}
    363
    364		/* Update the free PFN */
    365		if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
    366		    free_pfn > reset_free) {
    367			free_set = true;
    368			reset_free = free_pfn;
    369			zone->compact_init_free_pfn = reset_free;
    370			zone->compact_cached_free_pfn = reset_free;
    371		}
    372	}
    373
    374	/* Leave no distance if no suitable block was reset */
    375	if (reset_migrate >= reset_free) {
    376		zone->compact_cached_migrate_pfn[0] = migrate_pfn;
    377		zone->compact_cached_migrate_pfn[1] = migrate_pfn;
    378		zone->compact_cached_free_pfn = free_pfn;
    379	}
    380}
    381
    382void reset_isolation_suitable(pg_data_t *pgdat)
    383{
    384	int zoneid;
    385
    386	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
    387		struct zone *zone = &pgdat->node_zones[zoneid];
    388		if (!populated_zone(zone))
    389			continue;
    390
    391		/* Only flush if a full compaction finished recently */
    392		if (zone->compact_blockskip_flush)
    393			__reset_isolation_suitable(zone);
    394	}
    395}
    396
    397/*
    398 * Sets the pageblock skip bit if it was clear. Note that this is a hint as
    399 * locks are not required for read/writers. Returns true if it was already set.
    400 */
    401static bool test_and_set_skip(struct compact_control *cc, struct page *page,
    402							unsigned long pfn)
    403{
    404	bool skip;
    405
    406	/* Do no update if skip hint is being ignored */
    407	if (cc->ignore_skip_hint)
    408		return false;
    409
    410	if (!IS_ALIGNED(pfn, pageblock_nr_pages))
    411		return false;
    412
    413	skip = get_pageblock_skip(page);
    414	if (!skip && !cc->no_set_skip_hint)
    415		set_pageblock_skip(page);
    416
    417	return skip;
    418}
    419
    420static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
    421{
    422	struct zone *zone = cc->zone;
    423
    424	pfn = pageblock_end_pfn(pfn);
    425
    426	/* Set for isolation rather than compaction */
    427	if (cc->no_set_skip_hint)
    428		return;
    429
    430	if (pfn > zone->compact_cached_migrate_pfn[0])
    431		zone->compact_cached_migrate_pfn[0] = pfn;
    432	if (cc->mode != MIGRATE_ASYNC &&
    433	    pfn > zone->compact_cached_migrate_pfn[1])
    434		zone->compact_cached_migrate_pfn[1] = pfn;
    435}
    436
    437/*
    438 * If no pages were isolated then mark this pageblock to be skipped in the
    439 * future. The information is later cleared by __reset_isolation_suitable().
    440 */
    441static void update_pageblock_skip(struct compact_control *cc,
    442			struct page *page, unsigned long pfn)
    443{
    444	struct zone *zone = cc->zone;
    445
    446	if (cc->no_set_skip_hint)
    447		return;
    448
    449	if (!page)
    450		return;
    451
    452	set_pageblock_skip(page);
    453
    454	/* Update where async and sync compaction should restart */
    455	if (pfn < zone->compact_cached_free_pfn)
    456		zone->compact_cached_free_pfn = pfn;
    457}
    458#else
    459static inline bool isolation_suitable(struct compact_control *cc,
    460					struct page *page)
    461{
    462	return true;
    463}
    464
    465static inline bool pageblock_skip_persistent(struct page *page)
    466{
    467	return false;
    468}
    469
    470static inline void update_pageblock_skip(struct compact_control *cc,
    471			struct page *page, unsigned long pfn)
    472{
    473}
    474
    475static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
    476{
    477}
    478
    479static bool test_and_set_skip(struct compact_control *cc, struct page *page,
    480							unsigned long pfn)
    481{
    482	return false;
    483}
    484#endif /* CONFIG_COMPACTION */
    485
    486/*
    487 * Compaction requires the taking of some coarse locks that are potentially
    488 * very heavily contended. For async compaction, trylock and record if the
    489 * lock is contended. The lock will still be acquired but compaction will
    490 * abort when the current block is finished regardless of success rate.
    491 * Sync compaction acquires the lock.
    492 *
    493 * Always returns true which makes it easier to track lock state in callers.
    494 */
    495static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
    496						struct compact_control *cc)
    497	__acquires(lock)
    498{
    499	/* Track if the lock is contended in async mode */
    500	if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
    501		if (spin_trylock_irqsave(lock, *flags))
    502			return true;
    503
    504		cc->contended = true;
    505	}
    506
    507	spin_lock_irqsave(lock, *flags);
    508	return true;
    509}
    510
    511/*
    512 * Compaction requires the taking of some coarse locks that are potentially
    513 * very heavily contended. The lock should be periodically unlocked to avoid
    514 * having disabled IRQs for a long time, even when there is nobody waiting on
    515 * the lock. It might also be that allowing the IRQs will result in
    516 * need_resched() becoming true. If scheduling is needed, compaction schedules.
    517 * Either compaction type will also abort if a fatal signal is pending.
    518 * In either case if the lock was locked, it is dropped and not regained.
    519 *
    520 * Returns true if compaction should abort due to fatal signal pending.
    521 * Returns false when compaction can continue.
    522 */
    523static bool compact_unlock_should_abort(spinlock_t *lock,
    524		unsigned long flags, bool *locked, struct compact_control *cc)
    525{
    526	if (*locked) {
    527		spin_unlock_irqrestore(lock, flags);
    528		*locked = false;
    529	}
    530
    531	if (fatal_signal_pending(current)) {
    532		cc->contended = true;
    533		return true;
    534	}
    535
    536	cond_resched();
    537
    538	return false;
    539}
    540
    541/*
    542 * Isolate free pages onto a private freelist. If @strict is true, will abort
    543 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
    544 * (even though it may still end up isolating some pages).
    545 */
    546static unsigned long isolate_freepages_block(struct compact_control *cc,
    547				unsigned long *start_pfn,
    548				unsigned long end_pfn,
    549				struct list_head *freelist,
    550				unsigned int stride,
    551				bool strict)
    552{
    553	int nr_scanned = 0, total_isolated = 0;
    554	struct page *cursor;
    555	unsigned long flags = 0;
    556	bool locked = false;
    557	unsigned long blockpfn = *start_pfn;
    558	unsigned int order;
    559
    560	/* Strict mode is for isolation, speed is secondary */
    561	if (strict)
    562		stride = 1;
    563
    564	cursor = pfn_to_page(blockpfn);
    565
    566	/* Isolate free pages. */
    567	for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
    568		int isolated;
    569		struct page *page = cursor;
    570
    571		/*
    572		 * Periodically drop the lock (if held) regardless of its
    573		 * contention, to give chance to IRQs. Abort if fatal signal
    574		 * pending.
    575		 */
    576		if (!(blockpfn % COMPACT_CLUSTER_MAX)
    577		    && compact_unlock_should_abort(&cc->zone->lock, flags,
    578								&locked, cc))
    579			break;
    580
    581		nr_scanned++;
    582
    583		/*
    584		 * For compound pages such as THP and hugetlbfs, we can save
    585		 * potentially a lot of iterations if we skip them at once.
    586		 * The check is racy, but we can consider only valid values
    587		 * and the only danger is skipping too much.
    588		 */
    589		if (PageCompound(page)) {
    590			const unsigned int order = compound_order(page);
    591
    592			if (likely(order < MAX_ORDER)) {
    593				blockpfn += (1UL << order) - 1;
    594				cursor += (1UL << order) - 1;
    595			}
    596			goto isolate_fail;
    597		}
    598
    599		if (!PageBuddy(page))
    600			goto isolate_fail;
    601
    602		/* If we already hold the lock, we can skip some rechecking. */
    603		if (!locked) {
    604			locked = compact_lock_irqsave(&cc->zone->lock,
    605								&flags, cc);
    606
    607			/* Recheck this is a buddy page under lock */
    608			if (!PageBuddy(page))
    609				goto isolate_fail;
    610		}
    611
    612		/* Found a free page, will break it into order-0 pages */
    613		order = buddy_order(page);
    614		isolated = __isolate_free_page(page, order);
    615		if (!isolated)
    616			break;
    617		set_page_private(page, order);
    618
    619		total_isolated += isolated;
    620		cc->nr_freepages += isolated;
    621		list_add_tail(&page->lru, freelist);
    622
    623		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
    624			blockpfn += isolated;
    625			break;
    626		}
    627		/* Advance to the end of split page */
    628		blockpfn += isolated - 1;
    629		cursor += isolated - 1;
    630		continue;
    631
    632isolate_fail:
    633		if (strict)
    634			break;
    635		else
    636			continue;
    637
    638	}
    639
    640	if (locked)
    641		spin_unlock_irqrestore(&cc->zone->lock, flags);
    642
    643	/*
    644	 * There is a tiny chance that we have read bogus compound_order(),
    645	 * so be careful to not go outside of the pageblock.
    646	 */
    647	if (unlikely(blockpfn > end_pfn))
    648		blockpfn = end_pfn;
    649
    650	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
    651					nr_scanned, total_isolated);
    652
    653	/* Record how far we have got within the block */
    654	*start_pfn = blockpfn;
    655
    656	/*
    657	 * If strict isolation is requested by CMA then check that all the
    658	 * pages requested were isolated. If there were any failures, 0 is
    659	 * returned and CMA will fail.
    660	 */
    661	if (strict && blockpfn < end_pfn)
    662		total_isolated = 0;
    663
    664	cc->total_free_scanned += nr_scanned;
    665	if (total_isolated)
    666		count_compact_events(COMPACTISOLATED, total_isolated);
    667	return total_isolated;
    668}
    669
    670/**
    671 * isolate_freepages_range() - isolate free pages.
    672 * @cc:        Compaction control structure.
    673 * @start_pfn: The first PFN to start isolating.
    674 * @end_pfn:   The one-past-last PFN.
    675 *
    676 * Non-free pages, invalid PFNs, or zone boundaries within the
    677 * [start_pfn, end_pfn) range are considered errors, cause function to
    678 * undo its actions and return zero.
    679 *
    680 * Otherwise, function returns one-past-the-last PFN of isolated page
    681 * (which may be greater then end_pfn if end fell in a middle of
    682 * a free page).
    683 */
    684unsigned long
    685isolate_freepages_range(struct compact_control *cc,
    686			unsigned long start_pfn, unsigned long end_pfn)
    687{
    688	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
    689	LIST_HEAD(freelist);
    690
    691	pfn = start_pfn;
    692	block_start_pfn = pageblock_start_pfn(pfn);
    693	if (block_start_pfn < cc->zone->zone_start_pfn)
    694		block_start_pfn = cc->zone->zone_start_pfn;
    695	block_end_pfn = pageblock_end_pfn(pfn);
    696
    697	for (; pfn < end_pfn; pfn += isolated,
    698				block_start_pfn = block_end_pfn,
    699				block_end_pfn += pageblock_nr_pages) {
    700		/* Protect pfn from changing by isolate_freepages_block */
    701		unsigned long isolate_start_pfn = pfn;
    702
    703		block_end_pfn = min(block_end_pfn, end_pfn);
    704
    705		/*
    706		 * pfn could pass the block_end_pfn if isolated freepage
    707		 * is more than pageblock order. In this case, we adjust
    708		 * scanning range to right one.
    709		 */
    710		if (pfn >= block_end_pfn) {
    711			block_start_pfn = pageblock_start_pfn(pfn);
    712			block_end_pfn = pageblock_end_pfn(pfn);
    713			block_end_pfn = min(block_end_pfn, end_pfn);
    714		}
    715
    716		if (!pageblock_pfn_to_page(block_start_pfn,
    717					block_end_pfn, cc->zone))
    718			break;
    719
    720		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
    721					block_end_pfn, &freelist, 0, true);
    722
    723		/*
    724		 * In strict mode, isolate_freepages_block() returns 0 if
    725		 * there are any holes in the block (ie. invalid PFNs or
    726		 * non-free pages).
    727		 */
    728		if (!isolated)
    729			break;
    730
    731		/*
    732		 * If we managed to isolate pages, it is always (1 << n) *
    733		 * pageblock_nr_pages for some non-negative n.  (Max order
    734		 * page may span two pageblocks).
    735		 */
    736	}
    737
    738	/* __isolate_free_page() does not map the pages */
    739	split_map_pages(&freelist);
    740
    741	if (pfn < end_pfn) {
    742		/* Loop terminated early, cleanup. */
    743		release_freepages(&freelist);
    744		return 0;
    745	}
    746
    747	/* We don't use freelists for anything. */
    748	return pfn;
    749}
    750
    751/* Similar to reclaim, but different enough that they don't share logic */
    752static bool too_many_isolated(pg_data_t *pgdat)
    753{
    754	bool too_many;
    755
    756	unsigned long active, inactive, isolated;
    757
    758	inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
    759			node_page_state(pgdat, NR_INACTIVE_ANON);
    760	active = node_page_state(pgdat, NR_ACTIVE_FILE) +
    761			node_page_state(pgdat, NR_ACTIVE_ANON);
    762	isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
    763			node_page_state(pgdat, NR_ISOLATED_ANON);
    764
    765	too_many = isolated > (inactive + active) / 2;
    766	if (!too_many)
    767		wake_throttle_isolated(pgdat);
    768
    769	return too_many;
    770}
    771
    772/**
    773 * isolate_migratepages_block() - isolate all migrate-able pages within
    774 *				  a single pageblock
    775 * @cc:		Compaction control structure.
    776 * @low_pfn:	The first PFN to isolate
    777 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
    778 * @mode:	Isolation mode to be used.
    779 *
    780 * Isolate all pages that can be migrated from the range specified by
    781 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
    782 * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
    783 * -ENOMEM in case we could not allocate a page, or 0.
    784 * cc->migrate_pfn will contain the next pfn to scan.
    785 *
    786 * The pages are isolated on cc->migratepages list (not required to be empty),
    787 * and cc->nr_migratepages is updated accordingly.
    788 */
    789static int
    790isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
    791			unsigned long end_pfn, isolate_mode_t mode)
    792{
    793	pg_data_t *pgdat = cc->zone->zone_pgdat;
    794	unsigned long nr_scanned = 0, nr_isolated = 0;
    795	struct lruvec *lruvec;
    796	unsigned long flags = 0;
    797	struct lruvec *locked = NULL;
    798	struct page *page = NULL, *valid_page = NULL;
    799	struct address_space *mapping;
    800	unsigned long start_pfn = low_pfn;
    801	bool skip_on_failure = false;
    802	unsigned long next_skip_pfn = 0;
    803	bool skip_updated = false;
    804	int ret = 0;
    805
    806	cc->migrate_pfn = low_pfn;
    807
    808	/*
    809	 * Ensure that there are not too many pages isolated from the LRU
    810	 * list by either parallel reclaimers or compaction. If there are,
    811	 * delay for some time until fewer pages are isolated
    812	 */
    813	while (unlikely(too_many_isolated(pgdat))) {
    814		/* stop isolation if there are still pages not migrated */
    815		if (cc->nr_migratepages)
    816			return -EAGAIN;
    817
    818		/* async migration should just abort */
    819		if (cc->mode == MIGRATE_ASYNC)
    820			return -EAGAIN;
    821
    822		reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
    823
    824		if (fatal_signal_pending(current))
    825			return -EINTR;
    826	}
    827
    828	cond_resched();
    829
    830	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
    831		skip_on_failure = true;
    832		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
    833	}
    834
    835	/* Time to isolate some pages for migration */
    836	for (; low_pfn < end_pfn; low_pfn++) {
    837
    838		if (skip_on_failure && low_pfn >= next_skip_pfn) {
    839			/*
    840			 * We have isolated all migration candidates in the
    841			 * previous order-aligned block, and did not skip it due
    842			 * to failure. We should migrate the pages now and
    843			 * hopefully succeed compaction.
    844			 */
    845			if (nr_isolated)
    846				break;
    847
    848			/*
    849			 * We failed to isolate in the previous order-aligned
    850			 * block. Set the new boundary to the end of the
    851			 * current block. Note we can't simply increase
    852			 * next_skip_pfn by 1 << order, as low_pfn might have
    853			 * been incremented by a higher number due to skipping
    854			 * a compound or a high-order buddy page in the
    855			 * previous loop iteration.
    856			 */
    857			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
    858		}
    859
    860		/*
    861		 * Periodically drop the lock (if held) regardless of its
    862		 * contention, to give chance to IRQs. Abort completely if
    863		 * a fatal signal is pending.
    864		 */
    865		if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
    866			if (locked) {
    867				unlock_page_lruvec_irqrestore(locked, flags);
    868				locked = NULL;
    869			}
    870
    871			if (fatal_signal_pending(current)) {
    872				cc->contended = true;
    873				ret = -EINTR;
    874
    875				goto fatal_pending;
    876			}
    877
    878			cond_resched();
    879		}
    880
    881		nr_scanned++;
    882
    883		page = pfn_to_page(low_pfn);
    884
    885		/*
    886		 * Check if the pageblock has already been marked skipped.
    887		 * Only the aligned PFN is checked as the caller isolates
    888		 * COMPACT_CLUSTER_MAX at a time so the second call must
    889		 * not falsely conclude that the block should be skipped.
    890		 */
    891		if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
    892			if (!isolation_suitable(cc, page)) {
    893				low_pfn = end_pfn;
    894				page = NULL;
    895				goto isolate_abort;
    896			}
    897			valid_page = page;
    898		}
    899
    900		if (PageHuge(page) && cc->alloc_contig) {
    901			ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
    902
    903			/*
    904			 * Fail isolation in case isolate_or_dissolve_huge_page()
    905			 * reports an error. In case of -ENOMEM, abort right away.
    906			 */
    907			if (ret < 0) {
    908				 /* Do not report -EBUSY down the chain */
    909				if (ret == -EBUSY)
    910					ret = 0;
    911				low_pfn += compound_nr(page) - 1;
    912				goto isolate_fail;
    913			}
    914
    915			if (PageHuge(page)) {
    916				/*
    917				 * Hugepage was successfully isolated and placed
    918				 * on the cc->migratepages list.
    919				 */
    920				low_pfn += compound_nr(page) - 1;
    921				goto isolate_success_no_list;
    922			}
    923
    924			/*
    925			 * Ok, the hugepage was dissolved. Now these pages are
    926			 * Buddy and cannot be re-allocated because they are
    927			 * isolated. Fall-through as the check below handles
    928			 * Buddy pages.
    929			 */
    930		}
    931
    932		/*
    933		 * Skip if free. We read page order here without zone lock
    934		 * which is generally unsafe, but the race window is small and
    935		 * the worst thing that can happen is that we skip some
    936		 * potential isolation targets.
    937		 */
    938		if (PageBuddy(page)) {
    939			unsigned long freepage_order = buddy_order_unsafe(page);
    940
    941			/*
    942			 * Without lock, we cannot be sure that what we got is
    943			 * a valid page order. Consider only values in the
    944			 * valid order range to prevent low_pfn overflow.
    945			 */
    946			if (freepage_order > 0 && freepage_order < MAX_ORDER)
    947				low_pfn += (1UL << freepage_order) - 1;
    948			continue;
    949		}
    950
    951		/*
    952		 * Regardless of being on LRU, compound pages such as THP and
    953		 * hugetlbfs are not to be compacted unless we are attempting
    954		 * an allocation much larger than the huge page size (eg CMA).
    955		 * We can potentially save a lot of iterations if we skip them
    956		 * at once. The check is racy, but we can consider only valid
    957		 * values and the only danger is skipping too much.
    958		 */
    959		if (PageCompound(page) && !cc->alloc_contig) {
    960			const unsigned int order = compound_order(page);
    961
    962			if (likely(order < MAX_ORDER))
    963				low_pfn += (1UL << order) - 1;
    964			goto isolate_fail;
    965		}
    966
    967		/*
    968		 * Check may be lockless but that's ok as we recheck later.
    969		 * It's possible to migrate LRU and non-lru movable pages.
    970		 * Skip any other type of page
    971		 */
    972		if (!PageLRU(page)) {
    973			/*
    974			 * __PageMovable can return false positive so we need
    975			 * to verify it under page_lock.
    976			 */
    977			if (unlikely(__PageMovable(page)) &&
    978					!PageIsolated(page)) {
    979				if (locked) {
    980					unlock_page_lruvec_irqrestore(locked, flags);
    981					locked = NULL;
    982				}
    983
    984				if (!isolate_movable_page(page, mode))
    985					goto isolate_success;
    986			}
    987
    988			goto isolate_fail;
    989		}
    990
    991		/*
    992		 * Migration will fail if an anonymous page is pinned in memory,
    993		 * so avoid taking lru_lock and isolating it unnecessarily in an
    994		 * admittedly racy check.
    995		 */
    996		mapping = page_mapping(page);
    997		if (!mapping && page_count(page) > page_mapcount(page))
    998			goto isolate_fail;
    999
   1000		/*
   1001		 * Only allow to migrate anonymous pages in GFP_NOFS context
   1002		 * because those do not depend on fs locks.
   1003		 */
   1004		if (!(cc->gfp_mask & __GFP_FS) && mapping)
   1005			goto isolate_fail;
   1006
   1007		/*
   1008		 * Be careful not to clear PageLRU until after we're
   1009		 * sure the page is not being freed elsewhere -- the
   1010		 * page release code relies on it.
   1011		 */
   1012		if (unlikely(!get_page_unless_zero(page)))
   1013			goto isolate_fail;
   1014
   1015		/* Only take pages on LRU: a check now makes later tests safe */
   1016		if (!PageLRU(page))
   1017			goto isolate_fail_put;
   1018
   1019		/* Compaction might skip unevictable pages but CMA takes them */
   1020		if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page))
   1021			goto isolate_fail_put;
   1022
   1023		/*
   1024		 * To minimise LRU disruption, the caller can indicate with
   1025		 * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
   1026		 * it will be able to migrate without blocking - clean pages
   1027		 * for the most part.  PageWriteback would require blocking.
   1028		 */
   1029		if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page))
   1030			goto isolate_fail_put;
   1031
   1032		if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) {
   1033			bool migrate_dirty;
   1034
   1035			/*
   1036			 * Only pages without mappings or that have a
   1037			 * ->migratepage callback are possible to migrate
   1038			 * without blocking. However, we can be racing with
   1039			 * truncation so it's necessary to lock the page
   1040			 * to stabilise the mapping as truncation holds
   1041			 * the page lock until after the page is removed
   1042			 * from the page cache.
   1043			 */
   1044			if (!trylock_page(page))
   1045				goto isolate_fail_put;
   1046
   1047			mapping = page_mapping(page);
   1048			migrate_dirty = !mapping || mapping->a_ops->migratepage;
   1049			unlock_page(page);
   1050			if (!migrate_dirty)
   1051				goto isolate_fail_put;
   1052		}
   1053
   1054		/* Try isolate the page */
   1055		if (!TestClearPageLRU(page))
   1056			goto isolate_fail_put;
   1057
   1058		lruvec = folio_lruvec(page_folio(page));
   1059
   1060		/* If we already hold the lock, we can skip some rechecking */
   1061		if (lruvec != locked) {
   1062			if (locked)
   1063				unlock_page_lruvec_irqrestore(locked, flags);
   1064
   1065			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
   1066			locked = lruvec;
   1067
   1068			lruvec_memcg_debug(lruvec, page_folio(page));
   1069
   1070			/* Try get exclusive access under lock */
   1071			if (!skip_updated) {
   1072				skip_updated = true;
   1073				if (test_and_set_skip(cc, page, low_pfn))
   1074					goto isolate_abort;
   1075			}
   1076
   1077			/*
   1078			 * Page become compound since the non-locked check,
   1079			 * and it's on LRU. It can only be a THP so the order
   1080			 * is safe to read and it's 0 for tail pages.
   1081			 */
   1082			if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
   1083				low_pfn += compound_nr(page) - 1;
   1084				SetPageLRU(page);
   1085				goto isolate_fail_put;
   1086			}
   1087		}
   1088
   1089		/* The whole page is taken off the LRU; skip the tail pages. */
   1090		if (PageCompound(page))
   1091			low_pfn += compound_nr(page) - 1;
   1092
   1093		/* Successfully isolated */
   1094		del_page_from_lru_list(page, lruvec);
   1095		mod_node_page_state(page_pgdat(page),
   1096				NR_ISOLATED_ANON + page_is_file_lru(page),
   1097				thp_nr_pages(page));
   1098
   1099isolate_success:
   1100		list_add(&page->lru, &cc->migratepages);
   1101isolate_success_no_list:
   1102		cc->nr_migratepages += compound_nr(page);
   1103		nr_isolated += compound_nr(page);
   1104
   1105		/*
   1106		 * Avoid isolating too much unless this block is being
   1107		 * rescanned (e.g. dirty/writeback pages, parallel allocation)
   1108		 * or a lock is contended. For contention, isolate quickly to
   1109		 * potentially remove one source of contention.
   1110		 */
   1111		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
   1112		    !cc->rescan && !cc->contended) {
   1113			++low_pfn;
   1114			break;
   1115		}
   1116
   1117		continue;
   1118
   1119isolate_fail_put:
   1120		/* Avoid potential deadlock in freeing page under lru_lock */
   1121		if (locked) {
   1122			unlock_page_lruvec_irqrestore(locked, flags);
   1123			locked = NULL;
   1124		}
   1125		put_page(page);
   1126
   1127isolate_fail:
   1128		if (!skip_on_failure && ret != -ENOMEM)
   1129			continue;
   1130
   1131		/*
   1132		 * We have isolated some pages, but then failed. Release them
   1133		 * instead of migrating, as we cannot form the cc->order buddy
   1134		 * page anyway.
   1135		 */
   1136		if (nr_isolated) {
   1137			if (locked) {
   1138				unlock_page_lruvec_irqrestore(locked, flags);
   1139				locked = NULL;
   1140			}
   1141			putback_movable_pages(&cc->migratepages);
   1142			cc->nr_migratepages = 0;
   1143			nr_isolated = 0;
   1144		}
   1145
   1146		if (low_pfn < next_skip_pfn) {
   1147			low_pfn = next_skip_pfn - 1;
   1148			/*
   1149			 * The check near the loop beginning would have updated
   1150			 * next_skip_pfn too, but this is a bit simpler.
   1151			 */
   1152			next_skip_pfn += 1UL << cc->order;
   1153		}
   1154
   1155		if (ret == -ENOMEM)
   1156			break;
   1157	}
   1158
   1159	/*
   1160	 * The PageBuddy() check could have potentially brought us outside
   1161	 * the range to be scanned.
   1162	 */
   1163	if (unlikely(low_pfn > end_pfn))
   1164		low_pfn = end_pfn;
   1165
   1166	page = NULL;
   1167
   1168isolate_abort:
   1169	if (locked)
   1170		unlock_page_lruvec_irqrestore(locked, flags);
   1171	if (page) {
   1172		SetPageLRU(page);
   1173		put_page(page);
   1174	}
   1175
   1176	/*
   1177	 * Updated the cached scanner pfn once the pageblock has been scanned
   1178	 * Pages will either be migrated in which case there is no point
   1179	 * scanning in the near future or migration failed in which case the
   1180	 * failure reason may persist. The block is marked for skipping if
   1181	 * there were no pages isolated in the block or if the block is
   1182	 * rescanned twice in a row.
   1183	 */
   1184	if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
   1185		if (valid_page && !skip_updated)
   1186			set_pageblock_skip(valid_page);
   1187		update_cached_migrate(cc, low_pfn);
   1188	}
   1189
   1190	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
   1191						nr_scanned, nr_isolated);
   1192
   1193fatal_pending:
   1194	cc->total_migrate_scanned += nr_scanned;
   1195	if (nr_isolated)
   1196		count_compact_events(COMPACTISOLATED, nr_isolated);
   1197
   1198	cc->migrate_pfn = low_pfn;
   1199
   1200	return ret;
   1201}
   1202
   1203/**
   1204 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
   1205 * @cc:        Compaction control structure.
   1206 * @start_pfn: The first PFN to start isolating.
   1207 * @end_pfn:   The one-past-last PFN.
   1208 *
   1209 * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
   1210 * in case we could not allocate a page, or 0.
   1211 */
   1212int
   1213isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
   1214							unsigned long end_pfn)
   1215{
   1216	unsigned long pfn, block_start_pfn, block_end_pfn;
   1217	int ret = 0;
   1218
   1219	/* Scan block by block. First and last block may be incomplete */
   1220	pfn = start_pfn;
   1221	block_start_pfn = pageblock_start_pfn(pfn);
   1222	if (block_start_pfn < cc->zone->zone_start_pfn)
   1223		block_start_pfn = cc->zone->zone_start_pfn;
   1224	block_end_pfn = pageblock_end_pfn(pfn);
   1225
   1226	for (; pfn < end_pfn; pfn = block_end_pfn,
   1227				block_start_pfn = block_end_pfn,
   1228				block_end_pfn += pageblock_nr_pages) {
   1229
   1230		block_end_pfn = min(block_end_pfn, end_pfn);
   1231
   1232		if (!pageblock_pfn_to_page(block_start_pfn,
   1233					block_end_pfn, cc->zone))
   1234			continue;
   1235
   1236		ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
   1237						 ISOLATE_UNEVICTABLE);
   1238
   1239		if (ret)
   1240			break;
   1241
   1242		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
   1243			break;
   1244	}
   1245
   1246	return ret;
   1247}
   1248
   1249#endif /* CONFIG_COMPACTION || CONFIG_CMA */
   1250#ifdef CONFIG_COMPACTION
   1251
   1252static bool suitable_migration_source(struct compact_control *cc,
   1253							struct page *page)
   1254{
   1255	int block_mt;
   1256
   1257	if (pageblock_skip_persistent(page))
   1258		return false;
   1259
   1260	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
   1261		return true;
   1262
   1263	block_mt = get_pageblock_migratetype(page);
   1264
   1265	if (cc->migratetype == MIGRATE_MOVABLE)
   1266		return is_migrate_movable(block_mt);
   1267	else
   1268		return block_mt == cc->migratetype;
   1269}
   1270
   1271/* Returns true if the page is within a block suitable for migration to */
   1272static bool suitable_migration_target(struct compact_control *cc,
   1273							struct page *page)
   1274{
   1275	/* If the page is a large free page, then disallow migration */
   1276	if (PageBuddy(page)) {
   1277		/*
   1278		 * We are checking page_order without zone->lock taken. But
   1279		 * the only small danger is that we skip a potentially suitable
   1280		 * pageblock, so it's not worth to check order for valid range.
   1281		 */
   1282		if (buddy_order_unsafe(page) >= pageblock_order)
   1283			return false;
   1284	}
   1285
   1286	if (cc->ignore_block_suitable)
   1287		return true;
   1288
   1289	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
   1290	if (is_migrate_movable(get_pageblock_migratetype(page)))
   1291		return true;
   1292
   1293	/* Otherwise skip the block */
   1294	return false;
   1295}
   1296
   1297static inline unsigned int
   1298freelist_scan_limit(struct compact_control *cc)
   1299{
   1300	unsigned short shift = BITS_PER_LONG - 1;
   1301
   1302	return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
   1303}
   1304
   1305/*
   1306 * Test whether the free scanner has reached the same or lower pageblock than
   1307 * the migration scanner, and compaction should thus terminate.
   1308 */
   1309static inline bool compact_scanners_met(struct compact_control *cc)
   1310{
   1311	return (cc->free_pfn >> pageblock_order)
   1312		<= (cc->migrate_pfn >> pageblock_order);
   1313}
   1314
   1315/*
   1316 * Used when scanning for a suitable migration target which scans freelists
   1317 * in reverse. Reorders the list such as the unscanned pages are scanned
   1318 * first on the next iteration of the free scanner
   1319 */
   1320static void
   1321move_freelist_head(struct list_head *freelist, struct page *freepage)
   1322{
   1323	LIST_HEAD(sublist);
   1324
   1325	if (!list_is_last(freelist, &freepage->lru)) {
   1326		list_cut_before(&sublist, freelist, &freepage->lru);
   1327		list_splice_tail(&sublist, freelist);
   1328	}
   1329}
   1330
   1331/*
   1332 * Similar to move_freelist_head except used by the migration scanner
   1333 * when scanning forward. It's possible for these list operations to
   1334 * move against each other if they search the free list exactly in
   1335 * lockstep.
   1336 */
   1337static void
   1338move_freelist_tail(struct list_head *freelist, struct page *freepage)
   1339{
   1340	LIST_HEAD(sublist);
   1341
   1342	if (!list_is_first(freelist, &freepage->lru)) {
   1343		list_cut_position(&sublist, freelist, &freepage->lru);
   1344		list_splice_tail(&sublist, freelist);
   1345	}
   1346}
   1347
   1348static void
   1349fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
   1350{
   1351	unsigned long start_pfn, end_pfn;
   1352	struct page *page;
   1353
   1354	/* Do not search around if there are enough pages already */
   1355	if (cc->nr_freepages >= cc->nr_migratepages)
   1356		return;
   1357
   1358	/* Minimise scanning during async compaction */
   1359	if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
   1360		return;
   1361
   1362	/* Pageblock boundaries */
   1363	start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
   1364	end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
   1365
   1366	page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
   1367	if (!page)
   1368		return;
   1369
   1370	/* Scan before */
   1371	if (start_pfn != pfn) {
   1372		isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
   1373		if (cc->nr_freepages >= cc->nr_migratepages)
   1374			return;
   1375	}
   1376
   1377	/* Scan after */
   1378	start_pfn = pfn + nr_isolated;
   1379	if (start_pfn < end_pfn)
   1380		isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
   1381
   1382	/* Skip this pageblock in the future as it's full or nearly full */
   1383	if (cc->nr_freepages < cc->nr_migratepages)
   1384		set_pageblock_skip(page);
   1385}
   1386
   1387/* Search orders in round-robin fashion */
   1388static int next_search_order(struct compact_control *cc, int order)
   1389{
   1390	order--;
   1391	if (order < 0)
   1392		order = cc->order - 1;
   1393
   1394	/* Search wrapped around? */
   1395	if (order == cc->search_order) {
   1396		cc->search_order--;
   1397		if (cc->search_order < 0)
   1398			cc->search_order = cc->order - 1;
   1399		return -1;
   1400	}
   1401
   1402	return order;
   1403}
   1404
   1405static unsigned long
   1406fast_isolate_freepages(struct compact_control *cc)
   1407{
   1408	unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
   1409	unsigned int nr_scanned = 0;
   1410	unsigned long low_pfn, min_pfn, highest = 0;
   1411	unsigned long nr_isolated = 0;
   1412	unsigned long distance;
   1413	struct page *page = NULL;
   1414	bool scan_start = false;
   1415	int order;
   1416
   1417	/* Full compaction passes in a negative order */
   1418	if (cc->order <= 0)
   1419		return cc->free_pfn;
   1420
   1421	/*
   1422	 * If starting the scan, use a deeper search and use the highest
   1423	 * PFN found if a suitable one is not found.
   1424	 */
   1425	if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
   1426		limit = pageblock_nr_pages >> 1;
   1427		scan_start = true;
   1428	}
   1429
   1430	/*
   1431	 * Preferred point is in the top quarter of the scan space but take
   1432	 * a pfn from the top half if the search is problematic.
   1433	 */
   1434	distance = (cc->free_pfn - cc->migrate_pfn);
   1435	low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
   1436	min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
   1437
   1438	if (WARN_ON_ONCE(min_pfn > low_pfn))
   1439		low_pfn = min_pfn;
   1440
   1441	/*
   1442	 * Search starts from the last successful isolation order or the next
   1443	 * order to search after a previous failure
   1444	 */
   1445	cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
   1446
   1447	for (order = cc->search_order;
   1448	     !page && order >= 0;
   1449	     order = next_search_order(cc, order)) {
   1450		struct free_area *area = &cc->zone->free_area[order];
   1451		struct list_head *freelist;
   1452		struct page *freepage;
   1453		unsigned long flags;
   1454		unsigned int order_scanned = 0;
   1455		unsigned long high_pfn = 0;
   1456
   1457		if (!area->nr_free)
   1458			continue;
   1459
   1460		spin_lock_irqsave(&cc->zone->lock, flags);
   1461		freelist = &area->free_list[MIGRATE_MOVABLE];
   1462		list_for_each_entry_reverse(freepage, freelist, lru) {
   1463			unsigned long pfn;
   1464
   1465			order_scanned++;
   1466			nr_scanned++;
   1467			pfn = page_to_pfn(freepage);
   1468
   1469			if (pfn >= highest)
   1470				highest = max(pageblock_start_pfn(pfn),
   1471					      cc->zone->zone_start_pfn);
   1472
   1473			if (pfn >= low_pfn) {
   1474				cc->fast_search_fail = 0;
   1475				cc->search_order = order;
   1476				page = freepage;
   1477				break;
   1478			}
   1479
   1480			if (pfn >= min_pfn && pfn > high_pfn) {
   1481				high_pfn = pfn;
   1482
   1483				/* Shorten the scan if a candidate is found */
   1484				limit >>= 1;
   1485			}
   1486
   1487			if (order_scanned >= limit)
   1488				break;
   1489		}
   1490
   1491		/* Use a minimum pfn if a preferred one was not found */
   1492		if (!page && high_pfn) {
   1493			page = pfn_to_page(high_pfn);
   1494
   1495			/* Update freepage for the list reorder below */
   1496			freepage = page;
   1497		}
   1498
   1499		/* Reorder to so a future search skips recent pages */
   1500		move_freelist_head(freelist, freepage);
   1501
   1502		/* Isolate the page if available */
   1503		if (page) {
   1504			if (__isolate_free_page(page, order)) {
   1505				set_page_private(page, order);
   1506				nr_isolated = 1 << order;
   1507				cc->nr_freepages += nr_isolated;
   1508				list_add_tail(&page->lru, &cc->freepages);
   1509				count_compact_events(COMPACTISOLATED, nr_isolated);
   1510			} else {
   1511				/* If isolation fails, abort the search */
   1512				order = cc->search_order + 1;
   1513				page = NULL;
   1514			}
   1515		}
   1516
   1517		spin_unlock_irqrestore(&cc->zone->lock, flags);
   1518
   1519		/*
   1520		 * Smaller scan on next order so the total scan is related
   1521		 * to freelist_scan_limit.
   1522		 */
   1523		if (order_scanned >= limit)
   1524			limit = max(1U, limit >> 1);
   1525	}
   1526
   1527	if (!page) {
   1528		cc->fast_search_fail++;
   1529		if (scan_start) {
   1530			/*
   1531			 * Use the highest PFN found above min. If one was
   1532			 * not found, be pessimistic for direct compaction
   1533			 * and use the min mark.
   1534			 */
   1535			if (highest >= min_pfn) {
   1536				page = pfn_to_page(highest);
   1537				cc->free_pfn = highest;
   1538			} else {
   1539				if (cc->direct_compaction && pfn_valid(min_pfn)) {
   1540					page = pageblock_pfn_to_page(min_pfn,
   1541						min(pageblock_end_pfn(min_pfn),
   1542						    zone_end_pfn(cc->zone)),
   1543						cc->zone);
   1544					cc->free_pfn = min_pfn;
   1545				}
   1546			}
   1547		}
   1548	}
   1549
   1550	if (highest && highest >= cc->zone->compact_cached_free_pfn) {
   1551		highest -= pageblock_nr_pages;
   1552		cc->zone->compact_cached_free_pfn = highest;
   1553	}
   1554
   1555	cc->total_free_scanned += nr_scanned;
   1556	if (!page)
   1557		return cc->free_pfn;
   1558
   1559	low_pfn = page_to_pfn(page);
   1560	fast_isolate_around(cc, low_pfn, nr_isolated);
   1561	return low_pfn;
   1562}
   1563
   1564/*
   1565 * Based on information in the current compact_control, find blocks
   1566 * suitable for isolating free pages from and then isolate them.
   1567 */
   1568static void isolate_freepages(struct compact_control *cc)
   1569{
   1570	struct zone *zone = cc->zone;
   1571	struct page *page;
   1572	unsigned long block_start_pfn;	/* start of current pageblock */
   1573	unsigned long isolate_start_pfn; /* exact pfn we start at */
   1574	unsigned long block_end_pfn;	/* end of current pageblock */
   1575	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
   1576	struct list_head *freelist = &cc->freepages;
   1577	unsigned int stride;
   1578
   1579	/* Try a small search of the free lists for a candidate */
   1580	fast_isolate_freepages(cc);
   1581	if (cc->nr_freepages)
   1582		goto splitmap;
   1583
   1584	/*
   1585	 * Initialise the free scanner. The starting point is where we last
   1586	 * successfully isolated from, zone-cached value, or the end of the
   1587	 * zone when isolating for the first time. For looping we also need
   1588	 * this pfn aligned down to the pageblock boundary, because we do
   1589	 * block_start_pfn -= pageblock_nr_pages in the for loop.
   1590	 * For ending point, take care when isolating in last pageblock of a
   1591	 * zone which ends in the middle of a pageblock.
   1592	 * The low boundary is the end of the pageblock the migration scanner
   1593	 * is using.
   1594	 */
   1595	isolate_start_pfn = cc->free_pfn;
   1596	block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
   1597	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
   1598						zone_end_pfn(zone));
   1599	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
   1600	stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
   1601
   1602	/*
   1603	 * Isolate free pages until enough are available to migrate the
   1604	 * pages on cc->migratepages. We stop searching if the migrate
   1605	 * and free page scanners meet or enough free pages are isolated.
   1606	 */
   1607	for (; block_start_pfn >= low_pfn;
   1608				block_end_pfn = block_start_pfn,
   1609				block_start_pfn -= pageblock_nr_pages,
   1610				isolate_start_pfn = block_start_pfn) {
   1611		unsigned long nr_isolated;
   1612
   1613		/*
   1614		 * This can iterate a massively long zone without finding any
   1615		 * suitable migration targets, so periodically check resched.
   1616		 */
   1617		if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
   1618			cond_resched();
   1619
   1620		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
   1621									zone);
   1622		if (!page)
   1623			continue;
   1624
   1625		/* Check the block is suitable for migration */
   1626		if (!suitable_migration_target(cc, page))
   1627			continue;
   1628
   1629		/* If isolation recently failed, do not retry */
   1630		if (!isolation_suitable(cc, page))
   1631			continue;
   1632
   1633		/* Found a block suitable for isolating free pages from. */
   1634		nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
   1635					block_end_pfn, freelist, stride, false);
   1636
   1637		/* Update the skip hint if the full pageblock was scanned */
   1638		if (isolate_start_pfn == block_end_pfn)
   1639			update_pageblock_skip(cc, page, block_start_pfn);
   1640
   1641		/* Are enough freepages isolated? */
   1642		if (cc->nr_freepages >= cc->nr_migratepages) {
   1643			if (isolate_start_pfn >= block_end_pfn) {
   1644				/*
   1645				 * Restart at previous pageblock if more
   1646				 * freepages can be isolated next time.
   1647				 */
   1648				isolate_start_pfn =
   1649					block_start_pfn - pageblock_nr_pages;
   1650			}
   1651			break;
   1652		} else if (isolate_start_pfn < block_end_pfn) {
   1653			/*
   1654			 * If isolation failed early, do not continue
   1655			 * needlessly.
   1656			 */
   1657			break;
   1658		}
   1659
   1660		/* Adjust stride depending on isolation */
   1661		if (nr_isolated) {
   1662			stride = 1;
   1663			continue;
   1664		}
   1665		stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
   1666	}
   1667
   1668	/*
   1669	 * Record where the free scanner will restart next time. Either we
   1670	 * broke from the loop and set isolate_start_pfn based on the last
   1671	 * call to isolate_freepages_block(), or we met the migration scanner
   1672	 * and the loop terminated due to isolate_start_pfn < low_pfn
   1673	 */
   1674	cc->free_pfn = isolate_start_pfn;
   1675
   1676splitmap:
   1677	/* __isolate_free_page() does not map the pages */
   1678	split_map_pages(freelist);
   1679}
   1680
   1681/*
   1682 * This is a migrate-callback that "allocates" freepages by taking pages
   1683 * from the isolated freelists in the block we are migrating to.
   1684 */
   1685static struct page *compaction_alloc(struct page *migratepage,
   1686					unsigned long data)
   1687{
   1688	struct compact_control *cc = (struct compact_control *)data;
   1689	struct page *freepage;
   1690
   1691	if (list_empty(&cc->freepages)) {
   1692		isolate_freepages(cc);
   1693
   1694		if (list_empty(&cc->freepages))
   1695			return NULL;
   1696	}
   1697
   1698	freepage = list_entry(cc->freepages.next, struct page, lru);
   1699	list_del(&freepage->lru);
   1700	cc->nr_freepages--;
   1701
   1702	return freepage;
   1703}
   1704
   1705/*
   1706 * This is a migrate-callback that "frees" freepages back to the isolated
   1707 * freelist.  All pages on the freelist are from the same zone, so there is no
   1708 * special handling needed for NUMA.
   1709 */
   1710static void compaction_free(struct page *page, unsigned long data)
   1711{
   1712	struct compact_control *cc = (struct compact_control *)data;
   1713
   1714	list_add(&page->lru, &cc->freepages);
   1715	cc->nr_freepages++;
   1716}
   1717
   1718/* possible outcome of isolate_migratepages */
   1719typedef enum {
   1720	ISOLATE_ABORT,		/* Abort compaction now */
   1721	ISOLATE_NONE,		/* No pages isolated, continue scanning */
   1722	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
   1723} isolate_migrate_t;
   1724
   1725/*
   1726 * Allow userspace to control policy on scanning the unevictable LRU for
   1727 * compactable pages.
   1728 */
   1729#ifdef CONFIG_PREEMPT_RT
   1730int sysctl_compact_unevictable_allowed __read_mostly = 0;
   1731#else
   1732int sysctl_compact_unevictable_allowed __read_mostly = 1;
   1733#endif
   1734
   1735static inline void
   1736update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
   1737{
   1738	if (cc->fast_start_pfn == ULONG_MAX)
   1739		return;
   1740
   1741	if (!cc->fast_start_pfn)
   1742		cc->fast_start_pfn = pfn;
   1743
   1744	cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
   1745}
   1746
   1747static inline unsigned long
   1748reinit_migrate_pfn(struct compact_control *cc)
   1749{
   1750	if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
   1751		return cc->migrate_pfn;
   1752
   1753	cc->migrate_pfn = cc->fast_start_pfn;
   1754	cc->fast_start_pfn = ULONG_MAX;
   1755
   1756	return cc->migrate_pfn;
   1757}
   1758
   1759/*
   1760 * Briefly search the free lists for a migration source that already has
   1761 * some free pages to reduce the number of pages that need migration
   1762 * before a pageblock is free.
   1763 */
   1764static unsigned long fast_find_migrateblock(struct compact_control *cc)
   1765{
   1766	unsigned int limit = freelist_scan_limit(cc);
   1767	unsigned int nr_scanned = 0;
   1768	unsigned long distance;
   1769	unsigned long pfn = cc->migrate_pfn;
   1770	unsigned long high_pfn;
   1771	int order;
   1772	bool found_block = false;
   1773
   1774	/* Skip hints are relied on to avoid repeats on the fast search */
   1775	if (cc->ignore_skip_hint)
   1776		return pfn;
   1777
   1778	/*
   1779	 * If the migrate_pfn is not at the start of a zone or the start
   1780	 * of a pageblock then assume this is a continuation of a previous
   1781	 * scan restarted due to COMPACT_CLUSTER_MAX.
   1782	 */
   1783	if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
   1784		return pfn;
   1785
   1786	/*
   1787	 * For smaller orders, just linearly scan as the number of pages
   1788	 * to migrate should be relatively small and does not necessarily
   1789	 * justify freeing up a large block for a small allocation.
   1790	 */
   1791	if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
   1792		return pfn;
   1793
   1794	/*
   1795	 * Only allow kcompactd and direct requests for movable pages to
   1796	 * quickly clear out a MOVABLE pageblock for allocation. This
   1797	 * reduces the risk that a large movable pageblock is freed for
   1798	 * an unmovable/reclaimable small allocation.
   1799	 */
   1800	if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
   1801		return pfn;
   1802
   1803	/*
   1804	 * When starting the migration scanner, pick any pageblock within the
   1805	 * first half of the search space. Otherwise try and pick a pageblock
   1806	 * within the first eighth to reduce the chances that a migration
   1807	 * target later becomes a source.
   1808	 */
   1809	distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
   1810	if (cc->migrate_pfn != cc->zone->zone_start_pfn)
   1811		distance >>= 2;
   1812	high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
   1813
   1814	for (order = cc->order - 1;
   1815	     order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
   1816	     order--) {
   1817		struct free_area *area = &cc->zone->free_area[order];
   1818		struct list_head *freelist;
   1819		unsigned long flags;
   1820		struct page *freepage;
   1821
   1822		if (!area->nr_free)
   1823			continue;
   1824
   1825		spin_lock_irqsave(&cc->zone->lock, flags);
   1826		freelist = &area->free_list[MIGRATE_MOVABLE];
   1827		list_for_each_entry(freepage, freelist, lru) {
   1828			unsigned long free_pfn;
   1829
   1830			if (nr_scanned++ >= limit) {
   1831				move_freelist_tail(freelist, freepage);
   1832				break;
   1833			}
   1834
   1835			free_pfn = page_to_pfn(freepage);
   1836			if (free_pfn < high_pfn) {
   1837				/*
   1838				 * Avoid if skipped recently. Ideally it would
   1839				 * move to the tail but even safe iteration of
   1840				 * the list assumes an entry is deleted, not
   1841				 * reordered.
   1842				 */
   1843				if (get_pageblock_skip(freepage))
   1844					continue;
   1845
   1846				/* Reorder to so a future search skips recent pages */
   1847				move_freelist_tail(freelist, freepage);
   1848
   1849				update_fast_start_pfn(cc, free_pfn);
   1850				pfn = pageblock_start_pfn(free_pfn);
   1851				if (pfn < cc->zone->zone_start_pfn)
   1852					pfn = cc->zone->zone_start_pfn;
   1853				cc->fast_search_fail = 0;
   1854				found_block = true;
   1855				set_pageblock_skip(freepage);
   1856				break;
   1857			}
   1858		}
   1859		spin_unlock_irqrestore(&cc->zone->lock, flags);
   1860	}
   1861
   1862	cc->total_migrate_scanned += nr_scanned;
   1863
   1864	/*
   1865	 * If fast scanning failed then use a cached entry for a page block
   1866	 * that had free pages as the basis for starting a linear scan.
   1867	 */
   1868	if (!found_block) {
   1869		cc->fast_search_fail++;
   1870		pfn = reinit_migrate_pfn(cc);
   1871	}
   1872	return pfn;
   1873}
   1874
   1875/*
   1876 * Isolate all pages that can be migrated from the first suitable block,
   1877 * starting at the block pointed to by the migrate scanner pfn within
   1878 * compact_control.
   1879 */
   1880static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
   1881{
   1882	unsigned long block_start_pfn;
   1883	unsigned long block_end_pfn;
   1884	unsigned long low_pfn;
   1885	struct page *page;
   1886	const isolate_mode_t isolate_mode =
   1887		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
   1888		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
   1889	bool fast_find_block;
   1890
   1891	/*
   1892	 * Start at where we last stopped, or beginning of the zone as
   1893	 * initialized by compact_zone(). The first failure will use
   1894	 * the lowest PFN as the starting point for linear scanning.
   1895	 */
   1896	low_pfn = fast_find_migrateblock(cc);
   1897	block_start_pfn = pageblock_start_pfn(low_pfn);
   1898	if (block_start_pfn < cc->zone->zone_start_pfn)
   1899		block_start_pfn = cc->zone->zone_start_pfn;
   1900
   1901	/*
   1902	 * fast_find_migrateblock marks a pageblock skipped so to avoid
   1903	 * the isolation_suitable check below, check whether the fast
   1904	 * search was successful.
   1905	 */
   1906	fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
   1907
   1908	/* Only scan within a pageblock boundary */
   1909	block_end_pfn = pageblock_end_pfn(low_pfn);
   1910
   1911	/*
   1912	 * Iterate over whole pageblocks until we find the first suitable.
   1913	 * Do not cross the free scanner.
   1914	 */
   1915	for (; block_end_pfn <= cc->free_pfn;
   1916			fast_find_block = false,
   1917			cc->migrate_pfn = low_pfn = block_end_pfn,
   1918			block_start_pfn = block_end_pfn,
   1919			block_end_pfn += pageblock_nr_pages) {
   1920
   1921		/*
   1922		 * This can potentially iterate a massively long zone with
   1923		 * many pageblocks unsuitable, so periodically check if we
   1924		 * need to schedule.
   1925		 */
   1926		if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
   1927			cond_resched();
   1928
   1929		page = pageblock_pfn_to_page(block_start_pfn,
   1930						block_end_pfn, cc->zone);
   1931		if (!page)
   1932			continue;
   1933
   1934		/*
   1935		 * If isolation recently failed, do not retry. Only check the
   1936		 * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
   1937		 * to be visited multiple times. Assume skip was checked
   1938		 * before making it "skip" so other compaction instances do
   1939		 * not scan the same block.
   1940		 */
   1941		if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
   1942		    !fast_find_block && !isolation_suitable(cc, page))
   1943			continue;
   1944
   1945		/*
   1946		 * For async direct compaction, only scan the pageblocks of the
   1947		 * same migratetype without huge pages. Async direct compaction
   1948		 * is optimistic to see if the minimum amount of work satisfies
   1949		 * the allocation. The cached PFN is updated as it's possible
   1950		 * that all remaining blocks between source and target are
   1951		 * unsuitable and the compaction scanners fail to meet.
   1952		 */
   1953		if (!suitable_migration_source(cc, page)) {
   1954			update_cached_migrate(cc, block_end_pfn);
   1955			continue;
   1956		}
   1957
   1958		/* Perform the isolation */
   1959		if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
   1960						isolate_mode))
   1961			return ISOLATE_ABORT;
   1962
   1963		/*
   1964		 * Either we isolated something and proceed with migration. Or
   1965		 * we failed and compact_zone should decide if we should
   1966		 * continue or not.
   1967		 */
   1968		break;
   1969	}
   1970
   1971	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
   1972}
   1973
   1974/*
   1975 * order == -1 is expected when compacting via
   1976 * /proc/sys/vm/compact_memory
   1977 */
   1978static inline bool is_via_compact_memory(int order)
   1979{
   1980	return order == -1;
   1981}
   1982
   1983static bool kswapd_is_running(pg_data_t *pgdat)
   1984{
   1985	return pgdat->kswapd && task_is_running(pgdat->kswapd);
   1986}
   1987
   1988/*
   1989 * A zone's fragmentation score is the external fragmentation wrt to the
   1990 * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
   1991 */
   1992static unsigned int fragmentation_score_zone(struct zone *zone)
   1993{
   1994	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
   1995}
   1996
   1997/*
   1998 * A weighted zone's fragmentation score is the external fragmentation
   1999 * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
   2000 * returns a value in the range [0, 100].
   2001 *
   2002 * The scaling factor ensures that proactive compaction focuses on larger
   2003 * zones like ZONE_NORMAL, rather than smaller, specialized zones like
   2004 * ZONE_DMA32. For smaller zones, the score value remains close to zero,
   2005 * and thus never exceeds the high threshold for proactive compaction.
   2006 */
   2007static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
   2008{
   2009	unsigned long score;
   2010
   2011	score = zone->present_pages * fragmentation_score_zone(zone);
   2012	return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
   2013}
   2014
   2015/*
   2016 * The per-node proactive (background) compaction process is started by its
   2017 * corresponding kcompactd thread when the node's fragmentation score
   2018 * exceeds the high threshold. The compaction process remains active till
   2019 * the node's score falls below the low threshold, or one of the back-off
   2020 * conditions is met.
   2021 */
   2022static unsigned int fragmentation_score_node(pg_data_t *pgdat)
   2023{
   2024	unsigned int score = 0;
   2025	int zoneid;
   2026
   2027	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
   2028		struct zone *zone;
   2029
   2030		zone = &pgdat->node_zones[zoneid];
   2031		score += fragmentation_score_zone_weighted(zone);
   2032	}
   2033
   2034	return score;
   2035}
   2036
   2037static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
   2038{
   2039	unsigned int wmark_low;
   2040
   2041	/*
   2042	 * Cap the low watermark to avoid excessive compaction
   2043	 * activity in case a user sets the proactiveness tunable
   2044	 * close to 100 (maximum).
   2045	 */
   2046	wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
   2047	return low ? wmark_low : min(wmark_low + 10, 100U);
   2048}
   2049
   2050static bool should_proactive_compact_node(pg_data_t *pgdat)
   2051{
   2052	int wmark_high;
   2053
   2054	if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
   2055		return false;
   2056
   2057	wmark_high = fragmentation_score_wmark(pgdat, false);
   2058	return fragmentation_score_node(pgdat) > wmark_high;
   2059}
   2060
   2061static enum compact_result __compact_finished(struct compact_control *cc)
   2062{
   2063	unsigned int order;
   2064	const int migratetype = cc->migratetype;
   2065	int ret;
   2066
   2067	/* Compaction run completes if the migrate and free scanner meet */
   2068	if (compact_scanners_met(cc)) {
   2069		/* Let the next compaction start anew. */
   2070		reset_cached_positions(cc->zone);
   2071
   2072		/*
   2073		 * Mark that the PG_migrate_skip information should be cleared
   2074		 * by kswapd when it goes to sleep. kcompactd does not set the
   2075		 * flag itself as the decision to be clear should be directly
   2076		 * based on an allocation request.
   2077		 */
   2078		if (cc->direct_compaction)
   2079			cc->zone->compact_blockskip_flush = true;
   2080
   2081		if (cc->whole_zone)
   2082			return COMPACT_COMPLETE;
   2083		else
   2084			return COMPACT_PARTIAL_SKIPPED;
   2085	}
   2086
   2087	if (cc->proactive_compaction) {
   2088		int score, wmark_low;
   2089		pg_data_t *pgdat;
   2090
   2091		pgdat = cc->zone->zone_pgdat;
   2092		if (kswapd_is_running(pgdat))
   2093			return COMPACT_PARTIAL_SKIPPED;
   2094
   2095		score = fragmentation_score_zone(cc->zone);
   2096		wmark_low = fragmentation_score_wmark(pgdat, true);
   2097
   2098		if (score > wmark_low)
   2099			ret = COMPACT_CONTINUE;
   2100		else
   2101			ret = COMPACT_SUCCESS;
   2102
   2103		goto out;
   2104	}
   2105
   2106	if (is_via_compact_memory(cc->order))
   2107		return COMPACT_CONTINUE;
   2108
   2109	/*
   2110	 * Always finish scanning a pageblock to reduce the possibility of
   2111	 * fallbacks in the future. This is particularly important when
   2112	 * migration source is unmovable/reclaimable but it's not worth
   2113	 * special casing.
   2114	 */
   2115	if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
   2116		return COMPACT_CONTINUE;
   2117
   2118	/* Direct compactor: Is a suitable page free? */
   2119	ret = COMPACT_NO_SUITABLE_PAGE;
   2120	for (order = cc->order; order < MAX_ORDER; order++) {
   2121		struct free_area *area = &cc->zone->free_area[order];
   2122		bool can_steal;
   2123
   2124		/* Job done if page is free of the right migratetype */
   2125		if (!free_area_empty(area, migratetype))
   2126			return COMPACT_SUCCESS;
   2127
   2128#ifdef CONFIG_CMA
   2129		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
   2130		if (migratetype == MIGRATE_MOVABLE &&
   2131			!free_area_empty(area, MIGRATE_CMA))
   2132			return COMPACT_SUCCESS;
   2133#endif
   2134		/*
   2135		 * Job done if allocation would steal freepages from
   2136		 * other migratetype buddy lists.
   2137		 */
   2138		if (find_suitable_fallback(area, order, migratetype,
   2139						true, &can_steal) != -1)
   2140			/*
   2141			 * Movable pages are OK in any pageblock. If we are
   2142			 * stealing for a non-movable allocation, make sure
   2143			 * we finish compacting the current pageblock first
   2144			 * (which is assured by the above migrate_pfn align
   2145			 * check) so it is as free as possible and we won't
   2146			 * have to steal another one soon.
   2147			 */
   2148			return COMPACT_SUCCESS;
   2149	}
   2150
   2151out:
   2152	if (cc->contended || fatal_signal_pending(current))
   2153		ret = COMPACT_CONTENDED;
   2154
   2155	return ret;
   2156}
   2157
   2158static enum compact_result compact_finished(struct compact_control *cc)
   2159{
   2160	int ret;
   2161
   2162	ret = __compact_finished(cc);
   2163	trace_mm_compaction_finished(cc->zone, cc->order, ret);
   2164	if (ret == COMPACT_NO_SUITABLE_PAGE)
   2165		ret = COMPACT_CONTINUE;
   2166
   2167	return ret;
   2168}
   2169
   2170static enum compact_result __compaction_suitable(struct zone *zone, int order,
   2171					unsigned int alloc_flags,
   2172					int highest_zoneidx,
   2173					unsigned long wmark_target)
   2174{
   2175	unsigned long watermark;
   2176
   2177	if (is_via_compact_memory(order))
   2178		return COMPACT_CONTINUE;
   2179
   2180	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
   2181	/*
   2182	 * If watermarks for high-order allocation are already met, there
   2183	 * should be no need for compaction at all.
   2184	 */
   2185	if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
   2186								alloc_flags))
   2187		return COMPACT_SUCCESS;
   2188
   2189	/*
   2190	 * Watermarks for order-0 must be met for compaction to be able to
   2191	 * isolate free pages for migration targets. This means that the
   2192	 * watermark and alloc_flags have to match, or be more pessimistic than
   2193	 * the check in __isolate_free_page(). We don't use the direct
   2194	 * compactor's alloc_flags, as they are not relevant for freepage
   2195	 * isolation. We however do use the direct compactor's highest_zoneidx
   2196	 * to skip over zones where lowmem reserves would prevent allocation
   2197	 * even if compaction succeeds.
   2198	 * For costly orders, we require low watermark instead of min for
   2199	 * compaction to proceed to increase its chances.
   2200	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
   2201	 * suitable migration targets
   2202	 */
   2203	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
   2204				low_wmark_pages(zone) : min_wmark_pages(zone);
   2205	watermark += compact_gap(order);
   2206	if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
   2207						ALLOC_CMA, wmark_target))
   2208		return COMPACT_SKIPPED;
   2209
   2210	return COMPACT_CONTINUE;
   2211}
   2212
   2213/*
   2214 * compaction_suitable: Is this suitable to run compaction on this zone now?
   2215 * Returns
   2216 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
   2217 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
   2218 *   COMPACT_CONTINUE - If compaction should run now
   2219 */
   2220enum compact_result compaction_suitable(struct zone *zone, int order,
   2221					unsigned int alloc_flags,
   2222					int highest_zoneidx)
   2223{
   2224	enum compact_result ret;
   2225	int fragindex;
   2226
   2227	ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
   2228				    zone_page_state(zone, NR_FREE_PAGES));
   2229	/*
   2230	 * fragmentation index determines if allocation failures are due to
   2231	 * low memory or external fragmentation
   2232	 *
   2233	 * index of -1000 would imply allocations might succeed depending on
   2234	 * watermarks, but we already failed the high-order watermark check
   2235	 * index towards 0 implies failure is due to lack of memory
   2236	 * index towards 1000 implies failure is due to fragmentation
   2237	 *
   2238	 * Only compact if a failure would be due to fragmentation. Also
   2239	 * ignore fragindex for non-costly orders where the alternative to
   2240	 * a successful reclaim/compaction is OOM. Fragindex and the
   2241	 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
   2242	 * excessive compaction for costly orders, but it should not be at the
   2243	 * expense of system stability.
   2244	 */
   2245	if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
   2246		fragindex = fragmentation_index(zone, order);
   2247		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
   2248			ret = COMPACT_NOT_SUITABLE_ZONE;
   2249	}
   2250
   2251	trace_mm_compaction_suitable(zone, order, ret);
   2252	if (ret == COMPACT_NOT_SUITABLE_ZONE)
   2253		ret = COMPACT_SKIPPED;
   2254
   2255	return ret;
   2256}
   2257
   2258bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
   2259		int alloc_flags)
   2260{
   2261	struct zone *zone;
   2262	struct zoneref *z;
   2263
   2264	/*
   2265	 * Make sure at least one zone would pass __compaction_suitable if we continue
   2266	 * retrying the reclaim.
   2267	 */
   2268	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
   2269				ac->highest_zoneidx, ac->nodemask) {
   2270		unsigned long available;
   2271		enum compact_result compact_result;
   2272
   2273		/*
   2274		 * Do not consider all the reclaimable memory because we do not
   2275		 * want to trash just for a single high order allocation which
   2276		 * is even not guaranteed to appear even if __compaction_suitable
   2277		 * is happy about the watermark check.
   2278		 */
   2279		available = zone_reclaimable_pages(zone) / order;
   2280		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
   2281		compact_result = __compaction_suitable(zone, order, alloc_flags,
   2282				ac->highest_zoneidx, available);
   2283		if (compact_result == COMPACT_CONTINUE)
   2284			return true;
   2285	}
   2286
   2287	return false;
   2288}
   2289
   2290static enum compact_result
   2291compact_zone(struct compact_control *cc, struct capture_control *capc)
   2292{
   2293	enum compact_result ret;
   2294	unsigned long start_pfn = cc->zone->zone_start_pfn;
   2295	unsigned long end_pfn = zone_end_pfn(cc->zone);
   2296	unsigned long last_migrated_pfn;
   2297	const bool sync = cc->mode != MIGRATE_ASYNC;
   2298	bool update_cached;
   2299	unsigned int nr_succeeded = 0;
   2300
   2301	/*
   2302	 * These counters track activities during zone compaction.  Initialize
   2303	 * them before compacting a new zone.
   2304	 */
   2305	cc->total_migrate_scanned = 0;
   2306	cc->total_free_scanned = 0;
   2307	cc->nr_migratepages = 0;
   2308	cc->nr_freepages = 0;
   2309	INIT_LIST_HEAD(&cc->freepages);
   2310	INIT_LIST_HEAD(&cc->migratepages);
   2311
   2312	cc->migratetype = gfp_migratetype(cc->gfp_mask);
   2313	ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
   2314							cc->highest_zoneidx);
   2315	/* Compaction is likely to fail */
   2316	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
   2317		return ret;
   2318
   2319	/* huh, compaction_suitable is returning something unexpected */
   2320	VM_BUG_ON(ret != COMPACT_CONTINUE);
   2321
   2322	/*
   2323	 * Clear pageblock skip if there were failures recently and compaction
   2324	 * is about to be retried after being deferred.
   2325	 */
   2326	if (compaction_restarting(cc->zone, cc->order))
   2327		__reset_isolation_suitable(cc->zone);
   2328
   2329	/*
   2330	 * Setup to move all movable pages to the end of the zone. Used cached
   2331	 * information on where the scanners should start (unless we explicitly
   2332	 * want to compact the whole zone), but check that it is initialised
   2333	 * by ensuring the values are within zone boundaries.
   2334	 */
   2335	cc->fast_start_pfn = 0;
   2336	if (cc->whole_zone) {
   2337		cc->migrate_pfn = start_pfn;
   2338		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
   2339	} else {
   2340		cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
   2341		cc->free_pfn = cc->zone->compact_cached_free_pfn;
   2342		if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
   2343			cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
   2344			cc->zone->compact_cached_free_pfn = cc->free_pfn;
   2345		}
   2346		if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
   2347			cc->migrate_pfn = start_pfn;
   2348			cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
   2349			cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
   2350		}
   2351
   2352		if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
   2353			cc->whole_zone = true;
   2354	}
   2355
   2356	last_migrated_pfn = 0;
   2357
   2358	/*
   2359	 * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
   2360	 * the basis that some migrations will fail in ASYNC mode. However,
   2361	 * if the cached PFNs match and pageblocks are skipped due to having
   2362	 * no isolation candidates, then the sync state does not matter.
   2363	 * Until a pageblock with isolation candidates is found, keep the
   2364	 * cached PFNs in sync to avoid revisiting the same blocks.
   2365	 */
   2366	update_cached = !sync &&
   2367		cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
   2368
   2369	trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync);
   2370
   2371	/* lru_add_drain_all could be expensive with involving other CPUs */
   2372	lru_add_drain();
   2373
   2374	while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
   2375		int err;
   2376		unsigned long iteration_start_pfn = cc->migrate_pfn;
   2377
   2378		/*
   2379		 * Avoid multiple rescans which can happen if a page cannot be
   2380		 * isolated (dirty/writeback in async mode) or if the migrated
   2381		 * pages are being allocated before the pageblock is cleared.
   2382		 * The first rescan will capture the entire pageblock for
   2383		 * migration. If it fails, it'll be marked skip and scanning
   2384		 * will proceed as normal.
   2385		 */
   2386		cc->rescan = false;
   2387		if (pageblock_start_pfn(last_migrated_pfn) ==
   2388		    pageblock_start_pfn(iteration_start_pfn)) {
   2389			cc->rescan = true;
   2390		}
   2391
   2392		switch (isolate_migratepages(cc)) {
   2393		case ISOLATE_ABORT:
   2394			ret = COMPACT_CONTENDED;
   2395			putback_movable_pages(&cc->migratepages);
   2396			cc->nr_migratepages = 0;
   2397			goto out;
   2398		case ISOLATE_NONE:
   2399			if (update_cached) {
   2400				cc->zone->compact_cached_migrate_pfn[1] =
   2401					cc->zone->compact_cached_migrate_pfn[0];
   2402			}
   2403
   2404			/*
   2405			 * We haven't isolated and migrated anything, but
   2406			 * there might still be unflushed migrations from
   2407			 * previous cc->order aligned block.
   2408			 */
   2409			goto check_drain;
   2410		case ISOLATE_SUCCESS:
   2411			update_cached = false;
   2412			last_migrated_pfn = iteration_start_pfn;
   2413		}
   2414
   2415		err = migrate_pages(&cc->migratepages, compaction_alloc,
   2416				compaction_free, (unsigned long)cc, cc->mode,
   2417				MR_COMPACTION, &nr_succeeded);
   2418
   2419		trace_mm_compaction_migratepages(cc, nr_succeeded);
   2420
   2421		/* All pages were either migrated or will be released */
   2422		cc->nr_migratepages = 0;
   2423		if (err) {
   2424			putback_movable_pages(&cc->migratepages);
   2425			/*
   2426			 * migrate_pages() may return -ENOMEM when scanners meet
   2427			 * and we want compact_finished() to detect it
   2428			 */
   2429			if (err == -ENOMEM && !compact_scanners_met(cc)) {
   2430				ret = COMPACT_CONTENDED;
   2431				goto out;
   2432			}
   2433			/*
   2434			 * We failed to migrate at least one page in the current
   2435			 * order-aligned block, so skip the rest of it.
   2436			 */
   2437			if (cc->direct_compaction &&
   2438						(cc->mode == MIGRATE_ASYNC)) {
   2439				cc->migrate_pfn = block_end_pfn(
   2440						cc->migrate_pfn - 1, cc->order);
   2441				/* Draining pcplists is useless in this case */
   2442				last_migrated_pfn = 0;
   2443			}
   2444		}
   2445
   2446check_drain:
   2447		/*
   2448		 * Has the migration scanner moved away from the previous
   2449		 * cc->order aligned block where we migrated from? If yes,
   2450		 * flush the pages that were freed, so that they can merge and
   2451		 * compact_finished() can detect immediately if allocation
   2452		 * would succeed.
   2453		 */
   2454		if (cc->order > 0 && last_migrated_pfn) {
   2455			unsigned long current_block_start =
   2456				block_start_pfn(cc->migrate_pfn, cc->order);
   2457
   2458			if (last_migrated_pfn < current_block_start) {
   2459				lru_add_drain_cpu_zone(cc->zone);
   2460				/* No more flushing until we migrate again */
   2461				last_migrated_pfn = 0;
   2462			}
   2463		}
   2464
   2465		/* Stop if a page has been captured */
   2466		if (capc && capc->page) {
   2467			ret = COMPACT_SUCCESS;
   2468			break;
   2469		}
   2470	}
   2471
   2472out:
   2473	/*
   2474	 * Release free pages and update where the free scanner should restart,
   2475	 * so we don't leave any returned pages behind in the next attempt.
   2476	 */
   2477	if (cc->nr_freepages > 0) {
   2478		unsigned long free_pfn = release_freepages(&cc->freepages);
   2479
   2480		cc->nr_freepages = 0;
   2481		VM_BUG_ON(free_pfn == 0);
   2482		/* The cached pfn is always the first in a pageblock */
   2483		free_pfn = pageblock_start_pfn(free_pfn);
   2484		/*
   2485		 * Only go back, not forward. The cached pfn might have been
   2486		 * already reset to zone end in compact_finished()
   2487		 */
   2488		if (free_pfn > cc->zone->compact_cached_free_pfn)
   2489			cc->zone->compact_cached_free_pfn = free_pfn;
   2490	}
   2491
   2492	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
   2493	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
   2494
   2495	trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
   2496
   2497	return ret;
   2498}
   2499
   2500static enum compact_result compact_zone_order(struct zone *zone, int order,
   2501		gfp_t gfp_mask, enum compact_priority prio,
   2502		unsigned int alloc_flags, int highest_zoneidx,
   2503		struct page **capture)
   2504{
   2505	enum compact_result ret;
   2506	struct compact_control cc = {
   2507		.order = order,
   2508		.search_order = order,
   2509		.gfp_mask = gfp_mask,
   2510		.zone = zone,
   2511		.mode = (prio == COMPACT_PRIO_ASYNC) ?
   2512					MIGRATE_ASYNC :	MIGRATE_SYNC_LIGHT,
   2513		.alloc_flags = alloc_flags,
   2514		.highest_zoneidx = highest_zoneidx,
   2515		.direct_compaction = true,
   2516		.whole_zone = (prio == MIN_COMPACT_PRIORITY),
   2517		.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
   2518		.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
   2519	};
   2520	struct capture_control capc = {
   2521		.cc = &cc,
   2522		.page = NULL,
   2523	};
   2524
   2525	/*
   2526	 * Make sure the structs are really initialized before we expose the
   2527	 * capture control, in case we are interrupted and the interrupt handler
   2528	 * frees a page.
   2529	 */
   2530	barrier();
   2531	WRITE_ONCE(current->capture_control, &capc);
   2532
   2533	ret = compact_zone(&cc, &capc);
   2534
   2535	VM_BUG_ON(!list_empty(&cc.freepages));
   2536	VM_BUG_ON(!list_empty(&cc.migratepages));
   2537
   2538	/*
   2539	 * Make sure we hide capture control first before we read the captured
   2540	 * page pointer, otherwise an interrupt could free and capture a page
   2541	 * and we would leak it.
   2542	 */
   2543	WRITE_ONCE(current->capture_control, NULL);
   2544	*capture = READ_ONCE(capc.page);
   2545	/*
   2546	 * Technically, it is also possible that compaction is skipped but
   2547	 * the page is still captured out of luck(IRQ came and freed the page).
   2548	 * Returning COMPACT_SUCCESS in such cases helps in properly accounting
   2549	 * the COMPACT[STALL|FAIL] when compaction is skipped.
   2550	 */
   2551	if (*capture)
   2552		ret = COMPACT_SUCCESS;
   2553
   2554	return ret;
   2555}
   2556
   2557int sysctl_extfrag_threshold = 500;
   2558
   2559/**
   2560 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
   2561 * @gfp_mask: The GFP mask of the current allocation
   2562 * @order: The order of the current allocation
   2563 * @alloc_flags: The allocation flags of the current allocation
   2564 * @ac: The context of current allocation
   2565 * @prio: Determines how hard direct compaction should try to succeed
   2566 * @capture: Pointer to free page created by compaction will be stored here
   2567 *
   2568 * This is the main entry point for direct page compaction.
   2569 */
   2570enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
   2571		unsigned int alloc_flags, const struct alloc_context *ac,
   2572		enum compact_priority prio, struct page **capture)
   2573{
   2574	int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
   2575	struct zoneref *z;
   2576	struct zone *zone;
   2577	enum compact_result rc = COMPACT_SKIPPED;
   2578
   2579	/*
   2580	 * Check if the GFP flags allow compaction - GFP_NOIO is really
   2581	 * tricky context because the migration might require IO
   2582	 */
   2583	if (!may_perform_io)
   2584		return COMPACT_SKIPPED;
   2585
   2586	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
   2587
   2588	/* Compact each zone in the list */
   2589	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
   2590					ac->highest_zoneidx, ac->nodemask) {
   2591		enum compact_result status;
   2592
   2593		if (prio > MIN_COMPACT_PRIORITY
   2594					&& compaction_deferred(zone, order)) {
   2595			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
   2596			continue;
   2597		}
   2598
   2599		status = compact_zone_order(zone, order, gfp_mask, prio,
   2600				alloc_flags, ac->highest_zoneidx, capture);
   2601		rc = max(status, rc);
   2602
   2603		/* The allocation should succeed, stop compacting */
   2604		if (status == COMPACT_SUCCESS) {
   2605			/*
   2606			 * We think the allocation will succeed in this zone,
   2607			 * but it is not certain, hence the false. The caller
   2608			 * will repeat this with true if allocation indeed
   2609			 * succeeds in this zone.
   2610			 */
   2611			compaction_defer_reset(zone, order, false);
   2612
   2613			break;
   2614		}
   2615
   2616		if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
   2617					status == COMPACT_PARTIAL_SKIPPED))
   2618			/*
   2619			 * We think that allocation won't succeed in this zone
   2620			 * so we defer compaction there. If it ends up
   2621			 * succeeding after all, it will be reset.
   2622			 */
   2623			defer_compaction(zone, order);
   2624
   2625		/*
   2626		 * We might have stopped compacting due to need_resched() in
   2627		 * async compaction, or due to a fatal signal detected. In that
   2628		 * case do not try further zones
   2629		 */
   2630		if ((prio == COMPACT_PRIO_ASYNC && need_resched())
   2631					|| fatal_signal_pending(current))
   2632			break;
   2633	}
   2634
   2635	return rc;
   2636}
   2637
   2638/*
   2639 * Compact all zones within a node till each zone's fragmentation score
   2640 * reaches within proactive compaction thresholds (as determined by the
   2641 * proactiveness tunable).
   2642 *
   2643 * It is possible that the function returns before reaching score targets
   2644 * due to various back-off conditions, such as, contention on per-node or
   2645 * per-zone locks.
   2646 */
   2647static void proactive_compact_node(pg_data_t *pgdat)
   2648{
   2649	int zoneid;
   2650	struct zone *zone;
   2651	struct compact_control cc = {
   2652		.order = -1,
   2653		.mode = MIGRATE_SYNC_LIGHT,
   2654		.ignore_skip_hint = true,
   2655		.whole_zone = true,
   2656		.gfp_mask = GFP_KERNEL,
   2657		.proactive_compaction = true,
   2658	};
   2659
   2660	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
   2661		zone = &pgdat->node_zones[zoneid];
   2662		if (!populated_zone(zone))
   2663			continue;
   2664
   2665		cc.zone = zone;
   2666
   2667		compact_zone(&cc, NULL);
   2668
   2669		VM_BUG_ON(!list_empty(&cc.freepages));
   2670		VM_BUG_ON(!list_empty(&cc.migratepages));
   2671	}
   2672}
   2673
   2674/* Compact all zones within a node */
   2675static void compact_node(int nid)
   2676{
   2677	pg_data_t *pgdat = NODE_DATA(nid);
   2678	int zoneid;
   2679	struct zone *zone;
   2680	struct compact_control cc = {
   2681		.order = -1,
   2682		.mode = MIGRATE_SYNC,
   2683		.ignore_skip_hint = true,
   2684		.whole_zone = true,
   2685		.gfp_mask = GFP_KERNEL,
   2686	};
   2687
   2688
   2689	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
   2690
   2691		zone = &pgdat->node_zones[zoneid];
   2692		if (!populated_zone(zone))
   2693			continue;
   2694
   2695		cc.zone = zone;
   2696
   2697		compact_zone(&cc, NULL);
   2698
   2699		VM_BUG_ON(!list_empty(&cc.freepages));
   2700		VM_BUG_ON(!list_empty(&cc.migratepages));
   2701	}
   2702}
   2703
   2704/* Compact all nodes in the system */
   2705static void compact_nodes(void)
   2706{
   2707	int nid;
   2708
   2709	/* Flush pending updates to the LRU lists */
   2710	lru_add_drain_all();
   2711
   2712	for_each_online_node(nid)
   2713		compact_node(nid);
   2714}
   2715
   2716/*
   2717 * Tunable for proactive compaction. It determines how
   2718 * aggressively the kernel should compact memory in the
   2719 * background. It takes values in the range [0, 100].
   2720 */
   2721unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
   2722
   2723int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
   2724		void *buffer, size_t *length, loff_t *ppos)
   2725{
   2726	int rc, nid;
   2727
   2728	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
   2729	if (rc)
   2730		return rc;
   2731
   2732	if (write && sysctl_compaction_proactiveness) {
   2733		for_each_online_node(nid) {
   2734			pg_data_t *pgdat = NODE_DATA(nid);
   2735
   2736			if (pgdat->proactive_compact_trigger)
   2737				continue;
   2738
   2739			pgdat->proactive_compact_trigger = true;
   2740			wake_up_interruptible(&pgdat->kcompactd_wait);
   2741		}
   2742	}
   2743
   2744	return 0;
   2745}
   2746
   2747/*
   2748 * This is the entry point for compacting all nodes via
   2749 * /proc/sys/vm/compact_memory
   2750 */
   2751int sysctl_compaction_handler(struct ctl_table *table, int write,
   2752			void *buffer, size_t *length, loff_t *ppos)
   2753{
   2754	if (write)
   2755		compact_nodes();
   2756
   2757	return 0;
   2758}
   2759
   2760#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
   2761static ssize_t compact_store(struct device *dev,
   2762			     struct device_attribute *attr,
   2763			     const char *buf, size_t count)
   2764{
   2765	int nid = dev->id;
   2766
   2767	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
   2768		/* Flush pending updates to the LRU lists */
   2769		lru_add_drain_all();
   2770
   2771		compact_node(nid);
   2772	}
   2773
   2774	return count;
   2775}
   2776static DEVICE_ATTR_WO(compact);
   2777
   2778int compaction_register_node(struct node *node)
   2779{
   2780	return device_create_file(&node->dev, &dev_attr_compact);
   2781}
   2782
   2783void compaction_unregister_node(struct node *node)
   2784{
   2785	return device_remove_file(&node->dev, &dev_attr_compact);
   2786}
   2787#endif /* CONFIG_SYSFS && CONFIG_NUMA */
   2788
   2789static inline bool kcompactd_work_requested(pg_data_t *pgdat)
   2790{
   2791	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
   2792		pgdat->proactive_compact_trigger;
   2793}
   2794
   2795static bool kcompactd_node_suitable(pg_data_t *pgdat)
   2796{
   2797	int zoneid;
   2798	struct zone *zone;
   2799	enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
   2800
   2801	for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
   2802		zone = &pgdat->node_zones[zoneid];
   2803
   2804		if (!populated_zone(zone))
   2805			continue;
   2806
   2807		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
   2808					highest_zoneidx) == COMPACT_CONTINUE)
   2809			return true;
   2810	}
   2811
   2812	return false;
   2813}
   2814
   2815static void kcompactd_do_work(pg_data_t *pgdat)
   2816{
   2817	/*
   2818	 * With no special task, compact all zones so that a page of requested
   2819	 * order is allocatable.
   2820	 */
   2821	int zoneid;
   2822	struct zone *zone;
   2823	struct compact_control cc = {
   2824		.order = pgdat->kcompactd_max_order,
   2825		.search_order = pgdat->kcompactd_max_order,
   2826		.highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
   2827		.mode = MIGRATE_SYNC_LIGHT,
   2828		.ignore_skip_hint = false,
   2829		.gfp_mask = GFP_KERNEL,
   2830	};
   2831	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
   2832							cc.highest_zoneidx);
   2833	count_compact_event(KCOMPACTD_WAKE);
   2834
   2835	for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
   2836		int status;
   2837
   2838		zone = &pgdat->node_zones[zoneid];
   2839		if (!populated_zone(zone))
   2840			continue;
   2841
   2842		if (compaction_deferred(zone, cc.order))
   2843			continue;
   2844
   2845		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
   2846							COMPACT_CONTINUE)
   2847			continue;
   2848
   2849		if (kthread_should_stop())
   2850			return;
   2851
   2852		cc.zone = zone;
   2853		status = compact_zone(&cc, NULL);
   2854
   2855		if (status == COMPACT_SUCCESS) {
   2856			compaction_defer_reset(zone, cc.order, false);
   2857		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
   2858			/*
   2859			 * Buddy pages may become stranded on pcps that could
   2860			 * otherwise coalesce on the zone's free area for
   2861			 * order >= cc.order.  This is ratelimited by the
   2862			 * upcoming deferral.
   2863			 */
   2864			drain_all_pages(zone);
   2865
   2866			/*
   2867			 * We use sync migration mode here, so we defer like
   2868			 * sync direct compaction does.
   2869			 */
   2870			defer_compaction(zone, cc.order);
   2871		}
   2872
   2873		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
   2874				     cc.total_migrate_scanned);
   2875		count_compact_events(KCOMPACTD_FREE_SCANNED,
   2876				     cc.total_free_scanned);
   2877
   2878		VM_BUG_ON(!list_empty(&cc.freepages));
   2879		VM_BUG_ON(!list_empty(&cc.migratepages));
   2880	}
   2881
   2882	/*
   2883	 * Regardless of success, we are done until woken up next. But remember
   2884	 * the requested order/highest_zoneidx in case it was higher/tighter
   2885	 * than our current ones
   2886	 */
   2887	if (pgdat->kcompactd_max_order <= cc.order)
   2888		pgdat->kcompactd_max_order = 0;
   2889	if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
   2890		pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
   2891}
   2892
   2893void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
   2894{
   2895	if (!order)
   2896		return;
   2897
   2898	if (pgdat->kcompactd_max_order < order)
   2899		pgdat->kcompactd_max_order = order;
   2900
   2901	if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
   2902		pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
   2903
   2904	/*
   2905	 * Pairs with implicit barrier in wait_event_freezable()
   2906	 * such that wakeups are not missed.
   2907	 */
   2908	if (!wq_has_sleeper(&pgdat->kcompactd_wait))
   2909		return;
   2910
   2911	if (!kcompactd_node_suitable(pgdat))
   2912		return;
   2913
   2914	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
   2915							highest_zoneidx);
   2916	wake_up_interruptible(&pgdat->kcompactd_wait);
   2917}
   2918
   2919/*
   2920 * The background compaction daemon, started as a kernel thread
   2921 * from the init process.
   2922 */
   2923static int kcompactd(void *p)
   2924{
   2925	pg_data_t *pgdat = (pg_data_t *)p;
   2926	struct task_struct *tsk = current;
   2927	long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
   2928	long timeout = default_timeout;
   2929
   2930	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
   2931
   2932	if (!cpumask_empty(cpumask))
   2933		set_cpus_allowed_ptr(tsk, cpumask);
   2934
   2935	set_freezable();
   2936
   2937	pgdat->kcompactd_max_order = 0;
   2938	pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
   2939
   2940	while (!kthread_should_stop()) {
   2941		unsigned long pflags;
   2942
   2943		/*
   2944		 * Avoid the unnecessary wakeup for proactive compaction
   2945		 * when it is disabled.
   2946		 */
   2947		if (!sysctl_compaction_proactiveness)
   2948			timeout = MAX_SCHEDULE_TIMEOUT;
   2949		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
   2950		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
   2951			kcompactd_work_requested(pgdat), timeout) &&
   2952			!pgdat->proactive_compact_trigger) {
   2953
   2954			psi_memstall_enter(&pflags);
   2955			kcompactd_do_work(pgdat);
   2956			psi_memstall_leave(&pflags);
   2957			/*
   2958			 * Reset the timeout value. The defer timeout from
   2959			 * proactive compaction is lost here but that is fine
   2960			 * as the condition of the zone changing substantionally
   2961			 * then carrying on with the previous defer interval is
   2962			 * not useful.
   2963			 */
   2964			timeout = default_timeout;
   2965			continue;
   2966		}
   2967
   2968		/*
   2969		 * Start the proactive work with default timeout. Based
   2970		 * on the fragmentation score, this timeout is updated.
   2971		 */
   2972		timeout = default_timeout;
   2973		if (should_proactive_compact_node(pgdat)) {
   2974			unsigned int prev_score, score;
   2975
   2976			prev_score = fragmentation_score_node(pgdat);
   2977			proactive_compact_node(pgdat);
   2978			score = fragmentation_score_node(pgdat);
   2979			/*
   2980			 * Defer proactive compaction if the fragmentation
   2981			 * score did not go down i.e. no progress made.
   2982			 */
   2983			if (unlikely(score >= prev_score))
   2984				timeout =
   2985				   default_timeout << COMPACT_MAX_DEFER_SHIFT;
   2986		}
   2987		if (unlikely(pgdat->proactive_compact_trigger))
   2988			pgdat->proactive_compact_trigger = false;
   2989	}
   2990
   2991	return 0;
   2992}
   2993
   2994/*
   2995 * This kcompactd start function will be called by init and node-hot-add.
   2996 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
   2997 */
   2998void kcompactd_run(int nid)
   2999{
   3000	pg_data_t *pgdat = NODE_DATA(nid);
   3001
   3002	if (pgdat->kcompactd)
   3003		return;
   3004
   3005	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
   3006	if (IS_ERR(pgdat->kcompactd)) {
   3007		pr_err("Failed to start kcompactd on node %d\n", nid);
   3008		pgdat->kcompactd = NULL;
   3009	}
   3010}
   3011
   3012/*
   3013 * Called by memory hotplug when all memory in a node is offlined. Caller must
   3014 * hold mem_hotplug_begin/end().
   3015 */
   3016void kcompactd_stop(int nid)
   3017{
   3018	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
   3019
   3020	if (kcompactd) {
   3021		kthread_stop(kcompactd);
   3022		NODE_DATA(nid)->kcompactd = NULL;
   3023	}
   3024}
   3025
   3026/*
   3027 * It's optimal to keep kcompactd on the same CPUs as their memory, but
   3028 * not required for correctness. So if the last cpu in a node goes
   3029 * away, we get changed to run anywhere: as the first one comes back,
   3030 * restore their cpu bindings.
   3031 */
   3032static int kcompactd_cpu_online(unsigned int cpu)
   3033{
   3034	int nid;
   3035
   3036	for_each_node_state(nid, N_MEMORY) {
   3037		pg_data_t *pgdat = NODE_DATA(nid);
   3038		const struct cpumask *mask;
   3039
   3040		mask = cpumask_of_node(pgdat->node_id);
   3041
   3042		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
   3043			/* One of our CPUs online: restore mask */
   3044			if (pgdat->kcompactd)
   3045				set_cpus_allowed_ptr(pgdat->kcompactd, mask);
   3046	}
   3047	return 0;
   3048}
   3049
   3050static int __init kcompactd_init(void)
   3051{
   3052	int nid;
   3053	int ret;
   3054
   3055	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
   3056					"mm/compaction:online",
   3057					kcompactd_cpu_online, NULL);
   3058	if (ret < 0) {
   3059		pr_err("kcompactd: failed to register hotplug callbacks.\n");
   3060		return ret;
   3061	}
   3062
   3063	for_each_node_state(nid, N_MEMORY)
   3064		kcompactd_run(nid);
   3065	return 0;
   3066}
   3067subsys_initcall(kcompactd_init)
   3068
   3069#endif /* CONFIG_COMPACTION */