cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

z3fold.c (47137B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * z3fold.c
      4 *
      5 * Author: Vitaly Wool <vitaly.wool@konsulko.com>
      6 * Copyright (C) 2016, Sony Mobile Communications Inc.
      7 *
      8 * This implementation is based on zbud written by Seth Jennings.
      9 *
     10 * z3fold is an special purpose allocator for storing compressed pages. It
     11 * can store up to three compressed pages per page which improves the
     12 * compression ratio of zbud while retaining its main concepts (e. g. always
     13 * storing an integral number of objects per page) and simplicity.
     14 * It still has simple and deterministic reclaim properties that make it
     15 * preferable to a higher density approach (with no requirement on integral
     16 * number of object per page) when reclaim is used.
     17 *
     18 * As in zbud, pages are divided into "chunks".  The size of the chunks is
     19 * fixed at compile time and is determined by NCHUNKS_ORDER below.
     20 *
     21 * z3fold doesn't export any API and is meant to be used via zpool API.
     22 */
     23
     24#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     25
     26#include <linux/atomic.h>
     27#include <linux/sched.h>
     28#include <linux/cpumask.h>
     29#include <linux/list.h>
     30#include <linux/mm.h>
     31#include <linux/module.h>
     32#include <linux/page-flags.h>
     33#include <linux/migrate.h>
     34#include <linux/node.h>
     35#include <linux/compaction.h>
     36#include <linux/percpu.h>
     37#include <linux/mount.h>
     38#include <linux/pseudo_fs.h>
     39#include <linux/fs.h>
     40#include <linux/preempt.h>
     41#include <linux/workqueue.h>
     42#include <linux/slab.h>
     43#include <linux/spinlock.h>
     44#include <linux/zpool.h>
     45#include <linux/magic.h>
     46#include <linux/kmemleak.h>
     47
     48/*
     49 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
     50 * adjusting internal fragmentation.  It also determines the number of
     51 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
     52 * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
     53 * in the beginning of an allocated page are occupied by z3fold header, so
     54 * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
     55 * which shows the max number of free chunks in z3fold page, also there will
     56 * be 63, or 62, respectively, freelists per pool.
     57 */
     58#define NCHUNKS_ORDER	6
     59
     60#define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
     61#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
     62#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
     63#define ZHDR_CHUNKS	(ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
     64#define TOTAL_CHUNKS	(PAGE_SIZE >> CHUNK_SHIFT)
     65#define NCHUNKS		(TOTAL_CHUNKS - ZHDR_CHUNKS)
     66
     67#define BUDDY_MASK	(0x3)
     68#define BUDDY_SHIFT	2
     69#define SLOTS_ALIGN	(0x40)
     70
     71/*****************
     72 * Structures
     73*****************/
     74struct z3fold_pool;
     75struct z3fold_ops {
     76	int (*evict)(struct z3fold_pool *pool, unsigned long handle);
     77};
     78
     79enum buddy {
     80	HEADLESS = 0,
     81	FIRST,
     82	MIDDLE,
     83	LAST,
     84	BUDDIES_MAX = LAST
     85};
     86
     87struct z3fold_buddy_slots {
     88	/*
     89	 * we are using BUDDY_MASK in handle_to_buddy etc. so there should
     90	 * be enough slots to hold all possible variants
     91	 */
     92	unsigned long slot[BUDDY_MASK + 1];
     93	unsigned long pool; /* back link */
     94	rwlock_t lock;
     95};
     96#define HANDLE_FLAG_MASK	(0x03)
     97
     98/*
     99 * struct z3fold_header - z3fold page metadata occupying first chunks of each
    100 *			z3fold page, except for HEADLESS pages
    101 * @buddy:		links the z3fold page into the relevant list in the
    102 *			pool
    103 * @page_lock:		per-page lock
    104 * @refcount:		reference count for the z3fold page
    105 * @work:		work_struct for page layout optimization
    106 * @slots:		pointer to the structure holding buddy slots
    107 * @pool:		pointer to the containing pool
    108 * @cpu:		CPU which this page "belongs" to
    109 * @first_chunks:	the size of the first buddy in chunks, 0 if free
    110 * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
    111 * @last_chunks:	the size of the last buddy in chunks, 0 if free
    112 * @first_num:		the starting number (for the first handle)
    113 * @mapped_count:	the number of objects currently mapped
    114 */
    115struct z3fold_header {
    116	struct list_head buddy;
    117	spinlock_t page_lock;
    118	struct kref refcount;
    119	struct work_struct work;
    120	struct z3fold_buddy_slots *slots;
    121	struct z3fold_pool *pool;
    122	short cpu;
    123	unsigned short first_chunks;
    124	unsigned short middle_chunks;
    125	unsigned short last_chunks;
    126	unsigned short start_middle;
    127	unsigned short first_num:2;
    128	unsigned short mapped_count:2;
    129	unsigned short foreign_handles:2;
    130};
    131
    132/**
    133 * struct z3fold_pool - stores metadata for each z3fold pool
    134 * @name:	pool name
    135 * @lock:	protects pool unbuddied/lru lists
    136 * @stale_lock:	protects pool stale page list
    137 * @unbuddied:	per-cpu array of lists tracking z3fold pages that contain 2-
    138 *		buddies; the list each z3fold page is added to depends on
    139 *		the size of its free region.
    140 * @lru:	list tracking the z3fold pages in LRU order by most recently
    141 *		added buddy.
    142 * @stale:	list of pages marked for freeing
    143 * @pages_nr:	number of z3fold pages in the pool.
    144 * @c_handle:	cache for z3fold_buddy_slots allocation
    145 * @ops:	pointer to a structure of user defined operations specified at
    146 *		pool creation time.
    147 * @zpool:	zpool driver
    148 * @zpool_ops:	zpool operations structure with an evict callback
    149 * @compact_wq:	workqueue for page layout background optimization
    150 * @release_wq:	workqueue for safe page release
    151 * @work:	work_struct for safe page release
    152 * @inode:	inode for z3fold pseudo filesystem
    153 *
    154 * This structure is allocated at pool creation time and maintains metadata
    155 * pertaining to a particular z3fold pool.
    156 */
    157struct z3fold_pool {
    158	const char *name;
    159	spinlock_t lock;
    160	spinlock_t stale_lock;
    161	struct list_head *unbuddied;
    162	struct list_head lru;
    163	struct list_head stale;
    164	atomic64_t pages_nr;
    165	struct kmem_cache *c_handle;
    166	const struct z3fold_ops *ops;
    167	struct zpool *zpool;
    168	const struct zpool_ops *zpool_ops;
    169	struct workqueue_struct *compact_wq;
    170	struct workqueue_struct *release_wq;
    171	struct work_struct work;
    172	struct inode *inode;
    173};
    174
    175/*
    176 * Internal z3fold page flags
    177 */
    178enum z3fold_page_flags {
    179	PAGE_HEADLESS = 0,
    180	MIDDLE_CHUNK_MAPPED,
    181	NEEDS_COMPACTING,
    182	PAGE_STALE,
    183	PAGE_CLAIMED, /* by either reclaim or free */
    184	PAGE_MIGRATED, /* page is migrated and soon to be released */
    185};
    186
    187/*
    188 * handle flags, go under HANDLE_FLAG_MASK
    189 */
    190enum z3fold_handle_flags {
    191	HANDLES_NOFREE = 0,
    192};
    193
    194/*
    195 * Forward declarations
    196 */
    197static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
    198static void compact_page_work(struct work_struct *w);
    199
    200/*****************
    201 * Helpers
    202*****************/
    203
    204/* Converts an allocation size in bytes to size in z3fold chunks */
    205static int size_to_chunks(size_t size)
    206{
    207	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
    208}
    209
    210#define for_each_unbuddied_list(_iter, _begin) \
    211	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
    212
    213static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
    214							gfp_t gfp)
    215{
    216	struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
    217							     gfp);
    218
    219	if (slots) {
    220		/* It will be freed separately in free_handle(). */
    221		kmemleak_not_leak(slots);
    222		slots->pool = (unsigned long)pool;
    223		rwlock_init(&slots->lock);
    224	}
    225
    226	return slots;
    227}
    228
    229static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
    230{
    231	return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
    232}
    233
    234static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
    235{
    236	return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
    237}
    238
    239/* Lock a z3fold page */
    240static inline void z3fold_page_lock(struct z3fold_header *zhdr)
    241{
    242	spin_lock(&zhdr->page_lock);
    243}
    244
    245/* Try to lock a z3fold page */
    246static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
    247{
    248	return spin_trylock(&zhdr->page_lock);
    249}
    250
    251/* Unlock a z3fold page */
    252static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
    253{
    254	spin_unlock(&zhdr->page_lock);
    255}
    256
    257/* return locked z3fold page if it's not headless */
    258static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
    259{
    260	struct z3fold_buddy_slots *slots;
    261	struct z3fold_header *zhdr;
    262	int locked = 0;
    263
    264	if (!(handle & (1 << PAGE_HEADLESS))) {
    265		slots = handle_to_slots(handle);
    266		do {
    267			unsigned long addr;
    268
    269			read_lock(&slots->lock);
    270			addr = *(unsigned long *)handle;
    271			zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
    272			locked = z3fold_page_trylock(zhdr);
    273			read_unlock(&slots->lock);
    274			if (locked) {
    275				struct page *page = virt_to_page(zhdr);
    276
    277				if (!test_bit(PAGE_MIGRATED, &page->private))
    278					break;
    279				z3fold_page_unlock(zhdr);
    280			}
    281			cpu_relax();
    282		} while (true);
    283	} else {
    284		zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
    285	}
    286
    287	return zhdr;
    288}
    289
    290static inline void put_z3fold_header(struct z3fold_header *zhdr)
    291{
    292	struct page *page = virt_to_page(zhdr);
    293
    294	if (!test_bit(PAGE_HEADLESS, &page->private))
    295		z3fold_page_unlock(zhdr);
    296}
    297
    298static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
    299{
    300	struct z3fold_buddy_slots *slots;
    301	int i;
    302	bool is_free;
    303
    304	if (WARN_ON(*(unsigned long *)handle == 0))
    305		return;
    306
    307	slots = handle_to_slots(handle);
    308	write_lock(&slots->lock);
    309	*(unsigned long *)handle = 0;
    310
    311	if (test_bit(HANDLES_NOFREE, &slots->pool)) {
    312		write_unlock(&slots->lock);
    313		return; /* simple case, nothing else to do */
    314	}
    315
    316	if (zhdr->slots != slots)
    317		zhdr->foreign_handles--;
    318
    319	is_free = true;
    320	for (i = 0; i <= BUDDY_MASK; i++) {
    321		if (slots->slot[i]) {
    322			is_free = false;
    323			break;
    324		}
    325	}
    326	write_unlock(&slots->lock);
    327
    328	if (is_free) {
    329		struct z3fold_pool *pool = slots_to_pool(slots);
    330
    331		if (zhdr->slots == slots)
    332			zhdr->slots = NULL;
    333		kmem_cache_free(pool->c_handle, slots);
    334	}
    335}
    336
    337static int z3fold_init_fs_context(struct fs_context *fc)
    338{
    339	return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
    340}
    341
    342static struct file_system_type z3fold_fs = {
    343	.name		= "z3fold",
    344	.init_fs_context = z3fold_init_fs_context,
    345	.kill_sb	= kill_anon_super,
    346};
    347
    348static struct vfsmount *z3fold_mnt;
    349static int __init z3fold_mount(void)
    350{
    351	int ret = 0;
    352
    353	z3fold_mnt = kern_mount(&z3fold_fs);
    354	if (IS_ERR(z3fold_mnt))
    355		ret = PTR_ERR(z3fold_mnt);
    356
    357	return ret;
    358}
    359
    360static void z3fold_unmount(void)
    361{
    362	kern_unmount(z3fold_mnt);
    363}
    364
    365static const struct address_space_operations z3fold_aops;
    366static int z3fold_register_migration(struct z3fold_pool *pool)
    367{
    368	pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
    369	if (IS_ERR(pool->inode)) {
    370		pool->inode = NULL;
    371		return 1;
    372	}
    373
    374	pool->inode->i_mapping->private_data = pool;
    375	pool->inode->i_mapping->a_ops = &z3fold_aops;
    376	return 0;
    377}
    378
    379static void z3fold_unregister_migration(struct z3fold_pool *pool)
    380{
    381	if (pool->inode)
    382		iput(pool->inode);
    383}
    384
    385/* Initializes the z3fold header of a newly allocated z3fold page */
    386static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
    387					struct z3fold_pool *pool, gfp_t gfp)
    388{
    389	struct z3fold_header *zhdr = page_address(page);
    390	struct z3fold_buddy_slots *slots;
    391
    392	INIT_LIST_HEAD(&page->lru);
    393	clear_bit(PAGE_HEADLESS, &page->private);
    394	clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
    395	clear_bit(NEEDS_COMPACTING, &page->private);
    396	clear_bit(PAGE_STALE, &page->private);
    397	clear_bit(PAGE_CLAIMED, &page->private);
    398	clear_bit(PAGE_MIGRATED, &page->private);
    399	if (headless)
    400		return zhdr;
    401
    402	slots = alloc_slots(pool, gfp);
    403	if (!slots)
    404		return NULL;
    405
    406	memset(zhdr, 0, sizeof(*zhdr));
    407	spin_lock_init(&zhdr->page_lock);
    408	kref_init(&zhdr->refcount);
    409	zhdr->cpu = -1;
    410	zhdr->slots = slots;
    411	zhdr->pool = pool;
    412	INIT_LIST_HEAD(&zhdr->buddy);
    413	INIT_WORK(&zhdr->work, compact_page_work);
    414	return zhdr;
    415}
    416
    417/* Resets the struct page fields and frees the page */
    418static void free_z3fold_page(struct page *page, bool headless)
    419{
    420	if (!headless) {
    421		lock_page(page);
    422		__ClearPageMovable(page);
    423		unlock_page(page);
    424	}
    425	__free_page(page);
    426}
    427
    428/* Helper function to build the index */
    429static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
    430{
    431	return (bud + zhdr->first_num) & BUDDY_MASK;
    432}
    433
    434/*
    435 * Encodes the handle of a particular buddy within a z3fold page
    436 * Pool lock should be held as this function accesses first_num
    437 */
    438static unsigned long __encode_handle(struct z3fold_header *zhdr,
    439				struct z3fold_buddy_slots *slots,
    440				enum buddy bud)
    441{
    442	unsigned long h = (unsigned long)zhdr;
    443	int idx = 0;
    444
    445	/*
    446	 * For a headless page, its handle is its pointer with the extra
    447	 * PAGE_HEADLESS bit set
    448	 */
    449	if (bud == HEADLESS)
    450		return h | (1 << PAGE_HEADLESS);
    451
    452	/* otherwise, return pointer to encoded handle */
    453	idx = __idx(zhdr, bud);
    454	h += idx;
    455	if (bud == LAST)
    456		h |= (zhdr->last_chunks << BUDDY_SHIFT);
    457
    458	write_lock(&slots->lock);
    459	slots->slot[idx] = h;
    460	write_unlock(&slots->lock);
    461	return (unsigned long)&slots->slot[idx];
    462}
    463
    464static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
    465{
    466	return __encode_handle(zhdr, zhdr->slots, bud);
    467}
    468
    469/* only for LAST bud, returns zero otherwise */
    470static unsigned short handle_to_chunks(unsigned long handle)
    471{
    472	struct z3fold_buddy_slots *slots = handle_to_slots(handle);
    473	unsigned long addr;
    474
    475	read_lock(&slots->lock);
    476	addr = *(unsigned long *)handle;
    477	read_unlock(&slots->lock);
    478	return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
    479}
    480
    481/*
    482 * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
    483 *  but that doesn't matter. because the masking will result in the
    484 *  correct buddy number.
    485 */
    486static enum buddy handle_to_buddy(unsigned long handle)
    487{
    488	struct z3fold_header *zhdr;
    489	struct z3fold_buddy_slots *slots = handle_to_slots(handle);
    490	unsigned long addr;
    491
    492	read_lock(&slots->lock);
    493	WARN_ON(handle & (1 << PAGE_HEADLESS));
    494	addr = *(unsigned long *)handle;
    495	read_unlock(&slots->lock);
    496	zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
    497	return (addr - zhdr->first_num) & BUDDY_MASK;
    498}
    499
    500static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
    501{
    502	return zhdr->pool;
    503}
    504
    505static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
    506{
    507	struct page *page = virt_to_page(zhdr);
    508	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
    509
    510	WARN_ON(!list_empty(&zhdr->buddy));
    511	set_bit(PAGE_STALE, &page->private);
    512	clear_bit(NEEDS_COMPACTING, &page->private);
    513	spin_lock(&pool->lock);
    514	if (!list_empty(&page->lru))
    515		list_del_init(&page->lru);
    516	spin_unlock(&pool->lock);
    517
    518	if (locked)
    519		z3fold_page_unlock(zhdr);
    520
    521	spin_lock(&pool->stale_lock);
    522	list_add(&zhdr->buddy, &pool->stale);
    523	queue_work(pool->release_wq, &pool->work);
    524	spin_unlock(&pool->stale_lock);
    525
    526	atomic64_dec(&pool->pages_nr);
    527}
    528
    529static void release_z3fold_page_locked(struct kref *ref)
    530{
    531	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
    532						refcount);
    533	WARN_ON(z3fold_page_trylock(zhdr));
    534	__release_z3fold_page(zhdr, true);
    535}
    536
    537static void release_z3fold_page_locked_list(struct kref *ref)
    538{
    539	struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
    540					       refcount);
    541	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
    542
    543	spin_lock(&pool->lock);
    544	list_del_init(&zhdr->buddy);
    545	spin_unlock(&pool->lock);
    546
    547	WARN_ON(z3fold_page_trylock(zhdr));
    548	__release_z3fold_page(zhdr, true);
    549}
    550
    551static void free_pages_work(struct work_struct *w)
    552{
    553	struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
    554
    555	spin_lock(&pool->stale_lock);
    556	while (!list_empty(&pool->stale)) {
    557		struct z3fold_header *zhdr = list_first_entry(&pool->stale,
    558						struct z3fold_header, buddy);
    559		struct page *page = virt_to_page(zhdr);
    560
    561		list_del(&zhdr->buddy);
    562		if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
    563			continue;
    564		spin_unlock(&pool->stale_lock);
    565		cancel_work_sync(&zhdr->work);
    566		free_z3fold_page(page, false);
    567		cond_resched();
    568		spin_lock(&pool->stale_lock);
    569	}
    570	spin_unlock(&pool->stale_lock);
    571}
    572
    573/*
    574 * Returns the number of free chunks in a z3fold page.
    575 * NB: can't be used with HEADLESS pages.
    576 */
    577static int num_free_chunks(struct z3fold_header *zhdr)
    578{
    579	int nfree;
    580	/*
    581	 * If there is a middle object, pick up the bigger free space
    582	 * either before or after it. Otherwise just subtract the number
    583	 * of chunks occupied by the first and the last objects.
    584	 */
    585	if (zhdr->middle_chunks != 0) {
    586		int nfree_before = zhdr->first_chunks ?
    587			0 : zhdr->start_middle - ZHDR_CHUNKS;
    588		int nfree_after = zhdr->last_chunks ?
    589			0 : TOTAL_CHUNKS -
    590				(zhdr->start_middle + zhdr->middle_chunks);
    591		nfree = max(nfree_before, nfree_after);
    592	} else
    593		nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
    594	return nfree;
    595}
    596
    597/* Add to the appropriate unbuddied list */
    598static inline void add_to_unbuddied(struct z3fold_pool *pool,
    599				struct z3fold_header *zhdr)
    600{
    601	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
    602			zhdr->middle_chunks == 0) {
    603		struct list_head *unbuddied;
    604		int freechunks = num_free_chunks(zhdr);
    605
    606		migrate_disable();
    607		unbuddied = this_cpu_ptr(pool->unbuddied);
    608		spin_lock(&pool->lock);
    609		list_add(&zhdr->buddy, &unbuddied[freechunks]);
    610		spin_unlock(&pool->lock);
    611		zhdr->cpu = smp_processor_id();
    612		migrate_enable();
    613	}
    614}
    615
    616static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
    617{
    618	enum buddy bud = HEADLESS;
    619
    620	if (zhdr->middle_chunks) {
    621		if (!zhdr->first_chunks &&
    622		    chunks <= zhdr->start_middle - ZHDR_CHUNKS)
    623			bud = FIRST;
    624		else if (!zhdr->last_chunks)
    625			bud = LAST;
    626	} else {
    627		if (!zhdr->first_chunks)
    628			bud = FIRST;
    629		else if (!zhdr->last_chunks)
    630			bud = LAST;
    631		else
    632			bud = MIDDLE;
    633	}
    634
    635	return bud;
    636}
    637
    638static inline void *mchunk_memmove(struct z3fold_header *zhdr,
    639				unsigned short dst_chunk)
    640{
    641	void *beg = zhdr;
    642	return memmove(beg + (dst_chunk << CHUNK_SHIFT),
    643		       beg + (zhdr->start_middle << CHUNK_SHIFT),
    644		       zhdr->middle_chunks << CHUNK_SHIFT);
    645}
    646
    647static inline bool buddy_single(struct z3fold_header *zhdr)
    648{
    649	return !((zhdr->first_chunks && zhdr->middle_chunks) ||
    650			(zhdr->first_chunks && zhdr->last_chunks) ||
    651			(zhdr->middle_chunks && zhdr->last_chunks));
    652}
    653
    654static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
    655{
    656	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
    657	void *p = zhdr;
    658	unsigned long old_handle = 0;
    659	size_t sz = 0;
    660	struct z3fold_header *new_zhdr = NULL;
    661	int first_idx = __idx(zhdr, FIRST);
    662	int middle_idx = __idx(zhdr, MIDDLE);
    663	int last_idx = __idx(zhdr, LAST);
    664	unsigned short *moved_chunks = NULL;
    665
    666	/*
    667	 * No need to protect slots here -- all the slots are "local" and
    668	 * the page lock is already taken
    669	 */
    670	if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
    671		p += ZHDR_SIZE_ALIGNED;
    672		sz = zhdr->first_chunks << CHUNK_SHIFT;
    673		old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
    674		moved_chunks = &zhdr->first_chunks;
    675	} else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
    676		p += zhdr->start_middle << CHUNK_SHIFT;
    677		sz = zhdr->middle_chunks << CHUNK_SHIFT;
    678		old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
    679		moved_chunks = &zhdr->middle_chunks;
    680	} else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
    681		p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
    682		sz = zhdr->last_chunks << CHUNK_SHIFT;
    683		old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
    684		moved_chunks = &zhdr->last_chunks;
    685	}
    686
    687	if (sz > 0) {
    688		enum buddy new_bud = HEADLESS;
    689		short chunks = size_to_chunks(sz);
    690		void *q;
    691
    692		new_zhdr = __z3fold_alloc(pool, sz, false);
    693		if (!new_zhdr)
    694			return NULL;
    695
    696		if (WARN_ON(new_zhdr == zhdr))
    697			goto out_fail;
    698
    699		new_bud = get_free_buddy(new_zhdr, chunks);
    700		q = new_zhdr;
    701		switch (new_bud) {
    702		case FIRST:
    703			new_zhdr->first_chunks = chunks;
    704			q += ZHDR_SIZE_ALIGNED;
    705			break;
    706		case MIDDLE:
    707			new_zhdr->middle_chunks = chunks;
    708			new_zhdr->start_middle =
    709				new_zhdr->first_chunks + ZHDR_CHUNKS;
    710			q += new_zhdr->start_middle << CHUNK_SHIFT;
    711			break;
    712		case LAST:
    713			new_zhdr->last_chunks = chunks;
    714			q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
    715			break;
    716		default:
    717			goto out_fail;
    718		}
    719		new_zhdr->foreign_handles++;
    720		memcpy(q, p, sz);
    721		write_lock(&zhdr->slots->lock);
    722		*(unsigned long *)old_handle = (unsigned long)new_zhdr +
    723			__idx(new_zhdr, new_bud);
    724		if (new_bud == LAST)
    725			*(unsigned long *)old_handle |=
    726					(new_zhdr->last_chunks << BUDDY_SHIFT);
    727		write_unlock(&zhdr->slots->lock);
    728		add_to_unbuddied(pool, new_zhdr);
    729		z3fold_page_unlock(new_zhdr);
    730
    731		*moved_chunks = 0;
    732	}
    733
    734	return new_zhdr;
    735
    736out_fail:
    737	if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
    738		add_to_unbuddied(pool, new_zhdr);
    739		z3fold_page_unlock(new_zhdr);
    740	}
    741	return NULL;
    742
    743}
    744
    745#define BIG_CHUNK_GAP	3
    746/* Has to be called with lock held */
    747static int z3fold_compact_page(struct z3fold_header *zhdr)
    748{
    749	struct page *page = virt_to_page(zhdr);
    750
    751	if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
    752		return 0; /* can't move middle chunk, it's used */
    753
    754	if (unlikely(PageIsolated(page)))
    755		return 0;
    756
    757	if (zhdr->middle_chunks == 0)
    758		return 0; /* nothing to compact */
    759
    760	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
    761		/* move to the beginning */
    762		mchunk_memmove(zhdr, ZHDR_CHUNKS);
    763		zhdr->first_chunks = zhdr->middle_chunks;
    764		zhdr->middle_chunks = 0;
    765		zhdr->start_middle = 0;
    766		zhdr->first_num++;
    767		return 1;
    768	}
    769
    770	/*
    771	 * moving data is expensive, so let's only do that if
    772	 * there's substantial gain (at least BIG_CHUNK_GAP chunks)
    773	 */
    774	if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
    775	    zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
    776			BIG_CHUNK_GAP) {
    777		mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
    778		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
    779		return 1;
    780	} else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
    781		   TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
    782					+ zhdr->middle_chunks) >=
    783			BIG_CHUNK_GAP) {
    784		unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
    785			zhdr->middle_chunks;
    786		mchunk_memmove(zhdr, new_start);
    787		zhdr->start_middle = new_start;
    788		return 1;
    789	}
    790
    791	return 0;
    792}
    793
    794static void do_compact_page(struct z3fold_header *zhdr, bool locked)
    795{
    796	struct z3fold_pool *pool = zhdr_to_pool(zhdr);
    797	struct page *page;
    798
    799	page = virt_to_page(zhdr);
    800	if (locked)
    801		WARN_ON(z3fold_page_trylock(zhdr));
    802	else
    803		z3fold_page_lock(zhdr);
    804	if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
    805		z3fold_page_unlock(zhdr);
    806		return;
    807	}
    808	spin_lock(&pool->lock);
    809	list_del_init(&zhdr->buddy);
    810	spin_unlock(&pool->lock);
    811
    812	if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
    813		return;
    814
    815	if (test_bit(PAGE_STALE, &page->private) ||
    816	    test_and_set_bit(PAGE_CLAIMED, &page->private)) {
    817		z3fold_page_unlock(zhdr);
    818		return;
    819	}
    820
    821	if (!zhdr->foreign_handles && buddy_single(zhdr) &&
    822	    zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
    823		if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
    824			clear_bit(PAGE_CLAIMED, &page->private);
    825			z3fold_page_unlock(zhdr);
    826		}
    827		return;
    828	}
    829
    830	z3fold_compact_page(zhdr);
    831	add_to_unbuddied(pool, zhdr);
    832	clear_bit(PAGE_CLAIMED, &page->private);
    833	z3fold_page_unlock(zhdr);
    834}
    835
    836static void compact_page_work(struct work_struct *w)
    837{
    838	struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
    839						work);
    840
    841	do_compact_page(zhdr, false);
    842}
    843
    844/* returns _locked_ z3fold page header or NULL */
    845static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
    846						size_t size, bool can_sleep)
    847{
    848	struct z3fold_header *zhdr = NULL;
    849	struct page *page;
    850	struct list_head *unbuddied;
    851	int chunks = size_to_chunks(size), i;
    852
    853lookup:
    854	migrate_disable();
    855	/* First, try to find an unbuddied z3fold page. */
    856	unbuddied = this_cpu_ptr(pool->unbuddied);
    857	for_each_unbuddied_list(i, chunks) {
    858		struct list_head *l = &unbuddied[i];
    859
    860		zhdr = list_first_entry_or_null(READ_ONCE(l),
    861					struct z3fold_header, buddy);
    862
    863		if (!zhdr)
    864			continue;
    865
    866		/* Re-check under lock. */
    867		spin_lock(&pool->lock);
    868		if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
    869						struct z3fold_header, buddy)) ||
    870		    !z3fold_page_trylock(zhdr)) {
    871			spin_unlock(&pool->lock);
    872			zhdr = NULL;
    873			migrate_enable();
    874			if (can_sleep)
    875				cond_resched();
    876			goto lookup;
    877		}
    878		list_del_init(&zhdr->buddy);
    879		zhdr->cpu = -1;
    880		spin_unlock(&pool->lock);
    881
    882		page = virt_to_page(zhdr);
    883		if (test_bit(NEEDS_COMPACTING, &page->private) ||
    884		    test_bit(PAGE_CLAIMED, &page->private)) {
    885			z3fold_page_unlock(zhdr);
    886			zhdr = NULL;
    887			migrate_enable();
    888			if (can_sleep)
    889				cond_resched();
    890			goto lookup;
    891		}
    892
    893		/*
    894		 * this page could not be removed from its unbuddied
    895		 * list while pool lock was held, and then we've taken
    896		 * page lock so kref_put could not be called before
    897		 * we got here, so it's safe to just call kref_get()
    898		 */
    899		kref_get(&zhdr->refcount);
    900		break;
    901	}
    902	migrate_enable();
    903
    904	if (!zhdr) {
    905		int cpu;
    906
    907		/* look for _exact_ match on other cpus' lists */
    908		for_each_online_cpu(cpu) {
    909			struct list_head *l;
    910
    911			unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
    912			spin_lock(&pool->lock);
    913			l = &unbuddied[chunks];
    914
    915			zhdr = list_first_entry_or_null(READ_ONCE(l),
    916						struct z3fold_header, buddy);
    917
    918			if (!zhdr || !z3fold_page_trylock(zhdr)) {
    919				spin_unlock(&pool->lock);
    920				zhdr = NULL;
    921				continue;
    922			}
    923			list_del_init(&zhdr->buddy);
    924			zhdr->cpu = -1;
    925			spin_unlock(&pool->lock);
    926
    927			page = virt_to_page(zhdr);
    928			if (test_bit(NEEDS_COMPACTING, &page->private) ||
    929			    test_bit(PAGE_CLAIMED, &page->private)) {
    930				z3fold_page_unlock(zhdr);
    931				zhdr = NULL;
    932				if (can_sleep)
    933					cond_resched();
    934				continue;
    935			}
    936			kref_get(&zhdr->refcount);
    937			break;
    938		}
    939	}
    940
    941	if (zhdr && !zhdr->slots) {
    942		zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
    943		if (!zhdr->slots)
    944			goto out_fail;
    945	}
    946	return zhdr;
    947
    948out_fail:
    949	if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
    950		add_to_unbuddied(pool, zhdr);
    951		z3fold_page_unlock(zhdr);
    952	}
    953	return NULL;
    954}
    955
    956/*
    957 * API Functions
    958 */
    959
    960/**
    961 * z3fold_create_pool() - create a new z3fold pool
    962 * @name:	pool name
    963 * @gfp:	gfp flags when allocating the z3fold pool structure
    964 * @ops:	user-defined operations for the z3fold pool
    965 *
    966 * Return: pointer to the new z3fold pool or NULL if the metadata allocation
    967 * failed.
    968 */
    969static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
    970		const struct z3fold_ops *ops)
    971{
    972	struct z3fold_pool *pool = NULL;
    973	int i, cpu;
    974
    975	pool = kzalloc(sizeof(struct z3fold_pool), gfp);
    976	if (!pool)
    977		goto out;
    978	pool->c_handle = kmem_cache_create("z3fold_handle",
    979				sizeof(struct z3fold_buddy_slots),
    980				SLOTS_ALIGN, 0, NULL);
    981	if (!pool->c_handle)
    982		goto out_c;
    983	spin_lock_init(&pool->lock);
    984	spin_lock_init(&pool->stale_lock);
    985	pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
    986					 __alignof__(struct list_head));
    987	if (!pool->unbuddied)
    988		goto out_pool;
    989	for_each_possible_cpu(cpu) {
    990		struct list_head *unbuddied =
    991				per_cpu_ptr(pool->unbuddied, cpu);
    992		for_each_unbuddied_list(i, 0)
    993			INIT_LIST_HEAD(&unbuddied[i]);
    994	}
    995	INIT_LIST_HEAD(&pool->lru);
    996	INIT_LIST_HEAD(&pool->stale);
    997	atomic64_set(&pool->pages_nr, 0);
    998	pool->name = name;
    999	pool->compact_wq = create_singlethread_workqueue(pool->name);
   1000	if (!pool->compact_wq)
   1001		goto out_unbuddied;
   1002	pool->release_wq = create_singlethread_workqueue(pool->name);
   1003	if (!pool->release_wq)
   1004		goto out_wq;
   1005	if (z3fold_register_migration(pool))
   1006		goto out_rwq;
   1007	INIT_WORK(&pool->work, free_pages_work);
   1008	pool->ops = ops;
   1009	return pool;
   1010
   1011out_rwq:
   1012	destroy_workqueue(pool->release_wq);
   1013out_wq:
   1014	destroy_workqueue(pool->compact_wq);
   1015out_unbuddied:
   1016	free_percpu(pool->unbuddied);
   1017out_pool:
   1018	kmem_cache_destroy(pool->c_handle);
   1019out_c:
   1020	kfree(pool);
   1021out:
   1022	return NULL;
   1023}
   1024
   1025/**
   1026 * z3fold_destroy_pool() - destroys an existing z3fold pool
   1027 * @pool:	the z3fold pool to be destroyed
   1028 *
   1029 * The pool should be emptied before this function is called.
   1030 */
   1031static void z3fold_destroy_pool(struct z3fold_pool *pool)
   1032{
   1033	kmem_cache_destroy(pool->c_handle);
   1034
   1035	/*
   1036	 * We need to destroy pool->compact_wq before pool->release_wq,
   1037	 * as any pending work on pool->compact_wq will call
   1038	 * queue_work(pool->release_wq, &pool->work).
   1039	 *
   1040	 * There are still outstanding pages until both workqueues are drained,
   1041	 * so we cannot unregister migration until then.
   1042	 */
   1043
   1044	destroy_workqueue(pool->compact_wq);
   1045	destroy_workqueue(pool->release_wq);
   1046	z3fold_unregister_migration(pool);
   1047	free_percpu(pool->unbuddied);
   1048	kfree(pool);
   1049}
   1050
   1051/**
   1052 * z3fold_alloc() - allocates a region of a given size
   1053 * @pool:	z3fold pool from which to allocate
   1054 * @size:	size in bytes of the desired allocation
   1055 * @gfp:	gfp flags used if the pool needs to grow
   1056 * @handle:	handle of the new allocation
   1057 *
   1058 * This function will attempt to find a free region in the pool large enough to
   1059 * satisfy the allocation request.  A search of the unbuddied lists is
   1060 * performed first. If no suitable free region is found, then a new page is
   1061 * allocated and added to the pool to satisfy the request.
   1062 *
   1063 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
   1064 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
   1065 * a new page.
   1066 */
   1067static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
   1068			unsigned long *handle)
   1069{
   1070	int chunks = size_to_chunks(size);
   1071	struct z3fold_header *zhdr = NULL;
   1072	struct page *page = NULL;
   1073	enum buddy bud;
   1074	bool can_sleep = gfpflags_allow_blocking(gfp);
   1075
   1076	if (!size || (gfp & __GFP_HIGHMEM))
   1077		return -EINVAL;
   1078
   1079	if (size > PAGE_SIZE)
   1080		return -ENOSPC;
   1081
   1082	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
   1083		bud = HEADLESS;
   1084	else {
   1085retry:
   1086		zhdr = __z3fold_alloc(pool, size, can_sleep);
   1087		if (zhdr) {
   1088			bud = get_free_buddy(zhdr, chunks);
   1089			if (bud == HEADLESS) {
   1090				if (!kref_put(&zhdr->refcount,
   1091					     release_z3fold_page_locked))
   1092					z3fold_page_unlock(zhdr);
   1093				pr_err("No free chunks in unbuddied\n");
   1094				WARN_ON(1);
   1095				goto retry;
   1096			}
   1097			page = virt_to_page(zhdr);
   1098			goto found;
   1099		}
   1100		bud = FIRST;
   1101	}
   1102
   1103	page = alloc_page(gfp);
   1104	if (!page)
   1105		return -ENOMEM;
   1106
   1107	zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
   1108	if (!zhdr) {
   1109		__free_page(page);
   1110		return -ENOMEM;
   1111	}
   1112	atomic64_inc(&pool->pages_nr);
   1113
   1114	if (bud == HEADLESS) {
   1115		set_bit(PAGE_HEADLESS, &page->private);
   1116		goto headless;
   1117	}
   1118	if (can_sleep) {
   1119		lock_page(page);
   1120		__SetPageMovable(page, pool->inode->i_mapping);
   1121		unlock_page(page);
   1122	} else {
   1123		WARN_ON(!trylock_page(page));
   1124		__SetPageMovable(page, pool->inode->i_mapping);
   1125		unlock_page(page);
   1126	}
   1127	z3fold_page_lock(zhdr);
   1128
   1129found:
   1130	if (bud == FIRST)
   1131		zhdr->first_chunks = chunks;
   1132	else if (bud == LAST)
   1133		zhdr->last_chunks = chunks;
   1134	else {
   1135		zhdr->middle_chunks = chunks;
   1136		zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
   1137	}
   1138	add_to_unbuddied(pool, zhdr);
   1139
   1140headless:
   1141	spin_lock(&pool->lock);
   1142	/* Add/move z3fold page to beginning of LRU */
   1143	if (!list_empty(&page->lru))
   1144		list_del(&page->lru);
   1145
   1146	list_add(&page->lru, &pool->lru);
   1147
   1148	*handle = encode_handle(zhdr, bud);
   1149	spin_unlock(&pool->lock);
   1150	if (bud != HEADLESS)
   1151		z3fold_page_unlock(zhdr);
   1152
   1153	return 0;
   1154}
   1155
   1156/**
   1157 * z3fold_free() - frees the allocation associated with the given handle
   1158 * @pool:	pool in which the allocation resided
   1159 * @handle:	handle associated with the allocation returned by z3fold_alloc()
   1160 *
   1161 * In the case that the z3fold page in which the allocation resides is under
   1162 * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
   1163 * only sets the first|middle|last_chunks to 0.  The page is actually freed
   1164 * once all buddies are evicted (see z3fold_reclaim_page() below).
   1165 */
   1166static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
   1167{
   1168	struct z3fold_header *zhdr;
   1169	struct page *page;
   1170	enum buddy bud;
   1171	bool page_claimed;
   1172
   1173	zhdr = get_z3fold_header(handle);
   1174	page = virt_to_page(zhdr);
   1175	page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
   1176
   1177	if (test_bit(PAGE_HEADLESS, &page->private)) {
   1178		/* if a headless page is under reclaim, just leave.
   1179		 * NB: we use test_and_set_bit for a reason: if the bit
   1180		 * has not been set before, we release this page
   1181		 * immediately so we don't care about its value any more.
   1182		 */
   1183		if (!page_claimed) {
   1184			spin_lock(&pool->lock);
   1185			list_del(&page->lru);
   1186			spin_unlock(&pool->lock);
   1187			put_z3fold_header(zhdr);
   1188			free_z3fold_page(page, true);
   1189			atomic64_dec(&pool->pages_nr);
   1190		}
   1191		return;
   1192	}
   1193
   1194	/* Non-headless case */
   1195	bud = handle_to_buddy(handle);
   1196
   1197	switch (bud) {
   1198	case FIRST:
   1199		zhdr->first_chunks = 0;
   1200		break;
   1201	case MIDDLE:
   1202		zhdr->middle_chunks = 0;
   1203		break;
   1204	case LAST:
   1205		zhdr->last_chunks = 0;
   1206		break;
   1207	default:
   1208		pr_err("%s: unknown bud %d\n", __func__, bud);
   1209		WARN_ON(1);
   1210		put_z3fold_header(zhdr);
   1211		return;
   1212	}
   1213
   1214	if (!page_claimed)
   1215		free_handle(handle, zhdr);
   1216	if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
   1217		return;
   1218	if (page_claimed) {
   1219		/* the page has not been claimed by us */
   1220		put_z3fold_header(zhdr);
   1221		return;
   1222	}
   1223	if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
   1224		clear_bit(PAGE_CLAIMED, &page->private);
   1225		put_z3fold_header(zhdr);
   1226		return;
   1227	}
   1228	if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
   1229		zhdr->cpu = -1;
   1230		kref_get(&zhdr->refcount);
   1231		clear_bit(PAGE_CLAIMED, &page->private);
   1232		do_compact_page(zhdr, true);
   1233		return;
   1234	}
   1235	kref_get(&zhdr->refcount);
   1236	clear_bit(PAGE_CLAIMED, &page->private);
   1237	queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
   1238	put_z3fold_header(zhdr);
   1239}
   1240
   1241/**
   1242 * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
   1243 * @pool:	pool from which a page will attempt to be evicted
   1244 * @retries:	number of pages on the LRU list for which eviction will
   1245 *		be attempted before failing
   1246 *
   1247 * z3fold reclaim is different from normal system reclaim in that it is done
   1248 * from the bottom, up. This is because only the bottom layer, z3fold, has
   1249 * information on how the allocations are organized within each z3fold page.
   1250 * This has the potential to create interesting locking situations between
   1251 * z3fold and the user, however.
   1252 *
   1253 * To avoid these, this is how z3fold_reclaim_page() should be called:
   1254 *
   1255 * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
   1256 * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
   1257 * call the user-defined eviction handler with the pool and handle as
   1258 * arguments.
   1259 *
   1260 * If the handle can not be evicted, the eviction handler should return
   1261 * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
   1262 * appropriate list and try the next z3fold page on the LRU up to
   1263 * a user defined number of retries.
   1264 *
   1265 * If the handle is successfully evicted, the eviction handler should
   1266 * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
   1267 * contains logic to delay freeing the page if the page is under reclaim,
   1268 * as indicated by the setting of the PG_reclaim flag on the underlying page.
   1269 *
   1270 * If all buddies in the z3fold page are successfully evicted, then the
   1271 * z3fold page can be freed.
   1272 *
   1273 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
   1274 * no pages to evict or an eviction handler is not registered, -EAGAIN if
   1275 * the retry limit was hit.
   1276 */
   1277static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
   1278{
   1279	int i, ret = -1;
   1280	struct z3fold_header *zhdr = NULL;
   1281	struct page *page = NULL;
   1282	struct list_head *pos;
   1283	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
   1284	struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
   1285
   1286	rwlock_init(&slots.lock);
   1287	slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
   1288
   1289	spin_lock(&pool->lock);
   1290	if (!pool->ops || !pool->ops->evict || retries == 0) {
   1291		spin_unlock(&pool->lock);
   1292		return -EINVAL;
   1293	}
   1294	for (i = 0; i < retries; i++) {
   1295		if (list_empty(&pool->lru)) {
   1296			spin_unlock(&pool->lock);
   1297			return -EINVAL;
   1298		}
   1299		list_for_each_prev(pos, &pool->lru) {
   1300			page = list_entry(pos, struct page, lru);
   1301
   1302			zhdr = page_address(page);
   1303			if (test_bit(PAGE_HEADLESS, &page->private)) {
   1304				/*
   1305				 * For non-headless pages, we wait to do this
   1306				 * until we have the page lock to avoid racing
   1307				 * with __z3fold_alloc(). Headless pages don't
   1308				 * have a lock (and __z3fold_alloc() will never
   1309				 * see them), but we still need to test and set
   1310				 * PAGE_CLAIMED to avoid racing with
   1311				 * z3fold_free(), so just do it now before
   1312				 * leaving the loop.
   1313				 */
   1314				if (test_and_set_bit(PAGE_CLAIMED, &page->private))
   1315					continue;
   1316
   1317				break;
   1318			}
   1319
   1320			if (!z3fold_page_trylock(zhdr)) {
   1321				zhdr = NULL;
   1322				continue; /* can't evict at this point */
   1323			}
   1324
   1325			/* test_and_set_bit is of course atomic, but we still
   1326			 * need to do it under page lock, otherwise checking
   1327			 * that bit in __z3fold_alloc wouldn't make sense
   1328			 */
   1329			if (zhdr->foreign_handles ||
   1330			    test_and_set_bit(PAGE_CLAIMED, &page->private)) {
   1331				z3fold_page_unlock(zhdr);
   1332				zhdr = NULL;
   1333				continue; /* can't evict such page */
   1334			}
   1335			list_del_init(&zhdr->buddy);
   1336			zhdr->cpu = -1;
   1337			/* See comment in __z3fold_alloc. */
   1338			kref_get(&zhdr->refcount);
   1339			break;
   1340		}
   1341
   1342		if (!zhdr)
   1343			break;
   1344
   1345		list_del_init(&page->lru);
   1346		spin_unlock(&pool->lock);
   1347
   1348		if (!test_bit(PAGE_HEADLESS, &page->private)) {
   1349			/*
   1350			 * We need encode the handles before unlocking, and
   1351			 * use our local slots structure because z3fold_free
   1352			 * can zero out zhdr->slots and we can't do much
   1353			 * about that
   1354			 */
   1355			first_handle = 0;
   1356			last_handle = 0;
   1357			middle_handle = 0;
   1358			memset(slots.slot, 0, sizeof(slots.slot));
   1359			if (zhdr->first_chunks)
   1360				first_handle = __encode_handle(zhdr, &slots,
   1361								FIRST);
   1362			if (zhdr->middle_chunks)
   1363				middle_handle = __encode_handle(zhdr, &slots,
   1364								MIDDLE);
   1365			if (zhdr->last_chunks)
   1366				last_handle = __encode_handle(zhdr, &slots,
   1367								LAST);
   1368			/*
   1369			 * it's safe to unlock here because we hold a
   1370			 * reference to this page
   1371			 */
   1372			z3fold_page_unlock(zhdr);
   1373		} else {
   1374			first_handle = encode_handle(zhdr, HEADLESS);
   1375			last_handle = middle_handle = 0;
   1376		}
   1377		/* Issue the eviction callback(s) */
   1378		if (middle_handle) {
   1379			ret = pool->ops->evict(pool, middle_handle);
   1380			if (ret)
   1381				goto next;
   1382		}
   1383		if (first_handle) {
   1384			ret = pool->ops->evict(pool, first_handle);
   1385			if (ret)
   1386				goto next;
   1387		}
   1388		if (last_handle) {
   1389			ret = pool->ops->evict(pool, last_handle);
   1390			if (ret)
   1391				goto next;
   1392		}
   1393next:
   1394		if (test_bit(PAGE_HEADLESS, &page->private)) {
   1395			if (ret == 0) {
   1396				free_z3fold_page(page, true);
   1397				atomic64_dec(&pool->pages_nr);
   1398				return 0;
   1399			}
   1400			spin_lock(&pool->lock);
   1401			list_add(&page->lru, &pool->lru);
   1402			spin_unlock(&pool->lock);
   1403			clear_bit(PAGE_CLAIMED, &page->private);
   1404		} else {
   1405			struct z3fold_buddy_slots *slots = zhdr->slots;
   1406			z3fold_page_lock(zhdr);
   1407			if (kref_put(&zhdr->refcount,
   1408					release_z3fold_page_locked)) {
   1409				kmem_cache_free(pool->c_handle, slots);
   1410				return 0;
   1411			}
   1412			/*
   1413			 * if we are here, the page is still not completely
   1414			 * free. Take the global pool lock then to be able
   1415			 * to add it back to the lru list
   1416			 */
   1417			spin_lock(&pool->lock);
   1418			list_add(&page->lru, &pool->lru);
   1419			spin_unlock(&pool->lock);
   1420			if (list_empty(&zhdr->buddy))
   1421				add_to_unbuddied(pool, zhdr);
   1422			clear_bit(PAGE_CLAIMED, &page->private);
   1423			z3fold_page_unlock(zhdr);
   1424		}
   1425
   1426		/* We started off locked to we need to lock the pool back */
   1427		spin_lock(&pool->lock);
   1428	}
   1429	spin_unlock(&pool->lock);
   1430	return -EAGAIN;
   1431}
   1432
   1433/**
   1434 * z3fold_map() - maps the allocation associated with the given handle
   1435 * @pool:	pool in which the allocation resides
   1436 * @handle:	handle associated with the allocation to be mapped
   1437 *
   1438 * Extracts the buddy number from handle and constructs the pointer to the
   1439 * correct starting chunk within the page.
   1440 *
   1441 * Returns: a pointer to the mapped allocation
   1442 */
   1443static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
   1444{
   1445	struct z3fold_header *zhdr;
   1446	struct page *page;
   1447	void *addr;
   1448	enum buddy buddy;
   1449
   1450	zhdr = get_z3fold_header(handle);
   1451	addr = zhdr;
   1452	page = virt_to_page(zhdr);
   1453
   1454	if (test_bit(PAGE_HEADLESS, &page->private))
   1455		goto out;
   1456
   1457	buddy = handle_to_buddy(handle);
   1458	switch (buddy) {
   1459	case FIRST:
   1460		addr += ZHDR_SIZE_ALIGNED;
   1461		break;
   1462	case MIDDLE:
   1463		addr += zhdr->start_middle << CHUNK_SHIFT;
   1464		set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
   1465		break;
   1466	case LAST:
   1467		addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
   1468		break;
   1469	default:
   1470		pr_err("unknown buddy id %d\n", buddy);
   1471		WARN_ON(1);
   1472		addr = NULL;
   1473		break;
   1474	}
   1475
   1476	if (addr)
   1477		zhdr->mapped_count++;
   1478out:
   1479	put_z3fold_header(zhdr);
   1480	return addr;
   1481}
   1482
   1483/**
   1484 * z3fold_unmap() - unmaps the allocation associated with the given handle
   1485 * @pool:	pool in which the allocation resides
   1486 * @handle:	handle associated with the allocation to be unmapped
   1487 */
   1488static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
   1489{
   1490	struct z3fold_header *zhdr;
   1491	struct page *page;
   1492	enum buddy buddy;
   1493
   1494	zhdr = get_z3fold_header(handle);
   1495	page = virt_to_page(zhdr);
   1496
   1497	if (test_bit(PAGE_HEADLESS, &page->private))
   1498		return;
   1499
   1500	buddy = handle_to_buddy(handle);
   1501	if (buddy == MIDDLE)
   1502		clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
   1503	zhdr->mapped_count--;
   1504	put_z3fold_header(zhdr);
   1505}
   1506
   1507/**
   1508 * z3fold_get_pool_size() - gets the z3fold pool size in pages
   1509 * @pool:	pool whose size is being queried
   1510 *
   1511 * Returns: size in pages of the given pool.
   1512 */
   1513static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
   1514{
   1515	return atomic64_read(&pool->pages_nr);
   1516}
   1517
   1518static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
   1519{
   1520	struct z3fold_header *zhdr;
   1521	struct z3fold_pool *pool;
   1522
   1523	VM_BUG_ON_PAGE(!PageMovable(page), page);
   1524	VM_BUG_ON_PAGE(PageIsolated(page), page);
   1525
   1526	if (test_bit(PAGE_HEADLESS, &page->private))
   1527		return false;
   1528
   1529	zhdr = page_address(page);
   1530	z3fold_page_lock(zhdr);
   1531	if (test_bit(NEEDS_COMPACTING, &page->private) ||
   1532	    test_bit(PAGE_STALE, &page->private))
   1533		goto out;
   1534
   1535	if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
   1536		goto out;
   1537
   1538	if (test_and_set_bit(PAGE_CLAIMED, &page->private))
   1539		goto out;
   1540	pool = zhdr_to_pool(zhdr);
   1541	spin_lock(&pool->lock);
   1542	if (!list_empty(&zhdr->buddy))
   1543		list_del_init(&zhdr->buddy);
   1544	if (!list_empty(&page->lru))
   1545		list_del_init(&page->lru);
   1546	spin_unlock(&pool->lock);
   1547
   1548	kref_get(&zhdr->refcount);
   1549	z3fold_page_unlock(zhdr);
   1550	return true;
   1551
   1552out:
   1553	z3fold_page_unlock(zhdr);
   1554	return false;
   1555}
   1556
   1557static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
   1558			       struct page *page, enum migrate_mode mode)
   1559{
   1560	struct z3fold_header *zhdr, *new_zhdr;
   1561	struct z3fold_pool *pool;
   1562	struct address_space *new_mapping;
   1563
   1564	VM_BUG_ON_PAGE(!PageMovable(page), page);
   1565	VM_BUG_ON_PAGE(!PageIsolated(page), page);
   1566	VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
   1567	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
   1568
   1569	zhdr = page_address(page);
   1570	pool = zhdr_to_pool(zhdr);
   1571
   1572	if (!z3fold_page_trylock(zhdr))
   1573		return -EAGAIN;
   1574	if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
   1575		clear_bit(PAGE_CLAIMED, &page->private);
   1576		z3fold_page_unlock(zhdr);
   1577		return -EBUSY;
   1578	}
   1579	if (work_pending(&zhdr->work)) {
   1580		z3fold_page_unlock(zhdr);
   1581		return -EAGAIN;
   1582	}
   1583	new_zhdr = page_address(newpage);
   1584	memcpy(new_zhdr, zhdr, PAGE_SIZE);
   1585	newpage->private = page->private;
   1586	set_bit(PAGE_MIGRATED, &page->private);
   1587	z3fold_page_unlock(zhdr);
   1588	spin_lock_init(&new_zhdr->page_lock);
   1589	INIT_WORK(&new_zhdr->work, compact_page_work);
   1590	/*
   1591	 * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
   1592	 * so we only have to reinitialize it.
   1593	 */
   1594	INIT_LIST_HEAD(&new_zhdr->buddy);
   1595	new_mapping = page_mapping(page);
   1596	__ClearPageMovable(page);
   1597
   1598	get_page(newpage);
   1599	z3fold_page_lock(new_zhdr);
   1600	if (new_zhdr->first_chunks)
   1601		encode_handle(new_zhdr, FIRST);
   1602	if (new_zhdr->last_chunks)
   1603		encode_handle(new_zhdr, LAST);
   1604	if (new_zhdr->middle_chunks)
   1605		encode_handle(new_zhdr, MIDDLE);
   1606	set_bit(NEEDS_COMPACTING, &newpage->private);
   1607	new_zhdr->cpu = smp_processor_id();
   1608	spin_lock(&pool->lock);
   1609	list_add(&newpage->lru, &pool->lru);
   1610	spin_unlock(&pool->lock);
   1611	__SetPageMovable(newpage, new_mapping);
   1612	z3fold_page_unlock(new_zhdr);
   1613
   1614	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
   1615
   1616	/* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
   1617	page->private = 0;
   1618	put_page(page);
   1619	return 0;
   1620}
   1621
   1622static void z3fold_page_putback(struct page *page)
   1623{
   1624	struct z3fold_header *zhdr;
   1625	struct z3fold_pool *pool;
   1626
   1627	zhdr = page_address(page);
   1628	pool = zhdr_to_pool(zhdr);
   1629
   1630	z3fold_page_lock(zhdr);
   1631	if (!list_empty(&zhdr->buddy))
   1632		list_del_init(&zhdr->buddy);
   1633	INIT_LIST_HEAD(&page->lru);
   1634	if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
   1635		return;
   1636	spin_lock(&pool->lock);
   1637	list_add(&page->lru, &pool->lru);
   1638	spin_unlock(&pool->lock);
   1639	if (list_empty(&zhdr->buddy))
   1640		add_to_unbuddied(pool, zhdr);
   1641	clear_bit(PAGE_CLAIMED, &page->private);
   1642	z3fold_page_unlock(zhdr);
   1643}
   1644
   1645static const struct address_space_operations z3fold_aops = {
   1646	.isolate_page = z3fold_page_isolate,
   1647	.migratepage = z3fold_page_migrate,
   1648	.putback_page = z3fold_page_putback,
   1649};
   1650
   1651/*****************
   1652 * zpool
   1653 ****************/
   1654
   1655static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
   1656{
   1657	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
   1658		return pool->zpool_ops->evict(pool->zpool, handle);
   1659	else
   1660		return -ENOENT;
   1661}
   1662
   1663static const struct z3fold_ops z3fold_zpool_ops = {
   1664	.evict =	z3fold_zpool_evict
   1665};
   1666
   1667static void *z3fold_zpool_create(const char *name, gfp_t gfp,
   1668			       const struct zpool_ops *zpool_ops,
   1669			       struct zpool *zpool)
   1670{
   1671	struct z3fold_pool *pool;
   1672
   1673	pool = z3fold_create_pool(name, gfp,
   1674				zpool_ops ? &z3fold_zpool_ops : NULL);
   1675	if (pool) {
   1676		pool->zpool = zpool;
   1677		pool->zpool_ops = zpool_ops;
   1678	}
   1679	return pool;
   1680}
   1681
   1682static void z3fold_zpool_destroy(void *pool)
   1683{
   1684	z3fold_destroy_pool(pool);
   1685}
   1686
   1687static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
   1688			unsigned long *handle)
   1689{
   1690	return z3fold_alloc(pool, size, gfp, handle);
   1691}
   1692static void z3fold_zpool_free(void *pool, unsigned long handle)
   1693{
   1694	z3fold_free(pool, handle);
   1695}
   1696
   1697static int z3fold_zpool_shrink(void *pool, unsigned int pages,
   1698			unsigned int *reclaimed)
   1699{
   1700	unsigned int total = 0;
   1701	int ret = -EINVAL;
   1702
   1703	while (total < pages) {
   1704		ret = z3fold_reclaim_page(pool, 8);
   1705		if (ret < 0)
   1706			break;
   1707		total++;
   1708	}
   1709
   1710	if (reclaimed)
   1711		*reclaimed = total;
   1712
   1713	return ret;
   1714}
   1715
   1716static void *z3fold_zpool_map(void *pool, unsigned long handle,
   1717			enum zpool_mapmode mm)
   1718{
   1719	return z3fold_map(pool, handle);
   1720}
   1721static void z3fold_zpool_unmap(void *pool, unsigned long handle)
   1722{
   1723	z3fold_unmap(pool, handle);
   1724}
   1725
   1726static u64 z3fold_zpool_total_size(void *pool)
   1727{
   1728	return z3fold_get_pool_size(pool) * PAGE_SIZE;
   1729}
   1730
   1731static struct zpool_driver z3fold_zpool_driver = {
   1732	.type =		"z3fold",
   1733	.sleep_mapped = true,
   1734	.owner =	THIS_MODULE,
   1735	.create =	z3fold_zpool_create,
   1736	.destroy =	z3fold_zpool_destroy,
   1737	.malloc =	z3fold_zpool_malloc,
   1738	.free =		z3fold_zpool_free,
   1739	.shrink =	z3fold_zpool_shrink,
   1740	.map =		z3fold_zpool_map,
   1741	.unmap =	z3fold_zpool_unmap,
   1742	.total_size =	z3fold_zpool_total_size,
   1743};
   1744
   1745MODULE_ALIAS("zpool-z3fold");
   1746
   1747static int __init init_z3fold(void)
   1748{
   1749	int ret;
   1750
   1751	/*
   1752	 * Make sure the z3fold header is not larger than the page size and
   1753	 * there has remaining spaces for its buddy.
   1754	 */
   1755	BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
   1756	ret = z3fold_mount();
   1757	if (ret)
   1758		return ret;
   1759
   1760	zpool_register_driver(&z3fold_zpool_driver);
   1761
   1762	return 0;
   1763}
   1764
   1765static void __exit exit_z3fold(void)
   1766{
   1767	z3fold_unmount();
   1768	zpool_unregister_driver(&z3fold_zpool_driver);
   1769}
   1770
   1771module_init(init_z3fold);
   1772module_exit(exit_z3fold);
   1773
   1774MODULE_LICENSE("GPL");
   1775MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
   1776MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");