cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-bufio.c (52120B)


      1/*
      2 * Copyright (C) 2009-2011 Red Hat, Inc.
      3 *
      4 * Author: Mikulas Patocka <mpatocka@redhat.com>
      5 *
      6 * This file is released under the GPL.
      7 */
      8
      9#include <linux/dm-bufio.h>
     10
     11#include <linux/device-mapper.h>
     12#include <linux/dm-io.h>
     13#include <linux/slab.h>
     14#include <linux/sched/mm.h>
     15#include <linux/jiffies.h>
     16#include <linux/vmalloc.h>
     17#include <linux/shrinker.h>
     18#include <linux/module.h>
     19#include <linux/rbtree.h>
     20#include <linux/stacktrace.h>
     21
     22#define DM_MSG_PREFIX "bufio"
     23
     24/*
     25 * Memory management policy:
     26 *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
     27 *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
     28 *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
     29 *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
     30 *	dirty buffers.
     31 */
     32#define DM_BUFIO_MIN_BUFFERS		8
     33
     34#define DM_BUFIO_MEMORY_PERCENT		2
     35#define DM_BUFIO_VMALLOC_PERCENT	25
     36#define DM_BUFIO_WRITEBACK_RATIO	3
     37#define DM_BUFIO_LOW_WATERMARK_RATIO	16
     38
     39/*
     40 * Check buffer ages in this interval (seconds)
     41 */
     42#define DM_BUFIO_WORK_TIMER_SECS	30
     43
     44/*
     45 * Free buffers when they are older than this (seconds)
     46 */
     47#define DM_BUFIO_DEFAULT_AGE_SECS	300
     48
     49/*
     50 * The nr of bytes of cached data to keep around.
     51 */
     52#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
     53
     54/*
     55 * Align buffer writes to this boundary.
     56 * Tests show that SSDs have the highest IOPS when using 4k writes.
     57 */
     58#define DM_BUFIO_WRITE_ALIGN		4096
     59
     60/*
     61 * dm_buffer->list_mode
     62 */
     63#define LIST_CLEAN	0
     64#define LIST_DIRTY	1
     65#define LIST_SIZE	2
     66
     67/*
     68 * Linking of buffers:
     69 *	All buffers are linked to buffer_tree with their node field.
     70 *
     71 *	Clean buffers that are not being written (B_WRITING not set)
     72 *	are linked to lru[LIST_CLEAN] with their lru_list field.
     73 *
     74 *	Dirty and clean buffers that are being written are linked to
     75 *	lru[LIST_DIRTY] with their lru_list field. When the write
     76 *	finishes, the buffer cannot be relinked immediately (because we
     77 *	are in an interrupt context and relinking requires process
     78 *	context), so some clean-not-writing buffers can be held on
     79 *	dirty_lru too.  They are later added to lru in the process
     80 *	context.
     81 */
     82struct dm_bufio_client {
     83	struct mutex lock;
     84
     85	struct list_head lru[LIST_SIZE];
     86	unsigned long n_buffers[LIST_SIZE];
     87
     88	struct block_device *bdev;
     89	unsigned block_size;
     90	s8 sectors_per_block_bits;
     91	void (*alloc_callback)(struct dm_buffer *);
     92	void (*write_callback)(struct dm_buffer *);
     93
     94	struct kmem_cache *slab_buffer;
     95	struct kmem_cache *slab_cache;
     96	struct dm_io_client *dm_io;
     97
     98	struct list_head reserved_buffers;
     99	unsigned need_reserved_buffers;
    100
    101	unsigned minimum_buffers;
    102
    103	struct rb_root buffer_tree;
    104	wait_queue_head_t free_buffer_wait;
    105
    106	sector_t start;
    107
    108	int async_write_error;
    109
    110	struct list_head client_list;
    111
    112	struct shrinker shrinker;
    113	struct work_struct shrink_work;
    114	atomic_long_t need_shrink;
    115};
    116
    117/*
    118 * Buffer state bits.
    119 */
    120#define B_READING	0
    121#define B_WRITING	1
    122#define B_DIRTY		2
    123
    124/*
    125 * Describes how the block was allocated:
    126 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
    127 * See the comment at alloc_buffer_data.
    128 */
    129enum data_mode {
    130	DATA_MODE_SLAB = 0,
    131	DATA_MODE_GET_FREE_PAGES = 1,
    132	DATA_MODE_VMALLOC = 2,
    133	DATA_MODE_LIMIT = 3
    134};
    135
    136struct dm_buffer {
    137	struct rb_node node;
    138	struct list_head lru_list;
    139	struct list_head global_list;
    140	sector_t block;
    141	void *data;
    142	unsigned char data_mode;		/* DATA_MODE_* */
    143	unsigned char list_mode;		/* LIST_* */
    144	blk_status_t read_error;
    145	blk_status_t write_error;
    146	unsigned accessed;
    147	unsigned hold_count;
    148	unsigned long state;
    149	unsigned long last_accessed;
    150	unsigned dirty_start;
    151	unsigned dirty_end;
    152	unsigned write_start;
    153	unsigned write_end;
    154	struct dm_bufio_client *c;
    155	struct list_head write_list;
    156	void (*end_io)(struct dm_buffer *, blk_status_t);
    157#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
    158#define MAX_STACK 10
    159	unsigned int stack_len;
    160	unsigned long stack_entries[MAX_STACK];
    161#endif
    162};
    163
    164/*----------------------------------------------------------------*/
    165
    166#define dm_bufio_in_request()	(!!current->bio_list)
    167
    168static void dm_bufio_lock(struct dm_bufio_client *c)
    169{
    170	mutex_lock_nested(&c->lock, dm_bufio_in_request());
    171}
    172
    173static int dm_bufio_trylock(struct dm_bufio_client *c)
    174{
    175	return mutex_trylock(&c->lock);
    176}
    177
    178static void dm_bufio_unlock(struct dm_bufio_client *c)
    179{
    180	mutex_unlock(&c->lock);
    181}
    182
    183/*----------------------------------------------------------------*/
    184
    185/*
    186 * Default cache size: available memory divided by the ratio.
    187 */
    188static unsigned long dm_bufio_default_cache_size;
    189
    190/*
    191 * Total cache size set by the user.
    192 */
    193static unsigned long dm_bufio_cache_size;
    194
    195/*
    196 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
    197 * at any time.  If it disagrees, the user has changed cache size.
    198 */
    199static unsigned long dm_bufio_cache_size_latch;
    200
    201static DEFINE_SPINLOCK(global_spinlock);
    202
    203static LIST_HEAD(global_queue);
    204
    205static unsigned long global_num = 0;
    206
    207/*
    208 * Buffers are freed after this timeout
    209 */
    210static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
    211static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
    212
    213static unsigned long dm_bufio_peak_allocated;
    214static unsigned long dm_bufio_allocated_kmem_cache;
    215static unsigned long dm_bufio_allocated_get_free_pages;
    216static unsigned long dm_bufio_allocated_vmalloc;
    217static unsigned long dm_bufio_current_allocated;
    218
    219/*----------------------------------------------------------------*/
    220
    221/*
    222 * The current number of clients.
    223 */
    224static int dm_bufio_client_count;
    225
    226/*
    227 * The list of all clients.
    228 */
    229static LIST_HEAD(dm_bufio_all_clients);
    230
    231/*
    232 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
    233 */
    234static DEFINE_MUTEX(dm_bufio_clients_lock);
    235
    236static struct workqueue_struct *dm_bufio_wq;
    237static struct delayed_work dm_bufio_cleanup_old_work;
    238static struct work_struct dm_bufio_replacement_work;
    239
    240
    241#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
    242static void buffer_record_stack(struct dm_buffer *b)
    243{
    244	b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
    245}
    246#endif
    247
    248/*----------------------------------------------------------------
    249 * A red/black tree acts as an index for all the buffers.
    250 *--------------------------------------------------------------*/
    251static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
    252{
    253	struct rb_node *n = c->buffer_tree.rb_node;
    254	struct dm_buffer *b;
    255
    256	while (n) {
    257		b = container_of(n, struct dm_buffer, node);
    258
    259		if (b->block == block)
    260			return b;
    261
    262		n = block < b->block ? n->rb_left : n->rb_right;
    263	}
    264
    265	return NULL;
    266}
    267
    268static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
    269{
    270	struct rb_node *n = c->buffer_tree.rb_node;
    271	struct dm_buffer *b;
    272	struct dm_buffer *best = NULL;
    273
    274	while (n) {
    275		b = container_of(n, struct dm_buffer, node);
    276
    277		if (b->block == block)
    278			return b;
    279
    280		if (block <= b->block) {
    281			n = n->rb_left;
    282			best = b;
    283		} else {
    284			n = n->rb_right;
    285		}
    286	}
    287
    288	return best;
    289}
    290
    291static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
    292{
    293	struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
    294	struct dm_buffer *found;
    295
    296	while (*new) {
    297		found = container_of(*new, struct dm_buffer, node);
    298
    299		if (found->block == b->block) {
    300			BUG_ON(found != b);
    301			return;
    302		}
    303
    304		parent = *new;
    305		new = b->block < found->block ?
    306			&found->node.rb_left : &found->node.rb_right;
    307	}
    308
    309	rb_link_node(&b->node, parent, new);
    310	rb_insert_color(&b->node, &c->buffer_tree);
    311}
    312
    313static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
    314{
    315	rb_erase(&b->node, &c->buffer_tree);
    316}
    317
    318/*----------------------------------------------------------------*/
    319
    320static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
    321{
    322	unsigned char data_mode;
    323	long diff;
    324
    325	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
    326		&dm_bufio_allocated_kmem_cache,
    327		&dm_bufio_allocated_get_free_pages,
    328		&dm_bufio_allocated_vmalloc,
    329	};
    330
    331	data_mode = b->data_mode;
    332	diff = (long)b->c->block_size;
    333	if (unlink)
    334		diff = -diff;
    335
    336	spin_lock(&global_spinlock);
    337
    338	*class_ptr[data_mode] += diff;
    339
    340	dm_bufio_current_allocated += diff;
    341
    342	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
    343		dm_bufio_peak_allocated = dm_bufio_current_allocated;
    344
    345	b->accessed = 1;
    346
    347	if (!unlink) {
    348		list_add(&b->global_list, &global_queue);
    349		global_num++;
    350		if (dm_bufio_current_allocated > dm_bufio_cache_size)
    351			queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
    352	} else {
    353		list_del(&b->global_list);
    354		global_num--;
    355	}
    356
    357	spin_unlock(&global_spinlock);
    358}
    359
    360/*
    361 * Change the number of clients and recalculate per-client limit.
    362 */
    363static void __cache_size_refresh(void)
    364{
    365	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
    366	BUG_ON(dm_bufio_client_count < 0);
    367
    368	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
    369
    370	/*
    371	 * Use default if set to 0 and report the actual cache size used.
    372	 */
    373	if (!dm_bufio_cache_size_latch) {
    374		(void)cmpxchg(&dm_bufio_cache_size, 0,
    375			      dm_bufio_default_cache_size);
    376		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
    377	}
    378}
    379
    380/*
    381 * Allocating buffer data.
    382 *
    383 * Small buffers are allocated with kmem_cache, to use space optimally.
    384 *
    385 * For large buffers, we choose between get_free_pages and vmalloc.
    386 * Each has advantages and disadvantages.
    387 *
    388 * __get_free_pages can randomly fail if the memory is fragmented.
    389 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
    390 * as low as 128M) so using it for caching is not appropriate.
    391 *
    392 * If the allocation may fail we use __get_free_pages. Memory fragmentation
    393 * won't have a fatal effect here, but it just causes flushes of some other
    394 * buffers and more I/O will be performed. Don't use __get_free_pages if it
    395 * always fails (i.e. order >= MAX_ORDER).
    396 *
    397 * If the allocation shouldn't fail we use __vmalloc. This is only for the
    398 * initial reserve allocation, so there's no risk of wasting all vmalloc
    399 * space.
    400 */
    401static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
    402			       unsigned char *data_mode)
    403{
    404	if (unlikely(c->slab_cache != NULL)) {
    405		*data_mode = DATA_MODE_SLAB;
    406		return kmem_cache_alloc(c->slab_cache, gfp_mask);
    407	}
    408
    409	if (c->block_size <= KMALLOC_MAX_SIZE &&
    410	    gfp_mask & __GFP_NORETRY) {
    411		*data_mode = DATA_MODE_GET_FREE_PAGES;
    412		return (void *)__get_free_pages(gfp_mask,
    413						c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
    414	}
    415
    416	*data_mode = DATA_MODE_VMALLOC;
    417
    418	/*
    419	 * __vmalloc allocates the data pages and auxiliary structures with
    420	 * gfp_flags that were specified, but pagetables are always allocated
    421	 * with GFP_KERNEL, no matter what was specified as gfp_mask.
    422	 *
    423	 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
    424	 * all allocations done by this process (including pagetables) are done
    425	 * as if GFP_NOIO was specified.
    426	 */
    427	if (gfp_mask & __GFP_NORETRY) {
    428		unsigned noio_flag = memalloc_noio_save();
    429		void *ptr = __vmalloc(c->block_size, gfp_mask);
    430
    431		memalloc_noio_restore(noio_flag);
    432		return ptr;
    433	}
    434
    435	return __vmalloc(c->block_size, gfp_mask);
    436}
    437
    438/*
    439 * Free buffer's data.
    440 */
    441static void free_buffer_data(struct dm_bufio_client *c,
    442			     void *data, unsigned char data_mode)
    443{
    444	switch (data_mode) {
    445	case DATA_MODE_SLAB:
    446		kmem_cache_free(c->slab_cache, data);
    447		break;
    448
    449	case DATA_MODE_GET_FREE_PAGES:
    450		free_pages((unsigned long)data,
    451			   c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
    452		break;
    453
    454	case DATA_MODE_VMALLOC:
    455		vfree(data);
    456		break;
    457
    458	default:
    459		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
    460		       data_mode);
    461		BUG();
    462	}
    463}
    464
    465/*
    466 * Allocate buffer and its data.
    467 */
    468static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
    469{
    470	struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
    471
    472	if (!b)
    473		return NULL;
    474
    475	b->c = c;
    476
    477	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
    478	if (!b->data) {
    479		kmem_cache_free(c->slab_buffer, b);
    480		return NULL;
    481	}
    482
    483#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
    484	b->stack_len = 0;
    485#endif
    486	return b;
    487}
    488
    489/*
    490 * Free buffer and its data.
    491 */
    492static void free_buffer(struct dm_buffer *b)
    493{
    494	struct dm_bufio_client *c = b->c;
    495
    496	free_buffer_data(c, b->data, b->data_mode);
    497	kmem_cache_free(c->slab_buffer, b);
    498}
    499
    500/*
    501 * Link buffer to the buffer tree and clean or dirty queue.
    502 */
    503static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
    504{
    505	struct dm_bufio_client *c = b->c;
    506
    507	c->n_buffers[dirty]++;
    508	b->block = block;
    509	b->list_mode = dirty;
    510	list_add(&b->lru_list, &c->lru[dirty]);
    511	__insert(b->c, b);
    512	b->last_accessed = jiffies;
    513
    514	adjust_total_allocated(b, false);
    515}
    516
    517/*
    518 * Unlink buffer from the buffer tree and dirty or clean queue.
    519 */
    520static void __unlink_buffer(struct dm_buffer *b)
    521{
    522	struct dm_bufio_client *c = b->c;
    523
    524	BUG_ON(!c->n_buffers[b->list_mode]);
    525
    526	c->n_buffers[b->list_mode]--;
    527	__remove(b->c, b);
    528	list_del(&b->lru_list);
    529
    530	adjust_total_allocated(b, true);
    531}
    532
    533/*
    534 * Place the buffer to the head of dirty or clean LRU queue.
    535 */
    536static void __relink_lru(struct dm_buffer *b, int dirty)
    537{
    538	struct dm_bufio_client *c = b->c;
    539
    540	b->accessed = 1;
    541
    542	BUG_ON(!c->n_buffers[b->list_mode]);
    543
    544	c->n_buffers[b->list_mode]--;
    545	c->n_buffers[dirty]++;
    546	b->list_mode = dirty;
    547	list_move(&b->lru_list, &c->lru[dirty]);
    548	b->last_accessed = jiffies;
    549}
    550
    551/*----------------------------------------------------------------
    552 * Submit I/O on the buffer.
    553 *
    554 * Bio interface is faster but it has some problems:
    555 *	the vector list is limited (increasing this limit increases
    556 *	memory-consumption per buffer, so it is not viable);
    557 *
    558 *	the memory must be direct-mapped, not vmalloced;
    559 *
    560 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
    561 * it is not vmalloced, try using the bio interface.
    562 *
    563 * If the buffer is big, if it is vmalloced or if the underlying device
    564 * rejects the bio because it is too large, use dm-io layer to do the I/O.
    565 * The dm-io layer splits the I/O into multiple requests, avoiding the above
    566 * shortcomings.
    567 *--------------------------------------------------------------*/
    568
    569/*
    570 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
    571 * that the request was handled directly with bio interface.
    572 */
    573static void dmio_complete(unsigned long error, void *context)
    574{
    575	struct dm_buffer *b = context;
    576
    577	b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
    578}
    579
    580static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
    581		     unsigned n_sectors, unsigned offset)
    582{
    583	int r;
    584	struct dm_io_request io_req = {
    585		.bi_op = rw,
    586		.bi_op_flags = 0,
    587		.notify.fn = dmio_complete,
    588		.notify.context = b,
    589		.client = b->c->dm_io,
    590	};
    591	struct dm_io_region region = {
    592		.bdev = b->c->bdev,
    593		.sector = sector,
    594		.count = n_sectors,
    595	};
    596
    597	if (b->data_mode != DATA_MODE_VMALLOC) {
    598		io_req.mem.type = DM_IO_KMEM;
    599		io_req.mem.ptr.addr = (char *)b->data + offset;
    600	} else {
    601		io_req.mem.type = DM_IO_VMA;
    602		io_req.mem.ptr.vma = (char *)b->data + offset;
    603	}
    604
    605	r = dm_io(&io_req, 1, &region, NULL);
    606	if (unlikely(r))
    607		b->end_io(b, errno_to_blk_status(r));
    608}
    609
    610static void bio_complete(struct bio *bio)
    611{
    612	struct dm_buffer *b = bio->bi_private;
    613	blk_status_t status = bio->bi_status;
    614	bio_uninit(bio);
    615	kfree(bio);
    616	b->end_io(b, status);
    617}
    618
    619static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
    620		    unsigned n_sectors, unsigned offset)
    621{
    622	struct bio *bio;
    623	char *ptr;
    624	unsigned vec_size, len;
    625
    626	vec_size = b->c->block_size >> PAGE_SHIFT;
    627	if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
    628		vec_size += 2;
    629
    630	bio = bio_kmalloc(vec_size, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
    631	if (!bio) {
    632dmio:
    633		use_dmio(b, rw, sector, n_sectors, offset);
    634		return;
    635	}
    636	bio_init(bio, b->c->bdev, bio->bi_inline_vecs, vec_size, rw);
    637	bio->bi_iter.bi_sector = sector;
    638	bio->bi_end_io = bio_complete;
    639	bio->bi_private = b;
    640
    641	ptr = (char *)b->data + offset;
    642	len = n_sectors << SECTOR_SHIFT;
    643
    644	do {
    645		unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
    646		if (!bio_add_page(bio, virt_to_page(ptr), this_step,
    647				  offset_in_page(ptr))) {
    648			bio_put(bio);
    649			goto dmio;
    650		}
    651
    652		len -= this_step;
    653		ptr += this_step;
    654	} while (len > 0);
    655
    656	submit_bio(bio);
    657}
    658
    659static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
    660{
    661	sector_t sector;
    662
    663	if (likely(c->sectors_per_block_bits >= 0))
    664		sector = block << c->sectors_per_block_bits;
    665	else
    666		sector = block * (c->block_size >> SECTOR_SHIFT);
    667	sector += c->start;
    668
    669	return sector;
    670}
    671
    672static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
    673{
    674	unsigned n_sectors;
    675	sector_t sector;
    676	unsigned offset, end;
    677
    678	b->end_io = end_io;
    679
    680	sector = block_to_sector(b->c, b->block);
    681
    682	if (rw != REQ_OP_WRITE) {
    683		n_sectors = b->c->block_size >> SECTOR_SHIFT;
    684		offset = 0;
    685	} else {
    686		if (b->c->write_callback)
    687			b->c->write_callback(b);
    688		offset = b->write_start;
    689		end = b->write_end;
    690		offset &= -DM_BUFIO_WRITE_ALIGN;
    691		end += DM_BUFIO_WRITE_ALIGN - 1;
    692		end &= -DM_BUFIO_WRITE_ALIGN;
    693		if (unlikely(end > b->c->block_size))
    694			end = b->c->block_size;
    695
    696		sector += offset >> SECTOR_SHIFT;
    697		n_sectors = (end - offset) >> SECTOR_SHIFT;
    698	}
    699
    700	if (b->data_mode != DATA_MODE_VMALLOC)
    701		use_bio(b, rw, sector, n_sectors, offset);
    702	else
    703		use_dmio(b, rw, sector, n_sectors, offset);
    704}
    705
    706/*----------------------------------------------------------------
    707 * Writing dirty buffers
    708 *--------------------------------------------------------------*/
    709
    710/*
    711 * The endio routine for write.
    712 *
    713 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
    714 * it.
    715 */
    716static void write_endio(struct dm_buffer *b, blk_status_t status)
    717{
    718	b->write_error = status;
    719	if (unlikely(status)) {
    720		struct dm_bufio_client *c = b->c;
    721
    722		(void)cmpxchg(&c->async_write_error, 0,
    723				blk_status_to_errno(status));
    724	}
    725
    726	BUG_ON(!test_bit(B_WRITING, &b->state));
    727
    728	smp_mb__before_atomic();
    729	clear_bit(B_WRITING, &b->state);
    730	smp_mb__after_atomic();
    731
    732	wake_up_bit(&b->state, B_WRITING);
    733}
    734
    735/*
    736 * Initiate a write on a dirty buffer, but don't wait for it.
    737 *
    738 * - If the buffer is not dirty, exit.
    739 * - If there some previous write going on, wait for it to finish (we can't
    740 *   have two writes on the same buffer simultaneously).
    741 * - Submit our write and don't wait on it. We set B_WRITING indicating
    742 *   that there is a write in progress.
    743 */
    744static void __write_dirty_buffer(struct dm_buffer *b,
    745				 struct list_head *write_list)
    746{
    747	if (!test_bit(B_DIRTY, &b->state))
    748		return;
    749
    750	clear_bit(B_DIRTY, &b->state);
    751	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
    752
    753	b->write_start = b->dirty_start;
    754	b->write_end = b->dirty_end;
    755
    756	if (!write_list)
    757		submit_io(b, REQ_OP_WRITE, write_endio);
    758	else
    759		list_add_tail(&b->write_list, write_list);
    760}
    761
    762static void __flush_write_list(struct list_head *write_list)
    763{
    764	struct blk_plug plug;
    765	blk_start_plug(&plug);
    766	while (!list_empty(write_list)) {
    767		struct dm_buffer *b =
    768			list_entry(write_list->next, struct dm_buffer, write_list);
    769		list_del(&b->write_list);
    770		submit_io(b, REQ_OP_WRITE, write_endio);
    771		cond_resched();
    772	}
    773	blk_finish_plug(&plug);
    774}
    775
    776/*
    777 * Wait until any activity on the buffer finishes.  Possibly write the
    778 * buffer if it is dirty.  When this function finishes, there is no I/O
    779 * running on the buffer and the buffer is not dirty.
    780 */
    781static void __make_buffer_clean(struct dm_buffer *b)
    782{
    783	BUG_ON(b->hold_count);
    784
    785	if (!b->state)	/* fast case */
    786		return;
    787
    788	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
    789	__write_dirty_buffer(b, NULL);
    790	wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
    791}
    792
    793/*
    794 * Find some buffer that is not held by anybody, clean it, unlink it and
    795 * return it.
    796 */
    797static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
    798{
    799	struct dm_buffer *b;
    800
    801	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
    802		BUG_ON(test_bit(B_WRITING, &b->state));
    803		BUG_ON(test_bit(B_DIRTY, &b->state));
    804
    805		if (!b->hold_count) {
    806			__make_buffer_clean(b);
    807			__unlink_buffer(b);
    808			return b;
    809		}
    810		cond_resched();
    811	}
    812
    813	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
    814		BUG_ON(test_bit(B_READING, &b->state));
    815
    816		if (!b->hold_count) {
    817			__make_buffer_clean(b);
    818			__unlink_buffer(b);
    819			return b;
    820		}
    821		cond_resched();
    822	}
    823
    824	return NULL;
    825}
    826
    827/*
    828 * Wait until some other threads free some buffer or release hold count on
    829 * some buffer.
    830 *
    831 * This function is entered with c->lock held, drops it and regains it
    832 * before exiting.
    833 */
    834static void __wait_for_free_buffer(struct dm_bufio_client *c)
    835{
    836	DECLARE_WAITQUEUE(wait, current);
    837
    838	add_wait_queue(&c->free_buffer_wait, &wait);
    839	set_current_state(TASK_UNINTERRUPTIBLE);
    840	dm_bufio_unlock(c);
    841
    842	io_schedule();
    843
    844	remove_wait_queue(&c->free_buffer_wait, &wait);
    845
    846	dm_bufio_lock(c);
    847}
    848
    849enum new_flag {
    850	NF_FRESH = 0,
    851	NF_READ = 1,
    852	NF_GET = 2,
    853	NF_PREFETCH = 3
    854};
    855
    856/*
    857 * Allocate a new buffer. If the allocation is not possible, wait until
    858 * some other thread frees a buffer.
    859 *
    860 * May drop the lock and regain it.
    861 */
    862static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
    863{
    864	struct dm_buffer *b;
    865	bool tried_noio_alloc = false;
    866
    867	/*
    868	 * dm-bufio is resistant to allocation failures (it just keeps
    869	 * one buffer reserved in cases all the allocations fail).
    870	 * So set flags to not try too hard:
    871	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
    872	 *		    mutex and wait ourselves.
    873	 *	__GFP_NORETRY: don't retry and rather return failure
    874	 *	__GFP_NOMEMALLOC: don't use emergency reserves
    875	 *	__GFP_NOWARN: don't print a warning in case of failure
    876	 *
    877	 * For debugging, if we set the cache size to 1, no new buffers will
    878	 * be allocated.
    879	 */
    880	while (1) {
    881		if (dm_bufio_cache_size_latch != 1) {
    882			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
    883			if (b)
    884				return b;
    885		}
    886
    887		if (nf == NF_PREFETCH)
    888			return NULL;
    889
    890		if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
    891			dm_bufio_unlock(c);
    892			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
    893			dm_bufio_lock(c);
    894			if (b)
    895				return b;
    896			tried_noio_alloc = true;
    897		}
    898
    899		if (!list_empty(&c->reserved_buffers)) {
    900			b = list_entry(c->reserved_buffers.next,
    901				       struct dm_buffer, lru_list);
    902			list_del(&b->lru_list);
    903			c->need_reserved_buffers++;
    904
    905			return b;
    906		}
    907
    908		b = __get_unclaimed_buffer(c);
    909		if (b)
    910			return b;
    911
    912		__wait_for_free_buffer(c);
    913	}
    914}
    915
    916static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
    917{
    918	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
    919
    920	if (!b)
    921		return NULL;
    922
    923	if (c->alloc_callback)
    924		c->alloc_callback(b);
    925
    926	return b;
    927}
    928
    929/*
    930 * Free a buffer and wake other threads waiting for free buffers.
    931 */
    932static void __free_buffer_wake(struct dm_buffer *b)
    933{
    934	struct dm_bufio_client *c = b->c;
    935
    936	if (!c->need_reserved_buffers)
    937		free_buffer(b);
    938	else {
    939		list_add(&b->lru_list, &c->reserved_buffers);
    940		c->need_reserved_buffers--;
    941	}
    942
    943	wake_up(&c->free_buffer_wait);
    944}
    945
    946static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
    947					struct list_head *write_list)
    948{
    949	struct dm_buffer *b, *tmp;
    950
    951	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
    952		BUG_ON(test_bit(B_READING, &b->state));
    953
    954		if (!test_bit(B_DIRTY, &b->state) &&
    955		    !test_bit(B_WRITING, &b->state)) {
    956			__relink_lru(b, LIST_CLEAN);
    957			continue;
    958		}
    959
    960		if (no_wait && test_bit(B_WRITING, &b->state))
    961			return;
    962
    963		__write_dirty_buffer(b, write_list);
    964		cond_resched();
    965	}
    966}
    967
    968/*
    969 * Check if we're over watermark.
    970 * If we are over threshold_buffers, start freeing buffers.
    971 * If we're over "limit_buffers", block until we get under the limit.
    972 */
    973static void __check_watermark(struct dm_bufio_client *c,
    974			      struct list_head *write_list)
    975{
    976	if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
    977		__write_dirty_buffers_async(c, 1, write_list);
    978}
    979
    980/*----------------------------------------------------------------
    981 * Getting a buffer
    982 *--------------------------------------------------------------*/
    983
    984static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
    985				     enum new_flag nf, int *need_submit,
    986				     struct list_head *write_list)
    987{
    988	struct dm_buffer *b, *new_b = NULL;
    989
    990	*need_submit = 0;
    991
    992	b = __find(c, block);
    993	if (b)
    994		goto found_buffer;
    995
    996	if (nf == NF_GET)
    997		return NULL;
    998
    999	new_b = __alloc_buffer_wait(c, nf);
   1000	if (!new_b)
   1001		return NULL;
   1002
   1003	/*
   1004	 * We've had a period where the mutex was unlocked, so need to
   1005	 * recheck the buffer tree.
   1006	 */
   1007	b = __find(c, block);
   1008	if (b) {
   1009		__free_buffer_wake(new_b);
   1010		goto found_buffer;
   1011	}
   1012
   1013	__check_watermark(c, write_list);
   1014
   1015	b = new_b;
   1016	b->hold_count = 1;
   1017	b->read_error = 0;
   1018	b->write_error = 0;
   1019	__link_buffer(b, block, LIST_CLEAN);
   1020
   1021	if (nf == NF_FRESH) {
   1022		b->state = 0;
   1023		return b;
   1024	}
   1025
   1026	b->state = 1 << B_READING;
   1027	*need_submit = 1;
   1028
   1029	return b;
   1030
   1031found_buffer:
   1032	if (nf == NF_PREFETCH)
   1033		return NULL;
   1034	/*
   1035	 * Note: it is essential that we don't wait for the buffer to be
   1036	 * read if dm_bufio_get function is used. Both dm_bufio_get and
   1037	 * dm_bufio_prefetch can be used in the driver request routine.
   1038	 * If the user called both dm_bufio_prefetch and dm_bufio_get on
   1039	 * the same buffer, it would deadlock if we waited.
   1040	 */
   1041	if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
   1042		return NULL;
   1043
   1044	b->hold_count++;
   1045	__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
   1046		     test_bit(B_WRITING, &b->state));
   1047	return b;
   1048}
   1049
   1050/*
   1051 * The endio routine for reading: set the error, clear the bit and wake up
   1052 * anyone waiting on the buffer.
   1053 */
   1054static void read_endio(struct dm_buffer *b, blk_status_t status)
   1055{
   1056	b->read_error = status;
   1057
   1058	BUG_ON(!test_bit(B_READING, &b->state));
   1059
   1060	smp_mb__before_atomic();
   1061	clear_bit(B_READING, &b->state);
   1062	smp_mb__after_atomic();
   1063
   1064	wake_up_bit(&b->state, B_READING);
   1065}
   1066
   1067/*
   1068 * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
   1069 * functions is similar except that dm_bufio_new doesn't read the
   1070 * buffer from the disk (assuming that the caller overwrites all the data
   1071 * and uses dm_bufio_mark_buffer_dirty to write new data back).
   1072 */
   1073static void *new_read(struct dm_bufio_client *c, sector_t block,
   1074		      enum new_flag nf, struct dm_buffer **bp)
   1075{
   1076	int need_submit;
   1077	struct dm_buffer *b;
   1078
   1079	LIST_HEAD(write_list);
   1080
   1081	dm_bufio_lock(c);
   1082	b = __bufio_new(c, block, nf, &need_submit, &write_list);
   1083#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
   1084	if (b && b->hold_count == 1)
   1085		buffer_record_stack(b);
   1086#endif
   1087	dm_bufio_unlock(c);
   1088
   1089	__flush_write_list(&write_list);
   1090
   1091	if (!b)
   1092		return NULL;
   1093
   1094	if (need_submit)
   1095		submit_io(b, REQ_OP_READ, read_endio);
   1096
   1097	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
   1098
   1099	if (b->read_error) {
   1100		int error = blk_status_to_errno(b->read_error);
   1101
   1102		dm_bufio_release(b);
   1103
   1104		return ERR_PTR(error);
   1105	}
   1106
   1107	*bp = b;
   1108
   1109	return b->data;
   1110}
   1111
   1112void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
   1113		   struct dm_buffer **bp)
   1114{
   1115	return new_read(c, block, NF_GET, bp);
   1116}
   1117EXPORT_SYMBOL_GPL(dm_bufio_get);
   1118
   1119void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
   1120		    struct dm_buffer **bp)
   1121{
   1122	BUG_ON(dm_bufio_in_request());
   1123
   1124	return new_read(c, block, NF_READ, bp);
   1125}
   1126EXPORT_SYMBOL_GPL(dm_bufio_read);
   1127
   1128void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
   1129		   struct dm_buffer **bp)
   1130{
   1131	BUG_ON(dm_bufio_in_request());
   1132
   1133	return new_read(c, block, NF_FRESH, bp);
   1134}
   1135EXPORT_SYMBOL_GPL(dm_bufio_new);
   1136
   1137void dm_bufio_prefetch(struct dm_bufio_client *c,
   1138		       sector_t block, unsigned n_blocks)
   1139{
   1140	struct blk_plug plug;
   1141
   1142	LIST_HEAD(write_list);
   1143
   1144	BUG_ON(dm_bufio_in_request());
   1145
   1146	blk_start_plug(&plug);
   1147	dm_bufio_lock(c);
   1148
   1149	for (; n_blocks--; block++) {
   1150		int need_submit;
   1151		struct dm_buffer *b;
   1152		b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
   1153				&write_list);
   1154		if (unlikely(!list_empty(&write_list))) {
   1155			dm_bufio_unlock(c);
   1156			blk_finish_plug(&plug);
   1157			__flush_write_list(&write_list);
   1158			blk_start_plug(&plug);
   1159			dm_bufio_lock(c);
   1160		}
   1161		if (unlikely(b != NULL)) {
   1162			dm_bufio_unlock(c);
   1163
   1164			if (need_submit)
   1165				submit_io(b, REQ_OP_READ, read_endio);
   1166			dm_bufio_release(b);
   1167
   1168			cond_resched();
   1169
   1170			if (!n_blocks)
   1171				goto flush_plug;
   1172			dm_bufio_lock(c);
   1173		}
   1174	}
   1175
   1176	dm_bufio_unlock(c);
   1177
   1178flush_plug:
   1179	blk_finish_plug(&plug);
   1180}
   1181EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
   1182
   1183void dm_bufio_release(struct dm_buffer *b)
   1184{
   1185	struct dm_bufio_client *c = b->c;
   1186
   1187	dm_bufio_lock(c);
   1188
   1189	BUG_ON(!b->hold_count);
   1190
   1191	b->hold_count--;
   1192	if (!b->hold_count) {
   1193		wake_up(&c->free_buffer_wait);
   1194
   1195		/*
   1196		 * If there were errors on the buffer, and the buffer is not
   1197		 * to be written, free the buffer. There is no point in caching
   1198		 * invalid buffer.
   1199		 */
   1200		if ((b->read_error || b->write_error) &&
   1201		    !test_bit(B_READING, &b->state) &&
   1202		    !test_bit(B_WRITING, &b->state) &&
   1203		    !test_bit(B_DIRTY, &b->state)) {
   1204			__unlink_buffer(b);
   1205			__free_buffer_wake(b);
   1206		}
   1207	}
   1208
   1209	dm_bufio_unlock(c);
   1210}
   1211EXPORT_SYMBOL_GPL(dm_bufio_release);
   1212
   1213void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
   1214					unsigned start, unsigned end)
   1215{
   1216	struct dm_bufio_client *c = b->c;
   1217
   1218	BUG_ON(start >= end);
   1219	BUG_ON(end > b->c->block_size);
   1220
   1221	dm_bufio_lock(c);
   1222
   1223	BUG_ON(test_bit(B_READING, &b->state));
   1224
   1225	if (!test_and_set_bit(B_DIRTY, &b->state)) {
   1226		b->dirty_start = start;
   1227		b->dirty_end = end;
   1228		__relink_lru(b, LIST_DIRTY);
   1229	} else {
   1230		if (start < b->dirty_start)
   1231			b->dirty_start = start;
   1232		if (end > b->dirty_end)
   1233			b->dirty_end = end;
   1234	}
   1235
   1236	dm_bufio_unlock(c);
   1237}
   1238EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
   1239
   1240void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
   1241{
   1242	dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
   1243}
   1244EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
   1245
   1246void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
   1247{
   1248	LIST_HEAD(write_list);
   1249
   1250	BUG_ON(dm_bufio_in_request());
   1251
   1252	dm_bufio_lock(c);
   1253	__write_dirty_buffers_async(c, 0, &write_list);
   1254	dm_bufio_unlock(c);
   1255	__flush_write_list(&write_list);
   1256}
   1257EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
   1258
   1259/*
   1260 * For performance, it is essential that the buffers are written asynchronously
   1261 * and simultaneously (so that the block layer can merge the writes) and then
   1262 * waited upon.
   1263 *
   1264 * Finally, we flush hardware disk cache.
   1265 */
   1266int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
   1267{
   1268	int a, f;
   1269	unsigned long buffers_processed = 0;
   1270	struct dm_buffer *b, *tmp;
   1271
   1272	LIST_HEAD(write_list);
   1273
   1274	dm_bufio_lock(c);
   1275	__write_dirty_buffers_async(c, 0, &write_list);
   1276	dm_bufio_unlock(c);
   1277	__flush_write_list(&write_list);
   1278	dm_bufio_lock(c);
   1279
   1280again:
   1281	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
   1282		int dropped_lock = 0;
   1283
   1284		if (buffers_processed < c->n_buffers[LIST_DIRTY])
   1285			buffers_processed++;
   1286
   1287		BUG_ON(test_bit(B_READING, &b->state));
   1288
   1289		if (test_bit(B_WRITING, &b->state)) {
   1290			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
   1291				dropped_lock = 1;
   1292				b->hold_count++;
   1293				dm_bufio_unlock(c);
   1294				wait_on_bit_io(&b->state, B_WRITING,
   1295					       TASK_UNINTERRUPTIBLE);
   1296				dm_bufio_lock(c);
   1297				b->hold_count--;
   1298			} else
   1299				wait_on_bit_io(&b->state, B_WRITING,
   1300					       TASK_UNINTERRUPTIBLE);
   1301		}
   1302
   1303		if (!test_bit(B_DIRTY, &b->state) &&
   1304		    !test_bit(B_WRITING, &b->state))
   1305			__relink_lru(b, LIST_CLEAN);
   1306
   1307		cond_resched();
   1308
   1309		/*
   1310		 * If we dropped the lock, the list is no longer consistent,
   1311		 * so we must restart the search.
   1312		 *
   1313		 * In the most common case, the buffer just processed is
   1314		 * relinked to the clean list, so we won't loop scanning the
   1315		 * same buffer again and again.
   1316		 *
   1317		 * This may livelock if there is another thread simultaneously
   1318		 * dirtying buffers, so we count the number of buffers walked
   1319		 * and if it exceeds the total number of buffers, it means that
   1320		 * someone is doing some writes simultaneously with us.  In
   1321		 * this case, stop, dropping the lock.
   1322		 */
   1323		if (dropped_lock)
   1324			goto again;
   1325	}
   1326	wake_up(&c->free_buffer_wait);
   1327	dm_bufio_unlock(c);
   1328
   1329	a = xchg(&c->async_write_error, 0);
   1330	f = dm_bufio_issue_flush(c);
   1331	if (a)
   1332		return a;
   1333
   1334	return f;
   1335}
   1336EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
   1337
   1338/*
   1339 * Use dm-io to send an empty barrier to flush the device.
   1340 */
   1341int dm_bufio_issue_flush(struct dm_bufio_client *c)
   1342{
   1343	struct dm_io_request io_req = {
   1344		.bi_op = REQ_OP_WRITE,
   1345		.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
   1346		.mem.type = DM_IO_KMEM,
   1347		.mem.ptr.addr = NULL,
   1348		.client = c->dm_io,
   1349	};
   1350	struct dm_io_region io_reg = {
   1351		.bdev = c->bdev,
   1352		.sector = 0,
   1353		.count = 0,
   1354	};
   1355
   1356	BUG_ON(dm_bufio_in_request());
   1357
   1358	return dm_io(&io_req, 1, &io_reg, NULL);
   1359}
   1360EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
   1361
   1362/*
   1363 * Use dm-io to send a discard request to flush the device.
   1364 */
   1365int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
   1366{
   1367	struct dm_io_request io_req = {
   1368		.bi_op = REQ_OP_DISCARD,
   1369		.bi_op_flags = REQ_SYNC,
   1370		.mem.type = DM_IO_KMEM,
   1371		.mem.ptr.addr = NULL,
   1372		.client = c->dm_io,
   1373	};
   1374	struct dm_io_region io_reg = {
   1375		.bdev = c->bdev,
   1376		.sector = block_to_sector(c, block),
   1377		.count = block_to_sector(c, count),
   1378	};
   1379
   1380	BUG_ON(dm_bufio_in_request());
   1381
   1382	return dm_io(&io_req, 1, &io_reg, NULL);
   1383}
   1384EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
   1385
   1386/*
   1387 * We first delete any other buffer that may be at that new location.
   1388 *
   1389 * Then, we write the buffer to the original location if it was dirty.
   1390 *
   1391 * Then, if we are the only one who is holding the buffer, relink the buffer
   1392 * in the buffer tree for the new location.
   1393 *
   1394 * If there was someone else holding the buffer, we write it to the new
   1395 * location but not relink it, because that other user needs to have the buffer
   1396 * at the same place.
   1397 */
   1398void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
   1399{
   1400	struct dm_bufio_client *c = b->c;
   1401	struct dm_buffer *new;
   1402
   1403	BUG_ON(dm_bufio_in_request());
   1404
   1405	dm_bufio_lock(c);
   1406
   1407retry:
   1408	new = __find(c, new_block);
   1409	if (new) {
   1410		if (new->hold_count) {
   1411			__wait_for_free_buffer(c);
   1412			goto retry;
   1413		}
   1414
   1415		/*
   1416		 * FIXME: Is there any point waiting for a write that's going
   1417		 * to be overwritten in a bit?
   1418		 */
   1419		__make_buffer_clean(new);
   1420		__unlink_buffer(new);
   1421		__free_buffer_wake(new);
   1422	}
   1423
   1424	BUG_ON(!b->hold_count);
   1425	BUG_ON(test_bit(B_READING, &b->state));
   1426
   1427	__write_dirty_buffer(b, NULL);
   1428	if (b->hold_count == 1) {
   1429		wait_on_bit_io(&b->state, B_WRITING,
   1430			       TASK_UNINTERRUPTIBLE);
   1431		set_bit(B_DIRTY, &b->state);
   1432		b->dirty_start = 0;
   1433		b->dirty_end = c->block_size;
   1434		__unlink_buffer(b);
   1435		__link_buffer(b, new_block, LIST_DIRTY);
   1436	} else {
   1437		sector_t old_block;
   1438		wait_on_bit_lock_io(&b->state, B_WRITING,
   1439				    TASK_UNINTERRUPTIBLE);
   1440		/*
   1441		 * Relink buffer to "new_block" so that write_callback
   1442		 * sees "new_block" as a block number.
   1443		 * After the write, link the buffer back to old_block.
   1444		 * All this must be done in bufio lock, so that block number
   1445		 * change isn't visible to other threads.
   1446		 */
   1447		old_block = b->block;
   1448		__unlink_buffer(b);
   1449		__link_buffer(b, new_block, b->list_mode);
   1450		submit_io(b, REQ_OP_WRITE, write_endio);
   1451		wait_on_bit_io(&b->state, B_WRITING,
   1452			       TASK_UNINTERRUPTIBLE);
   1453		__unlink_buffer(b);
   1454		__link_buffer(b, old_block, b->list_mode);
   1455	}
   1456
   1457	dm_bufio_unlock(c);
   1458	dm_bufio_release(b);
   1459}
   1460EXPORT_SYMBOL_GPL(dm_bufio_release_move);
   1461
   1462static void forget_buffer_locked(struct dm_buffer *b)
   1463{
   1464	if (likely(!b->hold_count) && likely(!b->state)) {
   1465		__unlink_buffer(b);
   1466		__free_buffer_wake(b);
   1467	}
   1468}
   1469
   1470/*
   1471 * Free the given buffer.
   1472 *
   1473 * This is just a hint, if the buffer is in use or dirty, this function
   1474 * does nothing.
   1475 */
   1476void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
   1477{
   1478	struct dm_buffer *b;
   1479
   1480	dm_bufio_lock(c);
   1481
   1482	b = __find(c, block);
   1483	if (b)
   1484		forget_buffer_locked(b);
   1485
   1486	dm_bufio_unlock(c);
   1487}
   1488EXPORT_SYMBOL_GPL(dm_bufio_forget);
   1489
   1490void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
   1491{
   1492	struct dm_buffer *b;
   1493	sector_t end_block = block + n_blocks;
   1494
   1495	while (block < end_block) {
   1496		dm_bufio_lock(c);
   1497
   1498		b = __find_next(c, block);
   1499		if (b) {
   1500			block = b->block + 1;
   1501			forget_buffer_locked(b);
   1502		}
   1503
   1504		dm_bufio_unlock(c);
   1505
   1506		if (!b)
   1507			break;
   1508	}
   1509
   1510}
   1511EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
   1512
   1513void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
   1514{
   1515	c->minimum_buffers = n;
   1516}
   1517EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
   1518
   1519unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
   1520{
   1521	return c->block_size;
   1522}
   1523EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
   1524
   1525sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
   1526{
   1527	sector_t s = bdev_nr_sectors(c->bdev);
   1528	if (s >= c->start)
   1529		s -= c->start;
   1530	else
   1531		s = 0;
   1532	if (likely(c->sectors_per_block_bits >= 0))
   1533		s >>= c->sectors_per_block_bits;
   1534	else
   1535		sector_div(s, c->block_size >> SECTOR_SHIFT);
   1536	return s;
   1537}
   1538EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
   1539
   1540struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
   1541{
   1542	return c->dm_io;
   1543}
   1544EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
   1545
   1546sector_t dm_bufio_get_block_number(struct dm_buffer *b)
   1547{
   1548	return b->block;
   1549}
   1550EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
   1551
   1552void *dm_bufio_get_block_data(struct dm_buffer *b)
   1553{
   1554	return b->data;
   1555}
   1556EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
   1557
   1558void *dm_bufio_get_aux_data(struct dm_buffer *b)
   1559{
   1560	return b + 1;
   1561}
   1562EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
   1563
   1564struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
   1565{
   1566	return b->c;
   1567}
   1568EXPORT_SYMBOL_GPL(dm_bufio_get_client);
   1569
   1570static void drop_buffers(struct dm_bufio_client *c)
   1571{
   1572	struct dm_buffer *b;
   1573	int i;
   1574	bool warned = false;
   1575
   1576	BUG_ON(dm_bufio_in_request());
   1577
   1578	/*
   1579	 * An optimization so that the buffers are not written one-by-one.
   1580	 */
   1581	dm_bufio_write_dirty_buffers_async(c);
   1582
   1583	dm_bufio_lock(c);
   1584
   1585	while ((b = __get_unclaimed_buffer(c)))
   1586		__free_buffer_wake(b);
   1587
   1588	for (i = 0; i < LIST_SIZE; i++)
   1589		list_for_each_entry(b, &c->lru[i], lru_list) {
   1590			WARN_ON(!warned);
   1591			warned = true;
   1592			DMERR("leaked buffer %llx, hold count %u, list %d",
   1593			      (unsigned long long)b->block, b->hold_count, i);
   1594#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
   1595			stack_trace_print(b->stack_entries, b->stack_len, 1);
   1596			/* mark unclaimed to avoid BUG_ON below */
   1597			b->hold_count = 0;
   1598#endif
   1599		}
   1600
   1601#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
   1602	while ((b = __get_unclaimed_buffer(c)))
   1603		__free_buffer_wake(b);
   1604#endif
   1605
   1606	for (i = 0; i < LIST_SIZE; i++)
   1607		BUG_ON(!list_empty(&c->lru[i]));
   1608
   1609	dm_bufio_unlock(c);
   1610}
   1611
   1612/*
   1613 * We may not be able to evict this buffer if IO pending or the client
   1614 * is still using it.  Caller is expected to know buffer is too old.
   1615 *
   1616 * And if GFP_NOFS is used, we must not do any I/O because we hold
   1617 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
   1618 * rerouted to different bufio client.
   1619 */
   1620static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
   1621{
   1622	if (!(gfp & __GFP_FS)) {
   1623		if (test_bit(B_READING, &b->state) ||
   1624		    test_bit(B_WRITING, &b->state) ||
   1625		    test_bit(B_DIRTY, &b->state))
   1626			return false;
   1627	}
   1628
   1629	if (b->hold_count)
   1630		return false;
   1631
   1632	__make_buffer_clean(b);
   1633	__unlink_buffer(b);
   1634	__free_buffer_wake(b);
   1635
   1636	return true;
   1637}
   1638
   1639static unsigned long get_retain_buffers(struct dm_bufio_client *c)
   1640{
   1641	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
   1642	if (likely(c->sectors_per_block_bits >= 0))
   1643		retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
   1644	else
   1645		retain_bytes /= c->block_size;
   1646	return retain_bytes;
   1647}
   1648
   1649static void __scan(struct dm_bufio_client *c)
   1650{
   1651	int l;
   1652	struct dm_buffer *b, *tmp;
   1653	unsigned long freed = 0;
   1654	unsigned long count = c->n_buffers[LIST_CLEAN] +
   1655			      c->n_buffers[LIST_DIRTY];
   1656	unsigned long retain_target = get_retain_buffers(c);
   1657
   1658	for (l = 0; l < LIST_SIZE; l++) {
   1659		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
   1660			if (count - freed <= retain_target)
   1661				atomic_long_set(&c->need_shrink, 0);
   1662			if (!atomic_long_read(&c->need_shrink))
   1663				return;
   1664			if (__try_evict_buffer(b, GFP_KERNEL)) {
   1665				atomic_long_dec(&c->need_shrink);
   1666				freed++;
   1667			}
   1668			cond_resched();
   1669		}
   1670	}
   1671}
   1672
   1673static void shrink_work(struct work_struct *w)
   1674{
   1675	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
   1676
   1677	dm_bufio_lock(c);
   1678	__scan(c);
   1679	dm_bufio_unlock(c);
   1680}
   1681
   1682static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
   1683{
   1684	struct dm_bufio_client *c;
   1685
   1686	c = container_of(shrink, struct dm_bufio_client, shrinker);
   1687	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
   1688	queue_work(dm_bufio_wq, &c->shrink_work);
   1689
   1690	return sc->nr_to_scan;
   1691}
   1692
   1693static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
   1694{
   1695	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
   1696	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
   1697			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
   1698	unsigned long retain_target = get_retain_buffers(c);
   1699	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
   1700
   1701	if (unlikely(count < retain_target))
   1702		count = 0;
   1703	else
   1704		count -= retain_target;
   1705
   1706	if (unlikely(count < queued_for_cleanup))
   1707		count = 0;
   1708	else
   1709		count -= queued_for_cleanup;
   1710
   1711	return count;
   1712}
   1713
   1714/*
   1715 * Create the buffering interface
   1716 */
   1717struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
   1718					       unsigned reserved_buffers, unsigned aux_size,
   1719					       void (*alloc_callback)(struct dm_buffer *),
   1720					       void (*write_callback)(struct dm_buffer *))
   1721{
   1722	int r;
   1723	struct dm_bufio_client *c;
   1724	unsigned i;
   1725	char slab_name[27];
   1726
   1727	if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
   1728		DMERR("%s: block size not specified or is not multiple of 512b", __func__);
   1729		r = -EINVAL;
   1730		goto bad_client;
   1731	}
   1732
   1733	c = kzalloc(sizeof(*c), GFP_KERNEL);
   1734	if (!c) {
   1735		r = -ENOMEM;
   1736		goto bad_client;
   1737	}
   1738	c->buffer_tree = RB_ROOT;
   1739
   1740	c->bdev = bdev;
   1741	c->block_size = block_size;
   1742	if (is_power_of_2(block_size))
   1743		c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
   1744	else
   1745		c->sectors_per_block_bits = -1;
   1746
   1747	c->alloc_callback = alloc_callback;
   1748	c->write_callback = write_callback;
   1749
   1750	for (i = 0; i < LIST_SIZE; i++) {
   1751		INIT_LIST_HEAD(&c->lru[i]);
   1752		c->n_buffers[i] = 0;
   1753	}
   1754
   1755	mutex_init(&c->lock);
   1756	INIT_LIST_HEAD(&c->reserved_buffers);
   1757	c->need_reserved_buffers = reserved_buffers;
   1758
   1759	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
   1760
   1761	init_waitqueue_head(&c->free_buffer_wait);
   1762	c->async_write_error = 0;
   1763
   1764	c->dm_io = dm_io_client_create();
   1765	if (IS_ERR(c->dm_io)) {
   1766		r = PTR_ERR(c->dm_io);
   1767		goto bad_dm_io;
   1768	}
   1769
   1770	if (block_size <= KMALLOC_MAX_SIZE &&
   1771	    (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
   1772		unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
   1773		snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
   1774		c->slab_cache = kmem_cache_create(slab_name, block_size, align,
   1775						  SLAB_RECLAIM_ACCOUNT, NULL);
   1776		if (!c->slab_cache) {
   1777			r = -ENOMEM;
   1778			goto bad;
   1779		}
   1780	}
   1781	if (aux_size)
   1782		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
   1783	else
   1784		snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
   1785	c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
   1786					   0, SLAB_RECLAIM_ACCOUNT, NULL);
   1787	if (!c->slab_buffer) {
   1788		r = -ENOMEM;
   1789		goto bad;
   1790	}
   1791
   1792	while (c->need_reserved_buffers) {
   1793		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
   1794
   1795		if (!b) {
   1796			r = -ENOMEM;
   1797			goto bad;
   1798		}
   1799		__free_buffer_wake(b);
   1800	}
   1801
   1802	INIT_WORK(&c->shrink_work, shrink_work);
   1803	atomic_long_set(&c->need_shrink, 0);
   1804
   1805	c->shrinker.count_objects = dm_bufio_shrink_count;
   1806	c->shrinker.scan_objects = dm_bufio_shrink_scan;
   1807	c->shrinker.seeks = 1;
   1808	c->shrinker.batch = 0;
   1809	r = register_shrinker(&c->shrinker);
   1810	if (r)
   1811		goto bad;
   1812
   1813	mutex_lock(&dm_bufio_clients_lock);
   1814	dm_bufio_client_count++;
   1815	list_add(&c->client_list, &dm_bufio_all_clients);
   1816	__cache_size_refresh();
   1817	mutex_unlock(&dm_bufio_clients_lock);
   1818
   1819	return c;
   1820
   1821bad:
   1822	while (!list_empty(&c->reserved_buffers)) {
   1823		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
   1824						 struct dm_buffer, lru_list);
   1825		list_del(&b->lru_list);
   1826		free_buffer(b);
   1827	}
   1828	kmem_cache_destroy(c->slab_cache);
   1829	kmem_cache_destroy(c->slab_buffer);
   1830	dm_io_client_destroy(c->dm_io);
   1831bad_dm_io:
   1832	mutex_destroy(&c->lock);
   1833	kfree(c);
   1834bad_client:
   1835	return ERR_PTR(r);
   1836}
   1837EXPORT_SYMBOL_GPL(dm_bufio_client_create);
   1838
   1839/*
   1840 * Free the buffering interface.
   1841 * It is required that there are no references on any buffers.
   1842 */
   1843void dm_bufio_client_destroy(struct dm_bufio_client *c)
   1844{
   1845	unsigned i;
   1846
   1847	drop_buffers(c);
   1848
   1849	unregister_shrinker(&c->shrinker);
   1850	flush_work(&c->shrink_work);
   1851
   1852	mutex_lock(&dm_bufio_clients_lock);
   1853
   1854	list_del(&c->client_list);
   1855	dm_bufio_client_count--;
   1856	__cache_size_refresh();
   1857
   1858	mutex_unlock(&dm_bufio_clients_lock);
   1859
   1860	BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
   1861	BUG_ON(c->need_reserved_buffers);
   1862
   1863	while (!list_empty(&c->reserved_buffers)) {
   1864		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
   1865						 struct dm_buffer, lru_list);
   1866		list_del(&b->lru_list);
   1867		free_buffer(b);
   1868	}
   1869
   1870	for (i = 0; i < LIST_SIZE; i++)
   1871		if (c->n_buffers[i])
   1872			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
   1873
   1874	for (i = 0; i < LIST_SIZE; i++)
   1875		BUG_ON(c->n_buffers[i]);
   1876
   1877	kmem_cache_destroy(c->slab_cache);
   1878	kmem_cache_destroy(c->slab_buffer);
   1879	dm_io_client_destroy(c->dm_io);
   1880	mutex_destroy(&c->lock);
   1881	kfree(c);
   1882}
   1883EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
   1884
   1885void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
   1886{
   1887	c->start = start;
   1888}
   1889EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
   1890
   1891static unsigned get_max_age_hz(void)
   1892{
   1893	unsigned max_age = READ_ONCE(dm_bufio_max_age);
   1894
   1895	if (max_age > UINT_MAX / HZ)
   1896		max_age = UINT_MAX / HZ;
   1897
   1898	return max_age * HZ;
   1899}
   1900
   1901static bool older_than(struct dm_buffer *b, unsigned long age_hz)
   1902{
   1903	return time_after_eq(jiffies, b->last_accessed + age_hz);
   1904}
   1905
   1906static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
   1907{
   1908	struct dm_buffer *b, *tmp;
   1909	unsigned long retain_target = get_retain_buffers(c);
   1910	unsigned long count;
   1911	LIST_HEAD(write_list);
   1912
   1913	dm_bufio_lock(c);
   1914
   1915	__check_watermark(c, &write_list);
   1916	if (unlikely(!list_empty(&write_list))) {
   1917		dm_bufio_unlock(c);
   1918		__flush_write_list(&write_list);
   1919		dm_bufio_lock(c);
   1920	}
   1921
   1922	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
   1923	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
   1924		if (count <= retain_target)
   1925			break;
   1926
   1927		if (!older_than(b, age_hz))
   1928			break;
   1929
   1930		if (__try_evict_buffer(b, 0))
   1931			count--;
   1932
   1933		cond_resched();
   1934	}
   1935
   1936	dm_bufio_unlock(c);
   1937}
   1938
   1939static void do_global_cleanup(struct work_struct *w)
   1940{
   1941	struct dm_bufio_client *locked_client = NULL;
   1942	struct dm_bufio_client *current_client;
   1943	struct dm_buffer *b;
   1944	unsigned spinlock_hold_count;
   1945	unsigned long threshold = dm_bufio_cache_size -
   1946		dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
   1947	unsigned long loops = global_num * 2;
   1948
   1949	mutex_lock(&dm_bufio_clients_lock);
   1950
   1951	while (1) {
   1952		cond_resched();
   1953
   1954		spin_lock(&global_spinlock);
   1955		if (unlikely(dm_bufio_current_allocated <= threshold))
   1956			break;
   1957
   1958		spinlock_hold_count = 0;
   1959get_next:
   1960		if (!loops--)
   1961			break;
   1962		if (unlikely(list_empty(&global_queue)))
   1963			break;
   1964		b = list_entry(global_queue.prev, struct dm_buffer, global_list);
   1965
   1966		if (b->accessed) {
   1967			b->accessed = 0;
   1968			list_move(&b->global_list, &global_queue);
   1969			if (likely(++spinlock_hold_count < 16))
   1970				goto get_next;
   1971			spin_unlock(&global_spinlock);
   1972			continue;
   1973		}
   1974
   1975		current_client = b->c;
   1976		if (unlikely(current_client != locked_client)) {
   1977			if (locked_client)
   1978				dm_bufio_unlock(locked_client);
   1979
   1980			if (!dm_bufio_trylock(current_client)) {
   1981				spin_unlock(&global_spinlock);
   1982				dm_bufio_lock(current_client);
   1983				locked_client = current_client;
   1984				continue;
   1985			}
   1986
   1987			locked_client = current_client;
   1988		}
   1989
   1990		spin_unlock(&global_spinlock);
   1991
   1992		if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
   1993			spin_lock(&global_spinlock);
   1994			list_move(&b->global_list, &global_queue);
   1995			spin_unlock(&global_spinlock);
   1996		}
   1997	}
   1998
   1999	spin_unlock(&global_spinlock);
   2000
   2001	if (locked_client)
   2002		dm_bufio_unlock(locked_client);
   2003
   2004	mutex_unlock(&dm_bufio_clients_lock);
   2005}
   2006
   2007static void cleanup_old_buffers(void)
   2008{
   2009	unsigned long max_age_hz = get_max_age_hz();
   2010	struct dm_bufio_client *c;
   2011
   2012	mutex_lock(&dm_bufio_clients_lock);
   2013
   2014	__cache_size_refresh();
   2015
   2016	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
   2017		__evict_old_buffers(c, max_age_hz);
   2018
   2019	mutex_unlock(&dm_bufio_clients_lock);
   2020}
   2021
   2022static void work_fn(struct work_struct *w)
   2023{
   2024	cleanup_old_buffers();
   2025
   2026	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
   2027			   DM_BUFIO_WORK_TIMER_SECS * HZ);
   2028}
   2029
   2030/*----------------------------------------------------------------
   2031 * Module setup
   2032 *--------------------------------------------------------------*/
   2033
   2034/*
   2035 * This is called only once for the whole dm_bufio module.
   2036 * It initializes memory limit.
   2037 */
   2038static int __init dm_bufio_init(void)
   2039{
   2040	__u64 mem;
   2041
   2042	dm_bufio_allocated_kmem_cache = 0;
   2043	dm_bufio_allocated_get_free_pages = 0;
   2044	dm_bufio_allocated_vmalloc = 0;
   2045	dm_bufio_current_allocated = 0;
   2046
   2047	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
   2048			       DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
   2049
   2050	if (mem > ULONG_MAX)
   2051		mem = ULONG_MAX;
   2052
   2053#ifdef CONFIG_MMU
   2054	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
   2055		mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
   2056#endif
   2057
   2058	dm_bufio_default_cache_size = mem;
   2059
   2060	mutex_lock(&dm_bufio_clients_lock);
   2061	__cache_size_refresh();
   2062	mutex_unlock(&dm_bufio_clients_lock);
   2063
   2064	dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
   2065	if (!dm_bufio_wq)
   2066		return -ENOMEM;
   2067
   2068	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
   2069	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
   2070	queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
   2071			   DM_BUFIO_WORK_TIMER_SECS * HZ);
   2072
   2073	return 0;
   2074}
   2075
   2076/*
   2077 * This is called once when unloading the dm_bufio module.
   2078 */
   2079static void __exit dm_bufio_exit(void)
   2080{
   2081	int bug = 0;
   2082
   2083	cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
   2084	destroy_workqueue(dm_bufio_wq);
   2085
   2086	if (dm_bufio_client_count) {
   2087		DMCRIT("%s: dm_bufio_client_count leaked: %d",
   2088			__func__, dm_bufio_client_count);
   2089		bug = 1;
   2090	}
   2091
   2092	if (dm_bufio_current_allocated) {
   2093		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
   2094			__func__, dm_bufio_current_allocated);
   2095		bug = 1;
   2096	}
   2097
   2098	if (dm_bufio_allocated_get_free_pages) {
   2099		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
   2100		       __func__, dm_bufio_allocated_get_free_pages);
   2101		bug = 1;
   2102	}
   2103
   2104	if (dm_bufio_allocated_vmalloc) {
   2105		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
   2106		       __func__, dm_bufio_allocated_vmalloc);
   2107		bug = 1;
   2108	}
   2109
   2110	BUG_ON(bug);
   2111}
   2112
   2113module_init(dm_bufio_init)
   2114module_exit(dm_bufio_exit)
   2115
   2116module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
   2117MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
   2118
   2119module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
   2120MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
   2121
   2122module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
   2123MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
   2124
   2125module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
   2126MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
   2127
   2128module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
   2129MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
   2130
   2131module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
   2132MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
   2133
   2134module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
   2135MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
   2136
   2137module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
   2138MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
   2139
   2140MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
   2141MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
   2142MODULE_LICENSE("GPL");