cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

drbd_bitmap.c (47775B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3   drbd_bitmap.c
      4
      5   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
      6
      7   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
      8   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
      9   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
     10
     11 */
     12
     13#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
     14
     15#include <linux/bitmap.h>
     16#include <linux/vmalloc.h>
     17#include <linux/string.h>
     18#include <linux/drbd.h>
     19#include <linux/slab.h>
     20#include <linux/highmem.h>
     21
     22#include "drbd_int.h"
     23
     24
     25/* OPAQUE outside this file!
     26 * interface defined in drbd_int.h
     27
     28 * convention:
     29 * function name drbd_bm_... => used elsewhere, "public".
     30 * function name      bm_... => internal to implementation, "private".
     31 */
     32
     33
     34/*
     35 * LIMITATIONS:
     36 * We want to support >= peta byte of backend storage, while for now still using
     37 * a granularity of one bit per 4KiB of storage.
     38 * 1 << 50		bytes backend storage (1 PiB)
     39 * 1 << (50 - 12)	bits needed
     40 *	38 --> we need u64 to index and count bits
     41 * 1 << (38 - 3)	bitmap bytes needed
     42 *	35 --> we still need u64 to index and count bytes
     43 *			(that's 32 GiB of bitmap for 1 PiB storage)
     44 * 1 << (35 - 2)	32bit longs needed
     45 *	33 --> we'd even need u64 to index and count 32bit long words.
     46 * 1 << (35 - 3)	64bit longs needed
     47 *	32 --> we could get away with a 32bit unsigned int to index and count
     48 *	64bit long words, but I rather stay with unsigned long for now.
     49 *	We probably should neither count nor point to bytes or long words
     50 *	directly, but either by bitnumber, or by page index and offset.
     51 * 1 << (35 - 12)
     52 *	22 --> we need that much 4KiB pages of bitmap.
     53 *	1 << (22 + 3) --> on a 64bit arch,
     54 *	we need 32 MiB to store the array of page pointers.
     55 *
     56 * Because I'm lazy, and because the resulting patch was too large, too ugly
     57 * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
     58 * (1 << 32) bits * 4k storage.
     59 *
     60
     61 * bitmap storage and IO:
     62 *	Bitmap is stored little endian on disk, and is kept little endian in
     63 *	core memory. Currently we still hold the full bitmap in core as long
     64 *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
     65 *	seems excessive.
     66 *
     67 *	We plan to reduce the amount of in-core bitmap pages by paging them in
     68 *	and out against their on-disk location as necessary, but need to make
     69 *	sure we don't cause too much meta data IO, and must not deadlock in
     70 *	tight memory situations. This needs some more work.
     71 */
     72
     73/*
     74 * NOTE
     75 *  Access to the *bm_pages is protected by bm_lock.
     76 *  It is safe to read the other members within the lock.
     77 *
     78 *  drbd_bm_set_bits is called from bio_endio callbacks,
     79 *  We may be called with irq already disabled,
     80 *  so we need spin_lock_irqsave().
     81 *  And we need the kmap_atomic.
     82 */
     83struct drbd_bitmap {
     84	struct page **bm_pages;
     85	spinlock_t bm_lock;
     86
     87	/* exclusively to be used by __al_write_transaction(),
     88	 * drbd_bm_mark_for_writeout() and
     89	 * and drbd_bm_write_hinted() -> bm_rw() called from there.
     90	 */
     91	unsigned int n_bitmap_hints;
     92	unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
     93
     94	/* see LIMITATIONS: above */
     95
     96	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
     97	unsigned long bm_bits;
     98	size_t   bm_words;
     99	size_t   bm_number_of_pages;
    100	sector_t bm_dev_capacity;
    101	struct mutex bm_change; /* serializes resize operations */
    102
    103	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
    104
    105	enum bm_flag bm_flags;
    106
    107	/* debugging aid, in case we are still racy somewhere */
    108	char          *bm_why;
    109	struct task_struct *bm_task;
    110};
    111
    112#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
    113static void __bm_print_lock_info(struct drbd_device *device, const char *func)
    114{
    115	struct drbd_bitmap *b = device->bitmap;
    116	if (!__ratelimit(&drbd_ratelimit_state))
    117		return;
    118	drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
    119		 current->comm, task_pid_nr(current),
    120		 func, b->bm_why ?: "?",
    121		 b->bm_task->comm, task_pid_nr(b->bm_task));
    122}
    123
    124void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
    125{
    126	struct drbd_bitmap *b = device->bitmap;
    127	int trylock_failed;
    128
    129	if (!b) {
    130		drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
    131		return;
    132	}
    133
    134	trylock_failed = !mutex_trylock(&b->bm_change);
    135
    136	if (trylock_failed) {
    137		drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
    138			  current->comm, task_pid_nr(current),
    139			  why, b->bm_why ?: "?",
    140			  b->bm_task->comm, task_pid_nr(b->bm_task));
    141		mutex_lock(&b->bm_change);
    142	}
    143	if (BM_LOCKED_MASK & b->bm_flags)
    144		drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
    145	b->bm_flags |= flags & BM_LOCKED_MASK;
    146
    147	b->bm_why  = why;
    148	b->bm_task = current;
    149}
    150
    151void drbd_bm_unlock(struct drbd_device *device)
    152{
    153	struct drbd_bitmap *b = device->bitmap;
    154	if (!b) {
    155		drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
    156		return;
    157	}
    158
    159	if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
    160		drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
    161
    162	b->bm_flags &= ~BM_LOCKED_MASK;
    163	b->bm_why  = NULL;
    164	b->bm_task = NULL;
    165	mutex_unlock(&b->bm_change);
    166}
    167
    168/* we store some "meta" info about our pages in page->private */
    169/* at a granularity of 4k storage per bitmap bit:
    170 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
    171 *  1<<38 bits,
    172 *  1<<23 4k bitmap pages.
    173 * Use 24 bits as page index, covers 2 peta byte storage
    174 * at a granularity of 4k per bit.
    175 * Used to report the failed page idx on io error from the endio handlers.
    176 */
    177#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
    178/* this page is currently read in, or written back */
    179#define BM_PAGE_IO_LOCK		31
    180/* if there has been an IO error for this page */
    181#define BM_PAGE_IO_ERROR	30
    182/* this is to be able to intelligently skip disk IO,
    183 * set if bits have been set since last IO. */
    184#define BM_PAGE_NEED_WRITEOUT	29
    185/* to mark for lazy writeout once syncer cleared all clearable bits,
    186 * we if bits have been cleared since last IO. */
    187#define BM_PAGE_LAZY_WRITEOUT	28
    188/* pages marked with this "HINT" will be considered for writeout
    189 * on activity log transactions */
    190#define BM_PAGE_HINT_WRITEOUT	27
    191
    192/* store_page_idx uses non-atomic assignment. It is only used directly after
    193 * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
    194 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
    195 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
    196 * requires it all to be atomic as well. */
    197static void bm_store_page_idx(struct page *page, unsigned long idx)
    198{
    199	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
    200	set_page_private(page, idx);
    201}
    202
    203static unsigned long bm_page_to_idx(struct page *page)
    204{
    205	return page_private(page) & BM_PAGE_IDX_MASK;
    206}
    207
    208/* As is very unlikely that the same page is under IO from more than one
    209 * context, we can get away with a bit per page and one wait queue per bitmap.
    210 */
    211static void bm_page_lock_io(struct drbd_device *device, int page_nr)
    212{
    213	struct drbd_bitmap *b = device->bitmap;
    214	void *addr = &page_private(b->bm_pages[page_nr]);
    215	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
    216}
    217
    218static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
    219{
    220	struct drbd_bitmap *b = device->bitmap;
    221	void *addr = &page_private(b->bm_pages[page_nr]);
    222	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
    223	wake_up(&device->bitmap->bm_io_wait);
    224}
    225
    226/* set _before_ submit_io, so it may be reset due to being changed
    227 * while this page is in flight... will get submitted later again */
    228static void bm_set_page_unchanged(struct page *page)
    229{
    230	/* use cmpxchg? */
    231	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
    232	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
    233}
    234
    235static void bm_set_page_need_writeout(struct page *page)
    236{
    237	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
    238}
    239
    240void drbd_bm_reset_al_hints(struct drbd_device *device)
    241{
    242	device->bitmap->n_bitmap_hints = 0;
    243}
    244
    245/**
    246 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
    247 * @device:	DRBD device.
    248 * @page_nr:	the bitmap page to mark with the "hint" flag
    249 *
    250 * From within an activity log transaction, we mark a few pages with these
    251 * hints, then call drbd_bm_write_hinted(), which will only write out changed
    252 * pages which are flagged with this mark.
    253 */
    254void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
    255{
    256	struct drbd_bitmap *b = device->bitmap;
    257	struct page *page;
    258	if (page_nr >= device->bitmap->bm_number_of_pages) {
    259		drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
    260			 page_nr, (int)device->bitmap->bm_number_of_pages);
    261		return;
    262	}
    263	page = device->bitmap->bm_pages[page_nr];
    264	BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
    265	if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
    266		b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
    267}
    268
    269static int bm_test_page_unchanged(struct page *page)
    270{
    271	volatile const unsigned long *addr = &page_private(page);
    272	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
    273}
    274
    275static void bm_set_page_io_err(struct page *page)
    276{
    277	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
    278}
    279
    280static void bm_clear_page_io_err(struct page *page)
    281{
    282	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
    283}
    284
    285static void bm_set_page_lazy_writeout(struct page *page)
    286{
    287	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
    288}
    289
    290static int bm_test_page_lazy_writeout(struct page *page)
    291{
    292	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
    293}
    294
    295/* on a 32bit box, this would allow for exactly (2<<38) bits. */
    296static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
    297{
    298	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
    299	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
    300	BUG_ON(page_nr >= b->bm_number_of_pages);
    301	return page_nr;
    302}
    303
    304static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
    305{
    306	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
    307	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
    308	BUG_ON(page_nr >= b->bm_number_of_pages);
    309	return page_nr;
    310}
    311
    312static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
    313{
    314	struct page *page = b->bm_pages[idx];
    315	return (unsigned long *) kmap_atomic(page);
    316}
    317
    318static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
    319{
    320	return __bm_map_pidx(b, idx);
    321}
    322
    323static void __bm_unmap(unsigned long *p_addr)
    324{
    325	kunmap_atomic(p_addr);
    326};
    327
    328static void bm_unmap(unsigned long *p_addr)
    329{
    330	return __bm_unmap(p_addr);
    331}
    332
    333/* long word offset of _bitmap_ sector */
    334#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
    335/* word offset from start of bitmap to word number _in_page_
    336 * modulo longs per page
    337#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
    338 hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
    339 so do it explicitly:
    340 */
    341#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
    342
    343/* Long words per page */
    344#define LWPP (PAGE_SIZE/sizeof(long))
    345
    346/*
    347 * actually most functions herein should take a struct drbd_bitmap*, not a
    348 * struct drbd_device*, but for the debug macros I like to have the device around
    349 * to be able to report device specific.
    350 */
    351
    352
    353static void bm_free_pages(struct page **pages, unsigned long number)
    354{
    355	unsigned long i;
    356	if (!pages)
    357		return;
    358
    359	for (i = 0; i < number; i++) {
    360		if (!pages[i]) {
    361			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
    362				 i, number);
    363			continue;
    364		}
    365		__free_page(pages[i]);
    366		pages[i] = NULL;
    367	}
    368}
    369
    370static inline void bm_vk_free(void *ptr)
    371{
    372	kvfree(ptr);
    373}
    374
    375/*
    376 * "have" and "want" are NUMBER OF PAGES.
    377 */
    378static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
    379{
    380	struct page **old_pages = b->bm_pages;
    381	struct page **new_pages, *page;
    382	unsigned int i, bytes;
    383	unsigned long have = b->bm_number_of_pages;
    384
    385	BUG_ON(have == 0 && old_pages != NULL);
    386	BUG_ON(have != 0 && old_pages == NULL);
    387
    388	if (have == want)
    389		return old_pages;
    390
    391	/* Trying kmalloc first, falling back to vmalloc.
    392	 * GFP_NOIO, as this is called while drbd IO is "suspended",
    393	 * and during resize or attach on diskless Primary,
    394	 * we must not block on IO to ourselves.
    395	 * Context is receiver thread or dmsetup. */
    396	bytes = sizeof(struct page *)*want;
    397	new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
    398	if (!new_pages) {
    399		new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO);
    400		if (!new_pages)
    401			return NULL;
    402	}
    403
    404	if (want >= have) {
    405		for (i = 0; i < have; i++)
    406			new_pages[i] = old_pages[i];
    407		for (; i < want; i++) {
    408			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
    409			if (!page) {
    410				bm_free_pages(new_pages + have, i - have);
    411				bm_vk_free(new_pages);
    412				return NULL;
    413			}
    414			/* we want to know which page it is
    415			 * from the endio handlers */
    416			bm_store_page_idx(page, i);
    417			new_pages[i] = page;
    418		}
    419	} else {
    420		for (i = 0; i < want; i++)
    421			new_pages[i] = old_pages[i];
    422		/* NOT HERE, we are outside the spinlock!
    423		bm_free_pages(old_pages + want, have - want);
    424		*/
    425	}
    426
    427	return new_pages;
    428}
    429
    430/*
    431 * allocates the drbd_bitmap and stores it in device->bitmap.
    432 */
    433int drbd_bm_init(struct drbd_device *device)
    434{
    435	struct drbd_bitmap *b = device->bitmap;
    436	WARN_ON(b != NULL);
    437	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
    438	if (!b)
    439		return -ENOMEM;
    440	spin_lock_init(&b->bm_lock);
    441	mutex_init(&b->bm_change);
    442	init_waitqueue_head(&b->bm_io_wait);
    443
    444	device->bitmap = b;
    445
    446	return 0;
    447}
    448
    449sector_t drbd_bm_capacity(struct drbd_device *device)
    450{
    451	if (!expect(device->bitmap))
    452		return 0;
    453	return device->bitmap->bm_dev_capacity;
    454}
    455
    456/* called on driver unload. TODO: call when a device is destroyed.
    457 */
    458void drbd_bm_cleanup(struct drbd_device *device)
    459{
    460	if (!expect(device->bitmap))
    461		return;
    462	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
    463	bm_vk_free(device->bitmap->bm_pages);
    464	kfree(device->bitmap);
    465	device->bitmap = NULL;
    466}
    467
    468/*
    469 * since (b->bm_bits % BITS_PER_LONG) != 0,
    470 * this masks out the remaining bits.
    471 * Returns the number of bits cleared.
    472 */
    473#ifndef BITS_PER_PAGE
    474#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
    475#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
    476#else
    477# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
    478#  error "ambiguous BITS_PER_PAGE"
    479# endif
    480#endif
    481#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
    482static int bm_clear_surplus(struct drbd_bitmap *b)
    483{
    484	unsigned long mask;
    485	unsigned long *p_addr, *bm;
    486	int tmp;
    487	int cleared = 0;
    488
    489	/* number of bits modulo bits per page */
    490	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
    491	/* mask the used bits of the word containing the last bit */
    492	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
    493	/* bitmap is always stored little endian,
    494	 * on disk and in core memory alike */
    495	mask = cpu_to_lel(mask);
    496
    497	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
    498	bm = p_addr + (tmp/BITS_PER_LONG);
    499	if (mask) {
    500		/* If mask != 0, we are not exactly aligned, so bm now points
    501		 * to the long containing the last bit.
    502		 * If mask == 0, bm already points to the word immediately
    503		 * after the last (long word aligned) bit. */
    504		cleared = hweight_long(*bm & ~mask);
    505		*bm &= mask;
    506		bm++;
    507	}
    508
    509	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
    510		/* on a 32bit arch, we may need to zero out
    511		 * a padding long to align with a 64bit remote */
    512		cleared += hweight_long(*bm);
    513		*bm = 0;
    514	}
    515	bm_unmap(p_addr);
    516	return cleared;
    517}
    518
    519static void bm_set_surplus(struct drbd_bitmap *b)
    520{
    521	unsigned long mask;
    522	unsigned long *p_addr, *bm;
    523	int tmp;
    524
    525	/* number of bits modulo bits per page */
    526	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
    527	/* mask the used bits of the word containing the last bit */
    528	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
    529	/* bitmap is always stored little endian,
    530	 * on disk and in core memory alike */
    531	mask = cpu_to_lel(mask);
    532
    533	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
    534	bm = p_addr + (tmp/BITS_PER_LONG);
    535	if (mask) {
    536		/* If mask != 0, we are not exactly aligned, so bm now points
    537		 * to the long containing the last bit.
    538		 * If mask == 0, bm already points to the word immediately
    539		 * after the last (long word aligned) bit. */
    540		*bm |= ~mask;
    541		bm++;
    542	}
    543
    544	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
    545		/* on a 32bit arch, we may need to zero out
    546		 * a padding long to align with a 64bit remote */
    547		*bm = ~0UL;
    548	}
    549	bm_unmap(p_addr);
    550}
    551
    552/* you better not modify the bitmap while this is running,
    553 * or its results will be stale */
    554static unsigned long bm_count_bits(struct drbd_bitmap *b)
    555{
    556	unsigned long *p_addr;
    557	unsigned long bits = 0;
    558	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
    559	int idx, last_word;
    560
    561	/* all but last page */
    562	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
    563		p_addr = __bm_map_pidx(b, idx);
    564		bits += bitmap_weight(p_addr, BITS_PER_PAGE);
    565		__bm_unmap(p_addr);
    566		cond_resched();
    567	}
    568	/* last (or only) page */
    569	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
    570	p_addr = __bm_map_pidx(b, idx);
    571	bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
    572	p_addr[last_word] &= cpu_to_lel(mask);
    573	bits += hweight_long(p_addr[last_word]);
    574	/* 32bit arch, may have an unused padding long */
    575	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
    576		p_addr[last_word+1] = 0;
    577	__bm_unmap(p_addr);
    578	return bits;
    579}
    580
    581/* offset and len in long words.*/
    582static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
    583{
    584	unsigned long *p_addr, *bm;
    585	unsigned int idx;
    586	size_t do_now, end;
    587
    588	end = offset + len;
    589
    590	if (end > b->bm_words) {
    591		pr_alert("bm_memset end > bm_words\n");
    592		return;
    593	}
    594
    595	while (offset < end) {
    596		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
    597		idx = bm_word_to_page_idx(b, offset);
    598		p_addr = bm_map_pidx(b, idx);
    599		bm = p_addr + MLPP(offset);
    600		if (bm+do_now > p_addr + LWPP) {
    601			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
    602			       p_addr, bm, (int)do_now);
    603		} else
    604			memset(bm, c, do_now * sizeof(long));
    605		bm_unmap(p_addr);
    606		bm_set_page_need_writeout(b->bm_pages[idx]);
    607		offset += do_now;
    608	}
    609}
    610
    611/* For the layout, see comment above drbd_md_set_sector_offsets(). */
    612static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
    613{
    614	u64 bitmap_sectors;
    615	if (ldev->md.al_offset == 8)
    616		bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
    617	else
    618		bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
    619	return bitmap_sectors << (9 + 3);
    620}
    621
    622/*
    623 * make sure the bitmap has enough room for the attached storage,
    624 * if necessary, resize.
    625 * called whenever we may have changed the device size.
    626 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
    627 * In case this is actually a resize, we copy the old bitmap into the new one.
    628 * Otherwise, the bitmap is initialized to all bits set.
    629 */
    630int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
    631{
    632	struct drbd_bitmap *b = device->bitmap;
    633	unsigned long bits, words, owords, obits;
    634	unsigned long want, have, onpages; /* number of pages */
    635	struct page **npages, **opages = NULL;
    636	int err = 0;
    637	bool growing;
    638
    639	if (!expect(b))
    640		return -ENOMEM;
    641
    642	drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
    643
    644	drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
    645			(unsigned long long)capacity);
    646
    647	if (capacity == b->bm_dev_capacity)
    648		goto out;
    649
    650	if (capacity == 0) {
    651		spin_lock_irq(&b->bm_lock);
    652		opages = b->bm_pages;
    653		onpages = b->bm_number_of_pages;
    654		owords = b->bm_words;
    655		b->bm_pages = NULL;
    656		b->bm_number_of_pages =
    657		b->bm_set   =
    658		b->bm_bits  =
    659		b->bm_words =
    660		b->bm_dev_capacity = 0;
    661		spin_unlock_irq(&b->bm_lock);
    662		bm_free_pages(opages, onpages);
    663		bm_vk_free(opages);
    664		goto out;
    665	}
    666	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
    667
    668	/* if we would use
    669	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
    670	   a 32bit host could present the wrong number of words
    671	   to a 64bit host.
    672	*/
    673	words = ALIGN(bits, 64) >> LN2_BPL;
    674
    675	if (get_ldev(device)) {
    676		u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
    677		put_ldev(device);
    678		if (bits > bits_on_disk) {
    679			drbd_info(device, "bits = %lu\n", bits);
    680			drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
    681			err = -ENOSPC;
    682			goto out;
    683		}
    684	}
    685
    686	want = PFN_UP(words*sizeof(long));
    687	have = b->bm_number_of_pages;
    688	if (want == have) {
    689		D_ASSERT(device, b->bm_pages != NULL);
    690		npages = b->bm_pages;
    691	} else {
    692		if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
    693			npages = NULL;
    694		else
    695			npages = bm_realloc_pages(b, want);
    696	}
    697
    698	if (!npages) {
    699		err = -ENOMEM;
    700		goto out;
    701	}
    702
    703	spin_lock_irq(&b->bm_lock);
    704	opages = b->bm_pages;
    705	owords = b->bm_words;
    706	obits  = b->bm_bits;
    707
    708	growing = bits > obits;
    709	if (opages && growing && set_new_bits)
    710		bm_set_surplus(b);
    711
    712	b->bm_pages = npages;
    713	b->bm_number_of_pages = want;
    714	b->bm_bits  = bits;
    715	b->bm_words = words;
    716	b->bm_dev_capacity = capacity;
    717
    718	if (growing) {
    719		if (set_new_bits) {
    720			bm_memset(b, owords, 0xff, words-owords);
    721			b->bm_set += bits - obits;
    722		} else
    723			bm_memset(b, owords, 0x00, words-owords);
    724
    725	}
    726
    727	if (want < have) {
    728		/* implicit: (opages != NULL) && (opages != npages) */
    729		bm_free_pages(opages + want, have - want);
    730	}
    731
    732	(void)bm_clear_surplus(b);
    733
    734	spin_unlock_irq(&b->bm_lock);
    735	if (opages != npages)
    736		bm_vk_free(opages);
    737	if (!growing)
    738		b->bm_set = bm_count_bits(b);
    739	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
    740
    741 out:
    742	drbd_bm_unlock(device);
    743	return err;
    744}
    745
    746/* inherently racy:
    747 * if not protected by other means, return value may be out of date when
    748 * leaving this function...
    749 * we still need to lock it, since it is important that this returns
    750 * bm_set == 0 precisely.
    751 *
    752 * maybe bm_set should be atomic_t ?
    753 */
    754unsigned long _drbd_bm_total_weight(struct drbd_device *device)
    755{
    756	struct drbd_bitmap *b = device->bitmap;
    757	unsigned long s;
    758	unsigned long flags;
    759
    760	if (!expect(b))
    761		return 0;
    762	if (!expect(b->bm_pages))
    763		return 0;
    764
    765	spin_lock_irqsave(&b->bm_lock, flags);
    766	s = b->bm_set;
    767	spin_unlock_irqrestore(&b->bm_lock, flags);
    768
    769	return s;
    770}
    771
    772unsigned long drbd_bm_total_weight(struct drbd_device *device)
    773{
    774	unsigned long s;
    775	/* if I don't have a disk, I don't know about out-of-sync status */
    776	if (!get_ldev_if_state(device, D_NEGOTIATING))
    777		return 0;
    778	s = _drbd_bm_total_weight(device);
    779	put_ldev(device);
    780	return s;
    781}
    782
    783size_t drbd_bm_words(struct drbd_device *device)
    784{
    785	struct drbd_bitmap *b = device->bitmap;
    786	if (!expect(b))
    787		return 0;
    788	if (!expect(b->bm_pages))
    789		return 0;
    790
    791	return b->bm_words;
    792}
    793
    794unsigned long drbd_bm_bits(struct drbd_device *device)
    795{
    796	struct drbd_bitmap *b = device->bitmap;
    797	if (!expect(b))
    798		return 0;
    799
    800	return b->bm_bits;
    801}
    802
    803/* merge number words from buffer into the bitmap starting at offset.
    804 * buffer[i] is expected to be little endian unsigned long.
    805 * bitmap must be locked by drbd_bm_lock.
    806 * currently only used from receive_bitmap.
    807 */
    808void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
    809			unsigned long *buffer)
    810{
    811	struct drbd_bitmap *b = device->bitmap;
    812	unsigned long *p_addr, *bm;
    813	unsigned long word, bits;
    814	unsigned int idx;
    815	size_t end, do_now;
    816
    817	end = offset + number;
    818
    819	if (!expect(b))
    820		return;
    821	if (!expect(b->bm_pages))
    822		return;
    823	if (number == 0)
    824		return;
    825	WARN_ON(offset >= b->bm_words);
    826	WARN_ON(end    >  b->bm_words);
    827
    828	spin_lock_irq(&b->bm_lock);
    829	while (offset < end) {
    830		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
    831		idx = bm_word_to_page_idx(b, offset);
    832		p_addr = bm_map_pidx(b, idx);
    833		bm = p_addr + MLPP(offset);
    834		offset += do_now;
    835		while (do_now--) {
    836			bits = hweight_long(*bm);
    837			word = *bm | *buffer++;
    838			*bm++ = word;
    839			b->bm_set += hweight_long(word) - bits;
    840		}
    841		bm_unmap(p_addr);
    842		bm_set_page_need_writeout(b->bm_pages[idx]);
    843	}
    844	/* with 32bit <-> 64bit cross-platform connect
    845	 * this is only correct for current usage,
    846	 * where we _know_ that we are 64 bit aligned,
    847	 * and know that this function is used in this way, too...
    848	 */
    849	if (end == b->bm_words)
    850		b->bm_set -= bm_clear_surplus(b);
    851	spin_unlock_irq(&b->bm_lock);
    852}
    853
    854/* copy number words from the bitmap starting at offset into the buffer.
    855 * buffer[i] will be little endian unsigned long.
    856 */
    857void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
    858		     unsigned long *buffer)
    859{
    860	struct drbd_bitmap *b = device->bitmap;
    861	unsigned long *p_addr, *bm;
    862	size_t end, do_now;
    863
    864	end = offset + number;
    865
    866	if (!expect(b))
    867		return;
    868	if (!expect(b->bm_pages))
    869		return;
    870
    871	spin_lock_irq(&b->bm_lock);
    872	if ((offset >= b->bm_words) ||
    873	    (end    >  b->bm_words) ||
    874	    (number <= 0))
    875		drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
    876			(unsigned long)	offset,
    877			(unsigned long)	number,
    878			(unsigned long) b->bm_words);
    879	else {
    880		while (offset < end) {
    881			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
    882			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
    883			bm = p_addr + MLPP(offset);
    884			offset += do_now;
    885			while (do_now--)
    886				*buffer++ = *bm++;
    887			bm_unmap(p_addr);
    888		}
    889	}
    890	spin_unlock_irq(&b->bm_lock);
    891}
    892
    893/* set all bits in the bitmap */
    894void drbd_bm_set_all(struct drbd_device *device)
    895{
    896	struct drbd_bitmap *b = device->bitmap;
    897	if (!expect(b))
    898		return;
    899	if (!expect(b->bm_pages))
    900		return;
    901
    902	spin_lock_irq(&b->bm_lock);
    903	bm_memset(b, 0, 0xff, b->bm_words);
    904	(void)bm_clear_surplus(b);
    905	b->bm_set = b->bm_bits;
    906	spin_unlock_irq(&b->bm_lock);
    907}
    908
    909/* clear all bits in the bitmap */
    910void drbd_bm_clear_all(struct drbd_device *device)
    911{
    912	struct drbd_bitmap *b = device->bitmap;
    913	if (!expect(b))
    914		return;
    915	if (!expect(b->bm_pages))
    916		return;
    917
    918	spin_lock_irq(&b->bm_lock);
    919	bm_memset(b, 0, 0, b->bm_words);
    920	b->bm_set = 0;
    921	spin_unlock_irq(&b->bm_lock);
    922}
    923
    924static void drbd_bm_aio_ctx_destroy(struct kref *kref)
    925{
    926	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
    927	unsigned long flags;
    928
    929	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
    930	list_del(&ctx->list);
    931	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
    932	put_ldev(ctx->device);
    933	kfree(ctx);
    934}
    935
    936/* bv_page may be a copy, or may be the original */
    937static void drbd_bm_endio(struct bio *bio)
    938{
    939	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
    940	struct drbd_device *device = ctx->device;
    941	struct drbd_bitmap *b = device->bitmap;
    942	unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
    943
    944	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
    945	    !bm_test_page_unchanged(b->bm_pages[idx]))
    946		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
    947
    948	if (bio->bi_status) {
    949		/* ctx error will hold the completed-last non-zero error code,
    950		 * in case error codes differ. */
    951		ctx->error = blk_status_to_errno(bio->bi_status);
    952		bm_set_page_io_err(b->bm_pages[idx]);
    953		/* Not identical to on disk version of it.
    954		 * Is BM_PAGE_IO_ERROR enough? */
    955		if (__ratelimit(&drbd_ratelimit_state))
    956			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
    957					bio->bi_status, idx);
    958	} else {
    959		bm_clear_page_io_err(b->bm_pages[idx]);
    960		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
    961	}
    962
    963	bm_page_unlock_io(device, idx);
    964
    965	if (ctx->flags & BM_AIO_COPY_PAGES)
    966		mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
    967
    968	bio_put(bio);
    969
    970	if (atomic_dec_and_test(&ctx->in_flight)) {
    971		ctx->done = 1;
    972		wake_up(&device->misc_wait);
    973		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
    974	}
    975}
    976
    977static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
    978{
    979	struct drbd_device *device = ctx->device;
    980	unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
    981	struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op,
    982					   GFP_NOIO, &drbd_md_io_bio_set);
    983	struct drbd_bitmap *b = device->bitmap;
    984	struct page *page;
    985	unsigned int len;
    986
    987	sector_t on_disk_sector =
    988		device->ldev->md.md_offset + device->ldev->md.bm_offset;
    989	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
    990
    991	/* this might happen with very small
    992	 * flexible external meta data device,
    993	 * or with PAGE_SIZE > 4k */
    994	len = min_t(unsigned int, PAGE_SIZE,
    995		(drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
    996
    997	/* serialize IO on this page */
    998	bm_page_lock_io(device, page_nr);
    999	/* before memcpy and submit,
   1000	 * so it can be redirtied any time */
   1001	bm_set_page_unchanged(b->bm_pages[page_nr]);
   1002
   1003	if (ctx->flags & BM_AIO_COPY_PAGES) {
   1004		page = mempool_alloc(&drbd_md_io_page_pool,
   1005				GFP_NOIO | __GFP_HIGHMEM);
   1006		copy_highpage(page, b->bm_pages[page_nr]);
   1007		bm_store_page_idx(page, page_nr);
   1008	} else
   1009		page = b->bm_pages[page_nr];
   1010	bio->bi_iter.bi_sector = on_disk_sector;
   1011	/* bio_add_page of a single page to an empty bio will always succeed,
   1012	 * according to api.  Do we want to assert that? */
   1013	bio_add_page(bio, page, len, 0);
   1014	bio->bi_private = ctx;
   1015	bio->bi_end_io = drbd_bm_endio;
   1016
   1017	if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
   1018		bio_io_error(bio);
   1019	} else {
   1020		submit_bio(bio);
   1021		/* this should not count as user activity and cause the
   1022		 * resync to throttle -- see drbd_rs_should_slow_down(). */
   1023		atomic_add(len >> 9, &device->rs_sect_ev);
   1024	}
   1025}
   1026
   1027/*
   1028 * bm_rw: read/write the whole bitmap from/to its on disk location.
   1029 */
   1030static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
   1031{
   1032	struct drbd_bm_aio_ctx *ctx;
   1033	struct drbd_bitmap *b = device->bitmap;
   1034	unsigned int num_pages, i, count = 0;
   1035	unsigned long now;
   1036	char ppb[10];
   1037	int err = 0;
   1038
   1039	/*
   1040	 * We are protected against bitmap disappearing/resizing by holding an
   1041	 * ldev reference (caller must have called get_ldev()).
   1042	 * For read/write, we are protected against changes to the bitmap by
   1043	 * the bitmap lock (see drbd_bitmap_io).
   1044	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
   1045	 * as we submit copies of pages anyways.
   1046	 */
   1047
   1048	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
   1049	if (!ctx)
   1050		return -ENOMEM;
   1051
   1052	*ctx = (struct drbd_bm_aio_ctx) {
   1053		.device = device,
   1054		.start_jif = jiffies,
   1055		.in_flight = ATOMIC_INIT(1),
   1056		.done = 0,
   1057		.flags = flags,
   1058		.error = 0,
   1059		.kref = KREF_INIT(2),
   1060	};
   1061
   1062	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
   1063		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
   1064		kfree(ctx);
   1065		return -ENODEV;
   1066	}
   1067	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
   1068	   drbd_adm_attach(), after device->ldev was assigned. */
   1069
   1070	if (0 == (ctx->flags & ~BM_AIO_READ))
   1071		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
   1072
   1073	spin_lock_irq(&device->resource->req_lock);
   1074	list_add_tail(&ctx->list, &device->pending_bitmap_io);
   1075	spin_unlock_irq(&device->resource->req_lock);
   1076
   1077	num_pages = b->bm_number_of_pages;
   1078
   1079	now = jiffies;
   1080
   1081	/* let the layers below us try to merge these bios... */
   1082
   1083	if (flags & BM_AIO_READ) {
   1084		for (i = 0; i < num_pages; i++) {
   1085			atomic_inc(&ctx->in_flight);
   1086			bm_page_io_async(ctx, i);
   1087			++count;
   1088			cond_resched();
   1089		}
   1090	} else if (flags & BM_AIO_WRITE_HINTED) {
   1091		/* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
   1092		unsigned int hint;
   1093		for (hint = 0; hint < b->n_bitmap_hints; hint++) {
   1094			i = b->al_bitmap_hints[hint];
   1095			if (i >= num_pages) /* == -1U: no hint here. */
   1096				continue;
   1097			/* Several AL-extents may point to the same page. */
   1098			if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
   1099			    &page_private(b->bm_pages[i])))
   1100				continue;
   1101			/* Has it even changed? */
   1102			if (bm_test_page_unchanged(b->bm_pages[i]))
   1103				continue;
   1104			atomic_inc(&ctx->in_flight);
   1105			bm_page_io_async(ctx, i);
   1106			++count;
   1107		}
   1108	} else {
   1109		for (i = 0; i < num_pages; i++) {
   1110			/* ignore completely unchanged pages */
   1111			if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
   1112				break;
   1113			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
   1114			    bm_test_page_unchanged(b->bm_pages[i])) {
   1115				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
   1116				continue;
   1117			}
   1118			/* during lazy writeout,
   1119			 * ignore those pages not marked for lazy writeout. */
   1120			if (lazy_writeout_upper_idx &&
   1121			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
   1122				dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
   1123				continue;
   1124			}
   1125			atomic_inc(&ctx->in_flight);
   1126			bm_page_io_async(ctx, i);
   1127			++count;
   1128			cond_resched();
   1129		}
   1130	}
   1131
   1132	/*
   1133	 * We initialize ctx->in_flight to one to make sure drbd_bm_endio
   1134	 * will not set ctx->done early, and decrement / test it here.  If there
   1135	 * are still some bios in flight, we need to wait for them here.
   1136	 * If all IO is done already (or nothing had been submitted), there is
   1137	 * no need to wait.  Still, we need to put the kref associated with the
   1138	 * "in_flight reached zero, all done" event.
   1139	 */
   1140	if (!atomic_dec_and_test(&ctx->in_flight))
   1141		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
   1142	else
   1143		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
   1144
   1145	/* summary for global bitmap IO */
   1146	if (flags == 0) {
   1147		unsigned int ms = jiffies_to_msecs(jiffies - now);
   1148		if (ms > 5) {
   1149			drbd_info(device, "bitmap %s of %u pages took %u ms\n",
   1150				 (flags & BM_AIO_READ) ? "READ" : "WRITE",
   1151				 count, ms);
   1152		}
   1153	}
   1154
   1155	if (ctx->error) {
   1156		drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
   1157		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
   1158		err = -EIO; /* ctx->error ? */
   1159	}
   1160
   1161	if (atomic_read(&ctx->in_flight))
   1162		err = -EIO; /* Disk timeout/force-detach during IO... */
   1163
   1164	now = jiffies;
   1165	if (flags & BM_AIO_READ) {
   1166		b->bm_set = bm_count_bits(b);
   1167		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
   1168		     jiffies - now);
   1169	}
   1170	now = b->bm_set;
   1171
   1172	if ((flags & ~BM_AIO_READ) == 0)
   1173		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
   1174		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
   1175
   1176	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
   1177	return err;
   1178}
   1179
   1180/**
   1181 * drbd_bm_read() - Read the whole bitmap from its on disk location.
   1182 * @device:	DRBD device.
   1183 */
   1184int drbd_bm_read(struct drbd_device *device) __must_hold(local)
   1185{
   1186	return bm_rw(device, BM_AIO_READ, 0);
   1187}
   1188
   1189/**
   1190 * drbd_bm_write() - Write the whole bitmap to its on disk location.
   1191 * @device:	DRBD device.
   1192 *
   1193 * Will only write pages that have changed since last IO.
   1194 */
   1195int drbd_bm_write(struct drbd_device *device) __must_hold(local)
   1196{
   1197	return bm_rw(device, 0, 0);
   1198}
   1199
   1200/**
   1201 * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
   1202 * @device:	DRBD device.
   1203 *
   1204 * Will write all pages.
   1205 */
   1206int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
   1207{
   1208	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
   1209}
   1210
   1211/**
   1212 * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
   1213 * @device:	DRBD device.
   1214 * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
   1215 */
   1216int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
   1217{
   1218	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
   1219}
   1220
   1221/**
   1222 * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
   1223 * @device:	DRBD device.
   1224 *
   1225 * Will only write pages that have changed since last IO.
   1226 * In contrast to drbd_bm_write(), this will copy the bitmap pages
   1227 * to temporary writeout pages. It is intended to trigger a full write-out
   1228 * while still allowing the bitmap to change, for example if a resync or online
   1229 * verify is aborted due to a failed peer disk, while local IO continues, or
   1230 * pending resync acks are still being processed.
   1231 */
   1232int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
   1233{
   1234	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
   1235}
   1236
   1237/**
   1238 * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
   1239 * @device:	DRBD device.
   1240 */
   1241int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
   1242{
   1243	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
   1244}
   1245
   1246/* NOTE
   1247 * find_first_bit returns int, we return unsigned long.
   1248 * For this to work on 32bit arch with bitnumbers > (1<<32),
   1249 * we'd need to return u64, and get a whole lot of other places
   1250 * fixed where we still use unsigned long.
   1251 *
   1252 * this returns a bit number, NOT a sector!
   1253 */
   1254static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
   1255	const int find_zero_bit)
   1256{
   1257	struct drbd_bitmap *b = device->bitmap;
   1258	unsigned long *p_addr;
   1259	unsigned long bit_offset;
   1260	unsigned i;
   1261
   1262
   1263	if (bm_fo > b->bm_bits) {
   1264		drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
   1265		bm_fo = DRBD_END_OF_BITMAP;
   1266	} else {
   1267		while (bm_fo < b->bm_bits) {
   1268			/* bit offset of the first bit in the page */
   1269			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
   1270			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
   1271
   1272			if (find_zero_bit)
   1273				i = find_next_zero_bit_le(p_addr,
   1274						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
   1275			else
   1276				i = find_next_bit_le(p_addr,
   1277						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
   1278
   1279			__bm_unmap(p_addr);
   1280			if (i < PAGE_SIZE*8) {
   1281				bm_fo = bit_offset + i;
   1282				if (bm_fo >= b->bm_bits)
   1283					break;
   1284				goto found;
   1285			}
   1286			bm_fo = bit_offset + PAGE_SIZE*8;
   1287		}
   1288		bm_fo = DRBD_END_OF_BITMAP;
   1289	}
   1290 found:
   1291	return bm_fo;
   1292}
   1293
   1294static unsigned long bm_find_next(struct drbd_device *device,
   1295	unsigned long bm_fo, const int find_zero_bit)
   1296{
   1297	struct drbd_bitmap *b = device->bitmap;
   1298	unsigned long i = DRBD_END_OF_BITMAP;
   1299
   1300	if (!expect(b))
   1301		return i;
   1302	if (!expect(b->bm_pages))
   1303		return i;
   1304
   1305	spin_lock_irq(&b->bm_lock);
   1306	if (BM_DONT_TEST & b->bm_flags)
   1307		bm_print_lock_info(device);
   1308
   1309	i = __bm_find_next(device, bm_fo, find_zero_bit);
   1310
   1311	spin_unlock_irq(&b->bm_lock);
   1312	return i;
   1313}
   1314
   1315unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
   1316{
   1317	return bm_find_next(device, bm_fo, 0);
   1318}
   1319
   1320#if 0
   1321/* not yet needed for anything. */
   1322unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
   1323{
   1324	return bm_find_next(device, bm_fo, 1);
   1325}
   1326#endif
   1327
   1328/* does not spin_lock_irqsave.
   1329 * you must take drbd_bm_lock() first */
   1330unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
   1331{
   1332	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
   1333	return __bm_find_next(device, bm_fo, 0);
   1334}
   1335
   1336unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
   1337{
   1338	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
   1339	return __bm_find_next(device, bm_fo, 1);
   1340}
   1341
   1342/* returns number of bits actually changed.
   1343 * for val != 0, we change 0 -> 1, return code positive
   1344 * for val == 0, we change 1 -> 0, return code negative
   1345 * wants bitnr, not sector.
   1346 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
   1347 * Must hold bitmap lock already. */
   1348static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
   1349	unsigned long e, int val)
   1350{
   1351	struct drbd_bitmap *b = device->bitmap;
   1352	unsigned long *p_addr = NULL;
   1353	unsigned long bitnr;
   1354	unsigned int last_page_nr = -1U;
   1355	int c = 0;
   1356	int changed_total = 0;
   1357
   1358	if (e >= b->bm_bits) {
   1359		drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
   1360				s, e, b->bm_bits);
   1361		e = b->bm_bits ? b->bm_bits -1 : 0;
   1362	}
   1363	for (bitnr = s; bitnr <= e; bitnr++) {
   1364		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
   1365		if (page_nr != last_page_nr) {
   1366			if (p_addr)
   1367				__bm_unmap(p_addr);
   1368			if (c < 0)
   1369				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
   1370			else if (c > 0)
   1371				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
   1372			changed_total += c;
   1373			c = 0;
   1374			p_addr = __bm_map_pidx(b, page_nr);
   1375			last_page_nr = page_nr;
   1376		}
   1377		if (val)
   1378			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
   1379		else
   1380			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
   1381	}
   1382	if (p_addr)
   1383		__bm_unmap(p_addr);
   1384	if (c < 0)
   1385		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
   1386	else if (c > 0)
   1387		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
   1388	changed_total += c;
   1389	b->bm_set += changed_total;
   1390	return changed_total;
   1391}
   1392
   1393/* returns number of bits actually changed.
   1394 * for val != 0, we change 0 -> 1, return code positive
   1395 * for val == 0, we change 1 -> 0, return code negative
   1396 * wants bitnr, not sector */
   1397static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
   1398	const unsigned long e, int val)
   1399{
   1400	unsigned long flags;
   1401	struct drbd_bitmap *b = device->bitmap;
   1402	int c = 0;
   1403
   1404	if (!expect(b))
   1405		return 1;
   1406	if (!expect(b->bm_pages))
   1407		return 0;
   1408
   1409	spin_lock_irqsave(&b->bm_lock, flags);
   1410	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
   1411		bm_print_lock_info(device);
   1412
   1413	c = __bm_change_bits_to(device, s, e, val);
   1414
   1415	spin_unlock_irqrestore(&b->bm_lock, flags);
   1416	return c;
   1417}
   1418
   1419/* returns number of bits changed 0 -> 1 */
   1420int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
   1421{
   1422	return bm_change_bits_to(device, s, e, 1);
   1423}
   1424
   1425/* returns number of bits changed 1 -> 0 */
   1426int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
   1427{
   1428	return -bm_change_bits_to(device, s, e, 0);
   1429}
   1430
   1431/* sets all bits in full words,
   1432 * from first_word up to, but not including, last_word */
   1433static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
   1434		int page_nr, int first_word, int last_word)
   1435{
   1436	int i;
   1437	int bits;
   1438	int changed = 0;
   1439	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
   1440
   1441	/* I think it is more cache line friendly to hweight_long then set to ~0UL,
   1442	 * than to first bitmap_weight() all words, then bitmap_fill() all words */
   1443	for (i = first_word; i < last_word; i++) {
   1444		bits = hweight_long(paddr[i]);
   1445		paddr[i] = ~0UL;
   1446		changed += BITS_PER_LONG - bits;
   1447	}
   1448	kunmap_atomic(paddr);
   1449	if (changed) {
   1450		/* We only need lazy writeout, the information is still in the
   1451		 * remote bitmap as well, and is reconstructed during the next
   1452		 * bitmap exchange, if lost locally due to a crash. */
   1453		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
   1454		b->bm_set += changed;
   1455	}
   1456}
   1457
   1458/* Same thing as drbd_bm_set_bits,
   1459 * but more efficient for a large bit range.
   1460 * You must first drbd_bm_lock().
   1461 * Can be called to set the whole bitmap in one go.
   1462 * Sets bits from s to e _inclusive_. */
   1463void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
   1464{
   1465	/* First set_bit from the first bit (s)
   1466	 * up to the next long boundary (sl),
   1467	 * then assign full words up to the last long boundary (el),
   1468	 * then set_bit up to and including the last bit (e).
   1469	 *
   1470	 * Do not use memset, because we must account for changes,
   1471	 * so we need to loop over the words with hweight() anyways.
   1472	 */
   1473	struct drbd_bitmap *b = device->bitmap;
   1474	unsigned long sl = ALIGN(s,BITS_PER_LONG);
   1475	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
   1476	int first_page;
   1477	int last_page;
   1478	int page_nr;
   1479	int first_word;
   1480	int last_word;
   1481
   1482	if (e - s <= 3*BITS_PER_LONG) {
   1483		/* don't bother; el and sl may even be wrong. */
   1484		spin_lock_irq(&b->bm_lock);
   1485		__bm_change_bits_to(device, s, e, 1);
   1486		spin_unlock_irq(&b->bm_lock);
   1487		return;
   1488	}
   1489
   1490	/* difference is large enough that we can trust sl and el */
   1491
   1492	spin_lock_irq(&b->bm_lock);
   1493
   1494	/* bits filling the current long */
   1495	if (sl)
   1496		__bm_change_bits_to(device, s, sl-1, 1);
   1497
   1498	first_page = sl >> (3 + PAGE_SHIFT);
   1499	last_page = el >> (3 + PAGE_SHIFT);
   1500
   1501	/* MLPP: modulo longs per page */
   1502	/* LWPP: long words per page */
   1503	first_word = MLPP(sl >> LN2_BPL);
   1504	last_word = LWPP;
   1505
   1506	/* first and full pages, unless first page == last page */
   1507	for (page_nr = first_page; page_nr < last_page; page_nr++) {
   1508		bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
   1509		spin_unlock_irq(&b->bm_lock);
   1510		cond_resched();
   1511		first_word = 0;
   1512		spin_lock_irq(&b->bm_lock);
   1513	}
   1514	/* last page (respectively only page, for first page == last page) */
   1515	last_word = MLPP(el >> LN2_BPL);
   1516
   1517	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
   1518	 * ==> e = 32767, el = 32768, last_page = 2,
   1519	 * and now last_word = 0.
   1520	 * We do not want to touch last_page in this case,
   1521	 * as we did not allocate it, it is not present in bitmap->bm_pages.
   1522	 */
   1523	if (last_word)
   1524		bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
   1525
   1526	/* possibly trailing bits.
   1527	 * example: (e & 63) == 63, el will be e+1.
   1528	 * if that even was the very last bit,
   1529	 * it would trigger an assert in __bm_change_bits_to()
   1530	 */
   1531	if (el <= e)
   1532		__bm_change_bits_to(device, el, e, 1);
   1533	spin_unlock_irq(&b->bm_lock);
   1534}
   1535
   1536/* returns bit state
   1537 * wants bitnr, NOT sector.
   1538 * inherently racy... area needs to be locked by means of {al,rs}_lru
   1539 *  1 ... bit set
   1540 *  0 ... bit not set
   1541 * -1 ... first out of bounds access, stop testing for bits!
   1542 */
   1543int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
   1544{
   1545	unsigned long flags;
   1546	struct drbd_bitmap *b = device->bitmap;
   1547	unsigned long *p_addr;
   1548	int i;
   1549
   1550	if (!expect(b))
   1551		return 0;
   1552	if (!expect(b->bm_pages))
   1553		return 0;
   1554
   1555	spin_lock_irqsave(&b->bm_lock, flags);
   1556	if (BM_DONT_TEST & b->bm_flags)
   1557		bm_print_lock_info(device);
   1558	if (bitnr < b->bm_bits) {
   1559		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
   1560		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
   1561		bm_unmap(p_addr);
   1562	} else if (bitnr == b->bm_bits) {
   1563		i = -1;
   1564	} else { /* (bitnr > b->bm_bits) */
   1565		drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
   1566		i = 0;
   1567	}
   1568
   1569	spin_unlock_irqrestore(&b->bm_lock, flags);
   1570	return i;
   1571}
   1572
   1573/* returns number of bits set in the range [s, e] */
   1574int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
   1575{
   1576	unsigned long flags;
   1577	struct drbd_bitmap *b = device->bitmap;
   1578	unsigned long *p_addr = NULL;
   1579	unsigned long bitnr;
   1580	unsigned int page_nr = -1U;
   1581	int c = 0;
   1582
   1583	/* If this is called without a bitmap, that is a bug.  But just to be
   1584	 * robust in case we screwed up elsewhere, in that case pretend there
   1585	 * was one dirty bit in the requested area, so we won't try to do a
   1586	 * local read there (no bitmap probably implies no disk) */
   1587	if (!expect(b))
   1588		return 1;
   1589	if (!expect(b->bm_pages))
   1590		return 1;
   1591
   1592	spin_lock_irqsave(&b->bm_lock, flags);
   1593	if (BM_DONT_TEST & b->bm_flags)
   1594		bm_print_lock_info(device);
   1595	for (bitnr = s; bitnr <= e; bitnr++) {
   1596		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
   1597		if (page_nr != idx) {
   1598			page_nr = idx;
   1599			if (p_addr)
   1600				bm_unmap(p_addr);
   1601			p_addr = bm_map_pidx(b, idx);
   1602		}
   1603		if (expect(bitnr < b->bm_bits))
   1604			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
   1605		else
   1606			drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
   1607	}
   1608	if (p_addr)
   1609		bm_unmap(p_addr);
   1610	spin_unlock_irqrestore(&b->bm_lock, flags);
   1611	return c;
   1612}
   1613
   1614
   1615/* inherently racy...
   1616 * return value may be already out-of-date when this function returns.
   1617 * but the general usage is that this is only use during a cstate when bits are
   1618 * only cleared, not set, and typically only care for the case when the return
   1619 * value is zero, or we already "locked" this "bitmap extent" by other means.
   1620 *
   1621 * enr is bm-extent number, since we chose to name one sector (512 bytes)
   1622 * worth of the bitmap a "bitmap extent".
   1623 *
   1624 * TODO
   1625 * I think since we use it like a reference count, we should use the real
   1626 * reference count of some bitmap extent element from some lru instead...
   1627 *
   1628 */
   1629int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
   1630{
   1631	struct drbd_bitmap *b = device->bitmap;
   1632	int count, s, e;
   1633	unsigned long flags;
   1634	unsigned long *p_addr, *bm;
   1635
   1636	if (!expect(b))
   1637		return 0;
   1638	if (!expect(b->bm_pages))
   1639		return 0;
   1640
   1641	spin_lock_irqsave(&b->bm_lock, flags);
   1642	if (BM_DONT_TEST & b->bm_flags)
   1643		bm_print_lock_info(device);
   1644
   1645	s = S2W(enr);
   1646	e = min((size_t)S2W(enr+1), b->bm_words);
   1647	count = 0;
   1648	if (s < b->bm_words) {
   1649		int n = e-s;
   1650		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
   1651		bm = p_addr + MLPP(s);
   1652		count += bitmap_weight(bm, n * BITS_PER_LONG);
   1653		bm_unmap(p_addr);
   1654	} else {
   1655		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
   1656	}
   1657	spin_unlock_irqrestore(&b->bm_lock, flags);
   1658	return count;
   1659}