cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-integrity.c (137917B)


      1/*
      2 * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
      3 * Copyright (C) 2016-2017 Milan Broz
      4 * Copyright (C) 2016-2017 Mikulas Patocka
      5 *
      6 * This file is released under the GPL.
      7 */
      8
      9#include "dm-bio-record.h"
     10
     11#include <linux/compiler.h>
     12#include <linux/module.h>
     13#include <linux/device-mapper.h>
     14#include <linux/dm-io.h>
     15#include <linux/vmalloc.h>
     16#include <linux/sort.h>
     17#include <linux/rbtree.h>
     18#include <linux/delay.h>
     19#include <linux/random.h>
     20#include <linux/reboot.h>
     21#include <crypto/hash.h>
     22#include <crypto/skcipher.h>
     23#include <linux/async_tx.h>
     24#include <linux/dm-bufio.h>
     25
     26#include "dm-audit.h"
     27
     28#define DM_MSG_PREFIX "integrity"
     29
     30#define DEFAULT_INTERLEAVE_SECTORS	32768
     31#define DEFAULT_JOURNAL_SIZE_FACTOR	7
     32#define DEFAULT_SECTORS_PER_BITMAP_BIT	32768
     33#define DEFAULT_BUFFER_SECTORS		128
     34#define DEFAULT_JOURNAL_WATERMARK	50
     35#define DEFAULT_SYNC_MSEC		10000
     36#define DEFAULT_MAX_JOURNAL_SECTORS	131072
     37#define MIN_LOG2_INTERLEAVE_SECTORS	3
     38#define MAX_LOG2_INTERLEAVE_SECTORS	31
     39#define METADATA_WORKQUEUE_MAX_ACTIVE	16
     40#define RECALC_SECTORS			32768
     41#define RECALC_WRITE_SUPER		16
     42#define BITMAP_BLOCK_SIZE		4096	/* don't change it */
     43#define BITMAP_FLUSH_INTERVAL		(10 * HZ)
     44#define DISCARD_FILLER			0xf6
     45#define SALT_SIZE			16
     46
     47/*
     48 * Warning - DEBUG_PRINT prints security-sensitive data to the log,
     49 * so it should not be enabled in the official kernel
     50 */
     51//#define DEBUG_PRINT
     52//#define INTERNAL_VERIFY
     53
     54/*
     55 * On disk structures
     56 */
     57
     58#define SB_MAGIC			"integrt"
     59#define SB_VERSION_1			1
     60#define SB_VERSION_2			2
     61#define SB_VERSION_3			3
     62#define SB_VERSION_4			4
     63#define SB_VERSION_5			5
     64#define SB_SECTORS			8
     65#define MAX_SECTORS_PER_BLOCK		8
     66
     67struct superblock {
     68	__u8 magic[8];
     69	__u8 version;
     70	__u8 log2_interleave_sectors;
     71	__le16 integrity_tag_size;
     72	__le32 journal_sections;
     73	__le64 provided_data_sectors;	/* userspace uses this value */
     74	__le32 flags;
     75	__u8 log2_sectors_per_block;
     76	__u8 log2_blocks_per_bitmap_bit;
     77	__u8 pad[2];
     78	__le64 recalc_sector;
     79	__u8 pad2[8];
     80	__u8 salt[SALT_SIZE];
     81};
     82
     83#define SB_FLAG_HAVE_JOURNAL_MAC	0x1
     84#define SB_FLAG_RECALCULATING		0x2
     85#define SB_FLAG_DIRTY_BITMAP		0x4
     86#define SB_FLAG_FIXED_PADDING		0x8
     87#define SB_FLAG_FIXED_HMAC		0x10
     88
     89#define	JOURNAL_ENTRY_ROUNDUP		8
     90
     91typedef __le64 commit_id_t;
     92#define JOURNAL_MAC_PER_SECTOR		8
     93
     94struct journal_entry {
     95	union {
     96		struct {
     97			__le32 sector_lo;
     98			__le32 sector_hi;
     99		} s;
    100		__le64 sector;
    101	} u;
    102	commit_id_t last_bytes[];
    103	/* __u8 tag[0]; */
    104};
    105
    106#define journal_entry_tag(ic, je)		((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
    107
    108#if BITS_PER_LONG == 64
    109#define journal_entry_set_sector(je, x)		do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
    110#else
    111#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
    112#endif
    113#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
    114#define journal_entry_is_unused(je)		((je)->u.s.sector_hi == cpu_to_le32(-1))
    115#define journal_entry_set_unused(je)		do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
    116#define journal_entry_is_inprogress(je)		((je)->u.s.sector_hi == cpu_to_le32(-2))
    117#define journal_entry_set_inprogress(je)	do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
    118
    119#define JOURNAL_BLOCK_SECTORS		8
    120#define JOURNAL_SECTOR_DATA		((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
    121#define JOURNAL_MAC_SIZE		(JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
    122
    123struct journal_sector {
    124	struct_group(sectors,
    125		__u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
    126		__u8 mac[JOURNAL_MAC_PER_SECTOR];
    127	);
    128	commit_id_t commit_id;
    129};
    130
    131#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
    132
    133#define METADATA_PADDING_SECTORS	8
    134
    135#define N_COMMIT_IDS			4
    136
    137static unsigned char prev_commit_seq(unsigned char seq)
    138{
    139	return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
    140}
    141
    142static unsigned char next_commit_seq(unsigned char seq)
    143{
    144	return (seq + 1) % N_COMMIT_IDS;
    145}
    146
    147/*
    148 * In-memory structures
    149 */
    150
    151struct journal_node {
    152	struct rb_node node;
    153	sector_t sector;
    154};
    155
    156struct alg_spec {
    157	char *alg_string;
    158	char *key_string;
    159	__u8 *key;
    160	unsigned key_size;
    161};
    162
    163struct dm_integrity_c {
    164	struct dm_dev *dev;
    165	struct dm_dev *meta_dev;
    166	unsigned tag_size;
    167	__s8 log2_tag_size;
    168	sector_t start;
    169	mempool_t journal_io_mempool;
    170	struct dm_io_client *io;
    171	struct dm_bufio_client *bufio;
    172	struct workqueue_struct *metadata_wq;
    173	struct superblock *sb;
    174	unsigned journal_pages;
    175	unsigned n_bitmap_blocks;
    176
    177	struct page_list *journal;
    178	struct page_list *journal_io;
    179	struct page_list *journal_xor;
    180	struct page_list *recalc_bitmap;
    181	struct page_list *may_write_bitmap;
    182	struct bitmap_block_status *bbs;
    183	unsigned bitmap_flush_interval;
    184	int synchronous_mode;
    185	struct bio_list synchronous_bios;
    186	struct delayed_work bitmap_flush_work;
    187
    188	struct crypto_skcipher *journal_crypt;
    189	struct scatterlist **journal_scatterlist;
    190	struct scatterlist **journal_io_scatterlist;
    191	struct skcipher_request **sk_requests;
    192
    193	struct crypto_shash *journal_mac;
    194
    195	struct journal_node *journal_tree;
    196	struct rb_root journal_tree_root;
    197
    198	sector_t provided_data_sectors;
    199
    200	unsigned short journal_entry_size;
    201	unsigned char journal_entries_per_sector;
    202	unsigned char journal_section_entries;
    203	unsigned short journal_section_sectors;
    204	unsigned journal_sections;
    205	unsigned journal_entries;
    206	sector_t data_device_sectors;
    207	sector_t meta_device_sectors;
    208	unsigned initial_sectors;
    209	unsigned metadata_run;
    210	__s8 log2_metadata_run;
    211	__u8 log2_buffer_sectors;
    212	__u8 sectors_per_block;
    213	__u8 log2_blocks_per_bitmap_bit;
    214
    215	unsigned char mode;
    216
    217	int failed;
    218
    219	struct crypto_shash *internal_hash;
    220
    221	struct dm_target *ti;
    222
    223	/* these variables are locked with endio_wait.lock */
    224	struct rb_root in_progress;
    225	struct list_head wait_list;
    226	wait_queue_head_t endio_wait;
    227	struct workqueue_struct *wait_wq;
    228	struct workqueue_struct *offload_wq;
    229
    230	unsigned char commit_seq;
    231	commit_id_t commit_ids[N_COMMIT_IDS];
    232
    233	unsigned committed_section;
    234	unsigned n_committed_sections;
    235
    236	unsigned uncommitted_section;
    237	unsigned n_uncommitted_sections;
    238
    239	unsigned free_section;
    240	unsigned char free_section_entry;
    241	unsigned free_sectors;
    242
    243	unsigned free_sectors_threshold;
    244
    245	struct workqueue_struct *commit_wq;
    246	struct work_struct commit_work;
    247
    248	struct workqueue_struct *writer_wq;
    249	struct work_struct writer_work;
    250
    251	struct workqueue_struct *recalc_wq;
    252	struct work_struct recalc_work;
    253	u8 *recalc_buffer;
    254	u8 *recalc_tags;
    255
    256	struct bio_list flush_bio_list;
    257
    258	unsigned long autocommit_jiffies;
    259	struct timer_list autocommit_timer;
    260	unsigned autocommit_msec;
    261
    262	wait_queue_head_t copy_to_journal_wait;
    263
    264	struct completion crypto_backoff;
    265
    266	bool journal_uptodate;
    267	bool just_formatted;
    268	bool recalculate_flag;
    269	bool reset_recalculate_flag;
    270	bool discard;
    271	bool fix_padding;
    272	bool fix_hmac;
    273	bool legacy_recalculate;
    274
    275	struct alg_spec internal_hash_alg;
    276	struct alg_spec journal_crypt_alg;
    277	struct alg_spec journal_mac_alg;
    278
    279	atomic64_t number_of_mismatches;
    280
    281	struct notifier_block reboot_notifier;
    282};
    283
    284struct dm_integrity_range {
    285	sector_t logical_sector;
    286	sector_t n_sectors;
    287	bool waiting;
    288	union {
    289		struct rb_node node;
    290		struct {
    291			struct task_struct *task;
    292			struct list_head wait_entry;
    293		};
    294	};
    295};
    296
    297struct dm_integrity_io {
    298	struct work_struct work;
    299
    300	struct dm_integrity_c *ic;
    301	enum req_opf op;
    302	bool fua;
    303
    304	struct dm_integrity_range range;
    305
    306	sector_t metadata_block;
    307	unsigned metadata_offset;
    308
    309	atomic_t in_flight;
    310	blk_status_t bi_status;
    311
    312	struct completion *completion;
    313
    314	struct dm_bio_details bio_details;
    315};
    316
    317struct journal_completion {
    318	struct dm_integrity_c *ic;
    319	atomic_t in_flight;
    320	struct completion comp;
    321};
    322
    323struct journal_io {
    324	struct dm_integrity_range range;
    325	struct journal_completion *comp;
    326};
    327
    328struct bitmap_block_status {
    329	struct work_struct work;
    330	struct dm_integrity_c *ic;
    331	unsigned idx;
    332	unsigned long *bitmap;
    333	struct bio_list bio_queue;
    334	spinlock_t bio_queue_lock;
    335
    336};
    337
    338static struct kmem_cache *journal_io_cache;
    339
    340#define JOURNAL_IO_MEMPOOL	32
    341
    342#ifdef DEBUG_PRINT
    343#define DEBUG_print(x, ...)	printk(KERN_DEBUG x, ##__VA_ARGS__)
    344static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
    345{
    346	va_list args;
    347	va_start(args, msg);
    348	vprintk(msg, args);
    349	va_end(args);
    350	if (len)
    351		pr_cont(":");
    352	while (len) {
    353		pr_cont(" %02x", *bytes);
    354		bytes++;
    355		len--;
    356	}
    357	pr_cont("\n");
    358}
    359#define DEBUG_bytes(bytes, len, msg, ...)	__DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
    360#else
    361#define DEBUG_print(x, ...)			do { } while (0)
    362#define DEBUG_bytes(bytes, len, msg, ...)	do { } while (0)
    363#endif
    364
    365static void dm_integrity_prepare(struct request *rq)
    366{
    367}
    368
    369static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
    370{
    371}
    372
    373/*
    374 * DM Integrity profile, protection is performed layer above (dm-crypt)
    375 */
    376static const struct blk_integrity_profile dm_integrity_profile = {
    377	.name			= "DM-DIF-EXT-TAG",
    378	.generate_fn		= NULL,
    379	.verify_fn		= NULL,
    380	.prepare_fn		= dm_integrity_prepare,
    381	.complete_fn		= dm_integrity_complete,
    382};
    383
    384static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
    385static void integrity_bio_wait(struct work_struct *w);
    386static void dm_integrity_dtr(struct dm_target *ti);
    387
    388static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
    389{
    390	if (err == -EILSEQ)
    391		atomic64_inc(&ic->number_of_mismatches);
    392	if (!cmpxchg(&ic->failed, 0, err))
    393		DMERR("Error on %s: %d", msg, err);
    394}
    395
    396static int dm_integrity_failed(struct dm_integrity_c *ic)
    397{
    398	return READ_ONCE(ic->failed);
    399}
    400
    401static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic)
    402{
    403	if (ic->legacy_recalculate)
    404		return false;
    405	if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ?
    406	    ic->internal_hash_alg.key || ic->journal_mac_alg.key :
    407	    ic->internal_hash_alg.key && !ic->journal_mac_alg.key)
    408		return true;
    409	return false;
    410}
    411
    412static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
    413					  unsigned j, unsigned char seq)
    414{
    415	/*
    416	 * Xor the number with section and sector, so that if a piece of
    417	 * journal is written at wrong place, it is detected.
    418	 */
    419	return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
    420}
    421
    422static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
    423				sector_t *area, sector_t *offset)
    424{
    425	if (!ic->meta_dev) {
    426		__u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
    427		*area = data_sector >> log2_interleave_sectors;
    428		*offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
    429	} else {
    430		*area = 0;
    431		*offset = data_sector;
    432	}
    433}
    434
    435#define sector_to_block(ic, n)						\
    436do {									\
    437	BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));		\
    438	(n) >>= (ic)->sb->log2_sectors_per_block;			\
    439} while (0)
    440
    441static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
    442					    sector_t offset, unsigned *metadata_offset)
    443{
    444	__u64 ms;
    445	unsigned mo;
    446
    447	ms = area << ic->sb->log2_interleave_sectors;
    448	if (likely(ic->log2_metadata_run >= 0))
    449		ms += area << ic->log2_metadata_run;
    450	else
    451		ms += area * ic->metadata_run;
    452	ms >>= ic->log2_buffer_sectors;
    453
    454	sector_to_block(ic, offset);
    455
    456	if (likely(ic->log2_tag_size >= 0)) {
    457		ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
    458		mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
    459	} else {
    460		ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
    461		mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
    462	}
    463	*metadata_offset = mo;
    464	return ms;
    465}
    466
    467static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
    468{
    469	sector_t result;
    470
    471	if (ic->meta_dev)
    472		return offset;
    473
    474	result = area << ic->sb->log2_interleave_sectors;
    475	if (likely(ic->log2_metadata_run >= 0))
    476		result += (area + 1) << ic->log2_metadata_run;
    477	else
    478		result += (area + 1) * ic->metadata_run;
    479
    480	result += (sector_t)ic->initial_sectors + offset;
    481	result += ic->start;
    482
    483	return result;
    484}
    485
    486static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
    487{
    488	if (unlikely(*sec_ptr >= ic->journal_sections))
    489		*sec_ptr -= ic->journal_sections;
    490}
    491
    492static void sb_set_version(struct dm_integrity_c *ic)
    493{
    494	if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC))
    495		ic->sb->version = SB_VERSION_5;
    496	else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
    497		ic->sb->version = SB_VERSION_4;
    498	else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
    499		ic->sb->version = SB_VERSION_3;
    500	else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
    501		ic->sb->version = SB_VERSION_2;
    502	else
    503		ic->sb->version = SB_VERSION_1;
    504}
    505
    506static int sb_mac(struct dm_integrity_c *ic, bool wr)
    507{
    508	SHASH_DESC_ON_STACK(desc, ic->journal_mac);
    509	int r;
    510	unsigned size = crypto_shash_digestsize(ic->journal_mac);
    511
    512	if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) {
    513		dm_integrity_io_error(ic, "digest is too long", -EINVAL);
    514		return -EINVAL;
    515	}
    516
    517	desc->tfm = ic->journal_mac;
    518
    519	r = crypto_shash_init(desc);
    520	if (unlikely(r < 0)) {
    521		dm_integrity_io_error(ic, "crypto_shash_init", r);
    522		return r;
    523	}
    524
    525	r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size);
    526	if (unlikely(r < 0)) {
    527		dm_integrity_io_error(ic, "crypto_shash_update", r);
    528		return r;
    529	}
    530
    531	if (likely(wr)) {
    532		r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size);
    533		if (unlikely(r < 0)) {
    534			dm_integrity_io_error(ic, "crypto_shash_final", r);
    535			return r;
    536		}
    537	} else {
    538		__u8 result[HASH_MAX_DIGESTSIZE];
    539		r = crypto_shash_final(desc, result);
    540		if (unlikely(r < 0)) {
    541			dm_integrity_io_error(ic, "crypto_shash_final", r);
    542			return r;
    543		}
    544		if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) {
    545			dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
    546			dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
    547			return -EILSEQ;
    548		}
    549	}
    550
    551	return 0;
    552}
    553
    554static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
    555{
    556	struct dm_io_request io_req;
    557	struct dm_io_region io_loc;
    558	int r;
    559
    560	io_req.bi_op = op;
    561	io_req.bi_op_flags = op_flags;
    562	io_req.mem.type = DM_IO_KMEM;
    563	io_req.mem.ptr.addr = ic->sb;
    564	io_req.notify.fn = NULL;
    565	io_req.client = ic->io;
    566	io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
    567	io_loc.sector = ic->start;
    568	io_loc.count = SB_SECTORS;
    569
    570	if (op == REQ_OP_WRITE) {
    571		sb_set_version(ic);
    572		if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
    573			r = sb_mac(ic, true);
    574			if (unlikely(r))
    575				return r;
    576		}
    577	}
    578
    579	r = dm_io(&io_req, 1, &io_loc, NULL);
    580	if (unlikely(r))
    581		return r;
    582
    583	if (op == REQ_OP_READ) {
    584		if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
    585			r = sb_mac(ic, false);
    586			if (unlikely(r))
    587				return r;
    588		}
    589	}
    590
    591	return 0;
    592}
    593
    594#define BITMAP_OP_TEST_ALL_SET		0
    595#define BITMAP_OP_TEST_ALL_CLEAR	1
    596#define BITMAP_OP_SET			2
    597#define BITMAP_OP_CLEAR			3
    598
    599static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
    600			    sector_t sector, sector_t n_sectors, int mode)
    601{
    602	unsigned long bit, end_bit, this_end_bit, page, end_page;
    603	unsigned long *data;
    604
    605	if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
    606		DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
    607			sector,
    608			n_sectors,
    609			ic->sb->log2_sectors_per_block,
    610			ic->log2_blocks_per_bitmap_bit,
    611			mode);
    612		BUG();
    613	}
    614
    615	if (unlikely(!n_sectors))
    616		return true;
    617
    618	bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
    619	end_bit = (sector + n_sectors - 1) >>
    620		(ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
    621
    622	page = bit / (PAGE_SIZE * 8);
    623	bit %= PAGE_SIZE * 8;
    624
    625	end_page = end_bit / (PAGE_SIZE * 8);
    626	end_bit %= PAGE_SIZE * 8;
    627
    628repeat:
    629	if (page < end_page) {
    630		this_end_bit = PAGE_SIZE * 8 - 1;
    631	} else {
    632		this_end_bit = end_bit;
    633	}
    634
    635	data = lowmem_page_address(bitmap[page].page);
    636
    637	if (mode == BITMAP_OP_TEST_ALL_SET) {
    638		while (bit <= this_end_bit) {
    639			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
    640				do {
    641					if (data[bit / BITS_PER_LONG] != -1)
    642						return false;
    643					bit += BITS_PER_LONG;
    644				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
    645				continue;
    646			}
    647			if (!test_bit(bit, data))
    648				return false;
    649			bit++;
    650		}
    651	} else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
    652		while (bit <= this_end_bit) {
    653			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
    654				do {
    655					if (data[bit / BITS_PER_LONG] != 0)
    656						return false;
    657					bit += BITS_PER_LONG;
    658				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
    659				continue;
    660			}
    661			if (test_bit(bit, data))
    662				return false;
    663			bit++;
    664		}
    665	} else if (mode == BITMAP_OP_SET) {
    666		while (bit <= this_end_bit) {
    667			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
    668				do {
    669					data[bit / BITS_PER_LONG] = -1;
    670					bit += BITS_PER_LONG;
    671				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
    672				continue;
    673			}
    674			__set_bit(bit, data);
    675			bit++;
    676		}
    677	} else if (mode == BITMAP_OP_CLEAR) {
    678		if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
    679			clear_page(data);
    680		else while (bit <= this_end_bit) {
    681			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
    682				do {
    683					data[bit / BITS_PER_LONG] = 0;
    684					bit += BITS_PER_LONG;
    685				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
    686				continue;
    687			}
    688			__clear_bit(bit, data);
    689			bit++;
    690		}
    691	} else {
    692		BUG();
    693	}
    694
    695	if (unlikely(page < end_page)) {
    696		bit = 0;
    697		page++;
    698		goto repeat;
    699	}
    700
    701	return true;
    702}
    703
    704static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
    705{
    706	unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
    707	unsigned i;
    708
    709	for (i = 0; i < n_bitmap_pages; i++) {
    710		unsigned long *dst_data = lowmem_page_address(dst[i].page);
    711		unsigned long *src_data = lowmem_page_address(src[i].page);
    712		copy_page(dst_data, src_data);
    713	}
    714}
    715
    716static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
    717{
    718	unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
    719	unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
    720
    721	BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
    722	return &ic->bbs[bitmap_block];
    723}
    724
    725static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
    726				 bool e, const char *function)
    727{
    728#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
    729	unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
    730
    731	if (unlikely(section >= ic->journal_sections) ||
    732	    unlikely(offset >= limit)) {
    733		DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
    734		       function, section, offset, ic->journal_sections, limit);
    735		BUG();
    736	}
    737#endif
    738}
    739
    740static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
    741			       unsigned *pl_index, unsigned *pl_offset)
    742{
    743	unsigned sector;
    744
    745	access_journal_check(ic, section, offset, false, "page_list_location");
    746
    747	sector = section * ic->journal_section_sectors + offset;
    748
    749	*pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
    750	*pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
    751}
    752
    753static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
    754					       unsigned section, unsigned offset, unsigned *n_sectors)
    755{
    756	unsigned pl_index, pl_offset;
    757	char *va;
    758
    759	page_list_location(ic, section, offset, &pl_index, &pl_offset);
    760
    761	if (n_sectors)
    762		*n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
    763
    764	va = lowmem_page_address(pl[pl_index].page);
    765
    766	return (struct journal_sector *)(va + pl_offset);
    767}
    768
    769static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
    770{
    771	return access_page_list(ic, ic->journal, section, offset, NULL);
    772}
    773
    774static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
    775{
    776	unsigned rel_sector, offset;
    777	struct journal_sector *js;
    778
    779	access_journal_check(ic, section, n, true, "access_journal_entry");
    780
    781	rel_sector = n % JOURNAL_BLOCK_SECTORS;
    782	offset = n / JOURNAL_BLOCK_SECTORS;
    783
    784	js = access_journal(ic, section, rel_sector);
    785	return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
    786}
    787
    788static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
    789{
    790	n <<= ic->sb->log2_sectors_per_block;
    791
    792	n += JOURNAL_BLOCK_SECTORS;
    793
    794	access_journal_check(ic, section, n, false, "access_journal_data");
    795
    796	return access_journal(ic, section, n);
    797}
    798
    799static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
    800{
    801	SHASH_DESC_ON_STACK(desc, ic->journal_mac);
    802	int r;
    803	unsigned j, size;
    804
    805	desc->tfm = ic->journal_mac;
    806
    807	r = crypto_shash_init(desc);
    808	if (unlikely(r < 0)) {
    809		dm_integrity_io_error(ic, "crypto_shash_init", r);
    810		goto err;
    811	}
    812
    813	if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
    814		__le64 section_le;
    815
    816		r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE);
    817		if (unlikely(r < 0)) {
    818			dm_integrity_io_error(ic, "crypto_shash_update", r);
    819			goto err;
    820		}
    821
    822		section_le = cpu_to_le64(section);
    823		r = crypto_shash_update(desc, (__u8 *)&section_le, sizeof section_le);
    824		if (unlikely(r < 0)) {
    825			dm_integrity_io_error(ic, "crypto_shash_update", r);
    826			goto err;
    827		}
    828	}
    829
    830	for (j = 0; j < ic->journal_section_entries; j++) {
    831		struct journal_entry *je = access_journal_entry(ic, section, j);
    832		r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
    833		if (unlikely(r < 0)) {
    834			dm_integrity_io_error(ic, "crypto_shash_update", r);
    835			goto err;
    836		}
    837	}
    838
    839	size = crypto_shash_digestsize(ic->journal_mac);
    840
    841	if (likely(size <= JOURNAL_MAC_SIZE)) {
    842		r = crypto_shash_final(desc, result);
    843		if (unlikely(r < 0)) {
    844			dm_integrity_io_error(ic, "crypto_shash_final", r);
    845			goto err;
    846		}
    847		memset(result + size, 0, JOURNAL_MAC_SIZE - size);
    848	} else {
    849		__u8 digest[HASH_MAX_DIGESTSIZE];
    850
    851		if (WARN_ON(size > sizeof(digest))) {
    852			dm_integrity_io_error(ic, "digest_size", -EINVAL);
    853			goto err;
    854		}
    855		r = crypto_shash_final(desc, digest);
    856		if (unlikely(r < 0)) {
    857			dm_integrity_io_error(ic, "crypto_shash_final", r);
    858			goto err;
    859		}
    860		memcpy(result, digest, JOURNAL_MAC_SIZE);
    861	}
    862
    863	return;
    864err:
    865	memset(result, 0, JOURNAL_MAC_SIZE);
    866}
    867
    868static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
    869{
    870	__u8 result[JOURNAL_MAC_SIZE];
    871	unsigned j;
    872
    873	if (!ic->journal_mac)
    874		return;
    875
    876	section_mac(ic, section, result);
    877
    878	for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
    879		struct journal_sector *js = access_journal(ic, section, j);
    880
    881		if (likely(wr))
    882			memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
    883		else {
    884			if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
    885				dm_integrity_io_error(ic, "journal mac", -EILSEQ);
    886				dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
    887			}
    888		}
    889	}
    890}
    891
    892static void complete_journal_op(void *context)
    893{
    894	struct journal_completion *comp = context;
    895	BUG_ON(!atomic_read(&comp->in_flight));
    896	if (likely(atomic_dec_and_test(&comp->in_flight)))
    897		complete(&comp->comp);
    898}
    899
    900static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
    901			unsigned n_sections, struct journal_completion *comp)
    902{
    903	struct async_submit_ctl submit;
    904	size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
    905	unsigned pl_index, pl_offset, section_index;
    906	struct page_list *source_pl, *target_pl;
    907
    908	if (likely(encrypt)) {
    909		source_pl = ic->journal;
    910		target_pl = ic->journal_io;
    911	} else {
    912		source_pl = ic->journal_io;
    913		target_pl = ic->journal;
    914	}
    915
    916	page_list_location(ic, section, 0, &pl_index, &pl_offset);
    917
    918	atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
    919
    920	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
    921
    922	section_index = pl_index;
    923
    924	do {
    925		size_t this_step;
    926		struct page *src_pages[2];
    927		struct page *dst_page;
    928
    929		while (unlikely(pl_index == section_index)) {
    930			unsigned dummy;
    931			if (likely(encrypt))
    932				rw_section_mac(ic, section, true);
    933			section++;
    934			n_sections--;
    935			if (!n_sections)
    936				break;
    937			page_list_location(ic, section, 0, &section_index, &dummy);
    938		}
    939
    940		this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
    941		dst_page = target_pl[pl_index].page;
    942		src_pages[0] = source_pl[pl_index].page;
    943		src_pages[1] = ic->journal_xor[pl_index].page;
    944
    945		async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
    946
    947		pl_index++;
    948		pl_offset = 0;
    949		n_bytes -= this_step;
    950	} while (n_bytes);
    951
    952	BUG_ON(n_sections);
    953
    954	async_tx_issue_pending_all();
    955}
    956
    957static void complete_journal_encrypt(struct crypto_async_request *req, int err)
    958{
    959	struct journal_completion *comp = req->data;
    960	if (unlikely(err)) {
    961		if (likely(err == -EINPROGRESS)) {
    962			complete(&comp->ic->crypto_backoff);
    963			return;
    964		}
    965		dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
    966	}
    967	complete_journal_op(comp);
    968}
    969
    970static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
    971{
    972	int r;
    973	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
    974				      complete_journal_encrypt, comp);
    975	if (likely(encrypt))
    976		r = crypto_skcipher_encrypt(req);
    977	else
    978		r = crypto_skcipher_decrypt(req);
    979	if (likely(!r))
    980		return false;
    981	if (likely(r == -EINPROGRESS))
    982		return true;
    983	if (likely(r == -EBUSY)) {
    984		wait_for_completion(&comp->ic->crypto_backoff);
    985		reinit_completion(&comp->ic->crypto_backoff);
    986		return true;
    987	}
    988	dm_integrity_io_error(comp->ic, "encrypt", r);
    989	return false;
    990}
    991
    992static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
    993			  unsigned n_sections, struct journal_completion *comp)
    994{
    995	struct scatterlist **source_sg;
    996	struct scatterlist **target_sg;
    997
    998	atomic_add(2, &comp->in_flight);
    999
   1000	if (likely(encrypt)) {
   1001		source_sg = ic->journal_scatterlist;
   1002		target_sg = ic->journal_io_scatterlist;
   1003	} else {
   1004		source_sg = ic->journal_io_scatterlist;
   1005		target_sg = ic->journal_scatterlist;
   1006	}
   1007
   1008	do {
   1009		struct skcipher_request *req;
   1010		unsigned ivsize;
   1011		char *iv;
   1012
   1013		if (likely(encrypt))
   1014			rw_section_mac(ic, section, true);
   1015
   1016		req = ic->sk_requests[section];
   1017		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
   1018		iv = req->iv;
   1019
   1020		memcpy(iv, iv + ivsize, ivsize);
   1021
   1022		req->src = source_sg[section];
   1023		req->dst = target_sg[section];
   1024
   1025		if (unlikely(do_crypt(encrypt, req, comp)))
   1026			atomic_inc(&comp->in_flight);
   1027
   1028		section++;
   1029		n_sections--;
   1030	} while (n_sections);
   1031
   1032	atomic_dec(&comp->in_flight);
   1033	complete_journal_op(comp);
   1034}
   1035
   1036static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
   1037			    unsigned n_sections, struct journal_completion *comp)
   1038{
   1039	if (ic->journal_xor)
   1040		return xor_journal(ic, encrypt, section, n_sections, comp);
   1041	else
   1042		return crypt_journal(ic, encrypt, section, n_sections, comp);
   1043}
   1044
   1045static void complete_journal_io(unsigned long error, void *context)
   1046{
   1047	struct journal_completion *comp = context;
   1048	if (unlikely(error != 0))
   1049		dm_integrity_io_error(comp->ic, "writing journal", -EIO);
   1050	complete_journal_op(comp);
   1051}
   1052
   1053static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
   1054			       unsigned sector, unsigned n_sectors, struct journal_completion *comp)
   1055{
   1056	struct dm_io_request io_req;
   1057	struct dm_io_region io_loc;
   1058	unsigned pl_index, pl_offset;
   1059	int r;
   1060
   1061	if (unlikely(dm_integrity_failed(ic))) {
   1062		if (comp)
   1063			complete_journal_io(-1UL, comp);
   1064		return;
   1065	}
   1066
   1067	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
   1068	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
   1069
   1070	io_req.bi_op = op;
   1071	io_req.bi_op_flags = op_flags;
   1072	io_req.mem.type = DM_IO_PAGE_LIST;
   1073	if (ic->journal_io)
   1074		io_req.mem.ptr.pl = &ic->journal_io[pl_index];
   1075	else
   1076		io_req.mem.ptr.pl = &ic->journal[pl_index];
   1077	io_req.mem.offset = pl_offset;
   1078	if (likely(comp != NULL)) {
   1079		io_req.notify.fn = complete_journal_io;
   1080		io_req.notify.context = comp;
   1081	} else {
   1082		io_req.notify.fn = NULL;
   1083	}
   1084	io_req.client = ic->io;
   1085	io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
   1086	io_loc.sector = ic->start + SB_SECTORS + sector;
   1087	io_loc.count = n_sectors;
   1088
   1089	r = dm_io(&io_req, 1, &io_loc, NULL);
   1090	if (unlikely(r)) {
   1091		dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
   1092		if (comp) {
   1093			WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
   1094			complete_journal_io(-1UL, comp);
   1095		}
   1096	}
   1097}
   1098
   1099static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
   1100		       unsigned n_sections, struct journal_completion *comp)
   1101{
   1102	unsigned sector, n_sectors;
   1103
   1104	sector = section * ic->journal_section_sectors;
   1105	n_sectors = n_sections * ic->journal_section_sectors;
   1106
   1107	rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
   1108}
   1109
   1110static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
   1111{
   1112	struct journal_completion io_comp;
   1113	struct journal_completion crypt_comp_1;
   1114	struct journal_completion crypt_comp_2;
   1115	unsigned i;
   1116
   1117	io_comp.ic = ic;
   1118	init_completion(&io_comp.comp);
   1119
   1120	if (commit_start + commit_sections <= ic->journal_sections) {
   1121		io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
   1122		if (ic->journal_io) {
   1123			crypt_comp_1.ic = ic;
   1124			init_completion(&crypt_comp_1.comp);
   1125			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
   1126			encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
   1127			wait_for_completion_io(&crypt_comp_1.comp);
   1128		} else {
   1129			for (i = 0; i < commit_sections; i++)
   1130				rw_section_mac(ic, commit_start + i, true);
   1131		}
   1132		rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
   1133			   commit_sections, &io_comp);
   1134	} else {
   1135		unsigned to_end;
   1136		io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
   1137		to_end = ic->journal_sections - commit_start;
   1138		if (ic->journal_io) {
   1139			crypt_comp_1.ic = ic;
   1140			init_completion(&crypt_comp_1.comp);
   1141			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
   1142			encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
   1143			if (try_wait_for_completion(&crypt_comp_1.comp)) {
   1144				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
   1145				reinit_completion(&crypt_comp_1.comp);
   1146				crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
   1147				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
   1148				wait_for_completion_io(&crypt_comp_1.comp);
   1149			} else {
   1150				crypt_comp_2.ic = ic;
   1151				init_completion(&crypt_comp_2.comp);
   1152				crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
   1153				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
   1154				wait_for_completion_io(&crypt_comp_1.comp);
   1155				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
   1156				wait_for_completion_io(&crypt_comp_2.comp);
   1157			}
   1158		} else {
   1159			for (i = 0; i < to_end; i++)
   1160				rw_section_mac(ic, commit_start + i, true);
   1161			rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
   1162			for (i = 0; i < commit_sections - to_end; i++)
   1163				rw_section_mac(ic, i, true);
   1164		}
   1165		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
   1166	}
   1167
   1168	wait_for_completion_io(&io_comp.comp);
   1169}
   1170
   1171static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
   1172			      unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
   1173{
   1174	struct dm_io_request io_req;
   1175	struct dm_io_region io_loc;
   1176	int r;
   1177	unsigned sector, pl_index, pl_offset;
   1178
   1179	BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
   1180
   1181	if (unlikely(dm_integrity_failed(ic))) {
   1182		fn(-1UL, data);
   1183		return;
   1184	}
   1185
   1186	sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
   1187
   1188	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
   1189	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
   1190
   1191	io_req.bi_op = REQ_OP_WRITE;
   1192	io_req.bi_op_flags = 0;
   1193	io_req.mem.type = DM_IO_PAGE_LIST;
   1194	io_req.mem.ptr.pl = &ic->journal[pl_index];
   1195	io_req.mem.offset = pl_offset;
   1196	io_req.notify.fn = fn;
   1197	io_req.notify.context = data;
   1198	io_req.client = ic->io;
   1199	io_loc.bdev = ic->dev->bdev;
   1200	io_loc.sector = target;
   1201	io_loc.count = n_sectors;
   1202
   1203	r = dm_io(&io_req, 1, &io_loc, NULL);
   1204	if (unlikely(r)) {
   1205		WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
   1206		fn(-1UL, data);
   1207	}
   1208}
   1209
   1210static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
   1211{
   1212	return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
   1213	       range1->logical_sector + range1->n_sectors > range2->logical_sector;
   1214}
   1215
   1216static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
   1217{
   1218	struct rb_node **n = &ic->in_progress.rb_node;
   1219	struct rb_node *parent;
   1220
   1221	BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
   1222
   1223	if (likely(check_waiting)) {
   1224		struct dm_integrity_range *range;
   1225		list_for_each_entry(range, &ic->wait_list, wait_entry) {
   1226			if (unlikely(ranges_overlap(range, new_range)))
   1227				return false;
   1228		}
   1229	}
   1230
   1231	parent = NULL;
   1232
   1233	while (*n) {
   1234		struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
   1235
   1236		parent = *n;
   1237		if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
   1238			n = &range->node.rb_left;
   1239		} else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
   1240			n = &range->node.rb_right;
   1241		} else {
   1242			return false;
   1243		}
   1244	}
   1245
   1246	rb_link_node(&new_range->node, parent, n);
   1247	rb_insert_color(&new_range->node, &ic->in_progress);
   1248
   1249	return true;
   1250}
   1251
   1252static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
   1253{
   1254	rb_erase(&range->node, &ic->in_progress);
   1255	while (unlikely(!list_empty(&ic->wait_list))) {
   1256		struct dm_integrity_range *last_range =
   1257			list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
   1258		struct task_struct *last_range_task;
   1259		last_range_task = last_range->task;
   1260		list_del(&last_range->wait_entry);
   1261		if (!add_new_range(ic, last_range, false)) {
   1262			last_range->task = last_range_task;
   1263			list_add(&last_range->wait_entry, &ic->wait_list);
   1264			break;
   1265		}
   1266		last_range->waiting = false;
   1267		wake_up_process(last_range_task);
   1268	}
   1269}
   1270
   1271static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
   1272{
   1273	unsigned long flags;
   1274
   1275	spin_lock_irqsave(&ic->endio_wait.lock, flags);
   1276	remove_range_unlocked(ic, range);
   1277	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
   1278}
   1279
   1280static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
   1281{
   1282	new_range->waiting = true;
   1283	list_add_tail(&new_range->wait_entry, &ic->wait_list);
   1284	new_range->task = current;
   1285	do {
   1286		__set_current_state(TASK_UNINTERRUPTIBLE);
   1287		spin_unlock_irq(&ic->endio_wait.lock);
   1288		io_schedule();
   1289		spin_lock_irq(&ic->endio_wait.lock);
   1290	} while (unlikely(new_range->waiting));
   1291}
   1292
   1293static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
   1294{
   1295	if (unlikely(!add_new_range(ic, new_range, true)))
   1296		wait_and_add_new_range(ic, new_range);
   1297}
   1298
   1299static void init_journal_node(struct journal_node *node)
   1300{
   1301	RB_CLEAR_NODE(&node->node);
   1302	node->sector = (sector_t)-1;
   1303}
   1304
   1305static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
   1306{
   1307	struct rb_node **link;
   1308	struct rb_node *parent;
   1309
   1310	node->sector = sector;
   1311	BUG_ON(!RB_EMPTY_NODE(&node->node));
   1312
   1313	link = &ic->journal_tree_root.rb_node;
   1314	parent = NULL;
   1315
   1316	while (*link) {
   1317		struct journal_node *j;
   1318		parent = *link;
   1319		j = container_of(parent, struct journal_node, node);
   1320		if (sector < j->sector)
   1321			link = &j->node.rb_left;
   1322		else
   1323			link = &j->node.rb_right;
   1324	}
   1325
   1326	rb_link_node(&node->node, parent, link);
   1327	rb_insert_color(&node->node, &ic->journal_tree_root);
   1328}
   1329
   1330static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
   1331{
   1332	BUG_ON(RB_EMPTY_NODE(&node->node));
   1333	rb_erase(&node->node, &ic->journal_tree_root);
   1334	init_journal_node(node);
   1335}
   1336
   1337#define NOT_FOUND	(-1U)
   1338
   1339static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
   1340{
   1341	struct rb_node *n = ic->journal_tree_root.rb_node;
   1342	unsigned found = NOT_FOUND;
   1343	*next_sector = (sector_t)-1;
   1344	while (n) {
   1345		struct journal_node *j = container_of(n, struct journal_node, node);
   1346		if (sector == j->sector) {
   1347			found = j - ic->journal_tree;
   1348		}
   1349		if (sector < j->sector) {
   1350			*next_sector = j->sector;
   1351			n = j->node.rb_left;
   1352		} else {
   1353			n = j->node.rb_right;
   1354		}
   1355	}
   1356
   1357	return found;
   1358}
   1359
   1360static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
   1361{
   1362	struct journal_node *node, *next_node;
   1363	struct rb_node *next;
   1364
   1365	if (unlikely(pos >= ic->journal_entries))
   1366		return false;
   1367	node = &ic->journal_tree[pos];
   1368	if (unlikely(RB_EMPTY_NODE(&node->node)))
   1369		return false;
   1370	if (unlikely(node->sector != sector))
   1371		return false;
   1372
   1373	next = rb_next(&node->node);
   1374	if (unlikely(!next))
   1375		return true;
   1376
   1377	next_node = container_of(next, struct journal_node, node);
   1378	return next_node->sector != sector;
   1379}
   1380
   1381static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
   1382{
   1383	struct rb_node *next;
   1384	struct journal_node *next_node;
   1385	unsigned next_section;
   1386
   1387	BUG_ON(RB_EMPTY_NODE(&node->node));
   1388
   1389	next = rb_next(&node->node);
   1390	if (unlikely(!next))
   1391		return false;
   1392
   1393	next_node = container_of(next, struct journal_node, node);
   1394
   1395	if (next_node->sector != node->sector)
   1396		return false;
   1397
   1398	next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
   1399	if (next_section >= ic->committed_section &&
   1400	    next_section < ic->committed_section + ic->n_committed_sections)
   1401		return true;
   1402	if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
   1403		return true;
   1404
   1405	return false;
   1406}
   1407
   1408#define TAG_READ	0
   1409#define TAG_WRITE	1
   1410#define TAG_CMP		2
   1411
   1412static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
   1413			       unsigned *metadata_offset, unsigned total_size, int op)
   1414{
   1415#define MAY_BE_FILLER		1
   1416#define MAY_BE_HASH		2
   1417	unsigned hash_offset = 0;
   1418	unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
   1419
   1420	do {
   1421		unsigned char *data, *dp;
   1422		struct dm_buffer *b;
   1423		unsigned to_copy;
   1424		int r;
   1425
   1426		r = dm_integrity_failed(ic);
   1427		if (unlikely(r))
   1428			return r;
   1429
   1430		data = dm_bufio_read(ic->bufio, *metadata_block, &b);
   1431		if (IS_ERR(data))
   1432			return PTR_ERR(data);
   1433
   1434		to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
   1435		dp = data + *metadata_offset;
   1436		if (op == TAG_READ) {
   1437			memcpy(tag, dp, to_copy);
   1438		} else if (op == TAG_WRITE) {
   1439			if (memcmp(dp, tag, to_copy)) {
   1440				memcpy(dp, tag, to_copy);
   1441				dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
   1442			}
   1443		} else {
   1444			/* e.g.: op == TAG_CMP */
   1445
   1446			if (likely(is_power_of_2(ic->tag_size))) {
   1447				if (unlikely(memcmp(dp, tag, to_copy)))
   1448					if (unlikely(!ic->discard) ||
   1449					    unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
   1450						goto thorough_test;
   1451				}
   1452			} else {
   1453				unsigned i, ts;
   1454thorough_test:
   1455				ts = total_size;
   1456
   1457				for (i = 0; i < to_copy; i++, ts--) {
   1458					if (unlikely(dp[i] != tag[i]))
   1459						may_be &= ~MAY_BE_HASH;
   1460					if (likely(dp[i] != DISCARD_FILLER))
   1461						may_be &= ~MAY_BE_FILLER;
   1462					hash_offset++;
   1463					if (unlikely(hash_offset == ic->tag_size)) {
   1464						if (unlikely(!may_be)) {
   1465							dm_bufio_release(b);
   1466							return ts;
   1467						}
   1468						hash_offset = 0;
   1469						may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
   1470					}
   1471				}
   1472			}
   1473		}
   1474		dm_bufio_release(b);
   1475
   1476		tag += to_copy;
   1477		*metadata_offset += to_copy;
   1478		if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
   1479			(*metadata_block)++;
   1480			*metadata_offset = 0;
   1481		}
   1482
   1483		if (unlikely(!is_power_of_2(ic->tag_size))) {
   1484			hash_offset = (hash_offset + to_copy) % ic->tag_size;
   1485		}
   1486
   1487		total_size -= to_copy;
   1488	} while (unlikely(total_size));
   1489
   1490	return 0;
   1491#undef MAY_BE_FILLER
   1492#undef MAY_BE_HASH
   1493}
   1494
   1495struct flush_request {
   1496	struct dm_io_request io_req;
   1497	struct dm_io_region io_reg;
   1498	struct dm_integrity_c *ic;
   1499	struct completion comp;
   1500};
   1501
   1502static void flush_notify(unsigned long error, void *fr_)
   1503{
   1504	struct flush_request *fr = fr_;
   1505	if (unlikely(error != 0))
   1506		dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO);
   1507	complete(&fr->comp);
   1508}
   1509
   1510static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data)
   1511{
   1512	int r;
   1513
   1514	struct flush_request fr;
   1515
   1516	if (!ic->meta_dev)
   1517		flush_data = false;
   1518	if (flush_data) {
   1519		fr.io_req.bi_op = REQ_OP_WRITE,
   1520		fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
   1521		fr.io_req.mem.type = DM_IO_KMEM,
   1522		fr.io_req.mem.ptr.addr = NULL,
   1523		fr.io_req.notify.fn = flush_notify,
   1524		fr.io_req.notify.context = &fr;
   1525		fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio),
   1526		fr.io_reg.bdev = ic->dev->bdev,
   1527		fr.io_reg.sector = 0,
   1528		fr.io_reg.count = 0,
   1529		fr.ic = ic;
   1530		init_completion(&fr.comp);
   1531		r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL);
   1532		BUG_ON(r);
   1533	}
   1534
   1535	r = dm_bufio_write_dirty_buffers(ic->bufio);
   1536	if (unlikely(r))
   1537		dm_integrity_io_error(ic, "writing tags", r);
   1538
   1539	if (flush_data)
   1540		wait_for_completion(&fr.comp);
   1541}
   1542
   1543static void sleep_on_endio_wait(struct dm_integrity_c *ic)
   1544{
   1545	DECLARE_WAITQUEUE(wait, current);
   1546	__add_wait_queue(&ic->endio_wait, &wait);
   1547	__set_current_state(TASK_UNINTERRUPTIBLE);
   1548	spin_unlock_irq(&ic->endio_wait.lock);
   1549	io_schedule();
   1550	spin_lock_irq(&ic->endio_wait.lock);
   1551	__remove_wait_queue(&ic->endio_wait, &wait);
   1552}
   1553
   1554static void autocommit_fn(struct timer_list *t)
   1555{
   1556	struct dm_integrity_c *ic = from_timer(ic, t, autocommit_timer);
   1557
   1558	if (likely(!dm_integrity_failed(ic)))
   1559		queue_work(ic->commit_wq, &ic->commit_work);
   1560}
   1561
   1562static void schedule_autocommit(struct dm_integrity_c *ic)
   1563{
   1564	if (!timer_pending(&ic->autocommit_timer))
   1565		mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
   1566}
   1567
   1568static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
   1569{
   1570	struct bio *bio;
   1571	unsigned long flags;
   1572
   1573	spin_lock_irqsave(&ic->endio_wait.lock, flags);
   1574	bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
   1575	bio_list_add(&ic->flush_bio_list, bio);
   1576	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
   1577
   1578	queue_work(ic->commit_wq, &ic->commit_work);
   1579}
   1580
   1581static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
   1582{
   1583	int r = dm_integrity_failed(ic);
   1584	if (unlikely(r) && !bio->bi_status)
   1585		bio->bi_status = errno_to_blk_status(r);
   1586	if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
   1587		unsigned long flags;
   1588		spin_lock_irqsave(&ic->endio_wait.lock, flags);
   1589		bio_list_add(&ic->synchronous_bios, bio);
   1590		queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
   1591		spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
   1592		return;
   1593	}
   1594	bio_endio(bio);
   1595}
   1596
   1597static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
   1598{
   1599	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
   1600
   1601	if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
   1602		submit_flush_bio(ic, dio);
   1603	else
   1604		do_endio(ic, bio);
   1605}
   1606
   1607static void dec_in_flight(struct dm_integrity_io *dio)
   1608{
   1609	if (atomic_dec_and_test(&dio->in_flight)) {
   1610		struct dm_integrity_c *ic = dio->ic;
   1611		struct bio *bio;
   1612
   1613		remove_range(ic, &dio->range);
   1614
   1615		if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))
   1616			schedule_autocommit(ic);
   1617
   1618		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
   1619
   1620		if (unlikely(dio->bi_status) && !bio->bi_status)
   1621			bio->bi_status = dio->bi_status;
   1622		if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
   1623			dio->range.logical_sector += dio->range.n_sectors;
   1624			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
   1625			INIT_WORK(&dio->work, integrity_bio_wait);
   1626			queue_work(ic->offload_wq, &dio->work);
   1627			return;
   1628		}
   1629		do_endio_flush(ic, dio);
   1630	}
   1631}
   1632
   1633static void integrity_end_io(struct bio *bio)
   1634{
   1635	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
   1636
   1637	dm_bio_restore(&dio->bio_details, bio);
   1638	if (bio->bi_integrity)
   1639		bio->bi_opf |= REQ_INTEGRITY;
   1640
   1641	if (dio->completion)
   1642		complete(dio->completion);
   1643
   1644	dec_in_flight(dio);
   1645}
   1646
   1647static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
   1648				      const char *data, char *result)
   1649{
   1650	__le64 sector_le = cpu_to_le64(sector);
   1651	SHASH_DESC_ON_STACK(req, ic->internal_hash);
   1652	int r;
   1653	unsigned digest_size;
   1654
   1655	req->tfm = ic->internal_hash;
   1656
   1657	r = crypto_shash_init(req);
   1658	if (unlikely(r < 0)) {
   1659		dm_integrity_io_error(ic, "crypto_shash_init", r);
   1660		goto failed;
   1661	}
   1662
   1663	if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
   1664		r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE);
   1665		if (unlikely(r < 0)) {
   1666			dm_integrity_io_error(ic, "crypto_shash_update", r);
   1667			goto failed;
   1668		}
   1669	}
   1670
   1671	r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
   1672	if (unlikely(r < 0)) {
   1673		dm_integrity_io_error(ic, "crypto_shash_update", r);
   1674		goto failed;
   1675	}
   1676
   1677	r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
   1678	if (unlikely(r < 0)) {
   1679		dm_integrity_io_error(ic, "crypto_shash_update", r);
   1680		goto failed;
   1681	}
   1682
   1683	r = crypto_shash_final(req, result);
   1684	if (unlikely(r < 0)) {
   1685		dm_integrity_io_error(ic, "crypto_shash_final", r);
   1686		goto failed;
   1687	}
   1688
   1689	digest_size = crypto_shash_digestsize(ic->internal_hash);
   1690	if (unlikely(digest_size < ic->tag_size))
   1691		memset(result + digest_size, 0, ic->tag_size - digest_size);
   1692
   1693	return;
   1694
   1695failed:
   1696	/* this shouldn't happen anyway, the hash functions have no reason to fail */
   1697	get_random_bytes(result, ic->tag_size);
   1698}
   1699
   1700static void integrity_metadata(struct work_struct *w)
   1701{
   1702	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
   1703	struct dm_integrity_c *ic = dio->ic;
   1704
   1705	int r;
   1706
   1707	if (ic->internal_hash) {
   1708		struct bvec_iter iter;
   1709		struct bio_vec bv;
   1710		unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
   1711		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
   1712		char *checksums;
   1713		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
   1714		char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
   1715		sector_t sector;
   1716		unsigned sectors_to_process;
   1717
   1718		if (unlikely(ic->mode == 'R'))
   1719			goto skip_io;
   1720
   1721		if (likely(dio->op != REQ_OP_DISCARD))
   1722			checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
   1723					    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
   1724		else
   1725			checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
   1726		if (!checksums) {
   1727			checksums = checksums_onstack;
   1728			if (WARN_ON(extra_space &&
   1729				    digest_size > sizeof(checksums_onstack))) {
   1730				r = -EINVAL;
   1731				goto error;
   1732			}
   1733		}
   1734
   1735		if (unlikely(dio->op == REQ_OP_DISCARD)) {
   1736			sector_t bi_sector = dio->bio_details.bi_iter.bi_sector;
   1737			unsigned bi_size = dio->bio_details.bi_iter.bi_size;
   1738			unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE;
   1739			unsigned max_blocks = max_size / ic->tag_size;
   1740			memset(checksums, DISCARD_FILLER, max_size);
   1741
   1742			while (bi_size) {
   1743				unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
   1744				this_step_blocks = min(this_step_blocks, max_blocks);
   1745				r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
   1746							this_step_blocks * ic->tag_size, TAG_WRITE);
   1747				if (unlikely(r)) {
   1748					if (likely(checksums != checksums_onstack))
   1749						kfree(checksums);
   1750					goto error;
   1751				}
   1752
   1753				/*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) {
   1754					printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size);
   1755					printk("BUGG: this_step_blocks: %u\n", this_step_blocks);
   1756					BUG();
   1757				}*/
   1758				bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
   1759				bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block;
   1760			}
   1761
   1762			if (likely(checksums != checksums_onstack))
   1763				kfree(checksums);
   1764			goto skip_io;
   1765		}
   1766
   1767		sector = dio->range.logical_sector;
   1768		sectors_to_process = dio->range.n_sectors;
   1769
   1770		__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
   1771			unsigned pos;
   1772			char *mem, *checksums_ptr;
   1773
   1774again:
   1775			mem = bvec_kmap_local(&bv);
   1776			pos = 0;
   1777			checksums_ptr = checksums;
   1778			do {
   1779				integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
   1780				checksums_ptr += ic->tag_size;
   1781				sectors_to_process -= ic->sectors_per_block;
   1782				pos += ic->sectors_per_block << SECTOR_SHIFT;
   1783				sector += ic->sectors_per_block;
   1784			} while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
   1785			kunmap_local(mem);
   1786
   1787			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
   1788						checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
   1789			if (unlikely(r)) {
   1790				if (r > 0) {
   1791					sector_t s;
   1792
   1793					s = sector - ((r + ic->tag_size - 1) / ic->tag_size);
   1794					DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
   1795						    bio->bi_bdev, s);
   1796					r = -EILSEQ;
   1797					atomic64_inc(&ic->number_of_mismatches);
   1798					dm_audit_log_bio(DM_MSG_PREFIX, "integrity-checksum",
   1799							 bio, s, 0);
   1800				}
   1801				if (likely(checksums != checksums_onstack))
   1802					kfree(checksums);
   1803				goto error;
   1804			}
   1805
   1806			if (!sectors_to_process)
   1807				break;
   1808
   1809			if (unlikely(pos < bv.bv_len)) {
   1810				bv.bv_offset += pos;
   1811				bv.bv_len -= pos;
   1812				goto again;
   1813			}
   1814		}
   1815
   1816		if (likely(checksums != checksums_onstack))
   1817			kfree(checksums);
   1818	} else {
   1819		struct bio_integrity_payload *bip = dio->bio_details.bi_integrity;
   1820
   1821		if (bip) {
   1822			struct bio_vec biv;
   1823			struct bvec_iter iter;
   1824			unsigned data_to_process = dio->range.n_sectors;
   1825			sector_to_block(ic, data_to_process);
   1826			data_to_process *= ic->tag_size;
   1827
   1828			bip_for_each_vec(biv, bip, iter) {
   1829				unsigned char *tag;
   1830				unsigned this_len;
   1831
   1832				BUG_ON(PageHighMem(biv.bv_page));
   1833				tag = bvec_virt(&biv);
   1834				this_len = min(biv.bv_len, data_to_process);
   1835				r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
   1836							this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE);
   1837				if (unlikely(r))
   1838					goto error;
   1839				data_to_process -= this_len;
   1840				if (!data_to_process)
   1841					break;
   1842			}
   1843		}
   1844	}
   1845skip_io:
   1846	dec_in_flight(dio);
   1847	return;
   1848error:
   1849	dio->bi_status = errno_to_blk_status(r);
   1850	dec_in_flight(dio);
   1851}
   1852
   1853static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
   1854{
   1855	struct dm_integrity_c *ic = ti->private;
   1856	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
   1857	struct bio_integrity_payload *bip;
   1858
   1859	sector_t area, offset;
   1860
   1861	dio->ic = ic;
   1862	dio->bi_status = 0;
   1863	dio->op = bio_op(bio);
   1864
   1865	if (unlikely(dio->op == REQ_OP_DISCARD)) {
   1866		if (ti->max_io_len) {
   1867			sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
   1868			unsigned log2_max_io_len = __fls(ti->max_io_len);
   1869			sector_t start_boundary = sec >> log2_max_io_len;
   1870			sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len;
   1871			if (start_boundary < end_boundary) {
   1872				sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1));
   1873				dm_accept_partial_bio(bio, len);
   1874			}
   1875		}
   1876	}
   1877
   1878	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
   1879		submit_flush_bio(ic, dio);
   1880		return DM_MAPIO_SUBMITTED;
   1881	}
   1882
   1883	dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
   1884	dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA;
   1885	if (unlikely(dio->fua)) {
   1886		/*
   1887		 * Don't pass down the FUA flag because we have to flush
   1888		 * disk cache anyway.
   1889		 */
   1890		bio->bi_opf &= ~REQ_FUA;
   1891	}
   1892	if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
   1893		DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
   1894		      dio->range.logical_sector, bio_sectors(bio),
   1895		      ic->provided_data_sectors);
   1896		return DM_MAPIO_KILL;
   1897	}
   1898	if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
   1899		DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
   1900		      ic->sectors_per_block,
   1901		      dio->range.logical_sector, bio_sectors(bio));
   1902		return DM_MAPIO_KILL;
   1903	}
   1904
   1905	if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) {
   1906		struct bvec_iter iter;
   1907		struct bio_vec bv;
   1908		bio_for_each_segment(bv, bio, iter) {
   1909			if (unlikely(bv.bv_len & ((ic->sectors_per_block << SECTOR_SHIFT) - 1))) {
   1910				DMERR("Bio vector (%u,%u) is not aligned on %u-sector boundary",
   1911					bv.bv_offset, bv.bv_len, ic->sectors_per_block);
   1912				return DM_MAPIO_KILL;
   1913			}
   1914		}
   1915	}
   1916
   1917	bip = bio_integrity(bio);
   1918	if (!ic->internal_hash) {
   1919		if (bip) {
   1920			unsigned wanted_tag_size = bio_sectors(bio) >> ic->sb->log2_sectors_per_block;
   1921			if (ic->log2_tag_size >= 0)
   1922				wanted_tag_size <<= ic->log2_tag_size;
   1923			else
   1924				wanted_tag_size *= ic->tag_size;
   1925			if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
   1926				DMERR("Invalid integrity data size %u, expected %u",
   1927				      bip->bip_iter.bi_size, wanted_tag_size);
   1928				return DM_MAPIO_KILL;
   1929			}
   1930		}
   1931	} else {
   1932		if (unlikely(bip != NULL)) {
   1933			DMERR("Unexpected integrity data when using internal hash");
   1934			return DM_MAPIO_KILL;
   1935		}
   1936	}
   1937
   1938	if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ))
   1939		return DM_MAPIO_KILL;
   1940
   1941	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
   1942	dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
   1943	bio->bi_iter.bi_sector = get_data_sector(ic, area, offset);
   1944
   1945	dm_integrity_map_continue(dio, true);
   1946	return DM_MAPIO_SUBMITTED;
   1947}
   1948
   1949static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
   1950				 unsigned journal_section, unsigned journal_entry)
   1951{
   1952	struct dm_integrity_c *ic = dio->ic;
   1953	sector_t logical_sector;
   1954	unsigned n_sectors;
   1955
   1956	logical_sector = dio->range.logical_sector;
   1957	n_sectors = dio->range.n_sectors;
   1958	do {
   1959		struct bio_vec bv = bio_iovec(bio);
   1960		char *mem;
   1961
   1962		if (unlikely(bv.bv_len >> SECTOR_SHIFT > n_sectors))
   1963			bv.bv_len = n_sectors << SECTOR_SHIFT;
   1964		n_sectors -= bv.bv_len >> SECTOR_SHIFT;
   1965		bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
   1966retry_kmap:
   1967		mem = kmap_local_page(bv.bv_page);
   1968		if (likely(dio->op == REQ_OP_WRITE))
   1969			flush_dcache_page(bv.bv_page);
   1970
   1971		do {
   1972			struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
   1973
   1974			if (unlikely(dio->op == REQ_OP_READ)) {
   1975				struct journal_sector *js;
   1976				char *mem_ptr;
   1977				unsigned s;
   1978
   1979				if (unlikely(journal_entry_is_inprogress(je))) {
   1980					flush_dcache_page(bv.bv_page);
   1981					kunmap_local(mem);
   1982
   1983					__io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
   1984					goto retry_kmap;
   1985				}
   1986				smp_rmb();
   1987				BUG_ON(journal_entry_get_sector(je) != logical_sector);
   1988				js = access_journal_data(ic, journal_section, journal_entry);
   1989				mem_ptr = mem + bv.bv_offset;
   1990				s = 0;
   1991				do {
   1992					memcpy(mem_ptr, js, JOURNAL_SECTOR_DATA);
   1993					*(commit_id_t *)(mem_ptr + JOURNAL_SECTOR_DATA) = je->last_bytes[s];
   1994					js++;
   1995					mem_ptr += 1 << SECTOR_SHIFT;
   1996				} while (++s < ic->sectors_per_block);
   1997#ifdef INTERNAL_VERIFY
   1998				if (ic->internal_hash) {
   1999					char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
   2000
   2001					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
   2002					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
   2003						DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
   2004							    logical_sector);
   2005						dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
   2006								 bio, logical_sector, 0);
   2007					}
   2008				}
   2009#endif
   2010			}
   2011
   2012			if (!ic->internal_hash) {
   2013				struct bio_integrity_payload *bip = bio_integrity(bio);
   2014				unsigned tag_todo = ic->tag_size;
   2015				char *tag_ptr = journal_entry_tag(ic, je);
   2016
   2017				if (bip) do {
   2018					struct bio_vec biv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
   2019					unsigned tag_now = min(biv.bv_len, tag_todo);
   2020					char *tag_addr;
   2021					BUG_ON(PageHighMem(biv.bv_page));
   2022					tag_addr = bvec_virt(&biv);
   2023					if (likely(dio->op == REQ_OP_WRITE))
   2024						memcpy(tag_ptr, tag_addr, tag_now);
   2025					else
   2026						memcpy(tag_addr, tag_ptr, tag_now);
   2027					bvec_iter_advance(bip->bip_vec, &bip->bip_iter, tag_now);
   2028					tag_ptr += tag_now;
   2029					tag_todo -= tag_now;
   2030				} while (unlikely(tag_todo)); else {
   2031					if (likely(dio->op == REQ_OP_WRITE))
   2032						memset(tag_ptr, 0, tag_todo);
   2033				}
   2034			}
   2035
   2036			if (likely(dio->op == REQ_OP_WRITE)) {
   2037				struct journal_sector *js;
   2038				unsigned s;
   2039
   2040				js = access_journal_data(ic, journal_section, journal_entry);
   2041				memcpy(js, mem + bv.bv_offset, ic->sectors_per_block << SECTOR_SHIFT);
   2042
   2043				s = 0;
   2044				do {
   2045					je->last_bytes[s] = js[s].commit_id;
   2046				} while (++s < ic->sectors_per_block);
   2047
   2048				if (ic->internal_hash) {
   2049					unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
   2050					if (unlikely(digest_size > ic->tag_size)) {
   2051						char checksums_onstack[HASH_MAX_DIGESTSIZE];
   2052						integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
   2053						memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
   2054					} else
   2055						integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
   2056				}
   2057
   2058				journal_entry_set_sector(je, logical_sector);
   2059			}
   2060			logical_sector += ic->sectors_per_block;
   2061
   2062			journal_entry++;
   2063			if (unlikely(journal_entry == ic->journal_section_entries)) {
   2064				journal_entry = 0;
   2065				journal_section++;
   2066				wraparound_section(ic, &journal_section);
   2067			}
   2068
   2069			bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
   2070		} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
   2071
   2072		if (unlikely(dio->op == REQ_OP_READ))
   2073			flush_dcache_page(bv.bv_page);
   2074		kunmap_local(mem);
   2075	} while (n_sectors);
   2076
   2077	if (likely(dio->op == REQ_OP_WRITE)) {
   2078		smp_mb();
   2079		if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
   2080			wake_up(&ic->copy_to_journal_wait);
   2081		if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold) {
   2082			queue_work(ic->commit_wq, &ic->commit_work);
   2083		} else {
   2084			schedule_autocommit(ic);
   2085		}
   2086	} else {
   2087		remove_range(ic, &dio->range);
   2088	}
   2089
   2090	if (unlikely(bio->bi_iter.bi_size)) {
   2091		sector_t area, offset;
   2092
   2093		dio->range.logical_sector = logical_sector;
   2094		get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
   2095		dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, &dio->metadata_offset);
   2096		return true;
   2097	}
   2098
   2099	return false;
   2100}
   2101
   2102static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map)
   2103{
   2104	struct dm_integrity_c *ic = dio->ic;
   2105	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
   2106	unsigned journal_section, journal_entry;
   2107	unsigned journal_read_pos;
   2108	struct completion read_comp;
   2109	bool discard_retried = false;
   2110	bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
   2111	if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
   2112		need_sync_io = true;
   2113
   2114	if (need_sync_io && from_map) {
   2115		INIT_WORK(&dio->work, integrity_bio_wait);
   2116		queue_work(ic->offload_wq, &dio->work);
   2117		return;
   2118	}
   2119
   2120lock_retry:
   2121	spin_lock_irq(&ic->endio_wait.lock);
   2122retry:
   2123	if (unlikely(dm_integrity_failed(ic))) {
   2124		spin_unlock_irq(&ic->endio_wait.lock);
   2125		do_endio(ic, bio);
   2126		return;
   2127	}
   2128	dio->range.n_sectors = bio_sectors(bio);
   2129	journal_read_pos = NOT_FOUND;
   2130	if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) {
   2131		if (dio->op == REQ_OP_WRITE) {
   2132			unsigned next_entry, i, pos;
   2133			unsigned ws, we, range_sectors;
   2134
   2135			dio->range.n_sectors = min(dio->range.n_sectors,
   2136						   (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
   2137			if (unlikely(!dio->range.n_sectors)) {
   2138				if (from_map)
   2139					goto offload_to_thread;
   2140				sleep_on_endio_wait(ic);
   2141				goto retry;
   2142			}
   2143			range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
   2144			ic->free_sectors -= range_sectors;
   2145			journal_section = ic->free_section;
   2146			journal_entry = ic->free_section_entry;
   2147
   2148			next_entry = ic->free_section_entry + range_sectors;
   2149			ic->free_section_entry = next_entry % ic->journal_section_entries;
   2150			ic->free_section += next_entry / ic->journal_section_entries;
   2151			ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
   2152			wraparound_section(ic, &ic->free_section);
   2153
   2154			pos = journal_section * ic->journal_section_entries + journal_entry;
   2155			ws = journal_section;
   2156			we = journal_entry;
   2157			i = 0;
   2158			do {
   2159				struct journal_entry *je;
   2160
   2161				add_journal_node(ic, &ic->journal_tree[pos], dio->range.logical_sector + i);
   2162				pos++;
   2163				if (unlikely(pos >= ic->journal_entries))
   2164					pos = 0;
   2165
   2166				je = access_journal_entry(ic, ws, we);
   2167				BUG_ON(!journal_entry_is_unused(je));
   2168				journal_entry_set_inprogress(je);
   2169				we++;
   2170				if (unlikely(we == ic->journal_section_entries)) {
   2171					we = 0;
   2172					ws++;
   2173					wraparound_section(ic, &ws);
   2174				}
   2175			} while ((i += ic->sectors_per_block) < dio->range.n_sectors);
   2176
   2177			spin_unlock_irq(&ic->endio_wait.lock);
   2178			goto journal_read_write;
   2179		} else {
   2180			sector_t next_sector;
   2181			journal_read_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
   2182			if (likely(journal_read_pos == NOT_FOUND)) {
   2183				if (unlikely(dio->range.n_sectors > next_sector - dio->range.logical_sector))
   2184					dio->range.n_sectors = next_sector - dio->range.logical_sector;
   2185			} else {
   2186				unsigned i;
   2187				unsigned jp = journal_read_pos + 1;
   2188				for (i = ic->sectors_per_block; i < dio->range.n_sectors; i += ic->sectors_per_block, jp++) {
   2189					if (!test_journal_node(ic, jp, dio->range.logical_sector + i))
   2190						break;
   2191				}
   2192				dio->range.n_sectors = i;
   2193			}
   2194		}
   2195	}
   2196	if (unlikely(!add_new_range(ic, &dio->range, true))) {
   2197		/*
   2198		 * We must not sleep in the request routine because it could
   2199		 * stall bios on current->bio_list.
   2200		 * So, we offload the bio to a workqueue if we have to sleep.
   2201		 */
   2202		if (from_map) {
   2203offload_to_thread:
   2204			spin_unlock_irq(&ic->endio_wait.lock);
   2205			INIT_WORK(&dio->work, integrity_bio_wait);
   2206			queue_work(ic->wait_wq, &dio->work);
   2207			return;
   2208		}
   2209		if (journal_read_pos != NOT_FOUND)
   2210			dio->range.n_sectors = ic->sectors_per_block;
   2211		wait_and_add_new_range(ic, &dio->range);
   2212		/*
   2213		 * wait_and_add_new_range drops the spinlock, so the journal
   2214		 * may have been changed arbitrarily. We need to recheck.
   2215		 * To simplify the code, we restrict I/O size to just one block.
   2216		 */
   2217		if (journal_read_pos != NOT_FOUND) {
   2218			sector_t next_sector;
   2219			unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
   2220			if (unlikely(new_pos != journal_read_pos)) {
   2221				remove_range_unlocked(ic, &dio->range);
   2222				goto retry;
   2223			}
   2224		}
   2225	}
   2226	if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) {
   2227		sector_t next_sector;
   2228		unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
   2229		if (unlikely(new_pos != NOT_FOUND) ||
   2230		    unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) {
   2231			remove_range_unlocked(ic, &dio->range);
   2232			spin_unlock_irq(&ic->endio_wait.lock);
   2233			queue_work(ic->commit_wq, &ic->commit_work);
   2234			flush_workqueue(ic->commit_wq);
   2235			queue_work(ic->writer_wq, &ic->writer_work);
   2236			flush_workqueue(ic->writer_wq);
   2237			discard_retried = true;
   2238			goto lock_retry;
   2239		}
   2240	}
   2241	spin_unlock_irq(&ic->endio_wait.lock);
   2242
   2243	if (unlikely(journal_read_pos != NOT_FOUND)) {
   2244		journal_section = journal_read_pos / ic->journal_section_entries;
   2245		journal_entry = journal_read_pos % ic->journal_section_entries;
   2246		goto journal_read_write;
   2247	}
   2248
   2249	if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) {
   2250		if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
   2251				     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
   2252			struct bitmap_block_status *bbs;
   2253
   2254			bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
   2255			spin_lock(&bbs->bio_queue_lock);
   2256			bio_list_add(&bbs->bio_queue, bio);
   2257			spin_unlock(&bbs->bio_queue_lock);
   2258			queue_work(ic->writer_wq, &bbs->work);
   2259			return;
   2260		}
   2261	}
   2262
   2263	dio->in_flight = (atomic_t)ATOMIC_INIT(2);
   2264
   2265	if (need_sync_io) {
   2266		init_completion(&read_comp);
   2267		dio->completion = &read_comp;
   2268	} else
   2269		dio->completion = NULL;
   2270
   2271	dm_bio_record(&dio->bio_details, bio);
   2272	bio_set_dev(bio, ic->dev->bdev);
   2273	bio->bi_integrity = NULL;
   2274	bio->bi_opf &= ~REQ_INTEGRITY;
   2275	bio->bi_end_io = integrity_end_io;
   2276	bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
   2277
   2278	if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
   2279		integrity_metadata(&dio->work);
   2280		dm_integrity_flush_buffers(ic, false);
   2281
   2282		dio->in_flight = (atomic_t)ATOMIC_INIT(1);
   2283		dio->completion = NULL;
   2284
   2285		submit_bio_noacct(bio);
   2286
   2287		return;
   2288	}
   2289
   2290	submit_bio_noacct(bio);
   2291
   2292	if (need_sync_io) {
   2293		wait_for_completion_io(&read_comp);
   2294		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
   2295		    dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
   2296			goto skip_check;
   2297		if (ic->mode == 'B') {
   2298			if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
   2299					     dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
   2300				goto skip_check;
   2301		}
   2302
   2303		if (likely(!bio->bi_status))
   2304			integrity_metadata(&dio->work);
   2305		else
   2306skip_check:
   2307			dec_in_flight(dio);
   2308
   2309	} else {
   2310		INIT_WORK(&dio->work, integrity_metadata);
   2311		queue_work(ic->metadata_wq, &dio->work);
   2312	}
   2313
   2314	return;
   2315
   2316journal_read_write:
   2317	if (unlikely(__journal_read_write(dio, bio, journal_section, journal_entry)))
   2318		goto lock_retry;
   2319
   2320	do_endio_flush(ic, dio);
   2321}
   2322
   2323
   2324static void integrity_bio_wait(struct work_struct *w)
   2325{
   2326	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
   2327
   2328	dm_integrity_map_continue(dio, false);
   2329}
   2330
   2331static void pad_uncommitted(struct dm_integrity_c *ic)
   2332{
   2333	if (ic->free_section_entry) {
   2334		ic->free_sectors -= ic->journal_section_entries - ic->free_section_entry;
   2335		ic->free_section_entry = 0;
   2336		ic->free_section++;
   2337		wraparound_section(ic, &ic->free_section);
   2338		ic->n_uncommitted_sections++;
   2339	}
   2340	if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
   2341		    (ic->n_uncommitted_sections + ic->n_committed_sections) *
   2342		    ic->journal_section_entries + ic->free_sectors)) {
   2343		DMCRIT("journal_sections %u, journal_section_entries %u, "
   2344		       "n_uncommitted_sections %u, n_committed_sections %u, "
   2345		       "journal_section_entries %u, free_sectors %u",
   2346		       ic->journal_sections, ic->journal_section_entries,
   2347		       ic->n_uncommitted_sections, ic->n_committed_sections,
   2348		       ic->journal_section_entries, ic->free_sectors);
   2349	}
   2350}
   2351
   2352static void integrity_commit(struct work_struct *w)
   2353{
   2354	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, commit_work);
   2355	unsigned commit_start, commit_sections;
   2356	unsigned i, j, n;
   2357	struct bio *flushes;
   2358
   2359	del_timer(&ic->autocommit_timer);
   2360
   2361	spin_lock_irq(&ic->endio_wait.lock);
   2362	flushes = bio_list_get(&ic->flush_bio_list);
   2363	if (unlikely(ic->mode != 'J')) {
   2364		spin_unlock_irq(&ic->endio_wait.lock);
   2365		dm_integrity_flush_buffers(ic, true);
   2366		goto release_flush_bios;
   2367	}
   2368
   2369	pad_uncommitted(ic);
   2370	commit_start = ic->uncommitted_section;
   2371	commit_sections = ic->n_uncommitted_sections;
   2372	spin_unlock_irq(&ic->endio_wait.lock);
   2373
   2374	if (!commit_sections)
   2375		goto release_flush_bios;
   2376
   2377	i = commit_start;
   2378	for (n = 0; n < commit_sections; n++) {
   2379		for (j = 0; j < ic->journal_section_entries; j++) {
   2380			struct journal_entry *je;
   2381			je = access_journal_entry(ic, i, j);
   2382			io_wait_event(ic->copy_to_journal_wait, !journal_entry_is_inprogress(je));
   2383		}
   2384		for (j = 0; j < ic->journal_section_sectors; j++) {
   2385			struct journal_sector *js;
   2386			js = access_journal(ic, i, j);
   2387			js->commit_id = dm_integrity_commit_id(ic, i, j, ic->commit_seq);
   2388		}
   2389		i++;
   2390		if (unlikely(i >= ic->journal_sections))
   2391			ic->commit_seq = next_commit_seq(ic->commit_seq);
   2392		wraparound_section(ic, &i);
   2393	}
   2394	smp_rmb();
   2395
   2396	write_journal(ic, commit_start, commit_sections);
   2397
   2398	spin_lock_irq(&ic->endio_wait.lock);
   2399	ic->uncommitted_section += commit_sections;
   2400	wraparound_section(ic, &ic->uncommitted_section);
   2401	ic->n_uncommitted_sections -= commit_sections;
   2402	ic->n_committed_sections += commit_sections;
   2403	spin_unlock_irq(&ic->endio_wait.lock);
   2404
   2405	if (READ_ONCE(ic->free_sectors) <= ic->free_sectors_threshold)
   2406		queue_work(ic->writer_wq, &ic->writer_work);
   2407
   2408release_flush_bios:
   2409	while (flushes) {
   2410		struct bio *next = flushes->bi_next;
   2411		flushes->bi_next = NULL;
   2412		do_endio(ic, flushes);
   2413		flushes = next;
   2414	}
   2415}
   2416
   2417static void complete_copy_from_journal(unsigned long error, void *context)
   2418{
   2419	struct journal_io *io = context;
   2420	struct journal_completion *comp = io->comp;
   2421	struct dm_integrity_c *ic = comp->ic;
   2422	remove_range(ic, &io->range);
   2423	mempool_free(io, &ic->journal_io_mempool);
   2424	if (unlikely(error != 0))
   2425		dm_integrity_io_error(ic, "copying from journal", -EIO);
   2426	complete_journal_op(comp);
   2427}
   2428
   2429static void restore_last_bytes(struct dm_integrity_c *ic, struct journal_sector *js,
   2430			       struct journal_entry *je)
   2431{
   2432	unsigned s = 0;
   2433	do {
   2434		js->commit_id = je->last_bytes[s];
   2435		js++;
   2436	} while (++s < ic->sectors_per_block);
   2437}
   2438
   2439static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
   2440			     unsigned write_sections, bool from_replay)
   2441{
   2442	unsigned i, j, n;
   2443	struct journal_completion comp;
   2444	struct blk_plug plug;
   2445
   2446	blk_start_plug(&plug);
   2447
   2448	comp.ic = ic;
   2449	comp.in_flight = (atomic_t)ATOMIC_INIT(1);
   2450	init_completion(&comp.comp);
   2451
   2452	i = write_start;
   2453	for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
   2454#ifndef INTERNAL_VERIFY
   2455		if (unlikely(from_replay))
   2456#endif
   2457			rw_section_mac(ic, i, false);
   2458		for (j = 0; j < ic->journal_section_entries; j++) {
   2459			struct journal_entry *je = access_journal_entry(ic, i, j);
   2460			sector_t sec, area, offset;
   2461			unsigned k, l, next_loop;
   2462			sector_t metadata_block;
   2463			unsigned metadata_offset;
   2464			struct journal_io *io;
   2465
   2466			if (journal_entry_is_unused(je))
   2467				continue;
   2468			BUG_ON(unlikely(journal_entry_is_inprogress(je)) && !from_replay);
   2469			sec = journal_entry_get_sector(je);
   2470			if (unlikely(from_replay)) {
   2471				if (unlikely(sec & (unsigned)(ic->sectors_per_block - 1))) {
   2472					dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
   2473					sec &= ~(sector_t)(ic->sectors_per_block - 1);
   2474				}
   2475				if (unlikely(sec >= ic->provided_data_sectors)) {
   2476					journal_entry_set_unused(je);
   2477					continue;
   2478				}
   2479			}
   2480			get_area_and_offset(ic, sec, &area, &offset);
   2481			restore_last_bytes(ic, access_journal_data(ic, i, j), je);
   2482			for (k = j + 1; k < ic->journal_section_entries; k++) {
   2483				struct journal_entry *je2 = access_journal_entry(ic, i, k);
   2484				sector_t sec2, area2, offset2;
   2485				if (journal_entry_is_unused(je2))
   2486					break;
   2487				BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
   2488				sec2 = journal_entry_get_sector(je2);
   2489				if (unlikely(sec2 >= ic->provided_data_sectors))
   2490					break;
   2491				get_area_and_offset(ic, sec2, &area2, &offset2);
   2492				if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
   2493					break;
   2494				restore_last_bytes(ic, access_journal_data(ic, i, k), je2);
   2495			}
   2496			next_loop = k - 1;
   2497
   2498			io = mempool_alloc(&ic->journal_io_mempool, GFP_NOIO);
   2499			io->comp = &comp;
   2500			io->range.logical_sector = sec;
   2501			io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
   2502
   2503			spin_lock_irq(&ic->endio_wait.lock);
   2504			add_new_range_and_wait(ic, &io->range);
   2505
   2506			if (likely(!from_replay)) {
   2507				struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
   2508
   2509				/* don't write if there is newer committed sector */
   2510				while (j < k && find_newer_committed_node(ic, &section_node[j])) {
   2511					struct journal_entry *je2 = access_journal_entry(ic, i, j);
   2512
   2513					journal_entry_set_unused(je2);
   2514					remove_journal_node(ic, &section_node[j]);
   2515					j++;
   2516					sec += ic->sectors_per_block;
   2517					offset += ic->sectors_per_block;
   2518				}
   2519				while (j < k && find_newer_committed_node(ic, &section_node[k - 1])) {
   2520					struct journal_entry *je2 = access_journal_entry(ic, i, k - 1);
   2521
   2522					journal_entry_set_unused(je2);
   2523					remove_journal_node(ic, &section_node[k - 1]);
   2524					k--;
   2525				}
   2526				if (j == k) {
   2527					remove_range_unlocked(ic, &io->range);
   2528					spin_unlock_irq(&ic->endio_wait.lock);
   2529					mempool_free(io, &ic->journal_io_mempool);
   2530					goto skip_io;
   2531				}
   2532				for (l = j; l < k; l++) {
   2533					remove_journal_node(ic, &section_node[l]);
   2534				}
   2535			}
   2536			spin_unlock_irq(&ic->endio_wait.lock);
   2537
   2538			metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
   2539			for (l = j; l < k; l++) {
   2540				int r;
   2541				struct journal_entry *je2 = access_journal_entry(ic, i, l);
   2542
   2543				if (
   2544#ifndef INTERNAL_VERIFY
   2545				    unlikely(from_replay) &&
   2546#endif
   2547				    ic->internal_hash) {
   2548					char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
   2549
   2550					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
   2551								  (char *)access_journal_data(ic, i, l), test_tag);
   2552					if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
   2553						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
   2554						dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
   2555					}
   2556				}
   2557
   2558				journal_entry_set_unused(je2);
   2559				r = dm_integrity_rw_tag(ic, journal_entry_tag(ic, je2), &metadata_block, &metadata_offset,
   2560							ic->tag_size, TAG_WRITE);
   2561				if (unlikely(r)) {
   2562					dm_integrity_io_error(ic, "reading tags", r);
   2563				}
   2564			}
   2565
   2566			atomic_inc(&comp.in_flight);
   2567			copy_from_journal(ic, i, j << ic->sb->log2_sectors_per_block,
   2568					  (k - j) << ic->sb->log2_sectors_per_block,
   2569					  get_data_sector(ic, area, offset),
   2570					  complete_copy_from_journal, io);
   2571skip_io:
   2572			j = next_loop;
   2573		}
   2574	}
   2575
   2576	dm_bufio_write_dirty_buffers_async(ic->bufio);
   2577
   2578	blk_finish_plug(&plug);
   2579
   2580	complete_journal_op(&comp);
   2581	wait_for_completion_io(&comp.comp);
   2582
   2583	dm_integrity_flush_buffers(ic, true);
   2584}
   2585
   2586static void integrity_writer(struct work_struct *w)
   2587{
   2588	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, writer_work);
   2589	unsigned write_start, write_sections;
   2590
   2591	unsigned prev_free_sectors;
   2592
   2593	/* the following test is not needed, but it tests the replay code */
   2594	if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev)
   2595		return;
   2596
   2597	spin_lock_irq(&ic->endio_wait.lock);
   2598	write_start = ic->committed_section;
   2599	write_sections = ic->n_committed_sections;
   2600	spin_unlock_irq(&ic->endio_wait.lock);
   2601
   2602	if (!write_sections)
   2603		return;
   2604
   2605	do_journal_write(ic, write_start, write_sections, false);
   2606
   2607	spin_lock_irq(&ic->endio_wait.lock);
   2608
   2609	ic->committed_section += write_sections;
   2610	wraparound_section(ic, &ic->committed_section);
   2611	ic->n_committed_sections -= write_sections;
   2612
   2613	prev_free_sectors = ic->free_sectors;
   2614	ic->free_sectors += write_sections * ic->journal_section_entries;
   2615	if (unlikely(!prev_free_sectors))
   2616		wake_up_locked(&ic->endio_wait);
   2617
   2618	spin_unlock_irq(&ic->endio_wait.lock);
   2619}
   2620
   2621static void recalc_write_super(struct dm_integrity_c *ic)
   2622{
   2623	int r;
   2624
   2625	dm_integrity_flush_buffers(ic, false);
   2626	if (dm_integrity_failed(ic))
   2627		return;
   2628
   2629	r = sync_rw_sb(ic, REQ_OP_WRITE, 0);
   2630	if (unlikely(r))
   2631		dm_integrity_io_error(ic, "writing superblock", r);
   2632}
   2633
   2634static void integrity_recalc(struct work_struct *w)
   2635{
   2636	struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
   2637	struct dm_integrity_range range;
   2638	struct dm_io_request io_req;
   2639	struct dm_io_region io_loc;
   2640	sector_t area, offset;
   2641	sector_t metadata_block;
   2642	unsigned metadata_offset;
   2643	sector_t logical_sector, n_sectors;
   2644	__u8 *t;
   2645	unsigned i;
   2646	int r;
   2647	unsigned super_counter = 0;
   2648
   2649	DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
   2650
   2651	spin_lock_irq(&ic->endio_wait.lock);
   2652
   2653next_chunk:
   2654
   2655	if (unlikely(dm_post_suspending(ic->ti)))
   2656		goto unlock_ret;
   2657
   2658	range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
   2659	if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
   2660		if (ic->mode == 'B') {
   2661			block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
   2662			DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
   2663			queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
   2664		}
   2665		goto unlock_ret;
   2666	}
   2667
   2668	get_area_and_offset(ic, range.logical_sector, &area, &offset);
   2669	range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
   2670	if (!ic->meta_dev)
   2671		range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
   2672
   2673	add_new_range_and_wait(ic, &range);
   2674	spin_unlock_irq(&ic->endio_wait.lock);
   2675	logical_sector = range.logical_sector;
   2676	n_sectors = range.n_sectors;
   2677
   2678	if (ic->mode == 'B') {
   2679		if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
   2680			goto advance_and_next;
   2681		}
   2682		while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
   2683				       ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
   2684			logical_sector += ic->sectors_per_block;
   2685			n_sectors -= ic->sectors_per_block;
   2686			cond_resched();
   2687		}
   2688		while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
   2689				       ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
   2690			n_sectors -= ic->sectors_per_block;
   2691			cond_resched();
   2692		}
   2693		get_area_and_offset(ic, logical_sector, &area, &offset);
   2694	}
   2695
   2696	DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors);
   2697
   2698	if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
   2699		recalc_write_super(ic);
   2700		if (ic->mode == 'B') {
   2701			queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
   2702		}
   2703		super_counter = 0;
   2704	}
   2705
   2706	if (unlikely(dm_integrity_failed(ic)))
   2707		goto err;
   2708
   2709	io_req.bi_op = REQ_OP_READ;
   2710	io_req.bi_op_flags = 0;
   2711	io_req.mem.type = DM_IO_VMA;
   2712	io_req.mem.ptr.addr = ic->recalc_buffer;
   2713	io_req.notify.fn = NULL;
   2714	io_req.client = ic->io;
   2715	io_loc.bdev = ic->dev->bdev;
   2716	io_loc.sector = get_data_sector(ic, area, offset);
   2717	io_loc.count = n_sectors;
   2718
   2719	r = dm_io(&io_req, 1, &io_loc, NULL);
   2720	if (unlikely(r)) {
   2721		dm_integrity_io_error(ic, "reading data", r);
   2722		goto err;
   2723	}
   2724
   2725	t = ic->recalc_tags;
   2726	for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
   2727		integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
   2728		t += ic->tag_size;
   2729	}
   2730
   2731	metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
   2732
   2733	r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
   2734	if (unlikely(r)) {
   2735		dm_integrity_io_error(ic, "writing tags", r);
   2736		goto err;
   2737	}
   2738
   2739	if (ic->mode == 'B') {
   2740		sector_t start, end;
   2741		start = (range.logical_sector >>
   2742			 (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
   2743			(ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
   2744		end = ((range.logical_sector + range.n_sectors) >>
   2745		       (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)) <<
   2746			(ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
   2747		block_bitmap_op(ic, ic->recalc_bitmap, start, end - start, BITMAP_OP_CLEAR);
   2748	}
   2749
   2750advance_and_next:
   2751	cond_resched();
   2752
   2753	spin_lock_irq(&ic->endio_wait.lock);
   2754	remove_range_unlocked(ic, &range);
   2755	ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
   2756	goto next_chunk;
   2757
   2758err:
   2759	remove_range(ic, &range);
   2760	return;
   2761
   2762unlock_ret:
   2763	spin_unlock_irq(&ic->endio_wait.lock);
   2764
   2765	recalc_write_super(ic);
   2766}
   2767
   2768static void bitmap_block_work(struct work_struct *w)
   2769{
   2770	struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
   2771	struct dm_integrity_c *ic = bbs->ic;
   2772	struct bio *bio;
   2773	struct bio_list bio_queue;
   2774	struct bio_list waiting;
   2775
   2776	bio_list_init(&waiting);
   2777
   2778	spin_lock(&bbs->bio_queue_lock);
   2779	bio_queue = bbs->bio_queue;
   2780	bio_list_init(&bbs->bio_queue);
   2781	spin_unlock(&bbs->bio_queue_lock);
   2782
   2783	while ((bio = bio_list_pop(&bio_queue))) {
   2784		struct dm_integrity_io *dio;
   2785
   2786		dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
   2787
   2788		if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
   2789				    dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
   2790			remove_range(ic, &dio->range);
   2791			INIT_WORK(&dio->work, integrity_bio_wait);
   2792			queue_work(ic->offload_wq, &dio->work);
   2793		} else {
   2794			block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
   2795					dio->range.n_sectors, BITMAP_OP_SET);
   2796			bio_list_add(&waiting, bio);
   2797		}
   2798	}
   2799
   2800	if (bio_list_empty(&waiting))
   2801		return;
   2802
   2803	rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
   2804			   bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
   2805			   BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
   2806
   2807	while ((bio = bio_list_pop(&waiting))) {
   2808		struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
   2809
   2810		block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
   2811				dio->range.n_sectors, BITMAP_OP_SET);
   2812
   2813		remove_range(ic, &dio->range);
   2814		INIT_WORK(&dio->work, integrity_bio_wait);
   2815		queue_work(ic->offload_wq, &dio->work);
   2816	}
   2817
   2818	queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
   2819}
   2820
   2821static void bitmap_flush_work(struct work_struct *work)
   2822{
   2823	struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
   2824	struct dm_integrity_range range;
   2825	unsigned long limit;
   2826	struct bio *bio;
   2827
   2828	dm_integrity_flush_buffers(ic, false);
   2829
   2830	range.logical_sector = 0;
   2831	range.n_sectors = ic->provided_data_sectors;
   2832
   2833	spin_lock_irq(&ic->endio_wait.lock);
   2834	add_new_range_and_wait(ic, &range);
   2835	spin_unlock_irq(&ic->endio_wait.lock);
   2836
   2837	dm_integrity_flush_buffers(ic, true);
   2838
   2839	limit = ic->provided_data_sectors;
   2840	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
   2841		limit = le64_to_cpu(ic->sb->recalc_sector)
   2842			>> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
   2843			<< (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
   2844	}
   2845	/*DEBUG_print("zeroing journal\n");*/
   2846	block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
   2847	block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
   2848
   2849	rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
   2850			   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   2851
   2852	spin_lock_irq(&ic->endio_wait.lock);
   2853	remove_range_unlocked(ic, &range);
   2854	while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
   2855		bio_endio(bio);
   2856		spin_unlock_irq(&ic->endio_wait.lock);
   2857		spin_lock_irq(&ic->endio_wait.lock);
   2858	}
   2859	spin_unlock_irq(&ic->endio_wait.lock);
   2860}
   2861
   2862
   2863static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
   2864			 unsigned n_sections, unsigned char commit_seq)
   2865{
   2866	unsigned i, j, n;
   2867
   2868	if (!n_sections)
   2869		return;
   2870
   2871	for (n = 0; n < n_sections; n++) {
   2872		i = start_section + n;
   2873		wraparound_section(ic, &i);
   2874		for (j = 0; j < ic->journal_section_sectors; j++) {
   2875			struct journal_sector *js = access_journal(ic, i, j);
   2876			BUILD_BUG_ON(sizeof(js->sectors) != JOURNAL_SECTOR_DATA);
   2877			memset(&js->sectors, 0, sizeof(js->sectors));
   2878			js->commit_id = dm_integrity_commit_id(ic, i, j, commit_seq);
   2879		}
   2880		for (j = 0; j < ic->journal_section_entries; j++) {
   2881			struct journal_entry *je = access_journal_entry(ic, i, j);
   2882			journal_entry_set_unused(je);
   2883		}
   2884	}
   2885
   2886	write_journal(ic, start_section, n_sections);
   2887}
   2888
   2889static int find_commit_seq(struct dm_integrity_c *ic, unsigned i, unsigned j, commit_id_t id)
   2890{
   2891	unsigned char k;
   2892	for (k = 0; k < N_COMMIT_IDS; k++) {
   2893		if (dm_integrity_commit_id(ic, i, j, k) == id)
   2894			return k;
   2895	}
   2896	dm_integrity_io_error(ic, "journal commit id", -EIO);
   2897	return -EIO;
   2898}
   2899
   2900static void replay_journal(struct dm_integrity_c *ic)
   2901{
   2902	unsigned i, j;
   2903	bool used_commit_ids[N_COMMIT_IDS];
   2904	unsigned max_commit_id_sections[N_COMMIT_IDS];
   2905	unsigned write_start, write_sections;
   2906	unsigned continue_section;
   2907	bool journal_empty;
   2908	unsigned char unused, last_used, want_commit_seq;
   2909
   2910	if (ic->mode == 'R')
   2911		return;
   2912
   2913	if (ic->journal_uptodate)
   2914		return;
   2915
   2916	last_used = 0;
   2917	write_start = 0;
   2918
   2919	if (!ic->just_formatted) {
   2920		DEBUG_print("reading journal\n");
   2921		rw_journal(ic, REQ_OP_READ, 0, 0, ic->journal_sections, NULL);
   2922		if (ic->journal_io)
   2923			DEBUG_bytes(lowmem_page_address(ic->journal_io[0].page), 64, "read journal");
   2924		if (ic->journal_io) {
   2925			struct journal_completion crypt_comp;
   2926			crypt_comp.ic = ic;
   2927			init_completion(&crypt_comp.comp);
   2928			crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
   2929			encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
   2930			wait_for_completion(&crypt_comp.comp);
   2931		}
   2932		DEBUG_bytes(lowmem_page_address(ic->journal[0].page), 64, "decrypted journal");
   2933	}
   2934
   2935	if (dm_integrity_failed(ic))
   2936		goto clear_journal;
   2937
   2938	journal_empty = true;
   2939	memset(used_commit_ids, 0, sizeof used_commit_ids);
   2940	memset(max_commit_id_sections, 0, sizeof max_commit_id_sections);
   2941	for (i = 0; i < ic->journal_sections; i++) {
   2942		for (j = 0; j < ic->journal_section_sectors; j++) {
   2943			int k;
   2944			struct journal_sector *js = access_journal(ic, i, j);
   2945			k = find_commit_seq(ic, i, j, js->commit_id);
   2946			if (k < 0)
   2947				goto clear_journal;
   2948			used_commit_ids[k] = true;
   2949			max_commit_id_sections[k] = i;
   2950		}
   2951		if (journal_empty) {
   2952			for (j = 0; j < ic->journal_section_entries; j++) {
   2953				struct journal_entry *je = access_journal_entry(ic, i, j);
   2954				if (!journal_entry_is_unused(je)) {
   2955					journal_empty = false;
   2956					break;
   2957				}
   2958			}
   2959		}
   2960	}
   2961
   2962	if (!used_commit_ids[N_COMMIT_IDS - 1]) {
   2963		unused = N_COMMIT_IDS - 1;
   2964		while (unused && !used_commit_ids[unused - 1])
   2965			unused--;
   2966	} else {
   2967		for (unused = 0; unused < N_COMMIT_IDS; unused++)
   2968			if (!used_commit_ids[unused])
   2969				break;
   2970		if (unused == N_COMMIT_IDS) {
   2971			dm_integrity_io_error(ic, "journal commit ids", -EIO);
   2972			goto clear_journal;
   2973		}
   2974	}
   2975	DEBUG_print("first unused commit seq %d [%d,%d,%d,%d]\n",
   2976		    unused, used_commit_ids[0], used_commit_ids[1],
   2977		    used_commit_ids[2], used_commit_ids[3]);
   2978
   2979	last_used = prev_commit_seq(unused);
   2980	want_commit_seq = prev_commit_seq(last_used);
   2981
   2982	if (!used_commit_ids[want_commit_seq] && used_commit_ids[prev_commit_seq(want_commit_seq)])
   2983		journal_empty = true;
   2984
   2985	write_start = max_commit_id_sections[last_used] + 1;
   2986	if (unlikely(write_start >= ic->journal_sections))
   2987		want_commit_seq = next_commit_seq(want_commit_seq);
   2988	wraparound_section(ic, &write_start);
   2989
   2990	i = write_start;
   2991	for (write_sections = 0; write_sections < ic->journal_sections; write_sections++) {
   2992		for (j = 0; j < ic->journal_section_sectors; j++) {
   2993			struct journal_sector *js = access_journal(ic, i, j);
   2994
   2995			if (js->commit_id != dm_integrity_commit_id(ic, i, j, want_commit_seq)) {
   2996				/*
   2997				 * This could be caused by crash during writing.
   2998				 * We won't replay the inconsistent part of the
   2999				 * journal.
   3000				 */
   3001				DEBUG_print("commit id mismatch at position (%u, %u): %d != %d\n",
   3002					    i, j, find_commit_seq(ic, i, j, js->commit_id), want_commit_seq);
   3003				goto brk;
   3004			}
   3005		}
   3006		i++;
   3007		if (unlikely(i >= ic->journal_sections))
   3008			want_commit_seq = next_commit_seq(want_commit_seq);
   3009		wraparound_section(ic, &i);
   3010	}
   3011brk:
   3012
   3013	if (!journal_empty) {
   3014		DEBUG_print("replaying %u sections, starting at %u, commit seq %d\n",
   3015			    write_sections, write_start, want_commit_seq);
   3016		do_journal_write(ic, write_start, write_sections, true);
   3017	}
   3018
   3019	if (write_sections == ic->journal_sections && (ic->mode == 'J' || journal_empty)) {
   3020		continue_section = write_start;
   3021		ic->commit_seq = want_commit_seq;
   3022		DEBUG_print("continuing from section %u, commit seq %d\n", write_start, ic->commit_seq);
   3023	} else {
   3024		unsigned s;
   3025		unsigned char erase_seq;
   3026clear_journal:
   3027		DEBUG_print("clearing journal\n");
   3028
   3029		erase_seq = prev_commit_seq(prev_commit_seq(last_used));
   3030		s = write_start;
   3031		init_journal(ic, s, 1, erase_seq);
   3032		s++;
   3033		wraparound_section(ic, &s);
   3034		if (ic->journal_sections >= 2) {
   3035			init_journal(ic, s, ic->journal_sections - 2, erase_seq);
   3036			s += ic->journal_sections - 2;
   3037			wraparound_section(ic, &s);
   3038			init_journal(ic, s, 1, erase_seq);
   3039		}
   3040
   3041		continue_section = 0;
   3042		ic->commit_seq = next_commit_seq(erase_seq);
   3043	}
   3044
   3045	ic->committed_section = continue_section;
   3046	ic->n_committed_sections = 0;
   3047
   3048	ic->uncommitted_section = continue_section;
   3049	ic->n_uncommitted_sections = 0;
   3050
   3051	ic->free_section = continue_section;
   3052	ic->free_section_entry = 0;
   3053	ic->free_sectors = ic->journal_entries;
   3054
   3055	ic->journal_tree_root = RB_ROOT;
   3056	for (i = 0; i < ic->journal_entries; i++)
   3057		init_journal_node(&ic->journal_tree[i]);
   3058}
   3059
   3060static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
   3061{
   3062	DEBUG_print("dm_integrity_enter_synchronous_mode\n");
   3063
   3064	if (ic->mode == 'B') {
   3065		ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
   3066		ic->synchronous_mode = 1;
   3067
   3068		cancel_delayed_work_sync(&ic->bitmap_flush_work);
   3069		queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
   3070		flush_workqueue(ic->commit_wq);
   3071	}
   3072}
   3073
   3074static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
   3075{
   3076	struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
   3077
   3078	DEBUG_print("dm_integrity_reboot\n");
   3079
   3080	dm_integrity_enter_synchronous_mode(ic);
   3081
   3082	return NOTIFY_DONE;
   3083}
   3084
   3085static void dm_integrity_postsuspend(struct dm_target *ti)
   3086{
   3087	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
   3088	int r;
   3089
   3090	WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
   3091
   3092	del_timer_sync(&ic->autocommit_timer);
   3093
   3094	if (ic->recalc_wq)
   3095		drain_workqueue(ic->recalc_wq);
   3096
   3097	if (ic->mode == 'B')
   3098		cancel_delayed_work_sync(&ic->bitmap_flush_work);
   3099
   3100	queue_work(ic->commit_wq, &ic->commit_work);
   3101	drain_workqueue(ic->commit_wq);
   3102
   3103	if (ic->mode == 'J') {
   3104		if (ic->meta_dev)
   3105			queue_work(ic->writer_wq, &ic->writer_work);
   3106		drain_workqueue(ic->writer_wq);
   3107		dm_integrity_flush_buffers(ic, true);
   3108	}
   3109
   3110	if (ic->mode == 'B') {
   3111		dm_integrity_flush_buffers(ic, true);
   3112#if 1
   3113		/* set to 0 to test bitmap replay code */
   3114		init_journal(ic, 0, ic->journal_sections, 0);
   3115		ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
   3116		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
   3117		if (unlikely(r))
   3118			dm_integrity_io_error(ic, "writing superblock", r);
   3119#endif
   3120	}
   3121
   3122	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
   3123
   3124	ic->journal_uptodate = true;
   3125}
   3126
   3127static void dm_integrity_resume(struct dm_target *ti)
   3128{
   3129	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
   3130	__u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
   3131	int r;
   3132
   3133	DEBUG_print("resume\n");
   3134
   3135	if (ic->provided_data_sectors != old_provided_data_sectors) {
   3136		if (ic->provided_data_sectors > old_provided_data_sectors &&
   3137		    ic->mode == 'B' &&
   3138		    ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
   3139			rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
   3140					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   3141			block_bitmap_op(ic, ic->journal, old_provided_data_sectors,
   3142					ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET);
   3143			rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
   3144					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   3145		}
   3146
   3147		ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
   3148		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
   3149		if (unlikely(r))
   3150			dm_integrity_io_error(ic, "writing superblock", r);
   3151	}
   3152
   3153	if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
   3154		DEBUG_print("resume dirty_bitmap\n");
   3155		rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
   3156				   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   3157		if (ic->mode == 'B') {
   3158			if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
   3159			    !ic->reset_recalculate_flag) {
   3160				block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
   3161				block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
   3162				if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
   3163						     BITMAP_OP_TEST_ALL_CLEAR)) {
   3164					ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
   3165					ic->sb->recalc_sector = cpu_to_le64(0);
   3166				}
   3167			} else {
   3168				DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
   3169					    ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
   3170				ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
   3171				block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
   3172				block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
   3173				block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
   3174				rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
   3175						   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   3176				ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
   3177				ic->sb->recalc_sector = cpu_to_le64(0);
   3178			}
   3179		} else {
   3180			if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
   3181			      block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) ||
   3182			    ic->reset_recalculate_flag) {
   3183				ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
   3184				ic->sb->recalc_sector = cpu_to_le64(0);
   3185			}
   3186			init_journal(ic, 0, ic->journal_sections, 0);
   3187			replay_journal(ic);
   3188			ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
   3189		}
   3190		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
   3191		if (unlikely(r))
   3192			dm_integrity_io_error(ic, "writing superblock", r);
   3193	} else {
   3194		replay_journal(ic);
   3195		if (ic->reset_recalculate_flag) {
   3196			ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
   3197			ic->sb->recalc_sector = cpu_to_le64(0);
   3198		}
   3199		if (ic->mode == 'B') {
   3200			ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
   3201			ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
   3202			r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
   3203			if (unlikely(r))
   3204				dm_integrity_io_error(ic, "writing superblock", r);
   3205
   3206			block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
   3207			block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
   3208			block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR);
   3209			if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
   3210			    le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) {
   3211				block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector),
   3212						ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
   3213				block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector),
   3214						ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
   3215				block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector),
   3216						ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET);
   3217			}
   3218			rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
   3219					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
   3220		}
   3221	}
   3222
   3223	DEBUG_print("testing recalc: %x\n", ic->sb->flags);
   3224	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
   3225		__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
   3226		DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors);
   3227		if (recalc_pos < ic->provided_data_sectors) {
   3228			queue_work(ic->recalc_wq, &ic->recalc_work);
   3229		} else if (recalc_pos > ic->provided_data_sectors) {
   3230			ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors);
   3231			recalc_write_super(ic);
   3232		}
   3233	}
   3234
   3235	ic->reboot_notifier.notifier_call = dm_integrity_reboot;
   3236	ic->reboot_notifier.next = NULL;
   3237	ic->reboot_notifier.priority = INT_MAX - 1;	/* be notified after md and before hardware drivers */
   3238	WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
   3239
   3240#if 0
   3241	/* set to 1 to stress test synchronous mode */
   3242	dm_integrity_enter_synchronous_mode(ic);
   3243#endif
   3244}
   3245
   3246static void dm_integrity_status(struct dm_target *ti, status_type_t type,
   3247				unsigned status_flags, char *result, unsigned maxlen)
   3248{
   3249	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
   3250	unsigned arg_count;
   3251	size_t sz = 0;
   3252
   3253	switch (type) {
   3254	case STATUSTYPE_INFO:
   3255		DMEMIT("%llu %llu",
   3256			(unsigned long long)atomic64_read(&ic->number_of_mismatches),
   3257			ic->provided_data_sectors);
   3258		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
   3259			DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector));
   3260		else
   3261			DMEMIT(" -");
   3262		break;
   3263
   3264	case STATUSTYPE_TABLE: {
   3265		__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
   3266		watermark_percentage += ic->journal_entries / 2;
   3267		do_div(watermark_percentage, ic->journal_entries);
   3268		arg_count = 3;
   3269		arg_count += !!ic->meta_dev;
   3270		arg_count += ic->sectors_per_block != 1;
   3271		arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
   3272		arg_count += ic->reset_recalculate_flag;
   3273		arg_count += ic->discard;
   3274		arg_count += ic->mode == 'J';
   3275		arg_count += ic->mode == 'J';
   3276		arg_count += ic->mode == 'B';
   3277		arg_count += ic->mode == 'B';
   3278		arg_count += !!ic->internal_hash_alg.alg_string;
   3279		arg_count += !!ic->journal_crypt_alg.alg_string;
   3280		arg_count += !!ic->journal_mac_alg.alg_string;
   3281		arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
   3282		arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0;
   3283		arg_count += ic->legacy_recalculate;
   3284		DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start,
   3285		       ic->tag_size, ic->mode, arg_count);
   3286		if (ic->meta_dev)
   3287			DMEMIT(" meta_device:%s", ic->meta_dev->name);
   3288		if (ic->sectors_per_block != 1)
   3289			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
   3290		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
   3291			DMEMIT(" recalculate");
   3292		if (ic->reset_recalculate_flag)
   3293			DMEMIT(" reset_recalculate");
   3294		if (ic->discard)
   3295			DMEMIT(" allow_discards");
   3296		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
   3297		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
   3298		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
   3299		if (ic->mode == 'J') {
   3300			DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
   3301			DMEMIT(" commit_time:%u", ic->autocommit_msec);
   3302		}
   3303		if (ic->mode == 'B') {
   3304			DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
   3305			DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
   3306		}
   3307		if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
   3308			DMEMIT(" fix_padding");
   3309		if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0)
   3310			DMEMIT(" fix_hmac");
   3311		if (ic->legacy_recalculate)
   3312			DMEMIT(" legacy_recalculate");
   3313
   3314#define EMIT_ALG(a, n)							\
   3315		do {							\
   3316			if (ic->a.alg_string) {				\
   3317				DMEMIT(" %s:%s", n, ic->a.alg_string);	\
   3318				if (ic->a.key_string)			\
   3319					DMEMIT(":%s", ic->a.key_string);\
   3320			}						\
   3321		} while (0)
   3322		EMIT_ALG(internal_hash_alg, "internal_hash");
   3323		EMIT_ALG(journal_crypt_alg, "journal_crypt");
   3324		EMIT_ALG(journal_mac_alg, "journal_mac");
   3325		break;
   3326	}
   3327	case STATUSTYPE_IMA:
   3328		DMEMIT_TARGET_NAME_VERSION(ti->type);
   3329		DMEMIT(",dev_name=%s,start=%llu,tag_size=%u,mode=%c",
   3330			ic->dev->name, ic->start, ic->tag_size, ic->mode);
   3331
   3332		if (ic->meta_dev)
   3333			DMEMIT(",meta_device=%s", ic->meta_dev->name);
   3334		if (ic->sectors_per_block != 1)
   3335			DMEMIT(",block_size=%u", ic->sectors_per_block << SECTOR_SHIFT);
   3336
   3337		DMEMIT(",recalculate=%c", (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ?
   3338		       'y' : 'n');
   3339		DMEMIT(",allow_discards=%c", ic->discard ? 'y' : 'n');
   3340		DMEMIT(",fix_padding=%c",
   3341		       ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) ? 'y' : 'n');
   3342		DMEMIT(",fix_hmac=%c",
   3343		       ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) ? 'y' : 'n');
   3344		DMEMIT(",legacy_recalculate=%c", ic->legacy_recalculate ? 'y' : 'n');
   3345
   3346		DMEMIT(",journal_sectors=%u", ic->initial_sectors - SB_SECTORS);
   3347		DMEMIT(",interleave_sectors=%u", 1U << ic->sb->log2_interleave_sectors);
   3348		DMEMIT(",buffer_sectors=%u", 1U << ic->log2_buffer_sectors);
   3349		DMEMIT(";");
   3350		break;
   3351	}
   3352}
   3353
   3354static int dm_integrity_iterate_devices(struct dm_target *ti,
   3355					iterate_devices_callout_fn fn, void *data)
   3356{
   3357	struct dm_integrity_c *ic = ti->private;
   3358
   3359	if (!ic->meta_dev)
   3360		return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
   3361	else
   3362		return fn(ti, ic->dev, 0, ti->len, data);
   3363}
   3364
   3365static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
   3366{
   3367	struct dm_integrity_c *ic = ti->private;
   3368
   3369	if (ic->sectors_per_block > 1) {
   3370		limits->logical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
   3371		limits->physical_block_size = ic->sectors_per_block << SECTOR_SHIFT;
   3372		blk_limits_io_min(limits, ic->sectors_per_block << SECTOR_SHIFT);
   3373	}
   3374}
   3375
   3376static void calculate_journal_section_size(struct dm_integrity_c *ic)
   3377{
   3378	unsigned sector_space = JOURNAL_SECTOR_DATA;
   3379
   3380	ic->journal_sections = le32_to_cpu(ic->sb->journal_sections);
   3381	ic->journal_entry_size = roundup(offsetof(struct journal_entry, last_bytes[ic->sectors_per_block]) + ic->tag_size,
   3382					 JOURNAL_ENTRY_ROUNDUP);
   3383
   3384	if (ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC))
   3385		sector_space -= JOURNAL_MAC_PER_SECTOR;
   3386	ic->journal_entries_per_sector = sector_space / ic->journal_entry_size;
   3387	ic->journal_section_entries = ic->journal_entries_per_sector * JOURNAL_BLOCK_SECTORS;
   3388	ic->journal_section_sectors = (ic->journal_section_entries << ic->sb->log2_sectors_per_block) + JOURNAL_BLOCK_SECTORS;
   3389	ic->journal_entries = ic->journal_section_entries * ic->journal_sections;
   3390}
   3391
   3392static int calculate_device_limits(struct dm_integrity_c *ic)
   3393{
   3394	__u64 initial_sectors;
   3395
   3396	calculate_journal_section_size(ic);
   3397	initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
   3398	if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX)
   3399		return -EINVAL;
   3400	ic->initial_sectors = initial_sectors;
   3401
   3402	if (!ic->meta_dev) {
   3403		sector_t last_sector, last_area, last_offset;
   3404
   3405		/* we have to maintain excessive padding for compatibility with existing volumes */
   3406		__u64 metadata_run_padding =
   3407			ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
   3408			(__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
   3409			(__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
   3410
   3411		ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
   3412					    metadata_run_padding) >> SECTOR_SHIFT;
   3413		if (!(ic->metadata_run & (ic->metadata_run - 1)))
   3414			ic->log2_metadata_run = __ffs(ic->metadata_run);
   3415		else
   3416			ic->log2_metadata_run = -1;
   3417
   3418		get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
   3419		last_sector = get_data_sector(ic, last_area, last_offset);
   3420		if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
   3421			return -EINVAL;
   3422	} else {
   3423		__u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
   3424		meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
   3425				>> (ic->log2_buffer_sectors + SECTOR_SHIFT);
   3426		meta_size <<= ic->log2_buffer_sectors;
   3427		if (ic->initial_sectors + meta_size < ic->initial_sectors ||
   3428		    ic->initial_sectors + meta_size > ic->meta_device_sectors)
   3429			return -EINVAL;
   3430		ic->metadata_run = 1;
   3431		ic->log2_metadata_run = 0;
   3432	}
   3433
   3434	return 0;
   3435}
   3436
   3437static void get_provided_data_sectors(struct dm_integrity_c *ic)
   3438{
   3439	if (!ic->meta_dev) {
   3440		int test_bit;
   3441		ic->provided_data_sectors = 0;
   3442		for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
   3443			__u64 prev_data_sectors = ic->provided_data_sectors;
   3444
   3445			ic->provided_data_sectors |= (sector_t)1 << test_bit;
   3446			if (calculate_device_limits(ic))
   3447				ic->provided_data_sectors = prev_data_sectors;
   3448		}
   3449	} else {
   3450		ic->provided_data_sectors = ic->data_device_sectors;
   3451		ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
   3452	}
   3453}
   3454
   3455static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
   3456{
   3457	unsigned journal_sections;
   3458	int test_bit;
   3459
   3460	memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
   3461	memcpy(ic->sb->magic, SB_MAGIC, 8);
   3462	ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
   3463	ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
   3464	if (ic->journal_mac_alg.alg_string)
   3465		ic->sb->flags |= cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC);
   3466
   3467	calculate_journal_section_size(ic);
   3468	journal_sections = journal_sectors / ic->journal_section_sectors;
   3469	if (!journal_sections)
   3470		journal_sections = 1;
   3471
   3472	if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) {
   3473		ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC);
   3474		get_random_bytes(ic->sb->salt, SALT_SIZE);
   3475	}
   3476
   3477	if (!ic->meta_dev) {
   3478		if (ic->fix_padding)
   3479			ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
   3480		ic->sb->journal_sections = cpu_to_le32(journal_sections);
   3481		if (!interleave_sectors)
   3482			interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
   3483		ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
   3484		ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
   3485		ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
   3486
   3487		get_provided_data_sectors(ic);
   3488		if (!ic->provided_data_sectors)
   3489			return -EINVAL;
   3490	} else {
   3491		ic->sb->log2_interleave_sectors = 0;
   3492
   3493		get_provided_data_sectors(ic);
   3494		if (!ic->provided_data_sectors)
   3495			return -EINVAL;
   3496
   3497try_smaller_buffer:
   3498		ic->sb->journal_sections = cpu_to_le32(0);
   3499		for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) {
   3500			__u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections);
   3501			__u32 test_journal_sections = prev_journal_sections | (1U << test_bit);
   3502			if (test_journal_sections > journal_sections)
   3503				continue;
   3504			ic->sb->journal_sections = cpu_to_le32(test_journal_sections);
   3505			if (calculate_device_limits(ic))
   3506				ic->sb->journal_sections = cpu_to_le32(prev_journal_sections);
   3507
   3508		}
   3509		if (!le32_to_cpu(ic->sb->journal_sections)) {
   3510			if (ic->log2_buffer_sectors > 3) {
   3511				ic->log2_buffer_sectors--;
   3512				goto try_smaller_buffer;
   3513			}
   3514			return -EINVAL;
   3515		}
   3516	}
   3517
   3518	ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
   3519
   3520	sb_set_version(ic);
   3521
   3522	return 0;
   3523}
   3524
   3525static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
   3526{
   3527	struct gendisk *disk = dm_disk(dm_table_get_md(ti->table));
   3528	struct blk_integrity bi;
   3529
   3530	memset(&bi, 0, sizeof(bi));
   3531	bi.profile = &dm_integrity_profile;
   3532	bi.tuple_size = ic->tag_size;
   3533	bi.tag_size = bi.tuple_size;
   3534	bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
   3535
   3536	blk_integrity_register(disk, &bi);
   3537	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
   3538}
   3539
   3540static void dm_integrity_free_page_list(struct page_list *pl)
   3541{
   3542	unsigned i;
   3543
   3544	if (!pl)
   3545		return;
   3546	for (i = 0; pl[i].page; i++)
   3547		__free_page(pl[i].page);
   3548	kvfree(pl);
   3549}
   3550
   3551static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
   3552{
   3553	struct page_list *pl;
   3554	unsigned i;
   3555
   3556	pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
   3557	if (!pl)
   3558		return NULL;
   3559
   3560	for (i = 0; i < n_pages; i++) {
   3561		pl[i].page = alloc_page(GFP_KERNEL);
   3562		if (!pl[i].page) {
   3563			dm_integrity_free_page_list(pl);
   3564			return NULL;
   3565		}
   3566		if (i)
   3567			pl[i - 1].next = &pl[i];
   3568	}
   3569	pl[i].page = NULL;
   3570	pl[i].next = NULL;
   3571
   3572	return pl;
   3573}
   3574
   3575static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, struct scatterlist **sl)
   3576{
   3577	unsigned i;
   3578	for (i = 0; i < ic->journal_sections; i++)
   3579		kvfree(sl[i]);
   3580	kvfree(sl);
   3581}
   3582
   3583static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
   3584								   struct page_list *pl)
   3585{
   3586	struct scatterlist **sl;
   3587	unsigned i;
   3588
   3589	sl = kvmalloc_array(ic->journal_sections,
   3590			    sizeof(struct scatterlist *),
   3591			    GFP_KERNEL | __GFP_ZERO);
   3592	if (!sl)
   3593		return NULL;
   3594
   3595	for (i = 0; i < ic->journal_sections; i++) {
   3596		struct scatterlist *s;
   3597		unsigned start_index, start_offset;
   3598		unsigned end_index, end_offset;
   3599		unsigned n_pages;
   3600		unsigned idx;
   3601
   3602		page_list_location(ic, i, 0, &start_index, &start_offset);
   3603		page_list_location(ic, i, ic->journal_section_sectors - 1,
   3604				   &end_index, &end_offset);
   3605
   3606		n_pages = (end_index - start_index + 1);
   3607
   3608		s = kvmalloc_array(n_pages, sizeof(struct scatterlist),
   3609				   GFP_KERNEL);
   3610		if (!s) {
   3611			dm_integrity_free_journal_scatterlist(ic, sl);
   3612			return NULL;
   3613		}
   3614
   3615		sg_init_table(s, n_pages);
   3616		for (idx = start_index; idx <= end_index; idx++) {
   3617			char *va = lowmem_page_address(pl[idx].page);
   3618			unsigned start = 0, end = PAGE_SIZE;
   3619			if (idx == start_index)
   3620				start = start_offset;
   3621			if (idx == end_index)
   3622				end = end_offset + (1 << SECTOR_SHIFT);
   3623			sg_set_buf(&s[idx - start_index], va + start, end - start);
   3624		}
   3625
   3626		sl[i] = s;
   3627	}
   3628
   3629	return sl;
   3630}
   3631
   3632static void free_alg(struct alg_spec *a)
   3633{
   3634	kfree_sensitive(a->alg_string);
   3635	kfree_sensitive(a->key);
   3636	memset(a, 0, sizeof *a);
   3637}
   3638
   3639static int get_alg_and_key(const char *arg, struct alg_spec *a, char **error, char *error_inval)
   3640{
   3641	char *k;
   3642
   3643	free_alg(a);
   3644
   3645	a->alg_string = kstrdup(strchr(arg, ':') + 1, GFP_KERNEL);
   3646	if (!a->alg_string)
   3647		goto nomem;
   3648
   3649	k = strchr(a->alg_string, ':');
   3650	if (k) {
   3651		*k = 0;
   3652		a->key_string = k + 1;
   3653		if (strlen(a->key_string) & 1)
   3654			goto inval;
   3655
   3656		a->key_size = strlen(a->key_string) / 2;
   3657		a->key = kmalloc(a->key_size, GFP_KERNEL);
   3658		if (!a->key)
   3659			goto nomem;
   3660		if (hex2bin(a->key, a->key_string, a->key_size))
   3661			goto inval;
   3662	}
   3663
   3664	return 0;
   3665inval:
   3666	*error = error_inval;
   3667	return -EINVAL;
   3668nomem:
   3669	*error = "Out of memory for an argument";
   3670	return -ENOMEM;
   3671}
   3672
   3673static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
   3674		   char *error_alg, char *error_key)
   3675{
   3676	int r;
   3677
   3678	if (a->alg_string) {
   3679		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
   3680		if (IS_ERR(*hash)) {
   3681			*error = error_alg;
   3682			r = PTR_ERR(*hash);
   3683			*hash = NULL;
   3684			return r;
   3685		}
   3686
   3687		if (a->key) {
   3688			r = crypto_shash_setkey(*hash, a->key, a->key_size);
   3689			if (r) {
   3690				*error = error_key;
   3691				return r;
   3692			}
   3693		} else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
   3694			*error = error_key;
   3695			return -ENOKEY;
   3696		}
   3697	}
   3698
   3699	return 0;
   3700}
   3701
   3702static int create_journal(struct dm_integrity_c *ic, char **error)
   3703{
   3704	int r = 0;
   3705	unsigned i;
   3706	__u64 journal_pages, journal_desc_size, journal_tree_size;
   3707	unsigned char *crypt_data = NULL, *crypt_iv = NULL;
   3708	struct skcipher_request *req = NULL;
   3709
   3710	ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL);
   3711	ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL);
   3712	ic->commit_ids[2] = cpu_to_le64(0x3333333333333333ULL);
   3713	ic->commit_ids[3] = cpu_to_le64(0x4444444444444444ULL);
   3714
   3715	journal_pages = roundup((__u64)ic->journal_sections * ic->journal_section_sectors,
   3716				PAGE_SIZE >> SECTOR_SHIFT) >> (PAGE_SHIFT - SECTOR_SHIFT);
   3717	journal_desc_size = journal_pages * sizeof(struct page_list);
   3718	if (journal_pages >= totalram_pages() - totalhigh_pages() || journal_desc_size > ULONG_MAX) {
   3719		*error = "Journal doesn't fit into memory";
   3720		r = -ENOMEM;
   3721		goto bad;
   3722	}
   3723	ic->journal_pages = journal_pages;
   3724
   3725	ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
   3726	if (!ic->journal) {
   3727		*error = "Could not allocate memory for journal";
   3728		r = -ENOMEM;
   3729		goto bad;
   3730	}
   3731	if (ic->journal_crypt_alg.alg_string) {
   3732		unsigned ivsize, blocksize;
   3733		struct journal_completion comp;
   3734
   3735		comp.ic = ic;
   3736		ic->journal_crypt = crypto_alloc_skcipher(ic->journal_crypt_alg.alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
   3737		if (IS_ERR(ic->journal_crypt)) {
   3738			*error = "Invalid journal cipher";
   3739			r = PTR_ERR(ic->journal_crypt);
   3740			ic->journal_crypt = NULL;
   3741			goto bad;
   3742		}
   3743		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
   3744		blocksize = crypto_skcipher_blocksize(ic->journal_crypt);
   3745
   3746		if (ic->journal_crypt_alg.key) {
   3747			r = crypto_skcipher_setkey(ic->journal_crypt, ic->journal_crypt_alg.key,
   3748						   ic->journal_crypt_alg.key_size);
   3749			if (r) {
   3750				*error = "Error setting encryption key";
   3751				goto bad;
   3752			}
   3753		}
   3754		DEBUG_print("cipher %s, block size %u iv size %u\n",
   3755			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
   3756
   3757		ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
   3758		if (!ic->journal_io) {
   3759			*error = "Could not allocate memory for journal io";
   3760			r = -ENOMEM;
   3761			goto bad;
   3762		}
   3763
   3764		if (blocksize == 1) {
   3765			struct scatterlist *sg;
   3766
   3767			req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
   3768			if (!req) {
   3769				*error = "Could not allocate crypt request";
   3770				r = -ENOMEM;
   3771				goto bad;
   3772			}
   3773
   3774			crypt_iv = kzalloc(ivsize, GFP_KERNEL);
   3775			if (!crypt_iv) {
   3776				*error = "Could not allocate iv";
   3777				r = -ENOMEM;
   3778				goto bad;
   3779			}
   3780
   3781			ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
   3782			if (!ic->journal_xor) {
   3783				*error = "Could not allocate memory for journal xor";
   3784				r = -ENOMEM;
   3785				goto bad;
   3786			}
   3787
   3788			sg = kvmalloc_array(ic->journal_pages + 1,
   3789					    sizeof(struct scatterlist),
   3790					    GFP_KERNEL);
   3791			if (!sg) {
   3792				*error = "Unable to allocate sg list";
   3793				r = -ENOMEM;
   3794				goto bad;
   3795			}
   3796			sg_init_table(sg, ic->journal_pages + 1);
   3797			for (i = 0; i < ic->journal_pages; i++) {
   3798				char *va = lowmem_page_address(ic->journal_xor[i].page);
   3799				clear_page(va);
   3800				sg_set_buf(&sg[i], va, PAGE_SIZE);
   3801			}
   3802			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
   3803
   3804			skcipher_request_set_crypt(req, sg, sg,
   3805						   PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
   3806			init_completion(&comp.comp);
   3807			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
   3808			if (do_crypt(true, req, &comp))
   3809				wait_for_completion(&comp.comp);
   3810			kvfree(sg);
   3811			r = dm_integrity_failed(ic);
   3812			if (r) {
   3813				*error = "Unable to encrypt journal";
   3814				goto bad;
   3815			}
   3816			DEBUG_bytes(lowmem_page_address(ic->journal_xor[0].page), 64, "xor data");
   3817
   3818			crypto_free_skcipher(ic->journal_crypt);
   3819			ic->journal_crypt = NULL;
   3820		} else {
   3821			unsigned crypt_len = roundup(ivsize, blocksize);
   3822
   3823			req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
   3824			if (!req) {
   3825				*error = "Could not allocate crypt request";
   3826				r = -ENOMEM;
   3827				goto bad;
   3828			}
   3829
   3830			crypt_iv = kmalloc(ivsize, GFP_KERNEL);
   3831			if (!crypt_iv) {
   3832				*error = "Could not allocate iv";
   3833				r = -ENOMEM;
   3834				goto bad;
   3835			}
   3836
   3837			crypt_data = kmalloc(crypt_len, GFP_KERNEL);
   3838			if (!crypt_data) {
   3839				*error = "Unable to allocate crypt data";
   3840				r = -ENOMEM;
   3841				goto bad;
   3842			}
   3843
   3844			ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal);
   3845			if (!ic->journal_scatterlist) {
   3846				*error = "Unable to allocate sg list";
   3847				r = -ENOMEM;
   3848				goto bad;
   3849			}
   3850			ic->journal_io_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal_io);
   3851			if (!ic->journal_io_scatterlist) {
   3852				*error = "Unable to allocate sg list";
   3853				r = -ENOMEM;
   3854				goto bad;
   3855			}
   3856			ic->sk_requests = kvmalloc_array(ic->journal_sections,
   3857							 sizeof(struct skcipher_request *),
   3858							 GFP_KERNEL | __GFP_ZERO);
   3859			if (!ic->sk_requests) {
   3860				*error = "Unable to allocate sk requests";
   3861				r = -ENOMEM;
   3862				goto bad;
   3863			}
   3864			for (i = 0; i < ic->journal_sections; i++) {
   3865				struct scatterlist sg;
   3866				struct skcipher_request *section_req;
   3867				__le32 section_le = cpu_to_le32(i);
   3868
   3869				memset(crypt_iv, 0x00, ivsize);
   3870				memset(crypt_data, 0x00, crypt_len);
   3871				memcpy(crypt_data, &section_le, min((size_t)crypt_len, sizeof(section_le)));
   3872
   3873				sg_init_one(&sg, crypt_data, crypt_len);
   3874				skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv);
   3875				init_completion(&comp.comp);
   3876				comp.in_flight = (atomic_t)ATOMIC_INIT(1);
   3877				if (do_crypt(true, req, &comp))
   3878					wait_for_completion(&comp.comp);
   3879
   3880				r = dm_integrity_failed(ic);
   3881				if (r) {
   3882					*error = "Unable to generate iv";
   3883					goto bad;
   3884				}
   3885
   3886				section_req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL);
   3887				if (!section_req) {
   3888					*error = "Unable to allocate crypt request";
   3889					r = -ENOMEM;
   3890					goto bad;
   3891				}
   3892				section_req->iv = kmalloc_array(ivsize, 2,
   3893								GFP_KERNEL);
   3894				if (!section_req->iv) {
   3895					skcipher_request_free(section_req);
   3896					*error = "Unable to allocate iv";
   3897					r = -ENOMEM;
   3898					goto bad;
   3899				}
   3900				memcpy(section_req->iv + ivsize, crypt_data, ivsize);
   3901				section_req->cryptlen = (size_t)ic->journal_section_sectors << SECTOR_SHIFT;
   3902				ic->sk_requests[i] = section_req;
   3903				DEBUG_bytes(crypt_data, ivsize, "iv(%u)", i);
   3904			}
   3905		}
   3906	}
   3907
   3908	for (i = 0; i < N_COMMIT_IDS; i++) {
   3909		unsigned j;
   3910retest_commit_id:
   3911		for (j = 0; j < i; j++) {
   3912			if (ic->commit_ids[j] == ic->commit_ids[i]) {
   3913				ic->commit_ids[i] = cpu_to_le64(le64_to_cpu(ic->commit_ids[i]) + 1);
   3914				goto retest_commit_id;
   3915			}
   3916		}
   3917		DEBUG_print("commit id %u: %016llx\n", i, ic->commit_ids[i]);
   3918	}
   3919
   3920	journal_tree_size = (__u64)ic->journal_entries * sizeof(struct journal_node);
   3921	if (journal_tree_size > ULONG_MAX) {
   3922		*error = "Journal doesn't fit into memory";
   3923		r = -ENOMEM;
   3924		goto bad;
   3925	}
   3926	ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
   3927	if (!ic->journal_tree) {
   3928		*error = "Could not allocate memory for journal tree";
   3929		r = -ENOMEM;
   3930	}
   3931bad:
   3932	kfree(crypt_data);
   3933	kfree(crypt_iv);
   3934	skcipher_request_free(req);
   3935
   3936	return r;
   3937}
   3938
   3939/*
   3940 * Construct a integrity mapping
   3941 *
   3942 * Arguments:
   3943 *	device
   3944 *	offset from the start of the device
   3945 *	tag size
   3946 *	D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
   3947 *	number of optional arguments
   3948 *	optional arguments:
   3949 *		journal_sectors
   3950 *		interleave_sectors
   3951 *		buffer_sectors
   3952 *		journal_watermark
   3953 *		commit_time
   3954 *		meta_device
   3955 *		block_size
   3956 *		sectors_per_bit
   3957 *		bitmap_flush_interval
   3958 *		internal_hash
   3959 *		journal_crypt
   3960 *		journal_mac
   3961 *		recalculate
   3962 */
   3963static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
   3964{
   3965	struct dm_integrity_c *ic;
   3966	char dummy;
   3967	int r;
   3968	unsigned extra_args;
   3969	struct dm_arg_set as;
   3970	static const struct dm_arg _args[] = {
   3971		{0, 18, "Invalid number of feature args"},
   3972	};
   3973	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
   3974	bool should_write_sb;
   3975	__u64 threshold;
   3976	unsigned long long start;
   3977	__s8 log2_sectors_per_bitmap_bit = -1;
   3978	__s8 log2_blocks_per_bitmap_bit;
   3979	__u64 bits_in_journal;
   3980	__u64 n_bitmap_bits;
   3981
   3982#define DIRECT_ARGUMENTS	4
   3983
   3984	if (argc <= DIRECT_ARGUMENTS) {
   3985		ti->error = "Invalid argument count";
   3986		return -EINVAL;
   3987	}
   3988
   3989	ic = kzalloc(sizeof(struct dm_integrity_c), GFP_KERNEL);
   3990	if (!ic) {
   3991		ti->error = "Cannot allocate integrity context";
   3992		return -ENOMEM;
   3993	}
   3994	ti->private = ic;
   3995	ti->per_io_data_size = sizeof(struct dm_integrity_io);
   3996	ic->ti = ti;
   3997
   3998	ic->in_progress = RB_ROOT;
   3999	INIT_LIST_HEAD(&ic->wait_list);
   4000	init_waitqueue_head(&ic->endio_wait);
   4001	bio_list_init(&ic->flush_bio_list);
   4002	init_waitqueue_head(&ic->copy_to_journal_wait);
   4003	init_completion(&ic->crypto_backoff);
   4004	atomic64_set(&ic->number_of_mismatches, 0);
   4005	ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
   4006
   4007	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
   4008	if (r) {
   4009		ti->error = "Device lookup failed";
   4010		goto bad;
   4011	}
   4012
   4013	if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) {
   4014		ti->error = "Invalid starting offset";
   4015		r = -EINVAL;
   4016		goto bad;
   4017	}
   4018	ic->start = start;
   4019
   4020	if (strcmp(argv[2], "-")) {
   4021		if (sscanf(argv[2], "%u%c", &ic->tag_size, &dummy) != 1 || !ic->tag_size) {
   4022			ti->error = "Invalid tag size";
   4023			r = -EINVAL;
   4024			goto bad;
   4025		}
   4026	}
   4027
   4028	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
   4029	    !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
   4030		ic->mode = argv[3][0];
   4031	} else {
   4032		ti->error = "Invalid mode (expecting J, B, D, R)";
   4033		r = -EINVAL;
   4034		goto bad;
   4035	}
   4036
   4037	journal_sectors = 0;
   4038	interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
   4039	buffer_sectors = DEFAULT_BUFFER_SECTORS;
   4040	journal_watermark = DEFAULT_JOURNAL_WATERMARK;
   4041	sync_msec = DEFAULT_SYNC_MSEC;
   4042	ic->sectors_per_block = 1;
   4043
   4044	as.argc = argc - DIRECT_ARGUMENTS;
   4045	as.argv = argv + DIRECT_ARGUMENTS;
   4046	r = dm_read_arg_group(_args, &as, &extra_args, &ti->error);
   4047	if (r)
   4048		goto bad;
   4049
   4050	while (extra_args--) {
   4051		const char *opt_string;
   4052		unsigned val;
   4053		unsigned long long llval;
   4054		opt_string = dm_shift_arg(&as);
   4055		if (!opt_string) {
   4056			r = -EINVAL;
   4057			ti->error = "Not enough feature arguments";
   4058			goto bad;
   4059		}
   4060		if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
   4061			journal_sectors = val ? val : 1;
   4062		else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
   4063			interleave_sectors = val;
   4064		else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
   4065			buffer_sectors = val;
   4066		else if (sscanf(opt_string, "journal_watermark:%u%c", &val, &dummy) == 1 && val <= 100)
   4067			journal_watermark = val;
   4068		else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
   4069			sync_msec = val;
   4070		else if (!strncmp(opt_string, "meta_device:", strlen("meta_device:"))) {
   4071			if (ic->meta_dev) {
   4072				dm_put_device(ti, ic->meta_dev);
   4073				ic->meta_dev = NULL;
   4074			}
   4075			r = dm_get_device(ti, strchr(opt_string, ':') + 1,
   4076					  dm_table_get_mode(ti->table), &ic->meta_dev);
   4077			if (r) {
   4078				ti->error = "Device lookup failed";
   4079				goto bad;
   4080			}
   4081		} else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
   4082			if (val < 1 << SECTOR_SHIFT ||
   4083			    val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
   4084			    (val & (val -1))) {
   4085				r = -EINVAL;
   4086				ti->error = "Invalid block_size argument";
   4087				goto bad;
   4088			}
   4089			ic->sectors_per_block = val >> SECTOR_SHIFT;
   4090		} else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
   4091			log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
   4092		} else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
   4093			if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
   4094				r = -EINVAL;
   4095				ti->error = "Invalid bitmap_flush_interval argument";
   4096				goto bad;
   4097			}
   4098			ic->bitmap_flush_interval = msecs_to_jiffies(val);
   4099		} else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
   4100			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
   4101					    "Invalid internal_hash argument");
   4102			if (r)
   4103				goto bad;
   4104		} else if (!strncmp(opt_string, "journal_crypt:", strlen("journal_crypt:"))) {
   4105			r = get_alg_and_key(opt_string, &ic->journal_crypt_alg, &ti->error,
   4106					    "Invalid journal_crypt argument");
   4107			if (r)
   4108				goto bad;
   4109		} else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
   4110			r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
   4111					    "Invalid journal_mac argument");
   4112			if (r)
   4113				goto bad;
   4114		} else if (!strcmp(opt_string, "recalculate")) {
   4115			ic->recalculate_flag = true;
   4116		} else if (!strcmp(opt_string, "reset_recalculate")) {
   4117			ic->recalculate_flag = true;
   4118			ic->reset_recalculate_flag = true;
   4119		} else if (!strcmp(opt_string, "allow_discards")) {
   4120			ic->discard = true;
   4121		} else if (!strcmp(opt_string, "fix_padding")) {
   4122			ic->fix_padding = true;
   4123		} else if (!strcmp(opt_string, "fix_hmac")) {
   4124			ic->fix_hmac = true;
   4125		} else if (!strcmp(opt_string, "legacy_recalculate")) {
   4126			ic->legacy_recalculate = true;
   4127		} else {
   4128			r = -EINVAL;
   4129			ti->error = "Invalid argument";
   4130			goto bad;
   4131		}
   4132	}
   4133
   4134	ic->data_device_sectors = bdev_nr_sectors(ic->dev->bdev);
   4135	if (!ic->meta_dev)
   4136		ic->meta_device_sectors = ic->data_device_sectors;
   4137	else
   4138		ic->meta_device_sectors = bdev_nr_sectors(ic->meta_dev->bdev);
   4139
   4140	if (!journal_sectors) {
   4141		journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
   4142				      ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
   4143	}
   4144
   4145	if (!buffer_sectors)
   4146		buffer_sectors = 1;
   4147	ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
   4148
   4149	r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
   4150		    "Invalid internal hash", "Error setting internal hash key");
   4151	if (r)
   4152		goto bad;
   4153
   4154	r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
   4155		    "Invalid journal mac", "Error setting journal mac key");
   4156	if (r)
   4157		goto bad;
   4158
   4159	if (!ic->tag_size) {
   4160		if (!ic->internal_hash) {
   4161			ti->error = "Unknown tag size";
   4162			r = -EINVAL;
   4163			goto bad;
   4164		}
   4165		ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
   4166	}
   4167	if (ic->tag_size > MAX_TAG_SIZE) {
   4168		ti->error = "Too big tag size";
   4169		r = -EINVAL;
   4170		goto bad;
   4171	}
   4172	if (!(ic->tag_size & (ic->tag_size - 1)))
   4173		ic->log2_tag_size = __ffs(ic->tag_size);
   4174	else
   4175		ic->log2_tag_size = -1;
   4176
   4177	if (ic->mode == 'B' && !ic->internal_hash) {
   4178		r = -EINVAL;
   4179		ti->error = "Bitmap mode can be only used with internal hash";
   4180		goto bad;
   4181	}
   4182
   4183	if (ic->discard && !ic->internal_hash) {
   4184		r = -EINVAL;
   4185		ti->error = "Discard can be only used with internal hash";
   4186		goto bad;
   4187	}
   4188
   4189	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
   4190	ic->autocommit_msec = sync_msec;
   4191	timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
   4192
   4193	ic->io = dm_io_client_create();
   4194	if (IS_ERR(ic->io)) {
   4195		r = PTR_ERR(ic->io);
   4196		ic->io = NULL;
   4197		ti->error = "Cannot allocate dm io";
   4198		goto bad;
   4199	}
   4200
   4201	r = mempool_init_slab_pool(&ic->journal_io_mempool, JOURNAL_IO_MEMPOOL, journal_io_cache);
   4202	if (r) {
   4203		ti->error = "Cannot allocate mempool";
   4204		goto bad;
   4205	}
   4206
   4207	ic->metadata_wq = alloc_workqueue("dm-integrity-metadata",
   4208					  WQ_MEM_RECLAIM, METADATA_WORKQUEUE_MAX_ACTIVE);
   4209	if (!ic->metadata_wq) {
   4210		ti->error = "Cannot allocate workqueue";
   4211		r = -ENOMEM;
   4212		goto bad;
   4213	}
   4214
   4215	/*
   4216	 * If this workqueue were percpu, it would cause bio reordering
   4217	 * and reduced performance.
   4218	 */
   4219	ic->wait_wq = alloc_workqueue("dm-integrity-wait", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
   4220	if (!ic->wait_wq) {
   4221		ti->error = "Cannot allocate workqueue";
   4222		r = -ENOMEM;
   4223		goto bad;
   4224	}
   4225
   4226	ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM,
   4227					  METADATA_WORKQUEUE_MAX_ACTIVE);
   4228	if (!ic->offload_wq) {
   4229		ti->error = "Cannot allocate workqueue";
   4230		r = -ENOMEM;
   4231		goto bad;
   4232	}
   4233
   4234	ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1);
   4235	if (!ic->commit_wq) {
   4236		ti->error = "Cannot allocate workqueue";
   4237		r = -ENOMEM;
   4238		goto bad;
   4239	}
   4240	INIT_WORK(&ic->commit_work, integrity_commit);
   4241
   4242	if (ic->mode == 'J' || ic->mode == 'B') {
   4243		ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
   4244		if (!ic->writer_wq) {
   4245			ti->error = "Cannot allocate workqueue";
   4246			r = -ENOMEM;
   4247			goto bad;
   4248		}
   4249		INIT_WORK(&ic->writer_work, integrity_writer);
   4250	}
   4251
   4252	ic->sb = alloc_pages_exact(SB_SECTORS << SECTOR_SHIFT, GFP_KERNEL);
   4253	if (!ic->sb) {
   4254		r = -ENOMEM;
   4255		ti->error = "Cannot allocate superblock area";
   4256		goto bad;
   4257	}
   4258
   4259	r = sync_rw_sb(ic, REQ_OP_READ, 0);
   4260	if (r) {
   4261		ti->error = "Error reading superblock";
   4262		goto bad;
   4263	}
   4264	should_write_sb = false;
   4265	if (memcmp(ic->sb->magic, SB_MAGIC, 8)) {
   4266		if (ic->mode != 'R') {
   4267			if (memchr_inv(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT)) {
   4268				r = -EINVAL;
   4269				ti->error = "The device is not initialized";
   4270				goto bad;
   4271			}
   4272		}
   4273
   4274		r = initialize_superblock(ic, journal_sectors, interleave_sectors);
   4275		if (r) {
   4276			ti->error = "Could not initialize superblock";
   4277			goto bad;
   4278		}
   4279		if (ic->mode != 'R')
   4280			should_write_sb = true;
   4281	}
   4282
   4283	if (!ic->sb->version || ic->sb->version > SB_VERSION_5) {
   4284		r = -EINVAL;
   4285		ti->error = "Unknown version";
   4286		goto bad;
   4287	}
   4288	if (le16_to_cpu(ic->sb->integrity_tag_size) != ic->tag_size) {
   4289		r = -EINVAL;
   4290		ti->error = "Tag size doesn't match the information in superblock";
   4291		goto bad;
   4292	}
   4293	if (ic->sb->log2_sectors_per_block != __ffs(ic->sectors_per_block)) {
   4294		r = -EINVAL;
   4295		ti->error = "Block size doesn't match the information in superblock";
   4296		goto bad;
   4297	}
   4298	if (!le32_to_cpu(ic->sb->journal_sections)) {
   4299		r = -EINVAL;
   4300		ti->error = "Corrupted superblock, journal_sections is 0";
   4301		goto bad;
   4302	}
   4303	/* make sure that ti->max_io_len doesn't overflow */
   4304	if (!ic->meta_dev) {
   4305		if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
   4306		    ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
   4307			r = -EINVAL;
   4308			ti->error = "Invalid interleave_sectors in the superblock";
   4309			goto bad;
   4310		}
   4311	} else {
   4312		if (ic->sb->log2_interleave_sectors) {
   4313			r = -EINVAL;
   4314			ti->error = "Invalid interleave_sectors in the superblock";
   4315			goto bad;
   4316		}
   4317	}
   4318	if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
   4319		r = -EINVAL;
   4320		ti->error = "Journal mac mismatch";
   4321		goto bad;
   4322	}
   4323
   4324	get_provided_data_sectors(ic);
   4325	if (!ic->provided_data_sectors) {
   4326		r = -EINVAL;
   4327		ti->error = "The device is too small";
   4328		goto bad;
   4329	}
   4330
   4331try_smaller_buffer:
   4332	r = calculate_device_limits(ic);
   4333	if (r) {
   4334		if (ic->meta_dev) {
   4335			if (ic->log2_buffer_sectors > 3) {
   4336				ic->log2_buffer_sectors--;
   4337				goto try_smaller_buffer;
   4338			}
   4339		}
   4340		ti->error = "The device is too small";
   4341		goto bad;
   4342	}
   4343
   4344	if (log2_sectors_per_bitmap_bit < 0)
   4345		log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
   4346	if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
   4347		log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
   4348
   4349	bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
   4350	if (bits_in_journal > UINT_MAX)
   4351		bits_in_journal = UINT_MAX;
   4352	while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
   4353		log2_sectors_per_bitmap_bit++;
   4354
   4355	log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
   4356	ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
   4357	if (should_write_sb) {
   4358		ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
   4359	}
   4360	n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
   4361				+ (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
   4362	ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
   4363
   4364	if (!ic->meta_dev)
   4365		ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
   4366
   4367	if (ti->len > ic->provided_data_sectors) {
   4368		r = -EINVAL;
   4369		ti->error = "Not enough provided sectors for requested mapping size";
   4370		goto bad;
   4371	}
   4372
   4373
   4374	threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
   4375	threshold += 50;
   4376	do_div(threshold, 100);
   4377	ic->free_sectors_threshold = threshold;
   4378
   4379	DEBUG_print("initialized:\n");
   4380	DEBUG_print("	integrity_tag_size %u\n", le16_to_cpu(ic->sb->integrity_tag_size));
   4381	DEBUG_print("	journal_entry_size %u\n", ic->journal_entry_size);
   4382	DEBUG_print("	journal_entries_per_sector %u\n", ic->journal_entries_per_sector);
   4383	DEBUG_print("	journal_section_entries %u\n", ic->journal_section_entries);
   4384	DEBUG_print("	journal_section_sectors %u\n", ic->journal_section_sectors);
   4385	DEBUG_print("	journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
   4386	DEBUG_print("	journal_entries %u\n", ic->journal_entries);
   4387	DEBUG_print("	log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
   4388	DEBUG_print("	data_device_sectors 0x%llx\n", bdev_nr_sectors(ic->dev->bdev));
   4389	DEBUG_print("	initial_sectors 0x%x\n", ic->initial_sectors);
   4390	DEBUG_print("	metadata_run 0x%x\n", ic->metadata_run);
   4391	DEBUG_print("	log2_metadata_run %d\n", ic->log2_metadata_run);
   4392	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors);
   4393	DEBUG_print("	log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
   4394	DEBUG_print("	bits_in_journal %llu\n", bits_in_journal);
   4395
   4396	if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
   4397		ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
   4398		ic->sb->recalc_sector = cpu_to_le64(0);
   4399	}
   4400
   4401	if (ic->internal_hash) {
   4402		size_t recalc_tags_size;
   4403		ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
   4404		if (!ic->recalc_wq ) {
   4405			ti->error = "Cannot allocate workqueue";
   4406			r = -ENOMEM;
   4407			goto bad;
   4408		}
   4409		INIT_WORK(&ic->recalc_work, integrity_recalc);
   4410		ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
   4411		if (!ic->recalc_buffer) {
   4412			ti->error = "Cannot allocate buffer for recalculating";
   4413			r = -ENOMEM;
   4414			goto bad;
   4415		}
   4416		recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size;
   4417		if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
   4418			recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
   4419		ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL);
   4420		if (!ic->recalc_tags) {
   4421			ti->error = "Cannot allocate tags for recalculating";
   4422			r = -ENOMEM;
   4423			goto bad;
   4424		}
   4425	} else {
   4426		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
   4427			ti->error = "Recalculate can only be specified with internal_hash";
   4428			r = -EINVAL;
   4429			goto bad;
   4430		}
   4431	}
   4432
   4433	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
   4434	    le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors &&
   4435	    dm_integrity_disable_recalculate(ic)) {
   4436		ti->error = "Recalculating with HMAC is disabled for security reasons - if you really need it, use the argument \"legacy_recalculate\"";
   4437		r = -EOPNOTSUPP;
   4438		goto bad;
   4439	}
   4440
   4441	ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
   4442			1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL);
   4443	if (IS_ERR(ic->bufio)) {
   4444		r = PTR_ERR(ic->bufio);
   4445		ti->error = "Cannot initialize dm-bufio";
   4446		ic->bufio = NULL;
   4447		goto bad;
   4448	}
   4449	dm_bufio_set_sector_offset(ic->bufio, ic->start + ic->initial_sectors);
   4450
   4451	if (ic->mode != 'R') {
   4452		r = create_journal(ic, &ti->error);
   4453		if (r)
   4454			goto bad;
   4455
   4456	}
   4457
   4458	if (ic->mode == 'B') {
   4459		unsigned i;
   4460		unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
   4461
   4462		ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
   4463		if (!ic->recalc_bitmap) {
   4464			r = -ENOMEM;
   4465			goto bad;
   4466		}
   4467		ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
   4468		if (!ic->may_write_bitmap) {
   4469			r = -ENOMEM;
   4470			goto bad;
   4471		}
   4472		ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
   4473		if (!ic->bbs) {
   4474			r = -ENOMEM;
   4475			goto bad;
   4476		}
   4477		INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
   4478		for (i = 0; i < ic->n_bitmap_blocks; i++) {
   4479			struct bitmap_block_status *bbs = &ic->bbs[i];
   4480			unsigned sector, pl_index, pl_offset;
   4481
   4482			INIT_WORK(&bbs->work, bitmap_block_work);
   4483			bbs->ic = ic;
   4484			bbs->idx = i;
   4485			bio_list_init(&bbs->bio_queue);
   4486			spin_lock_init(&bbs->bio_queue_lock);
   4487
   4488			sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
   4489			pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
   4490			pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
   4491
   4492			bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
   4493		}
   4494	}
   4495
   4496	if (should_write_sb) {
   4497		init_journal(ic, 0, ic->journal_sections, 0);
   4498		r = dm_integrity_failed(ic);
   4499		if (unlikely(r)) {
   4500			ti->error = "Error initializing journal";
   4501			goto bad;
   4502		}
   4503		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
   4504		if (r) {
   4505			ti->error = "Error initializing superblock";
   4506			goto bad;
   4507		}
   4508		ic->just_formatted = true;
   4509	}
   4510
   4511	if (!ic->meta_dev) {
   4512		r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
   4513		if (r)
   4514			goto bad;
   4515	}
   4516	if (ic->mode == 'B') {
   4517		unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
   4518		if (!max_io_len)
   4519			max_io_len = 1U << 31;
   4520		DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
   4521		if (!ti->max_io_len || ti->max_io_len > max_io_len) {
   4522			r = dm_set_target_max_io_len(ti, max_io_len);
   4523			if (r)
   4524				goto bad;
   4525		}
   4526	}
   4527
   4528	if (!ic->internal_hash)
   4529		dm_integrity_set(ti, ic);
   4530
   4531	ti->num_flush_bios = 1;
   4532	ti->flush_supported = true;
   4533	if (ic->discard)
   4534		ti->num_discard_bios = 1;
   4535
   4536	dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1);
   4537	return 0;
   4538
   4539bad:
   4540	dm_audit_log_ctr(DM_MSG_PREFIX, ti, 0);
   4541	dm_integrity_dtr(ti);
   4542	return r;
   4543}
   4544
   4545static void dm_integrity_dtr(struct dm_target *ti)
   4546{
   4547	struct dm_integrity_c *ic = ti->private;
   4548
   4549	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
   4550	BUG_ON(!list_empty(&ic->wait_list));
   4551
   4552	if (ic->metadata_wq)
   4553		destroy_workqueue(ic->metadata_wq);
   4554	if (ic->wait_wq)
   4555		destroy_workqueue(ic->wait_wq);
   4556	if (ic->offload_wq)
   4557		destroy_workqueue(ic->offload_wq);
   4558	if (ic->commit_wq)
   4559		destroy_workqueue(ic->commit_wq);
   4560	if (ic->writer_wq)
   4561		destroy_workqueue(ic->writer_wq);
   4562	if (ic->recalc_wq)
   4563		destroy_workqueue(ic->recalc_wq);
   4564	vfree(ic->recalc_buffer);
   4565	kvfree(ic->recalc_tags);
   4566	kvfree(ic->bbs);
   4567	if (ic->bufio)
   4568		dm_bufio_client_destroy(ic->bufio);
   4569	mempool_exit(&ic->journal_io_mempool);
   4570	if (ic->io)
   4571		dm_io_client_destroy(ic->io);
   4572	if (ic->dev)
   4573		dm_put_device(ti, ic->dev);
   4574	if (ic->meta_dev)
   4575		dm_put_device(ti, ic->meta_dev);
   4576	dm_integrity_free_page_list(ic->journal);
   4577	dm_integrity_free_page_list(ic->journal_io);
   4578	dm_integrity_free_page_list(ic->journal_xor);
   4579	dm_integrity_free_page_list(ic->recalc_bitmap);
   4580	dm_integrity_free_page_list(ic->may_write_bitmap);
   4581	if (ic->journal_scatterlist)
   4582		dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
   4583	if (ic->journal_io_scatterlist)
   4584		dm_integrity_free_journal_scatterlist(ic, ic->journal_io_scatterlist);
   4585	if (ic->sk_requests) {
   4586		unsigned i;
   4587
   4588		for (i = 0; i < ic->journal_sections; i++) {
   4589			struct skcipher_request *req = ic->sk_requests[i];
   4590			if (req) {
   4591				kfree_sensitive(req->iv);
   4592				skcipher_request_free(req);
   4593			}
   4594		}
   4595		kvfree(ic->sk_requests);
   4596	}
   4597	kvfree(ic->journal_tree);
   4598	if (ic->sb)
   4599		free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
   4600
   4601	if (ic->internal_hash)
   4602		crypto_free_shash(ic->internal_hash);
   4603	free_alg(&ic->internal_hash_alg);
   4604
   4605	if (ic->journal_crypt)
   4606		crypto_free_skcipher(ic->journal_crypt);
   4607	free_alg(&ic->journal_crypt_alg);
   4608
   4609	if (ic->journal_mac)
   4610		crypto_free_shash(ic->journal_mac);
   4611	free_alg(&ic->journal_mac_alg);
   4612
   4613	kfree(ic);
   4614	dm_audit_log_dtr(DM_MSG_PREFIX, ti, 1);
   4615}
   4616
   4617static struct target_type integrity_target = {
   4618	.name			= "integrity",
   4619	.version		= {1, 10, 0},
   4620	.module			= THIS_MODULE,
   4621	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
   4622	.ctr			= dm_integrity_ctr,
   4623	.dtr			= dm_integrity_dtr,
   4624	.map			= dm_integrity_map,
   4625	.postsuspend		= dm_integrity_postsuspend,
   4626	.resume			= dm_integrity_resume,
   4627	.status			= dm_integrity_status,
   4628	.iterate_devices	= dm_integrity_iterate_devices,
   4629	.io_hints		= dm_integrity_io_hints,
   4630};
   4631
   4632static int __init dm_integrity_init(void)
   4633{
   4634	int r;
   4635
   4636	journal_io_cache = kmem_cache_create("integrity_journal_io",
   4637					     sizeof(struct journal_io), 0, 0, NULL);
   4638	if (!journal_io_cache) {
   4639		DMERR("can't allocate journal io cache");
   4640		return -ENOMEM;
   4641	}
   4642
   4643	r = dm_register_target(&integrity_target);
   4644
   4645	if (r < 0)
   4646		DMERR("register failed %d", r);
   4647
   4648	return r;
   4649}
   4650
   4651static void __exit dm_integrity_exit(void)
   4652{
   4653	dm_unregister_target(&integrity_target);
   4654	kmem_cache_destroy(journal_io_cache);
   4655}
   4656
   4657module_init(dm_integrity_init);
   4658module_exit(dm_integrity_exit);
   4659
   4660MODULE_AUTHOR("Milan Broz");
   4661MODULE_AUTHOR("Mikulas Patocka");
   4662MODULE_DESCRIPTION(DM_NAME " target for integrity tags extension");
   4663MODULE_LICENSE("GPL");