raid5-cache.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
raid5-cache.c (89559B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
      4 * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
      5 */
      6#include <linux/kernel.h>
      7#include <linux/wait.h>
      8#include <linux/blkdev.h>
      9#include <linux/slab.h>
     10#include <linux/raid/md_p.h>
     11#include <linux/crc32c.h>
     12#include <linux/random.h>
     13#include <linux/kthread.h>
     14#include <linux/types.h>
     15#include "md.h"
     16#include "raid5.h"
     17#include "md-bitmap.h"
     18#include "raid5-log.h"
     19
     20/*
     21 * metadata/data stored in disk with 4k size unit (a block) regardless
     22 * underneath hardware sector size. only works with PAGE_SIZE == 4096
     23 */
     24#define BLOCK_SECTORS (8)
     25#define BLOCK_SECTOR_SHIFT (3)
     26
     27/*
     28 * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
     29 *
     30 * In write through mode, the reclaim runs every log->max_free_space.
     31 * This can prevent the recovery scans for too long
     32 */
     33#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
     34#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
     35
     36/* wake up reclaim thread periodically */
     37#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
     38/* start flush with these full stripes */
     39#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
     40/* reclaim stripes in groups */
     41#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
     42
     43/*
     44 * We only need 2 bios per I/O unit to make progress, but ensure we
     45 * have a few more available to not get too tight.
     46 */
     47#define R5L_POOL_SIZE	4
     48
     49static char *r5c_journal_mode_str[] = {"write-through",
     50				       "write-back"};
     51/*
     52 * raid5 cache state machine
     53 *
     54 * With the RAID cache, each stripe works in two phases:
     55 *	- caching phase
     56 *	- writing-out phase
     57 *
     58 * These two phases are controlled by bit STRIPE_R5C_CACHING:
     59 *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
     60 *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
     61 *
     62 * When there is no journal, or the journal is in write-through mode,
     63 * the stripe is always in writing-out phase.
     64 *
     65 * For write-back journal, the stripe is sent to caching phase on write
     66 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
     67 * the write-out phase by clearing STRIPE_R5C_CACHING.
     68 *
     69 * Stripes in caching phase do not write the raid disks. Instead, all
     70 * writes are committed from the log device. Therefore, a stripe in
     71 * caching phase handles writes as:
     72 *	- write to log device
     73 *	- return IO
     74 *
     75 * Stripes in writing-out phase handle writes as:
     76 *	- calculate parity
     77 *	- write pending data and parity to journal
     78 *	- write data and parity to raid disks
     79 *	- return IO for pending writes
     80 */
     81
     82struct r5l_log {
     83	struct md_rdev *rdev;
     84
     85	u32 uuid_checksum;
     86
     87	sector_t device_size;		/* log device size, round to
     88					 * BLOCK_SECTORS */
     89	sector_t max_free_space;	/* reclaim run if free space is at
     90					 * this size */
     91
     92	sector_t last_checkpoint;	/* log tail. where recovery scan
     93					 * starts from */
     94	u64 last_cp_seq;		/* log tail sequence */
     95
     96	sector_t log_start;		/* log head. where new data appends */
     97	u64 seq;			/* log head sequence */
     98
     99	sector_t next_checkpoint;
    100
    101	struct mutex io_mutex;
    102	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
    103
    104	spinlock_t io_list_lock;
    105	struct list_head running_ios;	/* io_units which are still running,
    106					 * and have not yet been completely
    107					 * written to the log */
    108	struct list_head io_end_ios;	/* io_units which have been completely
    109					 * written to the log but not yet written
    110					 * to the RAID */
    111	struct list_head flushing_ios;	/* io_units which are waiting for log
    112					 * cache flush */
    113	struct list_head finished_ios;	/* io_units which settle down in log disk */
    114	struct bio flush_bio;
    115
    116	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
    117
    118	struct kmem_cache *io_kc;
    119	mempool_t io_pool;
    120	struct bio_set bs;
    121	mempool_t meta_pool;
    122
    123	struct md_thread *reclaim_thread;
    124	unsigned long reclaim_target;	/* number of space that need to be
    125					 * reclaimed.  if it's 0, reclaim spaces
    126					 * used by io_units which are in
    127					 * IO_UNIT_STRIPE_END state (eg, reclaim
    128					 * dones't wait for specific io_unit
    129					 * switching to IO_UNIT_STRIPE_END
    130					 * state) */
    131	wait_queue_head_t iounit_wait;
    132
    133	struct list_head no_space_stripes; /* pending stripes, log has no space */
    134	spinlock_t no_space_stripes_lock;
    135
    136	bool need_cache_flush;
    137
    138	/* for r5c_cache */
    139	enum r5c_journal_mode r5c_journal_mode;
    140
    141	/* all stripes in r5cache, in the order of seq at sh->log_start */
    142	struct list_head stripe_in_journal_list;
    143
    144	spinlock_t stripe_in_journal_lock;
    145	atomic_t stripe_in_journal_count;
    146
    147	/* to submit async io_units, to fulfill ordering of flush */
    148	struct work_struct deferred_io_work;
    149	/* to disable write back during in degraded mode */
    150	struct work_struct disable_writeback_work;
    151
    152	/* to for chunk_aligned_read in writeback mode, details below */
    153	spinlock_t tree_lock;
    154	struct radix_tree_root big_stripe_tree;
    155};
    156
    157/*
    158 * Enable chunk_aligned_read() with write back cache.
    159 *
    160 * Each chunk may contain more than one stripe (for example, a 256kB
    161 * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
    162 * chunk_aligned_read, these stripes are grouped into one "big_stripe".
    163 * For each big_stripe, we count how many stripes of this big_stripe
    164 * are in the write back cache. These data are tracked in a radix tree
    165 * (big_stripe_tree). We use radix_tree item pointer as the counter.
    166 * r5c_tree_index() is used to calculate keys for the radix tree.
    167 *
    168 * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
    169 * big_stripe of each chunk in the tree. If this big_stripe is in the
    170 * tree, chunk_aligned_read() aborts. This look up is protected by
    171 * rcu_read_lock().
    172 *
    173 * It is necessary to remember whether a stripe is counted in
    174 * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
    175 * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
    176 * two flags are set, the stripe is counted in big_stripe_tree. This
    177 * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
    178 * r5c_try_caching_write(); and moving clear_bit of
    179 * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
    180 * r5c_finish_stripe_write_out().
    181 */
    182
    183/*
    184 * radix tree requests lowest 2 bits of data pointer to be 2b'00.
    185 * So it is necessary to left shift the counter by 2 bits before using it
    186 * as data pointer of the tree.
    187 */
    188#define R5C_RADIX_COUNT_SHIFT 2
    189
    190/*
    191 * calculate key for big_stripe_tree
    192 *
    193 * sect: align_bi->bi_iter.bi_sector or sh->sector
    194 */
    195static inline sector_t r5c_tree_index(struct r5conf *conf,
    196				      sector_t sect)
    197{
    198	sector_div(sect, conf->chunk_sectors);
    199	return sect;
    200}
    201
    202/*
    203 * an IO range starts from a meta data block and end at the next meta data
    204 * block. The io unit's the meta data block tracks data/parity followed it. io
    205 * unit is written to log disk with normal write, as we always flush log disk
    206 * first and then start move data to raid disks, there is no requirement to
    207 * write io unit with FLUSH/FUA
    208 */
    209struct r5l_io_unit {
    210	struct r5l_log *log;
    211
    212	struct page *meta_page;	/* store meta block */
    213	int meta_offset;	/* current offset in meta_page */
    214
    215	struct bio *current_bio;/* current_bio accepting new data */
    216
    217	atomic_t pending_stripe;/* how many stripes not flushed to raid */
    218	u64 seq;		/* seq number of the metablock */
    219	sector_t log_start;	/* where the io_unit starts */
    220	sector_t log_end;	/* where the io_unit ends */
    221	struct list_head log_sibling; /* log->running_ios */
    222	struct list_head stripe_list; /* stripes added to the io_unit */
    223
    224	int state;
    225	bool need_split_bio;
    226	struct bio *split_bio;
    227
    228	unsigned int has_flush:1;		/* include flush request */
    229	unsigned int has_fua:1;			/* include fua request */
    230	unsigned int has_null_flush:1;		/* include null flush request */
    231	unsigned int has_flush_payload:1;	/* include flush payload  */
    232	/*
    233	 * io isn't sent yet, flush/fua request can only be submitted till it's
    234	 * the first IO in running_ios list
    235	 */
    236	unsigned int io_deferred:1;
    237
    238	struct bio_list flush_barriers;   /* size == 0 flush bios */
    239};
    240
    241/* r5l_io_unit state */
    242enum r5l_io_unit_state {
    243	IO_UNIT_RUNNING = 0,	/* accepting new IO */
    244	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
    245				 * don't accepting new bio */
    246	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
    247	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
    248};
    249
    250bool r5c_is_writeback(struct r5l_log *log)
    251{
    252	return (log != NULL &&
    253		log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
    254}
    255
    256static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
    257{
    258	start += inc;
    259	if (start >= log->device_size)
    260		start = start - log->device_size;
    261	return start;
    262}
    263
    264static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
    265				  sector_t end)
    266{
    267	if (end >= start)
    268		return end - start;
    269	else
    270		return end + log->device_size - start;
    271}
    272
    273static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
    274{
    275	sector_t used_size;
    276
    277	used_size = r5l_ring_distance(log, log->last_checkpoint,
    278					log->log_start);
    279
    280	return log->device_size > used_size + size;
    281}
    282
    283static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
    284				    enum r5l_io_unit_state state)
    285{
    286	if (WARN_ON(io->state >= state))
    287		return;
    288	io->state = state;
    289}
    290
    291static void
    292r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
    293{
    294	struct bio *wbi, *wbi2;
    295
    296	wbi = dev->written;
    297	dev->written = NULL;
    298	while (wbi && wbi->bi_iter.bi_sector <
    299	       dev->sector + RAID5_STRIPE_SECTORS(conf)) {
    300		wbi2 = r5_next_bio(conf, wbi, dev->sector);
    301		md_write_end(conf->mddev);
    302		bio_endio(wbi);
    303		wbi = wbi2;
    304	}
    305}
    306
    307void r5c_handle_cached_data_endio(struct r5conf *conf,
    308				  struct stripe_head *sh, int disks)
    309{
    310	int i;
    311
    312	for (i = sh->disks; i--; ) {
    313		if (sh->dev[i].written) {
    314			set_bit(R5_UPTODATE, &sh->dev[i].flags);
    315			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
    316			md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
    317					   RAID5_STRIPE_SECTORS(conf),
    318					   !test_bit(STRIPE_DEGRADED, &sh->state),
    319					   0);
    320		}
    321	}
    322}
    323
    324void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
    325
    326/* Check whether we should flush some stripes to free up stripe cache */
    327void r5c_check_stripe_cache_usage(struct r5conf *conf)
    328{
    329	int total_cached;
    330
    331	if (!r5c_is_writeback(conf->log))
    332		return;
    333
    334	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
    335		atomic_read(&conf->r5c_cached_full_stripes);
    336
    337	/*
    338	 * The following condition is true for either of the following:
    339	 *   - stripe cache pressure high:
    340	 *          total_cached > 3/4 min_nr_stripes ||
    341	 *          empty_inactive_list_nr > 0
    342	 *   - stripe cache pressure moderate:
    343	 *          total_cached > 1/2 min_nr_stripes
    344	 */
    345	if (total_cached > conf->min_nr_stripes * 1 / 2 ||
    346	    atomic_read(&conf->empty_inactive_list_nr) > 0)
    347		r5l_wake_reclaim(conf->log, 0);
    348}
    349
    350/*
    351 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
    352 * stripes in the cache
    353 */
    354void r5c_check_cached_full_stripe(struct r5conf *conf)
    355{
    356	if (!r5c_is_writeback(conf->log))
    357		return;
    358
    359	/*
    360	 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
    361	 * or a full stripe (chunk size / 4k stripes).
    362	 */
    363	if (atomic_read(&conf->r5c_cached_full_stripes) >=
    364	    min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
    365		conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
    366		r5l_wake_reclaim(conf->log, 0);
    367}
    368
    369/*
    370 * Total log space (in sectors) needed to flush all data in cache
    371 *
    372 * To avoid deadlock due to log space, it is necessary to reserve log
    373 * space to flush critical stripes (stripes that occupying log space near
    374 * last_checkpoint). This function helps check how much log space is
    375 * required to flush all cached stripes.
    376 *
    377 * To reduce log space requirements, two mechanisms are used to give cache
    378 * flush higher priorities:
    379 *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
    380 *       stripes ALREADY in journal can be flushed w/o pending writes;
    381 *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
    382 *       can be delayed (r5l_add_no_space_stripe).
    383 *
    384 * In cache flush, the stripe goes through 1 and then 2. For a stripe that
    385 * already passed 1, flushing it requires at most (conf->max_degraded + 1)
    386 * pages of journal space. For stripes that has not passed 1, flushing it
    387 * requires (conf->raid_disks + 1) pages of journal space. There are at
    388 * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
    389 * required to flush all cached stripes (in pages) is:
    390 *
    391 *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
    392 *     (group_cnt + 1) * (raid_disks + 1)
    393 * or
    394 *     (stripe_in_journal_count) * (max_degraded + 1) +
    395 *     (group_cnt + 1) * (raid_disks - max_degraded)
    396 */
    397static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
    398{
    399	struct r5l_log *log = conf->log;
    400
    401	if (!r5c_is_writeback(log))
    402		return 0;
    403
    404	return BLOCK_SECTORS *
    405		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
    406		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
    407}
    408
    409/*
    410 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
    411 *
    412 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
    413 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
    414 * device is less than 2x of reclaim_required_space.
    415 */
    416static inline void r5c_update_log_state(struct r5l_log *log)
    417{
    418	struct r5conf *conf = log->rdev->mddev->private;
    419	sector_t free_space;
    420	sector_t reclaim_space;
    421	bool wake_reclaim = false;
    422
    423	if (!r5c_is_writeback(log))
    424		return;
    425
    426	free_space = r5l_ring_distance(log, log->log_start,
    427				       log->last_checkpoint);
    428	reclaim_space = r5c_log_required_to_flush_cache(conf);
    429	if (free_space < 2 * reclaim_space)
    430		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
    431	else {
    432		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
    433			wake_reclaim = true;
    434		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
    435	}
    436	if (free_space < 3 * reclaim_space)
    437		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
    438	else
    439		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
    440
    441	if (wake_reclaim)
    442		r5l_wake_reclaim(log, 0);
    443}
    444
    445/*
    446 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
    447 * This function should only be called in write-back mode.
    448 */
    449void r5c_make_stripe_write_out(struct stripe_head *sh)
    450{
    451	struct r5conf *conf = sh->raid_conf;
    452	struct r5l_log *log = conf->log;
    453
    454	BUG_ON(!r5c_is_writeback(log));
    455
    456	WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
    457	clear_bit(STRIPE_R5C_CACHING, &sh->state);
    458
    459	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
    460		atomic_inc(&conf->preread_active_stripes);
    461}
    462
    463static void r5c_handle_data_cached(struct stripe_head *sh)
    464{
    465	int i;
    466
    467	for (i = sh->disks; i--; )
    468		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
    469			set_bit(R5_InJournal, &sh->dev[i].flags);
    470			clear_bit(R5_LOCKED, &sh->dev[i].flags);
    471		}
    472	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
    473}
    474
    475/*
    476 * this journal write must contain full parity,
    477 * it may also contain some data pages
    478 */
    479static void r5c_handle_parity_cached(struct stripe_head *sh)
    480{
    481	int i;
    482
    483	for (i = sh->disks; i--; )
    484		if (test_bit(R5_InJournal, &sh->dev[i].flags))
    485			set_bit(R5_Wantwrite, &sh->dev[i].flags);
    486}
    487
    488/*
    489 * Setting proper flags after writing (or flushing) data and/or parity to the
    490 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
    491 */
    492static void r5c_finish_cache_stripe(struct stripe_head *sh)
    493{
    494	struct r5l_log *log = sh->raid_conf->log;
    495
    496	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
    497		BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
    498		/*
    499		 * Set R5_InJournal for parity dev[pd_idx]. This means
    500		 * all data AND parity in the journal. For RAID 6, it is
    501		 * NOT necessary to set the flag for dev[qd_idx], as the
    502		 * two parities are written out together.
    503		 */
    504		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
    505	} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
    506		r5c_handle_data_cached(sh);
    507	} else {
    508		r5c_handle_parity_cached(sh);
    509		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
    510	}
    511}
    512
    513static void r5l_io_run_stripes(struct r5l_io_unit *io)
    514{
    515	struct stripe_head *sh, *next;
    516
    517	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
    518		list_del_init(&sh->log_list);
    519
    520		r5c_finish_cache_stripe(sh);
    521
    522		set_bit(STRIPE_HANDLE, &sh->state);
    523		raid5_release_stripe(sh);
    524	}
    525}
    526
    527static void r5l_log_run_stripes(struct r5l_log *log)
    528{
    529	struct r5l_io_unit *io, *next;
    530
    531	lockdep_assert_held(&log->io_list_lock);
    532
    533	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
    534		/* don't change list order */
    535		if (io->state < IO_UNIT_IO_END)
    536			break;
    537
    538		list_move_tail(&io->log_sibling, &log->finished_ios);
    539		r5l_io_run_stripes(io);
    540	}
    541}
    542
    543static void r5l_move_to_end_ios(struct r5l_log *log)
    544{
    545	struct r5l_io_unit *io, *next;
    546
    547	lockdep_assert_held(&log->io_list_lock);
    548
    549	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
    550		/* don't change list order */
    551		if (io->state < IO_UNIT_IO_END)
    552			break;
    553		list_move_tail(&io->log_sibling, &log->io_end_ios);
    554	}
    555}
    556
    557static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
    558static void r5l_log_endio(struct bio *bio)
    559{
    560	struct r5l_io_unit *io = bio->bi_private;
    561	struct r5l_io_unit *io_deferred;
    562	struct r5l_log *log = io->log;
    563	unsigned long flags;
    564	bool has_null_flush;
    565	bool has_flush_payload;
    566
    567	if (bio->bi_status)
    568		md_error(log->rdev->mddev, log->rdev);
    569
    570	bio_put(bio);
    571	mempool_free(io->meta_page, &log->meta_pool);
    572
    573	spin_lock_irqsave(&log->io_list_lock, flags);
    574	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
    575
    576	/*
    577	 * if the io doesn't not have null_flush or flush payload,
    578	 * it is not safe to access it after releasing io_list_lock.
    579	 * Therefore, it is necessary to check the condition with
    580	 * the lock held.
    581	 */
    582	has_null_flush = io->has_null_flush;
    583	has_flush_payload = io->has_flush_payload;
    584
    585	if (log->need_cache_flush && !list_empty(&io->stripe_list))
    586		r5l_move_to_end_ios(log);
    587	else
    588		r5l_log_run_stripes(log);
    589	if (!list_empty(&log->running_ios)) {
    590		/*
    591		 * FLUSH/FUA io_unit is deferred because of ordering, now we
    592		 * can dispatch it
    593		 */
    594		io_deferred = list_first_entry(&log->running_ios,
    595					       struct r5l_io_unit, log_sibling);
    596		if (io_deferred->io_deferred)
    597			schedule_work(&log->deferred_io_work);
    598	}
    599
    600	spin_unlock_irqrestore(&log->io_list_lock, flags);
    601
    602	if (log->need_cache_flush)
    603		md_wakeup_thread(log->rdev->mddev->thread);
    604
    605	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
    606	if (has_null_flush) {
    607		struct bio *bi;
    608
    609		WARN_ON(bio_list_empty(&io->flush_barriers));
    610		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
    611			bio_endio(bi);
    612			if (atomic_dec_and_test(&io->pending_stripe)) {
    613				__r5l_stripe_write_finished(io);
    614				return;
    615			}
    616		}
    617	}
    618	/* decrease pending_stripe for flush payload */
    619	if (has_flush_payload)
    620		if (atomic_dec_and_test(&io->pending_stripe))
    621			__r5l_stripe_write_finished(io);
    622}
    623
    624static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
    625{
    626	unsigned long flags;
    627
    628	spin_lock_irqsave(&log->io_list_lock, flags);
    629	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
    630	spin_unlock_irqrestore(&log->io_list_lock, flags);
    631
    632	/*
    633	 * In case of journal device failures, submit_bio will get error
    634	 * and calls endio, then active stripes will continue write
    635	 * process. Therefore, it is not necessary to check Faulty bit
    636	 * of journal device here.
    637	 *
    638	 * We can't check split_bio after current_bio is submitted. If
    639	 * io->split_bio is null, after current_bio is submitted, current_bio
    640	 * might already be completed and the io_unit is freed. We submit
    641	 * split_bio first to avoid the issue.
    642	 */
    643	if (io->split_bio) {
    644		if (io->has_flush)
    645			io->split_bio->bi_opf |= REQ_PREFLUSH;
    646		if (io->has_fua)
    647			io->split_bio->bi_opf |= REQ_FUA;
    648		submit_bio(io->split_bio);
    649	}
    650
    651	if (io->has_flush)
    652		io->current_bio->bi_opf |= REQ_PREFLUSH;
    653	if (io->has_fua)
    654		io->current_bio->bi_opf |= REQ_FUA;
    655	submit_bio(io->current_bio);
    656}
    657
    658/* deferred io_unit will be dispatched here */
    659static void r5l_submit_io_async(struct work_struct *work)
    660{
    661	struct r5l_log *log = container_of(work, struct r5l_log,
    662					   deferred_io_work);
    663	struct r5l_io_unit *io = NULL;
    664	unsigned long flags;
    665
    666	spin_lock_irqsave(&log->io_list_lock, flags);
    667	if (!list_empty(&log->running_ios)) {
    668		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
    669				      log_sibling);
    670		if (!io->io_deferred)
    671			io = NULL;
    672		else
    673			io->io_deferred = 0;
    674	}
    675	spin_unlock_irqrestore(&log->io_list_lock, flags);
    676	if (io)
    677		r5l_do_submit_io(log, io);
    678}
    679
    680static void r5c_disable_writeback_async(struct work_struct *work)
    681{
    682	struct r5l_log *log = container_of(work, struct r5l_log,
    683					   disable_writeback_work);
    684	struct mddev *mddev = log->rdev->mddev;
    685	struct r5conf *conf = mddev->private;
    686	int locked = 0;
    687
    688	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
    689		return;
    690	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
    691		mdname(mddev));
    692
    693	/* wait superblock change before suspend */
    694	wait_event(mddev->sb_wait,
    695		   conf->log == NULL ||
    696		   (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
    697		    (locked = mddev_trylock(mddev))));
    698	if (locked) {
    699		mddev_suspend(mddev);
    700		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
    701		mddev_resume(mddev);
    702		mddev_unlock(mddev);
    703	}
    704}
    705
    706static void r5l_submit_current_io(struct r5l_log *log)
    707{
    708	struct r5l_io_unit *io = log->current_io;
    709	struct r5l_meta_block *block;
    710	unsigned long flags;
    711	u32 crc;
    712	bool do_submit = true;
    713
    714	if (!io)
    715		return;
    716
    717	block = page_address(io->meta_page);
    718	block->meta_size = cpu_to_le32(io->meta_offset);
    719	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
    720	block->checksum = cpu_to_le32(crc);
    721
    722	log->current_io = NULL;
    723	spin_lock_irqsave(&log->io_list_lock, flags);
    724	if (io->has_flush || io->has_fua) {
    725		if (io != list_first_entry(&log->running_ios,
    726					   struct r5l_io_unit, log_sibling)) {
    727			io->io_deferred = 1;
    728			do_submit = false;
    729		}
    730	}
    731	spin_unlock_irqrestore(&log->io_list_lock, flags);
    732	if (do_submit)
    733		r5l_do_submit_io(log, io);
    734}
    735
    736static struct bio *r5l_bio_alloc(struct r5l_log *log)
    737{
    738	struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS,
    739					   REQ_OP_WRITE, GFP_NOIO, &log->bs);
    740
    741	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
    742
    743	return bio;
    744}
    745
    746static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
    747{
    748	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
    749
    750	r5c_update_log_state(log);
    751	/*
    752	 * If we filled up the log device start from the beginning again,
    753	 * which will require a new bio.
    754	 *
    755	 * Note: for this to work properly the log size needs to me a multiple
    756	 * of BLOCK_SECTORS.
    757	 */
    758	if (log->log_start == 0)
    759		io->need_split_bio = true;
    760
    761	io->log_end = log->log_start;
    762}
    763
    764static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
    765{
    766	struct r5l_io_unit *io;
    767	struct r5l_meta_block *block;
    768
    769	io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
    770	if (!io)
    771		return NULL;
    772	memset(io, 0, sizeof(*io));
    773
    774	io->log = log;
    775	INIT_LIST_HEAD(&io->log_sibling);
    776	INIT_LIST_HEAD(&io->stripe_list);
    777	bio_list_init(&io->flush_barriers);
    778	io->state = IO_UNIT_RUNNING;
    779
    780	io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
    781	block = page_address(io->meta_page);
    782	clear_page(block);
    783	block->magic = cpu_to_le32(R5LOG_MAGIC);
    784	block->version = R5LOG_VERSION;
    785	block->seq = cpu_to_le64(log->seq);
    786	block->position = cpu_to_le64(log->log_start);
    787
    788	io->log_start = log->log_start;
    789	io->meta_offset = sizeof(struct r5l_meta_block);
    790	io->seq = log->seq++;
    791
    792	io->current_bio = r5l_bio_alloc(log);
    793	io->current_bio->bi_end_io = r5l_log_endio;
    794	io->current_bio->bi_private = io;
    795	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
    796
    797	r5_reserve_log_entry(log, io);
    798
    799	spin_lock_irq(&log->io_list_lock);
    800	list_add_tail(&io->log_sibling, &log->running_ios);
    801	spin_unlock_irq(&log->io_list_lock);
    802
    803	return io;
    804}
    805
    806static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
    807{
    808	if (log->current_io &&
    809	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
    810		r5l_submit_current_io(log);
    811
    812	if (!log->current_io) {
    813		log->current_io = r5l_new_meta(log);
    814		if (!log->current_io)
    815			return -ENOMEM;
    816	}
    817
    818	return 0;
    819}
    820
    821static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
    822				    sector_t location,
    823				    u32 checksum1, u32 checksum2,
    824				    bool checksum2_valid)
    825{
    826	struct r5l_io_unit *io = log->current_io;
    827	struct r5l_payload_data_parity *payload;
    828
    829	payload = page_address(io->meta_page) + io->meta_offset;
    830	payload->header.type = cpu_to_le16(type);
    831	payload->header.flags = cpu_to_le16(0);
    832	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
    833				    (PAGE_SHIFT - 9));
    834	payload->location = cpu_to_le64(location);
    835	payload->checksum[0] = cpu_to_le32(checksum1);
    836	if (checksum2_valid)
    837		payload->checksum[1] = cpu_to_le32(checksum2);
    838
    839	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
    840		sizeof(__le32) * (1 + !!checksum2_valid);
    841}
    842
    843static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
    844{
    845	struct r5l_io_unit *io = log->current_io;
    846
    847	if (io->need_split_bio) {
    848		BUG_ON(io->split_bio);
    849		io->split_bio = io->current_bio;
    850		io->current_bio = r5l_bio_alloc(log);
    851		bio_chain(io->current_bio, io->split_bio);
    852		io->need_split_bio = false;
    853	}
    854
    855	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
    856		BUG();
    857
    858	r5_reserve_log_entry(log, io);
    859}
    860
    861static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
    862{
    863	struct mddev *mddev = log->rdev->mddev;
    864	struct r5conf *conf = mddev->private;
    865	struct r5l_io_unit *io;
    866	struct r5l_payload_flush *payload;
    867	int meta_size;
    868
    869	/*
    870	 * payload_flush requires extra writes to the journal.
    871	 * To avoid handling the extra IO in quiesce, just skip
    872	 * flush_payload
    873	 */
    874	if (conf->quiesce)
    875		return;
    876
    877	mutex_lock(&log->io_mutex);
    878	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
    879
    880	if (r5l_get_meta(log, meta_size)) {
    881		mutex_unlock(&log->io_mutex);
    882		return;
    883	}
    884
    885	/* current implementation is one stripe per flush payload */
    886	io = log->current_io;
    887	payload = page_address(io->meta_page) + io->meta_offset;
    888	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
    889	payload->header.flags = cpu_to_le16(0);
    890	payload->size = cpu_to_le32(sizeof(__le64));
    891	payload->flush_stripes[0] = cpu_to_le64(sect);
    892	io->meta_offset += meta_size;
    893	/* multiple flush payloads count as one pending_stripe */
    894	if (!io->has_flush_payload) {
    895		io->has_flush_payload = 1;
    896		atomic_inc(&io->pending_stripe);
    897	}
    898	mutex_unlock(&log->io_mutex);
    899}
    900
    901static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
    902			   int data_pages, int parity_pages)
    903{
    904	int i;
    905	int meta_size;
    906	int ret;
    907	struct r5l_io_unit *io;
    908
    909	meta_size =
    910		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
    911		 * data_pages) +
    912		sizeof(struct r5l_payload_data_parity) +
    913		sizeof(__le32) * parity_pages;
    914
    915	ret = r5l_get_meta(log, meta_size);
    916	if (ret)
    917		return ret;
    918
    919	io = log->current_io;
    920
    921	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
    922		io->has_flush = 1;
    923
    924	for (i = 0; i < sh->disks; i++) {
    925		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
    926		    test_bit(R5_InJournal, &sh->dev[i].flags))
    927			continue;
    928		if (i == sh->pd_idx || i == sh->qd_idx)
    929			continue;
    930		if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
    931		    log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
    932			io->has_fua = 1;
    933			/*
    934			 * we need to flush journal to make sure recovery can
    935			 * reach the data with fua flag
    936			 */
    937			io->has_flush = 1;
    938		}
    939		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
    940					raid5_compute_blocknr(sh, i, 0),
    941					sh->dev[i].log_checksum, 0, false);
    942		r5l_append_payload_page(log, sh->dev[i].page);
    943	}
    944
    945	if (parity_pages == 2) {
    946		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
    947					sh->sector, sh->dev[sh->pd_idx].log_checksum,
    948					sh->dev[sh->qd_idx].log_checksum, true);
    949		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
    950		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
    951	} else if (parity_pages == 1) {
    952		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
    953					sh->sector, sh->dev[sh->pd_idx].log_checksum,
    954					0, false);
    955		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
    956	} else  /* Just writing data, not parity, in caching phase */
    957		BUG_ON(parity_pages != 0);
    958
    959	list_add_tail(&sh->log_list, &io->stripe_list);
    960	atomic_inc(&io->pending_stripe);
    961	sh->log_io = io;
    962
    963	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
    964		return 0;
    965
    966	if (sh->log_start == MaxSector) {
    967		BUG_ON(!list_empty(&sh->r5c));
    968		sh->log_start = io->log_start;
    969		spin_lock_irq(&log->stripe_in_journal_lock);
    970		list_add_tail(&sh->r5c,
    971			      &log->stripe_in_journal_list);
    972		spin_unlock_irq(&log->stripe_in_journal_lock);
    973		atomic_inc(&log->stripe_in_journal_count);
    974	}
    975	return 0;
    976}
    977
    978/* add stripe to no_space_stripes, and then wake up reclaim */
    979static inline void r5l_add_no_space_stripe(struct r5l_log *log,
    980					   struct stripe_head *sh)
    981{
    982	spin_lock(&log->no_space_stripes_lock);
    983	list_add_tail(&sh->log_list, &log->no_space_stripes);
    984	spin_unlock(&log->no_space_stripes_lock);
    985}
    986
    987/*
    988 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
    989 * data from log to raid disks), so we shouldn't wait for reclaim here
    990 */
    991int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
    992{
    993	struct r5conf *conf = sh->raid_conf;
    994	int write_disks = 0;
    995	int data_pages, parity_pages;
    996	int reserve;
    997	int i;
    998	int ret = 0;
    999	bool wake_reclaim = false;
   1000
   1001	if (!log)
   1002		return -EAGAIN;
   1003	/* Don't support stripe batch */
   1004	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
   1005	    test_bit(STRIPE_SYNCING, &sh->state)) {
   1006		/* the stripe is written to log, we start writing it to raid */
   1007		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
   1008		return -EAGAIN;
   1009	}
   1010
   1011	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
   1012
   1013	for (i = 0; i < sh->disks; i++) {
   1014		void *addr;
   1015
   1016		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
   1017		    test_bit(R5_InJournal, &sh->dev[i].flags))
   1018			continue;
   1019
   1020		write_disks++;
   1021		/* checksum is already calculated in last run */
   1022		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
   1023			continue;
   1024		addr = kmap_atomic(sh->dev[i].page);
   1025		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
   1026						    addr, PAGE_SIZE);
   1027		kunmap_atomic(addr);
   1028	}
   1029	parity_pages = 1 + !!(sh->qd_idx >= 0);
   1030	data_pages = write_disks - parity_pages;
   1031
   1032	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
   1033	/*
   1034	 * The stripe must enter state machine again to finish the write, so
   1035	 * don't delay.
   1036	 */
   1037	clear_bit(STRIPE_DELAYED, &sh->state);
   1038	atomic_inc(&sh->count);
   1039
   1040	mutex_lock(&log->io_mutex);
   1041	/* meta + data */
   1042	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
   1043
   1044	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
   1045		if (!r5l_has_free_space(log, reserve)) {
   1046			r5l_add_no_space_stripe(log, sh);
   1047			wake_reclaim = true;
   1048		} else {
   1049			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
   1050			if (ret) {
   1051				spin_lock_irq(&log->io_list_lock);
   1052				list_add_tail(&sh->log_list,
   1053					      &log->no_mem_stripes);
   1054				spin_unlock_irq(&log->io_list_lock);
   1055			}
   1056		}
   1057	} else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
   1058		/*
   1059		 * log space critical, do not process stripes that are
   1060		 * not in cache yet (sh->log_start == MaxSector).
   1061		 */
   1062		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
   1063		    sh->log_start == MaxSector) {
   1064			r5l_add_no_space_stripe(log, sh);
   1065			wake_reclaim = true;
   1066			reserve = 0;
   1067		} else if (!r5l_has_free_space(log, reserve)) {
   1068			if (sh->log_start == log->last_checkpoint)
   1069				BUG();
   1070			else
   1071				r5l_add_no_space_stripe(log, sh);
   1072		} else {
   1073			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
   1074			if (ret) {
   1075				spin_lock_irq(&log->io_list_lock);
   1076				list_add_tail(&sh->log_list,
   1077					      &log->no_mem_stripes);
   1078				spin_unlock_irq(&log->io_list_lock);
   1079			}
   1080		}
   1081	}
   1082
   1083	mutex_unlock(&log->io_mutex);
   1084	if (wake_reclaim)
   1085		r5l_wake_reclaim(log, reserve);
   1086	return 0;
   1087}
   1088
   1089void r5l_write_stripe_run(struct r5l_log *log)
   1090{
   1091	if (!log)
   1092		return;
   1093	mutex_lock(&log->io_mutex);
   1094	r5l_submit_current_io(log);
   1095	mutex_unlock(&log->io_mutex);
   1096}
   1097
   1098int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
   1099{
   1100	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
   1101		/*
   1102		 * in write through (journal only)
   1103		 * we flush log disk cache first, then write stripe data to
   1104		 * raid disks. So if bio is finished, the log disk cache is
   1105		 * flushed already. The recovery guarantees we can recovery
   1106		 * the bio from log disk, so we don't need to flush again
   1107		 */
   1108		if (bio->bi_iter.bi_size == 0) {
   1109			bio_endio(bio);
   1110			return 0;
   1111		}
   1112		bio->bi_opf &= ~REQ_PREFLUSH;
   1113	} else {
   1114		/* write back (with cache) */
   1115		if (bio->bi_iter.bi_size == 0) {
   1116			mutex_lock(&log->io_mutex);
   1117			r5l_get_meta(log, 0);
   1118			bio_list_add(&log->current_io->flush_barriers, bio);
   1119			log->current_io->has_flush = 1;
   1120			log->current_io->has_null_flush = 1;
   1121			atomic_inc(&log->current_io->pending_stripe);
   1122			r5l_submit_current_io(log);
   1123			mutex_unlock(&log->io_mutex);
   1124			return 0;
   1125		}
   1126	}
   1127	return -EAGAIN;
   1128}
   1129
   1130/* This will run after log space is reclaimed */
   1131static void r5l_run_no_space_stripes(struct r5l_log *log)
   1132{
   1133	struct stripe_head *sh;
   1134
   1135	spin_lock(&log->no_space_stripes_lock);
   1136	while (!list_empty(&log->no_space_stripes)) {
   1137		sh = list_first_entry(&log->no_space_stripes,
   1138				      struct stripe_head, log_list);
   1139		list_del_init(&sh->log_list);
   1140		set_bit(STRIPE_HANDLE, &sh->state);
   1141		raid5_release_stripe(sh);
   1142	}
   1143	spin_unlock(&log->no_space_stripes_lock);
   1144}
   1145
   1146/*
   1147 * calculate new last_checkpoint
   1148 * for write through mode, returns log->next_checkpoint
   1149 * for write back, returns log_start of first sh in stripe_in_journal_list
   1150 */
   1151static sector_t r5c_calculate_new_cp(struct r5conf *conf)
   1152{
   1153	struct stripe_head *sh;
   1154	struct r5l_log *log = conf->log;
   1155	sector_t new_cp;
   1156	unsigned long flags;
   1157
   1158	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
   1159		return log->next_checkpoint;
   1160
   1161	spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
   1162	if (list_empty(&conf->log->stripe_in_journal_list)) {
   1163		/* all stripes flushed */
   1164		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
   1165		return log->next_checkpoint;
   1166	}
   1167	sh = list_first_entry(&conf->log->stripe_in_journal_list,
   1168			      struct stripe_head, r5c);
   1169	new_cp = sh->log_start;
   1170	spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
   1171	return new_cp;
   1172}
   1173
   1174static sector_t r5l_reclaimable_space(struct r5l_log *log)
   1175{
   1176	struct r5conf *conf = log->rdev->mddev->private;
   1177
   1178	return r5l_ring_distance(log, log->last_checkpoint,
   1179				 r5c_calculate_new_cp(conf));
   1180}
   1181
   1182static void r5l_run_no_mem_stripe(struct r5l_log *log)
   1183{
   1184	struct stripe_head *sh;
   1185
   1186	lockdep_assert_held(&log->io_list_lock);
   1187
   1188	if (!list_empty(&log->no_mem_stripes)) {
   1189		sh = list_first_entry(&log->no_mem_stripes,
   1190				      struct stripe_head, log_list);
   1191		list_del_init(&sh->log_list);
   1192		set_bit(STRIPE_HANDLE, &sh->state);
   1193		raid5_release_stripe(sh);
   1194	}
   1195}
   1196
   1197static bool r5l_complete_finished_ios(struct r5l_log *log)
   1198{
   1199	struct r5l_io_unit *io, *next;
   1200	bool found = false;
   1201
   1202	lockdep_assert_held(&log->io_list_lock);
   1203
   1204	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
   1205		/* don't change list order */
   1206		if (io->state < IO_UNIT_STRIPE_END)
   1207			break;
   1208
   1209		log->next_checkpoint = io->log_start;
   1210
   1211		list_del(&io->log_sibling);
   1212		mempool_free(io, &log->io_pool);
   1213		r5l_run_no_mem_stripe(log);
   1214
   1215		found = true;
   1216	}
   1217
   1218	return found;
   1219}
   1220
   1221static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
   1222{
   1223	struct r5l_log *log = io->log;
   1224	struct r5conf *conf = log->rdev->mddev->private;
   1225	unsigned long flags;
   1226
   1227	spin_lock_irqsave(&log->io_list_lock, flags);
   1228	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
   1229
   1230	if (!r5l_complete_finished_ios(log)) {
   1231		spin_unlock_irqrestore(&log->io_list_lock, flags);
   1232		return;
   1233	}
   1234
   1235	if (r5l_reclaimable_space(log) > log->max_free_space ||
   1236	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
   1237		r5l_wake_reclaim(log, 0);
   1238
   1239	spin_unlock_irqrestore(&log->io_list_lock, flags);
   1240	wake_up(&log->iounit_wait);
   1241}
   1242
   1243void r5l_stripe_write_finished(struct stripe_head *sh)
   1244{
   1245	struct r5l_io_unit *io;
   1246
   1247	io = sh->log_io;
   1248	sh->log_io = NULL;
   1249
   1250	if (io && atomic_dec_and_test(&io->pending_stripe))
   1251		__r5l_stripe_write_finished(io);
   1252}
   1253
   1254static void r5l_log_flush_endio(struct bio *bio)
   1255{
   1256	struct r5l_log *log = container_of(bio, struct r5l_log,
   1257		flush_bio);
   1258	unsigned long flags;
   1259	struct r5l_io_unit *io;
   1260
   1261	if (bio->bi_status)
   1262		md_error(log->rdev->mddev, log->rdev);
   1263
   1264	spin_lock_irqsave(&log->io_list_lock, flags);
   1265	list_for_each_entry(io, &log->flushing_ios, log_sibling)
   1266		r5l_io_run_stripes(io);
   1267	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
   1268	spin_unlock_irqrestore(&log->io_list_lock, flags);
   1269
   1270	bio_uninit(bio);
   1271}
   1272
   1273/*
   1274 * Starting dispatch IO to raid.
   1275 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
   1276 * broken meta in the middle of a log causes recovery can't find meta at the
   1277 * head of log. If operations require meta at the head persistent in log, we
   1278 * must make sure meta before it persistent in log too. A case is:
   1279 *
   1280 * stripe data/parity is in log, we start write stripe to raid disks. stripe
   1281 * data/parity must be persistent in log before we do the write to raid disks.
   1282 *
   1283 * The solution is we restrictly maintain io_unit list order. In this case, we
   1284 * only write stripes of an io_unit to raid disks till the io_unit is the first
   1285 * one whose data/parity is in log.
   1286 */
   1287void r5l_flush_stripe_to_raid(struct r5l_log *log)
   1288{
   1289	bool do_flush;
   1290
   1291	if (!log || !log->need_cache_flush)
   1292		return;
   1293
   1294	spin_lock_irq(&log->io_list_lock);
   1295	/* flush bio is running */
   1296	if (!list_empty(&log->flushing_ios)) {
   1297		spin_unlock_irq(&log->io_list_lock);
   1298		return;
   1299	}
   1300	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
   1301	do_flush = !list_empty(&log->flushing_ios);
   1302	spin_unlock_irq(&log->io_list_lock);
   1303
   1304	if (!do_flush)
   1305		return;
   1306	bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0,
   1307		  REQ_OP_WRITE | REQ_PREFLUSH);
   1308	log->flush_bio.bi_end_io = r5l_log_flush_endio;
   1309	submit_bio(&log->flush_bio);
   1310}
   1311
   1312static void r5l_write_super(struct r5l_log *log, sector_t cp);
   1313static void r5l_write_super_and_discard_space(struct r5l_log *log,
   1314	sector_t end)
   1315{
   1316	struct block_device *bdev = log->rdev->bdev;
   1317	struct mddev *mddev;
   1318
   1319	r5l_write_super(log, end);
   1320
   1321	if (!bdev_max_discard_sectors(bdev))
   1322		return;
   1323
   1324	mddev = log->rdev->mddev;
   1325	/*
   1326	 * Discard could zero data, so before discard we must make sure
   1327	 * superblock is updated to new log tail. Updating superblock (either
   1328	 * directly call md_update_sb() or depend on md thread) must hold
   1329	 * reconfig mutex. On the other hand, raid5_quiesce is called with
   1330	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
   1331	 * for all IO finish, hence waitting for reclaim thread, while reclaim
   1332	 * thread is calling this function and waitting for reconfig mutex. So
   1333	 * there is a deadlock. We workaround this issue with a trylock.
   1334	 * FIXME: we could miss discard if we can't take reconfig mutex
   1335	 */
   1336	set_mask_bits(&mddev->sb_flags, 0,
   1337		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
   1338	if (!mddev_trylock(mddev))
   1339		return;
   1340	md_update_sb(mddev, 1);
   1341	mddev_unlock(mddev);
   1342
   1343	/* discard IO error really doesn't matter, ignore it */
   1344	if (log->last_checkpoint < end) {
   1345		blkdev_issue_discard(bdev,
   1346				log->last_checkpoint + log->rdev->data_offset,
   1347				end - log->last_checkpoint, GFP_NOIO);
   1348	} else {
   1349		blkdev_issue_discard(bdev,
   1350				log->last_checkpoint + log->rdev->data_offset,
   1351				log->device_size - log->last_checkpoint,
   1352				GFP_NOIO);
   1353		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
   1354				GFP_NOIO);
   1355	}
   1356}
   1357
   1358/*
   1359 * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
   1360 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
   1361 *
   1362 * must hold conf->device_lock
   1363 */
   1364static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
   1365{
   1366	BUG_ON(list_empty(&sh->lru));
   1367	BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
   1368	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
   1369
   1370	/*
   1371	 * The stripe is not ON_RELEASE_LIST, so it is safe to call
   1372	 * raid5_release_stripe() while holding conf->device_lock
   1373	 */
   1374	BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
   1375	lockdep_assert_held(&conf->device_lock);
   1376
   1377	list_del_init(&sh->lru);
   1378	atomic_inc(&sh->count);
   1379
   1380	set_bit(STRIPE_HANDLE, &sh->state);
   1381	atomic_inc(&conf->active_stripes);
   1382	r5c_make_stripe_write_out(sh);
   1383
   1384	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
   1385		atomic_inc(&conf->r5c_flushing_partial_stripes);
   1386	else
   1387		atomic_inc(&conf->r5c_flushing_full_stripes);
   1388	raid5_release_stripe(sh);
   1389}
   1390
   1391/*
   1392 * if num == 0, flush all full stripes
   1393 * if num > 0, flush all full stripes. If less than num full stripes are
   1394 *             flushed, flush some partial stripes until totally num stripes are
   1395 *             flushed or there is no more cached stripes.
   1396 */
   1397void r5c_flush_cache(struct r5conf *conf, int num)
   1398{
   1399	int count;
   1400	struct stripe_head *sh, *next;
   1401
   1402	lockdep_assert_held(&conf->device_lock);
   1403	if (!conf->log)
   1404		return;
   1405
   1406	count = 0;
   1407	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
   1408		r5c_flush_stripe(conf, sh);
   1409		count++;
   1410	}
   1411
   1412	if (count >= num)
   1413		return;
   1414	list_for_each_entry_safe(sh, next,
   1415				 &conf->r5c_partial_stripe_list, lru) {
   1416		r5c_flush_stripe(conf, sh);
   1417		if (++count >= num)
   1418			break;
   1419	}
   1420}
   1421
   1422static void r5c_do_reclaim(struct r5conf *conf)
   1423{
   1424	struct r5l_log *log = conf->log;
   1425	struct stripe_head *sh;
   1426	int count = 0;
   1427	unsigned long flags;
   1428	int total_cached;
   1429	int stripes_to_flush;
   1430	int flushing_partial, flushing_full;
   1431
   1432	if (!r5c_is_writeback(log))
   1433		return;
   1434
   1435	flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
   1436	flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
   1437	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
   1438		atomic_read(&conf->r5c_cached_full_stripes) -
   1439		flushing_full - flushing_partial;
   1440
   1441	if (total_cached > conf->min_nr_stripes * 3 / 4 ||
   1442	    atomic_read(&conf->empty_inactive_list_nr) > 0)
   1443		/*
   1444		 * if stripe cache pressure high, flush all full stripes and
   1445		 * some partial stripes
   1446		 */
   1447		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
   1448	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
   1449		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
   1450		 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
   1451		/*
   1452		 * if stripe cache pressure moderate, or if there is many full
   1453		 * stripes,flush all full stripes
   1454		 */
   1455		stripes_to_flush = 0;
   1456	else
   1457		/* no need to flush */
   1458		stripes_to_flush = -1;
   1459
   1460	if (stripes_to_flush >= 0) {
   1461		spin_lock_irqsave(&conf->device_lock, flags);
   1462		r5c_flush_cache(conf, stripes_to_flush);
   1463		spin_unlock_irqrestore(&conf->device_lock, flags);
   1464	}
   1465
   1466	/* if log space is tight, flush stripes on stripe_in_journal_list */
   1467	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
   1468		spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
   1469		spin_lock(&conf->device_lock);
   1470		list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
   1471			/*
   1472			 * stripes on stripe_in_journal_list could be in any
   1473			 * state of the stripe_cache state machine. In this
   1474			 * case, we only want to flush stripe on
   1475			 * r5c_cached_full/partial_stripes. The following
   1476			 * condition makes sure the stripe is on one of the
   1477			 * two lists.
   1478			 */
   1479			if (!list_empty(&sh->lru) &&
   1480			    !test_bit(STRIPE_HANDLE, &sh->state) &&
   1481			    atomic_read(&sh->count) == 0) {
   1482				r5c_flush_stripe(conf, sh);
   1483				if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
   1484					break;
   1485			}
   1486		}
   1487		spin_unlock(&conf->device_lock);
   1488		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
   1489	}
   1490
   1491	if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
   1492		r5l_run_no_space_stripes(log);
   1493
   1494	md_wakeup_thread(conf->mddev->thread);
   1495}
   1496
   1497static void r5l_do_reclaim(struct r5l_log *log)
   1498{
   1499	struct r5conf *conf = log->rdev->mddev->private;
   1500	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
   1501	sector_t reclaimable;
   1502	sector_t next_checkpoint;
   1503	bool write_super;
   1504
   1505	spin_lock_irq(&log->io_list_lock);
   1506	write_super = r5l_reclaimable_space(log) > log->max_free_space ||
   1507		reclaim_target != 0 || !list_empty(&log->no_space_stripes);
   1508	/*
   1509	 * move proper io_unit to reclaim list. We should not change the order.
   1510	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
   1511	 * shouldn't reuse space of an unreclaimable io_unit
   1512	 */
   1513	while (1) {
   1514		reclaimable = r5l_reclaimable_space(log);
   1515		if (reclaimable >= reclaim_target ||
   1516		    (list_empty(&log->running_ios) &&
   1517		     list_empty(&log->io_end_ios) &&
   1518		     list_empty(&log->flushing_ios) &&
   1519		     list_empty(&log->finished_ios)))
   1520			break;
   1521
   1522		md_wakeup_thread(log->rdev->mddev->thread);
   1523		wait_event_lock_irq(log->iounit_wait,
   1524				    r5l_reclaimable_space(log) > reclaimable,
   1525				    log->io_list_lock);
   1526	}
   1527
   1528	next_checkpoint = r5c_calculate_new_cp(conf);
   1529	spin_unlock_irq(&log->io_list_lock);
   1530
   1531	if (reclaimable == 0 || !write_super)
   1532		return;
   1533
   1534	/*
   1535	 * write_super will flush cache of each raid disk. We must write super
   1536	 * here, because the log area might be reused soon and we don't want to
   1537	 * confuse recovery
   1538	 */
   1539	r5l_write_super_and_discard_space(log, next_checkpoint);
   1540
   1541	mutex_lock(&log->io_mutex);
   1542	log->last_checkpoint = next_checkpoint;
   1543	r5c_update_log_state(log);
   1544	mutex_unlock(&log->io_mutex);
   1545
   1546	r5l_run_no_space_stripes(log);
   1547}
   1548
   1549static void r5l_reclaim_thread(struct md_thread *thread)
   1550{
   1551	struct mddev *mddev = thread->mddev;
   1552	struct r5conf *conf = mddev->private;
   1553	struct r5l_log *log = conf->log;
   1554
   1555	if (!log)
   1556		return;
   1557	r5c_do_reclaim(conf);
   1558	r5l_do_reclaim(log);
   1559}
   1560
   1561void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
   1562{
   1563	unsigned long target;
   1564	unsigned long new = (unsigned long)space; /* overflow in theory */
   1565
   1566	if (!log)
   1567		return;
   1568	do {
   1569		target = log->reclaim_target;
   1570		if (new < target)
   1571			return;
   1572	} while (cmpxchg(&log->reclaim_target, target, new) != target);
   1573	md_wakeup_thread(log->reclaim_thread);
   1574}
   1575
   1576void r5l_quiesce(struct r5l_log *log, int quiesce)
   1577{
   1578	struct mddev *mddev;
   1579
   1580	if (quiesce) {
   1581		/* make sure r5l_write_super_and_discard_space exits */
   1582		mddev = log->rdev->mddev;
   1583		wake_up(&mddev->sb_wait);
   1584		kthread_park(log->reclaim_thread->tsk);
   1585		r5l_wake_reclaim(log, MaxSector);
   1586		r5l_do_reclaim(log);
   1587	} else
   1588		kthread_unpark(log->reclaim_thread->tsk);
   1589}
   1590
   1591bool r5l_log_disk_error(struct r5conf *conf)
   1592{
   1593	struct r5l_log *log;
   1594	bool ret;
   1595	/* don't allow write if journal disk is missing */
   1596	rcu_read_lock();
   1597	log = rcu_dereference(conf->log);
   1598
   1599	if (!log)
   1600		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
   1601	else
   1602		ret = test_bit(Faulty, &log->rdev->flags);
   1603	rcu_read_unlock();
   1604	return ret;
   1605}
   1606
   1607#define R5L_RECOVERY_PAGE_POOL_SIZE 256
   1608
   1609struct r5l_recovery_ctx {
   1610	struct page *meta_page;		/* current meta */
   1611	sector_t meta_total_blocks;	/* total size of current meta and data */
   1612	sector_t pos;			/* recovery position */
   1613	u64 seq;			/* recovery position seq */
   1614	int data_parity_stripes;	/* number of data_parity stripes */
   1615	int data_only_stripes;		/* number of data_only stripes */
   1616	struct list_head cached_list;
   1617
   1618	/*
   1619	 * read ahead page pool (ra_pool)
   1620	 * in recovery, log is read sequentially. It is not efficient to
   1621	 * read every page with sync_page_io(). The read ahead page pool
   1622	 * reads multiple pages with one IO, so further log read can
   1623	 * just copy data from the pool.
   1624	 */
   1625	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
   1626	struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE];
   1627	sector_t pool_offset;	/* offset of first page in the pool */
   1628	int total_pages;	/* total allocated pages */
   1629	int valid_pages;	/* pages with valid data */
   1630};
   1631
   1632static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
   1633					    struct r5l_recovery_ctx *ctx)
   1634{
   1635	struct page *page;
   1636
   1637	ctx->valid_pages = 0;
   1638	ctx->total_pages = 0;
   1639	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
   1640		page = alloc_page(GFP_KERNEL);
   1641
   1642		if (!page)
   1643			break;
   1644		ctx->ra_pool[ctx->total_pages] = page;
   1645		ctx->total_pages += 1;
   1646	}
   1647
   1648	if (ctx->total_pages == 0)
   1649		return -ENOMEM;
   1650
   1651	ctx->pool_offset = 0;
   1652	return 0;
   1653}
   1654
   1655static void r5l_recovery_free_ra_pool(struct r5l_log *log,
   1656					struct r5l_recovery_ctx *ctx)
   1657{
   1658	int i;
   1659
   1660	for (i = 0; i < ctx->total_pages; ++i)
   1661		put_page(ctx->ra_pool[i]);
   1662}
   1663
   1664/*
   1665 * fetch ctx->valid_pages pages from offset
   1666 * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
   1667 * However, if the offset is close to the end of the journal device,
   1668 * ctx->valid_pages could be smaller than ctx->total_pages
   1669 */
   1670static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
   1671				      struct r5l_recovery_ctx *ctx,
   1672				      sector_t offset)
   1673{
   1674	struct bio bio;
   1675	int ret;
   1676
   1677	bio_init(&bio, log->rdev->bdev, ctx->ra_bvec,
   1678		 R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ);
   1679	bio.bi_iter.bi_sector = log->rdev->data_offset + offset;
   1680
   1681	ctx->valid_pages = 0;
   1682	ctx->pool_offset = offset;
   1683
   1684	while (ctx->valid_pages < ctx->total_pages) {
   1685		__bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE,
   1686			       0);
   1687		ctx->valid_pages += 1;
   1688
   1689		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
   1690
   1691		if (offset == 0)  /* reached end of the device */
   1692			break;
   1693	}
   1694
   1695	ret = submit_bio_wait(&bio);
   1696	bio_uninit(&bio);
   1697	return ret;
   1698}
   1699
   1700/*
   1701 * try read a page from the read ahead page pool, if the page is not in the
   1702 * pool, call r5l_recovery_fetch_ra_pool
   1703 */
   1704static int r5l_recovery_read_page(struct r5l_log *log,
   1705				  struct r5l_recovery_ctx *ctx,
   1706				  struct page *page,
   1707				  sector_t offset)
   1708{
   1709	int ret;
   1710
   1711	if (offset < ctx->pool_offset ||
   1712	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
   1713		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
   1714		if (ret)
   1715			return ret;
   1716	}
   1717
   1718	BUG_ON(offset < ctx->pool_offset ||
   1719	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
   1720
   1721	memcpy(page_address(page),
   1722	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
   1723					 BLOCK_SECTOR_SHIFT]),
   1724	       PAGE_SIZE);
   1725	return 0;
   1726}
   1727
   1728static int r5l_recovery_read_meta_block(struct r5l_log *log,
   1729					struct r5l_recovery_ctx *ctx)
   1730{
   1731	struct page *page = ctx->meta_page;
   1732	struct r5l_meta_block *mb;
   1733	u32 crc, stored_crc;
   1734	int ret;
   1735
   1736	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
   1737	if (ret != 0)
   1738		return ret;
   1739
   1740	mb = page_address(page);
   1741	stored_crc = le32_to_cpu(mb->checksum);
   1742	mb->checksum = 0;
   1743
   1744	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
   1745	    le64_to_cpu(mb->seq) != ctx->seq ||
   1746	    mb->version != R5LOG_VERSION ||
   1747	    le64_to_cpu(mb->position) != ctx->pos)
   1748		return -EINVAL;
   1749
   1750	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
   1751	if (stored_crc != crc)
   1752		return -EINVAL;
   1753
   1754	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
   1755		return -EINVAL;
   1756
   1757	ctx->meta_total_blocks = BLOCK_SECTORS;
   1758
   1759	return 0;
   1760}
   1761
   1762static void
   1763r5l_recovery_create_empty_meta_block(struct r5l_log *log,
   1764				     struct page *page,
   1765				     sector_t pos, u64 seq)
   1766{
   1767	struct r5l_meta_block *mb;
   1768
   1769	mb = page_address(page);
   1770	clear_page(mb);
   1771	mb->magic = cpu_to_le32(R5LOG_MAGIC);
   1772	mb->version = R5LOG_VERSION;
   1773	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
   1774	mb->seq = cpu_to_le64(seq);
   1775	mb->position = cpu_to_le64(pos);
   1776}
   1777
   1778static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
   1779					  u64 seq)
   1780{
   1781	struct page *page;
   1782	struct r5l_meta_block *mb;
   1783
   1784	page = alloc_page(GFP_KERNEL);
   1785	if (!page)
   1786		return -ENOMEM;
   1787	r5l_recovery_create_empty_meta_block(log, page, pos, seq);
   1788	mb = page_address(page);
   1789	mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
   1790					     mb, PAGE_SIZE));
   1791	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
   1792			  REQ_SYNC | REQ_FUA, false)) {
   1793		__free_page(page);
   1794		return -EIO;
   1795	}
   1796	__free_page(page);
   1797	return 0;
   1798}
   1799
   1800/*
   1801 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
   1802 * to mark valid (potentially not flushed) data in the journal.
   1803 *
   1804 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
   1805 * so there should not be any mismatch here.
   1806 */
   1807static void r5l_recovery_load_data(struct r5l_log *log,
   1808				   struct stripe_head *sh,
   1809				   struct r5l_recovery_ctx *ctx,
   1810				   struct r5l_payload_data_parity *payload,
   1811				   sector_t log_offset)
   1812{
   1813	struct mddev *mddev = log->rdev->mddev;
   1814	struct r5conf *conf = mddev->private;
   1815	int dd_idx;
   1816
   1817	raid5_compute_sector(conf,
   1818			     le64_to_cpu(payload->location), 0,
   1819			     &dd_idx, sh);
   1820	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
   1821	sh->dev[dd_idx].log_checksum =
   1822		le32_to_cpu(payload->checksum[0]);
   1823	ctx->meta_total_blocks += BLOCK_SECTORS;
   1824
   1825	set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
   1826	set_bit(STRIPE_R5C_CACHING, &sh->state);
   1827}
   1828
   1829static void r5l_recovery_load_parity(struct r5l_log *log,
   1830				     struct stripe_head *sh,
   1831				     struct r5l_recovery_ctx *ctx,
   1832				     struct r5l_payload_data_parity *payload,
   1833				     sector_t log_offset)
   1834{
   1835	struct mddev *mddev = log->rdev->mddev;
   1836	struct r5conf *conf = mddev->private;
   1837
   1838	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
   1839	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
   1840	sh->dev[sh->pd_idx].log_checksum =
   1841		le32_to_cpu(payload->checksum[0]);
   1842	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
   1843
   1844	if (sh->qd_idx >= 0) {
   1845		r5l_recovery_read_page(
   1846			log, ctx, sh->dev[sh->qd_idx].page,
   1847			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
   1848		sh->dev[sh->qd_idx].log_checksum =
   1849			le32_to_cpu(payload->checksum[1]);
   1850		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
   1851	}
   1852	clear_bit(STRIPE_R5C_CACHING, &sh->state);
   1853}
   1854
   1855static void r5l_recovery_reset_stripe(struct stripe_head *sh)
   1856{
   1857	int i;
   1858
   1859	sh->state = 0;
   1860	sh->log_start = MaxSector;
   1861	for (i = sh->disks; i--; )
   1862		sh->dev[i].flags = 0;
   1863}
   1864
   1865static void
   1866r5l_recovery_replay_one_stripe(struct r5conf *conf,
   1867			       struct stripe_head *sh,
   1868			       struct r5l_recovery_ctx *ctx)
   1869{
   1870	struct md_rdev *rdev, *rrdev;
   1871	int disk_index;
   1872	int data_count = 0;
   1873
   1874	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
   1875		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
   1876			continue;
   1877		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
   1878			continue;
   1879		data_count++;
   1880	}
   1881
   1882	/*
   1883	 * stripes that only have parity must have been flushed
   1884	 * before the crash that we are now recovering from, so
   1885	 * there is nothing more to recovery.
   1886	 */
   1887	if (data_count == 0)
   1888		goto out;
   1889
   1890	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
   1891		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
   1892			continue;
   1893
   1894		/* in case device is broken */
   1895		rcu_read_lock();
   1896		rdev = rcu_dereference(conf->disks[disk_index].rdev);
   1897		if (rdev) {
   1898			atomic_inc(&rdev->nr_pending);
   1899			rcu_read_unlock();
   1900			sync_page_io(rdev, sh->sector, PAGE_SIZE,
   1901				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
   1902				     false);
   1903			rdev_dec_pending(rdev, rdev->mddev);
   1904			rcu_read_lock();
   1905		}
   1906		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
   1907		if (rrdev) {
   1908			atomic_inc(&rrdev->nr_pending);
   1909			rcu_read_unlock();
   1910			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
   1911				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
   1912				     false);
   1913			rdev_dec_pending(rrdev, rrdev->mddev);
   1914			rcu_read_lock();
   1915		}
   1916		rcu_read_unlock();
   1917	}
   1918	ctx->data_parity_stripes++;
   1919out:
   1920	r5l_recovery_reset_stripe(sh);
   1921}
   1922
   1923static struct stripe_head *
   1924r5c_recovery_alloc_stripe(
   1925		struct r5conf *conf,
   1926		sector_t stripe_sect,
   1927		int noblock)
   1928{
   1929	struct stripe_head *sh;
   1930
   1931	sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
   1932	if (!sh)
   1933		return NULL;  /* no more stripe available */
   1934
   1935	r5l_recovery_reset_stripe(sh);
   1936
   1937	return sh;
   1938}
   1939
   1940static struct stripe_head *
   1941r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
   1942{
   1943	struct stripe_head *sh;
   1944
   1945	list_for_each_entry(sh, list, lru)
   1946		if (sh->sector == sect)
   1947			return sh;
   1948	return NULL;
   1949}
   1950
   1951static void
   1952r5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
   1953			  struct r5l_recovery_ctx *ctx)
   1954{
   1955	struct stripe_head *sh, *next;
   1956
   1957	list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
   1958		r5l_recovery_reset_stripe(sh);
   1959		list_del_init(&sh->lru);
   1960		raid5_release_stripe(sh);
   1961	}
   1962}
   1963
   1964static void
   1965r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
   1966			    struct r5l_recovery_ctx *ctx)
   1967{
   1968	struct stripe_head *sh, *next;
   1969
   1970	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
   1971		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
   1972			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
   1973			list_del_init(&sh->lru);
   1974			raid5_release_stripe(sh);
   1975		}
   1976}
   1977
   1978/* if matches return 0; otherwise return -EINVAL */
   1979static int
   1980r5l_recovery_verify_data_checksum(struct r5l_log *log,
   1981				  struct r5l_recovery_ctx *ctx,
   1982				  struct page *page,
   1983				  sector_t log_offset, __le32 log_checksum)
   1984{
   1985	void *addr;
   1986	u32 checksum;
   1987
   1988	r5l_recovery_read_page(log, ctx, page, log_offset);
   1989	addr = kmap_atomic(page);
   1990	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
   1991	kunmap_atomic(addr);
   1992	return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
   1993}
   1994
   1995/*
   1996 * before loading data to stripe cache, we need verify checksum for all data,
   1997 * if there is mismatch for any data page, we drop all data in the mata block
   1998 */
   1999static int
   2000r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
   2001					 struct r5l_recovery_ctx *ctx)
   2002{
   2003	struct mddev *mddev = log->rdev->mddev;
   2004	struct r5conf *conf = mddev->private;
   2005	struct r5l_meta_block *mb = page_address(ctx->meta_page);
   2006	sector_t mb_offset = sizeof(struct r5l_meta_block);
   2007	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
   2008	struct page *page;
   2009	struct r5l_payload_data_parity *payload;
   2010	struct r5l_payload_flush *payload_flush;
   2011
   2012	page = alloc_page(GFP_KERNEL);
   2013	if (!page)
   2014		return -ENOMEM;
   2015
   2016	while (mb_offset < le32_to_cpu(mb->meta_size)) {
   2017		payload = (void *)mb + mb_offset;
   2018		payload_flush = (void *)mb + mb_offset;
   2019
   2020		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
   2021			if (r5l_recovery_verify_data_checksum(
   2022				    log, ctx, page, log_offset,
   2023				    payload->checksum[0]) < 0)
   2024				goto mismatch;
   2025		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
   2026			if (r5l_recovery_verify_data_checksum(
   2027				    log, ctx, page, log_offset,
   2028				    payload->checksum[0]) < 0)
   2029				goto mismatch;
   2030			if (conf->max_degraded == 2 && /* q for RAID 6 */
   2031			    r5l_recovery_verify_data_checksum(
   2032				    log, ctx, page,
   2033				    r5l_ring_add(log, log_offset,
   2034						 BLOCK_SECTORS),
   2035				    payload->checksum[1]) < 0)
   2036				goto mismatch;
   2037		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
   2038			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
   2039		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
   2040			goto mismatch;
   2041
   2042		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
   2043			mb_offset += sizeof(struct r5l_payload_flush) +
   2044				le32_to_cpu(payload_flush->size);
   2045		} else {
   2046			/* DATA or PARITY payload */
   2047			log_offset = r5l_ring_add(log, log_offset,
   2048						  le32_to_cpu(payload->size));
   2049			mb_offset += sizeof(struct r5l_payload_data_parity) +
   2050				sizeof(__le32) *
   2051				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
   2052		}
   2053
   2054	}
   2055
   2056	put_page(page);
   2057	return 0;
   2058
   2059mismatch:
   2060	put_page(page);
   2061	return -EINVAL;
   2062}
   2063
   2064/*
   2065 * Analyze all data/parity pages in one meta block
   2066 * Returns:
   2067 * 0 for success
   2068 * -EINVAL for unknown playload type
   2069 * -EAGAIN for checksum mismatch of data page
   2070 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
   2071 */
   2072static int
   2073r5c_recovery_analyze_meta_block(struct r5l_log *log,
   2074				struct r5l_recovery_ctx *ctx,
   2075				struct list_head *cached_stripe_list)
   2076{
   2077	struct mddev *mddev = log->rdev->mddev;
   2078	struct r5conf *conf = mddev->private;
   2079	struct r5l_meta_block *mb;
   2080	struct r5l_payload_data_parity *payload;
   2081	struct r5l_payload_flush *payload_flush;
   2082	int mb_offset;
   2083	sector_t log_offset;
   2084	sector_t stripe_sect;
   2085	struct stripe_head *sh;
   2086	int ret;
   2087
   2088	/*
   2089	 * for mismatch in data blocks, we will drop all data in this mb, but
   2090	 * we will still read next mb for other data with FLUSH flag, as
   2091	 * io_unit could finish out of order.
   2092	 */
   2093	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
   2094	if (ret == -EINVAL)
   2095		return -EAGAIN;
   2096	else if (ret)
   2097		return ret;   /* -ENOMEM duo to alloc_page() failed */
   2098
   2099	mb = page_address(ctx->meta_page);
   2100	mb_offset = sizeof(struct r5l_meta_block);
   2101	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
   2102
   2103	while (mb_offset < le32_to_cpu(mb->meta_size)) {
   2104		int dd;
   2105
   2106		payload = (void *)mb + mb_offset;
   2107		payload_flush = (void *)mb + mb_offset;
   2108
   2109		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
   2110			int i, count;
   2111
   2112			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
   2113			for (i = 0; i < count; ++i) {
   2114				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
   2115				sh = r5c_recovery_lookup_stripe(cached_stripe_list,
   2116								stripe_sect);
   2117				if (sh) {
   2118					WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
   2119					r5l_recovery_reset_stripe(sh);
   2120					list_del_init(&sh->lru);
   2121					raid5_release_stripe(sh);
   2122				}
   2123			}
   2124
   2125			mb_offset += sizeof(struct r5l_payload_flush) +
   2126				le32_to_cpu(payload_flush->size);
   2127			continue;
   2128		}
   2129
   2130		/* DATA or PARITY payload */
   2131		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
   2132			raid5_compute_sector(
   2133				conf, le64_to_cpu(payload->location), 0, &dd,
   2134				NULL)
   2135			: le64_to_cpu(payload->location);
   2136
   2137		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
   2138						stripe_sect);
   2139
   2140		if (!sh) {
   2141			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
   2142			/*
   2143			 * cannot get stripe from raid5_get_active_stripe
   2144			 * try replay some stripes
   2145			 */
   2146			if (!sh) {
   2147				r5c_recovery_replay_stripes(
   2148					cached_stripe_list, ctx);
   2149				sh = r5c_recovery_alloc_stripe(
   2150					conf, stripe_sect, 1);
   2151			}
   2152			if (!sh) {
   2153				int new_size = conf->min_nr_stripes * 2;
   2154				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
   2155					mdname(mddev),
   2156					new_size);
   2157				ret = raid5_set_cache_size(mddev, new_size);
   2158				if (conf->min_nr_stripes <= new_size / 2) {
   2159					pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
   2160						mdname(mddev),
   2161						ret,
   2162						new_size,
   2163						conf->min_nr_stripes,
   2164						conf->max_nr_stripes);
   2165					return -ENOMEM;
   2166				}
   2167				sh = r5c_recovery_alloc_stripe(
   2168					conf, stripe_sect, 0);
   2169			}
   2170			if (!sh) {
   2171				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
   2172					mdname(mddev));
   2173				return -ENOMEM;
   2174			}
   2175			list_add_tail(&sh->lru, cached_stripe_list);
   2176		}
   2177
   2178		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
   2179			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
   2180			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
   2181				r5l_recovery_replay_one_stripe(conf, sh, ctx);
   2182				list_move_tail(&sh->lru, cached_stripe_list);
   2183			}
   2184			r5l_recovery_load_data(log, sh, ctx, payload,
   2185					       log_offset);
   2186		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
   2187			r5l_recovery_load_parity(log, sh, ctx, payload,
   2188						 log_offset);
   2189		else
   2190			return -EINVAL;
   2191
   2192		log_offset = r5l_ring_add(log, log_offset,
   2193					  le32_to_cpu(payload->size));
   2194
   2195		mb_offset += sizeof(struct r5l_payload_data_parity) +
   2196			sizeof(__le32) *
   2197			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
   2198	}
   2199
   2200	return 0;
   2201}
   2202
   2203/*
   2204 * Load the stripe into cache. The stripe will be written out later by
   2205 * the stripe cache state machine.
   2206 */
   2207static void r5c_recovery_load_one_stripe(struct r5l_log *log,
   2208					 struct stripe_head *sh)
   2209{
   2210	struct r5dev *dev;
   2211	int i;
   2212
   2213	for (i = sh->disks; i--; ) {
   2214		dev = sh->dev + i;
   2215		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
   2216			set_bit(R5_InJournal, &dev->flags);
   2217			set_bit(R5_UPTODATE, &dev->flags);
   2218		}
   2219	}
   2220}
   2221
   2222/*
   2223 * Scan through the log for all to-be-flushed data
   2224 *
   2225 * For stripes with data and parity, namely Data-Parity stripe
   2226 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
   2227 *
   2228 * For stripes with only data, namely Data-Only stripe
   2229 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
   2230 *
   2231 * For a stripe, if we see data after parity, we should discard all previous
   2232 * data and parity for this stripe, as these data are already flushed to
   2233 * the array.
   2234 *
   2235 * At the end of the scan, we return the new journal_tail, which points to
   2236 * first data-only stripe on the journal device, or next invalid meta block.
   2237 */
   2238static int r5c_recovery_flush_log(struct r5l_log *log,
   2239				  struct r5l_recovery_ctx *ctx)
   2240{
   2241	struct stripe_head *sh;
   2242	int ret = 0;
   2243
   2244	/* scan through the log */
   2245	while (1) {
   2246		if (r5l_recovery_read_meta_block(log, ctx))
   2247			break;
   2248
   2249		ret = r5c_recovery_analyze_meta_block(log, ctx,
   2250						      &ctx->cached_list);
   2251		/*
   2252		 * -EAGAIN means mismatch in data block, in this case, we still
   2253		 * try scan the next metablock
   2254		 */
   2255		if (ret && ret != -EAGAIN)
   2256			break;   /* ret == -EINVAL or -ENOMEM */
   2257		ctx->seq++;
   2258		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
   2259	}
   2260
   2261	if (ret == -ENOMEM) {
   2262		r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
   2263		return ret;
   2264	}
   2265
   2266	/* replay data-parity stripes */
   2267	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
   2268
   2269	/* load data-only stripes to stripe cache */
   2270	list_for_each_entry(sh, &ctx->cached_list, lru) {
   2271		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
   2272		r5c_recovery_load_one_stripe(log, sh);
   2273		ctx->data_only_stripes++;
   2274	}
   2275
   2276	return 0;
   2277}
   2278
   2279/*
   2280 * we did a recovery. Now ctx.pos points to an invalid meta block. New
   2281 * log will start here. but we can't let superblock point to last valid
   2282 * meta block. The log might looks like:
   2283 * | meta 1| meta 2| meta 3|
   2284 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
   2285 * superblock points to meta 1, we write a new valid meta 2n.  if crash
   2286 * happens again, new recovery will start from meta 1. Since meta 2n is
   2287 * valid now, recovery will think meta 3 is valid, which is wrong.
   2288 * The solution is we create a new meta in meta2 with its seq == meta
   2289 * 1's seq + 10000 and let superblock points to meta2. The same recovery
   2290 * will not think meta 3 is a valid meta, because its seq doesn't match
   2291 */
   2292
   2293/*
   2294 * Before recovery, the log looks like the following
   2295 *
   2296 *   ---------------------------------------------
   2297 *   |           valid log        | invalid log  |
   2298 *   ---------------------------------------------
   2299 *   ^
   2300 *   |- log->last_checkpoint
   2301 *   |- log->last_cp_seq
   2302 *
   2303 * Now we scan through the log until we see invalid entry
   2304 *
   2305 *   ---------------------------------------------
   2306 *   |           valid log        | invalid log  |
   2307 *   ---------------------------------------------
   2308 *   ^                            ^
   2309 *   |- log->last_checkpoint      |- ctx->pos
   2310 *   |- log->last_cp_seq          |- ctx->seq
   2311 *
   2312 * From this point, we need to increase seq number by 10 to avoid
   2313 * confusing next recovery.
   2314 *
   2315 *   ---------------------------------------------
   2316 *   |           valid log        | invalid log  |
   2317 *   ---------------------------------------------
   2318 *   ^                              ^
   2319 *   |- log->last_checkpoint        |- ctx->pos+1
   2320 *   |- log->last_cp_seq            |- ctx->seq+10001
   2321 *
   2322 * However, it is not safe to start the state machine yet, because data only
   2323 * parities are not yet secured in RAID. To save these data only parities, we
   2324 * rewrite them from seq+11.
   2325 *
   2326 *   -----------------------------------------------------------------
   2327 *   |           valid log        | data only stripes | invalid log  |
   2328 *   -----------------------------------------------------------------
   2329 *   ^                                                ^
   2330 *   |- log->last_checkpoint                          |- ctx->pos+n
   2331 *   |- log->last_cp_seq                              |- ctx->seq+10000+n
   2332 *
   2333 * If failure happens again during this process, the recovery can safe start
   2334 * again from log->last_checkpoint.
   2335 *
   2336 * Once data only stripes are rewritten to journal, we move log_tail
   2337 *
   2338 *   -----------------------------------------------------------------
   2339 *   |     old log        |    data only stripes    | invalid log  |
   2340 *   -----------------------------------------------------------------
   2341 *                        ^                         ^
   2342 *                        |- log->last_checkpoint   |- ctx->pos+n
   2343 *                        |- log->last_cp_seq       |- ctx->seq+10000+n
   2344 *
   2345 * Then we can safely start the state machine. If failure happens from this
   2346 * point on, the recovery will start from new log->last_checkpoint.
   2347 */
   2348static int
   2349r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
   2350				       struct r5l_recovery_ctx *ctx)
   2351{
   2352	struct stripe_head *sh;
   2353	struct mddev *mddev = log->rdev->mddev;
   2354	struct page *page;
   2355	sector_t next_checkpoint = MaxSector;
   2356
   2357	page = alloc_page(GFP_KERNEL);
   2358	if (!page) {
   2359		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
   2360		       mdname(mddev));
   2361		return -ENOMEM;
   2362	}
   2363
   2364	WARN_ON(list_empty(&ctx->cached_list));
   2365
   2366	list_for_each_entry(sh, &ctx->cached_list, lru) {
   2367		struct r5l_meta_block *mb;
   2368		int i;
   2369		int offset;
   2370		sector_t write_pos;
   2371
   2372		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
   2373		r5l_recovery_create_empty_meta_block(log, page,
   2374						     ctx->pos, ctx->seq);
   2375		mb = page_address(page);
   2376		offset = le32_to_cpu(mb->meta_size);
   2377		write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
   2378
   2379		for (i = sh->disks; i--; ) {
   2380			struct r5dev *dev = &sh->dev[i];
   2381			struct r5l_payload_data_parity *payload;
   2382			void *addr;
   2383
   2384			if (test_bit(R5_InJournal, &dev->flags)) {
   2385				payload = (void *)mb + offset;
   2386				payload->header.type = cpu_to_le16(
   2387					R5LOG_PAYLOAD_DATA);
   2388				payload->size = cpu_to_le32(BLOCK_SECTORS);
   2389				payload->location = cpu_to_le64(
   2390					raid5_compute_blocknr(sh, i, 0));
   2391				addr = kmap_atomic(dev->page);
   2392				payload->checksum[0] = cpu_to_le32(
   2393					crc32c_le(log->uuid_checksum, addr,
   2394						  PAGE_SIZE));
   2395				kunmap_atomic(addr);
   2396				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
   2397					     dev->page, REQ_OP_WRITE, 0, false);
   2398				write_pos = r5l_ring_add(log, write_pos,
   2399							 BLOCK_SECTORS);
   2400				offset += sizeof(__le32) +
   2401					sizeof(struct r5l_payload_data_parity);
   2402
   2403			}
   2404		}
   2405		mb->meta_size = cpu_to_le32(offset);
   2406		mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
   2407						     mb, PAGE_SIZE));
   2408		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
   2409			     REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
   2410		sh->log_start = ctx->pos;
   2411		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
   2412		atomic_inc(&log->stripe_in_journal_count);
   2413		ctx->pos = write_pos;
   2414		ctx->seq += 1;
   2415		next_checkpoint = sh->log_start;
   2416	}
   2417	log->next_checkpoint = next_checkpoint;
   2418	__free_page(page);
   2419	return 0;
   2420}
   2421
   2422static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
   2423						 struct r5l_recovery_ctx *ctx)
   2424{
   2425	struct mddev *mddev = log->rdev->mddev;
   2426	struct r5conf *conf = mddev->private;
   2427	struct stripe_head *sh, *next;
   2428	bool cleared_pending = false;
   2429
   2430	if (ctx->data_only_stripes == 0)
   2431		return;
   2432
   2433	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
   2434		cleared_pending = true;
   2435		clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
   2436	}
   2437	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
   2438
   2439	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
   2440		r5c_make_stripe_write_out(sh);
   2441		set_bit(STRIPE_HANDLE, &sh->state);
   2442		list_del_init(&sh->lru);
   2443		raid5_release_stripe(sh);
   2444	}
   2445
   2446	/* reuse conf->wait_for_quiescent in recovery */
   2447	wait_event(conf->wait_for_quiescent,
   2448		   atomic_read(&conf->active_stripes) == 0);
   2449
   2450	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
   2451	if (cleared_pending)
   2452		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
   2453}
   2454
   2455static int r5l_recovery_log(struct r5l_log *log)
   2456{
   2457	struct mddev *mddev = log->rdev->mddev;
   2458	struct r5l_recovery_ctx *ctx;
   2459	int ret;
   2460	sector_t pos;
   2461
   2462	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
   2463	if (!ctx)
   2464		return -ENOMEM;
   2465
   2466	ctx->pos = log->last_checkpoint;
   2467	ctx->seq = log->last_cp_seq;
   2468	INIT_LIST_HEAD(&ctx->cached_list);
   2469	ctx->meta_page = alloc_page(GFP_KERNEL);
   2470
   2471	if (!ctx->meta_page) {
   2472		ret =  -ENOMEM;
   2473		goto meta_page;
   2474	}
   2475
   2476	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
   2477		ret = -ENOMEM;
   2478		goto ra_pool;
   2479	}
   2480
   2481	ret = r5c_recovery_flush_log(log, ctx);
   2482
   2483	if (ret)
   2484		goto error;
   2485
   2486	pos = ctx->pos;
   2487	ctx->seq += 10000;
   2488
   2489	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
   2490		pr_info("md/raid:%s: starting from clean shutdown\n",
   2491			 mdname(mddev));
   2492	else
   2493		pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
   2494			 mdname(mddev), ctx->data_only_stripes,
   2495			 ctx->data_parity_stripes);
   2496
   2497	if (ctx->data_only_stripes == 0) {
   2498		log->next_checkpoint = ctx->pos;
   2499		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
   2500		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
   2501	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
   2502		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
   2503		       mdname(mddev));
   2504		ret =  -EIO;
   2505		goto error;
   2506	}
   2507
   2508	log->log_start = ctx->pos;
   2509	log->seq = ctx->seq;
   2510	log->last_checkpoint = pos;
   2511	r5l_write_super(log, pos);
   2512
   2513	r5c_recovery_flush_data_only_stripes(log, ctx);
   2514	ret = 0;
   2515error:
   2516	r5l_recovery_free_ra_pool(log, ctx);
   2517ra_pool:
   2518	__free_page(ctx->meta_page);
   2519meta_page:
   2520	kfree(ctx);
   2521	return ret;
   2522}
   2523
   2524static void r5l_write_super(struct r5l_log *log, sector_t cp)
   2525{
   2526	struct mddev *mddev = log->rdev->mddev;
   2527
   2528	log->rdev->journal_tail = cp;
   2529	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
   2530}
   2531
   2532static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
   2533{
   2534	struct r5conf *conf;
   2535	int ret;
   2536
   2537	spin_lock(&mddev->lock);
   2538	conf = mddev->private;
   2539	if (!conf || !conf->log) {
   2540		spin_unlock(&mddev->lock);
   2541		return 0;
   2542	}
   2543
   2544	switch (conf->log->r5c_journal_mode) {
   2545	case R5C_JOURNAL_MODE_WRITE_THROUGH:
   2546		ret = snprintf(
   2547			page, PAGE_SIZE, "[%s] %s\n",
   2548			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
   2549			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
   2550		break;
   2551	case R5C_JOURNAL_MODE_WRITE_BACK:
   2552		ret = snprintf(
   2553			page, PAGE_SIZE, "%s [%s]\n",
   2554			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
   2555			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
   2556		break;
   2557	default:
   2558		ret = 0;
   2559	}
   2560	spin_unlock(&mddev->lock);
   2561	return ret;
   2562}
   2563
   2564/*
   2565 * Set journal cache mode on @mddev (external API initially needed by dm-raid).
   2566 *
   2567 * @mode as defined in 'enum r5c_journal_mode'.
   2568 *
   2569 */
   2570int r5c_journal_mode_set(struct mddev *mddev, int mode)
   2571{
   2572	struct r5conf *conf;
   2573
   2574	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
   2575	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
   2576		return -EINVAL;
   2577
   2578	conf = mddev->private;
   2579	if (!conf || !conf->log)
   2580		return -ENODEV;
   2581
   2582	if (raid5_calc_degraded(conf) > 0 &&
   2583	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
   2584		return -EINVAL;
   2585
   2586	mddev_suspend(mddev);
   2587	conf->log->r5c_journal_mode = mode;
   2588	mddev_resume(mddev);
   2589
   2590	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
   2591		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
   2592	return 0;
   2593}
   2594EXPORT_SYMBOL(r5c_journal_mode_set);
   2595
   2596static ssize_t r5c_journal_mode_store(struct mddev *mddev,
   2597				      const char *page, size_t length)
   2598{
   2599	int mode = ARRAY_SIZE(r5c_journal_mode_str);
   2600	size_t len = length;
   2601	int ret;
   2602
   2603	if (len < 2)
   2604		return -EINVAL;
   2605
   2606	if (page[len - 1] == '\n')
   2607		len--;
   2608
   2609	while (mode--)
   2610		if (strlen(r5c_journal_mode_str[mode]) == len &&
   2611		    !strncmp(page, r5c_journal_mode_str[mode], len))
   2612			break;
   2613	ret = mddev_lock(mddev);
   2614	if (ret)
   2615		return ret;
   2616	ret = r5c_journal_mode_set(mddev, mode);
   2617	mddev_unlock(mddev);
   2618	return ret ?: length;
   2619}
   2620
   2621struct md_sysfs_entry
   2622r5c_journal_mode = __ATTR(journal_mode, 0644,
   2623			  r5c_journal_mode_show, r5c_journal_mode_store);
   2624
   2625/*
   2626 * Try handle write operation in caching phase. This function should only
   2627 * be called in write-back mode.
   2628 *
   2629 * If all outstanding writes can be handled in caching phase, returns 0
   2630 * If writes requires write-out phase, call r5c_make_stripe_write_out()
   2631 * and returns -EAGAIN
   2632 */
   2633int r5c_try_caching_write(struct r5conf *conf,
   2634			  struct stripe_head *sh,
   2635			  struct stripe_head_state *s,
   2636			  int disks)
   2637{
   2638	struct r5l_log *log = conf->log;
   2639	int i;
   2640	struct r5dev *dev;
   2641	int to_cache = 0;
   2642	void **pslot;
   2643	sector_t tree_index;
   2644	int ret;
   2645	uintptr_t refcount;
   2646
   2647	BUG_ON(!r5c_is_writeback(log));
   2648
   2649	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
   2650		/*
   2651		 * There are two different scenarios here:
   2652		 *  1. The stripe has some data cached, and it is sent to
   2653		 *     write-out phase for reclaim
   2654		 *  2. The stripe is clean, and this is the first write
   2655		 *
   2656		 * For 1, return -EAGAIN, so we continue with
   2657		 * handle_stripe_dirtying().
   2658		 *
   2659		 * For 2, set STRIPE_R5C_CACHING and continue with caching
   2660		 * write.
   2661		 */
   2662
   2663		/* case 1: anything injournal or anything in written */
   2664		if (s->injournal > 0 || s->written > 0)
   2665			return -EAGAIN;
   2666		/* case 2 */
   2667		set_bit(STRIPE_R5C_CACHING, &sh->state);
   2668	}
   2669
   2670	/*
   2671	 * When run in degraded mode, array is set to write-through mode.
   2672	 * This check helps drain pending write safely in the transition to
   2673	 * write-through mode.
   2674	 *
   2675	 * When a stripe is syncing, the write is also handled in write
   2676	 * through mode.
   2677	 */
   2678	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
   2679		r5c_make_stripe_write_out(sh);
   2680		return -EAGAIN;
   2681	}
   2682
   2683	for (i = disks; i--; ) {
   2684		dev = &sh->dev[i];
   2685		/* if non-overwrite, use writing-out phase */
   2686		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
   2687		    !test_bit(R5_InJournal, &dev->flags)) {
   2688			r5c_make_stripe_write_out(sh);
   2689			return -EAGAIN;
   2690		}
   2691	}
   2692
   2693	/* if the stripe is not counted in big_stripe_tree, add it now */
   2694	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
   2695	    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
   2696		tree_index = r5c_tree_index(conf, sh->sector);
   2697		spin_lock(&log->tree_lock);
   2698		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
   2699					       tree_index);
   2700		if (pslot) {
   2701			refcount = (uintptr_t)radix_tree_deref_slot_protected(
   2702				pslot, &log->tree_lock) >>
   2703				R5C_RADIX_COUNT_SHIFT;
   2704			radix_tree_replace_slot(
   2705				&log->big_stripe_tree, pslot,
   2706				(void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
   2707		} else {
   2708			/*
   2709			 * this radix_tree_insert can fail safely, so no
   2710			 * need to call radix_tree_preload()
   2711			 */
   2712			ret = radix_tree_insert(
   2713				&log->big_stripe_tree, tree_index,
   2714				(void *)(1 << R5C_RADIX_COUNT_SHIFT));
   2715			if (ret) {
   2716				spin_unlock(&log->tree_lock);
   2717				r5c_make_stripe_write_out(sh);
   2718				return -EAGAIN;
   2719			}
   2720		}
   2721		spin_unlock(&log->tree_lock);
   2722
   2723		/*
   2724		 * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
   2725		 * counted in the radix tree
   2726		 */
   2727		set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
   2728		atomic_inc(&conf->r5c_cached_partial_stripes);
   2729	}
   2730
   2731	for (i = disks; i--; ) {
   2732		dev = &sh->dev[i];
   2733		if (dev->towrite) {
   2734			set_bit(R5_Wantwrite, &dev->flags);
   2735			set_bit(R5_Wantdrain, &dev->flags);
   2736			set_bit(R5_LOCKED, &dev->flags);
   2737			to_cache++;
   2738		}
   2739	}
   2740
   2741	if (to_cache) {
   2742		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
   2743		/*
   2744		 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
   2745		 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
   2746		 * r5c_handle_data_cached()
   2747		 */
   2748		set_bit(STRIPE_LOG_TRAPPED, &sh->state);
   2749	}
   2750
   2751	return 0;
   2752}
   2753
   2754/*
   2755 * free extra pages (orig_page) we allocated for prexor
   2756 */
   2757void r5c_release_extra_page(struct stripe_head *sh)
   2758{
   2759	struct r5conf *conf = sh->raid_conf;
   2760	int i;
   2761	bool using_disk_info_extra_page;
   2762
   2763	using_disk_info_extra_page =
   2764		sh->dev[0].orig_page == conf->disks[0].extra_page;
   2765
   2766	for (i = sh->disks; i--; )
   2767		if (sh->dev[i].page != sh->dev[i].orig_page) {
   2768			struct page *p = sh->dev[i].orig_page;
   2769
   2770			sh->dev[i].orig_page = sh->dev[i].page;
   2771			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
   2772
   2773			if (!using_disk_info_extra_page)
   2774				put_page(p);
   2775		}
   2776
   2777	if (using_disk_info_extra_page) {
   2778		clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
   2779		md_wakeup_thread(conf->mddev->thread);
   2780	}
   2781}
   2782
   2783void r5c_use_extra_page(struct stripe_head *sh)
   2784{
   2785	struct r5conf *conf = sh->raid_conf;
   2786	int i;
   2787	struct r5dev *dev;
   2788
   2789	for (i = sh->disks; i--; ) {
   2790		dev = &sh->dev[i];
   2791		if (dev->orig_page != dev->page)
   2792			put_page(dev->orig_page);
   2793		dev->orig_page = conf->disks[i].extra_page;
   2794	}
   2795}
   2796
   2797/*
   2798 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
   2799 * stripe is committed to RAID disks.
   2800 */
   2801void r5c_finish_stripe_write_out(struct r5conf *conf,
   2802				 struct stripe_head *sh,
   2803				 struct stripe_head_state *s)
   2804{
   2805	struct r5l_log *log = conf->log;
   2806	int i;
   2807	int do_wakeup = 0;
   2808	sector_t tree_index;
   2809	void **pslot;
   2810	uintptr_t refcount;
   2811
   2812	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
   2813		return;
   2814
   2815	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
   2816	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
   2817
   2818	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
   2819		return;
   2820
   2821	for (i = sh->disks; i--; ) {
   2822		clear_bit(R5_InJournal, &sh->dev[i].flags);
   2823		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
   2824			do_wakeup = 1;
   2825	}
   2826
   2827	/*
   2828	 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
   2829	 * We updated R5_InJournal, so we also update s->injournal.
   2830	 */
   2831	s->injournal = 0;
   2832
   2833	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
   2834		if (atomic_dec_and_test(&conf->pending_full_writes))
   2835			md_wakeup_thread(conf->mddev->thread);
   2836
   2837	if (do_wakeup)
   2838		wake_up(&conf->wait_for_overlap);
   2839
   2840	spin_lock_irq(&log->stripe_in_journal_lock);
   2841	list_del_init(&sh->r5c);
   2842	spin_unlock_irq(&log->stripe_in_journal_lock);
   2843	sh->log_start = MaxSector;
   2844
   2845	atomic_dec(&log->stripe_in_journal_count);
   2846	r5c_update_log_state(log);
   2847
   2848	/* stop counting this stripe in big_stripe_tree */
   2849	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
   2850	    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
   2851		tree_index = r5c_tree_index(conf, sh->sector);
   2852		spin_lock(&log->tree_lock);
   2853		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
   2854					       tree_index);
   2855		BUG_ON(pslot == NULL);
   2856		refcount = (uintptr_t)radix_tree_deref_slot_protected(
   2857			pslot, &log->tree_lock) >>
   2858			R5C_RADIX_COUNT_SHIFT;
   2859		if (refcount == 1)
   2860			radix_tree_delete(&log->big_stripe_tree, tree_index);
   2861		else
   2862			radix_tree_replace_slot(
   2863				&log->big_stripe_tree, pslot,
   2864				(void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
   2865		spin_unlock(&log->tree_lock);
   2866	}
   2867
   2868	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
   2869		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
   2870		atomic_dec(&conf->r5c_flushing_partial_stripes);
   2871		atomic_dec(&conf->r5c_cached_partial_stripes);
   2872	}
   2873
   2874	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
   2875		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
   2876		atomic_dec(&conf->r5c_flushing_full_stripes);
   2877		atomic_dec(&conf->r5c_cached_full_stripes);
   2878	}
   2879
   2880	r5l_append_flush_payload(log, sh->sector);
   2881	/* stripe is flused to raid disks, we can do resync now */
   2882	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
   2883		set_bit(STRIPE_HANDLE, &sh->state);
   2884}
   2885
   2886int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
   2887{
   2888	struct r5conf *conf = sh->raid_conf;
   2889	int pages = 0;
   2890	int reserve;
   2891	int i;
   2892	int ret = 0;
   2893
   2894	BUG_ON(!log);
   2895
   2896	for (i = 0; i < sh->disks; i++) {
   2897		void *addr;
   2898
   2899		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
   2900			continue;
   2901		addr = kmap_atomic(sh->dev[i].page);
   2902		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
   2903						    addr, PAGE_SIZE);
   2904		kunmap_atomic(addr);
   2905		pages++;
   2906	}
   2907	WARN_ON(pages == 0);
   2908
   2909	/*
   2910	 * The stripe must enter state machine again to call endio, so
   2911	 * don't delay.
   2912	 */
   2913	clear_bit(STRIPE_DELAYED, &sh->state);
   2914	atomic_inc(&sh->count);
   2915
   2916	mutex_lock(&log->io_mutex);
   2917	/* meta + data */
   2918	reserve = (1 + pages) << (PAGE_SHIFT - 9);
   2919
   2920	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
   2921	    sh->log_start == MaxSector)
   2922		r5l_add_no_space_stripe(log, sh);
   2923	else if (!r5l_has_free_space(log, reserve)) {
   2924		if (sh->log_start == log->last_checkpoint)
   2925			BUG();
   2926		else
   2927			r5l_add_no_space_stripe(log, sh);
   2928	} else {
   2929		ret = r5l_log_stripe(log, sh, pages, 0);
   2930		if (ret) {
   2931			spin_lock_irq(&log->io_list_lock);
   2932			list_add_tail(&sh->log_list, &log->no_mem_stripes);
   2933			spin_unlock_irq(&log->io_list_lock);
   2934		}
   2935	}
   2936
   2937	mutex_unlock(&log->io_mutex);
   2938	return 0;
   2939}
   2940
   2941/* check whether this big stripe is in write back cache. */
   2942bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
   2943{
   2944	struct r5l_log *log = conf->log;
   2945	sector_t tree_index;
   2946	void *slot;
   2947
   2948	if (!log)
   2949		return false;
   2950
   2951	WARN_ON_ONCE(!rcu_read_lock_held());
   2952	tree_index = r5c_tree_index(conf, sect);
   2953	slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
   2954	return slot != NULL;
   2955}
   2956
   2957static int r5l_load_log(struct r5l_log *log)
   2958{
   2959	struct md_rdev *rdev = log->rdev;
   2960	struct page *page;
   2961	struct r5l_meta_block *mb;
   2962	sector_t cp = log->rdev->journal_tail;
   2963	u32 stored_crc, expected_crc;
   2964	bool create_super = false;
   2965	int ret = 0;
   2966
   2967	/* Make sure it's valid */
   2968	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
   2969		cp = 0;
   2970	page = alloc_page(GFP_KERNEL);
   2971	if (!page)
   2972		return -ENOMEM;
   2973
   2974	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
   2975		ret = -EIO;
   2976		goto ioerr;
   2977	}
   2978	mb = page_address(page);
   2979
   2980	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
   2981	    mb->version != R5LOG_VERSION) {
   2982		create_super = true;
   2983		goto create;
   2984	}
   2985	stored_crc = le32_to_cpu(mb->checksum);
   2986	mb->checksum = 0;
   2987	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
   2988	if (stored_crc != expected_crc) {
   2989		create_super = true;
   2990		goto create;
   2991	}
   2992	if (le64_to_cpu(mb->position) != cp) {
   2993		create_super = true;
   2994		goto create;
   2995	}
   2996create:
   2997	if (create_super) {
   2998		log->last_cp_seq = prandom_u32();
   2999		cp = 0;
   3000		r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
   3001		/*
   3002		 * Make sure super points to correct address. Log might have
   3003		 * data very soon. If super hasn't correct log tail address,
   3004		 * recovery can't find the log
   3005		 */
   3006		r5l_write_super(log, cp);
   3007	} else
   3008		log->last_cp_seq = le64_to_cpu(mb->seq);
   3009
   3010	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
   3011	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
   3012	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
   3013		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
   3014	log->last_checkpoint = cp;
   3015
   3016	__free_page(page);
   3017
   3018	if (create_super) {
   3019		log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
   3020		log->seq = log->last_cp_seq + 1;
   3021		log->next_checkpoint = cp;
   3022	} else
   3023		ret = r5l_recovery_log(log);
   3024
   3025	r5c_update_log_state(log);
   3026	return ret;
   3027ioerr:
   3028	__free_page(page);
   3029	return ret;
   3030}
   3031
   3032int r5l_start(struct r5l_log *log)
   3033{
   3034	int ret;
   3035
   3036	if (!log)
   3037		return 0;
   3038
   3039	ret = r5l_load_log(log);
   3040	if (ret) {
   3041		struct mddev *mddev = log->rdev->mddev;
   3042		struct r5conf *conf = mddev->private;
   3043
   3044		r5l_exit_log(conf);
   3045	}
   3046	return ret;
   3047}
   3048
   3049void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
   3050{
   3051	struct r5conf *conf = mddev->private;
   3052	struct r5l_log *log = conf->log;
   3053
   3054	if (!log)
   3055		return;
   3056
   3057	if ((raid5_calc_degraded(conf) > 0 ||
   3058	     test_bit(Journal, &rdev->flags)) &&
   3059	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
   3060		schedule_work(&log->disable_writeback_work);
   3061}
   3062
   3063int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
   3064{
   3065	struct request_queue *q = bdev_get_queue(rdev->bdev);
   3066	struct r5l_log *log;
   3067	int ret;
   3068
   3069	pr_debug("md/raid:%s: using device %pg as journal\n",
   3070		 mdname(conf->mddev), rdev->bdev);
   3071
   3072	if (PAGE_SIZE != 4096)
   3073		return -EINVAL;
   3074
   3075	/*
   3076	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
   3077	 * raid_disks r5l_payload_data_parity.
   3078	 *
   3079	 * Write journal and cache does not work for very big array
   3080	 * (raid_disks > 203)
   3081	 */
   3082	if (sizeof(struct r5l_meta_block) +
   3083	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
   3084	     conf->raid_disks) > PAGE_SIZE) {
   3085		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
   3086		       mdname(conf->mddev), conf->raid_disks);
   3087		return -EINVAL;
   3088	}
   3089
   3090	log = kzalloc(sizeof(*log), GFP_KERNEL);
   3091	if (!log)
   3092		return -ENOMEM;
   3093	log->rdev = rdev;
   3094
   3095	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
   3096
   3097	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
   3098				       sizeof(rdev->mddev->uuid));
   3099
   3100	mutex_init(&log->io_mutex);
   3101
   3102	spin_lock_init(&log->io_list_lock);
   3103	INIT_LIST_HEAD(&log->running_ios);
   3104	INIT_LIST_HEAD(&log->io_end_ios);
   3105	INIT_LIST_HEAD(&log->flushing_ios);
   3106	INIT_LIST_HEAD(&log->finished_ios);
   3107
   3108	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
   3109	if (!log->io_kc)
   3110		goto io_kc;
   3111
   3112	ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
   3113	if (ret)
   3114		goto io_pool;
   3115
   3116	ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
   3117	if (ret)
   3118		goto io_bs;
   3119
   3120	ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
   3121	if (ret)
   3122		goto out_mempool;
   3123
   3124	spin_lock_init(&log->tree_lock);
   3125	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
   3126
   3127	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
   3128						 log->rdev->mddev, "reclaim");
   3129	if (!log->reclaim_thread)
   3130		goto reclaim_thread;
   3131	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
   3132
   3133	init_waitqueue_head(&log->iounit_wait);
   3134
   3135	INIT_LIST_HEAD(&log->no_mem_stripes);
   3136
   3137	INIT_LIST_HEAD(&log->no_space_stripes);
   3138	spin_lock_init(&log->no_space_stripes_lock);
   3139
   3140	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
   3141	INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
   3142
   3143	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
   3144	INIT_LIST_HEAD(&log->stripe_in_journal_list);
   3145	spin_lock_init(&log->stripe_in_journal_lock);
   3146	atomic_set(&log->stripe_in_journal_count, 0);
   3147
   3148	rcu_assign_pointer(conf->log, log);
   3149
   3150	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
   3151	return 0;
   3152
   3153reclaim_thread:
   3154	mempool_exit(&log->meta_pool);
   3155out_mempool:
   3156	bioset_exit(&log->bs);
   3157io_bs:
   3158	mempool_exit(&log->io_pool);
   3159io_pool:
   3160	kmem_cache_destroy(log->io_kc);
   3161io_kc:
   3162	kfree(log);
   3163	return -EINVAL;
   3164}
   3165
   3166void r5l_exit_log(struct r5conf *conf)
   3167{
   3168	struct r5l_log *log = conf->log;
   3169
   3170	conf->log = NULL;
   3171	synchronize_rcu();
   3172
   3173	/* Ensure disable_writeback_work wakes up and exits */
   3174	wake_up(&conf->mddev->sb_wait);
   3175	flush_work(&log->disable_writeback_work);
   3176	md_unregister_thread(&log->reclaim_thread);
   3177	mempool_exit(&log->meta_pool);
   3178	bioset_exit(&log->bs);
   3179	mempool_exit(&log->io_pool);
   3180	kmem_cache_destroy(log->io_kc);
   3181	kfree(log);
   3182}