cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-mpath.c (56843B)


      1/*
      2 * Copyright (C) 2003 Sistina Software Limited.
      3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
      4 *
      5 * This file is released under the GPL.
      6 */
      7
      8#include <linux/device-mapper.h>
      9
     10#include "dm-rq.h"
     11#include "dm-bio-record.h"
     12#include "dm-path-selector.h"
     13#include "dm-uevent.h"
     14
     15#include <linux/blkdev.h>
     16#include <linux/ctype.h>
     17#include <linux/init.h>
     18#include <linux/mempool.h>
     19#include <linux/module.h>
     20#include <linux/pagemap.h>
     21#include <linux/slab.h>
     22#include <linux/time.h>
     23#include <linux/timer.h>
     24#include <linux/workqueue.h>
     25#include <linux/delay.h>
     26#include <scsi/scsi_dh.h>
     27#include <linux/atomic.h>
     28#include <linux/blk-mq.h>
     29
     30#define DM_MSG_PREFIX "multipath"
     31#define DM_PG_INIT_DELAY_MSECS 2000
     32#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
     33#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
     34
     35static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
     36
     37/* Path properties */
     38struct pgpath {
     39	struct list_head list;
     40
     41	struct priority_group *pg;	/* Owning PG */
     42	unsigned fail_count;		/* Cumulative failure count */
     43
     44	struct dm_path path;
     45	struct delayed_work activate_path;
     46
     47	bool is_active:1;		/* Path status */
     48};
     49
     50#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
     51
     52/*
     53 * Paths are grouped into Priority Groups and numbered from 1 upwards.
     54 * Each has a path selector which controls which path gets used.
     55 */
     56struct priority_group {
     57	struct list_head list;
     58
     59	struct multipath *m;		/* Owning multipath instance */
     60	struct path_selector ps;
     61
     62	unsigned pg_num;		/* Reference number */
     63	unsigned nr_pgpaths;		/* Number of paths in PG */
     64	struct list_head pgpaths;
     65
     66	bool bypassed:1;		/* Temporarily bypass this PG? */
     67};
     68
     69/* Multipath context */
     70struct multipath {
     71	unsigned long flags;		/* Multipath state flags */
     72
     73	spinlock_t lock;
     74	enum dm_queue_mode queue_mode;
     75
     76	struct pgpath *current_pgpath;
     77	struct priority_group *current_pg;
     78	struct priority_group *next_pg;	/* Switch to this PG if set */
     79
     80	atomic_t nr_valid_paths;	/* Total number of usable paths */
     81	unsigned nr_priority_groups;
     82	struct list_head priority_groups;
     83
     84	const char *hw_handler_name;
     85	char *hw_handler_params;
     86	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
     87	unsigned pg_init_retries;	/* Number of times to retry pg_init */
     88	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
     89	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
     90	atomic_t pg_init_count;		/* Number of times pg_init called */
     91
     92	struct mutex work_mutex;
     93	struct work_struct trigger_event;
     94	struct dm_target *ti;
     95
     96	struct work_struct process_queued_bios;
     97	struct bio_list queued_bios;
     98
     99	struct timer_list nopath_timer;	/* Timeout for queue_if_no_path */
    100};
    101
    102/*
    103 * Context information attached to each io we process.
    104 */
    105struct dm_mpath_io {
    106	struct pgpath *pgpath;
    107	size_t nr_bytes;
    108	u64 start_time_ns;
    109};
    110
    111typedef int (*action_fn) (struct pgpath *pgpath);
    112
    113static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
    114static void trigger_event(struct work_struct *work);
    115static void activate_or_offline_path(struct pgpath *pgpath);
    116static void activate_path_work(struct work_struct *work);
    117static void process_queued_bios(struct work_struct *work);
    118static void queue_if_no_path_timeout_work(struct timer_list *t);
    119
    120/*-----------------------------------------------
    121 * Multipath state flags.
    122 *-----------------------------------------------*/
    123
    124#define MPATHF_QUEUE_IO 0			/* Must we queue all I/O? */
    125#define MPATHF_QUEUE_IF_NO_PATH 1		/* Queue I/O if last path fails? */
    126#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2		/* Saved state during suspension */
    127#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3	/* If there's already a hw_handler present, don't change it. */
    128#define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
    129#define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
    130#define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
    131
    132static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
    133{
    134	bool r = test_bit(MPATHF_bit, &m->flags);
    135
    136	if (r) {
    137		unsigned long flags;
    138		spin_lock_irqsave(&m->lock, flags);
    139		r = test_bit(MPATHF_bit, &m->flags);
    140		spin_unlock_irqrestore(&m->lock, flags);
    141	}
    142
    143	return r;
    144}
    145
    146/*-----------------------------------------------
    147 * Allocation routines
    148 *-----------------------------------------------*/
    149
    150static struct pgpath *alloc_pgpath(void)
    151{
    152	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
    153
    154	if (!pgpath)
    155		return NULL;
    156
    157	pgpath->is_active = true;
    158
    159	return pgpath;
    160}
    161
    162static void free_pgpath(struct pgpath *pgpath)
    163{
    164	kfree(pgpath);
    165}
    166
    167static struct priority_group *alloc_priority_group(void)
    168{
    169	struct priority_group *pg;
    170
    171	pg = kzalloc(sizeof(*pg), GFP_KERNEL);
    172
    173	if (pg)
    174		INIT_LIST_HEAD(&pg->pgpaths);
    175
    176	return pg;
    177}
    178
    179static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
    180{
    181	struct pgpath *pgpath, *tmp;
    182
    183	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
    184		list_del(&pgpath->list);
    185		dm_put_device(ti, pgpath->path.dev);
    186		free_pgpath(pgpath);
    187	}
    188}
    189
    190static void free_priority_group(struct priority_group *pg,
    191				struct dm_target *ti)
    192{
    193	struct path_selector *ps = &pg->ps;
    194
    195	if (ps->type) {
    196		ps->type->destroy(ps);
    197		dm_put_path_selector(ps->type);
    198	}
    199
    200	free_pgpaths(&pg->pgpaths, ti);
    201	kfree(pg);
    202}
    203
    204static struct multipath *alloc_multipath(struct dm_target *ti)
    205{
    206	struct multipath *m;
    207
    208	m = kzalloc(sizeof(*m), GFP_KERNEL);
    209	if (m) {
    210		INIT_LIST_HEAD(&m->priority_groups);
    211		spin_lock_init(&m->lock);
    212		atomic_set(&m->nr_valid_paths, 0);
    213		INIT_WORK(&m->trigger_event, trigger_event);
    214		mutex_init(&m->work_mutex);
    215
    216		m->queue_mode = DM_TYPE_NONE;
    217
    218		m->ti = ti;
    219		ti->private = m;
    220
    221		timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
    222	}
    223
    224	return m;
    225}
    226
    227static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
    228{
    229	if (m->queue_mode == DM_TYPE_NONE) {
    230		m->queue_mode = DM_TYPE_REQUEST_BASED;
    231	} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
    232		INIT_WORK(&m->process_queued_bios, process_queued_bios);
    233		/*
    234		 * bio-based doesn't support any direct scsi_dh management;
    235		 * it just discovers if a scsi_dh is attached.
    236		 */
    237		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
    238	}
    239
    240	dm_table_set_type(ti->table, m->queue_mode);
    241
    242	/*
    243	 * Init fields that are only used when a scsi_dh is attached
    244	 * - must do this unconditionally (really doesn't hurt non-SCSI uses)
    245	 */
    246	set_bit(MPATHF_QUEUE_IO, &m->flags);
    247	atomic_set(&m->pg_init_in_progress, 0);
    248	atomic_set(&m->pg_init_count, 0);
    249	m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
    250	init_waitqueue_head(&m->pg_init_wait);
    251
    252	return 0;
    253}
    254
    255static void free_multipath(struct multipath *m)
    256{
    257	struct priority_group *pg, *tmp;
    258
    259	list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
    260		list_del(&pg->list);
    261		free_priority_group(pg, m->ti);
    262	}
    263
    264	kfree(m->hw_handler_name);
    265	kfree(m->hw_handler_params);
    266	mutex_destroy(&m->work_mutex);
    267	kfree(m);
    268}
    269
    270static struct dm_mpath_io *get_mpio(union map_info *info)
    271{
    272	return info->ptr;
    273}
    274
    275static size_t multipath_per_bio_data_size(void)
    276{
    277	return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
    278}
    279
    280static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
    281{
    282	return dm_per_bio_data(bio, multipath_per_bio_data_size());
    283}
    284
    285static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
    286{
    287	/* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
    288	void *bio_details = mpio + 1;
    289	return bio_details;
    290}
    291
    292static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
    293{
    294	struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
    295	struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
    296
    297	mpio->nr_bytes = bio->bi_iter.bi_size;
    298	mpio->pgpath = NULL;
    299	mpio->start_time_ns = 0;
    300	*mpio_p = mpio;
    301
    302	dm_bio_record(bio_details, bio);
    303}
    304
    305/*-----------------------------------------------
    306 * Path selection
    307 *-----------------------------------------------*/
    308
    309static int __pg_init_all_paths(struct multipath *m)
    310{
    311	struct pgpath *pgpath;
    312	unsigned long pg_init_delay = 0;
    313
    314	lockdep_assert_held(&m->lock);
    315
    316	if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
    317		return 0;
    318
    319	atomic_inc(&m->pg_init_count);
    320	clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
    321
    322	/* Check here to reset pg_init_required */
    323	if (!m->current_pg)
    324		return 0;
    325
    326	if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
    327		pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
    328						 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
    329	list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
    330		/* Skip failed paths */
    331		if (!pgpath->is_active)
    332			continue;
    333		if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
    334				       pg_init_delay))
    335			atomic_inc(&m->pg_init_in_progress);
    336	}
    337	return atomic_read(&m->pg_init_in_progress);
    338}
    339
    340static int pg_init_all_paths(struct multipath *m)
    341{
    342	int ret;
    343	unsigned long flags;
    344
    345	spin_lock_irqsave(&m->lock, flags);
    346	ret = __pg_init_all_paths(m);
    347	spin_unlock_irqrestore(&m->lock, flags);
    348
    349	return ret;
    350}
    351
    352static void __switch_pg(struct multipath *m, struct priority_group *pg)
    353{
    354	lockdep_assert_held(&m->lock);
    355
    356	m->current_pg = pg;
    357
    358	/* Must we initialise the PG first, and queue I/O till it's ready? */
    359	if (m->hw_handler_name) {
    360		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
    361		set_bit(MPATHF_QUEUE_IO, &m->flags);
    362	} else {
    363		clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
    364		clear_bit(MPATHF_QUEUE_IO, &m->flags);
    365	}
    366
    367	atomic_set(&m->pg_init_count, 0);
    368}
    369
    370static struct pgpath *choose_path_in_pg(struct multipath *m,
    371					struct priority_group *pg,
    372					size_t nr_bytes)
    373{
    374	unsigned long flags;
    375	struct dm_path *path;
    376	struct pgpath *pgpath;
    377
    378	path = pg->ps.type->select_path(&pg->ps, nr_bytes);
    379	if (!path)
    380		return ERR_PTR(-ENXIO);
    381
    382	pgpath = path_to_pgpath(path);
    383
    384	if (unlikely(READ_ONCE(m->current_pg) != pg)) {
    385		/* Only update current_pgpath if pg changed */
    386		spin_lock_irqsave(&m->lock, flags);
    387		m->current_pgpath = pgpath;
    388		__switch_pg(m, pg);
    389		spin_unlock_irqrestore(&m->lock, flags);
    390	}
    391
    392	return pgpath;
    393}
    394
    395static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
    396{
    397	unsigned long flags;
    398	struct priority_group *pg;
    399	struct pgpath *pgpath;
    400	unsigned bypassed = 1;
    401
    402	if (!atomic_read(&m->nr_valid_paths)) {
    403		spin_lock_irqsave(&m->lock, flags);
    404		clear_bit(MPATHF_QUEUE_IO, &m->flags);
    405		spin_unlock_irqrestore(&m->lock, flags);
    406		goto failed;
    407	}
    408
    409	/* Were we instructed to switch PG? */
    410	if (READ_ONCE(m->next_pg)) {
    411		spin_lock_irqsave(&m->lock, flags);
    412		pg = m->next_pg;
    413		if (!pg) {
    414			spin_unlock_irqrestore(&m->lock, flags);
    415			goto check_current_pg;
    416		}
    417		m->next_pg = NULL;
    418		spin_unlock_irqrestore(&m->lock, flags);
    419		pgpath = choose_path_in_pg(m, pg, nr_bytes);
    420		if (!IS_ERR_OR_NULL(pgpath))
    421			return pgpath;
    422	}
    423
    424	/* Don't change PG until it has no remaining paths */
    425check_current_pg:
    426	pg = READ_ONCE(m->current_pg);
    427	if (pg) {
    428		pgpath = choose_path_in_pg(m, pg, nr_bytes);
    429		if (!IS_ERR_OR_NULL(pgpath))
    430			return pgpath;
    431	}
    432
    433	/*
    434	 * Loop through priority groups until we find a valid path.
    435	 * First time we skip PGs marked 'bypassed'.
    436	 * Second time we only try the ones we skipped, but set
    437	 * pg_init_delay_retry so we do not hammer controllers.
    438	 */
    439	do {
    440		list_for_each_entry(pg, &m->priority_groups, list) {
    441			if (pg->bypassed == !!bypassed)
    442				continue;
    443			pgpath = choose_path_in_pg(m, pg, nr_bytes);
    444			if (!IS_ERR_OR_NULL(pgpath)) {
    445				if (!bypassed) {
    446					spin_lock_irqsave(&m->lock, flags);
    447					set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
    448					spin_unlock_irqrestore(&m->lock, flags);
    449				}
    450				return pgpath;
    451			}
    452		}
    453	} while (bypassed--);
    454
    455failed:
    456	spin_lock_irqsave(&m->lock, flags);
    457	m->current_pgpath = NULL;
    458	m->current_pg = NULL;
    459	spin_unlock_irqrestore(&m->lock, flags);
    460
    461	return NULL;
    462}
    463
    464/*
    465 * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
    466 * report the function name and line number of the function from which
    467 * it has been invoked.
    468 */
    469#define dm_report_EIO(m)						\
    470do {									\
    471	DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
    472		      dm_table_device_name((m)->ti->table),		\
    473		      test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),	\
    474		      test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
    475		      dm_noflush_suspending((m)->ti));			\
    476} while (0)
    477
    478/*
    479 * Check whether bios must be queued in the device-mapper core rather
    480 * than here in the target.
    481 */
    482static bool __must_push_back(struct multipath *m)
    483{
    484	return dm_noflush_suspending(m->ti);
    485}
    486
    487static bool must_push_back_rq(struct multipath *m)
    488{
    489	unsigned long flags;
    490	bool ret;
    491
    492	spin_lock_irqsave(&m->lock, flags);
    493	ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m));
    494	spin_unlock_irqrestore(&m->lock, flags);
    495
    496	return ret;
    497}
    498
    499/*
    500 * Map cloned requests (request-based multipath)
    501 */
    502static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
    503				   union map_info *map_context,
    504				   struct request **__clone)
    505{
    506	struct multipath *m = ti->private;
    507	size_t nr_bytes = blk_rq_bytes(rq);
    508	struct pgpath *pgpath;
    509	struct block_device *bdev;
    510	struct dm_mpath_io *mpio = get_mpio(map_context);
    511	struct request_queue *q;
    512	struct request *clone;
    513
    514	/* Do we need to select a new pgpath? */
    515	pgpath = READ_ONCE(m->current_pgpath);
    516	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
    517		pgpath = choose_pgpath(m, nr_bytes);
    518
    519	if (!pgpath) {
    520		if (must_push_back_rq(m))
    521			return DM_MAPIO_DELAY_REQUEUE;
    522		dm_report_EIO(m);	/* Failed */
    523		return DM_MAPIO_KILL;
    524	} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
    525		   mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
    526		pg_init_all_paths(m);
    527		return DM_MAPIO_DELAY_REQUEUE;
    528	}
    529
    530	mpio->pgpath = pgpath;
    531	mpio->nr_bytes = nr_bytes;
    532
    533	bdev = pgpath->path.dev->bdev;
    534	q = bdev_get_queue(bdev);
    535	clone = blk_mq_alloc_request(q, rq->cmd_flags | REQ_NOMERGE,
    536			BLK_MQ_REQ_NOWAIT);
    537	if (IS_ERR(clone)) {
    538		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
    539		if (blk_queue_dying(q)) {
    540			atomic_inc(&m->pg_init_in_progress);
    541			activate_or_offline_path(pgpath);
    542			return DM_MAPIO_DELAY_REQUEUE;
    543		}
    544
    545		/*
    546		 * blk-mq's SCHED_RESTART can cover this requeue, so we
    547		 * needn't deal with it by DELAY_REQUEUE. More importantly,
    548		 * we have to return DM_MAPIO_REQUEUE so that blk-mq can
    549		 * get the queue busy feedback (via BLK_STS_RESOURCE),
    550		 * otherwise I/O merging can suffer.
    551		 */
    552		return DM_MAPIO_REQUEUE;
    553	}
    554	clone->bio = clone->biotail = NULL;
    555	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
    556	*__clone = clone;
    557
    558	if (pgpath->pg->ps.type->start_io)
    559		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
    560					      &pgpath->path,
    561					      nr_bytes);
    562	return DM_MAPIO_REMAPPED;
    563}
    564
    565static void multipath_release_clone(struct request *clone,
    566				    union map_info *map_context)
    567{
    568	if (unlikely(map_context)) {
    569		/*
    570		 * non-NULL map_context means caller is still map
    571		 * method; must undo multipath_clone_and_map()
    572		 */
    573		struct dm_mpath_io *mpio = get_mpio(map_context);
    574		struct pgpath *pgpath = mpio->pgpath;
    575
    576		if (pgpath && pgpath->pg->ps.type->end_io)
    577			pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
    578						    &pgpath->path,
    579						    mpio->nr_bytes,
    580						    clone->io_start_time_ns);
    581	}
    582
    583	blk_mq_free_request(clone);
    584}
    585
    586/*
    587 * Map cloned bios (bio-based multipath)
    588 */
    589
    590static void __multipath_queue_bio(struct multipath *m, struct bio *bio)
    591{
    592	/* Queue for the daemon to resubmit */
    593	bio_list_add(&m->queued_bios, bio);
    594	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
    595		queue_work(kmultipathd, &m->process_queued_bios);
    596}
    597
    598static void multipath_queue_bio(struct multipath *m, struct bio *bio)
    599{
    600	unsigned long flags;
    601
    602	spin_lock_irqsave(&m->lock, flags);
    603	__multipath_queue_bio(m, bio);
    604	spin_unlock_irqrestore(&m->lock, flags);
    605}
    606
    607static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
    608{
    609	struct pgpath *pgpath;
    610	unsigned long flags;
    611
    612	/* Do we need to select a new pgpath? */
    613	pgpath = READ_ONCE(m->current_pgpath);
    614	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
    615		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
    616
    617	if (!pgpath) {
    618		spin_lock_irqsave(&m->lock, flags);
    619		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
    620			__multipath_queue_bio(m, bio);
    621			pgpath = ERR_PTR(-EAGAIN);
    622		}
    623		spin_unlock_irqrestore(&m->lock, flags);
    624
    625	} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
    626		   mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
    627		multipath_queue_bio(m, bio);
    628		pg_init_all_paths(m);
    629		return ERR_PTR(-EAGAIN);
    630	}
    631
    632	return pgpath;
    633}
    634
    635static int __multipath_map_bio(struct multipath *m, struct bio *bio,
    636			       struct dm_mpath_io *mpio)
    637{
    638	struct pgpath *pgpath = __map_bio(m, bio);
    639
    640	if (IS_ERR(pgpath))
    641		return DM_MAPIO_SUBMITTED;
    642
    643	if (!pgpath) {
    644		if (__must_push_back(m))
    645			return DM_MAPIO_REQUEUE;
    646		dm_report_EIO(m);
    647		return DM_MAPIO_KILL;
    648	}
    649
    650	mpio->pgpath = pgpath;
    651
    652	if (dm_ps_use_hr_timer(pgpath->pg->ps.type))
    653		mpio->start_time_ns = ktime_get_ns();
    654
    655	bio->bi_status = 0;
    656	bio_set_dev(bio, pgpath->path.dev->bdev);
    657	bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
    658
    659	if (pgpath->pg->ps.type->start_io)
    660		pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
    661					      &pgpath->path,
    662					      mpio->nr_bytes);
    663	return DM_MAPIO_REMAPPED;
    664}
    665
    666static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
    667{
    668	struct multipath *m = ti->private;
    669	struct dm_mpath_io *mpio = NULL;
    670
    671	multipath_init_per_bio_data(bio, &mpio);
    672	return __multipath_map_bio(m, bio, mpio);
    673}
    674
    675static void process_queued_io_list(struct multipath *m)
    676{
    677	if (m->queue_mode == DM_TYPE_REQUEST_BASED)
    678		dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
    679	else if (m->queue_mode == DM_TYPE_BIO_BASED)
    680		queue_work(kmultipathd, &m->process_queued_bios);
    681}
    682
    683static void process_queued_bios(struct work_struct *work)
    684{
    685	int r;
    686	unsigned long flags;
    687	struct bio *bio;
    688	struct bio_list bios;
    689	struct blk_plug plug;
    690	struct multipath *m =
    691		container_of(work, struct multipath, process_queued_bios);
    692
    693	bio_list_init(&bios);
    694
    695	spin_lock_irqsave(&m->lock, flags);
    696
    697	if (bio_list_empty(&m->queued_bios)) {
    698		spin_unlock_irqrestore(&m->lock, flags);
    699		return;
    700	}
    701
    702	bio_list_merge(&bios, &m->queued_bios);
    703	bio_list_init(&m->queued_bios);
    704
    705	spin_unlock_irqrestore(&m->lock, flags);
    706
    707	blk_start_plug(&plug);
    708	while ((bio = bio_list_pop(&bios))) {
    709		struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
    710		dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
    711		r = __multipath_map_bio(m, bio, mpio);
    712		switch (r) {
    713		case DM_MAPIO_KILL:
    714			bio->bi_status = BLK_STS_IOERR;
    715			bio_endio(bio);
    716			break;
    717		case DM_MAPIO_REQUEUE:
    718			bio->bi_status = BLK_STS_DM_REQUEUE;
    719			bio_endio(bio);
    720			break;
    721		case DM_MAPIO_REMAPPED:
    722			submit_bio_noacct(bio);
    723			break;
    724		case DM_MAPIO_SUBMITTED:
    725			break;
    726		default:
    727			WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
    728		}
    729	}
    730	blk_finish_plug(&plug);
    731}
    732
    733/*
    734 * If we run out of usable paths, should we queue I/O or error it?
    735 */
    736static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
    737			    bool save_old_value, const char *caller)
    738{
    739	unsigned long flags;
    740	bool queue_if_no_path_bit, saved_queue_if_no_path_bit;
    741	const char *dm_dev_name = dm_table_device_name(m->ti->table);
    742
    743	DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d",
    744		dm_dev_name, __func__, caller, queue_if_no_path, save_old_value);
    745
    746	spin_lock_irqsave(&m->lock, flags);
    747
    748	queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
    749	saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
    750
    751	if (save_old_value) {
    752		if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) {
    753			DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
    754			      dm_dev_name);
    755		} else
    756			assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit);
    757	} else if (!queue_if_no_path && saved_queue_if_no_path_bit) {
    758		/* due to "fail_if_no_path" message, need to honor it. */
    759		clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
    760	}
    761	assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
    762
    763	DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
    764		dm_dev_name, __func__,
    765		test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
    766		test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
    767		dm_noflush_suspending(m->ti));
    768
    769	spin_unlock_irqrestore(&m->lock, flags);
    770
    771	if (!queue_if_no_path) {
    772		dm_table_run_md_queue_async(m->ti->table);
    773		process_queued_io_list(m);
    774	}
    775
    776	return 0;
    777}
    778
    779/*
    780 * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
    781 * process any queued I/O.
    782 */
    783static void queue_if_no_path_timeout_work(struct timer_list *t)
    784{
    785	struct multipath *m = from_timer(m, t, nopath_timer);
    786
    787	DMWARN("queue_if_no_path timeout on %s, failing queued IO",
    788	       dm_table_device_name(m->ti->table));
    789	queue_if_no_path(m, false, false, __func__);
    790}
    791
    792/*
    793 * Enable the queue_if_no_path timeout if necessary.
    794 * Called with m->lock held.
    795 */
    796static void enable_nopath_timeout(struct multipath *m)
    797{
    798	unsigned long queue_if_no_path_timeout =
    799		READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
    800
    801	lockdep_assert_held(&m->lock);
    802
    803	if (queue_if_no_path_timeout > 0 &&
    804	    atomic_read(&m->nr_valid_paths) == 0 &&
    805	    test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
    806		mod_timer(&m->nopath_timer,
    807			  jiffies + queue_if_no_path_timeout);
    808	}
    809}
    810
    811static void disable_nopath_timeout(struct multipath *m)
    812{
    813	del_timer_sync(&m->nopath_timer);
    814}
    815
    816/*
    817 * An event is triggered whenever a path is taken out of use.
    818 * Includes path failure and PG bypass.
    819 */
    820static void trigger_event(struct work_struct *work)
    821{
    822	struct multipath *m =
    823		container_of(work, struct multipath, trigger_event);
    824
    825	dm_table_event(m->ti->table);
    826}
    827
    828/*-----------------------------------------------------------------
    829 * Constructor/argument parsing:
    830 * <#multipath feature args> [<arg>]*
    831 * <#hw_handler args> [hw_handler [<arg>]*]
    832 * <#priority groups>
    833 * <initial priority group>
    834 *     [<selector> <#selector args> [<arg>]*
    835 *      <#paths> <#per-path selector args>
    836 *         [<path> [<arg>]* ]+ ]+
    837 *---------------------------------------------------------------*/
    838static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
    839			       struct dm_target *ti)
    840{
    841	int r;
    842	struct path_selector_type *pst;
    843	unsigned ps_argc;
    844
    845	static const struct dm_arg _args[] = {
    846		{0, 1024, "invalid number of path selector args"},
    847	};
    848
    849	pst = dm_get_path_selector(dm_shift_arg(as));
    850	if (!pst) {
    851		ti->error = "unknown path selector type";
    852		return -EINVAL;
    853	}
    854
    855	r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
    856	if (r) {
    857		dm_put_path_selector(pst);
    858		return -EINVAL;
    859	}
    860
    861	r = pst->create(&pg->ps, ps_argc, as->argv);
    862	if (r) {
    863		dm_put_path_selector(pst);
    864		ti->error = "path selector constructor failed";
    865		return r;
    866	}
    867
    868	pg->ps.type = pst;
    869	dm_consume_args(as, ps_argc);
    870
    871	return 0;
    872}
    873
    874static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
    875			 const char **attached_handler_name, char **error)
    876{
    877	struct request_queue *q = bdev_get_queue(bdev);
    878	int r;
    879
    880	if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
    881retain:
    882		if (*attached_handler_name) {
    883			/*
    884			 * Clear any hw_handler_params associated with a
    885			 * handler that isn't already attached.
    886			 */
    887			if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
    888				kfree(m->hw_handler_params);
    889				m->hw_handler_params = NULL;
    890			}
    891
    892			/*
    893			 * Reset hw_handler_name to match the attached handler
    894			 *
    895			 * NB. This modifies the table line to show the actual
    896			 * handler instead of the original table passed in.
    897			 */
    898			kfree(m->hw_handler_name);
    899			m->hw_handler_name = *attached_handler_name;
    900			*attached_handler_name = NULL;
    901		}
    902	}
    903
    904	if (m->hw_handler_name) {
    905		r = scsi_dh_attach(q, m->hw_handler_name);
    906		if (r == -EBUSY) {
    907			DMINFO("retaining handler on device %pg", bdev);
    908			goto retain;
    909		}
    910		if (r < 0) {
    911			*error = "error attaching hardware handler";
    912			return r;
    913		}
    914
    915		if (m->hw_handler_params) {
    916			r = scsi_dh_set_params(q, m->hw_handler_params);
    917			if (r < 0) {
    918				*error = "unable to set hardware handler parameters";
    919				return r;
    920			}
    921		}
    922	}
    923
    924	return 0;
    925}
    926
    927static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
    928				 struct dm_target *ti)
    929{
    930	int r;
    931	struct pgpath *p;
    932	struct multipath *m = ti->private;
    933	struct request_queue *q;
    934	const char *attached_handler_name = NULL;
    935
    936	/* we need at least a path arg */
    937	if (as->argc < 1) {
    938		ti->error = "no device given";
    939		return ERR_PTR(-EINVAL);
    940	}
    941
    942	p = alloc_pgpath();
    943	if (!p)
    944		return ERR_PTR(-ENOMEM);
    945
    946	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
    947			  &p->path.dev);
    948	if (r) {
    949		ti->error = "error getting device";
    950		goto bad;
    951	}
    952
    953	q = bdev_get_queue(p->path.dev->bdev);
    954	attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
    955	if (attached_handler_name || m->hw_handler_name) {
    956		INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
    957		r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
    958		kfree(attached_handler_name);
    959		if (r) {
    960			dm_put_device(ti, p->path.dev);
    961			goto bad;
    962		}
    963	}
    964
    965	r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
    966	if (r) {
    967		dm_put_device(ti, p->path.dev);
    968		goto bad;
    969	}
    970
    971	return p;
    972 bad:
    973	free_pgpath(p);
    974	return ERR_PTR(r);
    975}
    976
    977static struct priority_group *parse_priority_group(struct dm_arg_set *as,
    978						   struct multipath *m)
    979{
    980	static const struct dm_arg _args[] = {
    981		{1, 1024, "invalid number of paths"},
    982		{0, 1024, "invalid number of selector args"}
    983	};
    984
    985	int r;
    986	unsigned i, nr_selector_args, nr_args;
    987	struct priority_group *pg;
    988	struct dm_target *ti = m->ti;
    989
    990	if (as->argc < 2) {
    991		as->argc = 0;
    992		ti->error = "not enough priority group arguments";
    993		return ERR_PTR(-EINVAL);
    994	}
    995
    996	pg = alloc_priority_group();
    997	if (!pg) {
    998		ti->error = "couldn't allocate priority group";
    999		return ERR_PTR(-ENOMEM);
   1000	}
   1001	pg->m = m;
   1002
   1003	r = parse_path_selector(as, pg, ti);
   1004	if (r)
   1005		goto bad;
   1006
   1007	/*
   1008	 * read the paths
   1009	 */
   1010	r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
   1011	if (r)
   1012		goto bad;
   1013
   1014	r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
   1015	if (r)
   1016		goto bad;
   1017
   1018	nr_args = 1 + nr_selector_args;
   1019	for (i = 0; i < pg->nr_pgpaths; i++) {
   1020		struct pgpath *pgpath;
   1021		struct dm_arg_set path_args;
   1022
   1023		if (as->argc < nr_args) {
   1024			ti->error = "not enough path parameters";
   1025			r = -EINVAL;
   1026			goto bad;
   1027		}
   1028
   1029		path_args.argc = nr_args;
   1030		path_args.argv = as->argv;
   1031
   1032		pgpath = parse_path(&path_args, &pg->ps, ti);
   1033		if (IS_ERR(pgpath)) {
   1034			r = PTR_ERR(pgpath);
   1035			goto bad;
   1036		}
   1037
   1038		pgpath->pg = pg;
   1039		list_add_tail(&pgpath->list, &pg->pgpaths);
   1040		dm_consume_args(as, nr_args);
   1041	}
   1042
   1043	return pg;
   1044
   1045 bad:
   1046	free_priority_group(pg, ti);
   1047	return ERR_PTR(r);
   1048}
   1049
   1050static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
   1051{
   1052	unsigned hw_argc;
   1053	int ret;
   1054	struct dm_target *ti = m->ti;
   1055
   1056	static const struct dm_arg _args[] = {
   1057		{0, 1024, "invalid number of hardware handler args"},
   1058	};
   1059
   1060	if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
   1061		return -EINVAL;
   1062
   1063	if (!hw_argc)
   1064		return 0;
   1065
   1066	if (m->queue_mode == DM_TYPE_BIO_BASED) {
   1067		dm_consume_args(as, hw_argc);
   1068		DMERR("bio-based multipath doesn't allow hardware handler args");
   1069		return 0;
   1070	}
   1071
   1072	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
   1073	if (!m->hw_handler_name)
   1074		return -EINVAL;
   1075
   1076	if (hw_argc > 1) {
   1077		char *p;
   1078		int i, j, len = 4;
   1079
   1080		for (i = 0; i <= hw_argc - 2; i++)
   1081			len += strlen(as->argv[i]) + 1;
   1082		p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
   1083		if (!p) {
   1084			ti->error = "memory allocation failed";
   1085			ret = -ENOMEM;
   1086			goto fail;
   1087		}
   1088		j = sprintf(p, "%d", hw_argc - 1);
   1089		for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
   1090			j = sprintf(p, "%s", as->argv[i]);
   1091	}
   1092	dm_consume_args(as, hw_argc - 1);
   1093
   1094	return 0;
   1095fail:
   1096	kfree(m->hw_handler_name);
   1097	m->hw_handler_name = NULL;
   1098	return ret;
   1099}
   1100
   1101static int parse_features(struct dm_arg_set *as, struct multipath *m)
   1102{
   1103	int r;
   1104	unsigned argc;
   1105	struct dm_target *ti = m->ti;
   1106	const char *arg_name;
   1107
   1108	static const struct dm_arg _args[] = {
   1109		{0, 8, "invalid number of feature args"},
   1110		{1, 50, "pg_init_retries must be between 1 and 50"},
   1111		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
   1112	};
   1113
   1114	r = dm_read_arg_group(_args, as, &argc, &ti->error);
   1115	if (r)
   1116		return -EINVAL;
   1117
   1118	if (!argc)
   1119		return 0;
   1120
   1121	do {
   1122		arg_name = dm_shift_arg(as);
   1123		argc--;
   1124
   1125		if (!strcasecmp(arg_name, "queue_if_no_path")) {
   1126			r = queue_if_no_path(m, true, false, __func__);
   1127			continue;
   1128		}
   1129
   1130		if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
   1131			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
   1132			continue;
   1133		}
   1134
   1135		if (!strcasecmp(arg_name, "pg_init_retries") &&
   1136		    (argc >= 1)) {
   1137			r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
   1138			argc--;
   1139			continue;
   1140		}
   1141
   1142		if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
   1143		    (argc >= 1)) {
   1144			r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
   1145			argc--;
   1146			continue;
   1147		}
   1148
   1149		if (!strcasecmp(arg_name, "queue_mode") &&
   1150		    (argc >= 1)) {
   1151			const char *queue_mode_name = dm_shift_arg(as);
   1152
   1153			if (!strcasecmp(queue_mode_name, "bio"))
   1154				m->queue_mode = DM_TYPE_BIO_BASED;
   1155			else if (!strcasecmp(queue_mode_name, "rq") ||
   1156				 !strcasecmp(queue_mode_name, "mq"))
   1157				m->queue_mode = DM_TYPE_REQUEST_BASED;
   1158			else {
   1159				ti->error = "Unknown 'queue_mode' requested";
   1160				r = -EINVAL;
   1161			}
   1162			argc--;
   1163			continue;
   1164		}
   1165
   1166		ti->error = "Unrecognised multipath feature request";
   1167		r = -EINVAL;
   1168	} while (argc && !r);
   1169
   1170	return r;
   1171}
   1172
   1173static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
   1174{
   1175	/* target arguments */
   1176	static const struct dm_arg _args[] = {
   1177		{0, 1024, "invalid number of priority groups"},
   1178		{0, 1024, "invalid initial priority group number"},
   1179	};
   1180
   1181	int r;
   1182	struct multipath *m;
   1183	struct dm_arg_set as;
   1184	unsigned pg_count = 0;
   1185	unsigned next_pg_num;
   1186	unsigned long flags;
   1187
   1188	as.argc = argc;
   1189	as.argv = argv;
   1190
   1191	m = alloc_multipath(ti);
   1192	if (!m) {
   1193		ti->error = "can't allocate multipath";
   1194		return -EINVAL;
   1195	}
   1196
   1197	r = parse_features(&as, m);
   1198	if (r)
   1199		goto bad;
   1200
   1201	r = alloc_multipath_stage2(ti, m);
   1202	if (r)
   1203		goto bad;
   1204
   1205	r = parse_hw_handler(&as, m);
   1206	if (r)
   1207		goto bad;
   1208
   1209	r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
   1210	if (r)
   1211		goto bad;
   1212
   1213	r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
   1214	if (r)
   1215		goto bad;
   1216
   1217	if ((!m->nr_priority_groups && next_pg_num) ||
   1218	    (m->nr_priority_groups && !next_pg_num)) {
   1219		ti->error = "invalid initial priority group";
   1220		r = -EINVAL;
   1221		goto bad;
   1222	}
   1223
   1224	/* parse the priority groups */
   1225	while (as.argc) {
   1226		struct priority_group *pg;
   1227		unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
   1228
   1229		pg = parse_priority_group(&as, m);
   1230		if (IS_ERR(pg)) {
   1231			r = PTR_ERR(pg);
   1232			goto bad;
   1233		}
   1234
   1235		nr_valid_paths += pg->nr_pgpaths;
   1236		atomic_set(&m->nr_valid_paths, nr_valid_paths);
   1237
   1238		list_add_tail(&pg->list, &m->priority_groups);
   1239		pg_count++;
   1240		pg->pg_num = pg_count;
   1241		if (!--next_pg_num)
   1242			m->next_pg = pg;
   1243	}
   1244
   1245	if (pg_count != m->nr_priority_groups) {
   1246		ti->error = "priority group count mismatch";
   1247		r = -EINVAL;
   1248		goto bad;
   1249	}
   1250
   1251	spin_lock_irqsave(&m->lock, flags);
   1252	enable_nopath_timeout(m);
   1253	spin_unlock_irqrestore(&m->lock, flags);
   1254
   1255	ti->num_flush_bios = 1;
   1256	ti->num_discard_bios = 1;
   1257	ti->num_write_zeroes_bios = 1;
   1258	if (m->queue_mode == DM_TYPE_BIO_BASED)
   1259		ti->per_io_data_size = multipath_per_bio_data_size();
   1260	else
   1261		ti->per_io_data_size = sizeof(struct dm_mpath_io);
   1262
   1263	return 0;
   1264
   1265 bad:
   1266	free_multipath(m);
   1267	return r;
   1268}
   1269
   1270static void multipath_wait_for_pg_init_completion(struct multipath *m)
   1271{
   1272	DEFINE_WAIT(wait);
   1273
   1274	while (1) {
   1275		prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
   1276
   1277		if (!atomic_read(&m->pg_init_in_progress))
   1278			break;
   1279
   1280		io_schedule();
   1281	}
   1282	finish_wait(&m->pg_init_wait, &wait);
   1283}
   1284
   1285static void flush_multipath_work(struct multipath *m)
   1286{
   1287	if (m->hw_handler_name) {
   1288		unsigned long flags;
   1289
   1290		if (!atomic_read(&m->pg_init_in_progress))
   1291			goto skip;
   1292
   1293		spin_lock_irqsave(&m->lock, flags);
   1294		if (atomic_read(&m->pg_init_in_progress) &&
   1295		    !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) {
   1296			spin_unlock_irqrestore(&m->lock, flags);
   1297
   1298			flush_workqueue(kmpath_handlerd);
   1299			multipath_wait_for_pg_init_completion(m);
   1300
   1301			spin_lock_irqsave(&m->lock, flags);
   1302			clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
   1303		}
   1304		spin_unlock_irqrestore(&m->lock, flags);
   1305	}
   1306skip:
   1307	if (m->queue_mode == DM_TYPE_BIO_BASED)
   1308		flush_work(&m->process_queued_bios);
   1309	flush_work(&m->trigger_event);
   1310}
   1311
   1312static void multipath_dtr(struct dm_target *ti)
   1313{
   1314	struct multipath *m = ti->private;
   1315
   1316	disable_nopath_timeout(m);
   1317	flush_multipath_work(m);
   1318	free_multipath(m);
   1319}
   1320
   1321/*
   1322 * Take a path out of use.
   1323 */
   1324static int fail_path(struct pgpath *pgpath)
   1325{
   1326	unsigned long flags;
   1327	struct multipath *m = pgpath->pg->m;
   1328
   1329	spin_lock_irqsave(&m->lock, flags);
   1330
   1331	if (!pgpath->is_active)
   1332		goto out;
   1333
   1334	DMWARN("%s: Failing path %s.",
   1335	       dm_table_device_name(m->ti->table),
   1336	       pgpath->path.dev->name);
   1337
   1338	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
   1339	pgpath->is_active = false;
   1340	pgpath->fail_count++;
   1341
   1342	atomic_dec(&m->nr_valid_paths);
   1343
   1344	if (pgpath == m->current_pgpath)
   1345		m->current_pgpath = NULL;
   1346
   1347	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
   1348		       pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
   1349
   1350	schedule_work(&m->trigger_event);
   1351
   1352	enable_nopath_timeout(m);
   1353
   1354out:
   1355	spin_unlock_irqrestore(&m->lock, flags);
   1356
   1357	return 0;
   1358}
   1359
   1360/*
   1361 * Reinstate a previously-failed path
   1362 */
   1363static int reinstate_path(struct pgpath *pgpath)
   1364{
   1365	int r = 0, run_queue = 0;
   1366	unsigned long flags;
   1367	struct multipath *m = pgpath->pg->m;
   1368	unsigned nr_valid_paths;
   1369
   1370	spin_lock_irqsave(&m->lock, flags);
   1371
   1372	if (pgpath->is_active)
   1373		goto out;
   1374
   1375	DMWARN("%s: Reinstating path %s.",
   1376	       dm_table_device_name(m->ti->table),
   1377	       pgpath->path.dev->name);
   1378
   1379	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
   1380	if (r)
   1381		goto out;
   1382
   1383	pgpath->is_active = true;
   1384
   1385	nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
   1386	if (nr_valid_paths == 1) {
   1387		m->current_pgpath = NULL;
   1388		run_queue = 1;
   1389	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
   1390		if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
   1391			atomic_inc(&m->pg_init_in_progress);
   1392	}
   1393
   1394	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
   1395		       pgpath->path.dev->name, nr_valid_paths);
   1396
   1397	schedule_work(&m->trigger_event);
   1398
   1399out:
   1400	spin_unlock_irqrestore(&m->lock, flags);
   1401	if (run_queue) {
   1402		dm_table_run_md_queue_async(m->ti->table);
   1403		process_queued_io_list(m);
   1404	}
   1405
   1406	if (pgpath->is_active)
   1407		disable_nopath_timeout(m);
   1408
   1409	return r;
   1410}
   1411
   1412/*
   1413 * Fail or reinstate all paths that match the provided struct dm_dev.
   1414 */
   1415static int action_dev(struct multipath *m, struct dm_dev *dev,
   1416		      action_fn action)
   1417{
   1418	int r = -EINVAL;
   1419	struct pgpath *pgpath;
   1420	struct priority_group *pg;
   1421
   1422	list_for_each_entry(pg, &m->priority_groups, list) {
   1423		list_for_each_entry(pgpath, &pg->pgpaths, list) {
   1424			if (pgpath->path.dev == dev)
   1425				r = action(pgpath);
   1426		}
   1427	}
   1428
   1429	return r;
   1430}
   1431
   1432/*
   1433 * Temporarily try to avoid having to use the specified PG
   1434 */
   1435static void bypass_pg(struct multipath *m, struct priority_group *pg,
   1436		      bool bypassed)
   1437{
   1438	unsigned long flags;
   1439
   1440	spin_lock_irqsave(&m->lock, flags);
   1441
   1442	pg->bypassed = bypassed;
   1443	m->current_pgpath = NULL;
   1444	m->current_pg = NULL;
   1445
   1446	spin_unlock_irqrestore(&m->lock, flags);
   1447
   1448	schedule_work(&m->trigger_event);
   1449}
   1450
   1451/*
   1452 * Switch to using the specified PG from the next I/O that gets mapped
   1453 */
   1454static int switch_pg_num(struct multipath *m, const char *pgstr)
   1455{
   1456	struct priority_group *pg;
   1457	unsigned pgnum;
   1458	unsigned long flags;
   1459	char dummy;
   1460
   1461	if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
   1462	    !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
   1463		DMWARN("invalid PG number supplied to switch_pg_num");
   1464		return -EINVAL;
   1465	}
   1466
   1467	spin_lock_irqsave(&m->lock, flags);
   1468	list_for_each_entry(pg, &m->priority_groups, list) {
   1469		pg->bypassed = false;
   1470		if (--pgnum)
   1471			continue;
   1472
   1473		m->current_pgpath = NULL;
   1474		m->current_pg = NULL;
   1475		m->next_pg = pg;
   1476	}
   1477	spin_unlock_irqrestore(&m->lock, flags);
   1478
   1479	schedule_work(&m->trigger_event);
   1480	return 0;
   1481}
   1482
   1483/*
   1484 * Set/clear bypassed status of a PG.
   1485 * PGs are numbered upwards from 1 in the order they were declared.
   1486 */
   1487static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
   1488{
   1489	struct priority_group *pg;
   1490	unsigned pgnum;
   1491	char dummy;
   1492
   1493	if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
   1494	    !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
   1495		DMWARN("invalid PG number supplied to bypass_pg");
   1496		return -EINVAL;
   1497	}
   1498
   1499	list_for_each_entry(pg, &m->priority_groups, list) {
   1500		if (!--pgnum)
   1501			break;
   1502	}
   1503
   1504	bypass_pg(m, pg, bypassed);
   1505	return 0;
   1506}
   1507
   1508/*
   1509 * Should we retry pg_init immediately?
   1510 */
   1511static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
   1512{
   1513	unsigned long flags;
   1514	bool limit_reached = false;
   1515
   1516	spin_lock_irqsave(&m->lock, flags);
   1517
   1518	if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
   1519	    !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
   1520		set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
   1521	else
   1522		limit_reached = true;
   1523
   1524	spin_unlock_irqrestore(&m->lock, flags);
   1525
   1526	return limit_reached;
   1527}
   1528
   1529static void pg_init_done(void *data, int errors)
   1530{
   1531	struct pgpath *pgpath = data;
   1532	struct priority_group *pg = pgpath->pg;
   1533	struct multipath *m = pg->m;
   1534	unsigned long flags;
   1535	bool delay_retry = false;
   1536
   1537	/* device or driver problems */
   1538	switch (errors) {
   1539	case SCSI_DH_OK:
   1540		break;
   1541	case SCSI_DH_NOSYS:
   1542		if (!m->hw_handler_name) {
   1543			errors = 0;
   1544			break;
   1545		}
   1546		DMERR("Could not failover the device: Handler scsi_dh_%s "
   1547		      "Error %d.", m->hw_handler_name, errors);
   1548		/*
   1549		 * Fail path for now, so we do not ping pong
   1550		 */
   1551		fail_path(pgpath);
   1552		break;
   1553	case SCSI_DH_DEV_TEMP_BUSY:
   1554		/*
   1555		 * Probably doing something like FW upgrade on the
   1556		 * controller so try the other pg.
   1557		 */
   1558		bypass_pg(m, pg, true);
   1559		break;
   1560	case SCSI_DH_RETRY:
   1561		/* Wait before retrying. */
   1562		delay_retry = true;
   1563		fallthrough;
   1564	case SCSI_DH_IMM_RETRY:
   1565	case SCSI_DH_RES_TEMP_UNAVAIL:
   1566		if (pg_init_limit_reached(m, pgpath))
   1567			fail_path(pgpath);
   1568		errors = 0;
   1569		break;
   1570	case SCSI_DH_DEV_OFFLINED:
   1571	default:
   1572		/*
   1573		 * We probably do not want to fail the path for a device
   1574		 * error, but this is what the old dm did. In future
   1575		 * patches we can do more advanced handling.
   1576		 */
   1577		fail_path(pgpath);
   1578	}
   1579
   1580	spin_lock_irqsave(&m->lock, flags);
   1581	if (errors) {
   1582		if (pgpath == m->current_pgpath) {
   1583			DMERR("Could not failover device. Error %d.", errors);
   1584			m->current_pgpath = NULL;
   1585			m->current_pg = NULL;
   1586		}
   1587	} else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
   1588		pg->bypassed = false;
   1589
   1590	if (atomic_dec_return(&m->pg_init_in_progress) > 0)
   1591		/* Activations of other paths are still on going */
   1592		goto out;
   1593
   1594	if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
   1595		if (delay_retry)
   1596			set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
   1597		else
   1598			clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
   1599
   1600		if (__pg_init_all_paths(m))
   1601			goto out;
   1602	}
   1603	clear_bit(MPATHF_QUEUE_IO, &m->flags);
   1604
   1605	process_queued_io_list(m);
   1606
   1607	/*
   1608	 * Wake up any thread waiting to suspend.
   1609	 */
   1610	wake_up(&m->pg_init_wait);
   1611
   1612out:
   1613	spin_unlock_irqrestore(&m->lock, flags);
   1614}
   1615
   1616static void activate_or_offline_path(struct pgpath *pgpath)
   1617{
   1618	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
   1619
   1620	if (pgpath->is_active && !blk_queue_dying(q))
   1621		scsi_dh_activate(q, pg_init_done, pgpath);
   1622	else
   1623		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
   1624}
   1625
   1626static void activate_path_work(struct work_struct *work)
   1627{
   1628	struct pgpath *pgpath =
   1629		container_of(work, struct pgpath, activate_path.work);
   1630
   1631	activate_or_offline_path(pgpath);
   1632}
   1633
   1634static int multipath_end_io(struct dm_target *ti, struct request *clone,
   1635			    blk_status_t error, union map_info *map_context)
   1636{
   1637	struct dm_mpath_io *mpio = get_mpio(map_context);
   1638	struct pgpath *pgpath = mpio->pgpath;
   1639	int r = DM_ENDIO_DONE;
   1640
   1641	/*
   1642	 * We don't queue any clone request inside the multipath target
   1643	 * during end I/O handling, since those clone requests don't have
   1644	 * bio clones.  If we queue them inside the multipath target,
   1645	 * we need to make bio clones, that requires memory allocation.
   1646	 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
   1647	 *  don't have bio clones.)
   1648	 * Instead of queueing the clone request here, we queue the original
   1649	 * request into dm core, which will remake a clone request and
   1650	 * clone bios for it and resubmit it later.
   1651	 */
   1652	if (error && blk_path_error(error)) {
   1653		struct multipath *m = ti->private;
   1654
   1655		if (error == BLK_STS_RESOURCE)
   1656			r = DM_ENDIO_DELAY_REQUEUE;
   1657		else
   1658			r = DM_ENDIO_REQUEUE;
   1659
   1660		if (pgpath)
   1661			fail_path(pgpath);
   1662
   1663		if (!atomic_read(&m->nr_valid_paths) &&
   1664		    !must_push_back_rq(m)) {
   1665			if (error == BLK_STS_IOERR)
   1666				dm_report_EIO(m);
   1667			/* complete with the original error */
   1668			r = DM_ENDIO_DONE;
   1669		}
   1670	}
   1671
   1672	if (pgpath) {
   1673		struct path_selector *ps = &pgpath->pg->ps;
   1674
   1675		if (ps->type->end_io)
   1676			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
   1677					 clone->io_start_time_ns);
   1678	}
   1679
   1680	return r;
   1681}
   1682
   1683static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
   1684				blk_status_t *error)
   1685{
   1686	struct multipath *m = ti->private;
   1687	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
   1688	struct pgpath *pgpath = mpio->pgpath;
   1689	unsigned long flags;
   1690	int r = DM_ENDIO_DONE;
   1691
   1692	if (!*error || !blk_path_error(*error))
   1693		goto done;
   1694
   1695	if (pgpath)
   1696		fail_path(pgpath);
   1697
   1698	if (!atomic_read(&m->nr_valid_paths)) {
   1699		spin_lock_irqsave(&m->lock, flags);
   1700		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
   1701			if (__must_push_back(m)) {
   1702				r = DM_ENDIO_REQUEUE;
   1703			} else {
   1704				dm_report_EIO(m);
   1705				*error = BLK_STS_IOERR;
   1706			}
   1707			spin_unlock_irqrestore(&m->lock, flags);
   1708			goto done;
   1709		}
   1710		spin_unlock_irqrestore(&m->lock, flags);
   1711	}
   1712
   1713	multipath_queue_bio(m, clone);
   1714	r = DM_ENDIO_INCOMPLETE;
   1715done:
   1716	if (pgpath) {
   1717		struct path_selector *ps = &pgpath->pg->ps;
   1718
   1719		if (ps->type->end_io)
   1720			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
   1721					 (mpio->start_time_ns ?:
   1722					  dm_start_time_ns_from_clone(clone)));
   1723	}
   1724
   1725	return r;
   1726}
   1727
   1728/*
   1729 * Suspend with flush can't complete until all the I/O is processed
   1730 * so if the last path fails we must error any remaining I/O.
   1731 * - Note that if the freeze_bdev fails while suspending, the
   1732 *   queue_if_no_path state is lost - userspace should reset it.
   1733 * Otherwise, during noflush suspend, queue_if_no_path will not change.
   1734 */
   1735static void multipath_presuspend(struct dm_target *ti)
   1736{
   1737	struct multipath *m = ti->private;
   1738
   1739	/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
   1740	if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
   1741		queue_if_no_path(m, false, true, __func__);
   1742}
   1743
   1744static void multipath_postsuspend(struct dm_target *ti)
   1745{
   1746	struct multipath *m = ti->private;
   1747
   1748	mutex_lock(&m->work_mutex);
   1749	flush_multipath_work(m);
   1750	mutex_unlock(&m->work_mutex);
   1751}
   1752
   1753/*
   1754 * Restore the queue_if_no_path setting.
   1755 */
   1756static void multipath_resume(struct dm_target *ti)
   1757{
   1758	struct multipath *m = ti->private;
   1759	unsigned long flags;
   1760
   1761	spin_lock_irqsave(&m->lock, flags);
   1762	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
   1763		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
   1764		clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
   1765	}
   1766
   1767	DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
   1768		dm_table_device_name(m->ti->table), __func__,
   1769		test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
   1770		test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
   1771
   1772	spin_unlock_irqrestore(&m->lock, flags);
   1773}
   1774
   1775/*
   1776 * Info output has the following format:
   1777 * num_multipath_feature_args [multipath_feature_args]*
   1778 * num_handler_status_args [handler_status_args]*
   1779 * num_groups init_group_number
   1780 *            [A|D|E num_ps_status_args [ps_status_args]*
   1781 *             num_paths num_selector_args
   1782 *             [path_dev A|F fail_count [selector_args]* ]+ ]+
   1783 *
   1784 * Table output has the following format (identical to the constructor string):
   1785 * num_feature_args [features_args]*
   1786 * num_handler_args hw_handler [hw_handler_args]*
   1787 * num_groups init_group_number
   1788 *     [priority selector-name num_ps_args [ps_args]*
   1789 *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
   1790 */
   1791static void multipath_status(struct dm_target *ti, status_type_t type,
   1792			     unsigned status_flags, char *result, unsigned maxlen)
   1793{
   1794	int sz = 0, pg_counter, pgpath_counter;
   1795	unsigned long flags;
   1796	struct multipath *m = ti->private;
   1797	struct priority_group *pg;
   1798	struct pgpath *p;
   1799	unsigned pg_num;
   1800	char state;
   1801
   1802	spin_lock_irqsave(&m->lock, flags);
   1803
   1804	/* Features */
   1805	if (type == STATUSTYPE_INFO)
   1806		DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
   1807		       atomic_read(&m->pg_init_count));
   1808	else {
   1809		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
   1810			      (m->pg_init_retries > 0) * 2 +
   1811			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
   1812			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
   1813			      (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
   1814
   1815		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
   1816			DMEMIT("queue_if_no_path ");
   1817		if (m->pg_init_retries)
   1818			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
   1819		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
   1820			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
   1821		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
   1822			DMEMIT("retain_attached_hw_handler ");
   1823		if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
   1824			switch(m->queue_mode) {
   1825			case DM_TYPE_BIO_BASED:
   1826				DMEMIT("queue_mode bio ");
   1827				break;
   1828			default:
   1829				WARN_ON_ONCE(true);
   1830				break;
   1831			}
   1832		}
   1833	}
   1834
   1835	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
   1836		DMEMIT("0 ");
   1837	else
   1838		DMEMIT("1 %s ", m->hw_handler_name);
   1839
   1840	DMEMIT("%u ", m->nr_priority_groups);
   1841
   1842	if (m->next_pg)
   1843		pg_num = m->next_pg->pg_num;
   1844	else if (m->current_pg)
   1845		pg_num = m->current_pg->pg_num;
   1846	else
   1847		pg_num = (m->nr_priority_groups ? 1 : 0);
   1848
   1849	DMEMIT("%u ", pg_num);
   1850
   1851	switch (type) {
   1852	case STATUSTYPE_INFO:
   1853		list_for_each_entry(pg, &m->priority_groups, list) {
   1854			if (pg->bypassed)
   1855				state = 'D';	/* Disabled */
   1856			else if (pg == m->current_pg)
   1857				state = 'A';	/* Currently Active */
   1858			else
   1859				state = 'E';	/* Enabled */
   1860
   1861			DMEMIT("%c ", state);
   1862
   1863			if (pg->ps.type->status)
   1864				sz += pg->ps.type->status(&pg->ps, NULL, type,
   1865							  result + sz,
   1866							  maxlen - sz);
   1867			else
   1868				DMEMIT("0 ");
   1869
   1870			DMEMIT("%u %u ", pg->nr_pgpaths,
   1871			       pg->ps.type->info_args);
   1872
   1873			list_for_each_entry(p, &pg->pgpaths, list) {
   1874				DMEMIT("%s %s %u ", p->path.dev->name,
   1875				       p->is_active ? "A" : "F",
   1876				       p->fail_count);
   1877				if (pg->ps.type->status)
   1878					sz += pg->ps.type->status(&pg->ps,
   1879					      &p->path, type, result + sz,
   1880					      maxlen - sz);
   1881			}
   1882		}
   1883		break;
   1884
   1885	case STATUSTYPE_TABLE:
   1886		list_for_each_entry(pg, &m->priority_groups, list) {
   1887			DMEMIT("%s ", pg->ps.type->name);
   1888
   1889			if (pg->ps.type->status)
   1890				sz += pg->ps.type->status(&pg->ps, NULL, type,
   1891							  result + sz,
   1892							  maxlen - sz);
   1893			else
   1894				DMEMIT("0 ");
   1895
   1896			DMEMIT("%u %u ", pg->nr_pgpaths,
   1897			       pg->ps.type->table_args);
   1898
   1899			list_for_each_entry(p, &pg->pgpaths, list) {
   1900				DMEMIT("%s ", p->path.dev->name);
   1901				if (pg->ps.type->status)
   1902					sz += pg->ps.type->status(&pg->ps,
   1903					      &p->path, type, result + sz,
   1904					      maxlen - sz);
   1905			}
   1906		}
   1907		break;
   1908
   1909	case STATUSTYPE_IMA:
   1910		sz = 0; /*reset the result pointer*/
   1911
   1912		DMEMIT_TARGET_NAME_VERSION(ti->type);
   1913		DMEMIT(",nr_priority_groups=%u", m->nr_priority_groups);
   1914
   1915		pg_counter = 0;
   1916		list_for_each_entry(pg, &m->priority_groups, list) {
   1917			if (pg->bypassed)
   1918				state = 'D';	/* Disabled */
   1919			else if (pg == m->current_pg)
   1920				state = 'A';	/* Currently Active */
   1921			else
   1922				state = 'E';	/* Enabled */
   1923			DMEMIT(",pg_state_%d=%c", pg_counter, state);
   1924			DMEMIT(",nr_pgpaths_%d=%u", pg_counter, pg->nr_pgpaths);
   1925			DMEMIT(",path_selector_name_%d=%s", pg_counter, pg->ps.type->name);
   1926
   1927			pgpath_counter = 0;
   1928			list_for_each_entry(p, &pg->pgpaths, list) {
   1929				DMEMIT(",path_name_%d_%d=%s,is_active_%d_%d=%c,fail_count_%d_%d=%u",
   1930				       pg_counter, pgpath_counter, p->path.dev->name,
   1931				       pg_counter, pgpath_counter, p->is_active ? 'A' : 'F',
   1932				       pg_counter, pgpath_counter, p->fail_count);
   1933				if (pg->ps.type->status) {
   1934					DMEMIT(",path_selector_status_%d_%d=",
   1935					       pg_counter, pgpath_counter);
   1936					sz += pg->ps.type->status(&pg->ps, &p->path,
   1937								  type, result + sz,
   1938								  maxlen - sz);
   1939				}
   1940				pgpath_counter++;
   1941			}
   1942			pg_counter++;
   1943		}
   1944		DMEMIT(";");
   1945		break;
   1946	}
   1947
   1948	spin_unlock_irqrestore(&m->lock, flags);
   1949}
   1950
   1951static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
   1952			     char *result, unsigned maxlen)
   1953{
   1954	int r = -EINVAL;
   1955	struct dm_dev *dev;
   1956	struct multipath *m = ti->private;
   1957	action_fn action;
   1958	unsigned long flags;
   1959
   1960	mutex_lock(&m->work_mutex);
   1961
   1962	if (dm_suspended(ti)) {
   1963		r = -EBUSY;
   1964		goto out;
   1965	}
   1966
   1967	if (argc == 1) {
   1968		if (!strcasecmp(argv[0], "queue_if_no_path")) {
   1969			r = queue_if_no_path(m, true, false, __func__);
   1970			spin_lock_irqsave(&m->lock, flags);
   1971			enable_nopath_timeout(m);
   1972			spin_unlock_irqrestore(&m->lock, flags);
   1973			goto out;
   1974		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
   1975			r = queue_if_no_path(m, false, false, __func__);
   1976			disable_nopath_timeout(m);
   1977			goto out;
   1978		}
   1979	}
   1980
   1981	if (argc != 2) {
   1982		DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
   1983		goto out;
   1984	}
   1985
   1986	if (!strcasecmp(argv[0], "disable_group")) {
   1987		r = bypass_pg_num(m, argv[1], true);
   1988		goto out;
   1989	} else if (!strcasecmp(argv[0], "enable_group")) {
   1990		r = bypass_pg_num(m, argv[1], false);
   1991		goto out;
   1992	} else if (!strcasecmp(argv[0], "switch_group")) {
   1993		r = switch_pg_num(m, argv[1]);
   1994		goto out;
   1995	} else if (!strcasecmp(argv[0], "reinstate_path"))
   1996		action = reinstate_path;
   1997	else if (!strcasecmp(argv[0], "fail_path"))
   1998		action = fail_path;
   1999	else {
   2000		DMWARN("Unrecognised multipath message received: %s", argv[0]);
   2001		goto out;
   2002	}
   2003
   2004	r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
   2005	if (r) {
   2006		DMWARN("message: error getting device %s",
   2007		       argv[1]);
   2008		goto out;
   2009	}
   2010
   2011	r = action_dev(m, dev, action);
   2012
   2013	dm_put_device(ti, dev);
   2014
   2015out:
   2016	mutex_unlock(&m->work_mutex);
   2017	return r;
   2018}
   2019
   2020static int multipath_prepare_ioctl(struct dm_target *ti,
   2021				   struct block_device **bdev)
   2022{
   2023	struct multipath *m = ti->private;
   2024	struct pgpath *pgpath;
   2025	unsigned long flags;
   2026	int r;
   2027
   2028	pgpath = READ_ONCE(m->current_pgpath);
   2029	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
   2030		pgpath = choose_pgpath(m, 0);
   2031
   2032	if (pgpath) {
   2033		if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) {
   2034			*bdev = pgpath->path.dev->bdev;
   2035			r = 0;
   2036		} else {
   2037			/* pg_init has not started or completed */
   2038			r = -ENOTCONN;
   2039		}
   2040	} else {
   2041		/* No path is available */
   2042		r = -EIO;
   2043		spin_lock_irqsave(&m->lock, flags);
   2044		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
   2045			r = -ENOTCONN;
   2046		spin_unlock_irqrestore(&m->lock, flags);
   2047	}
   2048
   2049	if (r == -ENOTCONN) {
   2050		if (!READ_ONCE(m->current_pg)) {
   2051			/* Path status changed, redo selection */
   2052			(void) choose_pgpath(m, 0);
   2053		}
   2054		spin_lock_irqsave(&m->lock, flags);
   2055		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
   2056			(void) __pg_init_all_paths(m);
   2057		spin_unlock_irqrestore(&m->lock, flags);
   2058		dm_table_run_md_queue_async(m->ti->table);
   2059		process_queued_io_list(m);
   2060	}
   2061
   2062	/*
   2063	 * Only pass ioctls through if the device sizes match exactly.
   2064	 */
   2065	if (!r && ti->len != bdev_nr_sectors((*bdev)))
   2066		return 1;
   2067	return r;
   2068}
   2069
   2070static int multipath_iterate_devices(struct dm_target *ti,
   2071				     iterate_devices_callout_fn fn, void *data)
   2072{
   2073	struct multipath *m = ti->private;
   2074	struct priority_group *pg;
   2075	struct pgpath *p;
   2076	int ret = 0;
   2077
   2078	list_for_each_entry(pg, &m->priority_groups, list) {
   2079		list_for_each_entry(p, &pg->pgpaths, list) {
   2080			ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
   2081			if (ret)
   2082				goto out;
   2083		}
   2084	}
   2085
   2086out:
   2087	return ret;
   2088}
   2089
   2090static int pgpath_busy(struct pgpath *pgpath)
   2091{
   2092	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
   2093
   2094	return blk_lld_busy(q);
   2095}
   2096
   2097/*
   2098 * We return "busy", only when we can map I/Os but underlying devices
   2099 * are busy (so even if we map I/Os now, the I/Os will wait on
   2100 * the underlying queue).
   2101 * In other words, if we want to kill I/Os or queue them inside us
   2102 * due to map unavailability, we don't return "busy".  Otherwise,
   2103 * dm core won't give us the I/Os and we can't do what we want.
   2104 */
   2105static int multipath_busy(struct dm_target *ti)
   2106{
   2107	bool busy = false, has_active = false;
   2108	struct multipath *m = ti->private;
   2109	struct priority_group *pg, *next_pg;
   2110	struct pgpath *pgpath;
   2111
   2112	/* pg_init in progress */
   2113	if (atomic_read(&m->pg_init_in_progress))
   2114		return true;
   2115
   2116	/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
   2117	if (!atomic_read(&m->nr_valid_paths)) {
   2118		unsigned long flags;
   2119		spin_lock_irqsave(&m->lock, flags);
   2120		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
   2121			spin_unlock_irqrestore(&m->lock, flags);
   2122			return (m->queue_mode != DM_TYPE_REQUEST_BASED);
   2123		}
   2124		spin_unlock_irqrestore(&m->lock, flags);
   2125	}
   2126
   2127	/* Guess which priority_group will be used at next mapping time */
   2128	pg = READ_ONCE(m->current_pg);
   2129	next_pg = READ_ONCE(m->next_pg);
   2130	if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
   2131		pg = next_pg;
   2132
   2133	if (!pg) {
   2134		/*
   2135		 * We don't know which pg will be used at next mapping time.
   2136		 * We don't call choose_pgpath() here to avoid to trigger
   2137		 * pg_init just by busy checking.
   2138		 * So we don't know whether underlying devices we will be using
   2139		 * at next mapping time are busy or not. Just try mapping.
   2140		 */
   2141		return busy;
   2142	}
   2143
   2144	/*
   2145	 * If there is one non-busy active path at least, the path selector
   2146	 * will be able to select it. So we consider such a pg as not busy.
   2147	 */
   2148	busy = true;
   2149	list_for_each_entry(pgpath, &pg->pgpaths, list) {
   2150		if (pgpath->is_active) {
   2151			has_active = true;
   2152			if (!pgpath_busy(pgpath)) {
   2153				busy = false;
   2154				break;
   2155			}
   2156		}
   2157	}
   2158
   2159	if (!has_active) {
   2160		/*
   2161		 * No active path in this pg, so this pg won't be used and
   2162		 * the current_pg will be changed at next mapping time.
   2163		 * We need to try mapping to determine it.
   2164		 */
   2165		busy = false;
   2166	}
   2167
   2168	return busy;
   2169}
   2170
   2171/*-----------------------------------------------------------------
   2172 * Module setup
   2173 *---------------------------------------------------------------*/
   2174static struct target_type multipath_target = {
   2175	.name = "multipath",
   2176	.version = {1, 14, 0},
   2177	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
   2178		    DM_TARGET_PASSES_INTEGRITY,
   2179	.module = THIS_MODULE,
   2180	.ctr = multipath_ctr,
   2181	.dtr = multipath_dtr,
   2182	.clone_and_map_rq = multipath_clone_and_map,
   2183	.release_clone_rq = multipath_release_clone,
   2184	.rq_end_io = multipath_end_io,
   2185	.map = multipath_map_bio,
   2186	.end_io = multipath_end_io_bio,
   2187	.presuspend = multipath_presuspend,
   2188	.postsuspend = multipath_postsuspend,
   2189	.resume = multipath_resume,
   2190	.status = multipath_status,
   2191	.message = multipath_message,
   2192	.prepare_ioctl = multipath_prepare_ioctl,
   2193	.iterate_devices = multipath_iterate_devices,
   2194	.busy = multipath_busy,
   2195};
   2196
   2197static int __init dm_multipath_init(void)
   2198{
   2199	int r;
   2200
   2201	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
   2202	if (!kmultipathd) {
   2203		DMERR("failed to create workqueue kmpathd");
   2204		r = -ENOMEM;
   2205		goto bad_alloc_kmultipathd;
   2206	}
   2207
   2208	/*
   2209	 * A separate workqueue is used to handle the device handlers
   2210	 * to avoid overloading existing workqueue. Overloading the
   2211	 * old workqueue would also create a bottleneck in the
   2212	 * path of the storage hardware device activation.
   2213	 */
   2214	kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
   2215						  WQ_MEM_RECLAIM);
   2216	if (!kmpath_handlerd) {
   2217		DMERR("failed to create workqueue kmpath_handlerd");
   2218		r = -ENOMEM;
   2219		goto bad_alloc_kmpath_handlerd;
   2220	}
   2221
   2222	r = dm_register_target(&multipath_target);
   2223	if (r < 0) {
   2224		DMERR("request-based register failed %d", r);
   2225		r = -EINVAL;
   2226		goto bad_register_target;
   2227	}
   2228
   2229	return 0;
   2230
   2231bad_register_target:
   2232	destroy_workqueue(kmpath_handlerd);
   2233bad_alloc_kmpath_handlerd:
   2234	destroy_workqueue(kmultipathd);
   2235bad_alloc_kmultipathd:
   2236	return r;
   2237}
   2238
   2239static void __exit dm_multipath_exit(void)
   2240{
   2241	destroy_workqueue(kmpath_handlerd);
   2242	destroy_workqueue(kmultipathd);
   2243
   2244	dm_unregister_target(&multipath_target);
   2245}
   2246
   2247module_init(dm_multipath_init);
   2248module_exit(dm_multipath_exit);
   2249
   2250module_param_named(queue_if_no_path_timeout_secs,
   2251		   queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
   2252MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
   2253
   2254MODULE_DESCRIPTION(DM_NAME " multipath target");
   2255MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
   2256MODULE_LICENSE("GPL");