dm-switch.c (15714B)
1/* 2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. 3 * Copyright (C) 2011-2013 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 * 7 * dm-switch is a device-mapper target that maps IO to underlying block 8 * devices efficiently when there are a large number of fixed-sized 9 * address regions but there is no simple pattern to allow for a compact 10 * mapping representation such as dm-stripe. 11 */ 12 13#include <linux/device-mapper.h> 14 15#include <linux/module.h> 16#include <linux/init.h> 17#include <linux/vmalloc.h> 18 19#define DM_MSG_PREFIX "switch" 20 21/* 22 * One region_table_slot_t holds <region_entries_per_slot> region table 23 * entries each of which is <region_table_entry_bits> in size. 24 */ 25typedef unsigned long region_table_slot_t; 26 27/* 28 * A device with the offset to its start sector. 29 */ 30struct switch_path { 31 struct dm_dev *dmdev; 32 sector_t start; 33}; 34 35/* 36 * Context block for a dm switch device. 37 */ 38struct switch_ctx { 39 struct dm_target *ti; 40 41 unsigned nr_paths; /* Number of paths in path_list. */ 42 43 unsigned region_size; /* Region size in 512-byte sectors */ 44 unsigned long nr_regions; /* Number of regions making up the device */ 45 signed char region_size_bits; /* log2 of region_size or -1 */ 46 47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ 48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ 49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ 50 51 region_table_slot_t *region_table; /* Region table */ 52 53 /* 54 * Array of dm devices to switch between. 55 */ 56 struct switch_path path_list[]; 57}; 58 59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, 60 unsigned region_size) 61{ 62 struct switch_ctx *sctx; 63 64 sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL); 65 if (!sctx) 66 return NULL; 67 68 sctx->ti = ti; 69 sctx->region_size = region_size; 70 71 ti->private = sctx; 72 73 return sctx; 74} 75 76static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) 77{ 78 struct switch_ctx *sctx = ti->private; 79 sector_t nr_regions = ti->len; 80 sector_t nr_slots; 81 82 if (!(sctx->region_size & (sctx->region_size - 1))) 83 sctx->region_size_bits = __ffs(sctx->region_size); 84 else 85 sctx->region_size_bits = -1; 86 87 sctx->region_table_entry_bits = 1; 88 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && 89 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) 90 sctx->region_table_entry_bits++; 91 92 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; 93 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) 94 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); 95 else 96 sctx->region_entries_per_slot_bits = -1; 97 98 if (sector_div(nr_regions, sctx->region_size)) 99 nr_regions++; 100 101 if (nr_regions >= ULONG_MAX) { 102 ti->error = "Region table too large"; 103 return -EINVAL; 104 } 105 sctx->nr_regions = nr_regions; 106 107 nr_slots = nr_regions; 108 if (sector_div(nr_slots, sctx->region_entries_per_slot)) 109 nr_slots++; 110 111 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { 112 ti->error = "Region table too large"; 113 return -EINVAL; 114 } 115 116 sctx->region_table = vmalloc(array_size(nr_slots, 117 sizeof(region_table_slot_t))); 118 if (!sctx->region_table) { 119 ti->error = "Cannot allocate region table"; 120 return -ENOMEM; 121 } 122 123 return 0; 124} 125 126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, 127 unsigned long *region_index, unsigned *bit) 128{ 129 if (sctx->region_entries_per_slot_bits >= 0) { 130 *region_index = region_nr >> sctx->region_entries_per_slot_bits; 131 *bit = region_nr & (sctx->region_entries_per_slot - 1); 132 } else { 133 *region_index = region_nr / sctx->region_entries_per_slot; 134 *bit = region_nr % sctx->region_entries_per_slot; 135 } 136 137 *bit *= sctx->region_table_entry_bits; 138} 139 140static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) 141{ 142 unsigned long region_index; 143 unsigned bit; 144 145 switch_get_position(sctx, region_nr, ®ion_index, &bit); 146 147 return (READ_ONCE(sctx->region_table[region_index]) >> bit) & 148 ((1 << sctx->region_table_entry_bits) - 1); 149} 150 151/* 152 * Find which path to use at given offset. 153 */ 154static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) 155{ 156 unsigned path_nr; 157 sector_t p; 158 159 p = offset; 160 if (sctx->region_size_bits >= 0) 161 p >>= sctx->region_size_bits; 162 else 163 sector_div(p, sctx->region_size); 164 165 path_nr = switch_region_table_read(sctx, p); 166 167 /* This can only happen if the processor uses non-atomic stores. */ 168 if (unlikely(path_nr >= sctx->nr_paths)) 169 path_nr = 0; 170 171 return path_nr; 172} 173 174static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, 175 unsigned value) 176{ 177 unsigned long region_index; 178 unsigned bit; 179 region_table_slot_t pte; 180 181 switch_get_position(sctx, region_nr, ®ion_index, &bit); 182 183 pte = sctx->region_table[region_index]; 184 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); 185 pte |= (region_table_slot_t)value << bit; 186 sctx->region_table[region_index] = pte; 187} 188 189/* 190 * Fill the region table with an initial round robin pattern. 191 */ 192static void initialise_region_table(struct switch_ctx *sctx) 193{ 194 unsigned path_nr = 0; 195 unsigned long region_nr; 196 197 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { 198 switch_region_table_write(sctx, region_nr, path_nr); 199 if (++path_nr >= sctx->nr_paths) 200 path_nr = 0; 201 } 202} 203 204static int parse_path(struct dm_arg_set *as, struct dm_target *ti) 205{ 206 struct switch_ctx *sctx = ti->private; 207 unsigned long long start; 208 int r; 209 210 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 211 &sctx->path_list[sctx->nr_paths].dmdev); 212 if (r) { 213 ti->error = "Device lookup failed"; 214 return r; 215 } 216 217 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { 218 ti->error = "Invalid device starting offset"; 219 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 220 return -EINVAL; 221 } 222 223 sctx->path_list[sctx->nr_paths].start = start; 224 225 sctx->nr_paths++; 226 227 return 0; 228} 229 230/* 231 * Destructor: Don't free the dm_target, just the ti->private data (if any). 232 */ 233static void switch_dtr(struct dm_target *ti) 234{ 235 struct switch_ctx *sctx = ti->private; 236 237 while (sctx->nr_paths--) 238 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 239 240 vfree(sctx->region_table); 241 kfree(sctx); 242} 243 244/* 245 * Constructor arguments: 246 * <num_paths> <region_size> <num_optional_args> [<optional_args>...] 247 * [<dev_path> <offset>]+ 248 * 249 * Optional args are to allow for future extension: currently this 250 * parameter must be 0. 251 */ 252static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) 253{ 254 static const struct dm_arg _args[] = { 255 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, 256 {1, UINT_MAX, "Invalid region size"}, 257 {0, 0, "Invalid number of optional args"}, 258 }; 259 260 struct switch_ctx *sctx; 261 struct dm_arg_set as; 262 unsigned nr_paths, region_size, nr_optional_args; 263 int r; 264 265 as.argc = argc; 266 as.argv = argv; 267 268 r = dm_read_arg(_args, &as, &nr_paths, &ti->error); 269 if (r) 270 return -EINVAL; 271 272 r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); 273 if (r) 274 return r; 275 276 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); 277 if (r) 278 return r; 279 /* parse optional arguments here, if we add any */ 280 281 if (as.argc != nr_paths * 2) { 282 ti->error = "Incorrect number of path arguments"; 283 return -EINVAL; 284 } 285 286 sctx = alloc_switch_ctx(ti, nr_paths, region_size); 287 if (!sctx) { 288 ti->error = "Cannot allocate redirection context"; 289 return -ENOMEM; 290 } 291 292 r = dm_set_target_max_io_len(ti, region_size); 293 if (r) 294 goto error; 295 296 while (as.argc) { 297 r = parse_path(&as, ti); 298 if (r) 299 goto error; 300 } 301 302 r = alloc_region_table(ti, nr_paths); 303 if (r) 304 goto error; 305 306 initialise_region_table(sctx); 307 308 /* For UNMAP, sending the request down any path is sufficient */ 309 ti->num_discard_bios = 1; 310 311 return 0; 312 313error: 314 switch_dtr(ti); 315 316 return r; 317} 318 319static int switch_map(struct dm_target *ti, struct bio *bio) 320{ 321 struct switch_ctx *sctx = ti->private; 322 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); 323 unsigned path_nr = switch_get_path_nr(sctx, offset); 324 325 bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev); 326 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; 327 328 return DM_MAPIO_REMAPPED; 329} 330 331/* 332 * We need to parse hex numbers in the message as quickly as possible. 333 * 334 * This table-based hex parser improves performance. 335 * It improves a time to load 1000000 entries compared to the condition-based 336 * parser. 337 * table-based parser condition-based parser 338 * PA-RISC 0.29s 0.31s 339 * Opteron 0.0495s 0.0498s 340 */ 341static const unsigned char hex_table[256] = { 342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 3450, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 346255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 348255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 358}; 359 360static __always_inline unsigned long parse_hex(const char **string) 361{ 362 unsigned char d; 363 unsigned long r = 0; 364 365 while ((d = hex_table[(unsigned char)**string]) < 16) { 366 r = (r << 4) | d; 367 (*string)++; 368 } 369 370 return r; 371} 372 373static int process_set_region_mappings(struct switch_ctx *sctx, 374 unsigned argc, char **argv) 375{ 376 unsigned i; 377 unsigned long region_index = 0; 378 379 for (i = 1; i < argc; i++) { 380 unsigned long path_nr; 381 const char *string = argv[i]; 382 383 if ((*string & 0xdf) == 'R') { 384 unsigned long cycle_length, num_write; 385 386 string++; 387 if (unlikely(*string == ',')) { 388 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 389 return -EINVAL; 390 } 391 cycle_length = parse_hex(&string); 392 if (unlikely(*string != ',')) { 393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 394 return -EINVAL; 395 } 396 string++; 397 if (unlikely(!*string)) { 398 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 399 return -EINVAL; 400 } 401 num_write = parse_hex(&string); 402 if (unlikely(*string)) { 403 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 404 return -EINVAL; 405 } 406 407 if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { 408 DMWARN("invalid set_region_mappings cycle length: %lu > %lu", 409 cycle_length - 1, region_index); 410 return -EINVAL; 411 } 412 if (unlikely(region_index + num_write < region_index) || 413 unlikely(region_index + num_write >= sctx->nr_regions)) { 414 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", 415 region_index, num_write, sctx->nr_regions); 416 return -EINVAL; 417 } 418 419 while (num_write--) { 420 region_index++; 421 path_nr = switch_region_table_read(sctx, region_index - cycle_length); 422 switch_region_table_write(sctx, region_index, path_nr); 423 } 424 425 continue; 426 } 427 428 if (*string == ':') 429 region_index++; 430 else { 431 region_index = parse_hex(&string); 432 if (unlikely(*string != ':')) { 433 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 434 return -EINVAL; 435 } 436 } 437 438 string++; 439 if (unlikely(!*string)) { 440 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 441 return -EINVAL; 442 } 443 444 path_nr = parse_hex(&string); 445 if (unlikely(*string)) { 446 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 447 return -EINVAL; 448 } 449 if (unlikely(region_index >= sctx->nr_regions)) { 450 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); 451 return -EINVAL; 452 } 453 if (unlikely(path_nr >= sctx->nr_paths)) { 454 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); 455 return -EINVAL; 456 } 457 458 switch_region_table_write(sctx, region_index, path_nr); 459 } 460 461 return 0; 462} 463 464/* 465 * Messages are processed one-at-a-time. 466 * 467 * Only set_region_mappings is supported. 468 */ 469static int switch_message(struct dm_target *ti, unsigned argc, char **argv, 470 char *result, unsigned maxlen) 471{ 472 static DEFINE_MUTEX(message_mutex); 473 474 struct switch_ctx *sctx = ti->private; 475 int r = -EINVAL; 476 477 mutex_lock(&message_mutex); 478 479 if (!strcasecmp(argv[0], "set_region_mappings")) 480 r = process_set_region_mappings(sctx, argc, argv); 481 else 482 DMWARN("Unrecognised message received."); 483 484 mutex_unlock(&message_mutex); 485 486 return r; 487} 488 489static void switch_status(struct dm_target *ti, status_type_t type, 490 unsigned status_flags, char *result, unsigned maxlen) 491{ 492 struct switch_ctx *sctx = ti->private; 493 unsigned sz = 0; 494 int path_nr; 495 496 switch (type) { 497 case STATUSTYPE_INFO: 498 result[0] = '\0'; 499 break; 500 501 case STATUSTYPE_TABLE: 502 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); 503 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) 504 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, 505 (unsigned long long)sctx->path_list[path_nr].start); 506 break; 507 508 case STATUSTYPE_IMA: 509 result[0] = '\0'; 510 break; 511 } 512} 513 514/* 515 * Switch ioctl: 516 * 517 * Passthrough all ioctls to the path for sector 0 518 */ 519static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 520{ 521 struct switch_ctx *sctx = ti->private; 522 unsigned path_nr; 523 524 path_nr = switch_get_path_nr(sctx, 0); 525 526 *bdev = sctx->path_list[path_nr].dmdev->bdev; 527 528 /* 529 * Only pass ioctls through if the device sizes match exactly. 530 */ 531 if (ti->len + sctx->path_list[path_nr].start != 532 bdev_nr_sectors((*bdev))) 533 return 1; 534 return 0; 535} 536 537static int switch_iterate_devices(struct dm_target *ti, 538 iterate_devices_callout_fn fn, void *data) 539{ 540 struct switch_ctx *sctx = ti->private; 541 int path_nr; 542 int r; 543 544 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { 545 r = fn(ti, sctx->path_list[path_nr].dmdev, 546 sctx->path_list[path_nr].start, ti->len, data); 547 if (r) 548 return r; 549 } 550 551 return 0; 552} 553 554static struct target_type switch_target = { 555 .name = "switch", 556 .version = {1, 1, 0}, 557 .features = DM_TARGET_NOWAIT, 558 .module = THIS_MODULE, 559 .ctr = switch_ctr, 560 .dtr = switch_dtr, 561 .map = switch_map, 562 .message = switch_message, 563 .status = switch_status, 564 .prepare_ioctl = switch_prepare_ioctl, 565 .iterate_devices = switch_iterate_devices, 566}; 567 568static int __init dm_switch_init(void) 569{ 570 int r; 571 572 r = dm_register_target(&switch_target); 573 if (r < 0) 574 DMERR("dm_register_target() failed %d", r); 575 576 return r; 577} 578 579static void __exit dm_switch_exit(void) 580{ 581 dm_unregister_target(&switch_target); 582} 583 584module_init(dm_switch_init); 585module_exit(dm_switch_exit); 586 587MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); 588MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); 589MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); 590MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); 591MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); 592MODULE_LICENSE("GPL");