dm-ebs-target.c (12538B)
1/* 2 * Copyright (C) 2020 Red Hat GmbH 3 * 4 * This file is released under the GPL. 5 * 6 * Device-mapper target to emulate smaller logical block 7 * size on backing devices exposing (natively) larger ones. 8 * 9 * E.g. 512 byte sector emulation on 4K native disks. 10 */ 11 12#include "dm.h" 13#include <linux/module.h> 14#include <linux/workqueue.h> 15#include <linux/dm-bufio.h> 16 17#define DM_MSG_PREFIX "ebs" 18 19static void ebs_dtr(struct dm_target *ti); 20 21/* Emulated block size context. */ 22struct ebs_c { 23 struct dm_dev *dev; /* Underlying device to emulate block size on. */ 24 struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ 25 struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ 26 struct work_struct ws; /* Work item used for ^. */ 27 struct bio_list bios_in; /* Worker bios input list. */ 28 spinlock_t lock; /* Guard bios input list above. */ 29 sector_t start; /* <start> table line argument, see ebs_ctr below. */ 30 unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ 31 unsigned int u_bs; /* Underlying block size in sectors retrieved from/set on lower layer device. */ 32 unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ 33 bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ 34}; 35 36static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) 37{ 38 return sector >> ec->block_shift; 39} 40 41static inline sector_t __block_mod(sector_t sector, unsigned int bs) 42{ 43 return sector & (bs - 1); 44} 45 46/* Return number of blocks for a bio, accounting for misalignment of start and end sectors. */ 47static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) 48{ 49 sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); 50 51 return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); 52} 53 54static inline bool __ebs_check_bs(unsigned int bs) 55{ 56 return bs && is_power_of_2(bs); 57} 58 59/* 60 * READ/WRITE: 61 * 62 * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. 63 */ 64static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter) 65{ 66 int r = 0; 67 unsigned char *ba, *pa; 68 unsigned int cur_len; 69 unsigned int bv_len = bv->bv_len; 70 unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); 71 sector_t block = __sector_to_block(ec, iter->bi_sector); 72 struct dm_buffer *b; 73 74 if (unlikely(!bv->bv_page || !bv_len)) 75 return -EIO; 76 77 pa = bvec_virt(bv); 78 79 /* Handle overlapping page <-> blocks */ 80 while (bv_len) { 81 cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); 82 83 /* Avoid reading for writes in case bio vector's page overwrites block completely. */ 84 if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) 85 ba = dm_bufio_read(ec->bufio, block, &b); 86 else 87 ba = dm_bufio_new(ec->bufio, block, &b); 88 89 if (IS_ERR(ba)) { 90 /* 91 * Carry on with next buffer, if any, to issue all possible 92 * data but return error. 93 */ 94 r = PTR_ERR(ba); 95 } else { 96 /* Copy data to/from bio to buffer if read/new was successful above. */ 97 ba += buf_off; 98 if (rw == READ) { 99 memcpy(pa, ba, cur_len); 100 flush_dcache_page(bv->bv_page); 101 } else { 102 flush_dcache_page(bv->bv_page); 103 memcpy(ba, pa, cur_len); 104 dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); 105 } 106 107 dm_bufio_release(b); 108 } 109 110 pa += cur_len; 111 bv_len -= cur_len; 112 buf_off = 0; 113 block++; 114 } 115 116 return r; 117} 118 119/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ 120static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) 121{ 122 int r = 0, rr; 123 struct bio_vec bv; 124 struct bvec_iter iter; 125 126 bio_for_each_bvec(bv, bio, iter) { 127 rr = __ebs_rw_bvec(ec, rw, &bv, &iter); 128 if (rr) 129 r = rr; 130 } 131 132 return r; 133} 134 135/* 136 * Discard bio's blocks, i.e. pass discards down. 137 * 138 * Avoid discarding partial blocks at beginning and end; 139 * return 0 in case no blocks can be discarded as a result. 140 */ 141static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) 142{ 143 sector_t block, blocks, sector = bio->bi_iter.bi_sector; 144 145 block = __sector_to_block(ec, sector); 146 blocks = __nr_blocks(ec, bio); 147 148 /* 149 * Partial first underlying block (__nr_blocks() may have 150 * resulted in one block). 151 */ 152 if (__block_mod(sector, ec->u_bs)) { 153 block++; 154 blocks--; 155 } 156 157 /* Partial last underlying block if any. */ 158 if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) 159 blocks--; 160 161 return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; 162} 163 164/* Release blocks them from the bufio cache. */ 165static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) 166{ 167 sector_t blocks, sector = bio->bi_iter.bi_sector; 168 169 blocks = __nr_blocks(ec, bio); 170 171 dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); 172} 173 174/* Worker function to process incoming bios. */ 175static void __ebs_process_bios(struct work_struct *ws) 176{ 177 int r; 178 bool write = false; 179 sector_t block1, block2; 180 struct ebs_c *ec = container_of(ws, struct ebs_c, ws); 181 struct bio *bio; 182 struct bio_list bios; 183 184 bio_list_init(&bios); 185 186 spin_lock_irq(&ec->lock); 187 bios = ec->bios_in; 188 bio_list_init(&ec->bios_in); 189 spin_unlock_irq(&ec->lock); 190 191 /* Prefetch all read and any mis-aligned write buffers */ 192 bio_list_for_each(bio, &bios) { 193 block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); 194 if (bio_op(bio) == REQ_OP_READ) 195 dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); 196 else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { 197 block2 = __sector_to_block(ec, bio_end_sector(bio)); 198 if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) 199 dm_bufio_prefetch(ec->bufio, block1, 1); 200 if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) 201 dm_bufio_prefetch(ec->bufio, block2, 1); 202 } 203 } 204 205 bio_list_for_each(bio, &bios) { 206 r = -EIO; 207 if (bio_op(bio) == REQ_OP_READ) 208 r = __ebs_rw_bio(ec, READ, bio); 209 else if (bio_op(bio) == REQ_OP_WRITE) { 210 write = true; 211 r = __ebs_rw_bio(ec, WRITE, bio); 212 } else if (bio_op(bio) == REQ_OP_DISCARD) { 213 __ebs_forget_bio(ec, bio); 214 r = __ebs_discard_bio(ec, bio); 215 } 216 217 if (r < 0) 218 bio->bi_status = errno_to_blk_status(r); 219 } 220 221 /* 222 * We write dirty buffers after processing I/O on them 223 * but before we endio thus addressing REQ_FUA/REQ_SYNC. 224 */ 225 r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; 226 227 while ((bio = bio_list_pop(&bios))) { 228 /* Any other request is endioed. */ 229 if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) 230 bio_io_error(bio); 231 else 232 bio_endio(bio); 233 } 234} 235 236/* 237 * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>] 238 * 239 * <dev_path>: path of the underlying device 240 * <offset>: offset in 512 bytes sectors into <dev_path> 241 * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer 242 * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer; 243 * optional, if not supplied, retrieve logical block size from underlying device 244 */ 245static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) 246{ 247 int r; 248 unsigned short tmp1; 249 unsigned long long tmp; 250 char dummy; 251 struct ebs_c *ec; 252 253 if (argc < 3 || argc > 4) { 254 ti->error = "Invalid argument count"; 255 return -EINVAL; 256 } 257 258 ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); 259 if (!ec) { 260 ti->error = "Cannot allocate ebs context"; 261 return -ENOMEM; 262 } 263 264 r = -EINVAL; 265 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || 266 tmp != (sector_t)tmp || 267 (sector_t)tmp >= ti->len) { 268 ti->error = "Invalid device offset sector"; 269 goto bad; 270 } 271 ec->start = tmp; 272 273 if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || 274 !__ebs_check_bs(tmp1) || 275 to_bytes(tmp1) > PAGE_SIZE) { 276 ti->error = "Invalid emulated block size"; 277 goto bad; 278 } 279 ec->e_bs = tmp1; 280 281 if (argc > 3) { 282 if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { 283 ti->error = "Invalid underlying block size"; 284 goto bad; 285 } 286 ec->u_bs = tmp1; 287 ec->u_bs_set = true; 288 } else 289 ec->u_bs_set = false; 290 291 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); 292 if (r) { 293 ti->error = "Device lookup failed"; 294 ec->dev = NULL; 295 goto bad; 296 } 297 298 r = -EINVAL; 299 if (!ec->u_bs_set) { 300 ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); 301 if (!__ebs_check_bs(ec->u_bs)) { 302 ti->error = "Invalid retrieved underlying block size"; 303 goto bad; 304 } 305 } 306 307 if (!ec->u_bs_set && ec->e_bs == ec->u_bs) 308 DMINFO("Emulation superfluous: emulated equal to underlying block size"); 309 310 if (__block_mod(ec->start, ec->u_bs)) { 311 ti->error = "Device offset must be multiple of underlying block size"; 312 goto bad; 313 } 314 315 ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); 316 if (IS_ERR(ec->bufio)) { 317 ti->error = "Cannot create dm bufio client"; 318 r = PTR_ERR(ec->bufio); 319 ec->bufio = NULL; 320 goto bad; 321 } 322 323 ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 324 if (!ec->wq) { 325 ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; 326 r = -ENOMEM; 327 goto bad; 328 } 329 330 ec->block_shift = __ffs(ec->u_bs); 331 INIT_WORK(&ec->ws, &__ebs_process_bios); 332 bio_list_init(&ec->bios_in); 333 spin_lock_init(&ec->lock); 334 335 ti->num_flush_bios = 1; 336 ti->num_discard_bios = 1; 337 ti->num_secure_erase_bios = 0; 338 ti->num_write_zeroes_bios = 0; 339 return 0; 340bad: 341 ebs_dtr(ti); 342 return r; 343} 344 345static void ebs_dtr(struct dm_target *ti) 346{ 347 struct ebs_c *ec = ti->private; 348 349 if (ec->wq) 350 destroy_workqueue(ec->wq); 351 if (ec->bufio) 352 dm_bufio_client_destroy(ec->bufio); 353 if (ec->dev) 354 dm_put_device(ti, ec->dev); 355 kfree(ec); 356} 357 358static int ebs_map(struct dm_target *ti, struct bio *bio) 359{ 360 struct ebs_c *ec = ti->private; 361 362 bio_set_dev(bio, ec->dev->bdev); 363 bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); 364 365 if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) 366 return DM_MAPIO_REMAPPED; 367 /* 368 * Only queue for bufio processing in case of partial or overlapping buffers 369 * -or- 370 * emulation with ebs == ubs aiming for tests of dm-bufio overhead. 371 */ 372 if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || 373 __block_mod(bio_end_sector(bio), ec->u_bs) || 374 ec->e_bs == ec->u_bs)) { 375 spin_lock_irq(&ec->lock); 376 bio_list_add(&ec->bios_in, bio); 377 spin_unlock_irq(&ec->lock); 378 379 queue_work(ec->wq, &ec->ws); 380 381 return DM_MAPIO_SUBMITTED; 382 } 383 384 /* Forget any buffer content relative to this direct backing device I/O. */ 385 __ebs_forget_bio(ec, bio); 386 387 return DM_MAPIO_REMAPPED; 388} 389 390static void ebs_status(struct dm_target *ti, status_type_t type, 391 unsigned status_flags, char *result, unsigned maxlen) 392{ 393 struct ebs_c *ec = ti->private; 394 395 switch (type) { 396 case STATUSTYPE_INFO: 397 *result = '\0'; 398 break; 399 case STATUSTYPE_TABLE: 400 snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", 401 ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); 402 break; 403 case STATUSTYPE_IMA: 404 *result = '\0'; 405 break; 406 } 407} 408 409static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 410{ 411 struct ebs_c *ec = ti->private; 412 struct dm_dev *dev = ec->dev; 413 414 /* 415 * Only pass ioctls through if the device sizes match exactly. 416 */ 417 *bdev = dev->bdev; 418 return !!(ec->start || ti->len != bdev_nr_sectors(dev->bdev)); 419} 420 421static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) 422{ 423 struct ebs_c *ec = ti->private; 424 425 limits->logical_block_size = to_bytes(ec->e_bs); 426 limits->physical_block_size = to_bytes(ec->u_bs); 427 limits->alignment_offset = limits->physical_block_size; 428 blk_limits_io_min(limits, limits->logical_block_size); 429} 430 431static int ebs_iterate_devices(struct dm_target *ti, 432 iterate_devices_callout_fn fn, void *data) 433{ 434 struct ebs_c *ec = ti->private; 435 436 return fn(ti, ec->dev, ec->start, ti->len, data); 437} 438 439static struct target_type ebs_target = { 440 .name = "ebs", 441 .version = {1, 0, 1}, 442 .features = DM_TARGET_PASSES_INTEGRITY, 443 .module = THIS_MODULE, 444 .ctr = ebs_ctr, 445 .dtr = ebs_dtr, 446 .map = ebs_map, 447 .status = ebs_status, 448 .io_hints = ebs_io_hints, 449 .prepare_ioctl = ebs_prepare_ioctl, 450 .iterate_devices = ebs_iterate_devices, 451}; 452 453static int __init dm_ebs_init(void) 454{ 455 int r = dm_register_target(&ebs_target); 456 457 if (r < 0) 458 DMERR("register failed %d", r); 459 460 return r; 461} 462 463static void dm_ebs_exit(void) 464{ 465 dm_unregister_target(&ebs_target); 466} 467 468module_init(dm_ebs_init); 469module_exit(dm_ebs_exit); 470 471MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); 472MODULE_DESCRIPTION(DM_NAME " emulated block size target"); 473MODULE_LICENSE("GPL");