aoedev.c (11164B)
1/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ 2/* 3 * aoedev.c 4 * AoE device utility functions; maintains device list. 5 */ 6 7#include <linux/hdreg.h> 8#include <linux/blk-mq.h> 9#include <linux/netdevice.h> 10#include <linux/delay.h> 11#include <linux/slab.h> 12#include <linux/bitmap.h> 13#include <linux/kdev_t.h> 14#include <linux/moduleparam.h> 15#include <linux/string.h> 16#include "aoe.h" 17 18static void freetgt(struct aoedev *d, struct aoetgt *t); 19static void skbpoolfree(struct aoedev *d); 20 21static int aoe_dyndevs = 1; 22module_param(aoe_dyndevs, int, 0644); 23MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); 24 25static struct aoedev *devlist; 26static DEFINE_SPINLOCK(devlist_lock); 27 28/* Because some systems will have one, many, or no 29 * - partitions, 30 * - slots per shelf, 31 * - or shelves, 32 * we need some flexibility in the way the minor numbers 33 * are allocated. So they are dynamic. 34 */ 35#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) 36 37static DEFINE_SPINLOCK(used_minors_lock); 38static DECLARE_BITMAP(used_minors, N_DEVS); 39 40static int 41minor_get_dyn(ulong *sysminor) 42{ 43 ulong flags; 44 ulong n; 45 int error = 0; 46 47 spin_lock_irqsave(&used_minors_lock, flags); 48 n = find_first_zero_bit(used_minors, N_DEVS); 49 if (n < N_DEVS) 50 set_bit(n, used_minors); 51 else 52 error = -1; 53 spin_unlock_irqrestore(&used_minors_lock, flags); 54 55 *sysminor = n * AOE_PARTITIONS; 56 return error; 57} 58 59static int 60minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) 61{ 62 ulong flags; 63 ulong n; 64 int error = 0; 65 enum { 66 /* for backwards compatibility when !aoe_dyndevs, 67 * a static number of supported slots per shelf */ 68 NPERSHELF = 16, 69 }; 70 71 if (aoemin >= NPERSHELF) { 72 pr_err("aoe: %s %d slots per shelf\n", 73 "static minor device numbers support only", 74 NPERSHELF); 75 error = -1; 76 goto out; 77 } 78 79 n = aoemaj * NPERSHELF + aoemin; 80 if (n >= N_DEVS) { 81 pr_err("aoe: %s with e%ld.%d\n", 82 "cannot use static minor device numbers", 83 aoemaj, aoemin); 84 error = -1; 85 goto out; 86 } 87 88 spin_lock_irqsave(&used_minors_lock, flags); 89 if (test_bit(n, used_minors)) { 90 pr_err("aoe: %s %lu\n", 91 "existing device already has static minor number", 92 n); 93 error = -1; 94 } else 95 set_bit(n, used_minors); 96 spin_unlock_irqrestore(&used_minors_lock, flags); 97 *sysminor = n * AOE_PARTITIONS; 98out: 99 return error; 100} 101 102static int 103minor_get(ulong *sysminor, ulong aoemaj, int aoemin) 104{ 105 if (aoe_dyndevs) 106 return minor_get_dyn(sysminor); 107 else 108 return minor_get_static(sysminor, aoemaj, aoemin); 109} 110 111static void 112minor_free(ulong minor) 113{ 114 ulong flags; 115 116 minor /= AOE_PARTITIONS; 117 BUG_ON(minor >= N_DEVS); 118 119 spin_lock_irqsave(&used_minors_lock, flags); 120 BUG_ON(!test_bit(minor, used_minors)); 121 clear_bit(minor, used_minors); 122 spin_unlock_irqrestore(&used_minors_lock, flags); 123} 124 125/* 126 * Users who grab a pointer to the device with aoedev_by_aoeaddr 127 * automatically get a reference count and must be responsible 128 * for performing a aoedev_put. With the addition of async 129 * kthread processing I'm no longer confident that we can 130 * guarantee consistency in the face of device flushes. 131 * 132 * For the time being, we only bother to add extra references for 133 * frames sitting on the iocq. When the kthreads finish processing 134 * these frames, they will aoedev_put the device. 135 */ 136 137void 138aoedev_put(struct aoedev *d) 139{ 140 ulong flags; 141 142 spin_lock_irqsave(&devlist_lock, flags); 143 d->ref--; 144 spin_unlock_irqrestore(&devlist_lock, flags); 145} 146 147static void 148dummy_timer(struct timer_list *t) 149{ 150 struct aoedev *d; 151 152 d = from_timer(d, t, timer); 153 if (d->flags & DEVFL_TKILL) 154 return; 155 d->timer.expires = jiffies + HZ; 156 add_timer(&d->timer); 157} 158 159static void 160aoe_failip(struct aoedev *d) 161{ 162 struct request *rq; 163 struct aoe_req *req; 164 struct bio *bio; 165 166 aoe_failbuf(d, d->ip.buf); 167 rq = d->ip.rq; 168 if (rq == NULL) 169 return; 170 171 req = blk_mq_rq_to_pdu(rq); 172 while ((bio = d->ip.nxbio)) { 173 bio->bi_status = BLK_STS_IOERR; 174 d->ip.nxbio = bio->bi_next; 175 req->nr_bios--; 176 } 177 178 if (!req->nr_bios) 179 aoe_end_request(d, rq, 0); 180} 181 182static void 183downdev_frame(struct list_head *pos) 184{ 185 struct frame *f; 186 187 f = list_entry(pos, struct frame, head); 188 list_del(pos); 189 if (f->buf) { 190 f->buf->nframesout--; 191 aoe_failbuf(f->t->d, f->buf); 192 } 193 aoe_freetframe(f); 194} 195 196void 197aoedev_downdev(struct aoedev *d) 198{ 199 struct aoetgt *t, **tt, **te; 200 struct list_head *head, *pos, *nx; 201 int i; 202 203 d->flags &= ~DEVFL_UP; 204 205 /* clean out active and to-be-retransmitted buffers */ 206 for (i = 0; i < NFACTIVE; i++) { 207 head = &d->factive[i]; 208 list_for_each_safe(pos, nx, head) 209 downdev_frame(pos); 210 } 211 head = &d->rexmitq; 212 list_for_each_safe(pos, nx, head) 213 downdev_frame(pos); 214 215 /* reset window dressings */ 216 tt = d->targets; 217 te = tt + d->ntargets; 218 for (; tt < te && (t = *tt); tt++) { 219 aoecmd_wreset(t); 220 t->nout = 0; 221 } 222 223 /* clean out the in-process request (if any) */ 224 aoe_failip(d); 225 226 /* fast fail all pending I/O */ 227 if (d->blkq) { 228 /* UP is cleared, freeze+quiesce to insure all are errored */ 229 blk_mq_freeze_queue(d->blkq); 230 blk_mq_quiesce_queue(d->blkq); 231 blk_mq_unquiesce_queue(d->blkq); 232 blk_mq_unfreeze_queue(d->blkq); 233 } 234 235 if (d->gd) 236 set_capacity(d->gd, 0); 237} 238 239/* return whether the user asked for this particular 240 * device to be flushed 241 */ 242static int 243user_req(char *s, size_t slen, struct aoedev *d) 244{ 245 const char *p; 246 size_t lim; 247 248 if (!d->gd) 249 return 0; 250 p = kbasename(d->gd->disk_name); 251 lim = sizeof(d->gd->disk_name); 252 lim -= p - d->gd->disk_name; 253 if (slen < lim) 254 lim = slen; 255 256 return !strncmp(s, p, lim); 257} 258 259static void 260freedev(struct aoedev *d) 261{ 262 struct aoetgt **t, **e; 263 int freeing = 0; 264 unsigned long flags; 265 266 spin_lock_irqsave(&d->lock, flags); 267 if (d->flags & DEVFL_TKILL 268 && !(d->flags & DEVFL_FREEING)) { 269 d->flags |= DEVFL_FREEING; 270 freeing = 1; 271 } 272 spin_unlock_irqrestore(&d->lock, flags); 273 if (!freeing) 274 return; 275 276 del_timer_sync(&d->timer); 277 if (d->gd) { 278 aoedisk_rm_debugfs(d); 279 del_gendisk(d->gd); 280 blk_cleanup_disk(d->gd); 281 blk_mq_free_tag_set(&d->tag_set); 282 } 283 t = d->targets; 284 e = t + d->ntargets; 285 for (; t < e && *t; t++) 286 freetgt(d, *t); 287 288 mempool_destroy(d->bufpool); 289 skbpoolfree(d); 290 minor_free(d->sysminor); 291 292 spin_lock_irqsave(&d->lock, flags); 293 d->flags |= DEVFL_FREED; 294 spin_unlock_irqrestore(&d->lock, flags); 295} 296 297enum flush_parms { 298 NOT_EXITING = 0, 299 EXITING = 1, 300}; 301 302static int 303flush(const char __user *str, size_t cnt, int exiting) 304{ 305 ulong flags; 306 struct aoedev *d, **dd; 307 char buf[16]; 308 int all = 0; 309 int specified = 0; /* flush a specific device */ 310 unsigned int skipflags; 311 312 skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; 313 314 if (!exiting && cnt >= 3) { 315 if (cnt > sizeof buf) 316 cnt = sizeof buf; 317 if (copy_from_user(buf, str, cnt)) 318 return -EFAULT; 319 all = !strncmp(buf, "all", 3); 320 if (!all) 321 specified = 1; 322 } 323 324 flush_workqueue(aoe_wq); 325 /* pass one: do aoedev_downdev, which might sleep */ 326restart1: 327 spin_lock_irqsave(&devlist_lock, flags); 328 for (d = devlist; d; d = d->next) { 329 spin_lock(&d->lock); 330 if (d->flags & DEVFL_TKILL) 331 goto cont; 332 333 if (exiting) { 334 /* unconditionally take each device down */ 335 } else if (specified) { 336 if (!user_req(buf, cnt, d)) 337 goto cont; 338 } else if ((!all && (d->flags & DEVFL_UP)) 339 || d->flags & skipflags 340 || d->nopen 341 || d->ref) 342 goto cont; 343 344 spin_unlock(&d->lock); 345 spin_unlock_irqrestore(&devlist_lock, flags); 346 aoedev_downdev(d); 347 d->flags |= DEVFL_TKILL; 348 goto restart1; 349cont: 350 spin_unlock(&d->lock); 351 } 352 spin_unlock_irqrestore(&devlist_lock, flags); 353 354 /* pass two: call freedev, which might sleep, 355 * for aoedevs marked with DEVFL_TKILL 356 */ 357restart2: 358 spin_lock_irqsave(&devlist_lock, flags); 359 for (d = devlist; d; d = d->next) { 360 spin_lock(&d->lock); 361 if (d->flags & DEVFL_TKILL 362 && !(d->flags & DEVFL_FREEING)) { 363 spin_unlock(&d->lock); 364 spin_unlock_irqrestore(&devlist_lock, flags); 365 freedev(d); 366 goto restart2; 367 } 368 spin_unlock(&d->lock); 369 } 370 371 /* pass three: remove aoedevs marked with DEVFL_FREED */ 372 for (dd = &devlist, d = *dd; d; d = *dd) { 373 struct aoedev *doomed = NULL; 374 375 spin_lock(&d->lock); 376 if (d->flags & DEVFL_FREED) { 377 *dd = d->next; 378 doomed = d; 379 } else { 380 dd = &d->next; 381 } 382 spin_unlock(&d->lock); 383 if (doomed) 384 kfree(doomed->targets); 385 kfree(doomed); 386 } 387 spin_unlock_irqrestore(&devlist_lock, flags); 388 389 return 0; 390} 391 392int 393aoedev_flush(const char __user *str, size_t cnt) 394{ 395 return flush(str, cnt, NOT_EXITING); 396} 397 398/* This has been confirmed to occur once with Tms=3*1000 due to the 399 * driver changing link and not processing its transmit ring. The 400 * problem is hard enough to solve by returning an error that I'm 401 * still punting on "solving" this. 402 */ 403static void 404skbfree(struct sk_buff *skb) 405{ 406 enum { Sms = 250, Tms = 30 * 1000}; 407 int i = Tms / Sms; 408 409 if (skb == NULL) 410 return; 411 while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) 412 msleep(Sms); 413 if (i < 0) { 414 printk(KERN_ERR 415 "aoe: %s holds ref: %s\n", 416 skb->dev ? skb->dev->name : "netif", 417 "cannot free skb -- memory leaked."); 418 return; 419 } 420 skb->truesize -= skb->data_len; 421 skb_shinfo(skb)->nr_frags = skb->data_len = 0; 422 skb_trim(skb, 0); 423 dev_kfree_skb(skb); 424} 425 426static void 427skbpoolfree(struct aoedev *d) 428{ 429 struct sk_buff *skb, *tmp; 430 431 skb_queue_walk_safe(&d->skbpool, skb, tmp) 432 skbfree(skb); 433 434 __skb_queue_head_init(&d->skbpool); 435} 436 437/* find it or allocate it */ 438struct aoedev * 439aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) 440{ 441 struct aoedev *d; 442 int i; 443 ulong flags; 444 ulong sysminor = 0; 445 446 spin_lock_irqsave(&devlist_lock, flags); 447 448 for (d=devlist; d; d=d->next) 449 if (d->aoemajor == maj && d->aoeminor == min) { 450 spin_lock(&d->lock); 451 if (d->flags & DEVFL_TKILL) { 452 spin_unlock(&d->lock); 453 d = NULL; 454 goto out; 455 } 456 d->ref++; 457 spin_unlock(&d->lock); 458 break; 459 } 460 if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) 461 goto out; 462 d = kcalloc(1, sizeof *d, GFP_ATOMIC); 463 if (!d) 464 goto out; 465 d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); 466 if (!d->targets) { 467 kfree(d); 468 d = NULL; 469 goto out; 470 } 471 d->ntargets = NTARGETS; 472 INIT_WORK(&d->work, aoecmd_sleepwork); 473 spin_lock_init(&d->lock); 474 INIT_LIST_HEAD(&d->rq_list); 475 skb_queue_head_init(&d->skbpool); 476 timer_setup(&d->timer, dummy_timer, 0); 477 d->timer.expires = jiffies + HZ; 478 add_timer(&d->timer); 479 d->bufpool = NULL; /* defer to aoeblk_gdalloc */ 480 d->tgt = d->targets; 481 d->ref = 1; 482 for (i = 0; i < NFACTIVE; i++) 483 INIT_LIST_HEAD(&d->factive[i]); 484 INIT_LIST_HEAD(&d->rexmitq); 485 d->sysminor = sysminor; 486 d->aoemajor = maj; 487 d->aoeminor = min; 488 d->rttavg = RTTAVG_INIT; 489 d->rttdev = RTTDEV_INIT; 490 d->next = devlist; 491 devlist = d; 492 out: 493 spin_unlock_irqrestore(&devlist_lock, flags); 494 return d; 495} 496 497static void 498freetgt(struct aoedev *d, struct aoetgt *t) 499{ 500 struct frame *f; 501 struct list_head *pos, *nx, *head; 502 struct aoeif *ifp; 503 504 for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { 505 if (!ifp->nd) 506 break; 507 dev_put(ifp->nd); 508 } 509 510 head = &t->ffree; 511 list_for_each_safe(pos, nx, head) { 512 list_del(pos); 513 f = list_entry(pos, struct frame, head); 514 skbfree(f->skb); 515 kfree(f); 516 } 517 kfree(t); 518} 519 520void 521aoedev_exit(void) 522{ 523 flush_workqueue(aoe_wq); 524 flush(NULL, 0, EXITING); 525} 526 527int __init 528aoedev_init(void) 529{ 530 return 0; 531}