habanalabs_drv.c (15612B)
1// SPDX-License-Identifier: GPL-2.0 2 3/* 4 * Copyright 2016-2021 HabanaLabs, Ltd. 5 * All Rights Reserved. 6 * 7 */ 8 9#define pr_fmt(fmt) "habanalabs: " fmt 10 11#include "habanalabs.h" 12 13#include <linux/pci.h> 14#include <linux/aer.h> 15#include <linux/module.h> 16 17#define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team" 18 19#define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators" 20 21MODULE_AUTHOR(HL_DRIVER_AUTHOR); 22MODULE_DESCRIPTION(HL_DRIVER_DESC); 23MODULE_LICENSE("GPL v2"); 24 25static int hl_major; 26static struct class *hl_class; 27static DEFINE_IDR(hl_devs_idr); 28static DEFINE_MUTEX(hl_devs_idr_lock); 29 30static int timeout_locked = 30; 31static int reset_on_lockup = 1; 32static int memory_scrub; 33static ulong boot_error_status_mask = ULONG_MAX; 34 35module_param(timeout_locked, int, 0444); 36MODULE_PARM_DESC(timeout_locked, 37 "Device lockup timeout in seconds (0 = disabled, default 30s)"); 38 39module_param(reset_on_lockup, int, 0444); 40MODULE_PARM_DESC(reset_on_lockup, 41 "Do device reset on lockup (0 = no, 1 = yes, default yes)"); 42 43module_param(memory_scrub, int, 0444); 44MODULE_PARM_DESC(memory_scrub, 45 "Scrub device memory in various states (0 = no, 1 = yes, default no)"); 46 47module_param(boot_error_status_mask, ulong, 0444); 48MODULE_PARM_DESC(boot_error_status_mask, 49 "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)"); 50 51#define PCI_VENDOR_ID_HABANALABS 0x1da3 52 53#define PCI_IDS_GOYA 0x0001 54#define PCI_IDS_GAUDI 0x1000 55#define PCI_IDS_GAUDI_SEC 0x1010 56 57static const struct pci_device_id ids[] = { 58 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), }, 59 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), }, 60 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), }, 61 { 0, } 62}; 63MODULE_DEVICE_TABLE(pci, ids); 64 65/* 66 * get_asic_type - translate device id to asic type 67 * 68 * @device: id of the PCI device 69 * 70 * Translate device id to asic type. 71 * In case of unidentified device, return -1 72 */ 73static enum hl_asic_type get_asic_type(u16 device) 74{ 75 enum hl_asic_type asic_type; 76 77 switch (device) { 78 case PCI_IDS_GOYA: 79 asic_type = ASIC_GOYA; 80 break; 81 case PCI_IDS_GAUDI: 82 asic_type = ASIC_GAUDI; 83 break; 84 case PCI_IDS_GAUDI_SEC: 85 asic_type = ASIC_GAUDI_SEC; 86 break; 87 default: 88 asic_type = ASIC_INVALID; 89 break; 90 } 91 92 return asic_type; 93} 94 95static bool is_asic_secured(enum hl_asic_type asic_type) 96{ 97 switch (asic_type) { 98 case ASIC_GAUDI_SEC: 99 return true; 100 default: 101 return false; 102 } 103} 104 105/* 106 * hl_device_open - open function for habanalabs device 107 * 108 * @inode: pointer to inode structure 109 * @filp: pointer to file structure 110 * 111 * Called when process opens an habanalabs device. 112 */ 113int hl_device_open(struct inode *inode, struct file *filp) 114{ 115 enum hl_device_status status; 116 struct hl_device *hdev; 117 struct hl_fpriv *hpriv; 118 int rc; 119 120 mutex_lock(&hl_devs_idr_lock); 121 hdev = idr_find(&hl_devs_idr, iminor(inode)); 122 mutex_unlock(&hl_devs_idr_lock); 123 124 if (!hdev) { 125 pr_err("Couldn't find device %d:%d\n", 126 imajor(inode), iminor(inode)); 127 return -ENXIO; 128 } 129 130 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); 131 if (!hpriv) 132 return -ENOMEM; 133 134 hpriv->hdev = hdev; 135 filp->private_data = hpriv; 136 hpriv->filp = filp; 137 138 mutex_init(&hpriv->notifier_event.lock); 139 mutex_init(&hpriv->restore_phase_mutex); 140 kref_init(&hpriv->refcount); 141 nonseekable_open(inode, filp); 142 143 hl_ctx_mgr_init(&hpriv->ctx_mgr); 144 hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr); 145 146 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); 147 148 mutex_lock(&hdev->fpriv_list_lock); 149 150 if (!hl_device_operational(hdev, &status)) { 151 dev_err_ratelimited(hdev->dev, 152 "Can't open %s because it is %s\n", 153 dev_name(hdev->dev), hdev->status[status]); 154 155 if (status == HL_DEVICE_STATUS_IN_RESET) 156 rc = -EAGAIN; 157 else 158 rc = -EPERM; 159 160 goto out_err; 161 } 162 163 if (hdev->is_in_dram_scrub) { 164 dev_dbg_ratelimited(hdev->dev, 165 "Can't open %s during dram scrub\n", 166 dev_name(hdev->dev)); 167 rc = -EAGAIN; 168 goto out_err; 169 } 170 171 if (hdev->compute_ctx_in_release) { 172 dev_dbg_ratelimited(hdev->dev, 173 "Can't open %s because another user is still releasing it\n", 174 dev_name(hdev->dev)); 175 rc = -EAGAIN; 176 goto out_err; 177 } 178 179 if (hdev->is_compute_ctx_active) { 180 dev_dbg_ratelimited(hdev->dev, 181 "Can't open %s because another user is working on it\n", 182 dev_name(hdev->dev)); 183 rc = -EBUSY; 184 goto out_err; 185 } 186 187 rc = hl_ctx_create(hdev, hpriv); 188 if (rc) { 189 dev_err(hdev->dev, "Failed to create context %d\n", rc); 190 goto out_err; 191 } 192 193 list_add(&hpriv->dev_node, &hdev->fpriv_list); 194 mutex_unlock(&hdev->fpriv_list_lock); 195 196 hl_debugfs_add_file(hpriv); 197 198 atomic_set(&hdev->last_error.cs_timeout.write_disable, 0); 199 atomic_set(&hdev->last_error.razwi.write_disable, 0); 200 201 hdev->open_counter++; 202 hdev->last_successful_open_jif = jiffies; 203 hdev->last_successful_open_ktime = ktime_get(); 204 205 return 0; 206 207out_err: 208 mutex_unlock(&hdev->fpriv_list_lock); 209 hl_mem_mgr_fini(&hpriv->mem_mgr); 210 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); 211 filp->private_data = NULL; 212 mutex_destroy(&hpriv->restore_phase_mutex); 213 mutex_destroy(&hpriv->notifier_event.lock); 214 put_pid(hpriv->taskpid); 215 216 kfree(hpriv); 217 218 return rc; 219} 220 221int hl_device_open_ctrl(struct inode *inode, struct file *filp) 222{ 223 struct hl_device *hdev; 224 struct hl_fpriv *hpriv; 225 int rc; 226 227 mutex_lock(&hl_devs_idr_lock); 228 hdev = idr_find(&hl_devs_idr, iminor(inode)); 229 mutex_unlock(&hl_devs_idr_lock); 230 231 if (!hdev) { 232 pr_err("Couldn't find device %d:%d\n", 233 imajor(inode), iminor(inode)); 234 return -ENXIO; 235 } 236 237 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); 238 if (!hpriv) 239 return -ENOMEM; 240 241 /* Prevent other routines from reading partial hpriv data by 242 * initializing hpriv fields before inserting it to the list 243 */ 244 hpriv->hdev = hdev; 245 filp->private_data = hpriv; 246 hpriv->filp = filp; 247 248 mutex_init(&hpriv->notifier_event.lock); 249 nonseekable_open(inode, filp); 250 251 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); 252 253 mutex_lock(&hdev->fpriv_ctrl_list_lock); 254 255 if (!hl_device_operational(hdev, NULL)) { 256 dev_err_ratelimited(hdev->dev_ctrl, 257 "Can't open %s because it is disabled or in reset\n", 258 dev_name(hdev->dev_ctrl)); 259 rc = -EPERM; 260 goto out_err; 261 } 262 263 list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list); 264 mutex_unlock(&hdev->fpriv_ctrl_list_lock); 265 266 return 0; 267 268out_err: 269 mutex_unlock(&hdev->fpriv_ctrl_list_lock); 270 filp->private_data = NULL; 271 put_pid(hpriv->taskpid); 272 273 kfree(hpriv); 274 275 return rc; 276} 277 278static void set_driver_behavior_per_device(struct hl_device *hdev) 279{ 280 hdev->pldm = 0; 281 hdev->fw_components = FW_TYPE_ALL_TYPES; 282 hdev->cpu_queues_enable = 1; 283 hdev->heartbeat = 1; 284 hdev->mmu_enable = 1; 285 hdev->sram_scrambler_enable = 1; 286 hdev->dram_scrambler_enable = 1; 287 hdev->bmc_enable = 1; 288 hdev->hard_reset_on_fw_events = 1; 289 hdev->reset_on_preboot_fail = 1; 290 hdev->reset_if_device_not_idle = 1; 291 292 hdev->reset_pcilink = 0; 293 hdev->axi_drain = 0; 294} 295 296static void copy_kernel_module_params_to_device(struct hl_device *hdev) 297{ 298 hdev->major = hl_major; 299 hdev->memory_scrub = memory_scrub; 300 hdev->reset_on_lockup = reset_on_lockup; 301 hdev->boot_error_status_mask = boot_error_status_mask; 302 303 if (timeout_locked) 304 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000); 305 else 306 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; 307 308} 309 310static int fixup_device_params(struct hl_device *hdev) 311{ 312 hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type); 313 314 hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; 315 hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; 316 317 hdev->stop_on_err = true; 318 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; 319 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; 320 321 /* Enable only after the initialization of the device */ 322 hdev->disabled = true; 323 324 return 0; 325} 326 327/** 328 * create_hdev - create habanalabs device instance 329 * 330 * @dev: will hold the pointer to the new habanalabs device structure 331 * @pdev: pointer to the pci device 332 * 333 * Allocate memory for habanalabs device and initialize basic fields 334 * Identify the ASIC type 335 * Allocate ID (minor) for the device (only for real devices) 336 */ 337static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) 338{ 339 int main_id, ctrl_id = 0, rc = 0; 340 struct hl_device *hdev; 341 342 *dev = NULL; 343 344 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); 345 if (!hdev) 346 return -ENOMEM; 347 348 /* can be NULL in case of simulator device */ 349 hdev->pdev = pdev; 350 351 /* Assign status description string */ 352 strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX); 353 strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX); 354 strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX); 355 strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX); 356 strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], 357 "in device creation", HL_STR_MAX); 358 359 /* First, we must find out which ASIC are we handling. This is needed 360 * to configure the behavior of the driver (kernel parameters) 361 */ 362 hdev->asic_type = get_asic_type(pdev->device); 363 if (hdev->asic_type == ASIC_INVALID) { 364 dev_err(&pdev->dev, "Unsupported ASIC\n"); 365 rc = -ENODEV; 366 goto free_hdev; 367 } 368 369 copy_kernel_module_params_to_device(hdev); 370 371 set_driver_behavior_per_device(hdev); 372 373 fixup_device_params(hdev); 374 375 mutex_lock(&hl_devs_idr_lock); 376 377 /* Always save 2 numbers, 1 for main device and 1 for control. 378 * They must be consecutive 379 */ 380 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL); 381 382 if (main_id >= 0) 383 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1, 384 main_id + 2, GFP_KERNEL); 385 386 mutex_unlock(&hl_devs_idr_lock); 387 388 if ((main_id < 0) || (ctrl_id < 0)) { 389 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC)) 390 pr_err("too many devices in the system\n"); 391 392 if (main_id >= 0) { 393 mutex_lock(&hl_devs_idr_lock); 394 idr_remove(&hl_devs_idr, main_id); 395 mutex_unlock(&hl_devs_idr_lock); 396 } 397 398 rc = -EBUSY; 399 goto free_hdev; 400 } 401 402 hdev->id = main_id; 403 hdev->id_control = ctrl_id; 404 405 *dev = hdev; 406 407 return 0; 408 409free_hdev: 410 kfree(hdev); 411 return rc; 412} 413 414/* 415 * destroy_hdev - destroy habanalabs device instance 416 * 417 * @dev: pointer to the habanalabs device structure 418 * 419 */ 420static void destroy_hdev(struct hl_device *hdev) 421{ 422 /* Remove device from the device list */ 423 mutex_lock(&hl_devs_idr_lock); 424 idr_remove(&hl_devs_idr, hdev->id); 425 idr_remove(&hl_devs_idr, hdev->id_control); 426 mutex_unlock(&hl_devs_idr_lock); 427 428 kfree(hdev); 429} 430 431static int hl_pmops_suspend(struct device *dev) 432{ 433 struct hl_device *hdev = dev_get_drvdata(dev); 434 435 pr_debug("Going to suspend PCI device\n"); 436 437 if (!hdev) { 438 pr_err("device pointer is NULL in suspend\n"); 439 return 0; 440 } 441 442 return hl_device_suspend(hdev); 443} 444 445static int hl_pmops_resume(struct device *dev) 446{ 447 struct hl_device *hdev = dev_get_drvdata(dev); 448 449 pr_debug("Going to resume PCI device\n"); 450 451 if (!hdev) { 452 pr_err("device pointer is NULL in resume\n"); 453 return 0; 454 } 455 456 return hl_device_resume(hdev); 457} 458 459/** 460 * hl_pci_probe - probe PCI habanalabs devices 461 * 462 * @pdev: pointer to pci device 463 * @id: pointer to pci device id structure 464 * 465 * Standard PCI probe function for habanalabs device. 466 * Create a new habanalabs device and initialize it according to the 467 * device's type 468 */ 469static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 470{ 471 struct hl_device *hdev; 472 int rc; 473 474 dev_info(&pdev->dev, HL_NAME 475 " device found [%04x:%04x] (rev %x)\n", 476 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision); 477 478 rc = create_hdev(&hdev, pdev); 479 if (rc) 480 return rc; 481 482 pci_set_drvdata(pdev, hdev); 483 484 pci_enable_pcie_error_reporting(pdev); 485 486 rc = hl_device_init(hdev, hl_class); 487 if (rc) { 488 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n"); 489 rc = -ENODEV; 490 goto disable_device; 491 } 492 493 return 0; 494 495disable_device: 496 pci_disable_pcie_error_reporting(pdev); 497 pci_set_drvdata(pdev, NULL); 498 destroy_hdev(hdev); 499 500 return rc; 501} 502 503/* 504 * hl_pci_remove - remove PCI habanalabs devices 505 * 506 * @pdev: pointer to pci device 507 * 508 * Standard PCI remove function for habanalabs device 509 */ 510static void hl_pci_remove(struct pci_dev *pdev) 511{ 512 struct hl_device *hdev; 513 514 hdev = pci_get_drvdata(pdev); 515 if (!hdev) 516 return; 517 518 hl_device_fini(hdev); 519 pci_disable_pcie_error_reporting(pdev); 520 pci_set_drvdata(pdev, NULL); 521 destroy_hdev(hdev); 522} 523 524/** 525 * hl_pci_err_detected - a PCI bus error detected on this device 526 * 527 * @pdev: pointer to pci device 528 * @state: PCI error type 529 * 530 * Called by the PCI subsystem whenever a non-correctable 531 * PCI bus error is detected 532 */ 533static pci_ers_result_t 534hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) 535{ 536 struct hl_device *hdev = pci_get_drvdata(pdev); 537 enum pci_ers_result result; 538 539 switch (state) { 540 case pci_channel_io_normal: 541 return PCI_ERS_RESULT_CAN_RECOVER; 542 543 case pci_channel_io_frozen: 544 dev_warn(hdev->dev, "frozen state error detected\n"); 545 result = PCI_ERS_RESULT_NEED_RESET; 546 break; 547 548 case pci_channel_io_perm_failure: 549 dev_warn(hdev->dev, "failure state error detected\n"); 550 result = PCI_ERS_RESULT_DISCONNECT; 551 break; 552 553 default: 554 result = PCI_ERS_RESULT_NONE; 555 } 556 557 hdev->asic_funcs->halt_engines(hdev, true, false); 558 559 return result; 560} 561 562/** 563 * hl_pci_err_resume - resume after a PCI slot reset 564 * 565 * @pdev: pointer to pci device 566 * 567 */ 568static void hl_pci_err_resume(struct pci_dev *pdev) 569{ 570 struct hl_device *hdev = pci_get_drvdata(pdev); 571 572 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n"); 573 hl_device_resume(hdev); 574} 575 576/** 577 * hl_pci_err_slot_reset - a PCI slot reset has just happened 578 * 579 * @pdev: pointer to pci device 580 * 581 * Determine if the driver can recover from the PCI slot reset 582 */ 583static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) 584{ 585 return PCI_ERS_RESULT_RECOVERED; 586} 587 588static const struct dev_pm_ops hl_pm_ops = { 589 .suspend = hl_pmops_suspend, 590 .resume = hl_pmops_resume, 591}; 592 593static const struct pci_error_handlers hl_pci_err_handler = { 594 .error_detected = hl_pci_err_detected, 595 .slot_reset = hl_pci_err_slot_reset, 596 .resume = hl_pci_err_resume, 597}; 598 599static struct pci_driver hl_pci_driver = { 600 .name = HL_NAME, 601 .id_table = ids, 602 .probe = hl_pci_probe, 603 .remove = hl_pci_remove, 604 .shutdown = hl_pci_remove, 605 .driver = { 606 .name = HL_NAME, 607 .pm = &hl_pm_ops, 608 .probe_type = PROBE_PREFER_ASYNCHRONOUS, 609 }, 610 .err_handler = &hl_pci_err_handler, 611}; 612 613/* 614 * hl_init - Initialize the habanalabs kernel driver 615 */ 616static int __init hl_init(void) 617{ 618 int rc; 619 dev_t dev; 620 621 pr_info("loading driver\n"); 622 623 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME); 624 if (rc < 0) { 625 pr_err("unable to get major\n"); 626 return rc; 627 } 628 629 hl_major = MAJOR(dev); 630 631 hl_class = class_create(THIS_MODULE, HL_NAME); 632 if (IS_ERR(hl_class)) { 633 pr_err("failed to allocate class\n"); 634 rc = PTR_ERR(hl_class); 635 goto remove_major; 636 } 637 638 hl_debugfs_init(); 639 640 rc = pci_register_driver(&hl_pci_driver); 641 if (rc) { 642 pr_err("failed to register pci device\n"); 643 goto remove_debugfs; 644 } 645 646 pr_debug("driver loaded\n"); 647 648 return 0; 649 650remove_debugfs: 651 hl_debugfs_fini(); 652 class_destroy(hl_class); 653remove_major: 654 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); 655 return rc; 656} 657 658/* 659 * hl_exit - Release all resources of the habanalabs kernel driver 660 */ 661static void __exit hl_exit(void) 662{ 663 pci_unregister_driver(&hl_pci_driver); 664 665 /* 666 * Removing debugfs must be after all devices or simulator devices 667 * have been removed because otherwise we get a bug in the 668 * debugfs module for referencing NULL objects 669 */ 670 hl_debugfs_fini(); 671 672 class_destroy(hl_class); 673 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); 674 675 idr_destroy(&hl_devs_idr); 676 677 pr_debug("driver removed\n"); 678} 679 680module_init(hl_init); 681module_exit(hl_exit);